From cbe882298e4ddc3917dfaf239eca475fe06d62d4 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Thu, 8 Feb 2024 10:19:52 -0700 Subject: [PATCH] Add slow disk diagnosis to ZED Slow disk response times can be indicative of a failing drive. ZFS currently tracks slow I/Os (slower than zio_slow_io_ms) and generates events (ereport.fs.zfs.delay). However, no action is taken by ZED, like is done for checksum or I/O errors. This change adds slow disk diagnosis to ZED which is opt-in using new VDEV properties: VDEV_PROP_SLOW_IO_N VDEV_PROP_SLOW_IO_T If multiple VDEVs in a pool are undergoing slow I/Os, then it skips the zpool_vdev_degrade(). Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Tony Hutter Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Co-authored-by: Rob Wing Signed-off-by: Don Brady Closes #15469 --- cmd/zed/agents/fmd_api.c | 57 +++-- cmd/zed/agents/fmd_api.h | 3 +- cmd/zed/agents/fmd_serd.c | 3 +- cmd/zed/agents/fmd_serd.h | 2 +- cmd/zed/agents/zfs_diagnosis.c | 143 +++++++++--- cmd/zed/agents/zfs_retire.c | 3 + cmd/zinject/zinject.c | 16 ++ cmd/zpool/zpool_main.c | 8 +- include/sys/fm/fs/zfs.h | 2 + include/sys/fs/zfs.h | 2 + include/sys/vdev_impl.h | 5 +- lib/libzfs/libzfs.abi | 4 +- lib/libzfs/libzfs_pool.c | 2 + lib/libzfs/libzfs_util.c | 4 +- man/man7/vdevprops.7 | 4 +- man/man7/zpoolconcepts.7 | 4 +- man/man8/zinject.8 | 1 + module/zcommon/zpool_prop.c | 6 + module/zfs/vdev.c | 30 +++ module/zfs/zfs_fm.c | 26 +++ module/zfs/zio_inject.c | 4 + tests/runfiles/linux.run | 3 +- tests/zfs-tests/tests/Makefile.am | 2 + .../cli_root/zpool_get/vdev_get.cfg | 2 + .../tests/functional/events/cleanup.ksh | 6 +- .../tests/functional/events/zed_slow_io.ksh | 205 ++++++++++++++++++ .../events/zed_slow_io_many_vdevs.ksh | 177 +++++++++++++++ .../tests/functional/fault/cleanup.ksh | 1 + .../tests/functional/fault/setup.ksh | 1 + 29 files changed, 655 insertions(+), 71 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/events/zed_slow_io.ksh create mode 100755 tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c index 4a6cfbf8c..fe43e2ab9 100644 --- a/cmd/zed/agents/fmd_api.c +++ b/cmd/zed/agents/fmd_api.c @@ -22,6 +22,7 @@ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2023, Klara Inc. */ /* @@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name) if (strcmp(name, "spare_on_remove") == 0) return (1); - if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0) - return (10); /* N = 10 events */ - - return (0); -} - -int64_t -fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name) -{ - (void) hdl; - - /* - * These can be looked up in mp->modinfo->fmdi_props - * For now we just hard code for phase 2. In the - * future, there can be a ZED based override. - */ - if (strcmp(name, "remove_timeout") == 0) - return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */ - - if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0) - return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */ - return (0); } @@ -535,6 +514,19 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name) return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL); } +int +fmd_serd_active(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); + return (0); + } + return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp)); +} + void fmd_serd_reset(fmd_hdl_t *hdl, const char *name) { @@ -543,12 +535,10 @@ fmd_serd_reset(fmd_hdl_t *hdl, const char *name) if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); - return; + } else { + fmd_serd_eng_reset(sgp); + fmd_hdl_debug(hdl, "serd_reset %s", name); } - - fmd_serd_eng_reset(sgp); - - fmd_hdl_debug(hdl, "serd_reset %s", name); } int @@ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep) { fmd_module_t *mp = (fmd_module_t *)hdl; fmd_serd_eng_t *sgp; - int err; if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'", name); return (0); } - err = fmd_serd_eng_record(sgp, ep->ev_hrt); + return (fmd_serd_eng_record(sgp, ep->ev_hrt)); +} - return (err); +void +fmd_serd_gc(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL); } /* FMD Timers */ @@ -579,7 +574,7 @@ _timer_notify(union sigval sv) const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; struct itimerspec its; - fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid); + fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid); /* disarm the timer */ memset(&its, 0, sizeof (struct itimerspec)); diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h index b940d0d39..8471feecf 100644 --- a/cmd/zed/agents/fmd_api.h +++ b/cmd/zed/agents/fmd_api.h @@ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list); extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...); extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *); -extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *); #define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */ #define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */ @@ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *); extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t); extern void fmd_serd_destroy(fmd_hdl_t *, const char *); extern int fmd_serd_exists(fmd_hdl_t *, const char *); +extern int fmd_serd_active(fmd_hdl_t *, const char *); extern void fmd_serd_reset(fmd_hdl_t *, const char *); extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *); extern int fmd_serd_fired(fmd_hdl_t *, const char *); extern int fmd_serd_empty(fmd_hdl_t *, const char *); +extern void fmd_serd_gc(fmd_hdl_t *); extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t); extern void fmd_timer_remove(fmd_hdl_t *, id_t); diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c index 0bb2c535f..f942e62b3 100644 --- a/cmd/zed/agents/fmd_serd.c +++ b/cmd/zed/agents/fmd_serd.c @@ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp) } void -fmd_serd_eng_gc(fmd_serd_eng_t *sgp) +fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg) { + (void) arg; fmd_serd_elem_t *sep, *nep; hrtime_t hrt; diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h index 25b6888e6..80ff9a3b2 100644 --- a/cmd/zed/agents/fmd_serd.h +++ b/cmd/zed/agents/fmd_serd.h @@ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *); extern int fmd_serd_eng_empty(fmd_serd_eng_t *); extern void fmd_serd_eng_reset(fmd_serd_eng_t *); -extern void fmd_serd_eng_gc(fmd_serd_eng_t *); +extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *); #ifdef __cplusplus } diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c index f6ba334a3..e0ad00800 100644 --- a/cmd/zed/agents/zfs_diagnosis.c +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -23,6 +23,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2023, Klara Inc. */ #include @@ -47,11 +48,16 @@ #define DEFAULT_CHECKSUM_T 600 /* seconds */ #define DEFAULT_IO_N 10 /* events */ #define DEFAULT_IO_T 600 /* seconds */ +#define DEFAULT_SLOW_IO_N 10 /* events */ +#define DEFAULT_SLOW_IO_T 30 /* seconds */ + +#define CASE_GC_TIMEOUT_SECS 43200 /* 12 hours */ /* - * Our serd engines are named 'zfs___{checksum,io}'. This - * #define reserves enough space for two 64-bit hex values plus the length of - * the longest string. + * Our serd engines are named in the following format: + * 'zfs___{checksum,io,slow_io}' + * This #define reserves enough space for two 64-bit hex values plus the + * length of the longest string. */ #define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) @@ -68,6 +74,7 @@ typedef struct zfs_case_data { int zc_pool_state; char zc_serd_checksum[MAX_SERDLEN]; char zc_serd_io[MAX_SERDLEN]; + char zc_serd_slow_io[MAX_SERDLEN]; int zc_has_remove_timer; } zfs_case_data_t; @@ -114,7 +121,8 @@ zfs_de_stats_t zfs_stats = { { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } }; -static hrtime_t zfs_remove_timeout; +/* wait 15 seconds after a removal */ +static hrtime_t zfs_remove_timeout = SEC2NSEC(15); uu_list_pool_t *zfs_case_pool; uu_list_t *zfs_cases; @@ -124,6 +132,8 @@ uu_list_t *zfs_cases; #define ZFS_MAKE_EREPORT(type) \ FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type +static void zfs_purge_cases(fmd_hdl_t *hdl); + /* * Write out the persistent representation of an active case. */ @@ -170,6 +180,42 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) return (zcp); } +/* + * count other unique slow-io cases in a pool + */ +static uint_t +zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case) +{ + zfs_case_t *zcp; + uint_t cases = 0; + static hrtime_t next_check = 0; + + /* + * Note that plumbing in some external GC would require adding locking, + * since most of this module code is not thread safe and assumes there + * is only one thread running against the module. So we perform GC here + * inline periodically so that future delay induced faults will be + * possible once the issue causing multiple vdev delays is resolved. + */ + if (gethrestime_sec() > next_check) { + /* Periodically purge old SERD entries and stale cases */ + fmd_serd_gc(hdl); + zfs_purge_cases(hdl); + next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS; + } + + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid && + zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid && + zcp->zc_data.zc_serd_slow_io[0] != '\0' && + fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) { + cases++; + } + } + return (cases); +} + /* * Iterate over any active cases. If any cases are associated with a pool or * vdev which is no longer present on the system, close the associated case. @@ -376,6 +422,14 @@ zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, (long long unsigned int)vdev_guid, type); } +static void +zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp) +{ + fmd_hdl_debug(hdl, "retiring case"); + + fmd_case_close(hdl, zcp->zc_case); +} + /* * Solve a given ZFS case. This first checks to make sure the diagnosis is * still valid, as well as cleaning up any pending timer associated with the @@ -632,9 +686,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) if (strcmp(class, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || strcmp(class, - ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || - strcmp(class, - ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) { + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) { zfs_stats.resource_drops.fmds_value.ui64++; return; } @@ -702,6 +754,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) if (zcp->zc_data.zc_serd_checksum[0] != '\0') fmd_serd_reset(hdl, zcp->zc_data.zc_serd_checksum); + if (zcp->zc_data.zc_serd_slow_io[0] != '\0') + fmd_serd_reset(hdl, + zcp->zc_data.zc_serd_slow_io); } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { uint64_t state = 0; @@ -730,7 +785,11 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) if (fmd_case_solved(hdl, zcp->zc_case)) return; - fmd_hdl_debug(hdl, "error event '%s'", class); + if (vdev_guid) + fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class, + vdev_guid); + else + fmd_hdl_debug(hdl, "error event '%s'", class); /* * Determine if we should solve the case and generate a fault. We solve @@ -779,6 +838,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) || + fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { const char *failmode = NULL; boolean_t checkremove = B_FALSE; @@ -814,6 +875,51 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) } if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) checkremove = B_TRUE; + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) { + uint64_t slow_io_n, slow_io_t; + + /* + * Create a slow io SERD engine when the VDEV has the + * 'vdev_slow_io_n' and 'vdev_slow_io_n' properties. + */ + if (zcp->zc_data.zc_serd_slow_io[0] == '\0' && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, + &slow_io_n) == 0 && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, + &slow_io_t) == 0) { + zfs_serd_name(zcp->zc_data.zc_serd_slow_io, + pool_guid, vdev_guid, "slow_io"); + fmd_serd_create(hdl, + zcp->zc_data.zc_serd_slow_io, + slow_io_n, + SEC2NSEC(slow_io_t)); + zfs_case_serialize(zcp); + } + /* Pass event to SERD engine and see if this triggers */ + if (zcp->zc_data.zc_serd_slow_io[0] != '\0' && + fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io, + ep)) { + /* + * Ignore a slow io diagnosis when other + * VDEVs in the pool show signs of being slow. + */ + if (zfs_other_slow_cases(hdl, &zcp->zc_data)) { + zfs_case_retire(hdl, zcp); + fmd_hdl_debug(hdl, "pool %llu has " + "multiple slow io cases -- skip " + "degrading vdev %llu", + (u_longlong_t) + zcp->zc_data.zc_pool_guid, + (u_longlong_t) + zcp->zc_data.zc_vdev_guid); + } else { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.vdev.slow_io"); + } + } } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { /* @@ -924,6 +1030,8 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); if (zcp->zc_data.zc_serd_io[0] != '\0') fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_serd_slow_io[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io); if (zcp->zc_data.zc_has_remove_timer) fmd_timer_remove(hdl, zcp->zc_remove_timer); @@ -932,30 +1040,15 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); } -/* - * We use the fmd gc entry point to look for old cases that no longer apply. - * This allows us to keep our set of case data small in a long running system. - */ -static void -zfs_fm_gc(fmd_hdl_t *hdl) -{ - zfs_purge_cases(hdl); -} - static const fmd_hdl_ops_t fmd_ops = { zfs_fm_recv, /* fmdo_recv */ zfs_fm_timeout, /* fmdo_timeout */ zfs_fm_close, /* fmdo_close */ NULL, /* fmdo_stats */ - zfs_fm_gc, /* fmdo_gc */ + NULL, /* fmdo_gc */ }; static const fmd_prop_t fmd_props[] = { - { "checksum_N", FMD_TYPE_UINT32, "10" }, - { "checksum_T", FMD_TYPE_TIME, "10min" }, - { "io_N", FMD_TYPE_UINT32, "10" }, - { "io_T", FMD_TYPE_TIME, "10min" }, - { "remove_timeout", FMD_TYPE_TIME, "15sec" }, { NULL, 0, NULL } }; @@ -996,8 +1089,6 @@ _zfs_diagnosis_init(fmd_hdl_t *hdl) (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); - - zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); } void diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index a0e377a4a..1ef5c631a 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -523,6 +523,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, } else if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.checksum")) { degrade_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.vdev.slow_io")) { + degrade_device = B_TRUE; } else if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.device")) { fault_device = B_FALSE; diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index f1262ed77..a11b6d0b7 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -1083,6 +1083,22 @@ main(int argc, char **argv) libzfs_fini(g_zfs); return (1); } + + if (record.zi_nlanes) { + switch (io_type) { + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + case ZIO_TYPES: + break; + default: + (void) fprintf(stderr, "I/O type for a delay " + "must be 'read' or 'write'\n"); + usage(); + libzfs_fini(g_zfs); + return (1); + } + } + if (!error) error = ENXIO; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 8753d7263..0783271f4 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2569,7 +2569,13 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, break; case VDEV_AUX_ERR_EXCEEDED: - (void) printf(gettext("too many errors")); + if (vs->vs_read_errors + vs->vs_write_errors + + vs->vs_checksum_errors == 0 && children == 0 && + vs->vs_slow_ios > 0) { + (void) printf(gettext("too many slow I/Os")); + } else { + (void) printf(gettext("too many errors")); + } break; case VDEV_AUX_IO_FAILURE: diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index fb9e86492..c746600cd 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -82,6 +82,8 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T "vdev_cksum_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N "vdev_io_n" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index c6f7dcca7..025567e21 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -366,6 +366,8 @@ typedef enum { VDEV_PROP_IO_N, VDEV_PROP_IO_T, VDEV_PROP_RAIDZ_EXPANDING, + VDEV_PROP_SLOW_IO_N, + VDEV_PROP_SLOW_IO_T, VDEV_NUM_PROPS } vdev_prop_t; diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index dafab66c7..f39ebf031 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2023, Klara Inc. */ #ifndef _SYS_VDEV_IMPL_H @@ -454,12 +455,14 @@ struct vdev { zfs_ratelimit_t vdev_checksum_rl; /* - * Checksum and IO thresholds for tuning ZED + * Vdev properties for tuning ZED */ uint64_t vdev_checksum_n; uint64_t vdev_checksum_t; uint64_t vdev_io_n; uint64_t vdev_io_t; + uint64_t vdev_slow_io_n; + uint64_t vdev_slow_io_t; }; #define VDEV_PAD_SIZE (8 << 10) diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 7c39b134d..cdd2f04c2 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5626,7 +5626,9 @@ - + + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index c7b8617ef..402c14a6b 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5264,6 +5264,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: + case VDEV_PROP_SLOW_IO_N: + case VDEV_PROP_SLOW_IO_T: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else { diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index d0a63a4da..8e70af2e5 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -1704,7 +1704,9 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, (prop == VDEV_PROP_CHECKSUM_N || prop == VDEV_PROP_CHECKSUM_T || prop == VDEV_PROP_IO_N || - prop == VDEV_PROP_IO_T)) { + prop == VDEV_PROP_IO_T || + prop == VDEV_PROP_SLOW_IO_N || + prop == VDEV_PROP_SLOW_IO_T)) { *ivalp = UINT64_MAX; } diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 6eebfa006..3d3ebc072 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -44,7 +44,7 @@ section, below. Every vdev has a set of properties that export statistics about the vdev as well as control various behaviors. Properties are not inherited from top-level vdevs, with the exception of -checksum_n, checksum_t, io_n, and io_t. +checksum_n, checksum_t, io_n, io_t, slow_io_n, and slow_io_t. .Pp The values of numeric properties can be specified using human-readable suffixes .Po for example, @@ -117,7 +117,7 @@ If this device is currently being removed from the pool .Pp The following native properties can be used to change the behavior of a vdev. .Bl -tag -width "allocating" -.It Sy checksum_n , checksum_t , io_n , io_t +.It Sy checksum_n , checksum_t , io_n , io_t , slow_io_n , slow_io_t Tune the fault management daemon by specifying checksum/io thresholds of errors in seconds, respectively. These properties can be set on leaf and top-level vdevs. diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index 98f3ee7cd..18dfca6dc 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -260,8 +260,8 @@ sufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet -compact .It -The number of checksum errors exceeds acceptable levels and the device is -degraded as an indication that something may be wrong. +The number of checksum errors or slow I/Os exceeds acceptable levels and the +device is degraded as an indication that something may be wrong. ZFS continues to use the device as necessary. .It The number of I/O errors exceeds acceptable levels. diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index 4f0bbae81..b692f1213 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -69,6 +69,7 @@ Force a vdev into the DEGRADED or FAULTED state. .Nm zinject .Fl d Ar vdev .Fl D Ar latency : Ns Ar lanes +.Op Fl T Ar read|write .Ar pool .Xc Add an artificial delay to I/O requests on a particular diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index e98063e8b..e2e3bf5be 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -431,6 +431,12 @@ vdev_prop_init(void) zprop_register_number(VDEV_PROP_IO_T, "io_t", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_VDEV, "", "IO_T", B_FALSE, sfeatures); + zprop_register_number(VDEV_PROP_SLOW_IO_N, "slow_io_n", UINT64_MAX, + PROP_DEFAULT, ZFS_TYPE_VDEV, "", "SLOW_IO_N", B_FALSE, + sfeatures); + zprop_register_number(VDEV_PROP_SLOW_IO_T, "slow_io_t", UINT64_MAX, + PROP_DEFAULT, ZFS_TYPE_VDEV, "", "SLOW_IO_T", B_FALSE, + sfeatures); /* default index (boolean) properties */ zprop_register_index(VDEV_PROP_REMOVING, "removing", 0, diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index d6286dc59..ebba453e2 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -677,6 +677,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); + vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); + vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); @@ -3755,6 +3757,18 @@ vdev_load(vdev_t *vd) if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N, + &vd->vdev_slow_io_n); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T, + &vd->vdev_slow_io_t); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); } /* @@ -5970,6 +5984,20 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_io_t = intval; break; + case VDEV_PROP_SLOW_IO_N: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_slow_io_n = intval; + break; + case VDEV_PROP_SLOW_IO_T: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_slow_io_t = intval; + break; default: /* Most processing is done in vdev_props_set_sync */ break; @@ -6313,6 +6341,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: + case VDEV_PROP_SLOW_IO_N: + case VDEV_PROP_SLOW_IO_T: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index c4eb74e87..481af2ba8 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -222,6 +222,12 @@ vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop) case VDEV_PROP_IO_T: propval = vd->vdev_io_t; break; + case VDEV_PROP_SLOW_IO_N: + propval = vd->vdev_slow_io_n; + break; + case VDEV_PROP_SLOW_IO_T: + propval = vd->vdev_slow_io_t; + break; default: propval = propdef; break; @@ -741,6 +747,26 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, NULL); } + if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { + uint64_t slow_io_n, slow_io_t; + + slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N); + if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, + DATA_TYPE_UINT64, + slow_io_n, + NULL); + + slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T); + if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, + DATA_TYPE_UINT64, + slow_io_t, + NULL); + } + mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 3598351c4..609182f4a 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -605,6 +605,10 @@ zio_handle_io_delay(zio_t *zio) if (vd->vdev_guid != handler->zi_record.zi_guid) continue; + if (handler->zi_record.zi_iotype != ZIO_TYPES && + handler->zi_record.zi_iotype != zio->io_type) + continue; + /* * Defensive; should never happen as the array allocation * occurs prior to inserting this handler on the list. diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 6a4cd3fe6..a0b74ef4a 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -104,7 +104,8 @@ tags = ['functional', 'devices'] [tests/functional/events:Linux] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', - 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config'] + 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config', + 'zed_slow_io', 'zed_slow_io_many_vdevs'] tags = ['functional', 'events'] [tests/functional/fadvise:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 01af258d5..fe9c92108 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1447,6 +1447,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/events/zed_fd_spill.ksh \ functional/events/zed_io_config.ksh \ functional/events/zed_rc_filter.ksh \ + functional/events/zed_slow_io.ksh \ + functional/events/zed_slow_io_many_vdevs.ksh \ functional/exec/cleanup.ksh \ functional/exec/exec_001_pos.ksh \ functional/exec/exec_002_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg index 71a64d4fa..c3b9efd64 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg @@ -70,4 +70,6 @@ typeset -a properties=( checksum_t io_n io_t + slow_io_n + slow_io_t ) diff --git a/tests/zfs-tests/tests/functional/events/cleanup.ksh b/tests/zfs-tests/tests/functional/events/cleanup.ksh index ef6e098cf..669b8ae99 100755 --- a/tests/zfs-tests/tests/functional/events/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/events/cleanup.ksh @@ -26,8 +26,10 @@ . $STF_SUITE/include/libtest.shlib -zed_cleanup all-debug.sh all-syslog.sh all-dumpfds - zed_stop +zed_cleanup all-debug.sh all-syslog.sh all-dumpfds + +zed_events_drain + default_cleanup diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh new file mode 100755 index 000000000..d9fabb2c3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh @@ -0,0 +1,205 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +# DESCRIPTION: +# Verify that vdev properties, slow_io_n and slow_io_t, work with ZED. +# +# STRATEGY: +# 1. Create a pool with single vdev +# 2. Set slow_io_n/slow_io_t to non-default values +# 3. Inject slow io errors +# 4. Verify that ZED degrades vdev +# + +. $STF_SUITE/include/libtest.shlib + +TESTDIR="$TEST_BASE_DIR/zed_slow_io" +VDEV="$TEST_BASE_DIR/vdevfile.$$" +TESTPOOL="slow_io_pool" +FILEPATH="$TESTDIR/slow_io.testfile" + +OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) +OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) + +verify_runnable "both" + +function do_setup +{ + log_must truncate -s 1G $VDEV + default_setup_noexit $VDEV + zed_events_drain + log_must zfs set compression=off $TESTPOOL + log_must zfs set primarycache=none $TESTPOOL + log_must zfs set prefetch=none $TESTPOOL + log_must zfs set recordsize=512 $TESTPOOL + for i in {1..10}; do + dd if=/dev/urandom of=${FILEPATH}$i bs=512 count=1 2>/dev/null + done + zpool sync +} + +# intermediate cleanup +function do_clean +{ + log_must zinject -c all + log_must zpool destroy $TESTPOOL + log_must rm -f $VDEV +} + +# final cleanup +function cleanup +{ + log_must zinject -c all + + # if pool still exists then something failed so log additional info + if poolexists $TESTPOOL ; then + log_note "$(zpool status -s $TESTPOOL)" + echo "=================== zed log search ===================" + grep "Diagnosis Engine" $ZEDLET_DIR/zed.log + destroy_pool $TESTPOOL + fi + log_must zed_stop + + log_must rm -f $VDEV + + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS +} + +function start_slow_io +{ + zpool sync + log_must set_tunable64 ZIO_SLOW_IO_MS 10 + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000 + + log_must zinject -d $VDEV -D10:1 -T read $TESTPOOL + zpool sync +} + +function stop_slow_io +{ + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS + + log_must zinject -c all +} + +# Test default ZED settings: +# inject 10 events over 2.5 seconds, should not degrade. +function default_degrade +{ + do_setup + + start_slow_io + for i in {1..10}; do + dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null + sleep 0.25 + done + stop_slow_io + log_note "$(zpool status -s $TESTPOOL)" + + # give slow ZED a chance to process the delay events + sleep 18 + log_note "$(zpool status -s $TESTPOOL)" + + degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l) + log_note $degrades vdev degrades in ZED log + [ $degrades -eq "0" ] || \ + log_fail "expecting no degrade events, found $degrades" + + do_clean +} + +# change slow_io_n, slow_io_t to 5 events in 60 seconds +# fire more than 5 events, should degrade +function slow_io_degrade +{ + do_setup + + zpool set slow_io_n=5 $TESTPOOL $VDEV + zpool set slow_io_t=60 $TESTPOOL $VDEV + + start_slow_io + for i in {1..16}; do + dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null + sleep 0.5 + done + stop_slow_io + zpool sync + + # + # wait up to 60 seconds for kernel to produce at least 5 delay events + # + typeset -i i=0 + typeset -i events=0 + while [[ $i -lt 60 ]]; do + events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l) + [[ $events -ge "5" ]] && break + i=$((i+1)) + sleep 1 + done + log_note "$events delay events found" + + if [[ $events -ge "5" ]]; then + log_must wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 10 + fi + + do_clean +} + +# change slow_io_n, slow_io_t to 10 events in 1 second +# inject events spaced 0.5 seconds apart, should not degrade +function slow_io_no_degrade +{ + do_setup + + zpool set slow_io_n=10 $TESTPOOL $VDEV + zpool set slow_io_t=1 $TESTPOOL $VDEV + + start_slow_io + for i in {1..16}; do + dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null + sleep 0.5 + done + stop_slow_io + zpool sync + + log_mustnot wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 45 + + do_clean +} + +log_assert "Test ZED slow io configurability" +log_onexit cleanup + +log_must zed_events_drain +log_must zed_start + +default_degrade +slow_io_degrade +slow_io_no_degrade + +log_pass "Test ZED slow io configurability" diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh new file mode 100755 index 000000000..3357ae2e3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh @@ -0,0 +1,177 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +# DESCRIPTION: +# Verify that delay events from multiple vdevs doesnt degrade +# +# STRATEGY: +# 1. Create a pool with a 3 disk raidz vdev +# 2. Inject slow io errors +# 3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs +# + +. $STF_SUITE/include/libtest.shlib + +TESTDIR="$TEST_BASE_DIR/zed_slow_io" +VDEV1="$TEST_BASE_DIR/vdevfile1.$$" +VDEV2="$TEST_BASE_DIR/vdevfile2.$$" +VDEV3="$TEST_BASE_DIR/vdevfile3.$$" +VDEV4="$TEST_BASE_DIR/vdevfile4.$$" +VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4" +TESTPOOL="slow_io_pool" +FILEPATH="$TESTDIR/slow_io.testfile" + +OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) +OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + + # if pool still exists then something failed so log additional info + if poolexists $TESTPOOL ; then + log_note "$(zpool status -s $TESTPOOL)" + echo "=================== zed log search ===================" + grep "Diagnosis Engine" $ZEDLET_DIR/zed.log + destroy_pool $TESTPOOL + fi + log_must zed_stop + + log_must rm -f $VDEVS + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS +} + +function start_slow_io +{ + for vdev in $VDEVS + do + log_must zpool set slow_io_n=4 $TESTPOOL $vdev + log_must zpool set slow_io_t=60 $TESTPOOL $vdev + done + zpool sync + + log_must set_tunable64 ZIO_SLOW_IO_MS 10 + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000 + + for vdev in $VDEVS + do + log_must zinject -d $vdev -D10:1 $TESTPOOL + done + zpool sync +} + +function stop_slow_io +{ + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS + + log_must zinject -c all +} + +function multiple_slow_vdevs_test +{ + log_must truncate -s 1G $VDEVS + default_raidz_setup_noexit $VDEVS + + log_must zpool events -c + log_must zfs set compression=off $TESTPOOL + log_must zfs set primarycache=none $TESTPOOL + log_must zfs set recordsize=4K $TESTPOOL + + log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=20 + zpool sync + + # + # Read the file with slow io injected on the disks + # This will cause multiple errors on each disk to trip ZED SERD + # + # pool: slow_io_pool + # state: ONLINE + # config: + # + # NAME STATE READ WRITE CKSUM SLOW + # slow_io_pool ONLINE 0 0 0 - + # raidz1-0 ONLINE 0 0 0 - + # /var/tmp/vdevfile1.499278 ONLINE 0 0 0 113 + # /var/tmp/vdevfile2.499278 ONLINE 0 0 0 109 + # /var/tmp/vdevfile3.499278 ONLINE 0 0 0 96 + # /var/tmp/vdevfile4.499278 ONLINE 0 0 0 109 + # + start_slow_io + dd if=$FILEPATH of=/dev/null bs=1M count=20 2>/dev/null + stop_slow_io + + # count events available for processing + typeset -i i=0 + typeset -i events=0 + while [[ $i -lt 60 ]]; do + events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l) + [[ $events -ge "50" ]] && break + i=$((i+1)) + sleep 1 + done + log_note "$events delay events found" + if [[ $events -lt "50" ]]; then + log_note "bailing: not enough events to complete the test" + destroy_pool $TESTPOOL + return + fi + + # + # give slow ZED a chance to process the delay events + # + typeset -i i=0 + typeset -i skips=0 + while [[ $i -lt 75 ]]; do + skips=$(grep "retiring case" \ + $ZEDLET_DIR/zed.log | wc -l) + [[ $skips -gt "0" ]] && break + i=$((i+1)) + sleep 1 + done + + log_note $skips degrade skips in ZED log after $i seconds + [ $skips -gt "0" ] || log_fail "expecting to see skips" + + degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l) + log_note $degrades vdev degrades in ZED log + [ $degrades -eq "0" ] || \ + log_fail "expecting no degrade events, found $degrades" + + destroy_pool $TESTPOOL +} + +log_assert "Test ZED slow io across multiple vdevs" +log_onexit cleanup + +log_must zed_events_drain +log_must zed_start +multiple_slow_vdevs_test + +log_pass "Test ZED slow io across multiple vdevs" diff --git a/tests/zfs-tests/tests/functional/fault/cleanup.ksh b/tests/zfs-tests/tests/functional/fault/cleanup.ksh index 654343c0c..2959236b5 100755 --- a/tests/zfs-tests/tests/functional/fault/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/fault/cleanup.ksh @@ -32,5 +32,6 @@ cleanup_devices $DISKS zed_stop zed_cleanup resilver_finish-start-scrub.sh +zed_events_drain log_pass diff --git a/tests/zfs-tests/tests/functional/fault/setup.ksh b/tests/zfs-tests/tests/functional/fault/setup.ksh index 62f1c8ab5..61b9206ec 100755 --- a/tests/zfs-tests/tests/functional/fault/setup.ksh +++ b/tests/zfs-tests/tests/functional/fault/setup.ksh @@ -28,6 +28,7 @@ verify_runnable "global" +zed_events_drain zed_setup resilver_finish-start-scrub.sh zed_start