mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
Add slow disk diagnosis to ZED
Slow disk response times can be indicative of a failing drive. ZFS currently tracks slow I/Os (slower than zio_slow_io_ms) and generates events (ereport.fs.zfs.delay). However, no action is taken by ZED, like is done for checksum or I/O errors. This change adds slow disk diagnosis to ZED which is opt-in using new VDEV properties: VDEV_PROP_SLOW_IO_N VDEV_PROP_SLOW_IO_T If multiple VDEVs in a pool are undergoing slow I/Os, then it skips the zpool_vdev_degrade(). Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Allan Jude <allan@klarasystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Co-authored-by: Rob Wing <rob.wing@klarasystems.com> Signed-off-by: Don Brady <don.brady@klarasystems.com> Closes #15469
This commit is contained in:
+26
-31
@@ -22,6 +22,7 @@
|
||||
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2016, Intel Corporation.
|
||||
* Copyright (c) 2023, Klara Inc.
|
||||
*/
|
||||
|
||||
/*
|
||||
@@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
|
||||
if (strcmp(name, "spare_on_remove") == 0)
|
||||
return (1);
|
||||
|
||||
if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
|
||||
return (10); /* N = 10 events */
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
int64_t
|
||||
fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
|
||||
{
|
||||
(void) hdl;
|
||||
|
||||
/*
|
||||
* These can be looked up in mp->modinfo->fmdi_props
|
||||
* For now we just hard code for phase 2. In the
|
||||
* future, there can be a ZED based override.
|
||||
*/
|
||||
if (strcmp(name, "remove_timeout") == 0)
|
||||
return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */
|
||||
|
||||
if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
|
||||
return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -535,6 +514,19 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
|
||||
return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
|
||||
}
|
||||
|
||||
int
|
||||
fmd_serd_active(fmd_hdl_t *hdl, const char *name)
|
||||
{
|
||||
fmd_module_t *mp = (fmd_module_t *)hdl;
|
||||
fmd_serd_eng_t *sgp;
|
||||
|
||||
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
|
||||
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
|
||||
return (0);
|
||||
}
|
||||
return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp));
|
||||
}
|
||||
|
||||
void
|
||||
fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
|
||||
{
|
||||
@@ -543,12 +535,10 @@ fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
|
||||
|
||||
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
|
||||
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
|
||||
return;
|
||||
} else {
|
||||
fmd_serd_eng_reset(sgp);
|
||||
fmd_hdl_debug(hdl, "serd_reset %s", name);
|
||||
}
|
||||
|
||||
fmd_serd_eng_reset(sgp);
|
||||
|
||||
fmd_hdl_debug(hdl, "serd_reset %s", name);
|
||||
}
|
||||
|
||||
int
|
||||
@@ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
|
||||
{
|
||||
fmd_module_t *mp = (fmd_module_t *)hdl;
|
||||
fmd_serd_eng_t *sgp;
|
||||
int err;
|
||||
|
||||
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
|
||||
zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
|
||||
name);
|
||||
return (0);
|
||||
}
|
||||
err = fmd_serd_eng_record(sgp, ep->ev_hrt);
|
||||
return (fmd_serd_eng_record(sgp, ep->ev_hrt));
|
||||
}
|
||||
|
||||
return (err);
|
||||
void
|
||||
fmd_serd_gc(fmd_hdl_t *hdl)
|
||||
{
|
||||
fmd_module_t *mp = (fmd_module_t *)hdl;
|
||||
|
||||
fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL);
|
||||
}
|
||||
|
||||
/* FMD Timers */
|
||||
@@ -579,7 +574,7 @@ _timer_notify(union sigval sv)
|
||||
const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
|
||||
struct itimerspec its;
|
||||
|
||||
fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
|
||||
fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid);
|
||||
|
||||
/* disarm the timer */
|
||||
memset(&its, 0, sizeof (struct itimerspec));
|
||||
|
||||
@@ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
|
||||
extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
|
||||
|
||||
extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
|
||||
extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
|
||||
|
||||
#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */
|
||||
#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */
|
||||
@@ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
|
||||
extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
|
||||
extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
|
||||
extern int fmd_serd_exists(fmd_hdl_t *, const char *);
|
||||
extern int fmd_serd_active(fmd_hdl_t *, const char *);
|
||||
extern void fmd_serd_reset(fmd_hdl_t *, const char *);
|
||||
extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
|
||||
extern int fmd_serd_fired(fmd_hdl_t *, const char *);
|
||||
extern int fmd_serd_empty(fmd_hdl_t *, const char *);
|
||||
extern void fmd_serd_gc(fmd_hdl_t *);
|
||||
|
||||
extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
|
||||
extern void fmd_timer_remove(fmd_hdl_t *, id_t);
|
||||
|
||||
@@ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
|
||||
}
|
||||
|
||||
void
|
||||
fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
|
||||
fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg)
|
||||
{
|
||||
(void) arg;
|
||||
fmd_serd_elem_t *sep, *nep;
|
||||
hrtime_t hrt;
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
|
||||
extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
|
||||
|
||||
extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
|
||||
extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
|
||||
extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
+117
-26
@@ -23,6 +23,7 @@
|
||||
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2016, Intel Corporation.
|
||||
* Copyright (c) 2023, Klara Inc.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
@@ -47,11 +48,16 @@
|
||||
#define DEFAULT_CHECKSUM_T 600 /* seconds */
|
||||
#define DEFAULT_IO_N 10 /* events */
|
||||
#define DEFAULT_IO_T 600 /* seconds */
|
||||
#define DEFAULT_SLOW_IO_N 10 /* events */
|
||||
#define DEFAULT_SLOW_IO_T 30 /* seconds */
|
||||
|
||||
#define CASE_GC_TIMEOUT_SECS 43200 /* 12 hours */
|
||||
|
||||
/*
|
||||
* Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
|
||||
* #define reserves enough space for two 64-bit hex values plus the length of
|
||||
* the longest string.
|
||||
* Our serd engines are named in the following format:
|
||||
* 'zfs_<pool_guid>_<vdev_guid>_{checksum,io,slow_io}'
|
||||
* This #define reserves enough space for two 64-bit hex values plus the
|
||||
* length of the longest string.
|
||||
*/
|
||||
#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
|
||||
|
||||
@@ -68,6 +74,7 @@ typedef struct zfs_case_data {
|
||||
int zc_pool_state;
|
||||
char zc_serd_checksum[MAX_SERDLEN];
|
||||
char zc_serd_io[MAX_SERDLEN];
|
||||
char zc_serd_slow_io[MAX_SERDLEN];
|
||||
int zc_has_remove_timer;
|
||||
} zfs_case_data_t;
|
||||
|
||||
@@ -114,7 +121,8 @@ zfs_de_stats_t zfs_stats = {
|
||||
{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
|
||||
};
|
||||
|
||||
static hrtime_t zfs_remove_timeout;
|
||||
/* wait 15 seconds after a removal */
|
||||
static hrtime_t zfs_remove_timeout = SEC2NSEC(15);
|
||||
|
||||
uu_list_pool_t *zfs_case_pool;
|
||||
uu_list_t *zfs_cases;
|
||||
@@ -124,6 +132,8 @@ uu_list_t *zfs_cases;
|
||||
#define ZFS_MAKE_EREPORT(type) \
|
||||
FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
|
||||
|
||||
static void zfs_purge_cases(fmd_hdl_t *hdl);
|
||||
|
||||
/*
|
||||
* Write out the persistent representation of an active case.
|
||||
*/
|
||||
@@ -170,6 +180,42 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
|
||||
return (zcp);
|
||||
}
|
||||
|
||||
/*
|
||||
* count other unique slow-io cases in a pool
|
||||
*/
|
||||
static uint_t
|
||||
zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
|
||||
{
|
||||
zfs_case_t *zcp;
|
||||
uint_t cases = 0;
|
||||
static hrtime_t next_check = 0;
|
||||
|
||||
/*
|
||||
* Note that plumbing in some external GC would require adding locking,
|
||||
* since most of this module code is not thread safe and assumes there
|
||||
* is only one thread running against the module. So we perform GC here
|
||||
* inline periodically so that future delay induced faults will be
|
||||
* possible once the issue causing multiple vdev delays is resolved.
|
||||
*/
|
||||
if (gethrestime_sec() > next_check) {
|
||||
/* Periodically purge old SERD entries and stale cases */
|
||||
fmd_serd_gc(hdl);
|
||||
zfs_purge_cases(hdl);
|
||||
next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS;
|
||||
}
|
||||
|
||||
for (zcp = uu_list_first(zfs_cases); zcp != NULL;
|
||||
zcp = uu_list_next(zfs_cases, zcp)) {
|
||||
if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid &&
|
||||
zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid &&
|
||||
zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
|
||||
fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) {
|
||||
cases++;
|
||||
}
|
||||
}
|
||||
return (cases);
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate over any active cases. If any cases are associated with a pool or
|
||||
* vdev which is no longer present on the system, close the associated case.
|
||||
@@ -376,6 +422,14 @@ zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
|
||||
(long long unsigned int)vdev_guid, type);
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp)
|
||||
{
|
||||
fmd_hdl_debug(hdl, "retiring case");
|
||||
|
||||
fmd_case_close(hdl, zcp->zc_case);
|
||||
}
|
||||
|
||||
/*
|
||||
* Solve a given ZFS case. This first checks to make sure the diagnosis is
|
||||
* still valid, as well as cleaning up any pending timer associated with the
|
||||
@@ -632,9 +686,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
||||
if (strcmp(class,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
|
||||
strcmp(class,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
|
||||
strcmp(class,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) {
|
||||
zfs_stats.resource_drops.fmds_value.ui64++;
|
||||
return;
|
||||
}
|
||||
@@ -702,6 +754,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
||||
if (zcp->zc_data.zc_serd_checksum[0] != '\0')
|
||||
fmd_serd_reset(hdl,
|
||||
zcp->zc_data.zc_serd_checksum);
|
||||
if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
|
||||
fmd_serd_reset(hdl,
|
||||
zcp->zc_data.zc_serd_slow_io);
|
||||
} else if (fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
|
||||
uint64_t state = 0;
|
||||
@@ -730,7 +785,11 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
||||
if (fmd_case_solved(hdl, zcp->zc_case))
|
||||
return;
|
||||
|
||||
fmd_hdl_debug(hdl, "error event '%s'", class);
|
||||
if (vdev_guid)
|
||||
fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class,
|
||||
vdev_guid);
|
||||
else
|
||||
fmd_hdl_debug(hdl, "error event '%s'", class);
|
||||
|
||||
/*
|
||||
* Determine if we should solve the case and generate a fault. We solve
|
||||
@@ -779,6 +838,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
||||
fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
|
||||
fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) ||
|
||||
fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
|
||||
const char *failmode = NULL;
|
||||
boolean_t checkremove = B_FALSE;
|
||||
@@ -814,6 +875,51 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
||||
}
|
||||
if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
|
||||
checkremove = B_TRUE;
|
||||
} else if (fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) {
|
||||
uint64_t slow_io_n, slow_io_t;
|
||||
|
||||
/*
|
||||
* Create a slow io SERD engine when the VDEV has the
|
||||
* 'vdev_slow_io_n' and 'vdev_slow_io_n' properties.
|
||||
*/
|
||||
if (zcp->zc_data.zc_serd_slow_io[0] == '\0' &&
|
||||
nvlist_lookup_uint64(nvl,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
|
||||
&slow_io_n) == 0 &&
|
||||
nvlist_lookup_uint64(nvl,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
|
||||
&slow_io_t) == 0) {
|
||||
zfs_serd_name(zcp->zc_data.zc_serd_slow_io,
|
||||
pool_guid, vdev_guid, "slow_io");
|
||||
fmd_serd_create(hdl,
|
||||
zcp->zc_data.zc_serd_slow_io,
|
||||
slow_io_n,
|
||||
SEC2NSEC(slow_io_t));
|
||||
zfs_case_serialize(zcp);
|
||||
}
|
||||
/* Pass event to SERD engine and see if this triggers */
|
||||
if (zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
|
||||
fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io,
|
||||
ep)) {
|
||||
/*
|
||||
* Ignore a slow io diagnosis when other
|
||||
* VDEVs in the pool show signs of being slow.
|
||||
*/
|
||||
if (zfs_other_slow_cases(hdl, &zcp->zc_data)) {
|
||||
zfs_case_retire(hdl, zcp);
|
||||
fmd_hdl_debug(hdl, "pool %llu has "
|
||||
"multiple slow io cases -- skip "
|
||||
"degrading vdev %llu",
|
||||
(u_longlong_t)
|
||||
zcp->zc_data.zc_pool_guid,
|
||||
(u_longlong_t)
|
||||
zcp->zc_data.zc_vdev_guid);
|
||||
} else {
|
||||
zfs_case_solve(hdl, zcp,
|
||||
"fault.fs.zfs.vdev.slow_io");
|
||||
}
|
||||
}
|
||||
} else if (fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
|
||||
/*
|
||||
@@ -924,6 +1030,8 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
|
||||
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
|
||||
if (zcp->zc_data.zc_serd_io[0] != '\0')
|
||||
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
|
||||
if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
|
||||
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io);
|
||||
if (zcp->zc_data.zc_has_remove_timer)
|
||||
fmd_timer_remove(hdl, zcp->zc_remove_timer);
|
||||
|
||||
@@ -932,30 +1040,15 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
|
||||
fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* We use the fmd gc entry point to look for old cases that no longer apply.
|
||||
* This allows us to keep our set of case data small in a long running system.
|
||||
*/
|
||||
static void
|
||||
zfs_fm_gc(fmd_hdl_t *hdl)
|
||||
{
|
||||
zfs_purge_cases(hdl);
|
||||
}
|
||||
|
||||
static const fmd_hdl_ops_t fmd_ops = {
|
||||
zfs_fm_recv, /* fmdo_recv */
|
||||
zfs_fm_timeout, /* fmdo_timeout */
|
||||
zfs_fm_close, /* fmdo_close */
|
||||
NULL, /* fmdo_stats */
|
||||
zfs_fm_gc, /* fmdo_gc */
|
||||
NULL, /* fmdo_gc */
|
||||
};
|
||||
|
||||
static const fmd_prop_t fmd_props[] = {
|
||||
{ "checksum_N", FMD_TYPE_UINT32, "10" },
|
||||
{ "checksum_T", FMD_TYPE_TIME, "10min" },
|
||||
{ "io_N", FMD_TYPE_UINT32, "10" },
|
||||
{ "io_T", FMD_TYPE_TIME, "10min" },
|
||||
{ "remove_timeout", FMD_TYPE_TIME, "15sec" },
|
||||
{ NULL, 0, NULL }
|
||||
};
|
||||
|
||||
@@ -996,8 +1089,6 @@ _zfs_diagnosis_init(fmd_hdl_t *hdl)
|
||||
|
||||
(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
|
||||
sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
|
||||
|
||||
zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -523,6 +523,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
|
||||
} else if (fmd_nvl_class_match(hdl, fault,
|
||||
"fault.fs.zfs.vdev.checksum")) {
|
||||
degrade_device = B_TRUE;
|
||||
} else if (fmd_nvl_class_match(hdl, fault,
|
||||
"fault.fs.zfs.vdev.slow_io")) {
|
||||
degrade_device = B_TRUE;
|
||||
} else if (fmd_nvl_class_match(hdl, fault,
|
||||
"fault.fs.zfs.device")) {
|
||||
fault_device = B_FALSE;
|
||||
|
||||
Reference in New Issue
Block a user