mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 11:18:52 +03:00
Add slow disk diagnosis to ZED
Slow disk response times can be indicative of a failing drive. ZFS currently tracks slow I/Os (slower than zio_slow_io_ms) and generates events (ereport.fs.zfs.delay). However, no action is taken by ZED, like is done for checksum or I/O errors. This change adds slow disk diagnosis to ZED which is opt-in using new VDEV properties: VDEV_PROP_SLOW_IO_N VDEV_PROP_SLOW_IO_T If multiple VDEVs in a pool are undergoing slow I/Os, then it skips the zpool_vdev_degrade(). Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Allan Jude <allan@klarasystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Co-authored-by: Rob Wing <rob.wing@klarasystems.com> Signed-off-by: Don Brady <don.brady@klarasystems.com> Closes #15469
This commit is contained in:
+26
-31
@@ -22,6 +22,7 @@
|
||||
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2016, Intel Corporation.
|
||||
* Copyright (c) 2023, Klara Inc.
|
||||
*/
|
||||
|
||||
/*
|
||||
@@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
|
||||
if (strcmp(name, "spare_on_remove") == 0)
|
||||
return (1);
|
||||
|
||||
if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
|
||||
return (10); /* N = 10 events */
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
int64_t
|
||||
fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
|
||||
{
|
||||
(void) hdl;
|
||||
|
||||
/*
|
||||
* These can be looked up in mp->modinfo->fmdi_props
|
||||
* For now we just hard code for phase 2. In the
|
||||
* future, there can be a ZED based override.
|
||||
*/
|
||||
if (strcmp(name, "remove_timeout") == 0)
|
||||
return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */
|
||||
|
||||
if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
|
||||
return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -535,6 +514,19 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
|
||||
return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
|
||||
}
|
||||
|
||||
int
|
||||
fmd_serd_active(fmd_hdl_t *hdl, const char *name)
|
||||
{
|
||||
fmd_module_t *mp = (fmd_module_t *)hdl;
|
||||
fmd_serd_eng_t *sgp;
|
||||
|
||||
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
|
||||
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
|
||||
return (0);
|
||||
}
|
||||
return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp));
|
||||
}
|
||||
|
||||
void
|
||||
fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
|
||||
{
|
||||
@@ -543,12 +535,10 @@ fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
|
||||
|
||||
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
|
||||
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
|
||||
return;
|
||||
} else {
|
||||
fmd_serd_eng_reset(sgp);
|
||||
fmd_hdl_debug(hdl, "serd_reset %s", name);
|
||||
}
|
||||
|
||||
fmd_serd_eng_reset(sgp);
|
||||
|
||||
fmd_hdl_debug(hdl, "serd_reset %s", name);
|
||||
}
|
||||
|
||||
int
|
||||
@@ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
|
||||
{
|
||||
fmd_module_t *mp = (fmd_module_t *)hdl;
|
||||
fmd_serd_eng_t *sgp;
|
||||
int err;
|
||||
|
||||
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
|
||||
zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
|
||||
name);
|
||||
return (0);
|
||||
}
|
||||
err = fmd_serd_eng_record(sgp, ep->ev_hrt);
|
||||
return (fmd_serd_eng_record(sgp, ep->ev_hrt));
|
||||
}
|
||||
|
||||
return (err);
|
||||
void
|
||||
fmd_serd_gc(fmd_hdl_t *hdl)
|
||||
{
|
||||
fmd_module_t *mp = (fmd_module_t *)hdl;
|
||||
|
||||
fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL);
|
||||
}
|
||||
|
||||
/* FMD Timers */
|
||||
@@ -579,7 +574,7 @@ _timer_notify(union sigval sv)
|
||||
const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
|
||||
struct itimerspec its;
|
||||
|
||||
fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
|
||||
fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid);
|
||||
|
||||
/* disarm the timer */
|
||||
memset(&its, 0, sizeof (struct itimerspec));
|
||||
|
||||
Reference in New Issue
Block a user