Configure zed's diagnosis engine with vdev properties

Introduce four new vdev properties:
    checksum_n
    checksum_t
    io_n
    io_t

These properties can be used for configuring the thresholds of zed's
diagnosis engine and are interpeted as <N> events in T <seconds>.

When this property is set to a non-default value on a top-level vdev,
those thresholds will also apply to its leaf vdevs. This behavior can be
overridden by explicitly setting the property on the leaf vdev.

Note that, these properties do not persist across vdev replacement. For
this reason, it is advisable to set the property on the top-level vdev
instead of the leaf vdev.

The default values for zed's diagnosis engine (10 events, 600 seconds)
remains unchanged.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Wing <rob.wing@klarasystems.com>
Sponsored-by: Seagate Technology LLC
Closes #13805
This commit is contained in:
rob-wing
2023-01-23 12:14:25 -09:00
committed by GitHub
parent f091db9248
commit 69f024a56e
15 changed files with 618 additions and 24 deletions
+36 -4
View File
@@ -39,6 +39,15 @@
#include "zfs_agents.h"
#include "fmd_api.h"
/*
* Default values for the serd engine when processing checksum or io errors. The
* semantics are N <events> in T <seconds>.
*/
#define DEFAULT_CHECKSUM_N 10 /* events */
#define DEFAULT_CHECKSUM_T 600 /* seconds */
#define DEFAULT_IO_N 10 /* events */
#define DEFAULT_IO_T 600 /* seconds */
/*
* Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
* #define reserves enough space for two 64-bit hex values plus the length of
@@ -448,6 +457,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
zfs_case_t *zcp, *dcp;
int32_t pool_state;
uint64_t ena, pool_guid, vdev_guid;
uint64_t checksum_n, checksum_t;
uint64_t io_n, io_t;
er_timeval_t pool_load;
er_timeval_t er_when;
nvlist_t *detector;
@@ -784,11 +795,21 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) {
if (zcp->zc_data.zc_serd_io[0] == '\0') {
if (nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N,
&io_n) != 0) {
io_n = DEFAULT_IO_N;
}
if (nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T,
&io_t) != 0) {
io_t = DEFAULT_IO_T;
}
zfs_serd_name(zcp->zc_data.zc_serd_io,
pool_guid, vdev_guid, "io");
fmd_serd_create(hdl, zcp->zc_data.zc_serd_io,
fmd_prop_get_int32(hdl, "io_N"),
fmd_prop_get_int64(hdl, "io_T"));
io_n,
SEC2NSEC(io_t));
zfs_case_serialize(zcp);
}
if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
@@ -813,12 +834,23 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
}
if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
if (nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N,
&checksum_n) != 0) {
checksum_n = DEFAULT_CHECKSUM_N;
}
if (nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T,
&checksum_t) != 0) {
checksum_t = DEFAULT_CHECKSUM_T;
}
zfs_serd_name(zcp->zc_data.zc_serd_checksum,
pool_guid, vdev_guid, "checksum");
fmd_serd_create(hdl,
zcp->zc_data.zc_serd_checksum,
fmd_prop_get_int32(hdl, "checksum_N"),
fmd_prop_get_int64(hdl, "checksum_T"));
checksum_n,
SEC2NSEC(checksum_t));
zfs_case_serialize(zcp);
}
if (fmd_serd_record(hdl,