Detect a slow raidz child during reads

A single slow responding disk can affect the overall read
performance of a raidz group.  When a raidz child disk is
determined to be a persistent slow outlier, then have it
sit out during reads for a period of time. The raidz group
can use parity to reconstruct the data that was skipped.

Each time a slow disk is placed into a sit out period, its
`vdev_stat.vs_slow_ios count` is incremented and a zevent
class `ereport.fs.zfs.delay` is posted.

The length of the sit out period can be changed using the
`raid_read_sit_out_secs` module parameter.  Setting it to
zero disables slow outlier detection.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Contributions-by: Don Brady <don.brady@klarasystems.com>
Contributions-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #17227
This commit is contained in:
Paul Dagnelie
2025-08-27 16:41:48 -07:00
committed by Brian Behlendorf
parent 0df85ec27c
commit df55ba7c49
28 changed files with 1399 additions and 13 deletions
+1
View File
@@ -58,6 +58,7 @@ extern "C" {
#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure"
#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay"
#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write"
#define FM_EREPORT_ZFS_SITOUT "sitout"
#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode"
+3
View File
@@ -385,6 +385,8 @@ typedef enum {
VDEV_PROP_TRIM_SUPPORT,
VDEV_PROP_TRIM_ERRORS,
VDEV_PROP_SLOW_IOS,
VDEV_PROP_SIT_OUT,
VDEV_PROP_AUTOSIT,
VDEV_NUM_PROPS
} vdev_prop_t;
@@ -1673,6 +1675,7 @@ typedef enum {
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
ZFS_ERR_ASHIFT_MISMATCH,
ZFS_ERR_STREAM_LARGE_MICROZAP,
ZFS_ERR_TOO_MANY_SITOUTS,
} zfs_errno_t;
/*
+6
View File
@@ -279,10 +279,12 @@ struct vdev {
uint64_t vdev_noalloc; /* device is passivated? */
uint64_t vdev_removing; /* device is being removed? */
uint64_t vdev_failfast; /* device failfast setting */
boolean_t vdev_autosit; /* automatic sitout management */
boolean_t vdev_rz_expanding; /* raidz is being expanded? */
boolean_t vdev_ishole; /* is a hole in the namespace */
uint64_t vdev_top_zap;
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
uint64_t vdev_last_latency_check;
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
@@ -431,6 +433,10 @@ struct vdev {
hrtime_t vdev_mmp_pending; /* 0 if write finished */
uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
uint64_t vdev_expansion_time; /* vdev's last expansion time */
/* used to calculate average read latency */
uint64_t *vdev_prev_histo;
int64_t vdev_outlier_count; /* read outlier amongst peers */
hrtime_t vdev_read_sit_out_expire; /* end of sit out period */
list_node_t vdev_leaf_node; /* leaf vdev list */
/*
+3
View File
@@ -61,6 +61,9 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *);
struct raidz_row *vdev_raidz_row_alloc(int, zio_t *);
void vdev_raidz_reflow_copy_scratch(spa_t *);
void raidz_dtl_reassessed(vdev_t *);
boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t);
void vdev_raidz_sit_child(vdev_t *, uint64_t);
void vdev_raidz_unsit_child(vdev_t *);
extern const zio_vsd_ops_t vdev_raidz_vsd_ops;
+2
View File
@@ -119,6 +119,7 @@ typedef struct raidz_col {
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
uint8_t rc_force_repair:1; /* Write good data to this column */
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
int rc_shadow_devidx; /* for double write during expansion */
int rc_shadow_error; /* for double write during expansion */
uint64_t rc_shadow_offset; /* for double write during expansion */
@@ -133,6 +134,7 @@ typedef struct raidz_row {
int rr_firstdatacol; /* First data column/parity count */
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
int rr_nempty; /* empty sectors included in parity */
int rr_outlier_cnt; /* Count of latency outlier devices */
#ifdef ZFS_DEBUG
uint64_t rr_offset; /* Logical offset for *_io_verify() */
uint64_t rr_size; /* Physical size for *_io_verify() */