Detect a slow raidz child during reads

A single slow responding disk can affect the overall read
performance of a raidz group.  When a raidz child disk is
determined to be a persistent slow outlier, then have it
sit out during reads for a period of time. The raidz group
can use parity to reconstruct the data that was skipped.

Each time a slow disk is placed into a sit out period, its
`vdev_stat.vs_slow_ios count` is incremented and a zevent
class `ereport.fs.zfs.delay` is posted.

The length of the sit out period can be changed using the
`raid_read_sit_out_secs` module parameter.  Setting it to
zero disables slow outlier detection.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Contributions-by: Don Brady <don.brady@klarasystems.com>
Contributions-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #17227
This commit is contained in:
Paul Dagnelie 2025-08-27 16:41:48 -07:00 committed by Brian Behlendorf
parent 0df85ec27c
commit df55ba7c49
28 changed files with 1399 additions and 13 deletions

View File

@ -62,6 +62,17 @@ typedef longlong_t hrtime_t;
#define SEC_TO_TICK(sec) ((sec) * hz)
#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz))
static __inline hrtime_t
getlrtime(void)
{
struct timespec ts;
hrtime_t nsec;
getnanouptime(&ts);
nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec;
return (nsec);
}
static __inline hrtime_t
gethrtime(void)
{

View File

@ -79,6 +79,14 @@ gethrestime_sec(void)
return (ts.tv_sec);
}
static inline hrtime_t
getlrtime(void)
{
inode_timespec_t ts;
ktime_get_coarse_ts64(&ts);
return (((hrtime_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec);
}
static inline hrtime_t
gethrtime(void)
{

View File

@ -58,6 +58,7 @@ extern "C" {
#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure"
#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay"
#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write"
#define FM_EREPORT_ZFS_SITOUT "sitout"
#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode"

View File

@ -385,6 +385,8 @@ typedef enum {
VDEV_PROP_TRIM_SUPPORT,
VDEV_PROP_TRIM_ERRORS,
VDEV_PROP_SLOW_IOS,
VDEV_PROP_SIT_OUT,
VDEV_PROP_AUTOSIT,
VDEV_NUM_PROPS
} vdev_prop_t;
@ -1673,6 +1675,7 @@ typedef enum {
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
ZFS_ERR_ASHIFT_MISMATCH,
ZFS_ERR_STREAM_LARGE_MICROZAP,
ZFS_ERR_TOO_MANY_SITOUTS,
} zfs_errno_t;
/*

View File

@ -279,10 +279,12 @@ struct vdev {
uint64_t vdev_noalloc; /* device is passivated? */
uint64_t vdev_removing; /* device is being removed? */
uint64_t vdev_failfast; /* device failfast setting */
boolean_t vdev_autosit; /* automatic sitout management */
boolean_t vdev_rz_expanding; /* raidz is being expanded? */
boolean_t vdev_ishole; /* is a hole in the namespace */
uint64_t vdev_top_zap;
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
uint64_t vdev_last_latency_check;
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
@ -431,6 +433,10 @@ struct vdev {
hrtime_t vdev_mmp_pending; /* 0 if write finished */
uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
uint64_t vdev_expansion_time; /* vdev's last expansion time */
/* used to calculate average read latency */
uint64_t *vdev_prev_histo;
int64_t vdev_outlier_count; /* read outlier amongst peers */
hrtime_t vdev_read_sit_out_expire; /* end of sit out period */
list_node_t vdev_leaf_node; /* leaf vdev list */
/*

View File

@ -61,6 +61,9 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *);
struct raidz_row *vdev_raidz_row_alloc(int, zio_t *);
void vdev_raidz_reflow_copy_scratch(spa_t *);
void raidz_dtl_reassessed(vdev_t *);
boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t);
void vdev_raidz_sit_child(vdev_t *, uint64_t);
void vdev_raidz_unsit_child(vdev_t *);
extern const zio_vsd_ops_t vdev_raidz_vsd_ops;

View File

@ -119,6 +119,7 @@ typedef struct raidz_col {
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
uint8_t rc_force_repair:1; /* Write good data to this column */
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
int rc_shadow_devidx; /* for double write during expansion */
int rc_shadow_error; /* for double write during expansion */
uint64_t rc_shadow_offset; /* for double write during expansion */
@ -133,6 +134,7 @@ typedef struct raidz_row {
int rr_firstdatacol; /* First data column/parity count */
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
int rr_nempty; /* empty sectors included in parity */
int rr_outlier_cnt; /* Count of latency outlier devices */
#ifdef ZFS_DEBUG
uint64_t rr_offset; /* Logical offset for *_io_verify() */
uint64_t rr_size; /* Physical size for *_io_verify() */

View File

@ -97,6 +97,15 @@ gethrestime_sec(void)
return (tv.tv_sec);
}
static inline hrtime_t
getlrtime(void)
{
struct timeval tv;
(void) gettimeofday(&tv, NULL);
return ((((uint64_t)tv.tv_sec) * NANOSEC) +
((uint64_t)tv.tv_usec * NSEC_PER_USEC));
}
static inline hrtime_t
gethrtime(void)
{

View File

@ -6117,7 +6117,9 @@
<enumerator name='VDEV_PROP_TRIM_SUPPORT' value='49'/>
<enumerator name='VDEV_PROP_TRIM_ERRORS' value='50'/>
<enumerator name='VDEV_PROP_SLOW_IOS' value='51'/>
<enumerator name='VDEV_NUM_PROPS' value='52'/>
<enumerator name='VDEV_PROP_SIT_OUT' value='52'/>
<enumerator name='VDEV_PROP_AUTOSIT' value='53'/>
<enumerator name='VDEV_NUM_PROPS' value='54'/>
</enum-decl>
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>

View File

@ -5549,6 +5549,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
/* Only use if provided by the RAIDZ VDEV above */
if (prop == VDEV_PROP_RAIDZ_EXPANDING)
return (ENOENT);
if (prop == VDEV_PROP_SIT_OUT)
return (ENOENT);
}
if (vdev_prop_index_to_string(prop, intval,
(const char **)&strval) != 0)
@ -5718,8 +5720,16 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname,
nvlist_free(nvl);
nvlist_free(outnvl);
if (ret)
(void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf);
if (ret) {
if (errno == ENOTSUP) {
zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
"property not supported for this vdev"));
(void) zfs_error(zhp->zpool_hdl, EZFS_PROPTYPE, errbuf);
} else {
(void) zpool_standard_error(zhp->zpool_hdl, errno,
errbuf);
}
}
return (ret);
}

View File

@ -776,6 +776,11 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case ZFS_ERR_ASHIFT_MISMATCH:
zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap);
break;
case ZFS_ERR_TOO_MANY_SITOUTS:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "too many disks "
"already sitting out"));
zfs_verror(hdl, EZFS_BUSY, fmt, ap);
break;
default:
zfs_error_aux(hdl, "%s", zfs_strerror(error));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);

View File

@ -4,6 +4,7 @@
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
.\" Copyright (c) 2019 Datto Inc.
.\" Copyright (c) 2023, 2024, 2025, Klara, Inc.
.\"
.\" The contents of this file are subject to the terms of the Common Development
.\" and Distribution License (the "License"). You may not use this file except
.\" in compliance with the License. You can obtain a copy of the license at
@ -601,6 +602,42 @@ new format when enabling the
feature.
The default is to convert all log entries.
.
.It Sy vdev_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong
When a slow disk outlier is detected it is placed in a sit out state.
While sitting out the disk will not participate in normal reads, instead its
data will be reconstructed as needed from parity.
Scrub operations will always read from a disk, even if it's sitting out.
A number of disks in a RAID-Z or dRAID vdev may sit out at the same time, up
to the number of parity devices.
Writes will still be issued to a disk which is sitting out to maintain full
redundancy.
Defaults to 600 seconds and a value of zero disables disk sit-outs in general,
including slow disk outlier detection.
.
.It Sy vdev_raidz_outlier_check_interval_ms Ns = Ns Sy 1000 Ns ms Po 1 sec Pc Pq ulong
How often each RAID-Z and dRAID vdev will check for slow disk outliers.
Increasing this interval will reduce the sensitivity of detection (since all
I/Os since the last check are included in the statistics), but will slow the
response to a disk developing a problem.
Defaults to once per second; setting extremely small values may cause negative
performance effects.
.
.It Sy vdev_raidz_outlier_insensitivity Ns = Ns Sy 50 Pq uint
When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is
used to determine how far out an outlier must be before it counts as an event
worth consdering.
This is phrased as "insensitivity" because larger values result in fewer
detections.
Smaller values will result in more aggressive sitting out of disks that may have
problems, but may significantly increase the rate of spurious sit-outs.
.Pp
To provide a more technical definition of this parameter, this is the multiple
of the inter-quartile range (IQR) that is being used in a Tukey's Fence
detection algorithm.
This is much higher than a normal Tukey's Fence k-value, because the
distribution under consideration is probably an extreme-value distribution,
rather than a more typical Gaussian distribution.
.
.It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
During top-level vdev removal, chunks of data are copied from the vdev
which may include free space in order to trade bandwidth for IOPS.

View File

@ -19,7 +19,7 @@
.\"
.\" CDDL HEADER END
.\"
.\" Copyright (c) 2021 Klara, Inc.
.\" Copyright (c) 2021, 2025, Klara, Inc.
.\"
.Dd July 23, 2024
.Dt VDEVPROPS 7
@ -106,11 +106,17 @@ The number of children belonging to this vdev
.It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors
The number of errors of each type encountered by this vdev
.It Sy slow_ios
The number of slow I/Os encountered by this vdev,
These represent I/O operations that didn't complete in
This indicates the number of slow I/O operations encountered by this vdev.
A slow I/O is defined as an operation that did not complete within the
.Sy zio_slow_io_ms
milliseconds
threshold in milliseconds
.Pq Sy 30000 No by default .
For
.Sy RAIDZ
and
.Sy DRAID
configurations, this value also represents the number of times the vdev was
identified as an outlier and excluded from participating in read I/O operations.
.It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops
The number of I/O operations of each type performed by this vdev
.It Xo
@ -150,6 +156,31 @@ The amount of space to reserve for the EFI system partition
.It Sy failfast
If this device should propagate BIO errors back to ZFS, used to disable
failfast.
.It Sy sit_out
Only valid for
.Sy RAIDZ
and
.Sy DRAID
vdevs.
True when a slow disk outlier was detected and the vdev is currently in a sit
out state.
This property can be manually set to cause vdevs to sit out.
It will also be automatically set by the
.Sy autosit
logic if that is enabled.
While sitting out, the vdev will not participate in normal reads, instead its
data will be reconstructed as needed from parity.
.It Sy autosit
Only valid for
.Sy RAIDZ
and
.Sy DRAID
vdevs.
If set, this enables the kernel-level slow disk detection logic.
This logic automatically causes any vdevs that are significant negative
performance outliers to sit out, as described in the
.Sy sit_out
property.
.It Sy path
The path to the device for this vdev
.It Sy allocating

View File

@ -190,6 +190,16 @@ Issued when a scrub is resumed on a pool.
.It Sy scrub.paused
Issued when a scrub is paused on a pool.
.It Sy bootfs.vdev.attach
.It Sy sitout
Issued when a
.Sy RAIDZ
or
.Sy DRAID
vdev triggers the
.Sy autosit
logic.
This logic detects when a disk in such a vdev is significantly slower than its
peers, and sits them out temporarily to preserve the performance of the pool.
.El
.
.Sh PAYLOADS

View File

@ -467,9 +467,15 @@ vdev_prop_init(void)
zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING",
boolean_table, sfeatures);
zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0,
PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table,
sfeatures);
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
boolean_table, sfeatures);
zprop_register_index(VDEV_PROP_AUTOSIT, "autosit", 0,
PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "AUTOSIT", boolean_table,
sfeatures);
/* default index properties */
zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE,

View File

@ -29,7 +29,7 @@
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Datto Inc. All rights reserved.
* Copyright (c) 2021, Klara Inc.
* Copyright (c) 2021, 2025, Klara, Inc.
* Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
*/
@ -1086,6 +1086,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
}
}
if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops))
vd->vdev_autosit =
vdev_prop_default_numeric(VDEV_PROP_AUTOSIT);
/*
* Add ourselves to the parent's list of children.
*/
@ -1187,6 +1191,9 @@ vdev_free(vdev_t *vd)
spa_spare_remove(vd);
if (vd->vdev_isl2cache)
spa_l2cache_remove(vd);
if (vd->vdev_prev_histo)
kmem_free(vd->vdev_prev_histo,
sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS);
txg_list_destroy(&vd->vdev_ms_list);
txg_list_destroy(&vd->vdev_dtl_list);
@ -3857,6 +3864,26 @@ vdev_load(vdev_t *vd)
}
}
if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
spa_t *spa = vd->vdev_spa;
uint64_t autosit;
error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit),
1, &autosit);
if (error == 0) {
vd->vdev_autosit = autosit == 1;
} else if (error == ENOENT) {
vd->vdev_autosit = vdev_prop_default_numeric(
VDEV_PROP_AUTOSIT);
} else {
vdev_dbgmsg(vd,
"vdev_load: zap_lookup(top_zap=%llu) "
"failed [error=%d]",
(u_longlong_t)vd->vdev_top_zap, error);
}
}
/*
* Load any rebuild state from the top-level vdev zap.
*/
@ -4616,6 +4643,8 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_checksum_errors = 0;
vd->vdev_stat.vs_dio_verify_errors = 0;
vd->vdev_stat.vs_slow_ios = 0;
atomic_store_64(&vd->vdev_outlier_count, 0);
vd->vdev_read_sit_out_expire = 0;
for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
@ -6107,6 +6136,56 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
}
vd->vdev_failfast = intval & 1;
break;
case VDEV_PROP_SIT_OUT:
/* Only expose this for a draid or raidz leaf */
if (!vd->vdev_ops->vdev_op_leaf ||
vd->vdev_top == NULL ||
(vd->vdev_top->vdev_ops != &vdev_raidz_ops &&
vd->vdev_top->vdev_ops != &vdev_draid_ops)) {
error = ENOTSUP;
break;
}
if (nvpair_value_uint64(elem, &intval) != 0) {
error = EINVAL;
break;
}
if (intval == 1) {
vdev_t *ancestor = vd;
while (ancestor->vdev_parent != vd->vdev_top)
ancestor = ancestor->vdev_parent;
vdev_t *pvd = vd->vdev_top;
uint_t sitouts = 0;
for (int i = 0; i < pvd->vdev_children; i++) {
if (pvd->vdev_child[i] == ancestor)
continue;
if (vdev_sit_out_reads(
pvd->vdev_child[i], 0)) {
sitouts++;
}
}
if (sitouts >= vdev_get_nparity(pvd)) {
error = ZFS_ERR_TOO_MANY_SITOUTS;
break;
}
if (error == 0)
vdev_raidz_sit_child(vd,
INT64_MAX - gethrestime_sec());
} else {
vdev_raidz_unsit_child(vd);
}
break;
case VDEV_PROP_AUTOSIT:
if (vd->vdev_ops != &vdev_raidz_ops &&
vd->vdev_ops != &vdev_draid_ops) {
error = ENOTSUP;
break;
}
if (nvpair_value_uint64(elem, &intval) != 0) {
error = EINVAL;
break;
}
vd->vdev_autosit = intval == 1;
break;
case VDEV_PROP_CHECKSUM_N:
if (nvpair_value_uint64(elem, &intval) != 0) {
error = EINVAL;
@ -6456,6 +6535,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
ZPROP_SRC_NONE);
}
continue;
case VDEV_PROP_SIT_OUT:
/* Only expose this for a draid or raidz leaf */
if (vd->vdev_ops->vdev_op_leaf &&
vd->vdev_top != NULL &&
(vd->vdev_top->vdev_ops ==
&vdev_raidz_ops ||
vd->vdev_top->vdev_ops ==
&vdev_draid_ops)) {
vdev_prop_add_list(outnvl, propname,
NULL, vdev_sit_out_reads(vd, 0),
ZPROP_SRC_NONE);
}
continue;
case VDEV_PROP_TRIM_SUPPORT:
/* only valid for leaf vdevs */
if (vd->vdev_ops->vdev_op_leaf) {
@ -6506,6 +6598,29 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
vdev_prop_add_list(outnvl, propname, strval,
intval, src);
break;
case VDEV_PROP_AUTOSIT:
/* Only raidz vdevs cannot have this property */
if (vd->vdev_ops != &vdev_raidz_ops &&
vd->vdev_ops != &vdev_draid_ops) {
src = ZPROP_SRC_NONE;
intval = ZPROP_BOOLEAN_NA;
} else {
err = vdev_prop_get_int(vd, prop,
&intval);
if (err && err != ENOENT)
break;
if (intval ==
vdev_prop_default_numeric(prop))
src = ZPROP_SRC_DEFAULT;
else
src = ZPROP_SRC_LOCAL;
}
vdev_prop_add_list(outnvl, propname, NULL,
intval, src);
break;
case VDEV_PROP_CHECKSUM_N:
case VDEV_PROP_CHECKSUM_T:
case VDEV_PROP_IO_N:

View File

@ -22,6 +22,7 @@
/*
* Copyright (c) 2018 Intel Corporation.
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
* Copyright (c) 2025, Klara, Inc.
*/
#include <sys/zfs_context.h>
@ -1996,6 +1997,33 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
rc->rc_allow_repair = 1;
}
}
if (vdev_sit_out_reads(cvd, zio->io_flags)) {
rr->rr_outlier_cnt++;
ASSERT0(rc->rc_latency_outlier);
rc->rc_latency_outlier = 1;
}
}
/*
* When the row contains a latency outlier and sufficient parity
* exists to reconstruct the column data, then skip reading the
* known slow child vdev as a performance optimization.
*/
if (rr->rr_outlier_cnt > 0 &&
(rr->rr_firstdatacol - rr->rr_missingparity) >=
(rr->rr_missingdata + 1)) {
for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
raidz_col_t *rc = &rr->rr_col[c];
if (rc->rc_error == 0 && rc->rc_latency_outlier) {
rr->rr_missingdata++;
rc->rc_error = SET_ERROR(EAGAIN);
rc->rc_skipped = 1;
break;
}
}
}
/*

View File

@ -24,6 +24,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2016 Gvozden Nešković. All rights reserved.
* Copyright (c) 2025, Klara, Inc.
*/
#include <sys/zfs_context.h>
@ -355,6 +356,32 @@ unsigned long raidz_expand_max_reflow_bytes = 0;
*/
uint_t raidz_expand_pause_point = 0;
/*
* This represents the duration for a slow drive read sit out.
*/
static unsigned long vdev_read_sit_out_secs = 600;
/*
* How often each RAID-Z and dRAID vdev will check for slow disk outliers.
* Increasing this interval will reduce the sensitivity of detection (since all
* I/Os since the last check are included in the statistics), but will slow the
* response to a disk developing a problem.
*
* Defaults to once per second; setting extremely small values may cause
* negative performance effects.
*/
static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000;
/*
* When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is
* used to determine how far out an outlier must be before it counts as an event
* worth consdering.
*
* Smaller values will result in more aggressive sitting out of disks that may
* have problems, but may significantly increase the rate of spurious sit-outs.
*/
static uint32_t vdev_raidz_outlier_insensitivity = 50;
/*
* Maximum amount of copy io's outstanding at once.
*/
@ -2311,6 +2338,41 @@ vdev_raidz_min_asize(vdev_t *vd)
vd->vdev_children);
}
/*
* return B_TRUE if a read should be skipped due to being too slow.
*
* In vdev_child_slow_outlier() it looks for outliers based on disk
* latency from the most recent child reads. Here we're checking if,
* over time, a disk has has been an outlier too many times and is
* now in a sit out period.
*/
boolean_t
vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
{
if (vdev_read_sit_out_secs == 0)
return (B_FALSE);
/* Avoid skipping a data column read when scrubbing */
if (io_flags & ZIO_FLAG_SCRUB)
return (B_FALSE);
if (!vd->vdev_ops->vdev_op_leaf) {
boolean_t sitting = B_FALSE;
for (int c = 0; c < vd->vdev_children; c++) {
sitting |= vdev_sit_out_reads(vd->vdev_child[c],
io_flags);
}
return (sitting);
}
if (vd->vdev_read_sit_out_expire >= gethrestime_sec())
return (B_TRUE);
vd->vdev_read_sit_out_expire = 0;
return (B_FALSE);
}
void
vdev_raidz_child_done(zio_t *zio)
{
@ -2475,6 +2537,45 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
rc->rc_skipped = 1;
continue;
}
if (vdev_sit_out_reads(cvd, zio->io_flags)) {
rr->rr_outlier_cnt++;
ASSERT0(rc->rc_latency_outlier);
rc->rc_latency_outlier = 1;
}
}
/*
* When the row contains a latency outlier and sufficient parity
* exists to reconstruct the column data, then skip reading the
* known slow child vdev as a performance optimization.
*/
if (rr->rr_outlier_cnt > 0 &&
(rr->rr_firstdatacol - rr->rr_missingparity) >=
(rr->rr_missingdata + 1)) {
for (int c = rr->rr_cols - 1; c >= 0; c--) {
raidz_col_t *rc = &rr->rr_col[c];
if (rc->rc_error == 0 && rc->rc_latency_outlier) {
if (c >= rr->rr_firstdatacol)
rr->rr_missingdata++;
else
rr->rr_missingparity++;
rc->rc_error = SET_ERROR(EAGAIN);
rc->rc_skipped = 1;
break;
}
}
}
for (int c = rr->rr_cols - 1; c >= 0; c--) {
raidz_col_t *rc = &rr->rr_col[c];
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
if (rc->rc_error || rc->rc_size == 0)
continue;
if (forceparity ||
c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
@ -2498,6 +2599,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
ASSERT3U(prc->rc_devidx, ==, i);
vdev_t *cvd = vd->vdev_child[i];
if (!vdev_readable(cvd)) {
prc->rc_error = SET_ERROR(ENXIO);
prc->rc_tried = 1; /* don't even try */
@ -2774,6 +2876,239 @@ vdev_raidz_worst_error(raidz_row_t *rr)
return (error);
}
/*
* Find the median value from a set of n values
*/
static uint64_t
latency_median_value(const uint64_t *data, size_t n)
{
uint64_t m;
if (n % 2 == 0)
m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1;
else
m = data[((n + 1) >> 1) - 1];
return (m);
}
/*
* Calculate the outlier fence from a set of n latency values
*
* fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1)
*/
static uint64_t
latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr)
{
uint64_t q1 = latency_median_value(&data[0], n >> 1);
uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1);
/*
* To avoid detecting false positive outliers when N is small and
* and the latencies values are very close, make sure the IQR
* is at least 25% larger than Q1.
*/
*iqr = MAX(q3 - q1, q1 / 4);
return (q3 + (*iqr * vdev_raidz_outlier_insensitivity));
}
#define LAT_CHILDREN_MIN 5
#define LAT_OUTLIER_LIMIT 20
static int
latency_compare(const void *arg1, const void *arg2)
{
const uint64_t *l1 = (uint64_t *)arg1;
const uint64_t *l2 = (uint64_t *)arg2;
return (TREE_CMP(*l1, *l2));
}
void
vdev_raidz_sit_child(vdev_t *svd, uint64_t secs)
{
for (int c = 0; c < svd->vdev_children; c++)
vdev_raidz_sit_child(svd->vdev_child[c], secs);
if (!svd->vdev_ops->vdev_op_leaf)
return;
/* Begin a sit out period for this slow drive */
svd->vdev_read_sit_out_expire = gethrestime_sec() +
secs;
/* Count each slow io period */
mutex_enter(&svd->vdev_stat_lock);
svd->vdev_stat.vs_slow_ios++;
mutex_exit(&svd->vdev_stat_lock);
}
void
vdev_raidz_unsit_child(vdev_t *vd)
{
for (int c = 0; c < vd->vdev_children; c++)
vdev_raidz_unsit_child(vd->vdev_child[c]);
if (!vd->vdev_ops->vdev_op_leaf)
return;
vd->vdev_read_sit_out_expire = 0;
}
/*
* Check for any latency outlier from latest set of child reads.
*
* Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This
* rule defines extreme outliers as data points outside the fence of the
* third quartile plus fifty times the Interquartile Range (IQR). This range
* is the distance between the first and third quartile.
*
* Fifty is an extremely large value for Tukey's fence, but the outliers we're
* attempting to detect here are orders of magnitude times larger than the
* median. This large value should capture any truly fault disk quickly,
* without causing spurious sit-outs.
*
* To further avoid spurious sit-outs, vdevs must be detected multiple times
* as an outlier before they are sat, and outlier counts will gradually decay.
* Every nchildren times we have detected an outlier, we subtract 2 from the
* outlier count of all children. If detected outliers are close to uniformly
* distributed, this will result in the outlier count remaining close to 0
* (in expectation; over long enough time-scales, spurious sit-outs are still
* possible).
*/
static void
vdev_child_slow_outlier(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 ||
vd->vdev_children < LAT_CHILDREN_MIN)
return;
hrtime_t now = getlrtime();
uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);
if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms))
return;
/* Allow a single winner when there are racing callers. */
if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last)
return;
int children = vd->vdev_children;
uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP);
for (int c = 0; c < children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_prev_histo == NULL) {
mutex_enter(&cvd->vdev_stat_lock);
size_t size =
sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP);
memcpy(cvd->vdev_prev_histo,
cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ],
size);
mutex_exit(&cvd->vdev_stat_lock);
}
}
uint64_t max = 0;
vdev_t *svd = NULL;
uint_t sitouts = 0;
boolean_t skip = B_FALSE, svd_sitting = B_FALSE;
for (int c = 0; c < children; c++) {
vdev_t *cvd = vd->vdev_child[c];
boolean_t sitting = vdev_sit_out_reads(cvd, 0) ||
cvd->vdev_state != VDEV_STATE_HEALTHY;
/* We can't sit out more disks than we have parity */
if (sitting && ++sitouts >= vdev_get_nparity(vd))
skip = B_TRUE;
mutex_enter(&cvd->vdev_stat_lock);
uint64_t *prev_histo = cvd->vdev_prev_histo;
uint64_t *histo =
cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ];
if (skip) {
size_t size =
sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
memcpy(prev_histo, histo, size);
mutex_exit(&cvd->vdev_stat_lock);
continue;
}
uint64_t count = 0;
lat_data[c] = 0;
for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) {
uint64_t this_count = histo[i] - prev_histo[i];
lat_data[c] += (1ULL << i) * this_count;
count += this_count;
}
size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
memcpy(prev_histo, histo, size);
mutex_exit(&cvd->vdev_stat_lock);
lat_data[c] /= MAX(1, count);
/* Wait until all disks have been read from */
if (lat_data[c] == 0 && !sitting) {
skip = B_TRUE;
continue;
}
/* Keep track of the vdev with largest value */
if (lat_data[c] > max) {
max = lat_data[c];
svd = cvd;
svd_sitting = sitting;
}
}
if (skip) {
kmem_free(lat_data, sizeof (uint64_t) * children);
return;
}
qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare);
uint64_t iqr;
uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr);
ASSERT3U(lat_data[children - 1], ==, max);
if (max > fence && !svd_sitting) {
ASSERT3U(iqr, >, 0);
uint64_t incr = MAX(1, MIN((max - fence) / iqr,
LAT_OUTLIER_LIMIT / 4));
vd->vdev_outlier_count += incr;
if (vd->vdev_outlier_count >= children) {
for (int c = 0; c < children; c++) {
vdev_t *cvd = vd->vdev_child[c];
cvd->vdev_outlier_count -= 2;
cvd->vdev_outlier_count = MAX(0,
cvd->vdev_outlier_count);
}
vd->vdev_outlier_count = 0;
}
/*
* Keep track of how many times this child has had
* an outlier read. A disk that persitently has a
* higher than peers outlier count will be considered
* a slow disk.
*/
svd->vdev_outlier_count += incr;
if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) {
ASSERT0(svd->vdev_read_sit_out_expire);
vdev_raidz_sit_child(svd, vdev_read_sit_out_secs);
(void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT,
zio->io_spa, svd, NULL, NULL, 0);
vdev_dbgmsg(svd, "begin read sit out for %d secs",
(int)vdev_read_sit_out_secs);
for (int c = 0; c < vd->vdev_children; c++)
vd->vdev_child[c]->vdev_outlier_count = 0;
}
}
kmem_free(lat_data, sizeof (uint64_t) * children);
}
static void
vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
{
@ -3515,6 +3850,9 @@ vdev_raidz_io_done(zio_t *zio)
raidz_row_t *rr = rm->rm_row[i];
vdev_raidz_io_done_verified(zio, rr);
}
/* Periodically check for a read outlier */
if (zio->io_type == ZIO_TYPE_READ)
vdev_child_slow_outlier(zio);
zio_checksum_verified(zio);
} else {
/*
@ -5155,3 +5493,10 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
"For expanded RAIDZ, automatically start a pool scrub when expansion "
"completes");
ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW,
"Raidz/draid slow disk sit out time period in seconds");
ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, ULONG,
ZMOD_RW, "Interval to check for slow raidz/draid children");
ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT,
ZMOD_RW, "How insensitive the slow raidz/draid child check should be");
/* END CSTYLED */

View File

@ -940,10 +940,11 @@ tags = ['functional', 'rename_dirs']
[tests/functional/replacement]
tests = ['attach_import', 'attach_multiple', 'attach_rebuild',
'attach_resilver', 'detach', 'rebuild_disabled_feature',
'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild',
'replace_resilver', 'resilver_restart_001', 'resilver_restart_002',
'scrub_cancel']
'attach_resilver', 'attach_resilver_sit_out', 'detach',
'rebuild_disabled_feature', 'rebuild_multiple', 'rebuild_raidz',
'replace_import', 'replace_rebuild', 'replace_resilver',
'replace_resilver_sit_out', 'resilver_restart_001',
'resilver_restart_002', 'scrub_cancel']
tags = ['functional', 'replacement']
[tests/functional/reservation]

View File

@ -109,7 +109,8 @@ tags = ['functional', 'direct']
[tests/functional/events:Linux]
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple']
'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple',
'slow_vdev_sit_out', 'slow_vdev_sit_out_neg', 'slow_vdev_degraded_sit_out']
tags = ['functional', 'events']
[tests/functional/fallocate:Linux]

View File

@ -1112,6 +1112,16 @@ function get_pool_prop # property pool
zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool"
}
# Get the specified vdev property in parsable format or fail
function get_vdev_prop
{
typeset prop="$1"
typeset pool="$2"
typeset vdev="$3"
zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev"
}
# Return 0 if a pool exists; $? otherwise
#
# $1 - pool name
@ -1970,6 +1980,28 @@ function wait_vdev_state # pool disk state timeout
return 1
}
#
# Wait for vdev 'sit_out' property to be cleared.
#
# $1 pool name
# $2 vdev name
# $3 timeout
#
function wait_sit_out #pool vdev timeout
{
typeset pool=${1:-$TESTPOOL}
typeset vdev="$2"
typeset timeout=${3:-300}
for (( timer = 0; timer < $timeout; timer++ )); do
if [ "$(get_vdev_prop sit_out "$pool" "$vdev")" = "off" ]; then
return 0
fi
sleep 1;
done
return 1
}
#
# Check the output of 'zpool status -v <pool>',
# and to see if the content of <token> contain the <keyword> specified.

View File

@ -72,6 +72,9 @@ MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval
OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize
PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable
RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes
READ_SIT_OUT_SECS vdev.read_sit_out_secs vdev_read_sit_out_secs
SIT_OUT_CHECK_INTERVAL vdev.raidz_outlier_check_interval_ms vdev_raidz_outlier_check_interval_ms
SIT_OUT_INSENSITIVITY vdev.raidz_outlier_insensitivity vdev_raidz_outlier_insensitivity
REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled
REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress
REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment

View File

@ -1525,6 +1525,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/events/events_001_pos.ksh \
functional/events/events_002_pos.ksh \
functional/events/setup.ksh \
functional/events/slow_vdev_degraded_sit_out.ksh \
functional/events/slow_vdev_sit_out.ksh \
functional/events/slow_vdev_sit_out_neg.ksh \
functional/events/zed_cksum_config.ksh \
functional/events/zed_cksum_reported.ksh \
functional/events/zed_diagnose_multiple.ksh \
@ -1937,6 +1940,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/replacement/attach_multiple.ksh \
functional/replacement/attach_rebuild.ksh \
functional/replacement/attach_resilver.ksh \
functional/replacement/attach_resilver_sit_out.ksh \
functional/replacement/cleanup.ksh \
functional/replacement/detach.ksh \
functional/replacement/rebuild_disabled_feature.ksh \
@ -1945,6 +1949,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/replacement/replace_import.ksh \
functional/replacement/replace_rebuild.ksh \
functional/replacement/replace_resilver.ksh \
functional/replacement/replace_resilver_sit_out.ksh \
functional/replacement/resilver_restart_001.ksh \
functional/replacement/resilver_restart_002.ksh \
functional/replacement/scrub_cancel.ksh \

View File

@ -0,0 +1,106 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
# Copyright (c) 2025 by Klara, Inc.
# DESCRIPTION:
# Verify that vdevs 'sit out' when they are slow
#
# STRATEGY:
# 1. Create various raidz/draid pools
# 2. Degrade/fault one of the disks.
# 3. Inject delays into one of the disks
# 4. Verify disk is set to 'sit out' for awhile.
# 5. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted.
#
. $STF_SUITE/include/libtest.shlib
function cleanup
{
restore_tunable READ_SIT_OUT_SECS
restore_tunable SIT_OUT_CHECK_INTERVAL
log_must zinject -c all
log_must zpool events -c
destroy_pool $TESTPOOL2
log_must rm -f $TEST_BASE_DIR/vdev.$$.*
}
log_assert "Verify sit_out works"
log_onexit cleanup
# shorten sit out period for testing
save_tunable READ_SIT_OUT_SECS
set_tunable32 READ_SIT_OUT_SECS 5
save_tunable SIT_OUT_CHECK_INTERVAL
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9}
for raidtype in raidz2 raidz3 draid2 draid3 ; do
log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9}
log_must zpool set autosit=on $TESTPOOL2 "${raidtype}-0"
log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=400
log_must zpool export $TESTPOOL2
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2
BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9
SLOW_VDEV=$TEST_BASE_DIR/vdev.$$.8
# Initial state should not be sitting out
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "off" ]]
# Delay our reads 200ms to trigger sit out
log_must zinject -d $SLOW_VDEV -D200:1 -T read $TESTPOOL2
type=$((RANDOM % 2))
[[ "$type" -eq "0" ]] && action="degrade" || action="fault"
log_must zinject -d $BAD_VDEV -A $action -T read $TESTPOOL2
# Do some reads and wait for us to sit out
for i in {0..99} ; do
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "on"
# Clear fault injection
log_must zinject -c all
# Wait for us to exit our sit out period
log_must wait_sit_out $TESTPOOL2 $SLOW_VDEV 10
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "off"
destroy_pool $TESTPOOL2
log_must zpool labelclear -f $BAD_VDEV
done
log_pass "sit_out works correctly"

View File

@ -0,0 +1,102 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
# DESCRIPTION:
# Verify that vdevs 'sit out' when they are slow
#
# STRATEGY:
# 1. Create various raidz/draid pools
# 2. Inject delays into one of the disks
# 3. Verify disk is set to 'sit out' for awhile.
# 4. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted.
#
. $STF_SUITE/include/libtest.shlib
function cleanup
{
restore_tunable READ_SIT_OUT_SECS
restore_tunable SIT_OUT_CHECK_INTERVAL
log_must zinject -c all
log_must zpool events -c
destroy_pool $TESTPOOL2
log_must rm -f $TEST_BASE_DIR/vdev.$$.*
}
log_assert "Verify sit_out works"
log_onexit cleanup
# shorten sit out period for testing
save_tunable READ_SIT_OUT_SECS
set_tunable32 READ_SIT_OUT_SECS 5
save_tunable SIT_OUT_CHECK_INTERVAL
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
log_must truncate -s200M $TEST_BASE_DIR/vdev.$$.{0..9}
for raidtype in raidz raidz2 raidz3 draid1 draid2 draid3 ; do
log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9}
log_must zpool set autosit=on $TESTPOOL2 "${raidtype}-0"
log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=600
log_must zpool export $TESTPOOL2
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2
BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9
# Initial state should not be sitting out
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" ]]
# Delay our reads 200ms to trigger sit out
log_must zinject -d $BAD_VDEV -D200:1 -T read $TESTPOOL2
# Do some reads and wait for us to sit out
for i in {0..99} ; do
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 200)) bs=2M count=1 of=/dev/null
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "on"
# Clear fault injection
log_must zinject -c all
# Wait for us to exit our sit out period
log_must wait_sit_out $TESTPOOL2 $BAD_VDEV 10
# Verify sit_out was cleared during wait_sit_out
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off"
destroy_pool $TESTPOOL2
done
log_pass "sit_out works correctly"

View File

@ -0,0 +1,116 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
# Copyright (c) 2025 by Klara, Inc.
# DESCRIPTION:
# Verify that we don't sit out too many vdevs
#
# STRATEGY:
# 1. Create draid2 pool
# 2. Inject delays into three of the disks
# 3. Do reads to trigger sit-outs
# 4. Verify exactly 2 disks sit out
#
. $STF_SUITE/include/libtest.shlib
function cleanup
{
restore_tunable READ_SIT_OUT_SECS
restore_tunable SIT_OUT_CHECK_INTERVAL
log_must zinject -c all
log_must zpool events -c
destroy_pool $TESTPOOL2
log_must rm -f $TEST_BASE_DIR/vdev.$$.*
}
log_assert "Verify sit_out works"
log_onexit cleanup
save_tunable SIT_OUT_CHECK_INTERVAL
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9}
log_must zpool create $TESTPOOL2 draid2 $TEST_BASE_DIR/vdev.$$.{0..9}
log_must zpool set autosit=on $TESTPOOL2 draid2-0
log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=400
log_must zpool export $TESTPOOL2
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2
BAD_VDEV1=$TEST_BASE_DIR/vdev.$$.7
BAD_VDEV2=$TEST_BASE_DIR/vdev.$$.8
BAD_VDEV3=$TEST_BASE_DIR/vdev.$$.9
# Initial state should not be sitting out
log_must eval [[ "$(get_vdev_prop autosit $TESTPOOL2 draid2-0)" == "on" ]]
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)" == "off" ]]
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)" == "off" ]]
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)" == "off" ]]
# Delay our reads 200ms to trigger sit out
log_must zinject -d $BAD_VDEV1 -D200:1 -T read $TESTPOOL2
# Do some reads and wait for us to sit out
for i in {0..99} ; do
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)" == "on"
log_must zinject -d $BAD_VDEV2 -D200:1 -T read $TESTPOOL2
# Do some reads and wait for us to sit out
for i in {0..99} ; do
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)" == "on"
log_must zinject -d $BAD_VDEV3 -D200:1 -T read $TESTPOOL2
# Do some reads and wait for us to sit out
for i in {0..99} ; do
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)" == "off"
log_pass "sit_out works correctly"

View File

@ -0,0 +1,189 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
# Copyright (c) 2025, Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
# Attaching disks while a disk is sitting out reads should pass
#
# STRATEGY:
# 1. Create raidz pools
# 2. Make one disk slower and trigger a read sit out for that disk
# 3. Start some random I/O
# 4. Attach a disk to the pool.
# 5. Verify the integrity of the file system and the resilvering.
verify_runnable "global"
save_tunable READ_SIT_OUT_SECS
set_tunable32 READ_SIT_OUT_SECS 120
save_tunable SIT_OUT_CHECK_INTERVAL
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
function cleanup
{
restore_tunable READ_SIT_OUT_SECS
restore_tunable SIT_OUT_CHECK_INTERVAL
log_must zinject -c all
log_must zpool events -c
if [[ -n "$child_pids" ]]; then
for wait_pid in $child_pids; do
kill $wait_pid
done
fi
if poolexists $TESTPOOL1; then
destroy_pool $TESTPOOL1
fi
[[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
}
log_assert "Replacing a disk during I/O with a sit out completes."
options=""
options_display="default options"
log_onexit cleanup
[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
options="$options -r "
[[ -n "$options" ]] && options_display=$options
child_pids=""
function attach_test
{
typeset vdev=$1
typeset disk=$2
typeset i=0
while [[ $i -lt $iters ]]; do
log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i"
file_trunc $options $TESTDIR/$TESTFILE.$i &
typeset pid=$!
sleep 1
child_pids="$child_pids $pid"
((i = i + 1))
done
# attach disk with a slow drive still present
SECONDS=0
log_must zpool attach -w $TESTPOOL1 $vdev $disk
log_note took $SECONDS seconds to attach disk
for wait_pid in $child_pids
do
kill $wait_pid
done
child_pids=""
log_must zinject -c all
log_must zpool export $TESTPOOL1
log_must zpool import -d $TESTDIR $TESTPOOL1
log_must zfs umount $TESTPOOL1/$TESTFS1
log_must zdb -cdui $TESTPOOL1/$TESTFS1
log_must zfs mount $TESTPOOL1/$TESTFS1
verify_pool $TESTPOOL1
}
DEVSIZE="150M"
specials_list=""
i=0
while [[ $i != 10 ]]; do
truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
((i = i + 1))
done
slow_disk=$TESTDIR/$TESTFILE1.3
log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE
# Test file size in MB
count=200
for type in "raidz1" "raidz2" "raidz3" ; do
create_pool $TESTPOOL1 $type $specials_list
log_must zpool set autosit=on $TESTPOOL1 "${type}-0"
log_must zfs create -o primarycache=none -o recordsize=512K \
$TESTPOOL1/$TESTFS1
log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count
# Make one disk 100ms slower to trigger a sit out
log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1
# Do some reads and wait for sit out on slow disk
SECONDS=0
typeset -i size=0
for i in $(seq 1 $count) ; do
dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null
size=$i
sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on"
log_note took $SECONDS seconds to reach sit out reading ${size}M
log_must zpool status -s $TESTPOOL1
typeset top=$(zpool status -j | jq -r ".pools.$TESTPOOL1.vdevs[].vdevs[].name")
attach_test $top $TESTDIR/$REPLACEFILE
log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\""
destroy_pool $TESTPOOL1
log_must rm -rf /$TESTPOOL1
done
log_pass

View File

@ -0,0 +1,199 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
# Copyright (c) 2025, Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
# Replacing disks while a disk is sitting out reads should pass
#
# STRATEGY:
# 1. Create raidz and draid pools
# 2. Make one disk slower and trigger a read sit out for that disk
# 3. Start some random I/O
# 4. Replace a disk in the pool with another disk.
# 5. Verify the integrity of the file system and the resilvering.
#
verify_runnable "global"
save_tunable READ_SIT_OUT_SECS
set_tunable32 READ_SIT_OUT_SECS 120
save_tunable SIT_OUT_CHECK_INTERVAL
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
function cleanup
{
restore_tunable READ_SIT_OUT_SECS
restore_tunable SIT_OUT_CHECK_INTERVAL
log_must zinject -c all
log_must zpool events -c
if [[ -n "$child_pids" ]]; then
for wait_pid in $child_pids
do
kill $wait_pid
done
fi
if poolexists $TESTPOOL1; then
destroy_pool $TESTPOOL1
fi
[[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
}
log_assert "Replacing a disk during I/O with a sit out completes."
options=""
options_display="default options"
log_onexit cleanup
[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
options="$options -r "
[[ -n "$options" ]] && options_display=$options
child_pids=""
function replace_test
{
typeset -i iters=2
typeset disk1=$1
typeset disk2=$2
typeset repl_type=$3
typeset i=0
while [[ $i -lt $iters ]]; do
log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i"
file_trunc $options $TESTDIR/$TESTFILE.$i &
typeset pid=$!
sleep 1
child_pids="$child_pids $pid"
((i = i + 1))
done
typeset repl_flag="-w"
if [[ "$repl_type" == "seq" ]]; then
repl_flag="-ws"
fi
# replace disk with a slow drive still present
SECONDS=0
log_must zpool replace $repl_flag $TESTPOOL1 $disk1 $disk2
log_note took $SECONDS seconds to replace disk
for wait_pid in $child_pids
do
kill $wait_pid
done
child_pids=""
log_must zinject -c all
log_must zpool export $TESTPOOL1
log_must zpool import -d $TESTDIR $TESTPOOL1
log_must zfs umount $TESTPOOL1/$TESTFS1
log_must zdb -cdui $TESTPOOL1/$TESTFS1
log_must zfs mount $TESTPOOL1/$TESTFS1
verify_pool $TESTPOOL1
}
DEVSIZE="150M"
specials_list=""
i=0
while [[ $i != 10 ]]; do
log_must truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
((i = i + 1))
done
slow_disk=$TESTDIR/$TESTFILE1.3
log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE
# Test file size in MB
count=400
for type in "raidz2" "raidz3" "draid2"; do
create_pool $TESTPOOL1 $type $specials_list
log_must zpool set autosit=on $TESTPOOL1 "${type}-0"
log_must zfs create -o primarycache=none -o recordsize=512K \
$TESTPOOL1/$TESTFS1
log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count
# Make one disk 100ms slower to trigger a sit out
log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1
# Do some reads and wait for sit out on slow disk
SECONDS=0
typeset -i size=0
for i in $(seq 1 $count) ; do
dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null
size=$i
sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on"
log_note took $SECONDS seconds to reach sit out reading ${size}M
log_must zpool status -s $TESTPOOL1
typeset repl_type="replace"
if [[ "$type" == "draid2" && $((RANDOM % 2)) -eq 0 ]]; then
repl_type="seq"
fi
replace_test $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE $repl_type
log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\""
destroy_pool $TESTPOOL1
log_must rm -rf /$TESTPOOL1
done
log_pass