mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-01-15 01:32:04 +03:00
Detect a slow raidz child during reads
A single slow responding disk can affect the overall read performance of a raidz group. When a raidz child disk is determined to be a persistent slow outlier, then have it sit out during reads for a period of time. The raidz group can use parity to reconstruct the data that was skipped. Each time a slow disk is placed into a sit out period, its `vdev_stat.vs_slow_ios count` is incremented and a zevent class `ereport.fs.zfs.delay` is posted. The length of the sit out period can be changed using the `raid_read_sit_out_secs` module parameter. Setting it to zero disables slow outlier detection. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Contributions-by: Don Brady <don.brady@klarasystems.com> Contributions-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #17227
This commit is contained in:
parent
0df85ec27c
commit
df55ba7c49
@ -62,6 +62,17 @@ typedef longlong_t hrtime_t;
|
||||
#define SEC_TO_TICK(sec) ((sec) * hz)
|
||||
#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz))
|
||||
|
||||
static __inline hrtime_t
|
||||
getlrtime(void)
|
||||
{
|
||||
struct timespec ts;
|
||||
hrtime_t nsec;
|
||||
|
||||
getnanouptime(&ts);
|
||||
nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec;
|
||||
return (nsec);
|
||||
}
|
||||
|
||||
static __inline hrtime_t
|
||||
gethrtime(void)
|
||||
{
|
||||
|
||||
@ -79,6 +79,14 @@ gethrestime_sec(void)
|
||||
return (ts.tv_sec);
|
||||
}
|
||||
|
||||
static inline hrtime_t
|
||||
getlrtime(void)
|
||||
{
|
||||
inode_timespec_t ts;
|
||||
ktime_get_coarse_ts64(&ts);
|
||||
return (((hrtime_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec);
|
||||
}
|
||||
|
||||
static inline hrtime_t
|
||||
gethrtime(void)
|
||||
{
|
||||
|
||||
@ -58,6 +58,7 @@ extern "C" {
|
||||
#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure"
|
||||
#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay"
|
||||
#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write"
|
||||
#define FM_EREPORT_ZFS_SITOUT "sitout"
|
||||
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode"
|
||||
|
||||
@ -385,6 +385,8 @@ typedef enum {
|
||||
VDEV_PROP_TRIM_SUPPORT,
|
||||
VDEV_PROP_TRIM_ERRORS,
|
||||
VDEV_PROP_SLOW_IOS,
|
||||
VDEV_PROP_SIT_OUT,
|
||||
VDEV_PROP_AUTOSIT,
|
||||
VDEV_NUM_PROPS
|
||||
} vdev_prop_t;
|
||||
|
||||
@ -1673,6 +1675,7 @@ typedef enum {
|
||||
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
|
||||
ZFS_ERR_ASHIFT_MISMATCH,
|
||||
ZFS_ERR_STREAM_LARGE_MICROZAP,
|
||||
ZFS_ERR_TOO_MANY_SITOUTS,
|
||||
} zfs_errno_t;
|
||||
|
||||
/*
|
||||
|
||||
@ -279,10 +279,12 @@ struct vdev {
|
||||
uint64_t vdev_noalloc; /* device is passivated? */
|
||||
uint64_t vdev_removing; /* device is being removed? */
|
||||
uint64_t vdev_failfast; /* device failfast setting */
|
||||
boolean_t vdev_autosit; /* automatic sitout management */
|
||||
boolean_t vdev_rz_expanding; /* raidz is being expanded? */
|
||||
boolean_t vdev_ishole; /* is a hole in the namespace */
|
||||
uint64_t vdev_top_zap;
|
||||
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
|
||||
uint64_t vdev_last_latency_check;
|
||||
|
||||
/* pool checkpoint related */
|
||||
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
|
||||
@ -431,6 +433,10 @@ struct vdev {
|
||||
hrtime_t vdev_mmp_pending; /* 0 if write finished */
|
||||
uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
|
||||
uint64_t vdev_expansion_time; /* vdev's last expansion time */
|
||||
/* used to calculate average read latency */
|
||||
uint64_t *vdev_prev_histo;
|
||||
int64_t vdev_outlier_count; /* read outlier amongst peers */
|
||||
hrtime_t vdev_read_sit_out_expire; /* end of sit out period */
|
||||
list_node_t vdev_leaf_node; /* leaf vdev list */
|
||||
|
||||
/*
|
||||
|
||||
@ -61,6 +61,9 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *);
|
||||
struct raidz_row *vdev_raidz_row_alloc(int, zio_t *);
|
||||
void vdev_raidz_reflow_copy_scratch(spa_t *);
|
||||
void raidz_dtl_reassessed(vdev_t *);
|
||||
boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t);
|
||||
void vdev_raidz_sit_child(vdev_t *, uint64_t);
|
||||
void vdev_raidz_unsit_child(vdev_t *);
|
||||
|
||||
extern const zio_vsd_ops_t vdev_raidz_vsd_ops;
|
||||
|
||||
|
||||
@ -119,6 +119,7 @@ typedef struct raidz_col {
|
||||
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
|
||||
uint8_t rc_force_repair:1; /* Write good data to this column */
|
||||
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
|
||||
uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
|
||||
int rc_shadow_devidx; /* for double write during expansion */
|
||||
int rc_shadow_error; /* for double write during expansion */
|
||||
uint64_t rc_shadow_offset; /* for double write during expansion */
|
||||
@ -133,6 +134,7 @@ typedef struct raidz_row {
|
||||
int rr_firstdatacol; /* First data column/parity count */
|
||||
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
|
||||
int rr_nempty; /* empty sectors included in parity */
|
||||
int rr_outlier_cnt; /* Count of latency outlier devices */
|
||||
#ifdef ZFS_DEBUG
|
||||
uint64_t rr_offset; /* Logical offset for *_io_verify() */
|
||||
uint64_t rr_size; /* Physical size for *_io_verify() */
|
||||
|
||||
@ -97,6 +97,15 @@ gethrestime_sec(void)
|
||||
return (tv.tv_sec);
|
||||
}
|
||||
|
||||
static inline hrtime_t
|
||||
getlrtime(void)
|
||||
{
|
||||
struct timeval tv;
|
||||
(void) gettimeofday(&tv, NULL);
|
||||
return ((((uint64_t)tv.tv_sec) * NANOSEC) +
|
||||
((uint64_t)tv.tv_usec * NSEC_PER_USEC));
|
||||
}
|
||||
|
||||
static inline hrtime_t
|
||||
gethrtime(void)
|
||||
{
|
||||
|
||||
@ -6117,7 +6117,9 @@
|
||||
<enumerator name='VDEV_PROP_TRIM_SUPPORT' value='49'/>
|
||||
<enumerator name='VDEV_PROP_TRIM_ERRORS' value='50'/>
|
||||
<enumerator name='VDEV_PROP_SLOW_IOS' value='51'/>
|
||||
<enumerator name='VDEV_NUM_PROPS' value='52'/>
|
||||
<enumerator name='VDEV_PROP_SIT_OUT' value='52'/>
|
||||
<enumerator name='VDEV_PROP_AUTOSIT' value='53'/>
|
||||
<enumerator name='VDEV_NUM_PROPS' value='54'/>
|
||||
</enum-decl>
|
||||
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
|
||||
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
|
||||
|
||||
@ -5549,6 +5549,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
|
||||
/* Only use if provided by the RAIDZ VDEV above */
|
||||
if (prop == VDEV_PROP_RAIDZ_EXPANDING)
|
||||
return (ENOENT);
|
||||
if (prop == VDEV_PROP_SIT_OUT)
|
||||
return (ENOENT);
|
||||
}
|
||||
if (vdev_prop_index_to_string(prop, intval,
|
||||
(const char **)&strval) != 0)
|
||||
@ -5718,8 +5720,16 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname,
|
||||
nvlist_free(nvl);
|
||||
nvlist_free(outnvl);
|
||||
|
||||
if (ret)
|
||||
(void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf);
|
||||
if (ret) {
|
||||
if (errno == ENOTSUP) {
|
||||
zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
|
||||
"property not supported for this vdev"));
|
||||
(void) zfs_error(zhp->zpool_hdl, EZFS_PROPTYPE, errbuf);
|
||||
} else {
|
||||
(void) zpool_standard_error(zhp->zpool_hdl, errno,
|
||||
errbuf);
|
||||
}
|
||||
}
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
@ -776,6 +776,11 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
|
||||
case ZFS_ERR_ASHIFT_MISMATCH:
|
||||
zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap);
|
||||
break;
|
||||
case ZFS_ERR_TOO_MANY_SITOUTS:
|
||||
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "too many disks "
|
||||
"already sitting out"));
|
||||
zfs_verror(hdl, EZFS_BUSY, fmt, ap);
|
||||
break;
|
||||
default:
|
||||
zfs_error_aux(hdl, "%s", zfs_strerror(error));
|
||||
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
|
||||
.\" Copyright (c) 2019 Datto Inc.
|
||||
.\" Copyright (c) 2023, 2024, 2025, Klara, Inc.
|
||||
.\"
|
||||
.\" The contents of this file are subject to the terms of the Common Development
|
||||
.\" and Distribution License (the "License"). You may not use this file except
|
||||
.\" in compliance with the License. You can obtain a copy of the license at
|
||||
@ -601,6 +602,42 @@ new format when enabling the
|
||||
feature.
|
||||
The default is to convert all log entries.
|
||||
.
|
||||
.It Sy vdev_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong
|
||||
When a slow disk outlier is detected it is placed in a sit out state.
|
||||
While sitting out the disk will not participate in normal reads, instead its
|
||||
data will be reconstructed as needed from parity.
|
||||
Scrub operations will always read from a disk, even if it's sitting out.
|
||||
A number of disks in a RAID-Z or dRAID vdev may sit out at the same time, up
|
||||
to the number of parity devices.
|
||||
Writes will still be issued to a disk which is sitting out to maintain full
|
||||
redundancy.
|
||||
Defaults to 600 seconds and a value of zero disables disk sit-outs in general,
|
||||
including slow disk outlier detection.
|
||||
.
|
||||
.It Sy vdev_raidz_outlier_check_interval_ms Ns = Ns Sy 1000 Ns ms Po 1 sec Pc Pq ulong
|
||||
How often each RAID-Z and dRAID vdev will check for slow disk outliers.
|
||||
Increasing this interval will reduce the sensitivity of detection (since all
|
||||
I/Os since the last check are included in the statistics), but will slow the
|
||||
response to a disk developing a problem.
|
||||
Defaults to once per second; setting extremely small values may cause negative
|
||||
performance effects.
|
||||
.
|
||||
.It Sy vdev_raidz_outlier_insensitivity Ns = Ns Sy 50 Pq uint
|
||||
When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is
|
||||
used to determine how far out an outlier must be before it counts as an event
|
||||
worth consdering.
|
||||
This is phrased as "insensitivity" because larger values result in fewer
|
||||
detections.
|
||||
Smaller values will result in more aggressive sitting out of disks that may have
|
||||
problems, but may significantly increase the rate of spurious sit-outs.
|
||||
.Pp
|
||||
To provide a more technical definition of this parameter, this is the multiple
|
||||
of the inter-quartile range (IQR) that is being used in a Tukey's Fence
|
||||
detection algorithm.
|
||||
This is much higher than a normal Tukey's Fence k-value, because the
|
||||
distribution under consideration is probably an extreme-value distribution,
|
||||
rather than a more typical Gaussian distribution.
|
||||
.
|
||||
.It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
|
||||
During top-level vdev removal, chunks of data are copied from the vdev
|
||||
which may include free space in order to trade bandwidth for IOPS.
|
||||
|
||||
@ -19,7 +19,7 @@
|
||||
.\"
|
||||
.\" CDDL HEADER END
|
||||
.\"
|
||||
.\" Copyright (c) 2021 Klara, Inc.
|
||||
.\" Copyright (c) 2021, 2025, Klara, Inc.
|
||||
.\"
|
||||
.Dd July 23, 2024
|
||||
.Dt VDEVPROPS 7
|
||||
@ -106,11 +106,17 @@ The number of children belonging to this vdev
|
||||
.It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors
|
||||
The number of errors of each type encountered by this vdev
|
||||
.It Sy slow_ios
|
||||
The number of slow I/Os encountered by this vdev,
|
||||
These represent I/O operations that didn't complete in
|
||||
This indicates the number of slow I/O operations encountered by this vdev.
|
||||
A slow I/O is defined as an operation that did not complete within the
|
||||
.Sy zio_slow_io_ms
|
||||
milliseconds
|
||||
threshold in milliseconds
|
||||
.Pq Sy 30000 No by default .
|
||||
For
|
||||
.Sy RAIDZ
|
||||
and
|
||||
.Sy DRAID
|
||||
configurations, this value also represents the number of times the vdev was
|
||||
identified as an outlier and excluded from participating in read I/O operations.
|
||||
.It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops
|
||||
The number of I/O operations of each type performed by this vdev
|
||||
.It Xo
|
||||
@ -150,6 +156,31 @@ The amount of space to reserve for the EFI system partition
|
||||
.It Sy failfast
|
||||
If this device should propagate BIO errors back to ZFS, used to disable
|
||||
failfast.
|
||||
.It Sy sit_out
|
||||
Only valid for
|
||||
.Sy RAIDZ
|
||||
and
|
||||
.Sy DRAID
|
||||
vdevs.
|
||||
True when a slow disk outlier was detected and the vdev is currently in a sit
|
||||
out state.
|
||||
This property can be manually set to cause vdevs to sit out.
|
||||
It will also be automatically set by the
|
||||
.Sy autosit
|
||||
logic if that is enabled.
|
||||
While sitting out, the vdev will not participate in normal reads, instead its
|
||||
data will be reconstructed as needed from parity.
|
||||
.It Sy autosit
|
||||
Only valid for
|
||||
.Sy RAIDZ
|
||||
and
|
||||
.Sy DRAID
|
||||
vdevs.
|
||||
If set, this enables the kernel-level slow disk detection logic.
|
||||
This logic automatically causes any vdevs that are significant negative
|
||||
performance outliers to sit out, as described in the
|
||||
.Sy sit_out
|
||||
property.
|
||||
.It Sy path
|
||||
The path to the device for this vdev
|
||||
.It Sy allocating
|
||||
|
||||
@ -190,6 +190,16 @@ Issued when a scrub is resumed on a pool.
|
||||
.It Sy scrub.paused
|
||||
Issued when a scrub is paused on a pool.
|
||||
.It Sy bootfs.vdev.attach
|
||||
.It Sy sitout
|
||||
Issued when a
|
||||
.Sy RAIDZ
|
||||
or
|
||||
.Sy DRAID
|
||||
vdev triggers the
|
||||
.Sy autosit
|
||||
logic.
|
||||
This logic detects when a disk in such a vdev is significantly slower than its
|
||||
peers, and sits them out temporarily to preserve the performance of the pool.
|
||||
.El
|
||||
.
|
||||
.Sh PAYLOADS
|
||||
|
||||
@ -467,9 +467,15 @@ vdev_prop_init(void)
|
||||
zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0,
|
||||
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING",
|
||||
boolean_table, sfeatures);
|
||||
zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0,
|
||||
PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table,
|
||||
sfeatures);
|
||||
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
|
||||
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
|
||||
boolean_table, sfeatures);
|
||||
zprop_register_index(VDEV_PROP_AUTOSIT, "autosit", 0,
|
||||
PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "AUTOSIT", boolean_table,
|
||||
sfeatures);
|
||||
|
||||
/* default index properties */
|
||||
zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE,
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
* Copyright 2017 Joyent, Inc.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
* Copyright (c) 2019, Datto Inc. All rights reserved.
|
||||
* Copyright (c) 2021, Klara Inc.
|
||||
* Copyright (c) 2021, 2025, Klara, Inc.
|
||||
* Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
|
||||
*/
|
||||
|
||||
@ -1086,6 +1086,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
}
|
||||
}
|
||||
|
||||
if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops))
|
||||
vd->vdev_autosit =
|
||||
vdev_prop_default_numeric(VDEV_PROP_AUTOSIT);
|
||||
|
||||
/*
|
||||
* Add ourselves to the parent's list of children.
|
||||
*/
|
||||
@ -1187,6 +1191,9 @@ vdev_free(vdev_t *vd)
|
||||
spa_spare_remove(vd);
|
||||
if (vd->vdev_isl2cache)
|
||||
spa_l2cache_remove(vd);
|
||||
if (vd->vdev_prev_histo)
|
||||
kmem_free(vd->vdev_prev_histo,
|
||||
sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS);
|
||||
|
||||
txg_list_destroy(&vd->vdev_ms_list);
|
||||
txg_list_destroy(&vd->vdev_dtl_list);
|
||||
@ -3857,6 +3864,26 @@ vdev_load(vdev_t *vd)
|
||||
}
|
||||
}
|
||||
|
||||
if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
uint64_t autosit;
|
||||
|
||||
error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
|
||||
vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit),
|
||||
1, &autosit);
|
||||
if (error == 0) {
|
||||
vd->vdev_autosit = autosit == 1;
|
||||
} else if (error == ENOENT) {
|
||||
vd->vdev_autosit = vdev_prop_default_numeric(
|
||||
VDEV_PROP_AUTOSIT);
|
||||
} else {
|
||||
vdev_dbgmsg(vd,
|
||||
"vdev_load: zap_lookup(top_zap=%llu) "
|
||||
"failed [error=%d]",
|
||||
(u_longlong_t)vd->vdev_top_zap, error);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Load any rebuild state from the top-level vdev zap.
|
||||
*/
|
||||
@ -4616,6 +4643,8 @@ vdev_clear(spa_t *spa, vdev_t *vd)
|
||||
vd->vdev_stat.vs_checksum_errors = 0;
|
||||
vd->vdev_stat.vs_dio_verify_errors = 0;
|
||||
vd->vdev_stat.vs_slow_ios = 0;
|
||||
atomic_store_64(&vd->vdev_outlier_count, 0);
|
||||
vd->vdev_read_sit_out_expire = 0;
|
||||
|
||||
for (int c = 0; c < vd->vdev_children; c++)
|
||||
vdev_clear(spa, vd->vdev_child[c]);
|
||||
@ -6107,6 +6136,56 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
|
||||
}
|
||||
vd->vdev_failfast = intval & 1;
|
||||
break;
|
||||
case VDEV_PROP_SIT_OUT:
|
||||
/* Only expose this for a draid or raidz leaf */
|
||||
if (!vd->vdev_ops->vdev_op_leaf ||
|
||||
vd->vdev_top == NULL ||
|
||||
(vd->vdev_top->vdev_ops != &vdev_raidz_ops &&
|
||||
vd->vdev_top->vdev_ops != &vdev_draid_ops)) {
|
||||
error = ENOTSUP;
|
||||
break;
|
||||
}
|
||||
if (nvpair_value_uint64(elem, &intval) != 0) {
|
||||
error = EINVAL;
|
||||
break;
|
||||
}
|
||||
if (intval == 1) {
|
||||
vdev_t *ancestor = vd;
|
||||
while (ancestor->vdev_parent != vd->vdev_top)
|
||||
ancestor = ancestor->vdev_parent;
|
||||
vdev_t *pvd = vd->vdev_top;
|
||||
uint_t sitouts = 0;
|
||||
for (int i = 0; i < pvd->vdev_children; i++) {
|
||||
if (pvd->vdev_child[i] == ancestor)
|
||||
continue;
|
||||
if (vdev_sit_out_reads(
|
||||
pvd->vdev_child[i], 0)) {
|
||||
sitouts++;
|
||||
}
|
||||
}
|
||||
if (sitouts >= vdev_get_nparity(pvd)) {
|
||||
error = ZFS_ERR_TOO_MANY_SITOUTS;
|
||||
break;
|
||||
}
|
||||
if (error == 0)
|
||||
vdev_raidz_sit_child(vd,
|
||||
INT64_MAX - gethrestime_sec());
|
||||
} else {
|
||||
vdev_raidz_unsit_child(vd);
|
||||
}
|
||||
break;
|
||||
case VDEV_PROP_AUTOSIT:
|
||||
if (vd->vdev_ops != &vdev_raidz_ops &&
|
||||
vd->vdev_ops != &vdev_draid_ops) {
|
||||
error = ENOTSUP;
|
||||
break;
|
||||
}
|
||||
if (nvpair_value_uint64(elem, &intval) != 0) {
|
||||
error = EINVAL;
|
||||
break;
|
||||
}
|
||||
vd->vdev_autosit = intval == 1;
|
||||
break;
|
||||
case VDEV_PROP_CHECKSUM_N:
|
||||
if (nvpair_value_uint64(elem, &intval) != 0) {
|
||||
error = EINVAL;
|
||||
@ -6456,6 +6535,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
|
||||
ZPROP_SRC_NONE);
|
||||
}
|
||||
continue;
|
||||
case VDEV_PROP_SIT_OUT:
|
||||
/* Only expose this for a draid or raidz leaf */
|
||||
if (vd->vdev_ops->vdev_op_leaf &&
|
||||
vd->vdev_top != NULL &&
|
||||
(vd->vdev_top->vdev_ops ==
|
||||
&vdev_raidz_ops ||
|
||||
vd->vdev_top->vdev_ops ==
|
||||
&vdev_draid_ops)) {
|
||||
vdev_prop_add_list(outnvl, propname,
|
||||
NULL, vdev_sit_out_reads(vd, 0),
|
||||
ZPROP_SRC_NONE);
|
||||
}
|
||||
continue;
|
||||
case VDEV_PROP_TRIM_SUPPORT:
|
||||
/* only valid for leaf vdevs */
|
||||
if (vd->vdev_ops->vdev_op_leaf) {
|
||||
@ -6506,6 +6598,29 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
|
||||
vdev_prop_add_list(outnvl, propname, strval,
|
||||
intval, src);
|
||||
break;
|
||||
case VDEV_PROP_AUTOSIT:
|
||||
/* Only raidz vdevs cannot have this property */
|
||||
if (vd->vdev_ops != &vdev_raidz_ops &&
|
||||
vd->vdev_ops != &vdev_draid_ops) {
|
||||
src = ZPROP_SRC_NONE;
|
||||
intval = ZPROP_BOOLEAN_NA;
|
||||
} else {
|
||||
err = vdev_prop_get_int(vd, prop,
|
||||
&intval);
|
||||
if (err && err != ENOENT)
|
||||
break;
|
||||
|
||||
if (intval ==
|
||||
vdev_prop_default_numeric(prop))
|
||||
src = ZPROP_SRC_DEFAULT;
|
||||
else
|
||||
src = ZPROP_SRC_LOCAL;
|
||||
}
|
||||
|
||||
vdev_prop_add_list(outnvl, propname, NULL,
|
||||
intval, src);
|
||||
break;
|
||||
|
||||
case VDEV_PROP_CHECKSUM_N:
|
||||
case VDEV_PROP_CHECKSUM_T:
|
||||
case VDEV_PROP_IO_N:
|
||||
|
||||
@ -22,6 +22,7 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Intel Corporation.
|
||||
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
|
||||
* Copyright (c) 2025, Klara, Inc.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@ -1996,6 +1997,33 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
|
||||
rc->rc_allow_repair = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (vdev_sit_out_reads(cvd, zio->io_flags)) {
|
||||
rr->rr_outlier_cnt++;
|
||||
ASSERT0(rc->rc_latency_outlier);
|
||||
rc->rc_latency_outlier = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* When the row contains a latency outlier and sufficient parity
|
||||
* exists to reconstruct the column data, then skip reading the
|
||||
* known slow child vdev as a performance optimization.
|
||||
*/
|
||||
if (rr->rr_outlier_cnt > 0 &&
|
||||
(rr->rr_firstdatacol - rr->rr_missingparity) >=
|
||||
(rr->rr_missingdata + 1)) {
|
||||
|
||||
for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
|
||||
raidz_col_t *rc = &rr->rr_col[c];
|
||||
|
||||
if (rc->rc_error == 0 && rc->rc_latency_outlier) {
|
||||
rr->rr_missingdata++;
|
||||
rc->rc_error = SET_ERROR(EAGAIN);
|
||||
rc->rc_skipped = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@ -24,6 +24,7 @@
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2016 Gvozden Nešković. All rights reserved.
|
||||
* Copyright (c) 2025, Klara, Inc.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@ -355,6 +356,32 @@ unsigned long raidz_expand_max_reflow_bytes = 0;
|
||||
*/
|
||||
uint_t raidz_expand_pause_point = 0;
|
||||
|
||||
/*
|
||||
* This represents the duration for a slow drive read sit out.
|
||||
*/
|
||||
static unsigned long vdev_read_sit_out_secs = 600;
|
||||
|
||||
/*
|
||||
* How often each RAID-Z and dRAID vdev will check for slow disk outliers.
|
||||
* Increasing this interval will reduce the sensitivity of detection (since all
|
||||
* I/Os since the last check are included in the statistics), but will slow the
|
||||
* response to a disk developing a problem.
|
||||
*
|
||||
* Defaults to once per second; setting extremely small values may cause
|
||||
* negative performance effects.
|
||||
*/
|
||||
static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000;
|
||||
|
||||
/*
|
||||
* When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is
|
||||
* used to determine how far out an outlier must be before it counts as an event
|
||||
* worth consdering.
|
||||
*
|
||||
* Smaller values will result in more aggressive sitting out of disks that may
|
||||
* have problems, but may significantly increase the rate of spurious sit-outs.
|
||||
*/
|
||||
static uint32_t vdev_raidz_outlier_insensitivity = 50;
|
||||
|
||||
/*
|
||||
* Maximum amount of copy io's outstanding at once.
|
||||
*/
|
||||
@ -2311,6 +2338,41 @@ vdev_raidz_min_asize(vdev_t *vd)
|
||||
vd->vdev_children);
|
||||
}
|
||||
|
||||
/*
|
||||
* return B_TRUE if a read should be skipped due to being too slow.
|
||||
*
|
||||
* In vdev_child_slow_outlier() it looks for outliers based on disk
|
||||
* latency from the most recent child reads. Here we're checking if,
|
||||
* over time, a disk has has been an outlier too many times and is
|
||||
* now in a sit out period.
|
||||
*/
|
||||
boolean_t
|
||||
vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
|
||||
{
|
||||
if (vdev_read_sit_out_secs == 0)
|
||||
return (B_FALSE);
|
||||
|
||||
/* Avoid skipping a data column read when scrubbing */
|
||||
if (io_flags & ZIO_FLAG_SCRUB)
|
||||
return (B_FALSE);
|
||||
|
||||
if (!vd->vdev_ops->vdev_op_leaf) {
|
||||
boolean_t sitting = B_FALSE;
|
||||
for (int c = 0; c < vd->vdev_children; c++) {
|
||||
sitting |= vdev_sit_out_reads(vd->vdev_child[c],
|
||||
io_flags);
|
||||
}
|
||||
return (sitting);
|
||||
}
|
||||
|
||||
if (vd->vdev_read_sit_out_expire >= gethrestime_sec())
|
||||
return (B_TRUE);
|
||||
|
||||
vd->vdev_read_sit_out_expire = 0;
|
||||
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
void
|
||||
vdev_raidz_child_done(zio_t *zio)
|
||||
{
|
||||
@ -2475,6 +2537,45 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
|
||||
rc->rc_skipped = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (vdev_sit_out_reads(cvd, zio->io_flags)) {
|
||||
rr->rr_outlier_cnt++;
|
||||
ASSERT0(rc->rc_latency_outlier);
|
||||
rc->rc_latency_outlier = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* When the row contains a latency outlier and sufficient parity
|
||||
* exists to reconstruct the column data, then skip reading the
|
||||
* known slow child vdev as a performance optimization.
|
||||
*/
|
||||
if (rr->rr_outlier_cnt > 0 &&
|
||||
(rr->rr_firstdatacol - rr->rr_missingparity) >=
|
||||
(rr->rr_missingdata + 1)) {
|
||||
|
||||
for (int c = rr->rr_cols - 1; c >= 0; c--) {
|
||||
raidz_col_t *rc = &rr->rr_col[c];
|
||||
|
||||
if (rc->rc_error == 0 && rc->rc_latency_outlier) {
|
||||
if (c >= rr->rr_firstdatacol)
|
||||
rr->rr_missingdata++;
|
||||
else
|
||||
rr->rr_missingparity++;
|
||||
rc->rc_error = SET_ERROR(EAGAIN);
|
||||
rc->rc_skipped = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int c = rr->rr_cols - 1; c >= 0; c--) {
|
||||
raidz_col_t *rc = &rr->rr_col[c];
|
||||
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
|
||||
|
||||
if (rc->rc_error || rc->rc_size == 0)
|
||||
continue;
|
||||
|
||||
if (forceparity ||
|
||||
c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
|
||||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
|
||||
@ -2498,6 +2599,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
|
||||
|
||||
ASSERT3U(prc->rc_devidx, ==, i);
|
||||
vdev_t *cvd = vd->vdev_child[i];
|
||||
|
||||
if (!vdev_readable(cvd)) {
|
||||
prc->rc_error = SET_ERROR(ENXIO);
|
||||
prc->rc_tried = 1; /* don't even try */
|
||||
@ -2774,6 +2876,239 @@ vdev_raidz_worst_error(raidz_row_t *rr)
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the median value from a set of n values
|
||||
*/
|
||||
static uint64_t
|
||||
latency_median_value(const uint64_t *data, size_t n)
|
||||
{
|
||||
uint64_t m;
|
||||
|
||||
if (n % 2 == 0)
|
||||
m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1;
|
||||
else
|
||||
m = data[((n + 1) >> 1) - 1];
|
||||
|
||||
return (m);
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the outlier fence from a set of n latency values
|
||||
*
|
||||
* fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1)
|
||||
*/
|
||||
static uint64_t
|
||||
latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr)
|
||||
{
|
||||
uint64_t q1 = latency_median_value(&data[0], n >> 1);
|
||||
uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1);
|
||||
|
||||
/*
|
||||
* To avoid detecting false positive outliers when N is small and
|
||||
* and the latencies values are very close, make sure the IQR
|
||||
* is at least 25% larger than Q1.
|
||||
*/
|
||||
*iqr = MAX(q3 - q1, q1 / 4);
|
||||
|
||||
return (q3 + (*iqr * vdev_raidz_outlier_insensitivity));
|
||||
}
|
||||
#define LAT_CHILDREN_MIN 5
|
||||
#define LAT_OUTLIER_LIMIT 20
|
||||
|
||||
static int
|
||||
latency_compare(const void *arg1, const void *arg2)
|
||||
{
|
||||
const uint64_t *l1 = (uint64_t *)arg1;
|
||||
const uint64_t *l2 = (uint64_t *)arg2;
|
||||
|
||||
return (TREE_CMP(*l1, *l2));
|
||||
}
|
||||
|
||||
void
|
||||
vdev_raidz_sit_child(vdev_t *svd, uint64_t secs)
|
||||
{
|
||||
for (int c = 0; c < svd->vdev_children; c++)
|
||||
vdev_raidz_sit_child(svd->vdev_child[c], secs);
|
||||
|
||||
if (!svd->vdev_ops->vdev_op_leaf)
|
||||
return;
|
||||
|
||||
/* Begin a sit out period for this slow drive */
|
||||
svd->vdev_read_sit_out_expire = gethrestime_sec() +
|
||||
secs;
|
||||
|
||||
/* Count each slow io period */
|
||||
mutex_enter(&svd->vdev_stat_lock);
|
||||
svd->vdev_stat.vs_slow_ios++;
|
||||
mutex_exit(&svd->vdev_stat_lock);
|
||||
}
|
||||
|
||||
void
|
||||
vdev_raidz_unsit_child(vdev_t *vd)
|
||||
{
|
||||
for (int c = 0; c < vd->vdev_children; c++)
|
||||
vdev_raidz_unsit_child(vd->vdev_child[c]);
|
||||
|
||||
if (!vd->vdev_ops->vdev_op_leaf)
|
||||
return;
|
||||
|
||||
vd->vdev_read_sit_out_expire = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for any latency outlier from latest set of child reads.
|
||||
*
|
||||
* Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This
|
||||
* rule defines extreme outliers as data points outside the fence of the
|
||||
* third quartile plus fifty times the Interquartile Range (IQR). This range
|
||||
* is the distance between the first and third quartile.
|
||||
*
|
||||
* Fifty is an extremely large value for Tukey's fence, but the outliers we're
|
||||
* attempting to detect here are orders of magnitude times larger than the
|
||||
* median. This large value should capture any truly fault disk quickly,
|
||||
* without causing spurious sit-outs.
|
||||
*
|
||||
* To further avoid spurious sit-outs, vdevs must be detected multiple times
|
||||
* as an outlier before they are sat, and outlier counts will gradually decay.
|
||||
* Every nchildren times we have detected an outlier, we subtract 2 from the
|
||||
* outlier count of all children. If detected outliers are close to uniformly
|
||||
* distributed, this will result in the outlier count remaining close to 0
|
||||
* (in expectation; over long enough time-scales, spurious sit-outs are still
|
||||
* possible).
|
||||
*/
|
||||
static void
|
||||
vdev_child_slow_outlier(zio_t *zio)
|
||||
{
|
||||
vdev_t *vd = zio->io_vd;
|
||||
if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 ||
|
||||
vd->vdev_children < LAT_CHILDREN_MIN)
|
||||
return;
|
||||
|
||||
hrtime_t now = getlrtime();
|
||||
uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);
|
||||
|
||||
if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms))
|
||||
return;
|
||||
|
||||
/* Allow a single winner when there are racing callers. */
|
||||
if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last)
|
||||
return;
|
||||
|
||||
int children = vd->vdev_children;
|
||||
uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP);
|
||||
|
||||
for (int c = 0; c < children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
if (cvd->vdev_prev_histo == NULL) {
|
||||
mutex_enter(&cvd->vdev_stat_lock);
|
||||
size_t size =
|
||||
sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
|
||||
cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP);
|
||||
memcpy(cvd->vdev_prev_histo,
|
||||
cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ],
|
||||
size);
|
||||
mutex_exit(&cvd->vdev_stat_lock);
|
||||
}
|
||||
}
|
||||
uint64_t max = 0;
|
||||
vdev_t *svd = NULL;
|
||||
uint_t sitouts = 0;
|
||||
boolean_t skip = B_FALSE, svd_sitting = B_FALSE;
|
||||
for (int c = 0; c < children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
boolean_t sitting = vdev_sit_out_reads(cvd, 0) ||
|
||||
cvd->vdev_state != VDEV_STATE_HEALTHY;
|
||||
|
||||
/* We can't sit out more disks than we have parity */
|
||||
if (sitting && ++sitouts >= vdev_get_nparity(vd))
|
||||
skip = B_TRUE;
|
||||
|
||||
mutex_enter(&cvd->vdev_stat_lock);
|
||||
|
||||
uint64_t *prev_histo = cvd->vdev_prev_histo;
|
||||
uint64_t *histo =
|
||||
cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ];
|
||||
if (skip) {
|
||||
size_t size =
|
||||
sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
|
||||
memcpy(prev_histo, histo, size);
|
||||
mutex_exit(&cvd->vdev_stat_lock);
|
||||
continue;
|
||||
}
|
||||
uint64_t count = 0;
|
||||
lat_data[c] = 0;
|
||||
for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) {
|
||||
uint64_t this_count = histo[i] - prev_histo[i];
|
||||
lat_data[c] += (1ULL << i) * this_count;
|
||||
count += this_count;
|
||||
}
|
||||
size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
|
||||
memcpy(prev_histo, histo, size);
|
||||
mutex_exit(&cvd->vdev_stat_lock);
|
||||
lat_data[c] /= MAX(1, count);
|
||||
|
||||
/* Wait until all disks have been read from */
|
||||
if (lat_data[c] == 0 && !sitting) {
|
||||
skip = B_TRUE;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Keep track of the vdev with largest value */
|
||||
if (lat_data[c] > max) {
|
||||
max = lat_data[c];
|
||||
svd = cvd;
|
||||
svd_sitting = sitting;
|
||||
}
|
||||
}
|
||||
|
||||
if (skip) {
|
||||
kmem_free(lat_data, sizeof (uint64_t) * children);
|
||||
return;
|
||||
}
|
||||
|
||||
qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare);
|
||||
|
||||
uint64_t iqr;
|
||||
uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr);
|
||||
|
||||
ASSERT3U(lat_data[children - 1], ==, max);
|
||||
if (max > fence && !svd_sitting) {
|
||||
ASSERT3U(iqr, >, 0);
|
||||
uint64_t incr = MAX(1, MIN((max - fence) / iqr,
|
||||
LAT_OUTLIER_LIMIT / 4));
|
||||
vd->vdev_outlier_count += incr;
|
||||
if (vd->vdev_outlier_count >= children) {
|
||||
for (int c = 0; c < children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
cvd->vdev_outlier_count -= 2;
|
||||
cvd->vdev_outlier_count = MAX(0,
|
||||
cvd->vdev_outlier_count);
|
||||
}
|
||||
vd->vdev_outlier_count = 0;
|
||||
}
|
||||
/*
|
||||
* Keep track of how many times this child has had
|
||||
* an outlier read. A disk that persitently has a
|
||||
* higher than peers outlier count will be considered
|
||||
* a slow disk.
|
||||
*/
|
||||
svd->vdev_outlier_count += incr;
|
||||
if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) {
|
||||
ASSERT0(svd->vdev_read_sit_out_expire);
|
||||
vdev_raidz_sit_child(svd, vdev_read_sit_out_secs);
|
||||
(void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT,
|
||||
zio->io_spa, svd, NULL, NULL, 0);
|
||||
vdev_dbgmsg(svd, "begin read sit out for %d secs",
|
||||
(int)vdev_read_sit_out_secs);
|
||||
|
||||
for (int c = 0; c < vd->vdev_children; c++)
|
||||
vd->vdev_child[c]->vdev_outlier_count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
kmem_free(lat_data, sizeof (uint64_t) * children);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
|
||||
{
|
||||
@ -3515,6 +3850,9 @@ vdev_raidz_io_done(zio_t *zio)
|
||||
raidz_row_t *rr = rm->rm_row[i];
|
||||
vdev_raidz_io_done_verified(zio, rr);
|
||||
}
|
||||
/* Periodically check for a read outlier */
|
||||
if (zio->io_type == ZIO_TYPE_READ)
|
||||
vdev_child_slow_outlier(zio);
|
||||
zio_checksum_verified(zio);
|
||||
} else {
|
||||
/*
|
||||
@ -5155,3 +5493,10 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
|
||||
"For expanded RAIDZ, automatically start a pool scrub when expansion "
|
||||
"completes");
|
||||
ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW,
|
||||
"Raidz/draid slow disk sit out time period in seconds");
|
||||
ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, ULONG,
|
||||
ZMOD_RW, "Interval to check for slow raidz/draid children");
|
||||
ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT,
|
||||
ZMOD_RW, "How insensitive the slow raidz/draid child check should be");
|
||||
/* END CSTYLED */
|
||||
|
||||
@ -940,10 +940,11 @@ tags = ['functional', 'rename_dirs']
|
||||
|
||||
[tests/functional/replacement]
|
||||
tests = ['attach_import', 'attach_multiple', 'attach_rebuild',
|
||||
'attach_resilver', 'detach', 'rebuild_disabled_feature',
|
||||
'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild',
|
||||
'replace_resilver', 'resilver_restart_001', 'resilver_restart_002',
|
||||
'scrub_cancel']
|
||||
'attach_resilver', 'attach_resilver_sit_out', 'detach',
|
||||
'rebuild_disabled_feature', 'rebuild_multiple', 'rebuild_raidz',
|
||||
'replace_import', 'replace_rebuild', 'replace_resilver',
|
||||
'replace_resilver_sit_out', 'resilver_restart_001',
|
||||
'resilver_restart_002', 'scrub_cancel']
|
||||
tags = ['functional', 'replacement']
|
||||
|
||||
[tests/functional/reservation]
|
||||
|
||||
@ -109,7 +109,8 @@ tags = ['functional', 'direct']
|
||||
[tests/functional/events:Linux]
|
||||
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
|
||||
'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
|
||||
'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple']
|
||||
'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple',
|
||||
'slow_vdev_sit_out', 'slow_vdev_sit_out_neg', 'slow_vdev_degraded_sit_out']
|
||||
tags = ['functional', 'events']
|
||||
|
||||
[tests/functional/fallocate:Linux]
|
||||
|
||||
@ -1112,6 +1112,16 @@ function get_pool_prop # property pool
|
||||
zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool"
|
||||
}
|
||||
|
||||
# Get the specified vdev property in parsable format or fail
|
||||
function get_vdev_prop
|
||||
{
|
||||
typeset prop="$1"
|
||||
typeset pool="$2"
|
||||
typeset vdev="$3"
|
||||
|
||||
zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev"
|
||||
}
|
||||
|
||||
# Return 0 if a pool exists; $? otherwise
|
||||
#
|
||||
# $1 - pool name
|
||||
@ -1970,6 +1980,28 @@ function wait_vdev_state # pool disk state timeout
|
||||
return 1
|
||||
}
|
||||
|
||||
#
|
||||
# Wait for vdev 'sit_out' property to be cleared.
|
||||
#
|
||||
# $1 pool name
|
||||
# $2 vdev name
|
||||
# $3 timeout
|
||||
#
|
||||
function wait_sit_out #pool vdev timeout
|
||||
{
|
||||
typeset pool=${1:-$TESTPOOL}
|
||||
typeset vdev="$2"
|
||||
typeset timeout=${3:-300}
|
||||
for (( timer = 0; timer < $timeout; timer++ )); do
|
||||
if [ "$(get_vdev_prop sit_out "$pool" "$vdev")" = "off" ]; then
|
||||
return 0
|
||||
fi
|
||||
sleep 1;
|
||||
done
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
#
|
||||
# Check the output of 'zpool status -v <pool>',
|
||||
# and to see if the content of <token> contain the <keyword> specified.
|
||||
|
||||
@ -72,6 +72,9 @@ MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval
|
||||
OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize
|
||||
PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable
|
||||
RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes
|
||||
READ_SIT_OUT_SECS vdev.read_sit_out_secs vdev_read_sit_out_secs
|
||||
SIT_OUT_CHECK_INTERVAL vdev.raidz_outlier_check_interval_ms vdev_raidz_outlier_check_interval_ms
|
||||
SIT_OUT_INSENSITIVITY vdev.raidz_outlier_insensitivity vdev_raidz_outlier_insensitivity
|
||||
REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled
|
||||
REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress
|
||||
REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment
|
||||
|
||||
@ -1525,6 +1525,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||
functional/events/events_001_pos.ksh \
|
||||
functional/events/events_002_pos.ksh \
|
||||
functional/events/setup.ksh \
|
||||
functional/events/slow_vdev_degraded_sit_out.ksh \
|
||||
functional/events/slow_vdev_sit_out.ksh \
|
||||
functional/events/slow_vdev_sit_out_neg.ksh \
|
||||
functional/events/zed_cksum_config.ksh \
|
||||
functional/events/zed_cksum_reported.ksh \
|
||||
functional/events/zed_diagnose_multiple.ksh \
|
||||
@ -1937,6 +1940,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||
functional/replacement/attach_multiple.ksh \
|
||||
functional/replacement/attach_rebuild.ksh \
|
||||
functional/replacement/attach_resilver.ksh \
|
||||
functional/replacement/attach_resilver_sit_out.ksh \
|
||||
functional/replacement/cleanup.ksh \
|
||||
functional/replacement/detach.ksh \
|
||||
functional/replacement/rebuild_disabled_feature.ksh \
|
||||
@ -1945,6 +1949,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||
functional/replacement/replace_import.ksh \
|
||||
functional/replacement/replace_rebuild.ksh \
|
||||
functional/replacement/replace_resilver.ksh \
|
||||
functional/replacement/replace_resilver_sit_out.ksh \
|
||||
functional/replacement/resilver_restart_001.ksh \
|
||||
functional/replacement/resilver_restart_002.ksh \
|
||||
functional/replacement/scrub_cancel.ksh \
|
||||
|
||||
106
tests/zfs-tests/tests/functional/events/slow_vdev_degraded_sit_out.ksh
Executable file
106
tests/zfs-tests/tests/functional/events/slow_vdev_degraded_sit_out.ksh
Executable file
@ -0,0 +1,106 @@
|
||||
#!/bin/ksh -p
|
||||
# SPDX-License-Identifier: CDDL-1.0
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
|
||||
# Copyright (c) 2025 by Klara, Inc.
|
||||
|
||||
# DESCRIPTION:
|
||||
# Verify that vdevs 'sit out' when they are slow
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create various raidz/draid pools
|
||||
# 2. Degrade/fault one of the disks.
|
||||
# 3. Inject delays into one of the disks
|
||||
# 4. Verify disk is set to 'sit out' for awhile.
|
||||
# 5. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
function cleanup
|
||||
{
|
||||
restore_tunable READ_SIT_OUT_SECS
|
||||
restore_tunable SIT_OUT_CHECK_INTERVAL
|
||||
log_must zinject -c all
|
||||
log_must zpool events -c
|
||||
destroy_pool $TESTPOOL2
|
||||
log_must rm -f $TEST_BASE_DIR/vdev.$$.*
|
||||
}
|
||||
|
||||
log_assert "Verify sit_out works"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
# shorten sit out period for testing
|
||||
save_tunable READ_SIT_OUT_SECS
|
||||
set_tunable32 READ_SIT_OUT_SECS 5
|
||||
|
||||
save_tunable SIT_OUT_CHECK_INTERVAL
|
||||
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
|
||||
|
||||
log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9}
|
||||
|
||||
for raidtype in raidz2 raidz3 draid2 draid3 ; do
|
||||
log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9}
|
||||
log_must zpool set autosit=on $TESTPOOL2 "${raidtype}-0"
|
||||
log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=400
|
||||
log_must zpool export $TESTPOOL2
|
||||
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2
|
||||
|
||||
BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9
|
||||
SLOW_VDEV=$TEST_BASE_DIR/vdev.$$.8
|
||||
|
||||
# Initial state should not be sitting out
|
||||
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "off" ]]
|
||||
|
||||
# Delay our reads 200ms to trigger sit out
|
||||
log_must zinject -d $SLOW_VDEV -D200:1 -T read $TESTPOOL2
|
||||
type=$((RANDOM % 2))
|
||||
[[ "$type" -eq "0" ]] && action="degrade" || action="fault"
|
||||
log_must zinject -d $BAD_VDEV -A $action -T read $TESTPOOL2
|
||||
|
||||
# Do some reads and wait for us to sit out
|
||||
for i in {0..99} ; do
|
||||
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
|
||||
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
|
||||
|
||||
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)
|
||||
if [[ "$sit_out" == "on" ]] ; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "on"
|
||||
|
||||
# Clear fault injection
|
||||
log_must zinject -c all
|
||||
|
||||
# Wait for us to exit our sit out period
|
||||
log_must wait_sit_out $TESTPOOL2 $SLOW_VDEV 10
|
||||
|
||||
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "off"
|
||||
destroy_pool $TESTPOOL2
|
||||
log_must zpool labelclear -f $BAD_VDEV
|
||||
done
|
||||
|
||||
log_pass "sit_out works correctly"
|
||||
102
tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh
Executable file
102
tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh
Executable file
@ -0,0 +1,102 @@
|
||||
#!/bin/ksh -p
|
||||
# SPDX-License-Identifier: CDDL-1.0
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
|
||||
|
||||
# DESCRIPTION:
|
||||
# Verify that vdevs 'sit out' when they are slow
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create various raidz/draid pools
|
||||
# 2. Inject delays into one of the disks
|
||||
# 3. Verify disk is set to 'sit out' for awhile.
|
||||
# 4. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
function cleanup
|
||||
{
|
||||
restore_tunable READ_SIT_OUT_SECS
|
||||
restore_tunable SIT_OUT_CHECK_INTERVAL
|
||||
log_must zinject -c all
|
||||
log_must zpool events -c
|
||||
destroy_pool $TESTPOOL2
|
||||
log_must rm -f $TEST_BASE_DIR/vdev.$$.*
|
||||
}
|
||||
|
||||
log_assert "Verify sit_out works"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
# shorten sit out period for testing
|
||||
save_tunable READ_SIT_OUT_SECS
|
||||
set_tunable32 READ_SIT_OUT_SECS 5
|
||||
|
||||
save_tunable SIT_OUT_CHECK_INTERVAL
|
||||
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
|
||||
|
||||
log_must truncate -s200M $TEST_BASE_DIR/vdev.$$.{0..9}
|
||||
|
||||
for raidtype in raidz raidz2 raidz3 draid1 draid2 draid3 ; do
|
||||
log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9}
|
||||
log_must zpool set autosit=on $TESTPOOL2 "${raidtype}-0"
|
||||
log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=600
|
||||
log_must zpool export $TESTPOOL2
|
||||
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2
|
||||
|
||||
BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9
|
||||
|
||||
# Initial state should not be sitting out
|
||||
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" ]]
|
||||
|
||||
# Delay our reads 200ms to trigger sit out
|
||||
log_must zinject -d $BAD_VDEV -D200:1 -T read $TESTPOOL2
|
||||
|
||||
# Do some reads and wait for us to sit out
|
||||
for i in {0..99} ; do
|
||||
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
|
||||
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null &
|
||||
dd if=/$TESTPOOL2/bigfile skip=$((i + 200)) bs=2M count=1 of=/dev/null
|
||||
|
||||
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)
|
||||
if [[ "$sit_out" == "on" ]] ; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "on"
|
||||
|
||||
# Clear fault injection
|
||||
log_must zinject -c all
|
||||
|
||||
# Wait for us to exit our sit out period
|
||||
log_must wait_sit_out $TESTPOOL2 $BAD_VDEV 10
|
||||
|
||||
# Verify sit_out was cleared during wait_sit_out
|
||||
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off"
|
||||
|
||||
destroy_pool $TESTPOOL2
|
||||
done
|
||||
|
||||
log_pass "sit_out works correctly"
|
||||
116
tests/zfs-tests/tests/functional/events/slow_vdev_sit_out_neg.ksh
Executable file
116
tests/zfs-tests/tests/functional/events/slow_vdev_sit_out_neg.ksh
Executable file
@ -0,0 +1,116 @@
|
||||
#!/bin/ksh -p
|
||||
# SPDX-License-Identifier: CDDL-1.0
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
|
||||
# Copyright (c) 2025 by Klara, Inc.
|
||||
|
||||
# DESCRIPTION:
|
||||
# Verify that we don't sit out too many vdevs
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create draid2 pool
|
||||
# 2. Inject delays into three of the disks
|
||||
# 3. Do reads to trigger sit-outs
|
||||
# 4. Verify exactly 2 disks sit out
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
function cleanup
|
||||
{
|
||||
restore_tunable READ_SIT_OUT_SECS
|
||||
restore_tunable SIT_OUT_CHECK_INTERVAL
|
||||
log_must zinject -c all
|
||||
log_must zpool events -c
|
||||
destroy_pool $TESTPOOL2
|
||||
log_must rm -f $TEST_BASE_DIR/vdev.$$.*
|
||||
}
|
||||
|
||||
log_assert "Verify sit_out works"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
save_tunable SIT_OUT_CHECK_INTERVAL
|
||||
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
|
||||
|
||||
log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9}
|
||||
|
||||
log_must zpool create $TESTPOOL2 draid2 $TEST_BASE_DIR/vdev.$$.{0..9}
|
||||
log_must zpool set autosit=on $TESTPOOL2 draid2-0
|
||||
log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=400
|
||||
log_must zpool export $TESTPOOL2
|
||||
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2
|
||||
|
||||
BAD_VDEV1=$TEST_BASE_DIR/vdev.$$.7
|
||||
BAD_VDEV2=$TEST_BASE_DIR/vdev.$$.8
|
||||
BAD_VDEV3=$TEST_BASE_DIR/vdev.$$.9
|
||||
|
||||
# Initial state should not be sitting out
|
||||
log_must eval [[ "$(get_vdev_prop autosit $TESTPOOL2 draid2-0)" == "on" ]]
|
||||
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)" == "off" ]]
|
||||
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)" == "off" ]]
|
||||
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)" == "off" ]]
|
||||
|
||||
# Delay our reads 200ms to trigger sit out
|
||||
log_must zinject -d $BAD_VDEV1 -D200:1 -T read $TESTPOOL2
|
||||
|
||||
# Do some reads and wait for us to sit out
|
||||
for i in {0..99} ; do
|
||||
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
|
||||
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
|
||||
|
||||
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)
|
||||
if [[ "$sit_out" == "on" ]] ; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)" == "on"
|
||||
|
||||
log_must zinject -d $BAD_VDEV2 -D200:1 -T read $TESTPOOL2
|
||||
# Do some reads and wait for us to sit out
|
||||
for i in {0..99} ; do
|
||||
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
|
||||
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
|
||||
|
||||
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)
|
||||
if [[ "$sit_out" == "on" ]] ; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)" == "on"
|
||||
|
||||
log_must zinject -d $BAD_VDEV3 -D200:1 -T read $TESTPOOL2
|
||||
# Do some reads and wait for us to sit out
|
||||
for i in {0..99} ; do
|
||||
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
|
||||
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
|
||||
|
||||
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)
|
||||
if [[ "$sit_out" == "on" ]] ; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)" == "off"
|
||||
|
||||
|
||||
log_pass "sit_out works correctly"
|
||||
189
tests/zfs-tests/tests/functional/replacement/attach_resilver_sit_out.ksh
Executable file
189
tests/zfs-tests/tests/functional/replacement/attach_resilver_sit_out.ksh
Executable file
@ -0,0 +1,189 @@
|
||||
#!/bin/ksh -p
|
||||
# SPDX-License-Identifier: CDDL-1.0
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
# Use is subject to license terms.
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
|
||||
# Copyright (c) 2025, Klara, Inc.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/replacement/replacement.cfg
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Attaching disks while a disk is sitting out reads should pass
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create raidz pools
|
||||
# 2. Make one disk slower and trigger a read sit out for that disk
|
||||
# 3. Start some random I/O
|
||||
# 4. Attach a disk to the pool.
|
||||
# 5. Verify the integrity of the file system and the resilvering.
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
save_tunable READ_SIT_OUT_SECS
|
||||
set_tunable32 READ_SIT_OUT_SECS 120
|
||||
save_tunable SIT_OUT_CHECK_INTERVAL
|
||||
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
|
||||
|
||||
function cleanup
|
||||
{
|
||||
restore_tunable READ_SIT_OUT_SECS
|
||||
restore_tunable SIT_OUT_CHECK_INTERVAL
|
||||
log_must zinject -c all
|
||||
log_must zpool events -c
|
||||
|
||||
if [[ -n "$child_pids" ]]; then
|
||||
for wait_pid in $child_pids; do
|
||||
kill $wait_pid
|
||||
done
|
||||
fi
|
||||
|
||||
if poolexists $TESTPOOL1; then
|
||||
destroy_pool $TESTPOOL1
|
||||
fi
|
||||
|
||||
[[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
|
||||
}
|
||||
|
||||
log_assert "Replacing a disk during I/O with a sit out completes."
|
||||
|
||||
options=""
|
||||
options_display="default options"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
|
||||
|
||||
[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
|
||||
|
||||
[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
|
||||
|
||||
[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
|
||||
|
||||
[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
|
||||
|
||||
options="$options -r "
|
||||
|
||||
[[ -n "$options" ]] && options_display=$options
|
||||
|
||||
child_pids=""
|
||||
|
||||
function attach_test
|
||||
{
|
||||
typeset vdev=$1
|
||||
typeset disk=$2
|
||||
|
||||
typeset i=0
|
||||
while [[ $i -lt $iters ]]; do
|
||||
log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i"
|
||||
file_trunc $options $TESTDIR/$TESTFILE.$i &
|
||||
typeset pid=$!
|
||||
|
||||
sleep 1
|
||||
|
||||
child_pids="$child_pids $pid"
|
||||
((i = i + 1))
|
||||
done
|
||||
|
||||
# attach disk with a slow drive still present
|
||||
SECONDS=0
|
||||
log_must zpool attach -w $TESTPOOL1 $vdev $disk
|
||||
log_note took $SECONDS seconds to attach disk
|
||||
|
||||
for wait_pid in $child_pids
|
||||
do
|
||||
kill $wait_pid
|
||||
done
|
||||
child_pids=""
|
||||
|
||||
log_must zinject -c all
|
||||
log_must zpool export $TESTPOOL1
|
||||
log_must zpool import -d $TESTDIR $TESTPOOL1
|
||||
log_must zfs umount $TESTPOOL1/$TESTFS1
|
||||
log_must zdb -cdui $TESTPOOL1/$TESTFS1
|
||||
log_must zfs mount $TESTPOOL1/$TESTFS1
|
||||
verify_pool $TESTPOOL1
|
||||
}
|
||||
|
||||
DEVSIZE="150M"
|
||||
specials_list=""
|
||||
i=0
|
||||
while [[ $i != 10 ]]; do
|
||||
truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i
|
||||
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
|
||||
|
||||
((i = i + 1))
|
||||
done
|
||||
|
||||
slow_disk=$TESTDIR/$TESTFILE1.3
|
||||
log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE
|
||||
|
||||
# Test file size in MB
|
||||
count=200
|
||||
|
||||
for type in "raidz1" "raidz2" "raidz3" ; do
|
||||
create_pool $TESTPOOL1 $type $specials_list
|
||||
log_must zpool set autosit=on $TESTPOOL1 "${type}-0"
|
||||
log_must zfs create -o primarycache=none -o recordsize=512K \
|
||||
$TESTPOOL1/$TESTFS1
|
||||
log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
|
||||
|
||||
log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count
|
||||
|
||||
# Make one disk 100ms slower to trigger a sit out
|
||||
log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1
|
||||
|
||||
# Do some reads and wait for sit out on slow disk
|
||||
SECONDS=0
|
||||
typeset -i size=0
|
||||
for i in $(seq 1 $count) ; do
|
||||
dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null
|
||||
size=$i
|
||||
|
||||
sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)
|
||||
if [[ "$sit_out" == "on" ]] ; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on"
|
||||
log_note took $SECONDS seconds to reach sit out reading ${size}M
|
||||
log_must zpool status -s $TESTPOOL1
|
||||
|
||||
typeset top=$(zpool status -j | jq -r ".pools.$TESTPOOL1.vdevs[].vdevs[].name")
|
||||
attach_test $top $TESTDIR/$REPLACEFILE
|
||||
|
||||
log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\""
|
||||
|
||||
destroy_pool $TESTPOOL1
|
||||
log_must rm -rf /$TESTPOOL1
|
||||
done
|
||||
|
||||
log_pass
|
||||
199
tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh
Executable file
199
tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh
Executable file
@ -0,0 +1,199 @@
|
||||
#!/bin/ksh -p
|
||||
# SPDX-License-Identifier: CDDL-1.0
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
# Use is subject to license terms.
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
|
||||
# Copyright (c) 2025, Klara, Inc.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/replacement/replacement.cfg
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Replacing disks while a disk is sitting out reads should pass
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create raidz and draid pools
|
||||
# 2. Make one disk slower and trigger a read sit out for that disk
|
||||
# 3. Start some random I/O
|
||||
# 4. Replace a disk in the pool with another disk.
|
||||
# 5. Verify the integrity of the file system and the resilvering.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
save_tunable READ_SIT_OUT_SECS
|
||||
set_tunable32 READ_SIT_OUT_SECS 120
|
||||
save_tunable SIT_OUT_CHECK_INTERVAL
|
||||
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
|
||||
|
||||
function cleanup
|
||||
{
|
||||
restore_tunable READ_SIT_OUT_SECS
|
||||
restore_tunable SIT_OUT_CHECK_INTERVAL
|
||||
log_must zinject -c all
|
||||
log_must zpool events -c
|
||||
|
||||
if [[ -n "$child_pids" ]]; then
|
||||
for wait_pid in $child_pids
|
||||
do
|
||||
kill $wait_pid
|
||||
done
|
||||
fi
|
||||
|
||||
if poolexists $TESTPOOL1; then
|
||||
destroy_pool $TESTPOOL1
|
||||
fi
|
||||
|
||||
[[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
|
||||
}
|
||||
|
||||
log_assert "Replacing a disk during I/O with a sit out completes."
|
||||
|
||||
options=""
|
||||
options_display="default options"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
|
||||
|
||||
[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
|
||||
|
||||
[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
|
||||
|
||||
[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
|
||||
|
||||
[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
|
||||
|
||||
options="$options -r "
|
||||
|
||||
[[ -n "$options" ]] && options_display=$options
|
||||
|
||||
child_pids=""
|
||||
|
||||
function replace_test
|
||||
{
|
||||
typeset -i iters=2
|
||||
typeset disk1=$1
|
||||
typeset disk2=$2
|
||||
typeset repl_type=$3
|
||||
|
||||
typeset i=0
|
||||
while [[ $i -lt $iters ]]; do
|
||||
log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i"
|
||||
file_trunc $options $TESTDIR/$TESTFILE.$i &
|
||||
typeset pid=$!
|
||||
|
||||
sleep 1
|
||||
|
||||
child_pids="$child_pids $pid"
|
||||
((i = i + 1))
|
||||
done
|
||||
|
||||
typeset repl_flag="-w"
|
||||
if [[ "$repl_type" == "seq" ]]; then
|
||||
repl_flag="-ws"
|
||||
fi
|
||||
# replace disk with a slow drive still present
|
||||
SECONDS=0
|
||||
log_must zpool replace $repl_flag $TESTPOOL1 $disk1 $disk2
|
||||
log_note took $SECONDS seconds to replace disk
|
||||
|
||||
for wait_pid in $child_pids
|
||||
do
|
||||
kill $wait_pid
|
||||
done
|
||||
child_pids=""
|
||||
|
||||
log_must zinject -c all
|
||||
log_must zpool export $TESTPOOL1
|
||||
log_must zpool import -d $TESTDIR $TESTPOOL1
|
||||
log_must zfs umount $TESTPOOL1/$TESTFS1
|
||||
log_must zdb -cdui $TESTPOOL1/$TESTFS1
|
||||
log_must zfs mount $TESTPOOL1/$TESTFS1
|
||||
verify_pool $TESTPOOL1
|
||||
}
|
||||
|
||||
DEVSIZE="150M"
|
||||
specials_list=""
|
||||
i=0
|
||||
while [[ $i != 10 ]]; do
|
||||
log_must truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i
|
||||
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
|
||||
|
||||
((i = i + 1))
|
||||
done
|
||||
|
||||
slow_disk=$TESTDIR/$TESTFILE1.3
|
||||
log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE
|
||||
|
||||
# Test file size in MB
|
||||
count=400
|
||||
|
||||
for type in "raidz2" "raidz3" "draid2"; do
|
||||
create_pool $TESTPOOL1 $type $specials_list
|
||||
log_must zpool set autosit=on $TESTPOOL1 "${type}-0"
|
||||
log_must zfs create -o primarycache=none -o recordsize=512K \
|
||||
$TESTPOOL1/$TESTFS1
|
||||
log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
|
||||
|
||||
log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count
|
||||
|
||||
# Make one disk 100ms slower to trigger a sit out
|
||||
log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1
|
||||
|
||||
# Do some reads and wait for sit out on slow disk
|
||||
SECONDS=0
|
||||
typeset -i size=0
|
||||
for i in $(seq 1 $count) ; do
|
||||
dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null
|
||||
size=$i
|
||||
|
||||
sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)
|
||||
if [[ "$sit_out" == "on" ]] ; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on"
|
||||
log_note took $SECONDS seconds to reach sit out reading ${size}M
|
||||
log_must zpool status -s $TESTPOOL1
|
||||
|
||||
typeset repl_type="replace"
|
||||
if [[ "$type" == "draid2" && $((RANDOM % 2)) -eq 0 ]]; then
|
||||
repl_type="seq"
|
||||
fi
|
||||
replace_test $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE $repl_type
|
||||
|
||||
log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\""
|
||||
|
||||
destroy_pool $TESTPOOL1
|
||||
log_must rm -rf /$TESTPOOL1
|
||||
done
|
||||
|
||||
log_pass
|
||||
Loading…
Reference in New Issue
Block a user