Detect a slow raidz child during reads

A single slow responding disk can affect the overall read
performance of a raidz group.  When a raidz child disk is
determined to be a persistent slow outlier, then have it
sit out during reads for a period of time. The raidz group
can use parity to reconstruct the data that was skipped.

Each time a slow disk is placed into a sit out period, its
`vdev_stat.vs_slow_ios count` is incremented and a zevent
class `ereport.fs.zfs.delay` is posted.

The length of the sit out period can be changed using the
`raid_read_sit_out_secs` module parameter.  Setting it to
zero disables slow outlier detection.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Contributions-by: Don Brady <don.brady@klarasystems.com>
Contributions-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #17227
This commit is contained in:
Paul Dagnelie
2025-08-27 16:41:48 -07:00
committed by Brian Behlendorf
parent 0df85ec27c
commit df55ba7c49
28 changed files with 1399 additions and 13 deletions
+32
View File
@@ -1112,6 +1112,16 @@ function get_pool_prop # property pool
zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool"
}
# Get the specified vdev property in parsable format or fail
function get_vdev_prop
{
typeset prop="$1"
typeset pool="$2"
typeset vdev="$3"
zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev"
}
# Return 0 if a pool exists; $? otherwise
#
# $1 - pool name
@@ -1970,6 +1980,28 @@ function wait_vdev_state # pool disk state timeout
return 1
}
#
# Wait for vdev 'sit_out' property to be cleared.
#
# $1 pool name
# $2 vdev name
# $3 timeout
#
function wait_sit_out #pool vdev timeout
{
typeset pool=${1:-$TESTPOOL}
typeset vdev="$2"
typeset timeout=${3:-300}
for (( timer = 0; timer < $timeout; timer++ )); do
if [ "$(get_vdev_prop sit_out "$pool" "$vdev")" = "off" ]; then
return 0
fi
sleep 1;
done
return 1
}
#
# Check the output of 'zpool status -v <pool>',
# and to see if the content of <token> contain the <keyword> specified.
+3
View File
@@ -72,6 +72,9 @@ MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval
OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize
PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable
RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes
READ_SIT_OUT_SECS vdev.read_sit_out_secs vdev_read_sit_out_secs
SIT_OUT_CHECK_INTERVAL vdev.raidz_outlier_check_interval_ms vdev_raidz_outlier_check_interval_ms
SIT_OUT_INSENSITIVITY vdev.raidz_outlier_insensitivity vdev_raidz_outlier_insensitivity
REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled
REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress
REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment
+5
View File
@@ -1525,6 +1525,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/events/events_001_pos.ksh \
functional/events/events_002_pos.ksh \
functional/events/setup.ksh \
functional/events/slow_vdev_degraded_sit_out.ksh \
functional/events/slow_vdev_sit_out.ksh \
functional/events/slow_vdev_sit_out_neg.ksh \
functional/events/zed_cksum_config.ksh \
functional/events/zed_cksum_reported.ksh \
functional/events/zed_diagnose_multiple.ksh \
@@ -1937,6 +1940,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/replacement/attach_multiple.ksh \
functional/replacement/attach_rebuild.ksh \
functional/replacement/attach_resilver.ksh \
functional/replacement/attach_resilver_sit_out.ksh \
functional/replacement/cleanup.ksh \
functional/replacement/detach.ksh \
functional/replacement/rebuild_disabled_feature.ksh \
@@ -1945,6 +1949,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/replacement/replace_import.ksh \
functional/replacement/replace_rebuild.ksh \
functional/replacement/replace_resilver.ksh \
functional/replacement/replace_resilver_sit_out.ksh \
functional/replacement/resilver_restart_001.ksh \
functional/replacement/resilver_restart_002.ksh \
functional/replacement/scrub_cancel.ksh \
@@ -0,0 +1,106 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
# Copyright (c) 2025 by Klara, Inc.
# DESCRIPTION:
# Verify that vdevs 'sit out' when they are slow
#
# STRATEGY:
# 1. Create various raidz/draid pools
# 2. Degrade/fault one of the disks.
# 3. Inject delays into one of the disks
# 4. Verify disk is set to 'sit out' for awhile.
# 5. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted.
#
. $STF_SUITE/include/libtest.shlib
function cleanup
{
restore_tunable READ_SIT_OUT_SECS
restore_tunable SIT_OUT_CHECK_INTERVAL
log_must zinject -c all
log_must zpool events -c
destroy_pool $TESTPOOL2
log_must rm -f $TEST_BASE_DIR/vdev.$$.*
}
log_assert "Verify sit_out works"
log_onexit cleanup
# shorten sit out period for testing
save_tunable READ_SIT_OUT_SECS
set_tunable32 READ_SIT_OUT_SECS 5
save_tunable SIT_OUT_CHECK_INTERVAL
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9}
for raidtype in raidz2 raidz3 draid2 draid3 ; do
log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9}
log_must zpool set autosit=on $TESTPOOL2 "${raidtype}-0"
log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=400
log_must zpool export $TESTPOOL2
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2
BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9
SLOW_VDEV=$TEST_BASE_DIR/vdev.$$.8
# Initial state should not be sitting out
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "off" ]]
# Delay our reads 200ms to trigger sit out
log_must zinject -d $SLOW_VDEV -D200:1 -T read $TESTPOOL2
type=$((RANDOM % 2))
[[ "$type" -eq "0" ]] && action="degrade" || action="fault"
log_must zinject -d $BAD_VDEV -A $action -T read $TESTPOOL2
# Do some reads and wait for us to sit out
for i in {0..99} ; do
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "on"
# Clear fault injection
log_must zinject -c all
# Wait for us to exit our sit out period
log_must wait_sit_out $TESTPOOL2 $SLOW_VDEV 10
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "off"
destroy_pool $TESTPOOL2
log_must zpool labelclear -f $BAD_VDEV
done
log_pass "sit_out works correctly"
@@ -0,0 +1,102 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
# DESCRIPTION:
# Verify that vdevs 'sit out' when they are slow
#
# STRATEGY:
# 1. Create various raidz/draid pools
# 2. Inject delays into one of the disks
# 3. Verify disk is set to 'sit out' for awhile.
# 4. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted.
#
. $STF_SUITE/include/libtest.shlib
function cleanup
{
restore_tunable READ_SIT_OUT_SECS
restore_tunable SIT_OUT_CHECK_INTERVAL
log_must zinject -c all
log_must zpool events -c
destroy_pool $TESTPOOL2
log_must rm -f $TEST_BASE_DIR/vdev.$$.*
}
log_assert "Verify sit_out works"
log_onexit cleanup
# shorten sit out period for testing
save_tunable READ_SIT_OUT_SECS
set_tunable32 READ_SIT_OUT_SECS 5
save_tunable SIT_OUT_CHECK_INTERVAL
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
log_must truncate -s200M $TEST_BASE_DIR/vdev.$$.{0..9}
for raidtype in raidz raidz2 raidz3 draid1 draid2 draid3 ; do
log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9}
log_must zpool set autosit=on $TESTPOOL2 "${raidtype}-0"
log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=600
log_must zpool export $TESTPOOL2
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2
BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9
# Initial state should not be sitting out
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" ]]
# Delay our reads 200ms to trigger sit out
log_must zinject -d $BAD_VDEV -D200:1 -T read $TESTPOOL2
# Do some reads and wait for us to sit out
for i in {0..99} ; do
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 200)) bs=2M count=1 of=/dev/null
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "on"
# Clear fault injection
log_must zinject -c all
# Wait for us to exit our sit out period
log_must wait_sit_out $TESTPOOL2 $BAD_VDEV 10
# Verify sit_out was cleared during wait_sit_out
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off"
destroy_pool $TESTPOOL2
done
log_pass "sit_out works correctly"
@@ -0,0 +1,116 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
# Copyright (c) 2025 by Klara, Inc.
# DESCRIPTION:
# Verify that we don't sit out too many vdevs
#
# STRATEGY:
# 1. Create draid2 pool
# 2. Inject delays into three of the disks
# 3. Do reads to trigger sit-outs
# 4. Verify exactly 2 disks sit out
#
. $STF_SUITE/include/libtest.shlib
function cleanup
{
restore_tunable READ_SIT_OUT_SECS
restore_tunable SIT_OUT_CHECK_INTERVAL
log_must zinject -c all
log_must zpool events -c
destroy_pool $TESTPOOL2
log_must rm -f $TEST_BASE_DIR/vdev.$$.*
}
log_assert "Verify sit_out works"
log_onexit cleanup
save_tunable SIT_OUT_CHECK_INTERVAL
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9}
log_must zpool create $TESTPOOL2 draid2 $TEST_BASE_DIR/vdev.$$.{0..9}
log_must zpool set autosit=on $TESTPOOL2 draid2-0
log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=400
log_must zpool export $TESTPOOL2
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2
BAD_VDEV1=$TEST_BASE_DIR/vdev.$$.7
BAD_VDEV2=$TEST_BASE_DIR/vdev.$$.8
BAD_VDEV3=$TEST_BASE_DIR/vdev.$$.9
# Initial state should not be sitting out
log_must eval [[ "$(get_vdev_prop autosit $TESTPOOL2 draid2-0)" == "on" ]]
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)" == "off" ]]
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)" == "off" ]]
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)" == "off" ]]
# Delay our reads 200ms to trigger sit out
log_must zinject -d $BAD_VDEV1 -D200:1 -T read $TESTPOOL2
# Do some reads and wait for us to sit out
for i in {0..99} ; do
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)" == "on"
log_must zinject -d $BAD_VDEV2 -D200:1 -T read $TESTPOOL2
# Do some reads and wait for us to sit out
for i in {0..99} ; do
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)" == "on"
log_must zinject -d $BAD_VDEV3 -D200:1 -T read $TESTPOOL2
# Do some reads and wait for us to sit out
for i in {0..99} ; do
dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null &
dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null
sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)" == "off"
log_pass "sit_out works correctly"
@@ -0,0 +1,189 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
# Copyright (c) 2025, Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
# Attaching disks while a disk is sitting out reads should pass
#
# STRATEGY:
# 1. Create raidz pools
# 2. Make one disk slower and trigger a read sit out for that disk
# 3. Start some random I/O
# 4. Attach a disk to the pool.
# 5. Verify the integrity of the file system and the resilvering.
verify_runnable "global"
save_tunable READ_SIT_OUT_SECS
set_tunable32 READ_SIT_OUT_SECS 120
save_tunable SIT_OUT_CHECK_INTERVAL
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
function cleanup
{
restore_tunable READ_SIT_OUT_SECS
restore_tunable SIT_OUT_CHECK_INTERVAL
log_must zinject -c all
log_must zpool events -c
if [[ -n "$child_pids" ]]; then
for wait_pid in $child_pids; do
kill $wait_pid
done
fi
if poolexists $TESTPOOL1; then
destroy_pool $TESTPOOL1
fi
[[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
}
log_assert "Replacing a disk during I/O with a sit out completes."
options=""
options_display="default options"
log_onexit cleanup
[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
options="$options -r "
[[ -n "$options" ]] && options_display=$options
child_pids=""
function attach_test
{
typeset vdev=$1
typeset disk=$2
typeset i=0
while [[ $i -lt $iters ]]; do
log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i"
file_trunc $options $TESTDIR/$TESTFILE.$i &
typeset pid=$!
sleep 1
child_pids="$child_pids $pid"
((i = i + 1))
done
# attach disk with a slow drive still present
SECONDS=0
log_must zpool attach -w $TESTPOOL1 $vdev $disk
log_note took $SECONDS seconds to attach disk
for wait_pid in $child_pids
do
kill $wait_pid
done
child_pids=""
log_must zinject -c all
log_must zpool export $TESTPOOL1
log_must zpool import -d $TESTDIR $TESTPOOL1
log_must zfs umount $TESTPOOL1/$TESTFS1
log_must zdb -cdui $TESTPOOL1/$TESTFS1
log_must zfs mount $TESTPOOL1/$TESTFS1
verify_pool $TESTPOOL1
}
DEVSIZE="150M"
specials_list=""
i=0
while [[ $i != 10 ]]; do
truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
((i = i + 1))
done
slow_disk=$TESTDIR/$TESTFILE1.3
log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE
# Test file size in MB
count=200
for type in "raidz1" "raidz2" "raidz3" ; do
create_pool $TESTPOOL1 $type $specials_list
log_must zpool set autosit=on $TESTPOOL1 "${type}-0"
log_must zfs create -o primarycache=none -o recordsize=512K \
$TESTPOOL1/$TESTFS1
log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count
# Make one disk 100ms slower to trigger a sit out
log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1
# Do some reads and wait for sit out on slow disk
SECONDS=0
typeset -i size=0
for i in $(seq 1 $count) ; do
dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null
size=$i
sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on"
log_note took $SECONDS seconds to reach sit out reading ${size}M
log_must zpool status -s $TESTPOOL1
typeset top=$(zpool status -j | jq -r ".pools.$TESTPOOL1.vdevs[].vdevs[].name")
attach_test $top $TESTDIR/$REPLACEFILE
log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\""
destroy_pool $TESTPOOL1
log_must rm -rf /$TESTPOOL1
done
log_pass
@@ -0,0 +1,199 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
# Copyright (c) 2025, Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
# Replacing disks while a disk is sitting out reads should pass
#
# STRATEGY:
# 1. Create raidz and draid pools
# 2. Make one disk slower and trigger a read sit out for that disk
# 3. Start some random I/O
# 4. Replace a disk in the pool with another disk.
# 5. Verify the integrity of the file system and the resilvering.
#
verify_runnable "global"
save_tunable READ_SIT_OUT_SECS
set_tunable32 READ_SIT_OUT_SECS 120
save_tunable SIT_OUT_CHECK_INTERVAL
set_tunable64 SIT_OUT_CHECK_INTERVAL 20
function cleanup
{
restore_tunable READ_SIT_OUT_SECS
restore_tunable SIT_OUT_CHECK_INTERVAL
log_must zinject -c all
log_must zpool events -c
if [[ -n "$child_pids" ]]; then
for wait_pid in $child_pids
do
kill $wait_pid
done
fi
if poolexists $TESTPOOL1; then
destroy_pool $TESTPOOL1
fi
[[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
}
log_assert "Replacing a disk during I/O with a sit out completes."
options=""
options_display="default options"
log_onexit cleanup
[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
options="$options -r "
[[ -n "$options" ]] && options_display=$options
child_pids=""
function replace_test
{
typeset -i iters=2
typeset disk1=$1
typeset disk2=$2
typeset repl_type=$3
typeset i=0
while [[ $i -lt $iters ]]; do
log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i"
file_trunc $options $TESTDIR/$TESTFILE.$i &
typeset pid=$!
sleep 1
child_pids="$child_pids $pid"
((i = i + 1))
done
typeset repl_flag="-w"
if [[ "$repl_type" == "seq" ]]; then
repl_flag="-ws"
fi
# replace disk with a slow drive still present
SECONDS=0
log_must zpool replace $repl_flag $TESTPOOL1 $disk1 $disk2
log_note took $SECONDS seconds to replace disk
for wait_pid in $child_pids
do
kill $wait_pid
done
child_pids=""
log_must zinject -c all
log_must zpool export $TESTPOOL1
log_must zpool import -d $TESTDIR $TESTPOOL1
log_must zfs umount $TESTPOOL1/$TESTFS1
log_must zdb -cdui $TESTPOOL1/$TESTFS1
log_must zfs mount $TESTPOOL1/$TESTFS1
verify_pool $TESTPOOL1
}
DEVSIZE="150M"
specials_list=""
i=0
while [[ $i != 10 ]]; do
log_must truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
((i = i + 1))
done
slow_disk=$TESTDIR/$TESTFILE1.3
log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE
# Test file size in MB
count=400
for type in "raidz2" "raidz3" "draid2"; do
create_pool $TESTPOOL1 $type $specials_list
log_must zpool set autosit=on $TESTPOOL1 "${type}-0"
log_must zfs create -o primarycache=none -o recordsize=512K \
$TESTPOOL1/$TESTFS1
log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count
# Make one disk 100ms slower to trigger a sit out
log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1
# Do some reads and wait for sit out on slow disk
SECONDS=0
typeset -i size=0
for i in $(seq 1 $count) ; do
dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null
size=$i
sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done
log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on"
log_note took $SECONDS seconds to reach sit out reading ${size}M
log_must zpool status -s $TESTPOOL1
typeset repl_type="replace"
if [[ "$type" == "draid2" && $((RANDOM % 2)) -eq 0 ]]; then
repl_type="seq"
fi
replace_test $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE $repl_type
log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\""
destroy_pool $TESTPOOL1
log_must rm -rf /$TESTPOOL1
done
log_pass