ZTS: add targeted redundancy_draid_spare exception

When sequentially resilvering a dRAID pool it's possible that a few
correctable checksum errors will be reported.  This is a known issue
which is occasionally observed in the CI.  Until it's resolved we
want the test case to tolerate a few checksum errors in this scenario
to prevent false positives in the CI.

This change also has the additional side effect of standardizing in
one location how the dRAID pool integrity is verified.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #18307
Issue #18319
Closes #18436
This commit is contained in:
Brian Behlendorf
2026-04-23 13:45:48 -07:00
committed by Tony Hutter
parent 887bfc1a64
commit b0c1dcb531
9 changed files with 96 additions and 71 deletions
@@ -52,7 +52,7 @@ function cleanup
#
function cksum_pool
{
typeset -i cksum=$(zpool status $1 | awk '
typeset -i cksum=$(zpool status -p $1 | awk '
!NF { isvdev = 0 }
isvdev { errors += $NF }
/CKSUM$/ { isvdev = 1 }
@@ -349,3 +349,77 @@ function recover_bad_missing_devs
return 0
}
#
# Given a dRAID pool issue a scrub and verify the current pool status
# aligns with the expected status based on the 'replace_mode' passed.
# Valid modes are:
#
# 1. healing - The pool is perfectly intact. No checksum errors have
# been reported and the scrub didn't make any repairs. This is the
# expected state after a healing resilver of a healthy pool.
#
# 2. sequential - The pool is fully intact. There should never be a
# checksum error, but the occasional checksum error does occur in
# practice. Until the root cause is identified and resolved, tolerate
# a checksum error when scrubbing after a sequential resilver.
#
# https://github.com/openzfs/zfs/issues/18307
# https://github.com/openzfs/zfs/issues/18319
#
# 3. damaged - The pool was intentionally silently damaged. Checksum
# errors are expected to be reported as the damaged blocks are
# detected and repaired.
#
# In all of these cases a scrub must be able to successfully repair the
# pool and result in no data loss.
#
function verify_draid_pool
{
typeset pool=${1:-$TESTPOOL}
typeset replace_mode=${2:-healing}
log_note "verify_draid_pool $pool $replace_mode"
log_must zpool scrub -w $pool
typeset -i cksum=$(cksum_pool $pool)
if [[ "$replace_mode" = "healing" ]]; then
if [[ $cksum -gt 0 ]]; then
log_must zpool status -v $pool
log_fail "Unexpected CKSUM errors found for $pool ($cksum)"
fi
if ! check_pool_status $pool "scan" "repaired 0B"; then
log_must zpool status -v $pool
log_fail "Unexpected repair IO found for $pool ($cksum)"
fi
elif [[ "$replace_mode" = "sequential" ]]; then
if [[ $cksum -gt 3 ]]; then
log_must zpool status -v $pool
log_fail "Unexpected CKSUM errors found for $pool ($cksum)"
fi
elif [[ "$replace_mode" = "damaged" ]]; then
if [[ $cksum -lt 1 ]]; then
log_must zpool status -v $pool
log_fail "Expected CKSUM errors missing for $pool ($cksum)"
fi
if check_pool_status $pool "scan" "repaired 0B"; then
log_must zpool status -v $pool
log_fail "Expected repair IO missing for $pool ($cksum)"
fi
else
log_fail "Invalid replace_mode=$replace_mode"
fi
if ! check_pool_status $pool "scan" "with 0 errors"; then
log_must zpool status -v $pool
log_fail "Unexpected repair errors found for $pool"
fi
if ! check_pool_status $pool "errors" "No known data errors"; then
log_must zpool status -v $pool
log_fail "Unexpected data errors found for $pool"
fi
}
@@ -86,8 +86,7 @@ function test_selfheal # <pool> <parity> <dir>
# from the files which were read. Before overwriting additional
# devices we need to repair all of the blocks in the pool.
#
log_must zpool scrub -w $pool
log_must check_pool_status $pool "errors" "No known data errors"
log_must verify_draid_pool $pool "damaged"
log_must zpool clear $pool
@@ -104,8 +103,7 @@ function test_selfheal # <pool> <parity> <dir>
log_must eval "find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1"
log_must check_pool_status $pool "errors" "No known data errors"
log_must zpool scrub -w $pool
log_must check_pool_status $pool "errors" "No known data errors"
log_must verify_draid_pool $pool "damaged"
log_must zpool clear $pool
}
@@ -182,8 +180,7 @@ function test_scrub # <pool> <parity> <dir>
log_must zpool import -o cachefile=none -d $dir $pool
log_must zpool scrub -w $pool
log_must check_pool_status $pool "errors" "No known data errors"
log_must verify_draid_pool $pool "damaged"
log_must zpool clear $pool
@@ -196,8 +193,7 @@ function test_scrub # <pool> <parity> <dir>
log_must zpool import -o cachefile=none -d $dir $pool
log_must zpool scrub -w $pool
log_must check_pool_status $pool "errors" "No known data errors"
log_must verify_draid_pool $pool "damaged"
log_must zpool clear $pool
}
@@ -89,12 +89,7 @@ function test_sequential_resilver # <pool> <parity> <dir>
log_must zpool replace -fsw $pool $dir/dev-$i $spare
done
log_must zpool scrub -w $pool
log_must zpool status $pool
log_mustnot check_pool_status $pool "scan" "repaired 0B"
log_must check_pool_status $pool "errors" "No known data errors"
log_must check_pool_status $pool "scan" "with 0 errors"
log_must verify_draid_pool $pool "damaged"
}
log_onexit cleanup
@@ -121,12 +121,7 @@ for nparity in 1 2 3; do
# Scrub the pool after the sequential resilver and verify
# that the silent damage was repaired by the scrub.
log_must zpool scrub -w $TESTPOOL
log_must zpool status $TESTPOOL
log_must check_pool_status $TESTPOOL "errors" \
"No known data errors"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_mustnot check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must verify_draid_pool $TESTPOOL "damaged"
done
for nspare in 0 1 2; do
@@ -145,12 +140,7 @@ for nparity in 1 2 3; do
done
log_must zpool clear $TESTPOOL
log_must zpool scrub -w $TESTPOOL
log_must zpool status $TESTPOOL
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must verify_draid_pool $TESTPOOL "healing"
log_must zpool destroy "$TESTPOOL"
done
@@ -89,12 +89,7 @@ function test_sequential_resilver # <pool> <parity> <dir>
spare=draid${nparity}-0-0
log_must zpool replace -fsw $pool $dir/dev-$nparity $spare
log_must zpool scrub -w $pool
log_must zpool status $pool
log_must check_pool_status $pool "scan" "repaired 0B"
log_must check_pool_status $pool "errors" "No known data errors"
log_must check_pool_status $pool "scan" "with 0 errors"
log_must verify_draid_pool $pool "sequential"
}
log_onexit cleanup
@@ -105,12 +105,7 @@ function test_sequential_resilver # <pool> <parity> <dir>
log_must zpool wait -t resilver $pool
log_must zpool scrub -w $pool
log_must zpool status $pool
log_must check_pool_status $pool "scan" "repaired 0B"
log_must check_pool_status $pool "errors" "No known data errors"
log_must check_pool_status $pool "scan" "with 0 errors"
log_must verify_draid_pool $pool "sequential"
}
log_onexit cleanup
@@ -85,9 +85,7 @@ for replace_mode in "healing" "sequential"; do
log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE"
# Preserve the 1st faulted vdev for the next test.
[[ $i -eq 0 ]] || log_must zpool detach $TESTPOOL $fault_vdev
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must verify_draid_pool $TESTPOOL $replace_mode
(( i += 1 ))
done
@@ -98,9 +96,7 @@ for replace_mode in "healing" "sequential"; do
# Verify that after clearing the 1st faulted vdev, all is healed.
log_must zpool clear $TESTPOOL "$BASEDIR/vdev0"
log_must wait_resilvered $TESTPOOL
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must verify_draid_pool $TESTPOOL $replace_mode
cleanup
done
@@ -60,9 +60,9 @@ log_must zpool offline -f $TESTPOOL $BASEDIR/vdev9
log_must zpool replace -w $TESTPOOL $BASEDIR/vdev9 draid1-0-2
# Verify, refill and verify the pool contents.
verify_pool $TESTPOOL
log_must verify_draid_pool $TESTPOOL "healing"
refill_test_env $TESTPOOL
verify_pool $TESTPOOL
log_must verify_draid_pool $TESTPOOL "healing"
# Bring everything back online and check for errors.
log_must zpool clear $TESTPOOL
@@ -72,9 +72,7 @@ log_must wait_hotspare_state $TESTPOOL draid1-0-0 "AVAIL"
log_must wait_hotspare_state $TESTPOOL draid1-0-1 "AVAIL"
log_must wait_hotspare_state $TESTPOOL draid1-0-2 "AVAIL"
log_must zpool scrub -w $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must verify_draid_pool $TESTPOOL "healing"
log_must is_data_valid $TESTPOOL
@@ -111,9 +111,7 @@ for replace_mode in "healing" "sequential"; do
log_must zpool detach $TESTPOOL $BASEDIR/vdev7
log_must check_vdev_state $TESTPOOL draid1-0-0 "ONLINE"
log_must check_hotspare_state $TESTPOOL draid1-0-0 "INUSE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must verify_draid_pool $TESTPOOL $replace_mode
# Distributed spare in mirror with original device faulted
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8
@@ -122,9 +120,7 @@ for replace_mode in "healing" "sequential"; do
log_must check_vdev_state $TESTPOOL spare-8 "DEGRADED"
log_must check_vdev_state $TESTPOOL draid1-0-1 "ONLINE"
log_must check_hotspare_state $TESTPOOL draid1-0-1 "INUSE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must verify_draid_pool $TESTPOOL $replace_mode
# Distributed spare in mirror with original device still online
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev9 "ONLINE"
@@ -132,9 +128,7 @@ for replace_mode in "healing" "sequential"; do
log_must check_vdev_state $TESTPOOL spare-9 "ONLINE"
log_must check_vdev_state $TESTPOOL draid1-0-2 "ONLINE"
log_must check_hotspare_state $TESTPOOL draid1-0-2 "INUSE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must verify_draid_pool $TESTPOOL $replace_mode
# Normal faulted device replacement
new_vdev0="$BASEDIR/new_vdev0"
@@ -143,9 +137,7 @@ for replace_mode in "healing" "sequential"; do
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev0 "FAULTED"
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev0 $new_vdev0
log_must check_vdev_state $TESTPOOL $new_vdev0 "ONLINE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must verify_draid_pool $TESTPOOL $replace_mode
# Distributed spare faulted device replacement
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev2
@@ -154,9 +146,7 @@ for replace_mode in "healing" "sequential"; do
log_must check_vdev_state $TESTPOOL spare-2 "DEGRADED"
log_must check_vdev_state $TESTPOOL draid1-0-3 "ONLINE"
log_must check_hotspare_state $TESTPOOL draid1-0-3 "INUSE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must verify_draid_pool $TESTPOOL $replace_mode
# Normal online device replacement
new_vdev1="$BASEDIR/new_vdev1"
@@ -164,9 +154,7 @@ for replace_mode in "healing" "sequential"; do
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev1 "ONLINE"
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev1 $new_vdev1
log_must check_vdev_state $TESTPOOL $new_vdev1 "ONLINE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must verify_draid_pool $TESTPOOL $replace_mode
# Distributed spare online device replacement (then fault)
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev3 draid1-0-4
@@ -176,9 +164,7 @@ for replace_mode in "healing" "sequential"; do
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev3
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev3 "FAULTED"
log_must check_vdev_state $TESTPOOL spare-3 "DEGRADED"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must verify_draid_pool $TESTPOOL $replace_mode
# Verify the original data is valid
log_must is_data_valid $TESTPOOL