From b0c1dcb5310f9cb81ec7f09b4a49b627b6051d97 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 23 Apr 2026 13:45:48 -0700 Subject: [PATCH] ZTS: add targeted redundancy_draid_spare exception When sequentially resilvering a dRAID pool it's possible that a few correctable checksum errors will be reported. This is a known issue which is occasionally observed in the CI. Until it's resolved we want the test case to tolerate a few checksum errors in this scenario to prevent false positives in the CI. This change also has the additional side effect of standardizing in one location how the dRAID pool integrity is verified. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Issue #18307 Issue #18319 Closes #18436 --- .../functional/redundancy/redundancy.kshlib | 76 ++++++++++++++++++- .../redundancy/redundancy_draid.ksh | 12 +-- .../redundancy/redundancy_draid_damaged1.ksh | 7 +- .../redundancy/redundancy_draid_damaged2.ksh | 14 +--- .../redundancy/redundancy_draid_degraded1.ksh | 7 +- .../redundancy/redundancy_draid_degraded2.ksh | 7 +- .../redundancy/redundancy_draid_spare1.ksh | 8 +- .../redundancy/redundancy_draid_spare2.ksh | 8 +- .../redundancy/redundancy_draid_spare3.ksh | 28 ++----- 9 files changed, 96 insertions(+), 71 deletions(-) diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib index 65435554b..702268aee 100644 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib @@ -52,7 +52,7 @@ function cleanup # function cksum_pool { - typeset -i cksum=$(zpool status $1 | awk ' + typeset -i cksum=$(zpool status -p $1 | awk ' !NF { isvdev = 0 } isvdev { errors += $NF } /CKSUM$/ { isvdev = 1 } @@ -349,3 +349,77 @@ function recover_bad_missing_devs return 0 } + +# +# Given a dRAID pool issue a scrub and verify the current pool status +# aligns with the expected status based on the 'replace_mode' passed. +# Valid modes are: +# +# 1. healing - The pool is perfectly intact. No checksum errors have +# been reported and the scrub didn't make any repairs. This is the +# expected state after a healing resilver of a healthy pool. +# +# 2. sequential - The pool is fully intact. There should never be a +# checksum error, but the occasional checksum error does occur in +# practice. Until the root cause is identified and resolved, tolerate +# a checksum error when scrubbing after a sequential resilver. +# +# https://github.com/openzfs/zfs/issues/18307 +# https://github.com/openzfs/zfs/issues/18319 +# +# 3. damaged - The pool was intentionally silently damaged. Checksum +# errors are expected to be reported as the damaged blocks are +# detected and repaired. +# +# In all of these cases a scrub must be able to successfully repair the +# pool and result in no data loss. +# +function verify_draid_pool +{ + typeset pool=${1:-$TESTPOOL} + typeset replace_mode=${2:-healing} + + log_note "verify_draid_pool $pool $replace_mode" + log_must zpool scrub -w $pool + + typeset -i cksum=$(cksum_pool $pool) + + if [[ "$replace_mode" = "healing" ]]; then + if [[ $cksum -gt 0 ]]; then + log_must zpool status -v $pool + log_fail "Unexpected CKSUM errors found for $pool ($cksum)" + fi + + if ! check_pool_status $pool "scan" "repaired 0B"; then + log_must zpool status -v $pool + log_fail "Unexpected repair IO found for $pool ($cksum)" + fi + elif [[ "$replace_mode" = "sequential" ]]; then + if [[ $cksum -gt 3 ]]; then + log_must zpool status -v $pool + log_fail "Unexpected CKSUM errors found for $pool ($cksum)" + fi + elif [[ "$replace_mode" = "damaged" ]]; then + if [[ $cksum -lt 1 ]]; then + log_must zpool status -v $pool + log_fail "Expected CKSUM errors missing for $pool ($cksum)" + fi + + if check_pool_status $pool "scan" "repaired 0B"; then + log_must zpool status -v $pool + log_fail "Expected repair IO missing for $pool ($cksum)" + fi + else + log_fail "Invalid replace_mode=$replace_mode" + fi + + if ! check_pool_status $pool "scan" "with 0 errors"; then + log_must zpool status -v $pool + log_fail "Unexpected repair errors found for $pool" + fi + + if ! check_pool_status $pool "errors" "No known data errors"; then + log_must zpool status -v $pool + log_fail "Unexpected data errors found for $pool" + fi +} diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh index a1356f619..81a01c07a 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh @@ -86,8 +86,7 @@ function test_selfheal # # from the files which were read. Before overwriting additional # devices we need to repair all of the blocks in the pool. # - log_must zpool scrub -w $pool - log_must check_pool_status $pool "errors" "No known data errors" + log_must verify_draid_pool $pool "damaged" log_must zpool clear $pool @@ -104,8 +103,7 @@ function test_selfheal # log_must eval "find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1" log_must check_pool_status $pool "errors" "No known data errors" - log_must zpool scrub -w $pool - log_must check_pool_status $pool "errors" "No known data errors" + log_must verify_draid_pool $pool "damaged" log_must zpool clear $pool } @@ -182,8 +180,7 @@ function test_scrub # log_must zpool import -o cachefile=none -d $dir $pool - log_must zpool scrub -w $pool - log_must check_pool_status $pool "errors" "No known data errors" + log_must verify_draid_pool $pool "damaged" log_must zpool clear $pool @@ -196,8 +193,7 @@ function test_scrub # log_must zpool import -o cachefile=none -d $dir $pool - log_must zpool scrub -w $pool - log_must check_pool_status $pool "errors" "No known data errors" + log_must verify_draid_pool $pool "damaged" log_must zpool clear $pool } diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh index cafd63166..56b12373a 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh @@ -89,12 +89,7 @@ function test_sequential_resilver # log_must zpool replace -fsw $pool $dir/dev-$i $spare done - log_must zpool scrub -w $pool - log_must zpool status $pool - - log_mustnot check_pool_status $pool "scan" "repaired 0B" - log_must check_pool_status $pool "errors" "No known data errors" - log_must check_pool_status $pool "scan" "with 0 errors" + log_must verify_draid_pool $pool "damaged" } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh index 46bf9f950..3dfb760be 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh @@ -121,12 +121,7 @@ for nparity in 1 2 3; do # Scrub the pool after the sequential resilver and verify # that the silent damage was repaired by the scrub. - log_must zpool scrub -w $TESTPOOL - log_must zpool status $TESTPOOL - log_must check_pool_status $TESTPOOL "errors" \ - "No known data errors" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" - log_mustnot check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must verify_draid_pool $TESTPOOL "damaged" done for nspare in 0 1 2; do @@ -145,12 +140,7 @@ for nparity in 1 2 3; do done log_must zpool clear $TESTPOOL - log_must zpool scrub -w $TESTPOOL - log_must zpool status $TESTPOOL - - log_must check_pool_status $TESTPOOL "errors" "No known data errors" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must verify_draid_pool $TESTPOOL "healing" log_must zpool destroy "$TESTPOOL" done diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded1.ksh index ae65d3a21..31444850f 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded1.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded1.ksh @@ -89,12 +89,7 @@ function test_sequential_resilver # spare=draid${nparity}-0-0 log_must zpool replace -fsw $pool $dir/dev-$nparity $spare - log_must zpool scrub -w $pool - log_must zpool status $pool - - log_must check_pool_status $pool "scan" "repaired 0B" - log_must check_pool_status $pool "errors" "No known data errors" - log_must check_pool_status $pool "scan" "with 0 errors" + log_must verify_draid_pool $pool "sequential" } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh index 8d102627f..22e1f2dfb 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh @@ -105,12 +105,7 @@ function test_sequential_resilver # log_must zpool wait -t resilver $pool - log_must zpool scrub -w $pool - log_must zpool status $pool - - log_must check_pool_status $pool "scan" "repaired 0B" - log_must check_pool_status $pool "errors" "No known data errors" - log_must check_pool_status $pool "scan" "with 0 errors" + log_must verify_draid_pool $pool "sequential" } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh index 0604f7f48..e5d16910d 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh @@ -85,9 +85,7 @@ for replace_mode in "healing" "sequential"; do log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE" # Preserve the 1st faulted vdev for the next test. [[ $i -eq 0 ]] || log_must zpool detach $TESTPOOL $fault_vdev - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode (( i += 1 )) done @@ -98,9 +96,7 @@ for replace_mode in "healing" "sequential"; do # Verify that after clearing the 1st faulted vdev, all is healed. log_must zpool clear $TESTPOOL "$BASEDIR/vdev0" log_must wait_resilvered $TESTPOOL - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode cleanup done diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh index 288f02392..6f94161b4 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh @@ -60,9 +60,9 @@ log_must zpool offline -f $TESTPOOL $BASEDIR/vdev9 log_must zpool replace -w $TESTPOOL $BASEDIR/vdev9 draid1-0-2 # Verify, refill and verify the pool contents. -verify_pool $TESTPOOL +log_must verify_draid_pool $TESTPOOL "healing" refill_test_env $TESTPOOL -verify_pool $TESTPOOL +log_must verify_draid_pool $TESTPOOL "healing" # Bring everything back online and check for errors. log_must zpool clear $TESTPOOL @@ -72,9 +72,7 @@ log_must wait_hotspare_state $TESTPOOL draid1-0-0 "AVAIL" log_must wait_hotspare_state $TESTPOOL draid1-0-1 "AVAIL" log_must wait_hotspare_state $TESTPOOL draid1-0-2 "AVAIL" -log_must zpool scrub -w $TESTPOOL -log_must check_pool_status $TESTPOOL "scan" "repaired 0B" -log_must check_pool_status $TESTPOOL "scan" "with 0 errors" +log_must verify_draid_pool $TESTPOOL "healing" log_must is_data_valid $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh index 425c30a49..f1485375f 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh @@ -111,9 +111,7 @@ for replace_mode in "healing" "sequential"; do log_must zpool detach $TESTPOOL $BASEDIR/vdev7 log_must check_vdev_state $TESTPOOL draid1-0-0 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-0 "INUSE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Distributed spare in mirror with original device faulted log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8 @@ -122,9 +120,7 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL spare-8 "DEGRADED" log_must check_vdev_state $TESTPOOL draid1-0-1 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-1 "INUSE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Distributed spare in mirror with original device still online log_must check_vdev_state $TESTPOOL $BASEDIR/vdev9 "ONLINE" @@ -132,9 +128,7 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL spare-9 "ONLINE" log_must check_vdev_state $TESTPOOL draid1-0-2 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-2 "INUSE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Normal faulted device replacement new_vdev0="$BASEDIR/new_vdev0" @@ -143,9 +137,7 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL $BASEDIR/vdev0 "FAULTED" log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev0 $new_vdev0 log_must check_vdev_state $TESTPOOL $new_vdev0 "ONLINE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Distributed spare faulted device replacement log_must zpool offline -f $TESTPOOL $BASEDIR/vdev2 @@ -154,9 +146,7 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL spare-2 "DEGRADED" log_must check_vdev_state $TESTPOOL draid1-0-3 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-3 "INUSE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Normal online device replacement new_vdev1="$BASEDIR/new_vdev1" @@ -164,9 +154,7 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL $BASEDIR/vdev1 "ONLINE" log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev1 $new_vdev1 log_must check_vdev_state $TESTPOOL $new_vdev1 "ONLINE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Distributed spare online device replacement (then fault) log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev3 draid1-0-4 @@ -176,9 +164,7 @@ for replace_mode in "healing" "sequential"; do log_must zpool offline -f $TESTPOOL $BASEDIR/vdev3 log_must check_vdev_state $TESTPOOL $BASEDIR/vdev3 "FAULTED" log_must check_vdev_state $TESTPOOL spare-3 "DEGRADED" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Verify the original data is valid log_must is_data_valid $TESTPOOL