diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib index 65435554b..702268aee 100644 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib @@ -52,7 +52,7 @@ function cleanup # function cksum_pool { - typeset -i cksum=$(zpool status $1 | awk ' + typeset -i cksum=$(zpool status -p $1 | awk ' !NF { isvdev = 0 } isvdev { errors += $NF } /CKSUM$/ { isvdev = 1 } @@ -349,3 +349,77 @@ function recover_bad_missing_devs return 0 } + +# +# Given a dRAID pool issue a scrub and verify the current pool status +# aligns with the expected status based on the 'replace_mode' passed. +# Valid modes are: +# +# 1. healing - The pool is perfectly intact. No checksum errors have +# been reported and the scrub didn't make any repairs. This is the +# expected state after a healing resilver of a healthy pool. +# +# 2. sequential - The pool is fully intact. There should never be a +# checksum error, but the occasional checksum error does occur in +# practice. Until the root cause is identified and resolved, tolerate +# a checksum error when scrubbing after a sequential resilver. +# +# https://github.com/openzfs/zfs/issues/18307 +# https://github.com/openzfs/zfs/issues/18319 +# +# 3. damaged - The pool was intentionally silently damaged. Checksum +# errors are expected to be reported as the damaged blocks are +# detected and repaired. +# +# In all of these cases a scrub must be able to successfully repair the +# pool and result in no data loss. +# +function verify_draid_pool +{ + typeset pool=${1:-$TESTPOOL} + typeset replace_mode=${2:-healing} + + log_note "verify_draid_pool $pool $replace_mode" + log_must zpool scrub -w $pool + + typeset -i cksum=$(cksum_pool $pool) + + if [[ "$replace_mode" = "healing" ]]; then + if [[ $cksum -gt 0 ]]; then + log_must zpool status -v $pool + log_fail "Unexpected CKSUM errors found for $pool ($cksum)" + fi + + if ! check_pool_status $pool "scan" "repaired 0B"; then + log_must zpool status -v $pool + log_fail "Unexpected repair IO found for $pool ($cksum)" + fi + elif [[ "$replace_mode" = "sequential" ]]; then + if [[ $cksum -gt 3 ]]; then + log_must zpool status -v $pool + log_fail "Unexpected CKSUM errors found for $pool ($cksum)" + fi + elif [[ "$replace_mode" = "damaged" ]]; then + if [[ $cksum -lt 1 ]]; then + log_must zpool status -v $pool + log_fail "Expected CKSUM errors missing for $pool ($cksum)" + fi + + if check_pool_status $pool "scan" "repaired 0B"; then + log_must zpool status -v $pool + log_fail "Expected repair IO missing for $pool ($cksum)" + fi + else + log_fail "Invalid replace_mode=$replace_mode" + fi + + if ! check_pool_status $pool "scan" "with 0 errors"; then + log_must zpool status -v $pool + log_fail "Unexpected repair errors found for $pool" + fi + + if ! check_pool_status $pool "errors" "No known data errors"; then + log_must zpool status -v $pool + log_fail "Unexpected data errors found for $pool" + fi +} diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh index a1356f619..81a01c07a 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh @@ -86,8 +86,7 @@ function test_selfheal # # from the files which were read. Before overwriting additional # devices we need to repair all of the blocks in the pool. # - log_must zpool scrub -w $pool - log_must check_pool_status $pool "errors" "No known data errors" + log_must verify_draid_pool $pool "damaged" log_must zpool clear $pool @@ -104,8 +103,7 @@ function test_selfheal # log_must eval "find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1" log_must check_pool_status $pool "errors" "No known data errors" - log_must zpool scrub -w $pool - log_must check_pool_status $pool "errors" "No known data errors" + log_must verify_draid_pool $pool "damaged" log_must zpool clear $pool } @@ -182,8 +180,7 @@ function test_scrub # log_must zpool import -o cachefile=none -d $dir $pool - log_must zpool scrub -w $pool - log_must check_pool_status $pool "errors" "No known data errors" + log_must verify_draid_pool $pool "damaged" log_must zpool clear $pool @@ -196,8 +193,7 @@ function test_scrub # log_must zpool import -o cachefile=none -d $dir $pool - log_must zpool scrub -w $pool - log_must check_pool_status $pool "errors" "No known data errors" + log_must verify_draid_pool $pool "damaged" log_must zpool clear $pool } diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh index cafd63166..56b12373a 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh @@ -89,12 +89,7 @@ function test_sequential_resilver # log_must zpool replace -fsw $pool $dir/dev-$i $spare done - log_must zpool scrub -w $pool - log_must zpool status $pool - - log_mustnot check_pool_status $pool "scan" "repaired 0B" - log_must check_pool_status $pool "errors" "No known data errors" - log_must check_pool_status $pool "scan" "with 0 errors" + log_must verify_draid_pool $pool "damaged" } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh index 46bf9f950..3dfb760be 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh @@ -121,12 +121,7 @@ for nparity in 1 2 3; do # Scrub the pool after the sequential resilver and verify # that the silent damage was repaired by the scrub. - log_must zpool scrub -w $TESTPOOL - log_must zpool status $TESTPOOL - log_must check_pool_status $TESTPOOL "errors" \ - "No known data errors" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" - log_mustnot check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must verify_draid_pool $TESTPOOL "damaged" done for nspare in 0 1 2; do @@ -145,12 +140,7 @@ for nparity in 1 2 3; do done log_must zpool clear $TESTPOOL - log_must zpool scrub -w $TESTPOOL - log_must zpool status $TESTPOOL - - log_must check_pool_status $TESTPOOL "errors" "No known data errors" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must verify_draid_pool $TESTPOOL "healing" log_must zpool destroy "$TESTPOOL" done diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded1.ksh index ae65d3a21..31444850f 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded1.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded1.ksh @@ -89,12 +89,7 @@ function test_sequential_resilver # spare=draid${nparity}-0-0 log_must zpool replace -fsw $pool $dir/dev-$nparity $spare - log_must zpool scrub -w $pool - log_must zpool status $pool - - log_must check_pool_status $pool "scan" "repaired 0B" - log_must check_pool_status $pool "errors" "No known data errors" - log_must check_pool_status $pool "scan" "with 0 errors" + log_must verify_draid_pool $pool "sequential" } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh index 8d102627f..22e1f2dfb 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh @@ -105,12 +105,7 @@ function test_sequential_resilver # log_must zpool wait -t resilver $pool - log_must zpool scrub -w $pool - log_must zpool status $pool - - log_must check_pool_status $pool "scan" "repaired 0B" - log_must check_pool_status $pool "errors" "No known data errors" - log_must check_pool_status $pool "scan" "with 0 errors" + log_must verify_draid_pool $pool "sequential" } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh index 0604f7f48..e5d16910d 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh @@ -85,9 +85,7 @@ for replace_mode in "healing" "sequential"; do log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE" # Preserve the 1st faulted vdev for the next test. [[ $i -eq 0 ]] || log_must zpool detach $TESTPOOL $fault_vdev - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode (( i += 1 )) done @@ -98,9 +96,7 @@ for replace_mode in "healing" "sequential"; do # Verify that after clearing the 1st faulted vdev, all is healed. log_must zpool clear $TESTPOOL "$BASEDIR/vdev0" log_must wait_resilvered $TESTPOOL - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode cleanup done diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh index 288f02392..6f94161b4 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh @@ -60,9 +60,9 @@ log_must zpool offline -f $TESTPOOL $BASEDIR/vdev9 log_must zpool replace -w $TESTPOOL $BASEDIR/vdev9 draid1-0-2 # Verify, refill and verify the pool contents. -verify_pool $TESTPOOL +log_must verify_draid_pool $TESTPOOL "healing" refill_test_env $TESTPOOL -verify_pool $TESTPOOL +log_must verify_draid_pool $TESTPOOL "healing" # Bring everything back online and check for errors. log_must zpool clear $TESTPOOL @@ -72,9 +72,7 @@ log_must wait_hotspare_state $TESTPOOL draid1-0-0 "AVAIL" log_must wait_hotspare_state $TESTPOOL draid1-0-1 "AVAIL" log_must wait_hotspare_state $TESTPOOL draid1-0-2 "AVAIL" -log_must zpool scrub -w $TESTPOOL -log_must check_pool_status $TESTPOOL "scan" "repaired 0B" -log_must check_pool_status $TESTPOOL "scan" "with 0 errors" +log_must verify_draid_pool $TESTPOOL "healing" log_must is_data_valid $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh index 425c30a49..f1485375f 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh @@ -111,9 +111,7 @@ for replace_mode in "healing" "sequential"; do log_must zpool detach $TESTPOOL $BASEDIR/vdev7 log_must check_vdev_state $TESTPOOL draid1-0-0 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-0 "INUSE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Distributed spare in mirror with original device faulted log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8 @@ -122,9 +120,7 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL spare-8 "DEGRADED" log_must check_vdev_state $TESTPOOL draid1-0-1 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-1 "INUSE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Distributed spare in mirror with original device still online log_must check_vdev_state $TESTPOOL $BASEDIR/vdev9 "ONLINE" @@ -132,9 +128,7 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL spare-9 "ONLINE" log_must check_vdev_state $TESTPOOL draid1-0-2 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-2 "INUSE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Normal faulted device replacement new_vdev0="$BASEDIR/new_vdev0" @@ -143,9 +137,7 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL $BASEDIR/vdev0 "FAULTED" log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev0 $new_vdev0 log_must check_vdev_state $TESTPOOL $new_vdev0 "ONLINE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Distributed spare faulted device replacement log_must zpool offline -f $TESTPOOL $BASEDIR/vdev2 @@ -154,9 +146,7 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL spare-2 "DEGRADED" log_must check_vdev_state $TESTPOOL draid1-0-3 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-3 "INUSE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Normal online device replacement new_vdev1="$BASEDIR/new_vdev1" @@ -164,9 +154,7 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL $BASEDIR/vdev1 "ONLINE" log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev1 $new_vdev1 log_must check_vdev_state $TESTPOOL $new_vdev1 "ONLINE" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Distributed spare online device replacement (then fault) log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev3 draid1-0-4 @@ -176,9 +164,7 @@ for replace_mode in "healing" "sequential"; do log_must zpool offline -f $TESTPOOL $BASEDIR/vdev3 log_must check_vdev_state $TESTPOOL $BASEDIR/vdev3 "FAULTED" log_must check_vdev_state $TESTPOOL spare-3 "DEGRADED" - log_must verify_pool $TESTPOOL - log_must check_pool_status $TESTPOOL "scan" "repaired 0B" - log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must verify_draid_pool $TESTPOOL $replace_mode # Verify the original data is valid log_must is_data_valid $TESTPOOL