mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-23 02:44:41 +03:00
ZTS: add targeted redundancy_draid_spare exception
When sequentially resilvering a dRAID pool it's possible that a few correctable checksum errors will be reported. This is a known issue which is occasionally observed in the CI. Until it's resolved we want the test case to tolerate a few checksum errors in this scenario to prevent false positives in the CI. This change also has the additional side effect of standardizing in one location how the dRAID pool integrity is verified. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #18307 Issue #18319 Closes #18436
This commit is contained in:
committed by
Tony Hutter
parent
887bfc1a64
commit
b0c1dcb531
@@ -52,7 +52,7 @@ function cleanup
|
||||
#
|
||||
function cksum_pool
|
||||
{
|
||||
typeset -i cksum=$(zpool status $1 | awk '
|
||||
typeset -i cksum=$(zpool status -p $1 | awk '
|
||||
!NF { isvdev = 0 }
|
||||
isvdev { errors += $NF }
|
||||
/CKSUM$/ { isvdev = 1 }
|
||||
@@ -349,3 +349,77 @@ function recover_bad_missing_devs
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
#
|
||||
# Given a dRAID pool issue a scrub and verify the current pool status
|
||||
# aligns with the expected status based on the 'replace_mode' passed.
|
||||
# Valid modes are:
|
||||
#
|
||||
# 1. healing - The pool is perfectly intact. No checksum errors have
|
||||
# been reported and the scrub didn't make any repairs. This is the
|
||||
# expected state after a healing resilver of a healthy pool.
|
||||
#
|
||||
# 2. sequential - The pool is fully intact. There should never be a
|
||||
# checksum error, but the occasional checksum error does occur in
|
||||
# practice. Until the root cause is identified and resolved, tolerate
|
||||
# a checksum error when scrubbing after a sequential resilver.
|
||||
#
|
||||
# https://github.com/openzfs/zfs/issues/18307
|
||||
# https://github.com/openzfs/zfs/issues/18319
|
||||
#
|
||||
# 3. damaged - The pool was intentionally silently damaged. Checksum
|
||||
# errors are expected to be reported as the damaged blocks are
|
||||
# detected and repaired.
|
||||
#
|
||||
# In all of these cases a scrub must be able to successfully repair the
|
||||
# pool and result in no data loss.
|
||||
#
|
||||
function verify_draid_pool
|
||||
{
|
||||
typeset pool=${1:-$TESTPOOL}
|
||||
typeset replace_mode=${2:-healing}
|
||||
|
||||
log_note "verify_draid_pool $pool $replace_mode"
|
||||
log_must zpool scrub -w $pool
|
||||
|
||||
typeset -i cksum=$(cksum_pool $pool)
|
||||
|
||||
if [[ "$replace_mode" = "healing" ]]; then
|
||||
if [[ $cksum -gt 0 ]]; then
|
||||
log_must zpool status -v $pool
|
||||
log_fail "Unexpected CKSUM errors found for $pool ($cksum)"
|
||||
fi
|
||||
|
||||
if ! check_pool_status $pool "scan" "repaired 0B"; then
|
||||
log_must zpool status -v $pool
|
||||
log_fail "Unexpected repair IO found for $pool ($cksum)"
|
||||
fi
|
||||
elif [[ "$replace_mode" = "sequential" ]]; then
|
||||
if [[ $cksum -gt 3 ]]; then
|
||||
log_must zpool status -v $pool
|
||||
log_fail "Unexpected CKSUM errors found for $pool ($cksum)"
|
||||
fi
|
||||
elif [[ "$replace_mode" = "damaged" ]]; then
|
||||
if [[ $cksum -lt 1 ]]; then
|
||||
log_must zpool status -v $pool
|
||||
log_fail "Expected CKSUM errors missing for $pool ($cksum)"
|
||||
fi
|
||||
|
||||
if check_pool_status $pool "scan" "repaired 0B"; then
|
||||
log_must zpool status -v $pool
|
||||
log_fail "Expected repair IO missing for $pool ($cksum)"
|
||||
fi
|
||||
else
|
||||
log_fail "Invalid replace_mode=$replace_mode"
|
||||
fi
|
||||
|
||||
if ! check_pool_status $pool "scan" "with 0 errors"; then
|
||||
log_must zpool status -v $pool
|
||||
log_fail "Unexpected repair errors found for $pool"
|
||||
fi
|
||||
|
||||
if ! check_pool_status $pool "errors" "No known data errors"; then
|
||||
log_must zpool status -v $pool
|
||||
log_fail "Unexpected data errors found for $pool"
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -86,8 +86,7 @@ function test_selfheal # <pool> <parity> <dir>
|
||||
# from the files which were read. Before overwriting additional
|
||||
# devices we need to repair all of the blocks in the pool.
|
||||
#
|
||||
log_must zpool scrub -w $pool
|
||||
log_must check_pool_status $pool "errors" "No known data errors"
|
||||
log_must verify_draid_pool $pool "damaged"
|
||||
|
||||
log_must zpool clear $pool
|
||||
|
||||
@@ -104,8 +103,7 @@ function test_selfheal # <pool> <parity> <dir>
|
||||
log_must eval "find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1"
|
||||
log_must check_pool_status $pool "errors" "No known data errors"
|
||||
|
||||
log_must zpool scrub -w $pool
|
||||
log_must check_pool_status $pool "errors" "No known data errors"
|
||||
log_must verify_draid_pool $pool "damaged"
|
||||
|
||||
log_must zpool clear $pool
|
||||
}
|
||||
@@ -182,8 +180,7 @@ function test_scrub # <pool> <parity> <dir>
|
||||
|
||||
log_must zpool import -o cachefile=none -d $dir $pool
|
||||
|
||||
log_must zpool scrub -w $pool
|
||||
log_must check_pool_status $pool "errors" "No known data errors"
|
||||
log_must verify_draid_pool $pool "damaged"
|
||||
|
||||
log_must zpool clear $pool
|
||||
|
||||
@@ -196,8 +193,7 @@ function test_scrub # <pool> <parity> <dir>
|
||||
|
||||
log_must zpool import -o cachefile=none -d $dir $pool
|
||||
|
||||
log_must zpool scrub -w $pool
|
||||
log_must check_pool_status $pool "errors" "No known data errors"
|
||||
log_must verify_draid_pool $pool "damaged"
|
||||
|
||||
log_must zpool clear $pool
|
||||
}
|
||||
|
||||
@@ -89,12 +89,7 @@ function test_sequential_resilver # <pool> <parity> <dir>
|
||||
log_must zpool replace -fsw $pool $dir/dev-$i $spare
|
||||
done
|
||||
|
||||
log_must zpool scrub -w $pool
|
||||
log_must zpool status $pool
|
||||
|
||||
log_mustnot check_pool_status $pool "scan" "repaired 0B"
|
||||
log_must check_pool_status $pool "errors" "No known data errors"
|
||||
log_must check_pool_status $pool "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $pool "damaged"
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
@@ -121,12 +121,7 @@ for nparity in 1 2 3; do
|
||||
|
||||
# Scrub the pool after the sequential resilver and verify
|
||||
# that the silent damage was repaired by the scrub.
|
||||
log_must zpool scrub -w $TESTPOOL
|
||||
log_must zpool status $TESTPOOL
|
||||
log_must check_pool_status $TESTPOOL "errors" \
|
||||
"No known data errors"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_mustnot check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must verify_draid_pool $TESTPOOL "damaged"
|
||||
done
|
||||
|
||||
for nspare in 0 1 2; do
|
||||
@@ -145,12 +140,7 @@ for nparity in 1 2 3; do
|
||||
done
|
||||
|
||||
log_must zpool clear $TESTPOOL
|
||||
log_must zpool scrub -w $TESTPOOL
|
||||
log_must zpool status $TESTPOOL
|
||||
|
||||
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must verify_draid_pool $TESTPOOL "healing"
|
||||
|
||||
log_must zpool destroy "$TESTPOOL"
|
||||
done
|
||||
|
||||
@@ -89,12 +89,7 @@ function test_sequential_resilver # <pool> <parity> <dir>
|
||||
spare=draid${nparity}-0-0
|
||||
log_must zpool replace -fsw $pool $dir/dev-$nparity $spare
|
||||
|
||||
log_must zpool scrub -w $pool
|
||||
log_must zpool status $pool
|
||||
|
||||
log_must check_pool_status $pool "scan" "repaired 0B"
|
||||
log_must check_pool_status $pool "errors" "No known data errors"
|
||||
log_must check_pool_status $pool "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $pool "sequential"
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
@@ -105,12 +105,7 @@ function test_sequential_resilver # <pool> <parity> <dir>
|
||||
|
||||
log_must zpool wait -t resilver $pool
|
||||
|
||||
log_must zpool scrub -w $pool
|
||||
log_must zpool status $pool
|
||||
|
||||
log_must check_pool_status $pool "scan" "repaired 0B"
|
||||
log_must check_pool_status $pool "errors" "No known data errors"
|
||||
log_must check_pool_status $pool "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $pool "sequential"
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
@@ -85,9 +85,7 @@ for replace_mode in "healing" "sequential"; do
|
||||
log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE"
|
||||
# Preserve the 1st faulted vdev for the next test.
|
||||
[[ $i -eq 0 ]] || log_must zpool detach $TESTPOOL $fault_vdev
|
||||
log_must verify_pool $TESTPOOL
|
||||
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $TESTPOOL $replace_mode
|
||||
|
||||
(( i += 1 ))
|
||||
done
|
||||
@@ -98,9 +96,7 @@ for replace_mode in "healing" "sequential"; do
|
||||
# Verify that after clearing the 1st faulted vdev, all is healed.
|
||||
log_must zpool clear $TESTPOOL "$BASEDIR/vdev0"
|
||||
log_must wait_resilvered $TESTPOOL
|
||||
log_must verify_pool $TESTPOOL
|
||||
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $TESTPOOL $replace_mode
|
||||
|
||||
cleanup
|
||||
done
|
||||
|
||||
@@ -60,9 +60,9 @@ log_must zpool offline -f $TESTPOOL $BASEDIR/vdev9
|
||||
log_must zpool replace -w $TESTPOOL $BASEDIR/vdev9 draid1-0-2
|
||||
|
||||
# Verify, refill and verify the pool contents.
|
||||
verify_pool $TESTPOOL
|
||||
log_must verify_draid_pool $TESTPOOL "healing"
|
||||
refill_test_env $TESTPOOL
|
||||
verify_pool $TESTPOOL
|
||||
log_must verify_draid_pool $TESTPOOL "healing"
|
||||
|
||||
# Bring everything back online and check for errors.
|
||||
log_must zpool clear $TESTPOOL
|
||||
@@ -72,9 +72,7 @@ log_must wait_hotspare_state $TESTPOOL draid1-0-0 "AVAIL"
|
||||
log_must wait_hotspare_state $TESTPOOL draid1-0-1 "AVAIL"
|
||||
log_must wait_hotspare_state $TESTPOOL draid1-0-2 "AVAIL"
|
||||
|
||||
log_must zpool scrub -w $TESTPOOL
|
||||
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $TESTPOOL "healing"
|
||||
|
||||
log_must is_data_valid $TESTPOOL
|
||||
|
||||
|
||||
@@ -111,9 +111,7 @@ for replace_mode in "healing" "sequential"; do
|
||||
log_must zpool detach $TESTPOOL $BASEDIR/vdev7
|
||||
log_must check_vdev_state $TESTPOOL draid1-0-0 "ONLINE"
|
||||
log_must check_hotspare_state $TESTPOOL draid1-0-0 "INUSE"
|
||||
log_must verify_pool $TESTPOOL
|
||||
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $TESTPOOL $replace_mode
|
||||
|
||||
# Distributed spare in mirror with original device faulted
|
||||
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8
|
||||
@@ -122,9 +120,7 @@ for replace_mode in "healing" "sequential"; do
|
||||
log_must check_vdev_state $TESTPOOL spare-8 "DEGRADED"
|
||||
log_must check_vdev_state $TESTPOOL draid1-0-1 "ONLINE"
|
||||
log_must check_hotspare_state $TESTPOOL draid1-0-1 "INUSE"
|
||||
log_must verify_pool $TESTPOOL
|
||||
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $TESTPOOL $replace_mode
|
||||
|
||||
# Distributed spare in mirror with original device still online
|
||||
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev9 "ONLINE"
|
||||
@@ -132,9 +128,7 @@ for replace_mode in "healing" "sequential"; do
|
||||
log_must check_vdev_state $TESTPOOL spare-9 "ONLINE"
|
||||
log_must check_vdev_state $TESTPOOL draid1-0-2 "ONLINE"
|
||||
log_must check_hotspare_state $TESTPOOL draid1-0-2 "INUSE"
|
||||
log_must verify_pool $TESTPOOL
|
||||
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $TESTPOOL $replace_mode
|
||||
|
||||
# Normal faulted device replacement
|
||||
new_vdev0="$BASEDIR/new_vdev0"
|
||||
@@ -143,9 +137,7 @@ for replace_mode in "healing" "sequential"; do
|
||||
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev0 "FAULTED"
|
||||
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev0 $new_vdev0
|
||||
log_must check_vdev_state $TESTPOOL $new_vdev0 "ONLINE"
|
||||
log_must verify_pool $TESTPOOL
|
||||
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $TESTPOOL $replace_mode
|
||||
|
||||
# Distributed spare faulted device replacement
|
||||
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev2
|
||||
@@ -154,9 +146,7 @@ for replace_mode in "healing" "sequential"; do
|
||||
log_must check_vdev_state $TESTPOOL spare-2 "DEGRADED"
|
||||
log_must check_vdev_state $TESTPOOL draid1-0-3 "ONLINE"
|
||||
log_must check_hotspare_state $TESTPOOL draid1-0-3 "INUSE"
|
||||
log_must verify_pool $TESTPOOL
|
||||
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $TESTPOOL $replace_mode
|
||||
|
||||
# Normal online device replacement
|
||||
new_vdev1="$BASEDIR/new_vdev1"
|
||||
@@ -164,9 +154,7 @@ for replace_mode in "healing" "sequential"; do
|
||||
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev1 "ONLINE"
|
||||
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev1 $new_vdev1
|
||||
log_must check_vdev_state $TESTPOOL $new_vdev1 "ONLINE"
|
||||
log_must verify_pool $TESTPOOL
|
||||
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $TESTPOOL $replace_mode
|
||||
|
||||
# Distributed spare online device replacement (then fault)
|
||||
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev3 draid1-0-4
|
||||
@@ -176,9 +164,7 @@ for replace_mode in "healing" "sequential"; do
|
||||
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev3
|
||||
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev3 "FAULTED"
|
||||
log_must check_vdev_state $TESTPOOL spare-3 "DEGRADED"
|
||||
log_must verify_pool $TESTPOOL
|
||||
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
|
||||
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
|
||||
log_must verify_draid_pool $TESTPOOL $replace_mode
|
||||
|
||||
# Verify the original data is valid
|
||||
log_must is_data_valid $TESTPOOL
|
||||
|
||||
Reference in New Issue
Block a user