draid: allow seq resilver reads from degraded vdevs

When sequentially resilvering allow a dRAID child to be read
as long as the DTLs indicate it should have a good copy of the
data and the leaf isn't being rebuilt.  The previous check was
slightly too broad and would skip dRAID spare and replacing
vdevs if one of their children was being replaced.  As long
as there exists enough additional redundancy this is fine, but
when there isn't this vdev must be read in order to correctly
reconstruct the missing data.

A new test case has been added which exhausts the available
redundancy, faults another device causing it to be degraded,
and then performs a sequential resilver for the degraded device.
In such a situation enough redundancy exists to perform the
replacement and a scrub should detect no checksum errors.

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Andriy Tkachuk <andriy.tkachuk@seagate.com>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18405
This commit is contained in:
Brian Behlendorf
2026-04-07 10:48:27 -07:00
committed by Tony Hutter
parent 63b8da8ff7
commit e9a8c6e080
6 changed files with 162 additions and 35 deletions
+9 -28
View File
@@ -1191,7 +1191,7 @@ vdev_draid_min_alloc(vdev_t *vd)
}
/*
* Returns true if the txg range does not exist on any leaf vdev.
* Returns false if the txg range exists on any leaf vdev, true otherwise.
*
* A dRAID spare does not fit into the DTL model. While it has child vdevs
* there is no redundancy among them, and the effective child vdev is
@@ -1932,34 +1932,15 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
vdev_t *svd;
/*
* Sequential rebuilds need to always consider the data
* on the child being rebuilt to be stale. This is
* important when all columns are available to aid
* known reconstruction in identifing which columns
* contain incorrect data.
*
* Furthermore, all repairs need to be constrained to
* the devices being rebuilt because without a checksum
* we cannot verify the data is actually correct and
* performing an incorrect repair could result in
* locking in damage and making the data unrecoverable.
* Repairs need to be constrained to the devices being
* rebuilt since without a checksum we cannot verify the
* data is actually correct and performing an incorrect
* repair could result in locking in the damage and
* making the data unrecoverable.
*/
if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
if (vdev_draid_rebuilding(cvd)) {
if (c >= rr->rr_firstdatacol)
rr->rr_missingdata++;
else
rr->rr_missingparity++;
rc->rc_error = SET_ERROR(ESTALE);
rc->rc_skipped = 1;
rc->rc_allow_repair = 1;
continue;
} else {
rc->rc_allow_repair = 0;
}
} else {
rc->rc_allow_repair = 1;
}
if (zio->io_priority == ZIO_PRIORITY_REBUILD &&
!vdev_draid_rebuilding(cvd))
rc->rc_allow_repair = 0;
/*
* If this child is a distributed spare then the
+8 -3
View File
@@ -674,9 +674,14 @@ vdev_mirror_io_start(zio_t *zio)
/*
* When sequentially resilvering only issue write repair
* IOs to the vdev which is being rebuilt since performance
* is limited by the slowest child. This is an issue for
* faster replacement devices such as distributed spares.
* IOs to the vdev which is being rebuilt for two reasons:
* 1. The repair IO data calculated from parity has no checksum
* to validate and could be incorrect. Existing data must
* never be overwritten with unconfirmed data to ensure we
* never lock in unrecoverable damage to the pool.
* 2. Performance is limited by the slowest child device. We
* don't want a slower device to limit the rebuild rate for
* faster replacement devices such as distributed spares.
*/
if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+3 -2
View File
@@ -913,8 +913,9 @@ timeout = 1200
[tests/functional/redundancy]
tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
'redundancy_draid3', 'redundancy_draid_damaged1',
'redundancy_draid_damaged2', 'redundancy_draid_spare1',
'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror',
'redundancy_draid_damaged2', 'redundancy_draid_degraded1',
'redundancy_draid_spare1', 'redundancy_draid_spare2',
'redundancy_draid_spare3', 'redundancy_mirror',
'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
'redundancy_raidz3', 'redundancy_stripe']
tags = ['functional', 'redundancy']
-2
View File
@@ -252,8 +252,6 @@ maybe = {
'projectquota/setup': ['SKIP', exec_reason],
'raidz/raidz_002_pos': ['FAIL', known_reason],
'raidz/raidz_expand_001_pos': ['FAIL', 16421],
'redundancy/redundancy_draid_spare1': ['FAIL', 18307],
'redundancy/redundancy_draid_spare3': ['FAIL', 18319],
'removal/removal_condense_export': ['FAIL', known_reason],
'renameat2/setup': ['SKIP', renameat2_reason],
'reservation/reservation_008_pos': ['FAIL', 7741],
+1
View File
@@ -1890,6 +1890,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/redundancy/redundancy_draid3.ksh \
functional/redundancy/redundancy_draid_damaged1.ksh \
functional/redundancy/redundancy_draid_damaged2.ksh \
functional/redundancy/redundancy_draid_degraded1.ksh \
functional/redundancy/redundancy_draid.ksh \
functional/redundancy/redundancy_draid_spare1.ksh \
functional/redundancy/redundancy_draid_spare2.ksh \
@@ -0,0 +1,141 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2026 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib
#
# DESCRIPTION:
# When sequentially resilvering a dRAID pool with multiple vdevs
# and N faulted vdevs, where N=parity, ensure that when another leaf
# is marked degraded the pool can still be sequentially resilvered
# without introducing new checksum errors. Note we've exhausted
# the available redundancy so no silent correction can be tolerated.
#
# STRATEGY:
# 1. Create block device files for the test draid pool
# 2. For each parity value [1..3]
# - create draid pool
# - fill it with some directories/files
# - fault N=parity vdevs eliminating any redundancy
# - force fault an additional vdev causing it to be degraded
# - replace the degraded (but online) vdev using a sequential
# resilver. The minimum pool redundancy requirements are met so
# reconstruction is possible when reading from all online vdevs.
# - verify that the draid spare was correctly reconstructed and
# no checksum errors were introduced.
# - destroy the draid pool
#
typeset -r devs=7
typeset -r dev_size_mb=512
typeset -a disks
prefetch_disable=$(get_tunable PREFETCH_DISABLE)
rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED)
function cleanup
{
poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"
for i in {0..$devs}; do
rm -f "$TEST_BASE_DIR/dev-$i"
done
set_tunable32 PREFETCH_DISABLE $prefetch_disable
set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled
}
function test_sequential_resilver # <pool> <parity> <dir>
{
typeset pool=$1
typeset nparity=$2
typeset dir=$3
# Fault N=parity devices
for (( i=0; i<$nparity; i=i+1 )); do
log_must zpool offline -f $pool $dir/dev-$i
done
# Parity is exhausted, faulting another device marks it degraded
log_must zpool offline -f $pool $dir/dev-$nparity
# Replace the degraded vdev with a distributed spare
spare=draid${nparity}-0-0
log_must zpool replace -fsw $pool $dir/dev-$nparity $spare
log_must zpool scrub -w $pool
log_must zpool status $pool
log_must check_pool_status $pool "scan" "repaired 0B"
log_must check_pool_status $pool "errors" "No known data errors"
log_must check_pool_status $pool "scan" "with 0 errors"
}
log_onexit cleanup
log_must set_tunable32 PREFETCH_DISABLE 1
log_must set_tunable32 REBUILD_SCRUB_ENABLED 0
# Disk files which will be used by pool
for i in {0..$(($devs - 1))}; do
device=$TEST_BASE_DIR/dev-$i
log_must truncate -s ${dev_size_mb}M $device
disks[${#disks[*]}+1]=$device
done
# Disk file which will be attached
log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs
for nparity in 1 2 3; do
raid=draid${nparity}:${nparity}s
dir=$TEST_BASE_DIR
log_must zpool create -O compression=off -f -o cachefile=none $TESTPOOL $raid ${disks[@]}
log_must zfs set primarycache=metadata $TESTPOOL
log_must zfs create $TESTPOOL/fs
log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R
log_must zfs create -o compress=on $TESTPOOL/fs2
log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R
log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R
log_must zpool export $TESTPOOL
log_must zpool import -o cachefile=none -d $dir $TESTPOOL
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
test_sequential_resilver $TESTPOOL $nparity $dir
log_must zpool destroy "$TESTPOOL"
done
log_pass "draid degraded device(s) test succeeded."