draid: allow seq resilver reads from degraded vdevs

When sequentially resilvering allow a dRAID child to be read as long as the DTLs indicate it should have a good copy of the data and the leaf isn't being rebuilt. The previous check was slightly too broad and would skip dRAID spare and replacing vdevs if one of their children was being replaced. As long as there exists enough additional redundancy this is fine, but when there isn't this vdev must be read in order to correctly reconstruct the missing data. A new test case has been added which exhausts the available redundancy, faults another device causing it to be degraded, and then performs a sequential resilver for the degraded device. In such a situation enough redundancy exists to perform the replacement and a scrub should detect no checksum errors. Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Reviewed-by: Andriy Tkachuk <andriy.tkachuk@seagate.com> Reviewed-by: Akash B <akash-b@hpe.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #18405
2026-05-24 03:08:51 +03:00 · 2026-04-07 10:48:27 -07:00
parent 63b8da8ff7
commit e9a8c6e080
6 changed files with 162 additions and 35 deletions
@@ -1191,7 +1191,7 @@ vdev_draid_min_alloc(vdev_t *vd)
 }

 /*
- * Returns true if the txg range does not exist on any leaf vdev.
+ * Returns false if the txg range exists on any leaf vdev, true otherwise.
 *
 * A dRAID spare does not fit into the DTL model. While it has child vdevs
 * there is no redundancy among them, and the effective child vdev is
@@ -1932,34 +1932,15 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
 			vdev_t *svd;

 			/*
-			 * Sequential rebuilds need to always consider the data
-			 * on the child being rebuilt to be stale.  This is
-			 * important when all columns are available to aid
-			 * known reconstruction in identifing which columns
-			 * contain incorrect data.
-			 *
-			 * Furthermore, all repairs need to be constrained to
-			 * the devices being rebuilt because without a checksum
-			 * we cannot verify the data is actually correct and
-			 * performing an incorrect repair could result in
-			 * locking in damage and making the data unrecoverable.
+			 * Repairs need to be constrained to the devices being
+			 * rebuilt since without a checksum we cannot verify the
+			 * data is actually correct and performing an incorrect
+			 * repair could result in locking in the damage and
+			 * making the data unrecoverable.
 			 */
-			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
-				if (vdev_draid_rebuilding(cvd)) {
-					if (c >= rr->rr_firstdatacol)
-						rr->rr_missingdata++;
-					else
-						rr->rr_missingparity++;
-					rc->rc_error = SET_ERROR(ESTALE);
-					rc->rc_skipped = 1;
-					rc->rc_allow_repair = 1;
-					continue;
-				} else {
-					rc->rc_allow_repair = 0;
-				}
-			} else {
-				rc->rc_allow_repair = 1;
-			}
+			if (zio->io_priority == ZIO_PRIORITY_REBUILD &&
+			    !vdev_draid_rebuilding(cvd))
+				rc->rc_allow_repair = 0;

 			/*
 			 * If this child is a distributed spare then the
@@ -674,9 +674,14 @@ vdev_mirror_io_start(zio_t *zio)

 		/*
 		 * When sequentially resilvering only issue write repair
-		 * IOs to the vdev which is being rebuilt since performance
-		 * is limited by the slowest child.  This is an issue for
-		 * faster replacement devices such as distributed spares.
+		 * IOs to the vdev which is being rebuilt for two reasons:
+		 * 1. The repair IO data calculated from parity has no checksum
+		 *    to validate and could be incorrect.  Existing data must
+		 *    never be overwritten with unconfirmed data to ensure we
+		 *    never lock in unrecoverable damage to the pool.
+		 * 2. Performance is limited by the slowest child device.  We
+		 *    don't want a slower device to limit the rebuild rate for
+		 *    faster replacement devices such as distributed spares.
 		 */
 		if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
 		    (zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
@@ -913,8 +913,9 @@ timeout = 1200
 [tests/functional/redundancy]
 tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
    'redundancy_draid3', 'redundancy_draid_damaged1',
-    'redundancy_draid_damaged2', 'redundancy_draid_spare1',
-    'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror',
+    'redundancy_draid_damaged2', 'redundancy_draid_degraded1',
+    'redundancy_draid_spare1', 'redundancy_draid_spare2',
+    'redundancy_draid_spare3', 'redundancy_mirror',
    'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
    'redundancy_raidz3', 'redundancy_stripe']
 tags = ['functional', 'redundancy']
@@ -252,8 +252,6 @@ maybe = {
    'projectquota/setup': ['SKIP', exec_reason],
    'raidz/raidz_002_pos': ['FAIL', known_reason],
    'raidz/raidz_expand_001_pos': ['FAIL', 16421],
-    'redundancy/redundancy_draid_spare1': ['FAIL', 18307],
-    'redundancy/redundancy_draid_spare3': ['FAIL', 18319],
    'removal/removal_condense_export': ['FAIL', known_reason],
    'renameat2/setup': ['SKIP', renameat2_reason],
    'reservation/reservation_008_pos': ['FAIL', 7741],
@@ -1890,6 +1890,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/redundancy/redundancy_draid3.ksh \
 	functional/redundancy/redundancy_draid_damaged1.ksh \
 	functional/redundancy/redundancy_draid_damaged2.ksh \
+	functional/redundancy/redundancy_draid_degraded1.ksh \
 	functional/redundancy/redundancy_draid.ksh \
 	functional/redundancy/redundancy_draid_spare1.ksh \
 	functional/redundancy/redundancy_draid_spare2.ksh \
@@ -0,0 +1,141 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib
+
+#
+# DESCRIPTION:
+#	When sequentially resilvering a dRAID pool with multiple vdevs
+#	and N faulted vdevs, where N=parity, ensure that when another leaf
+#	is marked degraded the pool can still be sequentially resilvered
+#	without introducing new checksum errors.  Note we've exhausted
+#	the available redundancy so no silent correction can be tolerated.
+#
+# STRATEGY:
+#	1. Create block device files for the test draid pool
+#	2. For each parity value [1..3]
+#	    - create draid pool
+#	    - fill it with some directories/files
+#	    - fault N=parity vdevs eliminating any redundancy
+#	    - force fault an additional vdev causing it to be degraded
+#	    - replace the degraded (but online) vdev using a sequential
+#	      resilver.  The minimum pool redundancy requirements are met so
+#	      reconstruction is possible when reading from all online vdevs.
+#	    - verify that the draid spare was correctly reconstructed and
+#	      no checksum errors were introduced.
+#	    - destroy the draid pool
+#
+
+typeset -r devs=7
+typeset -r dev_size_mb=512
+
+typeset -a disks
+
+prefetch_disable=$(get_tunable PREFETCH_DISABLE)
+rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED)
+
+function cleanup
+{
+	poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"
+
+	for i in {0..$devs}; do
+		rm -f "$TEST_BASE_DIR/dev-$i"
+	done
+
+	set_tunable32 PREFETCH_DISABLE $prefetch_disable
+	set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled
+}
+
+function test_sequential_resilver # <pool> <parity> <dir>
+{
+	typeset pool=$1
+	typeset nparity=$2
+	typeset dir=$3
+
+	# Fault N=parity devices
+	for (( i=0; i<$nparity; i=i+1 )); do
+		log_must zpool offline -f $pool $dir/dev-$i
+	done
+
+	# Parity is exhausted, faulting another device marks it degraded
+	log_must zpool offline -f $pool $dir/dev-$nparity
+
+	# Replace the degraded vdev with a distributed spare
+	spare=draid${nparity}-0-0
+	log_must zpool replace -fsw $pool $dir/dev-$nparity $spare
+
+	log_must zpool scrub -w $pool
+	log_must zpool status $pool
+
+	log_must check_pool_status $pool "scan" "repaired 0B"
+	log_must check_pool_status $pool "errors" "No known data errors"
+	log_must check_pool_status $pool "scan" "with 0 errors"
+}
+
+log_onexit cleanup
+
+log_must set_tunable32 PREFETCH_DISABLE 1
+log_must set_tunable32 REBUILD_SCRUB_ENABLED 0
+
+# Disk files which will be used by pool
+for i in {0..$(($devs - 1))}; do
+	device=$TEST_BASE_DIR/dev-$i
+	log_must truncate -s ${dev_size_mb}M $device
+	disks[${#disks[*]}+1]=$device
+done
+
+# Disk file which will be attached
+log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs
+
+for nparity in 1 2 3; do
+	raid=draid${nparity}:${nparity}s
+	dir=$TEST_BASE_DIR
+
+	log_must zpool create -O compression=off -f -o cachefile=none $TESTPOOL $raid ${disks[@]}
+	log_must zfs set primarycache=metadata $TESTPOOL
+
+	log_must zfs create $TESTPOOL/fs
+	log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R
+
+	log_must zfs create -o compress=on $TESTPOOL/fs2
+	log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R
+
+	log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
+	log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R
+
+	log_must zpool export $TESTPOOL
+	log_must zpool import -o cachefile=none -d $dir $TESTPOOL
+
+	log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+
+	test_sequential_resilver $TESTPOOL $nparity $dir
+
+	log_must zpool destroy "$TESTPOOL"
+done
+
+log_pass "draid degraded device(s) test succeeded."