diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 8c8dcfb07..3ff2a4d94 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -119,6 +119,7 @@ typedef struct raidz_col { uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ uint8_t rc_force_repair:1; /* Write good data to this column */ uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + uint8_t rc_tgt_is_dspare:1; /* The target is draid spare vdev */ uint8_t rc_latency_outlier:1; /* Latency outlier for this device */ int rc_shadow_devidx; /* for double write during expansion */ int rc_shadow_error; /* for double write during expansion */ diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 48e5fbd4b..576d88c3c 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -23,6 +23,7 @@ * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2026, Wasabi Technologies, Inc. */ #include @@ -1249,8 +1250,7 @@ vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg, if (vd == NULL) return (B_TRUE); - return (vdev_draid_missing(vd, physical_offset, - txg, size)); + return (vdev_draid_missing(vd, physical_offset, txg, size)); } return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); @@ -1909,12 +1909,34 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) } if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) { + vdev_t *svd; + if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; + + /* + * If this child has draid spare attached, and that + * spare by rc_offset maps to another spare, the repair + * would go to that spare, and we want all mirrored + * children on it to be updated with the repaired data, + * even when we cannot vouch for it during rebuilds + * (which don't have checksums). Otherwise, we will have + * a lot of checksum errors on that spares during scrub. + * The worst thing that can happen in this case is that + * we will update the reserved spare column on some + * device with unverified data, which is harmless. + */ + if ((svd = vdev_draid_find_spare(cvd)) != NULL) { + svd = vdev_draid_spare_get_child(svd, + rc->rc_offset); + if (svd && (svd->vdev_ops == &vdev_spare_ops || + svd->vdev_ops == &vdev_replacing_ops)) + rc->rc_tgt_is_dspare = 1; + } continue; } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 2048aa5a2..35a4a5beb 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -669,23 +669,19 @@ vdev_mirror_io_start(zio_t *zio) } while (children--) { - mc = &mm->mm_child[c]; - c++; + mc = &mm->mm_child[c++]; /* - * When sequentially resilvering only issue write repair - * IOs to the vdev which is being rebuilt for two reasons: - * 1. The repair IO data calculated from parity has no checksum - * to validate and could be incorrect. Existing data must - * never be overwritten with unconfirmed data to ensure we - * never lock in unrecoverable damage to the pool. - * 2. Performance is limited by the slowest child device. We - * don't want a slower device to limit the rebuild rate for - * faster replacement devices such as distributed spares. + * When sequentially resilvering and the integrity of the data + * is speculative (ZIO_FLAG_SPECULATIVE), issue write repair IOs + * only to the vdev which is being rebuilt. Existing data on + * other children must never be overwritten with unconfirmed + * data to avoid unrecoverable damage to the pool. */ if ((zio->io_priority == ZIO_PRIORITY_REBUILD) && (zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SCRUB) && + (zio->io_flags & ZIO_FLAG_SPECULATIVE) && mm->mm_rebuilding && !mc->mc_rebuilding) { continue; } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 520ddd692..6eb8d44cb 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -25,6 +25,7 @@ * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2026, Wasabi Technologies, Inc. */ #include @@ -3104,6 +3105,7 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) int parity_errors = 0; int parity_untried = 0; int data_errors = 0; + zio_flag_t add_flags = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); @@ -3134,10 +3136,30 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) * Note that we also regenerate parity when resilvering so we * can write it out to failed devices later. */ - if (parity_errors + parity_untried < - rr->rr_firstdatacol - data_errors || - (zio->io_flags & ZIO_FLAG_RESILVER)) { + boolean_t parity_verify = (parity_errors + parity_untried) < + (rr->rr_firstdatacol - data_errors); + if (parity_verify || (zio->io_flags & ZIO_FLAG_RESILVER)) { int n = raidz_parity_verify(zio, rr); + /* + * In, Reed-Solomon encoding, if we have ndata+1 columns and + * the parity doesn't match, it means the data integrity is + * compromised. We shouldn't try to repair anything in this + * case. + */ + if (parity_verify && n > 0 && + zio->io_priority == ZIO_PRIORITY_REBUILD) + return; + /* + * If we have only ndata columns, the data integrity will + * be checked by the checksums normally, but not in case + * of rebuild when we don't have checksums. In this case, + * we add ZIO_FLAG_SPECULATIVE and try to not spread + * unverified data. For example, when the target vdev happens + * to be the mirroring spare vdev, we would repair only that + * child in it which is being rebuilt. + */ + if (!parity_verify && zio->io_priority == ZIO_PRIORITY_REBUILD) + add_flags |= ZIO_FLAG_SPECULATIVE; unexpected_errors += n; } @@ -3163,13 +3185,27 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) */ ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); + /* + * When the target vdev is draid spare, we should clear + * ZIO_FLAG_SPECULATIVE. First, if that draid spare maps + * to another spare having an online/degraded disk, that + * disk must be repaired also. Otherwise, the scrub will + * detect a lot of cksum errors later. Second, since it + * is draid spare, there is no harm in updating its + * content on any vdev it maps to because the space is + * reserved as a spare anyway. + */ + zio_flag_t aflags = add_flags; + if (rc->rc_tgt_is_dspare) + aflags &= ~ZIO_FLAG_SPECULATIVE; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, zio->io_priority == ZIO_PRIORITY_REBUILD ? ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? - ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); + ZIO_FLAG_SELF_HEAL : 0) | aflags, NULL, NULL)); } } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a48854563..f94edf20a 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1669,9 +1669,11 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, /* * If we've decided to do a repair, the write is not speculative -- - * even if the original read was. + * even if the original read was. Rebuild is an exception since we + * cannot always ensure its data integrity. */ - if (flags & ZIO_FLAG_IO_REPAIR) + if ((flags & ZIO_FLAG_IO_REPAIR) && + pio->io_priority != ZIO_PRIORITY_REBUILD) flags &= ~ZIO_FLAG_SPECULATIVE; /* diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index ea4d2b2f5..10bee031b 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -914,6 +914,7 @@ timeout = 1200 tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', 'redundancy_draid3', 'redundancy_draid_damaged1', 'redundancy_draid_damaged2', 'redundancy_draid_degraded1', + 'redundancy_draid_degraded2', 'redundancy_draid_spare1', 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2', diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index c5a448ac9..c55f40500 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1891,6 +1891,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/redundancy/redundancy_draid_damaged1.ksh \ functional/redundancy/redundancy_draid_damaged2.ksh \ functional/redundancy/redundancy_draid_degraded1.ksh \ + functional/redundancy/redundancy_draid_degraded2.ksh \ functional/redundancy/redundancy_draid.ksh \ functional/redundancy/redundancy_draid_spare1.ksh \ functional/redundancy/redundancy_draid_spare2.ksh \ diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh new file mode 100755 index 000000000..8d102627f --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh @@ -0,0 +1,157 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2026 by Wasabi Technologies, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# When sequentially resilvering a dRAID pool with multiple vdevs +# and N faulted vdevs, where N=parity, ensure that when another leaf +# is marked degraded the pool can still be sequentially resilvered +# without introducing new checksum errors. Note we've exhausted +# the available redundancy so no silent correction can be tolerated. +# +# This test is very similar to redundancy_draid_degraded1 and is +# based on it. The difference is that 1) we always have some faulted +# vdev which is already resilvered, and 2) we resilver the most +# recently faulted, but marked degraded due to redundancy exhaustion, +# vdev also. +# +# STRATEGY: +# 1. Create block device files for the test draid pool +# 2. For each parity value [1..3] +# - create draid pool +# - fill it with some directories/files +# - fault one vdev and resilver it +# - fault N=parity vdevs eliminating any redundancy +# - force fault an additional vdev causing it to be degraded +# - replace faulted vdevs using a sequential resilver. +# The minimum pool redundancy requirements are met so +# reconstruction is possible when reading from all online vdevs. +# - verify that the draid spare was correctly reconstructed and +# no checksum errors were introduced. +# - destroy the draid pool +# + +typeset -r devs=13 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) +rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED) +scan_suspend_progress=$(get_tunable SCAN_SUSPEND_PROGRESS) + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + + for i in {0..$devs}; do + rm -f "$TEST_BASE_DIR/dev-$i" + done + + set_tunable32 PREFETCH_DISABLE $prefetch_disable + set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled + set_tunable32 SCAN_SUSPEND_PROGRESS $scan_suspend_progress +} + +function test_sequential_resilver # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + # Fault N=parity devices + for (( i=0; i<$nparity; i++ )); do + log_must zpool offline -f $pool $dir/dev-$i + done + + # Parity is exhausted, faulting another device marks it degraded + log_must zpool offline -f $pool $dir/dev-$nparity + + # Replace all faulted vdevs with distributed spares + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + for (( i=0; i<$((nparity+1)); i++ )); do + spare=draid${nparity}-0-$i + log_must zpool replace -fs $pool $dir/dev-$i $spare + done + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + + log_must zpool wait -t resilver $pool + + log_must zpool scrub -w $pool + log_must zpool status $pool + + log_must check_pool_status $pool "scan" "repaired 0B" + log_must check_pool_status $pool "errors" "No known data errors" + log_must check_pool_status $pool "scan" "with 0 errors" +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 +log_must set_tunable32 REBUILD_SCRUB_ENABLED 0 + +# Disk files which will be used by pool +for i in {0..$(($devs - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +# Disk file which will be attached +log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs + +for nparity in 3; do + raid=draid${nparity}:$((nparity+2))s + dir=$TEST_BASE_DIR + + log_must zpool create -O compression=off -f -o cachefile=none $TESTPOOL $raid ${disks[@]} + log_must zfs set primarycache=metadata $TESTPOOL + + log_must zfs create $TESTPOOL/fs + log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R + + log_must zfs create -o compress=on $TESTPOOL/fs2 + log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R + + log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 + log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R + + log_must zpool export $TESTPOOL + log_must zpool import -o cachefile=none -d $dir $TESTPOOL + + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + test_sequential_resilver $TESTPOOL $nparity $dir + + log_must zpool destroy "$TESTPOOL" +done + +log_pass "draid degraded device(s) test succeeded."