diff --git a/include/sys/spa.h b/include/sys/spa.h index 1a84844c5..0de8a1867 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1082,6 +1082,7 @@ extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_load_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); +extern uint64_t spa_open_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h index 51e669c2c..b787b1d5d 100644 --- a/include/sys/vdev_rebuild.h +++ b/include/sys/vdev_rebuild.h @@ -91,6 +91,7 @@ boolean_t vdev_rebuild_active(vdev_t *); int vdev_rebuild_load(vdev_t *); void vdev_rebuild(vdev_t *, uint64_t); +void vdev_rebuild_txgs(vdev_t *, uint64_t *, uint64_t *); void vdev_rebuild_stop_wait(vdev_t *); void vdev_rebuild_stop_all(spa_t *); void vdev_rebuild_restart(spa_t *); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 37d741007..9b110f31f 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1891,6 +1891,12 @@ spa_syncing_txg(spa_t *spa) return (spa->spa_syncing_txg); } +uint64_t +spa_open_txg(spa_t *spa) +{ + return (spa->spa_dsl_pool->dp_tx.tx_open_txg); +} + /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 3480b884e..de4161559 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -31,6 +31,7 @@ * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, 2025, Klara, Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2026, Seagate Technology, LLC. */ #include @@ -3096,8 +3097,11 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); - if (!zfs_range_tree_contains(rt, txg, size)) + if (!zfs_range_tree_contains(rt, txg, size)) { + /* Clear whatever is there already. */ + zfs_range_tree_clear(rt, txg, size); zfs_range_tree_add(rt, txg, size); + } mutex_exit(&vd->vdev_dtl_lock); } @@ -5220,11 +5224,13 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || + zio->io_priority == ZIO_PRIORITY_REBUILD || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair - * made by zil_claim() during spa_load() in the first txg. + * made by zil_claim() during spa_load() in the first txg, + * or its repair induced by rebuild (sequential resilver). * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, @@ -5235,27 +5241,38 @@ vdev_stat_update(zio_t *zio, uint64_t psize) * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. + * + * For rebuild, since we don't have any information about BPs + * and txgs that are being rebuilt, we need to add all known + * txgs (starting from TXG_INITIAL) to DTL so that during + * healing resilver we would be able to check all txgs at + * vdev_draid_need_resilver(). */ + uint64_t size = 1; if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); - vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); + vdev_dtl_dirty(vd, DTL_SCRUB, txg, size); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); + } else if (zio->io_priority == ZIO_PRIORITY_REBUILD) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + vdev_rebuild_txgs(vd->vdev_top, &txg, &size); + commit_txg = spa_open_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); - if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) - vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); + vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, size); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) - vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); + vdev_dtl_dirty(vd, DTL_MISSING, txg, size); } } diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 8588cfee3..9e02d8682 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1452,13 +1452,6 @@ vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg, /* Transaction group is known to be partially replicated. */ if (vdev_draid_partial(cvd, physical_offset, txg, size)) return (B_TRUE); - - /* - * Always check groups with active distributed spares - * because any vdev failure in the pool will affect them. - */ - if (vdev_draid_find_spare(cvd) != NULL) - return (B_TRUE); } return (B_FALSE); diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index bb75bcb49..4fe6dc9d6 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -233,7 +233,7 @@ vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) mutex_enter(&vd->vdev_rebuild_lock); memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; - vrp->vrp_min_txg = 0; + vrp->vrp_min_txg = TXG_INITIAL; vrp->vrp_max_txg = dmu_tx_get_txg(tx); vrp->vrp_start_time = gethrestime_sec(); vrp->vrp_scan_time_ms = 0; @@ -415,7 +415,7 @@ vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) ASSERT0P(vd->vdev_rebuild_thread); vrp->vrp_last_offset = 0; - vrp->vrp_min_txg = 0; + vrp->vrp_min_txg = TXG_INITIAL; vrp->vrp_max_txg = dmu_tx_get_txg(tx); vrp->vrp_bytes_scanned = 0; vrp->vrp_bytes_issued = 0; @@ -1127,6 +1127,22 @@ vdev_rebuild_stop_all(spa_t *spa) vdev_rebuild_stop_wait(spa->spa_root_vdev); } +/* + * Return rebuild transaction groups range. It's used to populate DTLs + * of the non-writable devices during the rebuild so that they could be + * healed correctly, in case they are cleared, and not miss the data + * that was written to their spares during the rebuild. + */ +void +vdev_rebuild_txgs(vdev_t *vd, uint64_t *min_txg, uint64_t *size) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + *min_txg = vrp->vrp_min_txg; + *size = vrp->vrp_max_txg - vrp->vrp_min_txg; +} + /* * Rebuild statistics reported per top-level vdev. */ diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 913d23d13..974e19c04 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3161,6 +3161,21 @@ function wait_scrubbed #pool timeout done } +# Wait for a pool to be resilvered +# +# $1 pool name +# $2 timeout +# +function wait_resilvered #pool timeout +{ + typeset timeout=${2:-300} + typeset pool=${1:-$TESTPOOL} + for (( timer = 0; timer < $timeout; timer++ )); do + is_pool_resilvered $pool && break; + sleep 1; + done +} + # Backup the zed.rc in our test directory so that we can edit it for our test. # # Returns: Backup file name. You will need to pass this to zed_rc_restore(). diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh index b0f312f26..0604f7f48 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh @@ -19,6 +19,7 @@ # # Copyright (c) 2019, Datto Inc. All rights reserved. # Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2026, Seagate Technology, LLC. # . $STF_SUITE/include/libtest.shlib @@ -82,7 +83,8 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state spare-$i "DEGRADED" log_must check_vdev_state $spare_vdev "ONLINE" log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE" - log_must zpool detach $TESTPOOL $fault_vdev + # Preserve the 1st faulted vdev for the next test. + [[ $i -eq 0 ]] || log_must zpool detach $TESTPOOL $fault_vdev log_must verify_pool $TESTPOOL log_must check_pool_status $TESTPOOL "scan" "repaired 0B" log_must check_pool_status $TESTPOOL "scan" "with 0 errors" @@ -93,6 +95,13 @@ for replace_mode in "healing" "sequential"; do log_must is_data_valid $TESTPOOL log_must check_pool_status $TESTPOOL "errors" "No known data errors" + # Verify that after clearing the 1st faulted vdev, all is healed. + log_must zpool clear $TESTPOOL "$BASEDIR/vdev0" + log_must wait_resilvered $TESTPOOL + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + cleanup done