From 938c8c98b1be2826b104280976fae0d3dd4a6a5c Mon Sep 17 00:00:00 2001
From: Andriy Tkachuk <andriy.tkachuk@seagate.com>
Date: Wed, 11 Mar 2026 21:54:20 +0000
Subject: [PATCH] draid: fix data corruption after disk clear

Currently, when there there are several faulted disks with attached
dRAID spares, and one of those disks is cleared from errors (zpool
clear), followed by its spare being detached, the data in all the
remaining spares that were attached while the cleared disk was in
FAULTED state might get corrupted (which can be seen by running scrub).
In some cases, when too many disks get cleared at a time, this can
result in data corruption/loss.

dRAID spare is a virtual device whose blocks are distributed among
other disks. Those disks can be also in FAULTED state with attached
spares on their own. When a disk gets sequentially resilvered (rebuilt),
the changes made by that resilvering won't get captured in the DTL
(Dirty Time Log) of other FAULTED disks with the attached spares to
which the data is written during the resilvering (as it would normally
be done for the changes made by the user if a new file is written or
some existing one is deleted). It is because sequential resilvering
works on the block level, without touching or looking into metadata,
so it doesn't know anything about the old BPs or transactions groups
that it is resilvering. So later on, when that disk gets cleared
from errors and healing resilvering is trying to sync all the data
from its spare onto it, all the changes made on its spare during the
resilvering of other disks will be missed because they won't be
captured in its DTL. That's why other dRAID spares may get corrupted.

Here's another way to explain it that might be helpful. Imagine a
scenario:

1. d1 fails and gets resilvered to some spare s1 - OK.
2. d2 fails and gets sequentially resilvered on draid spare s2. Now,
   in some slices, s2 would map to d1, which is failed. But d1 has s1
   spare attached, so the data from that resilvering goes to s1, but
   not recorded in d1's DTL.
3. Now, d1 gets cleared and its s1 gets detached. All the changes
   done by the user (writes or deletions) have their txgs captured
   in d1's DTL, so they will be resilvered by the healing resilver
   from its spare (s1) - that part works fine. But the data which
   was written during resilvering of d2 and went to s1 - that one
   will be missed from d1's DTL and won't get resilvered to it. So
   here we are:
4. s2 under d2 is corrupted in the slices which map to d1, because
   d1 doesn't have that data resilvered from s1.

Now, if there are more failed disks with draid spares attached which
were sequentially resilvered while d1 was failed, d3+s3, d4+s4 and
so on - all their spares will be corrupted. Because, in some slices,
each of them will map to d1 which will miss their data.

Solution: add all known txgs starting from TXG_INITIAL to DTLs of
non-writable devices during sequential resilvering so when healing
resilver starts on disk clear, it would be able to check and heal
blocks from all txgs.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by: Andriy Tkachuk <andriy.tkachuk@seagate.com>
Closes #18286
Closes #18294
---
 include/sys/spa.h                             |  1 +
 include/sys/vdev_rebuild.h                    |  1 +
 module/zfs/spa_misc.c                         |  6 ++++
 module/zfs/vdev.c                             | 29 +++++++++++++++----
 module/zfs/vdev_draid.c                       |  7 -----
 module/zfs/vdev_rebuild.c                     | 20 +++++++++++--
 tests/zfs-tests/include/libtest.shlib         | 15 ++++++++++
 .../redundancy/redundancy_draid_spare1.ksh    | 11 ++++++-
 8 files changed, 74 insertions(+), 16 deletions(-)

diff --git a/include/sys/spa.h b/include/sys/spa.h
index 1a84844c5..0de8a1867 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1082,6 +1082,7 @@ extern uint64_t spa_guid(spa_t *spa);
 extern uint64_t spa_load_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_open_txg(spa_t *spa);
 extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_final_dirty_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h
index 51e669c2c..b787b1d5d 100644
--- a/include/sys/vdev_rebuild.h
+++ b/include/sys/vdev_rebuild.h
@@ -91,6 +91,7 @@ boolean_t vdev_rebuild_active(vdev_t *);
 
 int vdev_rebuild_load(vdev_t *);
 void vdev_rebuild(vdev_t *, uint64_t);
+void vdev_rebuild_txgs(vdev_t *, uint64_t *, uint64_t *);
 void vdev_rebuild_stop_wait(vdev_t *);
 void vdev_rebuild_stop_all(spa_t *);
 void vdev_rebuild_restart(spa_t *);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index bed159eb7..b8887d869 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1891,6 +1891,12 @@ spa_syncing_txg(spa_t *spa)
 	return (spa->spa_syncing_txg);
 }
 
+uint64_t
+spa_open_txg(spa_t *spa)
+{
+	return (spa->spa_dsl_pool->dp_tx.tx_open_txg);
+}
+
 /*
  * Return the last txg where data can be dirtied. The final txgs
  * will be used to just clear out any deferred frees that remain.
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 6d44bd41b..71bb247c6 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -31,6 +31,7 @@
  * Copyright (c) 2019, Datto Inc. All rights reserved.
  * Copyright (c) 2021, 2025, Klara, Inc.
  * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2026, Seagate Technology, LLC.
  */
 
 #include <sys/zfs_context.h>
@@ -3094,8 +3095,11 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	mutex_enter(&vd->vdev_dtl_lock);
-	if (!zfs_range_tree_contains(rt, txg, size))
+	if (!zfs_range_tree_contains(rt, txg, size)) {
+		/* Clear whatever is there already. */
+		zfs_range_tree_clear(rt, txg, size);
 		zfs_range_tree_add(rt, txg, size);
+	}
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
@@ -5212,11 +5216,13 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
 	    (flags & ZIO_FLAG_SCAN_THREAD) ||
+	    zio->io_priority == ZIO_PRIORITY_REBUILD ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
 		 * a repair induced by the scrub thread, or it's a repair
-		 * made by zil_claim() during spa_load() in the first txg.
+		 * made by zil_claim() during spa_load() in the first txg,
+		 * or its repair induced by rebuild (sequential resilver).
 		 * In the normal case, we commit the DTL change in the same
 		 * txg as the block was born.  In the scrub-induced repair
 		 * case, we know that scrubs run in first-pass syncing context,
@@ -5227,27 +5233,38 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 		 * self-healing writes triggered by normal (non-scrubbing)
 		 * reads, because we have no transactional context in which to
 		 * do so -- and it's not clear that it'd be desirable anyway.
+		 *
+		 * For rebuild, since we don't have any information about BPs
+		 * and txgs that are being rebuilt, we need to add all known
+		 * txgs (starting from TXG_INITIAL) to DTL so that during
+		 * healing resilver we would be able to check all txgs at
+		 * vdev_draid_need_resilver().
 		 */
+		uint64_t size = 1;
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
-				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
+				vdev_dtl_dirty(vd, DTL_SCRUB, txg, size);
 				commit_txg = spa_syncing_txg(spa);
 			} else if (spa->spa_claiming) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				commit_txg = spa_first_txg(spa);
+			} else if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
+				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+				vdev_rebuild_txgs(vd->vdev_top, &txg, &size);
+				commit_txg = spa_open_txg(spa);
 			}
 			ASSERT(commit_txg >= spa_syncing_txg(spa));
-			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
+			if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
 				return;
 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
-				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
+				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, size);
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
 		if (vd != rvd)
-			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
+			vdev_dtl_dirty(vd, DTL_MISSING, txg, size);
 	}
 }
 
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index 8588cfee3..9e02d8682 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -1452,13 +1452,6 @@ vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg,
 		/* Transaction group is known to be partially replicated. */
 		if (vdev_draid_partial(cvd, physical_offset, txg, size))
 			return (B_TRUE);
-
-		/*
-		 * Always check groups with active distributed spares
-		 * because any vdev failure in the pool will affect them.
-		 */
-		if (vdev_draid_find_spare(cvd) != NULL)
-			return (B_TRUE);
 	}
 
 	return (B_FALSE);
diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
index 0e14d29d7..36b3f9e66 100644
--- a/module/zfs/vdev_rebuild.c
+++ b/module/zfs/vdev_rebuild.c
@@ -233,7 +233,7 @@ vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx)
 	mutex_enter(&vd->vdev_rebuild_lock);
 	memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
 	vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE;
-	vrp->vrp_min_txg = 0;
+	vrp->vrp_min_txg = TXG_INITIAL;
 	vrp->vrp_max_txg = dmu_tx_get_txg(tx);
 	vrp->vrp_start_time = gethrestime_sec();
 	vrp->vrp_scan_time_ms = 0;
@@ -415,7 +415,7 @@ vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx)
 	ASSERT0P(vd->vdev_rebuild_thread);
 
 	vrp->vrp_last_offset = 0;
-	vrp->vrp_min_txg = 0;
+	vrp->vrp_min_txg = TXG_INITIAL;
 	vrp->vrp_max_txg = dmu_tx_get_txg(tx);
 	vrp->vrp_bytes_scanned = 0;
 	vrp->vrp_bytes_issued = 0;
@@ -1126,6 +1126,22 @@ vdev_rebuild_stop_all(spa_t *spa)
 	vdev_rebuild_stop_wait(spa->spa_root_vdev);
 }
 
+/*
+ * Return rebuild transaction groups range.  It's used to populate DTLs
+ * of the non-writable devices during the rebuild so that they could be
+ * healed correctly, in case they are cleared, and not miss the data
+ * that was written to their spares during the rebuild.
+ */
+void
+vdev_rebuild_txgs(vdev_t *vd, uint64_t *min_txg, uint64_t *size)
+{
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+	*min_txg = vrp->vrp_min_txg;
+	*size = vrp->vrp_max_txg - vrp->vrp_min_txg;
+}
+
 /*
  * Rebuild statistics reported per top-level vdev.
  */
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 3d697726a..23460d665 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -3139,6 +3139,21 @@ function wait_scrubbed #pool timeout
        done
 }
 
+# Wait for a pool to be resilvered
+#
+# $1 pool name
+# $2 timeout
+#
+function wait_resilvered #pool timeout
+{
+       typeset timeout=${2:-300}
+       typeset pool=${1:-$TESTPOOL}
+       for (( timer = 0; timer < $timeout; timer++ )); do
+               is_pool_resilvered $pool && break;
+               sleep 1;
+       done
+}
+
 # Backup the zed.rc in our test directory so that we can edit it for our test.
 #
 # Returns: Backup file name.  You will need to pass this to zed_rc_restore().
diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh
index b0f312f26..0604f7f48 100755
--- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh
+++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh
@@ -19,6 +19,7 @@
 #
 # Copyright (c) 2019, Datto Inc. All rights reserved.
 # Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+# Copyright (c) 2026, Seagate Technology, LLC.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -82,7 +83,8 @@ for replace_mode in "healing" "sequential"; do
 		log_must check_vdev_state spare-$i "DEGRADED"
 		log_must check_vdev_state $spare_vdev "ONLINE"
 		log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE"
-		log_must zpool detach $TESTPOOL $fault_vdev
+		# Preserve the 1st faulted vdev for the next test.
+		[[ $i -eq 0 ]] || log_must zpool detach $TESTPOOL $fault_vdev
 		log_must verify_pool $TESTPOOL
 		log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
 		log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
@@ -93,6 +95,13 @@ for replace_mode in "healing" "sequential"; do
 	log_must is_data_valid $TESTPOOL
 	log_must check_pool_status $TESTPOOL "errors" "No known data errors"
 
+	# Verify that after clearing the 1st faulted vdev, all is healed.
+	log_must zpool clear $TESTPOOL "$BASEDIR/vdev0"
+	log_must wait_resilvered $TESTPOOL
+	log_must verify_pool $TESTPOOL
+	log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
+	log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
+
 	cleanup
 done