From 931deb290c304739263ad720eb447fb3542bc70b Mon Sep 17 00:00:00 2001
From: Alek P <alek-p@users.noreply.github.com>
Date: Mon, 23 Mar 2026 21:34:19 -0400
Subject: [PATCH] Prevent range tree corruption race by updating dnode_sync()

Switch to incremental range tree processing in dnode_sync() to avoid
unsafe lock dropping during zfs_range_tree_walk(). This also ensures
the free ranges remain visible to dnode_block_freed() throughout the
sync process, preventing potential stale data reads.

This patch:
 - Keeps the range tree attached during processing for visibility.
 - Processes segments one-by-one by restarting from the tree head.
 - Uses zfs_range_tree_clear() to safely handle ranges that may have
   been modified while the lock was dropped.
 - adds ASSERT()s to document that we don't expect dn_free_ranges
   modification outside of sync context.

Reviewed-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alek Pinchuk <apinchuk@axcient.com>
Issue #18186
Closes #18235
---
 include/sys/dnode.h     |  13 +++++
 module/zfs/dbuf.c       |  12 +++++
 module/zfs/dnode.c      |   2 +
 module/zfs/dnode_sync.c | 105 +++++++++++++++++++++++-----------------
 4 files changed, 87 insertions(+), 45 deletions(-)

diff --git a/include/sys/dnode.h b/include/sys/dnode.h
index 8bd1db5b7..b9ded73ef 100644
--- a/include/sys/dnode.h
+++ b/include/sys/dnode.h
@@ -652,6 +652,19 @@ extern dnode_sums_t dnode_sums;
 
 #endif
 
+/*
+ * Assert that we are not modifying the range tree for the syncing TXG from
+ * a non-syncing thread. We verify that either the transaction group is
+ * strictly newer than the one currently syncing (meaning it's being modified
+ * in open context), OR the current thread is the sync thread itself. If this
+ * triggers, it indicates a race where dn_free_ranges is being modified while
+ * dnode_sync() may be iterating over it.
+ */
+#define	FREE_RANGE_VERIFY(tx, dn) \
+	ASSERT((tx)->tx_txg > spa_syncing_txg((dn)->dn_objset->os_spa) || \
+	    dmu_objset_pool((dn)->dn_objset)->dp_tx.tx_sync_thread == \
+	    curthread)
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index df75d3fbe..0bfd6787c 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2201,6 +2201,17 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
 
 	mutex_enter(&dn->dn_mtx);
 	int txgoff = tx->tx_txg & TXG_MASK;
+
+	/*
+	 * Assert that we are not modifying the range tree for the syncing
+	 * TXG from a non-syncing thread. We verify that the tx's
+	 * transaction group is strictly newer than the one currently
+	 * syncing (meaning we are in open context). If this triggers,
+	 * it indicates a race where syncing dn_free_range tree is
+	 * being modified while dnode_sync() may be iterating over it.
+	 */
+	ASSERT(tx->tx_txg > spa_syncing_txg(dn->dn_objset->os_spa));
+
 	if (dn->dn_free_ranges[txgoff] != NULL) {
 		zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
 	}
@@ -2388,6 +2399,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	    db->db_blkid != DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_free_ranges[txgoff] != NULL) {
+			FREE_RANGE_VERIFY(tx, dn);
 			zfs_range_tree_clear(dn->dn_free_ranges[txgoff],
 			    db->db_blkid, 1);
 		}
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index e0cc4a7e1..be0e3de9b 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -2409,6 +2409,8 @@ done:
 	mutex_enter(&dn->dn_mtx);
 	{
 		int txgoff = tx->tx_txg & TXG_MASK;
+
+		FREE_RANGE_VERIFY(tx, dn);
 		if (dn->dn_free_ranges[txgoff] == NULL) {
 			dn->dn_free_ranges[txgoff] =
 			    zfs_range_tree_create_flags(
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 046ceddb3..0e070c69d 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -440,24 +440,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
 	}
 }
 
-typedef struct dnode_sync_free_range_arg {
-	dnode_t *dsfra_dnode;
-	dmu_tx_t *dsfra_tx;
-	boolean_t dsfra_free_indirects;
-} dnode_sync_free_range_arg_t;
-
-static void
-dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
-{
-	dnode_sync_free_range_arg_t *dsfra = arg;
-	dnode_t *dn = dsfra->dsfra_dnode;
-
-	mutex_exit(&dn->dn_mtx);
-	dnode_sync_free_range_impl(dn, blkid, nblks,
-	    dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
-	mutex_enter(&dn->dn_mtx);
-}
-
 /*
  * Try to kick all the dnode's dbufs out of the cache...
  */
@@ -634,6 +616,64 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 	 */
 }
 
+/*
+ * We cannot simply detach the range tree (set dn_free_ranges to NULL)
+ * before processing it because dnode_block_freed() relies on it to
+ * correctly identify blocks that have been freed in the current TXG
+ * (for dbuf_read() calls on holes). If we detached it early, a concurrent
+ * reader might see the block as valid on disk and return stale data
+ * instead of zeros.
+ *
+ * We also can't use zfs_range_tree_walk() nor zfs_range_tree_vacate()
+ * with a callback that drops dn_mtx (dnode_sync_free_range()). This is
+ * unsafe because another thread (spa_sync_deferred_frees() ->
+ * dnode_free_range()) could acquire dn_mtx and modify the tree while the
+ * walk or vacate was in progress. This leads to tree corruption or panic
+ * when we resume.
+ *
+ * To fix the race while maintaining visibility, we process the tree
+ * incrementally. We pick a segment, drop the lock to sync it, and
+ * re-acquire the lock to remove it. By always restarting from the head
+ * of the tree, we ensure we are never using an invalid iterator.
+ * We use zfs_range_tree_clear() instead of ..._remove() because the range
+ * might have already been removed while the lock was dropped (specifically
+ * in the dbuf_dirty path mentioned above). ..._clear() handles this
+ * gracefully, while ..._remove() would panic on a missing segment.
+ */
+static void
+dnode_sync_free_ranges(dnode_t *dn, dmu_tx_t *tx)
+{
+	int txgoff = tx->tx_txg & TXG_MASK;
+
+	mutex_enter(&dn->dn_mtx);
+	zfs_range_tree_t *rt = dn->dn_free_ranges[txgoff];
+	if (rt != NULL) {
+		boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
+		    dn->dn_free_txg <= tx->tx_txg;
+		zfs_range_seg_t *rs;
+
+		if (freeing_dnode) {
+			ASSERT(zfs_range_tree_contains(rt, 0,
+			    dn->dn_maxblkid + 1));
+		}
+
+		while ((rs = zfs_range_tree_first(rt)) != NULL) {
+			uint64_t start = zfs_rs_get_start(rs, rt);
+			uint64_t size = zfs_rs_get_end(rs, rt) - start;
+
+			mutex_exit(&dn->dn_mtx);
+			dnode_sync_free_range_impl(dn, start, size,
+			    freeing_dnode, tx);
+			mutex_enter(&dn->dn_mtx);
+
+			zfs_range_tree_clear(rt, start, size);
+		}
+		zfs_range_tree_destroy(rt);
+		dn->dn_free_ranges[txgoff] = NULL;
+	}
+	mutex_exit(&dn->dn_mtx);
+}
+
 /*
  * Write out the dnode's dirty buffers.
  * Does not wait for zio completions.
@@ -781,32 +821,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 	}
 
 	/* process all the "freed" ranges in the file */
-	if (dn->dn_free_ranges[txgoff] != NULL) {
-		dnode_sync_free_range_arg_t dsfra;
-		dsfra.dsfra_dnode = dn;
-		dsfra.dsfra_tx = tx;
-		dsfra.dsfra_free_indirects = freeing_dnode;
-		mutex_enter(&dn->dn_mtx);
-		if (freeing_dnode) {
-			ASSERT(zfs_range_tree_contains(
-			    dn->dn_free_ranges[txgoff], 0,
-			    dn->dn_maxblkid + 1));
-		}
-		/*
-		 * Because dnode_sync_free_range() must drop dn_mtx during its
-		 * processing, using it as a callback to zfs_range_tree_vacate()
-		 * is not safe. No other operations (besides destroy) are
-		 * allowed once zfs_range_tree_vacate() has begun, and dropping
-		 * dn_mtx would leave a window open for another thread to
-		 * observe that invalid (and unsafe) state.
-		 */
-		zfs_range_tree_walk(dn->dn_free_ranges[txgoff],
-		    dnode_sync_free_range, &dsfra);
-		zfs_range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL);
-		zfs_range_tree_destroy(dn->dn_free_ranges[txgoff]);
-		dn->dn_free_ranges[txgoff] = NULL;
-		mutex_exit(&dn->dn_mtx);
-	}
+	dnode_sync_free_ranges(dn, tx);
 
 	if (freeing_dnode) {
 		dn->dn_objset->os_freed_dnodes++;
@@ -828,7 +843,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 	}
 
 	/*
-	 * This must be done after dnode_sync_free_range()
+	 * This must be done after dnode_sync_free_ranges()
 	 * and dnode_increase_indirection(). See dnode_new_blkid()
 	 * for an explanation of the high bit being set.
 	 */