From 931deb290c304739263ad720eb447fb3542bc70b Mon Sep 17 00:00:00 2001 From: Alek P Date: Mon, 23 Mar 2026 21:34:19 -0400 Subject: [PATCH] Prevent range tree corruption race by updating dnode_sync() Switch to incremental range tree processing in dnode_sync() to avoid unsafe lock dropping during zfs_range_tree_walk(). This also ensures the free ranges remain visible to dnode_block_freed() throughout the sync process, preventing potential stale data reads. This patch: - Keeps the range tree attached during processing for visibility. - Processes segments one-by-one by restarting from the tree head. - Uses zfs_range_tree_clear() to safely handle ranges that may have been modified while the lock was dropped. - adds ASSERT()s to document that we don't expect dn_free_ranges modification outside of sync context. Reviewed-by: Paul Dagnelie Reviewed-by: Brian Behlendorf Signed-off-by: Alek Pinchuk Issue #18186 Closes #18235 --- include/sys/dnode.h | 13 +++++ module/zfs/dbuf.c | 12 +++++ module/zfs/dnode.c | 2 + module/zfs/dnode_sync.c | 105 +++++++++++++++++++++++----------------- 4 files changed, 87 insertions(+), 45 deletions(-) diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 8bd1db5b7..b9ded73ef 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -652,6 +652,19 @@ extern dnode_sums_t dnode_sums; #endif +/* + * Assert that we are not modifying the range tree for the syncing TXG from + * a non-syncing thread. We verify that either the transaction group is + * strictly newer than the one currently syncing (meaning it's being modified + * in open context), OR the current thread is the sync thread itself. If this + * triggers, it indicates a race where dn_free_ranges is being modified while + * dnode_sync() may be iterating over it. + */ +#define FREE_RANGE_VERIFY(tx, dn) \ + ASSERT((tx)->tx_txg > spa_syncing_txg((dn)->dn_objset->os_spa) || \ + dmu_objset_pool((dn)->dn_objset)->dp_tx.tx_sync_thread == \ + curthread) + #ifdef __cplusplus } #endif diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index df75d3fbe..0bfd6787c 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2201,6 +2201,17 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) mutex_enter(&dn->dn_mtx); int txgoff = tx->tx_txg & TXG_MASK; + + /* + * Assert that we are not modifying the range tree for the syncing + * TXG from a non-syncing thread. We verify that the tx's + * transaction group is strictly newer than the one currently + * syncing (meaning we are in open context). If this triggers, + * it indicates a race where syncing dn_free_range tree is + * being modified while dnode_sync() may be iterating over it. + */ + ASSERT(tx->tx_txg > spa_syncing_txg(dn->dn_objset->os_spa)); + if (dn->dn_free_ranges[txgoff] != NULL) { zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1); } @@ -2388,6 +2399,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_free_ranges[txgoff] != NULL) { + FREE_RANGE_VERIFY(tx, dn); zfs_range_tree_clear(dn->dn_free_ranges[txgoff], db->db_blkid, 1); } diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index e0cc4a7e1..be0e3de9b 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2409,6 +2409,8 @@ done: mutex_enter(&dn->dn_mtx); { int txgoff = tx->tx_txg & TXG_MASK; + + FREE_RANGE_VERIFY(tx, dn); if (dn->dn_free_ranges[txgoff] == NULL) { dn->dn_free_ranges[txgoff] = zfs_range_tree_create_flags( diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 046ceddb3..0e070c69d 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -440,24 +440,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, } } -typedef struct dnode_sync_free_range_arg { - dnode_t *dsfra_dnode; - dmu_tx_t *dsfra_tx; - boolean_t dsfra_free_indirects; -} dnode_sync_free_range_arg_t; - -static void -dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) -{ - dnode_sync_free_range_arg_t *dsfra = arg; - dnode_t *dn = dsfra->dsfra_dnode; - - mutex_exit(&dn->dn_mtx); - dnode_sync_free_range_impl(dn, blkid, nblks, - dsfra->dsfra_free_indirects, dsfra->dsfra_tx); - mutex_enter(&dn->dn_mtx); -} - /* * Try to kick all the dnode's dbufs out of the cache... */ @@ -634,6 +616,64 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) */ } +/* + * We cannot simply detach the range tree (set dn_free_ranges to NULL) + * before processing it because dnode_block_freed() relies on it to + * correctly identify blocks that have been freed in the current TXG + * (for dbuf_read() calls on holes). If we detached it early, a concurrent + * reader might see the block as valid on disk and return stale data + * instead of zeros. + * + * We also can't use zfs_range_tree_walk() nor zfs_range_tree_vacate() + * with a callback that drops dn_mtx (dnode_sync_free_range()). This is + * unsafe because another thread (spa_sync_deferred_frees() -> + * dnode_free_range()) could acquire dn_mtx and modify the tree while the + * walk or vacate was in progress. This leads to tree corruption or panic + * when we resume. + * + * To fix the race while maintaining visibility, we process the tree + * incrementally. We pick a segment, drop the lock to sync it, and + * re-acquire the lock to remove it. By always restarting from the head + * of the tree, we ensure we are never using an invalid iterator. + * We use zfs_range_tree_clear() instead of ..._remove() because the range + * might have already been removed while the lock was dropped (specifically + * in the dbuf_dirty path mentioned above). ..._clear() handles this + * gracefully, while ..._remove() would panic on a missing segment. + */ +static void +dnode_sync_free_ranges(dnode_t *dn, dmu_tx_t *tx) +{ + int txgoff = tx->tx_txg & TXG_MASK; + + mutex_enter(&dn->dn_mtx); + zfs_range_tree_t *rt = dn->dn_free_ranges[txgoff]; + if (rt != NULL) { + boolean_t freeing_dnode = dn->dn_free_txg > 0 && + dn->dn_free_txg <= tx->tx_txg; + zfs_range_seg_t *rs; + + if (freeing_dnode) { + ASSERT(zfs_range_tree_contains(rt, 0, + dn->dn_maxblkid + 1)); + } + + while ((rs = zfs_range_tree_first(rt)) != NULL) { + uint64_t start = zfs_rs_get_start(rs, rt); + uint64_t size = zfs_rs_get_end(rs, rt) - start; + + mutex_exit(&dn->dn_mtx); + dnode_sync_free_range_impl(dn, start, size, + freeing_dnode, tx); + mutex_enter(&dn->dn_mtx); + + zfs_range_tree_clear(rt, start, size); + } + zfs_range_tree_destroy(rt); + dn->dn_free_ranges[txgoff] = NULL; + } + mutex_exit(&dn->dn_mtx); +} + /* * Write out the dnode's dirty buffers. * Does not wait for zio completions. @@ -781,32 +821,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) } /* process all the "freed" ranges in the file */ - if (dn->dn_free_ranges[txgoff] != NULL) { - dnode_sync_free_range_arg_t dsfra; - dsfra.dsfra_dnode = dn; - dsfra.dsfra_tx = tx; - dsfra.dsfra_free_indirects = freeing_dnode; - mutex_enter(&dn->dn_mtx); - if (freeing_dnode) { - ASSERT(zfs_range_tree_contains( - dn->dn_free_ranges[txgoff], 0, - dn->dn_maxblkid + 1)); - } - /* - * Because dnode_sync_free_range() must drop dn_mtx during its - * processing, using it as a callback to zfs_range_tree_vacate() - * is not safe. No other operations (besides destroy) are - * allowed once zfs_range_tree_vacate() has begun, and dropping - * dn_mtx would leave a window open for another thread to - * observe that invalid (and unsafe) state. - */ - zfs_range_tree_walk(dn->dn_free_ranges[txgoff], - dnode_sync_free_range, &dsfra); - zfs_range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL); - zfs_range_tree_destroy(dn->dn_free_ranges[txgoff]); - dn->dn_free_ranges[txgoff] = NULL; - mutex_exit(&dn->dn_mtx); - } + dnode_sync_free_ranges(dn, tx); if (freeing_dnode) { dn->dn_objset->os_freed_dnodes++; @@ -828,7 +843,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) } /* - * This must be done after dnode_sync_free_range() + * This must be done after dnode_sync_free_ranges() * and dnode_increase_indirection(). See dnode_new_blkid() * for an explanation of the high bit being set. */