mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-04-12 22:51:46 +03:00
Prevent range tree corruption race by updating dnode_sync()
Switch to incremental range tree processing in dnode_sync() to avoid unsafe lock dropping during zfs_range_tree_walk(). This also ensures the free ranges remain visible to dnode_block_freed() throughout the sync process, preventing potential stale data reads. This patch: - Keeps the range tree attached during processing for visibility. - Processes segments one-by-one by restarting from the tree head. - Uses zfs_range_tree_clear() to safely handle ranges that may have been modified while the lock was dropped. - adds ASSERT()s to document that we don't expect dn_free_ranges modification outside of sync context. Reviewed-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alek Pinchuk <apinchuk@axcient.com> Issue #18186 Closes #18235
This commit is contained in:
parent
08d7a62673
commit
931deb290c
@ -652,6 +652,19 @@ extern dnode_sums_t dnode_sums;
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Assert that we are not modifying the range tree for the syncing TXG from
|
||||
* a non-syncing thread. We verify that either the transaction group is
|
||||
* strictly newer than the one currently syncing (meaning it's being modified
|
||||
* in open context), OR the current thread is the sync thread itself. If this
|
||||
* triggers, it indicates a race where dn_free_ranges is being modified while
|
||||
* dnode_sync() may be iterating over it.
|
||||
*/
|
||||
#define FREE_RANGE_VERIFY(tx, dn) \
|
||||
ASSERT((tx)->tx_txg > spa_syncing_txg((dn)->dn_objset->os_spa) || \
|
||||
dmu_objset_pool((dn)->dn_objset)->dp_tx.tx_sync_thread == \
|
||||
curthread)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -2201,6 +2201,17 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
|
||||
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
int txgoff = tx->tx_txg & TXG_MASK;
|
||||
|
||||
/*
|
||||
* Assert that we are not modifying the range tree for the syncing
|
||||
* TXG from a non-syncing thread. We verify that the tx's
|
||||
* transaction group is strictly newer than the one currently
|
||||
* syncing (meaning we are in open context). If this triggers,
|
||||
* it indicates a race where syncing dn_free_range tree is
|
||||
* being modified while dnode_sync() may be iterating over it.
|
||||
*/
|
||||
ASSERT(tx->tx_txg > spa_syncing_txg(dn->dn_objset->os_spa));
|
||||
|
||||
if (dn->dn_free_ranges[txgoff] != NULL) {
|
||||
zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
|
||||
}
|
||||
@ -2388,6 +2399,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
||||
db->db_blkid != DMU_SPILL_BLKID) {
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
if (dn->dn_free_ranges[txgoff] != NULL) {
|
||||
FREE_RANGE_VERIFY(tx, dn);
|
||||
zfs_range_tree_clear(dn->dn_free_ranges[txgoff],
|
||||
db->db_blkid, 1);
|
||||
}
|
||||
|
||||
@ -2409,6 +2409,8 @@ done:
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
{
|
||||
int txgoff = tx->tx_txg & TXG_MASK;
|
||||
|
||||
FREE_RANGE_VERIFY(tx, dn);
|
||||
if (dn->dn_free_ranges[txgoff] == NULL) {
|
||||
dn->dn_free_ranges[txgoff] =
|
||||
zfs_range_tree_create_flags(
|
||||
|
||||
@ -440,24 +440,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct dnode_sync_free_range_arg {
|
||||
dnode_t *dsfra_dnode;
|
||||
dmu_tx_t *dsfra_tx;
|
||||
boolean_t dsfra_free_indirects;
|
||||
} dnode_sync_free_range_arg_t;
|
||||
|
||||
static void
|
||||
dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
|
||||
{
|
||||
dnode_sync_free_range_arg_t *dsfra = arg;
|
||||
dnode_t *dn = dsfra->dsfra_dnode;
|
||||
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
dnode_sync_free_range_impl(dn, blkid, nblks,
|
||||
dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to kick all the dnode's dbufs out of the cache...
|
||||
*/
|
||||
@ -634,6 +616,64 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
* We cannot simply detach the range tree (set dn_free_ranges to NULL)
|
||||
* before processing it because dnode_block_freed() relies on it to
|
||||
* correctly identify blocks that have been freed in the current TXG
|
||||
* (for dbuf_read() calls on holes). If we detached it early, a concurrent
|
||||
* reader might see the block as valid on disk and return stale data
|
||||
* instead of zeros.
|
||||
*
|
||||
* We also can't use zfs_range_tree_walk() nor zfs_range_tree_vacate()
|
||||
* with a callback that drops dn_mtx (dnode_sync_free_range()). This is
|
||||
* unsafe because another thread (spa_sync_deferred_frees() ->
|
||||
* dnode_free_range()) could acquire dn_mtx and modify the tree while the
|
||||
* walk or vacate was in progress. This leads to tree corruption or panic
|
||||
* when we resume.
|
||||
*
|
||||
* To fix the race while maintaining visibility, we process the tree
|
||||
* incrementally. We pick a segment, drop the lock to sync it, and
|
||||
* re-acquire the lock to remove it. By always restarting from the head
|
||||
* of the tree, we ensure we are never using an invalid iterator.
|
||||
* We use zfs_range_tree_clear() instead of ..._remove() because the range
|
||||
* might have already been removed while the lock was dropped (specifically
|
||||
* in the dbuf_dirty path mentioned above). ..._clear() handles this
|
||||
* gracefully, while ..._remove() would panic on a missing segment.
|
||||
*/
|
||||
static void
|
||||
dnode_sync_free_ranges(dnode_t *dn, dmu_tx_t *tx)
|
||||
{
|
||||
int txgoff = tx->tx_txg & TXG_MASK;
|
||||
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
zfs_range_tree_t *rt = dn->dn_free_ranges[txgoff];
|
||||
if (rt != NULL) {
|
||||
boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
|
||||
dn->dn_free_txg <= tx->tx_txg;
|
||||
zfs_range_seg_t *rs;
|
||||
|
||||
if (freeing_dnode) {
|
||||
ASSERT(zfs_range_tree_contains(rt, 0,
|
||||
dn->dn_maxblkid + 1));
|
||||
}
|
||||
|
||||
while ((rs = zfs_range_tree_first(rt)) != NULL) {
|
||||
uint64_t start = zfs_rs_get_start(rs, rt);
|
||||
uint64_t size = zfs_rs_get_end(rs, rt) - start;
|
||||
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
dnode_sync_free_range_impl(dn, start, size,
|
||||
freeing_dnode, tx);
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
|
||||
zfs_range_tree_clear(rt, start, size);
|
||||
}
|
||||
zfs_range_tree_destroy(rt);
|
||||
dn->dn_free_ranges[txgoff] = NULL;
|
||||
}
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out the dnode's dirty buffers.
|
||||
* Does not wait for zio completions.
|
||||
@ -781,32 +821,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
|
||||
}
|
||||
|
||||
/* process all the "freed" ranges in the file */
|
||||
if (dn->dn_free_ranges[txgoff] != NULL) {
|
||||
dnode_sync_free_range_arg_t dsfra;
|
||||
dsfra.dsfra_dnode = dn;
|
||||
dsfra.dsfra_tx = tx;
|
||||
dsfra.dsfra_free_indirects = freeing_dnode;
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
if (freeing_dnode) {
|
||||
ASSERT(zfs_range_tree_contains(
|
||||
dn->dn_free_ranges[txgoff], 0,
|
||||
dn->dn_maxblkid + 1));
|
||||
}
|
||||
/*
|
||||
* Because dnode_sync_free_range() must drop dn_mtx during its
|
||||
* processing, using it as a callback to zfs_range_tree_vacate()
|
||||
* is not safe. No other operations (besides destroy) are
|
||||
* allowed once zfs_range_tree_vacate() has begun, and dropping
|
||||
* dn_mtx would leave a window open for another thread to
|
||||
* observe that invalid (and unsafe) state.
|
||||
*/
|
||||
zfs_range_tree_walk(dn->dn_free_ranges[txgoff],
|
||||
dnode_sync_free_range, &dsfra);
|
||||
zfs_range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL);
|
||||
zfs_range_tree_destroy(dn->dn_free_ranges[txgoff]);
|
||||
dn->dn_free_ranges[txgoff] = NULL;
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
}
|
||||
dnode_sync_free_ranges(dn, tx);
|
||||
|
||||
if (freeing_dnode) {
|
||||
dn->dn_objset->os_freed_dnodes++;
|
||||
@ -828,7 +843,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
|
||||
}
|
||||
|
||||
/*
|
||||
* This must be done after dnode_sync_free_range()
|
||||
* This must be done after dnode_sync_free_ranges()
|
||||
* and dnode_increase_indirection(). See dnode_new_blkid()
|
||||
* for an explanation of the high bit being set.
|
||||
*/
|
||||
|
||||
Loading…
Reference in New Issue
Block a user