Improve async destroy processing timing

Previous code effectively enforced that all async free ZIOs were
_issued_ within the TXG timeout.  But they could take forever to
complete, especially if the required metadata were not in ARC.

This patch introduces periodic waits every 2000 ZIOs, which should
give at least somewhat reasonable TXG timings even for single HDD
pools with empty ARC.  And makes them complete within half of the
TXG timeout, since we might still need time to sync DDT and BRT.

While there, change zfs_max_async_dedup_frees semantics to include
also clone and gang blocks, which are similar.  Bump the default
value from set long ago to be more forgiving to block cloning
(still not having logs and benefiting from large TXGs), now that
we have better working time limits.  The limit now is a possible
amount of dirty data produced by BRT updates.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18043
This commit is contained in:
Alexander Motin 2025-12-11 21:46:08 -05:00 committed by GitHub
parent f72fd378c8
commit 3d76ba2737
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 47 additions and 15 deletions

View File

@ -157,7 +157,7 @@ typedef struct dsl_scan {
/* per txg statistics */
uint64_t scn_visited_this_txg; /* total bps visited this txg */
uint64_t scn_dedup_frees_this_txg; /* dedup bps freed this txg */
uint64_t scn_async_frees_this_txg; /* async frees (dedup/clone/gang) */
uint64_t scn_holes_this_txg;
uint64_t scn_lt_min_this_txg;
uint64_t scn_gt_max_this_txg;

View File

@ -1468,8 +1468,13 @@ Enable/disable the processing of the free_bpobj object.
.It Sy zfs_async_block_max_blocks Ns = Ns Sy UINT64_MAX Po unlimited Pc Pq u64
Maximum number of blocks freed in a single TXG.
.
.It Sy zfs_max_async_dedup_frees Ns = Ns Sy 100000 Po 10^5 Pc Pq u64
Maximum number of dedup blocks freed in a single TXG.
.It Sy zfs_max_async_dedup_frees Ns = Ns Sy 250000 Pq u64
Maximum number of dedup, clone or gang blocks freed in a single TXG.
These frees may require additional I/O, making them more expensive.
.
.It Sy zfs_async_free_zio_wait_interval Ns = Ns Sy 2000 Pq u64
After freeing this many dedup, clone or gang blocks wait for all pending
I/Os to complete before continuing.
.
.It Sy zfs_vdev_async_read_max_active Ns = Ns Sy 3 Pq uint
Maximum asynchronous read I/O operations active to each device.
@ -1739,7 +1744,7 @@ but we chose the more conservative approach of not setting it,
so that there is no possibility of
leaking space in the "partial temporary" failure case.
.
.It Sy zfs_free_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq uint
.It Sy zfs_free_min_time_ms Ns = Ns Sy 500 Ns ms Po 1s Pc Pq uint
During a
.Nm zfs Cm destroy
operation using the

View File

@ -195,7 +195,7 @@ static uint_t zfs_scrub_min_time_ms = 1000;
static uint_t zfs_obsolete_min_time_ms = 500;
/* minimum milliseconds to free per txg */
static uint_t zfs_free_min_time_ms = 1000;
static uint_t zfs_free_min_time_ms = 500;
/* minimum milliseconds to resilver per txg */
static uint_t zfs_resilver_min_time_ms = 3000;
@ -208,7 +208,13 @@ static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
/* max number of blocks to free in a single TXG */
static uint64_t zfs_async_block_max_blocks = UINT64_MAX;
/* max number of dedup blocks to free in a single TXG */
static uint64_t zfs_max_async_dedup_frees = 100000;
static uint64_t zfs_max_async_dedup_frees = 250000;
/*
* After freeing this many async ZIOs (dedup, clone, gang blocks), wait for
* them to complete before continuing. This prevents unbounded I/O queueing.
*/
static uint64_t zfs_async_free_zio_wait_interval = 2000;
/* set to disable resilver deferring */
static int zfs_resilver_disable_defer = B_FALSE;
@ -3590,12 +3596,12 @@ dsl_scan_async_block_should_pause(dsl_scan_t *scn)
}
if (zfs_max_async_dedup_frees != 0 &&
scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) {
scn->scn_async_frees_this_txg >= zfs_max_async_dedup_frees) {
return (B_TRUE);
}
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
return (elapsed_nanosecs / (NANOSEC / 2) > zfs_txg_timeout ||
(NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
txg_sync_waiting(scn->scn_dp)) ||
spa_shutting_down(scn->scn_dp->dp_spa));
@ -3612,14 +3618,32 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
return (SET_ERROR(ERESTART));
}
zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
dmu_tx_get_txg(tx), bp, 0));
zio_t *zio = zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
dmu_tx_get_txg(tx), bp, 0);
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
-bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
-BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
scn->scn_visited_this_txg++;
if (BP_GET_DEDUP(bp))
scn->scn_dedup_frees_this_txg++;
if (zio != NULL) {
/*
* zio_free_sync() returned a ZIO, meaning this is an
* async I/O (dedup, clone or gang block).
*/
scn->scn_async_frees_this_txg++;
zio_nowait(zio);
/*
* After issuing N async ZIOs, wait for them to complete.
* This makes time limits work with actual I/O completion
* times, not just queuing times.
*/
uint64_t i = zfs_async_free_zio_wait_interval;
if (i != 0 && (scn->scn_async_frees_this_txg % i) == 0) {
VERIFY0(zio_wait(scn->scn_zio_root));
scn->scn_zio_root = zio_root(scn->scn_dp->dp_spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
}
}
return (0);
}
@ -3866,7 +3890,7 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
spa->spa_name, (longlong_t)tx->tx_txg, err);
scn->scn_visited_this_txg = 0;
scn->scn_dedup_frees_this_txg = 0;
scn->scn_async_frees_this_txg = 0;
/*
* Write out changes to the DDT and the BRT that may be required
@ -4408,7 +4432,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
/* reset scan statistics */
scn->scn_visited_this_txg = 0;
scn->scn_dedup_frees_this_txg = 0;
scn->scn_async_frees_this_txg = 0;
scn->scn_holes_this_txg = 0;
scn->scn_lt_min_this_txg = 0;
scn->scn_gt_max_this_txg = 0;
@ -5318,7 +5342,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW,
"Max number of blocks freed in one txg");
ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW,
"Max number of dedup blocks freed in one txg");
"Max number of dedup, clone or gang blocks freed in one txg");
ZFS_MODULE_PARAM(zfs, zfs_, async_free_zio_wait_interval, U64, ZMOD_RW,
"Wait for pending free I/Os after issuing this many asynchronously");
ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW,
"Enable processing of the free_bpobj");