diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 44cca1479..bcb98af40 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -157,7 +157,7 @@ typedef struct dsl_scan { /* per txg statistics */ uint64_t scn_visited_this_txg; /* total bps visited this txg */ - uint64_t scn_dedup_frees_this_txg; /* dedup bps freed this txg */ + uint64_t scn_async_frees_this_txg; /* async frees (dedup/clone/gang) */ uint64_t scn_holes_this_txg; uint64_t scn_lt_min_this_txg; uint64_t scn_gt_max_this_txg; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 6f2a23a45..50803101f 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1468,8 +1468,13 @@ Enable/disable the processing of the free_bpobj object. .It Sy zfs_async_block_max_blocks Ns = Ns Sy UINT64_MAX Po unlimited Pc Pq u64 Maximum number of blocks freed in a single TXG. . -.It Sy zfs_max_async_dedup_frees Ns = Ns Sy 100000 Po 10^5 Pc Pq u64 -Maximum number of dedup blocks freed in a single TXG. +.It Sy zfs_max_async_dedup_frees Ns = Ns Sy 250000 Pq u64 +Maximum number of dedup, clone or gang blocks freed in a single TXG. +These frees may require additional I/O, making them more expensive. +. +.It Sy zfs_async_free_zio_wait_interval Ns = Ns Sy 2000 Pq u64 +After freeing this many dedup, clone or gang blocks wait for all pending +I/Os to complete before continuing. . .It Sy zfs_vdev_async_read_max_active Ns = Ns Sy 3 Pq uint Maximum asynchronous read I/O operations active to each device. @@ -1739,7 +1744,7 @@ but we chose the more conservative approach of not setting it, so that there is no possibility of leaking space in the "partial temporary" failure case. . -.It Sy zfs_free_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq uint +.It Sy zfs_free_min_time_ms Ns = Ns Sy 500 Ns ms Po 1s Pc Pq uint During a .Nm zfs Cm destroy operation using the diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index eb3f72bb0..4a9e831dc 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -195,7 +195,7 @@ static uint_t zfs_scrub_min_time_ms = 1000; static uint_t zfs_obsolete_min_time_ms = 500; /* minimum milliseconds to free per txg */ -static uint_t zfs_free_min_time_ms = 1000; +static uint_t zfs_free_min_time_ms = 500; /* minimum milliseconds to resilver per txg */ static uint_t zfs_resilver_min_time_ms = 3000; @@ -208,7 +208,13 @@ static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ static uint64_t zfs_async_block_max_blocks = UINT64_MAX; /* max number of dedup blocks to free in a single TXG */ -static uint64_t zfs_max_async_dedup_frees = 100000; +static uint64_t zfs_max_async_dedup_frees = 250000; + +/* + * After freeing this many async ZIOs (dedup, clone, gang blocks), wait for + * them to complete before continuing. This prevents unbounded I/O queueing. + */ +static uint64_t zfs_async_free_zio_wait_interval = 2000; /* set to disable resilver deferring */ static int zfs_resilver_disable_defer = B_FALSE; @@ -3590,12 +3596,12 @@ dsl_scan_async_block_should_pause(dsl_scan_t *scn) } if (zfs_max_async_dedup_frees != 0 && - scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) { + scn->scn_async_frees_this_txg >= zfs_max_async_dedup_frees) { return (B_TRUE); } elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; - return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + return (elapsed_nanosecs / (NANOSEC / 2) > zfs_txg_timeout || (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); @@ -3612,14 +3618,32 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) return (SET_ERROR(ERESTART)); } - zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, - dmu_tx_get_txg(tx), bp, 0)); + zio_t *zio = zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, + dmu_tx_get_txg(tx), bp, 0); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); scn->scn_visited_this_txg++; - if (BP_GET_DEDUP(bp)) - scn->scn_dedup_frees_this_txg++; + if (zio != NULL) { + /* + * zio_free_sync() returned a ZIO, meaning this is an + * async I/O (dedup, clone or gang block). + */ + scn->scn_async_frees_this_txg++; + zio_nowait(zio); + + /* + * After issuing N async ZIOs, wait for them to complete. + * This makes time limits work with actual I/O completion + * times, not just queuing times. + */ + uint64_t i = zfs_async_free_zio_wait_interval; + if (i != 0 && (scn->scn_async_frees_this_txg % i) == 0) { + VERIFY0(zio_wait(scn->scn_zio_root)); + scn->scn_zio_root = zio_root(scn->scn_dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + } + } return (0); } @@ -3866,7 +3890,7 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), spa->spa_name, (longlong_t)tx->tx_txg, err); scn->scn_visited_this_txg = 0; - scn->scn_dedup_frees_this_txg = 0; + scn->scn_async_frees_this_txg = 0; /* * Write out changes to the DDT and the BRT that may be required @@ -4408,7 +4432,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) /* reset scan statistics */ scn->scn_visited_this_txg = 0; - scn->scn_dedup_frees_this_txg = 0; + scn->scn_async_frees_this_txg = 0; scn->scn_holes_this_txg = 0; scn->scn_lt_min_this_txg = 0; scn->scn_gt_max_this_txg = 0; @@ -5318,7 +5342,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW, "Max number of blocks freed in one txg"); ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW, - "Max number of dedup blocks freed in one txg"); + "Max number of dedup, clone or gang blocks freed in one txg"); + +ZFS_MODULE_PARAM(zfs, zfs_, async_free_zio_wait_interval, U64, ZMOD_RW, + "Wait for pending free I/Os after issuing this many asynchronously"); ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW, "Enable processing of the free_bpobj");