From c85ac731a0ec16e4277857b55ebe123c552365b6 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 25 Jan 2023 11:28:54 -0800 Subject: [PATCH] Improve resilver ETAs When resilvering the estimated time remaining is calculated using the average issue rate over the current pass. Where the current pass starts when a scan was started, or restarted, if the pool was exported/imported. For dRAID pools in particular this can result in wildly optimistic estimates since the issue rate will be very high while scanning when non-degraded regions of the pool are scanned. Once repair I/O starts being issued performance drops to a realistic number but the estimated performance is still significantly skewed. To address this we redefine a pass such that it starts after a scanning phase completes so the issue rate is more reflective of recent performance. Additionally, the zfs_scan_report_txgs module option can be set to reset the pass statistics more often. Reviewed-by: Akash B Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #14410 --- cmd/zpool/zpool_main.c | 33 ++++++++++++++++++++++----------- man/man4/zfs.4 | 7 +++++++ module/zfs/dsl_scan.c | 28 ++++++++++++++++++++++++++++ module/zfs/spa_misc.c | 1 - 4 files changed, 57 insertions(+), 12 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 93d6a1898..efb2d10e5 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -7524,19 +7524,20 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); - assert(ps->pss_func == POOL_SCAN_SCRUB || - ps->pss_func == POOL_SCAN_RESILVER); + int is_resilver = ps->pss_func == POOL_SCAN_RESILVER; + int is_scrub = ps->pss_func == POOL_SCAN_SCRUB; + assert(is_resilver || is_scrub); /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { secs_to_dhms(end - start, time_buf); - if (ps->pss_func == POOL_SCAN_SCRUB) { + if (is_scrub) { (void) printf(gettext("scrub repaired %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); - } else if (ps->pss_func == POOL_SCAN_RESILVER) { + } else if (is_resilver) { (void) printf(gettext("resilvered %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, @@ -7544,10 +7545,10 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) } return; } else if (ps->pss_state == DSS_CANCELED) { - if (ps->pss_func == POOL_SCAN_SCRUB) { + if (is_scrub) { (void) printf(gettext("scrub canceled on %s"), ctime(&end)); - } else if (ps->pss_func == POOL_SCAN_RESILVER) { + } else if (is_resilver) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); } @@ -7557,7 +7558,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) assert(ps->pss_state == DSS_SCANNING); /* Scan is in progress. Resilvers can't be paused. */ - if (ps->pss_func == POOL_SCAN_SCRUB) { + if (is_scrub) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); @@ -7567,7 +7568,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } - } else if (ps->pss_func == POOL_SCAN_RESILVER) { + } else if (is_resilver) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); } @@ -7609,17 +7610,27 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) scanned_buf, issued_buf, total_buf); } - if (ps->pss_func == POOL_SCAN_RESILVER) { + if (is_resilver) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), processed_buf, 100 * fraction_done); - } else if (ps->pss_func == POOL_SCAN_SCRUB) { + } else if (is_scrub) { (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); } if (pause == 0) { + /* + * Only provide an estimate iff: + * 1) the time remaining is valid, and + * 2) the issue rate exceeds 10 MB/s, and + * 3) it's either: + * a) a resilver which has started repairs, or + * b) a scrub which has entered the issue phase. + */ if (total_secs_left != UINT64_MAX && - issue_rate >= 10 * 1024 * 1024) { + issue_rate >= 10 * 1024 * 1024 && + ((is_resilver && ps->pss_processed > 0) || + (is_scrub && issued > 0))) { (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index e20d60134..6f260660e 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1890,6 +1890,13 @@ I/O. In this case (unless the metadata scan is done) we stop issuing verification I/O and start scanning metadata again until we get to the hard limit. . +.It Sy zfs_scan_report_txgs Ns = Ns Sy 0 Ns | Ns 1 Pq uint +When reporting resilver throughput and estimated completion time use the +performance observed over roughly the last +.Sy zfs_scan_report_txgs +TXGs. +When set to zero performance is calculated over the time between checkpoints. +. .It Sy zfs_scan_strict_mem_lim Ns = Ns Sy 0 Ns | Ns 1 Pq int Enforce tight memory limits on pool scans when a sequential scan is in progress. When disabled, the memory limit may be exceeded by fast disks. diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f9e437f0c..8a5a02cea 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -131,6 +131,15 @@ static uint64_t dsl_scan_count_data_disks(vdev_t *vd); extern uint_t zfs_vdev_async_write_active_min_dirty_percent; static int zfs_scan_blkstats = 0; +/* + * 'zpool status' uses bytes processed per pass to report throughput and + * estimate time remaining. We define a pass to start when the scanning + * phase completes for a sequential resilver. Optionally, this value + * may be used to reset the pass statistics every N txgs to provide an + * estimated completion time based on currently observed performance. + */ +static uint_t zfs_scan_report_txgs = 0; + /* * By default zfs will check to ensure it is not over the hard memory * limit before each txg. If finer-grained control of this is needed @@ -604,6 +613,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) } spa_scan_stat_init(spa); + vdev_scan_stat_init(spa->spa_root_vdev); + return (0); } @@ -763,6 +774,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) scn->scn_last_checkpoint = 0; scn->scn_checkpointing = B_FALSE; spa_scan_stat_init(spa); + vdev_scan_stat_init(spa->spa_root_vdev); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; @@ -3652,6 +3664,16 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) return; } + /* + * Disabled by default, set zfs_scan_report_txgs to report + * average performance over the last zfs_scan_report_txgs TXGs. + */ + if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 && + tx->tx_txg % zfs_scan_report_txgs == 0) { + scn->scn_issued_before_pass += spa->spa_scan_pass_issued; + spa_scan_stat_init(spa); + } + /* * It is possible to switch from unsorted to sorted at any time, * but afterwards the scan will remain sorted unless reloaded from @@ -3780,6 +3802,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (scn->scn_is_sorted) { scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; + scn->scn_issued_before_pass += + spa->spa_scan_pass_issued; + spa_scan_stat_init(spa); } zfs_dbgmsg("scan complete for %s txg %llu", spa->spa_name, @@ -4507,5 +4532,8 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW, "Tunable to adjust bias towards more filled segments during scans"); +ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, + "Tunable to report resilver performance over the last N txgs"); + ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 822fd0ee8..53763e915 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2556,7 +2556,6 @@ spa_scan_stat_init(spa_t *spa) spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; - vdev_scan_stat_init(spa->spa_root_vdev); } /*