mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-25 03:37:45 +03:00
Resilver restarts unnecessarily when it encounters errors
When a resilver finishes, vdev_dtl_reassess is called to hopefully excise DTL_MISSING (amongst other things). If there are errors during the resilver, they are tracked in DTL_SCRUB, as spelled out in the block comment in vdev.c. DTL_SCRUB is in-core only, so it can only be used if the pool was online for the whole resilver. This state is tracked with the spa_scrub_started flag, which only gets set when the scan is initialized. Unfortunately, this flag gets cleared right before vdev_dtl_reassess gets called, so if there are any errors during the scan, DTL_MISSING will never get excised and the resilver will just continually restart. This fix simply moves clearing that flag until after the call to vdev_dtl_reasses. In addition, if a pool is imported and already has scn_errors > 0, this change will restart the resilver immediately instead of doing the rest of the scan and then restarting it from the beginning. On the other hand, if scn_errors == 0 at import, then no errors have been encountered so far, so the spa_scrub_started flag can be safely set. A test has been added to verify that resilver does not restart when relevant DTL's are available. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Paul Zuchowski <pzuchowski@datto.com> Signed-off-by: John Poduska <jpoduska@datto.com> Closes #10291
This commit is contained in:
+22
-1
@@ -542,6 +542,22 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
|
||||
zfs_dbgmsg("new-style scrub was modified "
|
||||
"by old software; restarting in txg %llu",
|
||||
(longlong_t)scn->scn_restart_txg);
|
||||
} else if (dsl_scan_resilvering(dp)) {
|
||||
/*
|
||||
* If a resilver is in progress and there are already
|
||||
* errors, restart it instead of finishing this scan and
|
||||
* then restarting it. If there haven't been any errors
|
||||
* then remember that the incore DTL is valid.
|
||||
*/
|
||||
if (scn->scn_phys.scn_errors > 0) {
|
||||
scn->scn_restart_txg = txg;
|
||||
zfs_dbgmsg("resilver can't excise DTL_MISSING "
|
||||
"when finished; restarting in txg %llu",
|
||||
(u_longlong_t)scn->scn_restart_txg);
|
||||
} else {
|
||||
/* it's safe to excise DTL when finished */
|
||||
spa->spa_scrub_started = B_TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -887,7 +903,6 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
|
||||
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
|
||||
|
||||
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
|
||||
spa->spa_scrub_started = B_FALSE;
|
||||
spa->spa_scrub_active = B_FALSE;
|
||||
|
||||
/*
|
||||
@@ -914,6 +929,12 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
|
||||
}
|
||||
spa_errlog_rotate(spa);
|
||||
|
||||
/*
|
||||
* Don't clear flag until after vdev_dtl_reassess to ensure that
|
||||
* DTL_MISSING will get updated when possible.
|
||||
*/
|
||||
spa->spa_scrub_started = B_FALSE;
|
||||
|
||||
/*
|
||||
* We may have finished replacing a device.
|
||||
* Let the async thread assess this and handle the detach.
|
||||
|
||||
Reference in New Issue
Block a user