Add device rebuild feature

The device_rebuild feature enables sequential reconstruction when
resilvering.  Mirror vdevs can be rebuilt in LBA order which may
more quickly restore redundancy depending on the pools average block
size, overall fragmentation and the performance characteristics
of the devices.  However, block checksums cannot be verified
as part of the rebuild thus a scrub is automatically started after
the sequential resilver completes.

The new '-s' option has been added to the `zpool attach` and
`zpool replace` command to request sequential reconstruction
instead of healing reconstruction when resilvering.

    zpool attach -s <pool> <existing vdev> <new vdev>
    zpool replace -s <pool> <old vdev> <new vdev>

The `zpool status` output has been updated to report the progress
of sequential resilvering in the same way as healing resilvering.
The one notable difference is that multiple sequential resilvers
may be in progress as long as they're operating on different
top-level vdevs.

The `zpool wait -t resilver` command was extended to wait on
sequential resilvers.  From this perspective they are no different
than healing resilvers.

Sequential resilvers cannot be supported for RAIDZ, but are
compatible with the dRAID feature being developed.

As part of this change the resilver_restart_* tests were moved
in to the functional/replacement directory.  Additionally, the
replacement tests were renamed and extended to verify both
resilvering and rebuilding.

Original-patch-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: John Poduska <jpoduska@datto.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10349
This commit is contained in:
Brian Behlendorf
2020-07-03 11:05:50 -07:00
committed by GitHub
parent 7ddb753d17
commit 9a49d3f3d3
65 changed files with 3281 additions and 362 deletions
+1
View File
@@ -94,6 +94,7 @@ $(MODULE)-objs += vdev_queue.o
$(MODULE)-objs += vdev_raidz.o
$(MODULE)-objs += vdev_raidz_math.o
$(MODULE)-objs += vdev_raidz_math_scalar.o
$(MODULE)-objs += vdev_rebuild.o
$(MODULE)-objs += vdev_removal.o
$(MODULE)-objs += vdev_root.o
$(MODULE)-objs += vdev_trim.o
+35 -7
View File
@@ -704,8 +704,9 @@ static int
dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
if (dsl_scan_is_running(scn))
if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd))
return (SET_ERROR(EBUSY));
return (0);
@@ -746,8 +747,12 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
if (vdev_resilver_needed(spa->spa_root_vdev,
&scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
spa_event_notify(spa, NULL, NULL,
nvlist_t *aux = fnvlist_alloc();
fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
"healing");
spa_event_notify(spa, NULL, aux,
ESC_ZFS_RESILVER_START);
nvlist_free(aux);
} else {
spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
}
@@ -761,6 +766,21 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
/*
* When starting a resilver clear any existing rebuild state.
* This is required to prevent stale rebuild status from
* being reported when a rebuild is run, then a resilver and
* finally a scrub. In which case only the scrub status
* should be reported by 'zpool status'.
*/
if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
vdev_t *rvd = spa->spa_root_vdev;
for (uint64_t i = 0; i < rvd->vdev_children; i++) {
vdev_t *vd = rvd->vdev_child[i];
vdev_rebuild_clear_sync(
(void *)(uintptr_t)vd->vdev_id, tx);
}
}
}
/* back to the generic stuff */
@@ -918,14 +938,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
if (complete &&
!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
scn->scn_phys.scn_max_txg, B_TRUE);
scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);
spa_event_notify(spa, NULL, NULL,
scn->scn_phys.scn_min_txg ?
ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
if (scn->scn_phys.scn_min_txg) {
nvlist_t *aux = fnvlist_alloc();
fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
"healing");
spa_event_notify(spa, NULL, aux,
ESC_ZFS_RESILVER_FINISH);
nvlist_free(aux);
} else {
spa_event_notify(spa, NULL, NULL,
ESC_ZFS_SCRUB_FINISH);
}
} else {
vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
0, B_TRUE);
0, B_TRUE, B_FALSE);
}
spa_errlog_rotate(spa);
+87 -22
View File
@@ -57,6 +57,7 @@
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_rebuild.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_disk.h>
#include <sys/metaslab.h>
@@ -1562,6 +1563,7 @@ spa_unload(spa_t *spa)
vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_all(spa);
vdev_rebuild_stop_all(spa);
}
/*
@@ -4240,7 +4242,7 @@ spa_ld_load_vdev_metadata(spa_t *spa)
* Propagate the leaf DTLs we just loaded all the way up the vdev tree.
*/
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
spa_config_exit(spa, SCL_ALL, FTAG);
return (0);
@@ -4829,11 +4831,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
update_config_cache);
/*
* Check all DTLs to see if anything needs resilvering.
* Check if a rebuild was in progress and if so resume it.
* Then check all DTLs to see if anything needs resilvering.
* The resilver will be deferred if a rebuild was started.
*/
if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
if (vdev_rebuild_active(spa->spa_root_vdev)) {
vdev_rebuild_restart(spa);
} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
spa_async_request(spa, SPA_ASYNC_RESILVER);
}
/*
* Log the fact that we booted up (so that we can detect if
@@ -6313,6 +6320,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_all(spa);
vdev_rebuild_stop_all(spa);
}
/*
@@ -6536,12 +6544,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* extra rules: you can't attach to it after it's been created, and upon
* completion of resilvering, the first disk (the one being replaced)
* is automatically detached.
*
* If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
* should be performed instead of traditional healing reconstruction. From
* an administrators perspective these are both resilver operations.
*/
int
spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
int rebuild)
{
uint64_t txg, dtl_max_txg;
vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
char *oldvdpath, *newvdpath;
@@ -6561,6 +6574,19 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
return (spa_vdev_exit(spa, NULL, txg, error));
}
if (rebuild) {
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
if (dsl_scan_resilvering(spa_get_dsl(spa)))
return (spa_vdev_exit(spa, NULL, txg,
ZFS_ERR_RESILVER_IN_PROGRESS));
} else {
if (vdev_rebuild_active(rvd))
return (spa_vdev_exit(spa, NULL, txg,
ZFS_ERR_REBUILD_IN_PROGRESS));
}
if (spa->spa_vdev_removal != NULL)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
@@ -6593,6 +6619,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
if (rebuild) {
/*
* For rebuilds, the parent vdev must support reconstruction
* using only space maps. This means the only allowable
* parents are the root vdev or a mirror vdev.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
pvd->vdev_ops != &vdev_root_ops) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
}
}
if (!replacing) {
/*
* For attach, the only allowable parent is a mirror or the root
@@ -6646,7 +6684,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
* than the top-level vdev.
*/
if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
/*
* If this is an in-place replacement, update oldvd's path and devid
@@ -6664,9 +6702,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
}
}
/* mark the device being resilvered */
newvd->vdev_resilver_txg = txg;
/*
* If the parent is not a mirror, or if we're replacing, insert the new
* mirror/replacing/spare vdev above oldvd.
@@ -6704,8 +6739,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
dtl_max_txg = txg + TXG_CONCURRENT_STATES;
vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
dtl_max_txg - TXG_INITIAL);
vdev_dtl_dirty(newvd, DTL_MISSING,
TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
if (newvd->vdev_isspare) {
spa_spare_activate(newvd);
@@ -6722,16 +6757,25 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
vdev_dirty(tvd, VDD_DTL, newvd, txg);
/*
* Schedule the resilver to restart in the future. We do this to
* ensure that dmu_sync-ed blocks have been stitched into the
* respective datasets. We do not do this if resilvers have been
* deferred.
* Schedule the resilver or rebuild to restart in the future. We do
* this to ensure that dmu_sync-ed blocks have been stitched into the
* respective datasets.
*/
if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
vdev_defer_resilver(newvd);
else
dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg);
if (rebuild) {
newvd->vdev_rebuild_txg = txg;
vdev_rebuild(tvd);
} else {
newvd->vdev_resilver_txg = txg;
if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
vdev_defer_resilver(newvd);
} else {
dsl_scan_restart_resilver(spa->spa_dsl_pool,
dtl_max_txg);
}
}
if (spa->spa_bootfs)
spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@@ -6774,7 +6818,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
txg = spa_vdev_detach_enter(spa, guid);
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
@@ -7728,6 +7772,12 @@ spa_vdev_resilver_done(spa_t *spa)
}
spa_config_exit(spa, SCL_ALL, FTAG);
/*
* If a detach was not performed above replace waiters will not have
* been notified. In which case we must do so now.
*/
spa_notify_waiters(spa);
}
/*
@@ -7970,10 +8020,22 @@ spa_async_thread(void *arg)
if (tasks & SPA_ASYNC_RESILVER_DONE)
spa_vdev_resilver_done(spa);
/*
* If any devices are done replacing, detach them. Then if no
* top-level vdevs are rebuilding attempt to kick off a scrub.
*/
if (tasks & SPA_ASYNC_REBUILD_DONE) {
spa_vdev_resilver_done(spa);
if (!vdev_rebuild_active(spa->spa_root_vdev))
(void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
}
/*
* Kick off a resilver.
*/
if (tasks & SPA_ASYNC_RESILVER &&
!vdev_rebuild_active(spa->spa_root_vdev) &&
(!dsl_scan_resilvering(dp) ||
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
dsl_scan_restart_resilver(dp, 0);
@@ -9470,6 +9532,9 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
DSS_SCANNING);
break;
case ZPOOL_WAIT_RESILVER:
if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
break;
/* fall through */
case ZPOOL_WAIT_SCRUB:
{
boolean_t scanning, paused, is_scrub;
+27 -2
View File
@@ -1165,6 +1165,30 @@ spa_vdev_enter(spa_t *spa)
return (spa_vdev_config_enter(spa));
}
/*
* The same as spa_vdev_enter() above but additionally takes the guid of
* the vdev being detached. When there is a rebuild in process it will be
* suspended while the vdev tree is modified then resumed by spa_vdev_exit().
* The rebuild is canceled if only a single child remains after the detach.
*/
uint64_t
spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
{
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
vdev_autotrim_stop_all(spa);
if (guid != 0) {
vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
if (vd) {
vdev_rebuild_stop_wait(vd->vdev_top);
}
}
return (spa_vdev_config_enter(spa));
}
/*
* Internal implementation for spa_vdev_enter(). Used when a vdev
* operation requires multiple syncs (i.e. removing a device) while
@@ -1198,7 +1222,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
/*
* Reassess the DTLs.
*/
vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE);
if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
config_changed = B_TRUE;
@@ -1271,6 +1295,7 @@ int
spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
{
vdev_autotrim_restart(spa);
vdev_rebuild_restart(spa);
spa_vdev_config_exit(spa, vd, txg, error, FTAG);
mutex_exit(&spa_namespace_lock);
@@ -1322,7 +1347,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
}
if (vd != NULL || error == 0)
vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE);
vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE);
if (vd != NULL) {
if (vd != spa->spa_root_vdev)
+187 -51
View File
@@ -39,6 +39,7 @@
#include <sys/dmu_tx.h>
#include <sys/dsl_dir.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_rebuild.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -551,10 +552,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -562,10 +565,16 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
0);
}
txg_list_create(&vd->vdev_ms_list, spa,
offsetof(struct metaslab, ms_txg_node));
txg_list_create(&vd->vdev_dtl_list, spa,
@@ -835,6 +844,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
&vd->vdev_resilver_txg);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
&vd->vdev_rebuild_txg);
if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
vdev_defer_resilver(vd);
@@ -890,6 +902,7 @@ vdev_free(vdev_t *vd)
ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
ASSERT3P(vd->vdev_trim_thread, ==, NULL);
ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
/*
* Scan queues are normally destroyed at the end of a scan. If the
@@ -998,10 +1011,12 @@ vdev_free(vdev_t *vd)
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
mutex_destroy(&vd->vdev_scan_io_queue_lock);
mutex_destroy(&vd->vdev_initialize_lock);
mutex_destroy(&vd->vdev_initialize_io_lock);
cv_destroy(&vd->vdev_initialize_io_cv);
cv_destroy(&vd->vdev_initialize_cv);
mutex_destroy(&vd->vdev_trim_lock);
mutex_destroy(&vd->vdev_autotrim_lock);
mutex_destroy(&vd->vdev_trim_io_lock);
@@ -1009,6 +1024,11 @@ vdev_free(vdev_t *vd)
cv_destroy(&vd->vdev_autotrim_cv);
cv_destroy(&vd->vdev_trim_io_cv);
mutex_destroy(&vd->vdev_rebuild_lock);
mutex_destroy(&vd->vdev_rebuild_io_lock);
cv_destroy(&vd->vdev_rebuild_cv);
cv_destroy(&vd->vdev_rebuild_io_cv);
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -1078,7 +1098,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
ASSERT0(tvd->vdev_removing);
ASSERT0(tvd->vdev_rebuilding);
tvd->vdev_removing = svd->vdev_removing;
tvd->vdev_rebuilding = svd->vdev_rebuilding;
tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
tvd->vdev_indirect_config = svd->vdev_indirect_config;
tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
tvd->vdev_indirect_births = svd->vdev_indirect_births;
@@ -1092,6 +1115,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
svd->vdev_indirect_births = NULL;
svd->vdev_obsolete_sm = NULL;
svd->vdev_removing = 0;
svd->vdev_rebuilding = 0;
for (t = 0; t < TXG_SIZE; t++) {
while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
@@ -2576,11 +2600,8 @@ vdev_dtl_max(vdev_t *vd)
* excise the DTLs.
*/
static boolean_t
vdev_dtl_should_excise(vdev_t *vd)
vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
{
spa_t *spa = vd->vdev_spa;
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
ASSERT0(vd->vdev_children);
if (vd->vdev_state < VDEV_STATE_DEGRADED)
@@ -2589,23 +2610,52 @@ vdev_dtl_should_excise(vdev_t *vd)
if (vd->vdev_resilver_deferred)
return (B_FALSE);
if (vd->vdev_resilver_txg == 0 ||
range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
return (B_TRUE);
/*
* When a resilver is initiated the scan will assign the scn_max_txg
* value to the highest txg value that exists in all DTLs. If this
* device's max DTL is not part of this scan (i.e. it is not in
* the range (scn_min_txg, scn_max_txg] then it is not eligible
* for excision.
*/
if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
return (B_TRUE);
if (rebuild_done) {
vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
/* Rebuild not initiated by attach */
if (vd->vdev_rebuild_txg == 0)
return (B_TRUE);
/*
* When a rebuild completes without error then all missing data
* up to the rebuild max txg has been reconstructed and the DTL
* is eligible for excision.
*/
if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
return (B_TRUE);
}
} else {
dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
/* Resilver not initiated by attach */
if (vd->vdev_resilver_txg == 0)
return (B_TRUE);
/*
* When a resilver is initiated the scan will assign the
* scn_max_txg value to the highest txg value that exists
* in all DTLs. If this device's max DTL is not part of this
* scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
* then it is not eligible for excision.
*/
if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
return (B_TRUE);
}
}
return (B_FALSE);
}
@@ -2614,7 +2664,8 @@ vdev_dtl_should_excise(vdev_t *vd)
* write operations will be issued to the pool.
*/
void
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
boolean_t scrub_done, boolean_t rebuild_done)
{
spa_t *spa = vd->vdev_spa;
avl_tree_t reftree;
@@ -2624,22 +2675,28 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
for (int c = 0; c < vd->vdev_children; c++)
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
scrub_txg, scrub_done, rebuild_done);
if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
return;
if (vd->vdev_ops->vdev_op_leaf) {
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
boolean_t check_excise = B_FALSE;
boolean_t wasempty = B_TRUE;
mutex_enter(&vd->vdev_dtl_lock);
/*
* If requested, pretend the scan completed cleanly.
* If requested, pretend the scan or rebuild completed cleanly.
*/
if (zfs_scan_ignore_errors && scn)
scn->scn_phys.scn_errors = 0;
if (zfs_scan_ignore_errors) {
if (scn != NULL)
scn->scn_phys.scn_errors = 0;
if (vr != NULL)
vr->vr_rebuild_phys.vrp_errors = 0;
}
if (scrub_txg != 0 &&
!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
@@ -2654,21 +2711,29 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
}
/*
* If we've completed a scan cleanly then determine
* if this vdev should remove any DTLs. We only want to
* excise regions on vdevs that were available during
* the entire duration of this scan.
* If we've completed a scrub/resilver or a rebuild cleanly
* then determine if this vdev should remove any DTLs. We
* only want to excise regions on vdevs that were available
* during the entire duration of this scan.
*/
if (scrub_txg != 0 &&
(spa->spa_scrub_started ||
(scn != NULL && scn->scn_phys.scn_errors == 0)) &&
vdev_dtl_should_excise(vd)) {
if (rebuild_done &&
vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
check_excise = B_TRUE;
} else {
if (spa->spa_scrub_started ||
(scn != NULL && scn->scn_phys.scn_errors == 0)) {
check_excise = B_TRUE;
}
}
if (scrub_txg && check_excise &&
vdev_dtl_should_excise(vd, rebuild_done)) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
* will be valid, so excise the old region and
* fold in the scrub dtl. Otherwise, leave the
* dtl as-is if there was an error.
* We completed a scrub, resilver or rebuild up to
* scrub_txg. If we did it without rebooting, then
* the scrub dtl will be valid, so excise the old
* region and fold in the scrub dtl. Otherwise,
* leave the dtl as-is if there was an error.
*
* There's little trick here: to excise the beginning
* of the DTL_MISSING map, we put it into a reference
@@ -2711,15 +2776,20 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
/*
* If the vdev was resilvering and no longer has any
* DTLs then reset its resilvering flag and dirty
* If the vdev was resilvering or rebuilding and no longer
* has any DTLs then reset the appropriate flag and dirty
* the top level so that we persist the change.
*/
if (txg != 0 && vd->vdev_resilver_txg != 0 &&
if (txg != 0 &&
range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
vd->vdev_resilver_txg = 0;
vdev_config_dirty(vd->vdev_top);
if (vd->vdev_rebuild_txg != 0) {
vd->vdev_rebuild_txg = 0;
vdev_config_dirty(vd->vdev_top);
} else if (vd->vdev_resilver_txg != 0) {
vd->vdev_resilver_txg = 0;
vdev_config_dirty(vd->vdev_top);
}
}
mutex_exit(&vd->vdev_dtl_lock);
@@ -2955,10 +3025,10 @@ vdev_dtl_required(vdev_t *vd)
* If not, we can safely offline/detach/remove the device.
*/
vd->vdev_cant_read = B_TRUE;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
vd->vdev_cant_read = cant_read;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
if (!required && zio_injection_enabled) {
required = !!zio_handle_device_injection(vd, NULL,
@@ -3065,6 +3135,20 @@ vdev_load(vdev_t *vd)
}
}
/*
* Load any rebuild state from the top-level vdev zap.
*/
if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
error = vdev_rebuild_load(vd);
if (error && error != ENOTSUP) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
"failed [error=%d]", error);
return (error);
}
}
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
@@ -3947,6 +4031,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
if (vd->vdev_ops->vdev_op_leaf) {
vs->vs_rsize += VDEV_LABEL_START_SIZE +
VDEV_LABEL_END_SIZE;
@@ -3973,7 +4058,11 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
vs->vs_trim_state = vd->vdev_trim_state;
vs->vs_trim_action_time = vd->vdev_trim_action_time;
/* Set when there is a deferred resilver. */
vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
}
/*
* Report expandable space on top-level, non-auxiliary devices
* only. The expandable space is reported in terms of metaslab
@@ -3985,13 +4074,16 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vd->vdev_max_asize - vd->vdev_asize,
1ULL << tvd->vdev_ms_shift);
}
/*
* Report fragmentation and rebuild progress for top-level,
* non-auxiliary, concrete devices.
*/
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
vdev_is_concrete(vd)) {
vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
vd->vdev_mg->mg_fragmentation : 0;
}
if (vd->vdev_ops->vdev_op_leaf)
vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
}
vdev_get_stats_ex_impl(vd, vs, vsx);
@@ -4072,17 +4164,35 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_IO_REPAIR) {
/*
* Repair is the result of a resilver issued by the
* scan thread (spa_sync).
*/
if (flags & ZIO_FLAG_SCAN_THREAD) {
dsl_scan_phys_t *scn_phys =
&spa->spa_dsl_pool->dp_scan->scn_phys;
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
dsl_scan_phys_t *scn_phys = &scn->scn_phys;
uint64_t *processed = &scn_phys->scn_processed;
/* XXX cleanup? */
if (vd->vdev_ops->vdev_op_leaf)
atomic_add_64(processed, psize);
vs->vs_scan_processed += psize;
}
/*
* Repair is the result of a rebuild issued by the
* rebuild thread (vdev_rebuild_thread).
*/
if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
vdev_t *tvd = vd->vdev_top;
vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
if (vd->vdev_ops->vdev_op_leaf)
atomic_add_64(rebuilt, psize);
vs->vs_rebuild_processed += psize;
}
if (flags & ZIO_FLAG_SELF_HEAL)
vs->vs_self_healed += psize;
}
@@ -4094,6 +4204,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (vd->vdev_ops->vdev_op_leaf &&
(zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
zio_type_t vs_type = type;
zio_priority_t priority = zio->io_priority;
/*
* TRIM ops and bytes are reported to user space as
@@ -4103,19 +4214,44 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (type == ZIO_TYPE_TRIM)
vs_type = ZIO_TYPE_IOCTL;
/*
* Solely for the purposes of 'zpool iostat -lqrw'
* reporting use the priority to catagorize the IO.
* Only the following are reported to user space:
*
* ZIO_PRIORITY_SYNC_READ,
* ZIO_PRIORITY_SYNC_WRITE,
* ZIO_PRIORITY_ASYNC_READ,
* ZIO_PRIORITY_ASYNC_WRITE,
* ZIO_PRIORITY_SCRUB,
* ZIO_PRIORITY_TRIM.
*/
if (priority == ZIO_PRIORITY_REBUILD) {
priority = ((type == ZIO_TYPE_WRITE) ?
ZIO_PRIORITY_ASYNC_WRITE :
ZIO_PRIORITY_SCRUB);
} else if (priority == ZIO_PRIORITY_INITIALIZING) {
ASSERT3U(type, ==, ZIO_TYPE_WRITE);
priority = ZIO_PRIORITY_ASYNC_WRITE;
} else if (priority == ZIO_PRIORITY_REMOVAL) {
priority = ((type == ZIO_TYPE_WRITE) ?
ZIO_PRIORITY_ASYNC_WRITE :
ZIO_PRIORITY_ASYNC_READ);
}
vs->vs_ops[vs_type]++;
vs->vs_bytes[vs_type] += psize;
if (flags & ZIO_FLAG_DELEGATED) {
vsx->vsx_agg_histo[zio->io_priority]
vsx->vsx_agg_histo[priority]
[RQ_HISTO(zio->io_size)]++;
} else {
vsx->vsx_ind_histo[zio->io_priority]
vsx->vsx_ind_histo[priority]
[RQ_HISTO(zio->io_size)]++;
}
if (zio->io_delta && zio->io_delay) {
vsx->vsx_queue_histo[zio->io_priority]
vsx->vsx_queue_histo[priority]
[L_HISTO(zio->io_delta - zio->io_delay)]++;
vsx->vsx_disk_histo[type]
[L_HISTO(zio->io_delay)]++;
+17
View File
@@ -404,6 +404,19 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
}
}
static void
top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
{
if (vd == vd->vdev_top) {
vdev_rebuild_stat_t vrs;
if (vdev_rebuild_get_stats(vd, &vrs) == 0) {
fnvlist_add_uint64_array(nvl,
ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs,
sizeof (vrs) / sizeof (uint64_t));
}
}
}
/*
* Generate the nvlist representing this vdev's config.
*/
@@ -559,6 +572,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vdev_config_generate_stats(vd, nv);
root_vdev_actions_getprogress(vd, nv);
top_vdev_actions_getprogress(vd, nv);
/*
* Note: this can be called from open context
@@ -663,6 +677,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_resilver_txg != 0)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
vd->vdev_resilver_txg);
if (vd->vdev_rebuild_txg != 0)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
vd->vdev_rebuild_txg);
if (vd->vdev_faulted)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
if (vd->vdev_degraded)
+3 -2
View File
@@ -767,8 +767,9 @@ vdev_mirror_io_done(zio_t *zio)
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
zio->io_abd, zio->io_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
zio->io_abd, zio->io_size, ZIO_TYPE_WRITE,
zio->io_priority == ZIO_PRIORITY_REBUILD ?
ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
}
+16 -2
View File
@@ -158,6 +158,8 @@ uint32_t zfs_vdev_initializing_min_active = 1;
uint32_t zfs_vdev_initializing_max_active = 1;
uint32_t zfs_vdev_trim_min_active = 1;
uint32_t zfs_vdev_trim_max_active = 2;
uint32_t zfs_vdev_rebuild_min_active = 1;
uint32_t zfs_vdev_rebuild_max_active = 3;
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -278,6 +280,8 @@ vdev_queue_class_min_active(zio_priority_t p)
return (zfs_vdev_initializing_min_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_min_active);
case ZIO_PRIORITY_REBUILD:
return (zfs_vdev_rebuild_min_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -352,6 +356,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
return (zfs_vdev_initializing_max_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_max_active);
case ZIO_PRIORITY_REBUILD:
return (zfs_vdev_rebuild_max_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -845,7 +851,8 @@ vdev_queue_io(zio_t *zio)
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
zio->io_priority != ZIO_PRIORITY_SCRUB &&
zio->io_priority != ZIO_PRIORITY_REMOVAL &&
zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
zio->io_priority != ZIO_PRIORITY_REBUILD) {
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
}
} else if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -854,7 +861,8 @@ vdev_queue_io(zio_t *zio)
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_REMOVAL &&
zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
zio->io_priority != ZIO_PRIORITY_REBUILD) {
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
}
} else {
@@ -1051,6 +1059,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW,
"Min active trim/discard I/Os per vdev");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
"Max active rebuild I/Os per vdev");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
"Min active rebuild I/Os per vdev");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
"Queue depth percentage for each top-level vdev");
/* END CSTYLED */
File diff suppressed because it is too large Load Diff
+4 -2
View File
@@ -1938,8 +1938,9 @@ static int
zfs_ioc_vdev_attach(zfs_cmd_t *zc)
{
spa_t *spa;
int replacing = zc->zc_cookie;
nvlist_t *config;
int replacing = zc->zc_cookie;
int rebuild = zc->zc_simple;
int error;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
@@ -1947,7 +1948,8 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config)) == 0) {
error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
error = spa_vdev_attach(spa, zc->zc_guid, config, replacing,
rebuild);
nvlist_free(config);
}