mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-04-06 17:49:11 +03:00
Parallelize vdev_load
metaslab_init is the slowest part of importing a mature pool, and it must be repeated hundreds of times for each top-level vdev. But its speed is dominated by a few serialized disk accesses. That can lead to import times of > 1 hour for pools with many top-level vdevs on spinny disks. Speed up the import by using a taskqueue to parallelize vdev_load across all top-level vdevs. This also requires adding mutex protection to metaslab_class_t.mc_historgram. The mc_histogram fields were unprotected when that code was first written in "Illumos 4976-4984 - metaslab improvements" (OpenZFSf3a7f6610f
). The lock wasn't added until3dfb57a35e
, though it's unclear exactly which fields it's supposed to protect. In any case, it wasn't until vdev_load was parallelized that any code attempted concurrent access to those fields. Sponsored by: Axcient Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alan Somers <asomers@gmail.com> Closes #11470
This commit is contained in:
parent
dfb44c500e
commit
a0e01997ec
@ -269,6 +269,7 @@ struct vdev {
|
|||||||
boolean_t vdev_expanding; /* expand the vdev? */
|
boolean_t vdev_expanding; /* expand the vdev? */
|
||||||
boolean_t vdev_reopening; /* reopen in progress? */
|
boolean_t vdev_reopening; /* reopen in progress? */
|
||||||
boolean_t vdev_nonrot; /* true if solid state */
|
boolean_t vdev_nonrot; /* true if solid state */
|
||||||
|
int vdev_load_error; /* error on last load */
|
||||||
int vdev_open_error; /* error on last open */
|
int vdev_open_error; /* error on last open */
|
||||||
kthread_t *vdev_open_thread; /* thread opening children */
|
kthread_t *vdev_open_thread; /* thread opening children */
|
||||||
uint64_t vdev_crtxg; /* txg when top-level was added */
|
uint64_t vdev_crtxg; /* txg when top-level was added */
|
||||||
|
@ -522,6 +522,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
|
|||||||
mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
|
mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
|
||||||
KM_SLEEP);
|
KM_SLEEP);
|
||||||
|
|
||||||
|
mutex_enter(&mc->mc_lock);
|
||||||
for (int c = 0; c < rvd->vdev_children; c++) {
|
for (int c = 0; c < rvd->vdev_children; c++) {
|
||||||
vdev_t *tvd = rvd->vdev_child[c];
|
vdev_t *tvd = rvd->vdev_child[c];
|
||||||
metaslab_group_t *mg = vdev_get_mg(tvd, mc);
|
metaslab_group_t *mg = vdev_get_mg(tvd, mc);
|
||||||
@ -546,6 +547,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
|
|||||||
VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
|
VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mutex_exit(&mc->mc_lock);
|
||||||
kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
|
kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1067,6 +1069,7 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
mutex_enter(&mg->mg_lock);
|
mutex_enter(&mg->mg_lock);
|
||||||
|
mutex_enter(&mc->mc_lock);
|
||||||
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
|
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
|
||||||
IMPLY(mg == mg->mg_vd->vdev_log_mg,
|
IMPLY(mg == mg->mg_vd->vdev_log_mg,
|
||||||
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
|
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
|
||||||
@ -1075,6 +1078,7 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
|
|||||||
mc->mc_histogram[i + ashift] +=
|
mc->mc_histogram[i + ashift] +=
|
||||||
msp->ms_sm->sm_phys->smp_histogram[i];
|
msp->ms_sm->sm_phys->smp_histogram[i];
|
||||||
}
|
}
|
||||||
|
mutex_exit(&mc->mc_lock);
|
||||||
mutex_exit(&mg->mg_lock);
|
mutex_exit(&mg->mg_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1089,6 +1093,7 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
mutex_enter(&mg->mg_lock);
|
mutex_enter(&mg->mg_lock);
|
||||||
|
mutex_enter(&mc->mc_lock);
|
||||||
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
|
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
|
||||||
ASSERT3U(mg->mg_histogram[i + ashift], >=,
|
ASSERT3U(mg->mg_histogram[i + ashift], >=,
|
||||||
msp->ms_sm->sm_phys->smp_histogram[i]);
|
msp->ms_sm->sm_phys->smp_histogram[i]);
|
||||||
@ -1102,6 +1107,7 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
|
|||||||
mc->mc_histogram[i + ashift] -=
|
mc->mc_histogram[i + ashift] -=
|
||||||
msp->ms_sm->sm_phys->smp_histogram[i];
|
msp->ms_sm->sm_phys->smp_histogram[i];
|
||||||
}
|
}
|
||||||
|
mutex_exit(&mc->mc_lock);
|
||||||
mutex_exit(&mg->mg_lock);
|
mutex_exit(&mg->mg_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1724,6 +1724,14 @@ vdev_probe(vdev_t *vd, zio_t *zio)
|
|||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
vdev_load_child(void *arg)
|
||||||
|
{
|
||||||
|
vdev_t *vd = arg;
|
||||||
|
|
||||||
|
vd->vdev_load_error = vdev_load(vd);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
vdev_open_child(void *arg)
|
vdev_open_child(void *arg)
|
||||||
{
|
{
|
||||||
@ -3350,18 +3358,46 @@ vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
|
|||||||
int
|
int
|
||||||
vdev_load(vdev_t *vd)
|
vdev_load(vdev_t *vd)
|
||||||
{
|
{
|
||||||
|
int children = vd->vdev_children;
|
||||||
int error = 0;
|
int error = 0;
|
||||||
|
taskq_t *tq = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* It's only worthwhile to use the taskq for the root vdev, because the
|
||||||
|
* slow part is metaslab_init, and that only happens for top-level
|
||||||
|
* vdevs.
|
||||||
|
*/
|
||||||
|
if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
|
||||||
|
tq = taskq_create("vdev_load", children, minclsyspri,
|
||||||
|
children, children, TASKQ_PREPOPULATE);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Recursively load all children.
|
* Recursively load all children.
|
||||||
*/
|
*/
|
||||||
for (int c = 0; c < vd->vdev_children; c++) {
|
for (int c = 0; c < vd->vdev_children; c++) {
|
||||||
error = vdev_load(vd->vdev_child[c]);
|
vdev_t *cvd = vd->vdev_child[c];
|
||||||
if (error != 0) {
|
|
||||||
return (error);
|
if (tq == NULL || vdev_uses_zvols(cvd)) {
|
||||||
|
cvd->vdev_load_error = vdev_load(cvd);
|
||||||
|
} else {
|
||||||
|
VERIFY(taskq_dispatch(tq, vdev_load_child,
|
||||||
|
cvd, TQ_SLEEP) != TASKQID_INVALID);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tq != NULL) {
|
||||||
|
taskq_wait(tq);
|
||||||
|
taskq_destroy(tq);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int c = 0; c < vd->vdev_children; c++) {
|
||||||
|
int error = vd->vdev_child[c]->vdev_load_error;
|
||||||
|
|
||||||
|
if (error != 0)
|
||||||
|
return (error);
|
||||||
|
}
|
||||||
|
|
||||||
vdev_set_deflate_ratio(vd);
|
vdev_set_deflate_ratio(vd);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
Loading…
Reference in New Issue
Block a user