Add support for parallel pool exports

Changed spa_export_common() such that it no longer holds the
spa_namespace_lock for the entire duration and instead sets
spa_export_thread to indicate an import is in progress on the
spa.  This allows for an export to a diffent pool to proceed
in parallel while an export is still processing potentially
long operations like spa_unload_log_sm_flush_all().

Calls like spa_lookup() and spa_vdev_enter() that rely on
the spa_namespace_lock to serialize them against a concurrent
export, now wait for any in-progress export thread to complete
before proceeding.

The 'zpool import -a' sub-command also provides multi-threaded
support, using a thread pool to submit the exports in parallel.

Sponsored-By: Klara Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <gwilson@delphix.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #16153
This commit is contained in:
Don Brady
2024-05-02 19:28:10 +00:00
committed by Brian Behlendorf
parent abec7dcd30
commit 975a13259b
12 changed files with 373 additions and 33 deletions
+2 -2
View File
@@ -8143,11 +8143,11 @@ l2arc_dev_get_next(void)
ASSERT3P(next, !=, NULL);
} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
next->l2ad_trim_all);
next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting);
/* if we were unable to find any usable vdevs, return NULL */
if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
next->l2ad_trim_all)
next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting)
next = NULL;
l2arc_dev_last = next;
+29 -7
View File
@@ -34,6 +34,7 @@
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
* Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
* Copyright (c) 2024, Klara Inc.
*/
/*
@@ -1991,7 +1992,8 @@ spa_destroy_aux_threads(spa_t *spa)
static void
spa_unload(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
spa_import_progress_remove(spa_guid(spa));
@@ -6955,7 +6957,7 @@ static int
spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
boolean_t force, boolean_t hardforce)
{
int error;
int error = 0;
spa_t *spa;
hrtime_t export_start = gethrtime();
@@ -6979,8 +6981,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
spa->spa_is_exporting = B_TRUE;
/*
* Put a hold on the pool, drop the namespace lock, stop async tasks,
* reacquire the namespace lock, and see if we can export.
* Put a hold on the pool, drop the namespace lock, stop async tasks
* and see if we can export.
*/
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
@@ -6990,10 +6992,18 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
taskq_wait(spa->spa_zvol_taskq);
}
mutex_enter(&spa_namespace_lock);
spa->spa_export_thread = curthread;
spa_close(spa, FTAG);
mutex_exit(&spa_namespace_lock);
/*
* At this point we no longer hold the spa_namespace_lock and
* the spa_export_thread indicates that an export is in progress.
*/
if (spa->spa_state == POOL_STATE_UNINITIALIZED)
goto export_spa;
/*
* The pool will be in core if it's openable, in which case we can
* modify its state. Objsets may be open only because they're dirty,
@@ -7089,6 +7099,10 @@ export_spa:
if (oldconfig && spa->spa_config)
*oldconfig = fnvlist_dup(spa->spa_config);
if (new_state == POOL_STATE_EXPORTED)
zio_handle_export_delay(spa, gethrtime() - export_start);
mutex_enter(&spa_namespace_lock);
if (new_state != POOL_STATE_UNINITIALIZED) {
if (!hardforce)
spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
@@ -7100,17 +7114,25 @@ export_spa:
* we make sure to reset the exporting flag.
*/
spa->spa_is_exporting = B_FALSE;
spa->spa_export_thread = NULL;
}
if (new_state == POOL_STATE_EXPORTED)
zio_handle_export_delay(spa, gethrtime() - export_start);
/*
* Wake up any waiters on spa_namespace_lock
* They need to re-attempt a spa_lookup()
*/
cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (0);
fail:
mutex_enter(&spa_namespace_lock);
spa->spa_is_exporting = B_FALSE;
spa->spa_export_thread = NULL;
spa_async_resume(spa);
/* Wake up any waiters on spa_namespace_lock */
cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (error);
}
+40 -10
View File
@@ -27,7 +27,7 @@
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright (c) 2023, Klara Inc.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#include <sys/zfs_context.h>
@@ -82,8 +82,8 @@
* - Check if spa_refcount is zero
* - Rename a spa_t
* - add/remove/attach/detach devices
* - Held for the duration of create/destroy/export
* - Held at the start and end of import
* - Held for the duration of create/destroy
* - Held at the start and end of import and export
*
* It does not need to handle recursion. A create or destroy may
* reference objects (files or zvols) in other pools, but by
@@ -636,8 +636,14 @@ retry:
if (spa == NULL)
return (NULL);
if (spa->spa_load_thread != NULL &&
spa->spa_load_thread != curthread) {
/*
* Avoid racing with import/export, which don't hold the namespace
* lock for their entire duration.
*/
if ((spa->spa_load_thread != NULL &&
spa->spa_load_thread != curthread) ||
(spa->spa_export_thread != NULL &&
spa->spa_export_thread != curthread)) {
cv_wait(&spa_namespace_cv, &spa_namespace_lock);
goto retry;
}
@@ -950,14 +956,15 @@ spa_open_ref(spa_t *spa, const void *tag)
/*
* Remove a reference to the given spa_t. Must have at least one reference, or
* have the namespace lock held.
* have the namespace lock held or be part of a pool import/export.
*/
void
spa_close(spa_t *spa, const void *tag)
{
ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
spa->spa_load_thread == curthread ||
spa->spa_export_thread == curthread);
(void) zfs_refcount_remove(&spa->spa_refcount, tag);
}
@@ -977,13 +984,15 @@ spa_async_close(spa_t *spa, const void *tag)
/*
* Check to see if the spa refcount is zero. Must be called with
* spa_namespace_lock held. We really compare against spa_minref, which is the
* number of references acquired when opening a pool
* spa_namespace_lock held or be the spa export thread. We really
* compare against spa_minref, which is the number of references
* acquired when opening a pool
*/
boolean_t
spa_refcount_zero(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
}
@@ -1231,6 +1240,21 @@ spa_vdev_enter(spa_t *spa)
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
/*
* We have a reference on the spa and a spa export could be
* starting but no longer holding the spa_namespace_lock. So
* check if there is an export and if so wait. It will fail
* fast (EBUSY) since we are still holding a spa reference.
*
* Note that we can be woken by a different spa transitioning
* through an import/export, so we must wait for our condition
* to change before proceeding.
*/
while (spa->spa_export_thread != NULL &&
spa->spa_export_thread != curthread) {
cv_wait(&spa_namespace_cv, &spa_namespace_lock);
}
vdev_autotrim_stop_all(spa);
return (spa_vdev_config_enter(spa));
@@ -1248,6 +1272,12 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
/* See comment in spa_vdev_enter() */
while (spa->spa_export_thread != NULL &&
spa->spa_export_thread != curthread) {
cv_wait(&spa_namespace_cv, &spa_namespace_lock);
}
vdev_autotrim_stop_all(spa);
if (guid != 0) {
+6 -3
View File
@@ -682,7 +682,8 @@ vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
(void) spa;
vdev_t *vd;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
while ((vd = list_remove_head(vd_list)) != NULL) {
mutex_enter(&vd->vdev_initialize_lock);
@@ -724,7 +725,8 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
if (vd_list == NULL) {
vdev_initialize_stop_wait_impl(vd);
} else {
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
vd->vdev_spa->spa_export_thread == curthread);
list_insert_tail(vd_list, vd);
}
}
@@ -756,7 +758,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
spa_t *spa = vd->vdev_spa;
list_t vd_list;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
list_create(&vd_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_initialize_node));
+2 -1
View File
@@ -1087,7 +1087,8 @@ vdev_rebuild_stop_wait(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
if (vd == spa->spa_root_vdev) {
for (uint64_t i = 0; i < vd->vdev_children; i++)
+6 -3
View File
@@ -1040,7 +1040,8 @@ vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
(void) spa;
vdev_t *vd;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
while ((vd = list_remove_head(vd_list)) != NULL) {
mutex_enter(&vd->vdev_trim_lock);
@@ -1079,7 +1080,8 @@ vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
if (vd_list == NULL) {
vdev_trim_stop_wait_impl(vd);
} else {
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
vd->vdev_spa->spa_export_thread == curthread);
list_insert_tail(vd_list, vd);
}
}
@@ -1115,7 +1117,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
list_t vd_list;
vdev_t *vd_l2cache;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
list_create(&vd_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_trim_node));