Parallel pool import

This commit allow spa_load() to drop the spa_namespace_lock so
that imports can happen concurrently. Prior to dropping the
spa_namespace_lock, the import logic will set the spa_load_thread
value to track the thread which is doing the import.

Consumers of spa_lookup() retain the same behavior by blocking
when either a thread is holding the spa_namespace_lock or the
spa_load_thread value is set. This will ensure that critical
concurrent operations cannot take place while a pool is being
imported.

The zpool command is also enhanced to provide multi-threaded support
when invoking zpool import -a.

Lastly, zinject provides a mechanism to insert artificial delays
when importing a pool and new zfs tests are added to verify parallel
import functionality.

Contributions-by: Don Brady <don.brady@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Wilson <gwilson@delphix.com>
Closes #16093
This commit is contained in:
George Wilson
2024-04-22 12:42:38 -04:00
committed by GitHub
parent f4f156157d
commit c183d164aa
19 changed files with 818 additions and 72 deletions
+40 -18
View File
@@ -3273,8 +3273,6 @@ spa_spawn_aux_threads(spa_t *spa)
{
ASSERT(spa_writeable(spa));
ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa_start_raidz_expansion_thread(spa);
spa_start_indirect_condensing_thread(spa);
spa_start_livelist_destroy_thread(spa);
@@ -4981,7 +4979,8 @@ spa_ld_read_checkpoint_txg(spa_t *spa)
int error = 0;
ASSERT0(spa->spa_checkpoint_txg);
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
@@ -5228,6 +5227,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
boolean_t checkpoint_rewind =
(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
boolean_t update_config_cache = B_FALSE;
hrtime_t load_start = gethrtime();
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
@@ -5272,13 +5272,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
return (error);
}
/*
* Drop the namespace lock for the rest of the function.
*/
spa->spa_load_thread = curthread;
mutex_exit(&spa_namespace_lock);
/*
* Retrieve the checkpoint txg if the pool has a checkpoint.
*/
spa_import_progress_set_notes(spa, "Loading checkpoint txg");
error = spa_ld_read_checkpoint_txg(spa);
if (error != 0)
return (error);
goto fail;
/*
* Retrieve the mapping of indirect vdevs. Those vdevs were removed
@@ -5291,7 +5297,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
error = spa_ld_open_indirect_vdev_metadata(spa);
if (error != 0)
return (error);
goto fail;
/*
* Retrieve the full list of active features from the MOS and check if
@@ -5300,7 +5306,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
spa_import_progress_set_notes(spa, "Checking feature flags");
error = spa_ld_check_features(spa, &missing_feat_write);
if (error != 0)
return (error);
goto fail;
/*
* Load several special directories from the MOS needed by the dsl_pool
@@ -5309,7 +5315,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
spa_import_progress_set_notes(spa, "Loading special MOS directories");
error = spa_ld_load_special_directories(spa);
if (error != 0)
return (error);
goto fail;
/*
* Retrieve pool properties from the MOS.
@@ -5317,7 +5323,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
spa_import_progress_set_notes(spa, "Loading properties");
error = spa_ld_get_props(spa);
if (error != 0)
return (error);
goto fail;
/*
* Retrieve the list of auxiliary devices - cache devices and spares -
@@ -5326,7 +5332,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
spa_import_progress_set_notes(spa, "Loading AUX vdevs");
error = spa_ld_open_aux_vdevs(spa, type);
if (error != 0)
return (error);
goto fail;
/*
* Load the metadata for all vdevs. Also check if unopenable devices
@@ -5335,17 +5341,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
spa_import_progress_set_notes(spa, "Loading vdev metadata");
error = spa_ld_load_vdev_metadata(spa);
if (error != 0)
return (error);
goto fail;
spa_import_progress_set_notes(spa, "Loading dedup tables");
error = spa_ld_load_dedup_tables(spa);
if (error != 0)
return (error);
goto fail;
spa_import_progress_set_notes(spa, "Loading BRT");
error = spa_ld_load_brt(spa);
if (error != 0)
return (error);
goto fail;
/*
* Verify the logs now to make sure we don't have any unexpected errors
@@ -5354,7 +5360,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
spa_import_progress_set_notes(spa, "Verifying Log Devices");
error = spa_ld_verify_logs(spa, type, ereport);
if (error != 0)
return (error);
goto fail;
if (missing_feat_write) {
ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
@@ -5364,8 +5370,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
* read-only mode but not read-write mode. We now have enough
* information and can return to userland.
*/
return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
ENOTSUP));
error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
ENOTSUP);
goto fail;
}
/*
@@ -5376,7 +5383,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
spa_import_progress_set_notes(spa, "Verifying pool data");
error = spa_ld_verify_pool_data(spa);
if (error != 0)
return (error);
goto fail;
/*
* Calculate the deflated space for the pool. This must be done before
@@ -5501,13 +5508,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
spa_config_exit(spa, SCL_CONFIG, FTAG);
spa_import_progress_set_notes(spa, "Finished importing");
}
zio_handle_import_delay(spa, gethrtime() - load_start);
spa_import_progress_remove(spa_guid(spa));
spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
spa_load_note(spa, "LOADED");
fail:
mutex_enter(&spa_namespace_lock);
spa->spa_load_thread = NULL;
cv_broadcast(&spa_namespace_cv);
return (error);
return (0);
}
static int
@@ -6757,9 +6770,14 @@ spa_tryimport(nvlist_t *tryconfig)
/*
* Create and initialize the spa structure.
*/
char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
(void) snprintf(name, MAXPATHLEN, "%s-%llx-%s",
TRYIMPORT_NAME, (u_longlong_t)curthread, poolname);
mutex_enter(&spa_namespace_lock);
spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
spa = spa_add(name, tryconfig, NULL);
spa_activate(spa, SPA_MODE_READ);
kmem_free(name, MAXPATHLEN);
/*
* Rewind pool if a max txg was provided.
@@ -6874,6 +6892,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
{
int error;
spa_t *spa;
hrtime_t export_start = gethrtime();
if (oldconfig)
*oldconfig = NULL;
@@ -7018,6 +7037,9 @@ export_spa:
spa->spa_is_exporting = B_FALSE;
}
if (new_state == POOL_STATE_EXPORTED)
zio_handle_export_delay(spa, gethrtime() - export_start);
mutex_exit(&spa_namespace_lock);
return (0);
+19 -7
View File
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2019 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -82,7 +82,8 @@
* - Check if spa_refcount is zero
* - Rename a spa_t
* - add/remove/attach/detach devices
* - Held for the duration of create/destroy/import/export
* - Held for the duration of create/destroy/export
* - Held at the start and end of import
*
* It does not need to handle recursion. A create or destroy may
* reference objects (files or zvols) in other pools, but by
@@ -235,9 +236,9 @@
* locking is, always, based on spa_namespace_lock and spa_config_lock[].
*/
static avl_tree_t spa_namespace_avl;
avl_tree_t spa_namespace_avl;
kmutex_t spa_namespace_lock;
static kcondvar_t spa_namespace_cv;
kcondvar_t spa_namespace_cv;
static const int spa_max_replication_override = SPA_DVAS_PER_BP;
static kmutex_t spa_spare_lock;
@@ -619,6 +620,7 @@ spa_lookup(const char *name)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
retry:
(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
/*
@@ -630,6 +632,14 @@ spa_lookup(const char *name)
*cp = '\0';
spa = avl_find(&spa_namespace_avl, &search, &where);
if (spa == NULL)
return (NULL);
if (spa->spa_load_thread != NULL &&
spa->spa_load_thread != curthread) {
cv_wait(&spa_namespace_cv, &spa_namespace_lock);
goto retry;
}
return (spa);
}
@@ -728,6 +738,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa_config_lock_init(spa);
spa_stats_init(spa);
ASSERT(MUTEX_HELD(&spa_namespace_lock));
avl_add(&spa_namespace_avl, spa);
/*
@@ -826,7 +837,6 @@ spa_remove(spa_t *spa)
nvlist_free(spa->spa_config_splitting);
avl_remove(&spa_namespace_avl, spa);
cv_broadcast(&spa_namespace_cv);
if (spa->spa_root)
spa_strfree(spa->spa_root);
@@ -920,7 +930,8 @@ void
spa_open_ref(spa_t *spa, const void *tag)
{
ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
MUTEX_HELD(&spa_namespace_lock));
MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
(void) zfs_refcount_add(&spa->spa_refcount, tag);
}
@@ -932,7 +943,8 @@ void
spa_close(spa_t *spa, const void *tag)
{
ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
MUTEX_HELD(&spa_namespace_lock));
MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
(void) zfs_refcount_remove(&spa->spa_refcount, tag);
}
+3 -2
View File
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2016, 2024 by Delphix. All rights reserved.
*/
#include <sys/spa.h>
@@ -775,7 +775,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
void
vdev_initialize_restart(vdev_t *vd)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
vd->vdev_spa->spa_load_thread == curthread);
ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
if (vd->vdev_leaf_zap != 0) {
+3 -1
View File
@@ -23,6 +23,7 @@
* Copyright (c) 2018, Intel Corporation.
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
* Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
* Copyright (c) 2024 by Delphix. All rights reserved.
*/
#include <sys/vdev_impl.h>
@@ -1071,7 +1072,8 @@ vdev_rebuild_restart_impl(vdev_t *vd)
void
vdev_rebuild_restart(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
vdev_rebuild_restart_impl(spa->spa_root_vdev);
}
+5 -4
View File
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright (c) 2016, 2024 by Delphix. All rights reserved.
* Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
* Copyright (c) 2021 Hewlett Packard Enterprise Development LP
* Copyright 2023 RackTop Systems, Inc.
@@ -1148,7 +1148,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
void
vdev_trim_restart(vdev_t *vd)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
vd->vdev_spa->spa_load_thread == curthread);
ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
if (vd->vdev_leaf_zap != 0) {
@@ -1568,8 +1569,8 @@ vdev_autotrim_stop_all(spa_t *spa)
void
vdev_autotrim_restart(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
if (spa->spa_autotrim)
vdev_autotrim(spa);
}
+129 -9
View File
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2024, Klara Inc.
*/
/*
@@ -59,6 +60,7 @@ uint32_t zio_injection_enabled = 0;
typedef struct inject_handler {
int zi_id;
spa_t *zi_spa;
char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */
zinject_record_t zi_record;
uint64_t *zi_lanes;
int zi_next_lane;
@@ -703,6 +705,63 @@ zio_handle_io_delay(zio_t *zio)
return (min_target);
}
static void
zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command)
{
inject_handler_t *handler;
hrtime_t delay = 0;
int id = 0;
rw_enter(&inject_lock, RW_READER);
for (handler = list_head(&inject_handlers);
handler != NULL && handler->zi_record.zi_cmd == command;
handler = list_next(&inject_handlers, handler)) {
ASSERT3P(handler->zi_spa_name, !=, NULL);
if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) {
uint64_t pause =
SEC2NSEC(handler->zi_record.zi_duration);
if (pause > elapsed) {
delay = pause - elapsed;
}
id = handler->zi_id;
break;
}
}
rw_exit(&inject_lock);
if (delay) {
if (command == ZINJECT_DELAY_IMPORT) {
spa_import_progress_set_notes(spa, "injecting %llu "
"sec delay", (u_longlong_t)NSEC2SEC(delay));
}
zfs_sleep_until(gethrtime() + delay);
}
if (id) {
/* all done with this one-shot handler */
zio_clear_fault(id);
}
}
/*
* For testing, inject a delay during an import
*/
void
zio_handle_import_delay(spa_t *spa, hrtime_t elapsed)
{
zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT);
}
/*
* For testing, inject a delay during an export
*/
void
zio_handle_export_delay(spa_t *spa, hrtime_t elapsed)
{
zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT);
}
static int
zio_calculate_range(const char *pool, zinject_record_t *record)
{
@@ -760,6 +819,28 @@ zio_calculate_range(const char *pool, zinject_record_t *record)
return (0);
}
static boolean_t
zio_pool_handler_exists(const char *name, zinject_type_t command)
{
boolean_t exists = B_FALSE;
rw_enter(&inject_lock, RW_READER);
for (inject_handler_t *handler = list_head(&inject_handlers);
handler != NULL; handler = list_next(&inject_handlers, handler)) {
if (command != handler->zi_record.zi_cmd)
continue;
const char *pool = (handler->zi_spa_name != NULL) ?
handler->zi_spa_name : spa_name(handler->zi_spa);
if (strcmp(name, pool) == 0) {
exists = B_TRUE;
break;
}
}
rw_exit(&inject_lock);
return (exists);
}
/*
* Create a new handler for the given record. We add it to the list, adding
* a reference to the spa_t in the process. We increment zio_injection_enabled,
@@ -810,16 +891,42 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
if (!(flags & ZINJECT_NULL)) {
/*
* spa_inject_ref() will add an injection reference, which will
* prevent the pool from being removed from the namespace while
* still allowing it to be unloaded.
* Pool delays for import or export don't take an
* injection reference on the spa. Instead they
* rely on matching by name.
*/
if ((spa = spa_inject_addref(name)) == NULL)
return (SET_ERROR(ENOENT));
if (record->zi_cmd == ZINJECT_DELAY_IMPORT ||
record->zi_cmd == ZINJECT_DELAY_EXPORT) {
if (record->zi_duration <= 0)
return (SET_ERROR(EINVAL));
/*
* Only one import | export delay handler per pool.
*/
if (zio_pool_handler_exists(name, record->zi_cmd))
return (SET_ERROR(EEXIST));
mutex_enter(&spa_namespace_lock);
boolean_t has_spa = spa_lookup(name) != NULL;
mutex_exit(&spa_namespace_lock);
if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa)
return (SET_ERROR(EEXIST));
if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa)
return (SET_ERROR(ENOENT));
spa = NULL;
} else {
/*
* spa_inject_ref() will add an injection reference,
* which will prevent the pool from being removed
* from the namespace while still allowing it to be
* unloaded.
*/
if ((spa = spa_inject_addref(name)) == NULL)
return (SET_ERROR(ENOENT));
}
handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
handler->zi_spa = spa;
handler->zi_spa = spa; /* note: can be NULL */
handler->zi_record = *record;
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
@@ -832,6 +939,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
handler->zi_next_lane = 0;
}
if (handler->zi_spa == NULL)
handler->zi_spa_name = spa_strdup(name);
else
handler->zi_spa_name = NULL;
rw_enter(&inject_lock, RW_WRITER);
/*
@@ -891,7 +1003,11 @@ zio_inject_list_next(int *id, char *name, size_t buflen,
if (handler) {
*record = handler->zi_record;
*id = handler->zi_id;
(void) strlcpy(name, spa_name(handler->zi_spa), buflen);
ASSERT(handler->zi_spa || handler->zi_spa_name);
if (handler->zi_spa != NULL)
(void) strlcpy(name, spa_name(handler->zi_spa), buflen);
else
(void) strlcpy(name, handler->zi_spa_name, buflen);
ret = 0;
} else {
ret = SET_ERROR(ENOENT);
@@ -941,7 +1057,11 @@ zio_clear_fault(int id)
ASSERT3P(handler->zi_lanes, ==, NULL);
}
spa_inject_delref(handler->zi_spa);
if (handler->zi_spa_name != NULL)
spa_strfree(handler->zi_spa_name);
if (handler->zi_spa != NULL)
spa_inject_delref(handler->zi_spa);
kmem_free(handler, sizeof (inject_handler_t));
atomic_dec_32(&zio_injection_enabled);