From c10d37dd9f5d56e19330add732f1a713250d24f7 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Wed, 19 Dec 2018 09:20:39 -0700 Subject: [PATCH] zfs initialize performance enhancements PROBLEM ======== When invoking "zpool initialize" on a pool the command will create a thread to initialize each disk. Unfortunately, it does this serially across many transaction groups which can result in commands taking a long time to return to the user and may appear hung. The same thing is true when trying to suspend/cancel the operation. SOLUTION ========= This change refactors the way we invoke the initialize interface to ensure we can start or stop the intialization in just a few transaction groups. When stopping or cancelling a vdev initialization perform it in two phases. First signal each vdev initialization thread that it should exit, then after all threads have been signaled wait for them to exit. On a pool with 40 leaf vdevs this reduces the vdev initialize stop/cancel time from ~10 minutes to under a second. The reason for this is spa_vdev_initialize() no longer needs to wait on multiple full TXGs per leaf vdev being stopped. This commit additionally adds some missing checks for the passed "initialize_vdevs" input nvlist. The contents of the user provided input "initialize_vdevs" nvlist must be validated to ensure all values are uint64s. This is done in zfs_ioc_pool_initialize() in order to keep all of these checks in a single location. Updated the innvl and outnvl comments to match the formatting used for all other new sytle ioctls. Reviewed by: Matt Ahrens Reviewed-by: loli10K Reviewed-by: Tim Chase Signed-off-by: Brian Behlendorf Signed-off-by: George Wilson Closes #8230 --- cmd/ztest/ztest.c | 9 +++- include/sys/spa.h | 3 +- include/sys/vdev_impl.h | 1 + include/sys/vdev_initialize.h | 5 +- module/zfs/spa.c | 90 +++++++++++++++++++++++++---------- module/zfs/spa_misc.c | 3 +- module/zfs/vdev.c | 1 + module/zfs/vdev_initialize.c | 83 +++++++++++++++++++++++++------- module/zfs/vdev_removal.c | 2 +- module/zfs/zfs_ioctl.c | 57 ++++++++++------------ 10 files changed, 175 insertions(+), 79 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 385984f84..e98fe8cd3 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -6404,7 +6404,14 @@ ztest_initialize(ztest_ds_t *zd, uint64_t id) spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); - error = spa_vdev_initialize(spa, guid, cmd); + + nvlist_t *vdev_guids = fnvlist_alloc(); + nvlist_t *vdev_errlist = fnvlist_alloc(); + fnvlist_add_uint64(vdev_guids, path, guid); + error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); + fnvlist_free(vdev_guids); + fnvlist_free(vdev_errlist); + switch (cmd) { case POOL_INITIALIZE_CANCEL: if (ztest_opts.zo_verbose >= 4) { diff --git a/include/sys/spa.h b/include/sys/spa.h index 4a66260ef..febf0e8f2 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -788,7 +788,8 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern boolean_t spa_vdev_remove_active(spa_t *spa); -extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type); +extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, + nvlist_t *vdev_errlist); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index ae21e037e..5fe05c54b 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -263,6 +263,7 @@ struct vdev { boolean_t vdev_initialize_exit_wanted; vdev_initializing_state_t vdev_initialize_state; + list_node_t vdev_initialize_node; kthread_t *vdev_initialize_thread; /* Protects vdev_initialize_thread and vdev_initialize_state. */ kmutex_t vdev_initialize_lock; diff --git a/include/sys/vdev_initialize.h b/include/sys/vdev_initialize.h index db4b0572c..319fb9bc0 100644 --- a/include/sys/vdev_initialize.h +++ b/include/sys/vdev_initialize.h @@ -26,15 +26,18 @@ #ifndef _SYS_VDEV_INITIALIZE_H #define _SYS_VDEV_INITIALIZE_H +#include + #ifdef __cplusplus extern "C" { #endif extern void vdev_initialize(vdev_t *vd); extern void vdev_initialize_stop(vdev_t *vd, - vdev_initializing_state_t tgt_state); + vdev_initializing_state_t tgt_state, list_t *vd_list); extern void vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state); +extern void vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list); extern void vdev_initialize_restart(vdev_t *vd); extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 622be75f9..c4ff5002b 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -6381,32 +6381,24 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) return (error); } -int -spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type) +static int +spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, + list_t *vd_list) { - /* - * We hold the namespace lock through the whole function - * to prevent any changes to the pool while we're starting or - * stopping initialization. The config and state locks are held so that - * we can properly assess the vdev state before we commit to - * the initializing operation. - */ - mutex_enter(&spa_namespace_lock); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - mutex_exit(&spa_namespace_lock); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - mutex_exit(&spa_namespace_lock); return (SET_ERROR(EROFS)); } mutex_enter(&vd->vdev_initialize_lock); @@ -6423,18 +6415,15 @@ spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type) (vd->vdev_initialize_thread != NULL || vd->vdev_top->vdev_removing)) { mutex_exit(&vd->vdev_initialize_lock); - mutex_exit(&spa_namespace_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { mutex_exit(&vd->vdev_initialize_lock); - mutex_exit(&spa_namespace_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_SUSPEND && vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); - mutex_exit(&spa_namespace_lock); return (SET_ERROR(ESRCH)); } @@ -6443,23 +6432,65 @@ spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type) vdev_initialize(vd); break; case POOL_INITIALIZE_CANCEL: - vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); break; case POOL_INITIALIZE_SUSPEND: - vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED); + vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_initialize_lock); + return (0); +} + +int +spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, + nvlist_t *vdev_errlist) +{ + int total_errors = 0; + list_t vd_list; + + list_create(&vd_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_initialize_node)); + + /* + * We hold the namespace lock through the whole function + * to prevent any changes to the pool while we're starting or + * stopping initialization. The config and state locks are held so that + * we can properly assess the vdev state before we commit to + * the initializing operation. + */ + mutex_enter(&spa_namespace_lock); + + for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); + pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { + uint64_t vdev_guid = fnvpair_value_uint64(pair); + + int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, + &vd_list); + if (error != 0) { + char guid_as_str[MAXNAMELEN]; + + (void) snprintf(guid_as_str, sizeof (guid_as_str), + "%llu", (unsigned long long)vdev_guid); + fnvlist_add_int64(vdev_errlist, guid_as_str, error); + total_errors++; + } + } + + /* Wait for all initialize threads to stop. */ + vdev_initialize_stop_wait(spa, &vd_list); + /* Sync out the initializing state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); - return (0); -} + list_destroy(&vd_list); + return (total_errors); +} /* * Split a set of devices from their mirrors, and create a new pool from them. @@ -6669,18 +6700,25 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); + /* + * Temporarily stop the initializing activity. We set the state to + * ACTIVE so that we know to resume the initializing once the split + * has completed. + */ + list_t vd_list; + list_create(&vd_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_initialize_node)); + for (c = 0; c < children; c++) { if (vml[c] != NULL) { - /* - * Temporarily stop the initializing activity. We set - * the state to ACTIVE so that we know to resume - * the initializing once the split has completed. - */ mutex_enter(&vml[c]->vdev_initialize_lock); - vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE); + vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE, + &vd_list); mutex_exit(&vml[c]->vdev_initialize_lock); } } + vdev_initialize_stop_wait(spa, &vd_list); + list_destroy(&vd_list); newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index dfac92d45..877f312b1 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1197,7 +1197,8 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); if (vd->vdev_ops->vdev_op_leaf) { mutex_enter(&vd->vdev_initialize_lock); - vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, + NULL); mutex_exit(&vd->vdev_initialize_lock); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index f808f8ee7..26ef5b4c5 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -530,6 +530,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); + list_link_init(&vd->vdev_initialize_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index fcd2c76f9..939a5e938 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -702,18 +702,51 @@ vdev_initialize(vdev_t *vd) } /* - * Stop initializng a device, with the resultant initialing state being - * tgt_state. Blocks until the initializing thread has exited. - * Caller must hold vdev_initialize_lock and must not be writing to the spa - * config, as the initializing thread may try to enter the config as a reader - * before exiting. + * Wait for the initialize thread to be terminated (cancelled or stopped). + */ +static void +vdev_initialize_stop_wait_impl(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + + while (vd->vdev_initialize_thread != NULL) + cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); + + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + vd->vdev_initialize_exit_wanted = B_FALSE; +} + +/* + * Wait for vdev initialize threads which were either to cleanly exit. */ void -vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state) +vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) { - ASSERTV(spa_t *spa = vd->vdev_spa); - ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER)); + vdev_t *vd; + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + while ((vd = list_remove_head(vd_list)) != NULL) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop_wait_impl(vd); + mutex_exit(&vd->vdev_initialize_lock); + } +} + +/* + * Stop initializing a device, with the resultant initialing state being + * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when + * a list_t is provided the stopping vdev is inserted in to the list. Callers + * are then required to call vdev_initialize_stop_wait() to block for all the + * initialization threads to exit. The caller must hold vdev_initialize_lock + * and must not be writing to the spa config, as the initializing thread may + * try to enter the config as a reader before exiting. + */ +void +vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, + list_t *vd_list) +{ + ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); @@ -729,25 +762,29 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state) vdev_initialize_change_state(vd, tgt_state); vd->vdev_initialize_exit_wanted = B_TRUE; - while (vd->vdev_initialize_thread != NULL) - cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); - ASSERT3P(vd->vdev_initialize_thread, ==, NULL); - vd->vdev_initialize_exit_wanted = B_FALSE; + if (vd_list == NULL) { + vdev_initialize_stop_wait_impl(vd); + } else { + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + list_insert_tail(vd_list, vd); + } } static void -vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state) +vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state, + list_t *vd_list) { if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { mutex_enter(&vd->vdev_initialize_lock); - vdev_initialize_stop(vd, tgt_state); + vdev_initialize_stop(vd, tgt_state, vd_list); mutex_exit(&vd->vdev_initialize_lock); return; } for (uint64_t i = 0; i < vd->vdev_children; i++) { - vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state); + vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state, + vd_list); } } @@ -758,12 +795,23 @@ vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state) void vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) { - vdev_initialize_stop_all_impl(vd, tgt_state); + spa_t *spa = vd->vdev_spa; + list_t vd_list; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + list_create(&vd_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_initialize_node)); + + vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list); + vdev_initialize_stop_wait(spa, &vd_list); if (vd->vdev_spa->spa_sync_on) { /* Make sure that our state has been synced to disk */ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); } + + list_destroy(&vd_list); } void @@ -808,9 +856,10 @@ vdev_initialize_restart(vdev_t *vd) #if defined(_KERNEL) EXPORT_SYMBOL(vdev_initialize_restart); EXPORT_SYMBOL(vdev_xlate); -EXPORT_SYMBOL(vdev_initialize_stop_all); EXPORT_SYMBOL(vdev_initialize); EXPORT_SYMBOL(vdev_initialize_stop); +EXPORT_SYMBOL(vdev_initialize_stop_all); +EXPORT_SYMBOL(vdev_initialize_stop_wait); /* CSTYLED */ module_param(zfs_initialize_value, ulong, 0644); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index d0824aa84..1896e824f 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1899,7 +1899,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); /* Stop initializing */ - (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); + vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); *txg = spa_vdev_config_enter(spa); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 3c36502d8..0dfa01684 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3846,73 +3846,68 @@ zfs_ioc_destroy(zfs_cmd_t *zc) /* * innvl: { - * vdevs: { - * guid 1, guid 2, ... + * "initialize_command" -> POOL_INITIALIZE_{CANCEL|DO|SUSPEND} (uint64) + * "initialize_vdevs": { -> guids to initialize (nvlist) + * "vdev_path_1": vdev_guid_1, (uint64), + * "vdev_path_2": vdev_guid_2, (uint64), + * ... * }, - * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND} * } * * outnvl: { - * [func: EINVAL (if provided command type didn't make sense)], - * [vdevs: { - * guid1: errno, (see function body for possible errnos) + * "initialize_vdevs": { -> initialization errors (nvlist) + * "vdev_path_1": errno, see function body for possible errnos (uint64) + * "vdev_path_2": errno, ... (uint64) * ... - * }] + * } * } * + * EINVAL is returned for an unknown commands or if any of the provided vdev + * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { - {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, + {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { - spa_t *spa; - int error; - - error = spa_open(poolname, &spa, FTAG); - if (error != 0) - return (error); - uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, &cmd_type) != 0) { - spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } + if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_DO || cmd_type == POOL_INITIALIZE_SUSPEND)) { - spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, &vdev_guids) != 0) { - spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } - nvlist_t *vdev_errlist = fnvlist_alloc(); - int total_errors = 0; - for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { - uint64_t vdev_guid = fnvpair_value_uint64(pair); - - error = spa_vdev_initialize(spa, vdev_guid, cmd_type); - if (error != 0) { - char guid_as_str[MAXNAMELEN]; - - (void) snprintf(guid_as_str, sizeof (guid_as_str), - "%llu", (unsigned long long)vdev_guid); - fnvlist_add_int64(vdev_errlist, guid_as_str, error); - total_errors++; + uint64_t vdev_guid; + if (nvpair_value_uint64(pair, &vdev_guid) != 0) { + return (SET_ERROR(EINVAL)); } } + + spa_t *spa; + int error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + nvlist_t *vdev_errlist = fnvlist_alloc(); + int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type, + vdev_errlist); + if (fnvlist_size(vdev_errlist) > 0) { fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, vdev_errlist);