Vdev Properties Feature

Add properties, similar to pool properties, to each vdev.
This makes use of the existing per-vdev ZAP that was added as
part of device evacuation/removal.

A large number of read-only properties are exposed,
many of the members of struct vdev_t, that provide useful
statistics.

Adds support for read-only "removing" vdev property.
Adds the "allocating" property that defaults to "on" and
can be set to "off" to prevent future allocations from that
top-level vdev.

Supports user-defined vdev properties.
Includes support for properties.vdev in SYSFS.

Co-authored-by: Allan Jude <allan@klarasystems.com>
Co-authored-by: Mark Maybee <mark.maybee@delphix.com>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Closes #11711
This commit is contained in:
Allan Jude
2021-11-30 09:46:25 -05:00
committed by GitHub
parent 5f64bf7fde
commit 2a673e76a9
33 changed files with 2746 additions and 243 deletions
+227 -47
View File
@@ -167,6 +167,176 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
return (NULL);
}
static void
vdev_activate(vdev_t *vd)
{
metaslab_group_t *mg = vd->vdev_mg;
spa_t *spa = vd->vdev_spa;
uint64_t vdev_space = spa_deflate(spa) ?
vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
ASSERT(!vd->vdev_islog);
ASSERT(vd->vdev_noalloc);
metaslab_group_activate(mg);
metaslab_group_activate(vd->vdev_log_mg);
ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
spa->spa_nonallocating_dspace -= vdev_space;
vd->vdev_noalloc = B_FALSE;
}
static int
vdev_passivate(vdev_t *vd, uint64_t *txg)
{
spa_t *spa = vd->vdev_spa;
int error;
ASSERT(!vd->vdev_noalloc);
vdev_t *rvd = spa->spa_root_vdev;
metaslab_group_t *mg = vd->vdev_mg;
metaslab_class_t *normal = spa_normal_class(spa);
if (mg->mg_class == normal) {
/*
* We must check that this is not the only allocating device in
* the pool before passivating, otherwise we will not be able
* to make progress because we can't allocate from any vdevs.
*/
boolean_t last = B_TRUE;
for (uint64_t id = 0; id < rvd->vdev_children; id++) {
vdev_t *cvd = rvd->vdev_child[id];
if (cvd == vd ||
cvd->vdev_ops == &vdev_indirect_ops)
continue;
metaslab_class_t *mc = cvd->vdev_mg->mg_class;
if (mc != normal)
continue;
if (!cvd->vdev_noalloc) {
last = B_FALSE;
break;
}
}
if (last)
return (SET_ERROR(EINVAL));
}
metaslab_group_passivate(mg);
ASSERT(!vd->vdev_islog);
metaslab_group_passivate(vd->vdev_log_mg);
/*
* Wait for the youngest allocations and frees to sync,
* and then wait for the deferral of those frees to finish.
*/
spa_vdev_config_exit(spa, NULL,
*txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
/*
* We must ensure that no "stubby" log blocks are allocated
* on the device to be removed. These blocks could be
* written at any time, including while we are in the middle
* of copying them.
*/
error = spa_reset_logs(spa);
*txg = spa_vdev_config_enter(spa);
if (error != 0) {
metaslab_group_activate(mg);
ASSERT(!vd->vdev_islog);
if (vd->vdev_log_mg != NULL)
metaslab_group_activate(vd->vdev_log_mg);
return (error);
}
spa->spa_nonallocating_dspace += spa_deflate(spa) ?
vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
vd->vdev_noalloc = B_TRUE;
return (0);
}
/*
* Turn off allocations for a top-level device from the pool.
*
* Turning off allocations for a top-level device can take a significant
* amount of time. As a result we use the spa_vdev_config_[enter/exit]
* functions which allow us to grab and release the spa_config_lock while
* still holding the namespace lock. During each step the configuration
* is synced out.
*/
int
spa_vdev_noalloc(spa_t *spa, uint64_t guid)
{
vdev_t *vd;
uint64_t txg;
int error = 0;
ASSERT(!MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
ASSERT(MUTEX_HELD(&spa_namespace_lock));
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
if (vd == NULL)
error = SET_ERROR(ENOENT);
else if (vd->vdev_mg == NULL)
error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
else if (!vd->vdev_noalloc)
error = vdev_passivate(vd, &txg);
if (error == 0) {
vdev_dirty_leaves(vd, VDD_DTL, txg);
vdev_config_dirty(vd);
}
error = spa_vdev_exit(spa, NULL, txg, error);
return (error);
}
int
spa_vdev_alloc(spa_t *spa, uint64_t guid)
{
vdev_t *vd;
uint64_t txg;
int error = 0;
ASSERT(!MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
ASSERT(MUTEX_HELD(&spa_namespace_lock));
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
if (vd == NULL)
error = SET_ERROR(ENOENT);
else if (vd->vdev_mg == NULL)
error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
else if (!vd->vdev_removing)
vdev_activate(vd);
if (error == 0) {
vdev_dirty_leaves(vd, VDD_DTL, txg);
vdev_config_dirty(vd);
}
(void) spa_vdev_exit(spa, NULL, txg, error);
return (error);
}
static void
spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
nvlist_t *dev_to_remove)
@@ -1193,6 +1363,8 @@ vdev_remove_complete(spa_t *spa)
ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
ASSERT3P(vd->vdev_trim_thread, ==, NULL);
ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
uint64_t vdev_space = spa_deflate(spa) ?
vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
sysevent_t *ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_DEV);
@@ -1200,6 +1372,12 @@ vdev_remove_complete(spa_t *spa)
zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
(u_longlong_t)vd->vdev_id, (u_longlong_t)txg);
ASSERT3U(0, !=, vdev_space);
ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
/* the vdev is no longer part of the dspace */
spa->spa_nonallocating_dspace -= vdev_space;
/*
* Discard allocation state.
*/
@@ -1619,6 +1797,28 @@ spa_vdev_remove_suspend(spa_t *spa)
mutex_exit(&svr->svr_lock);
}
/*
* Return true if the "allocating" property has been set to "off"
*/
static boolean_t
vdev_prop_allocating_off(vdev_t *vd)
{
uint64_t objid = vd->vdev_top_zap;
uint64_t allocating = 1;
/* no vdev property object => no props */
if (objid != 0) {
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
mutex_enter(&spa->spa_props_lock);
(void) zap_lookup(mos, objid, "allocating", sizeof (uint64_t),
1, &allocating);
mutex_exit(&spa->spa_props_lock);
}
return (allocating == 0);
}
/* ARGSUSED */
static int
spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
@@ -1761,6 +1961,13 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
spa_finish_removal(spa, DSS_CANCELED, tx);
vd->vdev_removing = B_FALSE;
if (!vdev_prop_allocating_off(vd)) {
spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
vdev_activate(vd);
spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
}
vdev_config_dirty(vd);
zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
@@ -1774,21 +1981,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
static int
spa_vdev_remove_cancel_impl(spa_t *spa)
{
uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
spa_vdev_remove_cancel_sync, NULL, 0,
ZFS_SPACE_CHECK_EXTRA_RESERVED);
if (error == 0) {
spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
vdev_t *vd = vdev_lookup_top(spa, vdid);
metaslab_group_activate(vd->vdev_mg);
ASSERT(!vd->vdev_islog);
metaslab_group_activate(vd->vdev_log_mg);
spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
}
return (error);
}
@@ -1984,6 +2179,11 @@ spa_vdev_remove_top_check(vdev_t *vd)
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
return (SET_ERROR(ENOTSUP));
/*
* This device is already being removed
*/
if (vd->vdev_removing)
return (SET_ERROR(EALREADY));
metaslab_class_t *mc = vd->vdev_mg->mg_class;
metaslab_class_t *normal = spa_normal_class(spa);
@@ -2002,20 +2202,12 @@ spa_vdev_remove_top_check(vdev_t *vd)
ASSERT3U(available, >=, vd->vdev_stat.vs_alloc);
if (available < vd->vdev_stat.vs_alloc)
return (SET_ERROR(ENOSPC));
} else {
} else if (!vd->vdev_noalloc) {
/* available space in the pool's normal class */
uint64_t available = dsl_dir_space_available(
spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
if (available <
vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
/*
* This is a normal device. There has to be enough free
* space to remove the device and leave double the
* "slop" space (i.e. we must leave at least 3% of the
* pool free, in addition to the normal slop space).
*/
if (available < vd->vdev_stat.vs_dspace)
return (SET_ERROR(ENOSPC));
}
}
/*
@@ -2108,6 +2300,7 @@ static int
spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
{
spa_t *spa = vd->vdev_spa;
boolean_t set_noalloc = B_FALSE;
int error;
/*
@@ -2116,8 +2309,6 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
* are errors.
*/
error = spa_vdev_remove_top_check(vd);
if (error != 0)
return (error);
/*
* Stop allocating from this vdev. Note that we must check
@@ -2127,31 +2318,22 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
* The above check for sufficient free space serves this
* purpose.
*/
metaslab_group_t *mg = vd->vdev_mg;
metaslab_group_passivate(mg);
ASSERT(!vd->vdev_islog);
metaslab_group_passivate(vd->vdev_log_mg);
if (error == 0 && !vd->vdev_noalloc) {
set_noalloc = B_TRUE;
error = vdev_passivate(vd, txg);
}
/*
* Wait for the youngest allocations and frees to sync,
* and then wait for the deferral of those frees to finish.
*/
spa_vdev_config_exit(spa, NULL,
*txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
/*
* We must ensure that no "stubby" log blocks are allocated
* on the device to be removed. These blocks could be
* written at any time, including while we are in the middle
* of copying them.
*/
error = spa_reset_logs(spa);
if (error != 0)
return (error);
/*
* We stop any initializing and TRIM that is currently in progress
* but leave the state as "active". This will allow the process to
* resume if the removal is canceled sometime later.
*/
spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_wait(vd);
@@ -2162,13 +2344,11 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
* Things might have changed while the config lock was dropped
* (e.g. space usage). Check for errors again.
*/
if (error == 0)
error = spa_vdev_remove_top_check(vd);
error = spa_vdev_remove_top_check(vd);
if (error != 0) {
metaslab_group_activate(mg);
ASSERT(!vd->vdev_islog);
metaslab_group_activate(vd->vdev_log_mg);
if (set_noalloc)
vdev_activate(vd);
spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);