mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
Vdev Properties Feature
Add properties, similar to pool properties, to each vdev. This makes use of the existing per-vdev ZAP that was added as part of device evacuation/removal. A large number of read-only properties are exposed, many of the members of struct vdev_t, that provide useful statistics. Adds support for read-only "removing" vdev property. Adds the "allocating" property that defaults to "on" and can be set to "off" to prevent future allocations from that top-level vdev. Supports user-defined vdev properties. Includes support for properties.vdev in SYSFS. Co-authored-by: Allan Jude <allan@klarasystems.com> Co-authored-by: Mark Maybee <mark.maybee@delphix.com> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Mark Maybee <mark.maybee@delphix.com> Signed-off-by: Allan Jude <allan@klarasystems.com> Closes #11711
This commit is contained in:
+227
-47
@@ -167,6 +167,176 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_activate(vdev_t *vd)
|
||||
{
|
||||
metaslab_group_t *mg = vd->vdev_mg;
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
uint64_t vdev_space = spa_deflate(spa) ?
|
||||
vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
|
||||
|
||||
ASSERT(!vd->vdev_islog);
|
||||
ASSERT(vd->vdev_noalloc);
|
||||
|
||||
metaslab_group_activate(mg);
|
||||
metaslab_group_activate(vd->vdev_log_mg);
|
||||
|
||||
ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
|
||||
|
||||
spa->spa_nonallocating_dspace -= vdev_space;
|
||||
|
||||
vd->vdev_noalloc = B_FALSE;
|
||||
}
|
||||
|
||||
static int
|
||||
vdev_passivate(vdev_t *vd, uint64_t *txg)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
int error;
|
||||
|
||||
ASSERT(!vd->vdev_noalloc);
|
||||
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
metaslab_group_t *mg = vd->vdev_mg;
|
||||
metaslab_class_t *normal = spa_normal_class(spa);
|
||||
if (mg->mg_class == normal) {
|
||||
/*
|
||||
* We must check that this is not the only allocating device in
|
||||
* the pool before passivating, otherwise we will not be able
|
||||
* to make progress because we can't allocate from any vdevs.
|
||||
*/
|
||||
boolean_t last = B_TRUE;
|
||||
for (uint64_t id = 0; id < rvd->vdev_children; id++) {
|
||||
vdev_t *cvd = rvd->vdev_child[id];
|
||||
|
||||
if (cvd == vd ||
|
||||
cvd->vdev_ops == &vdev_indirect_ops)
|
||||
continue;
|
||||
|
||||
metaslab_class_t *mc = cvd->vdev_mg->mg_class;
|
||||
if (mc != normal)
|
||||
continue;
|
||||
|
||||
if (!cvd->vdev_noalloc) {
|
||||
last = B_FALSE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (last)
|
||||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
|
||||
metaslab_group_passivate(mg);
|
||||
ASSERT(!vd->vdev_islog);
|
||||
metaslab_group_passivate(vd->vdev_log_mg);
|
||||
|
||||
/*
|
||||
* Wait for the youngest allocations and frees to sync,
|
||||
* and then wait for the deferral of those frees to finish.
|
||||
*/
|
||||
spa_vdev_config_exit(spa, NULL,
|
||||
*txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
|
||||
|
||||
/*
|
||||
* We must ensure that no "stubby" log blocks are allocated
|
||||
* on the device to be removed. These blocks could be
|
||||
* written at any time, including while we are in the middle
|
||||
* of copying them.
|
||||
*/
|
||||
error = spa_reset_logs(spa);
|
||||
|
||||
*txg = spa_vdev_config_enter(spa);
|
||||
|
||||
if (error != 0) {
|
||||
metaslab_group_activate(mg);
|
||||
ASSERT(!vd->vdev_islog);
|
||||
if (vd->vdev_log_mg != NULL)
|
||||
metaslab_group_activate(vd->vdev_log_mg);
|
||||
return (error);
|
||||
}
|
||||
|
||||
spa->spa_nonallocating_dspace += spa_deflate(spa) ?
|
||||
vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
|
||||
vd->vdev_noalloc = B_TRUE;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Turn off allocations for a top-level device from the pool.
|
||||
*
|
||||
* Turning off allocations for a top-level device can take a significant
|
||||
* amount of time. As a result we use the spa_vdev_config_[enter/exit]
|
||||
* functions which allow us to grab and release the spa_config_lock while
|
||||
* still holding the namespace lock. During each step the configuration
|
||||
* is synced out.
|
||||
*/
|
||||
int
|
||||
spa_vdev_noalloc(spa_t *spa, uint64_t guid)
|
||||
{
|
||||
vdev_t *vd;
|
||||
uint64_t txg;
|
||||
int error = 0;
|
||||
|
||||
ASSERT(!MUTEX_HELD(&spa_namespace_lock));
|
||||
ASSERT(spa_writeable(spa));
|
||||
|
||||
txg = spa_vdev_enter(spa);
|
||||
|
||||
ASSERT(MUTEX_HELD(&spa_namespace_lock));
|
||||
|
||||
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
|
||||
|
||||
if (vd == NULL)
|
||||
error = SET_ERROR(ENOENT);
|
||||
else if (vd->vdev_mg == NULL)
|
||||
error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
|
||||
else if (!vd->vdev_noalloc)
|
||||
error = vdev_passivate(vd, &txg);
|
||||
|
||||
if (error == 0) {
|
||||
vdev_dirty_leaves(vd, VDD_DTL, txg);
|
||||
vdev_config_dirty(vd);
|
||||
}
|
||||
|
||||
error = spa_vdev_exit(spa, NULL, txg, error);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
int
|
||||
spa_vdev_alloc(spa_t *spa, uint64_t guid)
|
||||
{
|
||||
vdev_t *vd;
|
||||
uint64_t txg;
|
||||
int error = 0;
|
||||
|
||||
ASSERT(!MUTEX_HELD(&spa_namespace_lock));
|
||||
ASSERT(spa_writeable(spa));
|
||||
|
||||
txg = spa_vdev_enter(spa);
|
||||
|
||||
ASSERT(MUTEX_HELD(&spa_namespace_lock));
|
||||
|
||||
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
|
||||
|
||||
if (vd == NULL)
|
||||
error = SET_ERROR(ENOENT);
|
||||
else if (vd->vdev_mg == NULL)
|
||||
error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
|
||||
else if (!vd->vdev_removing)
|
||||
vdev_activate(vd);
|
||||
|
||||
if (error == 0) {
|
||||
vdev_dirty_leaves(vd, VDD_DTL, txg);
|
||||
vdev_config_dirty(vd);
|
||||
}
|
||||
|
||||
(void) spa_vdev_exit(spa, NULL, txg, error);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
static void
|
||||
spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
|
||||
nvlist_t *dev_to_remove)
|
||||
@@ -1193,6 +1363,8 @@ vdev_remove_complete(spa_t *spa)
|
||||
ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
|
||||
ASSERT3P(vd->vdev_trim_thread, ==, NULL);
|
||||
ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
|
||||
uint64_t vdev_space = spa_deflate(spa) ?
|
||||
vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
|
||||
|
||||
sysevent_t *ev = spa_event_create(spa, vd, NULL,
|
||||
ESC_ZFS_VDEV_REMOVE_DEV);
|
||||
@@ -1200,6 +1372,12 @@ vdev_remove_complete(spa_t *spa)
|
||||
zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
|
||||
(u_longlong_t)vd->vdev_id, (u_longlong_t)txg);
|
||||
|
||||
ASSERT3U(0, !=, vdev_space);
|
||||
ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
|
||||
|
||||
/* the vdev is no longer part of the dspace */
|
||||
spa->spa_nonallocating_dspace -= vdev_space;
|
||||
|
||||
/*
|
||||
* Discard allocation state.
|
||||
*/
|
||||
@@ -1619,6 +1797,28 @@ spa_vdev_remove_suspend(spa_t *spa)
|
||||
mutex_exit(&svr->svr_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if the "allocating" property has been set to "off"
|
||||
*/
|
||||
static boolean_t
|
||||
vdev_prop_allocating_off(vdev_t *vd)
|
||||
{
|
||||
uint64_t objid = vd->vdev_top_zap;
|
||||
uint64_t allocating = 1;
|
||||
|
||||
/* no vdev property object => no props */
|
||||
if (objid != 0) {
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
|
||||
mutex_enter(&spa->spa_props_lock);
|
||||
(void) zap_lookup(mos, objid, "allocating", sizeof (uint64_t),
|
||||
1, &allocating);
|
||||
mutex_exit(&spa->spa_props_lock);
|
||||
}
|
||||
return (allocating == 0);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static int
|
||||
spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
|
||||
@@ -1761,6 +1961,13 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
|
||||
spa_finish_removal(spa, DSS_CANCELED, tx);
|
||||
|
||||
vd->vdev_removing = B_FALSE;
|
||||
|
||||
if (!vdev_prop_allocating_off(vd)) {
|
||||
spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
|
||||
vdev_activate(vd);
|
||||
spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
|
||||
}
|
||||
|
||||
vdev_config_dirty(vd);
|
||||
|
||||
zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
|
||||
@@ -1774,21 +1981,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
|
||||
static int
|
||||
spa_vdev_remove_cancel_impl(spa_t *spa)
|
||||
{
|
||||
uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
|
||||
|
||||
int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
|
||||
spa_vdev_remove_cancel_sync, NULL, 0,
|
||||
ZFS_SPACE_CHECK_EXTRA_RESERVED);
|
||||
|
||||
if (error == 0) {
|
||||
spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
|
||||
vdev_t *vd = vdev_lookup_top(spa, vdid);
|
||||
metaslab_group_activate(vd->vdev_mg);
|
||||
ASSERT(!vd->vdev_islog);
|
||||
metaslab_group_activate(vd->vdev_log_mg);
|
||||
spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
|
||||
}
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
@@ -1984,6 +2179,11 @@ spa_vdev_remove_top_check(vdev_t *vd)
|
||||
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
|
||||
/*
|
||||
* This device is already being removed
|
||||
*/
|
||||
if (vd->vdev_removing)
|
||||
return (SET_ERROR(EALREADY));
|
||||
|
||||
metaslab_class_t *mc = vd->vdev_mg->mg_class;
|
||||
metaslab_class_t *normal = spa_normal_class(spa);
|
||||
@@ -2002,20 +2202,12 @@ spa_vdev_remove_top_check(vdev_t *vd)
|
||||
ASSERT3U(available, >=, vd->vdev_stat.vs_alloc);
|
||||
if (available < vd->vdev_stat.vs_alloc)
|
||||
return (SET_ERROR(ENOSPC));
|
||||
} else {
|
||||
} else if (!vd->vdev_noalloc) {
|
||||
/* available space in the pool's normal class */
|
||||
uint64_t available = dsl_dir_space_available(
|
||||
spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
|
||||
if (available <
|
||||
vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
|
||||
/*
|
||||
* This is a normal device. There has to be enough free
|
||||
* space to remove the device and leave double the
|
||||
* "slop" space (i.e. we must leave at least 3% of the
|
||||
* pool free, in addition to the normal slop space).
|
||||
*/
|
||||
if (available < vd->vdev_stat.vs_dspace)
|
||||
return (SET_ERROR(ENOSPC));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2108,6 +2300,7 @@ static int
|
||||
spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
boolean_t set_noalloc = B_FALSE;
|
||||
int error;
|
||||
|
||||
/*
|
||||
@@ -2116,8 +2309,6 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
|
||||
* are errors.
|
||||
*/
|
||||
error = spa_vdev_remove_top_check(vd);
|
||||
if (error != 0)
|
||||
return (error);
|
||||
|
||||
/*
|
||||
* Stop allocating from this vdev. Note that we must check
|
||||
@@ -2127,31 +2318,22 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
|
||||
* The above check for sufficient free space serves this
|
||||
* purpose.
|
||||
*/
|
||||
metaslab_group_t *mg = vd->vdev_mg;
|
||||
metaslab_group_passivate(mg);
|
||||
ASSERT(!vd->vdev_islog);
|
||||
metaslab_group_passivate(vd->vdev_log_mg);
|
||||
if (error == 0 && !vd->vdev_noalloc) {
|
||||
set_noalloc = B_TRUE;
|
||||
error = vdev_passivate(vd, txg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for the youngest allocations and frees to sync,
|
||||
* and then wait for the deferral of those frees to finish.
|
||||
*/
|
||||
spa_vdev_config_exit(spa, NULL,
|
||||
*txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
|
||||
|
||||
/*
|
||||
* We must ensure that no "stubby" log blocks are allocated
|
||||
* on the device to be removed. These blocks could be
|
||||
* written at any time, including while we are in the middle
|
||||
* of copying them.
|
||||
*/
|
||||
error = spa_reset_logs(spa);
|
||||
if (error != 0)
|
||||
return (error);
|
||||
|
||||
/*
|
||||
* We stop any initializing and TRIM that is currently in progress
|
||||
* but leave the state as "active". This will allow the process to
|
||||
* resume if the removal is canceled sometime later.
|
||||
*/
|
||||
|
||||
spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
|
||||
|
||||
vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
|
||||
vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
|
||||
vdev_autotrim_stop_wait(vd);
|
||||
@@ -2162,13 +2344,11 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
|
||||
* Things might have changed while the config lock was dropped
|
||||
* (e.g. space usage). Check for errors again.
|
||||
*/
|
||||
if (error == 0)
|
||||
error = spa_vdev_remove_top_check(vd);
|
||||
error = spa_vdev_remove_top_check(vd);
|
||||
|
||||
if (error != 0) {
|
||||
metaslab_group_activate(mg);
|
||||
ASSERT(!vd->vdev_islog);
|
||||
metaslab_group_activate(vd->vdev_log_mg);
|
||||
if (set_noalloc)
|
||||
vdev_activate(vd);
|
||||
spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
|
||||
spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
|
||||
spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
|
||||
|
||||
Reference in New Issue
Block a user