mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-25 11:47:43 +03:00
OpenZFS 6736 - ZFS per-vdev ZAPs
6736 ZFS per-vdev ZAPs Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: John Kennedy <john.kennedy@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Don Brady <don.brady@intel.com> Reviewed by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/6736 https://github.com/openzfs/openzfs/commit/215198a Ported-by: Don Brady <don.brady@intel.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4515
This commit is contained in:
committed by
Brian Behlendorf
parent
4cd77889b6
commit
e0ab3ab553
+227
-7
@@ -1674,6 +1674,21 @@ spa_check_removed(vdev_t *vd)
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd)
|
||||
{
|
||||
uint64_t i;
|
||||
|
||||
ASSERT3U(vd->vdev_children, ==, mvd->vdev_children);
|
||||
|
||||
vd->vdev_top_zap = mvd->vdev_top_zap;
|
||||
vd->vdev_leaf_zap = mvd->vdev_leaf_zap;
|
||||
|
||||
for (i = 0; i < vd->vdev_children; i++) {
|
||||
spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Validate the current config against the MOS config
|
||||
*/
|
||||
@@ -1778,16 +1793,25 @@ spa_config_valid(spa_t *spa, nvlist_t *config)
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
|
||||
vdev_reopen(rvd);
|
||||
} else if (mtvd->vdev_islog) {
|
||||
} else {
|
||||
if (mtvd->vdev_islog) {
|
||||
/*
|
||||
* Load the slog device's state from the MOS
|
||||
* config since it's possible that the label
|
||||
* does not contain the most up-to-date
|
||||
* information.
|
||||
*/
|
||||
vdev_load_log_state(tvd, mtvd);
|
||||
vdev_reopen(tvd);
|
||||
}
|
||||
|
||||
/*
|
||||
* Load the slog device's state from the MOS config
|
||||
* since it's possible that the label does not
|
||||
* contain the most up-to-date information.
|
||||
* Per-vdev ZAP info is stored exclusively in the MOS.
|
||||
*/
|
||||
vdev_load_log_state(tvd, mtvd);
|
||||
vdev_reopen(tvd);
|
||||
spa_config_valid_zaps(tvd, mtvd);
|
||||
}
|
||||
}
|
||||
|
||||
vdev_free(mrvd);
|
||||
spa_config_exit(spa, SCL_ALL, FTAG);
|
||||
|
||||
@@ -2214,6 +2238,36 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Count the number of per-vdev ZAPs associated with all of the vdevs in the
|
||||
* vdev tree rooted in the given vd, and ensure that each ZAP is present in the
|
||||
* spa's per-vdev ZAP list.
|
||||
*/
|
||||
static uint64_t
|
||||
vdev_count_verify_zaps(vdev_t *vd)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
uint64_t total = 0;
|
||||
uint64_t i;
|
||||
|
||||
if (vd->vdev_top_zap != 0) {
|
||||
total++;
|
||||
ASSERT0(zap_lookup_int(spa->spa_meta_objset,
|
||||
spa->spa_all_vdev_zaps, vd->vdev_top_zap));
|
||||
}
|
||||
if (vd->vdev_leaf_zap != 0) {
|
||||
total++;
|
||||
ASSERT0(zap_lookup_int(spa->spa_meta_objset,
|
||||
spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
|
||||
}
|
||||
|
||||
for (i = 0; i < vd->vdev_children; i++) {
|
||||
total += vdev_count_verify_zaps(vd->vdev_child[i]);
|
||||
}
|
||||
|
||||
return (total);
|
||||
}
|
||||
|
||||
/*
|
||||
* Load an existing storage pool, using the pool's builtin spa_config as a
|
||||
* source of configuration information.
|
||||
@@ -2234,6 +2288,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
|
||||
int parse, i;
|
||||
uint64_t obj;
|
||||
boolean_t missing_feat_write = B_FALSE;
|
||||
nvlist_t *mos_config;
|
||||
|
||||
/*
|
||||
* If this is an untrusted config, access the pool in read-only mode.
|
||||
@@ -2632,6 +2687,38 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
|
||||
if (error != 0 && error != ENOENT)
|
||||
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
|
||||
|
||||
/*
|
||||
* Load the per-vdev ZAP map. If we have an older pool, this will not
|
||||
* be present; in this case, defer its creation to a later time to
|
||||
* avoid dirtying the MOS this early / out of sync context. See
|
||||
* spa_sync_config_object.
|
||||
*/
|
||||
|
||||
/* The sentinel is only available in the MOS config. */
|
||||
if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
|
||||
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
|
||||
|
||||
error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
|
||||
&spa->spa_all_vdev_zaps);
|
||||
|
||||
if (error != ENOENT && error != 0) {
|
||||
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
|
||||
} else if (error == 0 && !nvlist_exists(mos_config,
|
||||
ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
|
||||
/*
|
||||
* An older version of ZFS overwrote the sentinel value, so
|
||||
* we have orphaned per-vdev ZAPs in the MOS. Defer their
|
||||
* destruction to later; see spa_sync_config_object.
|
||||
*/
|
||||
spa->spa_avz_action = AVZ_ACTION_DESTROY;
|
||||
/*
|
||||
* We're assuming that no vdevs have had their ZAPs created
|
||||
* before this. Better be sure of it.
|
||||
*/
|
||||
ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
|
||||
}
|
||||
nvlist_free(mos_config);
|
||||
|
||||
/*
|
||||
* If we're assembling the pool from the split-off vdevs of
|
||||
* an existing pool, we don't want to attach the spares & cache
|
||||
@@ -5170,6 +5257,16 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
|
||||
vml[c]->vdev_top->vdev_asize) == 0);
|
||||
VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
|
||||
vml[c]->vdev_top->vdev_ashift) == 0);
|
||||
|
||||
/* transfer per-vdev ZAPs */
|
||||
ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
|
||||
VERIFY0(nvlist_add_uint64(child[c],
|
||||
ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
|
||||
|
||||
ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
|
||||
VERIFY0(nvlist_add_uint64(child[c],
|
||||
ZPOOL_CONFIG_VDEV_TOP_ZAP,
|
||||
vml[c]->vdev_parent->vdev_top_zap));
|
||||
}
|
||||
|
||||
if (error != 0) {
|
||||
@@ -5211,11 +5308,13 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
|
||||
spa->spa_config_txg) == 0);
|
||||
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
|
||||
spa_generate_guid(NULL)) == 0);
|
||||
VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
|
||||
(void) nvlist_lookup_string(props,
|
||||
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
|
||||
|
||||
/* add the new pool to the namespace */
|
||||
newspa = spa_add(newname, config, altroot);
|
||||
newspa->spa_avz_action = AVZ_ACTION_REBUILD;
|
||||
newspa->spa_config_txg = spa->spa_config_txg;
|
||||
spa_set_log_state(newspa, SPA_LOG_CLEAR);
|
||||
|
||||
@@ -5273,9 +5372,11 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
|
||||
if (error == 0)
|
||||
spa_history_log_internal(spa, "detach", tx,
|
||||
"vdev=%s", vml[c]->vdev_path);
|
||||
|
||||
vdev_free(vml[c]);
|
||||
}
|
||||
}
|
||||
spa->spa_avz_action = AVZ_ACTION_REBUILD;
|
||||
vdev_config_dirty(spa->spa_root_vdev);
|
||||
spa->spa_config_splitting = NULL;
|
||||
nvlist_free(nvl);
|
||||
@@ -6109,16 +6210,120 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
|
||||
sav->sav_sync = B_FALSE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
|
||||
* The all-vdev ZAP must be empty.
|
||||
*/
|
||||
static void
|
||||
spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
uint64_t i;
|
||||
|
||||
if (vd->vdev_top_zap != 0) {
|
||||
VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
|
||||
vd->vdev_top_zap, tx));
|
||||
}
|
||||
if (vd->vdev_leaf_zap != 0) {
|
||||
VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
|
||||
vd->vdev_leaf_zap, tx));
|
||||
}
|
||||
for (i = 0; i < vd->vdev_children; i++) {
|
||||
spa_avz_build(vd->vdev_child[i], avz, tx);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
|
||||
{
|
||||
nvlist_t *config;
|
||||
|
||||
if (list_is_empty(&spa->spa_config_dirty_list))
|
||||
/*
|
||||
* If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
|
||||
* its config may not be dirty but we still need to build per-vdev ZAPs.
|
||||
* Similarly, if the pool is being assembled (e.g. after a split), we
|
||||
* need to rebuild the AVZ although the config may not be dirty.
|
||||
*/
|
||||
if (list_is_empty(&spa->spa_config_dirty_list) &&
|
||||
spa->spa_avz_action == AVZ_ACTION_NONE)
|
||||
return;
|
||||
|
||||
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
|
||||
|
||||
ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
|
||||
spa->spa_all_vdev_zaps != 0);
|
||||
|
||||
if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t za;
|
||||
|
||||
/* Make and build the new AVZ */
|
||||
uint64_t new_avz = zap_create(spa->spa_meta_objset,
|
||||
DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
|
||||
spa_avz_build(spa->spa_root_vdev, new_avz, tx);
|
||||
|
||||
/* Diff old AVZ with new one */
|
||||
for (zap_cursor_init(&zc, spa->spa_meta_objset,
|
||||
spa->spa_all_vdev_zaps);
|
||||
zap_cursor_retrieve(&zc, &za) == 0;
|
||||
zap_cursor_advance(&zc)) {
|
||||
uint64_t vdzap = za.za_first_integer;
|
||||
if (zap_lookup_int(spa->spa_meta_objset, new_avz,
|
||||
vdzap) == ENOENT) {
|
||||
/*
|
||||
* ZAP is listed in old AVZ but not in new one;
|
||||
* destroy it
|
||||
*/
|
||||
VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
|
||||
tx));
|
||||
}
|
||||
}
|
||||
|
||||
zap_cursor_fini(&zc);
|
||||
|
||||
/* Destroy the old AVZ */
|
||||
VERIFY0(zap_destroy(spa->spa_meta_objset,
|
||||
spa->spa_all_vdev_zaps, tx));
|
||||
|
||||
/* Replace the old AVZ in the dir obj with the new one */
|
||||
VERIFY0(zap_update(spa->spa_meta_objset,
|
||||
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
|
||||
sizeof (new_avz), 1, &new_avz, tx));
|
||||
|
||||
spa->spa_all_vdev_zaps = new_avz;
|
||||
} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t za;
|
||||
|
||||
/* Walk through the AVZ and destroy all listed ZAPs */
|
||||
for (zap_cursor_init(&zc, spa->spa_meta_objset,
|
||||
spa->spa_all_vdev_zaps);
|
||||
zap_cursor_retrieve(&zc, &za) == 0;
|
||||
zap_cursor_advance(&zc)) {
|
||||
uint64_t zap = za.za_first_integer;
|
||||
VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
|
||||
}
|
||||
|
||||
zap_cursor_fini(&zc);
|
||||
|
||||
/* Destroy and unlink the AVZ itself */
|
||||
VERIFY0(zap_destroy(spa->spa_meta_objset,
|
||||
spa->spa_all_vdev_zaps, tx));
|
||||
VERIFY0(zap_remove(spa->spa_meta_objset,
|
||||
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
|
||||
spa->spa_all_vdev_zaps = 0;
|
||||
}
|
||||
|
||||
if (spa->spa_all_vdev_zaps == 0) {
|
||||
spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
|
||||
DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_VDEV_ZAP_MAP, tx);
|
||||
}
|
||||
spa->spa_avz_action = AVZ_ACTION_NONE;
|
||||
|
||||
/* Create ZAPs for vdevs that don't have them. */
|
||||
vdev_construct_zaps(spa->spa_root_vdev, tx);
|
||||
|
||||
config = spa_config_generate(spa, spa->spa_root_vdev,
|
||||
dmu_tx_get_txg(tx), B_FALSE);
|
||||
|
||||
@@ -6509,6 +6714,21 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
|
||||
} while (dmu_objset_is_dirty(mos, txg));
|
||||
|
||||
if (!list_is_empty(&spa->spa_config_dirty_list)) {
|
||||
/*
|
||||
* Make sure that the number of ZAPs for all the vdevs matches
|
||||
* the number of ZAPs in the per-vdev ZAP list. This only gets
|
||||
* called if the config is dirty; otherwise there may be
|
||||
* outstanding AVZ operations that weren't completed in
|
||||
* spa_sync_config_object.
|
||||
*/
|
||||
uint64_t all_vdev_zap_entry_count;
|
||||
ASSERT0(zap_count(spa->spa_meta_objset,
|
||||
spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
|
||||
ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
|
||||
all_vdev_zap_entry_count);
|
||||
}
|
||||
|
||||
/*
|
||||
* Rewrite the vdev configuration (which includes the uberblock)
|
||||
* to commit the transaction group.
|
||||
|
||||
Reference in New Issue
Block a user