3090 vdev_reopen() during reguid causes vdev to be treated as corrupt
3102 vdev_uberblock_load() and vdev_validate() may read the wrong label

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Christopher Siden <chris.siden@delphix.com>
Reviewed by: Garrett D'Amore <garrett@damore.org>
Approved by: Eric Schrock <Eric.Schrock@delphix.com>

References:
  illumos/illumos-gate@dfbb943217
  illumos changeset: 13777:b1e53580146d
  https://www.illumos.org/issues/3090
  https://www.illumos.org/issues/3102

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #939
This commit is contained in:
George Wilson
2012-12-14 12:38:04 -08:00
committed by Brian Behlendorf
parent 5ac0c30a94
commit 3bc7e0fb0f
9 changed files with 188 additions and 100 deletions
+58 -18
View File
@@ -117,6 +117,8 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
static dsl_syncfunc_t spa_sync_version;
static dsl_syncfunc_t spa_sync_props;
static dsl_checkfunc_t spa_change_guid_check;
static dsl_syncfunc_t spa_change_guid_sync;
static boolean_t spa_has_active_shared_spare(spa_t *spa);
static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
@@ -676,6 +678,47 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
}
}
/*ARGSUSED*/
static int
spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
spa_t *spa = arg1;
vdev_t *rvd = spa->spa_root_vdev;
uint64_t vdev_state;
ASSERTV(uint64_t *newguid = arg2);
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
vdev_state = rvd->vdev_state;
spa_config_exit(spa, SCL_STATE, FTAG);
if (vdev_state != VDEV_STATE_HEALTHY)
return (ENXIO);
ASSERT3U(spa_guid(spa), !=, *newguid);
return (0);
}
static void
spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
spa_t *spa = arg1;
uint64_t *newguid = arg2;
uint64_t oldguid;
vdev_t *rvd = spa->spa_root_vdev;
oldguid = spa_guid(spa);
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
rvd->vdev_guid = *newguid;
rvd->vdev_guid_sum += (*newguid - oldguid);
vdev_config_dirty(rvd);
spa_config_exit(spa, SCL_STATE, FTAG);
spa_history_log_internal(LOG_POOL_GUID_CHANGE, spa, tx,
"old=%lld new=%lld", oldguid, *newguid);
}
/*
* Change the GUID for the pool. This is done so that we can later
* re-import a pool built from a clone of our own vdevs. We will modify
@@ -688,29 +731,23 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
int
spa_change_guid(spa_t *spa)
{
uint64_t oldguid, newguid;
uint64_t txg;
int error;
uint64_t guid;
if (!(spa_mode_global & FWRITE))
return (EROFS);
mutex_enter(&spa_namespace_lock);
guid = spa_generate_guid(NULL);
txg = spa_vdev_enter(spa);
error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check,
spa_change_guid_sync, spa, &guid, 5);
if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY)
return (spa_vdev_exit(spa, NULL, txg, ENXIO));
if (error == 0) {
spa_config_sync(spa, B_FALSE, B_TRUE);
spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID);
}
oldguid = spa_guid(spa);
newguid = spa_generate_guid(NULL);
ASSERT3U(oldguid, !=, newguid);
mutex_exit(&spa_namespace_lock);
spa->spa_root_vdev->vdev_guid = newguid;
spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid);
vdev_config_dirty(spa->spa_root_vdev);
spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID);
return (spa_vdev_exit(spa, NULL, txg, 0));
return (error);
}
/*
@@ -6083,6 +6120,9 @@ spa_sync(spa_t *spa, uint64_t txg)
rvd->vdev_children, txg, B_TRUE);
}
if (error == 0)
spa->spa_last_synced_guid = rvd->vdev_guid;
spa_config_exit(spa, SCL_STATE, FTAG);
if (error == 0)
+15 -2
View File
@@ -1334,16 +1334,29 @@ spa_name(spa_t *spa)
uint64_t
spa_guid(spa_t *spa)
{
dsl_pool_t *dp = spa_get_dsl(spa);
uint64_t guid;
/*
* If we fail to parse the config during spa_load(), we can go through
* the error path (which posts an ereport) and end up here with no root
* vdev. We stash the original pool guid in 'spa_config_guid' to handle
* this case.
*/
if (spa->spa_root_vdev != NULL)
if (spa->spa_root_vdev == NULL)
return (spa->spa_config_guid);
guid = spa->spa_last_synced_guid != 0 ?
spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
/*
* Return the most recently synced out guid unless we're
* in syncing context.
*/
if (dp && dsl_pool_sync_context(dp))
return (spa->spa_root_vdev->vdev_guid);
else
return (spa->spa_config_guid);
return (guid);
}
uint64_t
+4 -4
View File
@@ -1348,9 +1348,9 @@ vdev_validate(vdev_t *vd, boolean_t strict)
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
uint64_t aux_guid = 0;
nvlist_t *nvl;
uint64_t txg = strict ? spa->spa_config_txg : -1ULL;
if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) ==
NULL) {
if ((label = vdev_label_read_config(vd, txg)) == NULL) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (0);
@@ -1533,7 +1533,7 @@ vdev_reopen(vdev_t *vd)
!l2arc_vdev_present(vd))
l2arc_add_vdev(spa, vd);
} else {
(void) vdev_validate(vd, B_TRUE);
(void) vdev_validate(vd, spa_last_synced_txg(spa));
}
/*
@@ -1994,7 +1994,7 @@ vdev_validate_aux(vdev_t *vd)
if (!vdev_readable(vd))
return (0);
if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) {
if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
return (-1);
+48 -27
View File
@@ -433,17 +433,22 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config)
}
/*
* Returns the configuration from the label of the given vdev. If 'label' is
* VDEV_BEST_LABEL, each label of the vdev will be read until a valid
* configuration is found; otherwise, only the specified label will be read.
* Returns the configuration from the label of the given vdev. For vdevs
* which don't have a txg value stored on their label (i.e. spares/cache)
* or have not been completely initialized (txg = 0) just return
* the configuration from the first valid label we find. Otherwise,
* find the most up-to-date label that does not exceed the specified
* 'txg' value.
*/
nvlist_t *
vdev_label_read_config(vdev_t *vd, int label)
vdev_label_read_config(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
nvlist_t *config = NULL;
vdev_phys_t *vp;
zio_t *zio;
uint64_t best_txg = 0;
int error = 0;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SPECULATIVE;
int l;
@@ -457,8 +462,7 @@ vdev_label_read_config(vdev_t *vd, int label)
retry:
for (l = 0; l < VDEV_LABELS; l++) {
if (label >= 0 && label < VDEV_LABELS && label != l)
continue;
nvlist_t *label = NULL;
zio = zio_root(spa, NULL, NULL, flags);
@@ -468,12 +472,31 @@ retry:
if (zio_wait(zio) == 0 &&
nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
&config, 0) == 0)
break;
&label, 0) == 0) {
uint64_t label_txg = 0;
if (config != NULL) {
nvlist_free(config);
config = NULL;
/*
* Auxiliary vdevs won't have txg values in their
* labels and newly added vdevs may not have been
* completely initialized so just return the
* configuration from the first valid label we
* encounter.
*/
error = nvlist_lookup_uint64(label,
ZPOOL_CONFIG_POOL_TXG, &label_txg);
if ((error || label_txg == 0) && !config) {
config = label;
break;
} else if (label_txg <= txg && label_txg > best_txg) {
best_txg = label_txg;
nvlist_free(config);
config = fnvlist_dup(label);
}
}
if (label != NULL) {
nvlist_free(label);
label = NULL;
}
}
@@ -508,7 +531,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
/*
* Read the label, if any, and perform some basic sanity checks.
*/
if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL)
if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
return (B_FALSE);
(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
@@ -872,7 +895,6 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
struct ubl_cbdata {
uberblock_t *ubl_ubbest; /* Best uberblock */
vdev_t *ubl_vd; /* vdev associated with the above */
int ubl_label; /* Label associated with the above */
};
static void
@@ -891,15 +913,13 @@ vdev_uberblock_load_done(zio_t *zio)
if (ub->ub_txg <= spa->spa_load_max_txg &&
vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
/*
* Keep track of the vdev and label in which this
* uberblock was found. We will use this information
* later to obtain the config nvlist associated with
* Keep track of the vdev in which this uberblock
* was found. We will use this information later
* to obtain the config nvlist associated with
* this uberblock.
*/
*cbp->ubl_ubbest = *ub;
cbp->ubl_vd = vd;
cbp->ubl_label = vdev_label_number(vd->vdev_psize,
zio->io_offset);
}
mutex_exit(&rio->io_lock);
}
@@ -933,12 +953,11 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
* Reads the 'best' uberblock from disk along with its associated
* configuration. First, we read the uberblock array of each label of each
* vdev, keeping track of the uberblock with the highest txg in each array.
* Then, we read the configuration from the same label as the best uberblock.
* Then, we read the configuration from the same vdev as the best uberblock.
*/
void
vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
{
int i;
zio_t *zio;
spa_t *spa = rvd->vdev_spa;
struct ubl_cbdata cb;
@@ -958,13 +977,15 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
zio = zio_root(spa, NULL, &cb, flags);
vdev_uberblock_load_impl(zio, rvd, flags, &cb);
(void) zio_wait(zio);
if (cb.ubl_vd != NULL) {
for (i = cb.ubl_label % 2; i < VDEV_LABELS; i += 2) {
*config = vdev_label_read_config(cb.ubl_vd, i);
if (*config != NULL)
break;
}
}
/*
* It's possible that the best uberblock was discovered on a label
* that has a configuration which was written in a future txg.
* Search all labels on this vdev to find the configuration that
* matches the txg for our uberblock.
*/
if (cb.ubl_vd != NULL)
*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
spa_config_exit(spa, SCL_ALL, FTAG);
}