mmp: claim sequence id before final import

As part of SPA_LOAD_IMPORT add an additional activity check to
detect simultaneous imports from different hosts.  This check is
only required when the timing is such that there's no activity
for the the read-only tryimport check to detect.  This extra
safety chceck operates as follows:

1. Repeats the following MMP check 10 times:
  a. Write out an MMP uberblock with the best txg and a random
     sequence id to all primary pool vdevs.
  b. Verify a minimum number of good writes such that even if
     the pool appears degraded on the remote host it will see
     at least one of the updated MMP uberblocks.
  c. Wait for the MMP interval this leaves a window for other
     racing hosts to make similar modifications which can be
     detected.
  d. Call vdev_uberblock_load() to determine the best uberblock
     to use, this should be the MMP uberblock just written.
  e. Verify the txg and random sequeunce number match the MMP
     uberblock written in 1a.

2. Restore the original MMP uberblocks.  This allows the check
   to be performed again if the pool fails to import for an
   unrelated reason.

This change also includes some refactoring and minor improvements.

- Never try loading earlier txgs during import when the import
  fails with EREMOTEIO or EINTER.  These errors don't indicate
  the txg is damaged but instead that its either in use on a
  remote host or the import was interactively cancelled.  No
  rewind is also performed for EBADD which can result from a
  stale trusted config when doing a verbatim import.

- Refactor the code for consistent logging of the multihost
  activity check using spa_load_note() and console messages
  indicating when the activity check was trigger and the result.

- Added MMP_*_MASK and MMP_SEQ_CLEAR() macros to allow easier
  modification of the sequence number in an uberblock.

- Added ZFS_LOAD_INFO_DEBUG environment variable which can be
  set to log to dump to stdout the spa_load_info nvlist returned
  during import.  This is used by the updated mmp test cases
  to determine if an activity check was run and its result.

- Standardize the mmp messages similarly to make it easier to
  find all the relevent mmp lines in the debug log.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
This commit is contained in:
Brian Behlendorf 2026-01-22 16:20:02 -08:00 committed by Tony Hutter
parent 2f048ced4d
commit 20176224ee
10 changed files with 731 additions and 201 deletions

View File

@ -3879,6 +3879,9 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
hostid, ctime(&timestamp));
}
if (getenv("ZFS_LOAD_INFO_DEBUG"))
dump_nvlist(nvinfo, 4);
return (1);
}

View File

@ -873,6 +873,10 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_RESULT "mmp_result" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_TRYIMPORT_NS "mmp_tryimport_ns" /* not stored */
#define ZPOOL_CONFIG_MMP_IMPORT_NS "mmp_import_ns" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_CLAIM_NS "mmp_claim_ns" /* not stored on disk */
#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */
#define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */
#define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats"

View File

@ -33,6 +33,7 @@ extern "C" {
#define MMP_DEFAULT_IMPORT_INTERVALS 20
#define MMP_DEFAULT_FAIL_INTERVALS 10
#define MMP_MIN_FAIL_INTERVALS 2 /* min if != 0 */
#define MMP_IMPORT_VERIFY_ITERS 10
#define MMP_IMPORT_SAFETY_FACTOR 200 /* pct */
#define MMP_INTERVAL_OK(interval) MAX(interval, MMP_MIN_INTERVAL)
#define MMP_FAIL_INTVS_OK(fails) (fails == 0 ? 0 : MAX(fails, \
@ -53,6 +54,9 @@ typedef struct mmp_thread {
vdev_t *mmp_last_leaf; /* last mmp write sent here */
uint64_t mmp_leaf_last_gen; /* last mmp write sent here */
uint32_t mmp_seq; /* intra-second update counter */
uint64_t mmp_tryimport_ns; /* tryimport activity check time */
uint64_t mmp_import_ns; /* import activity check time */
uint64_t mmp_claim_ns; /* claim activity check time */
} mmp_thread_t;
@ -62,6 +66,7 @@ extern void mmp_thread_start(struct spa *spa);
extern void mmp_thread_stop(struct spa *spa);
extern void mmp_update_uberblock(struct spa *spa, struct uberblock *ub);
extern void mmp_signal_all_threads(void);
extern int mmp_claim_uberblock(spa_t *spa, vdev_t *vd, uberblock_t *ub);
/* Global tuning */
extern int param_set_multihost_interval(ZFS_MODULE_PARAM_ARGS);

View File

@ -298,6 +298,7 @@ struct spa {
void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
uberblock_t spa_ubsync; /* last synced uberblock */
uberblock_t spa_uberblock; /* current uberblock */
boolean_t spa_activity_check; /* activity check required */
boolean_t spa_extreme_rewind; /* rewind past deferred frees */
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
uint64_t spa_scrub_inflight; /* in-flight scrub bytes */

View File

@ -51,6 +51,12 @@ extern "C" {
#define MMP_SEQ_VALID_BIT 0x02
#define MMP_FAIL_INT_VALID_BIT 0x04
#define MMP_INTERVAL_MASK 0x00000000FFFFFF00
#define MMP_SEQ_MASK 0x0000FFFF00000000
#define MMP_FAIL_INT_MASK 0xFFFF000000000000
#define MMP_SEQ_MAX UINT16_MAX
#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \
(ubp)->ub_mmp_magic == MMP_MAGIC)
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
@ -60,21 +66,25 @@ extern "C" {
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_FAIL_INT_VALID_BIT))
#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \
#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & MMP_INTERVAL_MASK) \
>> 8)
#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \
#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & MMP_SEQ_MASK) \
>> 32)
#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \
#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & MMP_FAIL_INT_MASK) \
>> 48)
#define MMP_INTERVAL_SET(write) \
(((uint64_t)(write & 0xFFFFFF) << 8) | MMP_INTERVAL_VALID_BIT)
(((uint64_t)((write) & 0xFFFFFF) << 8) | MMP_INTERVAL_VALID_BIT)
#define MMP_SEQ_SET(seq) \
(((uint64_t)(seq & 0xFFFF) << 32) | MMP_SEQ_VALID_BIT)
(((uint64_t)((seq) & 0xFFFF) << 32) | MMP_SEQ_VALID_BIT)
#define MMP_FAIL_INT_SET(fail) \
(((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT)
(((uint64_t)((fail) & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT)
#define MMP_SEQ_CLEAR(ubp) \
((ubp)->ub_mmp_config &= ~(MMP_SEQ_MASK | MMP_SEQ_VALID_BIT))
/*
* RAIDZ expansion reflow information.

View File

@ -228,6 +228,8 @@ extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t
extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *);
extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *);
extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int);
extern int vdev_uberblock_compare(const struct uberblock *,
const struct uberblock *);
extern int vdev_check_boot_reserve(spa_t *, vdev_t *);
typedef enum {

View File

@ -2209,6 +2209,11 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
zpool_get_load_policy(config, &policy);
if (getenv("ZFS_LOAD_INFO_DEBUG") && nv != NULL &&
nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0) {
dump_nvlist(nvinfo, 4);
}
if (error) {
char desc[1024];
char aux[256];

View File

@ -145,6 +145,15 @@
* Additionally, the duration is then extended by a random 25% to attempt to to
* detect simultaneous imports. For example, if both partner hosts are rebooted
* at the same time and automatically attempt to import the pool.
*
* Once the read-only activity check completes and the pool is determined to
* be inactive a second check is performed to claim the pool. During this
* phase the host writes out MMP uberblocks to each of the devices which are
* identical to the best uberblock but with a randomly selected sequence id.
* The "best" uberblock is then read back and it must contain this new sequence
* number. This check is performed multiple times to ensure that there is
* no window where a concurrently importing system can incorrectly determine
* the pool to be inactive.
*/
/*
@ -237,8 +246,8 @@ mmp_thread_start(spa_t *spa)
if (!mmp->mmp_thread) {
mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
spa, 0, &p0, TS_RUN, defclsyspri);
zfs_dbgmsg("MMP thread started pool '%s' "
"gethrtime %llu", spa_name(spa), gethrtime());
zfs_dbgmsg("mmp: mmp thread started spa=%s "
"gethrtime=%llu", spa_name(spa), gethrtime());
}
mutex_exit(&mmp->mmp_thread_lock);
}
@ -257,7 +266,7 @@ mmp_thread_stop(spa_t *spa)
cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
}
mutex_exit(&mmp->mmp_thread_lock);
zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
zfs_dbgmsg("mmp: mmp thread stopped spa=%s gethrtime=%llu",
spa_name(spa), gethrtime());
ASSERT0P(mmp->mmp_thread);
@ -449,9 +458,9 @@ mmp_write_uberblock(spa_t *spa)
spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER);
lock_acquire_time = gethrtime() - lock_acquire_time;
if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
"gethrtime %llu", spa_name(spa), lock_acquire_time,
gethrtime());
zfs_dbgmsg("mmp: long SCL_STATE acquisition, spa=%s "
"acquire_time=%llu gethrtime=%llu", spa_name(spa),
lock_acquire_time, gethrtime());
mutex_enter(&mmp->mmp_io_lock);
@ -474,8 +483,8 @@ mmp_write_uberblock(spa_t *spa)
spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
gethrestime_sec(), mmp->mmp_delay, NULL, 0,
mmp->mmp_kstat_id++, error);
zfs_dbgmsg("MMP error choosing leaf pool '%s' "
"gethrtime %llu fail_mask %#x", spa_name(spa),
zfs_dbgmsg("mmp: error choosing leaf, spa=%s "
"gethrtime=%llu fail_mask=%#x", spa_name(spa),
gethrtime(), error);
}
mutex_exit(&mmp->mmp_io_lock);
@ -485,11 +494,11 @@ mmp_write_uberblock(spa_t *spa)
vd = spa->spa_mmp.mmp_last_leaf;
if (mmp->mmp_skip_error != 0) {
mmp->mmp_skip_error = 0;
zfs_dbgmsg("MMP write after skipping due to unavailable "
"leaves, pool '%s' gethrtime %llu leaf %llu",
zfs_dbgmsg("mmp: write after skipping due to unavailable "
"leaves, spa=%s gethrtime=%llu vdev=%llu error=%d",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)vd->vdev_guid);
(u_longlong_t)vd->vdev_guid, mmp->mmp_skip_error);
mmp->mmp_skip_error = 0;
}
if (mmp->mmp_zio_root == NULL)
@ -540,6 +549,108 @@ mmp_write_uberblock(spa_t *spa)
zio_nowait(zio);
}
static void
mmp_claim_uberblock_sync_done(zio_t *zio)
{
uint64_t *good_writes = zio->io_private;
if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
atomic_inc_64(good_writes);
}
/*
* Write the uberblock to the first label of all leaves of the specified vdev.
* Two writes required for each mirror, one for a singleton, and parity+1 for
* raidz or draid vdevs.
*/
static void
mmp_claim_uberblock_sync(zio_t *zio, uint64_t *good_writes,
uint64_t *req_writes, uberblock_t *ub, vdev_t *vd, int flags)
{
for (uint64_t c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_islog || cvd->vdev_isspare || cvd->vdev_isl2cache)
continue;
if (cvd->vdev_top == cvd) {
uint64_t nparity = vdev_get_nparity(cvd);
if (nparity) {
*req_writes += nparity + 1;
} else {
*req_writes +=
MIN(MAX(cvd->vdev_children, 1), 2);
}
}
mmp_claim_uberblock_sync(zio, good_writes, req_writes,
ub, cvd, flags);
}
if (!vd->vdev_ops->vdev_op_leaf)
return;
if (!vdev_writeable(vd))
return;
if (vd->vdev_ops == &vdev_draid_spare_ops)
return;
abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
abd_zero_off(ub_abd, sizeof (uberblock_t),
VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
vdev_label_write(zio, vd, 0, ub_abd,
VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
MMP_BLOCKS_PER_LABEL), VDEV_UBERBLOCK_SIZE(vd),
mmp_claim_uberblock_sync_done, good_writes,
flags | ZIO_FLAG_DONT_PROPAGATE);
abd_free(ub_abd);
}
int
mmp_claim_uberblock(spa_t *spa, vdev_t *vd, uberblock_t *ub)
{
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
uint64_t good_writes = 0;
uint64_t req_writes = 0;
zio_t *zio;
ASSERT(MMP_VALID(ub));
ASSERT(MMP_SEQ_VALID(ub));
spa_config_enter(spa, SCL_ALL, mmp_tag, RW_WRITER);
/* Sync the uberblock to all writeable leaves */
zio = zio_root(spa, NULL, NULL, flags);
mmp_claim_uberblock_sync(zio, &good_writes, &req_writes, ub, vd, flags);
(void) zio_wait(zio);
/* Flush the new uberblocks so they're immediately visible */
zio = zio_root(spa, NULL, NULL, flags);
zio_flush(zio, vd);
(void) zio_wait(zio);
spa_config_exit(spa, SCL_ALL, mmp_tag);
zfs_dbgmsg("mmp: claiming uberblock, spa=%s txg=%llu seq=%llu "
"req_writes=%llu good_writes=%llu", spa_load_name(spa),
(u_longlong_t)ub->ub_txg, (u_longlong_t)MMP_SEQ(ub),
(u_longlong_t)req_writes, (u_longlong_t)good_writes);
/*
* To guarantee visibility from a remote host we require a minimum
* number of good writes. For raidz/draid vdevs parity+1 writes, for
* mirrors 2 writes, and for singletons 1 write.
*/
if (req_writes == 0 || good_writes < req_writes)
return (SET_ERROR(EIO));
return (0);
}
static __attribute__((noreturn)) void
mmp_thread(void *arg)
{
@ -616,11 +727,11 @@ mmp_thread(void *arg)
next_time = gethrtime() + mmp_interval / leaves;
if (mmp_fail_ns != last_mmp_fail_ns) {
zfs_dbgmsg("MMP interval change pool '%s' "
"gethrtime %llu last_mmp_interval %llu "
"mmp_interval %llu last_mmp_fail_intervals %u "
"mmp_fail_intervals %u mmp_fail_ns %llu "
"skip_wait %d leaves %d next_time %llu",
zfs_dbgmsg("mmp: interval change, spa=%s "
"gethrtime=%llu last_mmp_interval=%llu "
"mmp_interval=%llu last_mmp_fail_intervals=%u "
"mmp_fail_intervals=%u mmp_fail_ns=%llu "
"skip_wait=%d leaves=%d next_time=%llu",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)last_mmp_interval,
(u_longlong_t)mmp_interval, last_mmp_fail_intervals,
@ -635,9 +746,9 @@ mmp_thread(void *arg)
*/
if ((!last_spa_multihost && multihost) ||
(last_spa_suspended && !suspended)) {
zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
"last_spa_multihost %u multihost %u "
"last_spa_suspended %u suspended %u",
zfs_dbgmsg("mmp: state change spa=%s: gethrtime=%llu "
"last_spa_multihost=%u multihost=%u "
"last_spa_suspended=%u suspended=%u",
spa_name(spa), (u_longlong_t)gethrtime(),
last_spa_multihost, multihost, last_spa_suspended,
suspended);
@ -663,9 +774,10 @@ mmp_thread(void *arg)
*/
if (multihost && !suspended && mmp_fail_intervals &&
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
"mmp_last_write %llu mmp_interval %llu "
"mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
zfs_dbgmsg("mmp: suspending pool, spa=%s "
"gethrtime=%llu mmp_last_write=%llu "
"mmp_interval=%llu mmp_fail_intervals=%llu "
"mmp_fail_ns=%llu txg=%llu",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)mmp->mmp_last_write,
(u_longlong_t)mmp_interval,

View File

@ -3722,32 +3722,104 @@ vdev_count_verify_zaps(vdev_t *vd)
#endif
/*
* Determine whether the activity check is required.
* Check the results load_info results from previous tryimport.
*
* error results:
* 0 - Pool remains in an idle state
* EREMOTEIO - Pool was known to be active on the other host
* ENOENT - The config does not contain complete tryimport info
*/
static boolean_t
spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
nvlist_t *config)
static int
spa_activity_verify_config(spa_t *spa, uberblock_t *ub)
{
uint64_t state = POOL_STATE_ACTIVE;
uint64_t hostid = 0;
uint64_t tryconfig_mmp_state = MMP_STATE_ACTIVE;
uint64_t tryconfig_txg = 0;
uint64_t tryconfig_timestamp = 0;
uint16_t tryconfig_mmp_seq = 0;
nvlist_t *nvinfo;
nvlist_t *nvinfo, *config = spa->spa_config;
int error;
if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
(void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
&tryconfig_txg);
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
&tryconfig_timestamp);
(void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
&tryconfig_mmp_seq);
/* Simply a non-zero value to indicate the verify was done. */
spa->spa_mmp.mmp_import_ns = 1000;
error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo);
if (error)
return (SET_ERROR(ENOENT));
/*
* If ZPOOL_CONFIG_MMP_STATE is present an activity check was performed
* during the earlier tryimport. If the state recorded there isn't
* MMP_STATE_INACTIVE the pool is known to be active on another host.
*/
error = nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE,
&tryconfig_mmp_state);
if (error)
return (SET_ERROR(ENOENT));
if (tryconfig_mmp_state != MMP_STATE_INACTIVE) {
spa_load_failed(spa, "mmp: pool is active on remote host, "
"state=%llu", (u_longlong_t)tryconfig_mmp_state);
return (SET_ERROR(EREMOTEIO));
}
/*
* If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
* during the earlier tryimport. If the txg recorded there is 0 then
* the pool is known to be active on another host.
*/
error = nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
&tryconfig_txg);
if (error)
return (SET_ERROR(ENOENT));
if (tryconfig_txg == 0) {
spa_load_failed(spa, "mmp: pool is active on remote host, "
"tryconfig_txg=%llu", (u_longlong_t)tryconfig_txg);
return (SET_ERROR(EREMOTEIO));
}
error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
&tryconfig_timestamp);
if (error)
return (SET_ERROR(ENOENT));
error = nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
&tryconfig_mmp_seq);
if (error)
return (SET_ERROR(ENOENT));
if (tryconfig_timestamp == ub->ub_timestamp &&
tryconfig_txg == ub->ub_txg &&
MMP_SEQ_VALID(ub) && tryconfig_mmp_seq == MMP_SEQ(ub)) {
zfs_dbgmsg("mmp: verified pool mmp tryimport config, "
"spa=%s", spa_load_name(spa));
return (0);
}
spa_load_failed(spa, "mmp: pool is active on remote host, "
"tc_timestamp=%llu ub_timestamp=%llu "
"tc_txg=%llu ub_txg=%llu tc_seq=%llu ub_seq=%llu",
(u_longlong_t)tryconfig_timestamp, (u_longlong_t)ub->ub_timestamp,
(u_longlong_t)tryconfig_txg, (u_longlong_t)ub->ub_txg,
(u_longlong_t)tryconfig_mmp_seq, (u_longlong_t)MMP_SEQ(ub));
return (SET_ERROR(EREMOTEIO));
}
/*
* Determine whether the activity check is required.
*/
static boolean_t
spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label)
{
nvlist_t *config = spa->spa_config;
uint64_t state = POOL_STATE_ACTIVE;
uint64_t hostid = 0;
/*
* Disable the MMP activity check - This is used by zdb which
* is intended to be used on potentially active pools.
* is always read-only and intended to be used on potentially
* active pools.
*/
if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) {
zfs_dbgmsg("mmp: skipping check ZFS_IMPORT_SKIP_MMP is set, "
@ -3757,46 +3829,44 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
/*
* Skip the activity check when the MMP feature is disabled.
* - MMP_MAGIC not set - Legacy pool predates the MMP feature, or
* - MMP_MAGIC set && mmp_delay == 0 - MMP feature is disabled.
*/
if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) {
if ((ub->ub_mmp_magic != MMP_MAGIC) ||
(ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)) {
zfs_dbgmsg("mmp: skipping check: feature is disabled, "
"spa=%s", spa_load_name(spa));
return (B_FALSE);
}
/*
* If the tryconfig_ values are nonzero, they are the results of an
* earlier tryimport. If they all match the uberblock we just found,
* then the pool has not changed and we return false so we do not test
* a second time.
*/
if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
tryconfig_mmp_seq && tryconfig_mmp_seq ==
(MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
zfs_dbgmsg("mmp: skipping check: tryconfig values match, "
"spa=%s", spa_name(spa));
return (B_FALSE);
}
/*
* Allow the activity check to be skipped when importing a cleanly
* exported pool on the same host which last imported it. Since the
* hostid from configuration may be stale use the one read from the
* label. Imports from other hostids must perform the activity check.
*/
if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
if (label != NULL) {
if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
hostid = fnvlist_lookup_uint64(label,
ZPOOL_CONFIG_HOSTID);
if (nvlist_exists(config, ZPOOL_CONFIG_POOL_STATE))
state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE);
if (nvlist_exists(config, ZPOOL_CONFIG_POOL_STATE))
state = fnvlist_lookup_uint64(config,
ZPOOL_CONFIG_POOL_STATE);
if (spa_get_hostid(spa) && hostid == spa_get_hostid(spa) &&
state == POOL_STATE_EXPORTED) {
zfs_dbgmsg("mmp: skipping check: hostid matches and pool is "
"exported, spa=%s, hostid=%llx",
spa_load_name(spa), (u_longlong_t)hostid);
return (B_FALSE);
if (spa_get_hostid(spa) && hostid == spa_get_hostid(spa) &&
state == POOL_STATE_EXPORTED) {
zfs_dbgmsg("mmp: skipping check: hostid matches "
"and pool is exported, spa=%s, hostid=%llx",
spa_load_name(spa), (u_longlong_t)hostid);
return (B_FALSE);
}
if (state == POOL_STATE_DESTROYED) {
zfs_dbgmsg("mmp: skipping check: intentionally "
"destroyed pool, spa=%s", spa_load_name(spa));
return (B_FALSE);
}
}
return (B_TRUE);
@ -3832,9 +3902,10 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
MMP_IMPORT_SAFETY_FACTOR / 100;
zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
"mmp_fails=%llu ub_mmp mmp_interval=%llu "
"import_intervals=%llu", (u_longlong_t)import_delay,
zfs_dbgmsg("mmp: settings spa=%s fail_intvals>0 "
"import_delay=%llu mmp_fails=%llu mmp_interval=%llu "
"import_intervals=%llu", spa_load_name(spa),
(u_longlong_t)import_delay,
(u_longlong_t)MMP_FAIL_INT(ub),
(u_longlong_t)MMP_INTERVAL(ub),
(u_longlong_t)import_intervals);
@ -3846,9 +3917,10 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
ub->ub_mmp_delay) * import_intervals);
zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
"mmp_interval=%llu ub_mmp_delay=%llu "
"import_intervals=%llu", (u_longlong_t)import_delay,
zfs_dbgmsg("mmp: settings spa=%s fail_intvals=0 "
"import_delay=%llu mmp_interval=%llu ub_mmp_delay=%llu "
"import_intervals=%llu", spa_load_name(spa),
(u_longlong_t)import_delay,
(u_longlong_t)MMP_INTERVAL(ub),
(u_longlong_t)ub->ub_mmp_delay,
(u_longlong_t)import_intervals);
@ -3861,17 +3933,18 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
import_delay = MAX(import_delay, (multihost_interval +
ub->ub_mmp_delay) * import_intervals);
zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
"import_intervals=%llu leaves=%u",
(u_longlong_t)import_delay,
zfs_dbgmsg("mmp: settings spa=%s import_delay=%llu "
"ub_mmp_delay=%llu import_intervals=%llu leaves=%u",
spa_load_name(spa), (u_longlong_t)import_delay,
(u_longlong_t)ub->ub_mmp_delay,
(u_longlong_t)import_intervals,
vdev_count_leaves(spa));
} else {
/* Using local tunings is the only reasonable option */
zfs_dbgmsg("pool last imported on non-MMP aware "
"host using import_delay=%llu multihost_interval=%llu "
"import_intervals=%llu", (u_longlong_t)import_delay,
zfs_dbgmsg("mmp: pool last imported on non-MMP aware "
"host using settings spa=%s import_delay=%llu "
"multihost_interval=%llu import_intervals=%llu",
spa_load_name(spa), (u_longlong_t)import_delay,
(u_longlong_t)multihost_interval,
(u_longlong_t)import_intervals);
}
@ -3880,7 +3953,122 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
}
/*
* Remote host activity check.
* Store the observed pool status in spa->spa_load_info nvlist. If the
* remote hostname or hostid are available from configuration read from
* disk store them as well. Additionally, provide some diagnostic info
* for which activity checks were run and their duration. This allows
* 'zpool import' to generate a more useful message.
*
* Mandatory observed pool status
* - ZPOOL_CONFIG_MMP_STATE - observed pool status (active/inactive)
* - ZPOOL_CONFIG_MMP_TXG - observed pool txg number
* - ZPOOL_CONFIG_MMP_SEQ - observed pool sequence id
*
* Optional information for detailed reporting
* - ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
* - ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool
* - ZPOOL_CONFIG_MMP_RESULT - set to result of activity check
* - ZPOOL_CONFIG_MMP_TRYIMPORT_NS - tryimport duration in nanosec
* - ZPOOL_CONFIG_MMP_IMPORT_NS - import duration in nanosec
* - ZPOOL_CONFIG_MMP_CLAIM_NS - claim duration in nanosec
*
* ZPOOL_CONFIG_MMP_RESULT can be set to:
* - ENXIO - system hostid not set
* - ESRCH - activity check skipped
* - EREMOTEIO - activity check detected active pool
* - EINTR - activity check interrupted
* - 0 - activity check detected no activity
*/
static void
spa_activity_set_load_info(spa_t *spa, nvlist_t *label, mmp_state_t state,
uint64_t txg, uint16_t seq, int error)
{
mmp_thread_t *mmp = &spa->spa_mmp;
const char *hostname = NULL;
uint64_t hostid = 0;
/* Always report a zero txg and seq id for active pools. */
if (state == MMP_STATE_ACTIVE) {
ASSERT0(txg);
ASSERT0(seq);
}
if (label) {
if (nvlist_exists(label, ZPOOL_CONFIG_HOSTNAME)) {
hostname = fnvlist_lookup_string(label,
ZPOOL_CONFIG_HOSTNAME);
fnvlist_add_string(spa->spa_load_info,
ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
}
if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) {
hostid = fnvlist_lookup_uint64(label,
ZPOOL_CONFIG_HOSTID);
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_HOSTID, hostid);
}
}
fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, state);
fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, txg);
fnvlist_add_uint16(spa->spa_load_info, ZPOOL_CONFIG_MMP_SEQ, seq);
fnvlist_add_uint32(spa->spa_load_info, ZPOOL_CONFIG_MMP_RESULT, error);
if (mmp->mmp_tryimport_ns > 0) {
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_TRYIMPORT_NS, mmp->mmp_tryimport_ns);
}
if (mmp->mmp_import_ns > 0) {
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_IMPORT_NS, mmp->mmp_import_ns);
}
if (mmp->mmp_claim_ns > 0) {
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_CLAIM_NS, mmp->mmp_claim_ns);
}
zfs_dbgmsg("mmp: set spa_load_info, spa=%s hostname=%s hostid=%llx "
"state=%d txg=%llu seq=%llu tryimport_ns=%lld import_ns=%lld "
"claim_ns=%lld", spa_load_name(spa),
hostname != NULL ? hostname : "none", (u_longlong_t)hostid,
(int)state, (u_longlong_t)txg, (u_longlong_t)seq,
(longlong_t)mmp->mmp_tryimport_ns, (longlong_t)mmp->mmp_import_ns,
(longlong_t)mmp->mmp_claim_ns);
}
static int
spa_ld_activity_result(spa_t *spa, int error, const char *state)
{
switch (error) {
case ENXIO:
cmn_err(CE_WARN, "pool '%s' system hostid not set, "
"aborted import during %s", spa_load_name(spa), state);
/* Userspace expects EREMOTEIO for no system hostid */
error = EREMOTEIO;
break;
case EREMOTEIO:
cmn_err(CE_WARN, "pool '%s' activity detected, aborted "
"import during %s", spa_load_name(spa), state);
break;
case EINTR:
cmn_err(CE_WARN, "pool '%s' activity check, interrupted "
"import during %s", spa_load_name(spa), state);
break;
case 0:
cmn_err(CE_NOTE, "pool '%s' activity check completed "
"successfully", spa_load_name(spa));
break;
}
return (error);
}
/*
* Remote host activity check. Performed during tryimport when the pool
* has passed on the basic sanity check and is open read-only.
*
* error results:
* 0 - no activity detected
@ -3888,17 +4076,9 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
* EINTR - user canceled the operation
*/
static int
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
spa_activity_check_tryimport(spa_t *spa, uberblock_t *spa_ub,
boolean_t importing)
{
uint64_t txg = ub->ub_txg;
uint64_t timestamp = ub->ub_timestamp;
uint64_t mmp_config = ub->ub_mmp_config;
uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
uint64_t import_delay;
hrtime_t import_expire, now;
nvlist_t *mmp_label = NULL;
vdev_t *rvd = spa->spa_root_vdev;
kcondvar_t cv;
kmutex_t mtx;
int error = 0;
@ -3907,65 +4087,55 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_enter(&mtx);
/*
* If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
* during the earlier tryimport. If the txg recorded there is 0 then
* the pool is known to be active on another host.
*
* Otherwise, the pool might be in use on another host. Check for
* changes in the uberblocks on disk if necessary.
*/
if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
ZPOOL_CONFIG_LOAD_INFO);
if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
vdev_uberblock_load(rvd, ub, &mmp_label);
error = SET_ERROR(EREMOTEIO);
goto out;
}
}
import_delay = spa_activity_check_duration(spa, ub);
uint64_t import_delay = spa_activity_check_duration(spa, spa_ub);
hrtime_t start_time = gethrtime();
/* Add a small random factor in case of simultaneous imports (0-25%) */
import_delay += import_delay * random_in_range(250) / 1000;
import_expire = gethrtime() + import_delay;
hrtime_t import_expire = gethrtime() + import_delay;
if (importing) {
/* Console message includes tryimport and claim time */
hrtime_t extra_delay = MMP_IMPORT_VERIFY_ITERS *
MSEC2NSEC(MMP_INTERVAL_VALID(spa_ub) ?
MMP_INTERVAL(spa_ub) : MMP_MIN_INTERVAL);
cmn_err(CE_NOTE, "pool '%s' multihost activity check "
"required, %llu seconds remaining", spa_load_name(spa),
(u_longlong_t)MAX(NSEC2SEC(import_delay + extra_delay), 1));
spa_import_progress_set_notes(spa, "Checking MMP activity, "
"waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
}
int iterations = 0;
hrtime_t now;
nvlist_t *mmp_label = NULL;
while ((now = gethrtime()) < import_expire) {
if (importing && iterations++ % 30 == 0) {
spa_import_progress_set_notes(spa, "Checking MMP "
"activity, %llu ms remaining",
(u_longlong_t)NSEC2MSEC(import_expire - now));
}
vdev_t *rvd = spa->spa_root_vdev;
uberblock_t mmp_ub;
if (importing) {
(void) spa_import_progress_set_mmp_check(spa_guid(spa),
NSEC2SEC(import_expire - gethrtime()));
}
vdev_uberblock_load(rvd, ub, &mmp_label);
if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
zfs_dbgmsg("multihost activity detected "
"txg %llu ub_txg %llu "
"timestamp %llu ub_timestamp %llu "
"mmp_config %#llx ub_mmp_config %#llx",
(u_longlong_t)txg, (u_longlong_t)ub->ub_txg,
(u_longlong_t)timestamp,
(u_longlong_t)ub->ub_timestamp,
(u_longlong_t)mmp_config,
(u_longlong_t)ub->ub_mmp_config);
vdev_uberblock_load(rvd, &mmp_ub, &mmp_label);
if (vdev_uberblock_compare(spa_ub, &mmp_ub)) {
spa_load_failed(spa, "mmp: activity detected during "
"tryimport, spa_ub_txg=%llu mmp_ub_txg=%llu "
"spa_ub_seq=%llu mmp_ub_seq=%llu "
"spa_ub_timestamp=%llu mmp_ub_timestamp=%llu "
"spa_ub_config=%#llx mmp_ub_config=%#llx",
(u_longlong_t)spa_ub->ub_txg,
(u_longlong_t)mmp_ub.ub_txg,
(u_longlong_t)(MMP_SEQ_VALID(spa_ub) ?
MMP_SEQ(spa_ub) : 0),
(u_longlong_t)(MMP_SEQ_VALID(&mmp_ub) ?
MMP_SEQ(&mmp_ub) : 0),
(u_longlong_t)spa_ub->ub_timestamp,
(u_longlong_t)mmp_ub.ub_timestamp,
(u_longlong_t)spa_ub->ub_mmp_config,
(u_longlong_t)mmp_ub.ub_mmp_config);
error = SET_ERROR(EREMOTEIO);
break;
}
@ -3983,52 +4153,255 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
error = 0;
}
out:
mutex_exit(&mtx);
mutex_destroy(&mtx);
cv_destroy(&cv);
/*
* If the pool is determined to be active store the status in the
* spa->spa_load_info nvlist. If the remote hostname or hostid are
* available from configuration read from disk store them as well.
* This allows 'zpool import' to generate a more useful message.
*
* ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory)
* ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
* ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool
*/
if (error == EREMOTEIO) {
if (mmp_label) {
if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
const char *hostname = fnvlist_lookup_string(
mmp_label, ZPOOL_CONFIG_HOSTNAME);
fnvlist_add_string(spa->spa_load_info,
ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
}
if (mmp_label)
nvlist_free(mmp_label);
if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
uint64_t hostid = fnvlist_lookup_uint64(
mmp_label, ZPOOL_CONFIG_HOSTID);
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_HOSTID, hostid);
}
if (spa->spa_load_state == SPA_LOAD_IMPORT ||
spa->spa_load_state == SPA_LOAD_OPEN) {
spa->spa_mmp.mmp_import_ns = gethrtime() - start_time;
} else {
spa->spa_mmp.mmp_tryimport_ns = gethrtime() - start_time;
}
return (error);
}
/*
* Remote host activity check. Performed during import when the pool has
* passed most sanity check and has been reopened read/write.
*
* error results:
* 0 - no activity detected
* EREMOTEIO - remote activity detected
* EINTR - user canceled the operation
*/
static int
spa_activity_check_claim(spa_t *spa)
{
vdev_t *rvd = spa->spa_root_vdev;
nvlist_t *mmp_label;
uberblock_t spa_ub;
kcondvar_t cv;
kmutex_t mtx;
int error = 0;
cv_init(&cv, NULL, CV_DEFAULT, NULL);
mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_enter(&mtx);
hrtime_t start_time = gethrtime();
/*
* Load the best uberblock and verify it matches the uberblock already
* identified and stored as spa->spa_uberblock to verify the pool has
* not changed.
*/
vdev_uberblock_load(rvd, &spa_ub, &mmp_label);
if (memcmp(&spa->spa_uberblock, &spa_ub, sizeof (uberblock_t))) {
spa_load_failed(spa, "mmp: uberblock changed on disk");
error = SET_ERROR(EREMOTEIO);
goto out;
}
if (!MMP_VALID(&spa_ub) || !MMP_INTERVAL_VALID(&spa_ub) ||
!MMP_SEQ_VALID(&spa_ub) || !MMP_FAIL_INT_VALID(&spa_ub)) {
spa_load_failed(spa, "mmp: is not enabled in spa uberblock");
error = SET_ERROR(EREMOTEIO);
goto out;
}
nvlist_free(mmp_label);
mmp_label = NULL;
uint64_t spa_ub_interval = MMP_INTERVAL(&spa_ub);
uint16_t spa_ub_seq = MMP_SEQ(&spa_ub);
/*
* In the highly unlikely event the sequence numbers have been
* exhaused reset the sequence to zero. As long as the MMP
* uberblock is updated on all of the vdevs the activity will
* still be detected.
*/
if (MMP_SEQ_MAX == spa_ub_seq)
spa_ub_seq = 0;
spa_import_progress_set_notes(spa,
"Establishing MMP claim, waiting %llu ms",
(u_longlong_t)(MMP_IMPORT_VERIFY_ITERS * spa_ub_interval));
/*
* Repeatedly sync out an MMP uberblock with a randomly selected
* sequence number, then read it back after the MMP interval. This
* random value acts as a claim token and is visible on other hosts.
* If the same random value is read back we can be certain no other
* pool is attempting to import the pool.
*/
for (int i = MMP_IMPORT_VERIFY_ITERS; i > 0; i--) {
uberblock_t set_ub, mmp_ub;
uint16_t mmp_seq;
(void) spa_import_progress_set_mmp_check(spa_guid(spa),
NSEC2SEC(i * MSEC2NSEC(spa_ub_interval)));
set_ub = spa_ub;
mmp_seq = spa_ub_seq + 1 +
random_in_range(MMP_SEQ_MAX - spa_ub_seq);
MMP_SEQ_CLEAR(&set_ub);
set_ub.ub_mmp_config |= MMP_SEQ_SET(mmp_seq);
error = mmp_claim_uberblock(spa, rvd, &set_ub);
if (error) {
spa_load_failed(spa, "mmp: uberblock claim "
"failed, error=%d", error);
error = SET_ERROR(EREMOTEIO);
break;
}
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_TXG, 0);
error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() +
MSEC_TO_TICK(spa_ub_interval));
if (error != -1) {
error = SET_ERROR(EINTR);
break;
}
error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
vdev_uberblock_load(rvd, &mmp_ub, &mmp_label);
if (vdev_uberblock_compare(&set_ub, &mmp_ub)) {
spa_load_failed(spa, "mmp: activity detected during "
"claim, set_ub_txg=%llu mmp_ub_txg=%llu "
"set_ub_seq=%llu mmp_ub_seq=%llu "
"set_ub_timestamp=%llu mmp_ub_timestamp=%llu "
"set_ub_config=%#llx mmp_ub_config=%#llx",
(u_longlong_t)set_ub.ub_txg,
(u_longlong_t)mmp_ub.ub_txg,
(u_longlong_t)(MMP_SEQ_VALID(&set_ub) ?
MMP_SEQ(&set_ub) : 0),
(u_longlong_t)(MMP_SEQ_VALID(&mmp_ub) ?
MMP_SEQ(&mmp_ub) : 0),
(u_longlong_t)set_ub.ub_timestamp,
(u_longlong_t)mmp_ub.ub_timestamp,
(u_longlong_t)set_ub.ub_mmp_config,
(u_longlong_t)mmp_ub.ub_mmp_config);
error = SET_ERROR(EREMOTEIO);
break;
}
if (mmp_label) {
nvlist_free(mmp_label);
mmp_label = NULL;
}
error = 0;
}
out:
spa->spa_mmp.mmp_claim_ns = gethrtime() - start_time;
(void) spa_import_progress_set_mmp_check(spa_guid(spa), 0);
if (error == EREMOTEIO) {
spa_activity_set_load_info(spa, mmp_label,
MMP_STATE_ACTIVE, 0, 0, EREMOTEIO);
} else {
spa_activity_set_load_info(spa, mmp_label,
MMP_STATE_INACTIVE, spa_ub.ub_txg, MMP_SEQ(&spa_ub), 0);
}
/*
* Restore the original sequence, this allows us to retry the
* import procedure if a subsequent step fails during import.
* Failure to restore it reduces the available sequence ids for
* the next import but shouldn't be considered fatal.
*/
int restore_error = mmp_claim_uberblock(spa, rvd, &spa_ub);
if (restore_error) {
zfs_dbgmsg("mmp: uberblock restore failed, spa=%s error=%d",
spa_load_name(spa), restore_error);
}
if (mmp_label)
nvlist_free(mmp_label);
mutex_exit(&mtx);
mutex_destroy(&mtx);
cv_destroy(&cv);
return (error);
}
static int
spa_ld_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *label)
{
vdev_t *rvd = spa->spa_root_vdev;
int error;
if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
spa_get_hostid(spa) == 0) {
spa_activity_set_load_info(spa, label, MMP_STATE_NO_HOSTID,
ub->ub_txg, MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0, ENXIO);
zfs_dbgmsg("mmp: system hostid not set, ub_mmp_magic=%llx "
"ub_mmp_delay=%llu hostid=%llx",
(u_longlong_t)ub->ub_mmp_magic,
(u_longlong_t)ub->ub_mmp_delay,
(u_longlong_t)spa_get_hostid(spa));
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, ENXIO));
}
switch (spa->spa_load_state) {
case SPA_LOAD_TRYIMPORT:
tryimport:
error = spa_activity_check_tryimport(spa, ub, B_TRUE);
if (error == EREMOTEIO) {
spa_activity_set_load_info(spa, label,
MMP_STATE_ACTIVE, 0, 0, EREMOTEIO);
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
} else if (error) {
ASSERT3S(error, ==, EINTR);
spa_activity_set_load_info(spa, label,
MMP_STATE_ACTIVE, 0, 0, EINTR);
return (error);
}
spa_activity_set_load_info(spa, label, MMP_STATE_INACTIVE,
ub->ub_txg, MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0, 0);
break;
case SPA_LOAD_IMPORT:
case SPA_LOAD_OPEN:
error = spa_activity_verify_config(spa, ub);
if (error == EREMOTEIO) {
spa_activity_set_load_info(spa, label,
MMP_STATE_ACTIVE, 0, 0, EREMOTEIO);
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
} else if (error) {
ASSERT3S(error, ==, ENOENT);
goto tryimport;
}
/* Load info set in spa_activity_check_claim() */
break;
case SPA_LOAD_RECOVER:
zfs_dbgmsg("mmp: skipping mmp check for rewind, spa=%s",
spa_load_name(spa));
break;
default:
spa_activity_set_load_info(spa, label, MMP_STATE_ACTIVE,
0, 0, EREMOTEIO);
zfs_dbgmsg("mmp: unreachable, spa=%s spa_load_state=%d",
spa_load_name(spa), spa->spa_load_state);
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
}
return (0);
}
/*
* Called from zfs_ioc_clear for a pool that was suspended
* after failing mmp write checks.
@ -4068,8 +4441,9 @@ spa_mmp_remote_host_activity(spa_t *spa)
if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
zfs_dbgmsg("txg mismatch detected during pool clear "
"txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
zfs_dbgmsg("mmp: txg mismatch detected during pool clear, "
"spa=%s txg=%llu ub_txg=%llu timestamp=%llu "
"ub_timestamp=%llu", spa_name(spa),
(u_longlong_t)spa->spa_uberblock.ub_txg,
(u_longlong_t)best_ub.ub_txg,
(u_longlong_t)spa->spa_uberblock.ub_timestamp,
@ -4080,8 +4454,7 @@ spa_mmp_remote_host_activity(spa_t *spa)
/*
* Perform an activity check looking for any remote writer
*/
return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
B_FALSE) != 0);
return (spa_activity_check_tryimport(spa, &best_ub, B_FALSE) != 0);
}
static int
@ -4341,7 +4714,6 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
vdev_t *rvd = spa->spa_root_vdev;
nvlist_t *label;
uberblock_t *ub = &spa->spa_uberblock;
boolean_t activity_check = B_FALSE;
/*
* If we are opening the checkpointed state of the pool by
@ -4393,37 +4765,25 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
(u_longlong_t)RRSS_GET_OFFSET(ub));
}
/*
* For pools which have the multihost property on determine if the
* pool is truly inactive and can be safely imported. Prevent
* hosts which don't have a hostid set from importing the pool.
*/
activity_check = spa_activity_check_required(spa, ub, label,
spa->spa_config);
if (activity_check) {
if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
spa_get_hostid(spa) == 0) {
nvlist_free(label);
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
}
int error =
spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
spa->spa_activity_check = spa_activity_check_required(spa, ub, label);
if (spa->spa_activity_check) {
int error = spa_ld_activity_check(spa, ub, label);
if (error) {
spa_load_state_t state = spa->spa_load_state;
error = spa_ld_activity_result(spa, error,
state == SPA_LOAD_TRYIMPORT ? "tryimport" :
state == SPA_LOAD_IMPORT ? "import" : "open");
nvlist_free(label);
return (error);
}
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
fnvlist_add_uint16(spa->spa_load_info,
ZPOOL_CONFIG_MMP_SEQ,
(MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
} else {
fnvlist_add_uint32(spa->spa_load_info,
ZPOOL_CONFIG_MMP_RESULT, ESRCH);
}
/*
@ -4706,6 +5066,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
}
}
/*
* Final sanity check for multihost pools that no other host is
* accessing the pool. All of the read-only check have passed at
* this point, perform targetted updates to the mmp uberblocks to
* safely force a visible change.
*/
if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
!spa->spa_extreme_rewind && spa->spa_activity_check) {
error = spa_activity_check_claim(spa);
error = spa_ld_activity_result(spa, error, "claim");
if (error == EREMOTEIO)
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
else if (error)
return (error);
}
error = spa_check_for_missing_logs(spa);
if (error != 0)
return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
@ -5931,13 +6309,21 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
if (load_error == 0)
return (0);
if (load_error == ZFS_ERR_NO_CHECKPOINT) {
/*
* When attempting checkpoint-rewind on a pool with no
* checkpoint, we should not attempt to load uberblocks
* from previous txgs when spa_load fails.
*/
/* Do not attempt to load uberblocks from previous txgs when: */
switch (load_error) {
case ZFS_ERR_NO_CHECKPOINT:
/* Attempting checkpoint-rewind on a pool with no checkpoint */
ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
zfs_fallthrough;
case EREMOTEIO:
/* MMP determines the pool is active on another host */
zfs_fallthrough;
case EBADF:
/* The config cache is out of sync (vdevs or hostid) */
zfs_fallthrough;
case EINTR:
/* The user interactively interrupted the import */
spa_import_progress_remove(spa_guid(spa));
return (load_error);
}

View File

@ -1491,7 +1491,7 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
* conflicting uberblocks on disk with the same txg. The solution is simple:
* among uberblocks with equal txg, choose the one with the latest timestamp.
*/
static int
int
vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
{
int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg);
@ -1622,8 +1622,10 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
* matches the txg for our uberblock.
*/
if (cb.ubl_vd != NULL) {
vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
"txg %llu", spa_load_name(spa), (u_longlong_t)ub->ub_txg);
vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s, "
"txg=%llu seq=%llu", spa_load_name(spa),
(u_longlong_t)ub->ub_txg,
(u_longlong_t)(MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
if (ub->ub_raidz_reflow_info !=
cb.ubl_latest.ub_raidz_reflow_info) {