mmp: claim sequence id before final import

As part of SPA_LOAD_IMPORT add an additional activity check to
detect simultaneous imports from different hosts.  This check is
only required when the timing is such that there's no activity
for the the read-only tryimport check to detect.  This extra
safety chceck operates as follows:

1. Repeats the following MMP check 10 times:
  a. Write out an MMP uberblock with the best txg and a random
     sequence id to all primary pool vdevs.
  b. Verify a minimum number of good writes such that even if
     the pool appears degraded on the remote host it will see
     at least one of the updated MMP uberblocks.
  c. Wait for the MMP interval this leaves a window for other
     racing hosts to make similar modifications which can be
     detected.
  d. Call vdev_uberblock_load() to determine the best uberblock
     to use, this should be the MMP uberblock just written.
  e. Verify the txg and random sequeunce number match the MMP
     uberblock written in 1a.

2. Restore the original MMP uberblocks.  This allows the check
   to be performed again if the pool fails to import for an
   unrelated reason.

This change also includes some refactoring and minor improvements.

- Never try loading earlier txgs during import when the import
  fails with EREMOTEIO or EINTER.  These errors don't indicate
  the txg is damaged but instead that its either in use on a
  remote host or the import was interactively cancelled.  No
  rewind is also performed for EBADD which can result from a
  stale trusted config when doing a verbatim import.

- Refactor the code for consistent logging of the multihost
  activity check using spa_load_note() and console messages
  indicating when the activity check was trigger and the result.

- Added MMP_*_MASK and MMP_SEQ_CLEAR() macros to allow easier
  modification of the sequence number in an uberblock.

- Added ZFS_LOAD_INFO_DEBUG environment variable which can be
  set to log to dump to stdout the spa_load_info nvlist returned
  during import.  This is used by the updated mmp test cases
  to determine if an activity check was run and its result.

- Standardize the mmp messages similarly to make it easier to
  find all the relevent mmp lines in the debug log.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
This commit is contained in:
Brian Behlendorf
2026-01-22 16:20:02 -08:00
committed by Tony Hutter
parent 2f048ced4d
commit 20176224ee
10 changed files with 731 additions and 201 deletions
+135 -23
View File
@@ -145,6 +145,15 @@
* Additionally, the duration is then extended by a random 25% to attempt to to
* detect simultaneous imports. For example, if both partner hosts are rebooted
* at the same time and automatically attempt to import the pool.
*
* Once the read-only activity check completes and the pool is determined to
* be inactive a second check is performed to claim the pool. During this
* phase the host writes out MMP uberblocks to each of the devices which are
* identical to the best uberblock but with a randomly selected sequence id.
* The "best" uberblock is then read back and it must contain this new sequence
* number. This check is performed multiple times to ensure that there is
* no window where a concurrently importing system can incorrectly determine
* the pool to be inactive.
*/
/*
@@ -237,8 +246,8 @@ mmp_thread_start(spa_t *spa)
if (!mmp->mmp_thread) {
mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
spa, 0, &p0, TS_RUN, defclsyspri);
zfs_dbgmsg("MMP thread started pool '%s' "
"gethrtime %llu", spa_name(spa), gethrtime());
zfs_dbgmsg("mmp: mmp thread started spa=%s "
"gethrtime=%llu", spa_name(spa), gethrtime());
}
mutex_exit(&mmp->mmp_thread_lock);
}
@@ -257,7 +266,7 @@ mmp_thread_stop(spa_t *spa)
cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
}
mutex_exit(&mmp->mmp_thread_lock);
zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
zfs_dbgmsg("mmp: mmp thread stopped spa=%s gethrtime=%llu",
spa_name(spa), gethrtime());
ASSERT0P(mmp->mmp_thread);
@@ -449,9 +458,9 @@ mmp_write_uberblock(spa_t *spa)
spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER);
lock_acquire_time = gethrtime() - lock_acquire_time;
if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
"gethrtime %llu", spa_name(spa), lock_acquire_time,
gethrtime());
zfs_dbgmsg("mmp: long SCL_STATE acquisition, spa=%s "
"acquire_time=%llu gethrtime=%llu", spa_name(spa),
lock_acquire_time, gethrtime());
mutex_enter(&mmp->mmp_io_lock);
@@ -474,8 +483,8 @@ mmp_write_uberblock(spa_t *spa)
spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
gethrestime_sec(), mmp->mmp_delay, NULL, 0,
mmp->mmp_kstat_id++, error);
zfs_dbgmsg("MMP error choosing leaf pool '%s' "
"gethrtime %llu fail_mask %#x", spa_name(spa),
zfs_dbgmsg("mmp: error choosing leaf, spa=%s "
"gethrtime=%llu fail_mask=%#x", spa_name(spa),
gethrtime(), error);
}
mutex_exit(&mmp->mmp_io_lock);
@@ -485,11 +494,11 @@ mmp_write_uberblock(spa_t *spa)
vd = spa->spa_mmp.mmp_last_leaf;
if (mmp->mmp_skip_error != 0) {
mmp->mmp_skip_error = 0;
zfs_dbgmsg("MMP write after skipping due to unavailable "
"leaves, pool '%s' gethrtime %llu leaf %llu",
zfs_dbgmsg("mmp: write after skipping due to unavailable "
"leaves, spa=%s gethrtime=%llu vdev=%llu error=%d",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)vd->vdev_guid);
(u_longlong_t)vd->vdev_guid, mmp->mmp_skip_error);
mmp->mmp_skip_error = 0;
}
if (mmp->mmp_zio_root == NULL)
@@ -540,6 +549,108 @@ mmp_write_uberblock(spa_t *spa)
zio_nowait(zio);
}
static void
mmp_claim_uberblock_sync_done(zio_t *zio)
{
uint64_t *good_writes = zio->io_private;
if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
atomic_inc_64(good_writes);
}
/*
* Write the uberblock to the first label of all leaves of the specified vdev.
* Two writes required for each mirror, one for a singleton, and parity+1 for
* raidz or draid vdevs.
*/
static void
mmp_claim_uberblock_sync(zio_t *zio, uint64_t *good_writes,
uint64_t *req_writes, uberblock_t *ub, vdev_t *vd, int flags)
{
for (uint64_t c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_islog || cvd->vdev_isspare || cvd->vdev_isl2cache)
continue;
if (cvd->vdev_top == cvd) {
uint64_t nparity = vdev_get_nparity(cvd);
if (nparity) {
*req_writes += nparity + 1;
} else {
*req_writes +=
MIN(MAX(cvd->vdev_children, 1), 2);
}
}
mmp_claim_uberblock_sync(zio, good_writes, req_writes,
ub, cvd, flags);
}
if (!vd->vdev_ops->vdev_op_leaf)
return;
if (!vdev_writeable(vd))
return;
if (vd->vdev_ops == &vdev_draid_spare_ops)
return;
abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
abd_zero_off(ub_abd, sizeof (uberblock_t),
VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
vdev_label_write(zio, vd, 0, ub_abd,
VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
MMP_BLOCKS_PER_LABEL), VDEV_UBERBLOCK_SIZE(vd),
mmp_claim_uberblock_sync_done, good_writes,
flags | ZIO_FLAG_DONT_PROPAGATE);
abd_free(ub_abd);
}
int
mmp_claim_uberblock(spa_t *spa, vdev_t *vd, uberblock_t *ub)
{
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
uint64_t good_writes = 0;
uint64_t req_writes = 0;
zio_t *zio;
ASSERT(MMP_VALID(ub));
ASSERT(MMP_SEQ_VALID(ub));
spa_config_enter(spa, SCL_ALL, mmp_tag, RW_WRITER);
/* Sync the uberblock to all writeable leaves */
zio = zio_root(spa, NULL, NULL, flags);
mmp_claim_uberblock_sync(zio, &good_writes, &req_writes, ub, vd, flags);
(void) zio_wait(zio);
/* Flush the new uberblocks so they're immediately visible */
zio = zio_root(spa, NULL, NULL, flags);
zio_flush(zio, vd);
(void) zio_wait(zio);
spa_config_exit(spa, SCL_ALL, mmp_tag);
zfs_dbgmsg("mmp: claiming uberblock, spa=%s txg=%llu seq=%llu "
"req_writes=%llu good_writes=%llu", spa_load_name(spa),
(u_longlong_t)ub->ub_txg, (u_longlong_t)MMP_SEQ(ub),
(u_longlong_t)req_writes, (u_longlong_t)good_writes);
/*
* To guarantee visibility from a remote host we require a minimum
* number of good writes. For raidz/draid vdevs parity+1 writes, for
* mirrors 2 writes, and for singletons 1 write.
*/
if (req_writes == 0 || good_writes < req_writes)
return (SET_ERROR(EIO));
return (0);
}
static __attribute__((noreturn)) void
mmp_thread(void *arg)
{
@@ -616,11 +727,11 @@ mmp_thread(void *arg)
next_time = gethrtime() + mmp_interval / leaves;
if (mmp_fail_ns != last_mmp_fail_ns) {
zfs_dbgmsg("MMP interval change pool '%s' "
"gethrtime %llu last_mmp_interval %llu "
"mmp_interval %llu last_mmp_fail_intervals %u "
"mmp_fail_intervals %u mmp_fail_ns %llu "
"skip_wait %d leaves %d next_time %llu",
zfs_dbgmsg("mmp: interval change, spa=%s "
"gethrtime=%llu last_mmp_interval=%llu "
"mmp_interval=%llu last_mmp_fail_intervals=%u "
"mmp_fail_intervals=%u mmp_fail_ns=%llu "
"skip_wait=%d leaves=%d next_time=%llu",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)last_mmp_interval,
(u_longlong_t)mmp_interval, last_mmp_fail_intervals,
@@ -635,9 +746,9 @@ mmp_thread(void *arg)
*/
if ((!last_spa_multihost && multihost) ||
(last_spa_suspended && !suspended)) {
zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
"last_spa_multihost %u multihost %u "
"last_spa_suspended %u suspended %u",
zfs_dbgmsg("mmp: state change spa=%s: gethrtime=%llu "
"last_spa_multihost=%u multihost=%u "
"last_spa_suspended=%u suspended=%u",
spa_name(spa), (u_longlong_t)gethrtime(),
last_spa_multihost, multihost, last_spa_suspended,
suspended);
@@ -663,9 +774,10 @@ mmp_thread(void *arg)
*/
if (multihost && !suspended && mmp_fail_intervals &&
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
"mmp_last_write %llu mmp_interval %llu "
"mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
zfs_dbgmsg("mmp: suspending pool, spa=%s "
"gethrtime=%llu mmp_last_write=%llu "
"mmp_interval=%llu mmp_fail_intervals=%llu "
"mmp_fail_ns=%llu txg=%llu",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)mmp->mmp_last_write,
(u_longlong_t)mmp_interval,