mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-25 11:47:43 +03:00
mmp: claim sequence id before final import
As part of SPA_LOAD_IMPORT add an additional activity check to
detect simultaneous imports from different hosts. This check is
only required when the timing is such that there's no activity
for the the read-only tryimport check to detect. This extra
safety chceck operates as follows:
1. Repeats the following MMP check 10 times:
a. Write out an MMP uberblock with the best txg and a random
sequence id to all primary pool vdevs.
b. Verify a minimum number of good writes such that even if
the pool appears degraded on the remote host it will see
at least one of the updated MMP uberblocks.
c. Wait for the MMP interval this leaves a window for other
racing hosts to make similar modifications which can be
detected.
d. Call vdev_uberblock_load() to determine the best uberblock
to use, this should be the MMP uberblock just written.
e. Verify the txg and random sequeunce number match the MMP
uberblock written in 1a.
2. Restore the original MMP uberblocks. This allows the check
to be performed again if the pool fails to import for an
unrelated reason.
This change also includes some refactoring and minor improvements.
- Never try loading earlier txgs during import when the import
fails with EREMOTEIO or EINTER. These errors don't indicate
the txg is damaged but instead that its either in use on a
remote host or the import was interactively cancelled. No
rewind is also performed for EBADD which can result from a
stale trusted config when doing a verbatim import.
- Refactor the code for consistent logging of the multihost
activity check using spa_load_note() and console messages
indicating when the activity check was trigger and the result.
- Added MMP_*_MASK and MMP_SEQ_CLEAR() macros to allow easier
modification of the sequence number in an uberblock.
- Added ZFS_LOAD_INFO_DEBUG environment variable which can be
set to log to dump to stdout the spa_load_info nvlist returned
during import. This is used by the updated mmp test cases
to determine if an activity check was run and its result.
- Standardize the mmp messages similarly to make it easier to
find all the relevent mmp lines in the debug log.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
This commit is contained in:
committed by
Tony Hutter
parent
2f048ced4d
commit
20176224ee
+135
-23
@@ -145,6 +145,15 @@
|
||||
* Additionally, the duration is then extended by a random 25% to attempt to to
|
||||
* detect simultaneous imports. For example, if both partner hosts are rebooted
|
||||
* at the same time and automatically attempt to import the pool.
|
||||
*
|
||||
* Once the read-only activity check completes and the pool is determined to
|
||||
* be inactive a second check is performed to claim the pool. During this
|
||||
* phase the host writes out MMP uberblocks to each of the devices which are
|
||||
* identical to the best uberblock but with a randomly selected sequence id.
|
||||
* The "best" uberblock is then read back and it must contain this new sequence
|
||||
* number. This check is performed multiple times to ensure that there is
|
||||
* no window where a concurrently importing system can incorrectly determine
|
||||
* the pool to be inactive.
|
||||
*/
|
||||
|
||||
/*
|
||||
@@ -237,8 +246,8 @@ mmp_thread_start(spa_t *spa)
|
||||
if (!mmp->mmp_thread) {
|
||||
mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
|
||||
spa, 0, &p0, TS_RUN, defclsyspri);
|
||||
zfs_dbgmsg("MMP thread started pool '%s' "
|
||||
"gethrtime %llu", spa_name(spa), gethrtime());
|
||||
zfs_dbgmsg("mmp: mmp thread started spa=%s "
|
||||
"gethrtime=%llu", spa_name(spa), gethrtime());
|
||||
}
|
||||
mutex_exit(&mmp->mmp_thread_lock);
|
||||
}
|
||||
@@ -257,7 +266,7 @@ mmp_thread_stop(spa_t *spa)
|
||||
cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
|
||||
}
|
||||
mutex_exit(&mmp->mmp_thread_lock);
|
||||
zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
|
||||
zfs_dbgmsg("mmp: mmp thread stopped spa=%s gethrtime=%llu",
|
||||
spa_name(spa), gethrtime());
|
||||
|
||||
ASSERT0P(mmp->mmp_thread);
|
||||
@@ -449,9 +458,9 @@ mmp_write_uberblock(spa_t *spa)
|
||||
spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER);
|
||||
lock_acquire_time = gethrtime() - lock_acquire_time;
|
||||
if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
|
||||
zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
|
||||
"gethrtime %llu", spa_name(spa), lock_acquire_time,
|
||||
gethrtime());
|
||||
zfs_dbgmsg("mmp: long SCL_STATE acquisition, spa=%s "
|
||||
"acquire_time=%llu gethrtime=%llu", spa_name(spa),
|
||||
lock_acquire_time, gethrtime());
|
||||
|
||||
mutex_enter(&mmp->mmp_io_lock);
|
||||
|
||||
@@ -474,8 +483,8 @@ mmp_write_uberblock(spa_t *spa)
|
||||
spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
|
||||
gethrestime_sec(), mmp->mmp_delay, NULL, 0,
|
||||
mmp->mmp_kstat_id++, error);
|
||||
zfs_dbgmsg("MMP error choosing leaf pool '%s' "
|
||||
"gethrtime %llu fail_mask %#x", spa_name(spa),
|
||||
zfs_dbgmsg("mmp: error choosing leaf, spa=%s "
|
||||
"gethrtime=%llu fail_mask=%#x", spa_name(spa),
|
||||
gethrtime(), error);
|
||||
}
|
||||
mutex_exit(&mmp->mmp_io_lock);
|
||||
@@ -485,11 +494,11 @@ mmp_write_uberblock(spa_t *spa)
|
||||
|
||||
vd = spa->spa_mmp.mmp_last_leaf;
|
||||
if (mmp->mmp_skip_error != 0) {
|
||||
mmp->mmp_skip_error = 0;
|
||||
zfs_dbgmsg("MMP write after skipping due to unavailable "
|
||||
"leaves, pool '%s' gethrtime %llu leaf %llu",
|
||||
zfs_dbgmsg("mmp: write after skipping due to unavailable "
|
||||
"leaves, spa=%s gethrtime=%llu vdev=%llu error=%d",
|
||||
spa_name(spa), (u_longlong_t)gethrtime(),
|
||||
(u_longlong_t)vd->vdev_guid);
|
||||
(u_longlong_t)vd->vdev_guid, mmp->mmp_skip_error);
|
||||
mmp->mmp_skip_error = 0;
|
||||
}
|
||||
|
||||
if (mmp->mmp_zio_root == NULL)
|
||||
@@ -540,6 +549,108 @@ mmp_write_uberblock(spa_t *spa)
|
||||
zio_nowait(zio);
|
||||
}
|
||||
|
||||
static void
|
||||
mmp_claim_uberblock_sync_done(zio_t *zio)
|
||||
{
|
||||
uint64_t *good_writes = zio->io_private;
|
||||
|
||||
if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
|
||||
atomic_inc_64(good_writes);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write the uberblock to the first label of all leaves of the specified vdev.
|
||||
* Two writes required for each mirror, one for a singleton, and parity+1 for
|
||||
* raidz or draid vdevs.
|
||||
*/
|
||||
static void
|
||||
mmp_claim_uberblock_sync(zio_t *zio, uint64_t *good_writes,
|
||||
uint64_t *req_writes, uberblock_t *ub, vdev_t *vd, int flags)
|
||||
{
|
||||
for (uint64_t c = 0; c < vd->vdev_children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
|
||||
if (cvd->vdev_islog || cvd->vdev_isspare || cvd->vdev_isl2cache)
|
||||
continue;
|
||||
|
||||
if (cvd->vdev_top == cvd) {
|
||||
uint64_t nparity = vdev_get_nparity(cvd);
|
||||
if (nparity) {
|
||||
*req_writes += nparity + 1;
|
||||
} else {
|
||||
*req_writes +=
|
||||
MIN(MAX(cvd->vdev_children, 1), 2);
|
||||
}
|
||||
}
|
||||
|
||||
mmp_claim_uberblock_sync(zio, good_writes, req_writes,
|
||||
ub, cvd, flags);
|
||||
}
|
||||
|
||||
if (!vd->vdev_ops->vdev_op_leaf)
|
||||
return;
|
||||
|
||||
if (!vdev_writeable(vd))
|
||||
return;
|
||||
|
||||
if (vd->vdev_ops == &vdev_draid_spare_ops)
|
||||
return;
|
||||
|
||||
abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
|
||||
abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
|
||||
abd_zero_off(ub_abd, sizeof (uberblock_t),
|
||||
VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
|
||||
|
||||
vdev_label_write(zio, vd, 0, ub_abd,
|
||||
VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
|
||||
MMP_BLOCKS_PER_LABEL), VDEV_UBERBLOCK_SIZE(vd),
|
||||
mmp_claim_uberblock_sync_done, good_writes,
|
||||
flags | ZIO_FLAG_DONT_PROPAGATE);
|
||||
|
||||
abd_free(ub_abd);
|
||||
}
|
||||
|
||||
int
|
||||
mmp_claim_uberblock(spa_t *spa, vdev_t *vd, uberblock_t *ub)
|
||||
{
|
||||
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
|
||||
uint64_t good_writes = 0;
|
||||
uint64_t req_writes = 0;
|
||||
zio_t *zio;
|
||||
|
||||
ASSERT(MMP_VALID(ub));
|
||||
ASSERT(MMP_SEQ_VALID(ub));
|
||||
|
||||
spa_config_enter(spa, SCL_ALL, mmp_tag, RW_WRITER);
|
||||
|
||||
/* Sync the uberblock to all writeable leaves */
|
||||
zio = zio_root(spa, NULL, NULL, flags);
|
||||
mmp_claim_uberblock_sync(zio, &good_writes, &req_writes, ub, vd, flags);
|
||||
(void) zio_wait(zio);
|
||||
|
||||
/* Flush the new uberblocks so they're immediately visible */
|
||||
zio = zio_root(spa, NULL, NULL, flags);
|
||||
zio_flush(zio, vd);
|
||||
(void) zio_wait(zio);
|
||||
|
||||
spa_config_exit(spa, SCL_ALL, mmp_tag);
|
||||
|
||||
zfs_dbgmsg("mmp: claiming uberblock, spa=%s txg=%llu seq=%llu "
|
||||
"req_writes=%llu good_writes=%llu", spa_load_name(spa),
|
||||
(u_longlong_t)ub->ub_txg, (u_longlong_t)MMP_SEQ(ub),
|
||||
(u_longlong_t)req_writes, (u_longlong_t)good_writes);
|
||||
|
||||
/*
|
||||
* To guarantee visibility from a remote host we require a minimum
|
||||
* number of good writes. For raidz/draid vdevs parity+1 writes, for
|
||||
* mirrors 2 writes, and for singletons 1 write.
|
||||
*/
|
||||
if (req_writes == 0 || good_writes < req_writes)
|
||||
return (SET_ERROR(EIO));
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static __attribute__((noreturn)) void
|
||||
mmp_thread(void *arg)
|
||||
{
|
||||
@@ -616,11 +727,11 @@ mmp_thread(void *arg)
|
||||
next_time = gethrtime() + mmp_interval / leaves;
|
||||
|
||||
if (mmp_fail_ns != last_mmp_fail_ns) {
|
||||
zfs_dbgmsg("MMP interval change pool '%s' "
|
||||
"gethrtime %llu last_mmp_interval %llu "
|
||||
"mmp_interval %llu last_mmp_fail_intervals %u "
|
||||
"mmp_fail_intervals %u mmp_fail_ns %llu "
|
||||
"skip_wait %d leaves %d next_time %llu",
|
||||
zfs_dbgmsg("mmp: interval change, spa=%s "
|
||||
"gethrtime=%llu last_mmp_interval=%llu "
|
||||
"mmp_interval=%llu last_mmp_fail_intervals=%u "
|
||||
"mmp_fail_intervals=%u mmp_fail_ns=%llu "
|
||||
"skip_wait=%d leaves=%d next_time=%llu",
|
||||
spa_name(spa), (u_longlong_t)gethrtime(),
|
||||
(u_longlong_t)last_mmp_interval,
|
||||
(u_longlong_t)mmp_interval, last_mmp_fail_intervals,
|
||||
@@ -635,9 +746,9 @@ mmp_thread(void *arg)
|
||||
*/
|
||||
if ((!last_spa_multihost && multihost) ||
|
||||
(last_spa_suspended && !suspended)) {
|
||||
zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
|
||||
"last_spa_multihost %u multihost %u "
|
||||
"last_spa_suspended %u suspended %u",
|
||||
zfs_dbgmsg("mmp: state change spa=%s: gethrtime=%llu "
|
||||
"last_spa_multihost=%u multihost=%u "
|
||||
"last_spa_suspended=%u suspended=%u",
|
||||
spa_name(spa), (u_longlong_t)gethrtime(),
|
||||
last_spa_multihost, multihost, last_spa_suspended,
|
||||
suspended);
|
||||
@@ -663,9 +774,10 @@ mmp_thread(void *arg)
|
||||
*/
|
||||
if (multihost && !suspended && mmp_fail_intervals &&
|
||||
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
|
||||
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
|
||||
"mmp_last_write %llu mmp_interval %llu "
|
||||
"mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
|
||||
zfs_dbgmsg("mmp: suspending pool, spa=%s "
|
||||
"gethrtime=%llu mmp_last_write=%llu "
|
||||
"mmp_interval=%llu mmp_fail_intervals=%llu "
|
||||
"mmp_fail_ns=%llu txg=%llu",
|
||||
spa_name(spa), (u_longlong_t)gethrtime(),
|
||||
(u_longlong_t)mmp->mmp_last_write,
|
||||
(u_longlong_t)mmp_interval,
|
||||
|
||||
Reference in New Issue
Block a user