MMP interval and fail_intervals in uberblock

When Multihost is enabled, and a pool is imported, uberblock writes
include ub_mmp_delay to allow an importing node to calculate the
duration of an activity test.  This value, is not enough information.

If zfs_multihost_fail_intervals > 0 on the node with the pool imported,
the safe minimum duration of the activity test is well defined, but does
not depend on ub_mmp_delay:

zfs_multihost_fail_intervals * zfs_multihost_interval

and if zfs_multihost_fail_intervals == 0 on that node, there is no such
well defined safe duration, but the importing host cannot tell whether
mmp_delay is high due to I/O delays, or due to a very large
zfs_multihost_interval setting on the host which last imported the pool.
As a result, it may use a far longer period for the activity test than
is necessary.

This patch renames ub_mmp_sequence to ub_mmp_config and uses it to
record the zfs_multihost_interval and zfs_multihost_fail_intervals
values, as well as the mmp sequence.  This allows a shorter activity
test duration to be calculated by the importing host in most situations.
These values are also added to the multihost_history kstat records.

It calculates the activity test duration differently depending on
whether the new fields are present or not; for importing pools with
only ub_mmp_delay, it uses

(zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals

Which results in an activity test duration less sensitive to the leaf
count.

In addition, it makes a few other improvements:
* It updates the "sequence" part of ub_mmp_config when MMP writes
  in between syncs occur.  This allows an importing host to detect MMP
  on the remote host sooner, when the pool is idle, as it is not limited
  to the granularity of ub_timestamp (1 second).
* It issues writes immediately when zfs_multihost_interval is changed
  so remote hosts see the updated value as soon as possible.
* It fixes a bug where setting zfs_multihost_fail_intervals = 1 results
  in immediate pool suspension.
* Update tests to verify activity check duration is based on recorded
  tunable values, not tunable values on importing host.
* Update tests to verify the expected number of uberblocks have valid
  MMP fields - fail_intervals, mmp_interval, mmp_seq (sequence number),
  that sequence number is incrementing, and that uberblock values match
  tunable settings.

Reviewed-by: Andreas Dilger <andreas.dilger@whamcloud.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #7842
This commit is contained in:
Olaf Faaland
2019-03-21 12:47:57 -07:00
committed by Brian Behlendorf
parent d10b2f1d35
commit 060f0226e6
17 changed files with 691 additions and 174 deletions
+97 -26
View File
@@ -2437,6 +2437,7 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
uint64_t hostid = 0;
uint64_t tryconfig_txg = 0;
uint64_t tryconfig_timestamp = 0;
uint16_t tryconfig_mmp_seq = 0;
nvlist_t *nvinfo;
if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
@@ -2445,6 +2446,8 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
&tryconfig_txg);
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
&tryconfig_timestamp);
(void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
&tryconfig_mmp_seq);
}
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
@@ -2462,13 +2465,15 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
return (B_FALSE);
/*
* If the tryconfig_* values are nonzero, they are the results of an
* earlier tryimport. If they match the uberblock we just found, then
* the pool has not changed and we return false so we do not test a
* second time.
* If the tryconfig_ values are nonzero, they are the results of an
* earlier tryimport. If they all match the uberblock we just found,
* then the pool has not changed and we return false so we do not test
* a second time.
*/
if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp)
tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
tryconfig_mmp_seq && tryconfig_mmp_seq ==
(MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
return (B_FALSE);
/*
@@ -2491,6 +2496,76 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
return (B_TRUE);
}
/*
* Nanoseconds the activity check must watch for changes on-disk.
*/
static uint64_t
spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
{
uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
uint64_t multihost_interval = MSEC2NSEC(
MMP_INTERVAL_OK(zfs_multihost_interval));
uint64_t import_delay = MAX(NANOSEC, import_intervals *
multihost_interval);
/*
* Local tunables determine a minimum duration except for the case
* where we know when the remote host will suspend the pool if MMP
* writes do not land.
*
* See Big Theory comment at the top of mmp.c for the reasoning behind
* these cases and times.
*/
ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
MMP_FAIL_INT(ub) > 0) {
/* MMP on remote host will suspend pool after failed writes */
import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
MMP_IMPORT_SAFETY_FACTOR / 100;
zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
"mmp_fails=%llu ub_mmp mmp_interval=%llu "
"import_intervals=%u", import_delay, MMP_FAIL_INT(ub),
MMP_INTERVAL(ub), import_intervals);
} else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
MMP_FAIL_INT(ub) == 0) {
/* MMP on remote host will never suspend pool */
import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
ub->ub_mmp_delay) * import_intervals);
zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
"mmp_interval=%llu ub_mmp_delay=%llu "
"import_intervals=%u", import_delay, MMP_INTERVAL(ub),
ub->ub_mmp_delay, import_intervals);
} else if (MMP_VALID(ub)) {
/*
* zfs-0.7 compatability case
*/
import_delay = MAX(import_delay, (multihost_interval +
ub->ub_mmp_delay) * import_intervals);
zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
"import_intervals=%u leaves=%u", import_delay,
ub->ub_mmp_delay, import_intervals,
vdev_count_leaves(spa));
} else {
/* Using local tunings is the only reasonable option */
zfs_dbgmsg("pool last imported on non-MMP aware "
"host using import_delay=%llu multihost_interval=%llu "
"import_intervals=%u", import_delay, multihost_interval,
import_intervals);
}
return (import_delay);
}
/*
* Perform the import activity check. If the user canceled the import or
* we detected activity then fail.
@@ -2498,10 +2573,11 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
static int
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
{
uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
uint64_t txg = ub->ub_txg;
uint64_t timestamp = ub->ub_timestamp;
uint64_t import_delay = NANOSEC;
uint64_t mmp_config = ub->ub_mmp_config;
uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
uint64_t import_delay;
hrtime_t import_expire;
nvlist_t *mmp_label = NULL;
vdev_t *rvd = spa->spa_root_vdev;
@@ -2518,7 +2594,7 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
* during the earlier tryimport. If the txg recorded there is 0 then
* the pool is known to be active on another host.
*
* Otherwise, the pool might be in use on another node. Check for
* Otherwise, the pool might be in use on another host. Check for
* changes in the uberblocks on disk if necessary.
*/
if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
@@ -2533,23 +2609,7 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
}
}
/*
* Preferentially use the zfs_multihost_interval from the node which
* last imported the pool. This value is stored in an MMP uberblock as.
*
* ub_mmp_delay * vdev_count_leaves() == zfs_multihost_interval
*/
if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay)
import_delay = MAX(import_delay, import_intervals *
ub->ub_mmp_delay * MAX(vdev_count_leaves(spa), 1));
/* Apply a floor using the local default values. */
import_delay = MAX(import_delay, import_intervals *
MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)));
zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu import_intervals=%u "
"leaves=%u", import_delay, ub->ub_mmp_delay, import_intervals,
vdev_count_leaves(spa));
import_delay = spa_activity_check_duration(spa, ub);
/* Add a small random factor in case of simultaneous imports (0-25%) */
import_expire = gethrtime() + import_delay +
@@ -2558,7 +2618,15 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
while (gethrtime() < import_expire) {
vdev_uberblock_load(rvd, ub, &mmp_label);
if (txg != ub->ub_txg || timestamp != ub->ub_timestamp) {
if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
zfs_dbgmsg("multihost activity detected "
"txg %llu ub_txg %llu "
"timestamp %llu ub_timestamp %llu "
"mmp_config %#llx ub_mmp_config %#llx",
txg, ub->ub_txg, timestamp, ub->ub_timestamp,
mmp_config, ub->ub_mmp_config);
error = SET_ERROR(EREMOTEIO);
break;
}
@@ -2944,6 +3012,9 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
fnvlist_add_uint16(spa->spa_load_info,
ZPOOL_CONFIG_MMP_SEQ,
(MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
}
/*