diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index bec3f94d9..b1e1b2150 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3879,6 +3879,9 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, hostid, ctime(×tamp)); } + if (getenv("ZFS_LOAD_INFO_DEBUG")) + dump_nvlist(nvinfo, 4); + return (1); } diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 87422658c..507d1fa2d 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -873,6 +873,10 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ +#define ZPOOL_CONFIG_MMP_RESULT "mmp_result" /* not stored on disk */ +#define ZPOOL_CONFIG_MMP_TRYIMPORT_NS "mmp_tryimport_ns" /* not stored */ +#define ZPOOL_CONFIG_MMP_IMPORT_NS "mmp_import_ns" /* not stored on disk */ +#define ZPOOL_CONFIG_MMP_CLAIM_NS "mmp_claim_ns" /* not stored on disk */ #define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ #define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */ #define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" diff --git a/include/sys/mmp.h b/include/sys/mmp.h index 287623682..239170f46 100644 --- a/include/sys/mmp.h +++ b/include/sys/mmp.h @@ -33,6 +33,7 @@ extern "C" { #define MMP_DEFAULT_IMPORT_INTERVALS 20 #define MMP_DEFAULT_FAIL_INTERVALS 10 #define MMP_MIN_FAIL_INTERVALS 2 /* min if != 0 */ +#define MMP_IMPORT_VERIFY_ITERS 10 #define MMP_IMPORT_SAFETY_FACTOR 200 /* pct */ #define MMP_INTERVAL_OK(interval) MAX(interval, MMP_MIN_INTERVAL) #define MMP_FAIL_INTVS_OK(fails) (fails == 0 ? 0 : MAX(fails, \ @@ -53,6 +54,9 @@ typedef struct mmp_thread { vdev_t *mmp_last_leaf; /* last mmp write sent here */ uint64_t mmp_leaf_last_gen; /* last mmp write sent here */ uint32_t mmp_seq; /* intra-second update counter */ + uint64_t mmp_tryimport_ns; /* tryimport activity check time */ + uint64_t mmp_import_ns; /* import activity check time */ + uint64_t mmp_claim_ns; /* claim activity check time */ } mmp_thread_t; @@ -62,6 +66,7 @@ extern void mmp_thread_start(struct spa *spa); extern void mmp_thread_stop(struct spa *spa); extern void mmp_update_uberblock(struct spa *spa, struct uberblock *ub); extern void mmp_signal_all_threads(void); +extern int mmp_claim_uberblock(spa_t *spa, vdev_t *vd, uberblock_t *ub); /* Global tuning */ extern int param_set_multihost_interval(ZFS_MODULE_PARAM_ARGS); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index c18d955f7..62cf196ee 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -298,6 +298,7 @@ struct spa { void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS]; uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ + boolean_t spa_activity_check; /* activity check required */ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub bytes */ diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index bcd40613d..8acfcc492 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -51,6 +51,12 @@ extern "C" { #define MMP_SEQ_VALID_BIT 0x02 #define MMP_FAIL_INT_VALID_BIT 0x04 +#define MMP_INTERVAL_MASK 0x00000000FFFFFF00 +#define MMP_SEQ_MASK 0x0000FFFF00000000 +#define MMP_FAIL_INT_MASK 0xFFFF000000000000 + +#define MMP_SEQ_MAX UINT16_MAX + #define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \ (ubp)->ub_mmp_magic == MMP_MAGIC) #define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ @@ -60,21 +66,25 @@ extern "C" { #define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_FAIL_INT_VALID_BIT)) -#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \ +#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & MMP_INTERVAL_MASK) \ >> 8) -#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \ +#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & MMP_SEQ_MASK) \ >> 32) -#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \ +#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & MMP_FAIL_INT_MASK) \ >> 48) #define MMP_INTERVAL_SET(write) \ - (((uint64_t)(write & 0xFFFFFF) << 8) | MMP_INTERVAL_VALID_BIT) + (((uint64_t)((write) & 0xFFFFFF) << 8) | MMP_INTERVAL_VALID_BIT) #define MMP_SEQ_SET(seq) \ - (((uint64_t)(seq & 0xFFFF) << 32) | MMP_SEQ_VALID_BIT) + (((uint64_t)((seq) & 0xFFFF) << 32) | MMP_SEQ_VALID_BIT) #define MMP_FAIL_INT_SET(fail) \ - (((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT) + (((uint64_t)((fail) & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT) + + +#define MMP_SEQ_CLEAR(ubp) \ + ((ubp)->ub_mmp_config &= ~(MMP_SEQ_MASK | MMP_SEQ_VALID_BIT)) /* * RAIDZ expansion reflow information. diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 86f2235f0..131cfc9cd 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -228,6 +228,8 @@ extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *); extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int); +extern int vdev_uberblock_compare(const struct uberblock *, + const struct uberblock *); extern int vdev_check_boot_reserve(spa_t *, vdev_t *); typedef enum { diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 756d701e2..db5cd6dc0 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2209,6 +2209,11 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, zpool_get_load_policy(config, &policy); + if (getenv("ZFS_LOAD_INFO_DEBUG") && nv != NULL && + nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0) { + dump_nvlist(nvinfo, 4); + } + if (error) { char desc[1024]; char aux[256]; diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index b8ba40ecd..aedcf9505 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -145,6 +145,15 @@ * Additionally, the duration is then extended by a random 25% to attempt to to * detect simultaneous imports. For example, if both partner hosts are rebooted * at the same time and automatically attempt to import the pool. + * + * Once the read-only activity check completes and the pool is determined to + * be inactive a second check is performed to claim the pool. During this + * phase the host writes out MMP uberblocks to each of the devices which are + * identical to the best uberblock but with a randomly selected sequence id. + * The "best" uberblock is then read back and it must contain this new sequence + * number. This check is performed multiple times to ensure that there is + * no window where a concurrently importing system can incorrectly determine + * the pool to be inactive. */ /* @@ -237,8 +246,8 @@ mmp_thread_start(spa_t *spa) if (!mmp->mmp_thread) { mmp->mmp_thread = thread_create(NULL, 0, mmp_thread, spa, 0, &p0, TS_RUN, defclsyspri); - zfs_dbgmsg("MMP thread started pool '%s' " - "gethrtime %llu", spa_name(spa), gethrtime()); + zfs_dbgmsg("mmp: mmp thread started spa=%s " + "gethrtime=%llu", spa_name(spa), gethrtime()); } mutex_exit(&mmp->mmp_thread_lock); } @@ -257,7 +266,7 @@ mmp_thread_stop(spa_t *spa) cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock); } mutex_exit(&mmp->mmp_thread_lock); - zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu", + zfs_dbgmsg("mmp: mmp thread stopped spa=%s gethrtime=%llu", spa_name(spa), gethrtime()); ASSERT0P(mmp->mmp_thread); @@ -449,9 +458,9 @@ mmp_write_uberblock(spa_t *spa) spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER); lock_acquire_time = gethrtime() - lock_acquire_time; if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) - zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " - "gethrtime %llu", spa_name(spa), lock_acquire_time, - gethrtime()); + zfs_dbgmsg("mmp: long SCL_STATE acquisition, spa=%s " + "acquire_time=%llu gethrtime=%llu", spa_name(spa), + lock_acquire_time, gethrtime()); mutex_enter(&mmp->mmp_io_lock); @@ -474,8 +483,8 @@ mmp_write_uberblock(spa_t *spa) spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg, gethrestime_sec(), mmp->mmp_delay, NULL, 0, mmp->mmp_kstat_id++, error); - zfs_dbgmsg("MMP error choosing leaf pool '%s' " - "gethrtime %llu fail_mask %#x", spa_name(spa), + zfs_dbgmsg("mmp: error choosing leaf, spa=%s " + "gethrtime=%llu fail_mask=%#x", spa_name(spa), gethrtime(), error); } mutex_exit(&mmp->mmp_io_lock); @@ -485,11 +494,11 @@ mmp_write_uberblock(spa_t *spa) vd = spa->spa_mmp.mmp_last_leaf; if (mmp->mmp_skip_error != 0) { - mmp->mmp_skip_error = 0; - zfs_dbgmsg("MMP write after skipping due to unavailable " - "leaves, pool '%s' gethrtime %llu leaf %llu", + zfs_dbgmsg("mmp: write after skipping due to unavailable " + "leaves, spa=%s gethrtime=%llu vdev=%llu error=%d", spa_name(spa), (u_longlong_t)gethrtime(), - (u_longlong_t)vd->vdev_guid); + (u_longlong_t)vd->vdev_guid, mmp->mmp_skip_error); + mmp->mmp_skip_error = 0; } if (mmp->mmp_zio_root == NULL) @@ -540,6 +549,108 @@ mmp_write_uberblock(spa_t *spa) zio_nowait(zio); } +static void +mmp_claim_uberblock_sync_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) + atomic_inc_64(good_writes); +} + +/* + * Write the uberblock to the first label of all leaves of the specified vdev. + * Two writes required for each mirror, one for a singleton, and parity+1 for + * raidz or draid vdevs. + */ +static void +mmp_claim_uberblock_sync(zio_t *zio, uint64_t *good_writes, + uint64_t *req_writes, uberblock_t *ub, vdev_t *vd, int flags) +{ + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_islog || cvd->vdev_isspare || cvd->vdev_isl2cache) + continue; + + if (cvd->vdev_top == cvd) { + uint64_t nparity = vdev_get_nparity(cvd); + if (nparity) { + *req_writes += nparity + 1; + } else { + *req_writes += + MIN(MAX(cvd->vdev_children, 1), 2); + } + } + + mmp_claim_uberblock_sync(zio, good_writes, req_writes, + ub, cvd, flags); + } + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (!vdev_writeable(vd)) + return; + + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + + abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); + abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); + abd_zero_off(ub_abd, sizeof (uberblock_t), + VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t)); + + vdev_label_write(zio, vd, 0, ub_abd, + VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) - + MMP_BLOCKS_PER_LABEL), VDEV_UBERBLOCK_SIZE(vd), + mmp_claim_uberblock_sync_done, good_writes, + flags | ZIO_FLAG_DONT_PROPAGATE); + + abd_free(ub_abd); +} + +int +mmp_claim_uberblock(spa_t *spa, vdev_t *vd, uberblock_t *ub) +{ + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + uint64_t good_writes = 0; + uint64_t req_writes = 0; + zio_t *zio; + + ASSERT(MMP_VALID(ub)); + ASSERT(MMP_SEQ_VALID(ub)); + + spa_config_enter(spa, SCL_ALL, mmp_tag, RW_WRITER); + + /* Sync the uberblock to all writeable leaves */ + zio = zio_root(spa, NULL, NULL, flags); + mmp_claim_uberblock_sync(zio, &good_writes, &req_writes, ub, vd, flags); + (void) zio_wait(zio); + + /* Flush the new uberblocks so they're immediately visible */ + zio = zio_root(spa, NULL, NULL, flags); + zio_flush(zio, vd); + (void) zio_wait(zio); + + spa_config_exit(spa, SCL_ALL, mmp_tag); + + zfs_dbgmsg("mmp: claiming uberblock, spa=%s txg=%llu seq=%llu " + "req_writes=%llu good_writes=%llu", spa_load_name(spa), + (u_longlong_t)ub->ub_txg, (u_longlong_t)MMP_SEQ(ub), + (u_longlong_t)req_writes, (u_longlong_t)good_writes); + + /* + * To guarantee visibility from a remote host we require a minimum + * number of good writes. For raidz/draid vdevs parity+1 writes, for + * mirrors 2 writes, and for singletons 1 write. + */ + if (req_writes == 0 || good_writes < req_writes) + return (SET_ERROR(EIO)); + + return (0); +} + static __attribute__((noreturn)) void mmp_thread(void *arg) { @@ -616,11 +727,11 @@ mmp_thread(void *arg) next_time = gethrtime() + mmp_interval / leaves; if (mmp_fail_ns != last_mmp_fail_ns) { - zfs_dbgmsg("MMP interval change pool '%s' " - "gethrtime %llu last_mmp_interval %llu " - "mmp_interval %llu last_mmp_fail_intervals %u " - "mmp_fail_intervals %u mmp_fail_ns %llu " - "skip_wait %d leaves %d next_time %llu", + zfs_dbgmsg("mmp: interval change, spa=%s " + "gethrtime=%llu last_mmp_interval=%llu " + "mmp_interval=%llu last_mmp_fail_intervals=%u " + "mmp_fail_intervals=%u mmp_fail_ns=%llu " + "skip_wait=%d leaves=%d next_time=%llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)last_mmp_interval, (u_longlong_t)mmp_interval, last_mmp_fail_intervals, @@ -635,9 +746,9 @@ mmp_thread(void *arg) */ if ((!last_spa_multihost && multihost) || (last_spa_suspended && !suspended)) { - zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu " - "last_spa_multihost %u multihost %u " - "last_spa_suspended %u suspended %u", + zfs_dbgmsg("mmp: state change spa=%s: gethrtime=%llu " + "last_spa_multihost=%u multihost=%u " + "last_spa_suspended=%u suspended=%u", spa_name(spa), (u_longlong_t)gethrtime(), last_spa_multihost, multihost, last_spa_suspended, suspended); @@ -663,9 +774,10 @@ mmp_thread(void *arg) */ if (multihost && !suspended && mmp_fail_intervals && (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { - zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " - "mmp_last_write %llu mmp_interval %llu " - "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu", + zfs_dbgmsg("mmp: suspending pool, spa=%s " + "gethrtime=%llu mmp_last_write=%llu " + "mmp_interval=%llu mmp_fail_intervals=%llu " + "mmp_fail_ns=%llu txg=%llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)mmp->mmp_last_write, (u_longlong_t)mmp_interval, diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b4259dbd2..ef726733f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3722,32 +3722,104 @@ vdev_count_verify_zaps(vdev_t *vd) #endif /* - * Determine whether the activity check is required. + * Check the results load_info results from previous tryimport. + * + * error results: + * 0 - Pool remains in an idle state + * EREMOTEIO - Pool was known to be active on the other host + * ENOENT - The config does not contain complete tryimport info */ -static boolean_t -spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, - nvlist_t *config) +static int +spa_activity_verify_config(spa_t *spa, uberblock_t *ub) { - uint64_t state = POOL_STATE_ACTIVE; - uint64_t hostid = 0; + uint64_t tryconfig_mmp_state = MMP_STATE_ACTIVE; uint64_t tryconfig_txg = 0; uint64_t tryconfig_timestamp = 0; uint16_t tryconfig_mmp_seq = 0; - nvlist_t *nvinfo; + nvlist_t *nvinfo, *config = spa->spa_config; + int error; - if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { - nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); - (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, - &tryconfig_txg); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, - &tryconfig_timestamp); - (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, - &tryconfig_mmp_seq); + /* Simply a non-zero value to indicate the verify was done. */ + spa->spa_mmp.mmp_import_ns = 1000; + + error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo); + if (error) + return (SET_ERROR(ENOENT)); + + /* + * If ZPOOL_CONFIG_MMP_STATE is present an activity check was performed + * during the earlier tryimport. If the state recorded there isn't + * MMP_STATE_INACTIVE the pool is known to be active on another host. + */ + error = nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE, + &tryconfig_mmp_state); + if (error) + return (SET_ERROR(ENOENT)); + + if (tryconfig_mmp_state != MMP_STATE_INACTIVE) { + spa_load_failed(spa, "mmp: pool is active on remote host, " + "state=%llu", (u_longlong_t)tryconfig_mmp_state); + return (SET_ERROR(EREMOTEIO)); } + /* + * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed + * during the earlier tryimport. If the txg recorded there is 0 then + * the pool is known to be active on another host. + */ + error = nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, + &tryconfig_txg); + if (error) + return (SET_ERROR(ENOENT)); + + if (tryconfig_txg == 0) { + spa_load_failed(spa, "mmp: pool is active on remote host, " + "tryconfig_txg=%llu", (u_longlong_t)tryconfig_txg); + return (SET_ERROR(EREMOTEIO)); + } + + error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, + &tryconfig_timestamp); + if (error) + return (SET_ERROR(ENOENT)); + + error = nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, + &tryconfig_mmp_seq); + if (error) + return (SET_ERROR(ENOENT)); + + if (tryconfig_timestamp == ub->ub_timestamp && + tryconfig_txg == ub->ub_txg && + MMP_SEQ_VALID(ub) && tryconfig_mmp_seq == MMP_SEQ(ub)) { + zfs_dbgmsg("mmp: verified pool mmp tryimport config, " + "spa=%s", spa_load_name(spa)); + return (0); + } + + spa_load_failed(spa, "mmp: pool is active on remote host, " + "tc_timestamp=%llu ub_timestamp=%llu " + "tc_txg=%llu ub_txg=%llu tc_seq=%llu ub_seq=%llu", + (u_longlong_t)tryconfig_timestamp, (u_longlong_t)ub->ub_timestamp, + (u_longlong_t)tryconfig_txg, (u_longlong_t)ub->ub_txg, + (u_longlong_t)tryconfig_mmp_seq, (u_longlong_t)MMP_SEQ(ub)); + + return (SET_ERROR(EREMOTEIO)); +} + +/* + * Determine whether the activity check is required. + */ +static boolean_t +spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label) +{ + nvlist_t *config = spa->spa_config; + uint64_t state = POOL_STATE_ACTIVE; + uint64_t hostid = 0; + /* * Disable the MMP activity check - This is used by zdb which - * is intended to be used on potentially active pools. + * is always read-only and intended to be used on potentially + * active pools. */ if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) { zfs_dbgmsg("mmp: skipping check ZFS_IMPORT_SKIP_MMP is set, " @@ -3757,46 +3829,44 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, /* * Skip the activity check when the MMP feature is disabled. + * - MMP_MAGIC not set - Legacy pool predates the MMP feature, or + * - MMP_MAGIC set && mmp_delay == 0 - MMP feature is disabled. */ - if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) { + if ((ub->ub_mmp_magic != MMP_MAGIC) || + (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)) { zfs_dbgmsg("mmp: skipping check: feature is disabled, " "spa=%s", spa_load_name(spa)); return (B_FALSE); } - /* - * If the tryconfig_ values are nonzero, they are the results of an - * earlier tryimport. If they all match the uberblock we just found, - * then the pool has not changed and we return false so we do not test - * a second time. - */ - if (tryconfig_txg && tryconfig_txg == ub->ub_txg && - tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && - tryconfig_mmp_seq && tryconfig_mmp_seq == - (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { - zfs_dbgmsg("mmp: skipping check: tryconfig values match, " - "spa=%s", spa_name(spa)); - return (B_FALSE); - } - /* * Allow the activity check to be skipped when importing a cleanly * exported pool on the same host which last imported it. Since the * hostid from configuration may be stale use the one read from the * label. Imports from other hostids must perform the activity check. */ - if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) - hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); + if (label != NULL) { + if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) + hostid = fnvlist_lookup_uint64(label, + ZPOOL_CONFIG_HOSTID); - if (nvlist_exists(config, ZPOOL_CONFIG_POOL_STATE)) - state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); + if (nvlist_exists(config, ZPOOL_CONFIG_POOL_STATE)) + state = fnvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE); - if (spa_get_hostid(spa) && hostid == spa_get_hostid(spa) && - state == POOL_STATE_EXPORTED) { - zfs_dbgmsg("mmp: skipping check: hostid matches and pool is " - "exported, spa=%s, hostid=%llx", - spa_load_name(spa), (u_longlong_t)hostid); - return (B_FALSE); + if (spa_get_hostid(spa) && hostid == spa_get_hostid(spa) && + state == POOL_STATE_EXPORTED) { + zfs_dbgmsg("mmp: skipping check: hostid matches " + "and pool is exported, spa=%s, hostid=%llx", + spa_load_name(spa), (u_longlong_t)hostid); + return (B_FALSE); + } + + if (state == POOL_STATE_DESTROYED) { + zfs_dbgmsg("mmp: skipping check: intentionally " + "destroyed pool, spa=%s", spa_load_name(spa)); + return (B_FALSE); + } } return (B_TRUE); @@ -3832,9 +3902,10 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * MMP_IMPORT_SAFETY_FACTOR / 100; - zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " - "mmp_fails=%llu ub_mmp mmp_interval=%llu " - "import_intervals=%llu", (u_longlong_t)import_delay, + zfs_dbgmsg("mmp: settings spa=%s fail_intvals>0 " + "import_delay=%llu mmp_fails=%llu mmp_interval=%llu " + "import_intervals=%llu", spa_load_name(spa), + (u_longlong_t)import_delay, (u_longlong_t)MMP_FAIL_INT(ub), (u_longlong_t)MMP_INTERVAL(ub), (u_longlong_t)import_intervals); @@ -3846,9 +3917,10 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + ub->ub_mmp_delay) * import_intervals); - zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " - "mmp_interval=%llu ub_mmp_delay=%llu " - "import_intervals=%llu", (u_longlong_t)import_delay, + zfs_dbgmsg("mmp: settings spa=%s fail_intvals=0 " + "import_delay=%llu mmp_interval=%llu ub_mmp_delay=%llu " + "import_intervals=%llu", spa_load_name(spa), + (u_longlong_t)import_delay, (u_longlong_t)MMP_INTERVAL(ub), (u_longlong_t)ub->ub_mmp_delay, (u_longlong_t)import_intervals); @@ -3861,17 +3933,18 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) import_delay = MAX(import_delay, (multihost_interval + ub->ub_mmp_delay) * import_intervals); - zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " - "import_intervals=%llu leaves=%u", - (u_longlong_t)import_delay, + zfs_dbgmsg("mmp: settings spa=%s import_delay=%llu " + "ub_mmp_delay=%llu import_intervals=%llu leaves=%u", + spa_load_name(spa), (u_longlong_t)import_delay, (u_longlong_t)ub->ub_mmp_delay, (u_longlong_t)import_intervals, vdev_count_leaves(spa)); } else { /* Using local tunings is the only reasonable option */ - zfs_dbgmsg("pool last imported on non-MMP aware " - "host using import_delay=%llu multihost_interval=%llu " - "import_intervals=%llu", (u_longlong_t)import_delay, + zfs_dbgmsg("mmp: pool last imported on non-MMP aware " + "host using settings spa=%s import_delay=%llu " + "multihost_interval=%llu import_intervals=%llu", + spa_load_name(spa), (u_longlong_t)import_delay, (u_longlong_t)multihost_interval, (u_longlong_t)import_intervals); } @@ -3880,7 +3953,122 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) } /* - * Remote host activity check. + * Store the observed pool status in spa->spa_load_info nvlist. If the + * remote hostname or hostid are available from configuration read from + * disk store them as well. Additionally, provide some diagnostic info + * for which activity checks were run and their duration. This allows + * 'zpool import' to generate a more useful message. + * + * Mandatory observed pool status + * - ZPOOL_CONFIG_MMP_STATE - observed pool status (active/inactive) + * - ZPOOL_CONFIG_MMP_TXG - observed pool txg number + * - ZPOOL_CONFIG_MMP_SEQ - observed pool sequence id + * + * Optional information for detailed reporting + * - ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool + * - ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool + * - ZPOOL_CONFIG_MMP_RESULT - set to result of activity check + * - ZPOOL_CONFIG_MMP_TRYIMPORT_NS - tryimport duration in nanosec + * - ZPOOL_CONFIG_MMP_IMPORT_NS - import duration in nanosec + * - ZPOOL_CONFIG_MMP_CLAIM_NS - claim duration in nanosec + * + * ZPOOL_CONFIG_MMP_RESULT can be set to: + * - ENXIO - system hostid not set + * - ESRCH - activity check skipped + * - EREMOTEIO - activity check detected active pool + * - EINTR - activity check interrupted + * - 0 - activity check detected no activity + */ +static void +spa_activity_set_load_info(spa_t *spa, nvlist_t *label, mmp_state_t state, + uint64_t txg, uint16_t seq, int error) +{ + mmp_thread_t *mmp = &spa->spa_mmp; + const char *hostname = NULL; + uint64_t hostid = 0; + + /* Always report a zero txg and seq id for active pools. */ + if (state == MMP_STATE_ACTIVE) { + ASSERT0(txg); + ASSERT0(seq); + } + + if (label) { + if (nvlist_exists(label, ZPOOL_CONFIG_HOSTNAME)) { + hostname = fnvlist_lookup_string(label, + ZPOOL_CONFIG_HOSTNAME); + fnvlist_add_string(spa->spa_load_info, + ZPOOL_CONFIG_MMP_HOSTNAME, hostname); + } + + if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) { + hostid = fnvlist_lookup_uint64(label, + ZPOOL_CONFIG_HOSTID); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_HOSTID, hostid); + } + } + + fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, state); + fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, txg); + fnvlist_add_uint16(spa->spa_load_info, ZPOOL_CONFIG_MMP_SEQ, seq); + fnvlist_add_uint32(spa->spa_load_info, ZPOOL_CONFIG_MMP_RESULT, error); + + if (mmp->mmp_tryimport_ns > 0) { + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_TRYIMPORT_NS, mmp->mmp_tryimport_ns); + } + + if (mmp->mmp_import_ns > 0) { + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_IMPORT_NS, mmp->mmp_import_ns); + } + + if (mmp->mmp_claim_ns > 0) { + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_CLAIM_NS, mmp->mmp_claim_ns); + } + + zfs_dbgmsg("mmp: set spa_load_info, spa=%s hostname=%s hostid=%llx " + "state=%d txg=%llu seq=%llu tryimport_ns=%lld import_ns=%lld " + "claim_ns=%lld", spa_load_name(spa), + hostname != NULL ? hostname : "none", (u_longlong_t)hostid, + (int)state, (u_longlong_t)txg, (u_longlong_t)seq, + (longlong_t)mmp->mmp_tryimport_ns, (longlong_t)mmp->mmp_import_ns, + (longlong_t)mmp->mmp_claim_ns); +} + +static int +spa_ld_activity_result(spa_t *spa, int error, const char *state) +{ + switch (error) { + case ENXIO: + cmn_err(CE_WARN, "pool '%s' system hostid not set, " + "aborted import during %s", spa_load_name(spa), state); + /* Userspace expects EREMOTEIO for no system hostid */ + error = EREMOTEIO; + break; + case EREMOTEIO: + cmn_err(CE_WARN, "pool '%s' activity detected, aborted " + "import during %s", spa_load_name(spa), state); + break; + case EINTR: + cmn_err(CE_WARN, "pool '%s' activity check, interrupted " + "import during %s", spa_load_name(spa), state); + break; + case 0: + cmn_err(CE_NOTE, "pool '%s' activity check completed " + "successfully", spa_load_name(spa)); + break; + } + + return (error); +} + + +/* + * Remote host activity check. Performed during tryimport when the pool + * has passed on the basic sanity check and is open read-only. * * error results: * 0 - no activity detected @@ -3888,17 +4076,9 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) * EINTR - user canceled the operation */ static int -spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, +spa_activity_check_tryimport(spa_t *spa, uberblock_t *spa_ub, boolean_t importing) { - uint64_t txg = ub->ub_txg; - uint64_t timestamp = ub->ub_timestamp; - uint64_t mmp_config = ub->ub_mmp_config; - uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; - uint64_t import_delay; - hrtime_t import_expire, now; - nvlist_t *mmp_label = NULL; - vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; kmutex_t mtx; int error = 0; @@ -3907,65 +4087,55 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); mutex_enter(&mtx); - /* - * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed - * during the earlier tryimport. If the txg recorded there is 0 then - * the pool is known to be active on another host. - * - * Otherwise, the pool might be in use on another host. Check for - * changes in the uberblocks on disk if necessary. - */ - if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { - nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, - ZPOOL_CONFIG_LOAD_INFO); - - if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && - fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { - vdev_uberblock_load(rvd, ub, &mmp_label); - error = SET_ERROR(EREMOTEIO); - goto out; - } - } - - import_delay = spa_activity_check_duration(spa, ub); + uint64_t import_delay = spa_activity_check_duration(spa, spa_ub); + hrtime_t start_time = gethrtime(); /* Add a small random factor in case of simultaneous imports (0-25%) */ import_delay += import_delay * random_in_range(250) / 1000; - - import_expire = gethrtime() + import_delay; + hrtime_t import_expire = gethrtime() + import_delay; if (importing) { + /* Console message includes tryimport and claim time */ + hrtime_t extra_delay = MMP_IMPORT_VERIFY_ITERS * + MSEC2NSEC(MMP_INTERVAL_VALID(spa_ub) ? + MMP_INTERVAL(spa_ub) : MMP_MIN_INTERVAL); + cmn_err(CE_NOTE, "pool '%s' multihost activity check " + "required, %llu seconds remaining", spa_load_name(spa), + (u_longlong_t)MAX(NSEC2SEC(import_delay + extra_delay), 1)); spa_import_progress_set_notes(spa, "Checking MMP activity, " "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); } - int iterations = 0; + hrtime_t now; + nvlist_t *mmp_label = NULL; + while ((now = gethrtime()) < import_expire) { - if (importing && iterations++ % 30 == 0) { - spa_import_progress_set_notes(spa, "Checking MMP " - "activity, %llu ms remaining", - (u_longlong_t)NSEC2MSEC(import_expire - now)); - } + vdev_t *rvd = spa->spa_root_vdev; + uberblock_t mmp_ub; if (importing) { (void) spa_import_progress_set_mmp_check(spa_guid(spa), NSEC2SEC(import_expire - gethrtime())); } - vdev_uberblock_load(rvd, ub, &mmp_label); - - if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || - mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { - zfs_dbgmsg("multihost activity detected " - "txg %llu ub_txg %llu " - "timestamp %llu ub_timestamp %llu " - "mmp_config %#llx ub_mmp_config %#llx", - (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, - (u_longlong_t)timestamp, - (u_longlong_t)ub->ub_timestamp, - (u_longlong_t)mmp_config, - (u_longlong_t)ub->ub_mmp_config); + vdev_uberblock_load(rvd, &mmp_ub, &mmp_label); + if (vdev_uberblock_compare(spa_ub, &mmp_ub)) { + spa_load_failed(spa, "mmp: activity detected during " + "tryimport, spa_ub_txg=%llu mmp_ub_txg=%llu " + "spa_ub_seq=%llu mmp_ub_seq=%llu " + "spa_ub_timestamp=%llu mmp_ub_timestamp=%llu " + "spa_ub_config=%#llx mmp_ub_config=%#llx", + (u_longlong_t)spa_ub->ub_txg, + (u_longlong_t)mmp_ub.ub_txg, + (u_longlong_t)(MMP_SEQ_VALID(spa_ub) ? + MMP_SEQ(spa_ub) : 0), + (u_longlong_t)(MMP_SEQ_VALID(&mmp_ub) ? + MMP_SEQ(&mmp_ub) : 0), + (u_longlong_t)spa_ub->ub_timestamp, + (u_longlong_t)mmp_ub.ub_timestamp, + (u_longlong_t)spa_ub->ub_mmp_config, + (u_longlong_t)mmp_ub.ub_mmp_config); error = SET_ERROR(EREMOTEIO); break; } @@ -3983,52 +4153,255 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, error = 0; } -out: mutex_exit(&mtx); mutex_destroy(&mtx); cv_destroy(&cv); - /* - * If the pool is determined to be active store the status in the - * spa->spa_load_info nvlist. If the remote hostname or hostid are - * available from configuration read from disk store them as well. - * This allows 'zpool import' to generate a more useful message. - * - * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) - * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool - * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool - */ - if (error == EREMOTEIO) { - if (mmp_label) { - if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { - const char *hostname = fnvlist_lookup_string( - mmp_label, ZPOOL_CONFIG_HOSTNAME); - fnvlist_add_string(spa->spa_load_info, - ZPOOL_CONFIG_MMP_HOSTNAME, hostname); - } + if (mmp_label) + nvlist_free(mmp_label); - if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { - uint64_t hostid = fnvlist_lookup_uint64( - mmp_label, ZPOOL_CONFIG_HOSTID); - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_HOSTID, hostid); - } + if (spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_OPEN) { + spa->spa_mmp.mmp_import_ns = gethrtime() - start_time; + } else { + spa->spa_mmp.mmp_tryimport_ns = gethrtime() - start_time; + } + + return (error); +} + +/* + * Remote host activity check. Performed during import when the pool has + * passed most sanity check and has been reopened read/write. + * + * error results: + * 0 - no activity detected + * EREMOTEIO - remote activity detected + * EINTR - user canceled the operation + */ +static int +spa_activity_check_claim(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + nvlist_t *mmp_label; + uberblock_t spa_ub; + kcondvar_t cv; + kmutex_t mtx; + int error = 0; + + cv_init(&cv, NULL, CV_DEFAULT, NULL); + mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_enter(&mtx); + + hrtime_t start_time = gethrtime(); + + /* + * Load the best uberblock and verify it matches the uberblock already + * identified and stored as spa->spa_uberblock to verify the pool has + * not changed. + */ + vdev_uberblock_load(rvd, &spa_ub, &mmp_label); + + if (memcmp(&spa->spa_uberblock, &spa_ub, sizeof (uberblock_t))) { + spa_load_failed(spa, "mmp: uberblock changed on disk"); + error = SET_ERROR(EREMOTEIO); + goto out; + } + + if (!MMP_VALID(&spa_ub) || !MMP_INTERVAL_VALID(&spa_ub) || + !MMP_SEQ_VALID(&spa_ub) || !MMP_FAIL_INT_VALID(&spa_ub)) { + spa_load_failed(spa, "mmp: is not enabled in spa uberblock"); + error = SET_ERROR(EREMOTEIO); + goto out; + } + + nvlist_free(mmp_label); + mmp_label = NULL; + + uint64_t spa_ub_interval = MMP_INTERVAL(&spa_ub); + uint16_t spa_ub_seq = MMP_SEQ(&spa_ub); + + /* + * In the highly unlikely event the sequence numbers have been + * exhaused reset the sequence to zero. As long as the MMP + * uberblock is updated on all of the vdevs the activity will + * still be detected. + */ + if (MMP_SEQ_MAX == spa_ub_seq) + spa_ub_seq = 0; + + spa_import_progress_set_notes(spa, + "Establishing MMP claim, waiting %llu ms", + (u_longlong_t)(MMP_IMPORT_VERIFY_ITERS * spa_ub_interval)); + + /* + * Repeatedly sync out an MMP uberblock with a randomly selected + * sequence number, then read it back after the MMP interval. This + * random value acts as a claim token and is visible on other hosts. + * If the same random value is read back we can be certain no other + * pool is attempting to import the pool. + */ + for (int i = MMP_IMPORT_VERIFY_ITERS; i > 0; i--) { + uberblock_t set_ub, mmp_ub; + uint16_t mmp_seq; + + (void) spa_import_progress_set_mmp_check(spa_guid(spa), + NSEC2SEC(i * MSEC2NSEC(spa_ub_interval))); + + set_ub = spa_ub; + mmp_seq = spa_ub_seq + 1 + + random_in_range(MMP_SEQ_MAX - spa_ub_seq); + MMP_SEQ_CLEAR(&set_ub); + set_ub.ub_mmp_config |= MMP_SEQ_SET(mmp_seq); + + error = mmp_claim_uberblock(spa, rvd, &set_ub); + if (error) { + spa_load_failed(spa, "mmp: uberblock claim " + "failed, error=%d", error); + error = SET_ERROR(EREMOTEIO); + break; } - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_TXG, 0); + error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + + MSEC_TO_TICK(spa_ub_interval)); + if (error != -1) { + error = SET_ERROR(EINTR); + break; + } - error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); + vdev_uberblock_load(rvd, &mmp_ub, &mmp_label); + + if (vdev_uberblock_compare(&set_ub, &mmp_ub)) { + spa_load_failed(spa, "mmp: activity detected during " + "claim, set_ub_txg=%llu mmp_ub_txg=%llu " + "set_ub_seq=%llu mmp_ub_seq=%llu " + "set_ub_timestamp=%llu mmp_ub_timestamp=%llu " + "set_ub_config=%#llx mmp_ub_config=%#llx", + (u_longlong_t)set_ub.ub_txg, + (u_longlong_t)mmp_ub.ub_txg, + (u_longlong_t)(MMP_SEQ_VALID(&set_ub) ? + MMP_SEQ(&set_ub) : 0), + (u_longlong_t)(MMP_SEQ_VALID(&mmp_ub) ? + MMP_SEQ(&mmp_ub) : 0), + (u_longlong_t)set_ub.ub_timestamp, + (u_longlong_t)mmp_ub.ub_timestamp, + (u_longlong_t)set_ub.ub_mmp_config, + (u_longlong_t)mmp_ub.ub_mmp_config); + error = SET_ERROR(EREMOTEIO); + break; + } + + if (mmp_label) { + nvlist_free(mmp_label); + mmp_label = NULL; + } + + error = 0; + } +out: + spa->spa_mmp.mmp_claim_ns = gethrtime() - start_time; + (void) spa_import_progress_set_mmp_check(spa_guid(spa), 0); + + if (error == EREMOTEIO) { + spa_activity_set_load_info(spa, mmp_label, + MMP_STATE_ACTIVE, 0, 0, EREMOTEIO); + } else { + spa_activity_set_load_info(spa, mmp_label, + MMP_STATE_INACTIVE, spa_ub.ub_txg, MMP_SEQ(&spa_ub), 0); + } + + /* + * Restore the original sequence, this allows us to retry the + * import procedure if a subsequent step fails during import. + * Failure to restore it reduces the available sequence ids for + * the next import but shouldn't be considered fatal. + */ + int restore_error = mmp_claim_uberblock(spa, rvd, &spa_ub); + if (restore_error) { + zfs_dbgmsg("mmp: uberblock restore failed, spa=%s error=%d", + spa_load_name(spa), restore_error); } if (mmp_label) nvlist_free(mmp_label); + mutex_exit(&mtx); + mutex_destroy(&mtx); + cv_destroy(&cv); + return (error); } +static int +spa_ld_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *label) +{ + vdev_t *rvd = spa->spa_root_vdev; + int error; + + if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && + spa_get_hostid(spa) == 0) { + spa_activity_set_load_info(spa, label, MMP_STATE_NO_HOSTID, + ub->ub_txg, MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0, ENXIO); + zfs_dbgmsg("mmp: system hostid not set, ub_mmp_magic=%llx " + "ub_mmp_delay=%llu hostid=%llx", + (u_longlong_t)ub->ub_mmp_magic, + (u_longlong_t)ub->ub_mmp_delay, + (u_longlong_t)spa_get_hostid(spa)); + return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, ENXIO)); + } + + switch (spa->spa_load_state) { + case SPA_LOAD_TRYIMPORT: +tryimport: + error = spa_activity_check_tryimport(spa, ub, B_TRUE); + if (error == EREMOTEIO) { + spa_activity_set_load_info(spa, label, + MMP_STATE_ACTIVE, 0, 0, EREMOTEIO); + return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); + } else if (error) { + ASSERT3S(error, ==, EINTR); + spa_activity_set_load_info(spa, label, + MMP_STATE_ACTIVE, 0, 0, EINTR); + return (error); + } + + spa_activity_set_load_info(spa, label, MMP_STATE_INACTIVE, + ub->ub_txg, MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0, 0); + + break; + + case SPA_LOAD_IMPORT: + case SPA_LOAD_OPEN: + error = spa_activity_verify_config(spa, ub); + if (error == EREMOTEIO) { + spa_activity_set_load_info(spa, label, + MMP_STATE_ACTIVE, 0, 0, EREMOTEIO); + return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); + } else if (error) { + ASSERT3S(error, ==, ENOENT); + goto tryimport; + } + + /* Load info set in spa_activity_check_claim() */ + + break; + + case SPA_LOAD_RECOVER: + zfs_dbgmsg("mmp: skipping mmp check for rewind, spa=%s", + spa_load_name(spa)); + break; + + default: + spa_activity_set_load_info(spa, label, MMP_STATE_ACTIVE, + 0, 0, EREMOTEIO); + zfs_dbgmsg("mmp: unreachable, spa=%s spa_load_state=%d", + spa_load_name(spa), spa->spa_load_state); + return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); + } + + return (0); +} + /* * Called from zfs_ioc_clear for a pool that was suspended * after failing mmp write checks. @@ -4068,8 +4441,9 @@ spa_mmp_remote_host_activity(spa_t *spa) if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { - zfs_dbgmsg("txg mismatch detected during pool clear " - "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", + zfs_dbgmsg("mmp: txg mismatch detected during pool clear, " + "spa=%s txg=%llu ub_txg=%llu timestamp=%llu " + "ub_timestamp=%llu", spa_name(spa), (u_longlong_t)spa->spa_uberblock.ub_txg, (u_longlong_t)best_ub.ub_txg, (u_longlong_t)spa->spa_uberblock.ub_timestamp, @@ -4080,8 +4454,7 @@ spa_mmp_remote_host_activity(spa_t *spa) /* * Perform an activity check looking for any remote writer */ - return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, - B_FALSE) != 0); + return (spa_activity_check_tryimport(spa, &best_ub, B_FALSE) != 0); } static int @@ -4341,7 +4714,6 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) vdev_t *rvd = spa->spa_root_vdev; nvlist_t *label; uberblock_t *ub = &spa->spa_uberblock; - boolean_t activity_check = B_FALSE; /* * If we are opening the checkpointed state of the pool by @@ -4393,37 +4765,25 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) (u_longlong_t)RRSS_GET_OFFSET(ub)); } - /* * For pools which have the multihost property on determine if the * pool is truly inactive and can be safely imported. Prevent * hosts which don't have a hostid set from importing the pool. */ - activity_check = spa_activity_check_required(spa, ub, label, - spa->spa_config); - if (activity_check) { - if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && - spa_get_hostid(spa) == 0) { - nvlist_free(label); - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); - return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); - } - - int error = - spa_activity_check(spa, ub, spa->spa_config, B_TRUE); + spa->spa_activity_check = spa_activity_check_required(spa, ub, label); + if (spa->spa_activity_check) { + int error = spa_ld_activity_check(spa, ub, label); if (error) { + spa_load_state_t state = spa->spa_load_state; + error = spa_ld_activity_result(spa, error, + state == SPA_LOAD_TRYIMPORT ? "tryimport" : + state == SPA_LOAD_IMPORT ? "import" : "open"); nvlist_free(label); return (error); } - - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); - fnvlist_add_uint16(spa->spa_load_info, - ZPOOL_CONFIG_MMP_SEQ, - (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); + } else { + fnvlist_add_uint32(spa->spa_load_info, + ZPOOL_CONFIG_MMP_RESULT, ESRCH); } /* @@ -4706,6 +5066,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, } } + /* + * Final sanity check for multihost pools that no other host is + * accessing the pool. All of the read-only check have passed at + * this point, perform targetted updates to the mmp uberblocks to + * safely force a visible change. + */ + if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && + !spa->spa_extreme_rewind && spa->spa_activity_check) { + + error = spa_activity_check_claim(spa); + error = spa_ld_activity_result(spa, error, "claim"); + + if (error == EREMOTEIO) + return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); + else if (error) + return (error); + } + error = spa_check_for_missing_logs(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); @@ -5931,13 +6309,21 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); - if (load_error == ZFS_ERR_NO_CHECKPOINT) { - /* - * When attempting checkpoint-rewind on a pool with no - * checkpoint, we should not attempt to load uberblocks - * from previous txgs when spa_load fails. - */ + + /* Do not attempt to load uberblocks from previous txgs when: */ + switch (load_error) { + case ZFS_ERR_NO_CHECKPOINT: + /* Attempting checkpoint-rewind on a pool with no checkpoint */ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + zfs_fallthrough; + case EREMOTEIO: + /* MMP determines the pool is active on another host */ + zfs_fallthrough; + case EBADF: + /* The config cache is out of sync (vdevs or hostid) */ + zfs_fallthrough; + case EINTR: + /* The user interactively interrupted the import */ spa_import_progress_remove(spa_guid(spa)); return (load_error); } diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index f64c06912..16ba09c6f 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1491,7 +1491,7 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) * conflicting uberblocks on disk with the same txg. The solution is simple: * among uberblocks with equal txg, choose the one with the latest timestamp. */ -static int +int vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg); @@ -1622,8 +1622,10 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) * matches the txg for our uberblock. */ if (cb.ubl_vd != NULL) { - vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " - "txg %llu", spa_load_name(spa), (u_longlong_t)ub->ub_txg); + vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s, " + "txg=%llu seq=%llu", spa_load_name(spa), + (u_longlong_t)ub->ub_txg, + (u_longlong_t)(MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); if (ub->ub_raidz_reflow_info != cb.ubl_latest.ub_raidz_reflow_info) {