mmp: claim sequence id before final import

As part of SPA_LOAD_IMPORT add an additional activity check to detect simultaneous imports from different hosts. This check is only required when the timing is such that there's no activity for the the read-only tryimport check to detect. This extra safety chceck operates as follows: 1. Repeats the following MMP check 10 times: a. Write out an MMP uberblock with the best txg and a random sequence id to all primary pool vdevs. b. Verify a minimum number of good writes such that even if the pool appears degraded on the remote host it will see at least one of the updated MMP uberblocks. c. Wait for the MMP interval this leaves a window for other racing hosts to make similar modifications which can be detected. d. Call vdev_uberblock_load() to determine the best uberblock to use, this should be the MMP uberblock just written. e. Verify the txg and random sequeunce number match the MMP uberblock written in 1a. 2. Restore the original MMP uberblocks. This allows the check to be performed again if the pool fails to import for an unrelated reason. This change also includes some refactoring and minor improvements. - Never try loading earlier txgs during import when the import fails with EREMOTEIO or EINTER. These errors don't indicate the txg is damaged but instead that its either in use on a remote host or the import was interactively cancelled. No rewind is also performed for EBADD which can result from a stale trusted config when doing a verbatim import. - Refactor the code for consistent logging of the multihost activity check using spa_load_note() and console messages indicating when the activity check was trigger and the result. - Added MMP_*_MASK and MMP_SEQ_CLEAR() macros to allow easier modification of the sequence number in an uberblock. - Added ZFS_LOAD_INFO_DEBUG environment variable which can be set to log to dump to stdout the spa_load_info nvlist returned during import. This is used by the updated mmp test cases to determine if an activity check was run and its result. - Standardize the mmp messages similarly to make it easier to find all the relevent mmp lines in the debug log. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Olaf Faaland <faaland1@llnl.gov> Reviewed-by: Akash B <akash-b@hpe.com>
2026-05-25 11:47:43 +03:00 · 2026-01-22 16:20:02 -08:00
parent 2f048ced4d
commit 20176224ee
10 changed files with 731 additions and 201 deletions
@@ -145,6 +145,15 @@
 * Additionally, the duration is then extended by a random 25% to attempt to to
 * detect simultaneous imports.  For example, if both partner hosts are rebooted
 * at the same time and automatically attempt to import the pool.
+ *
+ * Once the read-only activity check completes and the pool is determined to
+ * be inactive a second check is performed to claim the pool.  During this
+ * phase the host writes out MMP uberblocks to each of the devices which are
+ * identical to the best uberblock but with a randomly selected sequence id.
+ * The "best" uberblock is then read back and it must contain this new sequence
+ * number.  This check is performed multiple times to ensure that there is
+ * no window where a concurrently importing system can incorrectly determine
+ * the pool to be inactive.
 */

 /*
@@ -237,8 +246,8 @@ mmp_thread_start(spa_t *spa)
 		if (!mmp->mmp_thread) {
 			mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
 			    spa, 0, &p0, TS_RUN, defclsyspri);
-			zfs_dbgmsg("MMP thread started pool '%s' "
-			    "gethrtime %llu", spa_name(spa), gethrtime());
+			zfs_dbgmsg("mmp: mmp thread started spa=%s "
+			    "gethrtime=%llu", spa_name(spa), gethrtime());
 		}
 		mutex_exit(&mmp->mmp_thread_lock);
 	}
@@ -257,7 +266,7 @@ mmp_thread_stop(spa_t *spa)
 		cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
 	}
 	mutex_exit(&mmp->mmp_thread_lock);
-	zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
+	zfs_dbgmsg("mmp: mmp thread stopped spa=%s gethrtime=%llu",
 	    spa_name(spa), gethrtime());

 	ASSERT0P(mmp->mmp_thread);
@@ -449,9 +458,9 @@ mmp_write_uberblock(spa_t *spa)
 	spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER);
 	lock_acquire_time = gethrtime() - lock_acquire_time;
 	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
-		zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
-		    "gethrtime %llu", spa_name(spa), lock_acquire_time,
-		    gethrtime());
+		zfs_dbgmsg("mmp: long SCL_STATE acquisition, spa=%s "
+		    "acquire_time=%llu gethrtime=%llu", spa_name(spa),
+		    lock_acquire_time, gethrtime());

 	mutex_enter(&mmp->mmp_io_lock);

@@ -474,8 +483,8 @@ mmp_write_uberblock(spa_t *spa)
 			spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
 			    gethrestime_sec(), mmp->mmp_delay, NULL, 0,
 			    mmp->mmp_kstat_id++, error);
-			zfs_dbgmsg("MMP error choosing leaf pool '%s' "
-			    "gethrtime %llu fail_mask %#x", spa_name(spa),
+			zfs_dbgmsg("mmp: error choosing leaf, spa=%s "
+			    "gethrtime=%llu fail_mask=%#x", spa_name(spa),
 			    gethrtime(), error);
 		}
 		mutex_exit(&mmp->mmp_io_lock);
@@ -485,11 +494,11 @@ mmp_write_uberblock(spa_t *spa)

 	vd = spa->spa_mmp.mmp_last_leaf;
 	if (mmp->mmp_skip_error != 0) {
-		mmp->mmp_skip_error = 0;
-		zfs_dbgmsg("MMP write after skipping due to unavailable "
-		    "leaves, pool '%s' gethrtime %llu leaf %llu",
+		zfs_dbgmsg("mmp: write after skipping due to unavailable "
+		    "leaves, spa=%s gethrtime=%llu vdev=%llu error=%d",
 		    spa_name(spa), (u_longlong_t)gethrtime(),
-		    (u_longlong_t)vd->vdev_guid);
+		    (u_longlong_t)vd->vdev_guid, mmp->mmp_skip_error);
+		mmp->mmp_skip_error = 0;
 	}

 	if (mmp->mmp_zio_root == NULL)
@@ -540,6 +549,108 @@ mmp_write_uberblock(spa_t *spa)
 	zio_nowait(zio);
 }

+static void
+mmp_claim_uberblock_sync_done(zio_t *zio)
+{
+	uint64_t *good_writes = zio->io_private;
+
+	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
+		atomic_inc_64(good_writes);
+}
+
+/*
+ * Write the uberblock to the first label of all leaves of the specified vdev.
+ * Two writes required for each mirror, one for a singleton, and parity+1 for
+ * raidz or draid vdevs.
+ */
+static void
+mmp_claim_uberblock_sync(zio_t *zio, uint64_t *good_writes,
+    uint64_t *req_writes, uberblock_t *ub, vdev_t *vd, int flags)
+{
+	for (uint64_t c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_islog || cvd->vdev_isspare || cvd->vdev_isl2cache)
+			continue;
+
+		if (cvd->vdev_top == cvd) {
+			uint64_t nparity = vdev_get_nparity(cvd);
+			if (nparity) {
+				*req_writes += nparity + 1;
+			} else {
+				*req_writes +=
+				    MIN(MAX(cvd->vdev_children, 1), 2);
+			}
+		}
+
+		mmp_claim_uberblock_sync(zio, good_writes, req_writes,
+		    ub, cvd, flags);
+	}
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	if (!vdev_writeable(vd))
+		return;
+
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
+	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+	abd_zero_off(ub_abd, sizeof (uberblock_t),
+	    VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
+
+	vdev_label_write(zio, vd, 0, ub_abd,
+	    VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
+	    MMP_BLOCKS_PER_LABEL), VDEV_UBERBLOCK_SIZE(vd),
+	    mmp_claim_uberblock_sync_done, good_writes,
+	    flags | ZIO_FLAG_DONT_PROPAGATE);
+
+	abd_free(ub_abd);
+}
+
+int
+mmp_claim_uberblock(spa_t *spa, vdev_t *vd, uberblock_t *ub)
+{
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	uint64_t good_writes = 0;
+	uint64_t req_writes = 0;
+	zio_t *zio;
+
+	ASSERT(MMP_VALID(ub));
+	ASSERT(MMP_SEQ_VALID(ub));
+
+	spa_config_enter(spa, SCL_ALL, mmp_tag, RW_WRITER);
+
+	/* Sync the uberblock to all writeable leaves */
+	zio = zio_root(spa, NULL, NULL, flags);
+	mmp_claim_uberblock_sync(zio, &good_writes, &req_writes, ub, vd, flags);
+	(void) zio_wait(zio);
+
+	/* Flush the new uberblocks so they're immediately visible */
+	zio = zio_root(spa, NULL, NULL, flags);
+	zio_flush(zio, vd);
+	(void) zio_wait(zio);
+
+	spa_config_exit(spa, SCL_ALL, mmp_tag);
+
+	zfs_dbgmsg("mmp: claiming uberblock, spa=%s txg=%llu seq=%llu "
+	    "req_writes=%llu good_writes=%llu", spa_load_name(spa),
+	    (u_longlong_t)ub->ub_txg, (u_longlong_t)MMP_SEQ(ub),
+	    (u_longlong_t)req_writes, (u_longlong_t)good_writes);
+
+	/*
+	 * To guarantee visibility from a remote host we require a minimum
+	 * number of good writes. For raidz/draid vdevs parity+1 writes, for
+	 * mirrors 2 writes, and for singletons 1 write.
+	 */
+	if (req_writes == 0 || good_writes < req_writes)
+		return (SET_ERROR(EIO));
+
+	return (0);
+}
+
 static __attribute__((noreturn)) void
 mmp_thread(void *arg)
 {
@@ -616,11 +727,11 @@ mmp_thread(void *arg)
 			next_time = gethrtime() + mmp_interval / leaves;

 		if (mmp_fail_ns != last_mmp_fail_ns) {
-			zfs_dbgmsg("MMP interval change pool '%s' "
-			    "gethrtime %llu last_mmp_interval %llu "
-			    "mmp_interval %llu last_mmp_fail_intervals %u "
-			    "mmp_fail_intervals %u mmp_fail_ns %llu "
-			    "skip_wait %d leaves %d next_time %llu",
+			zfs_dbgmsg("mmp: interval change, spa=%s "
+			    "gethrtime=%llu last_mmp_interval=%llu "
+			    "mmp_interval=%llu last_mmp_fail_intervals=%u "
+			    "mmp_fail_intervals=%u mmp_fail_ns=%llu "
+			    "skip_wait=%d leaves=%d next_time=%llu",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    (u_longlong_t)last_mmp_interval,
 			    (u_longlong_t)mmp_interval, last_mmp_fail_intervals,
@@ -635,9 +746,9 @@ mmp_thread(void *arg)
 		 */
 		if ((!last_spa_multihost && multihost) ||
 		    (last_spa_suspended && !suspended)) {
-			zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
-			    "last_spa_multihost %u multihost %u "
-			    "last_spa_suspended %u suspended %u",
+			zfs_dbgmsg("mmp: state change spa=%s: gethrtime=%llu "
+			    "last_spa_multihost=%u multihost=%u "
+			    "last_spa_suspended=%u suspended=%u",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    last_spa_multihost, multihost, last_spa_suspended,
 			    suspended);
@@ -663,9 +774,10 @@ mmp_thread(void *arg)
 		 */
 		if (multihost && !suspended && mmp_fail_intervals &&
 		    (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
-			zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
-			    "mmp_last_write %llu mmp_interval %llu "
-			    "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
+			zfs_dbgmsg("mmp: suspending pool, spa=%s "
+			    "gethrtime=%llu mmp_last_write=%llu "
+			    "mmp_interval=%llu mmp_fail_intervals=%llu "
+			    "mmp_fail_ns=%llu txg=%llu",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    (u_longlong_t)mmp->mmp_last_write,
 			    (u_longlong_t)mmp_interval,