Update mmp_delay on sync or skipped, failed write

When an MMP write is skipped, or fails, and time since mts->mmp_last_write is already greater than mts->mmp_delay, increase mts->mmp_delay. The original code only updated mts->mmp_delay when a write succeeded, but this results in the write(s) after delays and failed write(s) reporting an ub_mmp_delay which is too low. Update mmp_last_write and mmp_delay if a txg sync was successful. At least one uberblock was written, thus extending the time we can be sure the pool will not be imported by another host. Do not allow mmp_delay to go below (MSEC2NSEC(zfs_multihost_interval) / vdev_count_leaves()) so that a period of frequent successful MMP writes, e.g. due to frequent txg syncs, does not result in an import activity check so short it is not reliable based on mmp thread writes alone. Remove unnecessary local variable, start. We do not use the start time of the loop iteration. Add a debug message in spa_activity_check() to allow verification of the import_delay value and to prove the activity check occurred. Alter the tests that import pools and attempt to detect an activity check. Calculate the expected duration of spa_activity_check() based on module parameters at the time the import is performed, rather than a fixed time set in mmp.cfg. The fixed time may be wrong. Also, use the default zfs_multihost_interval value so the activity check is longer and easier to recognize. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov> Signed-off-by: Olaf Faaland <faaland1@llnl.gov> Closes #7330
2026-06-11 08:26:39 +03:00 · 2018-04-04 16:38:44 -07:00
parent 21a4f5cc86
commit 533ea0415b
6 changed files with 93 additions and 47 deletions
@@ -280,6 +280,59 @@ mmp_random_leaf(vdev_t *in_vd, vdev_t **out_vd)
 	return (error_mask);
 }

+/*
+ * MMP writes are issued on a fixed schedule, but may complete at variable,
+ * much longer, intervals.  The mmp_delay captures long periods between
+ * successful writes for any reason, including disk latency, scheduling delays,
+ * etc.
+ *
+ * The mmp_delay is usually calculated as a decaying average, but if the latest
+ * delay is higher we do not average it, so that we do not hide sudden spikes
+ * which the importing host must wait for.
+ *
+ * If writes are occurring frequently, such as due to a high rate of txg syncs,
+ * the mmp_delay could become very small.  Since those short delays depend on
+ * activity we cannot count on, we never allow mmp_delay to get lower than rate
+ * expected if only mmp_thread writes occur.
+ *
+ * If an mmp write was skipped or fails, and we have already waited longer than
+ * mmp_delay, we need to update it so the next write reflects the longer delay.
+ *
+ * Do not set mmp_delay if the multihost property is not on, so as not to
+ * trigger an activity check on import.
+ */
+static void
+mmp_delay_update(spa_t *spa, boolean_t write_completed)
+{
+	mmp_thread_t *mts = &spa->spa_mmp;
+	hrtime_t delay = gethrtime() - mts->mmp_last_write;
+
+	ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
+
+	if (spa_multihost(spa) == B_FALSE) {
+		mts->mmp_delay = 0;
+		return;
+	}
+
+	if (delay > mts->mmp_delay)
+		mts->mmp_delay = delay;
+
+	if (write_completed == B_FALSE)
+		return;
+
+	mts->mmp_last_write = gethrtime();
+
+	/*
+	 * strictly less than, in case delay was changed above.
+	 */
+	if (delay < mts->mmp_delay) {
+		hrtime_t min_delay = MSEC2NSEC(zfs_multihost_interval) /
+		    vdev_count_leaves(spa);
+		mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
+		    min_delay);
+	}
+}
+
 static void
 mmp_write_done(zio_t *zio)
 {
@@ -291,38 +344,8 @@ mmp_write_done(zio_t *zio)
 	uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
 	hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;

-	if (zio->io_error)
-		goto unlock;
+	mmp_delay_update(spa, (zio->io_error == 0));

-	/*
-	 * Mmp writes are queued on a fixed schedule, but under many
-	 * circumstances, such as a busy device or faulty hardware,
-	 * the writes will complete at variable, much longer,
-	 * intervals.  In these cases, another node checking for
-	 * activity must wait longer to account for these delays.
-	 *
-	 * The mmp_delay is calculated as a decaying average of the interval
-	 * between completed mmp writes.  This is used to predict how long
-	 * the import must wait to detect activity in the pool, before
-	 * concluding it is not in use.
-	 *
-	 * Do not set mmp_delay if the multihost property is not on,
-	 * so as not to trigger an activity check on import.
-	 */
-	if (spa_multihost(spa)) {
-		hrtime_t delay = gethrtime() - mts->mmp_last_write;
-
-		if (delay > mts->mmp_delay)
-			mts->mmp_delay = delay;
-		else
-			mts->mmp_delay = (delay + mts->mmp_delay * 127) /
-			    128;
-	} else {
-		mts->mmp_delay = 0;
-	}
-	mts->mmp_last_write = gethrtime();
-
-unlock:
 	vd->vdev_mmp_pending = 0;
 	vd->vdev_mmp_kstat_id = 0;

@@ -348,6 +371,7 @@ mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
 	mutex_enter(&mmp->mmp_io_lock);
 	mmp->mmp_ub = *ub;
 	mmp->mmp_ub.ub_timestamp = gethrestime_sec();
+	mmp_delay_update(spa, B_TRUE);
 	mutex_exit(&mmp->mmp_io_lock);
 }

@@ -386,6 +410,7 @@ mmp_write_uberblock(spa_t *spa)
 	 */

 	if (error) {
+		mmp_delay_update(spa, B_FALSE);
 		if (mmp->mmp_skip_error == error) {
 			spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
 		} else {
@@ -463,15 +488,14 @@ mmp_thread(void *arg)
 		    MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
 		boolean_t suspended = spa_suspended(spa);
 		boolean_t multihost = spa_multihost(spa);
-		hrtime_t start, next_time;
+		hrtime_t next_time;

-		start = gethrtime();
-		if (multihost) {
-			next_time = start + mmp_interval /
+		if (multihost)
+			next_time = gethrtime() + mmp_interval /
 			    MAX(vdev_count_leaves(spa), 1);
-		} else {
-			next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL);
-		}
+		else
+			next_time = gethrtime() +
+			    MSEC2NSEC(MMP_DEFAULT_INTERVAL);

 		/*
 		 * MMP off => on, or suspended => !suspended:
@@ -515,11 +539,11 @@ mmp_thread(void *arg)
 		 * mmp_interval * mmp_fail_intervals nanoseconds.
 		 */
 		if (!suspended && mmp_fail_intervals && multihost &&
-		    (start - mmp->mmp_last_write) > max_fail_ns) {
+		    (gethrtime() - mmp->mmp_last_write) > max_fail_ns) {
 			cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
 			    "succeeded in over %llus; suspending pool",
 			    spa_name(spa),
-			    NSEC2SEC(start - mmp->mmp_last_write));
+			    NSEC2SEC(gethrtime() - mmp->mmp_last_write));
 			zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
 		}

@@ -2462,6 +2462,10 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
 	import_delay = MAX(import_delay, import_intervals *
 	    MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)));

+	zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu import_intervals=%u "
+	    "leaves=%u", import_delay, ub->ub_mmp_delay, import_intervals,
+	    vdev_count_leaves(spa));
+
 	/* Add a small random factor in case of simultaneous imports (0-25%) */
 	import_expire = gethrtime() + import_delay +
 	    (import_delay * spa_get_random(250) / 1000);
@@ -1495,7 +1495,6 @@ retry:
 	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
 		goto retry;

-
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, ub);

@@ -38,5 +38,3 @@ export MMP_HISTORY_OFF=0
 export MMP_INTERVAL_HOUR=$((60*60*1000))
 export MMP_INTERVAL_DEFAULT=1000
 export MMP_INTERVAL_MIN=100
-
-export ZPOOL_IMPORT_DURATION=9
@@ -163,17 +163,32 @@ function mmp_pool_set_hostid # pool hostid
 	return 0
 }

+# Return the number of seconds the activity check portion of the import process
+# will take.  Does not include the time to find devices and assemble the
+# preliminary pool configuration passed into the kernel.
+function seconds_mmp_waits_for_activity
+{
+	typeset import_intervals=$(get_tunable zfs_multihost_import_intervals)
+	typeset interval=$(get_tunable zfs_multihost_interval)
+	typeset seconds=$((interval*import_intervals/1000))
+
+	echo $seconds
+}
+
 function import_no_activity_check # pool opts
 {
 	typeset pool=$1
 	typeset opts=$2

+	typeset max_duration=$(seconds_mmp_waits_for_activity)
+
 	SECONDS=0
 	zpool import $opts $pool
 	typeset rc=$?

-	if [[ $SECONDS -gt $ZPOOL_IMPORT_DURATION ]]; then
-		log_fail "unexpected activity check (${SECONDS}s)"
+	if [[ $SECONDS -gt $max_duration ]]; then
+		log_fail "unexpected activity check (${SECONDS}s gt \
+$max_duration)"
 	fi

 	return $rc
@@ -184,12 +199,15 @@ function import_activity_check # pool opts
 	typeset pool=$1
 	typeset opts=$2

+	typeset min_duration=$(seconds_mmp_waits_for_activity)
+
 	SECONDS=0
 	zpool import $opts $pool
 	typeset rc=$?

-	if [[ $SECONDS -le $ZPOOL_IMPORT_DURATION ]]; then
-		log_fail "expected activity check (${SECONDS}s)"
+	if [[ $SECONDS -le $min_duration ]]; then
+		log_fail "expected activity check (${SECONDS}s le \
+$min_duration)"
 	fi

 	return $rc
@@ -103,6 +103,9 @@ MMP_IMPORTED_MSG="pool was previously in use from another system."
 log_must try_pool_import $MMP_POOL "-d $MMP_DIR" "$MMP_IMPORTED_MSG"

 # 7. Verify 'zpool import -f $MMP_POOL' can now import the pool.
+# Default interval results in minimum activity test 10s which
+# makes detection of the activity test reliable.
+log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT
 log_must import_activity_check $MMP_POOL "-f -d $MMP_DIR"

 # 8 Verify pool may be exported/imported without -f argument.