vdev probe to slow disk can stall mmp write checker

Simplify vdev probes in the zio_vdev_io_done context to
avoid holding the spa config lock for a long duration.

Also allow zpool clear if no evidence of another host
is using the pool.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #15839
This commit is contained in:
Don Brady
2024-04-29 15:35:53 -06:00
committed by GitHub
parent b28461b7c6
commit c3f2f1aa2d
16 changed files with 242 additions and 52 deletions
+3 -2
View File
@@ -664,12 +664,13 @@ mmp_thread(void *arg)
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
"mmp_last_write %llu mmp_interval %llu "
"mmp_fail_intervals %llu mmp_fail_ns %llu",
"mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)mmp->mmp_last_write,
(u_longlong_t)mmp_interval,
(u_longlong_t)mmp_fail_intervals,
(u_longlong_t)mmp_fail_ns);
(u_longlong_t)mmp_fail_ns,
(u_longlong_t)spa->spa_uberblock.ub_txg);
cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
"succeeded in over %llu ms; suspending pool. "
"Hrtime %llu",
+84 -18
View File
@@ -3594,11 +3594,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
}
/*
* Perform the import activity check. If the user canceled the import or
* we detected activity then fail.
* Remote host activity check.
*
* error results:
* 0 - no activity detected
* EREMOTEIO - remote activity detected
* EINTR - user canceled the operation
*/
static int
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
boolean_t importing)
{
uint64_t txg = ub->ub_txg;
uint64_t timestamp = ub->ub_timestamp;
@@ -3643,19 +3648,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
import_expire = gethrtime() + import_delay;
spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
"%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
if (importing) {
spa_import_progress_set_notes(spa, "Checking MMP activity, "
"waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
}
int interations = 0;
int iterations = 0;
while ((now = gethrtime()) < import_expire) {
if (interations++ % 30 == 0) {
if (importing && iterations++ % 30 == 0) {
spa_import_progress_set_notes(spa, "Checking MMP "
"activity, %llu ms remaining",
(u_longlong_t)NSEC2MSEC(import_expire - now));
}
(void) spa_import_progress_set_mmp_check(spa_guid(spa),
NSEC2SEC(import_expire - gethrtime()));
if (importing) {
(void) spa_import_progress_set_mmp_check(spa_guid(spa),
NSEC2SEC(import_expire - gethrtime()));
}
vdev_uberblock_load(rvd, ub, &mmp_label);
@@ -3737,6 +3746,61 @@ out:
return (error);
}
/*
* Called from zfs_ioc_clear for a pool that was suspended
* after failing mmp write checks.
*/
boolean_t
spa_mmp_remote_host_activity(spa_t *spa)
{
ASSERT(spa_multihost(spa) && spa_suspended(spa));
nvlist_t *best_label;
uberblock_t best_ub;
/*
* Locate the best uberblock on disk
*/
vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
if (best_label) {
/*
* confirm that the best hostid matches our hostid
*/
if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
spa_get_hostid(spa) !=
fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
nvlist_free(best_label);
return (B_TRUE);
}
nvlist_free(best_label);
} else {
return (B_TRUE);
}
if (!MMP_VALID(&best_ub) ||
!MMP_FAIL_INT_VALID(&best_ub) ||
MMP_FAIL_INT(&best_ub) == 0) {
return (B_TRUE);
}
if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
zfs_dbgmsg("txg mismatch detected during pool clear "
"txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
(u_longlong_t)spa->spa_uberblock.ub_txg,
(u_longlong_t)best_ub.ub_txg,
(u_longlong_t)spa->spa_uberblock.ub_timestamp,
(u_longlong_t)best_ub.ub_timestamp);
return (B_TRUE);
}
/*
* Perform an activity check looking for any remote writer
*/
return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
B_FALSE) != 0);
}
static int
spa_verify_host(spa_t *spa, nvlist_t *mos_config)
{
@@ -4063,7 +4127,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
}
int error = spa_activity_check(spa, ub, spa->spa_config);
int error =
spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
if (error) {
nvlist_free(label);
return (error);
@@ -8771,15 +8836,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
}
static void
spa_async_probe(spa_t *spa, vdev_t *vd)
spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_probe_wanted) {
vd->vdev_probe_wanted = B_FALSE;
vdev_reopen(vd); /* vdev_open() does the actual probe */
if (vd->vdev_fault_wanted) {
vd->vdev_fault_wanted = B_FALSE;
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
VDEV_AUX_ERR_EXCEEDED);
}
for (int c = 0; c < vd->vdev_children; c++)
spa_async_probe(spa, vd->vdev_child[c]);
spa_async_fault_vdev(spa, vd->vdev_child[c]);
}
static void
@@ -8867,11 +8933,11 @@ spa_async_thread(void *arg)
}
/*
* See if any devices need to be probed.
* See if any devices need to be marked faulted.
*/
if (tasks & SPA_ASYNC_PROBE) {
if (tasks & SPA_ASYNC_FAULT_VDEV) {
spa_vdev_state_enter(spa, SCL_NONE);
spa_async_probe(spa, spa->spa_root_vdev);
spa_async_fault_vdev(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
+9
View File
@@ -550,6 +550,15 @@ txg_sync_thread(void *arg)
timer = (delta > timeout ? 0 : timeout - delta);
}
/*
* When we're suspended, nothing should be changing and for
* MMP we don't want to bump anything that would make it
* harder to detect if another host is changing it when
* resuming after a MMP suspend.
*/
if (spa_suspended(spa))
continue;
/*
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.
+13 -9
View File
@@ -1664,6 +1664,7 @@ vdev_metaslab_fini(vdev_t *vd)
typedef struct vdev_probe_stats {
boolean_t vps_readable;
boolean_t vps_writeable;
boolean_t vps_zio_done_probe;
int vps_flags;
} vdev_probe_stats_t;
@@ -1709,6 +1710,17 @@ vdev_probe_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO);
/*
* If this probe was initiated from zio pipeline, then
* change the state in a spa_async_request. Probes that
* were initiated from a vdev_open can change the state
* as part of the open call.
*/
if (vps->vps_zio_done_probe) {
vd->vdev_fault_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
}
}
mutex_enter(&vd->vdev_probe_lock);
@@ -1759,6 +1771,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
vps->vps_zio_done_probe = (zio != NULL);
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
@@ -1785,15 +1798,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
/*
* We can't change the vdev state in this context, so we
* kick off an async task to do it on our behalf.
*/
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
}
}
if (zio != NULL)
+3 -1
View File
@@ -2027,6 +2027,7 @@ retry:
/*
* If this isn't a resync due to I/O errors,
* and nothing changed in this transaction group,
* and multihost protection isn't enabled,
* and the vdev configuration hasn't changed,
* then there's nothing to do.
*/
@@ -2034,7 +2035,8 @@ retry:
boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
txg, spa->spa_mmp.mmp_delay);
if (!changed && list_is_empty(&spa->spa_config_dirty_list))
if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
!spa_multihost(spa))
return (0);
}
+6 -3
View File
@@ -5823,10 +5823,13 @@ zfs_ioc_clear(zfs_cmd_t *zc)
/*
* If multihost is enabled, resuming I/O is unsafe as another
* host may have imported the pool.
* host may have imported the pool. Check for remote activity.
*/
if (spa_multihost(spa) && spa_suspended(spa))
return (SET_ERROR(EINVAL));
if (spa_multihost(spa) && spa_suspended(spa) &&
spa_mmp_remote_host_activity(spa)) {
spa_close(spa, FTAG);
return (SET_ERROR(EREMOTEIO));
}
spa_vdev_state_enter(spa, SCL_NONE);
+4 -2
View File
@@ -2532,8 +2532,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and the failure mode property for this pool "
"is set to panic.", spa_name(spa));
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
"failure and has been suspended.\n", spa_name(spa));
if (reason != ZIO_SUSPEND_MMP) {
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
"I/O failure and has been suspended.\n", spa_name(spa));
}
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0);
+4 -2
View File
@@ -607,9 +607,11 @@ zio_handle_io_delay(zio_t *zio)
if (vd->vdev_guid != handler->zi_record.zi_guid)
continue;
/* also match on I/O type (e.g., -T read) */
if (handler->zi_record.zi_iotype != ZIO_TYPES &&
handler->zi_record.zi_iotype != zio->io_type)
continue;
handler->zi_record.zi_iotype != zio->io_type) {
continue;
}
/*
* Defensive; should never happen as the array allocation