mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
Extend deadman logic
The intent of this patch is extend the existing deadman code
such that it's flexible enough to be used by both ztest and
on production systems. The proposed changes include:
* Added a new `zfs_deadman_failmode` module option which is
used to dynamically control the behavior of the deadman. It's
loosely modeled after, but independant from, the pool failmode
property. It can be set to wait, continue, or panic.
* wait - Wait for the "hung" I/O (default)
* continue - Attempt to recover from a "hung" I/O
* panic - Panic the system
* Added a new `zfs_deadman_ziotime_ms` module option which is
analogous to `zfs_deadman_synctime_ms` except instead of
applying to a pool TXG sync it applies to zio_wait(). A
default value of 300s is used to define a "hung" zio.
* The ztest deadman thread has been re-enabled by default,
aligned with the upstream OpenZFS code, and then extended
to terminate the process when it takes significantly longer
to complete than expected.
* The -G option was added to ztest to print the internal debug
log when a fatal error is encountered. This same option was
previously added to zdb in commit fa603f82. Update zloop.sh
to unconditionally pass -G to obtain additional debugging.
* The FM_EREPORT_ZFS_DELAY event which was previously posted
when the deadman detect a "hung" pool has been replaced by
a new dedicated FM_EREPORT_ZFS_DEADMAN event.
* The proposed recovery logic attempts to restart a "hung"
zio by calling zio_interrupt() on any outstanding leaf zios.
We may want to further restrict this to zios in either the
ZIO_STAGE_VDEV_IO_START or ZIO_STAGE_VDEV_IO_DONE stages.
Calling zio_interrupt() is expected to only be useful for
cases when an IO has been submitted to the physical device
but for some reasonable the completion callback hasn't been
called by the lower layers. This shouldn't be possible but
has been observed and may be caused by kernel/driver bugs.
* The 'zfs_deadman_synctime_ms' default value was reduced from
1000s to 600s.
* Depending on how ztest fails there may be no cache file to
move. This should not be considered fatal, collect the logs
which are available and carry on.
* Add deadman test cases for spa_deadman() and zio_wait().
* Increase default zfs_deadman_checktime_ms to 60s.
Reviewed-by: Tim Chase <tim@chase2k.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #6999
This commit is contained in:
+87
-7
@@ -292,24 +292,41 @@ int zfs_free_leak_on_eio = B_FALSE;
|
||||
/*
|
||||
* Expiration time in milliseconds. This value has two meanings. First it is
|
||||
* used to determine when the spa_deadman() logic should fire. By default the
|
||||
* spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
|
||||
* spa_deadman() will fire if spa_sync() has not completed in 600 seconds.
|
||||
* Secondly, the value determines if an I/O is considered "hung". Any I/O that
|
||||
* has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
|
||||
* in a system panic.
|
||||
* in one of three behaviors controlled by zfs_deadman_failmode.
|
||||
*/
|
||||
unsigned long zfs_deadman_synctime_ms = 1000000ULL;
|
||||
unsigned long zfs_deadman_synctime_ms = 600000ULL;
|
||||
|
||||
/*
|
||||
* This value controls the maximum amount of time zio_wait() will block for an
|
||||
* outstanding IO. By default this is 300 seconds at which point the "hung"
|
||||
* behavior will be applied as described for zfs_deadman_synctime_ms.
|
||||
*/
|
||||
unsigned long zfs_deadman_ziotime_ms = 300000ULL;
|
||||
|
||||
/*
|
||||
* Check time in milliseconds. This defines the frequency at which we check
|
||||
* for hung I/O.
|
||||
*/
|
||||
unsigned long zfs_deadman_checktime_ms = 5000ULL;
|
||||
unsigned long zfs_deadman_checktime_ms = 60000ULL;
|
||||
|
||||
/*
|
||||
* By default the deadman is enabled.
|
||||
*/
|
||||
int zfs_deadman_enabled = 1;
|
||||
|
||||
/*
|
||||
* Controls the behavior of the deadman when it detects a "hung" I/O.
|
||||
* Valid values are zfs_deadman_failmode=<wait|continue|panic>.
|
||||
*
|
||||
* wait - Wait for the "hung" I/O (default)
|
||||
* continue - Attempt to recover from a "hung" I/O
|
||||
* panic - Panic the system
|
||||
*/
|
||||
char *zfs_deadman_failmode = "wait";
|
||||
|
||||
/*
|
||||
* The worst case is single-sector max-parity RAID-Z blocks, in which
|
||||
* case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
|
||||
@@ -536,7 +553,7 @@ spa_deadman(void *arg)
|
||||
(gethrtime() - spa->spa_sync_starttime) / NANOSEC,
|
||||
++spa->spa_deadman_calls);
|
||||
if (zfs_deadman_enabled)
|
||||
vdev_deadman(spa->spa_root_vdev);
|
||||
vdev_deadman(spa->spa_root_vdev, FTAG);
|
||||
|
||||
spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
|
||||
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
|
||||
@@ -590,6 +607,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
|
||||
spa->spa_proc_state = SPA_PROC_NONE;
|
||||
|
||||
spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
|
||||
spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
|
||||
spa_set_deadman_failmode(spa, zfs_deadman_failmode);
|
||||
|
||||
refcount_create(&spa->spa_refcount);
|
||||
spa_config_lock_init(spa);
|
||||
@@ -1681,7 +1700,7 @@ spa_update_dspace(spa_t *spa)
|
||||
* Return the failure mode that has been set to this pool. The default
|
||||
* behavior will be to block all I/Os when a complete failure occurs.
|
||||
*/
|
||||
uint8_t
|
||||
uint64_t
|
||||
spa_get_failmode(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_failmode);
|
||||
@@ -1770,6 +1789,31 @@ spa_deadman_synctime(spa_t *spa)
|
||||
return (spa->spa_deadman_synctime);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
spa_deadman_ziotime(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_deadman_ziotime);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
spa_get_deadman_failmode(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_deadman_failmode);
|
||||
}
|
||||
|
||||
void
|
||||
spa_set_deadman_failmode(spa_t *spa, const char *failmode)
|
||||
{
|
||||
if (strcmp(failmode, "wait") == 0)
|
||||
spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
|
||||
else if (strcmp(failmode, "continue") == 0)
|
||||
spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE;
|
||||
else if (strcmp(failmode, "panic") == 0)
|
||||
spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
|
||||
else
|
||||
spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
|
||||
{
|
||||
@@ -2106,6 +2150,33 @@ spa_get_hostid(void)
|
||||
}
|
||||
|
||||
#if defined(_KERNEL) && defined(HAVE_SPL)
|
||||
|
||||
#include <linux/mod_compat.h>
|
||||
|
||||
static int
|
||||
param_set_deadman_failmode(const char *val, zfs_kernel_param_t *kp)
|
||||
{
|
||||
spa_t *spa = NULL;
|
||||
char *p;
|
||||
|
||||
if (val == NULL)
|
||||
return (SET_ERROR(-EINVAL));
|
||||
|
||||
if ((p = strchr(val, '\n')) != NULL)
|
||||
*p = '\0';
|
||||
|
||||
if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 &&
|
||||
strcmp(val, "panic"))
|
||||
return (SET_ERROR(-EINVAL));
|
||||
|
||||
mutex_enter(&spa_namespace_lock);
|
||||
while ((spa = spa_next(spa)) != NULL)
|
||||
spa_set_deadman_failmode(spa, val);
|
||||
mutex_exit(&spa_namespace_lock);
|
||||
|
||||
return (param_set_charp(val, kp));
|
||||
}
|
||||
|
||||
/* Namespace manipulation */
|
||||
EXPORT_SYMBOL(spa_lookup);
|
||||
EXPORT_SYMBOL(spa_add);
|
||||
@@ -2196,7 +2267,12 @@ MODULE_PARM_DESC(zfs_free_leak_on_eio,
|
||||
"Set to ignore IO errors during free and permanently leak the space");
|
||||
|
||||
module_param(zfs_deadman_synctime_ms, ulong, 0644);
|
||||
MODULE_PARM_DESC(zfs_deadman_synctime_ms, "Expiration time in milliseconds");
|
||||
MODULE_PARM_DESC(zfs_deadman_synctime_ms,
|
||||
"Pool sync expiration time in milliseconds");
|
||||
|
||||
module_param(zfs_deadman_ziotime_ms, ulong, 0644);
|
||||
MODULE_PARM_DESC(zfs_deadman_ziotime_ms,
|
||||
"IO expiration time in milliseconds");
|
||||
|
||||
module_param(zfs_deadman_checktime_ms, ulong, 0644);
|
||||
MODULE_PARM_DESC(zfs_deadman_checktime_ms,
|
||||
@@ -2205,6 +2281,10 @@ MODULE_PARM_DESC(zfs_deadman_checktime_ms,
|
||||
module_param(zfs_deadman_enabled, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer");
|
||||
|
||||
module_param_call(zfs_deadman_failmode, param_set_deadman_failmode,
|
||||
param_get_charp, &zfs_deadman_failmode, 0644);
|
||||
MODULE_PARM_DESC(zfs_deadman_failmode, "Failmode for deadman timer");
|
||||
|
||||
module_param(spa_asize_inflation, int, 0644);
|
||||
MODULE_PARM_DESC(spa_asize_inflation,
|
||||
"SPA size estimate multiplication factor");
|
||||
|
||||
+8
-11
@@ -3707,12 +3707,12 @@ vdev_split(vdev_t *vd)
|
||||
}
|
||||
|
||||
void
|
||||
vdev_deadman(vdev_t *vd)
|
||||
vdev_deadman(vdev_t *vd, char *tag)
|
||||
{
|
||||
for (int c = 0; c < vd->vdev_children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
|
||||
vdev_deadman(cvd);
|
||||
vdev_deadman(cvd, tag);
|
||||
}
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf) {
|
||||
@@ -3724,21 +3724,18 @@ vdev_deadman(vdev_t *vd)
|
||||
zio_t *fio;
|
||||
uint64_t delta;
|
||||
|
||||
zfs_dbgmsg("slow vdev: %s has %d active IOs",
|
||||
vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
|
||||
|
||||
/*
|
||||
* Look at the head of all the pending queues,
|
||||
* if any I/O has been outstanding for longer than
|
||||
* the spa_deadman_synctime we log a zevent.
|
||||
* the spa_deadman_synctime invoke the deadman logic.
|
||||
*/
|
||||
fio = avl_first(&vq->vq_active_tree);
|
||||
delta = gethrtime() - fio->io_timestamp;
|
||||
if (delta > spa_deadman_synctime(spa)) {
|
||||
zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
|
||||
"delta %lluns, last io %lluns",
|
||||
fio->io_timestamp, delta,
|
||||
vq->vq_io_complete_ts);
|
||||
zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
|
||||
spa, vd, &fio->io_bookmark, fio, 0, 0);
|
||||
}
|
||||
if (delta > spa_deadman_synctime(spa))
|
||||
zio_deadman(fio, tag);
|
||||
}
|
||||
mutex_exit(&vq->vq_lock);
|
||||
}
|
||||
|
||||
+88
-2
@@ -1748,6 +1748,80 @@ zio_delay_interrupt(zio_t *zio)
|
||||
zio_interrupt(zio);
|
||||
}
|
||||
|
||||
static void
|
||||
zio_deadman_impl(zio_t *pio)
|
||||
{
|
||||
zio_t *cio, *cio_next;
|
||||
zio_link_t *zl = NULL;
|
||||
vdev_t *vd = pio->io_vd;
|
||||
|
||||
if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
|
||||
vdev_queue_t *vq = &vd->vdev_queue;
|
||||
zbookmark_phys_t *zb = &pio->io_bookmark;
|
||||
uint64_t delta = gethrtime() - pio->io_timestamp;
|
||||
uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
|
||||
|
||||
zfs_dbgmsg("slow zio: zio=%p timestamp=%llu "
|
||||
"delta=%llu queued=%llu io=%llu "
|
||||
"path=%s last=%llu "
|
||||
"type=%d priority=%d flags=0x%x "
|
||||
"stage=0x%x pipeline=0x%x pipeline-trace=0x%x "
|
||||
"objset=%llu object=%llu level=%llu blkid=%llu "
|
||||
"offset=%llu size=%llu error=%d",
|
||||
pio, pio->io_timestamp,
|
||||
delta, pio->io_delta, pio->io_delay,
|
||||
vd->vdev_path, vq->vq_io_complete_ts,
|
||||
pio->io_type, pio->io_priority, pio->io_flags,
|
||||
pio->io_state, pio->io_pipeline, pio->io_pipeline_trace,
|
||||
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
|
||||
pio->io_offset, pio->io_size, pio->io_error);
|
||||
zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
|
||||
pio->io_spa, vd, zb, pio, 0, 0);
|
||||
|
||||
if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
|
||||
taskq_empty_ent(&pio->io_tqent)) {
|
||||
zio_interrupt(pio);
|
||||
}
|
||||
}
|
||||
|
||||
mutex_enter(&pio->io_lock);
|
||||
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
|
||||
cio_next = zio_walk_children(pio, &zl);
|
||||
zio_deadman_impl(cio);
|
||||
}
|
||||
mutex_exit(&pio->io_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Log the critical information describing this zio and all of its children
|
||||
* using the zfs_dbgmsg() interface then post deadman event for the ZED.
|
||||
*/
|
||||
void
|
||||
zio_deadman(zio_t *pio, char *tag)
|
||||
{
|
||||
spa_t *spa = pio->io_spa;
|
||||
char *name = spa_name(spa);
|
||||
|
||||
if (!zfs_deadman_enabled || spa_suspended(spa))
|
||||
return;
|
||||
|
||||
zio_deadman_impl(pio);
|
||||
|
||||
switch (spa_get_deadman_failmode(spa)) {
|
||||
case ZIO_FAILURE_MODE_WAIT:
|
||||
zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
|
||||
break;
|
||||
|
||||
case ZIO_FAILURE_MODE_CONTINUE:
|
||||
zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
|
||||
break;
|
||||
|
||||
case ZIO_FAILURE_MODE_PANIC:
|
||||
fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Execute the I/O pipeline until one of the following occurs:
|
||||
* (1) the I/O completes; (2) the pipeline stalls waiting for
|
||||
@@ -1877,6 +1951,7 @@ __zio_execute(zio_t *zio)
|
||||
int
|
||||
zio_wait(zio_t *zio)
|
||||
{
|
||||
long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
|
||||
int error;
|
||||
|
||||
ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
|
||||
@@ -1889,8 +1964,19 @@ zio_wait(zio_t *zio)
|
||||
__zio_execute(zio);
|
||||
|
||||
mutex_enter(&zio->io_lock);
|
||||
while (zio->io_executor != NULL)
|
||||
cv_wait_io(&zio->io_cv, &zio->io_lock);
|
||||
while (zio->io_executor != NULL) {
|
||||
error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
|
||||
ddi_get_lbolt() + timeout);
|
||||
|
||||
if (zfs_deadman_enabled && error == -1 &&
|
||||
gethrtime() - zio->io_queued_timestamp >
|
||||
spa_deadman_ziotime(zio->io_spa)) {
|
||||
mutex_exit(&zio->io_lock);
|
||||
timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
|
||||
zio_deadman(zio, FTAG);
|
||||
mutex_enter(&zio->io_lock);
|
||||
}
|
||||
}
|
||||
mutex_exit(&zio->io_lock);
|
||||
|
||||
error = zio->io_error;
|
||||
|
||||
Reference in New Issue
Block a user