Extend deadman logic

The intent of this patch is extend the existing deadman code
such that it's flexible enough to be used by both ztest and
on production systems.  The proposed changes include:

* Added a new `zfs_deadman_failmode` module option which is
  used to dynamically control the behavior of the deadman.  It's
  loosely modeled after, but independant from, the pool failmode
  property.  It can be set to wait, continue, or panic.

    * wait     - Wait for the "hung" I/O (default)
    * continue - Attempt to recover from a "hung" I/O
    * panic    - Panic the system

* Added a new `zfs_deadman_ziotime_ms` module option which is
  analogous to `zfs_deadman_synctime_ms` except instead of
  applying to a pool TXG sync it applies to zio_wait().  A
  default value of 300s is used to define a "hung" zio.

* The ztest deadman thread has been re-enabled by default,
  aligned with the upstream OpenZFS code, and then extended
  to terminate the process when it takes significantly longer
  to complete than expected.

* The -G option was added to ztest to print the internal debug
  log when a fatal error is encountered.  This same option was
  previously added to zdb in commit fa603f82.  Update zloop.sh
  to unconditionally pass -G to obtain additional debugging.

* The FM_EREPORT_ZFS_DELAY event which was previously posted
  when the deadman detect a "hung" pool has been replaced by
  a new dedicated FM_EREPORT_ZFS_DEADMAN event.

* The proposed recovery logic attempts to restart a "hung"
  zio by calling zio_interrupt() on any outstanding leaf zios.
  We may want to further restrict this to zios in either the
  ZIO_STAGE_VDEV_IO_START or ZIO_STAGE_VDEV_IO_DONE stages.
  Calling zio_interrupt() is expected to only be useful for
  cases when an IO has been submitted to the physical device
  but for some reasonable the completion callback hasn't been
  called by the lower layers.  This shouldn't be possible but
  has been observed and may be caused by kernel/driver bugs.

* The 'zfs_deadman_synctime_ms' default value was reduced from
  1000s to 600s.

* Depending on how ztest fails there may be no cache file to
  move.  This should not be considered fatal, collect the logs
  which are available and carry on.

* Add deadman test cases for spa_deadman() and zio_wait().

* Increase default zfs_deadman_checktime_ms to 60s.

Reviewed-by: Tim Chase <tim@chase2k.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #6999
This commit is contained in:
Brian Behlendorf
2017-12-18 17:06:07 -05:00
parent 1b18c6d791
commit 8fb1ede146
21 changed files with 583 additions and 58 deletions
+1
View File
@@ -37,6 +37,7 @@ extern "C" {
#define FM_EREPORT_ZFS_IO "io"
#define FM_EREPORT_ZFS_DATA "data"
#define FM_EREPORT_ZFS_DELAY "delay"
#define FM_EREPORT_ZFS_DEADMAN "deadman"
#define FM_EREPORT_ZFS_POOL "zpool"
#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"
+8 -1
View File
@@ -955,12 +955,15 @@ extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
extern void spa_evicting_os_wait(spa_t *spa);
extern int spa_max_replication(spa_t *spa);
extern int spa_prev_software_version(spa_t *spa);
extern uint8_t spa_get_failmode(spa_t *spa);
extern uint64_t spa_get_failmode(spa_t *spa);
extern uint64_t spa_get_deadman_failmode(spa_t *spa);
extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode);
extern boolean_t spa_suspended(spa_t *spa);
extern uint64_t spa_bootfs(spa_t *spa);
extern uint64_t spa_delegation(spa_t *spa);
extern objset_t *spa_meta_objset(spa_t *spa);
extern uint64_t spa_deadman_synctime(spa_t *spa);
extern uint64_t spa_deadman_ziotime(spa_t *spa);
/* Miscellaneous support routines */
extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
@@ -1074,6 +1077,10 @@ extern boolean_t spa_debug_enabled(spa_t *spa);
}
extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */
extern int zfs_deadman_enabled;
extern unsigned long zfs_deadman_synctime_ms;
extern unsigned long zfs_deadman_ziotime_ms;
extern unsigned long zfs_deadman_checktime_ms;
#ifdef __cplusplus
}
+3 -1
View File
@@ -228,6 +228,7 @@ struct spa {
uint64_t spa_pool_props_object; /* object for properties */
uint64_t spa_bootfs; /* default boot filesystem */
uint64_t spa_failmode; /* failure mode for the pool */
uint64_t spa_deadman_failmode; /* failure mode for deadman */
uint64_t spa_delegation; /* delegation on/off */
list_t spa_config_list; /* previous cache file(s) */
/* per-CPU array of root of async I/O: */
@@ -270,7 +271,8 @@ struct spa {
taskqid_t spa_deadman_tqid; /* Task id */
uint64_t spa_deadman_calls; /* number of deadman calls */
hrtime_t spa_sync_starttime; /* starting time of spa_sync */
uint64_t spa_deadman_synctime; /* deadman expiration timer */
uint64_t spa_deadman_synctime; /* deadman sync expiration */
uint64_t spa_deadman_ziotime; /* deadman zio expiration */
uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */
spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */
uint64_t spa_errata; /* errata issues detected */
+1 -1
View File
@@ -84,7 +84,7 @@ extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd, char *tag);
extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
+1
View File
@@ -317,6 +317,7 @@ extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
extern void cv_signal(kcondvar_t *cv);
extern void cv_broadcast(kcondvar_t *cv);
#define cv_timedwait_io(cv, mp, at) cv_timedwait(cv, mp, at)
#define cv_timedwait_sig(cv, mp, at) cv_timedwait(cv, mp, at)
#define cv_wait_sig(cv, mp) cv_wait(cv, mp)
#define cv_wait_io(cv, mp) cv_wait(cv, mp)
+1
View File
@@ -556,6 +556,7 @@ extern void zio_execute(zio_t *zio);
extern void zio_interrupt(zio_t *zio);
extern void zio_delay_init(zio_t *zio);
extern void zio_delay_interrupt(zio_t *zio);
extern void zio_deadman(zio_t *zio, char *tag);
extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);