mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 10:37:35 +03:00
3246 ZFS I/O deadman thread
Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com> Reviewed by: Eric Schrock <eric.schrock@delphix.com> Reviewed by: Christopher Siden <chris.siden@delphix.com> Approved by: Garrett D'Amore <garrett@damore.org> NOTES: This patch has been reworked from the original in the following ways to accomidate Linux ZFS implementation *) Usage of the cyclic interface was replaced by the delayed taskq interface. This avoids the need to implement new compatibility code and allows us to rely on the existing taskq implementation. *) An extern for zfs_txg_synctime_ms was added to sys/dsl_pool.h because declaring externs in source files as was done in the original patch is just plain wrong. *) Instead of panicing the system when the deadman triggers a zevent describing the blocked vdev and the first pending I/O is posted. If the panic behavior is desired Linux provides other generic methods to panic the system when threads are observed to hang. *) For reference, to delay zios by 30 seconds for testing you can use zinject as follows: 'zinject -d <vdev> -D30 <pool>' References: illumos/illumos-gate@283b84606b https://www.illumos.org/issues/3246 Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #1396
This commit is contained in:
committed by
Brian Behlendorf
parent
57f5a2008e
commit
cc92e9d0c3
@@ -1013,6 +1013,8 @@ spa_deactivate(spa_t *spa)
|
||||
list_destroy(&spa->spa_config_dirty_list);
|
||||
list_destroy(&spa->spa_state_dirty_list);
|
||||
|
||||
taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
|
||||
|
||||
for (t = 0; t < ZIO_TYPES; t++) {
|
||||
for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
|
||||
if (spa->spa_zio_taskq[t][q] != NULL)
|
||||
@@ -6017,6 +6019,12 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
|
||||
tx = dmu_tx_create_assigned(dp, txg);
|
||||
|
||||
spa->spa_sync_starttime = gethrtime();
|
||||
taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
|
||||
spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq,
|
||||
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
|
||||
NSEC_TO_TICK(spa->spa_deadman_synctime));
|
||||
|
||||
/*
|
||||
* If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
|
||||
* set spa_deflate if we have no raid-z vdevs.
|
||||
@@ -6145,6 +6153,9 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
}
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
|
||||
spa->spa_deadman_tqid = 0;
|
||||
|
||||
/*
|
||||
* Clear the dirty config list.
|
||||
*/
|
||||
|
||||
@@ -236,6 +236,24 @@ static avl_tree_t spa_l2cache_avl;
|
||||
kmem_cache_t *spa_buffer_pool;
|
||||
int spa_mode_global;
|
||||
|
||||
/*
|
||||
* Expiration time in units of zfs_txg_synctime_ms. This value has two
|
||||
* meanings. First it is used to determine when the spa_deadman logic
|
||||
* should fire. By default the spa_deadman will fire if spa_sync has
|
||||
* not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
|
||||
* Secondly, the value determines if an I/O is considered "hung".
|
||||
* Any I/O that has not completed in zfs_deadman_synctime is considered
|
||||
* "hung" resulting in a zevent being posted.
|
||||
* 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
|
||||
*/
|
||||
unsigned long zfs_deadman_synctime = 1000ULL;
|
||||
|
||||
/*
|
||||
* By default the deadman is enabled.
|
||||
*/
|
||||
int zfs_deadman_enabled = 1;
|
||||
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* SPA config locking
|
||||
@@ -412,6 +430,27 @@ spa_lookup(const char *name)
|
||||
return (spa);
|
||||
}
|
||||
|
||||
/*
|
||||
* Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
|
||||
* If the zfs_deadman_enabled flag is set then it inspects all vdev queues
|
||||
* looking for potentially hung I/Os.
|
||||
*/
|
||||
void
|
||||
spa_deadman(void *arg)
|
||||
{
|
||||
spa_t *spa = arg;
|
||||
|
||||
zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
|
||||
(gethrtime() - spa->spa_sync_starttime) / NANOSEC,
|
||||
++spa->spa_deadman_calls);
|
||||
if (zfs_deadman_enabled)
|
||||
vdev_deadman(spa->spa_root_vdev);
|
||||
|
||||
spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq,
|
||||
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
|
||||
NSEC_TO_TICK(spa->spa_deadman_synctime));
|
||||
}
|
||||
|
||||
/*
|
||||
* Create an uninitialized spa_t with the given name. Requires
|
||||
* spa_namespace_lock. The caller must ensure that the spa_t doesn't already
|
||||
@@ -454,6 +493,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
|
||||
spa->spa_proc = &p0;
|
||||
spa->spa_proc_state = SPA_PROC_NONE;
|
||||
|
||||
spa->spa_deadman_synctime = zfs_deadman_synctime *
|
||||
zfs_txg_synctime_ms * MICROSEC;
|
||||
|
||||
refcount_create(&spa->spa_refcount);
|
||||
spa_config_lock_init(spa);
|
||||
|
||||
@@ -1492,6 +1534,12 @@ spa_prev_software_version(spa_t *spa)
|
||||
return (spa->spa_prev_software_version);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
spa_deadman_synctime(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_deadman_synctime);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
|
||||
{
|
||||
@@ -1812,4 +1860,10 @@ EXPORT_SYMBOL(spa_writeable);
|
||||
EXPORT_SYMBOL(spa_mode);
|
||||
|
||||
EXPORT_SYMBOL(spa_namespace_lock);
|
||||
|
||||
module_param(zfs_deadman_synctime, ulong, 0644);
|
||||
MODULE_PARM_DESC(zfs_deadman_synctime,"Expire in units of zfs_txg_synctime_ms");
|
||||
|
||||
module_param(zfs_deadman_enabled, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer");
|
||||
#endif
|
||||
|
||||
@@ -3195,6 +3195,46 @@ vdev_split(vdev_t *vd)
|
||||
vdev_propagate_state(cvd);
|
||||
}
|
||||
|
||||
void
|
||||
vdev_deadman(vdev_t *vd)
|
||||
{
|
||||
int c;
|
||||
|
||||
for (c = 0; c < vd->vdev_children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
|
||||
vdev_deadman(cvd);
|
||||
}
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf) {
|
||||
vdev_queue_t *vq = &vd->vdev_queue;
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
if (avl_numnodes(&vq->vq_pending_tree) > 0) {
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
zio_t *fio;
|
||||
uint64_t delta;
|
||||
|
||||
/*
|
||||
* Look at the head of all the pending queues,
|
||||
* if any I/O has been outstanding for longer than
|
||||
* the spa_deadman_synctime we log a zevent.
|
||||
*/
|
||||
fio = avl_first(&vq->vq_pending_tree);
|
||||
delta = ddi_get_lbolt64() - fio->io_timestamp;
|
||||
if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) {
|
||||
zfs_dbgmsg("SLOW IO: zio timestamp %llu, "
|
||||
"delta %llu, last io %llu",
|
||||
fio->io_timestamp, delta,
|
||||
vq->vq_io_complete_ts);
|
||||
zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
|
||||
spa, vd, fio, 0, 0);
|
||||
}
|
||||
}
|
||||
mutex_exit(&vq->vq_lock);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(_KERNEL) && defined(HAVE_SPL)
|
||||
EXPORT_SYMBOL(vdev_fault);
|
||||
EXPORT_SYMBOL(vdev_degrade);
|
||||
|
||||
@@ -407,8 +407,7 @@ vdev_disk_dio_put(dio_request_t *dr)
|
||||
vdev_disk_dio_free(dr);
|
||||
|
||||
if (zio) {
|
||||
zio->io_delay = jiffies_to_msecs(
|
||||
jiffies_64 - zio->io_delay);
|
||||
zio->io_delay = jiffies_64 - zio->io_delay;
|
||||
zio->io_error = error;
|
||||
ASSERT3S(zio->io_error, >=, 0);
|
||||
if (zio->io_error)
|
||||
@@ -609,7 +608,7 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc)
|
||||
{
|
||||
zio_t *zio = bio->bi_private;
|
||||
|
||||
zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay);
|
||||
zio->io_delay = jiffies_64 - zio->io_delay;
|
||||
zio->io_error = -rc;
|
||||
if (rc && (rc == -EOPNOTSUPP))
|
||||
zio->io_vd->vdev_nowritecache = B_TRUE;
|
||||
|
||||
+14
-1
@@ -23,6 +23,10 @@
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/zio.h>
|
||||
@@ -319,6 +323,7 @@ again:
|
||||
vi, size, fio->io_type, ZIO_PRIORITY_AGG,
|
||||
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
|
||||
vdev_queue_agg_io_done, NULL);
|
||||
aio->io_timestamp = fio->io_timestamp;
|
||||
|
||||
nio = fio;
|
||||
do {
|
||||
@@ -391,7 +396,8 @@ vdev_queue_io(zio_t *zio)
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
|
||||
zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
|
||||
zio->io_timestamp = ddi_get_lbolt64();
|
||||
zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
|
||||
zio->io_priority;
|
||||
|
||||
vdev_queue_io_add(vq, zio);
|
||||
@@ -417,10 +423,17 @@ vdev_queue_io_done(zio_t *zio)
|
||||
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
|
||||
int i;
|
||||
|
||||
if (zio_injection_enabled)
|
||||
delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
|
||||
avl_remove(&vq->vq_pending_tree, zio);
|
||||
|
||||
zio->io_delta = ddi_get_lbolt64() - zio->io_timestamp;
|
||||
vq->vq_io_complete_ts = ddi_get_lbolt64();
|
||||
vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
|
||||
|
||||
for (i = 0; i < zfs_vdev_ramp_rate; i++) {
|
||||
zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
|
||||
if (nio == NULL)
|
||||
|
||||
@@ -250,6 +250,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||
|
||||
if (vd != NULL) {
|
||||
vdev_t *pvd = vd->vdev_parent;
|
||||
vdev_queue_t *vq = &vd->vdev_queue;
|
||||
|
||||
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
|
||||
DATA_TYPE_UINT64, vd->vdev_guid,
|
||||
@@ -272,6 +273,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
|
||||
DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
|
||||
|
||||
if (vq != NULL) {
|
||||
fm_payload_set(ereport,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
|
||||
DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
|
||||
fm_payload_set(ereport,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
|
||||
DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
|
||||
}
|
||||
|
||||
if (pvd != NULL) {
|
||||
fm_payload_set(ereport,
|
||||
FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
|
||||
@@ -304,6 +314,12 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||
DATA_TYPE_UINT32, zio->io_pipeline, NULL);
|
||||
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
|
||||
DATA_TYPE_UINT64, zio->io_delay, NULL);
|
||||
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
|
||||
DATA_TYPE_UINT64, zio->io_timestamp, NULL);
|
||||
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE,
|
||||
DATA_TYPE_UINT64, zio->io_deadline, NULL);
|
||||
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
|
||||
DATA_TYPE_UINT64, zio->io_delta, NULL);
|
||||
|
||||
/*
|
||||
* If the 'size' parameter is non-zero, it indicates this is a
|
||||
|
||||
+5
-3
@@ -609,6 +609,9 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
||||
zio->io_vsd_ops = NULL;
|
||||
zio->io_offset = offset;
|
||||
zio->io_deadline = 0;
|
||||
zio->io_timestamp = 0;
|
||||
zio->io_delta = 0;
|
||||
zio->io_delay = 0;
|
||||
zio->io_orig_data = zio->io_data = data;
|
||||
zio->io_orig_size = zio->io_size = size;
|
||||
zio->io_orig_flags = zio->io_flags = flags;
|
||||
@@ -620,7 +623,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
||||
zio->io_bp_override = NULL;
|
||||
zio->io_walk_link = NULL;
|
||||
zio->io_transform_stack = NULL;
|
||||
zio->io_delay = 0;
|
||||
zio->io_error = 0;
|
||||
zio->io_child_count = 0;
|
||||
zio->io_parent_count = 0;
|
||||
@@ -2906,11 +2908,11 @@ zio_done(zio_t *zio)
|
||||
vdev_stat_update(zio, zio->io_size);
|
||||
|
||||
/*
|
||||
* If this I/O is attached to a particular vdev is slow, exeeding
|
||||
* If this I/O is attached to a particular vdev is slow, exceeding
|
||||
* 30 seconds to complete, post an error described the I/O delay.
|
||||
* We ignore these errors if the device is currently unavailable.
|
||||
*/
|
||||
if (zio->io_delay >= zio_delay_max) {
|
||||
if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
|
||||
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
|
||||
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
|
||||
zio->io_vd, zio, 0, 0);
|
||||
|
||||
+37
-28
@@ -20,6 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
/*
|
||||
@@ -147,14 +148,8 @@ zio_handle_fault_injection(zio_t *zio, int error)
|
||||
for (handler = list_head(&inject_handlers); handler != NULL;
|
||||
handler = list_next(&inject_handlers, handler)) {
|
||||
|
||||
/* Ignore errors not destined for this pool */
|
||||
if (zio->io_spa != handler->zi_spa)
|
||||
continue;
|
||||
|
||||
/* Ignore device errors and panic injection */
|
||||
if (handler->zi_record.zi_guid != 0 ||
|
||||
handler->zi_record.zi_func[0] != '\0' ||
|
||||
handler->zi_record.zi_duration != 0)
|
||||
if (zio->io_spa != handler->zi_spa ||
|
||||
handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
|
||||
continue;
|
||||
|
||||
/* If this handler matches, return EIO */
|
||||
@@ -197,10 +192,7 @@ zio_handle_label_injection(zio_t *zio, int error)
|
||||
uint64_t start = handler->zi_record.zi_start;
|
||||
uint64_t end = handler->zi_record.zi_end;
|
||||
|
||||
/* Ignore device only faults or panic injection */
|
||||
if (handler->zi_record.zi_start == 0 ||
|
||||
handler->zi_record.zi_func[0] != '\0' ||
|
||||
handler->zi_record.zi_duration != 0)
|
||||
if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
|
||||
continue;
|
||||
|
||||
/*
|
||||
@@ -246,13 +238,7 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
|
||||
for (handler = list_head(&inject_handlers); handler != NULL;
|
||||
handler = list_next(&inject_handlers, handler)) {
|
||||
|
||||
/*
|
||||
* Ignore label specific faults, panic injection
|
||||
* or fake writes
|
||||
*/
|
||||
if (handler->zi_record.zi_start != 0 ||
|
||||
handler->zi_record.zi_func[0] != '\0' ||
|
||||
handler->zi_record.zi_duration != 0)
|
||||
if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
|
||||
continue;
|
||||
|
||||
if (vd->vdev_guid == handler->zi_record.zi_guid) {
|
||||
@@ -316,10 +302,8 @@ zio_handle_ignored_writes(zio_t *zio)
|
||||
handler = list_next(&inject_handlers, handler)) {
|
||||
|
||||
/* Ignore errors not destined for this pool */
|
||||
if (zio->io_spa != handler->zi_spa)
|
||||
continue;
|
||||
|
||||
if (handler->zi_record.zi_duration == 0)
|
||||
if (zio->io_spa != handler->zi_spa ||
|
||||
handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
|
||||
continue;
|
||||
|
||||
/*
|
||||
@@ -355,11 +339,8 @@ spa_handle_ignored_writes(spa_t *spa)
|
||||
for (handler = list_head(&inject_handlers); handler != NULL;
|
||||
handler = list_next(&inject_handlers, handler)) {
|
||||
|
||||
/* Ignore errors not destined for this pool */
|
||||
if (spa != handler->zi_spa)
|
||||
continue;
|
||||
|
||||
if (handler->zi_record.zi_duration == 0)
|
||||
if (spa != handler->zi_spa ||
|
||||
handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
|
||||
continue;
|
||||
|
||||
if (handler->zi_record.zi_duration > 0) {
|
||||
@@ -379,6 +360,34 @@ spa_handle_ignored_writes(spa_t *spa)
|
||||
rw_exit(&inject_lock);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
zio_handle_io_delay(zio_t *zio)
|
||||
{
|
||||
vdev_t *vd = zio->io_vd;
|
||||
inject_handler_t *handler;
|
||||
uint64_t seconds = 0;
|
||||
|
||||
if (zio_injection_enabled == 0)
|
||||
return (0);
|
||||
|
||||
rw_enter(&inject_lock, RW_READER);
|
||||
|
||||
for (handler = list_head(&inject_handlers); handler != NULL;
|
||||
handler = list_next(&inject_handlers, handler)) {
|
||||
|
||||
if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
|
||||
continue;
|
||||
|
||||
if (vd->vdev_guid == handler->zi_record.zi_guid) {
|
||||
seconds = handler->zi_record.zi_timer;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
rw_exit(&inject_lock);
|
||||
return (seconds);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a new handler for the given record. We add it to the list, adding
|
||||
* a reference to the spa_t in the process. We increment zio_injection_enabled,
|
||||
|
||||
Reference in New Issue
Block a user