mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
Add zpool status -s (slow I/Os) and -p (parseable)
This patch adds a new slow I/Os (-s) column to zpool status to show the number of VDEV slow I/Os. This is the number of I/Os that didn't complete in zio_slow_io_ms milliseconds. It also adds a new parsable (-p) flag to display exact values. NAME STATE READ WRITE CKSUM SLOW testpool ONLINE 0 0 0 - mirror-0 ONLINE 0 0 0 - loop0 ONLINE 0 0 0 20 loop1 ONLINE 0 0 0 0 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes #7756 Closes #6885
This commit is contained in:
committed by
Brian Behlendorf
parent
877d925a9e
commit
ad796b8a3b
+13
-10
@@ -77,14 +77,14 @@ int vdev_validate_skip = B_FALSE;
|
||||
int vdev_dtl_sm_blksz = (1 << 12);
|
||||
|
||||
/*
|
||||
* Rate limit delay events to this many IO delays per second.
|
||||
* Rate limit slow IO (delay) events to this many per second.
|
||||
*/
|
||||
unsigned int zfs_delays_per_second = 20;
|
||||
unsigned int zfs_slow_io_events_per_second = 20;
|
||||
|
||||
/*
|
||||
* Rate limit checksum events after this many checksum errors per second.
|
||||
*/
|
||||
unsigned int zfs_checksums_per_second = 20;
|
||||
unsigned int zfs_checksum_events_per_second = 20;
|
||||
|
||||
/*
|
||||
* Ignore errors during scrub/resilver. Allows to work around resilver
|
||||
@@ -507,8 +507,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
* and checksum events so that we don't overwhelm ZED with thousands
|
||||
* of events when a disk is acting up.
|
||||
*/
|
||||
zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_delays_per_second, 1);
|
||||
zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksums_per_second, 1);
|
||||
zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
|
||||
1);
|
||||
zfs_ratelimit_init(&vd->vdev_checksum_rl,
|
||||
&zfs_checksum_events_per_second, 1);
|
||||
|
||||
list_link_init(&vd->vdev_config_dirty_node);
|
||||
list_link_init(&vd->vdev_state_dirty_node);
|
||||
@@ -3591,6 +3593,7 @@ vdev_clear(spa_t *spa, vdev_t *vd)
|
||||
vd->vdev_stat.vs_read_errors = 0;
|
||||
vd->vdev_stat.vs_write_errors = 0;
|
||||
vd->vdev_stat.vs_checksum_errors = 0;
|
||||
vd->vdev_stat.vs_slow_ios = 0;
|
||||
|
||||
for (int c = 0; c < vd->vdev_children; c++)
|
||||
vdev_clear(spa, vd->vdev_child[c]);
|
||||
@@ -4630,12 +4633,12 @@ module_param(vdev_ms_count_limit, int, 0644);
|
||||
MODULE_PARM_DESC(vdev_ms_count_limit,
|
||||
"Practical upper limit of total metaslabs per top-level vdev");
|
||||
|
||||
module_param(zfs_delays_per_second, uint, 0644);
|
||||
MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many "
|
||||
"IO delays per second");
|
||||
module_param(zfs_slow_io_events_per_second, uint, 0644);
|
||||
MODULE_PARM_DESC(zfs_slow_io_events_per_second,
|
||||
"Rate limit slow IO (delay) events to this many per second");
|
||||
|
||||
module_param(zfs_checksums_per_second, uint, 0644);
|
||||
MODULE_PARM_DESC(zfs_checksums_per_second, "Rate limit checksum events "
|
||||
module_param(zfs_checksum_events_per_second, uint, 0644);
|
||||
MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events "
|
||||
"to this many checksum errors per second (do not set below zed"
|
||||
"threshold).");
|
||||
|
||||
|
||||
@@ -347,6 +347,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
|
||||
vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
|
||||
ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));
|
||||
|
||||
/* IO delays */
|
||||
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
|
||||
|
||||
/* Add extended stats nvlist to main nvlist */
|
||||
fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
|
||||
|
||||
|
||||
+116
-83
@@ -140,7 +140,10 @@ zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
|
||||
return (rc);
|
||||
}
|
||||
|
||||
static void
|
||||
/*
|
||||
* Return B_TRUE if the event actually posted, B_FALSE if not.
|
||||
*/
|
||||
static boolean_t
|
||||
zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||
const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
|
||||
zio_t *zio, uint64_t stateoroffset, uint64_t size)
|
||||
@@ -150,78 +153,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||
uint64_t ena;
|
||||
char class[64];
|
||||
|
||||
/*
|
||||
* If we are doing a spa_tryimport() or in recovery mode,
|
||||
* ignore errors.
|
||||
*/
|
||||
if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
|
||||
spa_load_state(spa) == SPA_LOAD_RECOVER)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If we are in the middle of opening a pool, and the previous attempt
|
||||
* failed, don't bother logging any new ereports - we're just going to
|
||||
* get the same diagnosis anyway.
|
||||
*/
|
||||
if (spa_load_state(spa) != SPA_LOAD_NONE &&
|
||||
spa->spa_last_open_failed)
|
||||
return;
|
||||
|
||||
if (zio != NULL) {
|
||||
/*
|
||||
* If this is not a read or write zio, ignore the error. This
|
||||
* can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
|
||||
*/
|
||||
if (zio->io_type != ZIO_TYPE_READ &&
|
||||
zio->io_type != ZIO_TYPE_WRITE)
|
||||
return;
|
||||
|
||||
if (vd != NULL) {
|
||||
/*
|
||||
* If the vdev has already been marked as failing due
|
||||
* to a failed probe, then ignore any subsequent I/O
|
||||
* errors, as the DE will automatically fault the vdev
|
||||
* on the first such failure. This also catches cases
|
||||
* where vdev_remove_wanted is set and the device has
|
||||
* not yet been asynchronously placed into the REMOVED
|
||||
* state.
|
||||
*/
|
||||
if (zio->io_vd == vd && !vdev_accessible(vd, zio))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Ignore checksum errors for reads from DTL regions of
|
||||
* leaf vdevs.
|
||||
*/
|
||||
if (zio->io_type == ZIO_TYPE_READ &&
|
||||
zio->io_error == ECKSUM &&
|
||||
vd->vdev_ops->vdev_op_leaf &&
|
||||
vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* For probe failure, we want to avoid posting ereports if we've
|
||||
* already removed the device in the meantime.
|
||||
*/
|
||||
if (vd != NULL &&
|
||||
strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
|
||||
(vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
|
||||
return;
|
||||
|
||||
if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
|
||||
(zio != NULL) && (!zio->io_timestamp)) {
|
||||
/* Ignore bogus delay events */
|
||||
return;
|
||||
}
|
||||
if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
|
||||
return (B_FALSE);
|
||||
|
||||
if ((ereport = fm_nvlist_create(NULL)) == NULL)
|
||||
return;
|
||||
return (B_FALSE);
|
||||
|
||||
if ((detector = fm_nvlist_create(NULL)) == NULL) {
|
||||
fm_nvlist_destroy(ereport, FM_NVA_FREE);
|
||||
return;
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -332,7 +272,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
|
||||
DATA_TYPE_UINT64, vs->vs_write_errors,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
|
||||
DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL);
|
||||
DATA_TYPE_UINT64, vs->vs_checksum_errors,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
|
||||
DATA_TYPE_UINT64, vs->vs_slow_ios,
|
||||
NULL);
|
||||
}
|
||||
|
||||
if (pvd != NULL) {
|
||||
@@ -427,7 +370,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||
/*
|
||||
* Payload for I/Os with corresponding logical information.
|
||||
*/
|
||||
if (zb != NULL && (zio == NULL || zio->io_logical != NULL))
|
||||
if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
|
||||
fm_payload_set(ereport,
|
||||
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
|
||||
DATA_TYPE_UINT64, zb->zb_objset,
|
||||
@@ -437,11 +380,13 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||
DATA_TYPE_INT64, zb->zb_level,
|
||||
FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
|
||||
DATA_TYPE_UINT64, zb->zb_blkid, NULL);
|
||||
}
|
||||
|
||||
mutex_exit(&spa->spa_errlist_lock);
|
||||
|
||||
*ereport_out = ereport;
|
||||
*detector_out = detector;
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
/* if it's <= 128 bytes, save the corruption directly */
|
||||
@@ -765,27 +710,111 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
|
||||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
/*
|
||||
* Make sure our event is still valid for the given zio/vdev/pool. For example,
|
||||
* we don't want to keep logging events for a faulted or missing vdev.
|
||||
*/
|
||||
boolean_t
|
||||
zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
|
||||
{
|
||||
#ifdef _KERNEL
|
||||
/*
|
||||
* If we are doing a spa_tryimport() or in recovery mode,
|
||||
* ignore errors.
|
||||
*/
|
||||
if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
|
||||
spa_load_state(spa) == SPA_LOAD_RECOVER)
|
||||
return (B_FALSE);
|
||||
|
||||
/*
|
||||
* If we are in the middle of opening a pool, and the previous attempt
|
||||
* failed, don't bother logging any new ereports - we're just going to
|
||||
* get the same diagnosis anyway.
|
||||
*/
|
||||
if (spa_load_state(spa) != SPA_LOAD_NONE &&
|
||||
spa->spa_last_open_failed)
|
||||
return (B_FALSE);
|
||||
|
||||
if (zio != NULL) {
|
||||
/*
|
||||
* If this is not a read or write zio, ignore the error. This
|
||||
* can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
|
||||
*/
|
||||
if (zio->io_type != ZIO_TYPE_READ &&
|
||||
zio->io_type != ZIO_TYPE_WRITE)
|
||||
return (B_FALSE);
|
||||
|
||||
if (vd != NULL) {
|
||||
/*
|
||||
* If the vdev has already been marked as failing due
|
||||
* to a failed probe, then ignore any subsequent I/O
|
||||
* errors, as the DE will automatically fault the vdev
|
||||
* on the first such failure. This also catches cases
|
||||
* where vdev_remove_wanted is set and the device has
|
||||
* not yet been asynchronously placed into the REMOVED
|
||||
* state.
|
||||
*/
|
||||
if (zio->io_vd == vd && !vdev_accessible(vd, zio))
|
||||
return (B_FALSE);
|
||||
|
||||
/*
|
||||
* Ignore checksum errors for reads from DTL regions of
|
||||
* leaf vdevs.
|
||||
*/
|
||||
if (zio->io_type == ZIO_TYPE_READ &&
|
||||
zio->io_error == ECKSUM &&
|
||||
vd->vdev_ops->vdev_op_leaf &&
|
||||
vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
|
||||
return (B_FALSE);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* For probe failure, we want to avoid posting ereports if we've
|
||||
* already removed the device in the meantime.
|
||||
*/
|
||||
if (vd != NULL &&
|
||||
strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
|
||||
(vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
|
||||
return (B_FALSE);
|
||||
|
||||
/* Ignore bogus delay events (like from ioctls or unqueued IOs) */
|
||||
if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
|
||||
(zio != NULL) && (!zio->io_timestamp)) {
|
||||
return (B_FALSE);
|
||||
}
|
||||
#endif
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return 0 if event was posted, EINVAL if there was a problem posting it or
|
||||
* EBUSY if the event was rate limited.
|
||||
*/
|
||||
int
|
||||
zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
|
||||
const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
|
||||
uint64_t size)
|
||||
{
|
||||
int rc = 0;
|
||||
#ifdef _KERNEL
|
||||
nvlist_t *ereport = NULL;
|
||||
nvlist_t *detector = NULL;
|
||||
|
||||
if (zfs_is_ratelimiting_event(subclass, vd))
|
||||
return;
|
||||
return (SET_ERROR(EBUSY));
|
||||
|
||||
zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
|
||||
zb, zio, stateoroffset, size);
|
||||
if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
|
||||
zb, zio, stateoroffset, size))
|
||||
return (SET_ERROR(EINVAL)); /* couldn't post event */
|
||||
|
||||
if (ereport == NULL)
|
||||
return;
|
||||
return (SET_ERROR(EINVAL));
|
||||
|
||||
/* Cleanup is handled by the callback function */
|
||||
zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
|
||||
rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
|
||||
#endif
|
||||
return (rc);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -795,7 +824,6 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
|
||||
{
|
||||
zio_cksum_report_t *report;
|
||||
|
||||
|
||||
#ifdef _KERNEL
|
||||
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
|
||||
return;
|
||||
@@ -874,30 +902,34 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
int
|
||||
zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
|
||||
struct zio *zio, uint64_t offset, uint64_t length,
|
||||
const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
|
||||
{
|
||||
int rc = 0;
|
||||
#ifdef _KERNEL
|
||||
nvlist_t *ereport = NULL;
|
||||
nvlist_t *detector = NULL;
|
||||
zfs_ecksum_info_t *info;
|
||||
|
||||
zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
|
||||
spa, vd, zb, zio, offset, length);
|
||||
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
|
||||
return (EBUSY);
|
||||
|
||||
if (ereport == NULL)
|
||||
return;
|
||||
if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
|
||||
spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
|
||||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
|
||||
info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
|
||||
B_FALSE);
|
||||
|
||||
if (info != NULL) {
|
||||
zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
|
||||
rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
|
||||
kmem_free(info, sizeof (*info));
|
||||
}
|
||||
#endif
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1043,6 +1075,7 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
|
||||
|
||||
#if defined(_KERNEL)
|
||||
EXPORT_SYMBOL(zfs_ereport_post);
|
||||
EXPORT_SYMBOL(zfs_ereport_is_valid);
|
||||
EXPORT_SYMBOL(zfs_ereport_post_checksum);
|
||||
EXPORT_SYMBOL(zfs_post_remove);
|
||||
EXPORT_SYMBOL(zfs_post_autoreplace);
|
||||
|
||||
+27
-7
@@ -77,7 +77,8 @@ uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
||||
uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
||||
#endif
|
||||
|
||||
int zio_delay_max = ZIO_DELAY_MAX;
|
||||
/* Mark IOs as "slow" if they take longer than 30 seconds */
|
||||
int zio_slow_io_ms = (30 * MILLISEC);
|
||||
|
||||
#define BP_SPANB(indblkshift, level) \
|
||||
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
|
||||
@@ -4431,10 +4432,28 @@ zio_done(zio_t *zio)
|
||||
* 30 seconds to complete, post an error described the I/O delay.
|
||||
* We ignore these errors if the device is currently unavailable.
|
||||
*/
|
||||
if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) {
|
||||
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
|
||||
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
|
||||
zio->io_vd, &zio->io_bookmark, zio, 0, 0);
|
||||
if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
|
||||
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
|
||||
/*
|
||||
* We want to only increment our slow IO counters if
|
||||
* the IO is valid (i.e. not if the drive is removed).
|
||||
*
|
||||
* zfs_ereport_post() will also do these checks, but
|
||||
* it can also ratelimit and have other failures, so we
|
||||
* need to increment the slow_io counters independent
|
||||
* of it.
|
||||
*/
|
||||
if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
|
||||
zio->io_spa, zio->io_vd, zio)) {
|
||||
mutex_enter(&zio->io_vd->vdev_stat_lock);
|
||||
zio->io_vd->vdev_stat.vs_slow_ios++;
|
||||
mutex_exit(&zio->io_vd->vdev_stat_lock);
|
||||
|
||||
zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
|
||||
zio->io_spa, zio->io_vd, &zio->io_bookmark,
|
||||
zio, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (zio->io_error) {
|
||||
@@ -4823,8 +4842,9 @@ EXPORT_SYMBOL(zio_data_buf_alloc);
|
||||
EXPORT_SYMBOL(zio_buf_free);
|
||||
EXPORT_SYMBOL(zio_data_buf_free);
|
||||
|
||||
module_param(zio_delay_max, int, 0644);
|
||||
MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
|
||||
module_param(zio_slow_io_ms, int, 0644);
|
||||
MODULE_PARM_DESC(zio_slow_io_ms,
|
||||
"Max I/O completion time (milliseconds) before marking it as slow");
|
||||
|
||||
module_param(zio_requeue_io_start_cut_in_line, int, 0644);
|
||||
MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
|
||||
|
||||
Reference in New Issue
Block a user