Add zpool status -s (slow I/Os) and -p (parseable)

This patch adds a new slow I/Os (-s) column to zpool status to show the
number of VDEV slow I/Os. This is the number of I/Os that didn't
complete in zio_slow_io_ms milliseconds. It also adds a new parsable
(-p) flag to display exact values.

 	NAME         STATE     READ WRITE CKSUM  SLOW
 	testpool     ONLINE       0     0     0     -
	  mirror-0   ONLINE       0     0     0     -
 	    loop0    ONLINE       0     0     0    20
 	    loop1    ONLINE       0     0     0     0

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #7756
Closes #6885
This commit is contained in:
Tony Hutter 2018-11-08 16:47:24 -08:00 committed by Brian Behlendorf
parent 877d925a9e
commit ad796b8a3b
16 changed files with 321 additions and 136 deletions

View File

@ -365,8 +365,8 @@ get_usage(zpool_help_t idx)
case HELP_RESILVER: case HELP_RESILVER:
return (gettext("\tresilver <pool> ...\n")); return (gettext("\tresilver <pool> ...\n"));
case HELP_STATUS: case HELP_STATUS:
return (gettext("\tstatus [-c [script1,script2,...]] [-gLPvxD]" return (gettext("\tstatus [-c [script1,script2,...]] "
"[-T d|u] [pool] ... \n" "[-gLpPsvxD] [-T d|u] [pool] ... \n"
"\t [interval [count]]\n")); "\t [interval [count]]\n"));
case HELP_UPGRADE: case HELP_UPGRADE:
return (gettext("\tupgrade\n" return (gettext("\tupgrade\n"
@ -1669,10 +1669,12 @@ typedef struct status_cbdata {
int cb_namewidth; int cb_namewidth;
boolean_t cb_allpools; boolean_t cb_allpools;
boolean_t cb_verbose; boolean_t cb_verbose;
boolean_t cb_literal;
boolean_t cb_explain; boolean_t cb_explain;
boolean_t cb_first; boolean_t cb_first;
boolean_t cb_dedup_stats; boolean_t cb_dedup_stats;
boolean_t cb_print_status; boolean_t cb_print_status;
boolean_t cb_print_slow_ios;
vdev_cmd_data_list_t *vcdl; vdev_cmd_data_list_t *vcdl;
} status_cbdata_t; } status_cbdata_t;
@ -1788,10 +1790,34 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
name, state); name, state);
if (!isspare) { if (!isspare) {
if (cb->cb_literal) {
printf(" %5llu %5llu %5llu",
(u_longlong_t)vs->vs_read_errors,
(u_longlong_t)vs->vs_write_errors,
(u_longlong_t)vs->vs_checksum_errors);
} else {
zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); zfs_nicenum(vs->vs_checksum_errors, cbuf,
(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); sizeof (cbuf));
printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
}
if (cb->cb_print_slow_ios) {
if (children == 0) {
/* Only leafs vdevs have slow IOs */
zfs_nicenum(vs->vs_slow_ios, rbuf,
sizeof (rbuf));
} else {
snprintf(rbuf, sizeof (rbuf), "-");
}
if (cb->cb_literal)
printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
else
printf(" %5s", rbuf);
}
} }
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
@ -7175,6 +7201,9 @@ status_callback(zpool_handle_t *zhp, void *data)
cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE",
"CKSUM"); "CKSUM");
if (cbp->cb_print_slow_ios)
(void) printf(" %5s", gettext("SLOW"));
if (cbp->vcdl != NULL) if (cbp->vcdl != NULL)
print_cmd_columns(cbp->vcdl, 0); print_cmd_columns(cbp->vcdl, 0);
@ -7241,13 +7270,15 @@ status_callback(zpool_handle_t *zhp, void *data)
} }
/* /*
* zpool status [-c [script1,script2,...]] [-gLPvx] [-T d|u] [pool] ... * zpool status [-c [script1,script2,...]] [-gLpPsvx] [-T d|u] [pool] ...
* [interval [count]] * [interval [count]]
* *
* -c CMD For each vdev, run command CMD * -c CMD For each vdev, run command CMD
* -g Display guid for individual vdev name. * -g Display guid for individual vdev name.
* -L Follow links when resolving vdev path name. * -L Follow links when resolving vdev path name.
* -p Display values in parsable (exact) format.
* -P Display full path for vdev name. * -P Display full path for vdev name.
* -s Display slow IOs column.
* -v Display complete error logs * -v Display complete error logs
* -x Display only pools with potential problems * -x Display only pools with potential problems
* -D Display dedup status (undocumented) * -D Display dedup status (undocumented)
@ -7266,7 +7297,7 @@ zpool_do_status(int argc, char **argv)
char *cmd = NULL; char *cmd = NULL;
/* check options */ /* check options */
while ((c = getopt(argc, argv, "c:gLPvxDT:")) != -1) { while ((c = getopt(argc, argv, "c:gLpPsvxDT:")) != -1) {
switch (c) { switch (c) {
case 'c': case 'c':
if (cmd != NULL) { if (cmd != NULL) {
@ -7298,9 +7329,15 @@ zpool_do_status(int argc, char **argv)
case 'L': case 'L':
cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
break; break;
case 'p':
cb.cb_literal = B_TRUE;
break;
case 'P': case 'P':
cb.cb_name_flags |= VDEV_NAME_PATH; cb.cb_name_flags |= VDEV_NAME_PATH;
break; break;
case 's':
cb.cb_print_slow_ios = B_TRUE;
break;
case 'v': case 'v':
cb.cb_verbose = B_TRUE; cb.cb_verbose = B_TRUE;
break; break;

View File

@ -74,6 +74,7 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"

View File

@ -665,6 +665,9 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo"
#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" #define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo"
/* Number of slow IOs */
#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios"
/* vdev enclosure sysfs path */ /* vdev enclosure sysfs path */
#define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" #define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path"
@ -990,6 +993,7 @@ typedef struct vdev_stat {
uint64_t vs_fragmentation; /* device fragmentation */ uint64_t vs_fragmentation; /* device fragmentation */
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
uint64_t vs_resilver_deferred; /* resilver deferred */ uint64_t vs_resilver_deferred; /* resilver deferred */
uint64_t vs_slow_ios; /* slow IOs */
} vdev_stat_t; } vdev_stat_t;
/* /*

View File

@ -1076,9 +1076,11 @@ extern const char *spa_state_to_name(spa_t *spa);
/* error handling */ /* error handling */
struct zbookmark_phys; struct zbookmark_phys;
extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb);
extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, extern int zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
uint64_t length); uint64_t length);
extern boolean_t zfs_ereport_is_valid(const char *class, spa_t *spa, vdev_t *vd,
zio_t *zio);
extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
const char *name, nvlist_t *aux); const char *name, nvlist_t *aux);
extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_remove(spa_t *spa, vdev_t *vd);

View File

@ -158,11 +158,6 @@ enum zio_encrypt {
(compress) == ZIO_COMPRESS_ON || \ (compress) == ZIO_COMPRESS_ON || \
(compress) == ZIO_COMPRESS_OFF) (compress) == ZIO_COMPRESS_OFF)
/*
* Default Linux timeout for a sd device.
*/
#define ZIO_DELAY_MAX (30 * MILLISEC)
#define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_WAIT 0
#define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2 #define ZIO_FAILURE_MODE_PANIC 2
@ -664,7 +659,7 @@ extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
/* If we have the good data in hand, this function can be used */ /* If we have the good data in hand, this function can be used */
extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
uint64_t length, const abd_t *good_data, const abd_t *bad_data, uint64_t length, const abd_t *good_data, const abd_t *bad_data,
struct zio_bad_cksum *info); struct zio_bad_cksum *info);

View File

@ -96,8 +96,9 @@ information regarding "hung" I/O detection and configuration.
.ad .ad
.RS 12n .RS 12n
Issued when a completed I/O exceeds the maximum allowed time specified Issued when a completed I/O exceeds the maximum allowed time specified
by the \fBzio_delay_max\fR module option. This can be an indicator of by the \fBzio_slow_io_ms\fR module option. This can be an indicator of
problems with the underlying storage device. problems with the underlying storage device. The number of delay events is
ratelimited by the \fBzfs_slow_io_events_per_second\fR module parameter.
.RE .RE
.sp .sp
@ -697,10 +698,9 @@ full list of all the I/O stages.
\fBzio_delay\fR \fBzio_delay\fR
.ad .ad
.RS 12n .RS 12n
The time in ticks (HZ) required for the block layer to service the I/O. Unlike The time elapsed (in nanoseconds) waiting for the block layer to complete the
\fBzio_delta\fR this does not include any vdev queuing time and is therefore I/O. Unlike \fBzio_delta\fR this does not include any vdev queuing time and is
solely a measure of the block layer performance. On most modern Linux systems therefore solely a measure of the block layer performance.
HZ is defined as 1000 making a tick equivalent to 1 millisecond.
.RE .RE
.sp .sp

View File

@ -1086,7 +1086,7 @@ Default value: \fB600,000\fR.
.ad .ad
.RS 12n .RS 12n
Interval in milliseconds after which the deadman is triggered and an Interval in milliseconds after which the deadman is triggered and an
individual IO operation is considered to be "hung". As long as the I/O individual I/O operation is considered to be "hung". As long as the I/O
remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR
milliseconds until the I/O completes. milliseconds until the I/O completes.
.sp .sp
@ -1141,10 +1141,10 @@ Default value: \fB500,000\fR.
.sp .sp
.ne 2 .ne 2
.na .na
\fBzfs_delays_per_second\fR (int) \fBzfs_low_ios_per_second\fR (int)
.ad .ad
.RS 12n .RS 12n
Rate limit IO delay events to this many per second. Rate limit delay zevents (which report slow I/Os) to this many per second.
.sp .sp
Default value: 20 Default value: 20
.RE .RE
@ -1655,7 +1655,7 @@ Default value: \fB50\fR.
.ad .ad
.RS 12n .RS 12n
We currently support block sizes from 512 bytes to 16MB. The benefits of We currently support block sizes from 512 bytes to 16MB. The benefits of
larger blocks, and thus larger IO, need to be weighed against the cost of larger blocks, and thus larger I/O, need to be weighed against the cost of
COWing a giant block to modify one byte. Additionally, very large blocks COWing a giant block to modify one byte. Additionally, very large blocks
can have an impact on i/o latency, and also potentially on the memory can have an impact on i/o latency, and also potentially on the memory
allocator. Therefore, we do not allow the recordsize to be set larger than allocator. Therefore, we do not allow the recordsize to be set larger than
@ -2552,12 +2552,13 @@ Default value: \fB0\fR.
.sp .sp
.ne 2 .ne 2
.na .na
\fBzio_delay_max\fR (int) \fBzio_slow_io_ms\fR (int)
.ad .ad
.RS 12n .RS 12n
A zevent will be logged if a ZIO operation takes more than N milliseconds to When an I/O operation takes more than \fBzio_slow_io_ms\fR milliseconds to
complete. Note that this is only a logging facility, not a timeout on complete is marked as a slow I/O. Each slow I/O causes a delay zevent. Slow
operations. I/O counters can be seen with "zpool status -s".
.sp .sp
Default value: \fB30,000\fR. Default value: \fB30,000\fR.
.RE .RE
@ -2568,7 +2569,7 @@ Default value: \fB30,000\fR.
\fBzio_dva_throttle_enabled\fR (int) \fBzio_dva_throttle_enabled\fR (int)
.ad .ad
.RS 12n .RS 12n
Throttle block allocations in the ZIO pipeline. This allows for Throttle block allocations in the I/O pipeline. This allows for
dynamic allocation distribution when devices are imbalanced. dynamic allocation distribution when devices are imbalanced.
When enabled, the maximum number of pending allocations per top-level vdev When enabled, the maximum number of pending allocations per top-level vdev
is limited by \fBzfs_vdev_queue_depth_pct\fR. is limited by \fBzfs_vdev_queue_depth_pct\fR.
@ -2594,7 +2595,7 @@ Default value: \fB0\fR.
.ad .ad
.RS 12n .RS 12n
Percentage of online CPUs (or CPU cores, etc) which will run a worker thread Percentage of online CPUs (or CPU cores, etc) which will run a worker thread
for IO. These workers are responsible for IO work such as compression and for I/O. These workers are responsible for I/O work such as compression and
checksum calculations. Fractional number of CPUs will be rounded down. checksum calculations. Fractional number of CPUs will be rounded down.
.sp .sp
The default value of 75 was chosen to avoid using all CPUs which can result in The default value of 75 was chosen to avoid using all CPUs which can result in

View File

@ -182,7 +182,7 @@
.Nm .Nm
.Cm status .Cm status
.Oo Fl c Ar SCRIPT Oc .Oo Fl c Ar SCRIPT Oc
.Op Fl gLPvxD .Op Fl DgLpPsvx
.Op Fl T Sy u Ns | Ns Sy d .Op Fl T Sy u Ns | Ns Sy d
.Oo Ar pool Oc Ns ... .Oo Ar pool Oc Ns ...
.Op Ar interval Op Ar count .Op Ar interval Op Ar count
@ -2167,7 +2167,7 @@ and automatically import it.
.Nm .Nm
.Cm status .Cm status
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ...
.Op Fl gLPvxD .Op Fl DgLpPsvx
.Op Fl T Sy u Ns | Ns Sy d .Op Fl T Sy u Ns | Ns Sy d
.Oo Ar pool Oc Ns ... .Oo Ar pool Oc Ns ...
.Op Ar interval Op Ar count .Op Ar interval Op Ar count
@ -2203,6 +2203,8 @@ Display real paths for vdevs resolving all symbolic links. This can
be used to look up the current block device name regardless of the be used to look up the current block device name regardless of the
.Pa /dev/disk/ .Pa /dev/disk/
path used to open it. path used to open it.
.It Fl p
Display numbers in parsable (exact) values.
.It Fl P .It Fl P
Display full paths for vdevs instead of only the last component of Display full paths for vdevs instead of only the last component of
the path. This can be used in conjunction with the the path. This can be used in conjunction with the
@ -2214,6 +2216,12 @@ Display a histogram of deduplication statistics, showing the allocated
and referenced and referenced
.Pq logically referenced in the pool .Pq logically referenced in the pool
block counts and sizes by reference count. block counts and sizes by reference count.
.It Fl s
Display the number of leaf VDEV slow IOs. This is the number of IOs that
didn't complete in \fBzio_slow_io_ms\fR milliseconds (default 30 seconds).
This does not necessarily mean the IOs failed to complete, just took an
unreasonably long amount of time. This may indicate a problem with the
underlying storage.
.It Fl T Sy u Ns | Ns Sy d .It Fl T Sy u Ns | Ns Sy d
Display a time stamp. Display a time stamp.
Specify Specify

View File

@ -77,14 +77,14 @@ int vdev_validate_skip = B_FALSE;
int vdev_dtl_sm_blksz = (1 << 12); int vdev_dtl_sm_blksz = (1 << 12);
/* /*
* Rate limit delay events to this many IO delays per second. * Rate limit slow IO (delay) events to this many per second.
*/ */
unsigned int zfs_delays_per_second = 20; unsigned int zfs_slow_io_events_per_second = 20;
/* /*
* Rate limit checksum events after this many checksum errors per second. * Rate limit checksum events after this many checksum errors per second.
*/ */
unsigned int zfs_checksums_per_second = 20; unsigned int zfs_checksum_events_per_second = 20;
/* /*
* Ignore errors during scrub/resilver. Allows to work around resilver * Ignore errors during scrub/resilver. Allows to work around resilver
@ -507,8 +507,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
* and checksum events so that we don't overwhelm ZED with thousands * and checksum events so that we don't overwhelm ZED with thousands
* of events when a disk is acting up. * of events when a disk is acting up.
*/ */
zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_delays_per_second, 1); zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksums_per_second, 1); 1);
zfs_ratelimit_init(&vd->vdev_checksum_rl,
&zfs_checksum_events_per_second, 1);
list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_state_dirty_node);
@ -3591,6 +3593,7 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_checksum_errors = 0;
vd->vdev_stat.vs_slow_ios = 0;
for (int c = 0; c < vd->vdev_children; c++) for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]); vdev_clear(spa, vd->vdev_child[c]);
@ -4630,12 +4633,12 @@ module_param(vdev_ms_count_limit, int, 0644);
MODULE_PARM_DESC(vdev_ms_count_limit, MODULE_PARM_DESC(vdev_ms_count_limit,
"Practical upper limit of total metaslabs per top-level vdev"); "Practical upper limit of total metaslabs per top-level vdev");
module_param(zfs_delays_per_second, uint, 0644); module_param(zfs_slow_io_events_per_second, uint, 0644);
MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many " MODULE_PARM_DESC(zfs_slow_io_events_per_second,
"IO delays per second"); "Rate limit slow IO (delay) events to this many per second");
module_param(zfs_checksums_per_second, uint, 0644); module_param(zfs_checksum_events_per_second, uint, 0644);
MODULE_PARM_DESC(zfs_checksums_per_second, "Rate limit checksum events " MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events "
"to this many checksum errors per second (do not set below zed" "to this many checksum errors per second (do not set below zed"
"threshold)."); "threshold).");

View File

@ -347,6 +347,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB], vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB])); ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));
/* IO delays */
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
/* Add extended stats nvlist to main nvlist */ /* Add extended stats nvlist to main nvlist */
fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);

View File

@ -140,7 +140,10 @@ zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
return (rc); return (rc);
} }
static void /*
* Return B_TRUE if the event actually posted, B_FALSE if not.
*/
static boolean_t
zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
zio_t *zio, uint64_t stateoroffset, uint64_t size) zio_t *zio, uint64_t stateoroffset, uint64_t size)
@ -150,78 +153,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
uint64_t ena; uint64_t ena;
char class[64]; char class[64];
/* if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
* If we are doing a spa_tryimport() or in recovery mode, return (B_FALSE);
* ignore errors.
*/
if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
spa_load_state(spa) == SPA_LOAD_RECOVER)
return;
/*
* If we are in the middle of opening a pool, and the previous attempt
* failed, don't bother logging any new ereports - we're just going to
* get the same diagnosis anyway.
*/
if (spa_load_state(spa) != SPA_LOAD_NONE &&
spa->spa_last_open_failed)
return;
if (zio != NULL) {
/*
* If this is not a read or write zio, ignore the error. This
* can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
*/
if (zio->io_type != ZIO_TYPE_READ &&
zio->io_type != ZIO_TYPE_WRITE)
return;
if (vd != NULL) {
/*
* If the vdev has already been marked as failing due
* to a failed probe, then ignore any subsequent I/O
* errors, as the DE will automatically fault the vdev
* on the first such failure. This also catches cases
* where vdev_remove_wanted is set and the device has
* not yet been asynchronously placed into the REMOVED
* state.
*/
if (zio->io_vd == vd && !vdev_accessible(vd, zio))
return;
/*
* Ignore checksum errors for reads from DTL regions of
* leaf vdevs.
*/
if (zio->io_type == ZIO_TYPE_READ &&
zio->io_error == ECKSUM &&
vd->vdev_ops->vdev_op_leaf &&
vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
return;
}
}
/*
* For probe failure, we want to avoid posting ereports if we've
* already removed the device in the meantime.
*/
if (vd != NULL &&
strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
(vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
return;
if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
(zio != NULL) && (!zio->io_timestamp)) {
/* Ignore bogus delay events */
return;
}
if ((ereport = fm_nvlist_create(NULL)) == NULL) if ((ereport = fm_nvlist_create(NULL)) == NULL)
return; return (B_FALSE);
if ((detector = fm_nvlist_create(NULL)) == NULL) { if ((detector = fm_nvlist_create(NULL)) == NULL) {
fm_nvlist_destroy(ereport, FM_NVA_FREE); fm_nvlist_destroy(ereport, FM_NVA_FREE);
return; return (B_FALSE);
} }
/* /*
@ -332,7 +272,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
DATA_TYPE_UINT64, vs->vs_write_errors, DATA_TYPE_UINT64, vs->vs_write_errors,
FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL); DATA_TYPE_UINT64, vs->vs_checksum_errors,
FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
DATA_TYPE_UINT64, vs->vs_slow_ios,
NULL);
} }
if (pvd != NULL) { if (pvd != NULL) {
@ -427,7 +370,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
/* /*
* Payload for I/Os with corresponding logical information. * Payload for I/Os with corresponding logical information.
*/ */
if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
fm_payload_set(ereport, fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
DATA_TYPE_UINT64, zb->zb_objset, DATA_TYPE_UINT64, zb->zb_objset,
@ -437,11 +380,13 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_INT64, zb->zb_level, DATA_TYPE_INT64, zb->zb_level,
FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
DATA_TYPE_UINT64, zb->zb_blkid, NULL); DATA_TYPE_UINT64, zb->zb_blkid, NULL);
}
mutex_exit(&spa->spa_errlist_lock); mutex_exit(&spa->spa_errlist_lock);
*ereport_out = ereport; *ereport_out = ereport;
*detector_out = detector; *detector_out = detector;
return (B_TRUE);
} }
/* if it's <= 128 bytes, save the corruption directly */ /* if it's <= 128 bytes, save the corruption directly */
@ -765,27 +710,111 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
} }
#endif #endif
void /*
* Make sure our event is still valid for the given zio/vdev/pool. For example,
* we don't want to keep logging events for a faulted or missing vdev.
*/
boolean_t
zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
{
#ifdef _KERNEL
/*
* If we are doing a spa_tryimport() or in recovery mode,
* ignore errors.
*/
if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
spa_load_state(spa) == SPA_LOAD_RECOVER)
return (B_FALSE);
/*
* If we are in the middle of opening a pool, and the previous attempt
* failed, don't bother logging any new ereports - we're just going to
* get the same diagnosis anyway.
*/
if (spa_load_state(spa) != SPA_LOAD_NONE &&
spa->spa_last_open_failed)
return (B_FALSE);
if (zio != NULL) {
/*
* If this is not a read or write zio, ignore the error. This
* can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
*/
if (zio->io_type != ZIO_TYPE_READ &&
zio->io_type != ZIO_TYPE_WRITE)
return (B_FALSE);
if (vd != NULL) {
/*
* If the vdev has already been marked as failing due
* to a failed probe, then ignore any subsequent I/O
* errors, as the DE will automatically fault the vdev
* on the first such failure. This also catches cases
* where vdev_remove_wanted is set and the device has
* not yet been asynchronously placed into the REMOVED
* state.
*/
if (zio->io_vd == vd && !vdev_accessible(vd, zio))
return (B_FALSE);
/*
* Ignore checksum errors for reads from DTL regions of
* leaf vdevs.
*/
if (zio->io_type == ZIO_TYPE_READ &&
zio->io_error == ECKSUM &&
vd->vdev_ops->vdev_op_leaf &&
vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
return (B_FALSE);
}
}
/*
* For probe failure, we want to avoid posting ereports if we've
* already removed the device in the meantime.
*/
if (vd != NULL &&
strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
(vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
return (B_FALSE);
/* Ignore bogus delay events (like from ioctls or unqueued IOs) */
if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
(zio != NULL) && (!zio->io_timestamp)) {
return (B_FALSE);
}
#endif
return (B_TRUE);
}
/*
* Return 0 if event was posted, EINVAL if there was a problem posting it or
* EBUSY if the event was rate limited.
*/
int
zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
uint64_t size) uint64_t size)
{ {
int rc = 0;
#ifdef _KERNEL #ifdef _KERNEL
nvlist_t *ereport = NULL; nvlist_t *ereport = NULL;
nvlist_t *detector = NULL; nvlist_t *detector = NULL;
if (zfs_is_ratelimiting_event(subclass, vd)) if (zfs_is_ratelimiting_event(subclass, vd))
return; return (SET_ERROR(EBUSY));
zfs_ereport_start(&ereport, &detector, subclass, spa, vd, if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
zb, zio, stateoroffset, size); zb, zio, stateoroffset, size))
return (SET_ERROR(EINVAL)); /* couldn't post event */
if (ereport == NULL) if (ereport == NULL)
return; return (SET_ERROR(EINVAL));
/* Cleanup is handled by the callback function */ /* Cleanup is handled by the callback function */
zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
#endif #endif
return (rc);
} }
void void
@ -795,7 +824,6 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
{ {
zio_cksum_report_t *report; zio_cksum_report_t *report;
#ifdef _KERNEL #ifdef _KERNEL
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
return; return;
@ -874,30 +902,34 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
} }
void int
zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length, struct zio *zio, uint64_t offset, uint64_t length,
const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
{ {
int rc = 0;
#ifdef _KERNEL #ifdef _KERNEL
nvlist_t *ereport = NULL; nvlist_t *ereport = NULL;
nvlist_t *detector = NULL; nvlist_t *detector = NULL;
zfs_ecksum_info_t *info; zfs_ecksum_info_t *info;
zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
spa, vd, zb, zio, offset, length); return (EBUSY);
if (ereport == NULL) if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
return; spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
return (SET_ERROR(EINVAL));
}
info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
B_FALSE); B_FALSE);
if (info != NULL) { if (info != NULL) {
zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
kmem_free(info, sizeof (*info)); kmem_free(info, sizeof (*info));
} }
#endif #endif
return (rc);
} }
/* /*
@ -1043,6 +1075,7 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
#if defined(_KERNEL) #if defined(_KERNEL)
EXPORT_SYMBOL(zfs_ereport_post); EXPORT_SYMBOL(zfs_ereport_post);
EXPORT_SYMBOL(zfs_ereport_is_valid);
EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_ereport_post_checksum);
EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_remove);
EXPORT_SYMBOL(zfs_post_autoreplace); EXPORT_SYMBOL(zfs_post_autoreplace);

View File

@ -77,7 +77,8 @@ uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
#endif #endif
int zio_delay_max = ZIO_DELAY_MAX; /* Mark IOs as "slow" if they take longer than 30 seconds */
int zio_slow_io_ms = (30 * MILLISEC);
#define BP_SPANB(indblkshift, level) \ #define BP_SPANB(indblkshift, level) \
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
@ -4431,10 +4432,28 @@ zio_done(zio_t *zio)
* 30 seconds to complete, post an error described the I/O delay. * 30 seconds to complete, post an error described the I/O delay.
* We ignore these errors if the device is currently unavailable. * We ignore these errors if the device is currently unavailable.
*/ */
if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) { if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, /*
zio->io_vd, &zio->io_bookmark, zio, 0, 0); * We want to only increment our slow IO counters if
* the IO is valid (i.e. not if the drive is removed).
*
* zfs_ereport_post() will also do these checks, but
* it can also ratelimit and have other failures, so we
* need to increment the slow_io counters independent
* of it.
*/
if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
zio->io_spa, zio->io_vd, zio)) {
mutex_enter(&zio->io_vd->vdev_stat_lock);
zio->io_vd->vdev_stat.vs_slow_ios++;
mutex_exit(&zio->io_vd->vdev_stat_lock);
zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
zio->io_spa, zio->io_vd, &zio->io_bookmark,
zio, 0, 0);
}
}
} }
if (zio->io_error) { if (zio->io_error) {
@ -4823,8 +4842,9 @@ EXPORT_SYMBOL(zio_data_buf_alloc);
EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_buf_free);
EXPORT_SYMBOL(zio_data_buf_free); EXPORT_SYMBOL(zio_data_buf_free);
module_param(zio_delay_max, int, 0644); module_param(zio_slow_io_ms, int, 0644);
MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event"); MODULE_PARM_DESC(zio_slow_io_ms,
"Max I/O completion time (milliseconds) before marking it as slow");
module_param(zio_requeue_io_start_cut_in_line, int, 0644); module_param(zio_requeue_io_start_cut_in_line, int, 0644);
MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O"); MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");

View File

@ -546,7 +546,7 @@ tags = ['functional', 'exec']
tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos', tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos',
'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple', 'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple',
'auto_spare_shared', 'scrub_after_resilver', 'decrypt_fault', 'auto_spare_shared', 'scrub_after_resilver', 'decrypt_fault',
'decompress_fault'] 'decompress_fault','zpool_status_-s']
tags = ['functional', 'fault'] tags = ['functional', 'fault']
[tests/functional/features/async_destroy] [tests/functional/features/async_destroy]

View File

@ -11,7 +11,8 @@ dist_pkgdata_SCRIPTS = \
auto_spare_shared.ksh \ auto_spare_shared.ksh \
decrypt_fault.ksh \ decrypt_fault.ksh \
decompress_fault.ksh \ decompress_fault.ksh \
scrub_after_resilver.ksh scrub_after_resilver.ksh \
zpool_status_-s.ksh
dist_pkgdata_DATA = \ dist_pkgdata_DATA = \
fault.cfg fault.cfg

View File

@ -0,0 +1,77 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
#
# DESCRIPTION:
# Verify zpool status -s (slow IOs) works
#
# STRATEGY:
# 1. Create a file
# 2. Inject slow IOs into the pool
# 3. Verify we can see the slow IOs with "zpool status -s".
# 4. Verify we can see delay events.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/include/zpool_script.shlib
DISK=${DISKS%% *}
verify_runnable "both"
log_must zpool create $TESTPOOL mirror ${DISKS}
function cleanup
{
log_must zinject -c all
log_must set_tunable64 zio_slow_io_ms $OLD_SLOW_IO
log_must set_tunable64 zfs_slow_io_events_per_second $OLD_SLOW_IO_EVENTS
log_must destroy_pool $TESTPOOL
}
log_onexit cleanup
log_must zpool events -c
# Mark any IOs greater than 10ms as slow IOs
OLD_SLOW_IO=$(get_tunable zio_slow_io_ms)
OLD_SLOW_IO_EVENTS=$(get_tunable zfs_slow_io_events_per_second)
log_must set_tunable64 zio_slow_io_ms 10
log_must set_tunable64 zfs_slow_io_events_per_second 1000
# Create 20ms IOs
log_must zinject -d $DISK -D20:100 $TESTPOOL
log_must mkfile 1048576 /$TESTPOOL/testfile
log_must zpool sync $TESTPOOL
log_must zinject -c all
SLOW_IOS=$(zpool status -sp | grep "$DISK" | awk '{print $6}')
DELAY_EVENTS=$(zpool events | grep delay | wc -l)
if [ $SLOW_IOS -gt 0 ] && [ $DELAY_EVENTS -gt 0 ] ; then
log_pass "Correctly saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events"
else
log_fail "Only saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events"
fi

View File

@ -410,7 +410,7 @@ function get_system_config
zfs_vdev_async_write_max_active \ zfs_vdev_async_write_max_active \
zfs_vdev_sync_read_max_active \ zfs_vdev_sync_read_max_active \
zfs_vdev_sync_write_max_active \ zfs_vdev_sync_write_max_active \
zio_delay_max zio_slow_io_ms
do do
if [ "$tunable" != "zfs_arc_max" ] if [ "$tunable" != "zfs_arc_max" ]
then then