Add 'zpool status -e' flag to see unhealthy vdevs

When very large pools are present, it can be laborious to find
reasons for why a pool is degraded and/or where an unhealthy vdev
is. This option filters out vdevs that are ONLINE and with no errors
to make it easier to see where the issues are. Root and parents of
unhealthy vdevs will always be printed.

Testing:
ZFS errors and drive failures for multiple vdevs were simulated with
zinject.

Sample vdev listings with '-e' option
- All vdevs healthy
    NAME        STATE     READ WRITE CKSUM
    iron5       ONLINE       0     0     0

- ZFS errors
    NAME        STATE     READ WRITE CKSUM
    iron5       ONLINE       0     0     0
      raidz2-5  ONLINE       1     0     0
        L23     ONLINE       1     0     0
        L24     ONLINE       1     0     0
        L37     ONLINE       1     0     0

- Vdev faulted
    NAME        STATE     READ WRITE CKSUM
    iron5       DEGRADED     0     0     0
      raidz2-6  DEGRADED     0     0     0
        L67     FAULTED      0     0     0  too many errors

- Vdev faults and data errors
    NAME        STATE     READ WRITE CKSUM
    iron5       DEGRADED     0     0     0
      raidz2-1  DEGRADED     0     0     0
        L2      FAULTED      0     0     0  too many errors
      raidz2-5  ONLINE       1     0     0
        L23     ONLINE       1     0     0
        L24     ONLINE       1     0     0
        L37     ONLINE       1     0     0
      raidz2-6  DEGRADED     0     0     0
        L67     FAULTED      0     0     0  too many errors

- Vdev missing
    NAME        STATE     READ WRITE CKSUM
    iron5       DEGRADED     0     0     0
      raidz2-6  DEGRADED     0     0     0
        L67     UNAVAIL      3     1     0

- Slow devices when -s provided with -e
    NAME        STATE     READ WRITE CKSUM  SLOW
    iron5       DEGRADED     0     0     0     -
      raidz2-5  DEGRADED     0     0     0     -
        L10     FAULTED      0     0     0     0  external device fault
        L51     ONLINE       0     0     0    14

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Cameron Harr <harr1@llnl.gov>
Closes #15769
This commit is contained in:
Cameron Harr
2024-02-07 09:12:12 -08:00
committed by Tony Hutter
parent 9bb8d26bd5
commit 40e20d808c
7 changed files with 169 additions and 7 deletions
+54 -4
View File
@@ -2161,6 +2161,7 @@ typedef struct status_cbdata {
boolean_t cb_explain;
boolean_t cb_first;
boolean_t cb_dedup_stats;
boolean_t cb_print_unhealthy;
boolean_t cb_print_status;
boolean_t cb_print_slow_ios;
boolean_t cb_print_vdev_init;
@@ -2357,6 +2358,35 @@ health_str_to_color(const char *health)
return (NULL);
}
/*
* Called for each leaf vdev. Returns 0 if the vdev is healthy.
* A vdev is unhealthy if any of the following are true:
* 1) there are read, write, or checksum errors,
* 2) its state is not ONLINE, or
* 3) slow IO reporting was requested (-s) and there are slow IOs.
*/
static int
vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data)
{
status_cbdata_t *cb = data;
vdev_stat_t *vs;
uint_t vsc;
(void) hdl_data;
if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) != 0)
return (1);
if (vs->vs_checksum_errors || vs->vs_read_errors ||
vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY)
return (1);
if (cb->cb_print_slow_ios && vs->vs_slow_ios)
return (1);
return (0);
}
/*
* Print out configuration state as requested by status_callback.
*/
@@ -2375,7 +2405,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
const char *state;
const char *type;
const char *path = NULL;
const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL;
const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL,
*scolor = NULL;
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
@@ -2402,6 +2433,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
state = gettext("AVAIL");
}
/*
* If '-e' is specified then top-level vdevs and their children
* can be pruned if all of their leaves are healthy.
*/
if (cb->cb_print_unhealthy && depth > 0 &&
for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) {
return;
}
printf_color(health_str_to_color(state),
"\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth,
name, state);
@@ -2416,6 +2456,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
if (vs->vs_checksum_errors)
ccolor = ANSI_RED;
if (vs->vs_slow_ios)
scolor = ANSI_BLUE;
if (cb->cb_literal) {
fputc(' ', stdout);
printf_color(rcolor, "%5llu",
@@ -2448,9 +2491,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
}
if (cb->cb_literal)
printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
printf_color(scolor, " %5llu",
(u_longlong_t)vs->vs_slow_ios);
else
printf(" %5s", rbuf);
printf_color(scolor, " %5s", rbuf);
}
if (cb->cb_print_power) {
if (children == 0) {
@@ -8999,9 +9043,11 @@ status_callback(zpool_handle_t *zhp, void *data)
(void) printf(gettext(
"errors: No known data errors\n"));
} else if (!cbp->cb_verbose) {
color_start(ANSI_RED);
(void) printf(gettext("errors: %llu data "
"errors, use '-v' for a list\n"),
(u_longlong_t)nerr);
color_end();
} else {
print_error_log(zhp);
}
@@ -9022,6 +9068,7 @@ status_callback(zpool_handle_t *zhp, void *data)
* [pool] [interval [count]]
*
* -c CMD For each vdev, run command CMD
* -e Display only unhealthy vdevs
* -i Display vdev initialization status.
* -g Display guid for individual vdev name.
* -L Follow links when resolving vdev path name.
@@ -9053,7 +9100,7 @@ zpool_do_status(int argc, char **argv)
};
/* check options */
while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options,
while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options,
NULL)) != -1) {
switch (c) {
case 'c':
@@ -9080,6 +9127,9 @@ zpool_do_status(int argc, char **argv)
}
cmd = optarg;
break;
case 'e':
cb.cb_print_unhealthy = B_TRUE;
break;
case 'i':
cb.cb_print_vdev_init = B_TRUE;
break;