mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	Add 'zpool status -e' flag to see unhealthy vdevs
When very large pools are present, it can be laborious to find
reasons for why a pool is degraded and/or where an unhealthy vdev
is. This option filters out vdevs that are ONLINE and with no errors
to make it easier to see where the issues are. Root and parents of
unhealthy vdevs will always be printed.
Testing:
ZFS errors and drive failures for multiple vdevs were simulated with
zinject.
Sample vdev listings with '-e' option
- All vdevs healthy
    NAME        STATE     READ WRITE CKSUM
    iron5       ONLINE       0     0     0
- ZFS errors
    NAME        STATE     READ WRITE CKSUM
    iron5       ONLINE       0     0     0
      raidz2-5  ONLINE       1     0     0
        L23     ONLINE       1     0     0
        L24     ONLINE       1     0     0
        L37     ONLINE       1     0     0
- Vdev faulted
    NAME        STATE     READ WRITE CKSUM
    iron5       DEGRADED     0     0     0
      raidz2-6  DEGRADED     0     0     0
        L67     FAULTED      0     0     0  too many errors
- Vdev faults and data errors
    NAME        STATE     READ WRITE CKSUM
    iron5       DEGRADED     0     0     0
      raidz2-1  DEGRADED     0     0     0
        L2      FAULTED      0     0     0  too many errors
      raidz2-5  ONLINE       1     0     0
        L23     ONLINE       1     0     0
        L24     ONLINE       1     0     0
        L37     ONLINE       1     0     0
      raidz2-6  DEGRADED     0     0     0
        L67     FAULTED      0     0     0  too many errors
- Vdev missing
    NAME        STATE     READ WRITE CKSUM
    iron5       DEGRADED     0     0     0
      raidz2-6  DEGRADED     0     0     0
        L67     UNAVAIL      3     1     0
- Slow devices when -s provided with -e
    NAME        STATE     READ WRITE CKSUM  SLOW
    iron5       DEGRADED     0     0     0     -
      raidz2-5  DEGRADED     0     0     0     -
        L10     FAULTED      0     0     0     0  external device fault
        L51     ONLINE       0     0     0    14
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Cameron Harr <harr1@llnl.gov>
Closes #15769
			
			
This commit is contained in:
		
							parent
							
								
									9bb8d26bd5
								
							
						
					
					
						commit
						40e20d808c
					
				| @ -2161,6 +2161,7 @@ typedef struct status_cbdata { | ||||
| 	boolean_t	cb_explain; | ||||
| 	boolean_t	cb_first; | ||||
| 	boolean_t	cb_dedup_stats; | ||||
| 	boolean_t	cb_print_unhealthy; | ||||
| 	boolean_t	cb_print_status; | ||||
| 	boolean_t	cb_print_slow_ios; | ||||
| 	boolean_t	cb_print_vdev_init; | ||||
| @ -2357,6 +2358,35 @@ health_str_to_color(const char *health) | ||||
| 	return (NULL); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Called for each leaf vdev.  Returns 0 if the vdev is healthy. | ||||
|  * A vdev is unhealthy if any of the following are true: | ||||
|  * 1) there are read, write, or checksum errors, | ||||
|  * 2) its state is not ONLINE, or | ||||
|  * 3) slow IO reporting was requested (-s) and there are slow IOs. | ||||
|  */ | ||||
| static int | ||||
| vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data) | ||||
| { | ||||
| 	status_cbdata_t *cb = data; | ||||
| 	vdev_stat_t *vs; | ||||
| 	uint_t vsc; | ||||
| 	(void) hdl_data; | ||||
| 
 | ||||
| 	if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, | ||||
| 	    (uint64_t **)&vs, &vsc) != 0) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	if (vs->vs_checksum_errors || vs->vs_read_errors || | ||||
| 	    vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	if (cb->cb_print_slow_ios && vs->vs_slow_ios) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Print out configuration state as requested by status_callback. | ||||
|  */ | ||||
| @ -2375,7 +2405,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, | ||||
| 	const char *state; | ||||
| 	const char *type; | ||||
| 	const char *path = NULL; | ||||
| 	const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; | ||||
| 	const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL, | ||||
| 	    *scolor = NULL; | ||||
| 
 | ||||
| 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, | ||||
| 	    &child, &children) != 0) | ||||
| @ -2402,6 +2433,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, | ||||
| 			state = gettext("AVAIL"); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If '-e' is specified then top-level vdevs and their children | ||||
| 	 * can be pruned if all of their leaves are healthy. | ||||
| 	 */ | ||||
| 	if (cb->cb_print_unhealthy && depth > 0 && | ||||
| 	    for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) { | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	printf_color(health_str_to_color(state), | ||||
| 	    "\t%*s%-*s  %-8s", depth, "", cb->cb_namewidth - depth, | ||||
| 	    name, state); | ||||
| @ -2416,6 +2456,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, | ||||
| 		if (vs->vs_checksum_errors) | ||||
| 			ccolor = ANSI_RED; | ||||
| 
 | ||||
| 		if (vs->vs_slow_ios) | ||||
| 			scolor = ANSI_BLUE; | ||||
| 
 | ||||
| 		if (cb->cb_literal) { | ||||
| 			fputc(' ', stdout); | ||||
| 			printf_color(rcolor, "%5llu", | ||||
| @ -2448,9 +2491,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, | ||||
| 			} | ||||
| 
 | ||||
| 			if (cb->cb_literal) | ||||
| 				printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); | ||||
| 				printf_color(scolor, " %5llu", | ||||
| 				    (u_longlong_t)vs->vs_slow_ios); | ||||
| 			else | ||||
| 				printf(" %5s", rbuf); | ||||
| 				printf_color(scolor, " %5s", rbuf); | ||||
| 		} | ||||
| 		if (cb->cb_print_power) { | ||||
| 			if (children == 0)  { | ||||
| @ -8999,9 +9043,11 @@ status_callback(zpool_handle_t *zhp, void *data) | ||||
| 				(void) printf(gettext( | ||||
| 				    "errors: No known data errors\n")); | ||||
| 			} else if (!cbp->cb_verbose) { | ||||
| 				color_start(ANSI_RED); | ||||
| 				(void) printf(gettext("errors: %llu data " | ||||
| 				    "errors, use '-v' for a list\n"), | ||||
| 				    (u_longlong_t)nerr); | ||||
| 				color_end(); | ||||
| 			} else { | ||||
| 				print_error_log(zhp); | ||||
| 			} | ||||
| @ -9022,6 +9068,7 @@ status_callback(zpool_handle_t *zhp, void *data) | ||||
|  *              [pool] [interval [count]] | ||||
|  * | ||||
|  *	-c CMD	For each vdev, run command CMD | ||||
|  *	-e	Display only unhealthy vdevs | ||||
|  *	-i	Display vdev initialization status. | ||||
|  *	-g	Display guid for individual vdev name. | ||||
|  *	-L	Follow links when resolving vdev path name. | ||||
| @ -9053,7 +9100,7 @@ zpool_do_status(int argc, char **argv) | ||||
| 	}; | ||||
| 
 | ||||
| 	/* check options */ | ||||
| 	while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options, | ||||
| 	while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options, | ||||
| 	    NULL)) != -1) { | ||||
| 		switch (c) { | ||||
| 		case 'c': | ||||
| @ -9080,6 +9127,9 @@ zpool_do_status(int argc, char **argv) | ||||
| 			} | ||||
| 			cmd = optarg; | ||||
| 			break; | ||||
| 		case 'e': | ||||
| 			cb.cb_print_unhealthy = B_TRUE; | ||||
| 			break; | ||||
| 		case 'i': | ||||
| 			cb.cb_print_vdev_init = B_TRUE; | ||||
| 			break; | ||||
|  | ||||
| @ -36,7 +36,7 @@ | ||||
| .Sh SYNOPSIS | ||||
| .Nm zpool | ||||
| .Cm status | ||||
| .Op Fl DigLpPstvx | ||||
| .Op Fl DeigLpPstvx | ||||
| .Op Fl T Sy u Ns | Ns Sy d | ||||
| .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … | ||||
| .Oo Ar pool Oc Ns … | ||||
| @ -69,6 +69,8 @@ See the | ||||
| option of | ||||
| .Nm zpool Cm iostat | ||||
| for complete details. | ||||
| .It Fl e | ||||
| Only show unhealthy vdevs (not-ONLINE or with errors). | ||||
| .It Fl i | ||||
| Display vdev initialization status. | ||||
| .It Fl g | ||||
|  | ||||
| @ -535,7 +535,8 @@ tags = ['functional', 'cli_root', 'zpool_split'] | ||||
| tests = ['zpool_status_001_pos', 'zpool_status_002_pos', | ||||
|     'zpool_status_003_pos', 'zpool_status_004_pos', | ||||
|     'zpool_status_005_pos', 'zpool_status_006_pos', | ||||
|     'zpool_status_007_pos', 'zpool_status_features_001_pos'] | ||||
|     'zpool_status_007_pos', 'zpool_status_008_pos', | ||||
|     'zpool_status_features_001_pos'] | ||||
| tags = ['functional', 'cli_root', 'zpool_status'] | ||||
| 
 | ||||
| [tests/functional/cli_root/zpool_sync] | ||||
|  | ||||
| @ -1238,6 +1238,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ | ||||
| 	functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
 | ||||
| 	functional/cli_root/zpool_status/zpool_status_006_pos.ksh \
 | ||||
| 	functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
 | ||||
| 	functional/cli_root/zpool_status/zpool_status_008_pos.ksh \
 | ||||
| 	functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
 | ||||
| 	functional/cli_root/zpool_sync/cleanup.ksh \
 | ||||
| 	functional/cli_root/zpool_sync/setup.ksh \
 | ||||
|  | ||||
| @ -51,7 +51,7 @@ else | ||||
| fi | ||||
| 
 | ||||
| set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \ | ||||
| 	"-vx $testpool" | ||||
| 	"-vx $testpool" "-e $testpool" "-es $testpool" | ||||
| 
 | ||||
| log_assert "Executing 'zpool status' with correct options succeeds" | ||||
| 
 | ||||
| @ -64,4 +64,6 @@ while [[ $i -lt ${#args[*]} ]]; do | ||||
| 	(( i = i + 1 )) | ||||
| done | ||||
| 
 | ||||
| cleanup  | ||||
| 
 | ||||
| log_pass "'zpool status' with correct options succeeded" | ||||
|  | ||||
| @ -37,6 +37,7 @@ | ||||
| # 3. Read the file | ||||
| # 4. Take a snapshot and make a clone | ||||
| # 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v' | ||||
| #      and 'zpool status -ev' | ||||
| 
 | ||||
| function cleanup | ||||
| { | ||||
| @ -68,6 +69,7 @@ log_must zpool status -v $TESTPOOL2 | ||||
| log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'" | ||||
| log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'" | ||||
| log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'" | ||||
| log_must eval "zpool status -ev | grep '$TESTPOOL2/10m_file'" | ||||
| log_mustnot eval "zpool status -v | grep '$TESTFS1'" | ||||
| 
 | ||||
| log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone" | ||||
|  | ||||
							
								
								
									
										104
									
								
								tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										104
									
								
								tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,104 @@ | ||||
| #!/bin/ksh -p | ||||
| 
 | ||||
| # | ||||
| # CDDL HEADER START | ||||
| # | ||||
| # This file and its contents are supplied under the terms of the | ||||
| # Common Development and Distribution License ("CDDL"), version 1.0. | ||||
| # You may only use this file in accordance with the terms of version | ||||
| # 1.0 of the CDDL. | ||||
| # | ||||
| # A full copy of the text of the CDDL should have accompanied this | ||||
| # source.  A copy of the CDDL is also available via the Internet at | ||||
| # http://www.illumos.org/license/CDDL. | ||||
| # | ||||
| # CDDL HEADER END | ||||
| # | ||||
| 
 | ||||
| # | ||||
| # Copyright (c) 2024 by Lawrence Livermore National Security, LLC. | ||||
| # | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| 
 | ||||
| # | ||||
| # DESCRIPTION: | ||||
| # Verify 'zpool status -e' only shows unhealthy devices. | ||||
| # | ||||
| # STRATEGY: | ||||
| # 1. Create zpool | ||||
| # 2. Force DEGRADE, FAULT, or inject slow IOs for vdevs | ||||
| # 3. Verify vdevs are reported correctly with -e and -s | ||||
| # 4. Verify parents are reported as DEGRADED | ||||
| # 5. Verify healthy children are not reported | ||||
| # | ||||
| 
 | ||||
| function cleanup | ||||
| { | ||||
| 	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO | ||||
| 	zinject -c all | ||||
| 	poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 | ||||
| 	log_must rm -f $all_vdevs | ||||
| } | ||||
| 
 | ||||
| log_assert "Verify 'zpool status -e'" | ||||
| 
 | ||||
| log_onexit cleanup | ||||
| 
 | ||||
| all_vdevs=$(echo $TESTDIR/vdev{1..6}) | ||||
| log_must mkdir -p $TESTDIR | ||||
| log_must truncate -s $MINVDEVSIZE $all_vdevs | ||||
| 
 | ||||
| OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) | ||||
| 
 | ||||
| for raid_type in "draid2:3d:6c:1s" "raidz2"; do | ||||
| 
 | ||||
| 	log_must zpool create -f $TESTPOOL2 $raid_type $all_vdevs | ||||
| 
 | ||||
| 	# Check DEGRADED vdevs are shown. | ||||
| 	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "ONLINE" | ||||
| 	log_must zinject -d $TESTDIR/vdev4 -A degrade $TESTPOOL2 | ||||
| 	log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev4 | grep DEGRADED" | ||||
| 
 | ||||
| 	# Check FAULTED vdevs are shown. | ||||
| 	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev5 "ONLINE" | ||||
| 	log_must zinject -d $TESTDIR/vdev5 -A fault $TESTPOOL2 | ||||
| 	log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev5 | grep FAULTED" | ||||
| 
 | ||||
| 	# Check no ONLINE vdevs are shown | ||||
| 	log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE" | ||||
| 
 | ||||
| 	# Check no ONLINE slow vdevs are show.  Then mark IOs greater than | ||||
| 	# 10ms slow, delay IOs 20ms to vdev6, check slow IOs. | ||||
| 	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE" | ||||
| 	log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE" | ||||
| 
 | ||||
| 	log_must set_tunable64 ZIO_SLOW_IO_MS 10 | ||||
| 	log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2 | ||||
| 	log_must mkfile 1048576 /$TESTPOOL2/testfile | ||||
| 	sync_pool $TESTPOOL2 | ||||
| 	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO | ||||
| 
 | ||||
| 	# Check vdev6 slow IOs are only shown when requested with -s. | ||||
| 	log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" | ||||
| 	log_must eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" | ||||
| 
 | ||||
| 	# Pool level and top-vdev level status must be DEGRADED. | ||||
| 	log_must eval "zpool status -e $TESTPOOL2 | grep $TESTPOOL2 | grep DEGRADED" | ||||
| 	log_must eval "zpool status -e $TESTPOOL2 | grep $raid_type | grep DEGRADED" | ||||
| 
 | ||||
| 	# Check that healthy vdevs[1-3] aren't shown with -e. | ||||
| 	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE" | ||||
| 	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE" | ||||
| 	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE" | ||||
| 	log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE" | ||||
| 	log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE" | ||||
| 	log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE" | ||||
| 
 | ||||
| 	log_must zinject -c all | ||||
| 	log_must zpool status -es $TESTPOOL2 | ||||
| 
 | ||||
| 	zpool destroy $TESTPOOL2 | ||||
| done | ||||
| 
 | ||||
| log_pass "Verify zpool status -e shows only unhealthy vdevs" | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Cameron Harr
						Cameron Harr