mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	Add slow disk diagnosis to ZED
Slow disk response times can be indicative of a failing drive. ZFS currently tracks slow I/Os (slower than zio_slow_io_ms) and generates events (ereport.fs.zfs.delay). However, no action is taken by ZED, like is done for checksum or I/O errors. This change adds slow disk diagnosis to ZED which is opt-in using new VDEV properties: VDEV_PROP_SLOW_IO_N VDEV_PROP_SLOW_IO_T If multiple VDEVs in a pool are undergoing slow I/Os, then it skips the zpool_vdev_degrade(). Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Allan Jude <allan@klarasystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Co-authored-by: Rob Wing <rob.wing@klarasystems.com> Signed-off-by: Don Brady <don.brady@klarasystems.com> Closes #15469
This commit is contained in:
		
							parent
							
								
									db65272aef
								
							
						
					
					
						commit
						c1c26a77ff
					
				| @ -22,6 +22,7 @@ | ||||
|  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * | ||||
|  * Copyright (c) 2016, Intel Corporation. | ||||
|  * Copyright (c) 2023, Klara Inc. | ||||
|  */ | ||||
| 
 | ||||
| /*
 | ||||
| @ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name) | ||||
| 	if (strcmp(name, "spare_on_remove") == 0) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0) | ||||
| 		return (10);	/* N = 10 events */ | ||||
| 
 | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| int64_t | ||||
| fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name) | ||||
| { | ||||
| 	(void) hdl; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * These can be looked up in mp->modinfo->fmdi_props | ||||
| 	 * For now we just hard code for phase 2. In the | ||||
| 	 * future, there can be a ZED based override. | ||||
| 	 */ | ||||
| 	if (strcmp(name, "remove_timeout") == 0) | ||||
| 		return (15ULL * 1000ULL * 1000ULL * 1000ULL);	/* 15 sec */ | ||||
| 
 | ||||
| 	if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0) | ||||
| 		return (1000ULL * 1000ULL * 1000ULL * 600ULL);	/* 10 min */ | ||||
| 
 | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| @ -535,6 +514,19 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name) | ||||
| 	return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL); | ||||
| } | ||||
| 
 | ||||
| int | ||||
| fmd_serd_active(fmd_hdl_t *hdl, const char *name) | ||||
| { | ||||
| 	fmd_module_t *mp = (fmd_module_t *)hdl; | ||||
| 	fmd_serd_eng_t *sgp; | ||||
| 
 | ||||
| 	if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { | ||||
| 		zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); | ||||
| 		return (0); | ||||
| 	} | ||||
| 	return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp)); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| fmd_serd_reset(fmd_hdl_t *hdl, const char *name) | ||||
| { | ||||
| @ -543,12 +535,10 @@ fmd_serd_reset(fmd_hdl_t *hdl, const char *name) | ||||
| 
 | ||||
| 	if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { | ||||
| 		zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); | ||||
| 		return; | ||||
| 	} else { | ||||
| 		fmd_serd_eng_reset(sgp); | ||||
| 		fmd_hdl_debug(hdl, "serd_reset %s", name); | ||||
| 	} | ||||
| 
 | ||||
| 	fmd_serd_eng_reset(sgp); | ||||
| 
 | ||||
| 	fmd_hdl_debug(hdl, "serd_reset %s", name); | ||||
| } | ||||
| 
 | ||||
| int | ||||
| @ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep) | ||||
| { | ||||
| 	fmd_module_t *mp = (fmd_module_t *)hdl; | ||||
| 	fmd_serd_eng_t *sgp; | ||||
| 	int err; | ||||
| 
 | ||||
| 	if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { | ||||
| 		zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'", | ||||
| 		    name); | ||||
| 		return (0); | ||||
| 	} | ||||
| 	err = fmd_serd_eng_record(sgp, ep->ev_hrt); | ||||
| 	return (fmd_serd_eng_record(sgp, ep->ev_hrt)); | ||||
| } | ||||
| 
 | ||||
| 	return (err); | ||||
| void | ||||
| fmd_serd_gc(fmd_hdl_t *hdl) | ||||
| { | ||||
| 	fmd_module_t *mp = (fmd_module_t *)hdl; | ||||
| 
 | ||||
| 	fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL); | ||||
| } | ||||
| 
 | ||||
| /* FMD Timers */ | ||||
| @ -579,7 +574,7 @@ _timer_notify(union sigval sv) | ||||
| 	const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; | ||||
| 	struct itimerspec its; | ||||
| 
 | ||||
| 	fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid); | ||||
| 	fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid); | ||||
| 
 | ||||
| 	/* disarm the timer */ | ||||
| 	memset(&its, 0, sizeof (struct itimerspec)); | ||||
|  | ||||
| @ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list); | ||||
| extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...); | ||||
| 
 | ||||
| extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *); | ||||
| extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *); | ||||
| 
 | ||||
| #define	FMD_STAT_NOALLOC	0x0	/* fmd should use caller's memory */ | ||||
| #define	FMD_STAT_ALLOC		0x1	/* fmd should allocate stats memory */ | ||||
| @ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *); | ||||
| extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t); | ||||
| extern void fmd_serd_destroy(fmd_hdl_t *, const char *); | ||||
| extern int fmd_serd_exists(fmd_hdl_t *, const char *); | ||||
| extern int fmd_serd_active(fmd_hdl_t *, const char *); | ||||
| extern void fmd_serd_reset(fmd_hdl_t *, const char *); | ||||
| extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *); | ||||
| extern int fmd_serd_fired(fmd_hdl_t *, const char *); | ||||
| extern int fmd_serd_empty(fmd_hdl_t *, const char *); | ||||
| extern void fmd_serd_gc(fmd_hdl_t *); | ||||
| 
 | ||||
| extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t); | ||||
| extern void fmd_timer_remove(fmd_hdl_t *, id_t); | ||||
|  | ||||
| @ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp) | ||||
| } | ||||
| 
 | ||||
| void | ||||
| fmd_serd_eng_gc(fmd_serd_eng_t *sgp) | ||||
| fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg) | ||||
| { | ||||
| 	(void) arg; | ||||
| 	fmd_serd_elem_t *sep, *nep; | ||||
| 	hrtime_t hrt; | ||||
| 
 | ||||
|  | ||||
| @ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *); | ||||
| extern int fmd_serd_eng_empty(fmd_serd_eng_t *); | ||||
| 
 | ||||
| extern void fmd_serd_eng_reset(fmd_serd_eng_t *); | ||||
| extern void fmd_serd_eng_gc(fmd_serd_eng_t *); | ||||
| extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *); | ||||
| 
 | ||||
| #ifdef	__cplusplus | ||||
| } | ||||
|  | ||||
| @ -23,6 +23,7 @@ | ||||
|  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved. | ||||
|  * Copyright (c) 2016, Intel Corporation. | ||||
|  * Copyright (c) 2023, Klara Inc. | ||||
|  */ | ||||
| 
 | ||||
| #include <stddef.h> | ||||
| @ -47,11 +48,16 @@ | ||||
| #define	DEFAULT_CHECKSUM_T	600	/* seconds */ | ||||
| #define	DEFAULT_IO_N		10	/* events */ | ||||
| #define	DEFAULT_IO_T		600	/* seconds */ | ||||
| #define	DEFAULT_SLOW_IO_N	10	/* events */ | ||||
| #define	DEFAULT_SLOW_IO_T	30	/* seconds */ | ||||
| 
 | ||||
| #define	CASE_GC_TIMEOUT_SECS	43200	/* 12 hours */ | ||||
| 
 | ||||
| /*
 | ||||
|  * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'.  This | ||||
|  * #define reserves enough space for two 64-bit hex values plus the length of | ||||
|  * the longest string. | ||||
|  * Our serd engines are named in the following format: | ||||
|  *     'zfs_<pool_guid>_<vdev_guid>_{checksum,io,slow_io}' | ||||
|  * This #define reserves enough space for two 64-bit hex values plus the | ||||
|  * length of the longest string. | ||||
|  */ | ||||
| #define	MAX_SERDLEN	(16 * 2 + sizeof ("zfs___checksum")) | ||||
| 
 | ||||
| @ -68,6 +74,7 @@ typedef struct zfs_case_data { | ||||
| 	int		zc_pool_state; | ||||
| 	char		zc_serd_checksum[MAX_SERDLEN]; | ||||
| 	char		zc_serd_io[MAX_SERDLEN]; | ||||
| 	char		zc_serd_slow_io[MAX_SERDLEN]; | ||||
| 	int		zc_has_remove_timer; | ||||
| } zfs_case_data_t; | ||||
| 
 | ||||
| @ -114,7 +121,8 @@ zfs_de_stats_t zfs_stats = { | ||||
| 	{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } | ||||
| }; | ||||
| 
 | ||||
| static hrtime_t zfs_remove_timeout; | ||||
| /* wait 15 seconds after a removal */ | ||||
| static hrtime_t zfs_remove_timeout = SEC2NSEC(15); | ||||
| 
 | ||||
| uu_list_pool_t *zfs_case_pool; | ||||
| uu_list_t *zfs_cases; | ||||
| @ -124,6 +132,8 @@ uu_list_t *zfs_cases; | ||||
| #define	ZFS_MAKE_EREPORT(type)	\ | ||||
|     FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type | ||||
| 
 | ||||
| static void zfs_purge_cases(fmd_hdl_t *hdl); | ||||
| 
 | ||||
| /*
 | ||||
|  * Write out the persistent representation of an active case. | ||||
|  */ | ||||
| @ -170,6 +180,42 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) | ||||
| 	return (zcp); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * count other unique slow-io cases in a pool | ||||
|  */ | ||||
| static uint_t | ||||
| zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case) | ||||
| { | ||||
| 	zfs_case_t *zcp; | ||||
| 	uint_t cases = 0; | ||||
| 	static hrtime_t next_check = 0; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Note that plumbing in some external GC would require adding locking, | ||||
| 	 * since most of this module code is not thread safe and assumes there | ||||
| 	 * is only one thread running against the module. So we perform GC here | ||||
| 	 * inline periodically so that future delay induced faults will be | ||||
| 	 * possible once the issue causing multiple vdev delays is resolved. | ||||
| 	 */ | ||||
| 	if (gethrestime_sec() > next_check) { | ||||
| 		/* Periodically purge old SERD entries and stale cases */ | ||||
| 		fmd_serd_gc(hdl); | ||||
| 		zfs_purge_cases(hdl); | ||||
| 		next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS; | ||||
| 	} | ||||
| 
 | ||||
| 	for (zcp = uu_list_first(zfs_cases); zcp != NULL; | ||||
| 	    zcp = uu_list_next(zfs_cases, zcp)) { | ||||
| 		if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid && | ||||
| 		    zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid && | ||||
| 		    zcp->zc_data.zc_serd_slow_io[0] != '\0' && | ||||
| 		    fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) { | ||||
| 			cases++; | ||||
| 		} | ||||
| 	} | ||||
| 	return (cases); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Iterate over any active cases.  If any cases are associated with a pool or | ||||
|  * vdev which is no longer present on the system, close the associated case. | ||||
| @ -376,6 +422,14 @@ zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, | ||||
| 	    (long long unsigned int)vdev_guid, type); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp) | ||||
| { | ||||
| 	fmd_hdl_debug(hdl, "retiring case"); | ||||
| 
 | ||||
| 	fmd_case_close(hdl, zcp->zc_case); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Solve a given ZFS case.  This first checks to make sure the diagnosis is | ||||
|  * still valid, as well as cleaning up any pending timer associated with the | ||||
| @ -632,9 +686,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) | ||||
| 		if (strcmp(class, | ||||
| 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || | ||||
| 		    strcmp(class, | ||||
| 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || | ||||
| 		    strcmp(class, | ||||
| 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) { | ||||
| 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) { | ||||
| 			zfs_stats.resource_drops.fmds_value.ui64++; | ||||
| 			return; | ||||
| 		} | ||||
| @ -702,6 +754,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) | ||||
| 			if (zcp->zc_data.zc_serd_checksum[0] != '\0') | ||||
| 				fmd_serd_reset(hdl, | ||||
| 				    zcp->zc_data.zc_serd_checksum); | ||||
| 			if (zcp->zc_data.zc_serd_slow_io[0] != '\0') | ||||
| 				fmd_serd_reset(hdl, | ||||
| 				    zcp->zc_data.zc_serd_slow_io); | ||||
| 		} else if (fmd_nvl_class_match(hdl, nvl, | ||||
| 		    ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { | ||||
| 			uint64_t state = 0; | ||||
| @ -730,7 +785,11 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) | ||||
| 	if (fmd_case_solved(hdl, zcp->zc_case)) | ||||
| 		return; | ||||
| 
 | ||||
| 	fmd_hdl_debug(hdl, "error event '%s'", class); | ||||
| 	if (vdev_guid) | ||||
| 		fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class, | ||||
| 		    vdev_guid); | ||||
| 	else | ||||
| 		fmd_hdl_debug(hdl, "error event '%s'", class); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Determine if we should solve the case and generate a fault.  We solve | ||||
| @ -779,6 +838,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) | ||||
| 	    fmd_nvl_class_match(hdl, nvl, | ||||
| 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || | ||||
| 	    fmd_nvl_class_match(hdl, nvl, | ||||
| 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) || | ||||
| 	    fmd_nvl_class_match(hdl, nvl, | ||||
| 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { | ||||
| 		const char *failmode = NULL; | ||||
| 		boolean_t checkremove = B_FALSE; | ||||
| @ -814,6 +875,51 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) | ||||
| 			} | ||||
| 			if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) | ||||
| 				checkremove = B_TRUE; | ||||
| 		} else if (fmd_nvl_class_match(hdl, nvl, | ||||
| 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) { | ||||
| 			uint64_t slow_io_n, slow_io_t; | ||||
| 
 | ||||
| 			/*
 | ||||
| 			 * Create a slow io SERD engine when the VDEV has the | ||||
| 			 * 'vdev_slow_io_n' and 'vdev_slow_io_n' properties. | ||||
| 			 */ | ||||
| 			if (zcp->zc_data.zc_serd_slow_io[0] == '\0' && | ||||
| 			    nvlist_lookup_uint64(nvl, | ||||
| 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, | ||||
| 			    &slow_io_n) == 0 && | ||||
| 			    nvlist_lookup_uint64(nvl, | ||||
| 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, | ||||
| 			    &slow_io_t) == 0) { | ||||
| 				zfs_serd_name(zcp->zc_data.zc_serd_slow_io, | ||||
| 				    pool_guid, vdev_guid, "slow_io"); | ||||
| 				fmd_serd_create(hdl, | ||||
| 				    zcp->zc_data.zc_serd_slow_io, | ||||
| 				    slow_io_n, | ||||
| 				    SEC2NSEC(slow_io_t)); | ||||
| 				zfs_case_serialize(zcp); | ||||
| 			} | ||||
| 			/* Pass event to SERD engine and see if this triggers */ | ||||
| 			if (zcp->zc_data.zc_serd_slow_io[0] != '\0' && | ||||
| 			    fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io, | ||||
| 			    ep)) { | ||||
| 				/*
 | ||||
| 				 * Ignore a slow io diagnosis when other | ||||
| 				 * VDEVs in the pool show signs of being slow. | ||||
| 				 */ | ||||
| 				if (zfs_other_slow_cases(hdl, &zcp->zc_data)) { | ||||
| 					zfs_case_retire(hdl, zcp); | ||||
| 					fmd_hdl_debug(hdl, "pool %llu has " | ||||
| 					    "multiple slow io cases -- skip " | ||||
| 					    "degrading vdev %llu", | ||||
| 					    (u_longlong_t) | ||||
| 					    zcp->zc_data.zc_pool_guid, | ||||
| 					    (u_longlong_t) | ||||
| 					    zcp->zc_data.zc_vdev_guid); | ||||
| 				} else { | ||||
| 					zfs_case_solve(hdl, zcp, | ||||
| 					    "fault.fs.zfs.vdev.slow_io"); | ||||
| 				} | ||||
| 			} | ||||
| 		} else if (fmd_nvl_class_match(hdl, nvl, | ||||
| 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { | ||||
| 			/*
 | ||||
| @ -924,6 +1030,8 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) | ||||
| 		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); | ||||
| 	if (zcp->zc_data.zc_serd_io[0] != '\0') | ||||
| 		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); | ||||
| 	if (zcp->zc_data.zc_serd_slow_io[0] != '\0') | ||||
| 		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io); | ||||
| 	if (zcp->zc_data.zc_has_remove_timer) | ||||
| 		fmd_timer_remove(hdl, zcp->zc_remove_timer); | ||||
| 
 | ||||
| @ -932,30 +1040,15 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) | ||||
| 	fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * We use the fmd gc entry point to look for old cases that no longer apply. | ||||
|  * This allows us to keep our set of case data small in a long running system. | ||||
|  */ | ||||
| static void | ||||
| zfs_fm_gc(fmd_hdl_t *hdl) | ||||
| { | ||||
| 	zfs_purge_cases(hdl); | ||||
| } | ||||
| 
 | ||||
| static const fmd_hdl_ops_t fmd_ops = { | ||||
| 	zfs_fm_recv,	/* fmdo_recv */ | ||||
| 	zfs_fm_timeout,	/* fmdo_timeout */ | ||||
| 	zfs_fm_close,	/* fmdo_close */ | ||||
| 	NULL,		/* fmdo_stats */ | ||||
| 	zfs_fm_gc,	/* fmdo_gc */ | ||||
| 	NULL,	/* fmdo_gc */ | ||||
| }; | ||||
| 
 | ||||
| static const fmd_prop_t fmd_props[] = { | ||||
| 	{ "checksum_N", FMD_TYPE_UINT32, "10" }, | ||||
| 	{ "checksum_T", FMD_TYPE_TIME, "10min" }, | ||||
| 	{ "io_N", FMD_TYPE_UINT32, "10" }, | ||||
| 	{ "io_T", FMD_TYPE_TIME, "10min" }, | ||||
| 	{ "remove_timeout", FMD_TYPE_TIME, "15sec" }, | ||||
| 	{ NULL, 0, NULL } | ||||
| }; | ||||
| 
 | ||||
| @ -996,8 +1089,6 @@ _zfs_diagnosis_init(fmd_hdl_t *hdl) | ||||
| 
 | ||||
| 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / | ||||
| 	    sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); | ||||
| 
 | ||||
| 	zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); | ||||
| } | ||||
| 
 | ||||
| void | ||||
|  | ||||
| @ -523,6 +523,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, | ||||
| 		} else if (fmd_nvl_class_match(hdl, fault, | ||||
| 		    "fault.fs.zfs.vdev.checksum")) { | ||||
| 			degrade_device = B_TRUE; | ||||
| 		} else if (fmd_nvl_class_match(hdl, fault, | ||||
| 		    "fault.fs.zfs.vdev.slow_io")) { | ||||
| 			degrade_device = B_TRUE; | ||||
| 		} else if (fmd_nvl_class_match(hdl, fault, | ||||
| 		    "fault.fs.zfs.device")) { | ||||
| 			fault_device = B_FALSE; | ||||
|  | ||||
| @ -1083,6 +1083,22 @@ main(int argc, char **argv) | ||||
| 			libzfs_fini(g_zfs); | ||||
| 			return (1); | ||||
| 		} | ||||
| 
 | ||||
| 		if (record.zi_nlanes) { | ||||
| 			switch (io_type) { | ||||
| 			case ZIO_TYPE_READ: | ||||
| 			case ZIO_TYPE_WRITE: | ||||
| 			case ZIO_TYPES: | ||||
| 				break; | ||||
| 			default: | ||||
| 				(void) fprintf(stderr, "I/O type for a delay " | ||||
| 				    "must be 'read' or 'write'\n"); | ||||
| 				usage(); | ||||
| 				libzfs_fini(g_zfs); | ||||
| 				return (1); | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		if (!error) | ||||
| 			error = ENXIO; | ||||
| 
 | ||||
|  | ||||
| @ -2569,7 +2569,13 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, | ||||
| 			break; | ||||
| 
 | ||||
| 		case VDEV_AUX_ERR_EXCEEDED: | ||||
| 			(void) printf(gettext("too many errors")); | ||||
| 			if (vs->vs_read_errors + vs->vs_write_errors + | ||||
| 			    vs->vs_checksum_errors == 0 && children == 0 && | ||||
| 			    vs->vs_slow_ios > 0) { | ||||
| 				(void) printf(gettext("too many slow I/Os")); | ||||
| 			} else { | ||||
| 				(void) printf(gettext("too many errors")); | ||||
| 			} | ||||
| 			break; | ||||
| 
 | ||||
| 		case VDEV_AUX_IO_FAILURE: | ||||
|  | ||||
| @ -82,6 +82,8 @@ extern "C" { | ||||
| #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T	"vdev_cksum_t" | ||||
| #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N	"vdev_io_n" | ||||
| #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T	"vdev_io_t" | ||||
| #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N	"vdev_slow_io_n" | ||||
| #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T	"vdev_slow_io_t" | ||||
| #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS	"vdev_delays" | ||||
| #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID	"parent_guid" | ||||
| #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE	"parent_type" | ||||
|  | ||||
| @ -364,6 +364,8 @@ typedef enum { | ||||
| 	VDEV_PROP_IO_N, | ||||
| 	VDEV_PROP_IO_T, | ||||
| 	VDEV_PROP_RAIDZ_EXPANDING, | ||||
| 	VDEV_PROP_SLOW_IO_N, | ||||
| 	VDEV_PROP_SLOW_IO_T, | ||||
| 	VDEV_NUM_PROPS | ||||
| } vdev_prop_t; | ||||
| 
 | ||||
|  | ||||
| @ -22,6 +22,7 @@ | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2011, 2020 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2017, Intel Corporation. | ||||
|  * Copyright (c) 2023, Klara Inc. | ||||
|  */ | ||||
| 
 | ||||
| #ifndef _SYS_VDEV_IMPL_H | ||||
| @ -453,12 +454,14 @@ struct vdev { | ||||
| 	zfs_ratelimit_t vdev_checksum_rl; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Checksum and IO thresholds for tuning ZED | ||||
| 	 * Vdev properties for tuning ZED | ||||
| 	 */ | ||||
| 	uint64_t	vdev_checksum_n; | ||||
| 	uint64_t	vdev_checksum_t; | ||||
| 	uint64_t	vdev_io_n; | ||||
| 	uint64_t	vdev_io_t; | ||||
| 	uint64_t	vdev_slow_io_n; | ||||
| 	uint64_t	vdev_slow_io_t; | ||||
| }; | ||||
| 
 | ||||
| #define	VDEV_PAD_SIZE		(8 << 10) | ||||
|  | ||||
| @ -5672,7 +5672,9 @@ | ||||
|       <enumerator name='VDEV_PROP_IO_N' value='44'/> | ||||
|       <enumerator name='VDEV_PROP_IO_T' value='45'/> | ||||
|       <enumerator name='VDEV_PROP_RAIDZ_EXPANDING' value='46'/> | ||||
|       <enumerator name='VDEV_NUM_PROPS' value='47'/> | ||||
|       <enumerator name='VDEV_PROP_SLOW_IO_N' value='47'/> | ||||
|       <enumerator name='VDEV_PROP_SLOW_IO_T' value='48'/> | ||||
|       <enumerator name='VDEV_NUM_PROPS' value='49'/> | ||||
|     </enum-decl> | ||||
|     <typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/> | ||||
|     <class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'> | ||||
|  | ||||
| @ -5224,6 +5224,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, | ||||
| 		case VDEV_PROP_CHECKSUM_T: | ||||
| 		case VDEV_PROP_IO_N: | ||||
| 		case VDEV_PROP_IO_T: | ||||
| 		case VDEV_PROP_SLOW_IO_N: | ||||
| 		case VDEV_PROP_SLOW_IO_T: | ||||
| 			if (intval == UINT64_MAX) { | ||||
| 				(void) strlcpy(buf, "-", len); | ||||
| 			} else { | ||||
|  | ||||
| @ -1699,7 +1699,9 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, | ||||
| 		    (prop == VDEV_PROP_CHECKSUM_N || | ||||
| 		    prop == VDEV_PROP_CHECKSUM_T || | ||||
| 		    prop == VDEV_PROP_IO_N || | ||||
| 		    prop == VDEV_PROP_IO_T)) { | ||||
| 		    prop == VDEV_PROP_IO_T || | ||||
| 		    prop == VDEV_PROP_SLOW_IO_N || | ||||
| 		    prop == VDEV_PROP_SLOW_IO_T)) { | ||||
| 			*ivalp = UINT64_MAX; | ||||
| 		} | ||||
| 
 | ||||
|  | ||||
| @ -44,7 +44,7 @@ section, below. | ||||
| Every vdev has a set of properties that export statistics about the vdev | ||||
| as well as control various behaviors. | ||||
| Properties are not inherited from top-level vdevs, with the exception of | ||||
| checksum_n, checksum_t, io_n, and io_t. | ||||
| checksum_n, checksum_t, io_n, io_t, slow_io_n, and slow_io_t. | ||||
| .Pp | ||||
| The values of numeric properties can be specified using human-readable suffixes | ||||
| .Po for example, | ||||
| @ -117,7 +117,7 @@ If this device is currently being removed from the pool | ||||
| .Pp | ||||
| The following native properties can be used to change the behavior of a vdev. | ||||
| .Bl -tag -width "allocating" | ||||
| .It Sy checksum_n , checksum_t , io_n , io_t | ||||
| .It Sy checksum_n , checksum_t , io_n , io_t , slow_io_n , slow_io_t | ||||
| Tune the fault management daemon by specifying checksum/io thresholds of <N> | ||||
| errors in <T> seconds, respectively. | ||||
| These properties can be set on leaf and top-level vdevs. | ||||
|  | ||||
| @ -260,8 +260,8 @@ sufficient replicas exist to continue functioning. | ||||
| The underlying conditions are as follows: | ||||
| .Bl -bullet -compact | ||||
| .It | ||||
| The number of checksum errors exceeds acceptable levels and the device is | ||||
| degraded as an indication that something may be wrong. | ||||
| The number of checksum errors or slow I/Os exceeds acceptable levels and the | ||||
| device is degraded as an indication that something may be wrong. | ||||
| ZFS continues to use the device as necessary. | ||||
| .It | ||||
| The number of I/O errors exceeds acceptable levels. | ||||
|  | ||||
| @ -69,6 +69,7 @@ Force a vdev into the DEGRADED or FAULTED state. | ||||
| .Nm zinject | ||||
| .Fl d Ar vdev | ||||
| .Fl D Ar latency : Ns Ar lanes | ||||
| .Op Fl T Ar read|write | ||||
| .Ar pool | ||||
| .Xc | ||||
| Add an artificial delay to I/O requests on a particular | ||||
|  | ||||
| @ -431,6 +431,12 @@ vdev_prop_init(void) | ||||
| 	zprop_register_number(VDEV_PROP_IO_T, "io_t", UINT64_MAX, | ||||
| 	    PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "IO_T", B_FALSE, | ||||
| 	    sfeatures); | ||||
| 	zprop_register_number(VDEV_PROP_SLOW_IO_N, "slow_io_n", UINT64_MAX, | ||||
| 	    PROP_DEFAULT, ZFS_TYPE_VDEV, "<events>", "SLOW_IO_N", B_FALSE, | ||||
| 	    sfeatures); | ||||
| 	zprop_register_number(VDEV_PROP_SLOW_IO_T, "slow_io_t", UINT64_MAX, | ||||
| 	    PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "SLOW_IO_T", B_FALSE, | ||||
| 	    sfeatures); | ||||
| 
 | ||||
| 	/* default index (boolean) properties */ | ||||
| 	zprop_register_index(VDEV_PROP_REMOVING, "removing", 0, | ||||
|  | ||||
| @ -676,6 +676,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) | ||||
| 	vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); | ||||
| 	vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); | ||||
| 	vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); | ||||
| 	vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); | ||||
| 	vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); | ||||
| 
 | ||||
| 	list_link_init(&vd->vdev_config_dirty_node); | ||||
| 	list_link_init(&vd->vdev_state_dirty_node); | ||||
| @ -3730,6 +3732,18 @@ vdev_load(vdev_t *vd) | ||||
| 		if (error && error != ENOENT) | ||||
| 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " | ||||
| 			    "failed [error=%d]", (u_longlong_t)zapobj, error); | ||||
| 
 | ||||
| 		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N, | ||||
| 		    &vd->vdev_slow_io_n); | ||||
| 		if (error && error != ENOENT) | ||||
| 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " | ||||
| 			    "failed [error=%d]", (u_longlong_t)zapobj, error); | ||||
| 
 | ||||
| 		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T, | ||||
| 		    &vd->vdev_slow_io_t); | ||||
| 		if (error && error != ENOENT) | ||||
| 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " | ||||
| 			    "failed [error=%d]", (u_longlong_t)zapobj, error); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| @ -5934,6 +5948,20 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) | ||||
| 			} | ||||
| 			vd->vdev_io_t = intval; | ||||
| 			break; | ||||
| 		case VDEV_PROP_SLOW_IO_N: | ||||
| 			if (nvpair_value_uint64(elem, &intval) != 0) { | ||||
| 				error = EINVAL; | ||||
| 				break; | ||||
| 			} | ||||
| 			vd->vdev_slow_io_n = intval; | ||||
| 			break; | ||||
| 		case VDEV_PROP_SLOW_IO_T: | ||||
| 			if (nvpair_value_uint64(elem, &intval) != 0) { | ||||
| 				error = EINVAL; | ||||
| 				break; | ||||
| 			} | ||||
| 			vd->vdev_slow_io_t = intval; | ||||
| 			break; | ||||
| 		default: | ||||
| 			/* Most processing is done in vdev_props_set_sync */ | ||||
| 			break; | ||||
| @ -6269,6 +6297,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) | ||||
| 			case VDEV_PROP_CHECKSUM_T: | ||||
| 			case VDEV_PROP_IO_N: | ||||
| 			case VDEV_PROP_IO_T: | ||||
| 			case VDEV_PROP_SLOW_IO_N: | ||||
| 			case VDEV_PROP_SLOW_IO_T: | ||||
| 				err = vdev_prop_get_int(vd, prop, &intval); | ||||
| 				if (err && err != ENOENT) | ||||
| 					break; | ||||
|  | ||||
| @ -222,6 +222,12 @@ vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop) | ||||
| 		case VDEV_PROP_IO_T: | ||||
| 			propval = vd->vdev_io_t; | ||||
| 			break; | ||||
| 		case VDEV_PROP_SLOW_IO_N: | ||||
| 			propval = vd->vdev_slow_io_n; | ||||
| 			break; | ||||
| 		case VDEV_PROP_SLOW_IO_T: | ||||
| 			propval = vd->vdev_slow_io_t; | ||||
| 			break; | ||||
| 		default: | ||||
| 			propval = propdef; | ||||
| 			break; | ||||
| @ -741,6 +747,26 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, | ||||
| 			    NULL); | ||||
| 	} | ||||
| 
 | ||||
| 	if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { | ||||
| 		uint64_t slow_io_n, slow_io_t; | ||||
| 
 | ||||
| 		slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N); | ||||
| 		if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N)) | ||||
| 			fm_payload_set(ereport, | ||||
| 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, | ||||
| 			    DATA_TYPE_UINT64, | ||||
| 			    slow_io_n, | ||||
| 			    NULL); | ||||
| 
 | ||||
| 		slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T); | ||||
| 		if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T)) | ||||
| 			fm_payload_set(ereport, | ||||
| 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, | ||||
| 			    DATA_TYPE_UINT64, | ||||
| 			    slow_io_t, | ||||
| 			    NULL); | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_exit(&spa->spa_errlist_lock); | ||||
| 
 | ||||
| 	*ereport_out = ereport; | ||||
|  | ||||
| @ -605,6 +605,10 @@ zio_handle_io_delay(zio_t *zio) | ||||
| 		if (vd->vdev_guid != handler->zi_record.zi_guid) | ||||
| 			continue; | ||||
| 
 | ||||
| 		if (handler->zi_record.zi_iotype != ZIO_TYPES && | ||||
| 		    handler->zi_record.zi_iotype != zio->io_type) | ||||
| 				continue; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Defensive; should never happen as the array allocation | ||||
| 		 * occurs prior to inserting this handler on the list. | ||||
|  | ||||
| @ -104,7 +104,8 @@ tags = ['functional', 'devices'] | ||||
| 
 | ||||
| [tests/functional/events:Linux] | ||||
| tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', | ||||
|     'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config'] | ||||
|     'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config', | ||||
|     'zed_slow_io', 'zed_slow_io_many_vdevs'] | ||||
| tags = ['functional', 'events'] | ||||
| 
 | ||||
| [tests/functional/fadvise:Linux] | ||||
|  | ||||
| @ -1447,6 +1447,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ | ||||
| 	functional/events/zed_fd_spill.ksh \
 | ||||
| 	functional/events/zed_io_config.ksh \
 | ||||
| 	functional/events/zed_rc_filter.ksh \
 | ||||
| 	functional/events/zed_slow_io.ksh \
 | ||||
| 	functional/events/zed_slow_io_many_vdevs.ksh \
 | ||||
| 	functional/exec/cleanup.ksh \
 | ||||
| 	functional/exec/exec_001_pos.ksh \
 | ||||
| 	functional/exec/exec_002_neg.ksh \
 | ||||
|  | ||||
| @ -70,4 +70,6 @@ typeset -a properties=( | ||||
|     checksum_t | ||||
|     io_n | ||||
|     io_t | ||||
|     slow_io_n | ||||
|     slow_io_t | ||||
| ) | ||||
|  | ||||
| @ -26,8 +26,10 @@ | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| 
 | ||||
| zed_cleanup all-debug.sh all-syslog.sh all-dumpfds | ||||
| 
 | ||||
| zed_stop | ||||
| 
 | ||||
| zed_cleanup all-debug.sh all-syslog.sh all-dumpfds | ||||
| 
 | ||||
| zed_events_drain | ||||
| 
 | ||||
| default_cleanup | ||||
|  | ||||
							
								
								
									
										205
									
								
								tests/zfs-tests/tests/functional/events/zed_slow_io.ksh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										205
									
								
								tests/zfs-tests/tests/functional/events/zed_slow_io.ksh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,205 @@ | ||||
| #!/bin/ksh -p | ||||
| # | ||||
| # CDDL HEADER START | ||||
| # | ||||
| # The contents of this file are subject to the terms of the | ||||
| # Common Development and Distribution License (the "License"). | ||||
| # You may not use this file except in compliance with the License. | ||||
| # | ||||
| # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | ||||
| # or https://opensource.org/licenses/CDDL-1.0. | ||||
| # See the License for the specific language governing permissions | ||||
| # and limitations under the License. | ||||
| # | ||||
| # When distributing Covered Code, include this CDDL HEADER in each | ||||
| # file and include the License file at usr/src/OPENSOLARIS.LICENSE. | ||||
| # If applicable, add the following below this CDDL HEADER, with the | ||||
| # fields enclosed by brackets "[]" replaced with your own identifying | ||||
| # information: Portions Copyright [yyyy] [name of copyright owner] | ||||
| # | ||||
| # CDDL HEADER END | ||||
| # | ||||
| 
 | ||||
| # | ||||
| # Copyright (c) 2023, Klara Inc. | ||||
| # | ||||
| 
 | ||||
| # DESCRIPTION: | ||||
| #	Verify that vdev properties, slow_io_n and slow_io_t, work with ZED. | ||||
| # | ||||
| # STRATEGY: | ||||
| #	1. Create a pool with single vdev | ||||
| #	2. Set slow_io_n/slow_io_t to non-default values | ||||
| #	3. Inject slow io errors | ||||
| #	4. Verify that ZED degrades vdev | ||||
| # | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| 
 | ||||
| TESTDIR="$TEST_BASE_DIR/zed_slow_io" | ||||
| VDEV="$TEST_BASE_DIR/vdevfile.$$" | ||||
| TESTPOOL="slow_io_pool" | ||||
| FILEPATH="$TESTDIR/slow_io.testfile" | ||||
| 
 | ||||
| OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) | ||||
| OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) | ||||
| 
 | ||||
| verify_runnable "both" | ||||
| 
 | ||||
| function do_setup | ||||
| { | ||||
| 	log_must truncate -s 1G $VDEV | ||||
| 	default_setup_noexit $VDEV | ||||
| 	zed_events_drain | ||||
| 	log_must zfs set compression=off $TESTPOOL | ||||
| 	log_must zfs set primarycache=none $TESTPOOL | ||||
| 	log_must zfs set prefetch=none $TESTPOOL | ||||
| 	log_must zfs set recordsize=512 $TESTPOOL | ||||
| 	for i in {1..10}; do | ||||
| 		dd if=/dev/urandom of=${FILEPATH}$i bs=512 count=1 2>/dev/null | ||||
| 	done | ||||
| 	zpool sync | ||||
| } | ||||
| 
 | ||||
| # intermediate cleanup | ||||
| function do_clean | ||||
| { | ||||
| 	log_must zinject -c all | ||||
| 	log_must zpool destroy $TESTPOOL | ||||
| 	log_must rm -f $VDEV | ||||
| } | ||||
| 
 | ||||
| # final cleanup | ||||
| function cleanup | ||||
| { | ||||
| 	log_must zinject -c all | ||||
| 
 | ||||
| 	# if pool still exists then something failed so log additional info | ||||
| 	if poolexists $TESTPOOL ; then | ||||
| 		log_note "$(zpool status -s $TESTPOOL)" | ||||
| 		echo "=================== zed log search ===================" | ||||
| 		grep "Diagnosis Engine" $ZEDLET_DIR/zed.log | ||||
| 		destroy_pool $TESTPOOL | ||||
| 	fi | ||||
| 	log_must zed_stop | ||||
| 
 | ||||
| 	log_must rm -f $VDEV | ||||
| 
 | ||||
| 	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO | ||||
| 	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS | ||||
| } | ||||
| 
 | ||||
| function start_slow_io | ||||
| { | ||||
| 	zpool sync | ||||
| 	log_must set_tunable64 ZIO_SLOW_IO_MS 10 | ||||
| 	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000 | ||||
| 
 | ||||
| 	log_must zinject -d $VDEV -D10:1 -T read $TESTPOOL | ||||
| 	zpool sync | ||||
| } | ||||
| 
 | ||||
| function stop_slow_io | ||||
| { | ||||
| 	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO | ||||
| 	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS | ||||
| 
 | ||||
| 	log_must zinject -c all | ||||
| } | ||||
| 
 | ||||
| # Test default ZED settings: | ||||
| #    inject 10 events over 2.5 seconds, should not degrade. | ||||
| function default_degrade | ||||
| { | ||||
| 	do_setup | ||||
| 
 | ||||
| 	start_slow_io | ||||
| 	for i in {1..10}; do | ||||
| 		dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null | ||||
| 		sleep 0.25 | ||||
| 	done | ||||
| 	stop_slow_io | ||||
| 	log_note "$(zpool status -s $TESTPOOL)" | ||||
| 
 | ||||
| 	# give slow ZED a chance to process the delay events | ||||
| 	sleep 18 | ||||
| 	log_note "$(zpool status -s $TESTPOOL)" | ||||
| 
 | ||||
| 	degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l) | ||||
| 	log_note $degrades vdev degrades in ZED log | ||||
| 	[ $degrades -eq "0" ] || \ | ||||
| 		log_fail "expecting no degrade events, found $degrades" | ||||
| 
 | ||||
| 	do_clean | ||||
| } | ||||
| 
 | ||||
| # change slow_io_n, slow_io_t to 5 events in 60 seconds | ||||
| # fire more than 5 events, should degrade | ||||
| function slow_io_degrade | ||||
| { | ||||
| 	do_setup | ||||
| 
 | ||||
| 	zpool set slow_io_n=5 $TESTPOOL $VDEV | ||||
| 	zpool set slow_io_t=60 $TESTPOOL $VDEV | ||||
| 
 | ||||
| 	start_slow_io | ||||
| 	for i in {1..16}; do | ||||
| 		dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null | ||||
| 		sleep 0.5 | ||||
| 	done | ||||
| 	stop_slow_io | ||||
| 	zpool sync | ||||
| 
 | ||||
| 	# | ||||
| 	# wait up to 60 seconds for kernel to produce at least 5 delay events | ||||
| 	# | ||||
| 	typeset -i i=0 | ||||
| 	typeset -i events=0 | ||||
| 	while [[ $i -lt 60 ]]; do | ||||
| 		events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l) | ||||
| 		[[ $events -ge "5" ]] && break | ||||
| 		i=$((i+1)) | ||||
| 		sleep 1 | ||||
| 	done | ||||
| 	log_note "$events delay events found" | ||||
| 
 | ||||
| 	if [[ $events -ge "5" ]]; then | ||||
| 		log_must wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 10 | ||||
| 	fi | ||||
| 
 | ||||
| 	do_clean | ||||
| } | ||||
| 
 | ||||
| # change slow_io_n, slow_io_t to 10 events in 1 second | ||||
| # inject events spaced 0.5 seconds apart, should not degrade | ||||
| function slow_io_no_degrade | ||||
| { | ||||
| 	do_setup | ||||
| 
 | ||||
| 	zpool set slow_io_n=10 $TESTPOOL $VDEV | ||||
| 	zpool set slow_io_t=1 $TESTPOOL $VDEV | ||||
| 
 | ||||
| 	start_slow_io | ||||
| 	for i in {1..16}; do | ||||
| 		dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null | ||||
| 		sleep 0.5 | ||||
| 	done | ||||
| 	stop_slow_io | ||||
| 	zpool sync | ||||
| 
 | ||||
| 	log_mustnot wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 45 | ||||
| 
 | ||||
| 	do_clean | ||||
| } | ||||
| 
 | ||||
| log_assert "Test ZED slow io configurability" | ||||
| log_onexit cleanup | ||||
| 
 | ||||
| log_must zed_events_drain | ||||
| log_must zed_start | ||||
| 
 | ||||
| default_degrade | ||||
| slow_io_degrade | ||||
| slow_io_no_degrade | ||||
| 
 | ||||
| log_pass "Test ZED slow io configurability" | ||||
							
								
								
									
										177
									
								
								tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										177
									
								
								tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,177 @@ | ||||
| #!/bin/ksh -p | ||||
| # | ||||
| # CDDL HEADER START | ||||
| # | ||||
| # The contents of this file are subject to the terms of the | ||||
| # Common Development and Distribution License (the "License"). | ||||
| # You may not use this file except in compliance with the License. | ||||
| # | ||||
| # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | ||||
| # or https://opensource.org/licenses/CDDL-1.0. | ||||
| # See the License for the specific language governing permissions | ||||
| # and limitations under the License. | ||||
| # | ||||
| # When distributing Covered Code, include this CDDL HEADER in each | ||||
| # file and include the License file at usr/src/OPENSOLARIS.LICENSE. | ||||
| # If applicable, add the following below this CDDL HEADER, with the | ||||
| # fields enclosed by brackets "[]" replaced with your own identifying | ||||
| # information: Portions Copyright [yyyy] [name of copyright owner] | ||||
| # | ||||
| # CDDL HEADER END | ||||
| # | ||||
| 
 | ||||
| # | ||||
| # Copyright (c) 2023, Klara Inc. | ||||
| # | ||||
| 
 | ||||
| # DESCRIPTION: | ||||
| #	Verify that delay events from multiple vdevs doesnt degrade | ||||
| # | ||||
| # STRATEGY: | ||||
| #	1. Create a pool with a 3 disk raidz vdev | ||||
| #	2. Inject slow io errors | ||||
| #	3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs | ||||
| # | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| 
 | ||||
| TESTDIR="$TEST_BASE_DIR/zed_slow_io" | ||||
| VDEV1="$TEST_BASE_DIR/vdevfile1.$$" | ||||
| VDEV2="$TEST_BASE_DIR/vdevfile2.$$" | ||||
| VDEV3="$TEST_BASE_DIR/vdevfile3.$$" | ||||
| VDEV4="$TEST_BASE_DIR/vdevfile4.$$" | ||||
| VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4" | ||||
| TESTPOOL="slow_io_pool" | ||||
| FILEPATH="$TESTDIR/slow_io.testfile" | ||||
| 
 | ||||
| OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) | ||||
| OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) | ||||
| 
 | ||||
| verify_runnable "both" | ||||
| 
 | ||||
| function cleanup | ||||
| { | ||||
| 	log_must zinject -c all | ||||
| 
 | ||||
| 	# if pool still exists then something failed so log additional info | ||||
| 	if poolexists $TESTPOOL ; then | ||||
| 		log_note "$(zpool status -s $TESTPOOL)" | ||||
| 		echo "=================== zed log search ===================" | ||||
| 		grep "Diagnosis Engine" $ZEDLET_DIR/zed.log | ||||
| 		destroy_pool $TESTPOOL | ||||
| 	fi | ||||
| 	log_must zed_stop | ||||
| 
 | ||||
| 	log_must rm -f $VDEVS | ||||
| 	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO | ||||
| 	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS | ||||
| } | ||||
| 
 | ||||
| function start_slow_io | ||||
| { | ||||
| 	for vdev in $VDEVS | ||||
| 	do | ||||
| 		log_must zpool set slow_io_n=4 $TESTPOOL $vdev | ||||
| 		log_must zpool set slow_io_t=60 $TESTPOOL $vdev | ||||
| 	done | ||||
| 	zpool sync | ||||
| 
 | ||||
| 	log_must set_tunable64 ZIO_SLOW_IO_MS 10 | ||||
| 	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000 | ||||
| 
 | ||||
| 	for vdev in $VDEVS | ||||
| 	do | ||||
| 		log_must zinject -d $vdev -D10:1 $TESTPOOL | ||||
| 	done | ||||
| 	zpool sync | ||||
| } | ||||
| 
 | ||||
| function stop_slow_io | ||||
| { | ||||
| 	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO | ||||
| 	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS | ||||
| 
 | ||||
| 	log_must zinject -c all | ||||
| } | ||||
| 
 | ||||
| function multiple_slow_vdevs_test | ||||
| { | ||||
| 	log_must truncate -s 1G $VDEVS | ||||
| 	default_raidz_setup_noexit $VDEVS | ||||
| 
 | ||||
| 	log_must zpool events -c | ||||
| 	log_must zfs set compression=off $TESTPOOL | ||||
| 	log_must zfs set primarycache=none $TESTPOOL | ||||
| 	log_must zfs set recordsize=4K $TESTPOOL | ||||
| 
 | ||||
| 	log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=20 | ||||
| 	zpool sync | ||||
| 
 | ||||
| 	# | ||||
| 	# Read the file with slow io injected on the disks | ||||
| 	# This will cause multiple errors on each disk to trip ZED SERD | ||||
| 	# | ||||
| 	#   pool: slow_io_pool | ||||
| 	#  state: ONLINE | ||||
| 	# config: | ||||
| 	# | ||||
| 	#         NAME                           STATE  READ WRITE CKSUM  SLOW | ||||
| 	#         slow_io_pool                   ONLINE    0     0     0     - | ||||
| 	#           raidz1-0                     ONLINE    0     0     0     - | ||||
| 	#             /var/tmp/vdevfile1.499278  ONLINE    0     0     0   113 | ||||
| 	#             /var/tmp/vdevfile2.499278  ONLINE    0     0     0   109 | ||||
| 	#             /var/tmp/vdevfile3.499278  ONLINE    0     0     0    96 | ||||
| 	#             /var/tmp/vdevfile4.499278  ONLINE    0     0     0   109 | ||||
| 	# | ||||
| 	start_slow_io | ||||
| 	dd if=$FILEPATH of=/dev/null bs=1M count=20 2>/dev/null | ||||
| 	stop_slow_io | ||||
| 
 | ||||
| 	# count events available for processing | ||||
| 	typeset -i i=0 | ||||
| 	typeset -i events=0 | ||||
| 	while [[ $i -lt 60 ]]; do | ||||
| 		events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l) | ||||
| 		[[ $events -ge "50" ]] && break | ||||
| 		i=$((i+1)) | ||||
| 		sleep 1 | ||||
| 	done | ||||
| 	log_note "$events delay events found" | ||||
| 	if [[ $events -lt "50" ]]; then | ||||
| 		log_note "bailing: not enough events to complete the test" | ||||
| 		destroy_pool $TESTPOOL | ||||
| 		return | ||||
| 	fi | ||||
| 
 | ||||
| 	# | ||||
| 	# give slow ZED a chance to process the delay events | ||||
| 	# | ||||
| 	typeset -i i=0 | ||||
| 	typeset -i skips=0 | ||||
| 	while [[ $i -lt 75 ]]; do | ||||
| 		skips=$(grep "retiring case" \ | ||||
| 			$ZEDLET_DIR/zed.log | wc -l) | ||||
| 		[[ $skips -gt "0" ]] && break | ||||
| 		i=$((i+1)) | ||||
| 		sleep 1 | ||||
| 	done | ||||
| 
 | ||||
| 	log_note $skips degrade skips in ZED log after $i seconds | ||||
| 	[ $skips -gt "0" ] || log_fail "expecting to see skips" | ||||
| 
 | ||||
| 	degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l) | ||||
| 	log_note $degrades vdev degrades in ZED log | ||||
| 	[ $degrades -eq "0" ] || \ | ||||
| 		log_fail "expecting no degrade events, found $degrades" | ||||
| 
 | ||||
| 	destroy_pool $TESTPOOL | ||||
| } | ||||
| 
 | ||||
| log_assert "Test ZED slow io across multiple vdevs" | ||||
| log_onexit cleanup | ||||
| 
 | ||||
| log_must zed_events_drain | ||||
| log_must zed_start | ||||
| multiple_slow_vdevs_test | ||||
| 
 | ||||
| log_pass "Test ZED slow io across multiple vdevs" | ||||
| @ -32,5 +32,6 @@ cleanup_devices $DISKS | ||||
| 
 | ||||
| zed_stop | ||||
| zed_cleanup resilver_finish-start-scrub.sh | ||||
| zed_events_drain | ||||
| 
 | ||||
| log_pass | ||||
|  | ||||
| @ -28,6 +28,7 @@ | ||||
| 
 | ||||
| verify_runnable "global" | ||||
| 
 | ||||
| zed_events_drain | ||||
| zed_setup resilver_finish-start-scrub.sh | ||||
| zed_start | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Don Brady
						Don Brady