mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	vdev probe to slow disk can stall mmp write checker
Simplify vdev probes in the zio_vdev_io_done context to avoid holding the spa config lock for a long duration. Also allow zpool clear if no evidence of another host is using the pool. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Olaf Faaland <faaland1@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Don Brady <don.brady@klarasystems.com> Closes #15839
This commit is contained in:
		
							parent
							
								
									ea3f7c12a9
								
							
						
					
					
						commit
						706307445e
					
				| @ -8910,7 +8910,7 @@ status_callback(zpool_handle_t *zhp, void *data) | ||||
| 		printf_color(ANSI_BOLD, gettext("action: ")); | ||||
| 		printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" | ||||
| 		    " are connected, then reboot your system and\n\timport the " | ||||
| 		    "pool.\n")); | ||||
| 		    "pool or run 'zpool clear' to resume the pool.\n")); | ||||
| 		break; | ||||
| 
 | ||||
| 	case ZPOOL_STATUS_IO_FAILURE_WAIT: | ||||
|  | ||||
| @ -769,7 +769,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); | ||||
| 
 | ||||
| #define	SPA_ASYNC_CONFIG_UPDATE			0x01 | ||||
| #define	SPA_ASYNC_REMOVE			0x02 | ||||
| #define	SPA_ASYNC_PROBE				0x04 | ||||
| #define	SPA_ASYNC_FAULT_VDEV			0x04 | ||||
| #define	SPA_ASYNC_RESILVER_DONE			0x08 | ||||
| #define	SPA_ASYNC_RESILVER			0x10 | ||||
| #define	SPA_ASYNC_AUTOEXPAND			0x20 | ||||
| @ -1113,6 +1113,8 @@ extern uint32_t spa_get_hostid(spa_t *spa); | ||||
| extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); | ||||
| extern boolean_t spa_livelist_delete_check(spa_t *spa); | ||||
| 
 | ||||
| extern boolean_t spa_mmp_remote_host_activity(spa_t *spa); | ||||
| 
 | ||||
| extern spa_mode_t spa_mode(spa_t *spa); | ||||
| extern uint64_t zfs_strtonum(const char *str, char **nptr); | ||||
| 
 | ||||
|  | ||||
| @ -50,20 +50,20 @@ extern "C" { | ||||
| #define	MMP_SEQ_VALID_BIT	0x02 | ||||
| #define	MMP_FAIL_INT_VALID_BIT	0x04 | ||||
| 
 | ||||
| #define	MMP_VALID(ubp)		(ubp->ub_magic == UBERBLOCK_MAGIC && \ | ||||
| 				    ubp->ub_mmp_magic == MMP_MAGIC) | ||||
| #define	MMP_INTERVAL_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \ | ||||
| #define	MMP_VALID(ubp)		((ubp)->ub_magic == UBERBLOCK_MAGIC && \ | ||||
| 				    (ubp)->ub_mmp_magic == MMP_MAGIC) | ||||
| #define	MMP_INTERVAL_VALID(ubp)	(MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ | ||||
| 				    MMP_INTERVAL_VALID_BIT)) | ||||
| #define	MMP_SEQ_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \ | ||||
| #define	MMP_SEQ_VALID(ubp)	(MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ | ||||
| 				    MMP_SEQ_VALID_BIT)) | ||||
| #define	MMP_FAIL_INT_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \ | ||||
| #define	MMP_FAIL_INT_VALID(ubp)	(MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ | ||||
| 				    MMP_FAIL_INT_VALID_BIT)) | ||||
| 
 | ||||
| #define	MMP_INTERVAL(ubp)	((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ | ||||
| #define	MMP_INTERVAL(ubp)	(((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \ | ||||
| 				    >> 8) | ||||
| #define	MMP_SEQ(ubp)		((ubp->ub_mmp_config & 0x0000FFFF00000000) \ | ||||
| #define	MMP_SEQ(ubp)		(((ubp)->ub_mmp_config & 0x0000FFFF00000000) \ | ||||
| 				    >> 32) | ||||
| #define	MMP_FAIL_INT(ubp)	((ubp->ub_mmp_config & 0xFFFF000000000000) \ | ||||
| #define	MMP_FAIL_INT(ubp)	(((ubp)->ub_mmp_config & 0xFFFF000000000000) \ | ||||
| 				    >> 48) | ||||
| 
 | ||||
| #define	MMP_INTERVAL_SET(write) \ | ||||
|  | ||||
| @ -274,7 +274,7 @@ struct vdev { | ||||
| 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/ | ||||
| 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/ | ||||
| 	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/ | ||||
| 	boolean_t	vdev_probe_wanted; /* async probe wanted?	*/ | ||||
| 	boolean_t	vdev_fault_wanted; /* async faulted wanted?	*/ | ||||
| 	list_node_t	vdev_config_dirty_node; /* config dirty list	*/ | ||||
| 	list_node_t	vdev_state_dirty_node; /* state dirty list	*/ | ||||
| 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/ | ||||
|  | ||||
| @ -50,9 +50,10 @@ If the pool was suspended it will be brought back online provided the | ||||
| devices can be accessed. | ||||
| Pools with | ||||
| .Sy multihost | ||||
| enabled which have been suspended cannot be resumed. | ||||
| While the pool was suspended, it may have been imported on | ||||
| another host, and resuming I/O could result in pool damage. | ||||
| enabled which have been suspended cannot be resumed when there is evidence | ||||
| that the pool was imported by another host. | ||||
| The same checks performed during an import will be applied before the clear | ||||
| proceeds. | ||||
| .Bl -tag -width Ds | ||||
| .It Fl -power | ||||
| Power on the devices's slot in the storage enclosure and wait for the device | ||||
|  | ||||
| @ -663,12 +663,13 @@ mmp_thread(void *arg) | ||||
| 		    (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { | ||||
| 			zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " | ||||
| 			    "mmp_last_write %llu mmp_interval %llu " | ||||
| 			    "mmp_fail_intervals %llu mmp_fail_ns %llu", | ||||
| 			    "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu", | ||||
| 			    spa_name(spa), (u_longlong_t)gethrtime(), | ||||
| 			    (u_longlong_t)mmp->mmp_last_write, | ||||
| 			    (u_longlong_t)mmp_interval, | ||||
| 			    (u_longlong_t)mmp_fail_intervals, | ||||
| 			    (u_longlong_t)mmp_fail_ns); | ||||
| 			    (u_longlong_t)mmp_fail_ns, | ||||
| 			    (u_longlong_t)spa->spa_uberblock.ub_txg); | ||||
| 			cmn_err(CE_WARN, "MMP writes to pool '%s' have not " | ||||
| 			    "succeeded in over %llu ms; suspending pool. " | ||||
| 			    "Hrtime %llu", | ||||
|  | ||||
| @ -3542,11 +3542,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Perform the import activity check.  If the user canceled the import or | ||||
|  * we detected activity then fail. | ||||
|  * Remote host activity check. | ||||
|  * | ||||
|  * error results: | ||||
|  *          0 - no activity detected | ||||
|  *  EREMOTEIO - remote activity detected | ||||
|  *      EINTR - user canceled the operation | ||||
|  */ | ||||
| static int | ||||
| spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) | ||||
| spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, | ||||
|     boolean_t importing) | ||||
| { | ||||
| 	uint64_t txg = ub->ub_txg; | ||||
| 	uint64_t timestamp = ub->ub_timestamp; | ||||
| @ -3591,19 +3596,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) | ||||
| 
 | ||||
| 	import_expire = gethrtime() + import_delay; | ||||
| 
 | ||||
| 	spa_import_progress_set_notes(spa, "Checking MMP activity, waiting " | ||||
| 	    "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); | ||||
| 	if (importing) { | ||||
| 		spa_import_progress_set_notes(spa, "Checking MMP activity, " | ||||
| 		    "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); | ||||
| 	} | ||||
| 
 | ||||
| 	int interations = 0; | ||||
| 	int iterations = 0; | ||||
| 	while ((now = gethrtime()) < import_expire) { | ||||
| 		if (interations++ % 30 == 0) { | ||||
| 		if (importing && iterations++ % 30 == 0) { | ||||
| 			spa_import_progress_set_notes(spa, "Checking MMP " | ||||
| 			    "activity, %llu ms remaining", | ||||
| 			    (u_longlong_t)NSEC2MSEC(import_expire - now)); | ||||
| 		} | ||||
| 
 | ||||
| 		if (importing) { | ||||
| 			(void) spa_import_progress_set_mmp_check(spa_guid(spa), | ||||
| 			    NSEC2SEC(import_expire - gethrtime())); | ||||
| 		} | ||||
| 
 | ||||
| 		vdev_uberblock_load(rvd, ub, &mmp_label); | ||||
| 
 | ||||
| @ -3685,6 +3694,61 @@ out: | ||||
| 	return (error); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Called from zfs_ioc_clear for a pool that was suspended | ||||
|  * after failing mmp write checks. | ||||
|  */ | ||||
| boolean_t | ||||
| spa_mmp_remote_host_activity(spa_t *spa) | ||||
| { | ||||
| 	ASSERT(spa_multihost(spa) && spa_suspended(spa)); | ||||
| 
 | ||||
| 	nvlist_t *best_label; | ||||
| 	uberblock_t best_ub; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Locate the best uberblock on disk | ||||
| 	 */ | ||||
| 	vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); | ||||
| 	if (best_label) { | ||||
| 		/*
 | ||||
| 		 * confirm that the best hostid matches our hostid | ||||
| 		 */ | ||||
| 		if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && | ||||
| 		    spa_get_hostid(spa) != | ||||
| 		    fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { | ||||
| 			nvlist_free(best_label); | ||||
| 			return (B_TRUE); | ||||
| 		} | ||||
| 		nvlist_free(best_label); | ||||
| 	} else { | ||||
| 		return (B_TRUE); | ||||
| 	} | ||||
| 
 | ||||
| 	if (!MMP_VALID(&best_ub) || | ||||
| 	    !MMP_FAIL_INT_VALID(&best_ub) || | ||||
| 	    MMP_FAIL_INT(&best_ub) == 0) { | ||||
| 		return (B_TRUE); | ||||
| 	} | ||||
| 
 | ||||
| 	if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || | ||||
| 	    best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { | ||||
| 		zfs_dbgmsg("txg mismatch detected during pool clear " | ||||
| 		    "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", | ||||
| 		    (u_longlong_t)spa->spa_uberblock.ub_txg, | ||||
| 		    (u_longlong_t)best_ub.ub_txg, | ||||
| 		    (u_longlong_t)spa->spa_uberblock.ub_timestamp, | ||||
| 		    (u_longlong_t)best_ub.ub_timestamp); | ||||
| 		return (B_TRUE); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Perform an activity check looking for any remote writer | ||||
| 	 */ | ||||
| 	return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, | ||||
| 	    B_FALSE) != 0); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| spa_verify_host(spa_t *spa, nvlist_t *mos_config) | ||||
| { | ||||
| @ -4005,7 +4069,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) | ||||
| 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); | ||||
| 		} | ||||
| 
 | ||||
| 		int error = spa_activity_check(spa, ub, spa->spa_config); | ||||
| 		int error = | ||||
| 		    spa_activity_check(spa, ub, spa->spa_config, B_TRUE); | ||||
| 		if (error) { | ||||
| 			nvlist_free(label); | ||||
| 			return (error); | ||||
| @ -8589,15 +8654,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd) | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| spa_async_probe(spa_t *spa, vdev_t *vd) | ||||
| spa_async_fault_vdev(spa_t *spa, vdev_t *vd) | ||||
| { | ||||
| 	if (vd->vdev_probe_wanted) { | ||||
| 		vd->vdev_probe_wanted = B_FALSE; | ||||
| 		vdev_reopen(vd);	/* vdev_open() does the actual probe */ | ||||
| 	if (vd->vdev_fault_wanted) { | ||||
| 		vd->vdev_fault_wanted = B_FALSE; | ||||
| 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, | ||||
| 		    VDEV_AUX_ERR_EXCEEDED); | ||||
| 	} | ||||
| 
 | ||||
| 	for (int c = 0; c < vd->vdev_children; c++) | ||||
| 		spa_async_probe(spa, vd->vdev_child[c]); | ||||
| 		spa_async_fault_vdev(spa, vd->vdev_child[c]); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| @ -8685,11 +8751,11 @@ spa_async_thread(void *arg) | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * See if any devices need to be probed. | ||||
| 	 * See if any devices need to be marked faulted. | ||||
| 	 */ | ||||
| 	if (tasks & SPA_ASYNC_PROBE) { | ||||
| 	if (tasks & SPA_ASYNC_FAULT_VDEV) { | ||||
| 		spa_vdev_state_enter(spa, SCL_NONE); | ||||
| 		spa_async_probe(spa, spa->spa_root_vdev); | ||||
| 		spa_async_fault_vdev(spa, spa->spa_root_vdev); | ||||
| 		(void) spa_vdev_state_exit(spa, NULL, 0); | ||||
| 	} | ||||
| 
 | ||||
|  | ||||
| @ -550,6 +550,15 @@ txg_sync_thread(void *arg) | ||||
| 			timer = (delta > timeout ? 0 : timeout - delta); | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * When we're suspended, nothing should be changing and for | ||||
| 		 * MMP we don't want to bump anything that would make it | ||||
| 		 * harder to detect if another host is changing it when | ||||
| 		 * resuming after a MMP suspend. | ||||
| 		 */ | ||||
| 		if (spa_suspended(spa)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Wait until the quiesce thread hands off a txg to us, | ||||
| 		 * prompting it to do so if necessary. | ||||
|  | ||||
| @ -1661,6 +1661,7 @@ vdev_metaslab_fini(vdev_t *vd) | ||||
| typedef struct vdev_probe_stats { | ||||
| 	boolean_t	vps_readable; | ||||
| 	boolean_t	vps_writeable; | ||||
| 	boolean_t	vps_zio_done_probe; | ||||
| 	int		vps_flags; | ||||
| } vdev_probe_stats_t; | ||||
| 
 | ||||
| @ -1704,6 +1705,17 @@ vdev_probe_done(zio_t *zio) | ||||
| 			(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, | ||||
| 			    spa, vd, NULL, NULL, 0); | ||||
| 			zio->io_error = SET_ERROR(ENXIO); | ||||
| 
 | ||||
| 			/*
 | ||||
| 			 * If this probe was initiated from zio pipeline, then | ||||
| 			 * change the state in a spa_async_request. Probes that | ||||
| 			 * were initiated from a vdev_open can change the state | ||||
| 			 * as part of the open call. | ||||
| 			 */ | ||||
| 			if (vps->vps_zio_done_probe) { | ||||
| 				vd->vdev_fault_wanted = B_TRUE; | ||||
| 				spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		mutex_enter(&vd->vdev_probe_lock); | ||||
| @ -1754,6 +1766,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) | ||||
| 
 | ||||
| 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | | ||||
| 		    ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; | ||||
| 		vps->vps_zio_done_probe = (zio != NULL); | ||||
| 
 | ||||
| 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { | ||||
| 			/*
 | ||||
| @ -1780,15 +1793,6 @@ vdev_probe(vdev_t *vd, zio_t *zio) | ||||
| 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, | ||||
| 		    vdev_probe_done, vps, | ||||
| 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * We can't change the vdev state in this context, so we | ||||
| 		 * kick off an async task to do it on our behalf. | ||||
| 		 */ | ||||
| 		if (zio != NULL) { | ||||
| 			vd->vdev_probe_wanted = B_TRUE; | ||||
| 			spa_async_request(spa, SPA_ASYNC_PROBE); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (zio != NULL) | ||||
|  | ||||
| @ -1982,6 +1982,7 @@ retry: | ||||
| 	/*
 | ||||
| 	 * If this isn't a resync due to I/O errors, | ||||
| 	 * and nothing changed in this transaction group, | ||||
| 	 * and multihost protection isn't enabled, | ||||
| 	 * and the vdev configuration hasn't changed, | ||||
| 	 * then there's nothing to do. | ||||
| 	 */ | ||||
| @ -1989,7 +1990,8 @@ retry: | ||||
| 		boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, | ||||
| 		    txg, spa->spa_mmp.mmp_delay); | ||||
| 
 | ||||
| 		if (!changed && list_is_empty(&spa->spa_config_dirty_list)) | ||||
| 		if (!changed && list_is_empty(&spa->spa_config_dirty_list) && | ||||
| 		    !spa_multihost(spa)) | ||||
| 			return (0); | ||||
| 	} | ||||
| 
 | ||||
|  | ||||
| @ -5800,10 +5800,13 @@ zfs_ioc_clear(zfs_cmd_t *zc) | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If multihost is enabled, resuming I/O is unsafe as another | ||||
| 	 * host may have imported the pool. | ||||
| 	 * host may have imported the pool. Check for remote activity. | ||||
| 	 */ | ||||
| 	if (spa_multihost(spa) && spa_suspended(spa)) | ||||
| 		return (SET_ERROR(EINVAL)); | ||||
| 	if (spa_multihost(spa) && spa_suspended(spa) && | ||||
| 	    spa_mmp_remote_host_activity(spa)) { | ||||
| 		spa_close(spa, FTAG); | ||||
| 		return (SET_ERROR(EREMOTEIO)); | ||||
| 	} | ||||
| 
 | ||||
| 	spa_vdev_state_enter(spa, SCL_NONE); | ||||
| 
 | ||||
|  | ||||
| @ -2503,8 +2503,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) | ||||
| 		    "failure and the failure mode property for this pool " | ||||
| 		    "is set to panic.", spa_name(spa)); | ||||
| 
 | ||||
| 	cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " | ||||
| 	    "failure and has been suspended.\n", spa_name(spa)); | ||||
| 	if (reason != ZIO_SUSPEND_MMP) { | ||||
| 		cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " | ||||
| 		    "I/O failure and has been suspended.\n", spa_name(spa)); | ||||
| 	} | ||||
| 
 | ||||
| 	(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, | ||||
| 	    NULL, NULL, 0); | ||||
|  | ||||
| @ -605,9 +605,11 @@ zio_handle_io_delay(zio_t *zio) | ||||
| 		if (vd->vdev_guid != handler->zi_record.zi_guid) | ||||
| 			continue; | ||||
| 
 | ||||
| 		/* also match on I/O type (e.g., -T read) */ | ||||
| 		if (handler->zi_record.zi_iotype != ZIO_TYPES && | ||||
| 		    handler->zi_record.zi_iotype != zio->io_type) | ||||
| 		    handler->zi_record.zi_iotype != zio->io_type) { | ||||
| 			continue; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Defensive; should never happen as the array allocation | ||||
|  | ||||
| @ -146,7 +146,7 @@ tags = ['functional', 'mmap'] | ||||
| tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', | ||||
|     'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', | ||||
|     'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', | ||||
|     'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] | ||||
|     'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk'] | ||||
| tags = ['functional', 'mmp'] | ||||
| 
 | ||||
| [tests/functional/mount:Linux] | ||||
|  | ||||
| @ -1588,6 +1588,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ | ||||
| 	functional/mmp/mmp_on_zdb.ksh \
 | ||||
| 	functional/mmp/mmp_reset_interval.ksh \
 | ||||
| 	functional/mmp/mmp_write_distribution.ksh \
 | ||||
| 	functional/mmp/mmp_write_slow_disk.ksh \
 | ||||
| 	functional/mmp/mmp_write_uberblocks.ksh \
 | ||||
| 	functional/mmp/multihost_history.ksh \
 | ||||
| 	functional/mmp/setup.ksh \
 | ||||
|  | ||||
							
								
								
									
										97
									
								
								tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										97
									
								
								tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,97 @@ | ||||
| #!/bin/ksh -p | ||||
| # | ||||
| # CDDL HEADER START | ||||
| # | ||||
| # This file and its contents are supplied under the terms of the | ||||
| # Common Development and Distribution License ("CDDL"), version 1.0. | ||||
| # You may only use this file in accordance with the terms of version | ||||
| # 1.0 of the CDDL. | ||||
| # | ||||
| # A full copy of the text of the CDDL should have accompanied this | ||||
| # source.  A copy of the CDDL is also available via the Internet at | ||||
| # http://www.illumos.org/license/CDDL. | ||||
| # | ||||
| # CDDL HEADER END | ||||
| # | ||||
| 
 | ||||
| # | ||||
| # Copyright (c) 2024, Klara Inc | ||||
| # | ||||
| 
 | ||||
| # DESCRIPTION: | ||||
| #	Verify that long VDEV probes do not cause MMP checks to suspend pool | ||||
| #	Note: without PR-15839 fix, this test will suspend the pool. | ||||
| # | ||||
| #	A device that is returning unexpected errors will trigger a vdev_probe. | ||||
| #	When the device additionally has slow response times, the probe can hold | ||||
| #	the spa config lock as a writer for a long period of time such that the | ||||
| #	mmp uberblock updates stall when trying to acquire the spa config lock. | ||||
| # | ||||
| # STRATEGY: | ||||
| #	1. Create a pool with multiple leaf vdevs | ||||
| #	2. Enable multihost and multihost_history | ||||
| #	3. Delay for MMP writes to occur | ||||
| #	4. Verify that a long VDEV probe didn't cause MMP check to suspend pool | ||||
| # | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| . $STF_SUITE/tests/functional/mmp/mmp.cfg | ||||
| . $STF_SUITE/tests/functional/mmp/mmp.kshlib | ||||
| 
 | ||||
| verify_runnable "both" | ||||
| 
 | ||||
| function cleanup | ||||
| { | ||||
| 	log_must zinject -c all | ||||
| 
 | ||||
| 	if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then | ||||
| 		log_must zpool clear $MMP_POOL | ||||
| 		zpool get state $MMP_POOL $MMP_DIR/file.3 | ||||
| 		zpool events | grep ".fs.zfs." | grep -v "history_event" | ||||
| 	fi | ||||
| 
 | ||||
| 	poolexists $MMP_POOL && destroy_pool $MMP_POOL | ||||
| 	log_must rm -r $MMP_DIR | ||||
| 	log_must mmp_clear_hostid | ||||
| } | ||||
| 
 | ||||
| log_assert "A long VDEV probe doesn't cause a MMP check suspend" | ||||
| log_onexit cleanup | ||||
| 
 | ||||
| MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost | ||||
| 
 | ||||
| # Create a multiple drive pool | ||||
| log_must zpool events -c | ||||
| log_must mkdir -p $MMP_DIR | ||||
| log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5} | ||||
| log_must zpool create -f $MMP_POOL \ | ||||
| 	mirror $MMP_DIR/file.{0,1,2} \ | ||||
| 	mirror $MMP_DIR/file.{3,4,5} | ||||
| 
 | ||||
| # Enable MMP | ||||
| log_must mmp_set_hostid $HOSTID1 | ||||
| log_must zpool set multihost=on $MMP_POOL | ||||
| clear_mmp_history | ||||
| 
 | ||||
| # Inject vdev write error along with a delay | ||||
| log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL | ||||
| log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL | ||||
| log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL | ||||
| 
 | ||||
| log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5 | ||||
| sleep 10 | ||||
| sync_pool $MMP_POOL | ||||
| 
 | ||||
| # Confirm mmp writes to the non-slow disks have taken place | ||||
| for x in {0,1,2,4}; do | ||||
| 	write_count=$(grep -c file.${x} $MMP_HISTORY_URL) | ||||
| 	[[ $write_count -gt 0 ]] || log_fail "expecting mmp writes" | ||||
| done | ||||
| 
 | ||||
| # Expect that the pool was not suspended | ||||
| log_must check_state $MMP_POOL "" "ONLINE" | ||||
| health=$(zpool list -H -o health $MMP_POOL) | ||||
| log_note "$MMP_POOL health is $health" | ||||
| [[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected" | ||||
| 
 | ||||
| log_pass "A long VDEV probe doesn't cause a MMP check suspend" | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Don Brady
						Don Brady