mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	Skip spurious resilver IO on raidz vdev
On a raidz vdev, a block that does not span all child vdevs, excluding its skip sectors if any, may not be affected by a child vdev outage or failure. In such cases, the block does not need to be resilvered. However, current resilver algorithm simply resilvers all blocks on a degraded raidz vdev. Such spurious IO is not only wasteful, but also adds the risk of overwriting good data. This patch eliminates such spurious IOs. Reviewed-by: Gvozden Neskovic <neskovic@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Isaac Huang <he.huang@intel.com> Closes #5316
This commit is contained in:
		
							parent
							
								
									8c54ddd33a
								
							
						
					
					
						commit
						3d6da72d18
					
				| @ -65,6 +65,7 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, | ||||
| extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, | ||||
|     uint64_t txg, uint64_t size); | ||||
| extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); | ||||
| extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); | ||||
| extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, | ||||
|     int scrub_done); | ||||
| extern boolean_t vdev_dtl_required(vdev_t *vd); | ||||
|  | ||||
| @ -68,6 +68,7 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); | ||||
| typedef void	vdev_io_start_func_t(zio_t *zio); | ||||
| typedef void	vdev_io_done_func_t(zio_t *zio); | ||||
| typedef void	vdev_state_change_func_t(vdev_t *vd, int, int); | ||||
| typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); | ||||
| typedef void	vdev_hold_func_t(vdev_t *vd); | ||||
| typedef void	vdev_rele_func_t(vdev_t *vd); | ||||
| 
 | ||||
| @ -78,6 +79,7 @@ typedef const struct vdev_ops { | ||||
| 	vdev_io_start_func_t		*vdev_op_io_start; | ||||
| 	vdev_io_done_func_t		*vdev_op_io_done; | ||||
| 	vdev_state_change_func_t	*vdev_op_state_change; | ||||
| 	vdev_need_resilver_func_t	*vdev_op_need_resilver; | ||||
| 	vdev_hold_func_t		*vdev_op_hold; | ||||
| 	vdev_rele_func_t		*vdev_op_rele; | ||||
| 	char				vdev_op_type[16]; | ||||
|  | ||||
| @ -1836,12 +1836,51 @@ dsl_scan_scrub_done(zio_t *zio) | ||||
| 	mutex_exit(&spa->spa_scrub_lock); | ||||
| } | ||||
| 
 | ||||
| static boolean_t | ||||
| dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, | ||||
|     uint64_t phys_birth) | ||||
| { | ||||
| 	vdev_t *vd; | ||||
| 
 | ||||
| 	if (DVA_GET_GANG(dva)) { | ||||
| 		/*
 | ||||
| 		 * Gang members may be spread across multiple | ||||
| 		 * vdevs, so the best estimate we have is the | ||||
| 		 * scrub range, which has already been checked. | ||||
| 		 * XXX -- it would be better to change our | ||||
| 		 * allocation policy to ensure that all | ||||
| 		 * gang members reside on the same vdev. | ||||
| 		 */ | ||||
| 		return (B_TRUE); | ||||
| 	} | ||||
| 
 | ||||
| 	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Check if the txg falls within the range which must be | ||||
| 	 * resilvered.  DVAs outside this range can always be skipped. | ||||
| 	 */ | ||||
| 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) | ||||
| 		return (B_FALSE); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Check if the top-level vdev must resilver this offset. | ||||
| 	 * When the offset does not intersect with a dirty leaf DTL | ||||
| 	 * then it may be possible to skip the resilver IO.  The psize | ||||
| 	 * is provided instead of asize to simplify the check for RAIDZ. | ||||
| 	 */ | ||||
| 	if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) | ||||
| 		return (B_FALSE); | ||||
| 
 | ||||
| 	return (B_TRUE); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| dsl_scan_scrub_cb(dsl_pool_t *dp, | ||||
|     const blkptr_t *bp, const zbookmark_phys_t *zb) | ||||
| { | ||||
| 	dsl_scan_t *scn = dp->dp_scan; | ||||
| 	size_t size = BP_GET_PSIZE(bp); | ||||
| 	size_t psize = BP_GET_PSIZE(bp); | ||||
| 	spa_t *spa = dp->dp_spa; | ||||
| 	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); | ||||
| 	boolean_t needs_io = B_FALSE; | ||||
| @ -1875,33 +1914,19 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, | ||||
| 		zio_flags |= ZIO_FLAG_SPECULATIVE; | ||||
| 
 | ||||
| 	for (d = 0; d < BP_GET_NDVAS(bp); d++) { | ||||
| 		vdev_t *vd = vdev_lookup_top(spa, | ||||
| 		    DVA_GET_VDEV(&bp->blk_dva[d])); | ||||
| 		const dva_t *dva = &bp->blk_dva[d]; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Keep track of how much data we've examined so that | ||||
| 		 * zpool(1M) status can make useful progress reports. | ||||
| 		 */ | ||||
| 		scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); | ||||
| 		spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); | ||||
| 		scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); | ||||
| 		spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); | ||||
| 
 | ||||
| 		/* if it's a resilver, this may not be in the target range */ | ||||
| 		if (!needs_io) { | ||||
| 			if (DVA_GET_GANG(&bp->blk_dva[d])) { | ||||
| 				/*
 | ||||
| 				 * Gang members may be spread across multiple | ||||
| 				 * vdevs, so the best estimate we have is the | ||||
| 				 * scrub range, which has already been checked. | ||||
| 				 * XXX -- it would be better to change our | ||||
| 				 * allocation policy to ensure that all | ||||
| 				 * gang members reside on the same vdev. | ||||
| 				 */ | ||||
| 				needs_io = B_TRUE; | ||||
| 			} else { | ||||
| 				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, | ||||
| 				    phys_birth, 1); | ||||
| 			} | ||||
| 		} | ||||
| 		if (!needs_io) | ||||
| 			needs_io = dsl_scan_need_resilver(spa, dva, psize, | ||||
| 			    phys_birth); | ||||
| 	} | ||||
| 
 | ||||
| 	if (needs_io && !zfs_no_scrub_io) { | ||||
| @ -1922,8 +1947,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, | ||||
| 			delay(scan_delay); | ||||
| 
 | ||||
| 		zio_nowait(zio_read(NULL, spa, bp, | ||||
| 		    abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done, | ||||
| 		    NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); | ||||
| 		    abd_alloc_for_io(psize, B_FALSE), | ||||
| 		    psize, dsl_scan_scrub_done, NULL, | ||||
| 		    ZIO_PRIORITY_SCRUB, zio_flags, zb)); | ||||
| 	} | ||||
| 
 | ||||
| 	/* do not relocate this block */ | ||||
|  | ||||
| @ -1819,6 +1819,21 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) | ||||
| 	return (empty); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Returns B_TRUE if vdev determines offset needs to be resilvered. | ||||
|  */ | ||||
| boolean_t | ||||
| vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) | ||||
| { | ||||
| 	ASSERT(vd != vd->vdev_spa->spa_root_vdev); | ||||
| 
 | ||||
| 	if (vd->vdev_ops->vdev_op_need_resilver == NULL || | ||||
| 	    vd->vdev_ops->vdev_op_leaf) | ||||
| 		return (B_TRUE); | ||||
| 
 | ||||
| 	return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Returns the lowest txg in the DTL range. | ||||
|  */ | ||||
|  | ||||
| @ -796,6 +796,7 @@ vdev_ops_t vdev_disk_ops = { | ||||
| 	vdev_disk_io_start, | ||||
| 	vdev_disk_io_done, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	vdev_disk_hold, | ||||
| 	vdev_disk_rele, | ||||
| 	VDEV_TYPE_DISK,		/* name of this vdev type */ | ||||
|  | ||||
| @ -250,6 +250,7 @@ vdev_ops_t vdev_file_ops = { | ||||
| 	vdev_file_io_start, | ||||
| 	vdev_file_io_done, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	vdev_file_hold, | ||||
| 	vdev_file_rele, | ||||
| 	VDEV_TYPE_FILE,		/* name of this vdev type */ | ||||
| @ -283,6 +284,7 @@ vdev_ops_t vdev_disk_ops = { | ||||
| 	vdev_file_io_start, | ||||
| 	vdev_file_io_done, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	vdev_file_hold, | ||||
| 	vdev_file_rele, | ||||
| 	VDEV_TYPE_DISK,		/* name of this vdev type */ | ||||
|  | ||||
| @ -615,6 +615,7 @@ vdev_ops_t vdev_mirror_ops = { | ||||
| 	vdev_mirror_state_change, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	VDEV_TYPE_MIRROR,	/* name of this vdev type */ | ||||
| 	B_FALSE			/* not a leaf vdev */ | ||||
| }; | ||||
| @ -628,6 +629,7 @@ vdev_ops_t vdev_replacing_ops = { | ||||
| 	vdev_mirror_state_change, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	VDEV_TYPE_REPLACING,	/* name of this vdev type */ | ||||
| 	B_FALSE			/* not a leaf vdev */ | ||||
| }; | ||||
| @ -641,6 +643,7 @@ vdev_ops_t vdev_spare_ops = { | ||||
| 	vdev_mirror_state_change, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	VDEV_TYPE_SPARE,	/* name of this vdev type */ | ||||
| 	B_FALSE			/* not a leaf vdev */ | ||||
| }; | ||||
|  | ||||
| @ -88,6 +88,7 @@ vdev_ops_t vdev_missing_ops = { | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	VDEV_TYPE_MISSING,	/* name of this vdev type */ | ||||
| 	B_TRUE			/* leaf vdev */ | ||||
| }; | ||||
| @ -101,6 +102,7 @@ vdev_ops_t vdev_hole_ops = { | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	VDEV_TYPE_HOLE,		/* name of this vdev type */ | ||||
| 	B_TRUE			/* leaf vdev */ | ||||
| }; | ||||
|  | ||||
| @ -330,18 +330,18 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { | ||||
|  * is this functions only caller, as small as possible on the stack. | ||||
|  */ | ||||
| noinline raidz_map_t * | ||||
| vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, | ||||
| vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, | ||||
|     uint64_t nparity) | ||||
| { | ||||
| 	raidz_map_t *rm; | ||||
| 	/* The starting RAIDZ (parent) vdev sector of the block. */ | ||||
| 	uint64_t b = zio->io_offset >> unit_shift; | ||||
| 	uint64_t b = zio->io_offset >> ashift; | ||||
| 	/* The zio's size in units of the vdev's minimum sector size. */ | ||||
| 	uint64_t s = zio->io_size >> unit_shift; | ||||
| 	uint64_t s = zio->io_size >> ashift; | ||||
| 	/* The first column for this stripe. */ | ||||
| 	uint64_t f = b % dcols; | ||||
| 	/* The starting byte offset on each child vdev. */ | ||||
| 	uint64_t o = (b / dcols) << unit_shift; | ||||
| 	uint64_t o = (b / dcols) << ashift; | ||||
| 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; | ||||
| 	uint64_t off = 0; | ||||
| 
 | ||||
| @ -400,7 +400,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, | ||||
| 		coff = o; | ||||
| 		if (col >= dcols) { | ||||
| 			col -= dcols; | ||||
| 			coff += 1ULL << unit_shift; | ||||
| 			coff += 1ULL << ashift; | ||||
| 		} | ||||
| 		rm->rm_col[c].rc_devidx = col; | ||||
| 		rm->rm_col[c].rc_offset = coff; | ||||
| @ -413,17 +413,17 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, | ||||
| 		if (c >= acols) | ||||
| 			rm->rm_col[c].rc_size = 0; | ||||
| 		else if (c < bc) | ||||
| 			rm->rm_col[c].rc_size = (q + 1) << unit_shift; | ||||
| 			rm->rm_col[c].rc_size = (q + 1) << ashift; | ||||
| 		else | ||||
| 			rm->rm_col[c].rc_size = q << unit_shift; | ||||
| 			rm->rm_col[c].rc_size = q << ashift; | ||||
| 
 | ||||
| 		asize += rm->rm_col[c].rc_size; | ||||
| 	} | ||||
| 
 | ||||
| 	ASSERT3U(asize, ==, tot << unit_shift); | ||||
| 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); | ||||
| 	ASSERT3U(asize, ==, tot << ashift); | ||||
| 	rm->rm_asize = roundup(asize, (nparity + 1) << ashift); | ||||
| 	rm->rm_nskip = roundup(tot, nparity + 1) - tot; | ||||
| 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); | ||||
| 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); | ||||
| 	ASSERT3U(rm->rm_nskip, <=, nparity); | ||||
| 
 | ||||
| 	for (c = 0; c < rm->rm_firstdatacol; c++) | ||||
| @ -2299,6 +2299,44 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) | ||||
| 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Determine if any portion of the provided block resides on a child vdev | ||||
|  * with a dirty DTL and therefore needs to be resilvered.  The function | ||||
|  * assumes that at least one DTL is dirty which imples that full stripe | ||||
|  * width blocks must be resilvered. | ||||
|  */ | ||||
| static boolean_t | ||||
| vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) | ||||
| { | ||||
| 	uint64_t dcols = vd->vdev_children; | ||||
| 	uint64_t nparity = vd->vdev_nparity; | ||||
| 	uint64_t ashift = vd->vdev_top->vdev_ashift; | ||||
| 	/* The starting RAIDZ (parent) vdev sector of the block. */ | ||||
| 	uint64_t b = offset >> ashift; | ||||
| 	/* The zio's size in units of the vdev's minimum sector size. */ | ||||
| 	uint64_t s = ((psize - 1) >> ashift) + 1; | ||||
| 	/* The first column for this stripe. */ | ||||
| 	uint64_t f = b % dcols; | ||||
| 
 | ||||
| 	if (s + nparity >= dcols) | ||||
| 		return (B_TRUE); | ||||
| 
 | ||||
| 	for (uint64_t c = 0; c < s + nparity; c++) { | ||||
| 		uint64_t devidx = (f + c) % dcols; | ||||
| 		vdev_t *cvd = vd->vdev_child[devidx]; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * dsl_scan_need_resilver() already checked vd with | ||||
| 		 * vdev_dtl_contains(). So here just check cvd with | ||||
| 		 * vdev_dtl_empty(), cheaper and a good approximation. | ||||
| 		 */ | ||||
| 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) | ||||
| 			return (B_TRUE); | ||||
| 	} | ||||
| 
 | ||||
| 	return (B_FALSE); | ||||
| } | ||||
| 
 | ||||
| vdev_ops_t vdev_raidz_ops = { | ||||
| 	vdev_raidz_open, | ||||
| 	vdev_raidz_close, | ||||
| @ -2306,6 +2344,7 @@ vdev_ops_t vdev_raidz_ops = { | ||||
| 	vdev_raidz_io_start, | ||||
| 	vdev_raidz_io_done, | ||||
| 	vdev_raidz_state_change, | ||||
| 	vdev_raidz_need_resilver, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */ | ||||
|  | ||||
| @ -120,6 +120,7 @@ vdev_ops_t vdev_root_ops = { | ||||
| 	vdev_root_state_change, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	NULL, | ||||
| 	VDEV_TYPE_ROOT,		/* name of this vdev type */ | ||||
| 	B_FALSE			/* not a leaf vdev */ | ||||
| }; | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Isaac Huang
						Isaac Huang