Skip spurious resilver IO on raidz vdev

On a raidz vdev, a block that does not span all child vdevs, excluding its skip sectors if any, may not be affected by a child vdev outage or failure. In such cases, the block does not need to be resilvered. However, current resilver algorithm simply resilvers all blocks on a degraded raidz vdev. Such spurious IO is not only wasteful, but also adds the risk of overwriting good data. This patch eliminates such spurious IOs. Reviewed-by: Gvozden Neskovic <neskovic@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Isaac Huang <he.huang@intel.com> Closes #5316
2026-05-26 04:07:45 +03:00 · 2017-05-12 18:28:03 -06:00
parent 8c54ddd33a
commit 3d6da72d18
10 changed files with 125 additions and 33 deletions
@@ -1836,12 +1836,51 @@ dsl_scan_scrub_done(zio_t *zio)
 	mutex_exit(&spa->spa_scrub_lock);
 }

+static boolean_t
+dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+	vdev_t *vd;
+
+	if (DVA_GET_GANG(dva)) {
+		/*
+		 * Gang members may be spread across multiple
+		 * vdevs, so the best estimate we have is the
+		 * scrub range, which has already been checked.
+		 * XXX -- it would be better to change our
+		 * allocation policy to ensure that all
+		 * gang members reside on the same vdev.
+		 */
+		return (B_TRUE);
+	}
+
+	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+	/*
+	 * Check if the txg falls within the range which must be
+	 * resilvered.  DVAs outside this range can always be skipped.
+	 */
+	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+		return (B_FALSE);
+
+	/*
+	 * Check if the top-level vdev must resilver this offset.
+	 * When the offset does not intersect with a dirty leaf DTL
+	 * then it may be possible to skip the resilver IO.  The psize
+	 * is provided instead of asize to simplify the check for RAIDZ.
+	 */
+	if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
 static int
 dsl_scan_scrub_cb(dsl_pool_t *dp,
    const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	dsl_scan_t *scn = dp->dp_scan;
-	size_t size = BP_GET_PSIZE(bp);
+	size_t psize = BP_GET_PSIZE(bp);
 	spa_t *spa = dp->dp_spa;
 	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
 	boolean_t needs_io = B_FALSE;
@@ -1875,33 +1914,19 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 		zio_flags |= ZIO_FLAG_SPECULATIVE;

 	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		vdev_t *vd = vdev_lookup_top(spa,
-		    DVA_GET_VDEV(&bp->blk_dva[d]));
+		const dva_t *dva = &bp->blk_dva[d];

 		/*
 		 * Keep track of how much data we've examined so that
 		 * zpool(1M) status can make useful progress reports.
 		 */
-		scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
-		spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+		scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
+		spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);

 		/* if it's a resilver, this may not be in the target range */
-		if (!needs_io) {
-			if (DVA_GET_GANG(&bp->blk_dva[d])) {
-				/*
-				 * Gang members may be spread across multiple
-				 * vdevs, so the best estimate we have is the
-				 * scrub range, which has already been checked.
-				 * XXX -- it would be better to change our
-				 * allocation policy to ensure that all
-				 * gang members reside on the same vdev.
-				 */
-				needs_io = B_TRUE;
-			} else {
-				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
-				    phys_birth, 1);
-			}
-		}
+		if (!needs_io)
+			needs_io = dsl_scan_need_resilver(spa, dva, psize,
+			    phys_birth);
 	}

 	if (needs_io && !zfs_no_scrub_io) {
@@ -1922,8 +1947,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 			delay(scan_delay);

 		zio_nowait(zio_read(NULL, spa, bp,
-		    abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
-		    NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
+		    abd_alloc_for_io(psize, B_FALSE),
+		    psize, dsl_scan_scrub_done, NULL,
+		    ZIO_PRIORITY_SCRUB, zio_flags, zb));
 	}

 	/* do not relocate this block */