Skip spurious resilver IO on raidz vdev

On a raidz vdev, a block that does not span all child vdevs, excluding
its skip sectors if any, may not be affected by a child vdev outage or
failure. In such cases, the block does not need to be resilvered.
However, current resilver algorithm simply resilvers all blocks on a
degraded raidz vdev. Such spurious IO is not only wasteful, but also
adds the risk of overwriting good data.

This patch eliminates such spurious IOs.

Reviewed-by: Gvozden Neskovic <neskovic@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Isaac Huang <he.huang@intel.com>
Closes #5316
This commit is contained in:
Isaac Huang
2017-05-12 18:28:03 -06:00
committed by Brian Behlendorf
parent 8c54ddd33a
commit 3d6da72d18
10 changed files with 125 additions and 33 deletions
+49 -23
View File
@@ -1836,12 +1836,51 @@ dsl_scan_scrub_done(zio_t *zio)
mutex_exit(&spa->spa_scrub_lock);
}
static boolean_t
dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
vdev_t *vd;
if (DVA_GET_GANG(dva)) {
/*
* Gang members may be spread across multiple
* vdevs, so the best estimate we have is the
* scrub range, which has already been checked.
* XXX -- it would be better to change our
* allocation policy to ensure that all
* gang members reside on the same vdev.
*/
return (B_TRUE);
}
vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
/*
* Check if the txg falls within the range which must be
* resilvered. DVAs outside this range can always be skipped.
*/
if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
return (B_FALSE);
/*
* Check if the top-level vdev must resilver this offset.
* When the offset does not intersect with a dirty leaf DTL
* then it may be possible to skip the resilver IO. The psize
* is provided instead of asize to simplify the check for RAIDZ.
*/
if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
return (B_FALSE);
return (B_TRUE);
}
static int
dsl_scan_scrub_cb(dsl_pool_t *dp,
const blkptr_t *bp, const zbookmark_phys_t *zb)
{
dsl_scan_t *scn = dp->dp_scan;
size_t size = BP_GET_PSIZE(bp);
size_t psize = BP_GET_PSIZE(bp);
spa_t *spa = dp->dp_spa;
uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
boolean_t needs_io = B_FALSE;
@@ -1875,33 +1914,19 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
zio_flags |= ZIO_FLAG_SPECULATIVE;
for (d = 0; d < BP_GET_NDVAS(bp); d++) {
vdev_t *vd = vdev_lookup_top(spa,
DVA_GET_VDEV(&bp->blk_dva[d]));
const dva_t *dva = &bp->blk_dva[d];
/*
* Keep track of how much data we've examined so that
* zpool(1M) status can make useful progress reports.
*/
scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
/* if it's a resilver, this may not be in the target range */
if (!needs_io) {
if (DVA_GET_GANG(&bp->blk_dva[d])) {
/*
* Gang members may be spread across multiple
* vdevs, so the best estimate we have is the
* scrub range, which has already been checked.
* XXX -- it would be better to change our
* allocation policy to ensure that all
* gang members reside on the same vdev.
*/
needs_io = B_TRUE;
} else {
needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
phys_birth, 1);
}
}
if (!needs_io)
needs_io = dsl_scan_need_resilver(spa, dva, psize,
phys_birth);
}
if (needs_io && !zfs_no_scrub_io) {
@@ -1922,8 +1947,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
delay(scan_delay);
zio_nowait(zio_read(NULL, spa, bp,
abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
abd_alloc_for_io(psize, B_FALSE),
psize, dsl_scan_scrub_done, NULL,
ZIO_PRIORITY_SCRUB, zio_flags, zb));
}
/* do not relocate this block */