Distributed Spare (dRAID) Feature

This patch adds a new top-level vdev type called dRAID, which stands for Distributed parity RAID. This pool configuration allows all dRAID vdevs to participate when rebuilding to a distributed hot spare device. This can substantially reduce the total time required to restore full parity to pool with a failed device. A dRAID pool can be created using the new top-level `draid` type. Like `raidz`, the desired redundancy is specified after the type: `draid[1,2,3]`. No additional information is required to create the pool and reasonable default values will be chosen based on the number of child vdevs in the dRAID vdev. zpool create <pool> draid[1,2,3] <vdevs...> Unlike raidz, additional optional dRAID configuration values can be provided as part of the draid type as colon separated values. This allows administrators to fully specify a layout for either performance or capacity reasons. The supported options include: zpool create <pool> \ draid[<parity>][:<data>d][:<children>c][:<spares>s] \ <vdevs...> - draid[parity] - Parity level (default 1) - draid[:<data>d] - Data devices per group (default 8) - draid[:<children>c] - Expected number of child vdevs - draid[:<spares>s] - Distributed hot spares (default 0) Abbreviated example `zpool status` output for a 68 disk dRAID pool with two distributed spares using special allocation classes. ``` pool: tank state: ONLINE config: NAME STATE READ WRITE CKSUM slag7 ONLINE 0 0 0 draid2:8d:68c:2s-0 ONLINE 0 0 0 L0 ONLINE 0 0 0 L1 ONLINE 0 0 0 ... U25 ONLINE 0 0 0 U26 ONLINE 0 0 0 spare-53 ONLINE 0 0 0 U27 ONLINE 0 0 0 draid2-0-0 ONLINE 0 0 0 U28 ONLINE 0 0 0 U29 ONLINE 0 0 0 ... U42 ONLINE 0 0 0 U43 ONLINE 0 0 0 special mirror-1 ONLINE 0 0 0 L5 ONLINE 0 0 0 U5 ONLINE 0 0 0 mirror-2 ONLINE 0 0 0 L6 ONLINE 0 0 0 U6 ONLINE 0 0 0 spares draid2-0-0 INUSE currently in use draid2-0-1 AVAIL ``` When adding test coverage for the new dRAID vdev type the following options were added to the ztest command. These options are leverages by zloop.sh to test a wide range of dRAID configurations. -K draid|raidz|random - kind of RAID to test -D <value> - dRAID data drives per group -S <value> - dRAID distributed hot spares -R <value> - RAID parity (raidz or dRAID) The zpool_create, zpool_import, redundancy, replacement and fault test groups have all been updated provide test coverage for the dRAID feature. Co-authored-by: Isaac Huang <he.huang@intel.com> Co-authored-by: Mark Maybee <mmaybee@cray.com> Co-authored-by: Don Brady <don.brady@delphix.com> Co-authored-by: Matthew Ahrens <mahrens@delphix.com> Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Mark Maybee <mmaybee@cray.com> Reviewed-by: Matt Ahrens <matt@delphix.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #10102
2026-05-24 03:08:51 +03:00 · 2020-11-13 13:51:51 -08:00
parent a724db0374
commit b2255edcc0
153 changed files with 10203 additions and 1882 deletions
@@ -40,6 +40,7 @@
 #include <sys/dsl_dir.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_rebuild.h>
+#include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -51,6 +52,7 @@
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
+#include <sys/vdev_raidz.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
@@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
 static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
+	&vdev_draid_ops,
+	&vdev_draid_spare_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
@@ -221,10 +225,11 @@ vdev_getops(const char *type)

 /* ARGSUSED */
 void
-vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
+vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
-	res->rs_start = in->rs_start;
-	res->rs_end = in->rs_end;
+	physical_rs->rs_start = logical_rs->rs_start;
+	physical_rs->rs_end = logical_rs->rs_end;
 }

 /*
@@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
 	return (asize);
 }

+uint64_t
+vdev_default_min_asize(vdev_t *vd)
+{
+	return (vd->vdev_min_asize);
+}
+
 /*
 * Get the minimum allocatable size. We define the allocatable size as
 * the vdev's asize rounded to the nearest metaslab. This allows us to
@@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd)
 	if (vd == vd->vdev_top)
 		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));

-	/*
-	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
-	 * so each child must provide at least 1/Nth of its asize.
-	 */
-	if (pvd->vdev_ops == &vdev_raidz_ops)
-		return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
-		    pvd->vdev_children);
-
-	return (pvd->vdev_min_asize);
+	return (pvd->vdev_ops->vdev_op_min_asize(pvd));
 }

 void
@@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd)
 		vdev_set_min_asize(vd->vdev_child[c]);
 }

+/*
+ * Get the minimal allocation size for the top-level vdev.
+ */
+uint64_t
+vdev_get_min_alloc(vdev_t *vd)
+{
+	uint64_t min_alloc = 1ULL << vd->vdev_ashift;
+
+	if (vd->vdev_ops->vdev_op_min_alloc != NULL)
+		min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
+
+	return (min_alloc);
+}
+
+/*
+ * Get the parity level for a top-level vdev.
+ */
+uint64_t
+vdev_get_nparity(vdev_t *vd)
+{
+	uint64_t nparity = 0;
+
+	if (vd->vdev_ops->vdev_op_nparity != NULL)
+		nparity = vd->vdev_ops->vdev_op_nparity(vd);
+
+	return (nparity);
+}
+
+/*
+ * Get the number of data disks for a top-level vdev.
+ */
+uint64_t
+vdev_get_ndisks(vdev_t *vd)
+{
+	uint64_t ndisks = 1;
+
+	if (vd->vdev_ops->vdev_op_ndisks != NULL)
+		ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
+
+	return (ndisks);
+}
+
 vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
@@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	list_link_init(&vd->vdev_initialize_node);
 	list_link_init(&vd->vdev_leaf_node);
 	list_link_init(&vd->vdev_trim_node);
+
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);

 	mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);

 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
@@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 {
 	vdev_ops_t *ops;
 	char *type;
-	uint64_t guid = 0, islog, nparity;
+	uint64_t guid = 0, islog;
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 	char *tmp = NULL;
@@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 		return (SET_ERROR(ENOTSUP));

-	/*
-	 * Set the nparity property for RAID-Z vdevs.
-	 */
-	nparity = -1ULL;
-	if (ops == &vdev_raidz_ops) {
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
-		    &nparity) == 0) {
-			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
-				return (SET_ERROR(EINVAL));
-			/*
-			 * Previous versions could only support 1 or 2 parity
-			 * device.
-			 */
-			if (nparity > 1 &&
-			    spa_version(spa) < SPA_VERSION_RAIDZ2)
-				return (SET_ERROR(ENOTSUP));
-			if (nparity > 2 &&
-			    spa_version(spa) < SPA_VERSION_RAIDZ3)
-				return (SET_ERROR(ENOTSUP));
-		} else {
-			/*
-			 * We require the parity to be specified for SPAs that
-			 * support multiple parity levels.
-			 */
-			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
-				return (SET_ERROR(EINVAL));
-			/*
-			 * Otherwise, we default to 1 parity device for RAID-Z.
-			 */
-			nparity = 1;
-		}
-	} else {
-		nparity = 0;
-	}
-	ASSERT(nparity != -1ULL);
-
-	/*
-	 * If creating a top-level vdev, check for allocation classes input
-	 */
 	if (top_level && alloctype == VDEV_ALLOC_ADD) {
 		char *bias;

+		/*
+		 * If creating a top-level vdev, check for allocation
+		 * classes input.
+		 */
 		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
 		    &bias) == 0) {
 			alloc_bias = vdev_derive_alloc_bias(bias);
@@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 				return (SET_ERROR(ENOTSUP));
 			}
 		}
+
+		/* spa_vdev_add() expects feature to be enabled */
+		if (ops == &vdev_draid_ops &&
+		    spa->spa_load_state != SPA_LOAD_CREATE &&
+		    !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
+			return (SET_ERROR(ENOTSUP));
+		}
+	}
+
+	/*
+	 * Initialize the vdev specific data.  This is done before calling
+	 * vdev_alloc_common() since it may fail and this simplifies the
+	 * error reporting and cleanup code paths.
+	 */
+	void *tsd = NULL;
+	if (ops->vdev_op_init != NULL) {
+		rc = ops->vdev_op_init(spa, nv, &tsd);
+		if (rc != 0) {
+			return (rc);
+		}
 	}

 	vd = vdev_alloc_common(spa, id, guid, ops);
-	vic = &vd->vdev_indirect_config;
-
+	vd->vdev_tsd = tsd;
 	vd->vdev_islog = islog;
-	vd->vdev_nparity = nparity;
+
 	if (top_level && alloc_bias != VDEV_BIAS_NONE)
 		vd->vdev_alloc_bias = alloc_bias;

@@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	    &vd->vdev_wholedisk) != 0)
 		vd->vdev_wholedisk = -1ULL;

+	vic = &vd->vdev_indirect_config;
+
 	ASSERT0(vic->vic_mapping_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 	    &vic->vic_mapping_object);
@@ -937,6 +967,9 @@ vdev_free(vdev_t *vd)
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);

+	if (vd->vdev_ops->vdev_op_fini != NULL)
+		vd->vdev_ops->vdev_op_fini(vd);
+
 	/*
 	 * Discard allocation state.
 	 */
@@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd)
 	cv_destroy(&vd->vdev_trim_io_cv);

 	mutex_destroy(&vd->vdev_rebuild_lock);
-	mutex_destroy(&vd->vdev_rebuild_io_lock);
 	cv_destroy(&vd->vdev_rebuild_cv);
-	cv_destroy(&vd->vdev_rebuild_io_cv);

 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd)
 }

 /*
- * Add a mirror/replacing vdev above an existing vdev.
+ * Add a mirror/replacing vdev above an existing vdev.  There is no need to
+ * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
 */
 vdev_t *
 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
@@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd)
 				spa->spa_max_ashift = vd->vdev_ashift;
 			if (vd->vdev_ashift < spa->spa_min_ashift)
 				spa->spa_min_ashift = vd->vdev_ashift;
+
+			uint64_t min_alloc = vdev_get_min_alloc(vd);
+			if (min_alloc < spa->spa_min_alloc)
+				spa->spa_min_alloc = min_alloc;
 		}
 	}
 }
@@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd)
 	return (B_FALSE);
 }

+/*
+ * Returns B_TRUE if the passed child should be opened.
+ */
+static boolean_t
+vdev_default_open_children_func(vdev_t *vd)
+{
+	return (B_TRUE);
+}
+
+/*
+ * Open the requested child vdevs.  If any of the leaf vdevs are using
+ * a ZFS volume then do the opens in a single thread.  This avoids a
+ * deadlock when the current thread is holding the spa_namespace_lock.
+ */
+static void
+vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+	int children = vd->vdev_children;
+
+	taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
+	    children, children, TASKQ_PREPOPULATE);
+	vd->vdev_nonrot = B_TRUE;
+
+	for (int c = 0; c < children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (open_func(cvd) == B_FALSE)
+			continue;
+
+		if (tq == NULL || vdev_uses_zvols(vd)) {
+			cvd->vdev_open_error = vdev_open(cvd);
+		} else {
+			VERIFY(taskq_dispatch(tq, vdev_open_child,
+			    cvd, TQ_SLEEP) != TASKQID_INVALID);
+		}
+
+		vd->vdev_nonrot &= cvd->vdev_nonrot;
+	}
+
+	if (tq != NULL) {
+		taskq_wait(tq);
+		taskq_destroy(tq);
+	}
+}
+
+/*
+ * Open all child vdevs.
+ */
 void
 vdev_open_children(vdev_t *vd)
 {
-	taskq_t *tq;
-	int children = vd->vdev_children;
+	vdev_open_children_impl(vd, vdev_default_open_children_func);
+}

-	/*
-	 * in order to handle pools on top of zvols, do the opens
-	 * in a single thread so that the same thread holds the
-	 * spa_namespace_lock
-	 */
-	if (vdev_uses_zvols(vd)) {
-retry_sync:
-		for (int c = 0; c < children; c++)
-			vd->vdev_child[c]->vdev_open_error =
-			    vdev_open(vd->vdev_child[c]);
-	} else {
-		tq = taskq_create("vdev_open", children, minclsyspri,
-		    children, children, TASKQ_PREPOPULATE);
-		if (tq == NULL)
-			goto retry_sync;
-
-		for (int c = 0; c < children; c++)
-			VERIFY(taskq_dispatch(tq, vdev_open_child,
-			    vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
-
-		taskq_destroy(tq);
-	}
-
-	vd->vdev_nonrot = B_TRUE;
-
-	for (int c = 0; c < children; c++)
-		vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
+/*
+ * Conditionally open a subset of child vdevs.
+ */
+void
+vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+	vdev_open_children_impl(vd, open_func);
 }

 /*
@@ -1952,6 +2016,16 @@ vdev_open(vdev_t *vd)
 		return (error);
 	}

+	/*
+	 * Track the the minimum allocation size.
+	 */
+	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
+		uint64_t min_alloc = vdev_get_min_alloc(vd);
+		if (min_alloc < spa->spa_min_alloc)
+			spa->spa_min_alloc = min_alloc;
+	}
+
 	/*
 	 * If this is a leaf vdev, assess whether a resilver is needed.
 	 * But don't do this if we are doing a reopen for a scrub, since
@@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd)
 	vdev_t *pvd = vd->vdev_parent;
 	spa_t *spa __maybe_unused = vd->vdev_spa;

-	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+	ASSERT(vd != NULL);
+	ASSERT(vd->vdev_open_thread == curthread ||
+	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);

 	/*
 	 * If our parent is reopening, then we are as well, unless we are
@@ -2606,10 +2682,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 }

 /*
- * Returns B_TRUE if vdev determines offset needs to be resilvered.
+ * Check if the txg falls within the range which must be
+ * resilvered.  DVAs outside this range can always be skipped.
 */
 boolean_t
-vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+	/* Set by sequential resilver. */
+	if (phys_birth == TXG_UNKNOWN)
+		return (B_TRUE);
+
+	return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
+}
+
+/*
+ * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
+ */
+boolean_t
+vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
 {
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);

@@ -2617,7 +2709,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
 	    vd->vdev_ops->vdev_op_leaf)
 		return (B_TRUE);

-	return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
+	return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
+	    phys_birth));
 }

 /*
@@ -2862,8 +2955,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
 			continue;			/* leaf vdevs only */
 		if (t == DTL_PARTIAL)
 			minref = 1;			/* i.e. non-zero */
-		else if (vd->vdev_nparity != 0)
-			minref = vd->vdev_nparity + 1;	/* RAID-Z */
+		else if (vdev_get_nparity(vd) != 0)
+			minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
 		else
 			minref = vd->vdev_children;	/* any kind of mirror */
 		space_reftree_create(&reftree);
@@ -3727,6 +3820,9 @@ top:
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));

+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
 	tvd = vd->vdev_top;
 	mg = tvd->vdev_mg;
 	generation = spa->spa_config_generation + 1;
@@ -3971,6 +4067,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
 static void
 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 {
+	/*
+	 * Exclude the dRAID spare when aggregating to avoid double counting
+	 * the ops and bytes.  These IOs are counted by the physical leaves.
+	 */
+	if (cvd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
 	for (int t = 0; t < VS_ZIO_TYPES; t++) {
 		vs->vs_ops[t] += cvs->vs_ops[t];
 		vs->vs_bytes[t] += cvs->vs_bytes[t];
@@ -4063,7 +4166,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 				vdev_get_child_stat(cvd, vs, cvs);
 			if (vsx)
 				vdev_get_child_stat_ex(cvd, vsx, cvsx);
-
 		}
 	} else {
 		/*
@@ -4248,7 +4350,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)

 			/*
 			 * Repair is the result of a rebuild issued by the
-			 * rebuild thread (vdev_rebuild_thread).
+			 * rebuild thread (vdev_rebuild_thread).  To avoid
+			 * double counting repaired bytes the virtual dRAID
+			 * spare vdev is excluded from the processed bytes.
 			 */
 			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
 				vdev_t *tvd = vd->vdev_top;
@@ -4256,8 +4360,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 				vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 				uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;

-				if (vd->vdev_ops->vdev_op_leaf)
+				if (vd->vdev_ops->vdev_op_leaf &&
+				    vd->vdev_ops != &vdev_draid_spare_ops) {
 					atomic_add_64(rebuilt, psize);
+				}
 				vs->vs_rebuild_processed += psize;
 			}

@@ -4981,31 +5087,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
 	    vdev_resilver_needed(vd, NULL, NULL));
 }

+boolean_t
+vdev_xlate_is_empty(range_seg64_t *rs)
+{
+	return (rs->rs_start == rs->rs_end);
+}
+
 /*
- * Translate a logical range to the physical range for the specified vdev_t.
- * This function is initially called with a leaf vdev and will walk each
- * parent vdev until it reaches a top-level vdev. Once the top-level is
- * reached the physical range is initialized and the recursive function
- * begins to unwind. As it unwinds it calls the parent's vdev specific
- * translation function to do the real conversion.
+ * Translate a logical range to the first contiguous physical range for the
+ * specified vdev_t.  This function is initially called with a leaf vdev and
+ * will walk each parent vdev until it reaches a top-level vdev. Once the
+ * top-level is reached the physical range is initialized and the recursive
+ * function begins to unwind. As it unwinds it calls the parent's vdev
+ * specific translation function to do the real conversion.
 */
 void
 vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
-    range_seg64_t *physical_rs)
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
 	/*
 	 * Walk up the vdev tree
 	 */
 	if (vd != vd->vdev_top) {
-		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
+		    remain_rs);
 	} else {
 		/*
-		 * We've reached the top-level vdev, initialize the
-		 * physical range to the logical range and start to
-		 * unwind.
+		 * We've reached the top-level vdev, initialize the physical
+		 * range to the logical range and set an empty remaining
+		 * range then start to unwind.
 		 */
 		physical_rs->rs_start = logical_rs->rs_start;
 		physical_rs->rs_end = logical_rs->rs_end;
+
+		remain_rs->rs_start = logical_rs->rs_start;
+		remain_rs->rs_end = logical_rs->rs_start;
+
 		return;
 	}

@@ -5015,16 +5132,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,

 	/*
 	 * As this recursive function unwinds, translate the logical
-	 * range into its physical components by calling the
-	 * vdev specific translate function.
+	 * range into its physical and any remaining components by calling
+	 * the vdev specific translate function.
 	 */
 	range_seg64_t intermediate = { 0 };
-	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);

 	physical_rs->rs_start = intermediate.rs_start;
 	physical_rs->rs_end = intermediate.rs_end;
 }

+void
+vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
+    vdev_xlate_func_t *func, void *arg)
+{
+	range_seg64_t iter_rs = *logical_rs;
+	range_seg64_t physical_rs;
+	range_seg64_t remain_rs;
+
+	while (!vdev_xlate_is_empty(&iter_rs)) {
+
+		vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
+
+		/*
+		 * With raidz and dRAID, it's possible that the logical range
+		 * does not live on this leaf vdev. Only when there is a non-
+		 * zero physical size call the provided function.
+		 */
+		if (!vdev_xlate_is_empty(&physical_rs))
+			func(arg, &physical_rs);
+
+		iter_rs = remain_rs;
+	}
+}
+
 /*
 * Look at the vdev tree and determine whether any devices are currently being
 * replaced.