Distributed Spare (dRAID) Feature

This patch adds a new top-level vdev type called dRAID, which stands for Distributed parity RAID. This pool configuration allows all dRAID vdevs to participate when rebuilding to a distributed hot spare device. This can substantially reduce the total time required to restore full parity to pool with a failed device. A dRAID pool can be created using the new top-level `draid` type. Like `raidz`, the desired redundancy is specified after the type: `draid[1,2,3]`. No additional information is required to create the pool and reasonable default values will be chosen based on the number of child vdevs in the dRAID vdev. zpool create <pool> draid[1,2,3] <vdevs...> Unlike raidz, additional optional dRAID configuration values can be provided as part of the draid type as colon separated values. This allows administrators to fully specify a layout for either performance or capacity reasons. The supported options include: zpool create <pool> \ draid[<parity>][:<data>d][:<children>c][:<spares>s] \ <vdevs...> - draid[parity] - Parity level (default 1) - draid[:<data>d] - Data devices per group (default 8) - draid[:<children>c] - Expected number of child vdevs - draid[:<spares>s] - Distributed hot spares (default 0) Abbreviated example `zpool status` output for a 68 disk dRAID pool with two distributed spares using special allocation classes. ``` pool: tank state: ONLINE config: NAME STATE READ WRITE CKSUM slag7 ONLINE 0 0 0 draid2:8d:68c:2s-0 ONLINE 0 0 0 L0 ONLINE 0 0 0 L1 ONLINE 0 0 0 ... U25 ONLINE 0 0 0 U26 ONLINE 0 0 0 spare-53 ONLINE 0 0 0 U27 ONLINE 0 0 0 draid2-0-0 ONLINE 0 0 0 U28 ONLINE 0 0 0 U29 ONLINE 0 0 0 ... U42 ONLINE 0 0 0 U43 ONLINE 0 0 0 special mirror-1 ONLINE 0 0 0 L5 ONLINE 0 0 0 U5 ONLINE 0 0 0 mirror-2 ONLINE 0 0 0 L6 ONLINE 0 0 0 U6 ONLINE 0 0 0 spares draid2-0-0 INUSE currently in use draid2-0-1 AVAIL ``` When adding test coverage for the new dRAID vdev type the following options were added to the ztest command. These options are leverages by zloop.sh to test a wide range of dRAID configurations. -K draid|raidz|random - kind of RAID to test -D <value> - dRAID data drives per group -S <value> - dRAID distributed hot spares -R <value> - RAID parity (raidz or dRAID) The zpool_create, zpool_import, redundancy, replacement and fault test groups have all been updated provide test coverage for the dRAID feature. Co-authored-by: Isaac Huang <he.huang@intel.com> Co-authored-by: Mark Maybee <mmaybee@cray.com> Co-authored-by: Don Brady <don.brady@delphix.com> Co-authored-by: Matthew Ahrens <mahrens@delphix.com> Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Mark Maybee <mmaybee@cray.com> Reviewed-by: Matt Ahrens <matt@delphix.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #10102
2026-05-22 18:40:43 +03:00 · 2020-11-13 13:51:51 -08:00
parent a724db0374
commit b2255edcc0
153 changed files with 10203 additions and 1882 deletions
@@ -84,6 +84,8 @@ $(MODULE)-objs += uberblock.o
 $(MODULE)-objs += unique.o
 $(MODULE)-objs += vdev.o
 $(MODULE)-objs += vdev_cache.o
+$(MODULE)-objs += vdev_draid.o
+$(MODULE)-objs += vdev_draid_rand.o
 $(MODULE)-objs += vdev_indirect.o
 $(MODULE)-objs += vdev_indirect_births.o
 $(MODULE)-objs += vdev_indirect_mapping.o
@@ -781,16 +781,17 @@ int
 abd_iterate_func(abd_t *abd, size_t off, size_t size,
    abd_iter_func_t *func, void *private)
 {
-	int ret = 0;
 	struct abd_iter aiter;
-	boolean_t abd_multi;
-	abd_t *c_abd;
+	int ret = 0;
+
+	if (size == 0)
+		return (0);

 	abd_verify(abd);
 	ASSERT3U(off + size, <=, abd->abd_size);

-	abd_multi = abd_is_gang(abd);
-	c_abd = abd_init_abd_iter(abd, &aiter, off);
+	boolean_t abd_multi = abd_is_gang(abd);
+	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);

 	while (size > 0) {
 		/* If we are at the end of the gang ABD we are done */
@@ -920,6 +921,9 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
 	boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
 	abd_t *c_dabd, *c_sabd;

+	if (size == 0)
+		return (0);
+
 	abd_verify(dabd);
 	abd_verify(sabd);

@@ -713,7 +713,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 	return (0);
 }

-static void
+void
 dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
@@ -3327,20 +3327,13 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
 		return (B_TRUE);
 	}

-	/*
-	 * Check if the txg falls within the range which must be
-	 * resilvered.  DVAs outside this range can always be skipped.
-	 */
-	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
-		return (B_FALSE);
-
 	/*
 	 * Check if the top-level vdev must resilver this offset.
 	 * When the offset does not intersect with a dirty leaf DTL
 	 * then it may be possible to skip the resilver IO.  The psize
 	 * is provided instead of asize to simplify the check for RAIDZ.
 	 */
-	if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
+	if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth))
 		return (B_FALSE);

 	/*
@@ -32,6 +32,7 @@
 #include <sys/space_map.h>
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
@@ -1563,6 +1564,7 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,

 #if defined(WITH_DF_BLOCK_ALLOCATOR) || \
    defined(WITH_CF_BLOCK_ALLOCATOR)
+
 /*
 * This is a helper function that can be used by the allocator to find a
 * suitable block to allocate. This will search the specified B-tree looking
@@ -1654,6 +1656,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 		range_seg_t *rs;
 		if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
 			metaslab_size_tree_full_load(msp->ms_allocatable);
+
 		if (metaslab_df_use_largest_segment) {
 			/* use largest free segment */
 			rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
@@ -2616,6 +2619,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
 	ms->ms_allocator = -1;
 	ms->ms_new = B_TRUE;

+	vdev_ops_t *ops = vd->vdev_ops;
+	if (ops->vdev_op_metaslab_init != NULL)
+		ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
+
 	/*
 	 * We only open space map objects that already exist. All others
 	 * will be opened when we finally allocate an object for it.
@@ -5813,7 +5820,6 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 			metaslab_group_alloc_increment(spa,
 			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
 		}
-
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
@@ -307,8 +307,17 @@ mmp_next_leaf(spa_t *spa)
 		if (leaf == NULL)
 			leaf = list_head(&spa->spa_leaf_list);

-		if (!vdev_writeable(leaf)) {
+		/*
+		 * We skip unwritable, offline, detached, and dRAID spare
+		 * devices as they are either not legal targets or the write
+		 * may fail or not be seen by other hosts.  Skipped dRAID
+		 * spares can never be written so the fail mask is not set.
+		 */
+		if (!vdev_writeable(leaf) || leaf->vdev_offline ||
+		    leaf->vdev_detached) {
 			fail_mask |= MMP_FAIL_NOT_WRITABLE;
+		} else if (leaf->vdev_ops == &vdev_draid_spare_ops) {
+			continue;
 		} else if (leaf->vdev_mmp_pending != 0) {
 			fail_mask |= MMP_FAIL_WRITE_PENDING;
 		} else {
@@ -60,6 +60,7 @@
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_disk.h>
+#include <sys/vdev_draid.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/mmp.h>
@@ -3681,7 +3682,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
 	/*
 	 * Build a new vdev tree from the trusted config
 	 */
-	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+	error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
+	if (error != 0) {
+		nvlist_free(mos_config);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa_load_failed(spa, "spa_config_parse failed [error=%d]",
+		    error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+	}

 	/*
 	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
@@ -5631,7 +5639,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
-	uint64_t version, obj;
+	uint64_t version, obj, ndraid = 0;
 	boolean_t has_features;
 	boolean_t has_encryption;
 	boolean_t has_allocclass;
@@ -5753,8 +5761,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,

 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
-	    (error = spa_validate_aux(spa, nvroot, txg,
-	    VDEV_ALLOC_ADD)) == 0) {
+	    (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
+	    (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
 		/*
 		 * instantiate the metaslab groups (this will dirty the vdevs)
 		 * we can no longer error exit past this point
@@ -5895,6 +5903,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 		spa_sync_props(props, tx);
 	}

+	for (int i = 0; i < ndraid; i++)
+		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
+
 	dmu_tx_commit(tx);

 	spa->spa_sync_on = B_TRUE;
@@ -6403,13 +6414,26 @@ spa_reset(const char *pool)
 * ==========================================================================
 */

+/*
+ * This is called as a synctask to increment the draid feature flag
+ */
+static void
+spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	int draid = (int)(uintptr_t)arg;
+
+	for (int c = 0; c < draid; c++)
+		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
+}
+
 /*
 * Add a device to a storage pool.
 */
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
-	uint64_t txg;
+	uint64_t txg, ndraid = 0;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
@@ -6438,8 +6462,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));

 	if (vd->vdev_children != 0 &&
-	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
+	    (error = vdev_create(vd, txg, B_FALSE)) != 0) {
 		return (spa_vdev_exit(spa, vd, txg, error));
+	}
+
+	/*
+	 * The virtual dRAID spares must be added after vdev tree is created
+	 * and the vdev guids are generated.  The guid of their assoicated
+	 * dRAID is stored in the config and used when opening the spare.
+	 */
+	if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
+	    rvd->vdev_children)) == 0) {
+		if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
+		    ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
+			nspares = 0;
+	} else {
+		return (spa_vdev_exit(spa, vd, txg, error));
+	}

 	/*
 	 * We must validate the spares and l2cache devices after checking the
@@ -6452,7 +6491,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 	 * If we are in the middle of a device removal, we can only add
 	 * devices which match the existing devices in the pool.
 	 * If we are in the middle of a removal, or have some indirect
-	 * vdevs, we can not add raidz toplevels.
+	 * vdevs, we can not add raidz or dRAID top levels.
 	 */
 	if (spa->spa_vdev_removal != NULL ||
 	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
@@ -6462,10 +6501,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 			    tvd->vdev_ashift != spa->spa_max_ashift) {
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 			}
-			/* Fail if top level vdev is raidz */
-			if (tvd->vdev_ops == &vdev_raidz_ops) {
+			/* Fail if top level vdev is raidz or a dRAID */
+			if (vdev_get_nparity(tvd) != 0)
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
-			}
+
 			/*
 			 * Need the top level mirror to be
 			 * a mirror of leaf vdevs only
@@ -6505,6 +6544,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}

+	/*
+	 * We can't increment a feature while holding spa_vdev so we
+	 * have to do it in a synctask.
+	 */
+	if (ndraid != 0) {
+		dmu_tx_t *tx;
+
+		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+		dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
+		    (void *)(uintptr_t)ndraid, tx);
+		dmu_tx_commit(tx);
+	}
+
 	/*
 	 * We have to be careful when adding new vdevs to an existing pool.
 	 * If other threads start allocating from these vdevs before we
@@ -6615,14 +6667,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));

+	/*
+	 * A dRAID spare can only replace a child of its parent dRAID vdev.
+	 */
+	if (newvd->vdev_ops == &vdev_draid_spare_ops &&
+	    oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
+		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+	}
+
 	if (rebuild) {
 		/*
-		 * For rebuilds, the parent vdev must support reconstruction
+		 * For rebuilds, the top vdev must support reconstruction
 		 * using only space maps.  This means the only allowable
-		 * parents are the root vdev or a mirror vdev.
+		 * vdevs types are the root vdev, a mirror, or dRAID.
 		 */
-		if (pvd->vdev_ops != &vdev_mirror_ops &&
-		    pvd->vdev_ops != &vdev_root_ops) {
+		tvd = pvd;
+		if (pvd->vdev_top != NULL)
+			tvd = pvd->vdev_top;
+
+		if (tvd->vdev_ops != &vdev_mirror_ops &&
+		    tvd->vdev_ops != &vdev_root_ops &&
+		    tvd->vdev_ops != &vdev_draid_ops) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 	}
@@ -6915,14 +6980,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 	}

 	/*
-	 * If we are detaching the original disk from a spare, then it implies
-	 * that the spare should become a real disk, and be removed from the
-	 * active spare list for the pool.
+	 * If we are detaching the original disk from a normal spare, then it
+	 * implies that the spare should become a real disk, and be removed
+	 * from the active spare list for the pool.  dRAID spares on the
+	 * other hand are coupled to the pool and thus should never be removed
+	 * from the spares list.
 	 */
-	if (pvd->vdev_ops == &vdev_spare_ops &&
-	    vd->vdev_id == 0 &&
-	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
-		unspare = B_TRUE;
+	if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
+		vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
+
+		if (last_cvd->vdev_isspare &&
+		    last_cvd->vdev_ops != &vdev_draid_spare_ops) {
+			unspare = B_TRUE;
+		}
+	}

 	/*
 	 * Erase the disk labels so the disk can be used for other things.
@@ -8013,18 +8084,9 @@ spa_async_thread(void *arg)
 	/*
 	 * If any devices are done replacing, detach them.
 	 */
-	if (tasks & SPA_ASYNC_RESILVER_DONE)
+	if (tasks & SPA_ASYNC_RESILVER_DONE ||
+	    tasks & SPA_ASYNC_REBUILD_DONE) {
 		spa_vdev_resilver_done(spa);
-
-	/*
-	 * If any devices are done replacing, detach them.  Then if no
-	 * top-level vdevs are rebuilding attempt to kick off a scrub.
-	 */
-	if (tasks & SPA_ASYNC_REBUILD_DONE) {
-		spa_vdev_resilver_done(spa);
-
-		if (!vdev_rebuild_active(spa->spa_root_vdev))
-			(void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
 	}

 	/*
@@ -741,6 +741,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)

 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
+	spa->spa_min_alloc = INT_MAX;

 	/* Reset cached value */
 	spa->spa_dedup_dspace = ~0ULL;
@@ -40,6 +40,7 @@
 #include <sys/dsl_dir.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_rebuild.h>
+#include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -51,6 +52,7 @@
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
+#include <sys/vdev_raidz.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
@@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
 static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
+	&vdev_draid_ops,
+	&vdev_draid_spare_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
@@ -221,10 +225,11 @@ vdev_getops(const char *type)

 /* ARGSUSED */
 void
-vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
+vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
-	res->rs_start = in->rs_start;
-	res->rs_end = in->rs_end;
+	physical_rs->rs_start = logical_rs->rs_start;
+	physical_rs->rs_end = logical_rs->rs_end;
 }

 /*
@@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
 	return (asize);
 }

+uint64_t
+vdev_default_min_asize(vdev_t *vd)
+{
+	return (vd->vdev_min_asize);
+}
+
 /*
 * Get the minimum allocatable size. We define the allocatable size as
 * the vdev's asize rounded to the nearest metaslab. This allows us to
@@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd)
 	if (vd == vd->vdev_top)
 		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));

-	/*
-	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
-	 * so each child must provide at least 1/Nth of its asize.
-	 */
-	if (pvd->vdev_ops == &vdev_raidz_ops)
-		return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
-		    pvd->vdev_children);
-
-	return (pvd->vdev_min_asize);
+	return (pvd->vdev_ops->vdev_op_min_asize(pvd));
 }

 void
@@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd)
 		vdev_set_min_asize(vd->vdev_child[c]);
 }

+/*
+ * Get the minimal allocation size for the top-level vdev.
+ */
+uint64_t
+vdev_get_min_alloc(vdev_t *vd)
+{
+	uint64_t min_alloc = 1ULL << vd->vdev_ashift;
+
+	if (vd->vdev_ops->vdev_op_min_alloc != NULL)
+		min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
+
+	return (min_alloc);
+}
+
+/*
+ * Get the parity level for a top-level vdev.
+ */
+uint64_t
+vdev_get_nparity(vdev_t *vd)
+{
+	uint64_t nparity = 0;
+
+	if (vd->vdev_ops->vdev_op_nparity != NULL)
+		nparity = vd->vdev_ops->vdev_op_nparity(vd);
+
+	return (nparity);
+}
+
+/*
+ * Get the number of data disks for a top-level vdev.
+ */
+uint64_t
+vdev_get_ndisks(vdev_t *vd)
+{
+	uint64_t ndisks = 1;
+
+	if (vd->vdev_ops->vdev_op_ndisks != NULL)
+		ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
+
+	return (ndisks);
+}
+
 vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
@@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	list_link_init(&vd->vdev_initialize_node);
 	list_link_init(&vd->vdev_leaf_node);
 	list_link_init(&vd->vdev_trim_node);
+
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);

 	mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);

 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
@@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 {
 	vdev_ops_t *ops;
 	char *type;
-	uint64_t guid = 0, islog, nparity;
+	uint64_t guid = 0, islog;
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 	char *tmp = NULL;
@@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 		return (SET_ERROR(ENOTSUP));

-	/*
-	 * Set the nparity property for RAID-Z vdevs.
-	 */
-	nparity = -1ULL;
-	if (ops == &vdev_raidz_ops) {
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
-		    &nparity) == 0) {
-			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
-				return (SET_ERROR(EINVAL));
-			/*
-			 * Previous versions could only support 1 or 2 parity
-			 * device.
-			 */
-			if (nparity > 1 &&
-			    spa_version(spa) < SPA_VERSION_RAIDZ2)
-				return (SET_ERROR(ENOTSUP));
-			if (nparity > 2 &&
-			    spa_version(spa) < SPA_VERSION_RAIDZ3)
-				return (SET_ERROR(ENOTSUP));
-		} else {
-			/*
-			 * We require the parity to be specified for SPAs that
-			 * support multiple parity levels.
-			 */
-			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
-				return (SET_ERROR(EINVAL));
-			/*
-			 * Otherwise, we default to 1 parity device for RAID-Z.
-			 */
-			nparity = 1;
-		}
-	} else {
-		nparity = 0;
-	}
-	ASSERT(nparity != -1ULL);
-
-	/*
-	 * If creating a top-level vdev, check for allocation classes input
-	 */
 	if (top_level && alloctype == VDEV_ALLOC_ADD) {
 		char *bias;

+		/*
+		 * If creating a top-level vdev, check for allocation
+		 * classes input.
+		 */
 		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
 		    &bias) == 0) {
 			alloc_bias = vdev_derive_alloc_bias(bias);
@@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 				return (SET_ERROR(ENOTSUP));
 			}
 		}
+
+		/* spa_vdev_add() expects feature to be enabled */
+		if (ops == &vdev_draid_ops &&
+		    spa->spa_load_state != SPA_LOAD_CREATE &&
+		    !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
+			return (SET_ERROR(ENOTSUP));
+		}
+	}
+
+	/*
+	 * Initialize the vdev specific data.  This is done before calling
+	 * vdev_alloc_common() since it may fail and this simplifies the
+	 * error reporting and cleanup code paths.
+	 */
+	void *tsd = NULL;
+	if (ops->vdev_op_init != NULL) {
+		rc = ops->vdev_op_init(spa, nv, &tsd);
+		if (rc != 0) {
+			return (rc);
+		}
 	}

 	vd = vdev_alloc_common(spa, id, guid, ops);
-	vic = &vd->vdev_indirect_config;
-
+	vd->vdev_tsd = tsd;
 	vd->vdev_islog = islog;
-	vd->vdev_nparity = nparity;
+
 	if (top_level && alloc_bias != VDEV_BIAS_NONE)
 		vd->vdev_alloc_bias = alloc_bias;

@@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	    &vd->vdev_wholedisk) != 0)
 		vd->vdev_wholedisk = -1ULL;

+	vic = &vd->vdev_indirect_config;
+
 	ASSERT0(vic->vic_mapping_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 	    &vic->vic_mapping_object);
@@ -937,6 +967,9 @@ vdev_free(vdev_t *vd)
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);

+	if (vd->vdev_ops->vdev_op_fini != NULL)
+		vd->vdev_ops->vdev_op_fini(vd);
+
 	/*
 	 * Discard allocation state.
 	 */
@@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd)
 	cv_destroy(&vd->vdev_trim_io_cv);

 	mutex_destroy(&vd->vdev_rebuild_lock);
-	mutex_destroy(&vd->vdev_rebuild_io_lock);
 	cv_destroy(&vd->vdev_rebuild_cv);
-	cv_destroy(&vd->vdev_rebuild_io_cv);

 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd)
 }

 /*
- * Add a mirror/replacing vdev above an existing vdev.
+ * Add a mirror/replacing vdev above an existing vdev.  There is no need to
+ * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
 */
 vdev_t *
 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
@@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd)
 				spa->spa_max_ashift = vd->vdev_ashift;
 			if (vd->vdev_ashift < spa->spa_min_ashift)
 				spa->spa_min_ashift = vd->vdev_ashift;
+
+			uint64_t min_alloc = vdev_get_min_alloc(vd);
+			if (min_alloc < spa->spa_min_alloc)
+				spa->spa_min_alloc = min_alloc;
 		}
 	}
 }
@@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd)
 	return (B_FALSE);
 }

+/*
+ * Returns B_TRUE if the passed child should be opened.
+ */
+static boolean_t
+vdev_default_open_children_func(vdev_t *vd)
+{
+	return (B_TRUE);
+}
+
+/*
+ * Open the requested child vdevs.  If any of the leaf vdevs are using
+ * a ZFS volume then do the opens in a single thread.  This avoids a
+ * deadlock when the current thread is holding the spa_namespace_lock.
+ */
+static void
+vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+	int children = vd->vdev_children;
+
+	taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
+	    children, children, TASKQ_PREPOPULATE);
+	vd->vdev_nonrot = B_TRUE;
+
+	for (int c = 0; c < children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (open_func(cvd) == B_FALSE)
+			continue;
+
+		if (tq == NULL || vdev_uses_zvols(vd)) {
+			cvd->vdev_open_error = vdev_open(cvd);
+		} else {
+			VERIFY(taskq_dispatch(tq, vdev_open_child,
+			    cvd, TQ_SLEEP) != TASKQID_INVALID);
+		}
+
+		vd->vdev_nonrot &= cvd->vdev_nonrot;
+	}
+
+	if (tq != NULL) {
+		taskq_wait(tq);
+		taskq_destroy(tq);
+	}
+}
+
+/*
+ * Open all child vdevs.
+ */
 void
 vdev_open_children(vdev_t *vd)
 {
-	taskq_t *tq;
-	int children = vd->vdev_children;
+	vdev_open_children_impl(vd, vdev_default_open_children_func);
+}

-	/*
-	 * in order to handle pools on top of zvols, do the opens
-	 * in a single thread so that the same thread holds the
-	 * spa_namespace_lock
-	 */
-	if (vdev_uses_zvols(vd)) {
-retry_sync:
-		for (int c = 0; c < children; c++)
-			vd->vdev_child[c]->vdev_open_error =
-			    vdev_open(vd->vdev_child[c]);
-	} else {
-		tq = taskq_create("vdev_open", children, minclsyspri,
-		    children, children, TASKQ_PREPOPULATE);
-		if (tq == NULL)
-			goto retry_sync;
-
-		for (int c = 0; c < children; c++)
-			VERIFY(taskq_dispatch(tq, vdev_open_child,
-			    vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
-
-		taskq_destroy(tq);
-	}
-
-	vd->vdev_nonrot = B_TRUE;
-
-	for (int c = 0; c < children; c++)
-		vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
+/*
+ * Conditionally open a subset of child vdevs.
+ */
+void
+vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+	vdev_open_children_impl(vd, open_func);
 }

 /*
@@ -1952,6 +2016,16 @@ vdev_open(vdev_t *vd)
 		return (error);
 	}

+	/*
+	 * Track the the minimum allocation size.
+	 */
+	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
+		uint64_t min_alloc = vdev_get_min_alloc(vd);
+		if (min_alloc < spa->spa_min_alloc)
+			spa->spa_min_alloc = min_alloc;
+	}
+
 	/*
 	 * If this is a leaf vdev, assess whether a resilver is needed.
 	 * But don't do this if we are doing a reopen for a scrub, since
@@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd)
 	vdev_t *pvd = vd->vdev_parent;
 	spa_t *spa __maybe_unused = vd->vdev_spa;

-	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+	ASSERT(vd != NULL);
+	ASSERT(vd->vdev_open_thread == curthread ||
+	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);

 	/*
 	 * If our parent is reopening, then we are as well, unless we are
@@ -2606,10 +2682,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 }

 /*
- * Returns B_TRUE if vdev determines offset needs to be resilvered.
+ * Check if the txg falls within the range which must be
+ * resilvered.  DVAs outside this range can always be skipped.
 */
 boolean_t
-vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+	/* Set by sequential resilver. */
+	if (phys_birth == TXG_UNKNOWN)
+		return (B_TRUE);
+
+	return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
+}
+
+/*
+ * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
+ */
+boolean_t
+vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
 {
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);

@@ -2617,7 +2709,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
 	    vd->vdev_ops->vdev_op_leaf)
 		return (B_TRUE);

-	return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
+	return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
+	    phys_birth));
 }

 /*
@@ -2862,8 +2955,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
 			continue;			/* leaf vdevs only */
 		if (t == DTL_PARTIAL)
 			minref = 1;			/* i.e. non-zero */
-		else if (vd->vdev_nparity != 0)
-			minref = vd->vdev_nparity + 1;	/* RAID-Z */
+		else if (vdev_get_nparity(vd) != 0)
+			minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
 		else
 			minref = vd->vdev_children;	/* any kind of mirror */
 		space_reftree_create(&reftree);
@@ -3727,6 +3820,9 @@ top:
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));

+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
 	tvd = vd->vdev_top;
 	mg = tvd->vdev_mg;
 	generation = spa->spa_config_generation + 1;
@@ -3971,6 +4067,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
 static void
 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 {
+	/*
+	 * Exclude the dRAID spare when aggregating to avoid double counting
+	 * the ops and bytes.  These IOs are counted by the physical leaves.
+	 */
+	if (cvd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
 	for (int t = 0; t < VS_ZIO_TYPES; t++) {
 		vs->vs_ops[t] += cvs->vs_ops[t];
 		vs->vs_bytes[t] += cvs->vs_bytes[t];
@@ -4063,7 +4166,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 				vdev_get_child_stat(cvd, vs, cvs);
 			if (vsx)
 				vdev_get_child_stat_ex(cvd, vsx, cvsx);
-
 		}
 	} else {
 		/*
@@ -4248,7 +4350,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)

 			/*
 			 * Repair is the result of a rebuild issued by the
-			 * rebuild thread (vdev_rebuild_thread).
+			 * rebuild thread (vdev_rebuild_thread).  To avoid
+			 * double counting repaired bytes the virtual dRAID
+			 * spare vdev is excluded from the processed bytes.
 			 */
 			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
 				vdev_t *tvd = vd->vdev_top;
@@ -4256,8 +4360,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 				vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 				uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;

-				if (vd->vdev_ops->vdev_op_leaf)
+				if (vd->vdev_ops->vdev_op_leaf &&
+				    vd->vdev_ops != &vdev_draid_spare_ops) {
 					atomic_add_64(rebuilt, psize);
+				}
 				vs->vs_rebuild_processed += psize;
 			}

@@ -4981,31 +5087,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
 	    vdev_resilver_needed(vd, NULL, NULL));
 }

+boolean_t
+vdev_xlate_is_empty(range_seg64_t *rs)
+{
+	return (rs->rs_start == rs->rs_end);
+}
+
 /*
- * Translate a logical range to the physical range for the specified vdev_t.
- * This function is initially called with a leaf vdev and will walk each
- * parent vdev until it reaches a top-level vdev. Once the top-level is
- * reached the physical range is initialized and the recursive function
- * begins to unwind. As it unwinds it calls the parent's vdev specific
- * translation function to do the real conversion.
+ * Translate a logical range to the first contiguous physical range for the
+ * specified vdev_t.  This function is initially called with a leaf vdev and
+ * will walk each parent vdev until it reaches a top-level vdev. Once the
+ * top-level is reached the physical range is initialized and the recursive
+ * function begins to unwind. As it unwinds it calls the parent's vdev
+ * specific translation function to do the real conversion.
 */
 void
 vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
-    range_seg64_t *physical_rs)
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
 	/*
 	 * Walk up the vdev tree
 	 */
 	if (vd != vd->vdev_top) {
-		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
+		    remain_rs);
 	} else {
 		/*
-		 * We've reached the top-level vdev, initialize the
-		 * physical range to the logical range and start to
-		 * unwind.
+		 * We've reached the top-level vdev, initialize the physical
+		 * range to the logical range and set an empty remaining
+		 * range then start to unwind.
 		 */
 		physical_rs->rs_start = logical_rs->rs_start;
 		physical_rs->rs_end = logical_rs->rs_end;
+
+		remain_rs->rs_start = logical_rs->rs_start;
+		remain_rs->rs_end = logical_rs->rs_start;
+
 		return;
 	}

@@ -5015,16 +5132,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,

 	/*
 	 * As this recursive function unwinds, translate the logical
-	 * range into its physical components by calling the
-	 * vdev specific translate function.
+	 * range into its physical and any remaining components by calling
+	 * the vdev specific translate function.
 	 */
 	range_seg64_t intermediate = { 0 };
-	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);

 	physical_rs->rs_start = intermediate.rs_start;
 	physical_rs->rs_end = intermediate.rs_end;
 }

+void
+vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
+    vdev_xlate_func_t *func, void *arg)
+{
+	range_seg64_t iter_rs = *logical_rs;
+	range_seg64_t physical_rs;
+	range_seg64_t remain_rs;
+
+	while (!vdev_xlate_is_empty(&iter_rs)) {
+
+		vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
+
+		/*
+		 * With raidz and dRAID, it's possible that the logical range
+		 * does not live on this leaf vdev. Only when there is a non-
+		 * zero physical size call the provided function.
+		 */
+		if (!vdev_xlate_is_empty(&physical_rs))
+			func(arg, &physical_rs);
+
+		iter_rs = remain_rs;
+	}
+}
+
 /*
 * Look at the vdev tree and determine whether any devices are currently being
 * replaced.
@@ -0,0 +1,40 @@
+/*
+ * Xorshift Pseudo Random Number Generator based on work by David Blackman
+ * and Sebastiano Vigna (vigna@acm.org).
+ *
+ *   "Further scramblings of Marsaglia's xorshift generators"
+ *   http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
+ *   http://prng.di.unimi.it/xoroshiro128plusplus.c
+ *
+ * To the extent possible under law, the author has dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * See <http://creativecommons.org/publicdomain/zero/1.0/>.
+ *
+ * This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid,
+ * small-state generators. It is extremely (sub-ns) fast and it passes all
+ * tests we are aware of, but its state space is large enough only for
+ * mild parallelism.
+ */
+
+#include <sys/vdev_draid.h>
+
+static inline uint64_t rotl(const uint64_t x, int k)
+{
+	return (x << k) | (x >> (64 - k));
+}
+
+uint64_t
+vdev_draid_rand(uint64_t *s)
+{
+	const uint64_t s0 = s[0];
+	uint64_t s1 = s[1];
+	const uint64_t result = rotl(s0 + s1, 17) + s0;
+
+	s1 ^= s0;
+	s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
+	s[1] = rotl(s1, 28); // c
+
+	return (result);
+}
@@ -1844,9 +1844,13 @@ vdev_indirect_io_done(zio_t *zio)
 }

 vdev_ops_t vdev_indirect_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_indirect_open,
 	.vdev_op_close = vdev_indirect_close,
 	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_indirect_io_start,
 	.vdev_op_io_done = vdev_indirect_io_done,
 	.vdev_op_state_change = NULL,
@@ -1855,6 +1859,11 @@ vdev_ops_t vdev_indirect_ops = {
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = vdev_indirect_remap,
 	.vdev_op_xlate = NULL,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_INDIRECT,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* leaf vdev */
 };
@@ -121,6 +121,8 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 	if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
 		vd->vdev_initialize_action_time = gethrestime_sec();
 	}
+
+	vdev_initializing_state_t old_state = vd->vdev_initialize_state;
 	vd->vdev_initialize_state = new_state;

 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
@@ -138,8 +140,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 		    "vdev=%s suspended", vd->vdev_path);
 		break;
 	case VDEV_INITIALIZE_CANCELED:
-		spa_history_log_internal(spa, "initialize", tx,
-		    "vdev=%s canceled", vd->vdev_path);
+		if (old_state == VDEV_INITIALIZE_ACTIVE ||
+		    old_state == VDEV_INITIALIZE_SUSPENDED)
+			spa_history_log_internal(spa, "initialize", tx,
+			    "vdev=%s canceled", vd->vdev_path);
 		break;
 	case VDEV_INITIALIZE_COMPLETE:
 		spa_history_log_internal(spa, "initialize", tx,
@@ -317,6 +321,32 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
 	return (0);
 }

+static void
+vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
+{
+	uint64_t *last_rs_end = (uint64_t *)arg;
+
+	if (physical_rs->rs_end > *last_rs_end)
+		*last_rs_end = physical_rs->rs_end;
+}
+
+static void
+vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs)
+{
+	vdev_t *vd = (vdev_t *)arg;
+
+	uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
+	vd->vdev_initialize_bytes_est += size;
+
+	if (vd->vdev_initialize_last_offset > physical_rs->rs_end) {
+		vd->vdev_initialize_bytes_done += size;
+	} else if (vd->vdev_initialize_last_offset > physical_rs->rs_start &&
+	    vd->vdev_initialize_last_offset < physical_rs->rs_end) {
+		vd->vdev_initialize_bytes_done +=
+		    vd->vdev_initialize_last_offset - physical_rs->rs_start;
+	}
+}
+
 static void
 vdev_initialize_calculate_progress(vdev_t *vd)
 {
@@ -331,28 +361,35 @@ vdev_initialize_calculate_progress(vdev_t *vd)
 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
 		mutex_enter(&msp->ms_lock);

-		uint64_t ms_free = msp->ms_size -
-		    metaslab_allocated_space(msp);
-
-		if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
-			ms_free /= vd->vdev_top->vdev_children;
+		uint64_t ms_free = (msp->ms_size -
+		    metaslab_allocated_space(msp)) /
+		    vdev_get_ndisks(vd->vdev_top);

 		/*
 		 * Convert the metaslab range to a physical range
 		 * on our vdev. We use this to determine if we are
 		 * in the middle of this metaslab range.
 		 */
-		range_seg64_t logical_rs, physical_rs;
+		range_seg64_t logical_rs, physical_rs, remain_rs;
 		logical_rs.rs_start = msp->ms_start;
 		logical_rs.rs_end = msp->ms_start + msp->ms_size;
-		vdev_xlate(vd, &logical_rs, &physical_rs);

+		/* Metaslab space after this offset has not been initialized */
+		vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
 		if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
 			vd->vdev_initialize_bytes_est += ms_free;
 			mutex_exit(&msp->ms_lock);
 			continue;
-		} else if (vd->vdev_initialize_last_offset >
-		    physical_rs.rs_end) {
+		}
+
+		/* Metaslab space before this offset has been initialized */
+		uint64_t last_rs_end = physical_rs.rs_end;
+		if (!vdev_xlate_is_empty(&remain_rs)) {
+			vdev_xlate_walk(vd, &remain_rs,
+			    vdev_initialize_xlate_last_rs_end, &last_rs_end);
+		}
+
+		if (vd->vdev_initialize_last_offset > last_rs_end) {
 			vd->vdev_initialize_bytes_done += ms_free;
 			vd->vdev_initialize_bytes_est += ms_free;
 			mutex_exit(&msp->ms_lock);
@@ -374,22 +411,9 @@ vdev_initialize_calculate_progress(vdev_t *vd)
 		    &where)) {
 			logical_rs.rs_start = rs_get_start(rs, rt);
 			logical_rs.rs_end = rs_get_end(rs, rt);
-			vdev_xlate(vd, &logical_rs, &physical_rs);

-			uint64_t size = physical_rs.rs_end -
-			    physical_rs.rs_start;
-			vd->vdev_initialize_bytes_est += size;
-			if (vd->vdev_initialize_last_offset >
-			    physical_rs.rs_end) {
-				vd->vdev_initialize_bytes_done += size;
-			} else if (vd->vdev_initialize_last_offset >
-			    physical_rs.rs_start &&
-			    vd->vdev_initialize_last_offset <
-			    physical_rs.rs_end) {
-				vd->vdev_initialize_bytes_done +=
-				    vd->vdev_initialize_last_offset -
-				    physical_rs.rs_start;
-			}
+			vdev_xlate_walk(vd, &logical_rs,
+			    vdev_initialize_xlate_progress, vd);
 		}
 		mutex_exit(&msp->ms_lock);
 	}
@@ -419,6 +443,34 @@ vdev_initialize_load(vdev_t *vd)
 	return (err);
 }

+static void
+vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs)
+{
+	vdev_t *vd = arg;
+
+	/* Only add segments that we have not visited yet */
+	if (physical_rs->rs_end <= vd->vdev_initialize_last_offset)
+		return;
+
+	/* Pick up where we left off mid-range. */
+	if (vd->vdev_initialize_last_offset > physical_rs->rs_start) {
+		zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
+		    "(%llu, %llu)", vd->vdev_path,
+		    (u_longlong_t)physical_rs->rs_start,
+		    (u_longlong_t)physical_rs->rs_end,
+		    (u_longlong_t)vd->vdev_initialize_last_offset,
+		    (u_longlong_t)physical_rs->rs_end);
+		ASSERT3U(physical_rs->rs_end, >,
+		    vd->vdev_initialize_last_offset);
+		physical_rs->rs_start = vd->vdev_initialize_last_offset;
+	}
+
+	ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
+
+	range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start,
+	    physical_rs->rs_end - physical_rs->rs_start);
+}
+
 /*
 * Convert the logical range into a physical range and add it to our
 * avl tree.
@@ -427,47 +479,12 @@ static void
 vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_t *vd = arg;
-	range_seg64_t logical_rs, physical_rs;
+	range_seg64_t logical_rs;
 	logical_rs.rs_start = start;
 	logical_rs.rs_end = start + size;

 	ASSERT(vd->vdev_ops->vdev_op_leaf);
-	vdev_xlate(vd, &logical_rs, &physical_rs);
-
-	IMPLY(vd->vdev_top == vd,
-	    logical_rs.rs_start == physical_rs.rs_start);
-	IMPLY(vd->vdev_top == vd,
-	    logical_rs.rs_end == physical_rs.rs_end);
-
-	/* Only add segments that we have not visited yet */
-	if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
-		return;
-
-	/* Pick up where we left off mid-range. */
-	if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
-		zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
-		    "(%llu, %llu)", vd->vdev_path,
-		    (u_longlong_t)physical_rs.rs_start,
-		    (u_longlong_t)physical_rs.rs_end,
-		    (u_longlong_t)vd->vdev_initialize_last_offset,
-		    (u_longlong_t)physical_rs.rs_end);
-		ASSERT3U(physical_rs.rs_end, >,
-		    vd->vdev_initialize_last_offset);
-		physical_rs.rs_start = vd->vdev_initialize_last_offset;
-	}
-	ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
-
-	/*
-	 * With raidz, it's possible that the logical range does not live on
-	 * this leaf vdev. We only add the physical range to this vdev's if it
-	 * has a length greater than 0.
-	 */
-	if (physical_rs.rs_end > physical_rs.rs_start) {
-		range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
-		    physical_rs.rs_end - physical_rs.rs_start);
-	} else {
-		ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
-	}
+	vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
 }

 static void
@@ -142,6 +142,7 @@
 #include <sys/zap.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -453,31 +454,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 	if (vd->vdev_fru != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);

-	if (vd->vdev_nparity != 0) {
-		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
-		    VDEV_TYPE_RAIDZ) == 0);
+	if (vd->vdev_ops->vdev_op_config_generate != NULL)
+		vd->vdev_ops->vdev_op_config_generate(vd, nv);

-		/*
-		 * Make sure someone hasn't managed to sneak a fancy new vdev
-		 * into a crufty old storage pool.
-		 */
-		ASSERT(vd->vdev_nparity == 1 ||
-		    (vd->vdev_nparity <= 2 &&
-		    spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
-		    (vd->vdev_nparity <= 3 &&
-		    spa_version(spa) >= SPA_VERSION_RAIDZ3));
-
-		/*
-		 * Note that we'll add the nparity tag even on storage pools
-		 * that only support a single parity device -- older software
-		 * will just ignore it.
-		 */
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
-	}
-
-	if (vd->vdev_wholedisk != -1ULL)
+	if (vd->vdev_wholedisk != -1ULL) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    vd->vdev_wholedisk);
+	}

 	if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
@@ -785,6 +768,14 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
 	if (!vdev_readable(vd))
 		return (NULL);

+	/*
+	 * The label for a dRAID distributed spare is not stored on disk.
+	 * Instead it is generated when needed which allows us to bypass
+	 * the pipeline when reading the config from the label.
+	 */
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return (vdev_draid_read_config_spare(vd));
+
 	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
 	vp = abd_to_buf(vp_abd);

@@ -1497,7 +1488,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);

-	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) &&
+	    vd->vdev_ops != &vdev_draid_spare_ops) {
 		for (int l = 0; l < VDEV_LABELS; l++) {
 			for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 				vdev_label_read(zio, vd, l,
@@ -1586,6 +1578,13 @@ vdev_copy_uberblocks(vdev_t *vd)
 	    SCL_STATE);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);

+	/*
+	 * No uberblocks are stored on distributed spares, they may be
+	 * safely skipped when expanding a leaf vdev.
+	 */
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
 	spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER);

 	ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
@@ -1647,6 +1646,15 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
 	if (!vdev_writeable(vd))
 		return;

+	/*
+	 * There's no need to write uberblocks to a distributed spare, they
+	 * are already stored on all the leaves of the parent dRAID.  For
+	 * this same reason vdev_uberblock_load_impl() skips distributed
+	 * spares when reading uberblocks.
+	 */
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
 	/* If the vdev was expanded, need to copy uberblock rings. */
 	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
 	    vd->vdev_copy_uberblocks == B_TRUE) {
@@ -1763,6 +1771,14 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes,
 	if (!vdev_writeable(vd))
 		return;

+	/*
+	 * The top-level config never needs to be written to a distributed
+	 * spare.  When read vdev_dspare_label_read_config() will generate
+	 * the config for the vdev_label_read_config().
+	 */
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
 	/*
 	 * Generate a label describing the top-level config to which we belong.
 	 */
@@ -33,6 +33,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
 #include <sys/zio.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
@@ -99,7 +100,6 @@ vdev_mirror_stat_fini(void)
 /*
 * Virtual device vector for mirroring.
 */
-
 typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	uint64_t	mc_offset;
@@ -108,6 +108,7 @@ typedef struct mirror_child {
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
+	uint8_t		mc_rebuilding;
 } mirror_child_t;

 typedef struct mirror_map {
@@ -115,6 +116,7 @@ typedef struct mirror_map {
 	int		mm_preferred_cnt;
 	int		mm_children;
 	boolean_t	mm_resilvering;
+	boolean_t	mm_rebuilding;
 	boolean_t	mm_root;
 	mirror_child_t	mm_child[];
 } mirror_map_t;
@@ -239,6 +241,21 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
 	return (load + zfs_vdev_mirror_rotating_seek_inc);
 }

+static boolean_t
+vdev_mirror_rebuilding(vdev_t *vd)
+{
+	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
+		return (B_TRUE);
+
+	for (int i = 0; i < vd->vdev_children; i++) {
+		if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
 /*
 * Avoid inlining the function to keep vdev_mirror_io_start(), which
 * is this functions only caller, as small as possible on the stack.
@@ -356,6 +373,9 @@ vdev_mirror_map_init(zio_t *zio)
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
 			mc->mc_offset = zio->io_offset;
+
+			if (vdev_mirror_rebuilding(mc->mc_vd))
+				mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
 		}
 	}

@@ -493,12 +513,37 @@ vdev_mirror_preferred_child_randomize(zio_t *zio)
 	return (mm->mm_preferred[p]);
 }

+static boolean_t
+vdev_mirror_child_readable(mirror_child_t *mc)
+{
+	vdev_t *vd = mc->mc_vd;
+
+	if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
+		return (vdev_draid_readable(vd, mc->mc_offset));
+	else
+		return (vdev_readable(vd));
+}
+
+static boolean_t
+vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
+{
+	vdev_t *vd = mc->mc_vd;
+
+	if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
+		return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
+	else
+		return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
+}
+
 /*
 * Try to find a vdev whose DTL doesn't contain the block we want to read
- * preferring vdevs based on determined load.
+ * preferring vdevs based on determined load. If we can't, try the read on
+ * any vdev we haven't already tried.
 *
- * Try to find a child whose DTL doesn't contain the block we want to read.
- * If we can't, try the read on any vdev we haven't already tried.
+ * Distributed spares are an exception to the above load rule. They are
+ * always preferred in order to detect gaps in the distributed spare which
+ * are created when another disk in the dRAID fails. In order to restore
+ * redundancy those gaps must be read to trigger the required repair IO.
 */
 static int
 vdev_mirror_child_select(zio_t *zio)
@@ -518,20 +563,27 @@ vdev_mirror_child_select(zio_t *zio)
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;

-		if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) {
+		if (mc->mc_vd == NULL ||
+		    !vdev_mirror_child_readable(mc)) {
 			mc->mc_error = SET_ERROR(ENXIO);
 			mc->mc_tried = 1;	/* don't even try */
 			mc->mc_skipped = 1;
 			continue;
 		}

-		if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+		if (vdev_mirror_child_missing(mc, txg, 1)) {
 			mc->mc_error = SET_ERROR(ESTALE);
 			mc->mc_skipped = 1;
 			mc->mc_speculative = 1;
 			continue;
 		}

+		if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
+			mm->mm_preferred[0] = c;
+			mm->mm_preferred_cnt = 1;
+			break;
+		}
+
 		mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
 		if (mc->mc_load > lowest_load)
 			continue;
@@ -625,11 +677,25 @@ vdev_mirror_io_start(zio_t *zio)

 	while (children--) {
 		mc = &mm->mm_child[c];
+		c++;
+
+		/*
+		 * When sequentially resilvering only issue write repair
+		 * IOs to the vdev which is being rebuilt since performance
+		 * is limited by the slowest child.  This is an issue for
+		 * faster replacement devices such as distributed spares.
+		 */
+		if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
+		    (zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+		    !(zio->io_flags & ZIO_FLAG_SCRUB) &&
+		    mm->mm_rebuilding && !mc->mc_rebuilding) {
+			continue;
+		}
+
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_mirror_child_done, mc));
-		c++;
 	}

 	zio_execute(zio);
@@ -744,6 +810,8 @@ vdev_mirror_io_done(zio_t *zio)
 			mc = &mm->mm_child[c];

 			if (mc->mc_error == 0) {
+				vdev_ops_t *ops = mc->mc_vd->vdev_ops;
+
 				if (mc->mc_tried)
 					continue;
 				/*
@@ -752,15 +820,16 @@ vdev_mirror_io_done(zio_t *zio)
 				 * 1. it's a scrub (in which case we have
 				 * tried everything that was healthy)
 				 *  - or -
-				 * 2. it's an indirect vdev (in which case
-				 * it could point to any other vdev, which
-				 * might have a bad DTL)
+				 * 2. it's an indirect or distributed spare
+				 * vdev (in which case it could point to any
+				 * other vdev, which might have a bad DTL)
 				 *  - or -
 				 * 3. the DTL indicates that this data is
 				 * missing from this vdev
 				 */
 				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
-				    mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
+				    ops != &vdev_indirect_ops &&
+				    ops != &vdev_draid_spare_ops &&
 				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 				    zio->io_txg, 1))
 					continue;
@@ -796,50 +865,90 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
 	}
 }

+/*
+ * Return the maximum asize for a rebuild zio in the provided range.
+ */
+static uint64_t
+vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
+    uint64_t max_segment)
+{
+	uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
+	    SPA_MAXBLOCKSIZE);
+
+	return (MIN(asize, vdev_psize_to_asize(vd, psize)));
+}
+
 vdev_ops_t vdev_mirror_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
 	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
-	.vdev_op_need_resilver = NULL,
+	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_MIRROR,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };

 vdev_ops_t vdev_replacing_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
 	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
-	.vdev_op_need_resilver = NULL,
+	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_REPLACING,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };

 vdev_ops_t vdev_spare_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
 	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
-	.vdev_op_need_resilver = NULL,
+	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_SPARE,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
@@ -81,9 +81,13 @@ vdev_missing_io_done(zio_t *zio)
 }

 vdev_ops_t vdev_missing_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_missing_open,
 	.vdev_op_close = vdev_missing_close,
 	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_missing_io_start,
 	.vdev_op_io_done = vdev_missing_io_done,
 	.vdev_op_state_change = NULL,
@@ -92,14 +96,23 @@ vdev_ops_t vdev_missing_ops = {
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = NULL,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_MISSING,	/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };

 vdev_ops_t vdev_hole_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_missing_open,
 	.vdev_op_close = vdev_missing_close,
 	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_missing_io_start,
 	.vdev_op_io_done = vdev_missing_io_done,
 	.vdev_op_state_change = NULL,
@@ -108,6 +121,11 @@ vdev_ops_t vdev_hole_ops = {
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = NULL,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_HOLE,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
@@ -593,6 +593,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
 		return (NULL);

+	/*
+	 * I/Os to distributed spares are directly dispatched to the dRAID
+	 * leaf vdevs for aggregation.  See the comment at the end of the
+	 * zio_vdev_io_start() function.
+	 */
+	ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops);
+
 	first = last = zio;

 	if (zio->io_type == ZIO_TYPE_READ)
@@ -149,7 +149,7 @@ vdev_raidz_math_get_ops(void)
 * Select parity generation method for raidz_map
 */
 int
-vdev_raidz_math_generate(raidz_map_t *rm)
+vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr)
 {
 	raidz_gen_f gen_parity = NULL;

@@ -174,7 +174,7 @@ vdev_raidz_math_generate(raidz_map_t *rm)
 	if (gen_parity == NULL)
 		return (RAIDZ_ORIGINAL_IMPL);

-	gen_parity(rm);
+	gen_parity(rr);

 	return (0);
 }
@@ -241,8 +241,8 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
 * @nbaddata     - Number of failed data columns
 */
 int
-vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
-    const int *dt, const int nbaddata)
+vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr,
+    const int *parity_valid, const int *dt, const int nbaddata)
 {
 	raidz_rec_f rec_fn = NULL;

@@ -265,7 +265,7 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
 	if (rec_fn == NULL)
 		return (RAIDZ_ORIGINAL_IMPL);
 	else
-		return (rec_fn(rm, dt));
+		return (rec_fn(rr, dt));
 }

 const char *raidz_gen_name[] = {
@@ -26,6 +26,7 @@
 #define	_VDEV_RAIDZ_MATH_IMPL_H

 #include <sys/types.h>
+#include <sys/vdev_raidz_impl.h>

 #define	raidz_inline inline __attribute__((always_inline))
 #ifndef noinline
@@ -36,33 +37,33 @@
 * Functions calculate multiplication constants for data reconstruction.
 * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and
 * used parity columns for reconstruction.
- * @rm			RAIDZ map
+ * @rr			RAIDZ row
 * @tgtidx		array of missing data indexes
 * @coeff		output array of coefficients. Array must be provided by
 *         		user and must hold minimum MUL_CNT values.
 */
 static noinline void
-raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
 {
-	const unsigned ncols = raidz_ncols(rm);
+	const unsigned ncols = rr->rr_cols;
 	const unsigned x = tgtidx[TARGET_X];

 	coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1));
 }

 static noinline void
-raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
 {
-	const unsigned ncols = raidz_ncols(rm);
+	const unsigned ncols = rr->rr_cols;
 	const unsigned x = tgtidx[TARGET_X];

 	coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1));
 }

 static noinline void
-raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
 {
-	const unsigned ncols = raidz_ncols(rm);
+	const unsigned ncols = rr->rr_cols;
 	const unsigned x = tgtidx[TARGET_X];
 	const unsigned y = tgtidx[TARGET_Y];
 	gf_t a, b, e;
@@ -76,9 +77,9 @@ raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
 }

 static noinline void
-raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
 {
-	const unsigned ncols = raidz_ncols(rm);
+	const unsigned ncols = rr->rr_cols;
 	const unsigned x = tgtidx[TARGET_X];
 	const unsigned y = tgtidx[TARGET_Y];

@@ -93,9 +94,9 @@ raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
 }

 static noinline void
-raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
 {
-	const unsigned ncols = raidz_ncols(rm);
+	const unsigned ncols = rr->rr_cols;
 	const unsigned x = tgtidx[TARGET_X];
 	const unsigned y = tgtidx[TARGET_Y];

@@ -114,9 +115,9 @@ raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
 }

 static noinline void
-raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
 {
-	const unsigned ncols = raidz_ncols(rm);
+	const unsigned ncols = rr->rr_cols;
 	const unsigned x = tgtidx[TARGET_X];
 	const unsigned y = tgtidx[TARGET_Y];
 	const unsigned z = tgtidx[TARGET_Z];
@@ -347,26 +348,26 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private)
 /*
 * Generate P parity (RAIDZ1)
 *
- * @rm	RAIDZ map
+ * @rr	RAIDZ row
 */
 static raidz_inline void
-raidz_generate_p_impl(raidz_map_t * const rm)
+raidz_generate_p_impl(raidz_row_t * const rr)
 {
 	size_t c;
-	const size_t ncols = raidz_ncols(rm);
-	const size_t psize = rm->rm_col[CODE_P].rc_size;
-	abd_t *pabd = rm->rm_col[CODE_P].rc_abd;
+	const size_t ncols = rr->rr_cols;
+	const size_t psize = rr->rr_col[CODE_P].rc_size;
+	abd_t *pabd = rr->rr_col[CODE_P].rc_abd;
 	size_t size;
 	abd_t *dabd;

 	raidz_math_begin();

 	/* start with first data column */
-	raidz_copy(pabd, rm->rm_col[1].rc_abd, psize);
+	raidz_copy(pabd, rr->rr_col[1].rc_abd, psize);

 	for (c = 2; c < ncols; c++) {
-		dabd = rm->rm_col[c].rc_abd;
-		size = rm->rm_col[c].rc_size;
+		dabd = rr->rr_col[c].rc_abd;
+		size = rr->rr_col[c].rc_size;

 		/* add data column */
 		raidz_add(pabd, dabd, size);
@@ -414,29 +415,29 @@ raidz_gen_pq_add(void **c, const void *dc, const size_t csize,
 /*
 * Generate PQ parity (RAIDZ2)
 *
- * @rm	RAIDZ map
+ * @rr	RAIDZ row
 */
 static raidz_inline void
-raidz_generate_pq_impl(raidz_map_t * const rm)
+raidz_generate_pq_impl(raidz_row_t * const rr)
 {
 	size_t c;
-	const size_t ncols = raidz_ncols(rm);
-	const size_t csize = rm->rm_col[CODE_P].rc_size;
+	const size_t ncols = rr->rr_cols;
+	const size_t csize = rr->rr_col[CODE_P].rc_size;
 	size_t dsize;
 	abd_t *dabd;
 	abd_t *cabds[] = {
-		rm->rm_col[CODE_P].rc_abd,
-		rm->rm_col[CODE_Q].rc_abd
+		rr->rr_col[CODE_P].rc_abd,
+		rr->rr_col[CODE_Q].rc_abd
 	};

 	raidz_math_begin();

-	raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize);
-	raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize);
+	raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize);
+	raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize);

 	for (c = 3; c < ncols; c++) {
-		dabd = rm->rm_col[c].rc_abd;
-		dsize = rm->rm_col[c].rc_size;
+		dabd = rr->rr_col[c].rc_abd;
+		dsize = rr->rr_col[c].rc_size;

 		abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
 		    raidz_gen_pq_add);
@@ -487,31 +488,31 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
 /*
 * Generate PQR parity (RAIDZ2)
 *
- * @rm	RAIDZ map
+ * @rr	RAIDZ row
 */
 static raidz_inline void
-raidz_generate_pqr_impl(raidz_map_t * const rm)
+raidz_generate_pqr_impl(raidz_row_t * const rr)
 {
 	size_t c;
-	const size_t ncols = raidz_ncols(rm);
-	const size_t csize = rm->rm_col[CODE_P].rc_size;
+	const size_t ncols = rr->rr_cols;
+	const size_t csize = rr->rr_col[CODE_P].rc_size;
 	size_t dsize;
 	abd_t *dabd;
 	abd_t *cabds[] = {
-		rm->rm_col[CODE_P].rc_abd,
-		rm->rm_col[CODE_Q].rc_abd,
-		rm->rm_col[CODE_R].rc_abd
+		rr->rr_col[CODE_P].rc_abd,
+		rr->rr_col[CODE_Q].rc_abd,
+		rr->rr_col[CODE_R].rc_abd
 	};

 	raidz_math_begin();

-	raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize);
-	raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize);
-	raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize);
+	raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize);
+	raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize);
+	raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize);

 	for (c = 4; c < ncols; c++) {
-		dabd = rm->rm_col[c].rc_abd;
-		dsize = rm->rm_col[c].rc_size;
+		dabd = rr->rr_col[c].rc_abd;
+		dsize = rr->rr_col[c].rc_size;

 		abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
 		    raidz_gen_pqr_add);
@@ -579,33 +580,36 @@ raidz_generate_pqr_impl(raidz_map_t * const rm)
 * @syn_method	raidz_add_abd()
 * @rec_method	not applicable
 *
- * @rm		RAIDZ map
+ * @rr		RAIDZ row
 * @tgtidx	array of missing data indexes
 */
 static raidz_inline int
-raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx)
 {
 	size_t c;
-	const size_t firstdc = raidz_parity(rm);
-	const size_t ncols = raidz_ncols(rm);
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
 	const size_t x = tgtidx[TARGET_X];
-	const size_t xsize = rm->rm_col[x].rc_size;
-	abd_t *xabd = rm->rm_col[x].rc_abd;
+	const size_t xsize = rr->rr_col[x].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
 	size_t size;
 	abd_t *dabd;

+	if (xabd == NULL)
+		return (1 << CODE_P);
+
 	raidz_math_begin();

 	/* copy P into target */
-	raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize);
+	raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize);

 	/* generate p_syndrome */
 	for (c = firstdc; c < ncols; c++) {
 		if (c == x)
 			continue;

-		dabd = rm->rm_col[c].rc_abd;
-		size = MIN(rm->rm_col[c].rc_size, xsize);
+		dabd = rr->rr_col[c].rc_abd;
+		size = MIN(rr->rr_col[c].rc_size, xsize);

 		raidz_add(xabd, dabd, size);
 	}
@@ -653,30 +657,33 @@ raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize,
 * @syn_method	raidz_add_abd()
 * @rec_method	raidz_mul_abd_cb()
 *
- * @rm		RAIDZ map
+ * @rr		RAIDZ row
 * @tgtidx	array of missing data indexes
 */
 static raidz_inline int
-raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
 {
 	size_t c;
 	size_t dsize;
 	abd_t *dabd;
-	const size_t firstdc = raidz_parity(rm);
-	const size_t ncols = raidz_ncols(rm);
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
 	const size_t x = tgtidx[TARGET_X];
-	abd_t *xabd = rm->rm_col[x].rc_abd;
-	const size_t xsize = rm->rm_col[x].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	const size_t xsize = rr->rr_col[x].rc_size;
 	abd_t *tabds[] = { xabd };

+	if (xabd == NULL)
+		return (1 << CODE_Q);
+
 	unsigned coeff[MUL_CNT];
-	raidz_rec_q_coeff(rm, tgtidx, coeff);
+	raidz_rec_q_coeff(rr, tgtidx, coeff);

 	raidz_math_begin();

 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 	}
@@ -687,8 +694,8 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
 			dabd = NULL;
 			dsize = 0;
 		} else {
-			dabd = rm->rm_col[c].rc_abd;
-			dsize = rm->rm_col[c].rc_size;
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
 		}

 		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
@@ -696,7 +703,7 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
 	}

 	/* add Q to the syndrome */
-	raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize);
+	raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize);

 	/* transform the syndrome */
 	abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
@@ -744,30 +751,33 @@ raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize,
 * @syn_method	raidz_add_abd()
 * @rec_method	raidz_mul_abd_cb()
 *
- * @rm		RAIDZ map
+ * @rr		RAIDZ rr
 * @tgtidx	array of missing data indexes
 */
 static raidz_inline int
-raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
 {
 	size_t c;
 	size_t dsize;
 	abd_t *dabd;
-	const size_t firstdc = raidz_parity(rm);
-	const size_t ncols = raidz_ncols(rm);
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
 	const size_t x = tgtidx[TARGET_X];
-	const size_t xsize = rm->rm_col[x].rc_size;
-	abd_t *xabd = rm->rm_col[x].rc_abd;
+	const size_t xsize = rr->rr_col[x].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
 	abd_t *tabds[] = { xabd };

+	if (xabd == NULL)
+		return (1 << CODE_R);
+
 	unsigned coeff[MUL_CNT];
-	raidz_rec_r_coeff(rm, tgtidx, coeff);
+	raidz_rec_r_coeff(rr, tgtidx, coeff);

 	raidz_math_begin();

 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 	}
@@ -779,8 +789,8 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
 			dabd = NULL;
 			dsize = 0;
 		} else {
-			dabd = rm->rm_col[c].rc_abd;
-			dsize = rm->rm_col[c].rc_size;
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
 		}

 		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
@@ -788,7 +798,7 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
 	}

 	/* add R to the syndrome */
-	raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize);
+	raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize);

 	/* transform the syndrome */
 	abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
@@ -881,31 +891,34 @@ raidz_rec_pq_abd(void **tc, const size_t tsize, void **c,
 * @syn_method	raidz_syn_pq_abd()
 * @rec_method	raidz_rec_pq_abd()
 *
- * @rm		RAIDZ map
+ * @rr		RAIDZ row
 * @tgtidx	array of missing data indexes
 */
 static raidz_inline int
-raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
 {
 	size_t c;
 	size_t dsize;
 	abd_t *dabd;
-	const size_t firstdc = raidz_parity(rm);
-	const size_t ncols = raidz_ncols(rm);
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
 	const size_t x = tgtidx[TARGET_X];
 	const size_t y = tgtidx[TARGET_Y];
-	const size_t xsize = rm->rm_col[x].rc_size;
-	const size_t ysize = rm->rm_col[y].rc_size;
-	abd_t *xabd = rm->rm_col[x].rc_abd;
-	abd_t *yabd = rm->rm_col[y].rc_abd;
+	const size_t xsize = rr->rr_col[x].rc_size;
+	const size_t ysize = rr->rr_col[y].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	abd_t *yabd = rr->rr_col[y].rc_abd;
 	abd_t *tabds[2] = { xabd, yabd };
 	abd_t *cabds[] = {
-		rm->rm_col[CODE_P].rc_abd,
-		rm->rm_col[CODE_Q].rc_abd
+		rr->rr_col[CODE_P].rc_abd,
+		rr->rr_col[CODE_Q].rc_abd
 	};

+	if (xabd == NULL)
+		return ((1 << CODE_P) | (1 << CODE_Q));
+
 	unsigned coeff[MUL_CNT];
-	raidz_rec_pq_coeff(rm, tgtidx, coeff);
+	raidz_rec_pq_coeff(rr, tgtidx, coeff);

 	/*
 	 * Check if some of targets is shorter then others
@@ -921,8 +934,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)

 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
-		raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 		raidz_zero(yabd, xsize);
@@ -934,8 +947,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
 			dabd = NULL;
 			dsize = 0;
 		} else {
-			dabd = rm->rm_col[c].rc_abd;
-			dsize = rm->rm_col[c].rc_size;
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
 		}

 		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@@ -946,7 +959,7 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)

 	/* Copy shorter targets back to the original abd buffer */
 	if (ysize < xsize)
-		raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);

 	raidz_math_end();

@@ -1038,30 +1051,34 @@ raidz_rec_pr_abd(void **t, const size_t tsize, void **c,
 * @syn_method	raidz_syn_pr_abd()
 * @rec_method	raidz_rec_pr_abd()
 *
- * @rm		RAIDZ map
+ * @rr		RAIDZ row
 * @tgtidx	array of missing data indexes
 */
 static raidz_inline int
-raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
 {
 	size_t c;
 	size_t dsize;
 	abd_t *dabd;
-	const size_t firstdc = raidz_parity(rm);
-	const size_t ncols = raidz_ncols(rm);
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
 	const size_t x = tgtidx[0];
 	const size_t y = tgtidx[1];
-	const size_t xsize = rm->rm_col[x].rc_size;
-	const size_t ysize = rm->rm_col[y].rc_size;
-	abd_t *xabd = rm->rm_col[x].rc_abd;
-	abd_t *yabd = rm->rm_col[y].rc_abd;
+	const size_t xsize = rr->rr_col[x].rc_size;
+	const size_t ysize = rr->rr_col[y].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	abd_t *yabd = rr->rr_col[y].rc_abd;
 	abd_t *tabds[2] = { xabd, yabd };
 	abd_t *cabds[] = {
-		rm->rm_col[CODE_P].rc_abd,
-		rm->rm_col[CODE_R].rc_abd
+		rr->rr_col[CODE_P].rc_abd,
+		rr->rr_col[CODE_R].rc_abd
 	};
+
+	if (xabd == NULL)
+		return ((1 << CODE_P) | (1 << CODE_R));
+
 	unsigned coeff[MUL_CNT];
-	raidz_rec_pr_coeff(rm, tgtidx, coeff);
+	raidz_rec_pr_coeff(rr, tgtidx, coeff);

 	/*
 	 * Check if some of targets are shorter then others.
@@ -1077,8 +1094,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)

 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
-		raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 		raidz_zero(yabd, xsize);
@@ -1090,8 +1107,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
 			dabd = NULL;
 			dsize = 0;
 		} else {
-			dabd = rm->rm_col[c].rc_abd;
-			dsize = rm->rm_col[c].rc_size;
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
 		}

 		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@@ -1104,14 +1121,14 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
 	 * Copy shorter targets back to the original abd buffer
 	 */
 	if (ysize < xsize)
-		raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);

 	raidz_math_end();

 	if (ysize < xsize)
 		abd_free(yabd);

-	return ((1 << CODE_P) | (1 << CODE_Q));
+	return ((1 << CODE_P) | (1 << CODE_R));
 }


@@ -1201,30 +1218,34 @@ raidz_rec_qr_abd(void **t, const size_t tsize, void **c,
 * @syn_method	raidz_syn_qr_abd()
 * @rec_method	raidz_rec_qr_abd()
 *
- * @rm		RAIDZ map
+ * @rr		RAIDZ row
 * @tgtidx	array of missing data indexes
 */
 static raidz_inline int
-raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
 {
 	size_t c;
 	size_t dsize;
 	abd_t *dabd;
-	const size_t firstdc = raidz_parity(rm);
-	const size_t ncols = raidz_ncols(rm);
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
 	const size_t x = tgtidx[TARGET_X];
 	const size_t y = tgtidx[TARGET_Y];
-	const size_t xsize = rm->rm_col[x].rc_size;
-	const size_t ysize = rm->rm_col[y].rc_size;
-	abd_t *xabd = rm->rm_col[x].rc_abd;
-	abd_t *yabd = rm->rm_col[y].rc_abd;
+	const size_t xsize = rr->rr_col[x].rc_size;
+	const size_t ysize = rr->rr_col[y].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	abd_t *yabd = rr->rr_col[y].rc_abd;
 	abd_t *tabds[2] = { xabd, yabd };
 	abd_t *cabds[] = {
-		rm->rm_col[CODE_Q].rc_abd,
-		rm->rm_col[CODE_R].rc_abd
+		rr->rr_col[CODE_Q].rc_abd,
+		rr->rr_col[CODE_R].rc_abd
 	};
+
+	if (xabd == NULL)
+		return ((1 << CODE_Q) | (1 << CODE_R));
+
 	unsigned coeff[MUL_CNT];
-	raidz_rec_qr_coeff(rm, tgtidx, coeff);
+	raidz_rec_qr_coeff(rr, tgtidx, coeff);

 	/*
 	 * Check if some of targets is shorter then others
@@ -1240,8 +1261,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)

 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
-		raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 		raidz_zero(yabd, xsize);
@@ -1253,8 +1274,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
 			dabd = NULL;
 			dsize = 0;
 		} else {
-			dabd = rm->rm_col[c].rc_abd;
-			dsize = rm->rm_col[c].rc_size;
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
 		}

 		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@@ -1267,7 +1288,7 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
 	 * Copy shorter targets back to the original abd buffer
 	 */
 	if (ysize < xsize)
-		raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);

 	raidz_math_end();

@@ -1384,34 +1405,38 @@ raidz_rec_pqr_abd(void **t, const size_t tsize, void **c,
 * @syn_method	raidz_syn_pqr_abd()
 * @rec_method	raidz_rec_pqr_abd()
 *
- * @rm		RAIDZ map
+ * @rr		RAIDZ row
 * @tgtidx	array of missing data indexes
 */
 static raidz_inline int
-raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
 {
 	size_t c;
 	size_t dsize;
 	abd_t *dabd;
-	const size_t firstdc = raidz_parity(rm);
-	const size_t ncols = raidz_ncols(rm);
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
 	const size_t x = tgtidx[TARGET_X];
 	const size_t y = tgtidx[TARGET_Y];
 	const size_t z = tgtidx[TARGET_Z];
-	const size_t xsize = rm->rm_col[x].rc_size;
-	const size_t ysize = rm->rm_col[y].rc_size;
-	const size_t zsize = rm->rm_col[z].rc_size;
-	abd_t *xabd = rm->rm_col[x].rc_abd;
-	abd_t *yabd = rm->rm_col[y].rc_abd;
-	abd_t *zabd = rm->rm_col[z].rc_abd;
+	const size_t xsize = rr->rr_col[x].rc_size;
+	const size_t ysize = rr->rr_col[y].rc_size;
+	const size_t zsize = rr->rr_col[z].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	abd_t *yabd = rr->rr_col[y].rc_abd;
+	abd_t *zabd = rr->rr_col[z].rc_abd;
 	abd_t *tabds[] = { xabd, yabd, zabd };
 	abd_t *cabds[] = {
-		rm->rm_col[CODE_P].rc_abd,
-		rm->rm_col[CODE_Q].rc_abd,
-		rm->rm_col[CODE_R].rc_abd
+		rr->rr_col[CODE_P].rc_abd,
+		rr->rr_col[CODE_Q].rc_abd,
+		rr->rr_col[CODE_R].rc_abd
 	};
+
+	if (xabd == NULL)
+		return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R));
+
 	unsigned coeff[MUL_CNT];
-	raidz_rec_pqr_coeff(rm, tgtidx, coeff);
+	raidz_rec_pqr_coeff(rr, tgtidx, coeff);

 	/*
 	 * Check if some of targets is shorter then others
@@ -1431,9 +1456,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)

 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
-		raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
-		raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 		raidz_zero(yabd, xsize);
@@ -1446,8 +1471,8 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
 			dabd = NULL;
 			dsize = 0;
 		} else {
-			dabd = rm->rm_col[c].rc_abd;
-			dsize = rm->rm_col[c].rc_size;
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
 		}

 		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
@@ -1460,9 +1485,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
 	 * Copy shorter targets back to the original abd buffer
 	 */
 	if (ysize < xsize)
-		raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
 	if (zsize < xsize)
-		raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize);
+		raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize);

 	raidz_math_end();

@@ -25,6 +25,7 @@
 */

 #include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
 #include <sys/dsl_scan.h>
 #include <sys/spa_impl.h>
 #include <sys/metaslab_impl.h>
@@ -63,13 +64,15 @@
 *
 * Limitations:
 *
- *   - Only supported for mirror vdev types.  Due to the variable stripe
- *     width used by raidz sequential reconstruction is not possible.
+ *   - Sequential reconstruction is not possible on RAIDZ due to its
+ *     variable stripe width.  Note dRAID uses a fixed stripe width which
+ *     avoids this issue, but comes at the expense of some usable capacity.
 *
- *   - Block checksums are not verified during sequential reconstuction.
+ *   - Block checksums are not verified during sequential reconstruction.
 *     Similar to traditional RAID the parity/mirror data is reconstructed
 *     but cannot be immediately double checked.  For this reason when the
- *     last active resilver completes the pool is automatically scrubbed.
+ *     last active resilver completes the pool is automatically scrubbed
+ *     by default.
 *
 *   - Deferred resilvers using sequential reconstruction are not currently
 *     supported.  When adding another vdev to an active top-level resilver
@@ -77,8 +80,8 @@
 *
 * Advantages:
 *
- *   - Sequential reconstuction is performed in LBA order which may be faster
- *     than healing reconstuction particularly when using using HDDs (or
+ *   - Sequential reconstruction is performed in LBA order which may be faster
+ *     than healing reconstruction particularly when using using HDDs (or
 *     especially with SMR devices).  Only allocated capacity is resilvered.
 *
 *   - Sequential reconstruction is not constrained by ZFS block boundaries.
@@ -86,9 +89,9 @@
 *     allowing all of these logical blocks to be repaired with a single IO.
 *
 *   - Unlike a healing resilver or scrub which are pool wide operations,
- *     sequential reconstruction is handled by the top-level mirror vdevs.
- *     This allows for it to be started or canceled on a top-level vdev
- *     without impacting any other top-level vdevs in the pool.
+ *     sequential reconstruction is handled by the top-level vdevs.  This
+ *     allows for it to be started or canceled on a top-level vdev without
+ *     impacting any other top-level vdevs in the pool.
 *
 *   - Data only referenced by a pool checkpoint will be repaired because
 *     that space is reflected in the space maps.  This differs for a
@@ -97,18 +100,36 @@


 /*
- * Maximum number of queued rebuild I/Os top-level vdev.  The number of
- * concurrent rebuild I/Os issued to the device is controlled by the
- * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module
- * options.
- */
-unsigned int zfs_rebuild_queue_limit = 20;
-
-/*
- * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE.
+ * Size of rebuild reads; defaults to 1MiB per data disk and is capped at
+ * SPA_MAXBLOCKSIZE.
 */
 unsigned long zfs_rebuild_max_segment = 1024 * 1024;

+/*
+ * Maximum number of parallelly executed bytes per leaf vdev caused by a
+ * sequential resilver.  We attempt to strike a balance here between keeping
+ * the vdev queues full of I/Os at all times and not overflowing the queues
+ * to cause long latency, which would cause long txg sync times.
+ *
+ * A large default value can be safely used here because the default target
+ * segment size is also large (zfs_rebuild_max_segment=1M).  This helps keep
+ * the queue depth short.
+ *
+ * 32MB was selected as the default value to achieve good performance with
+ * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
+ * rebuild was unable to saturate all of the drives using smaller values.
+ * With a value of 32MB the sequential resilver write rate was measured at
+ * 800MB/s sustained while rebuilding to a distributed spare.
+ */
+unsigned long zfs_rebuild_vdev_limit = 32 << 20;
+
+/*
+ * Automatically start a pool scrub when the last active sequential resilver
+ * completes in order to verify the checksums of all blocks which have been
+ * resilvered. This option is enabled by default and is strongly recommended.
+ */
+int zfs_rebuild_scrub_enabled = 1;
+
 /*
 * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
 */
@@ -293,7 +314,7 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
 	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
 	    REBUILD_PHYS_ENTRIES, vrp, tx));

-	vdev_dtl_reassess(vd,  tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
+	vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
 	spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);

 	spa_history_log_internal(spa, "rebuild",  tx,
@@ -306,7 +327,16 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
 	vd->vdev_rebuilding = B_FALSE;
 	mutex_exit(&vd->vdev_rebuild_lock);

-	spa_notify_waiters(spa);
+	/*
+	 * While we're in syncing context take the opportunity to
+	 * setup the scrub when there are no more active rebuilds.
+	 */
+	if (!vdev_rebuild_active(spa->spa_root_vdev) &&
+	    zfs_rebuild_scrub_enabled) {
+		pool_scan_func_t func = POOL_SCAN_SCRUB;
+		dsl_scan_setup_sync(&func, tx);
+	}
+
 	cv_broadcast(&vd->vdev_rebuild_cv);
 }

@@ -438,7 +468,7 @@ vdev_rebuild_cb(zio_t *zio)
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 	vdev_t *vd = vr->vr_top_vdev;

-	mutex_enter(&vd->vdev_rebuild_io_lock);
+	mutex_enter(&vr->vr_io_lock);
 	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
 		/*
 		 * The I/O failed because the top-level vdev was unavailable.
@@ -455,34 +485,30 @@ vdev_rebuild_cb(zio_t *zio)

 	abd_free(zio->io_abd);

-	ASSERT3U(vd->vdev_rebuild_inflight, >, 0);
-	vd->vdev_rebuild_inflight--;
-	cv_broadcast(&vd->vdev_rebuild_io_cv);
-	mutex_exit(&vd->vdev_rebuild_io_lock);
+	ASSERT3U(vr->vr_bytes_inflight, >, 0);
+	vr->vr_bytes_inflight -= zio->io_size;
+	cv_broadcast(&vr->vr_io_cv);
+	mutex_exit(&vr->vr_io_lock);

 	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 }

 /*
- * Rebuild the data in this range by constructing a special dummy block
- * pointer for the given range.  It has no relation to any existing blocks
- * in the pool.  But by disabling checksum verification and issuing a scrub
- * I/O mirrored vdevs will replicate the block using any available mirror
- * leaf vdevs.
+ * Initialize a block pointer that can be used to read the given segment
+ * for sequential rebuild.
 */
 static void
-vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
-    uint64_t txg)
+vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start,
+    uint64_t asize)
 {
-	vdev_t *vd = vr->vr_top_vdev;
-	spa_t *spa = vd->vdev_spa;
-	uint64_t psize = asize;
-
-	ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
+	ASSERT(vd->vdev_ops == &vdev_draid_ops ||
+	    vd->vdev_ops == &vdev_mirror_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops ||
 	    vd->vdev_ops == &vdev_spare_ops);

-	blkptr_t blk, *bp = &blk;
+	uint64_t psize = vd->vdev_ops == &vdev_draid_ops ?
+	    vdev_draid_asize_to_psize(vd, asize) : asize;
+
 	BP_ZERO(bp);

 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
@@ -499,19 +525,6 @@ vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-	/*
-	 * We increment the issued bytes by the asize rather than the psize
-	 * so the scanned and issued bytes may be directly compared.  This
-	 * is consistent with the scrub/resilver issued reporting.
-	 */
-	vr->vr_pass_bytes_issued += asize;
-	vr->vr_rebuild_phys.vrp_bytes_issued += asize;
-
-	zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp,
-	    abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
-	    ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
-	    ZIO_FLAG_RESILVER, NULL));
 }

 /*
@@ -525,6 +538,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
 	uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id;
 	vdev_t *vd = vr->vr_top_vdev;
 	spa_t *spa = vd->vdev_spa;
+	blkptr_t blk;

 	ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift);
 	ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift);
@@ -532,14 +546,26 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
 	vr->vr_pass_bytes_scanned += size;
 	vr->vr_rebuild_phys.vrp_bytes_scanned += size;

-	mutex_enter(&vd->vdev_rebuild_io_lock);
+	/*
+	 * Rebuild the data in this range by constructing a special block
+	 * pointer.  It has no relation to any existing blocks in the pool.
+	 * However, by disabling checksum verification and issuing a scrub IO
+	 * we can reconstruct and repair any children with missing data.
+	 */
+	vdev_rebuild_blkptr_init(&blk, vd, start, size);
+	uint64_t psize = BP_GET_PSIZE(&blk);
+
+	if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
+		return (0);
+
+	mutex_enter(&vr->vr_io_lock);

 	/* Limit in flight rebuild I/Os */
-	while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit)
-		cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
+	while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max)
+		cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);

-	vd->vdev_rebuild_inflight++;
-	mutex_exit(&vd->vdev_rebuild_io_lock);
+	vr->vr_bytes_inflight += psize;
+	mutex_exit(&vr->vr_io_lock);

 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
@@ -558,45 +584,29 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)

 	/* When exiting write out our progress. */
 	if (vdev_rebuild_should_stop(vd)) {
-		mutex_enter(&vd->vdev_rebuild_io_lock);
-		vd->vdev_rebuild_inflight--;
-		mutex_exit(&vd->vdev_rebuild_io_lock);
+		mutex_enter(&vr->vr_io_lock);
+		vr->vr_bytes_inflight -= psize;
+		mutex_exit(&vr->vr_io_lock);
 		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 		mutex_exit(&vd->vdev_rebuild_lock);
 		dmu_tx_commit(tx);
 		return (SET_ERROR(EINTR));
 	}
 	mutex_exit(&vd->vdev_rebuild_lock);
-
-	vr->vr_scan_offset[txg & TXG_MASK] = start + size;
-	vdev_rebuild_rebuild_block(vr, start, size, txg);
-
 	dmu_tx_commit(tx);

+	vr->vr_scan_offset[txg & TXG_MASK] = start + size;
+	vr->vr_pass_bytes_issued += size;
+	vr->vr_rebuild_phys.vrp_bytes_issued += size;
+
+	zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk,
+	    abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
+	    ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_RESILVER, NULL));
+
 	return (0);
 }

-/*
- * Split range into legally-sized logical chunks given the constraints of the
- * top-level mirror vdev type.
- */
-static uint64_t
-vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size)
-{
-	uint64_t chunk_size, max_asize, max_segment;
-
-	ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
-	    vd->vdev_ops == &vdev_replacing_ops ||
-	    vd->vdev_ops == &vdev_spare_ops);
-
-	max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment,
-	    1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE);
-	max_asize = vdev_psize_to_asize(vd, max_segment);
-	chunk_size = MIN(size, max_asize);
-
-	return (chunk_size);
-}
-
 /*
 * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree.
 */
@@ -625,7 +635,14 @@ vdev_rebuild_ranges(vdev_rebuild_t *vr)
 		while (size > 0) {
 			uint64_t chunk_size;

-			chunk_size = vdev_rebuild_chunk_size(vd, start, size);
+			/*
+			 * Split range into legally-sized logical chunks
+			 * given the constraints of the top-level vdev
+			 * being rebuilt (dRAID or mirror).
+			 */
+			ASSERT3P(vd->vdev_ops, !=, NULL);
+			chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd,
+			    start, size, zfs_rebuild_max_segment);

 			error = vdev_rebuild_range(vr, start, chunk_size);
 			if (error != 0)
@@ -747,10 +764,16 @@ vdev_rebuild_thread(void *arg)
 	vr->vr_top_vdev = vd;
 	vr->vr_scan_msp = NULL;
 	vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+	mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);
+
 	vr->vr_pass_start_time = gethrtime();
 	vr->vr_pass_bytes_scanned = 0;
 	vr->vr_pass_bytes_issued = 0;

+	vr->vr_bytes_inflight_max = MAX(1ULL << 20,
+	    zfs_rebuild_vdev_limit * vd->vdev_children);
+
 	uint64_t update_est_time = gethrtime();
 	vdev_rebuild_update_bytes_est(vd, 0);

@@ -780,21 +803,32 @@ vdev_rebuild_thread(void *arg)

 		ASSERT0(range_tree_space(vr->vr_scan_tree));

-		/*
-		 * Disable any new allocations to this metaslab and wait
-		 * for any writes inflight to complete.  This is needed to
-		 * ensure all allocated ranges are rebuilt.
-		 */
+		/* Disable any new allocations to this metaslab */
 		metaslab_disable(msp);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
-		txg_wait_synced(dsl, 0);

 		mutex_enter(&msp->ms_sync_lock);
 		mutex_enter(&msp->ms_lock);

+		/*
+		 * If there are outstanding allocations wait for them to be
+		 * synced.  This is needed to ensure all allocated ranges are
+		 * on disk and therefore will be rebuilt.
+		 */
+		for (int j = 0; j < TXG_SIZE; j++) {
+			if (range_tree_space(msp->ms_allocating[j])) {
+				mutex_exit(&msp->ms_lock);
+				mutex_exit(&msp->ms_sync_lock);
+				txg_wait_synced(dsl, 0);
+				mutex_enter(&msp->ms_sync_lock);
+				mutex_enter(&msp->ms_lock);
+				break;
+			}
+		}
+
 		/*
 		 * When a metaslab has been allocated from read its allocated
-		 * ranges from the space map object in to the vr_scan_tree.
+		 * ranges from the space map object into the vr_scan_tree.
 		 * Then add inflight / unflushed ranges and remove inflight /
 		 * unflushed frees.  This is the minimum range to be rebuilt.
 		 */
@@ -827,7 +861,7 @@ vdev_rebuild_thread(void *arg)
 		/*
 		 * To provide an accurate estimate re-calculate the estimated
 		 * size every 5 minutes to account for recent allocations and
-		 * frees made space maps which have not yet been rebuilt.
+		 * frees made to space maps which have not yet been rebuilt.
 		 */
 		if (gethrtime() > update_est_time + SEC2NSEC(300)) {
 			update_est_time = gethrtime();
@@ -851,11 +885,14 @@ vdev_rebuild_thread(void *arg)
 	spa_config_exit(spa, SCL_CONFIG, FTAG);

 	/* Wait for any remaining rebuild I/O to complete */
-	mutex_enter(&vd->vdev_rebuild_io_lock);
-	while (vd->vdev_rebuild_inflight > 0)
-		cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
+	mutex_enter(&vr->vr_io_lock);
+	while (vr->vr_bytes_inflight > 0)
+		cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);

-	mutex_exit(&vd->vdev_rebuild_io_lock);
+	mutex_exit(&vr->vr_io_lock);
+
+	mutex_destroy(&vr->vr_io_lock);
+	cv_destroy(&vr->vr_io_cv);

 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

@@ -1100,5 +1137,11 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)

 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
-        "Max segment size in bytes of rebuild reads");
+	"Max segment size in bytes of rebuild reads");
+
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW,
+	"Max bytes in flight per leaf vdev for sequential resilvers");
+
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW,
+	"Automatically scrub after sequential resilver completes");
 /* END CSTYLED */
@@ -250,7 +250,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 	spa_vdev_removal_t *svr = NULL;
 	uint64_t txg __maybe_unused = dmu_tx_get_txg(tx);

-	ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
+	ASSERT0(vdev_get_nparity(vd));
 	svr = spa_vdev_removal_create(vd);

 	ASSERT(vd->vdev_removing);
@@ -1120,7 +1120,7 @@ static void
 vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
 {
 	ASSERT3P(zlist, !=, NULL);
-	ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
+	ASSERT0(vdev_get_nparity(vd));

 	if (vd->vdev_leaf_zap != 0) {
 		char zkey[32];
@@ -2041,7 +2041,7 @@ spa_vdev_remove_top_check(vdev_t *vd)

 	/*
 	 * All vdevs in normal class must have the same ashift
-	 * and not be raidz.
+	 * and not be raidz or draid.
 	 */
 	vdev_t *rvd = spa->spa_root_vdev;
 	int num_indirect = 0;
@@ -2064,7 +2064,7 @@ spa_vdev_remove_top_check(vdev_t *vd)
 			num_indirect++;
 		if (!vdev_is_concrete(cvd))
 			continue;
-		if (cvd->vdev_ops == &vdev_raidz_ops)
+		if (vdev_get_nparity(cvd) != 0)
 			return (SET_ERROR(EINVAL));
 		/*
 		 * Need the mirror to be mirror of leaf vdevs only
@@ -2217,18 +2217,30 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 		 * in this pool.
 		 */
 		if (vd == NULL || unspare) {
-			if (vd == NULL)
-				vd = spa_lookup_by_guid(spa, guid, B_TRUE);
-			ev = spa_event_create(spa, vd, NULL,
-			    ESC_ZFS_VDEV_REMOVE_AUX);
+			char *type;
+			boolean_t draid_spare = B_FALSE;

-			vd_type = VDEV_TYPE_SPARE;
-			vd_path = spa_strdup(fnvlist_lookup_string(
-			    nv, ZPOOL_CONFIG_PATH));
-			spa_vdev_remove_aux(spa->spa_spares.sav_config,
-			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
-			spa_load_spares(spa);
-			spa->spa_spares.sav_sync = B_TRUE;
+			if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
+			    == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
+				draid_spare = B_TRUE;
+
+			if (vd == NULL && draid_spare) {
+				error = SET_ERROR(ENOTSUP);
+			} else {
+				if (vd == NULL)
+					vd = spa_lookup_by_guid(spa,
+					    guid, B_TRUE);
+				ev = spa_event_create(spa, vd, NULL,
+				    ESC_ZFS_VDEV_REMOVE_AUX);
+
+				vd_type = VDEV_TYPE_SPARE;
+				vd_path = spa_strdup(fnvlist_lookup_string(
+				    nv, ZPOOL_CONFIG_PATH));
+				spa_vdev_remove_aux(spa->spa_spares.sav_config,
+				    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
+				spa_load_spares(spa);
+				spa->spa_spares.sav_sync = B_TRUE;
+			}
 		} else {
 			error = SET_ERROR(EBUSY);
 		}
@@ -142,9 +142,13 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
 }

 vdev_ops_t vdev_root_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_root_open,
 	.vdev_op_close = vdev_root_close,
 	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = NULL,	/* not applicable to the root */
 	.vdev_op_io_done = NULL,	/* not applicable to the root */
 	.vdev_op_state_change = vdev_root_state_change,
@@ -153,6 +157,11 @@ vdev_ops_t vdev_root_ops = {
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = NULL,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_ROOT,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE		/* not a leaf vdev */
 };
@@ -311,7 +311,8 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
 			vd->vdev_trim_secure = secure;
 	}

-	boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED);
+	vdev_trim_state_t old_state = vd->vdev_trim_state;
+	boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED);
 	vd->vdev_trim_state = new_state;

 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
@@ -332,9 +333,12 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
 		    "vdev=%s suspended", vd->vdev_path);
 		break;
 	case VDEV_TRIM_CANCELED:
-		spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
-		spa_history_log_internal(spa, "trim", tx,
-		    "vdev=%s canceled", vd->vdev_path);
+		if (old_state == VDEV_TRIM_ACTIVE ||
+		    old_state == VDEV_TRIM_SUSPENDED) {
+			spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
+			spa_history_log_internal(spa, "trim", tx,
+			    "vdev=%s canceled", vd->vdev_path);
+		}
 		break;
 	case VDEV_TRIM_COMPLETE:
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
@@ -601,6 +605,32 @@ vdev_trim_ranges(trim_args_t *ta)
 	return (0);
 }

+static void
+vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
+{
+	uint64_t *last_rs_end = (uint64_t *)arg;
+
+	if (physical_rs->rs_end > *last_rs_end)
+		*last_rs_end = physical_rs->rs_end;
+}
+
+static void
+vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs)
+{
+	vdev_t *vd = (vdev_t *)arg;
+
+	uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
+	vd->vdev_trim_bytes_est += size;
+
+	if (vd->vdev_trim_last_offset >= physical_rs->rs_end) {
+		vd->vdev_trim_bytes_done += size;
+	} else if (vd->vdev_trim_last_offset > physical_rs->rs_start &&
+	    vd->vdev_trim_last_offset <= physical_rs->rs_end) {
+		vd->vdev_trim_bytes_done +=
+		    vd->vdev_trim_last_offset - physical_rs->rs_start;
+	}
+}
+
 /*
 * Calculates the completion percentage of a manual TRIM.
 */
@@ -618,27 +648,35 @@ vdev_trim_calculate_progress(vdev_t *vd)
 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
 		mutex_enter(&msp->ms_lock);

-		uint64_t ms_free = msp->ms_size -
-		    metaslab_allocated_space(msp);
-
-		if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
-			ms_free /= vd->vdev_top->vdev_children;
+		uint64_t ms_free = (msp->ms_size -
+		    metaslab_allocated_space(msp)) /
+		    vdev_get_ndisks(vd->vdev_top);

 		/*
 		 * Convert the metaslab range to a physical range
 		 * on our vdev. We use this to determine if we are
 		 * in the middle of this metaslab range.
 		 */
-		range_seg64_t logical_rs, physical_rs;
+		range_seg64_t logical_rs, physical_rs, remain_rs;
 		logical_rs.rs_start = msp->ms_start;
 		logical_rs.rs_end = msp->ms_start + msp->ms_size;
-		vdev_xlate(vd, &logical_rs, &physical_rs);

+		/* Metaslab space after this offset has not been trimmed. */
+		vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
 		if (vd->vdev_trim_last_offset <= physical_rs.rs_start) {
 			vd->vdev_trim_bytes_est += ms_free;
 			mutex_exit(&msp->ms_lock);
 			continue;
-		} else if (vd->vdev_trim_last_offset > physical_rs.rs_end) {
+		}
+
+		/* Metaslab space before this offset has been trimmed */
+		uint64_t last_rs_end = physical_rs.rs_end;
+		if (!vdev_xlate_is_empty(&remain_rs)) {
+			vdev_xlate_walk(vd, &remain_rs,
+			    vdev_trim_xlate_last_rs_end, &last_rs_end);
+		}
+
+		if (vd->vdev_trim_last_offset > last_rs_end) {
 			vd->vdev_trim_bytes_done += ms_free;
 			vd->vdev_trim_bytes_est += ms_free;
 			mutex_exit(&msp->ms_lock);
@@ -659,21 +697,9 @@ vdev_trim_calculate_progress(vdev_t *vd)
 		    rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) {
 			logical_rs.rs_start = rs_get_start(rs, rt);
 			logical_rs.rs_end = rs_get_end(rs, rt);
-			vdev_xlate(vd, &logical_rs, &physical_rs);

-			uint64_t size = physical_rs.rs_end -
-			    physical_rs.rs_start;
-			vd->vdev_trim_bytes_est += size;
-			if (vd->vdev_trim_last_offset >= physical_rs.rs_end) {
-				vd->vdev_trim_bytes_done += size;
-			} else if (vd->vdev_trim_last_offset >
-			    physical_rs.rs_start &&
-			    vd->vdev_trim_last_offset <=
-			    physical_rs.rs_end) {
-				vd->vdev_trim_bytes_done +=
-				    vd->vdev_trim_last_offset -
-				    physical_rs.rs_start;
-			}
+			vdev_xlate_walk(vd, &logical_rs,
+			    vdev_trim_xlate_progress, vd);
 		}
 		mutex_exit(&msp->ms_lock);
 	}
@@ -741,8 +767,38 @@ vdev_trim_load(vdev_t *vd)
 	return (err);
 }

+static void
+vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs)
+{
+	trim_args_t *ta = arg;
+	vdev_t *vd = ta->trim_vdev;
+
+	/*
+	 * Only a manual trim will be traversing the vdev sequentially.
+	 * For an auto trim all valid ranges should be added.
+	 */
+	if (ta->trim_type == TRIM_TYPE_MANUAL) {
+
+		/* Only add segments that we have not visited yet */
+		if (physical_rs->rs_end <= vd->vdev_trim_last_offset)
+			return;
+
+		/* Pick up where we left off mid-range. */
+		if (vd->vdev_trim_last_offset > physical_rs->rs_start) {
+			ASSERT3U(physical_rs->rs_end, >,
+			    vd->vdev_trim_last_offset);
+			physical_rs->rs_start = vd->vdev_trim_last_offset;
+		}
+	}
+
+	ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
+
+	range_tree_add(ta->trim_tree, physical_rs->rs_start,
+	    physical_rs->rs_end - physical_rs->rs_start);
+}
+
 /*
- * Convert the logical range into a physical range and add it to the
+ * Convert the logical range into physical ranges and add them to the
 * range tree passed in the trim_args_t.
 */
 static void
@@ -750,7 +806,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
 {
 	trim_args_t *ta = arg;
 	vdev_t *vd = ta->trim_vdev;
-	range_seg64_t logical_rs, physical_rs;
+	range_seg64_t logical_rs;
 	logical_rs.rs_start = start;
 	logical_rs.rs_end = start + size;

@@ -767,44 +823,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
 	}

 	ASSERT(vd->vdev_ops->vdev_op_leaf);
-	vdev_xlate(vd, &logical_rs, &physical_rs);
-
-	IMPLY(vd->vdev_top == vd,
-	    logical_rs.rs_start == physical_rs.rs_start);
-	IMPLY(vd->vdev_top == vd,
-	    logical_rs.rs_end == physical_rs.rs_end);
-
-	/*
-	 * Only a manual trim will be traversing the vdev sequentially.
-	 * For an auto trim all valid ranges should be added.
-	 */
-	if (ta->trim_type == TRIM_TYPE_MANUAL) {
-
-		/* Only add segments that we have not visited yet */
-		if (physical_rs.rs_end <= vd->vdev_trim_last_offset)
-			return;
-
-		/* Pick up where we left off mid-range. */
-		if (vd->vdev_trim_last_offset > physical_rs.rs_start) {
-			ASSERT3U(physical_rs.rs_end, >,
-			    vd->vdev_trim_last_offset);
-			physical_rs.rs_start = vd->vdev_trim_last_offset;
-		}
-	}
-
-	ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
-
-	/*
-	 * With raidz, it's possible that the logical range does not live on
-	 * this leaf vdev. We only add the physical range to this vdev's if it
-	 * has a length greater than 0.
-	 */
-	if (physical_rs.rs_end > physical_rs.rs_start) {
-		range_tree_add(ta->trim_tree, physical_rs.rs_start,
-		    physical_rs.rs_end - physical_rs.rs_start);
-	} else {
-		ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
-	}
+	vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg);
 }

 /*
@@ -1111,7 +1111,9 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
 		bcopy(info, report->zcr_ckinfo, sizeof (*info));
 	}

-	report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
+	report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
+	report->zcr_align =
+	    vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
 	report->zcr_length = length;

 #ifdef _KERNEL
@@ -1702,16 +1702,16 @@ zio_write_compress(zio_t *zio)
 			return (zio);
 		} else {
 			/*
-			 * Round up compressed size up to the ashift
-			 * of the smallest-ashift device, and zero the tail.
-			 * This ensures that the compressed size of the BP
-			 * (and thus compressratio property) are correct,
+			 * Round compressed size up to the minimum allocation
+			 * size of the smallest-ashift device, and zero the
+			 * tail. This ensures that the compressed size of the
+			 * BP (and thus compressratio property) are correct,
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
-			ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
-			size_t rounded = (size_t)P2ROUNDUP(psize,
-			    1ULL << spa->spa_min_ashift);
+			ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
+			size_t rounded = (size_t)roundup(psize,
+			    spa->spa_min_alloc);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				zio_buf_free(cbuf, lsize);
@@ -3754,19 +3754,37 @@ zio_vdev_io_start(zio_t *zio)
 	 * However, indirect vdevs point off to other vdevs which may have
 	 * DTL's, so we never bypass them.  The child i/os on concrete vdevs
 	 * will be properly bypassed instead.
+	 *
+	 * Leaf DTL_PARTIAL can be empty when a legitimate write comes from
+	 * a dRAID spare vdev. For example, when a dRAID spare is first
+	 * used, its spare blocks need to be written to but the leaf vdev's
+	 * of such blocks can have empty DTL_PARTIAL.
+	 *
+	 * There seemed no clean way to allow such writes while bypassing
+	 * spurious ones. At this point, just avoid all bypassing for dRAID
+	 * for correctness.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
 	    vd->vdev_ops != &vdev_indirect_ops &&
+	    vd->vdev_top->vdev_ops != &vdev_draid_ops &&
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
 		return (zio);
 	}

-	if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ ||
-	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) {
+	/*
+	 * Select the next best leaf I/O to process.  Distributed spares are
+	 * excluded since they dispatch the I/O directly to a leaf vdev after
+	 * applying the dRAID mapping.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf &&
+	    vd->vdev_ops != &vdev_draid_spare_ops &&
+	    (zio->io_type == ZIO_TYPE_READ ||
+	    zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_TRIM)) {

 		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
 			return (zio);
@@ -3803,8 +3821,8 @@ zio_vdev_io_done(zio_t *zio)
 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;

-	if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
-
+	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+	    vd->vdev_ops != &vdev_draid_spare_ops) {
 		vdev_queue_io_done(zio);

 		if (zio->io_type == ZIO_TYPE_WRITE)
@@ -4206,7 +4224,7 @@ zio_checksum_verify(zio_t *zio)
 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
 			return (zio);

-		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
+		ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
 	}

 	if ((error = zio_checksum_error(zio, &info)) != 0) {
@@ -265,6 +265,12 @@ zio_handle_fault_injection(zio_t *zio, int error)
 	if (zio->io_type != ZIO_TYPE_READ)
 		return (0);

+	/*
+	 * A rebuild I/O has no checksum to verify.
+	 */
+	if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
+		return (0);
+
 	rw_enter(&inject_lock, RW_READER);

 	for (handler = list_head(&inject_handlers); handler != NULL;