RAID-Z expansion feature

This feature allows disks to be added one at a time to a RAID-Z group, expanding its capacity incrementally. This feature is especially useful for small pools (typically with only one RAID-Z group), where there isn't sufficient hardware to add capacity by adding a whole new RAID-Z group (typically doubling the number of disks). == Initiating expansion == A new device (disk) can be attached to an existing RAIDZ vdev, by running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank raidz2-0 sda`. The new device will become part of the RAIDZ group. A "raidz expansion" will be initiated, and the new device will contribute additional space to the RAIDZ group once the expansion completes. The `feature@raidz_expansion` on-disk feature flag must be `enabled` to initiate an expansion, and it remains `active` for the life of the pool. In other words, pools with expanded RAIDZ vdevs can not be imported by older releases of the ZFS software. == During expansion == The expansion entails reading all allocated space from existing disks in the RAIDZ group, and rewriting it to the new disks in the RAIDZ group (including the newly added device). The expansion progress can be monitored with `zpool status`. Data redundancy is maintained during (and after) the expansion. If a disk fails while the expansion is in progress, the expansion pauses until the health of the RAIDZ vdev is restored (e.g. by replacing the failed disk and waiting for reconstruction to complete). The pool remains accessible during expansion. Following a reboot or export/import, the expansion resumes where it left off. == After expansion == When the expansion completes, the additional space is available for use, and is reflected in the `available` zfs property (as seen in `zfs list`, `df`, etc). Expansion does not change the number of failures that can be tolerated without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after expansion). A RAIDZ vdev can be expanded multiple times. After the expansion completes, old blocks remain with their old data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but distributed among the larger set of disks. New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ vdev's "assumed parity ratio" does not change, so slightly less space than is expected may be reported for newly-written blocks, according to `zfs list`, `df`, `ls -s`, and similar tools. Sponsored-by: The FreeBSD Foundation Sponsored-by: iXsystems, Inc. Sponsored-by: vStack Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Mark Maybee <mark.maybee@delphix.com> Authored-by: Matthew Ahrens <mahrens@delphix.com> Contributions-by: Fedor Uporov <fuporov.vstack@gmail.com> Contributions-by: Stuart Maybee <stuart.maybee@comcast.net> Contributions-by: Thorsten Behrens <tbehrens@outlook.com> Contributions-by: Fmstrat <nospam@nowsci.com> Contributions-by: Don Brady <dev.fs.zfs@gmail.com> Signed-off-by: Don Brady <dev.fs.zfs@gmail.com> Closes #15022
2026-05-22 10:37:35 +03:00 · 2023-11-08 11:19:41 -07:00
parent 9198de8f10
commit 5caeef02fa
62 changed files with 5740 additions and 876 deletions
@@ -84,10 +84,10 @@ run_gen_bench_impl(const char *impl)

 			if (rto_opts.rto_expand) {
 				rm_bench = vdev_raidz_map_alloc_expanded(
-				    zio_bench.io_abd,
-				    zio_bench.io_size, zio_bench.io_offset,
+				    &zio_bench,
 				    rto_opts.rto_ashift, ncols+1, ncols,
-				    fn+1, rto_opts.rto_expand_offset);
+				    fn+1, rto_opts.rto_expand_offset,
+				    0, B_FALSE);
 			} else {
 				rm_bench = vdev_raidz_map_alloc(&zio_bench,
 				    BENCH_ASHIFT, ncols, fn+1);
@@ -172,10 +172,10 @@ run_rec_bench_impl(const char *impl)

 			if (rto_opts.rto_expand) {
 				rm_bench = vdev_raidz_map_alloc_expanded(
-				    zio_bench.io_abd,
-				    zio_bench.io_size, zio_bench.io_offset,
+				    &zio_bench,
 				    BENCH_ASHIFT, ncols+1, ncols,
-				    PARITY_PQR, rto_opts.rto_expand_offset);
+				    PARITY_PQR,
+				    rto_opts.rto_expand_offset, 0, B_FALSE);
 			} else {
 				rm_bench = vdev_raidz_map_alloc(&zio_bench,
 				    BENCH_ASHIFT, ncols, PARITY_PQR);
@@ -327,14 +327,12 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)

 	if (opts->rto_expand) {
 		opts->rm_golden =
-		    vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
-		    opts->zio_golden->io_size, opts->zio_golden->io_offset,
+		    vdev_raidz_map_alloc_expanded(opts->zio_golden,
 		    opts->rto_ashift, total_ncols+1, total_ncols,
-		    parity, opts->rto_expand_offset);
-		rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
-		    zio_test->io_size, zio_test->io_offset,
+		    parity, opts->rto_expand_offset, 0, B_FALSE);
+		rm_test = vdev_raidz_map_alloc_expanded(zio_test,
 		    opts->rto_ashift, total_ncols+1, total_ncols,
-		    parity, opts->rto_expand_offset);
+		    parity, opts->rto_expand_offset, 0, B_FALSE);
 	} else {
 		opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
 		    opts->rto_ashift, total_ncols, parity);
@@ -361,187 +359,6 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
 	return (err);
 }

-/*
- * If reflow is not in progress, reflow_offset should be UINT64_MAX.
- * For each row, if the row is entirely before reflow_offset, it will
- * come from the new location.  Otherwise this row will come from the
- * old location.  Therefore, rows that straddle the reflow_offset will
- * come from the old location.
- *
- * NOTE: Until raidz expansion is implemented this function is only
- * needed by raidz_test.c to the multi-row raid_map_t functionality.
- */
-raidz_map_t *
-vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
-    uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
-    uint64_t nparity, uint64_t reflow_offset)
-{
-	/* The zio's size in units of the vdev's minimum sector size. */
-	uint64_t s = size >> ashift;
-	uint64_t q, r, bc, devidx, asize = 0, tot;
-
-	/*
-	 * "Quotient": The number of data sectors for this stripe on all but
-	 * the "big column" child vdevs that also contain "remainder" data.
-	 * AKA "full rows"
-	 */
-	q = s / (logical_cols - nparity);
-
-	/*
-	 * "Remainder": The number of partial stripe data sectors in this I/O.
-	 * This will add a sector to some, but not all, child vdevs.
-	 */
-	r = s - q * (logical_cols - nparity);
-
-	/* The number of "big columns" - those which contain remainder data. */
-	bc = (r == 0 ? 0 : r + nparity);
-
-	/*
-	 * The total number of data and parity sectors associated with
-	 * this I/O.
-	 */
-	tot = s + nparity * (q + (r == 0 ? 0 : 1));
-
-	/* How many rows contain data (not skip) */
-	uint64_t rows = howmany(tot, logical_cols);
-	int cols = MIN(tot, logical_cols);
-
-	raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
-	    KM_SLEEP);
-	rm->rm_nrows = rows;
-
-	for (uint64_t row = 0; row < rows; row++) {
-		raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
-		    rr_col[cols]), KM_SLEEP);
-		rm->rm_row[row] = rr;
-
-		/* The starting RAIDZ (parent) vdev sector of the row. */
-		uint64_t b = (offset >> ashift) + row * logical_cols;
-
-		/*
-		 * If we are in the middle of a reflow, and any part of this
-		 * row has not been copied, then use the old location of
-		 * this row.
-		 */
-		int row_phys_cols = physical_cols;
-		if (b + (logical_cols - nparity) > reflow_offset >> ashift)
-			row_phys_cols--;
-
-		/* starting child of this row */
-		uint64_t child_id = b % row_phys_cols;
-		/* The starting byte offset on each child vdev. */
-		uint64_t child_offset = (b / row_phys_cols) << ashift;
-
-		/*
-		 * We set cols to the entire width of the block, even
-		 * if this row is shorter.  This is needed because parity
-		 * generation (for Q and R) needs to know the entire width,
-		 * because it treats the short row as though it was
-		 * full-width (and the "phantom" sectors were zero-filled).
-		 *
-		 * Another approach to this would be to set cols shorter
-		 * (to just the number of columns that we might do i/o to)
-		 * and have another mechanism to tell the parity generation
-		 * about the "entire width".  Reconstruction (at least
-		 * vdev_raidz_reconstruct_general()) would also need to
-		 * know about the "entire width".
-		 */
-		rr->rr_cols = cols;
-		rr->rr_bigcols = bc;
-		rr->rr_missingdata = 0;
-		rr->rr_missingparity = 0;
-		rr->rr_firstdatacol = nparity;
-		rr->rr_abd_empty = NULL;
-		rr->rr_nempty = 0;
-
-		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
-			if (child_id >= row_phys_cols) {
-				child_id -= row_phys_cols;
-				child_offset += 1ULL << ashift;
-			}
-			rr->rr_col[c].rc_devidx = child_id;
-			rr->rr_col[c].rc_offset = child_offset;
-			rr->rr_col[c].rc_orig_data = NULL;
-			rr->rr_col[c].rc_error = 0;
-			rr->rr_col[c].rc_tried = 0;
-			rr->rr_col[c].rc_skipped = 0;
-			rr->rr_col[c].rc_need_orig_restore = B_FALSE;
-
-			uint64_t dc = c - rr->rr_firstdatacol;
-			if (c < rr->rr_firstdatacol) {
-				rr->rr_col[c].rc_size = 1ULL << ashift;
-				rr->rr_col[c].rc_abd =
-				    abd_alloc_linear(rr->rr_col[c].rc_size,
-				    B_TRUE);
-			} else if (row == rows - 1 && bc != 0 && c >= bc) {
-				/*
-				 * Past the end, this for parity generation.
-				 */
-				rr->rr_col[c].rc_size = 0;
-				rr->rr_col[c].rc_abd = NULL;
-			} else {
-				/*
-				 * "data column" (col excluding parity)
-				 * Add an ASCII art diagram here
-				 */
-				uint64_t off;
-
-				if (c < bc || r == 0) {
-					off = dc * rows + row;
-				} else {
-					off = r * rows +
-					    (dc - r) * (rows - 1) + row;
-				}
-				rr->rr_col[c].rc_size = 1ULL << ashift;
-				rr->rr_col[c].rc_abd = abd_get_offset_struct(
-				    &rr->rr_col[c].rc_abdstruct,
-				    abd, off << ashift, 1 << ashift);
-			}
-
-			asize += rr->rr_col[c].rc_size;
-		}
-		/*
-		 * If all data stored spans all columns, there's a danger that
-		 * parity will always be on the same device and, since parity
-		 * isn't read during normal operation, that that device's I/O
-		 * bandwidth won't be used effectively. We therefore switch
-		 * the parity every 1MB.
-		 *
-		 * ...at least that was, ostensibly, the theory. As a practical
-		 * matter unless we juggle the parity between all devices
-		 * evenly, we won't see any benefit. Further, occasional writes
-		 * that aren't a multiple of the LCM of the number of children
-		 * and the minimum stripe width are sufficient to avoid pessimal
-		 * behavior. Unfortunately, this decision created an implicit
-		 * on-disk format requirement that we need to support for all
-		 * eternity, but only for single-parity RAID-Z.
-		 *
-		 * If we intend to skip a sector in the zeroth column for
-		 * padding we must make sure to note this swap. We will never
-		 * intend to skip the first column since at least one data and
-		 * one parity column must appear in each row.
-		 */
-		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
-		    (offset & (1ULL << 20))) {
-			ASSERT(rr->rr_cols >= 2);
-			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
-			devidx = rr->rr_col[0].rc_devidx;
-			uint64_t o = rr->rr_col[0].rc_offset;
-			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
-			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
-			rr->rr_col[1].rc_devidx = devidx;
-			rr->rr_col[1].rc_offset = o;
-		}
-
-	}
-	ASSERT3U(asize, ==, tot << ashift);
-
-	/* init RAIDZ parity ops */
-	rm->rm_ops = vdev_raidz_math_get_ops();
-
-	return (rm);
-}
-
 static raidz_map_t *
 init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
 {
@@ -561,10 +378,9 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
 	init_zio_abd(*zio);

 	if (opts->rto_expand) {
-		rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
-		    (*zio)->io_size, (*zio)->io_offset,
+		rm = vdev_raidz_map_alloc_expanded(*zio,
 		    opts->rto_ashift, total_ncols+1, total_ncols,
-		    parity, opts->rto_expand_offset);
+		    parity, opts->rto_expand_offset, 0, B_FALSE);
 	} else {
 		rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
 		    total_ncols, parity);
@@ -119,7 +119,4 @@ void init_zio_abd(zio_t *zio);

 void run_raidz_benchmark(void);

-struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
-    uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
-
 #endif /* RAIDZ_TEST_H */
@@ -4134,6 +4134,11 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 	}
 	(void) printf("\tcheckpoint_txg = %llu\n",
 	    (u_longlong_t)ub->ub_checkpoint_txg);
+
+	(void) printf("\traidz_reflow state=%u off=%llu\n",
+	    (int)RRSS_GET_STATE(ub),
+	    (u_longlong_t)RRSS_GET_OFFSET(ub));
+
 	(void) printf("%s", footer ? footer : "");
 }

@@ -6650,9 +6650,17 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
 	ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing,
 	    rebuild);

-	if (ret == 0 && wait)
-		ret = zpool_wait(zhp,
-		    replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER);
+	if (ret == 0 && wait) {
+		zpool_wait_activity_t activity = ZPOOL_WAIT_RESILVER;
+		char raidz_prefix[] = "raidz";
+		if (replacing) {
+			activity = ZPOOL_WAIT_REPLACE;
+		} else if (strncmp(old_disk,
+		    raidz_prefix, strlen(raidz_prefix)) == 0) {
+			activity = ZPOOL_WAIT_RAIDZ_EXPAND;
+		}
+		ret = zpool_wait(zhp, activity);
+	}

 	nvlist_free(props);
 	nvlist_free(nvroot);
@@ -6678,17 +6686,21 @@ zpool_do_replace(int argc, char **argv)
 }

 /*
- * zpool attach [-fsw] [-o property=value] <pool> <device> <new_device>
+ * zpool attach [-fsw] [-o property=value] <pool> <device>|<vdev> <new_device>
 *
 *	-f	Force attach, even if <new_device> appears to be in use.
 *	-s	Use sequential instead of healing reconstruction for resilver.
 *	-o	Set property=value.
- *	-w	Wait for resilvering to complete before returning
+ *	-w	Wait for resilvering (mirror) or expansion (raidz) to complete
+ *		before returning.
 *
- * Attach <new_device> to the mirror containing <device>.  If <device> is not
- * part of a mirror, then <device> will be transformed into a mirror of
- * <device> and <new_device>.  In either case, <new_device> will begin life
- * with a DTL of [0, now], and will immediately begin to resilver itself.
+ * Attach <new_device> to a <device> or <vdev>, where the vdev can be of type
+ * mirror or raidz. If <device> is not part of a mirror, then <device> will
+ * be transformed into a mirror of <device> and <new_device>. When a mirror
+ * is involved, <new_device> will begin life with a DTL of [0, now], and will
+ * immediately begin to resilver itself. For the raidz case, a expansion will
+ * commence and reflow the raidz data across all the disks including the
+ * <new_device>.
 */
 int
 zpool_do_attach(int argc, char **argv)
@@ -8195,6 +8207,97 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs)
 	}
 }

+/*
+ * Print out detailed raidz expansion status.
+ */
+static void
+print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres)
+{
+	char copied_buf[7];
+
+	if (pres == NULL || pres->pres_state == DSS_NONE)
+		return;
+
+	/*
+	 * Determine name of vdev.
+	 */
+	nvlist_t *config = zpool_get_config(zhp, NULL);
+	nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
+	    ZPOOL_CONFIG_VDEV_TREE);
+	nvlist_t **child;
+	uint_t children;
+	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0);
+	assert(pres->pres_expanding_vdev < children);
+
+	printf_color(ANSI_BOLD, gettext("expand: "));
+
+	time_t start = pres->pres_start_time;
+	time_t end = pres->pres_end_time;
+	char *vname =
+	    zpool_vdev_name(g_zfs, zhp, child[pres->pres_expanding_vdev], 0);
+	zfs_nicenum(pres->pres_reflowed, copied_buf, sizeof (copied_buf));
+
+	/*
+	 * Expansion is finished or canceled.
+	 */
+	if (pres->pres_state == DSS_FINISHED) {
+		char time_buf[32];
+		secs_to_dhms(end - start, time_buf);
+
+		(void) printf(gettext("expanded %s-%u copied %s in %s, "
+		    "on %s"), vname, (int)pres->pres_expanding_vdev,
+		    copied_buf, time_buf, ctime((time_t *)&end));
+	} else {
+		char examined_buf[7], total_buf[7], rate_buf[7];
+		uint64_t copied, total, elapsed, secs_left;
+		double fraction_done;
+		uint_t rate;
+
+		assert(pres->pres_state == DSS_SCANNING);
+
+		/*
+		 * Expansion is in progress.
+		 */
+		(void) printf(gettext(
+		    "expansion of %s-%u in progress since %s"),
+		    vname, (int)pres->pres_expanding_vdev, ctime(&start));
+
+		copied = pres->pres_reflowed > 0 ? pres->pres_reflowed : 1;
+		total = pres->pres_to_reflow;
+		fraction_done = (double)copied / total;
+
+		/* elapsed time for this pass */
+		elapsed = time(NULL) - pres->pres_start_time;
+		elapsed = elapsed > 0 ? elapsed : 1;
+		rate = copied / elapsed;
+		rate = rate > 0 ? rate : 1;
+		secs_left = (total - copied) / rate;
+
+		zfs_nicenum(copied, examined_buf, sizeof (examined_buf));
+		zfs_nicenum(total, total_buf, sizeof (total_buf));
+		zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
+
+		/*
+		 * do not print estimated time if hours_left is more than
+		 * 30 days
+		 */
+		(void) printf(gettext("\t%s / %s copied at %s/s, %.2f%% done"),
+		    examined_buf, total_buf, rate_buf, 100 * fraction_done);
+		if (pres->pres_waiting_for_resilver) {
+			(void) printf(gettext(", paused for resilver or "
+			    "clear\n"));
+		} else if (secs_left < (30 * 24 * 3600)) {
+			char time_buf[32];
+			secs_to_dhms(secs_left, time_buf);
+			(void) printf(gettext(", %s to go\n"), time_buf);
+		} else {
+			(void) printf(gettext(
+			    ", (copy is slow, no estimated time)\n"));
+		}
+	}
+	free(vname);
+}
 static void
 print_checkpoint_status(pool_checkpoint_stat_t *pcs)
 {
@@ -8772,19 +8875,24 @@ status_callback(zpool_handle_t *zhp, void *data)
 		uint64_t nerr;
 		nvlist_t **spares, **l2cache;
 		uint_t nspares, nl2cache;
-		pool_checkpoint_stat_t *pcs = NULL;
-		pool_removal_stat_t *prs = NULL;

 		print_scan_status(zhp, nvroot);

+		pool_removal_stat_t *prs = NULL;
 		(void) nvlist_lookup_uint64_array(nvroot,
 		    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
 		print_removal_status(zhp, prs);

+		pool_checkpoint_stat_t *pcs = NULL;
 		(void) nvlist_lookup_uint64_array(nvroot,
 		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
 		print_checkpoint_status(pcs);

+		pool_raidz_expand_stat_t *pres = NULL;
+		(void) nvlist_lookup_uint64_array(nvroot,
+		    ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c);
+		print_raidz_expand_status(zhp, pres);
+
 		cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
 		    cbp->cb_name_flags | VDEV_NAME_TYPE_ID);
 		if (cbp->cb_namewidth < 10)
@@ -10738,8 +10846,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
 	pool_checkpoint_stat_t *pcs = NULL;
 	pool_scan_stat_t *pss = NULL;
 	pool_removal_stat_t *prs = NULL;
+	pool_raidz_expand_stat_t *pres = NULL;
 	const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE",
-	    "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM"};
+	    "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"};
 	int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES];

 	/* Calculate the width of each column */
@@ -10798,6 +10907,13 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
 		    vdev_activity_top_remaining(nvroot);
 	}

+	(void) nvlist_lookup_uint64_array(nvroot,
+	    ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c);
+	if (pres != NULL && pres->pres_state == DSS_SCANNING) {
+		int64_t rem = pres->pres_to_reflow - pres->pres_reflowed;
+		bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem;
+	}
+
 	bytes_rem[ZPOOL_WAIT_INITIALIZE] =
 	    vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE);
 	bytes_rem[ZPOOL_WAIT_TRIM] =
@@ -10827,11 +10943,12 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
 		if (!wd->wd_enabled[i])
 			continue;

-		if (wd->wd_exact)
+		if (wd->wd_exact) {
 			(void) snprintf(buf, sizeof (buf), "%" PRIi64,
 			    bytes_rem[i]);
-		else
+		} else {
 			zfs_nicenum(bytes_rem[i], buf, sizeof (buf));
+		}

 		if (wd->wd_scripted)
 			(void) printf(i == 0 ? "%s" : "\t%s", buf);
@@ -10937,7 +11054,8 @@ zpool_do_wait(int argc, char **argv)
 			for (char *tok; (tok = strsep(&optarg, ",")); ) {
 				static const char *const col_opts[] = {
 				    "discard", "free", "initialize", "replace",
-				    "remove", "resilver", "scrub", "trim" };
+				    "remove", "resilver", "scrub", "trim",
+				    "raidz_expand" };

 				for (i = 0; i < ARRAY_SIZE(col_opts); ++i)
 					if (strcmp(tok, col_opts[i]) == 0) {