From 5caeef02fa531238b4554afc977533382e43314f Mon Sep 17 00:00:00 2001 From: Don Brady Date: Wed, 8 Nov 2023 11:19:41 -0700 Subject: [PATCH] RAID-Z expansion feature This feature allows disks to be added one at a time to a RAID-Z group, expanding its capacity incrementally. This feature is especially useful for small pools (typically with only one RAID-Z group), where there isn't sufficient hardware to add capacity by adding a whole new RAID-Z group (typically doubling the number of disks). == Initiating expansion == A new device (disk) can be attached to an existing RAIDZ vdev, by running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank raidz2-0 sda`. The new device will become part of the RAIDZ group. A "raidz expansion" will be initiated, and the new device will contribute additional space to the RAIDZ group once the expansion completes. The `feature@raidz_expansion` on-disk feature flag must be `enabled` to initiate an expansion, and it remains `active` for the life of the pool. In other words, pools with expanded RAIDZ vdevs can not be imported by older releases of the ZFS software. == During expansion == The expansion entails reading all allocated space from existing disks in the RAIDZ group, and rewriting it to the new disks in the RAIDZ group (including the newly added device). The expansion progress can be monitored with `zpool status`. Data redundancy is maintained during (and after) the expansion. If a disk fails while the expansion is in progress, the expansion pauses until the health of the RAIDZ vdev is restored (e.g. by replacing the failed disk and waiting for reconstruction to complete). The pool remains accessible during expansion. Following a reboot or export/import, the expansion resumes where it left off. == After expansion == When the expansion completes, the additional space is available for use, and is reflected in the `available` zfs property (as seen in `zfs list`, `df`, etc). Expansion does not change the number of failures that can be tolerated without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after expansion). A RAIDZ vdev can be expanded multiple times. After the expansion completes, old blocks remain with their old data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but distributed among the larger set of disks. New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ vdev's "assumed parity ratio" does not change, so slightly less space than is expected may be reported for newly-written blocks, according to `zfs list`, `df`, `ls -s`, and similar tools. Sponsored-by: The FreeBSD Foundation Sponsored-by: iXsystems, Inc. Sponsored-by: vStack Reviewed-by: Brian Behlendorf Reviewed-by: Mark Maybee Authored-by: Matthew Ahrens Contributions-by: Fedor Uporov Contributions-by: Stuart Maybee Contributions-by: Thorsten Behrens Contributions-by: Fmstrat Contributions-by: Don Brady Signed-off-by: Don Brady Closes #15022 --- cmd/raidz_test/raidz_bench.c | 12 +- cmd/raidz_test/raidz_test.c | 196 +- cmd/raidz_test/raidz_test.h | 3 - cmd/zdb/zdb.c | 5 + cmd/zpool/zpool_main.c | 148 +- cmd/ztest.c | 1048 +++++-- contrib/pyzfs/libzfs_core/_constants.py | 2 + .../pyzfs/libzfs_core/_error_translation.py | 3 + contrib/pyzfs/libzfs_core/exceptions.py | 6 + include/libzfs.h | 1 + include/sys/fs/zfs.h | 25 + include/sys/spa_impl.h | 4 + include/sys/uberblock_impl.h | 35 + include/sys/vdev.h | 10 +- include/sys/vdev_impl.h | 6 +- include/sys/vdev_raidz.h | 101 +- include/sys/vdev_raidz_impl.h | 47 +- include/sys/zfs_debug.h | 1 + include/zfeature_common.h | 1 + lib/libzfs/libzfs.abi | 371 ++- lib/libzfs/libzfs_pool.c | 47 +- lib/libzfs/libzfs_util.c | 5 + lib/libzfs_core/libzfs_core.abi | 12 +- lib/libzpool/Makefile.am | 1 + man/man1/ztest.1 | 22 +- man/man4/zfs.4 | 19 + man/man7/zpool-features.7 | 32 +- man/man8/zpool-attach.8 | 53 +- man/man8/zpool-wait.8 | 6 +- module/Kbuild.in | 1 + module/os/freebsd/zfs/vdev_label_os.c | 59 + module/os/linux/zfs/vdev_label_os.c | 45 + module/os/linux/zfs/zfs_debug.c | 3 +- module/zcommon/zfeature_common.c | 5 + module/zcommon/zpool_prop.c | 3 + module/zfs/arc.c | 8 +- module/zfs/dsl_scan.c | 1 - module/zfs/metaslab.c | 12 +- module/zfs/spa.c | 242 +- module/zfs/spa_checkpoint.c | 3 + module/zfs/vdev.c | 114 +- module/zfs/vdev_draid.c | 28 +- module/zfs/vdev_initialize.c | 12 +- module/zfs/vdev_label.c | 51 +- module/zfs/vdev_raidz.c | 2556 ++++++++++++++++- module/zfs/vdev_trim.c | 17 +- scripts/zloop.sh | 63 +- tests/runfiles/common.run | 6 +- tests/zfs-tests/include/tunables.cfg | 3 + tests/zfs-tests/tests/Makefile.am | 10 +- .../cli_root/zpool_get/zpool_get.cfg | 1 + .../tests/functional/raidz/raidz_002_pos.ksh | 34 +- .../tests/functional/raidz/raidz_003_pos.ksh | 41 - .../tests/functional/raidz/raidz_004_pos.ksh | 41 - .../functional/raidz/raidz_expand_001_pos.ksh | 215 ++ .../functional/raidz/raidz_expand_002_pos.ksh | 115 + .../functional/raidz/raidz_expand_003_neg.ksh | 102 + .../functional/raidz/raidz_expand_003_pos.ksh | 141 + .../functional/raidz/raidz_expand_004_pos.ksh | 121 + .../functional/raidz/raidz_expand_005_pos.ksh | 177 ++ .../functional/raidz/raidz_expand_006_neg.ksh | 78 + .../functional/raidz/raidz_expand_007_neg.ksh | 86 + 62 files changed, 5740 insertions(+), 876 deletions(-) create mode 100644 module/os/linux/zfs/vdev_label_os.c delete mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh delete mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_006_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_007_neg.ksh diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c index 8be08558b..730e6e1a0 100644 --- a/cmd/raidz_test/raidz_bench.c +++ b/cmd/raidz_test/raidz_bench.c @@ -84,10 +84,10 @@ run_gen_bench_impl(const char *impl) if (rto_opts.rto_expand) { rm_bench = vdev_raidz_map_alloc_expanded( - zio_bench.io_abd, - zio_bench.io_size, zio_bench.io_offset, + &zio_bench, rto_opts.rto_ashift, ncols+1, ncols, - fn+1, rto_opts.rto_expand_offset); + fn+1, rto_opts.rto_expand_offset, + 0, B_FALSE); } else { rm_bench = vdev_raidz_map_alloc(&zio_bench, BENCH_ASHIFT, ncols, fn+1); @@ -172,10 +172,10 @@ run_rec_bench_impl(const char *impl) if (rto_opts.rto_expand) { rm_bench = vdev_raidz_map_alloc_expanded( - zio_bench.io_abd, - zio_bench.io_size, zio_bench.io_offset, + &zio_bench, BENCH_ASHIFT, ncols+1, ncols, - PARITY_PQR, rto_opts.rto_expand_offset); + PARITY_PQR, + rto_opts.rto_expand_offset, 0, B_FALSE); } else { rm_bench = vdev_raidz_map_alloc(&zio_bench, BENCH_ASHIFT, ncols, PARITY_PQR); diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index 195026d3a..6a018ecf0 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -327,14 +327,12 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) if (opts->rto_expand) { opts->rm_golden = - vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, - opts->zio_golden->io_size, opts->zio_golden->io_offset, + vdev_raidz_map_alloc_expanded(opts->zio_golden, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); - rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, - zio_test->io_size, zio_test->io_offset, + parity, opts->rto_expand_offset, 0, B_FALSE); + rm_test = vdev_raidz_map_alloc_expanded(zio_test, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); + parity, opts->rto_expand_offset, 0, B_FALSE); } else { opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, opts->rto_ashift, total_ncols, parity); @@ -361,187 +359,6 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) return (err); } -/* - * If reflow is not in progress, reflow_offset should be UINT64_MAX. - * For each row, if the row is entirely before reflow_offset, it will - * come from the new location. Otherwise this row will come from the - * old location. Therefore, rows that straddle the reflow_offset will - * come from the old location. - * - * NOTE: Until raidz expansion is implemented this function is only - * needed by raidz_test.c to the multi-row raid_map_t functionality. - */ -raidz_map_t * -vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, - uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, - uint64_t nparity, uint64_t reflow_offset) -{ - /* The zio's size in units of the vdev's minimum sector size. */ - uint64_t s = size >> ashift; - uint64_t q, r, bc, devidx, asize = 0, tot; - - /* - * "Quotient": The number of data sectors for this stripe on all but - * the "big column" child vdevs that also contain "remainder" data. - * AKA "full rows" - */ - q = s / (logical_cols - nparity); - - /* - * "Remainder": The number of partial stripe data sectors in this I/O. - * This will add a sector to some, but not all, child vdevs. - */ - r = s - q * (logical_cols - nparity); - - /* The number of "big columns" - those which contain remainder data. */ - bc = (r == 0 ? 0 : r + nparity); - - /* - * The total number of data and parity sectors associated with - * this I/O. - */ - tot = s + nparity * (q + (r == 0 ? 0 : 1)); - - /* How many rows contain data (not skip) */ - uint64_t rows = howmany(tot, logical_cols); - int cols = MIN(tot, logical_cols); - - raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), - KM_SLEEP); - rm->rm_nrows = rows; - - for (uint64_t row = 0; row < rows; row++) { - raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, - rr_col[cols]), KM_SLEEP); - rm->rm_row[row] = rr; - - /* The starting RAIDZ (parent) vdev sector of the row. */ - uint64_t b = (offset >> ashift) + row * logical_cols; - - /* - * If we are in the middle of a reflow, and any part of this - * row has not been copied, then use the old location of - * this row. - */ - int row_phys_cols = physical_cols; - if (b + (logical_cols - nparity) > reflow_offset >> ashift) - row_phys_cols--; - - /* starting child of this row */ - uint64_t child_id = b % row_phys_cols; - /* The starting byte offset on each child vdev. */ - uint64_t child_offset = (b / row_phys_cols) << ashift; - - /* - * We set cols to the entire width of the block, even - * if this row is shorter. This is needed because parity - * generation (for Q and R) needs to know the entire width, - * because it treats the short row as though it was - * full-width (and the "phantom" sectors were zero-filled). - * - * Another approach to this would be to set cols shorter - * (to just the number of columns that we might do i/o to) - * and have another mechanism to tell the parity generation - * about the "entire width". Reconstruction (at least - * vdev_raidz_reconstruct_general()) would also need to - * know about the "entire width". - */ - rr->rr_cols = cols; - rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; - rr->rr_firstdatacol = nparity; - rr->rr_abd_empty = NULL; - rr->rr_nempty = 0; - - for (int c = 0; c < rr->rr_cols; c++, child_id++) { - if (child_id >= row_phys_cols) { - child_id -= row_phys_cols; - child_offset += 1ULL << ashift; - } - rr->rr_col[c].rc_devidx = child_id; - rr->rr_col[c].rc_offset = child_offset; - rr->rr_col[c].rc_orig_data = NULL; - rr->rr_col[c].rc_error = 0; - rr->rr_col[c].rc_tried = 0; - rr->rr_col[c].rc_skipped = 0; - rr->rr_col[c].rc_need_orig_restore = B_FALSE; - - uint64_t dc = c - rr->rr_firstdatacol; - if (c < rr->rr_firstdatacol) { - rr->rr_col[c].rc_size = 1ULL << ashift; - rr->rr_col[c].rc_abd = - abd_alloc_linear(rr->rr_col[c].rc_size, - B_TRUE); - } else if (row == rows - 1 && bc != 0 && c >= bc) { - /* - * Past the end, this for parity generation. - */ - rr->rr_col[c].rc_size = 0; - rr->rr_col[c].rc_abd = NULL; - } else { - /* - * "data column" (col excluding parity) - * Add an ASCII art diagram here - */ - uint64_t off; - - if (c < bc || r == 0) { - off = dc * rows + row; - } else { - off = r * rows + - (dc - r) * (rows - 1) + row; - } - rr->rr_col[c].rc_size = 1ULL << ashift; - rr->rr_col[c].rc_abd = abd_get_offset_struct( - &rr->rr_col[c].rc_abdstruct, - abd, off << ashift, 1 << ashift); - } - - asize += rr->rr_col[c].rc_size; - } - /* - * If all data stored spans all columns, there's a danger that - * parity will always be on the same device and, since parity - * isn't read during normal operation, that that device's I/O - * bandwidth won't be used effectively. We therefore switch - * the parity every 1MB. - * - * ...at least that was, ostensibly, the theory. As a practical - * matter unless we juggle the parity between all devices - * evenly, we won't see any benefit. Further, occasional writes - * that aren't a multiple of the LCM of the number of children - * and the minimum stripe width are sufficient to avoid pessimal - * behavior. Unfortunately, this decision created an implicit - * on-disk format requirement that we need to support for all - * eternity, but only for single-parity RAID-Z. - * - * If we intend to skip a sector in the zeroth column for - * padding we must make sure to note this swap. We will never - * intend to skip the first column since at least one data and - * one parity column must appear in each row. - */ - if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && - (offset & (1ULL << 20))) { - ASSERT(rr->rr_cols >= 2); - ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); - devidx = rr->rr_col[0].rc_devidx; - uint64_t o = rr->rr_col[0].rc_offset; - rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; - rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; - rr->rr_col[1].rc_devidx = devidx; - rr->rr_col[1].rc_offset = o; - } - - } - ASSERT3U(asize, ==, tot << ashift); - - /* init RAIDZ parity ops */ - rm->rm_ops = vdev_raidz_math_get_ops(); - - return (rm); -} - static raidz_map_t * init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) { @@ -561,10 +378,9 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) init_zio_abd(*zio); if (opts->rto_expand) { - rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, - (*zio)->io_size, (*zio)->io_offset, + rm = vdev_raidz_map_alloc_expanded(*zio, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); + parity, opts->rto_expand_offset, 0, B_FALSE); } else { rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, total_ncols, parity); diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h index 163929def..f912e281f 100644 --- a/cmd/raidz_test/raidz_test.h +++ b/cmd/raidz_test/raidz_test.h @@ -119,7 +119,4 @@ void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); -struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t, - uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); - #endif /* RAIDZ_TEST_H */ diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 3c282f3fc..18221c4b9 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -4134,6 +4134,11 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer) } (void) printf("\tcheckpoint_txg = %llu\n", (u_longlong_t)ub->ub_checkpoint_txg); + + (void) printf("\traidz_reflow state=%u off=%llu\n", + (int)RRSS_GET_STATE(ub), + (u_longlong_t)RRSS_GET_OFFSET(ub)); + (void) printf("%s", footer ? footer : ""); } diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 5507f9d3f..9dd1d2109 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -6650,9 +6650,17 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, rebuild); - if (ret == 0 && wait) - ret = zpool_wait(zhp, - replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER); + if (ret == 0 && wait) { + zpool_wait_activity_t activity = ZPOOL_WAIT_RESILVER; + char raidz_prefix[] = "raidz"; + if (replacing) { + activity = ZPOOL_WAIT_REPLACE; + } else if (strncmp(old_disk, + raidz_prefix, strlen(raidz_prefix)) == 0) { + activity = ZPOOL_WAIT_RAIDZ_EXPAND; + } + ret = zpool_wait(zhp, activity); + } nvlist_free(props); nvlist_free(nvroot); @@ -6678,17 +6686,21 @@ zpool_do_replace(int argc, char **argv) } /* - * zpool attach [-fsw] [-o property=value] + * zpool attach [-fsw] [-o property=value] | * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. - * -w Wait for resilvering to complete before returning + * -w Wait for resilvering (mirror) or expansion (raidz) to complete + * before returning. * - * Attach to the mirror containing . If is not - * part of a mirror, then will be transformed into a mirror of - * and . In either case, will begin life - * with a DTL of [0, now], and will immediately begin to resilver itself. + * Attach to a or , where the vdev can be of type + * mirror or raidz. If is not part of a mirror, then will + * be transformed into a mirror of and . When a mirror + * is involved, will begin life with a DTL of [0, now], and will + * immediately begin to resilver itself. For the raidz case, a expansion will + * commence and reflow the raidz data across all the disks including the + * . */ int zpool_do_attach(int argc, char **argv) @@ -8195,6 +8207,97 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) } } +/* + * Print out detailed raidz expansion status. + */ +static void +print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres) +{ + char copied_buf[7]; + + if (pres == NULL || pres->pres_state == DSS_NONE) + return; + + /* + * Determine name of vdev. + */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + nvlist_t **child; + uint_t children; + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + assert(pres->pres_expanding_vdev < children); + + printf_color(ANSI_BOLD, gettext("expand: ")); + + time_t start = pres->pres_start_time; + time_t end = pres->pres_end_time; + char *vname = + zpool_vdev_name(g_zfs, zhp, child[pres->pres_expanding_vdev], 0); + zfs_nicenum(pres->pres_reflowed, copied_buf, sizeof (copied_buf)); + + /* + * Expansion is finished or canceled. + */ + if (pres->pres_state == DSS_FINISHED) { + char time_buf[32]; + secs_to_dhms(end - start, time_buf); + + (void) printf(gettext("expanded %s-%u copied %s in %s, " + "on %s"), vname, (int)pres->pres_expanding_vdev, + copied_buf, time_buf, ctime((time_t *)&end)); + } else { + char examined_buf[7], total_buf[7], rate_buf[7]; + uint64_t copied, total, elapsed, secs_left; + double fraction_done; + uint_t rate; + + assert(pres->pres_state == DSS_SCANNING); + + /* + * Expansion is in progress. + */ + (void) printf(gettext( + "expansion of %s-%u in progress since %s"), + vname, (int)pres->pres_expanding_vdev, ctime(&start)); + + copied = pres->pres_reflowed > 0 ? pres->pres_reflowed : 1; + total = pres->pres_to_reflow; + fraction_done = (double)copied / total; + + /* elapsed time for this pass */ + elapsed = time(NULL) - pres->pres_start_time; + elapsed = elapsed > 0 ? elapsed : 1; + rate = copied / elapsed; + rate = rate > 0 ? rate : 1; + secs_left = (total - copied) / rate; + + zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); + zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); + + /* + * do not print estimated time if hours_left is more than + * 30 days + */ + (void) printf(gettext("\t%s / %s copied at %s/s, %.2f%% done"), + examined_buf, total_buf, rate_buf, 100 * fraction_done); + if (pres->pres_waiting_for_resilver) { + (void) printf(gettext(", paused for resilver or " + "clear\n")); + } else if (secs_left < (30 * 24 * 3600)) { + char time_buf[32]; + secs_to_dhms(secs_left, time_buf); + (void) printf(gettext(", %s to go\n"), time_buf); + } else { + (void) printf(gettext( + ", (copy is slow, no estimated time)\n")); + } + } + free(vname); +} static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { @@ -8772,19 +8875,24 @@ status_callback(zpool_handle_t *zhp, void *data) uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - pool_checkpoint_stat_t *pcs = NULL; - pool_removal_stat_t *prs = NULL; print_scan_status(zhp, nvroot); + pool_removal_stat_t *prs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_removal_status(zhp, prs); + pool_checkpoint_stat_t *pcs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_status(pcs); + pool_raidz_expand_stat_t *pres = NULL; + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); + print_raidz_expand_status(zhp, pres); + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) @@ -10738,8 +10846,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; + pool_raidz_expand_stat_t *pres = NULL; const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE", - "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM"}; + "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ @@ -10798,6 +10907,13 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) vdev_activity_top_remaining(nvroot); } + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); + if (pres != NULL && pres->pres_state == DSS_SCANNING) { + int64_t rem = pres->pres_to_reflow - pres->pres_reflowed; + bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem; + } + bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = @@ -10827,11 +10943,12 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) if (!wd->wd_enabled[i]) continue; - if (wd->wd_exact) + if (wd->wd_exact) { (void) snprintf(buf, sizeof (buf), "%" PRIi64, bytes_rem[i]); - else + } else { zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); + } if (wd->wd_scripted) (void) printf(i == 0 ? "%s" : "\t%s", buf); @@ -10937,7 +11054,8 @@ zpool_do_wait(int argc, char **argv) for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_opts[] = { "discard", "free", "initialize", "replace", - "remove", "resilver", "scrub", "trim" }; + "remove", "resilver", "scrub", "trim", + "raidz_expand" }; for (i = 0; i < ARRAY_SIZE(col_opts); ++i) if (strcmp(tok, col_opts[i]) == 0) { diff --git a/cmd/ztest.c b/cmd/ztest.c index 8cfbdfe1c..1d414a9f6 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -151,6 +151,7 @@ typedef struct ztest_shared_hdr { uint64_t zh_stats_count; uint64_t zh_ds_size; uint64_t zh_ds_count; + uint64_t zh_scratch_state_size; } ztest_shared_hdr_t; static ztest_shared_hdr_t *ztest_shared_hdr; @@ -161,6 +162,16 @@ enum ztest_class_state { ZTEST_VDEV_CLASS_RND }; +/* Dedicated RAIDZ Expansion test states */ +typedef enum { + RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */ + RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */ + RAIDZ_EXPAND_STARTED, /* Testing has commenced */ + RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */ + RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ +} raidz_expand_test_state_t; + + #define ZO_GVARS_MAX_ARGLEN ((size_t)64) #define ZO_GVARS_MAX_COUNT ((size_t)10) @@ -174,6 +185,7 @@ typedef struct ztest_shared_opts { size_t zo_vdev_size; int zo_ashift; int zo_mirrors; + int zo_raid_do_expand; int zo_raid_children; int zo_raid_parity; char zo_raid_type[8]; @@ -188,6 +200,7 @@ typedef struct ztest_shared_opts { uint64_t zo_time; uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; + raidz_expand_test_state_t zo_raidz_expand_test; int zo_mmp_test; int zo_special_vdevs; int zo_dump_dbgmsg; @@ -249,6 +262,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, .zo_gvars_count = 0, + .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, }; extern uint64_t metaslab_force_ganging; @@ -261,6 +275,8 @@ extern uint_t dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; +extern uint64_t raidz_expand_max_reflow_bytes; +extern uint_t raidz_expand_pause_point; static ztest_shared_opts_t *ztest_shared_opts; @@ -274,6 +290,12 @@ typedef struct ztest_shared_ds { static ztest_shared_ds_t *ztest_shared_ds; #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) +typedef struct ztest_scratch_state { + uint64_t zs_raidz_scratch_verify_pause; +} ztest_shared_scratch_state_t; + +static ztest_shared_scratch_state_t *ztest_scratch_state; + #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) @@ -311,9 +333,9 @@ typedef struct bufwad { * still need to map from object ID to rangelock_t. */ typedef enum { - RL_READER, - RL_WRITER, - RL_APPEND + ZTRL_READER, + ZTRL_WRITER, + ZTRL_APPEND } rl_type_t; typedef struct rll { @@ -408,6 +430,7 @@ ztest_func_t ztest_mmp_enable_disable; ztest_func_t ztest_scrub; ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; +ztest_func_t ztest_vdev_raidz_attach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_class_add; @@ -465,6 +488,7 @@ static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), + ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), @@ -745,7 +769,7 @@ static ztest_option_t option_table[] = { DEFAULT_RAID_CHILDREN, NULL}, { 'R', "raid-parity", "INTEGER", "Raid parity", DEFAULT_RAID_PARITY, NULL}, - { 'K', "raid-kind", "raidz|draid|random", "Raid kind", + { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", NO_DEFAULT, "random"}, { 'D', "draid-data", "INTEGER", "Number of draid data drives", DEFAULT_DRAID_DATA, NULL}, @@ -781,6 +805,9 @@ static ztest_option_t option_table[] = { NO_DEFAULT, NULL}, { 'C', "vdev-class-state", "on|off|random", "vdev class state", NO_DEFAULT, "random"}, + { 'X', "raidz-expansion", NULL, + "Perform a dedicated raidz expansion test", + NO_DEFAULT, NULL}, { 'o', "option", "\"OPTION=INTEGER\"", "Set global variable to an unsigned 32-bit integer value", NO_DEFAULT, NULL}, @@ -853,7 +880,7 @@ usage(boolean_t requested) option_table[i].short_opt, option_table[i].long_opt); } - (void) fprintf(fp, " %-40s%s", option, + (void) fprintf(fp, " %-43s%s", option, option_table[i].comment); if (option_table[i].long_opt_param != NULL) { @@ -1027,6 +1054,9 @@ process_options(int argc, char **argv) case 'V': zo->zo_verbose++; break; + case 'X': + zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; + break; case 'E': zo->zo_init = 0; break; @@ -1078,9 +1108,28 @@ process_options(int argc, char **argv) fini_options(); - /* When raid choice is 'random' add a draid pool 50% of the time */ + /* Force compatible options for raidz expansion run */ + if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) { + zo->zo_mmp_test = 0; + zo->zo_mirrors = 0; + zo->zo_vdevs = 1; + zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; + zo->zo_raid_do_expand = B_FALSE; + raid_kind = "raidz"; + } + if (strcmp(raid_kind, "random") == 0) { - raid_kind = (ztest_random(2) == 0) ? "draid" : "raidz"; + switch (ztest_random(3)) { + case 0: + raid_kind = "raidz"; + break; + case 1: + raid_kind = "eraidz"; + break; + case 2: + raid_kind = "draid"; + break; + } if (ztest_opts.zo_verbose >= 3) (void) printf("choosing RAID type '%s'\n", raid_kind); @@ -1119,6 +1168,18 @@ process_options(int argc, char **argv) (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, sizeof (zo->zo_raid_type)); + } else if (strcmp(raid_kind, "eraidz") == 0) { + /* using eraidz (expandable raidz) */ + zo->zo_raid_do_expand = B_TRUE; + + /* tests expect top-level to be raidz */ + zo->zo_mirrors = 0; + zo->zo_vdevs = 1; + + /* Make sure parity is less than data columns */ + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + } else /* using raidz */ { ASSERT0(strcmp(raid_kind, "raidz")); @@ -1166,9 +1227,29 @@ ztest_kill(ztest_shared_t *zs) * Before we kill ourselves, make sure that the config is updated. * See comment above spa_write_cachefile(). */ - mutex_enter(&spa_namespace_lock); - spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); - mutex_exit(&spa_namespace_lock); + if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { + if (mutex_tryenter(&spa_namespace_lock)) { + spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, + B_FALSE); + mutex_exit(&spa_namespace_lock); + + ztest_scratch_state->zs_raidz_scratch_verify_pause = + raidz_expand_pause_point; + } else { + /* + * Do not verify scratch object in case if + * spa_namespace_lock cannot be acquired, + * it can cause deadlock in spa_config_update(). + */ + raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; + + return; + } + } else { + mutex_enter(&spa_namespace_lock); + spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); + mutex_exit(&spa_namespace_lock); + } (void) raise(SIGKILL); } @@ -1615,7 +1696,7 @@ ztest_rll_lock(rll_t *rll, rl_type_t type) { mutex_enter(&rll->rll_lock); - if (type == RL_READER) { + if (type == ZTRL_READER) { while (rll->rll_writer != NULL) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_readers++; @@ -2071,7 +2152,7 @@ ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); ASSERT3U(object, !=, 0); - ztest_object_lock(zd, object, RL_WRITER); + ztest_object_lock(zd, object, ZTRL_WRITER); VERIFY0(dmu_object_info(os, object, &doi)); @@ -2141,8 +2222,8 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) if (bt->bt_magic != BT_MAGIC) bt = NULL; - ztest_object_lock(zd, lr->lr_foid, RL_READER); - rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); @@ -2245,9 +2326,9 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - ztest_object_lock(zd, lr->lr_foid, RL_READER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, - RL_WRITER); + ZTRL_WRITER); tx = dmu_tx_create(os); @@ -2287,7 +2368,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - ztest_object_lock(zd, lr->lr_foid, RL_WRITER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); @@ -2414,7 +2495,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, ASSERT3P(lwb, !=, NULL); ASSERT3U(size, !=, 0); - ztest_object_lock(zd, object, RL_READER); + ztest_object_lock(zd, object, ZTRL_READER); error = dmu_bonus_hold(os, object, FTAG, &db); if (error) { ztest_object_unlock(zd, object); @@ -2439,7 +2520,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, if (buf != NULL) { /* immediate write */ zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, - object, offset, size, RL_READER); + object, offset, size, ZTRL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); @@ -2455,7 +2536,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, } zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, - object, offset, size, RL_READER); + object, offset, size, ZTRL_READER); error = dmu_buf_hold_noread(os, object, offset, zgd, &db); @@ -2531,7 +2612,7 @@ ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) ASSERT3U(od->od_object, !=, 0); ASSERT0(missing); /* there should be no gaps */ - ztest_object_lock(zd, od->od_object, RL_READER); + ztest_object_lock(zd, od->od_object, ZTRL_READER); VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, FTAG, &db)); dmu_object_info_from_db(db, &doi); @@ -2634,7 +2715,7 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) static int ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, - void *data) + const void *data) { lr_write_t *lr; int error; @@ -2704,8 +2785,8 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) txg_wait_synced(dmu_objset_pool(os), 0); - ztest_object_lock(zd, object, RL_READER); - rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); + ztest_object_lock(zd, object, ZTRL_READER); + rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); tx = dmu_tx_create(os); @@ -3033,13 +3114,32 @@ ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) spa_config_exit(spa, SCL_CONFIG, FTAG); } +static int +ztest_get_raidz_children(spa_t *spa) +{ + (void) spa; + vdev_t *raidvd; + + ASSERT(MUTEX_HELD(&ztest_vdev_lock)); + + if (ztest_opts.zo_raid_do_expand) { + raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; + + ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); + + return (raidvd->vdev_children); + } + + return (ztest_opts.zo_raid_children); +} + void ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa; uint64_t initial_version = SPA_VERSION_INITIAL; - uint64_t version, newversion; + uint64_t raidz_children, version, newversion; nvlist_t *nvroot, *props; char *name; @@ -3058,8 +3158,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) */ (void) spa_destroy(name); + raidz_children = ztest_get_raidz_children(ztest_spa); + nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); + NULL, raidz_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the @@ -3125,6 +3227,7 @@ ztest_spa_checkpoint(spa_t *spa) case ZFS_ERR_DEVRM_IN_PROGRESS: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_CHECKPOINT_EXISTS: + case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: break; case ENOSPC: ztest_record_enospc(FTAG); @@ -3205,6 +3308,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) spa_t *spa = ztest_spa; uint64_t leaves; uint64_t guid; + uint64_t raidz_children; + nvlist_t *nvroot; int error; @@ -3212,8 +3317,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) return; mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * - ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -3267,7 +3372,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? - "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, + "log" : NULL, raidz_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); @@ -3295,6 +3400,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) spa_t *spa = ztest_spa; uint64_t leaves; nvlist_t *nvroot; + uint64_t raidz_children; const char *class = (ztest_random(2) == 0) ? VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; int error; @@ -3322,15 +3428,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) return; } - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * - ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); + class, raidz_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); fnvlist_free(nvroot); @@ -3592,6 +3698,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) uint64_t ashift = ztest_get_ashift(); uint64_t oldguid, pguid; uint64_t oldsize, newsize; + uint64_t raidz_children; char *oldpath, *newpath; int replacing; int oldvd_has_siblings = B_FALSE; @@ -3608,7 +3715,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors, 1) * raidz_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -3623,6 +3731,15 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) goto out; } + /* + * RAIDZ leaf VDEV mirrors are not currently supported while a + * RAIDZ expansion is in progress. + */ + if (ztest_opts.zo_raid_do_expand) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + /* * Decide whether to do an attach or a replace. */ @@ -3647,7 +3764,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (zs->zs_mirrors >= 1) { ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; + oldvd = oldvd->vdev_child[leaf / raidz_children]; } /* pick a child out of the raidz group */ @@ -3656,8 +3773,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); else ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); - ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); - oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; + oldvd = oldvd->vdev_child[leaf % raidz_children]; } /* @@ -3825,6 +3941,226 @@ out: umem_free(newpath, MAXPATHLEN); } +static void +raidz_scratch_verify(void) +{ + spa_t *spa; + uint64_t write_size, logical_size, offset; + raidz_reflow_scratch_state_t state; + vdev_raidz_expand_t *vre; + vdev_t *raidvd; + + ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); + + if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) + return; + + kernel_init(SPA_MODE_READ); + + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(ztest_opts.zo_pool); + ASSERT(spa); + spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; + mutex_exit(&spa_namespace_lock); + + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + + ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); + + vre = spa->spa_raidz_expand; + if (vre == NULL) + goto out; + + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + offset = RRSS_GET_OFFSET(&spa->spa_uberblock); + state = RRSS_GET_STATE(&spa->spa_uberblock); + write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift); + logical_size = write_size * raidvd->vdev_children; + + switch (state) { + /* + * Initial state of reflow process. RAIDZ expansion was + * requested by user, but scratch object was not created. + */ + case RRSS_SCRATCH_NOT_IN_USE: + ASSERT3U(offset, ==, 0); + break; + + /* + * Scratch object was synced and stored in boot area. + */ + case RRSS_SCRATCH_VALID: + + /* + * Scratch object was synced back to raidz start offset, + * raidz is ready for sector by sector reflow process. + */ + case RRSS_SCRATCH_INVALID_SYNCED: + + /* + * Scratch object was synced back to raidz start offset + * on zpool importing, raidz is ready for sector by sector + * reflow process. + */ + case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT: + ASSERT3U(offset, ==, logical_size); + break; + + /* + * Sector by sector reflow process started. + */ + case RRSS_SCRATCH_INVALID_SYNCED_REFLOW: + ASSERT3U(offset, >=, logical_size); + break; + } + +out: + spa_config_exit(spa, SCL_ALL, FTAG); + + mutex_exit(&ztest_vdev_lock); + + ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; + + spa_close(spa, FTAG); + kernel_fini(); +} + +static void +ztest_scratch_thread(void *arg) +{ + (void) arg; + + /* wait up to 10 seconds */ + for (int t = 100; t > 0; t -= 1) { + if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) + thread_exit(); + + (void) poll(NULL, 0, 100); + } + + /* killed when the scratch area progress reached a certain point */ + ztest_kill(ztest_shared); +} + +/* + * Verify that we can attach raidz device. + */ +void +ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); + kthread_t *scratch_thread = NULL; + vdev_t *newvd, *pvd; + nvlist_t *root; + char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + int error, expected_error = 0; + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); + + /* Only allow attach when raid-kind = 'eraidz' */ + if (!ztest_opts.zo_raid_do_expand) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + if (ztest_opts.zo_mmp_test) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + if (ztest_device_removal_active) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + pvd = vdev_lookup_top(spa, 0); + + ASSERT(pvd->vdev_ops == &vdev_raidz_ops); + + /* + * Get size of a child of the raidz group, + * make sure device is a bit bigger + */ + newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; + newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); + + /* + * Get next attached leaf id + */ + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; + zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; + + if (spa->spa_raidz_expand) + expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; + + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Path to vdev to be attached + */ + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); + + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, + 0, 0, 1); + + /* + * 50% of the time, set raidz_expand_pause_point to cause + * raidz_reflow_scratch_sync() to pause at a certain point and + * then kill the test after 10 seconds so raidz_scratch_verify() + * can confirm consistency when the pool is imported. + */ + if (ztest_random(2) == 0 && expected_error == 0) { + raidz_expand_pause_point = + ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; + scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, + ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + } + + error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); + + nvlist_free(root); + + if (error == EOVERFLOW || error == ENXIO || + error == ZFS_ERR_CHECKPOINT_EXISTS || + error == ZFS_ERR_DISCARDING_CHECKPOINT) + expected_error = error; + + if (error != 0 && error != expected_error) { + fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", + newpath, newsize, error, expected_error); + } + + if (raidz_expand_pause_point) { + if (error != 0) { + /* + * Do not verify scratch object in case of error + * returned by vdev attaching. + */ + raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; + } + + VERIFY0(thread_join(scratch_thread)); + } +out: + mutex_exit(&ztest_vdev_lock); + + umem_free(newpath, MAXPATHLEN); +} + void ztest_device_removal(ztest_ds_t *zd, uint64_t id) { @@ -4031,6 +4367,18 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) return; } + /* + * If we are under raidz expansion, the test can failed because the + * metaslabs count will not increase immediately after the vdev is + * expanded. It will happen only after raidz expansion completion. + */ + if (spa->spa_raidz_expand) { + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + top = ztest_random_vdev_top(spa, B_TRUE); tvd = spa->spa_root_vdev->vdev_child[top]; @@ -5815,7 +6163,7 @@ ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) dmu_object_info_t doi; dmu_buf_t *db; - ztest_object_lock(zd, obj, RL_READER); + ztest_object_lock(zd, obj, ZTRL_READER); if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { ztest_object_unlock(zd, obj); continue; @@ -6038,6 +6386,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) uint64_t leaves; uint64_t bad = 0x1990c0ffeedecadeull; uint64_t top, leaf; + uint64_t raidz_children; char *path0; char *pathrand; size_t fsize; @@ -6048,6 +6397,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) vdev_t *vd0 = NULL; uint64_t guid0 = 0; boolean_t islog = B_FALSE; + boolean_t injected = B_FALSE; path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); @@ -6060,15 +6410,23 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * strategy for damaging blocks does not take in to account evacuated * blocks which may have already been damaged. */ - if (ztest_device_removal_active) { - mutex_exit(&ztest_vdev_lock); + if (ztest_device_removal_active) + goto out; + + /* + * The fault injection strategy for damaging blocks cannot be used + * if raidz expansion is in progress. The leaves value + * (attached raidz children) is variable and strategy for damaging + * blocks will corrupt same data blocks on different child vdevs + * because of the reflow process. + */ + if (spa->spa_raidz_expand != NULL) goto out; - } maxfaults = MAXFAULTS(zs); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; + raidz_children = ztest_get_raidz_children(spa); + leaves = MAX(zs->zs_mirrors, 1) * raidz_children; mirror_save = zs->zs_mirrors; - mutex_exit(&ztest_vdev_lock); ASSERT3U(leaves, >=, 1); @@ -6209,13 +6567,9 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * call vdev_[on|off]line without holding locks * to force unpredictable failures but the side * effects of vdev_[on|off]line prevent us from - * doing so. We grab the ztest_vdev_lock here to - * prevent a race between injection testing and - * aux_vdev removal. + * doing so. */ - mutex_enter(&ztest_vdev_lock); (void) vdev_online(spa, guid0, 0, NULL); - mutex_exit(&ztest_vdev_lock); } } @@ -6289,9 +6643,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; - mutex_enter(&ztest_vdev_lock); if (mirror_save != zs->zs_mirrors) { - mutex_exit(&ztest_vdev_lock); (void) close(fd); goto out; } @@ -6301,15 +6653,25 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) "can't inject bad word at 0x%"PRIx64" in %s", offset, pathrand); - mutex_exit(&ztest_vdev_lock); - if (ztest_opts.zo_verbose >= 7) (void) printf("injected bad word into %s," " offset 0x%"PRIx64"\n", pathrand, offset); + + injected = B_TRUE; } (void) close(fd); out: + mutex_exit(&ztest_vdev_lock); + + if (injected && ztest_opts.zo_raid_do_expand) { + int error = spa_scan(spa, POOL_SCAN_SCRUB); + if (error == 0) { + while (dsl_scan_scrubbing(spa_get_dsl(spa))) + txg_wait_synced(spa_get_dsl(spa), 0); + } + } + umem_free(path0, MAXPATHLEN); umem_free(pathrand, MAXPATHLEN); } @@ -7194,6 +7556,75 @@ ztest_execute(int test, ztest_info_t *zi, uint64_t id) (double)functime / NANOSEC, zi->zi_funcname); } +typedef struct ztest_raidz_expand_io { + uint64_t rzx_id; + uint64_t rzx_amount; + uint64_t rzx_bufsize; + const void *rzx_buffer; + uint64_t rzx_alloc_max; + spa_t *rzx_spa; +} ztest_expand_io_t; + +#undef OD_ARRAY_SIZE +#define OD_ARRAY_SIZE 10 + +/* + * Write a request amount of data to some dataset objects. + * There will be ztest_opts.zo_threads count of these running in parallel. + */ +static __attribute__((noreturn)) void +ztest_rzx_thread(void *arg) +{ + ztest_expand_io_t *info = (ztest_expand_io_t *)arg; + ztest_od_t *od; + int batchsize; + int od_size; + ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets]; + spa_t *spa = info->rzx_spa; + + od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; + od = umem_alloc(od_size, UMEM_NOFAIL); + batchsize = OD_ARRAY_SIZE; + + /* Create objects to write to */ + for (int b = 0; b < batchsize; b++) { + ztest_od_init(od + b, info->rzx_id, FTAG, b, + DMU_OT_UINT64_OTHER, 0, 0, 0); + } + if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) { + umem_free(od, od_size); + thread_exit(); + } + + for (uint64_t offset = 0, written = 0; written < info->rzx_amount; + offset += info->rzx_bufsize) { + /* write to 10 objects */ + for (int i = 0; i < batchsize && written < info->rzx_amount; + i++) { + (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); + ztest_write(zd, od[i].od_object, offset, + info->rzx_bufsize, info->rzx_buffer); + (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); + written += info->rzx_bufsize; + } + txg_wait_synced(spa_get_dsl(spa), 0); + /* due to inflation, we'll typically bail here */ + if (metaslab_class_get_alloc(spa_normal_class(spa)) > + info->rzx_alloc_max) { + break; + } + } + + /* Remove a few objects to leave some holes in allocation space */ + mutex_enter(&zd->zd_dirobj_lock); + (void) ztest_remove(zd, od, 2); + mutex_exit(&zd->zd_dirobj_lock); + + umem_free(od, od_size); + + thread_exit(); +} + static __attribute__((noreturn)) void ztest_thread(void *arg) { @@ -7209,8 +7640,10 @@ ztest_thread(void *arg) /* * See if it's time to force a crash. */ - if (now > zs->zs_thread_kill) + if (now > zs->zs_thread_kill && + raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { ztest_kill(zs); + } /* * If we're getting ENOSPC with some regularity, stop. @@ -7400,9 +7833,14 @@ ztest_freeze(void) spa_t *spa; int numloops = 0; + /* freeze not supported during RAIDZ expansion */ + if (ztest_opts.zo_raid_do_expand) + return; + if (ztest_opts.zo_verbose >= 3) (void) printf("testing spa_freeze()...\n"); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); VERIFY0(ztest_dataset_open(0)); @@ -7470,6 +7908,7 @@ ztest_freeze(void) /* * Open and close the pool and dataset to induce log replay. */ + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); @@ -7519,6 +7958,7 @@ ztest_import(ztest_shared_t *zs) mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); ztest_import_impl(); @@ -7543,144 +7983,249 @@ ztest_import(ztest_shared_t *zs) } /* - * Kick off threads to run tests on all datasets in parallel. + * After the expansion was killed, check that the pool is healthy */ static void -ztest_run(ztest_shared_t *zs) +ztest_raidz_expand_check(spa_t *spa) { - spa_t *spa; - objset_t *os; - kthread_t *resume_thread, *deadman_thread; + ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED); + /* + * Set pool check done flag, main program will run a zdb check + * of the pool when we exit. + */ + ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; + + /* Wait for reflow to finish */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("\nwaiting for reflow to finish ...\n"); + } + pool_raidz_expand_stat_t rzx_stats; + pool_raidz_expand_stat_t *pres = &rzx_stats; + do { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 500); /* wait 1/2 second */ + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } while (pres->pres_state != DSS_FINISHED && + pres->pres_reflowed < pres->pres_to_reflow); + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("verifying an interrupted raidz " + "expansion using a pool scrub ...\n"); + } + /* Will fail here if there is non-recoverable corruption detected */ + VERIFY0(ztest_scrub_impl(spa)); + if (ztest_opts.zo_verbose >= 1) { + (void) printf("raidz expansion scrub check complete\n"); + } +} + +/* + * Start a raidz expansion test. We run some I/O on the pool for a while + * to get some data in the pool. Then we grow the raidz and + * kill the test at the requested offset into the reflow, verifying that + * doing such does not lead to pool corruption. + */ +static void +ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) +{ + nvlist_t *root; + pool_raidz_expand_stat_t rzx_stats; + pool_raidz_expand_stat_t *pres = &rzx_stats; kthread_t **run_threads; - uint64_t object; - int error; - int t, d; + vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0]; + int total_disks = rzvd->vdev_children; + int data_disks = total_disks - vdev_get_nparity(rzvd); + uint64_t alloc_goal; + uint64_t csize; + int error, t; + int threads = ztest_opts.zo_threads; + ztest_expand_io_t *thread_args; - ztest_exiting = B_FALSE; + ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); + ASSERT3U(rzvd->vdev_ops, ==, &vdev_raidz_ops); + ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; + /* Setup a 1 MiB buffer of random data */ + uint64_t bufsize = 1024 * 1024; + void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); + + if (read(ztest_fd_rand, buffer, bufsize) != bufsize) { + fatal(B_TRUE, "short read from /dev/urandom"); + } /* - * Initialize parent/child shared state. + * Put some data in the pool and then attach a vdev to initiate + * reflow. */ - mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); - VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); - - zs->zs_thread_start = gethrtime(); - zs->zs_thread_stop = - zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; - zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); - zs->zs_thread_kill = zs->zs_thread_stop; - if (ztest_random(100) < ztest_opts.zo_killrate) { - zs->zs_thread_kill -= - ztest_random(ztest_opts.zo_passtime * NANOSEC); + run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); + thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), + UMEM_NOFAIL); + /* Aim for roughly 25% of allocatable space up to 1GB */ + alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks; + alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024); + if (ztest_opts.zo_verbose >= 1) { + (void) printf("adding data to pool '%s', goal %llu bytes\n", + ztest_opts.zo_pool, (u_longlong_t)alloc_goal); } - mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); - - list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), - offsetof(ztest_cb_data_t, zcd_node)); - /* - * Open our pool. It may need to be imported first depending on - * what tests were running when the previous pass was terminated. + * Kick off all the I/O generators that run in parallel. */ - kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); - error = spa_open(ztest_opts.zo_pool, &spa, FTAG); - if (error) { - VERIFY3S(error, ==, ENOENT); - ztest_import_impl(); - VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); - zs->zs_metaslab_sz = - 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; - } - - metaslab_preload_limit = ztest_random(20) + 1; - ztest_spa = spa; - - VERIFY0(vdev_raidz_impl_set("cycle")); - - dmu_objset_stats_t dds; - VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, - DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - dmu_objset_fast_stat(os, &dds); - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - dmu_objset_disown(os, B_TRUE, FTAG); - - /* - * Create a thread to periodically resume suspended I/O. - */ - resume_thread = thread_create(NULL, 0, ztest_resume_thread, - spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); - - /* - * Create a deadman thread and set to panic if we hang. - */ - deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, - zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); - - spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; - - /* - * Verify that we can safely inquire about any object, - * whether it's allocated or not. To make it interesting, - * we probe a 5-wide window around each power of two. - * This hits all edge cases, including zero and the max. - */ - for (t = 0; t < 64; t++) { - for (d = -5; d <= 5; d++) { - error = dmu_object_info(spa->spa_meta_objset, - (1ULL << t) + d, NULL); - ASSERT(error == 0 || error == ENOENT || - error == EINVAL); + for (t = 0; t < threads; t++) { + if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { + umem_free(run_threads, threads * sizeof (kthread_t *)); + umem_free(buffer, bufsize); + return; } + thread_args[t].rzx_id = t; + thread_args[t].rzx_amount = alloc_goal / threads; + thread_args[t].rzx_bufsize = bufsize; + thread_args[t].rzx_buffer = buffer; + thread_args[t].rzx_alloc_max = alloc_goal; + thread_args[t].rzx_spa = spa; + run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, + &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, + defclsyspri); } /* - * If we got any ENOSPC errors on the previous run, destroy something. + * Wait for all of the writers to complete. */ - if (zs->zs_enospc_count != 0) { - int d = ztest_random(ztest_opts.zo_datasets); - ztest_dataset_destroy(d); - } - zs->zs_enospc_count = 0; + for (t = 0; t < threads; t++) + VERIFY0(thread_join(run_threads[t])); /* - * If we were in the middle of ztest_device_removal() and were killed - * we need to ensure the removal and scrub complete before running - * any tests that check ztest_device_removal_active. The removal will - * be restarted automatically when the spa is opened, but we need to - * initiate the scrub manually if it is not already in progress. Note - * that we always run the scrub whenever an indirect vdev exists - * because we have no way of knowing for sure if ztest_device_removal() - * fully completed its scrub before the pool was reimported. + * Close all datasets. This must be done after all the threads + * are joined so we can be sure none of the datasets are in-use + * by any of the threads. */ - if (spa->spa_removing_phys.sr_state == DSS_SCANNING || - spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { - while (spa->spa_removing_phys.sr_state == DSS_SCANNING) - txg_wait_synced(spa_get_dsl(spa), 0); - - error = ztest_scrub_impl(spa); - if (error == EBUSY) - error = 0; - ASSERT0(error); + for (t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets) + ztest_dataset_close(t); } + txg_wait_synced(spa_get_dsl(spa), 0); + + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + + umem_free(buffer, bufsize); + umem_free(run_threads, threads * sizeof (kthread_t *)); + umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); + + /* Set our reflow target to 25%, 50% or 75% of allocated size */ + uint_t multiple = ztest_random(3) + 1; + uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; + raidz_expand_max_reflow_bytes = reflow_max; + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("running raidz expansion test, killing when " + "reflow reaches %llu bytes (%u/4 of allocated space)\n", + (u_longlong_t)reflow_max, multiple); + } + + /* XXX - do we want some I/O load during the reflow? */ + + /* + * Use a disk size that is larger than existing ones + */ + cvd = rzvd->vdev_child[0]; + csize = vdev_get_min_asize(cvd); + csize += csize / 10; + /* + * Path to vdev to be attached + */ + char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), + NULL, 0, 0, 1); + /* + * Expand the raidz vdev by attaching the new disk + */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("expanding raidz: %d wide to %d wide with '%s'\n", + (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1, + newpath); + } + error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); + nvlist_free(root); + if (error != 0) { + fatal(0, "raidz expand: attach (%s %llu) returned %d", + newpath, (long long)csize, error); + } + + /* + * Wait for reflow to begin + */ + while (spa->spa_raidz_expand == NULL) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + } + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + while (pres->pres_state != DSS_SCANNING) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + ASSERT3U(pres->pres_state, ==, DSS_SCANNING); + ASSERT3U(pres->pres_to_reflow, !=, 0); + /* + * Set so when we are killed we go to raidz checking rather than + * restarting test. + */ + ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; + if (ztest_opts.zo_verbose >= 1) { + (void) printf("raidz expansion reflow started, waiting for " + "%llu bytes to be copied\n", (u_longlong_t)reflow_max); + } + + /* + * Wait for reflow maximum to be reached and then kill the test + */ + while (pres->pres_reflowed < reflow_max) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + /* Reset the reflow pause before killing */ + raidz_expand_max_reflow_bytes = 0; + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("killing raidz expansion test after reflow " + "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed); + } + + /* + * Kill ourself to simulate a panic during a reflow. Our parent will + * restart the test and the changed flag value will drive the test + * through the scrub/check code to verify the pool is not corrupted. + */ + ztest_kill(zs); +} + +static void +ztest_generic_run(ztest_shared_t *zs, spa_t *spa) +{ + kthread_t **run_threads; + int t; + run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), UMEM_NOFAIL); - if (ztest_opts.zo_verbose >= 4) - (void) printf("starting main threads...\n"); - - /* - * Replay all logs of all datasets in the pool. This is primarily for - * temporary datasets which wouldn't otherwise get replayed, which - * can trigger failures when attempting to offline a SLOG in - * ztest_fault_inject(). - */ - (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, - NULL, DS_FIND_CHILDREN); - /* * Kick off all the tests that run in parallel. */ @@ -7718,6 +8263,166 @@ ztest_run(ztest_shared_t *zs) zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); +} + +/* + * Setup our test context and kick off threads to run tests on all datasets + * in parallel. + */ +static void +ztest_run(ztest_shared_t *zs) +{ + spa_t *spa; + objset_t *os; + kthread_t *resume_thread, *deadman_thread; + uint64_t object; + int error; + int t, d; + + ztest_exiting = B_FALSE; + + /* + * Initialize parent/child shared state. + */ + mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); + VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + + zs->zs_thread_start = gethrtime(); + zs->zs_thread_stop = + zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; + zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); + zs->zs_thread_kill = zs->zs_thread_stop; + if (ztest_random(100) < ztest_opts.zo_killrate) { + zs->zs_thread_kill -= + ztest_random(ztest_opts.zo_passtime * NANOSEC); + } + + mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); + + list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), + offsetof(ztest_cb_data_t, zcd_node)); + + /* + * Open our pool. It may need to be imported first depending on + * what tests were running when the previous pass was terminated. + */ + raidz_scratch_verify(); + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + error = spa_open(ztest_opts.zo_pool, &spa, FTAG); + if (error) { + VERIFY3S(error, ==, ENOENT); + ztest_import_impl(); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + } + + metaslab_preload_limit = ztest_random(20) + 1; + ztest_spa = spa; + + /* + * XXX - BUGBUG raidz expansion do not run this for generic for now + */ + if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) + VERIFY0(vdev_raidz_impl_set("cycle")); + + dmu_objset_stats_t dds; + VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, + DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + dmu_objset_disown(os, B_TRUE, FTAG); + + /* Give the dedicated raidz expansion test more grace time */ + if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) + zfs_deadman_synctime_ms *= 2; + + /* + * Create a thread to periodically resume suspended I/O. + */ + resume_thread = thread_create(NULL, 0, ztest_resume_thread, + spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + + /* + * Create a deadman thread and set to panic if we hang. + */ + deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, + zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; + + /* + * Verify that we can safely inquire about any object, + * whether it's allocated or not. To make it interesting, + * we probe a 5-wide window around each power of two. + * This hits all edge cases, including zero and the max. + */ + for (t = 0; t < 64; t++) { + for (d = -5; d <= 5; d++) { + error = dmu_object_info(spa->spa_meta_objset, + (1ULL << t) + d, NULL); + ASSERT(error == 0 || error == ENOENT || + error == EINVAL); + } + } + + /* + * If we got any ENOSPC errors on the previous run, destroy something. + */ + if (zs->zs_enospc_count != 0) { + /* Not expecting ENOSPC errors during raidz expansion tests */ + ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, + RAIDZ_EXPAND_NONE); + + int d = ztest_random(ztest_opts.zo_datasets); + ztest_dataset_destroy(d); + } + zs->zs_enospc_count = 0; + + /* + * If we were in the middle of ztest_device_removal() and were killed + * we need to ensure the removal and scrub complete before running + * any tests that check ztest_device_removal_active. The removal will + * be restarted automatically when the spa is opened, but we need to + * initiate the scrub manually if it is not already in progress. Note + * that we always run the scrub whenever an indirect vdev exists + * because we have no way of knowing for sure if ztest_device_removal() + * fully completed its scrub before the pool was reimported. + * + * Does not apply for the RAIDZ expansion specific test runs + */ + if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE && + (spa->spa_removing_phys.sr_state == DSS_SCANNING || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) { + while (spa->spa_removing_phys.sr_state == DSS_SCANNING) + txg_wait_synced(spa_get_dsl(spa), 0); + + error = ztest_scrub_impl(spa); + if (error == EBUSY) + error = 0; + ASSERT0(error); + } + + if (ztest_opts.zo_verbose >= 4) + (void) printf("starting main threads...\n"); + + /* + * Replay all logs of all datasets in the pool. This is primarily for + * temporary datasets which wouldn't otherwise get replayed, which + * can trigger failures when attempting to offline a SLOG in + * ztest_fault_inject(). + */ + (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, + NULL, DS_FIND_CHILDREN); + + if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) + ztest_raidz_expand_run(zs, spa); + else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) + ztest_raidz_expand_check(spa); + else + ztest_generic_run(zs, spa); /* Kill the resume and deadman threads */ ztest_exiting = B_TRUE; @@ -7826,6 +8531,7 @@ ztest_init(ztest_shared_t *zs) mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); /* @@ -7911,6 +8617,7 @@ shared_data_size(ztest_shared_hdr_t *hdr) size += hdr->zh_size; size += hdr->zh_stats_size * hdr->zh_stats_count; size += hdr->zh_ds_size * hdr->zh_ds_count; + size += hdr->zh_scratch_state_size; return (size); } @@ -7934,6 +8641,7 @@ setup_hdr(void) hdr->zh_stats_count = ZTEST_FUNCS; hdr->zh_ds_size = sizeof (ztest_shared_ds_t); hdr->zh_ds_count = ztest_opts.zo_datasets; + hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); size = shared_data_size(hdr); VERIFY0(ftruncate(ztest_fd_data, size)); @@ -7968,6 +8676,8 @@ setup_data(void) ztest_shared_callstate = (void *)&buf[offset]; offset += hdr->zh_stats_size * hdr->zh_stats_count; ztest_shared_ds = (void *)&buf[offset]; + offset += hdr->zh_ds_size * hdr->zh_ds_count; + ztest_scratch_state = (void *)&buf[offset]; } static boolean_t @@ -8188,13 +8898,14 @@ main(int argc, char **argv) hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); if (ztest_opts.zo_verbose >= 1) { - (void) printf("%"PRIu64" vdevs, %d datasets, %d threads," - "%d %s disks, %"PRIu64" seconds...\n\n", + (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, " + "%d %s disks, parity %d, %"PRIu64" seconds...\n\n", ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, ztest_opts.zo_raid_children, ztest_opts.zo_raid_type, + ztest_opts.zo_raid_parity, ztest_opts.zo_time); } @@ -8307,6 +9018,9 @@ main(int argc, char **argv) if (!ztest_opts.zo_mmp_test) ztest_run_zdb(zs->zs_guid); + if (ztest_shared_opts->zo_raidz_expand_test == + RAIDZ_EXPAND_CHECKED) + break; /* raidz expand test complete */ } if (ztest_opts.zo_verbose >= 1) { diff --git a/contrib/pyzfs/libzfs_core/_constants.py b/contrib/pyzfs/libzfs_core/_constants.py index 5ee422dfa..9c40ece1a 100644 --- a/contrib/pyzfs/libzfs_core/_constants.py +++ b/contrib/pyzfs/libzfs_core/_constants.py @@ -103,6 +103,7 @@ zfs_errno = enum_with_offset(1024, [ 'ZFS_ERR_NOT_USER_NAMESPACE', 'ZFS_ERR_RESUME_EXISTS', 'ZFS_ERR_CRYPTO_NOTSUP', + 'ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS', ], {} ) @@ -115,5 +116,6 @@ ZFS_ERR_DEVRM_IN_PROGRESS = zfs_errno.ZFS_ERR_DEVRM_IN_PROGRESS ZFS_ERR_VDEV_TOO_BIG = zfs_errno.ZFS_ERR_VDEV_TOO_BIG ZFS_ERR_WRONG_PARENT = zfs_errno.ZFS_ERR_WRONG_PARENT ZFS_ERR_VDEV_NOTSUP = zfs_errno.ZFS_ERR_VDEV_NOTSUP +ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS = zfs_errno.ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/_error_translation.py b/contrib/pyzfs/libzfs_core/_error_translation.py index 26676db39..3d1a2d573 100644 --- a/contrib/pyzfs/libzfs_core/_error_translation.py +++ b/contrib/pyzfs/libzfs_core/_error_translation.py @@ -43,6 +43,7 @@ from ._constants import ( ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_WRONG_PARENT, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, zfs_errno ) @@ -596,6 +597,8 @@ def lzc_pool_checkpoint_translate_error(ret, name, discard=False): raise lzc_exc.DeviceRemovalRunning() if ret == ZFS_ERR_VDEV_TOO_BIG: raise lzc_exc.DeviceTooBig() + if ret == ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: + raise lzc_exc.RaidzExpansionRunning() if discard: raise _generic_exception( ret, name, "Failed to discard pool checkpoint") diff --git a/contrib/pyzfs/libzfs_core/exceptions.py b/contrib/pyzfs/libzfs_core/exceptions.py index e484b07b6..ba8f7e490 100644 --- a/contrib/pyzfs/libzfs_core/exceptions.py +++ b/contrib/pyzfs/libzfs_core/exceptions.py @@ -30,6 +30,7 @@ from ._constants import ( ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_WRONG_PARENT, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, zfs_errno ) @@ -598,4 +599,9 @@ class DeviceTooBig(ZFSError): message = "One or more top-level vdevs exceed the maximum vdev size" +class RaidzExpansionRunning(ZFSError): + errno = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS + message = "A raidz device is currently expanding" + + # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/include/libzfs.h b/include/libzfs.h index 4adfa38e8..dbb6340b0 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -157,6 +157,7 @@ typedef enum zfs_error { EZFS_CKSUM, /* insufficient replicas */ EZFS_RESUME_EXISTS, /* Resume on existing dataset without force */ EZFS_SHAREFAILED, /* filesystem share failed */ + EZFS_RAIDZ_EXPAND_IN_PROGRESS, /* a raidz is currently expanding */ EZFS_UNKNOWN } zfs_error_t; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index a978a9db5..c6f7dcca7 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -365,6 +365,7 @@ typedef enum { VDEV_PROP_CHECKSUM_T, VDEV_PROP_IO_N, VDEV_PROP_IO_T, + VDEV_PROP_RAIDZ_EXPANDING, VDEV_NUM_PROPS } vdev_prop_t; @@ -724,6 +725,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ +#define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ @@ -789,6 +791,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" +#define ZPOOL_CONFIG_RAIDZ_EXPANDING "raidz_expanding" +#define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" @@ -907,6 +911,15 @@ typedef struct zpool_load_policy { #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE \ + "org.openzfs:raidz_expand_state" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME \ + "org.openzfs:raidz_expand_start_time" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME \ + "org.openzfs:raidz_expand_end_time" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED \ + "org.openzfs:raidz_expand_bytes_copied" + /* vdev metaslab allocation bias */ #define VDEV_ALLOC_BIAS_LOG "log" #define VDEV_ALLOC_BIAS_SPECIAL "special" @@ -1138,6 +1151,16 @@ typedef struct pool_removal_stat { uint64_t prs_mapping_memory; } pool_removal_stat_t; +typedef struct pool_raidz_expand_stat { + uint64_t pres_state; /* dsl_scan_state_t */ + uint64_t pres_expanding_vdev; + uint64_t pres_start_time; + uint64_t pres_end_time; + uint64_t pres_to_reflow; /* bytes that need to be moved */ + uint64_t pres_reflowed; /* bytes moved so far */ + uint64_t pres_waiting_for_resilver; +} pool_raidz_expand_stat_t; + typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, @@ -1577,6 +1600,7 @@ typedef enum { ZFS_ERR_NOT_USER_NAMESPACE, ZFS_ERR_RESUME_EXISTS, ZFS_ERR_CRYPTO_NOTSUP, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, } zfs_errno_t; /* @@ -1601,6 +1625,7 @@ typedef enum { ZPOOL_WAIT_RESILVER, ZPOOL_WAIT_SCRUB, ZPOOL_WAIT_TRIM, + ZPOOL_WAIT_RAIDZ_EXPAND, ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index b1eb06f94..ee91816ac 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -333,6 +334,9 @@ struct spa { spa_condensing_indirect_t *spa_condensing_indirect; zthr_t *spa_condense_zthr; /* zthr doing condense. */ + vdev_raidz_expand_t *spa_raidz_expand; + zthr_t *spa_raidz_expand_zthr; + uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index 03bcfa8f4..d3a71cc8f 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -75,6 +75,39 @@ extern "C" { #define MMP_FAIL_INT_SET(fail) \ (((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT) +/* + * RAIDZ expansion reflow information. + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |Scratch | Reflow | + * | State | Offset | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + */ +typedef enum raidz_reflow_scratch_state { + RRSS_SCRATCH_NOT_IN_USE = 0, + RRSS_SCRATCH_VALID, + RRSS_SCRATCH_INVALID_SYNCED, + RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, + RRSS_SCRATCH_INVALID_SYNCED_REFLOW +} raidz_reflow_scratch_state_t; + +#define RRSS_GET_OFFSET(ub) \ + BF64_GET_SB((ub)->ub_raidz_reflow_info, 0, 55, SPA_MINBLOCKSHIFT, 0) +#define RRSS_SET_OFFSET(ub, x) \ + BF64_SET_SB((ub)->ub_raidz_reflow_info, 0, 55, SPA_MINBLOCKSHIFT, 0, x) + +#define RRSS_GET_STATE(ub) \ + BF64_GET((ub)->ub_raidz_reflow_info, 55, 9) +#define RRSS_SET_STATE(ub, x) \ + BF64_SET((ub)->ub_raidz_reflow_info, 55, 9, x) + +#define RAIDZ_REFLOW_SET(ub, state, offset) do { \ + (ub)->ub_raidz_reflow_info = 0; \ + RRSS_SET_OFFSET(ub, offset); \ + RRSS_SET_STATE(ub, state); \ +} while (0) + struct uberblock { uint64_t ub_magic; /* UBERBLOCK_MAGIC */ uint64_t ub_version; /* SPA_VERSION */ @@ -136,6 +169,8 @@ struct uberblock { * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. */ uint64_t ub_checkpoint_txg; + + uint64_t ub_raidz_reflow_info; }; #ifdef __cplusplus diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 03e1f438a..38f62b07d 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -132,15 +132,19 @@ extern void vdev_space_update(vdev_t *vd, extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); +extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, + uint64_t txg); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); /* - * Return the amount of space allocated for a gang block header. + * Return the amount of space allocated for a gang block header. Note that + * since the physical birth txg is not provided, this must be constant for + * a given vdev. (e.g. raidz expansion can't change this) */ static inline uint64_t vdev_gang_header_asize(vdev_t *vd) { - return (vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); + return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0)); } extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); @@ -204,6 +208,8 @@ extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, int flags); extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *); +extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int); +extern int vdev_check_boot_reserve(spa_t *, vdev_t *); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 3f2312c23..dafab66c7 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -72,7 +72,7 @@ typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); -typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg); typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); @@ -281,6 +281,7 @@ struct vdev { uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ uint64_t vdev_failfast; /* device failfast setting */ + boolean_t vdev_rz_expanding; /* raidz is being expanded? */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ @@ -536,6 +537,7 @@ typedef struct vdev_label { /* * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. + * On RAIDZ, this space is overwritten during RAIDZ expansion. */ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ @@ -608,7 +610,7 @@ extern vdev_ops_t vdev_indirect_ops; */ extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs); -extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index e34b6e4b1..a34bc00ca 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -26,6 +26,7 @@ #define _SYS_VDEV_RAIDZ_H #include +#include #ifdef __cplusplus extern "C" { @@ -35,6 +36,8 @@ struct zio; struct raidz_col; struct raidz_row; struct raidz_map; +struct vdev_raidz; +struct uberblock; #if !defined(_KERNEL) struct kernel_param {}; #endif @@ -44,13 +47,19 @@ struct kernel_param {}; */ struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, uint64_t); +struct raidz_map *vdev_raidz_map_alloc_expanded(struct zio *, + uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, boolean_t); void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_free(struct vdev_raidz *); void vdev_raidz_generate_parity_row(struct raidz_map *, struct raidz_row *); void vdev_raidz_generate_parity(struct raidz_map *); void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); void vdev_raidz_child_done(zio_t *); void vdev_raidz_io_done(zio_t *); void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); +struct raidz_row *vdev_raidz_row_alloc(int); +void vdev_raidz_reflow_copy_scratch(spa_t *); +void raidz_dtl_reassessed(vdev_t *); extern const zio_vsd_ops_t vdev_raidz_vsd_ops; @@ -65,11 +74,101 @@ int vdev_raidz_math_reconstruct(struct raidz_map *, struct raidz_row *, const int *, const int *, const int); int vdev_raidz_impl_set(const char *); +typedef struct vdev_raidz_expand { + uint64_t vre_vdev_id; + + kmutex_t vre_lock; + kcondvar_t vre_cv; + + /* + * How much i/o is outstanding (issued and not completed). + */ + uint64_t vre_outstanding_bytes; + + /* + * Next offset to issue i/o for. + */ + uint64_t vre_offset; + + /* + * Lowest offset of a failed expansion i/o. The expansion will retry + * from here. Once the expansion thread notices the failure and exits, + * vre_failed_offset is reset back to UINT64_MAX, and + * vre_waiting_for_resilver will be set. + */ + uint64_t vre_failed_offset; + boolean_t vre_waiting_for_resilver; + + /* + * Offset that is completing each txg + */ + uint64_t vre_offset_pertxg[TXG_SIZE]; + + /* + * Bytes copied in each txg. + */ + uint64_t vre_bytes_copied_pertxg[TXG_SIZE]; + + /* + * The rangelock prevents normal read/write zio's from happening while + * there are expansion (reflow) i/os in progress to the same offsets. + */ + zfs_rangelock_t vre_rangelock; + + /* + * These fields are stored on-disk in the vdev_top_zap: + */ + dsl_scan_state_t vre_state; + uint64_t vre_start_time; + uint64_t vre_end_time; + uint64_t vre_bytes_copied; +} vdev_raidz_expand_t; + typedef struct vdev_raidz { - int vd_logical_width; + /* + * Number of child vdevs when this raidz vdev was created (i.e. before + * any raidz expansions). + */ + int vd_original_width; + + /* + * The current number of child vdevs, which may be more than the + * original width if an expansion is in progress or has completed. + */ + int vd_physical_width; + int vd_nparity; + + /* + * Tree of reflow_node_t's. The lock protects the avl tree only. + * The reflow_node_t's describe completed expansions, and are used + * to determine the logical width given a block's birth time. + */ + avl_tree_t vd_expand_txgs; + kmutex_t vd_expand_lock; + + /* + * If this vdev is being expanded, spa_raidz_expand is set to this + */ + vdev_raidz_expand_t vn_vre; } vdev_raidz_t; +extern int vdev_raidz_attach_check(vdev_t *); +extern void vdev_raidz_attach_sync(void *, dmu_tx_t *); +extern void spa_start_raidz_expansion_thread(spa_t *); +extern int spa_raidz_expand_get_stats(spa_t *, pool_raidz_expand_stat_t *); +extern int vdev_raidz_load(vdev_t *); + +/* RAIDZ scratch area pause points (for testing) */ +#define RAIDZ_EXPAND_PAUSE_NONE 0 +#define RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1 1 +#define RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2 2 +#define RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3 3 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_VALID 4 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED 5 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1 6 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2 7 + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index c1037fa12..fae03f8f5 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -30,6 +30,8 @@ #include #include #include +#include +#include #ifdef __cplusplus extern "C" { @@ -102,28 +104,32 @@ typedef struct raidz_impl_ops { char name[RAIDZ_IMPL_NAME_MAX]; /* Name of the implementation */ } raidz_impl_ops_t; + typedef struct raidz_col { - uint64_t rc_devidx; /* child device index for I/O */ + int rc_devidx; /* child device index for I/O */ + uint32_t rc_size; /* I/O size */ uint64_t rc_offset; /* device offset */ - uint64_t rc_size; /* I/O size */ abd_t rc_abdstruct; /* rc_abd probably points here */ abd_t *rc_abd; /* I/O data */ abd_t *rc_orig_data; /* pre-reconstruction */ int rc_error; /* I/O error for this device */ - uint8_t rc_tried; /* Did we attempt this I/O column? */ - uint8_t rc_skipped; /* Did we skip this I/O column? */ - uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ - uint8_t rc_force_repair; /* Write good data to this column */ - uint8_t rc_allow_repair; /* Allow repair I/O to this column */ + uint8_t rc_tried:1; /* Did we attempt this I/O column? */ + uint8_t rc_skipped:1; /* Did we skip this I/O column? */ + uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ + uint8_t rc_force_repair:1; /* Write good data to this column */ + uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + int rc_shadow_devidx; /* for double write during expansion */ + int rc_shadow_error; /* for double write during expansion */ + uint64_t rc_shadow_offset; /* for double write during expansion */ } raidz_col_t; typedef struct raidz_row { - uint64_t rr_cols; /* Regular column count */ - uint64_t rr_scols; /* Count including skipped columns */ - uint64_t rr_bigcols; /* Remainder data column count */ - uint64_t rr_missingdata; /* Count of missing data devices */ - uint64_t rr_missingparity; /* Count of missing parity devices */ - uint64_t rr_firstdatacol; /* First data column/parity count */ + int rr_cols; /* Regular column count */ + int rr_scols; /* Count including skipped columns */ + int rr_bigcols; /* Remainder data column count */ + int rr_missingdata; /* Count of missing data devices */ + int rr_missingparity; /* Count of missing parity devices */ + int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ #ifdef ZFS_DEBUG @@ -138,10 +144,25 @@ typedef struct raidz_map { int rm_nrows; /* Regular row count */ int rm_nskip; /* RAIDZ sectors skipped for padding */ int rm_skipstart; /* Column index of padding start */ + int rm_original_width; /* pre-expansion width of raidz vdev */ + int rm_nphys_cols; /* num entries in rm_phys_col[] */ + zfs_locked_range_t *rm_lr; const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + raidz_col_t *rm_phys_col; /* if non-NULL, read i/o aggregation */ raidz_row_t *rm_row[0]; /* flexible array of rows */ } raidz_map_t; +/* + * Nodes in vdev_raidz_t:vd_expand_txgs. + * Blocks with physical birth time of re_txg or later have the specified + * logical width (until the next node). + */ +typedef struct reflow_node { + uint64_t re_txg; + uint64_t re_logical_width; + avl_node_t re_link; +} reflow_node_t; + #define RAIDZ_ORIGINAL_IMPL (INT_MAX) diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index a1dfef1d8..8d94557a5 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -58,6 +58,7 @@ extern int zfs_dbgmsg_enable; #define ZFS_DEBUG_LOG_SPACEMAP (1 << 12) #define ZFS_DEBUG_METASLAB_ALLOC (1 << 13) #define ZFS_DEBUG_BRT (1 << 14) +#define ZFS_DEBUG_RAIDZ_RECONSTRUCT (1 << 15) extern void __set_error(const char *file, const char *func, int line, int err); extern void __zfs_dbgmsg(char *buf); diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 1025c4473..2515ba321 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -81,6 +81,7 @@ typedef enum spa_feature { SPA_FEATURE_BLOCK_CLONING, SPA_FEATURE_AVZ_V2, SPA_FEATURE_REDACTION_LIST_SPILL, + SPA_FEATURE_RAIDZ_EXPANSION, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index bcdcff976..51b0368f8 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -602,7 +602,7 @@ - + @@ -1257,41 +1257,15 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1337,7 +1311,6 @@ - @@ -1387,13 +1360,6 @@ - - - - - - - @@ -1521,18 +1487,10 @@ - - - - - - - - @@ -1561,12 +1519,6 @@ - - - - - - @@ -5671,7 +5623,8 @@ - + + @@ -5768,6 +5721,112 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -5778,7 +5837,8 @@ - + + @@ -5824,7 +5884,8 @@ - + + @@ -5941,6 +6002,13 @@ + + + + + + + @@ -6556,6 +6624,15 @@ + + + + + + + + + @@ -6577,6 +6654,9 @@ + + + @@ -7123,25 +7203,103 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -7302,6 +7460,23 @@ + + + + + + + + + + + + + + + + + @@ -7320,15 +7495,37 @@ + + + + + + + + + + + + + + + + + - - - + + + + + + + + @@ -7407,6 +7604,10 @@ + + + + @@ -8081,14 +8282,6 @@ - - - - - - - - @@ -8101,9 +8294,6 @@ - - - @@ -8385,9 +8575,6 @@ - - - @@ -8480,39 +8667,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -8523,9 +8677,6 @@ - - - @@ -8579,10 +8730,6 @@ - - - - @@ -8727,8 +8874,8 @@ - - + + @@ -8805,7 +8952,7 @@ - + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 4ebd112f4..71cf029de 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3378,6 +3378,7 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, boolean_t avail_spare, l2cache, islog; uint64_t val; char *newname; + const char *type; nvlist_t **child; uint_t children; nvlist_t *config_root; @@ -3412,6 +3413,14 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf)); } + type = fnvlist_lookup_string(tgt, ZPOOL_CONFIG_TYPE); + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 && + zfeature_lookup_guid("org.openzfs:raidz_expansion", NULL) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "the loaded zfs module doesn't support raidz expansion")); + return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf)); + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children != 1) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, @@ -3479,6 +3488,10 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a replacing device")); } + } else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "raidz_expansion feature must be enabled " + "in order to attach a device to raidz")); } else { char status[64] = {0}; zpool_prop_get_feature(zhp, @@ -3508,8 +3521,7 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, break; case EBUSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, " - "or device removal is in progress"), + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"), new_disk); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; @@ -3540,6 +3552,34 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, (void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf); break; + case ENXIO: + /* + * The existing raidz vdev has offline children + */ + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "raidz vdev has devices that are are offline or " + "being replaced")); + (void) zfs_error(hdl, EZFS_BADDEV, errbuf); + break; + } else { + (void) zpool_standard_error(hdl, errno, errbuf); + } + break; + + case EADDRINUSE: + /* + * The boot reserved area is already being used (FreeBSD) + */ + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "the reserved boot area needed for the expansion " + "is already being used by a boot loader")); + (void) zfs_error(hdl, EZFS_BADDEV, errbuf); + } else { + (void) zpool_standard_error(hdl, errno, errbuf); + } + break; default: (void) zpool_standard_error(hdl, errno, errbuf); } @@ -5222,6 +5262,9 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, } else { src = ZPROP_SRC_DEFAULT; intval = vdev_prop_default_numeric(prop); + /* Only use if provided by the RAIDZ VDEV above */ + if (prop == VDEV_PROP_RAIDZ_EXPANDING) + return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, (const char **)&strval) != 0) diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index fdd1975fa..778e511e0 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -317,6 +317,8 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_RESUME_EXISTS: return (dgettext(TEXT_DOMAIN, "Resuming recv on existing " "dataset without force")); + case EZFS_RAIDZ_EXPAND_IN_PROGRESS: + return (dgettext(TEXT_DOMAIN, "raidz expansion in progress")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -763,6 +765,9 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_IOC_ARG_BADTYPE: zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; + case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: + zfs_verror(hdl, EZFS_RAIDZ_EXPAND_IN_PROGRESS, fmt, ap); + break; default: zfs_error_aux(hdl, "%s", strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index f2087186a..5b95c8f77 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -1360,7 +1360,9 @@ + + @@ -1376,7 +1378,8 @@ - + + @@ -2538,6 +2541,13 @@ + + + + + + + diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 58d7f0752..3c986a707 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -43,6 +43,7 @@ nodist_libzpool_la_SOURCES = \ module/os/linux/zfs/arc_os.c \ module/os/linux/zfs/trace.c \ module/os/linux/zfs/vdev_file.c \ + module/os/linux/zfs/vdev_label_os.c \ module/os/linux/zfs/zfs_debug.c \ module/os/linux/zfs/zfs_racct.c \ module/os/linux/zfs/zfs_znode.c \ diff --git a/man/man1/ztest.1 b/man/man1/ztest.1 index 64514b317..bbbe751ca 100644 --- a/man/man1/ztest.1 +++ b/man/man1/ztest.1 @@ -52,6 +52,16 @@ .Op Fl T Ar time .Op Fl z Ar zil_failure_rate . +.Nm +.Fl X +.Op Fl VG +.Op Fl s Ar size_of_each_vdev +.Op Fl a Ar alignment_shift +.Op Fl r Ar raidz_disks +.Op Fl R Ar raid_parity +.Op Fl d Ar datasets +.Op Fl t Ar threads +. .Sh DESCRIPTION .Nm was written by the ZFS Developers as a ZFS unit test. @@ -122,11 +132,17 @@ Number of mirror copies. Number of raidz/draid disks. .It Fl R , -raid-parity Ns = (default: Sy 1 ) Raid parity (raidz & draid). -.It Fl K , -raid-kind Ns = Ns Sy raidz Ns | Ns Sy draid Ns | Ns Sy random No (default : Sy random ) +.It Xo +.Fl K , -raid-kind Ns = Ns +.Sy raidz Ns | Ns Sy eraidz Ns | Ns Sy draid Ns | Ns Sy random +(default: +.Sy random Ns +) +.Xc The kind of RAID config to use. With .Sy random -the kind alternates between raidz and draid. +the kind alternates between raidz, eraidz (expandable raidz) and draid. .It Fl D , -draid-data Ns = (default: Sy 4 ) Number of data disks in a dRAID redundancy group. .It Fl S , -draid-spares Ns = (default: Sy 1 ) @@ -181,6 +197,8 @@ to an unsigned 32-bit integer Dump zfs_dbgmsg buffer before exiting due to an error. .It Fl V , -verbose Verbose (use multiple times for ever more verbosity). +.It Fl X , -raidz-expansion +Perform a dedicated raidz expansion test. .El . .Sh EXAMPLES diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index f9824ac17..574558f9d 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -430,6 +430,19 @@ TXGs must pass before unloading will occur. .It Sy reference_history Ns = Ns Sy 3 Pq uint Maximum reference holders being tracked when reference_tracking_enable is active. +.It Sy raidz_expand_max_copy_bytes Ns = Ns Sy 160MB Pq ulong +Max amount of memory to use for RAID-Z expansion I/O. +This limits how much I/O can be outstanding at once. +. +.It Sy raidz_expand_max_reflow_bytes Ns = Ns Sy 0 Pq ulong +For testing, pause RAID-Z expansion when reflow amount reaches this value. +. +.It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong +For expanded RAID-Z, aggregate reads that have more rows than this. +. +.It Sy reference_history Ns = Ns Sy 3 Pq int +Maximum reference holders being tracked when reference_tracking_enable is +active. . .It Sy reference_tracking_enable Ns = Ns Sy 0 Ns | Ns 1 Pq int Track reference holders to @@ -1781,6 +1794,12 @@ even if there were unrepairable errors. Intended to be used during pool repair or recovery to stop resilvering when the pool is next imported. . +.It Sy zfs_scrub_after_expand Ns = Ns Sy 1 Ns | Ns 0 Pq int +Automatically start a pool scrub after a RAIDZ expansion completes +in order to verify the checksums of all blocks which have been +copied during the expansion. +This is enabled by default and strongly recommended. +. .It Sy zfs_scrub_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq uint Scrubs are processed by the sync thread. While scrubbing, it will spend at least this much time diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 3c7b0b345..01dec61b9 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -767,6 +767,14 @@ as soon as it is enabled and will never return to being .Sy disabled . \*[remount-upgrade] . +.feature org.openzfs raidz_expansion no none +This feature enables the +.Nm zpool Cm attach +subcommand to attach a new device to a RAID-Z group, expanding the total +amount usable space in the pool. +See +.Xr zpool-attach 8 . +. .feature com.delphix redaction_bookmarks no bookmarks extensible_dataset This feature enables the use of redacted .Nm zfs Cm send Ns s , @@ -784,6 +792,18 @@ and so cannot be safely mounted, and their contents cannot be safely read. For more information about redacted receives, see .Xr zfs-send 8 . . +.feature com.delphix redaction_list_spill no redaction_bookmarks +This feature enables the redaction list created by zfs redact to store +many more entries. +It becomes +.Sy active +when a redaction list is created with more than 36 entries, +and returns to being +.Sy enabled +when no long redaction lists remain in the pool. +For more information about redacted sends, see +.Xr zfs-send 8 . +. .feature com.datto resilver_defer yes This feature allows ZFS to postpone new resilvers if an existing one is already in progress. @@ -947,18 +967,6 @@ once all filesystems that have ever had their property set to .Sy zstd are destroyed. -. -.feature com.delphix redaction_list_spill no redaction_bookmarks -This feature enables the redaction list created by zfs redact to store -many more entries. -It becomes -.Sy active -when a redaction list is created with more than 36 entries, -and returns to being -.Sy enabled -when no long redaction lists remain in the pool. -For more information about redacted sends, see -.Xr zfs-send 8 . .El . .Sh SEE ALSO diff --git a/man/man8/zpool-attach.8 b/man/man8/zpool-attach.8 index 73535cbdf..22b1369b6 100644 --- a/man/man8/zpool-attach.8 +++ b/man/man8/zpool-attach.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 15, 2020 +.Dd June 28, 2023 .Dt ZPOOL-ATTACH 8 .Os . @@ -45,7 +45,15 @@ Attaches .Ar new_device to the existing .Ar device . -The existing device cannot be part of a raidz configuration. +The behavior differs depending on if the existing +.Ar device +is a RAID-Z device, or a mirror/plain device. +.Pp +If the existing device is a mirror or plain device +.Pq e.g. specified as Qo Li sda Qc or Qq Li mirror-7 , +the new device will be mirrored with the existing device, a resilver will be +initiated, and the new device will contribute to additional redundancy once the +resilver completes. If .Ar device is not currently part of a mirrored configuration, @@ -62,6 +70,42 @@ creates a three-way mirror, and so on. In either case, .Ar new_device begins to resilver immediately and any running scrub is cancelled. +.Pp +If the existing device is a RAID-Z device +.Pq e.g. specified as Qq Ar raidz2-0 , +the new device will become part of that RAID-Z group. +A "raidz expansion" will be initiated, and once the expansion completes, +the new device will contribute additional space to the RAID-Z group. +The expansion entails reading all allocated space from existing disks in the +RAID-Z group, and rewriting it to the new disks in the RAID-Z group (including +the newly added +.Ar device ) . +Its progress can be monitored with +.Nm zpool Cm status . +.Pp +Data redundancy is maintained during and after the expansion. +If a disk fails while the expansion is in progress, the expansion pauses until +the health of the RAID-Z vdev is restored (e.g. by replacing the failed disk +and waiting for reconstruction to complete). +Expansion does not change the number of failures that can be tolerated +without data loss (e.g. a RAID-Z2 is still a RAID-Z2 even after expansion). +A RAID-Z vdev can be expanded multiple times. +.Pp +After the expansion completes, old blocks retain their old data-to-parity +ratio +.Pq e.g. 5-wide RAID-Z2 has 3 data and 2 parity +but distributed among the larger set of disks. +New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide +RAID-Z2 which has been expanded once to 6-wide, has 4 data and 2 parity). +However, the vdev's assumed parity ratio does not change, so slightly less +space than is expected may be reported for newly-written blocks, according to +.Nm zfs Cm list , +.Nm df , +.Nm ls Fl s , +and similar tools. +.Pp +A pool-wide scrub is initiated at the end of the expansion in order to verify +the checksums of all blocks which have been copied during the expansion. .Bl -tag -width Ds .It Fl f Forces use of @@ -76,16 +120,15 @@ manual page for a list of valid properties that can be set. The only property supported at the moment is .Sy ashift . .It Fl s -The +When attaching to a mirror or plain device, the .Ar new_device is reconstructed sequentially to restore redundancy as quickly as possible. Checksums are not verified during sequential reconstruction so a scrub is started when the resilver completes. -Sequential reconstruction is not supported for raidz configurations. .It Fl w Waits until .Ar new_device -has finished resilvering before returning. +has finished resilvering or expanding before returning. .El . .Sh SEE ALSO diff --git a/man/man8/zpool-wait.8 b/man/man8/zpool-wait.8 index 683b01414..d646d9cc3 100644 --- a/man/man8/zpool-wait.8 +++ b/man/man8/zpool-wait.8 @@ -20,7 +20,7 @@ .\" .\" .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. -.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2018 George Melikov. All Rights Reserved. @@ -57,7 +57,7 @@ immediately. These are the possible values for .Ar activity , along with what each one waits for: -.Bl -tag -compact -offset Ds -width "initialize" +.Bl -tag -compact -offset Ds -width "raidz_expand" .It Sy discard Checkpoint to be discarded .It Sy free @@ -76,6 +76,8 @@ Resilver to cease Scrub to cease .It Sy trim Manual trim to cease +.It Sy raidz_expand +Attaching to a RAID-Z vdev to complete .El .Pp If an diff --git a/module/Kbuild.in b/module/Kbuild.in index c13217159..6408fb106 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -445,6 +445,7 @@ ZFS_OBJS_OS := \ trace.o \ vdev_disk.o \ vdev_file.o \ + vdev_label_os.o \ zfs_acl.o \ zfs_ctldir.o \ zfs_debug.o \ diff --git a/module/os/freebsd/zfs/vdev_label_os.c b/module/os/freebsd/zfs/vdev_label_os.c index bc856b930..338982ff6 100644 --- a/module/os/freebsd/zfs/vdev_label_os.c +++ b/module/os/freebsd/zfs/vdev_label_os.c @@ -72,3 +72,62 @@ retry: abd_free(pad2); return (error); } + +static void +vdev_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); +} + +/* + * Check if the reserved boot area is in-use. + * + * When booting FreeBSD with an MBR partition with ZFS, the zfsboot file + * (which understands the ZFS file system) is written to the ZFS BOOT + * reserve area (at offset 512K). We check for that here before attaching + * a disk to raidz which would then corrupt this boot data. + */ +int +vdev_check_boot_reserve(spa_t *spa, vdev_t *childvd) +{ + ASSERT(childvd->vdev_ops->vdev_op_leaf); + + size_t size = SPA_MINBLOCKSIZE; + abd_t *abd = abd_alloc_linear(size, B_FALSE); + + zio_t *pio = zio_root(spa, NULL, NULL, 0); + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to the offset + * to calculate the physical offset to write to. Passing in a negative + * offset lets us access the boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, childvd, + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abd, size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, 0, vdev_child_done, pio)); + zio_wait(pio); + + unsigned char *buf = abd_to_buf(abd); + + /* + * The BTX server has a special header at the begining. + * + * btx_hdr: .byte 0xeb # Machine ID + * .byte 0xe # Header size + * .ascii "BTX" # Magic + * .byte 0x1 # Major version + * .byte 0x2 # Minor version + * .byte BTX_FLAGS # Flags + */ + if (buf[0] == 0xeb && buf[1] == 0x0e && + buf[2] == 'B' && buf[3] == 'T' && buf[4] == 'X') { + abd_free(abd); + return (EBUSY); + } + + abd_free(abd); + return (0); +} diff --git a/module/os/linux/zfs/vdev_label_os.c b/module/os/linux/zfs/vdev_label_os.c new file mode 100644 index 000000000..3d965b89a --- /dev/null +++ b/module/os/linux/zfs/vdev_label_os.c @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2023 by iXsystems, Inc. + */ + +#include +#include +#include +#include +#include + +/* + * Check if the reserved boot area is in-use. + * + * This function always returns 0, as there are no known external uses + * of the reserved area on Linux. + */ +int +vdev_check_boot_reserve(spa_t *spa, vdev_t *childvd) +{ + (void) spa; + (void) childvd; + + return (0); +} diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c index b090ec684..f707959c9 100644 --- a/module/os/linux/zfs/zfs_debug.c +++ b/module/os/linux/zfs/zfs_debug.c @@ -175,7 +175,8 @@ __dprintf(boolean_t dprint, const char *file, const char *func, newfile = file; } - i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func); + i = snprintf(buf, size, "%px %s%s:%d:%s(): ", + curthread, prefix, newfile, line, func); if (i < size) { va_start(adx, fmt); diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 2c74d10f4..309d9bf14 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -749,6 +749,11 @@ zpool_feature_init(void) redact_list_spill_deps, sfeatures); } + zfeature_register(SPA_FEATURE_RAIDZ_EXPANSION, + "org.openzfs:raidz_expansion", "raidz_expansion", + "Support for raidz expansion", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index c4aca04a9..e98063e8b 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -439,6 +439,9 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_ALLOCATING, "allocating", 1, PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "ALLOCATING", boolean_na_table, sfeatures); + zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0, + PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING", + boolean_table, sfeatures); /* default index properties */ zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index dfea15b74..dc2dd63b3 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4518,7 +4518,7 @@ arc_evict_cb_check(void *arg, zthr_t *zthr) static void arc_evict_cb(void *arg, zthr_t *zthr) { - (void) arg, (void) zthr; + (void) arg; uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); @@ -4542,9 +4542,13 @@ arc_evict_cb(void *arg, zthr_t *zthr) * infinite loop. Additionally, zthr_iscancelled() is * checked here so that if the arc is shutting down, the * broadcast will wake any remaining arc evict waiters. + * + * Note we cancel using zthr instead of arc_evict_zthr + * because the latter may not yet be initializd when the + * callback is first invoked. */ mutex_enter(&arc_evict_lock); - arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && + arc_evict_needed = !zthr_iscancelled(zthr) && evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; if (!arc_evict_needed) { /* diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 34012db82..e16128fdf 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -3066,7 +3066,6 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); - spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); if (scn->scn_suspending) return; diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index e0d4a6a63..0983ba143 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -4342,7 +4342,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); - if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { + if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || + vd->vdev_rz_expanding) { defer_allowed = B_FALSE; } @@ -4650,6 +4651,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); + VERIFY0(msp->ms_new); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { @@ -4721,10 +4723,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, } /* - * If the selected metaslab is condensing or disabled, - * skip it. + * If the selected metaslab is condensing or disabled, or + * hasn't gone through a metaslab_sync_done(), then skip it. */ - if (msp->ms_condensing || msp->ms_disabled > 0) + if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new) continue; *was_active = msp->ms_allocator != -1; @@ -5270,7 +5272,7 @@ top: ASSERT(mg->mg_class == mc); - uint64_t asize = vdev_psize_to_asize(vd, psize); + uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); /* diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 68f367c1c..20225640f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -1709,6 +1710,10 @@ spa_destroy_aux_threads(spa_t *spa) zthr_destroy(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = NULL; } + if (spa->spa_raidz_expand_zthr != NULL) { + zthr_destroy(spa->spa_raidz_expand_zthr); + spa->spa_raidz_expand_zthr = NULL; + } } /* @@ -1861,6 +1866,8 @@ spa_unload(spa_t *spa) spa->spa_compatibility = NULL; } + spa->spa_raidz_expand = NULL; + spa_config_exit(spa, SCL_ALL, spa); } @@ -2999,6 +3006,7 @@ spa_spawn_aux_threads(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); + spa_start_raidz_expansion_thread(spa); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); @@ -3753,6 +3761,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); + if (ub->ub_raidz_reflow_info != 0) { + spa_load_note(spa, "uberblock raidz_reflow_info: " + "state=%u offset=%llu", + (int)RRSS_GET_STATE(ub), + (u_longlong_t)RRSS_GET_OFFSET(ub)); + } /* @@ -5091,6 +5105,13 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); + /* + * Before we do any zio_write's, complete the raidz expansion + * scratch space copying, if necessary. + */ + if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) + vdev_raidz_reflow_copy_scratch(spa); + /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. @@ -6905,9 +6926,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } /* - * Attach a device to a mirror. The arguments are the path to any device - * in the mirror, and the nvroot for the new device. If the path specifies - * a device that is not mirrored, we automatically insert the mirror vdev. + * Attach a device to a vdev specified by its guid. The vdev type can be + * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a + * single device). When the vdev is a single device, a mirror vdev will be + * automatically inserted. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own @@ -6930,7 +6952,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; - int newvd_isspare; + int newvd_isspare = B_FALSE; int error; ASSERT(spa_writeable(spa)); @@ -6961,16 +6983,35 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, ZFS_ERR_REBUILD_IN_PROGRESS)); } - if (spa->spa_vdev_removal != NULL) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + if (spa->spa_vdev_removal != NULL) { + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_DEVRM_IN_PROGRESS)); + } if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - if (!oldvd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; - pvd = oldvd->vdev_parent; + if (raidz) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + /* + * Can't expand a raidz while prior expand is in progress. + */ + if (spa->spa_raidz_expand != NULL) { + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); + } + } else if (!oldvd->vdev_ops->vdev_op_leaf) { + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } + + if (raidz) + pvd = oldvd; + else + pvd = oldvd->vdev_parent; if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH) != 0) @@ -7026,6 +7067,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_raidz_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); @@ -7065,7 +7107,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, /* * Make sure the new device is big enough. */ - if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) + vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; + if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* @@ -7075,32 +7118,75 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* + * RAIDZ-expansion-specific checks. + */ + if (raidz) { + if (vdev_raidz_attach_check(newvd) != 0) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + /* + * Fail early if a child is not healthy or being replaced + */ + for (int i = 0; i < oldvd->vdev_children; i++) { + if (vdev_is_dead(oldvd->vdev_child[i]) || + !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { + return (spa_vdev_exit(spa, newrootvd, txg, + ENXIO)); + } + /* Also fail if reserved boot area is in-use */ + if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) + != 0) { + return (spa_vdev_exit(spa, newrootvd, txg, + EADDRINUSE)); + } + } + } + + if (raidz) { + /* + * Note: oldvdpath is freed by spa_strfree(), but + * kmem_asprintf() is freed by kmem_strfree(), so we have to + * move it to a spa_strdup-ed string. + */ + char *tmp = kmem_asprintf("raidz%u-%u", + (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); + oldvdpath = spa_strdup(tmp); + kmem_strfree(tmp); + } else { + oldvdpath = spa_strdup(oldvd->vdev_path); + } + newvdpath = spa_strdup(newvd->vdev_path); + /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ - if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { + if (strcmp(oldvdpath, newvdpath) == 0) { spa_strfree(oldvd->vdev_path); - oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, KM_SLEEP); - (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, - "%s/%s", newvd->vdev_path, "old"); + (void) sprintf(oldvd->vdev_path, "%s/old", + newvdpath); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } + spa_strfree(oldvdpath); + oldvdpath = spa_strdup(oldvd->vdev_path); } /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ - if (pvd->vdev_ops != pvops) + if (!raidz && pvd->vdev_ops != pvops) { pvd = vdev_add_parent(oldvd, pvops); + ASSERT(pvd->vdev_ops == pvops); + ASSERT(oldvd->vdev_parent == pvd); + } ASSERT(pvd->vdev_top->vdev_parent == rvd); - ASSERT(pvd->vdev_ops == pvops); - ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. @@ -7128,41 +7214,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, - TXG_INITIAL, dtl_max_txg - TXG_INITIAL); + if (raidz) { + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); - if (newvd->vdev_isspare) { - spa_spare_activate(newvd); - spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); - } + vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); + vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); + vdev_autotrim_stop_wait(tvd); - oldvdpath = spa_strdup(oldvd->vdev_path); - newvdpath = spa_strdup(newvd->vdev_path); - newvd_isspare = newvd->vdev_isspare; + dtl_max_txg = spa_vdev_config_enter(spa); - /* - * Mark newvd's DTL dirty in this txg. - */ - vdev_dirty(tvd, VDD_DTL, newvd, txg); + tvd->vdev_rz_expanding = B_TRUE; - /* - * Schedule the resilver or rebuild to restart in the future. We do - * this to ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. - */ - if (rebuild) { - newvd->vdev_rebuild_txg = txg; + vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); + vdev_config_dirty(tvd); - vdev_rebuild(tvd); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, + dtl_max_txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, + newvd, tx); + dmu_tx_commit(tx); } else { - newvd->vdev_resilver_txg = txg; + vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, + dtl_max_txg - TXG_INITIAL); - if (dsl_scan_resilvering(spa_get_dsl(spa)) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { - vdev_defer_resilver(newvd); + if (newvd->vdev_isspare) { + spa_spare_activate(newvd); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); + } + + newvd_isspare = newvd->vdev_isspare; + + /* + * Mark newvd's DTL dirty in this txg. + */ + vdev_dirty(tvd, VDD_DTL, newvd, txg); + + /* + * Schedule the resilver or rebuild to restart in the future. + * We do this to ensure that dmu_sync-ed blocks have been + * stitched into the respective datasets. + */ + if (rebuild) { + newvd->vdev_rebuild_txg = txg; + + vdev_rebuild(tvd); } else { - dsl_scan_restart_resilver(spa->spa_dsl_pool, - dtl_max_txg); + newvd->vdev_resilver_txg = txg; + + if (dsl_scan_resilvering(spa_get_dsl(spa)) && + spa_feature_is_enabled(spa, + SPA_FEATURE_RESILVER_DEFER)) { + vdev_defer_resilver(newvd); + } else { + dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg); + } } } @@ -7487,7 +7598,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, */ if (cmd_type == POOL_INITIALIZE_START && (vd->vdev_initialize_thread != NULL || - vd->vdev_top->vdev_removing)) { + vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && @@ -7609,7 +7720,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, * which has completed but the thread is not exited. */ if (cmd_type == POOL_TRIM_START && - (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { + (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_TRIM_CANCEL && @@ -8512,6 +8624,10 @@ spa_async_suspend(spa_t *spa) if (condense_thread != NULL) zthr_cancel(condense_thread); + zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; + if (raidz_expand_thread != NULL) + zthr_cancel(raidz_expand_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); @@ -8538,6 +8654,10 @@ spa_async_resume(spa_t *spa) if (condense_thread != NULL) zthr_resume(condense_thread); + zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; + if (raidz_expand_thread != NULL) + zthr_resume(raidz_expand_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); @@ -9343,6 +9463,27 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) != NULL) vdev_sync(vd, txg); + if (pass == 1) { + /* + * dsl_pool_sync() -> dp_sync_tasks may have dirtied + * the config. If that happens, this txg should not + * be a no-op. So we must sync the config to the MOS + * before checking for no-op. + * + * Note that when the config is dirty, it will + * be written to the MOS (i.e. the MOS will be + * dirtied) every time we call spa_sync_config_object() + * in this txg. Therefore we can't call this after + * dsl_pool_sync() every pass, because it would + * prevent us from converging, since we'd dirty + * the MOS every pass. + * + * Sync tasks can only be processed in pass 1, so + * there's no need to do this in later passes. + */ + spa_sync_config_object(spa, tx); + } + /* * Note: We need to check if the MOS is dirty because we could * have marked the MOS dirty without updating the uberblock @@ -10100,7 +10241,8 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: - if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) + *in_progress = vdev_rebuild_active(spa->spa_root_vdev); + if (*in_progress) break; zfs_fallthrough; case ZPOOL_WAIT_SCRUB: @@ -10115,6 +10257,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, is_scrub == (activity == ZPOOL_WAIT_SCRUB)); break; } + case ZPOOL_WAIT_RAIDZ_EXPAND: + { + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); + break; + } default: panic("unrecognized value for activity %d", activity); } diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index b588f7041..1efff47f8 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -465,6 +465,9 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx) if (spa->spa_removing_phys.sr_state == DSS_SCANNING) return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); + if (spa->spa_raidz_expand != NULL) + return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); + if (spa->spa_checkpoint_txg != 0) return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index afb01c0ef..c10c78ebf 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include "zfs_prop.h" @@ -305,13 +306,13 @@ vdev_derive_alloc_bias(const char *bias) * all children. This is what's used by anything other than RAID-Z. */ uint64_t -vdev_default_asize(vdev_t *vd, uint64_t psize) +vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { - csize = vdev_psize_to_asize(vd->vdev_child[c], psize); + csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } @@ -930,6 +931,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); + vd->vdev_rz_expanding = nvlist_exists(nv, + ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } @@ -1692,6 +1695,8 @@ vdev_probe_done(zio_t *zio) vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; + vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", + vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { @@ -1913,17 +1918,20 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) } /* - * Compute the raidz-deflation ratio. Note, we hard-code - * in 128k (1 << 17) because it is the "typical" blocksize. - * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, - * otherwise it would inconsistently account for existing bp's. + * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) + * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE + * changed, this algorithm can not change, otherwise it would inconsistently + * account for existing bp's. We also hard-code txg 0 for the same reason + * since expanded RAIDZ vdevs can use a different asize for different birth + * txg's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / - (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> + SPA_MINBLOCKSHIFT); } } @@ -3228,32 +3236,43 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - return; + } else { + mutex_enter(&vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; + if (t == DTL_SCRUB) { + /* leaf vdevs only */ + continue; + } + if (t == DTL_PARTIAL) { + /* i.e. non-zero */ + minref = 1; + } else if (vdev_get_nparity(vd) != 0) { + /* RAIDZ, DRAID */ + minref = vdev_get_nparity(vd) + 1; + } else { + /* any kind of mirror */ + minref = vd->vdev_children; + } + space_reftree_create(&reftree); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_dtl_lock); + space_reftree_add_map(&reftree, + cvd->vdev_dtl[s], 1); + mutex_exit(&cvd->vdev_dtl_lock); + } + space_reftree_generate_map(&reftree, + vd->vdev_dtl[t], minref); + space_reftree_destroy(&reftree); + } + mutex_exit(&vd->vdev_dtl_lock); } - mutex_enter(&vd->vdev_dtl_lock); - for (int t = 0; t < DTL_TYPES; t++) { - /* account for child's outage in parent's missing map */ - int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; - if (t == DTL_SCRUB) - continue; /* leaf vdevs only */ - if (t == DTL_PARTIAL) - minref = 1; /* i.e. non-zero */ - else if (vdev_get_nparity(vd) != 0) - minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ - else - minref = vd->vdev_children; /* any kind of mirror */ - space_reftree_create(&reftree); - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - mutex_enter(&cvd->vdev_dtl_lock); - space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); - mutex_exit(&cvd->vdev_dtl_lock); - } - space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); - space_reftree_destroy(&reftree); + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { + raidz_dtl_reassessed(vd); } - mutex_exit(&vd->vdev_dtl_lock); } /* @@ -3628,6 +3647,12 @@ vdev_load(vdev_t *vd) vdev_set_deflate_ratio(vd); + if (vd->vdev_ops == &vdev_raidz_ops) { + error = vdev_raidz_load(vd); + if (error != 0) + return (error); + } + /* * On spa_load path, grab the allocation bias from our zap */ @@ -4005,10 +4030,22 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } +/* + * Return the amount of space that should be (or was) allocated for the given + * psize (compressed block size) in the given TXG. Note that for expanded + * RAIDZ vdevs, the size allocated for older BP's may be larger. See + * vdev_raidz_asize(). + */ +uint64_t +vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) +{ + return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); +} + uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { - return (vd->vdev_ops->vdev_op_asize(vd, psize)); + return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* @@ -4174,9 +4211,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); - wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; @@ -5457,7 +5491,9 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_set_deflate_ratio(vd); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + if ((vd->vdev_spa->spa_raidz_expand == NULL || + vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && + (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); @@ -6209,6 +6245,14 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_removing, ZPROP_SRC_NONE); continue; + case VDEV_PROP_RAIDZ_EXPANDING: + /* Only expose this for raidz */ + if (vd->vdev_ops == &vdev_raidz_ops) { + vdev_prop_add_list(outnvl, propname, + NULL, vd->vdev_rz_expanding, + ZPROP_SRC_NONE); + } + continue; /* Numeric Properites */ case VDEV_PROP_ALLOCATING: /* Leaf vdevs cannot have this property */ diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 307e2353d..ec961255f 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -577,8 +577,9 @@ vdev_draid_permute_id(vdev_draid_config_t *vdc, * i.e. vdev_draid_psize_to_asize(). */ static uint64_t -vdev_draid_asize(vdev_t *vd, uint64_t psize) +vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { + (void) txg; vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_ashift; @@ -960,7 +961,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t io_size = abd_size; - uint64_t io_asize = vdev_draid_asize(vd, io_size); + uint64_t io_asize = vdev_draid_asize(vd, io_size, 0); uint64_t group = vdev_draid_offset_to_group(vd, io_offset); uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); @@ -1025,15 +1026,9 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, ASSERT3U(vdc->vdc_nparity, >, 0); - raidz_row_t *rr; - rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP); - rr->rr_cols = groupwidth; - rr->rr_scols = groupwidth; + raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth); rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; rr->rr_firstdatacol = vdc->vdc_nparity; - rr->rr_abd_empty = NULL; #ifdef ZFS_DEBUG rr->rr_offset = io_offset; rr->rr_size = io_size; @@ -1053,14 +1048,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); rc->rc_offset = physical_offset; - rc->rc_abd = NULL; - rc->rc_orig_data = NULL; - rc->rc_error = 0; - rc->rc_tried = 0; - rc->rc_skipped = 0; - rc->rc_force_repair = 0; - rc->rc_allow_repair = 1; - rc->rc_need_orig_restore = B_FALSE; if (q == 0 && i >= bc) rc->rc_size = 0; @@ -1129,7 +1116,7 @@ vdev_draid_map_alloc(zio_t *zio) if (size < abd_size) { vdev_t *vd = zio->io_vd; - io_offset += vdev_draid_asize(vd, size); + io_offset += vdev_draid_asize(vd, size, 0); abd_offset += size; abd_size -= size; nrows++; @@ -1151,7 +1138,6 @@ vdev_draid_map_alloc(zio_t *zio) rm->rm_row[0] = rr[0]; if (nrows == 2) rm->rm_row[1] = rr[1]; - return (rm); } @@ -1783,7 +1769,7 @@ vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t asize = vdev_draid_asize(vd, psize); + uint64_t asize = vdev_draid_asize(vd, psize, 0); if (phys_birth == TXG_UNKNOWN) { /* @@ -1840,7 +1826,7 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_draid_asize(vd, rr->rr_size); + vdev_draid_asize(vd, rr->rr_size, 0); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index ffdcef197..5aaef1a69 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -48,7 +48,8 @@ static boolean_t vdev_initialize_should_stop(vdev_t *vd) { return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); + vd->vdev_detached || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding); } static void @@ -67,7 +68,8 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + if (vd == NULL || vd->vdev_top->vdev_removing || + !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; @@ -631,6 +633,7 @@ vdev_initialize(vdev_t *vd) ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_initialize_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_top->vdev_rz_expanding); vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); vd->vdev_initialize_thread = thread_create(NULL, 0, @@ -791,13 +794,14 @@ vdev_initialize_restart(vdev_t *vd) ASSERT(err == 0 || err == ENOENT); vd->vdev_initialize_action_time = timestamp; - if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || - vd->vdev_offline) { + if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_initialize_load(vd)); } else if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + !vd->vdev_top->vdev_rz_expanding && vd->vdev_initialize_thread == NULL) { vdev_initialize(vd); } diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index a2e5524a8..e8f562a1a 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -142,6 +142,7 @@ #include #include #include +#include #include #include #include @@ -423,6 +424,13 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, sizeof (pcs) / sizeof (uint64_t)); } + + pool_raidz_expand_stat_t pres; + if (spa_raidz_expand_get_stats(spa, &pres) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres, + sizeof (pres) / sizeof (uint64_t)); + } } static void @@ -1504,7 +1512,8 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) } struct ubl_cbdata { - uberblock_t *ubl_ubbest; /* Best uberblock */ + uberblock_t ubl_latest; /* Most recent uberblock */ + uberblock_t *ubl_ubbest; /* Best uberblock (w/r/t max_txg) */ vdev_t *ubl_vd; /* vdev associated with the above */ }; @@ -1521,6 +1530,9 @@ vdev_uberblock_load_done(zio_t *zio) if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); + if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) { + cbp->ubl_latest = *ub; + } if (ub->ub_txg <= spa->spa_load_max_txg && vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { /* @@ -1578,10 +1590,10 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) ASSERT(config); memset(ub, 0, sizeof (uberblock_t)); + memset(&cb, 0, sizeof (cb)); *config = NULL; cb.ubl_ubbest = ub; - cb.ubl_vd = NULL; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); zio = zio_root(spa, NULL, &cb, flags); @@ -1598,6 +1610,22 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); + if (ub->ub_raidz_reflow_info != + cb.ubl_latest.ub_raidz_reflow_info) { + vdev_dbgmsg(cb.ubl_vd, + "spa=%s best uberblock (txg=%llu info=0x%llx) " + "has different raidz_reflow_info than latest " + "uberblock (txg=%llu info=0x%llx)", + spa->spa_name, + (u_longlong_t)ub->ub_txg, + (u_longlong_t)ub->ub_raidz_reflow_info, + (u_longlong_t)cb.ubl_latest.ub_txg, + (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info); + memset(ub, 0, sizeof (uberblock_t)); + spa_config_exit(spa, SCL_ALL, FTAG); + return; + } + *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); if (*config == NULL && spa->spa_extreme_rewind) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " @@ -1719,8 +1747,23 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, vd->vdev_copy_uberblocks = B_FALSE; } + /* + * We chose a slot based on the txg. If this uberblock has a special + * RAIDZ expansion state, then it is essentially an update of the + * current uberblock (it has the same txg). However, the current + * state is committed, so we want to write it to a different slot. If + * we overwrote the same slot, and we lose power during the uberblock + * write, and the disk does not do single-sector overwrites + * atomically (even though it is required to - i.e. we should see + * either the old or the new uberblock), then we could lose this + * txg's uberblock. Rewinding to the previous txg's uberblock may not + * be possible because RAIDZ expansion may have already overwritten + * some of the data, so we need the progress indicator in the + * uberblock. + */ int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; - int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m); + int n = (ub->ub_txg - (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) % + (VDEV_UBERBLOCK_COUNT(vd) - m); /* Copy the uberblock_t into the ABD */ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); @@ -1737,7 +1780,7 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, } /* Sync the uberblocks to all vdevs in svd[] */ -static int +int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { spa_t *spa = svd[0]->vdev_spa; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 3445fa9d3..9d0b8763f 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -27,15 +27,22 @@ #include #include +#include +#include #include +#include #include #include +#include #include +#include #include #include #include #include #include +#include +#include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -135,6 +142,237 @@ VDEV_RAIDZ_64MUL_2((x), mask); \ } + +/* + * Big Theory Statement for how a RAIDZ VDEV is expanded + * + * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion + * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs + * that have been previously expanded can be expanded again. + * + * The RAIDZ VDEV must be healthy (must be able to write to all the drives in + * the VDEV) when an expansion starts. And the expansion will pause if any + * disk in the VDEV fails, and resume once the VDEV is healthy again. All other + * operations on the pool can continue while an expansion is in progress (e.g. + * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, + * and zpool initialize which can't be run during an expansion. Following a + * reboot or export/import, the expansion resumes where it left off. + * + * == Reflowing the Data == + * + * The expansion involves reflowing (copying) the data from the current set + * of disks to spread it across the new set which now has one more disk. This + * reflow operation is similar to reflowing text when the column width of a + * text editor window is expanded. The text doesn’t change but the location of + * the text changes to accommodate the new width. An example reflow result for + * a 4-wide RAIDZ1 to a 5-wide is shown below. + * + * Reflow End State + * Each letter indicates a parity group (logical stripe) + * + * Before expansion After Expansion + * D1 D2 D3 D4 D1 D2 D3 D4 D5 + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | A | A | A | A | | A | A | A | A | B | + * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | B | B | C | C | | B | C | C | C | C | + * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | C | C | D | D | | D | D | E | E | E | + * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | E | E | E | E | --> | E | F | F | G | G | + * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | F | F | G | G | | G | G | H | H | H | + * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | G | G | H | H | | H | I | I | J | J | + * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | H | H | I | I | | J | J | | | K | + * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| + * +------+------+------+------+ +------+------+------+------+------+ + * + * This reflow approach has several advantages. There is no need to read or + * modify the block pointers or recompute any block checksums. The reflow + * doesn’t need to know where the parity sectors reside. We can read and write + * data sequentially and the copy can occur in a background thread in open + * context. The design also allows for fast discovery of what data to copy. + * + * The VDEV metaslabs are processed, one at a time, to copy the block data to + * have it flow across all the disks. The metaslab is disabled for allocations + * during the copy. As an optimization, we only copy the allocated data which + * can be determined by looking at the metaslab range tree. During the copy we + * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still + * need to be able to survive losing parity count disks). This means we + * cannot overwrite data during the reflow that would be needed if a disk is + * lost. + * + * After the reflow completes, all newly-written blocks will have the new + * layout, i.e., they will have the parity to data ratio implied by the new + * number of disks in the RAIDZ group. Even though the reflow copies all of + * the allocated space (data and parity), it is only rearranged, not changed. + * + * This act of reflowing the data has a few implications about blocks + * that were written before the reflow completes: + * + * - Old blocks will still use the same amount of space (i.e., they will have + * the parity to data ratio implied by the old number of disks in the RAIDZ + * group). + * - Reading old blocks will be slightly slower than before the reflow, for + * two reasons. First, we will have to read from all disks in the RAIDZ + * VDEV, rather than being able to skip the children that contain only + * parity of this block (because the data of a single block is now spread + * out across all the disks). Second, in most cases there will be an extra + * bcopy, needed to rearrange the data back to its original layout in memory. + * + * == Scratch Area == + * + * As we copy the block data, we can only progress to the point that writes + * will not overlap with blocks whose progress has not yet been recorded on + * disk. Since partially-copied rows are always read from the old location, + * we need to stop one row before the sector-wise overlap, to prevent any + * row-wise overlap. For example, in the diagram above, when we reflow sector + * B6 it will overwite the original location for B5. + * + * To get around this, a scratch space is used so that we can start copying + * without risking data loss by overlapping the row. As an added benefit, it + * improves performance at the beginning of the reflow, but that small perf + * boost wouldn't be worth the complexity on its own. + * + * Ideally we want to copy at least 2 * (new_width)^2 so that we have a + * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max + * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice + * the widths will likely be single digits so we can get a substantial chuck + * size using only a few MB of scratch per disk. + * + * The scratch area is persisted to disk which holds a large amount of reflowed + * state. We can always read the partially written stripes when a disk fails or + * the copy is interrupted (crash) during the initial copying phase and also + * get past a small chunk size restriction. At a minimum, the scratch space + * must be large enough to get us to the point that one row does not overlap + * itself when moved (i.e new_width^2). But going larger is even better. We + * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels + * as our scratch space to handle overwriting the initial part of the VDEV. + * + * 0 256K 512K 4M + * +------+------+-----------------------+----------------------------- + * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... + * | L0 | L1 | Reserved | (Metaslabs) + * +------+------+-----------------------+------------------------------- + * Scratch Area + * + * == Reflow Progress Updates == + * After the initial scratch-based reflow, the expansion process works + * similarly to device removal. We create a new open context thread which + * reflows the data, and periodically kicks off sync tasks to update logical + * state. In this case, state is the committed progress (offset of next data + * to copy). We need to persist the completed offset on disk, so that if we + * crash we know which format each VDEV offset is in. + * + * == Time Dependent Geometry == + * + * In non-expanded RAIDZ, blocks are read from disk in a column by column + * fashion. For a multi-row block, the second sector is in the first column + * not in the second column. This allows us to issue full reads for each + * column directly into the request buffer. The block data is thus laid out + * sequentially in a column-by-column fashion. + * + * For example, in the before expansion diagram above, one logical block might + * be sectors G19-H26. The parity is in G19,H23; and the data is in + * G20,H24,G21,H25,G22,H26. + * + * After a block is reflowed, the sectors that were all in the original column + * data can now reside in different columns. When reading from an expanded + * VDEV, we need to know the logical stripe width for each block so we can + * reconstitute the block’s data after the reads are completed. Likewise, + * when we perform the combinatorial reconstruction we need to know the + * original width so we can retry combinations from the past layouts. + * + * Time dependent geometry is what we call having blocks with different layouts + * (stripe widths) in the same VDEV. This time-dependent geometry uses the + * block’s birth time (+ the time expansion ended) to establish the correct + * width for a given block. After an expansion completes, we record the time + * for blocks written with a particular width (geometry). + * + * == On Disk Format Changes == + * + * New pool feature flag, 'raidz_expansion' whose reference count is the number + * of RAIDZ VDEVs that have been expanded. + * + * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. + * + * Since the uberblock can point to arbitrary blocks, which might be on the + * expanding RAIDZ, and might or might not have been expanded. We need to know + * which way a block is laid out before reading it. This info is the next + * offset that needs to be reflowed and we persist that in the uberblock, in + * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. + * After the expansion is complete, we then use the raidz_expand_txgs array + * (see below) to determine how to read a block and the ub_raidz_reflow_info + * field no longer required. + * + * The uberblock's ub_raidz_reflow_info field also holds the scratch space + * state (i.e., active or not) which is also required before reading a block + * during the initial phase of reflowing the data. + * + * The top-level RAIDZ VDEV has two new entries in the nvlist: + * + * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here + * and used after the expansion is complete to + * determine how to read a raidz block + * 'raidz_expanding' boolean: present during reflow and removed after completion + * used during a spa import to resume an unfinished + * expansion + * + * And finally the VDEVs top zap adds the following informational entries: + * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE + * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME + * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME + * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED + */ + +/* + * For testing only: pause the raidz expansion after reflowing this amount. + * (accessed by ZTS and ztest) + */ +#ifdef _KERNEL +static +#endif /* _KERNEL */ +unsigned long raidz_expand_max_reflow_bytes = 0; + +/* + * For testing only: pause the raidz expansion at a certain point. + */ +uint_t raidz_expand_pause_point = 0; + +/* + * Maximum amount of copy io's outstanding at once. + */ +static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; + +/* + * Apply raidz map abds aggregation if the number of rows in the map is equal + * or greater than the value below. + */ +static unsigned long raidz_io_aggregate_rows = 4; + +/* + * Automatically start a pool scrub when a RAIDZ expansion completes in + * order to verify the checksums of all blocks which have been copied + * during the expansion. Automatic scrubbing is enabled by default and + * is strongly recommended. + */ +static int zfs_scrub_after_expand = 1; + static void vdev_raidz_row_free(raidz_row_t *rr) { @@ -159,6 +397,17 @@ vdev_raidz_map_free(raidz_map_t *rm) for (int i = 0; i < rm->rm_nrows; i++) vdev_raidz_row_free(rm->rm_row[i]); + if (rm->rm_nphys_cols) { + for (int i = 0; i < rm->rm_nphys_cols; i++) { + if (rm->rm_phys_col[i].rc_abd != NULL) + abd_free(rm->rm_phys_col[i].rc_abd); + } + + kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * + rm->rm_nphys_cols); + } + + ASSERT3P(rm->rm_lr, ==, NULL); kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } @@ -170,10 +419,37 @@ vdev_raidz_map_free_vsd(zio_t *zio) vdev_raidz_map_free(rm); } +static int +vdev_raidz_reflow_compare(const void *x1, const void *x2) +{ + const reflow_node_t *l = x1; + const reflow_node_t *r = x2; + + return (TREE_CMP(l->re_txg, r->re_txg)); +} + const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, }; +raidz_row_t * +vdev_raidz_row_alloc(int cols) +{ + raidz_row_t *rr = + kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); + + rr->rr_cols = cols; + rr->rr_scols = cols; + + for (int c = 0; c < cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_shadow_devidx = INT_MAX; + rc->rc_shadow_offset = UINT64_MAX; + rc->rc_allow_repair = 1; + } + return (rr); +} + static void vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) { @@ -302,7 +578,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << ashift; - uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t acols, scols; raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); @@ -312,22 +588,22 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. */ - q = s / (dcols - nparity); + uint64_t q = s / (dcols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ - r = s - q * (dcols - nparity); + uint64_t r = s - q * (dcols - nparity); /* The number of "big columns" - those which contain remainder data. */ - bc = (r == 0 ? 0 : r + nparity); + uint64_t bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ - tot = s + nparity * (q + (r == 0 ? 0 : 1)); + uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* * acols: The columns that will be accessed. @@ -343,43 +619,28 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } ASSERT3U(acols, <=, scols); - - rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); + rr = vdev_raidz_row_alloc(scols); rm->rm_row[0] = rr; - rr->rr_cols = acols; - rr->rr_scols = scols; rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; rr->rr_firstdatacol = nparity; - rr->rr_abd_empty = NULL; - rr->rr_nempty = 0; #ifdef ZFS_DEBUG rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; #endif - asize = 0; + uint64_t asize = 0; - for (c = 0; c < scols; c++) { + for (uint64_t c = 0; c < scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; - col = f + c; - coff = o; + uint64_t col = f + c; + uint64_t coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } rc->rc_devidx = col; rc->rc_offset = coff; - rc->rc_abd = NULL; - rc->rc_orig_data = NULL; - rc->rc_error = 0; - rc->rc_tried = 0; - rc->rc_skipped = 0; - rc->rc_force_repair = 0; - rc->rc_allow_repair = 1; - rc->rc_need_orig_restore = B_FALSE; if (c >= acols) rc->rc_size = 0; @@ -419,13 +680,12 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { - devidx = rr->rr_col[0].rc_devidx; + uint64_t devidx = rr->rr_col[0].rc_devidx; o = rr->rr_col[0].rc_offset; rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[1].rc_devidx = devidx; rr->rr_col[1].rc_offset = o; - if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } @@ -435,7 +695,338 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } else { vdev_raidz_map_alloc_read(zio, rm); } + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + return (rm); +} + +/* + * Everything before reflow_offset_synced should have been moved to the new + * location (read and write completed). However, this may not yet be reflected + * in the on-disk format (e.g. raidz_reflow_sync() has been called but the + * uberblock has not yet been written). If reflow is not in progress, + * reflow_offset_synced should be UINT64_MAX. For each row, if the row is + * entirely before reflow_offset_synced, it will come from the new location. + * Otherwise this row will come from the old location. Therefore, rows that + * straddle the reflow_offset_synced will come from the old location. + * + * For writes, reflow_offset_next is the next offset to copy. If a sector has + * been copied, but not yet reflected in the on-disk progress + * (reflow_offset_synced), it will also be written to the new (already copied) + * offset. + */ +noinline raidz_map_t * +vdev_raidz_map_alloc_expanded(zio_t *zio, + uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, + uint64_t nparity, uint64_t reflow_offset_synced, + uint64_t reflow_offset_next, boolean_t use_scratch) +{ + abd_t *abd = zio->io_abd; + uint64_t offset = zio->io_offset; + uint64_t size = zio->io_size; + + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> ashift; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + * AKA "full rows" + */ + uint64_t q = s / (logical_cols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + uint64_t r = s - q * (logical_cols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + uint64_t bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* How many rows contain data (not skip) */ + uint64_t rows = howmany(tot, logical_cols); + int cols = MIN(tot, logical_cols); + + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), + KM_SLEEP); + rm->rm_nrows = rows; + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + rm->rm_skipstart = bc; + uint64_t asize = 0; + + for (uint64_t row = 0; row < rows; row++) { + boolean_t row_use_scratch = B_FALSE; + raidz_row_t *rr = vdev_raidz_row_alloc(cols); + rm->rm_row[row] = rr; + + /* The starting RAIDZ (parent) vdev sector of the row. */ + uint64_t b = (offset >> ashift) + row * logical_cols; + + /* + * If we are in the middle of a reflow, and the copying has + * not yet completed for any part of this row, then use the + * old location of this row. Note that reflow_offset_synced + * reflects the i/o that's been completed, because it's + * updated by a synctask, after zio_wait(spa_txg_zio[]). + * This is sufficient for our check, even if that progress + * has not yet been recorded to disk (reflected in + * spa_ubsync). Also note that we consider the last row to + * be "full width" (`cols`-wide rather than `bc`-wide) for + * this calculation. This causes a tiny bit of unnecessary + * double-writes but is safe and simpler to calculate. + */ + int row_phys_cols = physical_cols; + if (b + cols > reflow_offset_synced >> ashift) + row_phys_cols--; + else if (use_scratch) + row_use_scratch = B_TRUE; + + /* starting child of this row */ + uint64_t child_id = b % row_phys_cols; + /* The starting byte offset on each child vdev. */ + uint64_t child_offset = (b / row_phys_cols) << ashift; + + /* + * Note, rr_cols is the entire width of the block, even + * if this row is shorter. This is needed because parity + * generation (for Q and R) needs to know the entire width, + * because it treats the short row as though it was + * full-width (and the "phantom" sectors were zero-filled). + * + * Another approach to this would be to set cols shorter + * (to just the number of columns that we might do i/o to) + * and have another mechanism to tell the parity generation + * about the "entire width". Reconstruction (at least + * vdev_raidz_reconstruct_general()) would also need to + * know about the "entire width". + */ + rr->rr_firstdatacol = nparity; +#ifdef ZFS_DEBUG + /* + * note: rr_size is PSIZE, not ASIZE + */ + rr->rr_offset = b << ashift; + rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; +#endif + + for (int c = 0; c < rr->rr_cols; c++, child_id++) { + if (child_id >= row_phys_cols) { + child_id -= row_phys_cols; + child_offset += 1ULL << ashift; + } + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_devidx = child_id; + rc->rc_offset = child_offset; + + /* + * Get this from the scratch space if appropriate. + * This only happens if we crashed in the middle of + * raidz_reflow_scratch_sync() (while it's running, + * the rangelock prevents us from doing concurrent + * io), and even then only during zpool import or + * when the pool is imported readonly. + */ + if (row_use_scratch) + rc->rc_offset -= VDEV_BOOT_SIZE; + + uint64_t dc = c - rr->rr_firstdatacol; + if (c < rr->rr_firstdatacol) { + rc->rc_size = 1ULL << ashift; + + /* + * Parity sectors' rc_abd's are set below + * after determining if this is an aggregation. + */ + } else if (row == rows - 1 && bc != 0 && c >= bc) { + /* + * Past the end of the block (even including + * skip sectors). This sector is part of the + * map so that we have full rows for p/q parity + * generation. + */ + rc->rc_size = 0; + rc->rc_abd = NULL; + } else { + /* "data column" (col excluding parity) */ + uint64_t off; + + if (c < bc || r == 0) { + off = dc * rows + row; + } else { + off = r * rows + + (dc - r) * (rows - 1) + row; + } + rc->rc_size = 1ULL << ashift; + rc->rc_abd = abd_get_offset_struct( + &rc->rc_abdstruct, abd, off << ashift, + rc->rc_size); + } + + if (rc->rc_size == 0) + continue; + + /* + * If any part of this row is in both old and new + * locations, the primary location is the old + * location. If this sector was already copied to the + * new location, we need to also write to the new, + * "shadow" location. + * + * Note, `row_phys_cols != physical_cols` indicates + * that the primary location is the old location. + * `b+c < reflow_offset_next` indicates that the copy + * to the new location has been initiated. We know + * that the copy has completed because we have the + * rangelock, which is held exclusively while the + * copy is in progress. + */ + if (row_use_scratch || + (row_phys_cols != physical_cols && + b + c < reflow_offset_next >> ashift)) { + rc->rc_shadow_devidx = (b + c) % physical_cols; + rc->rc_shadow_offset = + ((b + c) / physical_cols) << ashift; + if (row_use_scratch) + rc->rc_shadow_offset -= VDEV_BOOT_SIZE; + } + + asize += rc->rc_size; + } + + /* + * See comment in vdev_raidz_map_alloc() + */ + if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && + (offset & (1ULL << 20))) { + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); + + int devidx0 = rr->rr_col[0].rc_devidx; + uint64_t offset0 = rr->rr_col[0].rc_offset; + int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; + uint64_t shadow_offset0 = + rr->rr_col[0].rc_shadow_offset; + + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[0].rc_shadow_devidx = + rr->rr_col[1].rc_shadow_devidx; + rr->rr_col[0].rc_shadow_offset = + rr->rr_col[1].rc_shadow_offset; + + rr->rr_col[1].rc_devidx = devidx0; + rr->rr_col[1].rc_offset = offset0; + rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; + rr->rr_col[1].rc_shadow_offset = shadow_offset0; + } + } + ASSERT3U(asize, ==, tot << ashift); + + /* + * Determine if the block is contiguous, in which case we can use + * an aggregation. + */ + if (rows >= raidz_io_aggregate_rows) { + rm->rm_nphys_cols = physical_cols; + rm->rm_phys_col = + kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, + KM_SLEEP); + + /* + * Determine the aggregate io's offset and size, and check + * that the io is contiguous. + */ + for (int i = 0; + i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + + if (rc->rc_size == 0) + continue; + + if (prc->rc_size == 0) { + ASSERT0(prc->rc_offset); + prc->rc_offset = rc->rc_offset; + } else if (prc->rc_offset + prc->rc_size != + rc->rc_offset) { + /* + * This block is not contiguous and + * therefore can't be aggregated. + * This is expected to be rare, so + * the cost of allocating and then + * freeing rm_phys_col is not + * significant. + */ + kmem_free(rm->rm_phys_col, + sizeof (raidz_col_t) * + rm->rm_nphys_cols); + rm->rm_phys_col = NULL; + rm->rm_nphys_cols = 0; + break; + } + prc->rc_size += rc->rc_size; + } + } + } + if (rm->rm_phys_col != NULL) { + /* + * Allocate aggregate ABD's. + */ + for (int i = 0; i < rm->rm_nphys_cols; i++) { + raidz_col_t *prc = &rm->rm_phys_col[i]; + + prc->rc_devidx = i; + + if (prc->rc_size == 0) + continue; + + prc->rc_abd = + abd_alloc_linear(rm->rm_phys_col[i].rc_size, + B_FALSE); + } + + /* + * Point the parity abd's into the aggregate abd's. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + rc->rc_abd = + abd_get_offset_struct(&rc->rc_abdstruct, + prc->rc_abd, + rc->rc_offset - prc->rc_offset, + rc->rc_size); + } + } + } else { + /* + * Allocate new abd's for the parity sectors. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = + abd_alloc_linear(rc->rc_size, + B_TRUE); + } + } + } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); @@ -453,11 +1044,11 @@ vdev_raidz_p_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; - int i, cnt = size / sizeof (src[0]); + int cnt = size / sizeof (src[0]); ASSERT(pqr->p && !pqr->q && !pqr->r); - for (i = 0; i < cnt; i++, src++, pqr->p++) + for (int i = 0; i < cnt; i++, src++, pqr->p++) *pqr->p ^= *src; return (0); @@ -469,11 +1060,11 @@ vdev_raidz_pq_func(void *buf, size_t size, void *private) struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; - int i, cnt = size / sizeof (src[0]); + int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && !pqr->r); - for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; @@ -488,11 +1079,11 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private) struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; - int i, cnt = size / sizeof (src[0]); + int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && pqr->r); - for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; @@ -618,7 +1209,15 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr) void vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { - ASSERT3U(rr->rr_cols, !=, 0); + if (rr->rr_cols == 0) { + /* + * We are handling this block one row at a time (because + * this block has a different logical vs physical width, + * due to RAIDZ expansion), and this is a pad-only row, + * which has no parity. + */ + return; + } /* Generate using the new math implementation */ if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) @@ -770,6 +1369,9 @@ vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) int x = tgts[0]; abd_t *dst, *src; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); + ASSERT3U(ntgts, ==, 1); ASSERT3U(x, >=, rr->rr_firstdatacol); ASSERT3U(x, <, rr->rr_cols); @@ -802,6 +1404,9 @@ vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) int c, exp; abd_t *dst, *src; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); + ASSERT(ntgts == 1); ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); @@ -848,6 +1453,9 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) int y = tgts[1]; abd_t *xd, *yd; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); + ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rr->rr_firstdatacol); @@ -1295,11 +1903,14 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) abd_t **bufs = NULL; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate * temporary linear ABDs if any non-linear ABDs are found. */ for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { + ASSERT(rr->rr_col[i].rc_abd != NULL); if (!abd_is_linear(rr->rr_col[i].rc_abd)) { bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE); @@ -1427,10 +2038,23 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { + zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", + rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, + (int)rr->rr_missingparity); + } + nbadparity = rr->rr_firstdatacol; nbaddata = rr->rr_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rr->rr_cols; c++) { + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { + zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " + "offset=%llx error=%u)", + rr, c, (int)rr->rr_col[c].rc_devidx, + (long long)rr->rr_col[c].rc_offset, + (int)rr->rr_col[c].rc_error); + } if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; @@ -1537,8 +2161,15 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *physical_ashift, cvd->vdev_physical_ashift); } - *asize *= vd->vdev_children; - *max_asize *= vd->vdev_children; + if (vd->vdev_rz_expanding) { + *asize *= vd->vdev_children - 1; + *max_asize *= vd->vdev_children - 1; + + vd->vdev_min_asize = *asize; + } else { + *asize *= vd->vdev_children; + *max_asize *= vd->vdev_children; + } if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; @@ -1557,19 +2188,71 @@ vdev_raidz_close(vdev_t *vd) } } +/* + * Return the logical width to use, given the txg in which the allocation + * happened. Note that BP_PHYSICAL_BIRTH() is usually the txg in which the + * BP was allocated. Remapped BP's (that were relocated due to device + * removal, see remap_blkptr_cb()), will have a more recent + * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can + * ignore these because they can't be on RAIDZ (device removal doesn't + * support RAIDZ). + */ static uint64_t -vdev_raidz_asize(vdev_t *vd, uint64_t psize) +vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) +{ + reflow_node_t lookup = { + .re_txg = txg, + }; + avl_index_t where; + + uint64_t width; + mutex_enter(&vdrz->vd_expand_lock); + reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); + if (re != NULL) { + width = re->re_logical_width; + } else { + re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); + if (re != NULL) + width = re->re_logical_width; + else + width = vdrz->vd_original_width; + } + mutex_exit(&vdrz->vd_expand_lock); + return (width); +} + +/* + * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated + * more space due to the lower data-to-parity ratio. In this case it's + * important to pass in the correct txg. Note that vdev_gang_header_asize() + * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, + * regardless of txg. This is assured because for a single data sector, we + * allocate P+1 sectors regardless of width ("cols", which is at least P+1). + */ +static uint64_t +vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vdrz->vd_logical_width; + uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; + cols = vdev_raidz_get_logical_width(vdrz, txg); + asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); asize = roundup(asize, nparity + 1) << ashift; +#ifdef ZFS_DEBUG + uint64_t asize_new = ((psize - 1) >> ashift) + 1; + uint64_t ncols_new = vdrz->vd_physical_width; + asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / + (ncols_new - nparity)); + asize_new = roundup(asize_new, nparity + 1) << ashift; + VERIFY3U(asize_new, <=, asize); +#endif + return (asize); } @@ -1596,21 +2279,37 @@ vdev_raidz_child_done(zio_t *zio) } static void -vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) +vdev_raidz_shadow_child_done(zio_t *zio) { -#ifdef ZFS_DEBUG - vdev_t *tvd = vd->vdev_top; + raidz_col_t *rc = zio->io_private; + rc->rc_shadow_error = zio->io_error; +} + +static void +vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) +{ + (void) rm; +#ifdef ZFS_DEBUG range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(vd, rr->rr_size); + vdev_raidz_asize(zio->io_vd, rr->rr_size, + BP_PHYSICAL_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; - vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); ASSERT(vdev_xlate_is_empty(&remain_rs)); + if (vdev_xlate_is_empty(&physical_rs)) { + /* + * If we are in the middle of expansion, the + * physical->logical mapping is changing so vdev_xlate() + * can't give us a reliable answer. + */ + return; + } ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* @@ -1621,7 +2320,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) */ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + - rc->rc_size + (1 << tvd->vdev_ashift)); + rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); } else { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); } @@ -1629,7 +2328,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) } static void -vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) +vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; @@ -1641,31 +2340,66 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; /* Verify physical to logical translation */ - vdev_raidz_io_verify(vd, rr, c); + vdev_raidz_io_verify(zio, rm, rr, c); - if (rc->rc_size > 0) { - ASSERT3P(rc->rc_abd, !=, NULL); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, - abd_get_size(rc->rc_abd), zio->io_type, - zio->io_priority, 0, vdev_raidz_child_done, rc)); - } else { - /* - * Generate optional write for skip sector to improve - * aggregation contiguity. - */ - ASSERT3P(rc->rc_abd, ==, NULL); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, NULL, 1ULL << ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, - NULL)); + if (rc->rc_size == 0) + continue; + + ASSERT3U(rc->rc_offset + rc->rc_size, <, + cvd->vdev_psize - VDEV_LABEL_END_SIZE); + + ASSERT3P(rc->rc_abd, !=, NULL); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, + abd_get_size(rc->rc_abd), zio->io_type, + zio->io_priority, 0, vdev_raidz_child_done, rc)); + + if (rc->rc_shadow_devidx != INT_MAX) { + vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; + + ASSERT3U( + rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, + cvd2->vdev_psize - VDEV_LABEL_END_SIZE); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, + rc->rc_shadow_offset, rc->rc_abd, + abd_get_size(rc->rc_abd), + zio->io_type, zio->io_priority, 0, + vdev_raidz_shadow_child_done, rc)); } } } +/* + * Generate optional I/Os for skip sectors to improve aggregation contiguity. + * This only works for vdev_raidz_map_alloc() (not _expanded()). + */ static void -vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) +raidz_start_skip_writes(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + raidz_map_t *rm = zio->io_vsd; + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + for (int c = 0; c < rr->rr_scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + if (rc->rc_size != 0) + continue; + ASSERT3P(rc->rc_abd, ==, NULL); + + ASSERT3U(rc->rc_offset, <, + cvd->vdev_psize - VDEV_LABEL_END_SIZE); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, + NULL, 1ULL << ashift, zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } +} + +static void +vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) { vdev_t *vd = zio->io_vd; @@ -1697,7 +2431,8 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_skipped = 1; continue; } - if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + if (forceparity || + c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, @@ -1707,6 +2442,56 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) } } +static void +vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) +{ + vdev_t *vd = zio->io_vd; + + for (int i = 0; i < rm->rm_nphys_cols; i++) { + raidz_col_t *prc = &rm->rm_phys_col[i]; + if (prc->rc_size == 0) + continue; + + ASSERT3U(prc->rc_devidx, ==, i); + vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { + prc->rc_error = SET_ERROR(ENXIO); + prc->rc_tried = 1; /* don't even try */ + prc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { + prc->rc_error = SET_ERROR(ESTALE); + prc->rc_skipped = 1; + continue; + } + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + prc->rc_offset, prc->rc_abd, prc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, prc)); + } +} + +static void +vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) +{ + /* + * If there are multiple rows, we will be hitting + * all disks, so go ahead and read the parity so + * that we are reading in decent size chunks. + */ + boolean_t forceparity = rm->rm_nrows > 1; + + if (rm->rm_phys_col) { + vdev_raidz_io_start_read_phys_cols(zio, rm); + } else { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_io_start_read_row(zio, rr, forceparity); + } + } +} + /* * Start an IO operation on a RAIDZ VDev * @@ -1730,24 +2515,83 @@ vdev_raidz_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_raidz_t *vdrz = vd->vdev_tsd; + raidz_map_t *rm; + + uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, + BP_PHYSICAL_BIRTH(zio->io_bp)); + if (logical_width != vdrz->vd_physical_width) { + zfs_locked_range_t *lr = NULL; + uint64_t synced_offset = UINT64_MAX; + uint64_t next_offset = UINT64_MAX; + boolean_t use_scratch = B_FALSE; + /* + * Note: when the expansion is completing, we set + * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) + * in a later txg than when we last update spa_ubsync's state + * (see the end of spa_raidz_expand_thread()). Therefore we + * may see vre_state!=SCANNING before + * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected + * on disk, but the copying progress has been synced to disk + * (and reflected in spa_ubsync). In this case it's fine to + * treat the expansion as completed, since if we crash there's + * no additional copying to do. + */ + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, + &vdrz->vn_vre); + lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, + zio->io_offset, zio->io_size, RL_READER); + use_scratch = + (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == + RRSS_SCRATCH_VALID); + synced_offset = + RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); + next_offset = vdrz->vn_vre.vre_offset; + /* + * If we haven't resumed expanding since importing the + * pool, vre_offset won't have been set yet. In + * this case the next offset to be copied is the same + * as what was synced. + */ + if (next_offset == UINT64_MAX) { + next_offset = synced_offset; + } + } + if (use_scratch) { + zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" + "%lld next_offset=%lld use_scratch=%u", + zio, + zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", + (long long)zio->io_offset, + (long long)synced_offset, + (long long)next_offset, + use_scratch); + } + + rm = vdev_raidz_map_alloc_expanded(zio, + tvd->vdev_ashift, vdrz->vd_physical_width, + logical_width, vdrz->vd_nparity, + synced_offset, next_offset, use_scratch); + rm->rm_lr = lr; + } else { + rm = vdev_raidz_map_alloc(zio, + tvd->vdev_ashift, logical_width, vdrz->vd_nparity); + } + rm->rm_original_width = vdrz->vd_original_width; - raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, - vdrz->vd_logical_width, vdrz->vd_nparity); zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; - - /* - * Until raidz expansion is implemented all maps for a raidz vdev - * contain a single row. - */ - ASSERT3U(rm->rm_nrows, ==, 1); - raidz_row_t *rr = rm->rm_row[0]; - if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_start_write(zio, rm->rm_row[i]); + } + + if (logical_width == vdrz->vd_physical_width) { + raidz_start_skip_writes(zio); + } } else { ASSERT(zio->io_type == ZIO_TYPE_READ); - vdev_raidz_io_start_read(zio, rr); + vdev_raidz_io_start_read(zio, rm); } zio_execute(zio); @@ -1847,6 +2691,8 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) continue; if (abd_cmp(orig[c], rc->rc_abd) != 0) { + zfs_dbgmsg("found error on col=%u devidx=%u off %llx", + c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); vdev_raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -1862,8 +2708,10 @@ vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; - for (int c = 0; c < rr->rr_cols; c++) + for (int c = 0; c < rr->rr_cols; c++) { error = zio_worst_error(error, rr->rr_col[c].rc_error); + error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); + } return (error); } @@ -1929,6 +2777,10 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) continue; } + zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " + "offset=%llx", + zio, c, rc->rc_devidx, (long long)rc->rc_offset); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, @@ -1938,6 +2790,42 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } + + /* + * Scrub or resilver i/o's: overwrite any shadow locations with the + * good data. This ensures that if we've already copied this sector, + * it will be corrected if it was damaged. This writes more than is + * necessary, but since expansion is paused during scrub/resilver, at + * most a single row will have a shadow location. + */ + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && + (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *vd = zio->io_vd; + + if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; + + /* + * Note: We don't want to update the repair stats + * because that would incorrectly indicate that there + * was bad data to repair, which we aren't sure about. + * By clearing the SCAN_THREAD flag, we prevent this + * from happening, despite having the REPAIR flag set. + * We need to set SELF_HEAL so that this i/o can't be + * bypassed by zio_vdev_io_start(). + */ + zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, + rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, + NULL, NULL); + cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; + zio_nowait(cio); + } + } } static void @@ -1956,6 +2844,43 @@ raidz_restore_orig_data(raidz_map_t *rm) } } +/* + * During raidz_reconstruct() for expanded VDEV, we need special consideration + * failure simulations. See note in raidz_reconstruct() on simulating failure + * of a pre-expansion device. + * + * Treating logical child i as failed, return TRUE if the given column should + * be treated as failed. The idea of logical children allows us to imagine + * that a disk silently failed before a RAIDZ expansion (reads from this disk + * succeed but return the wrong data). Since the expansion doesn't verify + * checksums, the incorrect data will be moved to new locations spread among + * the children (going diagonally across them). + * + * Higher "logical child failures" (values of `i`) indicate these + * "pre-expansion failures". The first physical_width values imagine that a + * current child failed; the next physical_width-1 values imagine that a + * child failed before the most recent expansion; the next physical_width-2 + * values imagine a child failed in the expansion before that, etc. + */ +static boolean_t +raidz_simulate_failure(int physical_width, int original_width, int ashift, + int i, raidz_col_t *rc) +{ + uint64_t sector_id = + physical_width * (rc->rc_offset >> ashift) + + rc->rc_devidx; + + for (int w = physical_width; w >= original_width; w--) { + if (i < w) { + return (sector_id % w == i); + } else { + i -= w; + } + } + ASSERT(!"invalid logical child id"); + return (B_FALSE); +} + /* * returns EINVAL if reconstruction of the block will not be possible * returns ECKSUM if this specific reconstruction failed @@ -1965,6 +2890,15 @@ static int raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) { raidz_map_t *rm = zio->io_vsd; + int physical_width = zio->io_vd->vdev_children; + int original_width = (rm->rm_original_width != 0) ? + rm->rm_original_width : physical_width; + int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; + + if (dbgmsg) { + zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " + "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); + } /* Reconstruct each row */ for (int r = 0; r < rm->rm_nrows; r++) { @@ -1974,6 +2908,9 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) int dead = 0; int dead_data = 0; + if (dbgmsg) + zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); + for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; ASSERT0(rc->rc_need_orig_restore); @@ -1986,7 +2923,10 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) if (rc->rc_size == 0) continue; for (int lt = 0; lt < ntgts; lt++) { - if (rc->rc_devidx == ltgts[lt]) { + if (raidz_simulate_failure(physical_width, + original_width, + zio->io_vd->vdev_top->vdev_ashift, + ltgts[lt], rc)) { if (rc->rc_orig_data == NULL) { rc->rc_orig_data = abd_alloc_linear( @@ -1999,13 +2939,37 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) dead++; if (c >= nparity) dead_data++; - my_tgts[t++] = c; + /* + * Note: simulating failure of a + * pre-expansion device can hit more + * than one column, in which case we + * might try to simulate more failures + * than can be reconstructed, which is + * also more than the size of my_tgts. + * This check prevents accessing past + * the end of my_tgts. The "dead > + * nparity" check below will fail this + * reconstruction attempt. + */ + if (t < VDEV_RAIDZ_MAXPARITY) { + my_tgts[t++] = c; + if (dbgmsg) { + zfs_dbgmsg("simulating " + "failure of col %u " + "devidx %u", c, + (int)rc->rc_devidx); + } + } break; } } } if (dead > nparity) { /* reconstruction not possible */ + if (dbgmsg) { + zfs_dbgmsg("reconstruction not possible; " + "too many failures"); + } raidz_restore_orig_data(rm); return (EINVAL); } @@ -2049,11 +3013,19 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) zio_checksum_verified(zio); + if (dbgmsg) { + zfs_dbgmsg("reconstruction successful " + "(checksum verified)"); + } return (0); } /* Reconstruction failed - restore original data */ raidz_restore_orig_data(rm); + if (dbgmsg) { + zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " + "failed", zio); + } return (ECKSUM); } @@ -2068,7 +3040,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) * The order that we find the various possible combinations of failed * disks is dictated by these rules: * - Examine each "slot" (the "i" in tgts[i]) - * - Try to increment this slot (tgts[i] = tgts[i] + 1) + * - Try to increment this slot (tgts[i] += 1) * - if we can't increment because it runs into the next slot, * reset our slot to the minimum, and examine the next slot * @@ -2099,18 +3071,22 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) * * This strategy works for dRAID but is less efficient when there are a large * number of child vdevs and therefore permutations to check. Furthermore, - * since the raidz_map_t rows likely do not overlap reconstruction would be + * since the raidz_map_t rows likely do not overlap, reconstruction would be * possible as long as there are no more than nparity data errors per row. * These additional permutations are not currently checked but could be as * a future improvement. + * + * Returns 0 on success, ECKSUM on failure. */ static int vdev_raidz_combrec(zio_t *zio) { int nparity = vdev_get_nparity(zio->io_vd); raidz_map_t *rm = zio->io_vsd; + int physical_width = zio->io_vd->vdev_children; + int original_width = (rm->rm_original_width != 0) ? + rm->rm_original_width : physical_width; - /* Check if there's enough data to attempt reconstrution. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; int total_errors = 0; @@ -2128,8 +3104,16 @@ vdev_raidz_combrec(zio_t *zio) int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *ltgts = &tstore[1]; /* value is logical child ID */ - /* Determine number of logical children, n */ - int n = zio->io_vd->vdev_children; + + /* + * Determine number of logical children, n. See comment + * above raidz_simulate_failure(). + */ + int n = 0; + for (int w = physical_width; + w >= original_width; w--) { + n += w; + } ASSERT3U(num_failures, <=, nparity); ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); @@ -2160,6 +3144,14 @@ vdev_raidz_combrec(zio_t *zio) if (ltgts[t] == n) { /* try more failures */ ASSERT3U(t, ==, num_failures - 1); + if (zfs_flags & + ZFS_DEBUG_RAIDZ_RECONSTRUCT) { + zfs_dbgmsg("reconstruction " + "failed for num_failures=" + "%u; tried all " + "combinations", + num_failures); + } break; } @@ -2171,7 +3163,7 @@ vdev_raidz_combrec(zio_t *zio) * Try the next combination. */ if (ltgts[t] != ltgts[t + 1]) - break; + break; // found next combination /* * Otherwise, reset this tgt to the minimum, @@ -2186,7 +3178,8 @@ vdev_raidz_combrec(zio_t *zio) break; } } - + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruction failed for all num_failures"); return (ECKSUM); } @@ -2211,7 +3204,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) static void vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) { - int total_errors = 0; + int normal_errors = 0; + int shadow_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); @@ -2220,24 +3214,31 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; - if (rc->rc_error) { + if (rc->rc_error != 0) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - - total_errors++; + normal_errors++; + } + if (rc->rc_shadow_error != 0) { + ASSERT(rc->rc_shadow_error != ECKSUM); + shadow_errors++; } } /* * Treat partial writes as a success. If we couldn't write enough - * columns to reconstruct the data, the I/O failed. Otherwise, - * good enough. + * columns to reconstruct the data, the I/O failed. Otherwise, good + * enough. Note that in the case of a shadow write (during raidz + * expansion), depending on if we crash, either the normal (old) or + * shadow (new) location may become the "real" version of the block, + * so both locations must have sufficient redundancy. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ - if (total_errors > rr->rr_firstdatacol) { + if (normal_errors > rr->rr_firstdatacol || + shadow_errors > rr->rr_firstdatacol) { zio->io_error = zio_worst_error(zio->io_error, vdev_raidz_worst_error(rr)); } @@ -2254,7 +3255,6 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; @@ -2337,7 +3337,7 @@ vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) * for a normal read then allocate an ABD for them now so they * may be read, verified, and any needed repairs performed. */ - if (rr->rr_nempty && rr->rr_abd_empty == NULL) + if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) vdev_draid_map_alloc_empty(zio, rr); for (int c = 0; c < rr->rr_cols; c++) { @@ -2395,11 +3395,48 @@ vdev_raidz_io_done(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; + ASSERT(zio->io_bp != NULL); if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); } } else { + if (rm->rm_phys_col) { + /* + * This is an aggregated read. Copy the data and status + * from the aggregate abd's to the individual rows. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_tried || rc->rc_size == 0) + continue; + + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + rc->rc_error = prc->rc_error; + rc->rc_tried = prc->rc_tried; + rc->rc_skipped = prc->rc_skipped; + if (c >= rr->rr_firstdatacol) { + /* + * Note: this is slightly faster + * than using abd_copy_off(). + */ + char *physbuf = abd_to_buf( + prc->rc_abd); + void *physloc = physbuf + + rc->rc_offset - + prc->rc_offset; + + abd_copy_from_buf(rc->rc_abd, + physloc, rc->rc_size); + } + } + } + } + for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_reconstruct_known_missing(zio, @@ -2446,7 +3483,54 @@ vdev_raidz_io_done(zio_t *zio) zio_vdev_io_redone(zio); return; } - + /* + * It would be too expensive to try every possible + * combination of failed sectors in every row, so + * instead we try every combination of failed current or + * past physical disk. This means that if the incorrect + * sectors were all on Nparity disks at any point in the + * past, we will find the correct data. The only known + * case where this is less durable than a non-expanded + * RAIDZ, is if we have a silent failure during + * expansion. In that case, one block could be + * partially in the old format and partially in the + * new format, so we'd lost some sectors from the old + * format and some from the new format. + * + * e.g. logical_width=4 physical_width=6 + * the 15 (6+5+4) possible failed disks are: + * width=6 child=0 + * width=6 child=1 + * width=6 child=2 + * width=6 child=3 + * width=6 child=4 + * width=6 child=5 + * width=5 child=0 + * width=5 child=1 + * width=5 child=2 + * width=5 child=3 + * width=5 child=4 + * width=4 child=0 + * width=4 child=1 + * width=4 child=2 + * width=4 child=3 + * And we will try every combination of Nparity of these + * failing. + * + * As a first pass, we can generate every combo, + * and try reconstructing, ignoring any known + * failures. If any row has too many known + simulated + * failures, then we bail on reconstructing with this + * number of simulated failures. As an improvement, + * we could detect the number of whole known failures + * (i.e. we have known failures on these disks for + * every row; the disks never succeeded), and + * subtract that from the max # failures to simulate. + * We could go even further like the current + * combrec code, but that doesn't seem like it + * gains us very much. If we simulate a failure + * that is also a known failure, that's fine. + */ zio->io_error = vdev_raidz_combrec(zio); if (zio->io_error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -2454,6 +3538,10 @@ vdev_raidz_io_done(zio_t *zio) } } } + if (rm->rm_lr != NULL) { + zfs_rangelock_exit(rm->rm_lr); + rm->rm_lr = NULL; + } } static void @@ -2480,6 +3568,14 @@ vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_raidz_t *vdrz = vd->vdev_tsd; + + /* + * If we're in the middle of a RAIDZ expansion, this block may be in + * the old and/or new location. For simplicity, always resilver it. + */ + if (vdrz->vn_vre.vre_state == DSS_SCANNING) + return (B_TRUE); + uint64_t dcols = vd->vdev_children; uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; @@ -2524,7 +3620,24 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); - uint64_t width = raidvd->vdev_children; + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + /* + * We're in the middle of expansion, in which case the + * translation is in flux. Any answer we give may be wrong + * by the time we return, so it isn't safe for the caller to + * act on it. Therefore we say that this range isn't present + * on any children. The only consumers of this are "zpool + * initialize" and trimming, both of which are "best effort" + * anyway. + */ + physical_rs->rs_start = physical_rs->rs_end = 0; + remain_rs->rs_start = remain_rs->rs_end = 0; + return; + } + + uint64_t width = vdrz->vd_physical_width; uint64_t tgt_col = cvd->vdev_id; uint64_t ashift = raidvd->vdev_top->vdev_ashift; @@ -2550,15 +3663,1155 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, logical_rs->rs_end - logical_rs->rs_start); } +static void +raidz_reflow_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + /* + * Ensure there are no i/os to the range that is being committed. + */ + uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); + ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); + + mutex_enter(&vre->vre_lock); + uint64_t new_offset = + MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); + /* + * We should not have committed anything that failed. + */ + VERIFY3U(vre->vre_failed_offset, >=, old_offset); + mutex_exit(&vre->vre_lock); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, + old_offset, new_offset - old_offset, + RL_WRITER); + + /* + * Update the uberblock that will be written when this txg completes. + */ + RAIDZ_REFLOW_SET(&spa->spa_uberblock, + RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); + vre->vre_offset_pertxg[txgoff] = 0; + zfs_rangelock_exit(lr); + + mutex_enter(&vre->vre_lock); + vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; + vre->vre_bytes_copied_pertxg[txgoff] = 0; + mutex_exit(&vre->vre_lock); + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, + sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); +} + +static void +raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + + for (int i = 0; i < TXG_SIZE; i++) + VERIFY0(vre->vre_offset_pertxg[i]); + + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; + re->re_logical_width = vdrz->vd_physical_width; + mutex_enter(&vdrz->vd_expand_lock); + avl_add(&vdrz->vd_expand_txgs, re); + mutex_exit(&vdrz->vd_expand_lock); + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + + /* + * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS + * will get written (based on vd_expand_txgs). + */ + vdev_config_dirty(vd); + + /* + * Before we change vre_state, the on-disk state must reflect that we + * have completed all copying, so that vdev_raidz_io_start() can use + * vre_state to determine if the reflow is in progress. See also the + * end of spa_raidz_expand_thread(). + */ + VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, + raidvd->vdev_ms_count << raidvd->vdev_ms_shift); + + vre->vre_end_time = gethrestime_sec(); + vre->vre_state = DSS_FINISHED; + + uint64_t state = vre->vre_state; + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state, tx)); + + uint64_t end_time = vre->vre_end_time; + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, + sizeof (end_time), 1, &end_time, tx)); + + spa->spa_uberblock.ub_raidz_reflow_info = 0; + + spa_history_log_internal(spa, "raidz vdev expansion completed", tx, + "%s vdev %llu new width %llu", spa_name(spa), + (unsigned long long)vd->vdev_id, + (unsigned long long)vd->vdev_children); + + spa->spa_raidz_expand = NULL; + raidvd->vdev_rz_expanding = B_FALSE; + + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); + spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); + + spa_notify_waiters(spa); + + /* + * While we're in syncing context take the opportunity to + * setup a scrub. All the data has been sucessfully copied + * but we have not validated any checksums. + */ + pool_scan_func_t func = POOL_SCAN_SCRUB; + if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0) + dsl_scan_setup_sync(&func, tx); +} + +/* + * Struct for one copy zio. + */ +typedef struct raidz_reflow_arg { + vdev_raidz_expand_t *rra_vre; + zfs_locked_range_t *rra_lr; + uint64_t rra_txg; +} raidz_reflow_arg_t; + +/* + * The write of the new location is done. + */ +static void +raidz_reflow_write_done(zio_t *zio) +{ + raidz_reflow_arg_t *rra = zio->io_private; + vdev_raidz_expand_t *vre = rra->rra_vre; + + abd_free(zio->io_abd); + + mutex_enter(&vre->vre_lock); + if (zio->io_error != 0) { + /* Force a reflow pause on errors */ + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + } + ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); + vre->vre_outstanding_bytes -= zio->io_size; + if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < + vre->vre_failed_offset) { + vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += + zio->io_size; + } + cv_signal(&vre->vre_cv); + mutex_exit(&vre->vre_lock); + + zfs_rangelock_exit(rra->rra_lr); + + kmem_free(rra, sizeof (*rra)); + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); +} + +/* + * The read of the old location is done. The parent zio is the write to + * the new location. Allow it to start. + */ +static void +raidz_reflow_read_done(zio_t *zio) +{ + raidz_reflow_arg_t *rra = zio->io_private; + vdev_raidz_expand_t *vre = rra->rra_vre; + + /* + * If the read failed, or if it was done on a vdev that is not fully + * healthy (e.g. a child that has a resilver in progress), we may not + * have the correct data. Note that it's OK if the write proceeds. + * It may write garbage but the location is otherwise unused and we + * will retry later due to vre_failed_offset. + */ + if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { + zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " + "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", + (long long)rra->rra_lr->lr_offset, + (long long)rra->rra_lr->lr_length, + (long long)rra->rra_txg, + zio->io_error, + vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), + vdev_dtl_empty(zio->io_vd, DTL_MISSING)); + mutex_enter(&vre->vre_lock); + /* Force a reflow pause on errors */ + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + mutex_exit(&vre->vre_lock); + } + + zio_nowait(zio_unique_parent(zio)); +} + +static void +raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, + dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (offset == 0) + return; + + mutex_enter(&vre->vre_lock); + ASSERT3U(vre->vre_offset, <=, offset); + vre->vre_offset = offset; + mutex_exit(&vre->vre_lock); + + if (vre->vre_offset_pertxg[txgoff] == 0) { + dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, + spa, tx); + } + vre->vre_offset_pertxg[txgoff] = offset; +} + +static boolean_t +vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) +{ + for (int i = 0; i < raidz_vd->vdev_children; i++) { + /* Quick check if a child is being replaced */ + if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, + dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + int ashift = vd->vdev_top->vdev_ashift; + uint64_t offset, size; + + if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, + &offset, &size)) { + return (B_FALSE); + } + ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); + ASSERT3U(size, >=, 1 << ashift); + uint64_t length = 1 << ashift; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + uint64_t blkid = offset >> ashift; + + int old_children = vd->vdev_children - 1; + + /* + * We can only progress to the point that writes will not overlap + * with blocks whose progress has not yet been recorded on disk. + * Since partially-copied rows are still read from the old location, + * we need to stop one row before the sector-wise overlap, to prevent + * row-wise overlap. + * + * Note that even if we are skipping over a large unallocated region, + * we can't move the on-disk progress to `offset`, because concurrent + * writes/allocations could still use the currently-unallocated + * region. + */ + uint64_t ubsync_blkid = + RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; + uint64_t next_overwrite_blkid = ubsync_blkid + + ubsync_blkid / old_children - old_children; + VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); + + if (blkid >= next_overwrite_blkid) { + raidz_reflow_record_progress(vre, + next_overwrite_blkid << ashift, tx); + return (B_TRUE); + } + + range_tree_remove(rt, offset, length); + + raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); + rra->rra_vre = vre; + rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, + offset, length, RL_WRITER); + rra->rra_txg = dmu_tx_get_txg(tx); + + raidz_reflow_record_progress(vre, offset + length, tx); + + mutex_enter(&vre->vre_lock); + vre->vre_outstanding_bytes += length; + mutex_exit(&vre->vre_lock); + + /* + * SCL_STATE will be released when the read and write are done, + * by raidz_reflow_write_done(). + */ + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + /* check if a replacing vdev was added, if so treat it as an error */ + if (vdev_raidz_expand_child_replacing(vd)) { + zfs_dbgmsg("replacing vdev encountered, reflow paused at " + "offset=%llu txg=%llu", + (long long)rra->rra_lr->lr_offset, + (long long)rra->rra_txg); + + mutex_enter(&vre->vre_lock); + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + cv_signal(&vre->vre_cv); + mutex_exit(&vre->vre_lock); + + /* drop everything we acquired */ + zfs_rangelock_exit(rra->rra_lr); + kmem_free(rra, sizeof (*rra)); + spa_config_exit(spa, SCL_STATE, spa); + return (B_TRUE); + } + + zio_t *pio = spa->spa_txg_zio[txgoff]; + abd_t *abd = abd_alloc_for_io(length, B_FALSE); + zio_t *write_zio = zio_vdev_child_io(pio, NULL, + vd->vdev_child[blkid % vd->vdev_children], + (blkid / vd->vdev_children) << ashift, + abd, length, + ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_reflow_write_done, rra); + + zio_nowait(zio_vdev_child_io(write_zio, NULL, + vd->vdev_child[blkid % old_children], + (blkid / old_children) << ashift, + abd, length, + ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_reflow_read_done, rra)); + + return (B_FALSE); +} + +/* + * For testing (ztest specific) + */ +static void +raidz_expand_pause(uint_t pause_point) +{ + while (raidz_expand_pause_point != 0 && + raidz_expand_pause_point <= pause_point) + delay(hz); +} + +static void +raidz_scratch_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); +} + +/* + * Reflow the beginning portion of the vdev into an intermediate scratch area + * in memory and on disk. This operation must be persisted on disk before we + * proceed to overwrite the beginning portion with the reflowed data. + * + * This multi-step task can fail to complete if disk errors are encountered + * and we can return here after a pause (waiting for disk to become healthy). + */ +static void +raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) +{ + vdev_raidz_expand_t *vre = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zio_t *pio; + int error; + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + int ashift = raidvd->vdev_ashift; + uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift); + uint64_t logical_size = write_size * raidvd->vdev_children; + uint64_t read_size = + P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), + 1 << ashift); + + /* + * The scratch space must be large enough to get us to the point + * that one row does not overlap itself when moved. This is checked + * by vdev_raidz_attach_check(). + */ + VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); + VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); + VERIFY3U(write_size, <=, read_size); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, + 0, logical_size, RL_WRITER); + + abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), + KM_SLEEP); + for (int i = 0; i < raidvd->vdev_children; i++) { + abds[i] = abd_alloc_linear(read_size, B_FALSE); + } + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); + + /* + * If we have already written the scratch area then we must read from + * there, since new writes were redirected there while we were paused + * or the original location may have been partially overwritten with + * reflowed data. + */ + if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { + VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); + /* + * Read from scratch space. + */ + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children; i++) { + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE + * to the offset to calculate the physical offset to + * write to. Passing in a negative offset makes us + * access the scratch area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, + raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + zfs_dbgmsg("reflow: error %d reading scratch location", + error); + goto io_error_exit; + } + goto overwrite; + } + + /* + * Read from original location. + */ + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children - 1; i++) { + ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], read_size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + zfs_dbgmsg("reflow: error %d reading original location", error); +io_error_exit: + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + zfs_rangelock_exit(lr); + spa_config_exit(spa, SCL_STATE, FTAG); + return; + } + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); + + /* + * Reflow in memory. + */ + uint64_t logical_sectors = logical_size >> ashift; + for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { + int oldchild = i % (raidvd->vdev_children - 1); + uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; + + int newchild = i % raidvd->vdev_children; + uint64_t newoff = (i / raidvd->vdev_children) << ashift; + + /* a single sector should not be copying over itself */ + ASSERT(!(newchild == oldchild && newoff == oldoff)); + + abd_copy_off(abds[newchild], abds[oldchild], + newoff, oldoff, 1 << ashift); + } + + /* + * Verify that we filled in everything we intended to (write_size on + * each child). + */ + VERIFY0(logical_sectors % raidvd->vdev_children); + VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, + write_size); + + /* + * Write to scratch location (boot area). + */ + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children; i++) { + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to + * the offset to calculate the physical offset to write to. + * Passing in a negative offset lets us access the boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + zfs_dbgmsg("reflow: error %d writing scratch location", error); + goto io_error_exit; + } + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", + (long long)logical_size); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); + + /* + * Update uberblock to indicate that scratch space is valid. This is + * needed because after this point, the real location may be + * overwritten. If we crash, we need to get the data from the + * scratch space, rather than the real location. + * + * Note: ub_timestamp is bumped so that vdev_uberblock_compare() + * will prefer this uberblock. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); + spa->spa_ubsync.ub_timestamp++; + ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + zfs_dbgmsg("reflow: uberblock updated " + "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); + + /* + * Overwrite with reflow'ed data. + */ +overwrite: + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children; i++) { + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, + raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + /* + * When we exit early here and drop the range lock, new + * writes will go into the scratch area so we'll need to + * read from there when we return after pausing. + */ + zfs_dbgmsg("reflow: error %d writing real location", error); + /* + * Update the uberblock that is written when this txg completes. + */ + RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, + logical_size); + goto io_error_exit; + } + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", + (long long)logical_size); + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); + + /* + * Update uberblock to indicate that the initial part has been + * reflow'ed. This is needed because after this point (when we exit + * the rangelock), we allow regular writes to this region, which will + * be written to the new location only (because reflow_offset_next == + * reflow_offset_synced). If we crashed and re-copied from the + * scratch space, we would lose the regular writes. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, + logical_size); + spa->spa_ubsync.ub_timestamp++; + ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + zfs_dbgmsg("reflow: uberblock updated " + "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); + + /* + * Update progress. + */ + vre->vre_offset = logical_size; + zfs_rangelock_exit(lr); + spa_config_exit(spa, SCL_STATE, FTAG); + + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vre->vre_offset_pertxg[txgoff] = vre->vre_offset; + vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; + /* + * Note - raidz_reflow_sync() will update the uberblock state to + * RRSS_SCRATCH_INVALID_SYNCED_REFLOW + */ + raidz_reflow_sync(spa, tx); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); +} + +/* + * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work + * here. No other i/o can be in progress, so we don't need the vre_rangelock. + */ +void +vdev_raidz_reflow_copy_scratch(spa_t *spa) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); + ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + ASSERT0(logical_size % raidvd->vdev_children); + uint64_t write_size = logical_size / raidvd->vdev_children; + + zio_t *pio; + + /* + * Read from scratch space. + */ + abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), + KM_SLEEP); + for (int i = 0; i < raidvd->vdev_children; i++) { + abds[i] = abd_alloc_linear(write_size, B_FALSE); + } + + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to + * the offset to calculate the physical offset to write to. + * Passing in a negative offset lets us access the boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, 0, + raidz_scratch_child_done, pio)); + } + zio_wait(pio); + + /* + * Overwrite real location with reflow'ed data. + */ + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, 0, + raidz_scratch_child_done, pio)); + } + zio_wait(pio); + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " + "to real location", (long long)logical_size); + + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + + /* + * Update uberblock. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, + RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); + spa->spa_ubsync.ub_timestamp++; + VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + zfs_dbgmsg("reflow recovery: uberblock updated " + "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, + spa_first_txg(spa)); + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vre->vre_offset = logical_size; + vre->vre_offset_pertxg[txgoff] = vre->vre_offset; + vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; + /* + * Note that raidz_reflow_sync() will update the uberblock once more + */ + raidz_reflow_sync(spa, tx); + + dmu_tx_commit(tx); + + spa_config_exit(spa, SCL_STATE, FTAG); +} + +static boolean_t +spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) +{ + (void) zthr; + spa_t *spa = arg; + + return (spa->spa_raidz_expand != NULL && + !spa->spa_raidz_expand->vre_waiting_for_resilver); +} + +/* + * RAIDZ expansion background thread + * + * Can be called multiple times if the reflow is paused + */ +static void +spa_raidz_expand_thread(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) + vre->vre_offset = 0; + else + vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); + + /* Reflow the begining portion using the scratch area */ + if (vre->vre_offset == 0) { + VERIFY0(dsl_sync_task(spa_name(spa), + NULL, raidz_reflow_scratch_sync, + vre, 0, ZFS_SPACE_CHECK_NONE)); + + /* if we encountered errors then pause */ + if (vre->vre_offset == 0) { + mutex_enter(&vre->vre_lock); + vre->vre_waiting_for_resilver = B_TRUE; + mutex_exit(&vre->vre_lock); + return; + } + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + uint64_t guid = raidvd->vdev_guid; + + /* Iterate over all the remaining metaslabs */ + for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; + i < raidvd->vdev_ms_count && + !zthr_iscancelled(zthr) && + vre->vre_failed_offset == UINT64_MAX; i++) { + metaslab_t *msp = raidvd->vdev_ms[i]; + + metaslab_disable(msp); + mutex_enter(&msp->ms_lock); + + /* + * The metaslab may be newly created (for the expanded + * space), in which case its trees won't exist yet, + * so we need to bail out early. + */ + if (msp->ms_new) { + mutex_exit(&msp->ms_lock); + metaslab_enable(msp, B_FALSE, B_FALSE); + continue; + } + + VERIFY0(metaslab_load(msp)); + + /* + * We want to copy everything except the free (allocatable) + * space. Note that there may be a little bit more free + * space (e.g. in ms_defer), and it's fine to copy that too. + */ + range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, + NULL, 0, 0); + range_tree_add(rt, msp->ms_start, msp->ms_size); + range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); + mutex_exit(&msp->ms_lock); + + /* + * Force the last sector of each metaslab to be copied. This + * ensures that we advance the on-disk progress to the end of + * this metaslab while the metaslab is disabled. Otherwise, we + * could move past this metaslab without advancing the on-disk + * progress, and then an allocation to this metaslab would not + * be copied. + */ + int sectorsz = 1 << raidvd->vdev_ashift; + uint64_t ms_last_offset = msp->ms_start + + msp->ms_size - sectorsz; + if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { + range_tree_add(rt, ms_last_offset, sectorsz); + } + + /* + * When we are resuming from a paused expansion (i.e. + * when importing a pool with a expansion in progress), + * discard any state that we have already processed. + */ + range_tree_clear(rt, 0, vre->vre_offset); + + while (!zthr_iscancelled(zthr) && + !range_tree_is_empty(rt) && + vre->vre_failed_offset == UINT64_MAX) { + + /* + * We need to periodically drop the config lock so that + * writers can get in. Additionally, we can't wait + * for a txg to sync while holding a config lock + * (since a waiting writer could cause a 3-way deadlock + * with the sync thread, which also gets a config + * lock for reader). So we can't hold the config lock + * while calling dmu_tx_assign(). + */ + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * If requested, pause the reflow when the amount + * specified by raidz_expand_max_reflow_bytes is reached + * + * This pause is only used during testing or debugging. + */ + while (raidz_expand_max_reflow_bytes != 0 && + raidz_expand_max_reflow_bytes <= + vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { + delay(hz); + } + + mutex_enter(&vre->vre_lock); + while (vre->vre_outstanding_bytes > + raidz_expand_max_copy_bytes) { + cv_wait(&vre->vre_cv, &vre->vre_lock); + } + mutex_exit(&vre->vre_lock); + + dmu_tx_t *tx = + dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + /* + * Reacquire the vdev_config lock. Theoretically, the + * vdev_t that we're expanding may have changed. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + boolean_t needsync = + raidz_reflow_impl(raidvd, vre, rt, tx); + + dmu_tx_commit(tx); + + if (needsync) { + spa_config_exit(spa, SCL_CONFIG, FTAG); + txg_wait_synced(spa->spa_dsl_pool, txg); + spa_config_enter(spa, SCL_CONFIG, FTAG, + RW_READER); + } + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + metaslab_enable(msp, B_FALSE, B_FALSE); + range_tree_vacate(rt, NULL, NULL); + range_tree_destroy(rt); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * The txg_wait_synced() here ensures that all reflow zio's have + * completed, and vre_failed_offset has been set if necessary. It + * also ensures that the progress of the last raidz_reflow_sync() is + * written to disk before raidz_reflow_complete_sync() changes the + * in-memory vre_state. vdev_raidz_io_start() uses vre_state to + * determine if a reflow is in progress, in which case we may need to + * write to both old and new locations. Therefore we can only change + * vre_state once this is not necessary, which is once the on-disk + * progress (in spa_ubsync) has been set past any possible writes (to + * the end of the last metaslab). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + + if (!zthr_iscancelled(zthr) && + vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { + /* + * We are not being canceled or paused, so the reflow must be + * complete. In that case also mark it as completed on disk. + */ + ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + raidz_reflow_complete_sync, spa, + 0, ZFS_SPACE_CHECK_NONE)); + (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); + } else { + /* + * Wait for all copy zio's to complete and for all the + * raidz_reflow_sync() synctasks to be run. + */ + spa_history_log_internal(spa, "reflow pause", + NULL, "offset=%llu failed_offset=%lld", + (long long)vre->vre_offset, + (long long)vre->vre_failed_offset); + mutex_enter(&vre->vre_lock); + if (vre->vre_failed_offset != UINT64_MAX) { + /* + * Reset progress so that we will retry everything + * after the point that something failed. + */ + vre->vre_offset = vre->vre_failed_offset; + vre->vre_failed_offset = UINT64_MAX; + vre->vre_waiting_for_resilver = B_TRUE; + } + mutex_exit(&vre->vre_lock); + } +} + +void +spa_start_raidz_expansion_thread(spa_t *spa) +{ + ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); + spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", + spa_raidz_expand_thread_check, spa_raidz_expand_thread, + spa, defclsyspri); +} + +void +raidz_dtl_reassessed(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + if (spa->spa_raidz_expand != NULL) { + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + /* + * we get called often from vdev_dtl_reassess() so make + * sure it's our vdev and any replacing is complete + */ + if (vd->vdev_top->vdev_id == vre->vre_vdev_id && + !vdev_raidz_expand_child_replacing(vd->vdev_top)) { + mutex_enter(&vre->vre_lock); + if (vre->vre_waiting_for_resilver) { + vdev_dbgmsg(vd, "DTL reassessed, " + "continuing raidz expansion"); + vre->vre_waiting_for_resilver = B_FALSE; + zthr_wakeup(spa->spa_raidz_expand_zthr); + } + mutex_exit(&vre->vre_lock); + } + } +} + +int +vdev_raidz_attach_check(vdev_t *new_child) +{ + vdev_t *raidvd = new_child->vdev_parent; + uint64_t new_children = raidvd->vdev_children; + + /* + * We use the "boot" space as scratch space to handle overwriting the + * initial part of the vdev. If it is too small, then this expansion + * is not allowed. This would be very unusual (e.g. ashift > 13 and + * >200 children). + */ + if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { + return (EINVAL); + } + return (0); +} + +void +vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *new_child = arg; + spa_t *spa = new_child->vdev_spa; + vdev_t *raidvd = new_child->vdev_parent; + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); + ASSERT3P(raidvd->vdev_top, ==, raidvd); + ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); + ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); + ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, + new_child); + + spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); + + vdrz->vd_physical_width++; + + VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); + vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; + vdrz->vn_vre.vre_offset = 0; + vdrz->vn_vre.vre_failed_offset = UINT64_MAX; + spa->spa_raidz_expand = &vdrz->vn_vre; + zthr_wakeup(spa->spa_raidz_expand_zthr); + + /* + * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get + * written to the config. + */ + vdev_config_dirty(raidvd); + + vdrz->vn_vre.vre_start_time = gethrestime_sec(); + vdrz->vn_vre.vre_end_time = 0; + vdrz->vn_vre.vre_state = DSS_SCANNING; + vdrz->vn_vre.vre_bytes_copied = 0; + + uint64_t state = vdrz->vn_vre.vre_state; + VERIFY0(zap_update(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state, tx)); + + uint64_t start_time = vdrz->vn_vre.vre_start_time; + VERIFY0(zap_update(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, + sizeof (start_time), 1, &start_time, tx)); + + (void) zap_remove(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); + (void) zap_remove(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); + + spa_history_log_internal(spa, "raidz vdev expansion started", tx, + "%s vdev %llu new width %llu", spa_name(spa), + (unsigned long long)raidvd->vdev_id, + (unsigned long long)raidvd->vdev_children); +} + +int +vdev_raidz_load(vdev_t *vd) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + int err; + + uint64_t state = DSS_NONE; + uint64_t start_time = 0; + uint64_t end_time = 0; + uint64_t bytes_copied = 0; + + if (vd->vdev_top_zap != 0) { + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, + sizeof (start_time), 1, &start_time); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, + sizeof (end_time), 1, &end_time); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, + sizeof (bytes_copied), 1, &bytes_copied); + if (err != 0 && err != ENOENT) + return (err); + } + + /* + * If we are in the middle of expansion, vre_state should have + * already been set by vdev_raidz_init(). + */ + EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); + vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; + vdrz->vn_vre.vre_start_time = start_time; + vdrz->vn_vre.vre_end_time = end_time; + vdrz->vn_vre.vre_bytes_copied = bytes_copied; + + return (0); +} + +int +spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + if (vre == NULL) { + /* no removal in progress; find most recent completed */ + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; + if (vd->vdev_ops == &vdev_raidz_ops) { + vdev_raidz_t *vdrz = vd->vdev_tsd; + + if (vdrz->vn_vre.vre_end_time != 0 && + (vre == NULL || + vdrz->vn_vre.vre_end_time > + vre->vre_end_time)) { + vre = &vdrz->vn_vre; + } + } + } + } + + if (vre == NULL) { + return (SET_ERROR(ENOENT)); + } + + pres->pres_state = vre->vre_state; + pres->pres_expanding_vdev = vre->vre_vdev_id; + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + pres->pres_to_reflow = vd->vdev_stat.vs_alloc; + + mutex_enter(&vre->vre_lock); + pres->pres_reflowed = vre->vre_bytes_copied; + for (int i = 0; i < TXG_SIZE; i++) + pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; + mutex_exit(&vre->vre_lock); + + pres->pres_start_time = vre->vre_start_time; + pres->pres_end_time = vre->vre_end_time; + pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; + + return (0); +} + /* * Initialize private RAIDZ specific fields from the nvlist. */ static int vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) { - vdev_raidz_t *vdrz; - uint64_t nparity; - uint_t children; nvlist_t **child; int error = nvlist_lookup_nvlist_array(nv, @@ -2566,6 +4819,7 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) if (error != 0) return (SET_ERROR(EINVAL)); + uint64_t nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); @@ -2592,10 +4846,56 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) nparity = 1; } - vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); - vdrz->vd_logical_width = children; + vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); + vdrz->vn_vre.vre_vdev_id = -1; + vdrz->vn_vre.vre_offset = UINT64_MAX; + vdrz->vn_vre.vre_failed_offset = UINT64_MAX; + mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); + zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); + mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, + sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); + + vdrz->vd_physical_width = children; vdrz->vd_nparity = nparity; + /* note, the ID does not exist when creating a pool */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, + &vdrz->vn_vre.vre_vdev_id); + + boolean_t reflow_in_progress = + nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + if (reflow_in_progress) { + spa->spa_raidz_expand = &vdrz->vn_vre; + vdrz->vn_vre.vre_state = DSS_SCANNING; + } + + vdrz->vd_original_width = children; + uint64_t *txgs; + unsigned int txgs_size = 0; + error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + &txgs, &txgs_size); + if (error == 0) { + for (int i = 0; i < txgs_size; i++) { + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = txgs[txgs_size - i - 1]; + re->re_logical_width = vdrz->vd_physical_width - i; + + if (reflow_in_progress) + re->re_logical_width--; + + avl_add(&vdrz->vd_expand_txgs, re); + } + + vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; + } + if (reflow_in_progress) { + vdrz->vd_original_width--; + zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", + children, txgs_size); + } + *tsd = vdrz; return (0); @@ -2604,7 +4904,20 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) static void vdev_raidz_fini(vdev_t *vd) { - kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); + vdev_raidz_t *vdrz = vd->vdev_tsd; + if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) + vd->vdev_spa->spa_raidz_expand = NULL; + reflow_node_t *re; + void *cookie = NULL; + avl_tree_t *tree = &vdrz->vd_expand_txgs; + while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) + kmem_free(re, sizeof (*re)); + avl_destroy(&vdrz->vd_expand_txgs); + mutex_destroy(&vdrz->vd_expand_lock); + mutex_destroy(&vdrz->vn_vre.vre_lock); + cv_destroy(&vdrz->vn_vre.vre_cv); + zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); + kmem_free(vdrz, sizeof (*vdrz)); } /* @@ -2632,6 +4945,29 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) * it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); + + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + } + + mutex_enter(&vdrz->vd_expand_lock); + if (!avl_is_empty(&vdrz->vd_expand_txgs)) { + uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); + uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, + KM_SLEEP); + uint64_t i = 0; + + for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); + re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { + txgs[i++] = re->re_txg; + } + + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + txgs, count); + + kmem_free(txgs, sizeof (uint64_t) * count); + } + mutex_exit(&vdrz->vd_expand_lock); } static uint64_t @@ -2671,3 +5007,15 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, + "For testing, pause RAIDZ expansion after reflowing this many bytes"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, + "Max amount of concurrent i/o for RAIDZ expansion"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, + "For expanded RAIDZ, aggregate reads that have more rows than this"); +ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, + "For expanded RAIDZ, automatically start a pool scrub when expansion " + "completes"); +/* END CSTYLED */ diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 03e17db02..1c54eae40 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -169,7 +169,8 @@ static boolean_t vdev_trim_should_stop(vdev_t *vd) { return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); + vd->vdev_detached || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding); } /* @@ -180,6 +181,7 @@ vdev_autotrim_should_stop(vdev_t *tvd) { return (tvd->vdev_autotrim_exit_wanted || !vdev_writeable(tvd) || tvd->vdev_removing || + tvd->vdev_rz_expanding || spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); } @@ -222,7 +224,8 @@ vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + if (vd == NULL || vd->vdev_top->vdev_removing || + !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; @@ -1005,6 +1008,7 @@ vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_rz_expanding); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); vd->vdev_trim_thread = thread_create(NULL, 0, @@ -1162,12 +1166,13 @@ vdev_trim_restart(vdev_t *vd) ASSERT(err == 0 || err == ENOENT); vd->vdev_trim_action_time = timestamp; - if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || - vd->vdev_offline) { + if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || + vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_trim_load(vd)); } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + !vd->vdev_top->vdev_rz_expanding && vd->vdev_trim_thread == NULL) { VERIFY0(vdev_trim_load(vd)); vdev_trim(vd, vd->vdev_trim_rate, @@ -1492,7 +1497,8 @@ vdev_autotrim(spa_t *spa) mutex_enter(&tvd->vdev_autotrim_lock); if (vdev_writeable(tvd) && !tvd->vdev_removing && - tvd->vdev_autotrim_thread == NULL) { + tvd->vdev_autotrim_thread == NULL && + !tvd->vdev_rz_expanding) { ASSERT3P(tvd->vdev_top, ==, tvd); tvd->vdev_autotrim_thread = thread_create(NULL, 0, @@ -1717,6 +1723,7 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_top->vdev_rz_expanding); ta.trim_vdev = vd; ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); diff --git a/scripts/zloop.sh b/scripts/zloop.sh index 83160c34a..7cda23743 100755 --- a/scripts/zloop.sh +++ b/scripts/zloop.sh @@ -252,38 +252,57 @@ while (( timeout == 0 )) || (( curtime <= (starttime + timeout) )); do or_die rm -rf "$workdir" or_die mkdir "$workdir" - # switch between three types of configs - # 1/3 basic, 1/3 raidz mix, and 1/3 draid mix - choice=$((RANDOM % 3)) - # ashift range 9 - 15 align=$(((RANDOM % 2) * 3 + 9)) + # choose parity value + parity=$(((RANDOM % 3) + 1)) + + draid_data=0 + draid_spares=0 + # randomly use special classes class="special=random" - if [[ $choice -eq 0 ]]; then - # basic mirror only - parity=1 + # choose between four types of configs + # (basic, raidz mix, raidz expansion, and draid mix) + case $((RANDOM % 4)) in + + # basic mirror configuration + 0) parity=1 mirrors=2 - draid_data=0 - draid_spares=0 raid_children=0 vdevs=2 raid_type="raidz" - elif [[ $choice -eq 1 ]]; then - # fully randomized mirror/raidz (sans dRAID) - parity=$(((RANDOM % 3) + 1)) - mirrors=$(((RANDOM % 3) * 1)) - draid_data=0 - draid_spares=0 + ;; + + # fully randomized mirror/raidz (sans dRAID) + 1) mirrors=$(((RANDOM % 3) * 1)) raid_children=$((((RANDOM % 9) + parity + 1) * (RANDOM % 2))) vdevs=$(((RANDOM % 3) + 3)) raid_type="raidz" - else - # fully randomized dRAID (sans mirror/raidz) - parity=$(((RANDOM % 3) + 1)) - mirrors=0 + ;; + + # randomized raidz expansion (one top-level raidz vdev) + 2) mirrors=0 + vdevs=1 + # derive initial raidz disk count based on parity choice + # P1: 3 - 7 disks + # P2: 5 - 9 disks + # P3: 7 - 11 disks + raid_children=$(((RANDOM % 5) + (parity * 2) + 1)) + + # 1/3 of the time use a dedicated '-X' raidz expansion test + if [[ $((RANDOM % 3)) -eq 0 ]]; then + zopt="$zopt -X -t 16" + raid_type="raidz" + else + raid_type="eraidz" + fi + ;; + + # fully randomized dRAID (sans mirror/raidz) + 3) mirrors=0 draid_data=$(((RANDOM % 8) + 3)) draid_spares=$(((RANDOM % 2) + parity)) stripe=$((draid_data + parity)) @@ -291,7 +310,11 @@ while (( timeout == 0 )) || (( curtime <= (starttime + timeout) )); do raid_children=$(((((RANDOM % 4) + 1) * stripe) + extra)) vdevs=$((RANDOM % 3)) raid_type="draid" - fi + ;; + *) + # avoid shellcheck SC2249 + ;; + esac zopt="$zopt -K $raid_type" zopt="$zopt -m $mirrors" diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index ef787c65c..20c9b823c 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -769,8 +769,12 @@ tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', tags = ['functional', 'redacted_send'] [tests/functional/raidz] -tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos'] +tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_expand_001_pos', + 'raidz_expand_002_pos', 'raidz_expand_003_neg', 'raidz_expand_003_pos', + 'raidz_expand_004_pos', 'raidz_expand_005_pos', 'raidz_expand_006_neg', + 'raidz_expand_007_neg'] tags = ['functional', 'raidz'] +timeout = 1200 [tests/functional/redundancy] tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 80e7bcb3b..fb861f1a2 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -34,6 +34,7 @@ DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync +EMBEDDED_SLOG_MIN_MS embedded_slog_min_ms zfs_embedded_slog_min_ms INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export @@ -62,6 +63,7 @@ MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_inter MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable +RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment @@ -69,6 +71,7 @@ RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms SCAN_LEGACY scan_legacy zfs_scan_legacy SCAN_SUSPEND_PROGRESS scan_suspend_progress zfs_scan_suspend_progress SCAN_VDEV_LIMIT scan_vdev_limit zfs_scan_vdev_limit +SCRUB_AFTER_EXPAND scrub_after_expand zfs_scrub_after_expand SEND_HOLES_WITHOUT_BIRTH_TIME send_holes_without_birth_time send_holes_without_birth_time SLOW_IO_EVENTS_PER_SECOND slow_io_events_per_second zfs_slow_io_events_per_second SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 87b50f59c..79aef1900 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1668,8 +1668,14 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/raidz/cleanup.ksh \ functional/raidz/raidz_001_neg.ksh \ functional/raidz/raidz_002_pos.ksh \ - functional/raidz/raidz_003_pos.ksh \ - functional/raidz/raidz_004_pos.ksh \ + functional/raidz/raidz_expand_001_pos.ksh \ + functional/raidz/raidz_expand_002_pos.ksh \ + functional/raidz/raidz_expand_003_neg.ksh \ + functional/raidz/raidz_expand_003_pos.ksh \ + functional/raidz/raidz_expand_004_pos.ksh \ + functional/raidz/raidz_expand_005_pos.ksh \ + functional/raidz/raidz_expand_006_neg.ksh \ + functional/raidz/raidz_expand_007_neg.ksh \ functional/raidz/setup.ksh \ functional/redacted_send/cleanup.ksh \ functional/redacted_send/redacted_compressed.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 4248578cd..6ebce9459 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -106,5 +106,6 @@ if is_linux || is_freebsd; then "feature@blake3" "feature@block_cloning" "feature@vdev_zaps_v2" + "feature@raidz_expansion" ) fi diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_002_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_002_pos.ksh index 746718ad9..4bd11a940 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_002_pos.ksh @@ -23,19 +23,39 @@ # # Copyright (c) 2016 by Gvozden Neskovic. All rights reserved. # Use is subject to license terms. +# Copyright (c) 2020 by vStack. All rights reserved. # . $STF_SUITE/include/libtest.shlib # # DESCRIPTION: -# Call the raidz_test tool with -S to test all supported raidz -# implementations. This options will test several raidz block geometries +# Call the raidz_test tool with sweep to test all supported raidz +# implementations. This will test several raidz block geometries # and several zio parameters that affect raidz block layout. Data -# reconstruction performs all combinations of failed disks. Wall time -# is set to 5min, but actual runtime might be longer. +# reconstruction performs all combinations of failed disks. Wall +# time is set to 5 min, but actual runtime might be longer. # -log_must raidz_test -S -t 300 - -log_pass "raidz_test parameter sweep test succeeded." +case $((RANDOM % 3)) in + 0) + # Basic sweep test + log_must raidz_test -S -t 300 + log_pass "raidz_test parameter sweep test succeeded." + ;; + 1) + # Using expanded raidz map to test all supported raidz + # implementations with expanded map and default reflow offset. + log_must raidz_test -S -e -t 300 + log_pass "raidz_test sweep test with expanded map succeeded." + ;; + 2) + # Using expanded raidz map ('-e') to test all supported raidz + # implementations with expanded map and zero reflow offset. + log_must raidz_test -S -e -r 0 -t 300 + log_pass "raidz_test sweep test with expanded map succeeded." + ;; + *) + # avoid shellcheck SC2249 + ;; +esac diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh deleted file mode 100755 index ce44906d5..000000000 --- a/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/ksh -p -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or https://opensource.org/licenses/CDDL-1.0. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# - -# -# Copyright (c) 2020 by vStack. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib - -# -# DESCRIPTION: -# Call the raidz_test tool with -S and -e to test all supported raidz -# implementations with expanded map and default reflow offset. -# This options will test several raidz block geometries and several zio -# parameters that affect raidz block layout. Data reconstruction performs -# all combinations of failed disks. Wall time is set to 5min, but actual -# runtime might be longer. -# - -log_must raidz_test -S -e -t 60 - -log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh deleted file mode 100755 index 0e3affd51..000000000 --- a/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/ksh -p -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or https://opensource.org/licenses/CDDL-1.0. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# - -# -# Copyright (c) 2020 by vStack. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib - -# -# DESCRIPTION: -# Call the raidz_test tool with -S and -e to test all supported raidz -# implementations with expanded map and zero reflow offset. -# This options will test several raidz block geometries and several zio -# parameters that affect raidz block layout. Data reconstruction performs -# all combinations of failed disks. Wall time is set to 5min, but actual -# runtime might be longer. -# - -log_must raidz_test -S -e -r 0 -t 60 - -log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh new file mode 100755 index 000000000..063d7fa73 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh @@ -0,0 +1,215 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should attach new device to the pool. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool +# - fill it with some directories/files +# - attach device to the raidz pool +# - verify that device attached and the raidz pool size increase +# - verify resilver by replacing parity devices +# - verify resilver by replacing data devices +# - verify scrub by zeroing parity devices +# - verify scrub by zeroing data devices +# - verify the raidz pool +# - destroy the raidz pool + +typeset -r devs=6 +typeset -r dev_size_mb=128 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + log_pos zpool status $TESTPOOL + + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES 0 +} + +function wait_expand_paused +{ + oldcopied='0' + newcopied='1' + while [[ $oldcopied != $newcopied ]]; do + oldcopied=$newcopied + sleep 2 + newcopied=$(zpool status $TESTPOOL | \ + grep 'copied out of' | \ + awk '{print $1}') + log_note "newcopied=$newcopied" + done + log_note "paused at $newcopied" +} + +function test_resilver # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool replace -f $pool $dir/dev-$i + done + + log_must zpool wait -t resilver $pool + + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool replace -f $pool $dir/dev-$i + done + + log_must zpool wait -t resilver $pool + + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool +} + +function test_scrub # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + typeset combrec=$4 + + reflow_size=$(get_pool_prop allocated $pool) + randbyte=$(( ((RANDOM<<15) + RANDOM) % $reflow_size )) + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $randbyte + log_must zpool attach $TESTPOOL ${raid}-0 $dir/dev-$devs + wait_expand_paused + + log_must zpool export $pool + + # zero out parity disks + for (( i=0; i<$nparity; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub -w $pool + log_must zpool clear $pool + log_must zpool export $pool + + # zero out parity count worth of data disks + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub -w $pool + + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES 0 + log_must zpool wait -t raidz_expand $TESTPOOL +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +# Disk file which will be attached +log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs + +nparity=$((RANDOM%(3) + 1)) +raid=raidz$nparity +dir=$TEST_BASE_DIR + +log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]} +log_must zfs set primarycache=metadata $TESTPOOL + +log_must zfs create $TESTPOOL/fs +log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + +log_must zfs create -o compress=on $TESTPOOL/fs2 +log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + +log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 +log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + +log_must check_pool_status $TESTPOOL "errors" "No known data errors" + +test_scrub $TESTPOOL $nparity $dir +test_resilver $TESTPOOL $nparity $dir + +zpool destroy "$TESTPOOL" + +log_pass "raidz expansion test succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh new file mode 100755 index 000000000..004f3d1f9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh @@ -0,0 +1,115 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should attach new devive to the pool. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - for each free test block device +# - attach to the pool +# - verify the raidz pool +# - destroy the raidz pool + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +nparity=$((RANDOM%(3) + 1)) +raid=raidz$nparity +dir=$TEST_BASE_DIR +pool=$TESTPOOL +opts="-o cachefile=none" + +log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} +log_must zfs set primarycache=metadata $pool + +log_must zfs create $pool/fs +log_must fill_fs /$pool/fs 1 512 100 1024 R + +log_must zfs create -o compress=on $pool/fs2 +log_must fill_fs /$pool/fs2 1 512 100 1024 R + +log_must zfs create -o compress=on -o recordsize=8k $pool/fs3 +log_must fill_fs /$pool/fs3 1 512 100 1024 R + +typeset pool_size=$(get_pool_prop size $pool) + +for disk in ${disks[$(($nparity+2))..$devs]}; do + log_must dd if=/dev/urandom of=/${pool}/FILE-$RANDOM bs=1M \ + count=64 + + log_must zpool attach -w $pool ${raid}-0 $disk + + # Wait some time for pool size increase + sleep 5 + + # Confirm that disk was attached to the pool + log_must zpool get -H path $TESTPOOL $disk + + typeset expand_size=$(get_pool_prop size $pool) + if [[ "$expand_size" -le "$pool_size" ]]; then + log_fail "pool $pool not expanded" + fi + + verify_pool $pool + + pool_size=$expand_size +done + +zpool destroy "$pool" + +log_pass "raidz expansion test succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh new file mode 100755 index 000000000..4d85c4689 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should reject device attach if pool +# is in checkpointed state. If checkpoint creation requested on +# expanding pool, the request should be rejected. + +# +# STRATEGY: +# 1. Create block device files for the test raidz pool. +# 2. Create pool and checkpoint it. +# 3. Try to expand raidz, ensure that request rejected. +# 4. Recreate the pool. +# 5. Apply raidz expansion. +# 6. Ensure, that checkpoint cannot be created. + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES 0 +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +nparity=1 +raid=raidz$nparity +pool=$TESTPOOL +opts="-o cachefile=none" + +# case 1: checkpoint exist, try to expand +log_must zpool create -f $opts $pool $raid ${disks[1..$(($devs-1))]} +log_must zfs set primarycache=metadata $pool +log_must zpool checkpoint $pool +log_mustnot zpool attach $pool ${raid}-0 ${disks[$devs]} +log_must zpool destroy $pool + +# +# case 2: expansion in progress, try to checkpoint +# +# Sets pause point at 25% of allocated space so that we know an +# expansion is still in progress when we attempt the checkpoint +# +log_must zpool create -f $opts $pool $raid ${disks[1..$(($devs-1))]} +log_must zfs set primarycache=metadata $pool +log_must zfs create $pool/fs +log_must fill_fs /$pool/fs 1 512 100 1024 R +allocated=$(zpool list -Hp -o allocated $pool) +log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $((allocated / 4)) +log_must zpool attach $pool ${raid}-0 ${disks[$devs]} +log_mustnot zpool checkpoint $pool +log_must zpool destroy $pool + +log_pass "raidz expansion test succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh new file mode 100755 index 000000000..712b25261 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh @@ -0,0 +1,141 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib + +# +# DESCRIPTION: +# Check raidz expansion is able to work correctly under i/o load. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - create couple of datasets with different recordsize and fill it +# - set a max reflow value near pool capacity +# - wait for reflow to reach this max +# - verify pool +# - set reflow bytes to max value to complete the expansion + +typeset -r devs=10 +typeset -r dev_size_mb=256 + +typeset -a disks + +embedded_slog_min_ms=$(get_tunable EMBEDDED_SLOG_MIN_MS) + +function cleanup +{ + poolexists "$TESTPOOL" && zpool status -v "$TESTPOOL" + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 EMBEDDED_SLOG_MIN_MS $embedded_slog_min_ms + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES 0 +} + +function wait_expand_paused +{ + oldcopied='0' + newcopied='1' + # wait until reflow copied value stops changing + while [[ $oldcopied != $newcopied ]]; do + oldcopied=$newcopied + sleep 1 + newcopied=$(zpool status $TESTPOOL | \ + grep 'copied out of' | \ + awk '{print $1}') + done +} + +log_onexit cleanup + +log_must set_tunable32 EMBEDDED_SLOG_MIN_MS 99999 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +nparity=$((RANDOM%(3) + 1)) +raid=raidz$nparity +pool=$TESTPOOL +opts="-o cachefile=none" + +log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + +log_must zfs create -o recordsize=8k $pool/fs +log_must fill_fs /$pool/fs 1 256 100 1024 R + +log_must zfs create -o recordsize=128k $pool/fs2 +log_must fill_fs /$pool/fs2 1 256 100 1024 R + +for disk in ${disks[$(($nparity+2))..$devs]}; do + log_must mkfile -n 400m /$pool/fs/file + log_bkgrnd randwritecomp /$pool/fs/file 250 + pid0=$! + + # start some random writes in the background during expansion + log_must mkfile -n 400m /$pool/fs2/file2 + log_bkgrnd randwritecomp /$pool/fs2/file2 250 + pid1=$! + sleep 10 + + # Pause at half total bytes to be copied for expansion + reflow_size=$(get_pool_prop allocated $pool) + log_note need to reflow $reflow_size bytes + pause=$((reflow_size/2)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $pause + + log_must zpool attach $pool ${raid}-0 $disk + wait_expand_paused + + kill_if_running $pid0 + kill_if_running $pid1 + + log_must zpool scrub -w $pool + + log_must check_pool_status $pool "errors" "No known data errors" + log_must check_pool_status $pool "scan" "with 0 errors" + log_must check_pool_status $pool "scan" "repaired 0B" + + # Set pause past largest possible value for this pool + pause=$((devs*dev_size_mb*1024*1024)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $pause + + log_must zpool wait -t raidz_expand $pool +done + +log_must zpool destroy "$pool" + +log_pass "raidz expansion test succeeded." + diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh new file mode 100755 index 000000000..2be55dae4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh @@ -0,0 +1,121 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Check device replacement during raidz expansion. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - create couple of datasets with different recordsize and fill it +# - attach new device to the pool +# - offline and zero vdevs allowed by parity +# - wait some time and start offlined vdevs replacement +# - wait replacement completion and verify pool status + +typeset -r devs=10 +typeset -r dev_size_mb=128 + +typeset -a disks + +embedded_slog_min_ms=$(get_tunable EMBEDDED_SLOG_MIN_MS) +original_scrub_after_expand=$(get_tunable SCRUB_AFTER_EXPAND) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 EMBEDDED_SLOG_MIN_MS $embedded_slog_min_ms + log_must set_tunable32 SCRUB_AFTER_EXPAND $original_scrub_after_expand +} + +log_onexit cleanup + +log_must set_tunable32 EMBEDDED_SLOG_MIN_MS 99999 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +nparity=$((RANDOM%(3) + 1)) +raid=raidz$nparity +pool=$TESTPOOL +opts="-o cachefile=none" + +log_must set_tunable32 SCRUB_AFTER_EXPAND 0 + +log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + +log_must zfs create -o recordsize=8k $pool/fs +log_must fill_fs /$pool/fs 1 128 100 1024 R + +log_must zfs create -o recordsize=128k $pool/fs2 +log_must fill_fs /$pool/fs2 1 128 100 1024 R + +for disk in ${disks[$(($nparity+2))..$devs]}; do + log_must zpool attach $pool ${raid}-0 $disk + + sleep 10 + + for (( i=1; i<=$nparity; i=i+1 )); do + log_must zpool offline $pool ${disks[$i]} + log_must dd if=/dev/zero of=${disks[$i]} \ + bs=1024k count=$dev_size_mb conv=notrunc + done + + sleep 3 + + for (( i=1; i<=$nparity; i=i+1 )); do + log_must zpool replace $pool ${disks[$i]} + done + + log_must zpool wait -t replace $pool + log_must check_pool_status $pool "scan" "with 0 errors" + + log_must zpool wait -t raidz_expand $pool + + log_must zpool clear $pool + log_must zpool scrub -w $pool + + log_must zpool status -v + log_must check_pool_status $pool "scan" "with 0 errors" +done + +log_must zpool destroy "$pool" + +log_pass "raidz expansion test succeeded." + diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh new file mode 100755 index 000000000..a31a7d254 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh @@ -0,0 +1,177 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Check device replacement during raidz expansion using expansion pausing. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - create couple of datasets with different recordsize and fill it +# - set raidz expand maximum reflow bytes +# - attach new device to the pool +# - wait for reflow bytes to reach the maximum +# - offline and zero vdevs allowed by parity +# - wait some time and start offlined vdevs replacement +# - wait replacement completion and verify pool status +# - loop thru vdevs replacing with the max reflow bytes increasing +# - verify pool +# - set reflow bytes to max value to complete the expansion + +typeset -r devs=10 +typeset -r dev_size_mb=128 + +typeset -a disks + +embedded_slog_min_ms=$(get_tunable EMBEDDED_SLOG_MIN_MS) +original_scrub_after_expand=$(get_tunable SCRUB_AFTER_EXPAND) + +function cleanup +{ + poolexists "$TESTPOOL" && zpool status -v "$TESTPOOL" + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 EMBEDDED_SLOG_MIN_MS $embedded_slog_min_ms + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES 0 + log_must set_tunable32 SCRUB_AFTER_EXPAND $original_scrub_after_expand +} + +function wait_expand_paused +{ + oldcopied='0' + newcopied='1' + while [[ $oldcopied != $newcopied ]]; do + oldcopied=$newcopied + sleep 1 + newcopied=$(zpool status $TESTPOOL | \ + grep 'copied out of' | \ + awk '{print $1}') + done +} + +log_onexit cleanup + +function test_replace # +{ + pool=${1} + devices=${2} + nparity=${3} + device_count=0 + + log_must echo "devices=$devices" + + for dev in ${devices}; do + device_count=$((device_count+1)) + done + + index=$((RANDOM%(device_count-nparity))) + for (( j=1; j<=$nparity; j=j+1 )); do + log_must zpool offline $pool ${disks[$((index+j))]} + log_must dd if=/dev/zero of=${disks[$((index+j))]} \ + bs=1024k count=$dev_size_mb conv=notrunc + done + + for (( j=1; j<=$nparity; j=j+1 )); do + log_must zpool replace $pool ${disks[$((index+j))]} + done + + log_must zpool wait -t replace $pool + log_must check_pool_status $pool "scan" "with 0 errors" + + log_must zpool clear $pool + log_must zpool scrub -w $pool + + log_must zpool status -v + log_must check_pool_status $pool "scan" "repaired 0B" +} + +log_must set_tunable32 EMBEDDED_SLOG_MIN_MS 99999 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +nparity=$((RANDOM%(3) + 1)) +raid=raidz$nparity +pool=$TESTPOOL +opts="-o cachefile=none" +devices="" + +log_must set_tunable32 SCRUB_AFTER_EXPAND 0 + +log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} +devices="${disks[1..$(($nparity+1))]}" + +log_must zfs create -o recordsize=8k $pool/fs +log_must fill_fs /$pool/fs 1 128 100 1024 R + +log_must zfs create -o recordsize=128k $pool/fs2 +log_must fill_fs /$pool/fs2 1 128 100 1024 R + +for disk in ${disks[$(($nparity+2))..$devs]}; do + # Set pause to some random value near halfway point + reflow_size=$(get_pool_prop allocated $pool) + pause=$((((RANDOM << 15) + RANDOM) % reflow_size / 2)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $pause + + log_must zpool attach $pool ${raid}-0 $disk + devices="$devices $disk" + + wait_expand_paused + + for (( i=0; i<2; i++ )); do + test_replace $pool "$devices" $nparity + + # Increase pause by about 25% + pause=$((pause + (((RANDOM << 15) + RANDOM) % \ + reflow_size) / 4)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $pause + + wait_expand_paused + done + + # Set pause past largest possible value for this pool + pause=$((devs*dev_size_mb*1024*1024)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $pause + + log_must zpool wait -t raidz_expand $pool +done + +log_must zpool destroy "$pool" + +log_pass "raidz expansion test succeeded." + diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_006_neg.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_006_neg.ksh new file mode 100755 index 000000000..35ba8bde2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_006_neg.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should fail if raidz_expansion +# feature is not enabled. +# +# STRATEGY: +# 1. Create raidz pool with raidz_expansion feature disabled +# 2. Attempt to attach a device to the raidz vdev +# 3. Verify that device attached failed +# 4. Destroy the raidz pool + +typeset -r devs=4 +typeset -r dev_size_mb=128 +typeset -a disks + +function cleanup +{ + log_pos zpool status "$TESTPOOL" + + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done +} + +log_onexit cleanup + +for i in {0..$devs}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M "$device" + if [[ $i -ne $devs ]]; then + disks[${#disks[*]}+1]=$device + fi +done + +# create a pool with raidz_expansion feature disabled +log_must zpool create -f -o cachefile=none -o feature@raidz_expansion=disabled \ + "$TESTPOOL" raidz1 "${disks[@]}" +status=$(zpool list -H -o feature@raidz_expansion "$TESTPOOL") +if [[ "$status" != "disabled" ]]; then + log_fail "raidz_expansion feature was not disabled" +fi + +# expecting attach to fail +log_mustnot_expect "raidz_expansion feature must be enabled" zpool attach -f \ + "$TESTPOOL" raidz1-0 "$TEST_BASE_DIR/dev-$devs" +log_must zpool destroy "$TESTPOOL" + +log_pass "raidz attach failed with feature disabled as expected" diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_007_neg.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_007_neg.ksh new file mode 100755 index 000000000..78294cb9e --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_007_neg.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Negative for FreeBSD Only +# +# Attempting to expand a RAIDZ should fail if the scratch area on the +# existing disks contains BTX Server binary (used to boot FreeBSD when +# using MBR partitions with ZFS). +# +# STRATEGY: +# 1. Create raidz pool +# 2. Add a BTX header to the reserved boot area +# 3. Attempt to attach a device to the raidz vdev +# 4. Verify that device attached failed +# 5. Destroy the raidz pool + +typeset -r devs=4 +typeset -r dev_size_mb=128 +typeset -a disks + +function cleanup +{ + log_pos zpool status "$TESTPOOL" + + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done +} + +log_onexit cleanup + +for i in {0..$devs}; do + device=$TEST_BASE_DIR/dev-$i + # simulate active BTX Server data by inserting a BTX header + printf "\xeb\x0e%s\x01\x02\x80" "BTX" | dd of="$device" \ + bs=512 seek=1024 status=none + log_must truncate -s ${dev_size_mb}M "$device" + if [[ $i -ne $devs ]]; then + disks[${#disks[*]}+1]=$device + fi +done + +log_must zpool create -f -o cachefile=none "$TESTPOOL" raidz1 "${disks[@]}" + +if is_freebsd; then + # expecting attach to fail + log_mustnot_expect "the reserved boot area" zpool attach -f \ + "$TESTPOOL" raidz1-0 "$TEST_BASE_DIR/dev-$devs" + log_must zpool destroy "$TESTPOOL" + log_pass "raidz attach failed with in-use reserved boot area" +else + # expecting attach to pass everywhere else + log_must zpool attach -f "$TESTPOOL" raidz1-0 "$TEST_BASE_DIR/dev-$devs" + log_must zpool destroy "$TESTPOOL" + log_pass "raidz attach passed with in-use reserved boot area" +fi +