diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c index 8a2cec4ca..a3446c52c 100644 --- a/cmd/raidz_test/raidz_bench.c +++ b/cmd/raidz_test/raidz_bench.c @@ -83,8 +83,17 @@ run_gen_bench_impl(const char *impl) /* create suitable raidz_map */ ncols = rto_opts.rto_dcols + fn + 1; zio_bench.io_size = 1ULL << ds; - rm_bench = vdev_raidz_map_alloc(&zio_bench, - BENCH_ASHIFT, ncols, fn+1); + + if (rto_opts.rto_expand) { + rm_bench = vdev_raidz_map_alloc_expanded( + zio_bench.io_abd, + zio_bench.io_size, zio_bench.io_offset, + rto_opts.rto_ashift, ncols+1, ncols, + fn+1, rto_opts.rto_expand_offset); + } else { + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, fn+1); + } /* estimate iteration count */ iter_cnt = GEN_BENCH_MEMORY; @@ -163,8 +172,16 @@ run_rec_bench_impl(const char *impl) (1ULL << BENCH_ASHIFT)) continue; - rm_bench = vdev_raidz_map_alloc(&zio_bench, - BENCH_ASHIFT, ncols, PARITY_PQR); + if (rto_opts.rto_expand) { + rm_bench = vdev_raidz_map_alloc_expanded( + zio_bench.io_abd, + zio_bench.io_size, zio_bench.io_offset, + BENCH_ASHIFT, ncols+1, ncols, + PARITY_PQR, rto_opts.rto_expand_offset); + } else { + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, PARITY_PQR); + } /* estimate iteration count */ iter_cnt = (REC_BENCH_MEMORY); diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index 66f36b0d5..4e2639f36 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -77,16 +77,20 @@ static void print_opts(raidz_test_opts_t *opts, boolean_t force) (void) fprintf(stdout, DBLSEP "Running with options:\n" " (-a) zio ashift : %zu\n" " (-o) zio offset : 1 << %zu\n" + " (-e) expanded map : %s\n" + " (-r) reflow offset : %llx\n" " (-d) number of raidz data columns : %zu\n" " (-s) size of DATA : 1 << %zu\n" " (-S) sweep parameters : %s \n" " (-v) verbose : %s \n\n", - opts->rto_ashift, /* -a */ - ilog2(opts->rto_offset), /* -o */ - opts->rto_dcols, /* -d */ - ilog2(opts->rto_dsize), /* -s */ - opts->rto_sweep ? "yes" : "no", /* -S */ - verbose); /* -v */ + opts->rto_ashift, /* -a */ + ilog2(opts->rto_offset), /* -o */ + opts->rto_expand ? "yes" : "no", /* -e */ + (u_longlong_t)opts->rto_expand_offset, /* -r */ + opts->rto_dcols, /* -d */ + ilog2(opts->rto_dsize), /* -s */ + opts->rto_sweep ? "yes" : "no", /* -S */ + verbose); /* -v */ } } @@ -104,6 +108,8 @@ static void usage(boolean_t requested) "\t[-S parameter sweep (default: %s)]\n" "\t[-t timeout for parameter sweep test]\n" "\t[-B benchmark all raidz implementations]\n" + "\t[-e use expanded raidz map (default: %s)]\n" + "\t[-r expanded raidz map reflow offset (default: %llx)]\n" "\t[-v increase verbosity (default: %zu)]\n" "\t[-h (print help)]\n" "\t[-T test the test, see if failure would be detected]\n" @@ -114,6 +120,8 @@ static void usage(boolean_t requested) o->rto_dcols, /* -d */ ilog2(o->rto_dsize), /* -s */ rto_opts.rto_sweep ? "yes" : "no", /* -S */ + rto_opts.rto_expand ? "yes" : "no", /* -e */ + (u_longlong_t)o->rto_expand_offset, /* -r */ o->rto_v); /* -d */ exit(requested ? 0 : 1); @@ -128,7 +136,7 @@ static void process_options(int argc, char **argv) bcopy(&rto_opts_defaults, o, sizeof (*o)); - while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) { + while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) { value = 0; switch (opt) { @@ -136,6 +144,12 @@ static void process_options(int argc, char **argv) value = strtoull(optarg, NULL, 0); o->rto_ashift = MIN(13, MAX(9, value)); break; + case 'e': + o->rto_expand = 1; + break; + case 'r': + o->rto_expand_offset = strtoull(optarg, NULL, 0); + break; case 'o': value = strtoull(optarg, NULL, 0); o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; @@ -179,25 +193,34 @@ static void process_options(int argc, char **argv) } } -#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) -#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) +#define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd) +#define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size) -#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) -#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) +#define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd) +#define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size) static int cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) { - int i, ret = 0; + int r, i, ret = 0; VERIFY(parity >= 1 && parity <= 3); - for (i = 0; i < parity; i++) { - if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i)) - != 0) { - ret++; - LOG_OPT(D_DEBUG, opts, - "\nParity block [%d] different!\n", i); + for (r = 0; r < rm->rm_nrows; r++) { + raidz_row_t * const rr = rm->rm_row[r]; + raidz_row_t * const rrg = opts->rm_golden->rm_row[r]; + for (i = 0; i < parity; i++) { + if (CODE_COL_SIZE(rrg, i) == 0) { + VERIFY0(CODE_COL_SIZE(rr, i)); + continue; + } + + if (abd_cmp(CODE_COL(rr, i), + CODE_COL(rrg, i)) != 0) { + ret++; + LOG_OPT(D_DEBUG, opts, + "\nParity block [%d] different!\n", i); + } } } return (ret); @@ -206,16 +229,26 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) static int cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) { - int i, ret = 0; - int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); + int r, i, dcols, ret = 0; - for (i = 0; i < dcols; i++) { - if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i)) - != 0) { - ret++; + for (r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + raidz_row_t *rrg = opts->rm_golden->rm_row[r]; + dcols = opts->rm_golden->rm_row[0]->rr_cols - + raidz_parity(opts->rm_golden); + for (i = 0; i < dcols; i++) { + if (DATA_COL_SIZE(rrg, i) == 0) { + VERIFY0(DATA_COL_SIZE(rr, i)); + continue; + } - LOG_OPT(D_DEBUG, opts, - "\nData block [%d] different!\n", i); + if (abd_cmp(DATA_COL(rrg, i), + DATA_COL(rr, i)) != 0) { + ret++; + + LOG_OPT(D_DEBUG, opts, + "\nData block [%d] different!\n", i); + } } } return (ret); @@ -236,12 +269,13 @@ init_rand(void *data, size_t size, void *private) static void corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) { - int i; - raidz_col_t *col; - - for (i = 0; i < cnt; i++) { - col = &rm->rm_col[tgts[i]]; - abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL); + for (int r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + for (int i = 0; i < cnt; i++) { + raidz_col_t *col = &rr->rr_col[tgts[i]]; + abd_iterate_func(col->rc_abd, 0, col->rc_size, + init_rand, NULL); + } } } @@ -288,10 +322,22 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) VERIFY0(vdev_raidz_impl_set("original")); - opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, - opts->rto_ashift, total_ncols, parity); - rm_test = vdev_raidz_map_alloc(zio_test, - opts->rto_ashift, total_ncols, parity); + if (opts->rto_expand) { + opts->rm_golden = + vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, + opts->zio_golden->io_size, opts->zio_golden->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, + zio_test->io_size, zio_test->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + } else { + opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, + opts->rto_ashift, total_ncols, parity); + rm_test = vdev_raidz_map_alloc(zio_test, + opts->rto_ashift, total_ncols, parity); + } VERIFY(opts->zio_golden); VERIFY(opts->rm_golden); @@ -312,6 +358,188 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) return (err); } +/* + * If reflow is not in progress, reflow_offset should be UINT64_MAX. + * For each row, if the row is entirely before reflow_offset, it will + * come from the new location. Otherwise this row will come from the + * old location. Therefore, rows that straddle the reflow_offset will + * come from the old location. + * + * NOTE: Until raidz expansion is implemented this function is only + * needed by raidz_test.c to the multi-row raid_map_t functionality. + */ +raidz_map_t * +vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, + uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, + uint64_t nparity, uint64_t reflow_offset) +{ + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> ashift; + uint64_t q, r, bc, devidx, asize = 0, tot; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + * AKA "full rows" + */ + q = s / (logical_cols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = s - q * (logical_cols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* How many rows contain data (not skip) */ + uint64_t rows = howmany(tot, logical_cols); + int cols = MIN(tot, logical_cols); + + raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), + KM_SLEEP); + rm->rm_nrows = rows; + + for (uint64_t row = 0; row < rows; row++) { + raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, + rr_col[cols]), KM_SLEEP); + rm->rm_row[row] = rr; + + /* The starting RAIDZ (parent) vdev sector of the row. */ + uint64_t b = (offset >> ashift) + row * logical_cols; + + /* + * If we are in the middle of a reflow, and any part of this + * row has not been copied, then use the old location of + * this row. + */ + int row_phys_cols = physical_cols; + if (b + (logical_cols - nparity) > reflow_offset >> ashift) + row_phys_cols--; + + /* starting child of this row */ + uint64_t child_id = b % row_phys_cols; + /* The starting byte offset on each child vdev. */ + uint64_t child_offset = (b / row_phys_cols) << ashift; + + /* + * We set cols to the entire width of the block, even + * if this row is shorter. This is needed because parity + * generation (for Q and R) needs to know the entire width, + * because it treats the short row as though it was + * full-width (and the "phantom" sectors were zero-filled). + * + * Another approach to this would be to set cols shorter + * (to just the number of columns that we might do i/o to) + * and have another mechanism to tell the parity generation + * about the "entire width". Reconstruction (at least + * vdev_raidz_reconstruct_general()) would also need to + * know about the "entire width". + */ + rr->rr_cols = cols; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = nparity; + rr->rr_abd_copy = NULL; + rr->rr_abd_empty = NULL; + rr->rr_nempty = 0; + + for (int c = 0; c < rr->rr_cols; c++, child_id++) { + if (child_id >= row_phys_cols) { + child_id -= row_phys_cols; + child_offset += 1ULL << ashift; + } + rr->rr_col[c].rc_devidx = child_id; + rr->rr_col[c].rc_offset = child_offset; + rr->rr_col[c].rc_gdata = NULL; + rr->rr_col[c].rc_orig_data = NULL; + rr->rr_col[c].rc_error = 0; + rr->rr_col[c].rc_tried = 0; + rr->rr_col[c].rc_skipped = 0; + rr->rr_col[c].rc_need_orig_restore = B_FALSE; + + uint64_t dc = c - rr->rr_firstdatacol; + if (c < rr->rr_firstdatacol) { + rr->rr_col[c].rc_size = 1ULL << ashift; + rr->rr_col[c].rc_abd = + abd_alloc_linear(rr->rr_col[c].rc_size, + B_TRUE); + } else if (row == rows - 1 && bc != 0 && c >= bc) { + /* + * Past the end, this for parity generation. + */ + rr->rr_col[c].rc_size = 0; + rr->rr_col[c].rc_abd = NULL; + } else { + /* + * "data column" (col excluding parity) + * Add an ASCII art diagram here + */ + uint64_t off; + + if (c < bc || r == 0) { + off = dc * rows + row; + } else { + off = r * rows + + (dc - r) * (rows - 1) + row; + } + rr->rr_col[c].rc_size = 1ULL << ashift; + rr->rr_col[c].rc_abd = + abd_get_offset(abd, off << ashift); + } + + asize += rr->rr_col[c].rc_size; + } + /* + * If all data stored spans all columns, there's a danger that + * parity will always be on the same device and, since parity + * isn't read during normal operation, that that device's I/O + * bandwidth won't be used effectively. We therefore switch + * the parity every 1MB. + * + * ...at least that was, ostensibly, the theory. As a practical + * matter unless we juggle the parity between all devices + * evenly, we won't see any benefit. Further, occasional writes + * that aren't a multiple of the LCM of the number of children + * and the minimum stripe width are sufficient to avoid pessimal + * behavior. Unfortunately, this decision created an implicit + * on-disk format requirement that we need to support for all + * eternity, but only for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for + * padding we must make sure to note this swap. We will never + * intend to skip the first column since at least one data and + * one parity column must appear in each row. + */ + if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && + (offset & (1ULL << 20))) { + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); + devidx = rr->rr_col[0].rc_devidx; + uint64_t o = rr->rr_col[0].rc_offset; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[1].rc_devidx = devidx; + rr->rr_col[1].rc_offset = o; + } + + } + ASSERT3U(asize, ==, tot << ashift); + + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + + return (rm); +} + static raidz_map_t * init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) { @@ -330,8 +558,15 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) (*zio)->io_abd = raidz_alloc(alloc_dsize); init_zio_abd(*zio); - rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, - total_ncols, parity); + if (opts->rto_expand) { + rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, + (*zio)->io_size, (*zio)->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + } else { + rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, + total_ncols, parity); + } VERIFY(rm); /* Make sure code columns are destroyed */ @@ -420,7 +655,7 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) if (fn < RAIDZ_REC_PQ) { /* can reconstruct 1 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; /* Check if should stop */ @@ -445,10 +680,11 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) } else if (fn < RAIDZ_REC_PQR) { /* can reconstruct 2 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { - if (x1 >= rm->rm_cols - raidz_parity(rm)) + if (x1 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; /* Check if should stop */ @@ -475,14 +711,15 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) } else { /* can reconstruct 3 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { - if (x1 >= rm->rm_cols - raidz_parity(rm)) + if (x1 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { - if (x2 >= - rm->rm_cols - raidz_parity(rm)) + if (x2 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; /* Check if should stop */ @@ -700,6 +937,8 @@ run_sweep(void) opts->rto_dcols = dcols_v[d]; opts->rto_offset = (1 << ashift_v[a]) * rand(); opts->rto_dsize = size_v[s]; + opts->rto_expand = rto_opts.rto_expand; + opts->rto_expand_offset = rto_opts.rto_expand_offset; opts->rto_v = 0; /* be quiet */ VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, @@ -732,6 +971,7 @@ exit: return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); } + int main(int argc, char **argv) { diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h index 09c825ae4..0f7f4cee3 100644 --- a/cmd/raidz_test/raidz_test.h +++ b/cmd/raidz_test/raidz_test.h @@ -44,13 +44,15 @@ static const char *raidz_impl_names[] = { typedef struct raidz_test_opts { size_t rto_ashift; - size_t rto_offset; + uint64_t rto_offset; size_t rto_dcols; size_t rto_dsize; size_t rto_v; size_t rto_sweep; size_t rto_sweep_timeout; size_t rto_benchmark; + size_t rto_expand; + uint64_t rto_expand_offset; size_t rto_sanity; size_t rto_gdb; @@ -69,6 +71,8 @@ static const raidz_test_opts_t rto_opts_defaults = { .rto_v = 0, .rto_sweep = 0, .rto_benchmark = 0, + .rto_expand = 0, + .rto_expand_offset = -1ULL, .rto_sanity = 0, .rto_gdb = 0, .rto_should_stop = B_FALSE @@ -113,4 +117,7 @@ void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); +struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t, + uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); + #endif /* RAIDZ_TEST_H */ diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index dbf09a652..d4a37dee0 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1642,7 +1642,11 @@ dump_metaslab(metaslab_t *msp) SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } - ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + if (vd->vdev_ops == &vdev_draid_ops) + ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); + else + ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { @@ -5203,8 +5207,6 @@ zdb_blkptr_done(zio_t *zio) zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - abd_free(zio->io_abd); - mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); @@ -5231,6 +5233,8 @@ zdb_blkptr_done(zio_t *zio) blkbuf); } mutex_exit(&spa->spa_scrub_lock); + + abd_free(zio->io_abd); } static int diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 8190beb0c..4a58e1f1d 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -435,7 +435,15 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) return; } - ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE); + /* + * Prefer sequential resilvering when supported (mirrors and dRAID), + * otherwise fallback to a traditional healing resilver. + */ + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE); + if (ret != 0) { + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, + B_TRUE, B_FALSE); + } zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", fullpath, path, (ret == 0) ? "no errors" : diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index ba8a6de3a..89bb84e48 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -219,12 +219,18 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) * replace it. */ for (s = 0; s < nspares; s++) { - char *spare_name; + boolean_t rebuild = B_FALSE; + char *spare_name, *type; if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, &spare_name) != 0) continue; + /* prefer sequential resilvering for distributed spares */ + if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE, + &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) + rebuild = B_TRUE; + /* if set, add the "ashift" pool property to the spare nvlist */ if (source != ZPROP_SRC_DEFAULT) (void) nvlist_add_uint64(spares[s], @@ -237,7 +243,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) dev_name, basename(spare_name)); if (zpool_vdev_attach(zhp, dev_name, spare_name, - replacement, B_TRUE, B_FALSE) == 0) { + replacement, B_TRUE, rebuild) == 0) { free(dev_name); nvlist_free(replacement); return (B_TRUE); @@ -499,6 +505,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, * Attempt to substitute a hot spare. */ (void) replace_with_spare(hdl, zhp, vdev); + zpool_close(zhp); } diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index f609a4e70..340a7db96 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -892,6 +892,107 @@ usage: return (-1); } +/* + * Return a default volblocksize for the pool which always uses more than + * half of the data sectors. This primarily applies to dRAID which always + * writes full stripe widths. + */ +static uint64_t +default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +{ + uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + nvlist_t *tree, **vdevs; + uint_t nvdevs; + + nvlist_t *config = zpool_get_config(zhp, NULL); + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || + nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, + &vdevs, &nvdevs) != 0) { + return (ZVOL_DEFAULT_BLOCKSIZE); + } + + for (int i = 0; i < nvdevs; i++) { + nvlist_t *nv = vdevs[i]; + uint64_t ashift, ndata, nparity; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0) + continue; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, + &ndata) == 0) { + /* dRAID minimum allocation width */ + asize = MAX(asize, ndata * (1ULL << ashift)); + } else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, + &nparity) == 0) { + /* raidz minimum allocation width */ + if (nparity == 1) + asize = MAX(asize, 2 * (1ULL << ashift)); + else + asize = MAX(asize, 4 * (1ULL << ashift)); + } else { + /* mirror or (non-redundant) leaf vdev */ + asize = MAX(asize, 1ULL << ashift); + } + } + + /* + * Calculate the target volblocksize such that more than half + * of the asize is used. The following table is for 4k sectors. + * + * n asize blksz used | n asize blksz used + * -------------------------+--------------------------------- + * 1 4,096 8,192 100% | 9 36,864 32,768 88% + * 2 8,192 8,192 100% | 10 40,960 32,768 80% + * 3 12,288 8,192 66% | 11 45,056 32,768 72% + * 4 16,384 16,384 100% | 12 49,152 32,768 66% + * 5 20,480 16,384 80% | 13 53,248 32,768 61% + * 6 24,576 16,384 66% | 14 57,344 32,768 57% + * 7 28,672 16,384 57% | 15 61,440 32,768 53% + * 8 32,768 32,768 100% | 16 65,536 65,636 100% + * + * This is primarily a concern for dRAID which always allocates + * a full stripe width. For dRAID the default stripe width is + * n=8 in which case the volblocksize is set to 32k. Ignoring + * compression there are no unused sectors. This same reasoning + * applies to raidz[2,3] so target 4 sectors to minimize waste. + */ + uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + while (tgt_volblocksize * 2 <= asize) + tgt_volblocksize *= 2; + + const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); + if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) { + + /* Issue a warning when a non-optimal size is requested. */ + if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) { + (void) fprintf(stderr, gettext("Warning: " + "volblocksize (%llu) is less than the default " + "minimum block size (%llu).\nTo reduce wasted " + "space a volblocksize of %llu is recommended.\n"), + (u_longlong_t)volblocksize, + (u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE, + (u_longlong_t)tgt_volblocksize); + } else if (volblocksize < tgt_volblocksize) { + (void) fprintf(stderr, gettext("Warning: " + "volblocksize (%llu) is much less than the " + "minimum allocation\nunit (%llu), which wastes " + "at least %llu%% of space. To reduce wasted " + "space,\nuse a larger volblocksize (%llu is " + "recommended), fewer dRAID data disks\n" + "per group, or smaller sector size (ashift).\n"), + (u_longlong_t)volblocksize, (u_longlong_t)asize, + (u_longlong_t)((100 * (asize - volblocksize)) / + asize), (u_longlong_t)tgt_volblocksize); + } + } else { + volblocksize = tgt_volblocksize; + fnvlist_add_uint64(props, prop, volblocksize); + } + + return (volblocksize); +} + /* * zfs create [-Pnpv] [-o prop=value] ... fs * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size @@ -932,6 +1033,7 @@ zfs_do_create(int argc, char **argv) int ret = 1; nvlist_t *props; uint64_t intval; + char *strval; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); @@ -1018,7 +1120,7 @@ zfs_do_create(int argc, char **argv) goto badusage; } - if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) { + if (dryrun || type == ZFS_TYPE_VOLUME) { char msg[ZFS_MAX_DATASET_NAME_LEN * 2]; char *p; @@ -1040,18 +1142,24 @@ zfs_do_create(int argc, char **argv) } } - /* - * if volsize is not a multiple of volblocksize, round it up to the - * nearest multiple of the volblocksize - */ if (type == ZFS_TYPE_VOLUME) { - uint64_t volblocksize; + const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); + uint64_t volblocksize = default_volblocksize(zpool_handle, + real_props); - if (nvlist_lookup_uint64(props, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - &volblocksize) != 0) - volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE && + nvlist_lookup_string(props, prop, &strval) != 0) { + if (asprintf(&strval, "%llu", + (u_longlong_t)volblocksize) == -1) + nomem(); + nvlist_add_string(props, prop, strval); + free(strval); + } + /* + * If volsize is not a multiple of volblocksize, round it + * up to the nearest multiple of the volblocksize. + */ if (volsize % volblocksize) { volsize = P2ROUNDUP_TYPED(volsize, volblocksize, uint64_t); @@ -1064,11 +1172,9 @@ zfs_do_create(int argc, char **argv) } } - if (type == ZFS_TYPE_VOLUME && !noreserve) { uint64_t spa_version; zfs_prop_t resv_prop; - char *strval; spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 83a9b5a5a..524cff335 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2294,7 +2294,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } } - /* Display vdev initialization and trim status for leaves */ + /* Display vdev initialization and trim status for leaves. */ if (children == 0) { print_status_initialize(vs, cb->cb_print_vdev_init); print_status_trim(vs, cb->cb_print_vdev_trim); @@ -9849,7 +9849,8 @@ vdev_any_spare_replacing(nvlist_t *nv) (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || - strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) { + strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 || + strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) { return (B_TRUE); } diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 9aa09b18c..c86081a81 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -86,9 +86,6 @@ boolean_t error_seen; boolean_t is_force; - - - /*PRINTFLIKE1*/ void vdev_error(const char *fmt, ...) @@ -222,6 +219,9 @@ is_spare(nvlist_t *config, const char *path) uint_t i, nspares; boolean_t inuse; + if (zpool_is_draid_spare(path)) + return (B_TRUE); + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) return (B_FALSE); @@ -267,9 +267,10 @@ is_spare(nvlist_t *config, const char *path) * /dev/xxx Complete disk path * /xxx Full path to file * xxx Shorthand for /xxx + * draid* Virtual dRAID spare */ static nvlist_t * -make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) +make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) { char path[MAXPATHLEN]; struct stat64 statbuf; @@ -309,6 +310,17 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) /* After whole disk check restore original passed path */ strlcpy(path, arg, sizeof (path)); + } else if (zpool_is_draid_spare(arg)) { + if (!is_primary) { + (void) fprintf(stderr, + gettext("cannot open '%s': dRAID spares can only " + "be used to replace primary vdevs\n"), arg); + return (NULL); + } + + wholedisk = B_TRUE; + strlcpy(path, arg, sizeof (path)); + type = VDEV_TYPE_DRAID_SPARE; } else { err = is_shorthand_path(arg, path, sizeof (path), &statbuf, &wholedisk); @@ -337,17 +349,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) } } - /* - * Determine whether this is a device or a file. - */ - if (wholedisk || S_ISBLK(statbuf.st_mode)) { - type = VDEV_TYPE_DISK; - } else if (S_ISREG(statbuf.st_mode)) { - type = VDEV_TYPE_FILE; - } else { - (void) fprintf(stderr, gettext("cannot use '%s': must be a " - "block device or regular file\n"), path); - return (NULL); + if (type == NULL) { + /* + * Determine whether this is a device or a file. + */ + if (wholedisk || S_ISBLK(statbuf.st_mode)) { + type = VDEV_TYPE_DISK; + } else if (S_ISREG(statbuf.st_mode)) { + type = VDEV_TYPE_FILE; + } else { + fprintf(stderr, gettext("cannot use '%s': must " + "be a block device or regular file\n"), path); + return (NULL); + } } /* @@ -358,10 +372,7 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); - verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) - verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, - VDEV_ALLOC_BIAS_LOG) == 0); + if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); @@ -432,11 +443,16 @@ typedef struct replication_level { #define ZPOOL_FUZZ (16 * 1024 * 1024) +/* + * N.B. For the purposes of comparing replication levels dRAID can be + * considered functionally equivilant to raidz. + */ static boolean_t is_raidz_mirror(replication_level_t *a, replication_level_t *b, replication_level_t **raidz, replication_level_t **mirror) { - if (strcmp(a->zprl_type, "raidz") == 0 && + if ((strcmp(a->zprl_type, "raidz") == 0 || + strcmp(a->zprl_type, "draid") == 0) && strcmp(b->zprl_type, "mirror") == 0) { *raidz = a; *mirror = b; @@ -445,6 +461,22 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b, return (B_FALSE); } +/* + * Comparison for determining if dRAID and raidz where passed in either order. + */ +static boolean_t +is_raidz_draid(replication_level_t *a, replication_level_t *b) +{ + if ((strcmp(a->zprl_type, "raidz") == 0 || + strcmp(a->zprl_type, "draid") == 0) && + (strcmp(b->zprl_type, "raidz") == 0 || + strcmp(b->zprl_type, "draid") == 0)) { + return (B_TRUE); + } + + return (B_FALSE); +} + /* * Given a list of toplevel vdevs, return the current replication level. If * the config is inconsistent, then NULL is returned. If 'fatal' is set, then @@ -511,7 +543,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_type = type; rep.zprl_children = 0; - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); @@ -677,6 +710,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) else return (NULL); } + } else if (is_raidz_draid(&lastrep, &rep)) { + /* + * Accepted raidz and draid when they can + * handle the same number of disk failures. + */ + if (lastrep.zprl_parity != rep.zprl_parity) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication " + "level: %s and %s vdevs " + "with different " + "redundancy, %llu vs. " + "%llu are present\n"), + lastrep.zprl_type, + rep.zprl_type, + lastrep.zprl_parity, + rep.zprl_parity); + else + return (NULL); + } } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { if (ret != NULL) @@ -1103,31 +1159,87 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, return (anyinuse); } +/* + * Returns the parity level extracted from a raidz or draid type. + * If the parity cannot be determined zero is returned. + */ +static int +get_parity(const char *type) +{ + long parity = 0; + const char *p; + + if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { + p = type + strlen(VDEV_TYPE_RAIDZ); + + if (*p == '\0') { + /* when unspecified default to single parity */ + return (1); + } else if (*p == '0') { + /* no zero prefixes allowed */ + return (0); + } else { + /* 0-3, no suffixes allowed */ + char *end; + errno = 0; + parity = strtol(p, &end, 10); + if (errno != 0 || *end != '\0' || + parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { + return (0); + } + } + } else if (strncmp(type, VDEV_TYPE_DRAID, + strlen(VDEV_TYPE_DRAID)) == 0) { + p = type + strlen(VDEV_TYPE_DRAID); + + if (*p == '\0' || *p == ':') { + /* when unspecified default to single parity */ + return (1); + } else if (*p == '0') { + /* no zero prefixes allowed */ + return (0); + } else { + /* 0-3, allowed suffixes: '\0' or ':' */ + char *end; + errno = 0; + parity = strtol(p, &end, 10); + if (errno != 0 || + parity < 1 || parity > VDEV_DRAID_MAXPARITY || + (*end != '\0' && *end != ':')) { + return (0); + } + } + } + + return ((int)parity); +} + +/* + * Assign the minimum and maximum number of devices allowed for + * the specified type. On error NULL is returned, otherwise the + * type prefix is returned (raidz, mirror, etc). + */ static const char * is_grouping(const char *type, int *mindev, int *maxdev) { - if (strncmp(type, "raidz", 5) == 0) { - const char *p = type + 5; - char *end; - long nparity; - - if (*p == '\0') { - nparity = 1; - } else if (*p == '0') { - return (NULL); /* no zero prefixes allowed */ - } else { - errno = 0; - nparity = strtol(p, &end, 10); - if (errno != 0 || nparity < 1 || nparity >= 255 || - *end != '\0') - return (NULL); - } + int nparity; + if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { + nparity = get_parity(type); + if (nparity == 0) + return (NULL); if (mindev != NULL) *mindev = nparity + 1; if (maxdev != NULL) *maxdev = 255; - return (VDEV_TYPE_RAIDZ); + + if (strncmp(type, VDEV_TYPE_RAIDZ, + strlen(VDEV_TYPE_RAIDZ)) == 0) { + return (VDEV_TYPE_RAIDZ); + } else { + return (VDEV_TYPE_DRAID); + } } if (maxdev != NULL) @@ -1167,6 +1279,163 @@ is_grouping(const char *type, int *mindev, int *maxdev) return (NULL); } +/* + * Extract the configuration parameters encoded in the dRAID type and + * use them to generate a dRAID configuration. The expected format is: + * + * draid[][:][:][:] + * + * The intent is to be able to generate a good configuration when no + * additional information is provided. The only mandatory component + * of the 'type' is the 'draid' prefix. If a value is not provided + * then reasonable defaults are used. The optional components may + * appear in any order but the d/s/c suffix is required. + * + * Valid inputs: + * - data: number of data devices per group (1-255) + * - parity: number of parity blocks per group (1-3) + * - spares: number of distributed spare (0-100) + * - children: total number of devices (1-255) + * + * Examples: + * - zpool create tank draid + * - zpool create tank draid2:8d:51c:2s + */ +static int +draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) +{ + uint64_t nparity = 1; + uint64_t nspares = 0; + uint64_t ndata = UINT64_MAX; + uint64_t ngroups = 1; + long value; + + if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) + return (EINVAL); + + nparity = (uint64_t)get_parity(type); + if (nparity == 0) + return (EINVAL); + + char *p = (char *)type; + while ((p = strchr(p, ':')) != NULL) { + char *end; + + p = p + 1; + errno = 0; + + if (!isdigit(p[0])) { + (void) fprintf(stderr, gettext("invalid dRAID " + "syntax; expected [:] not '%s'\n"), + type); + return (EINVAL); + } + + /* Expected non-zero value with c/d/s suffix */ + value = strtol(p, &end, 10); + char suffix = tolower(*end); + if (errno != 0 || + (suffix != 'c' && suffix != 'd' && suffix != 's')) { + (void) fprintf(stderr, gettext("invalid dRAID " + "syntax; expected [:] not '%s'\n"), + type); + return (EINVAL); + } + + if (suffix == 'c') { + if ((uint64_t)value != children) { + fprintf(stderr, + gettext("invalid number of dRAID children; " + "%llu required but %llu provided\n"), + (u_longlong_t)value, + (u_longlong_t)children); + return (EINVAL); + } + } else if (suffix == 'd') { + ndata = (uint64_t)value; + } else if (suffix == 's') { + nspares = (uint64_t)value; + } else { + verify(0); /* Unreachable */ + } + } + + /* + * When a specific number of data disks is not provided limit a + * redundancy group to 8 data disks. This value was selected to + * provide a reasonable tradeoff between capacity and performance. + */ + if (ndata == UINT64_MAX) { + if (children > nspares + nparity) { + ndata = MIN(children - nspares - nparity, 8); + } else { + fprintf(stderr, gettext("request number of " + "distributed spares %llu and parity level %llu\n" + "leaves no disks available for data\n"), + (u_longlong_t)nspares, (u_longlong_t)nparity); + return (EINVAL); + } + } + + /* Verify the maximum allowed group size is never exceeded. */ + if (ndata == 0 || (ndata + nparity > children - nspares)) { + fprintf(stderr, gettext("requested number of dRAID data " + "disks per group %llu is too high,\nat most %llu disks " + "are available for data\n"), (u_longlong_t)ndata, + (u_longlong_t)(children - nspares - nparity)); + return (EINVAL); + } + + if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { + fprintf(stderr, + gettext("invalid dRAID parity level %llu; must be " + "between 1 and %d\n"), (u_longlong_t)nparity, + VDEV_DRAID_MAXPARITY); + return (EINVAL); + } + + /* + * Verify the requested number of spares can be satisfied. + * An arbitrary limit of 100 distributed spares is applied. + */ + if (nspares > 100 || nspares > (children - (ndata + nparity))) { + fprintf(stderr, + gettext("invalid number of dRAID spares %llu; additional " + "disks would be required\n"), (u_longlong_t)nspares); + return (EINVAL); + } + + /* Verify the requested number children is sufficient. */ + if (children < (ndata + nparity + nspares)) { + fprintf(stderr, gettext("%llu disks were provided, but at " + "least %llu disks are required for this config\n"), + (u_longlong_t)children, + (u_longlong_t)(ndata + nparity + nspares)); + } + + if (children > VDEV_DRAID_MAX_CHILDREN) { + fprintf(stderr, gettext("%llu disks were provided, but " + "dRAID only supports up to %u disks"), + (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); + } + + /* + * Calculate the minimum number of groups required to fill a slice. + * This is the LCM of the stripe width (ndata + nparity) and the + * number of data drives (children - nspares). + */ + while (ngroups * (ndata + nparity) % (children - nspares) != 0) + ngroups++; + + /* Store the basic dRAID configuration. */ + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + + return (0); +} + /* * Construct a syntactically valid vdev specification, * and ensure that all devices and files exist and can be opened. @@ -1178,8 +1447,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) { nvlist_t *nvroot, *nv, **top, **spares, **l2cache; int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; - const char *type; - uint64_t is_log, is_special, is_dedup; + const char *type, *fulltype; + boolean_t is_log, is_special, is_dedup, is_spare; boolean_t seen_logs; top = NULL; @@ -1189,18 +1458,20 @@ construct_spec(nvlist_t *props, int argc, char **argv) nspares = 0; nlogs = 0; nl2cache = 0; - is_log = is_special = is_dedup = B_FALSE; + is_log = is_special = is_dedup = is_spare = B_FALSE; seen_logs = B_FALSE; nvroot = NULL; while (argc > 0) { + fulltype = argv[0]; nv = NULL; /* - * If it's a mirror or raidz, the subsequent arguments are - * its leaves -- until we encounter the next mirror or raidz. + * If it's a mirror, raidz, or draid the subsequent arguments + * are its leaves -- until we encounter the next mirror, + * raidz or draid. */ - if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { + if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; @@ -1212,6 +1483,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } + is_spare = B_TRUE; is_log = is_special = is_dedup = B_FALSE; } @@ -1225,8 +1497,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) } seen_logs = B_TRUE; is_log = B_TRUE; - is_special = B_FALSE; - is_dedup = B_FALSE; + is_special = is_dedup = is_spare = B_FALSE; argc--; argv++; /* @@ -1238,8 +1509,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { is_special = B_TRUE; - is_log = B_FALSE; - is_dedup = B_FALSE; + is_log = is_dedup = is_spare = B_FALSE; argc--; argv++; continue; @@ -1247,8 +1517,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { is_dedup = B_TRUE; - is_log = B_FALSE; - is_special = B_FALSE; + is_log = is_special = is_spare = B_FALSE; argc--; argv++; continue; @@ -1262,7 +1531,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } - is_log = is_special = is_dedup = B_FALSE; + is_log = is_special = B_FALSE; + is_dedup = is_spare = B_FALSE; } if (is_log || is_special || is_dedup) { @@ -1280,13 +1550,15 @@ construct_spec(nvlist_t *props, int argc, char **argv) for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; + children++; child = realloc(child, children * sizeof (nvlist_t *)); if (child == NULL) zpool_no_memory(); if ((nv = make_leaf_vdev(props, argv[c], - B_FALSE)) == NULL) { + !(is_log || is_special || is_dedup || + is_spare))) == NULL) { for (c = 0; c < children - 1; c++) nvlist_free(child[c]); free(child); @@ -1335,10 +1607,11 @@ construct_spec(nvlist_t *props, int argc, char **argv) type) == 0); verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) + if (is_log) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_LOG) == 0); + } if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, @@ -1354,6 +1627,15 @@ construct_spec(nvlist_t *props, int argc, char **argv) ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); } + if (strcmp(type, VDEV_TYPE_DRAID) == 0) { + if (draid_config_by_type(nv, + fulltype, children) != 0) { + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + } verify(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, children) == 0); @@ -1367,12 +1649,19 @@ construct_spec(nvlist_t *props, int argc, char **argv) * We have a device. Pass off to make_leaf_vdev() to * construct the appropriate nvlist describing the vdev. */ - if ((nv = make_leaf_vdev(props, argv[0], - is_log)) == NULL) + if ((nv = make_leaf_vdev(props, argv[0], !(is_log || + is_special || is_dedup || is_spare))) == NULL) goto spec_out; - if (is_log) + verify(nvlist_add_uint64(nv, + ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); nlogs++; + } + if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 31205a5bf..1c4da20e4 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -104,6 +104,7 @@ #include #include #include +#include #include #include #include @@ -167,8 +168,11 @@ typedef struct ztest_shared_opts { size_t zo_vdev_size; int zo_ashift; int zo_mirrors; - int zo_raidz; - int zo_raidz_parity; + int zo_raid_children; + int zo_raid_parity; + char zo_raid_type[8]; + int zo_draid_data; + int zo_draid_spares; int zo_datasets; int zo_threads; uint64_t zo_passtime; @@ -191,9 +195,12 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_vdevs = 5, .zo_ashift = SPA_MINBLOCKSHIFT, .zo_mirrors = 2, - .zo_raidz = 4, - .zo_raidz_parity = 1, + .zo_raid_children = 4, + .zo_raid_parity = 1, + .zo_raid_type = VDEV_TYPE_RAIDZ, .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ + .zo_draid_data = 4, /* data drives */ + .zo_draid_spares = 1, /* distributed spares */ .zo_datasets = 7, .zo_threads = 23, .zo_passtime = 60, /* 60 seconds */ @@ -232,7 +239,7 @@ static ztest_shared_ds_t *ztest_shared_ds; #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ - (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) + (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, @@ -689,8 +696,11 @@ usage(boolean_t requested) "\t[-s size_of_each_vdev (default: %s)]\n" "\t[-a alignment_shift (default: %d)] use 0 for random\n" "\t[-m mirror_copies (default: %d)]\n" - "\t[-r raidz_disks (default: %d)]\n" - "\t[-R raidz_parity (default: %d)]\n" + "\t[-r raidz_disks / draid_disks (default: %d)]\n" + "\t[-R raid_parity (default: %d)]\n" + "\t[-K raid_kind (default: random)] raidz|draid|random\n" + "\t[-D draid_data (default: %d)] in config\n" + "\t[-S draid_spares (default: %d)]\n" "\t[-d datasets (default: %d)]\n" "\t[-t threads (default: %d)]\n" "\t[-g gang_block_threshold (default: %s)]\n" @@ -716,8 +726,10 @@ usage(boolean_t requested) nice_vdev_size, /* -s */ zo->zo_ashift, /* -a */ zo->zo_mirrors, /* -m */ - zo->zo_raidz, /* -r */ - zo->zo_raidz_parity, /* -R */ + zo->zo_raid_children, /* -r */ + zo->zo_raid_parity, /* -R */ + zo->zo_draid_data, /* -D */ + zo->zo_draid_spares, /* -S */ zo->zo_datasets, /* -d */ zo->zo_threads, /* -t */ nice_force_ganging, /* -g */ @@ -731,6 +743,21 @@ usage(boolean_t requested) exit(requested ? 0 : 1); } +static uint64_t +ztest_random(uint64_t range) +{ + uint64_t r; + + ASSERT3S(ztest_fd_rand, >=, 0); + + if (range == 0) + return (0); + + if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) + fatal(1, "short read from /dev/urandom"); + + return (r % range); +} static void ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) @@ -780,11 +807,12 @@ process_options(int argc, char **argv) int opt; uint64_t value; char altdir[MAXNAMELEN] = { 0 }; + char raid_kind[8] = { "random" }; bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, - "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { + "v:s:a:m:r:R:K:D:S:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { value = 0; switch (opt) { case 'v': @@ -793,6 +821,8 @@ process_options(int argc, char **argv) case 'm': case 'r': case 'R': + case 'D': + case 'S': case 'd': case 't': case 'g': @@ -817,10 +847,19 @@ process_options(int argc, char **argv) zo->zo_mirrors = value; break; case 'r': - zo->zo_raidz = MAX(1, value); + zo->zo_raid_children = MAX(1, value); break; case 'R': - zo->zo_raidz_parity = MIN(MAX(value, 1), 3); + zo->zo_raid_parity = MIN(MAX(value, 1), 3); + break; + case 'K': + (void) strlcpy(raid_kind, optarg, sizeof (raid_kind)); + break; + case 'D': + zo->zo_draid_data = MAX(1, value); + break; + case 'S': + zo->zo_draid_spares = MAX(1, value); break; case 'd': zo->zo_datasets = MAX(1, value); @@ -895,7 +934,54 @@ process_options(int argc, char **argv) } } - zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); + /* When raid choice is 'random' add a draid pool 50% of the time */ + if (strcmp(raid_kind, "random") == 0) { + (void) strlcpy(raid_kind, (ztest_random(2) == 0) ? + "draid" : "raidz", sizeof (raid_kind)); + + if (ztest_opts.zo_verbose >= 3) + (void) printf("choosing RAID type '%s'\n", raid_kind); + } + + if (strcmp(raid_kind, "draid") == 0) { + uint64_t min_devsize; + + /* With fewer disk use 256M, otherwise 128M is OK */ + min_devsize = (ztest_opts.zo_raid_children < 16) ? + (256ULL << 20) : (128ULL << 20); + + /* No top-level mirrors with dRAID for now */ + zo->zo_mirrors = 0; + + /* Use more appropriate defaults for dRAID */ + if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) + zo->zo_vdevs = 1; + if (zo->zo_raid_children == + ztest_opts_defaults.zo_raid_children) + zo->zo_raid_children = 16; + if (zo->zo_ashift < 12) + zo->zo_ashift = 12; + if (zo->zo_vdev_size < min_devsize) + zo->zo_vdev_size = min_devsize; + + if (zo->zo_draid_data + zo->zo_raid_parity > + zo->zo_raid_children - zo->zo_draid_spares) { + (void) fprintf(stderr, "error: too few draid " + "children (%d) for stripe width (%d)\n", + zo->zo_raid_children, + zo->zo_draid_data + zo->zo_raid_parity); + usage(B_FALSE); + } + + (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, + sizeof (zo->zo_raid_type)); + + } else /* using raidz */ { + ASSERT0(strcmp(raid_kind, "raidz")); + + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + } zo->zo_vdevtime = (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : @@ -966,22 +1052,6 @@ ztest_kill(ztest_shared_t *zs) (void) kill(getpid(), SIGKILL); } -static uint64_t -ztest_random(uint64_t range) -{ - uint64_t r; - - ASSERT3S(ztest_fd_rand, >=, 0); - - if (range == 0) - return (0); - - if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) - fatal(1, "short read from /dev/urandom"); - - return (r % range); -} - /* ARGSUSED */ static void ztest_record_enospc(const char *s) @@ -997,12 +1067,27 @@ ztest_get_ashift(void) return (ztest_opts.zo_ashift); } +static boolean_t +ztest_is_draid_spare(const char *name) +{ + uint64_t spare_id = 0, parity = 0, vdev_id = 0; + + if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", + (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, + (u_longlong_t *)&spare_id) == 3) { + return (B_TRUE); + } + + return (B_FALSE); +} + static nvlist_t * make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) { char *pathbuf; uint64_t vdev; nvlist_t *file; + boolean_t draid_spare = B_FALSE; pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); @@ -1024,9 +1109,11 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) ztest_dev_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, vdev); } + } else { + draid_spare = ztest_is_draid_spare(path); } - if (size != 0) { + if (size != 0 && !draid_spare) { int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd == -1) fatal(1, "can't open %s", path); @@ -1035,20 +1122,21 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) (void) close(fd); } - VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); - VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); - VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); + VERIFY0(nvlist_alloc(&file, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, + draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE)); + VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path)); + VERIFY0(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift)); umem_free(pathbuf, MAXPATHLEN); return (file); } static nvlist_t * -make_vdev_raidz(char *path, char *aux, char *pool, size_t size, +make_vdev_raid(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int r) { - nvlist_t *raidz, **child; + nvlist_t *raid, **child; int c; if (r < 2) @@ -1058,20 +1146,41 @@ make_vdev_raidz(char *path, char *aux, char *pool, size_t size, for (c = 0; c < r; c++) child[c] = make_vdev_file(path, aux, pool, size, ashift); - VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_RAIDZ) == 0); - VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, - ztest_opts.zo_raidz_parity) == 0); - VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, - child, r) == 0); + VERIFY0(nvlist_alloc(&raid, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_string(raid, ZPOOL_CONFIG_TYPE, + ztest_opts.zo_raid_type)); + VERIFY0(nvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, + ztest_opts.zo_raid_parity)); + VERIFY0(nvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, + child, r)); + + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { + uint64_t ndata = ztest_opts.zo_draid_data; + uint64_t nparity = ztest_opts.zo_raid_parity; + uint64_t nspares = ztest_opts.zo_draid_spares; + uint64_t children = ztest_opts.zo_raid_children; + uint64_t ngroups = 1; + + /* + * Calculate the minimum number of groups required to fill a + * slice. This is the LCM of the stripe width (data + parity) + * and the number of data drives (children - spares). + */ + while (ngroups * (ndata + nparity) % (children - nspares) != 0) + ngroups++; + + /* Store the basic dRAID configuration. */ + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + } for (c = 0; c < r; c++) nvlist_free(child[c]); umem_free(child, r * sizeof (nvlist_t *)); - return (raidz); + return (raid); } static nvlist_t * @@ -1082,12 +1191,12 @@ make_vdev_mirror(char *path, char *aux, char *pool, size_t size, int c; if (m < 1) - return (make_vdev_raidz(path, aux, pool, size, ashift, r)); + return (make_vdev_raid(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) - child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); + child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, @@ -2809,6 +2918,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) if (ztest_opts.zo_mmp_test) return; + /* dRAID added after feature flags, skip upgrade test. */ + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) + return; + mutex_enter(&ztest_vdev_lock); name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); @@ -2818,13 +2931,13 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); + NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the * initial version is capable of supporting that feature. */ - switch (ztest_opts.zo_raidz_parity) { + switch (ztest_opts.zo_raid_parity) { case 0: case 1: initial_version = SPA_VERSION_INITIAL; @@ -2970,7 +3083,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) return; mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * + ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -3024,7 +3138,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? - "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, + 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -3078,14 +3193,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) return; } - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * + ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -3134,7 +3250,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) char *aux; char *path; uint64_t guid = 0; - int error; + int error, ignore_err = 0; if (ztest_opts.zo_mmp_test) return; @@ -3157,7 +3273,13 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) /* * Pick a random device to remove. */ - guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; + vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; + + /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ + if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) + ignore_err = ENOTSUP; + + guid = svd->vdev_guid; } else { /* * Find an unused device we can add. @@ -3214,7 +3336,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: - fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); + if (error != ignore_err) + fatal(0, "spa_vdev_remove(%llu) = %d", guid, + error); } } @@ -3243,7 +3367,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id) mutex_enter(&ztest_vdev_lock); /* ensure we have a usable config; mirrors of raidz aren't supported */ - if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { + if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { mutex_exit(&ztest_vdev_lock); return; } @@ -3343,6 +3467,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) int replacing; int oldvd_has_siblings = B_FALSE; int newvd_is_spare = B_FALSE; + int newvd_is_dspare = B_FALSE; int oldvd_is_log; int error, expected_error; @@ -3353,7 +3478,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -3393,14 +3518,17 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (zs->zs_mirrors >= 1) { ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); ASSERT(oldvd->vdev_children >= zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; + oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; } /* pick a child out of the raidz group */ - if (ztest_opts.zo_raidz > 1) { - ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); - ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); - oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; + if (ztest_opts.zo_raid_children > 1) { + if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) + ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); + else + ASSERT(oldvd->vdev_ops == &vdev_draid_ops); + ASSERT(oldvd->vdev_children == ztest_opts.zo_raid_children); + oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; } /* @@ -3447,6 +3575,10 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (sav->sav_count != 0 && ztest_random(3) == 0) { newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; newvd_is_spare = B_TRUE; + + if (newvd->vdev_ops == &vdev_draid_spare_ops) + newvd_is_dspare = B_TRUE; + (void) strcpy(newpath, newvd->vdev_path); } else { (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, @@ -3480,6 +3612,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * If newvd is already part of the pool, it should fail with EBUSY. * * If newvd is too small, it should fail with EOVERFLOW. + * + * If newvd is a distributed spare and it's being attached to a + * dRAID which is not its parent it should fail with EINVAL. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || @@ -3492,10 +3627,12 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) expected_error = replacing ? 0 : EBUSY; else if (vdev_lookup_by_path(rvd, newpath) != NULL) expected_error = EBUSY; - else if (newsize < oldsize) + else if (!newvd_is_dspare && newsize < oldsize) expected_error = EOVERFLOW; else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; + else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) + expected_error = ENOTSUP; else expected_error = 0; @@ -4880,13 +5017,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); - VERIFY(0 == dmu_read(os, packobj, packoff, + VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); - VERIFY(0 == dmu_read(os, bigobj, bigoff, + VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); - ASSERT(bcmp(packbuf, packcheck, packsize) == 0); - ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + ASSERT0(bcmp(packbuf, packcheck, packsize)); + ASSERT0(bcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); @@ -5761,7 +5898,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) } maxfaults = MAXFAULTS(zs); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; mirror_save = zs->zs_mirrors; mutex_exit(&ztest_vdev_lock); @@ -6011,7 +6148,7 @@ out: /* * By design ztest will never inject uncorrectable damage in to the pool. * Issue a scrub, wait for it to complete, and verify there is never any - * any persistent damage. + * persistent damage. * * Only after a full scrub has been completed is it safe to start injecting * data corruption. See the comment in zfs_fault_inject(). @@ -7347,7 +7484,7 @@ ztest_init(ztest_shared_t *zs) zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); props = make_random_props(); /* @@ -7683,10 +7820,12 @@ main(int argc, char **argv) if (ztest_opts.zo_verbose >= 1) { (void) printf("%llu vdevs, %d datasets, %d threads," - " %llu seconds...\n", + "%d %s disks, %llu seconds...\n\n", (u_longlong_t)ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, + ztest_opts.zo_raid_children, + ztest_opts.zo_raid_type, (u_longlong_t)ztest_opts.zo_time); } diff --git a/configure.ac b/configure.ac index 47f0f2f50..9ba122e58 100644 --- a/configure.ac +++ b/configure.ac @@ -209,6 +209,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/cmd/btree_test/Makefile tests/zfs-tests/cmd/chg_usr_exec/Makefile tests/zfs-tests/cmd/devname2devid/Makefile + tests/zfs-tests/cmd/draid/Makefile tests/zfs-tests/cmd/dir_rd_update/Makefile tests/zfs-tests/cmd/file_check/Makefile tests/zfs-tests/cmd/file_trunc/Makefile diff --git a/include/libzfs.h b/include/libzfs.h index 337e4934a..cb232c291 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -455,6 +455,7 @@ extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, nvlist_t *); extern int zpool_checkpoint(zpool_handle_t *); extern int zpool_discard_checkpoint(zpool_handle_t *); +extern boolean_t zpool_is_draid_spare(const char *); /* * Basic handle manipulations. These functions do not create or destroy the diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index cfc3d1018..c3ebf17b5 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -82,6 +82,7 @@ COMMON_H = \ vdev_disk.h \ vdev_file.h \ vdev.h \ + vdev_draid.h \ vdev_impl.h \ vdev_indirect_births.h \ vdev_indirect_mapping.h \ diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 8f929207d..19c3dd599 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -163,6 +163,7 @@ typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; void scan_init(void); void scan_fini(void); int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); +void dsl_scan_setup_sync(void *, dmu_tx_t *); void dsl_scan_fini(struct dsl_pool *dp); void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); int dsl_scan_cancel(struct dsl_pool *); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 211dd6d50..5bb7971d4 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -617,6 +617,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" +#define ZPOOL_CONFIG_SPARE_ID "spareid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" @@ -757,10 +758,17 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" +/* dRAID configuration */ +#define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata" +#define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" +#define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" + #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DRAID "draid" +#define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" @@ -770,6 +778,12 @@ typedef struct zpool_load_policy { #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" +#define VDEV_RAIDZ_MAXPARITY 3 + +#define VDEV_DRAID_MAXPARITY 3 +#define VDEV_DRAID_MIN_CHILDREN 2 +#define VDEV_DRAID_MAX_CHILDREN UINT8_MAX + /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ #define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ "com.delphix:indirect_obsolete_sm" diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 69de75fb6..93f49a311 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -240,8 +240,9 @@ struct spa { kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ - int spa_min_ashift; /* of vdevs in normal class */ - int spa_max_ashift; /* of vdevs in normal class */ + uint64_t spa_min_ashift; /* of vdevs in normal class */ + uint64_t spa_max_ashift; /* of vdevs in normal class */ + uint64_t spa_min_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ diff --git a/include/sys/txg.h b/include/sys/txg.h index 260a3b43c..22158bd1a 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -41,6 +41,7 @@ extern "C" { #define TXG_MASK (TXG_SIZE - 1) /* mask for size */ #define TXG_INITIAL TXG_SIZE /* initial txg */ #define TXG_IDX (txg & TXG_MASK) +#define TXG_UNKNOWN 0 /* Number of txgs worth of frees we defer adding to in-core spacemaps */ #define TXG_DEFER_SIZE 2 diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 309ce33be..7bc72a03d 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -49,10 +49,13 @@ typedef enum vdev_dtl_type { extern int zfs_nocacheflush; +typedef boolean_t vdev_open_children_func_t(vdev_t *vd); + extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...); extern void vdev_dbgmsg_print_tree(vdev_t *, int); extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); +extern void vdev_open_children_subset(vdev_t *, vdev_open_children_func_t *); extern int vdev_validate(vdev_t *); extern int vdev_copy_path_strict(vdev_t *, vdev_t *); extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *); @@ -71,7 +74,10 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); -extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); +extern boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); +extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done); extern boolean_t vdev_dtl_required(vdev_t *vd); @@ -97,8 +103,14 @@ extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd, char *tag); + +typedef void vdev_xlate_func_t(void *arg, range_seg64_t *physical_rs); + +extern boolean_t vdev_xlate_is_empty(range_seg64_t *rs); extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs); + range_seg64_t *physical_rs, range_seg64_t *remain_rs); +extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, + vdev_xlate_func_t *func, void *arg); extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); diff --git a/include/sys/vdev_draid.h b/include/sys/vdev_draid.h new file mode 100644 index 000000000..65417a93c --- /dev/null +++ b/include/sys/vdev_draid.h @@ -0,0 +1,110 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_VDEV_DRAID_H +#define _SYS_VDEV_DRAID_H + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Constants required to generate and use dRAID permutations. + */ +#define VDEV_DRAID_SEED 0xd7a1d5eed +#define VDEV_DRAID_MAX_MAPS 254 +#define VDEV_DRAID_ROWSHIFT SPA_MAXBLOCKSHIFT +#define VDEV_DRAID_ROWHEIGHT (1ULL << VDEV_DRAID_ROWSHIFT) +#define VDEV_DRAID_REFLOW_RESERVE (2 * VDEV_DRAID_ROWHEIGHT) + +/* + * dRAID permutation map. + */ +typedef struct draid_map { + uint64_t dm_children; /* # of permuation columns */ + uint64_t dm_nperms; /* # of permutation rows */ + uint64_t dm_seed; /* dRAID map seed */ + uint64_t dm_checksum; /* Checksum of generated map */ + uint8_t *dm_perms; /* base permutation array */ +} draid_map_t; + +/* + * dRAID configuration. + */ +typedef struct vdev_draid_config { + /* + * Values read from the dRAID nvlist configuration. + */ + uint64_t vdc_ndata; /* # of data devices in group */ + uint64_t vdc_nparity; /* # of parity devices in group */ + uint64_t vdc_nspares; /* # of distributed spares */ + uint64_t vdc_children; /* # of children */ + uint64_t vdc_ngroups; /* # groups per slice */ + + /* + * Immutable derived constants. + */ + uint8_t *vdc_perms; /* permutation array */ + uint64_t vdc_nperms; /* # of permutations */ + uint64_t vdc_groupwidth; /* = data + parity */ + uint64_t vdc_ndisks; /* = children - spares */ + uint64_t vdc_groupsz; /* = groupwidth * DRAID_ROWSIZE */ + uint64_t vdc_devslicesz; /* = (groupsz * groups) / ndisks */ +} vdev_draid_config_t; + +/* + * Functions for handling dRAID permutation maps. + */ +extern uint64_t vdev_draid_rand(uint64_t *); +extern int vdev_draid_lookup_map(uint64_t, const draid_map_t **); +extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **); + +/* + * General dRAID support functions. + */ +extern boolean_t vdev_draid_readable(vdev_t *, uint64_t); +extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t); +extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t); +extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *); +extern nvlist_t *vdev_draid_read_config_spare(vdev_t *); + +/* Functions for dRAID distributed spares. */ +extern vdev_t *vdev_draid_spare_get_child(vdev_t *, uint64_t); +extern vdev_t *vdev_draid_spare_get_parent(vdev_t *); +extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_DRAID_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 3c4c3fb5a..7d2b2743c 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -68,14 +68,19 @@ extern uint32_t zfs_vdev_async_write_max_active; /* * Virtual device operations */ +typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd); +typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); +typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); -typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); +typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); @@ -87,13 +92,24 @@ typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, * Given a target vdev, translates the logical range "in" to the physical * range "res" */ -typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *in, - range_seg64_t *res); +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *logical, + range_seg64_t *physical, range_seg64_t *remain); +typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start, + uint64_t size, uint64_t max_segment); +typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp, + uint64_t *sizep); +typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv); +typedef uint64_t vdev_nparity_func_t(vdev_t *vd); +typedef uint64_t vdev_ndisks_func_t(vdev_t *vd); typedef const struct vdev_ops { + vdev_init_func_t *vdev_op_init; + vdev_fini_func_t *vdev_op_fini; vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; vdev_asize_func_t *vdev_op_asize; + vdev_min_asize_func_t *vdev_op_min_asize; + vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; @@ -101,11 +117,12 @@ typedef const struct vdev_ops { vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; - /* - * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. - * Used when initializing vdevs. Isn't used by leaf ops. - */ vdev_xlation_func_t *vdev_op_xlate; + vdev_rebuild_asize_func_t *vdev_op_rebuild_asize; + vdev_metaslab_init_func_t *vdev_op_metaslab_init; + vdev_config_generate_func_t *vdev_op_config_generate; + vdev_nparity_func_t *vdev_op_nparity; + vdev_ndisks_func_t *vdev_op_ndisks; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -325,16 +342,13 @@ struct vdev { kthread_t *vdev_rebuild_thread; vdev_rebuild_t vdev_rebuild_config; - /* For limiting outstanding I/Os (initialize, TRIM, rebuild) */ + /* For limiting outstanding I/Os (initialize, TRIM) */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; uint64_t vdev_trim_inflight[3]; - kmutex_t vdev_rebuild_io_lock; - kcondvar_t vdev_rebuild_io_cv; - uint64_t vdev_rebuild_inflight; /* * Values stored in the config for an indirect or removing vdev. @@ -392,7 +406,6 @@ struct vdev { uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ - uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ @@ -445,8 +458,6 @@ struct vdev { zfs_ratelimit_t vdev_checksum_rl; }; -#define VDEV_RAIDZ_MAXPARITY 3 - #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 @@ -532,6 +543,9 @@ typedef struct vdev_label { #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 #define VDEV_BEST_LABEL VDEV_LABELS +#define VDEV_OFFSET_IS_LABEL(vd, off) \ + (((off) < VDEV_LABEL_START_SIZE) || \ + ((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE))) #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 @@ -577,6 +591,8 @@ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; +extern vdev_ops_t vdev_draid_ops; +extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; @@ -587,11 +603,15 @@ extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ -extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, - range_seg64_t *out); +extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); +extern uint64_t vdev_get_min_alloc(vdev_t *vd); +extern uint64_t vdev_get_nparity(vdev_t *vd); +extern uint64_t vdev_get_ndisks(vdev_t *vd); /* * Global variables diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 0ce2b5ea1..029fdef5f 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -32,6 +32,7 @@ extern "C" { #endif struct zio; +struct raidz_row; struct raidz_map; #if !defined(_KERNEL) struct kernel_param {}; @@ -43,8 +44,11 @@ struct kernel_param {}; struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, uint64_t); void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_generate_parity_row(struct raidz_map *, struct raidz_row *); void vdev_raidz_generate_parity(struct raidz_map *); -int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); +void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); +void vdev_raidz_child_done(zio_t *); +void vdev_raidz_io_done(zio_t *); /* * vdev_raidz_math interface @@ -52,11 +56,16 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); void vdev_raidz_math_init(void); void vdev_raidz_math_fini(void); const struct raidz_impl_ops *vdev_raidz_math_get_ops(void); -int vdev_raidz_math_generate(struct raidz_map *); -int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *, - const int); +int vdev_raidz_math_generate(struct raidz_map *, struct raidz_row *); +int vdev_raidz_math_reconstruct(struct raidz_map *, struct raidz_row *, + const int *, const int *, const int); int vdev_raidz_impl_set(const char *); +typedef struct vdev_raidz { + int vd_logical_width; + int vd_nparity; +} vdev_raidz_t; + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 8492daedb..38d4f9e0b 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -106,30 +107,45 @@ typedef struct raidz_col { uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ abd_t *rc_abd; /* I/O data */ - void *rc_gdata; /* used to store the "good" version */ + void *rc_orig_data; /* pre-reconstruction */ + abd_t *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ + uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ + uint8_t rc_repair; /* Write good data to this column */ } raidz_col_t; +typedef struct raidz_row { + uint64_t rr_cols; /* Regular column count */ + uint64_t rr_scols; /* Count including skipped columns */ + uint64_t rr_bigcols; /* Remainder data column count */ + uint64_t rr_missingdata; /* Count of missing data devices */ + uint64_t rr_missingparity; /* Count of missing parity devices */ + uint64_t rr_firstdatacol; /* First data column/parity count */ + abd_t *rr_abd_copy; /* rm_asize-buffer of copied data */ + abd_t *rr_abd_empty; /* dRAID empty sector buffer */ + int rr_nempty; /* empty sectors included in parity */ + int rr_code; /* reconstruction code (unused) */ +#ifdef ZFS_DEBUG + uint64_t rr_offset; /* Logical offset for *_io_verify() */ + uint64_t rr_size; /* Physical size for *_io_verify() */ +#endif + raidz_col_t rr_col[0]; /* Flexible array of I/O columns */ +} raidz_row_t; + typedef struct raidz_map { - uint64_t rm_cols; /* Regular column count */ - uint64_t rm_scols; /* Count including skipped columns */ - uint64_t rm_bigcols; /* Number of oversized columns */ - uint64_t rm_asize; /* Actual total I/O size */ - uint64_t rm_missingdata; /* Count of missing data devices */ - uint64_t rm_missingparity; /* Count of missing parity devices */ - uint64_t rm_firstdatacol; /* First data column/parity count */ - uint64_t rm_nskip; /* Skipped sectors for padding */ - uint64_t rm_skipstart; /* Column index of padding start */ - abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ - uint8_t rm_freed; /* map no longer has referencing ZIO */ - uint8_t rm_ecksuminjected; /* checksum error was injected */ + boolean_t rm_freed; /* map no longer has referencing ZIO */ + boolean_t rm_ecksuminjected; /* checksum error was injected */ + int rm_nrows; /* Regular row count */ + int rm_nskip; /* RAIDZ sectors skipped for padding */ + int rm_skipstart; /* Column index of padding start */ const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ - raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ + raidz_row_t *rm_row[0]; /* flexible array of rows */ } raidz_map_t; + #define RAIDZ_ORIGINAL_IMPL (INT_MAX) extern const raidz_impl_ops_t vdev_raidz_scalar_impl; @@ -163,14 +179,15 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl; * * raidz_parity Returns parity of the RAIDZ block * raidz_ncols Returns number of columns the block spans + * Note, all rows have the same number of columns. * raidz_nbigcols Returns number of big columns * raidz_col_p Returns pointer to a column * raidz_col_size Returns size of a column * raidz_big_size Returns size of big columns * raidz_short_size Returns size of short columns */ -#define raidz_parity(rm) ((rm)->rm_firstdatacol) -#define raidz_ncols(rm) ((rm)->rm_cols) +#define raidz_parity(rm) ((rm)->rm_row[0]->rr_firstdatacol) +#define raidz_ncols(rm) ((rm)->rm_row[0]->rr_cols) #define raidz_nbigcols(rm) ((rm)->rm_bigcols) #define raidz_col_p(rm, c) ((rm)->rm_col + (c)) #define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size) @@ -185,10 +202,10 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl; */ #define _RAIDZ_GEN_WRAP(code, impl) \ static void \ -impl ## _gen_ ## code(void *rmp) \ +impl ## _gen_ ## code(void *rrp) \ { \ - raidz_map_t *rm = (raidz_map_t *)rmp; \ - raidz_generate_## code ## _impl(rm); \ + raidz_row_t *rr = (raidz_row_t *)rrp; \ + raidz_generate_## code ## _impl(rr); \ } /* @@ -199,10 +216,10 @@ impl ## _gen_ ## code(void *rmp) \ */ #define _RAIDZ_REC_WRAP(code, impl) \ static int \ -impl ## _rec_ ## code(void *rmp, const int *tgtidx) \ +impl ## _rec_ ## code(void *rrp, const int *tgtidx) \ { \ - raidz_map_t *rm = (raidz_map_t *)rmp; \ - return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \ + raidz_row_t *rr = (raidz_row_t *)rrp; \ + return (raidz_reconstruct_## code ## _impl(rr, tgtidx)); \ } /* diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h index 3d4b8cc46..61ae15c5d 100644 --- a/include/sys/vdev_rebuild.h +++ b/include/sys/vdev_rebuild.h @@ -66,10 +66,14 @@ typedef struct vdev_rebuild { vdev_t *vr_top_vdev; /* top-level vdev to rebuild */ metaslab_t *vr_scan_msp; /* scanning disabled metaslab */ range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */ + kmutex_t vr_io_lock; /* inflight IO lock */ + kcondvar_t vr_io_cv; /* inflight IO cv */ /* In-core state and progress */ uint64_t vr_scan_offset[TXG_SIZE]; uint64_t vr_prev_scan_time_ms; /* any previous scan time */ + uint64_t vr_bytes_inflight_max; /* maximum bytes inflight */ + uint64_t vr_bytes_inflight; /* current bytes inflight */ /* Per-rebuild pass statistics for calculating bandwidth */ uint64_t vr_pass_start_time; diff --git a/include/sys/zio.h b/include/sys/zio.h index 495983171..334ca064b 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -372,6 +372,7 @@ struct zio_cksum_report { nvlist_t *zcr_detector; void *zcr_cbdata; size_t zcr_cbinfo; /* passed to zcr_free() */ + uint64_t zcr_sector; uint64_t zcr_align; uint64_t zcr_length; zio_cksum_finish_f *zcr_finish; diff --git a/include/zfeature_common.h b/include/zfeature_common.h index db0138ae8..cf05bad76 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -76,6 +76,7 @@ typedef enum spa_feature { SPA_FEATURE_LIVELIST, SPA_FEATURE_DEVICE_REBUILD, SPA_FEATURE_ZSTD_COMPRESS, + SPA_FEATURE_DRAID, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 1eaed435c..47418b323 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -5336,6 +5336,16 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in * the 128k block example above. * + * The situtation is slightly different for dRAID since the minimum allocation + * size is the full group width. The same 8K block above would be written as + * follows in a dRAID group: + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | D1 | S0 | S1 | + * +-------+-------+-------+-------+-------+ + * * Compression may lead to a variety of block sizes being written for the same * volume or file. There is no clear way to reserve just the amount of space * that will be required, so the worst case (no compression) is assumed. @@ -5365,6 +5375,23 @@ vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, return (asize); } +/* + * Derived from function of same name in module/zfs/vdev_draid.c. Returns the + * amount of space (in bytes) that will be allocated for the specified block + * size. + */ +static uint64_t +vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, + uint64_t blksize) +{ + ASSERT3U(ndisks, >, nparity); + uint64_t ndata = ndisks - nparity; + uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1; + uint64_t asize = (rows * ndisks) << ashift; + + return (asize); +} + /* * Determine how much space will be allocated if it lands on the most space- * inefficient top-level vdev. Returns the size in bytes required to store one @@ -5374,7 +5401,7 @@ static uint64_t volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) { nvlist_t *config, *tree, **vdevs; - uint_t nvdevs, v; + uint_t nvdevs; uint64_t ret = 0; config = zpool_get_config(zhp, NULL); @@ -5384,33 +5411,61 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) return (nblocks * blksize); } - for (v = 0; v < nvdevs; v++) { + for (int v = 0; v < nvdevs; v++) { char *type; uint64_t nparity, ashift, asize, tsize; - nvlist_t **disks; - uint_t ndisks; uint64_t volsize; if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, - &type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 || - nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, - &nparity) != 0 || - nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, - &ashift) != 0 || - nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, - &disks, &ndisks) != 0) { + &type) != 0) continue; + + if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 && + strcmp(type, VDEV_TYPE_DRAID) != 0) + continue; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_NPARITY, &nparity) != 0) + continue; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_ASHIFT, &ashift) != 0) + continue; + + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + nvlist_t **disks; + uint_t ndisks; + + if (nvlist_lookup_nvlist_array(vdevs[v], + ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0) + continue; + + /* allocation size for the "typical" 128k block */ + tsize = vdev_raidz_asize(ndisks, nparity, ashift, + SPA_OLD_MAXBLOCKSIZE); + + /* allocation size for the blksize block */ + asize = vdev_raidz_asize(ndisks, nparity, ashift, + blksize); + } else { + uint64_t ndata; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0) + continue; + + /* allocation size for the "typical" 128k block */ + tsize = vdev_draid_asize(ndata + nparity, nparity, + ashift, SPA_OLD_MAXBLOCKSIZE); + + /* allocation size for the blksize block */ + asize = vdev_draid_asize(ndata + nparity, nparity, + ashift, blksize); } - /* allocation size for the "typical" 128k block */ - tsize = vdev_raidz_asize(ndisks, nparity, ashift, - SPA_OLD_MAXBLOCKSIZE); - /* allocation size for the blksize block */ - asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); - /* - * Scale this size down as a ratio of 128k / tsize. See theory - * statement above. + * Scale this size down as a ratio of 128k / tsize. + * See theory statement above. */ volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; if (volsize > ret) { diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 6c5f61836..44d3ade49 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -112,7 +112,6 @@ refresh_config_libzfs(void *handle, nvlist_t *tryconfig) return (refresh_config((libzfs_handle_t *)handle, tryconfig)); } - static int pool_active_libzfs(void *handle, const char *name, uint64_t guid, boolean_t *isactive) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 00b0b6faf..16f8e3e7f 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -42,10 +42,10 @@ #include #include #include +#include #include #include #include - #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" @@ -481,7 +481,8 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, if (err != 0) { ASSERT3U(err, ==, ENOENT); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid feature '%s'"), fname); + "feature '%s' unsupported by kernel"), + fname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } @@ -960,6 +961,7 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || + strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) @@ -1186,6 +1188,37 @@ zpool_has_special_vdev(nvlist_t *nvroot) return (B_FALSE); } +/* + * Output a dRAID top-level vdev name in to the provided buffer. + */ +static char * +zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity, + uint64_t spares, uint64_t children) +{ + snprintf(name, len, "%s%llu:%llud:%lluc:%llus", + VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, + (u_longlong_t)children, (u_longlong_t)spares); + + return (name); +} + +/* + * Return B_TRUE if the provided name is a dRAID spare name. + */ +boolean_t +zpool_is_draid_spare(const char *name) +{ + uint64_t spare_id, parity, vdev_id; + + if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", + (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, + (u_longlong_t *)&spare_id) == 3) { + return (B_TRUE); + } + + return (B_FALSE); +} + /* * Create the named pool, using the provided vdev list. It is assumed * that the consumer has already validated the contents of the nvlist, so we @@ -2668,6 +2701,11 @@ zpool_vdev_is_interior(const char *name) VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); + + if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 && + !zpool_is_draid_spare(name)) + return (B_TRUE); + return (B_FALSE); } @@ -3101,7 +3139,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_SPARE) == 0 && + if ((strcmp(type, VDEV_TYPE_SPARE) == 0 || + strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) && children == 2 && child[which] == tgt) return (B_TRUE); @@ -3216,8 +3255,12 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, "cannot replace a log with a spare")); } else if (rebuild) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "only mirror vdevs support sequential " - "reconstruction")); + "only mirror and dRAID vdevs support " + "sequential reconstruction")); + } else if (zpool_is_draid_spare(new_disk)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID spares can only replace child " + "devices in their parent's dRAID vdev")); } else if (version >= SPA_VERSION_MULTI_REPLACE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " @@ -3618,6 +3661,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); + if (zpool_is_draid_spare(path)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID spares cannot be removed")); + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + } + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) @@ -3955,9 +4004,10 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, } /* - * Remove the partition from the path it this is a whole disk. + * Remove the partition from the path if this is a whole disk. */ - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) + if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 && + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value && !(name_flags & VDEV_NAME_PATH)) { return (zfs_strip_partition(path)); } @@ -3975,6 +4025,27 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, path = buf; } + /* + * If it's a dRAID device, we add parity, groups, and spares. + */ + if (strcmp(path, VDEV_TYPE_DRAID) == 0) { + uint64_t ndata, nparity, nspares; + nvlist_t **child; + uint_t children; + + verify(nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_NPARITY, &nparity) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_DRAID_NDATA, &ndata) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_DRAID_NSPARES, &nspares) == 0); + + path = zpool_draid_name(buf, sizeof (buf), ndata, + nparity, nspares, children); + } + /* * We identify each top-level vdev by using a * naming convention. diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index d427bda36..5b938bd4a 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -124,6 +124,8 @@ KERNEL_C = \ unique.c \ vdev.c \ vdev_cache.c \ + vdev_draid.c \ + vdev_draid_rand.c \ vdev_file.c \ vdev_indirect_births.c \ vdev_indirect.c \ @@ -216,7 +218,7 @@ libzpool_la_LIBADD = \ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ $(abs_top_builddir)/lib/libzstd/libzstd.la -libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl +libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl -lm libzpool_la_LDFLAGS = -pthread diff --git a/man/man1/raidz_test.1 b/man/man1/raidz_test.1 index 94e48bf49..26e6b24ad 100644 --- a/man/man1/raidz_test.1 +++ b/man/man1/raidz_test.1 @@ -61,6 +61,11 @@ during testing. .IP Size of data for raidz block. Size is 1 << (zio_size_shift). .HP +.BI "\-r" " reflow_offset" " (default: uint max)" +.IP +Set raidz expansion offset. The expanded raidz map allocation function will +produce different map configurations depending on this value. +.HP .BI "\-S(weep)" .IP Sweep parameter space while verifying the raidz implementations. This option @@ -77,6 +82,10 @@ This options starts the benchmark mode. All implementations are benchmarked using increasing per disk data size. Results are given as throughput per disk, measured in MiB/s. .HP +.BI "\-e(xpansion)" +.IP +Use expanded raidz map allocation function. +.HP .BI "\-v(erbose)" .IP Increase verbosity. diff --git a/man/man1/ztest.1 b/man/man1/ztest.1 index 68c978ca0..3f30b3ed7 100644 --- a/man/man1/ztest.1 +++ b/man/man1/ztest.1 @@ -23,6 +23,7 @@ .\" Copyright (c) 2009 Oracle and/or its affiliates. All rights reserved. .\" Copyright (c) 2009 Michael Gebetsroither . All rights .\" reserved. +.\" Copyright (c) 2017, Intel Corporation. .\" .TH ZTEST 1 "Aug 24, 2020" OpenZFS @@ -82,13 +83,29 @@ Used alignment in test. .IP Number of mirror copies. .HP -.BI "\-r" " raidz_disks" " (default: 4)" +.BI "\-r" " raidz_disks / draid_disks" " (default: 4 / 16)" .IP Number of raidz disks. .HP -.BI "\-R" " raidz_parity" " (default: 1)" +.BI "\-R" " raid_parity" " (default: 1)" .IP -Raidz parity. +Raid parity (raidz & draid). +.HP +.BI "\-K" " raid_kind" " (default: 'random') raidz|draid|random" +.IP +The kind of RAID config to use. With 'random' the kind alternates between raidz and draid. +.HP +.BI "\-D" " draid_data" " (default: 4)" +.IP +Number of data disks in a dRAID redundancy group. +.HP +.BI "\-S" " draid_spares" " (default: 1)" +.IP +Number of dRAID distributed spare disks. +.HP +.BI "\-C" " vdev_class_state" " (default: random)" +.IP +The vdev allocation class state: special=on|off|random. .HP .BI "\-d" " datasets" " (default: 7)" .IP diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 469963750..5b4dac42f 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2902,6 +2902,31 @@ top-level vdev. Default value: \fB1,048,576\fR. .RE +.sp +.ne 2 +.na +\fBzfs_rebuild_scrub_enabled\fR (int) +.ad +.RS 12n +Automatically start a pool scrub when the last active sequential resilver +completes in order to verify the checksums of all blocks which have been +resilvered. This option is enabled by default and is strongly recommended. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_rebuild_vdev_limit\fR (ulong) +.ad +.RS 12n +Maximum amount of i/o that can be concurrently issued for a sequential +resilver per leaf device, given in bytes. +.sp +Default value: \fB33,554,432\fR. +.RE + .sp .ne 2 .na diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index 08a84ece2..2e5ab4c37 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -306,6 +306,30 @@ This feature becomes \fBactive\fR when the \fBzpool remove\fR subcommand is used on a top-level vdev, and will never return to being \fBenabled\fR. .RE +.sp +.ne 2 +.na +\fBdraid\fR +.ad +.RS 4n +.TS +l l . +GUID org.openzfs:draid +READ\-ONLY COMPATIBLE no +DEPENDENCIES none +.TE + +This feature enables use of the \fBdraid\fR vdev type. dRAID is a variant +of raidz which provides integrated distributed hot spares that allow faster +resilvering while retaining the benefits of raidz. Data, parity, and spare +space are organized in redundancy groups and distributed evenly over all of +the devices. + +This feature becomes \fBactive\fR when creating a pool which uses the +\fBdraid\fR vdev type, or when adding a new \fBdraid\fR vdev to an +existing pool. +.RE + .sp .ne 2 .na diff --git a/man/man8/zpool-create.8 b/man/man8/zpool-create.8 index 7f3f27b9b..7406a493e 100644 --- a/man/man8/zpool-create.8 +++ b/man/man8/zpool-create.8 @@ -73,12 +73,14 @@ and period The pool names .Sy mirror , .Sy raidz , +.Sy draid , .Sy spare and .Sy log are reserved, as are names beginning with .Sy mirror , .Sy raidz , +.Sy draid , .Sy spare , and the pattern .Sy c[0-9] . diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8 index ede569978..6ff2eb261 100644 --- a/man/man8/zpool-scrub.8 +++ b/man/man8/zpool-scrub.8 @@ -52,7 +52,7 @@ Begins a scrub or resumes a paused scrub. The scrub examines all data in the specified pools to verify that it checksums correctly. For replicated -.Pq mirror or raidz +.Pq mirror, raidz, or draid devices, ZFS automatically repairs any damage discovered during the scrub. The .Nm zpool Cm status diff --git a/man/man8/zpoolconcepts.8 b/man/man8/zpoolconcepts.8 index f9c262f4b..d999b0354 100644 --- a/man/man8/zpoolconcepts.8 +++ b/man/man8/zpoolconcepts.8 @@ -64,7 +64,7 @@ A file must be specified by a full path. A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with N disks of size X can hold X bytes and can withstand (N-1) devices -failing before data integrity is compromised. +failing without losing data. .It Sy raidz , raidz1 , raidz2 , raidz3 A variation on RAID-5 that allows for better distribution of parity and eliminates the RAID-5 @@ -88,11 +88,75 @@ vdev type is an alias for .Sy raidz1 . .Pp A raidz group with N disks of size X with P parity disks can hold approximately -(N-P)*X bytes and can withstand P device(s) failing before data integrity is -compromised. +(N-P)*X bytes and can withstand P device(s) failing without losing data. The minimum number of devices in a raidz group is one more than the number of parity disks. The recommended number is between 3 and 9 to help increase performance. +.It Sy draid , draid1 , draid2 , draid3 +A variant of raidz that provides integrated distributed hot spares which +allows for faster resilvering while retaining the benefits of raidz. +A dRAID vdev is constructed from multiple internal raidz groups, each with D +data devices and P parity devices. +These groups are distributed over all of the children in order to fully +utilize the available disk performance. +.Pp +Unlike raidz, dRAID uses a fixed stripe width (padding as necessary with +zeros) to allow fully sequential resilvering. +This fixed stripe width significantly effects both usable capacity and IOPS. +For example, with the default D=8 and 4k disk sectors the minimum allocation +size is 32k. +If using compression, this relatively large allocation size can reduce the +effective compression ratio. +When using ZFS volumes and dRAID the default volblocksize property is increased +to account for the allocation size. +If a dRAID pool will hold a significant amount of small blocks, it is +recommended to also add a mirrored +.Sy special +vdev to store those blocks. +.Pp +In regards to IO/s, performance is similar to raidz since for any read all D +data disks must be accessed. +Delivered random IOPS can be reasonably approximated as +floor((N-S)/(D+P))*. +.Pp +Like raidz a dRAID can have single-, double-, or triple-parity. The +.Sy draid1 , +.Sy draid2 , +and +.Sy draid3 +types can be used to specify the parity level. +The +.Sy draid +vdev type is an alias for +.Sy draid1 . +.Pp +A dRAID with N disks of size X, D data disks per redundancy group, P parity +level, and S distributed hot spares can hold approximately (N-S)*(D/(D+P))*X +bytes and can withstand P device(s) failing without losing data. +.It Sy draid[][:d][:c][:s] +A non-default dRAID configuration can be specified by appending one or more +of the following optional arguments to the +.Sy draid +keyword. +.Pp +.Em parity +- The parity level (1-3). +.Pp +.Em data +- The number of data devices per redundancy group. +In general a smaller value of D will increase IOPS, improve the compression ratio, and speed up resilvering at the expense of total usable capacity. +Defaults to 8, unless N-P-S is less than 8. +.Pp +.Em children +- The expected number of children. +Useful as a cross-check when listing a large number of devices. +An error is returned when the provided number of children differs. +.Pp +.Em spares +- The number of distributed hot spares. +Defaults to zero. +.Pp +.Pp .It Sy spare A pseudo-vdev which keeps track of available hot spares for a pool. For more information, see the @@ -273,6 +337,14 @@ If the original faulted device is detached, then the hot spare assumes its place in the configuration, and is removed from the spare list of all active pools. .Pp +The +.Sy draid +vdev type provides distributed hot spares. +These hot spares are named after the dRAID vdev they're a part of ( +.Qq draid1-2-3 specifies spare 3 of vdev 2, which is a single parity dRAID +) and may only be used by that dRAID vdev. +Otherwise, they behave the same as normal hot spares. +.Pp Spares cannot replace log devices. .Ss Intent Log The ZFS Intent Log (ZIL) satisfies POSIX requirements for synchronous diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 4a2514fd4..1acf543ac 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -243,6 +243,8 @@ SRCS+= abd.c \ unique.c \ vdev.c \ vdev_cache.c \ + vdev_draid.c \ + vdev_draid_rand.c \ vdev_indirect.c \ vdev_indirect_births.c \ vdev_indirect_mapping.c \ @@ -341,6 +343,7 @@ CFLAGS.lz4.c= -Wno-cast-qual CFLAGS.spa.c= -Wno-cast-qual CFLAGS.spa_misc.c= -Wno-cast-qual CFLAGS.sysctl_os.c= -include ../zfs_config.h +CFLAGS.vdev_draid.c= -Wno-cast-qual CFLAGS.vdev_raidz.c= -Wno-cast-qual CFLAGS.vdev_raidz_math.c= -Wno-cast-qual CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c index cf762c5fd..825bd706e 100644 --- a/module/os/freebsd/zfs/vdev_file.c +++ b/module/os/freebsd/zfs/vdev_file.c @@ -292,19 +292,28 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_FILE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; /* @@ -313,19 +322,28 @@ vdev_ops_t vdev_file_ops = { #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; #endif diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index b888cfdf0..ae7cbe60a 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1189,17 +1189,26 @@ vdev_geom_rele(vdev_t *vd) } vdev_ops_t vdev_disk_ops = { - vdev_geom_open, - vdev_geom_close, - vdev_default_asize, - vdev_geom_io_start, - vdev_geom_io_done, - NULL, - NULL, - vdev_geom_hold, - vdev_geom_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_geom_open, + .vdev_op_close = vdev_geom_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_geom_io_start, + .vdev_op_io_done = vdev_geom_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_geom_hold, + .vdev_op_rele = vdev_geom_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 7de5c30f7..12117655b 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -826,9 +826,13 @@ vdev_disk_rele(vdev_t *vd) } vdev_ops_t vdev_disk_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, .vdev_op_io_done = vdev_disk_io_done, .vdev_op_state_change = NULL, @@ -837,6 +841,11 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_rele = vdev_disk_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index 423ce8581..bf8a13ae6 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -305,9 +305,13 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, .vdev_op_state_change = NULL, @@ -316,6 +320,11 @@ vdev_ops_t vdev_file_ops = { .vdev_op_rele = vdev_file_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; @@ -341,9 +350,13 @@ vdev_file_fini(void) #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, .vdev_op_state_change = NULL, @@ -352,6 +365,11 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_rele = vdev_file_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 97ddacbab..599791d49 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -576,7 +576,7 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_DEVICE_REBUILD, "org.openzfs:device_rebuild", "device_rebuild", - "Support for sequential device rebuilds", + "Support for sequential mirror/dRAID device rebuilds", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); { @@ -589,6 +589,10 @@ zpool_feature_init(void) "zstd compression algorithm support.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps); } + + zfeature_register(SPA_FEATURE_DRAID, + "org.openzfs:draid", "draid", "Support for distributed parity RAID", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); } #if defined(_KERNEL) diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index f8625042a..0011a971c 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -442,7 +442,9 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) return (-1); } - if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { + if (strcmp(pool, "mirror") == 0 || + strcmp(pool, "raidz") == 0 || + strcmp(pool, "draid") == 0) { if (why) *why = NAME_ERR_RESERVED; return (-1); diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 8ee524fff..653ea0da9 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -84,6 +84,8 @@ $(MODULE)-objs += uberblock.o $(MODULE)-objs += unique.o $(MODULE)-objs += vdev.o $(MODULE)-objs += vdev_cache.o +$(MODULE)-objs += vdev_draid.o +$(MODULE)-objs += vdev_draid_rand.o $(MODULE)-objs += vdev_indirect.o $(MODULE)-objs += vdev_indirect_births.o $(MODULE)-objs += vdev_indirect_mapping.o diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 6018a42ca..68d4aa5f5 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -781,16 +781,17 @@ int abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_iter_func_t *func, void *private) { - int ret = 0; struct abd_iter aiter; - boolean_t abd_multi; - abd_t *c_abd; + int ret = 0; + + if (size == 0) + return (0); abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - abd_multi = abd_is_gang(abd); - c_abd = abd_init_abd_iter(abd, &aiter, off); + boolean_t abd_multi = abd_is_gang(abd); + abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { /* If we are at the end of the gang ABD we are done */ @@ -920,6 +921,9 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, boolean_t dabd_is_gang_abd, sabd_is_gang_abd; abd_t *c_dabd, *c_sabd; + if (size == 0) + return (0); + abd_verify(dabd); abd_verify(sabd); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f6a5ceca6..40adfbcee 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -713,7 +713,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) return (0); } -static void +void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; @@ -3327,20 +3327,13 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, return (B_TRUE); } - /* - * Check if the txg falls within the range which must be - * resilvered. DVAs outside this range can always be skipped. - */ - if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) - return (B_FALSE); - /* * Check if the top-level vdev must resilver this offset. * When the offset does not intersect with a dirty leaf DTL * then it may be possible to skip the resilver IO. The psize * is provided instead of asize to simplify the check for RAIDZ. */ - if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) + if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth)) return (B_FALSE); /* diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 325f505b7..fcf1285f6 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -1563,6 +1564,7 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, #if defined(WITH_DF_BLOCK_ALLOCATOR) || \ defined(WITH_CF_BLOCK_ALLOCATOR) + /* * This is a helper function that can be used by the allocator to find a * suitable block to allocate. This will search the specified B-tree looking @@ -1654,6 +1656,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) range_seg_t *rs; if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); + if (metaslab_df_use_largest_segment) { /* use largest free segment */ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); @@ -2616,6 +2619,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, ms->ms_allocator = -1; ms->ms_new = B_TRUE; + vdev_ops_t *ops = vd->vdev_ops; + if (ops->vdev_op_metaslab_init != NULL) + ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); + /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. @@ -5813,7 +5820,6 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } - } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 99852521b..d05c9db24 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -307,8 +307,17 @@ mmp_next_leaf(spa_t *spa) if (leaf == NULL) leaf = list_head(&spa->spa_leaf_list); - if (!vdev_writeable(leaf)) { + /* + * We skip unwritable, offline, detached, and dRAID spare + * devices as they are either not legal targets or the write + * may fail or not be seen by other hosts. Skipped dRAID + * spares can never be written so the fail mask is not set. + */ + if (!vdev_writeable(leaf) || leaf->vdev_offline || + leaf->vdev_detached) { fail_mask |= MMP_FAIL_NOT_WRITABLE; + } else if (leaf->vdev_ops == &vdev_draid_spare_ops) { + continue; } else if (leaf->vdev_mmp_pending != 0) { fail_mask |= MMP_FAIL_WRITE_PENDING; } else { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9d1d4e0cc..ae8964e6f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -3681,7 +3682,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, /* * Build a new vdev tree from the trusted config */ - VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); + if (error != 0) { + nvlist_free(mos_config); + spa_config_exit(spa, SCL_ALL, FTAG); + spa_load_failed(spa, "spa_config_parse failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } /* * Vdev paths in the MOS may be obsolete. If the untrusted config was @@ -5631,7 +5639,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version, obj; + uint64_t version, obj, ndraid = 0; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; @@ -5753,8 +5761,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = spa_validate_aux(spa, nvroot, txg, - VDEV_ALLOC_ADD)) == 0) { + (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && + (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point @@ -5895,6 +5903,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_sync_props(props, tx); } + for (int i = 0; i < ndraid; i++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -6403,13 +6414,26 @@ spa_reset(const char *pool) * ========================================================================== */ +/* + * This is called as a synctask to increment the draid feature flag + */ +static void +spa_draid_feature_incr(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int draid = (int)(uintptr_t)arg; + + for (int c = 0; c < draid; c++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); +} + /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { - uint64_t txg; + uint64_t txg, ndraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; @@ -6438,8 +6462,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && - (error = vdev_create(vd, txg, B_FALSE)) != 0) + (error = vdev_create(vd, txg, B_FALSE)) != 0) { return (spa_vdev_exit(spa, vd, txg, error)); + } + + /* + * The virtual dRAID spares must be added after vdev tree is created + * and the vdev guids are generated. The guid of their assoicated + * dRAID is stored in the config and used when opening the spare. + */ + if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, + rvd->vdev_children)) == 0) { + if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) + nspares = 0; + } else { + return (spa_vdev_exit(spa, vd, txg, error)); + } /* * We must validate the spares and l2cache devices after checking the @@ -6452,7 +6491,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect - * vdevs, we can not add raidz toplevels. + * vdevs, we can not add raidz or dRAID top levels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { @@ -6462,10 +6501,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } - /* Fail if top level vdev is raidz */ - if (tvd->vdev_ops == &vdev_raidz_ops) { + /* Fail if top level vdev is raidz or a dRAID */ + if (vdev_get_nparity(tvd) != 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); - } + /* * Need the top level mirror to be * a mirror of leaf vdevs only @@ -6505,6 +6544,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) spa->spa_l2cache.sav_sync = B_TRUE; } + /* + * We can't increment a feature while holding spa_vdev so we + * have to do it in a synctask. + */ + if (ndraid != 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, + (void *)(uintptr_t)ndraid, tx); + dmu_tx_commit(tx); + } + /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we @@ -6615,14 +6667,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* + * A dRAID spare can only replace a child of its parent dRAID vdev. + */ + if (newvd->vdev_ops == &vdev_draid_spare_ops && + oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + if (rebuild) { /* - * For rebuilds, the parent vdev must support reconstruction + * For rebuilds, the top vdev must support reconstruction * using only space maps. This means the only allowable - * parents are the root vdev or a mirror vdev. + * vdevs types are the root vdev, a mirror, or dRAID. */ - if (pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_root_ops) { + tvd = pvd; + if (pvd->vdev_top != NULL) + tvd = pvd->vdev_top; + + if (tvd->vdev_ops != &vdev_mirror_ops && + tvd->vdev_ops != &vdev_root_ops && + tvd->vdev_ops != &vdev_draid_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } } @@ -6915,14 +6980,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) } /* - * If we are detaching the original disk from a spare, then it implies - * that the spare should become a real disk, and be removed from the - * active spare list for the pool. + * If we are detaching the original disk from a normal spare, then it + * implies that the spare should become a real disk, and be removed + * from the active spare list for the pool. dRAID spares on the + * other hand are coupled to the pool and thus should never be removed + * from the spares list. */ - if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0 && - pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) - unspare = B_TRUE; + if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { + vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; + + if (last_cvd->vdev_isspare && + last_cvd->vdev_ops != &vdev_draid_spare_ops) { + unspare = B_TRUE; + } + } /* * Erase the disk labels so the disk can be used for other things. @@ -8013,18 +8084,9 @@ spa_async_thread(void *arg) /* * If any devices are done replacing, detach them. */ - if (tasks & SPA_ASYNC_RESILVER_DONE) + if (tasks & SPA_ASYNC_RESILVER_DONE || + tasks & SPA_ASYNC_REBUILD_DONE) { spa_vdev_resilver_done(spa); - - /* - * If any devices are done replacing, detach them. Then if no - * top-level vdevs are rebuilding attempt to kick off a scrub. - */ - if (tasks & SPA_ASYNC_REBUILD_DONE) { - spa_vdev_resilver_done(spa); - - if (!vdev_rebuild_active(spa->spa_root_vdev)) - (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB); } /* diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 1640dcedd..c6b3e8c11 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -741,6 +741,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; + spa->spa_min_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index e41e79ab8..38f36e52f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent) static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, + &vdev_draid_ops, + &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, @@ -221,10 +225,11 @@ vdev_getops(const char *type) /* ARGSUSED */ void -vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res) +vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { - res->rs_start = in->rs_start; - res->rs_end = in->rs_end; + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; } /* @@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) return (asize); } +uint64_t +vdev_default_min_asize(vdev_t *vd) +{ + return (vd->vdev_min_asize); +} + /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to @@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd) if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); - /* - * The allocatable space for a raidz vdev is N * sizeof(smallest child), - * so each child must provide at least 1/Nth of its asize. - */ - if (pvd->vdev_ops == &vdev_raidz_ops) - return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / - pvd->vdev_children); - - return (pvd->vdev_min_asize); + return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void @@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd) vdev_set_min_asize(vd->vdev_child[c]); } +/* + * Get the minimal allocation size for the top-level vdev. + */ +uint64_t +vdev_get_min_alloc(vdev_t *vd) +{ + uint64_t min_alloc = 1ULL << vd->vdev_ashift; + + if (vd->vdev_ops->vdev_op_min_alloc != NULL) + min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); + + return (min_alloc); +} + +/* + * Get the parity level for a top-level vdev. + */ +uint64_t +vdev_get_nparity(vdev_t *vd) +{ + uint64_t nparity = 0; + + if (vd->vdev_ops->vdev_op_nparity != NULL) + nparity = vd->vdev_ops->vdev_op_nparity(vd); + + return (nparity); +} + +/* + * Get the number of data disks for a top-level vdev. + */ +uint64_t +vdev_get_ndisks(vdev_t *vd) +{ + uint64_t ndisks = 1; + + if (vd->vdev_ops->vdev_op_ndisks != NULL) + ndisks = vd->vdev_ops->vdev_op_ndisks(vd); + + return (ndisks); +} + vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { @@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); + mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); @@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); - cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, @@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, { vdev_ops_t *ops; char *type; - uint64_t guid = 0, islog, nparity; + uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; char *tmp = NULL; @@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); - /* - * Set the nparity property for RAID-Z vdevs. - */ - nparity = -1ULL; - if (ops == &vdev_raidz_ops) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, - &nparity) == 0) { - if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) - return (SET_ERROR(EINVAL)); - /* - * Previous versions could only support 1 or 2 parity - * device. - */ - if (nparity > 1 && - spa_version(spa) < SPA_VERSION_RAIDZ2) - return (SET_ERROR(ENOTSUP)); - if (nparity > 2 && - spa_version(spa) < SPA_VERSION_RAIDZ3) - return (SET_ERROR(ENOTSUP)); - } else { - /* - * We require the parity to be specified for SPAs that - * support multiple parity levels. - */ - if (spa_version(spa) >= SPA_VERSION_RAIDZ2) - return (SET_ERROR(EINVAL)); - /* - * Otherwise, we default to 1 parity device for RAID-Z. - */ - nparity = 1; - } - } else { - nparity = 0; - } - ASSERT(nparity != -1ULL); - - /* - * If creating a top-level vdev, check for allocation classes input - */ if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; + /* + * If creating a top-level vdev, check for allocation + * classes input. + */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); @@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, return (SET_ERROR(ENOTSUP)); } } + + /* spa_vdev_add() expects feature to be enabled */ + if (ops == &vdev_draid_ops && + spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { + return (SET_ERROR(ENOTSUP)); + } + } + + /* + * Initialize the vdev specific data. This is done before calling + * vdev_alloc_common() since it may fail and this simplifies the + * error reporting and cleanup code paths. + */ + void *tsd = NULL; + if (ops->vdev_op_init != NULL) { + rc = ops->vdev_op_init(spa, nv, &tsd); + if (rc != 0) { + return (rc); + } } vd = vdev_alloc_common(spa, id, guid, ops); - vic = &vd->vdev_indirect_config; - + vd->vdev_tsd = tsd; vd->vdev_islog = islog; - vd->vdev_nparity = nparity; + if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; @@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; + vic = &vd->vdev_indirect_config; + ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); @@ -937,6 +967,9 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + if (vd->vdev_ops->vdev_op_fini != NULL) + vd->vdev_ops->vdev_op_fini(vd); + /* * Discard allocation state. */ @@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd) cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); - mutex_destroy(&vd->vdev_rebuild_io_lock); cv_destroy(&vd->vdev_rebuild_cv); - cv_destroy(&vd->vdev_rebuild_io_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd) } /* - * Add a mirror/replacing vdev above an existing vdev. + * Add a mirror/replacing vdev above an existing vdev. There is no need to + * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) @@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; + + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; } } } @@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd) return (B_FALSE); } +/* + * Returns B_TRUE if the passed child should be opened. + */ +static boolean_t +vdev_default_open_children_func(vdev_t *vd) +{ + return (B_TRUE); +} + +/* + * Open the requested child vdevs. If any of the leaf vdevs are using + * a ZFS volume then do the opens in a single thread. This avoids a + * deadlock when the current thread is holding the spa_namespace_lock. + */ +static void +vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) +{ + int children = vd->vdev_children; + + taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + vd->vdev_nonrot = B_TRUE; + + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (open_func(cvd) == B_FALSE) + continue; + + if (tq == NULL || vdev_uses_zvols(vd)) { + cvd->vdev_open_error = vdev_open(cvd); + } else { + VERIFY(taskq_dispatch(tq, vdev_open_child, + cvd, TQ_SLEEP) != TASKQID_INVALID); + } + + vd->vdev_nonrot &= cvd->vdev_nonrot; + } + + if (tq != NULL) { + taskq_wait(tq); + taskq_destroy(tq); + } +} + +/* + * Open all child vdevs. + */ void vdev_open_children(vdev_t *vd) { - taskq_t *tq; - int children = vd->vdev_children; + vdev_open_children_impl(vd, vdev_default_open_children_func); +} - /* - * in order to handle pools on top of zvols, do the opens - * in a single thread so that the same thread holds the - * spa_namespace_lock - */ - if (vdev_uses_zvols(vd)) { -retry_sync: - for (int c = 0; c < children; c++) - vd->vdev_child[c]->vdev_open_error = - vdev_open(vd->vdev_child[c]); - } else { - tq = taskq_create("vdev_open", children, minclsyspri, - children, children, TASKQ_PREPOPULATE); - if (tq == NULL) - goto retry_sync; - - for (int c = 0; c < children; c++) - VERIFY(taskq_dispatch(tq, vdev_open_child, - vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); - - taskq_destroy(tq); - } - - vd->vdev_nonrot = B_TRUE; - - for (int c = 0; c < children; c++) - vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; +/* + * Conditionally open a subset of child vdevs. + */ +void +vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) +{ + vdev_open_children_impl(vd, open_func); } /* @@ -1952,6 +2016,16 @@ vdev_open(vdev_t *vd) return (error); } + /* + * Track the the minimum allocation size. + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + vd->vdev_islog == 0 && vd->vdev_aux == NULL) { + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; + } + /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since @@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd) vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd != NULL); + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are @@ -2606,10 +2682,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) } /* - * Returns B_TRUE if vdev determines offset needs to be resilvered. + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. */ boolean_t -vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + /* Set by sequential resilver. */ + if (phys_birth == TXG_UNKNOWN) + return (B_TRUE); + + return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); +} + +/* + * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. + */ +boolean_t +vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); @@ -2617,7 +2709,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) vd->vdev_ops->vdev_op_leaf) return (B_TRUE); - return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); + return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, + phys_birth)); } /* @@ -2862,8 +2955,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ - else if (vd->vdev_nparity != 0) - minref = vd->vdev_nparity + 1; /* RAID-Z */ + else if (vdev_get_nparity(vd) != 0) + minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); @@ -3727,6 +3820,9 @@ top: if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; @@ -3971,6 +4067,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio) static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { + /* + * Exclude the dRAID spare when aggregating to avoid double counting + * the ops and bytes. These IOs are counted by the physical leaves. + */ + if (cvd->vdev_ops == &vdev_draid_spare_ops) + return; + for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; @@ -4063,7 +4166,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); - } } else { /* @@ -4248,7 +4350,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize) /* * Repair is the result of a rebuild issued by the - * rebuild thread (vdev_rebuild_thread). + * rebuild thread (vdev_rebuild_thread). To avoid + * double counting repaired bytes the virtual dRAID + * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; @@ -4256,8 +4360,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); + } vs->vs_rebuild_processed += psize; } @@ -4981,31 +5087,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) vdev_resilver_needed(vd, NULL, NULL)); } +boolean_t +vdev_xlate_is_empty(range_seg64_t *rs) +{ + return (rs->rs_start == rs->rs_end); +} + /* - * Translate a logical range to the physical range for the specified vdev_t. - * This function is initially called with a leaf vdev and will walk each - * parent vdev until it reaches a top-level vdev. Once the top-level is - * reached the physical range is initialized and the recursive function - * begins to unwind. As it unwinds it calls the parent's vdev specific - * translation function to do the real conversion. + * Translate a logical range to the first contiguous physical range for the + * specified vdev_t. This function is initially called with a leaf vdev and + * will walk each parent vdev until it reaches a top-level vdev. Once the + * top-level is reached the physical range is initialized and the recursive + * function begins to unwind. As it unwinds it calls the parent's vdev + * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs) + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { - vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); + vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, + remain_rs); } else { /* - * We've reached the top-level vdev, initialize the - * physical range to the logical range and start to - * unwind. + * We've reached the top-level vdev, initialize the physical + * range to the logical range and set an empty remaining + * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; + + remain_rs->rs_start = logical_rs->rs_start; + remain_rs->rs_end = logical_rs->rs_start; + return; } @@ -5015,16 +5132,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, /* * As this recursive function unwinds, translate the logical - * range into its physical components by calling the - * vdev specific translate function. + * range into its physical and any remaining components by calling + * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; - pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); + pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } +void +vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, + vdev_xlate_func_t *func, void *arg) +{ + range_seg64_t iter_rs = *logical_rs; + range_seg64_t physical_rs; + range_seg64_t remain_rs; + + while (!vdev_xlate_is_empty(&iter_rs)) { + + vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); + + /* + * With raidz and dRAID, it's possible that the logical range + * does not live on this leaf vdev. Only when there is a non- + * zero physical size call the provided function. + */ + if (!vdev_xlate_is_empty(&physical_rs)) + func(arg, &physical_rs); + + iter_rs = remain_rs; + } +} + /* * Look at the vdev tree and determine whether any devices are currently being * replaced. diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c new file mode 100644 index 000000000..6b7ad7021 --- /dev/null +++ b/module/zfs/vdev_draid.c @@ -0,0 +1,2984 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef ZFS_DEBUG +#include /* For vdev_xlate() in vdev_draid_io_verify() */ +#endif + +/* + * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is + * comprised of multiple raidz redundancy groups which are spread over the + * dRAID children. To ensure an even distribution, and avoid hot spots, a + * permutation mapping is applied to the order of the dRAID children. + * This mixing effectively distributes the parity columns evenly over all + * of the disks in the dRAID. + * + * This is beneficial because it means when resilvering all of the disks + * can participate thereby increasing the available IOPs and bandwidth. + * Furthermore, by reserving a small fraction of each child's total capacity + * virtual distributed spare disks can be created. These spares similarly + * benefit from the performance gains of spanning all of the children. The + * consequence of which is that resilvering to a distributed spare can + * substantially reduce the time required to restore full parity to pool + * with a failed disks. + * + * === dRAID group layout === + * + * First, let's define a "row" in the configuration to be a 16M chunk from + * each physical drive at the same offset. This is the minimum allowable + * size since it must be possible to store a full 16M block when there is + * only a single data column. Next, we define a "group" to be a set of + * sequential disks containing both the parity and data columns. We allow + * groups to span multiple rows in order to align any group size to any + * number of physical drives. Finally, a "slice" is comprised of the rows + * which contain the target number of groups. The permutation mappings + * are applied in a round robin fashion to each slice. + * + * Given D+P drives in a group (including parity drives) and C-S physical + * drives (not including the spare drives), we can distribute the groups + * across R rows without remainder by selecting the least common multiple + * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S). + * + * In the example below, there are C=14 physical drives in the configuration + * with S=2 drives worth of spare capacity. Each group has a width of 9 + * which includes D=8 data and P=1 parity drive. There are 4 groups and + * 3 rows per slice. Each group has a size of 144M (16M * 9) and a slice + * size is 576M (144M * 4). When allocating from a dRAID each group is + * filled before moving on to the next as show in slice0 below. + * + * data disks (8 data + 1 parity) spares (2) + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * ^ | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0 + * | +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | | group 0 | group 1..| | + * | +-----------------------------------+-----------+-------| + * | | 0 1 2 3 4 5 6 7 8 | 36 37 38| | r + * | | 9 10 11 12 13 14 15 16 17| 45 46 47| | o + * | | 18 19 20 21 22 23 24 25 26| 54 55 56| | w + * | 27 28 29 30 31 32 33 34 35| 63 64 65| | 0 + * s +-----------------------+-----------------------+-------+ + * l | ..group 1 | group 2.. | | + * i +-----------------------+-----------------------+-------+ + * c | 39 40 41 42 43 44| 72 73 74 75 76 77| | r + * e | 48 49 50 51 52 53| 81 82 83 84 85 86| | o + * 0 | 57 58 59 60 61 62| 90 91 92 93 94 95| | w + * | 66 67 68 69 70 71| 99 100 101 102 103 104| | 1 + * | +-----------+-----------+-----------------------+-------+ + * | |..group 2 | group 3 | | + * | +-----------+-----------+-----------------------+-------+ + * | | 78 79 80|108 109 110 111 112 113 114 115 116| | r + * | | 87 88 89|117 118 119 120 121 122 123 124 125| | o + * | | 96 97 98|126 127 128 129 130 131 132 133 134| | w + * v |105 106 107|135 136 137 138 139 140 141 142 143| | 2 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 4 | group 5..| | row 3 + * i +-----------------------+-----------+-----------+-------| + * c | ..group 5 | group 6.. | | row 4 + * e +-----------+-----------+-----------------------+-------+ + * 1 |..group 6 | group 7 | | row 5 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 8 | group 9..| | row 6 + * i +-----------------------------------------------+-------| + * c | ..group 9 | group 10.. | | row 7 + * e +-----------------------+-----------------------+-------+ + * 2 |..group 10 | group 11 | | row 8 + * +-----------+-----------------------------------+-------+ + * + * This layout has several advantages over requiring that each row contain + * a whole number of groups. + * + * 1. The group count is not a relevant parameter when defining a dRAID + * layout. Only the group width is needed, and *all* groups will have + * the desired size. + * + * 2. All possible group widths (<= physical disk count) can be supported. + * + * 3. The logic within vdev_draid.c is simplified when the group width is + * the same for all groups (although some of the logic around computing + * permutation numbers and drive offsets is more complicated). + * + * N.B. The following array describes all valid dRAID permutation maps. + * Each row is used to generate a permutation map for a different number + * of children from a unique seed. The seeds were generated and carefully + * evaluated by the 'draid' utility in order to provide balanced mappings. + * In addition to the seed a checksum of the in-memory mapping is stored + * for verification. + * + * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed, + * with a given permutation map) is the ratio of the amounts of I/O that will + * be sent to the least and most busy disks when resilvering. The average + * imbalance ratio (of a given number of disks and permutation map) is the + * average of the ratios of all possible single and double disk failures. + * + * In order to achieve a low imbalance ratio the number of permutations in + * the mapping must be significantly larger than the number of children. + * For dRAID the number of permutations has been limited to 512 to minimize + * the map size. This does result in a gradually increasing imbalance ratio + * as seen in the table below. Increasing the number of permutations for + * larger child counts would reduce the imbalance ratio. However, in practice + * when there are a large number of children each child is responsible for + * fewer total IOs so it's less of a concern. + * + * Note these values are hard coded and must never be changed. Existing + * pools depend on the same mapping always being generated in order to + * read and write from the correct locations. Any change would make + * existing pools completely inaccessible. + */ +static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = { + { 2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d }, /* 1.000 */ + { 3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 }, /* 1.000 */ + { 4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 }, /* 1.000 */ + { 5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 }, /* 1.010 */ + { 6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 }, /* 1.031 */ + { 7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee }, /* 1.043 */ + { 8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 }, /* 1.059 */ + { 9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 }, /* 1.056 */ + { 10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 }, /* 1.072 */ + { 11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c }, /* 1.083 */ + { 12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e }, /* 1.097 */ + { 13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 }, /* 1.100 */ + { 14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 }, /* 1.121 */ + { 15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 }, /* 1.103 */ + { 16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 }, /* 1.111 */ + { 17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe }, /* 1.133 */ + { 18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 }, /* 1.131 */ + { 19, 256, 0x892e343f2f31d690, 0x00000029eb392835 }, /* 1.130 */ + { 20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c }, /* 1.141 */ + { 21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 }, /* 1.139 */ + { 22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 }, /* 1.150 */ + { 23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f }, /* 1.174 */ + { 24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 }, /* 1.168 */ + { 25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 }, /* 1.180 */ + { 26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba }, /* 1.226 */ + { 27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 }, /* 1.228 */ + { 28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c }, /* 1.217 */ + { 29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c }, /* 1.239 */ + { 30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 }, /* 1.238 */ + { 31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f }, /* 1.273 */ + { 32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 }, /* 1.191 */ + { 33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 }, /* 1.199 */ + { 34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 }, /* 1.195 */ + { 35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 }, /* 1.201 */ + { 36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef }, /* 1.194 */ + { 37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 }, /* 1.237 */ + { 38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 }, /* 1.242 */ + { 39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd }, /* 1.231 */ + { 40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 }, /* 1.233 */ + { 41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 }, /* 1.271 */ + { 42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 }, /* 1.263 */ + { 43, 512, 0xbaa5125faa781854, 0x000001c76789e278 }, /* 1.270 */ + { 44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb }, /* 1.281 */ + { 45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 }, /* 1.282 */ + { 46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b }, /* 1.286 */ + { 47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 }, /* 1.329 */ + { 48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b }, /* 1.286 */ + { 49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 }, /* 1.322 */ + { 50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 }, /* 1.335 */ + { 51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 }, /* 1.305 */ + { 52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf }, /* 1.330 */ + { 53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 }, /* 1.365 */ + { 54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 }, /* 1.334 */ + { 55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 }, /* 1.364 */ + { 56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e }, /* 1.374 */ + { 57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 }, /* 1.363 */ + { 58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 }, /* 1.401 */ + { 59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c }, /* 1.392 */ + { 60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 }, /* 1.360 */ + { 61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd }, /* 1.396 */ + { 62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c }, /* 1.453 */ + { 63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 }, /* 1.437 */ + { 64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 }, /* 1.402 */ + { 65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 }, /* 1.459 */ + { 66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 }, /* 1.423 */ + { 67, 512, 0x910b9714f698a877, 0x00000451ea65d5db }, /* 1.447 */ + { 68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 }, /* 1.450 */ + { 69, 512, 0x836d4968fbaa3706, 0x000004954068a380 }, /* 1.455 */ + { 70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d }, /* 1.463 */ + { 71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 }, /* 1.463 */ + { 72, 512, 0x42763a680d5bed8e, 0x000005084275c680 }, /* 1.452 */ + { 73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab }, /* 1.498 */ + { 74, 512, 0x9fa08548b1621a44, 0x0000054708019247 }, /* 1.526 */ + { 75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 }, /* 1.491 */ + { 76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 }, /* 1.470 */ + { 77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 }, /* 1.527 */ + { 78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 }, /* 1.509 */ + { 79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e }, /* 1.569 */ + { 80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c }, /* 1.555 */ + { 81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 }, /* 1.509 */ + { 82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 }, /* 1.596 */ + { 83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e }, /* 1.568 */ + { 84, 512, 0xba02545069ddc6dc, 0x000006d19861364f }, /* 1.541 */ + { 85, 512, 0x447c73192c35073e, 0x000006fce315ce35 }, /* 1.623 */ + { 86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b }, /* 1.620 */ + { 87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 }, /* 1.597 */ + { 88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b }, /* 1.575 */ + { 89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc }, /* 1.627 */ + { 90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb }, /* 1.596 */ + { 91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 }, /* 1.622 */ + { 92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e }, /* 1.695 */ + { 93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c }, /* 1.605 */ + { 94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc }, /* 1.625 */ + { 95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 }, /* 1.687 */ + { 96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a }, /* 1.621 */ + { 97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 }, /* 1.699 */ + { 98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b }, /* 1.688 */ + { 99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce }, /* 1.642 */ + { 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc }, /* 1.683 */ + { 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 }, /* 1.755 */ + { 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 }, /* 1.692 */ + { 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 }, /* 1.747 */ + { 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 }, /* 1.751 */ + { 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 }, /* 1.751 */ + { 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f }, /* 1.726 */ + { 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d }, /* 1.788 */ + { 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 }, /* 1.740 */ + { 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 }, /* 1.780 */ + { 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 }, /* 1.836 */ + { 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 }, /* 1.778 */ + { 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 }, /* 1.831 */ + { 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df }, /* 1.825 */ + { 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 }, /* 1.826 */ + { 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 }, /* 1.843 */ + { 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d }, /* 1.826 */ + { 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b }, /* 1.803 */ + { 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 }, /* 1.857 */ + { 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 }, /* 1.877 */ + { 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 }, /* 1.849 */ + { 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d }, /* 1.867 */ + { 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 }, /* 1.978 */ + { 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d }, /* 1.947 */ + { 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea }, /* 1.865 */ + { 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f }, /* 1.881 */ + { 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b }, /* 1.882 */ + { 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e }, /* 1.867 */ + { 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e }, /* 1.972 */ + { 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 }, /* 1.896 */ + { 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d }, /* 1.965 */ + { 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 }, /* 1.963 */ + { 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 }, /* 1.925 */ + { 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 }, /* 1.862 */ + { 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 }, /* 2.042 */ + { 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 }, /* 1.935 */ + { 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 }, /* 2.005 */ + { 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c }, /* 2.041 */ + { 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 }, /* 1.997 */ + { 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 }, /* 1.996 */ + { 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d }, /* 2.053 */ + { 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a }, /* 1.971 */ + { 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 }, /* 2.018 */ + { 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd }, /* 1.961 */ + { 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 }, /* 2.046 */ + { 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb }, /* 1.968 */ + { 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 }, /* 2.143 */ + { 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 }, /* 2.064 */ + { 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 }, /* 2.023 */ + { 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c }, /* 2.136 */ + { 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 }, /* 2.063 */ + { 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 }, /* 1.974 */ + { 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 }, /* 2.210 */ + { 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a }, /* 2.006 */ + { 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 }, /* 2.193 */ + { 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 }, /* 2.163 */ + { 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc }, /* 2.046 */ + { 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 }, /* 2.084 */ + { 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 }, /* 2.264 */ + { 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 }, /* 2.074 */ + { 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 }, /* 2.282 */ + { 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf }, /* 2.148 */ + { 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 }, /* 2.355 */ + { 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 }, /* 2.164 */ + { 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a }, /* 2.393 */ + { 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 }, /* 2.178 */ + { 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc }, /* 2.334 */ + { 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b }, /* 2.266 */ + { 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 }, /* 2.304 */ + { 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d }, /* 2.218 */ + { 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff }, /* 2.377 */ + { 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 }, /* 2.155 */ + { 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 }, /* 2.404 */ + { 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 }, /* 2.205 */ + { 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d }, /* 2.359 */ + { 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 }, /* 2.158 */ + { 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b }, /* 2.614 */ + { 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc }, /* 2.239 */ + { 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc }, /* 2.493 */ + { 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c }, /* 2.327 */ + { 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 }, /* 2.231 */ + { 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c }, /* 2.237 */ + { 182, 512, 0xe6035defea48f933, 0x00002038e3346658 }, /* 2.691 */ + { 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e }, /* 2.170 */ + { 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 }, /* 2.600 */ + { 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc }, /* 2.391 */ + { 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 }, /* 2.677 */ + { 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c }, /* 2.410 */ + { 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 }, /* 2.776 */ + { 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 }, /* 2.266 */ + { 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 }, /* 2.717 */ + { 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c }, /* 2.474 */ + { 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 }, /* 2.673 */ + { 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 }, /* 2.420 */ + { 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 }, /* 2.898 */ + { 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c }, /* 2.363 */ + { 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e }, /* 2.747 */ + { 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 }, /* 2.531 */ + { 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 }, /* 2.707 */ + { 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 }, /* 2.315 */ + { 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf }, /* 3.012 */ + { 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 }, /* 2.378 */ + { 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 }, /* 2.969 */ + { 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d }, /* 2.594 */ + { 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd }, /* 2.763 */ + { 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 }, /* 2.457 */ + { 206, 512, 0xc02fc96684715a16, 0x0000297515608601 }, /* 3.057 */ + { 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 }, /* 2.590 */ + { 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b }, /* 3.047 */ + { 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 }, /* 2.676 */ + { 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 }, /* 2.993 */ + { 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 }, /* 2.457 */ + { 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 }, /* 3.182 */ + { 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 }, /* 2.563 */ + { 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 }, /* 3.025 */ + { 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f }, /* 2.730 */ + { 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 }, /* 3.036 */ + { 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 }, /* 2.722 */ + { 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 }, /* 3.356 */ + { 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 }, /* 2.697 */ + { 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 }, /* 2.979 */ + { 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 }, /* 2.858 */ + { 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e }, /* 3.258 */ + { 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 }, /* 2.693 */ + { 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 }, /* 3.259 */ + { 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c }, /* 2.733 */ + { 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 }, /* 3.235 */ + { 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 }, /* 2.983 */ + { 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e }, /* 3.308 */ + { 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 }, /* 2.715 */ + { 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f }, /* 3.540 */ + { 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 }, /* 2.779 */ + { 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c }, /* 3.084 */ + { 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc }, /* 2.987 */ + { 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae }, /* 3.341 */ + { 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 }, /* 2.793 */ + { 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 }, /* 3.518 */ + { 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 }, /* 2.962 */ + { 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 }, /* 3.196 */ + { 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 }, /* 2.914 */ + { 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 }, /* 3.408 */ + { 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 }, /* 2.903 */ + { 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 }, /* 3.778 */ + { 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c }, /* 3.026 */ + { 244, 512, 0xc740263f0301efa8, 0x00003a147146512d }, /* 3.347 */ + { 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d }, /* 3.212 */ + { 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 }, /* 3.482 */ + { 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 }, /* 3.146 */ + { 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f }, /* 3.626 */ + { 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 }, /* 2.952 */ + { 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e }, /* 3.463 */ + { 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 }, /* 3.131 */ + { 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c }, /* 3.538 */ + { 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac }, /* 2.974 */ + { 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 }, /* 3.843 */ + { 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 }, /* 3.088 */ +}; + +/* + * Verify the map is valid. Each device index must appear exactly + * once in every row, and the permutation array checksum must match. + */ +static int +verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms, + uint64_t checksum) +{ + int countssz = sizeof (uint16_t) * children; + uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP); + + for (int i = 0; i < nperms; i++) { + for (int j = 0; j < children; j++) { + uint8_t val = perms[(i * children) + j]; + + if (val >= children || counts[val] != i) { + kmem_free(counts, countssz); + return (EINVAL); + } + + counts[val]++; + } + } + + if (checksum != 0) { + int permssz = sizeof (uint8_t) * children * nperms; + zio_cksum_t cksum; + + fletcher_4_native_varsize(perms, permssz, &cksum); + + if (checksum != cksum.zc_word[0]) { + kmem_free(counts, countssz); + return (ECKSUM); + } + } + + kmem_free(counts, countssz); + + return (0); +} + +/* + * Generate the permutation array for the draid_map_t. These maps control + * the placement of all data in a dRAID. Therefore it's critical that the + * seed always generates the same mapping. We provide our own pseudo-random + * number generator for this purpose. + */ +int +vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) +{ + VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN); + VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN); + VERIFY3U(map->dm_seed, !=, 0); + VERIFY3U(map->dm_nperms, !=, 0); + VERIFY3P(map->dm_perms, ==, NULL); + +#ifdef _KERNEL + /* + * The kernel code always provides both a map_seed and checksum. + * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide + * a zero checksum when generating new candidate maps. + */ + VERIFY3U(map->dm_checksum, !=, 0); +#endif + uint64_t children = map->dm_children; + uint64_t nperms = map->dm_nperms; + int rowsz = sizeof (uint8_t) * children; + int permssz = rowsz * nperms; + uint8_t *perms; + + /* Allocate the permutation array */ + perms = vmem_alloc(permssz, KM_SLEEP); + + /* Setup an initial row with a known pattern */ + uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP); + for (int i = 0; i < children; i++) + initial_row[i] = i; + + uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed }; + uint8_t *current_row, *previous_row = initial_row; + + /* + * Perform a Fisher-Yates shuffle of each row using the previous + * row as the starting point. An initial_row with known pattern + * is used as the input for the first row. + */ + for (int i = 0; i < nperms; i++) { + current_row = &perms[i * children]; + memcpy(current_row, previous_row, rowsz); + + for (int j = children - 1; j > 0; j--) { + uint64_t k = vdev_draid_rand(draid_seed) % (j + 1); + uint8_t val = current_row[j]; + current_row[j] = current_row[k]; + current_row[k] = val; + } + + previous_row = current_row; + } + + kmem_free(initial_row, rowsz); + + int error = verify_perms(perms, children, nperms, map->dm_checksum); + if (error) { + vmem_free(perms, permssz); + return (error); + } + + *permsp = perms; + + return (0); +} + +/* + * Lookup the fixed draid_map_t for the requested number of children. + */ +int +vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp) +{ + for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) { + if (draid_maps[i].dm_children == children) { + *mapp = &draid_maps[i]; + return (0); + } + } + + return (ENOENT); +} + +/* + * Lookup the permutation array and iteration id for the provided offset. + */ +static void +vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex, + uint8_t **base, uint64_t *iter) +{ + uint64_t ncols = vdc->vdc_children; + uint64_t poff = pindex % (vdc->vdc_nperms * ncols); + + *base = vdc->vdc_perms + (poff / ncols) * ncols; + *iter = poff % ncols; +} + +static inline uint64_t +vdev_draid_permute_id(vdev_draid_config_t *vdc, + uint8_t *base, uint64_t iter, uint64_t index) +{ + return ((base[index] + iter) % vdc->vdc_children); +} + +/* + * Return the asize which is the psize rounded up to a full group width. + * i.e. vdev_draid_psize_to_asize(). + */ +static uint64_t +vdev_draid_asize(vdev_t *vd, uint64_t psize) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t ashift = vd->vdev_ashift; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1; + uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift; + + ASSERT3U(asize, !=, 0); + ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0); + + return (asize); +} + +/* + * Deflate the asize to the psize, this includes stripping parity. + */ +uint64_t +vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT0(asize % vdc->vdc_groupwidth); + + return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata); +} + +/* + * Convert a logical offset to the corresponding group number. + */ +static uint64_t +vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (offset / vdc->vdc_groupsz); +} + +/* + * Convert a group number to the logical starting offset for that group. + */ +static uint64_t +vdev_draid_group_to_offset(vdev_t *vd, uint64_t group) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (group * vdc->vdc_groupsz); +} + + +static void +vdev_draid_map_free_vsd(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + ASSERT0(rm->rm_freed); + rm->rm_freed = B_TRUE; + + if (rm->rm_reports == 0) { + vdev_raidz_map_free(rm); + } +} + +/*ARGSUSED*/ +static void +vdev_draid_cksum_free(void *arg, size_t ignored) +{ + raidz_map_t *rm = arg; + + ASSERT3U(rm->rm_reports, >, 0); + + if (--rm->rm_reports == 0 && rm->rm_freed) + vdev_raidz_map_free(rm); +} + +static void +vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) +{ + raidz_map_t *rm = zcr->zcr_cbdata; + const size_t c = zcr->zcr_cbinfo; + uint64_t skip_size = zcr->zcr_sector; + uint64_t parity_size; + size_t x, offset, size; + + if (good_data == NULL) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + /* + * Detailed cksum reporting is currently only supported for single + * row draid mappings, this covers the vast majority of zios. Only + * a dRAID zio which spans groups will have multiple rows. + */ + if (rm->rm_nrows != 1) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + raidz_row_t *rr = rm->rm_row[0]; + const abd_t *good = NULL; + const abd_t *bad = rr->rr_col[c].rc_abd; + + if (c < rr->rr_firstdatacol) { + /* + * The first time through, calculate the parity blocks for + * the good data (this relies on the fact that the good + * data never changes for a given logical zio) + */ + if (rr->rr_col[0].rc_gdata == NULL) { + abd_t *bad_parity[VDEV_DRAID_MAXPARITY]; + + /* + * Set up the rr_col[]s to generate the parity for + * good_data, first saving the parity bufs and + * replacing them with buffers to hold the result. + */ + for (x = 0; x < rr->rr_firstdatacol; x++) { + bad_parity[x] = rr->rr_col[x].rc_abd; + rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = + abd_alloc_sametype(rr->rr_col[x].rc_abd, + rr->rr_col[x].rc_size); + } + + /* + * Fill in the data columns from good_data being + * careful to pad short columns and empty columns + * with a skip sector. + */ + uint64_t good_size = abd_get_size((abd_t *)good_data); + + offset = 0; + for (; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); + + if (offset == good_size) { + /* empty data column (small write) */ + rr->rr_col[x].rc_abd = + abd_get_zeros(skip_size); + } else if (x < rr->rr_bigcols) { + /* this is a "big column" */ + size = rr->rr_col[x].rc_size; + rr->rr_col[x].rc_abd = + abd_get_offset_size( + (abd_t *)good_data, offset, size); + offset += size; + } else { + /* short data column, add skip sector */ + size = rr->rr_col[x].rc_size -skip_size; + rr->rr_col[x].rc_abd = abd_alloc( + rr->rr_col[x].rc_size, B_TRUE); + abd_copy_off(rr->rr_col[x].rc_abd, + (abd_t *)good_data, 0, offset, + size); + abd_zero_off(rr->rr_col[x].rc_abd, + size, skip_size); + offset += size; + } + } + + /* + * Construct the parity from the good data. + */ + vdev_raidz_generate_parity_row(rm, rr); + + /* restore everything back to its original state */ + for (x = 0; x < rr->rr_firstdatacol; x++) + rr->rr_col[x].rc_abd = bad_parity[x]; + + offset = 0; + for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { + if (offset == good_size || x < rr->rr_bigcols) + abd_put(rr->rr_col[x].rc_abd); + else + abd_free(rr->rr_col[x].rc_abd); + + rr->rr_col[x].rc_abd = abd_get_offset_size( + rr->rr_abd_copy, offset, + rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; + } + } + + ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); + good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, + rr->rr_col[c].rc_size); + } else { + /* adjust good_data to point at the start of our column */ + parity_size = size = rr->rr_col[0].rc_size; + if (c >= rr->rr_bigcols) { + size -= skip_size; + zcr->zcr_length = size; + } + + /* empty column */ + if (size == 0) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_TRUE); + return; + } + + offset = 0; + for (x = rr->rr_firstdatacol; x < c; x++) { + if (x < rr->rr_bigcols) { + offset += parity_size; + } else { + offset += parity_size - skip_size; + } + } + + good = abd_get_offset_size((abd_t *)good_data, offset, size); + } + + /* we drop the ereport if it ends up that the data was good */ + zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_put((abd_t *)good); +} + +/* + * Invoked indirectly by zfs_ereport_start_checksum(), called + * below when our read operation fails completely. The main point + * is to keep a copy of everything we read from disk, so that at + * vdev_draid_cksum_finish() time we can compare it with the good data. + */ +static void +vdev_draid_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) +{ + size_t c = (size_t)(uintptr_t)arg; + raidz_map_t *rm = zio->io_vsd; + + /* set up the report and bump the refcount */ + zcr->zcr_cbdata = rm; + zcr->zcr_cbinfo = c; + zcr->zcr_finish = vdev_draid_cksum_finish; + zcr->zcr_free = vdev_draid_cksum_free; + + rm->rm_reports++; + ASSERT3U(rm->rm_reports, >, 0); + + if (rm->rm_row[0]->rr_abd_copy != NULL) + return; + + /* + * It's the first time we're called for this raidz_map_t, so we need + * to copy the data aside; there's no guarantee that our zio's buffer + * won't be re-used for something else. + * + * Our parity data is already in separate buffers, so there's no need + * to copy them. Furthermore, all columns should have been expanded + * by vdev_draid_map_alloc_empty() when attempting reconstruction. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + size_t offset = 0; + size_t size = 0; + + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + ASSERT3U(rr->rr_col[c].rc_size, ==, + rr->rr_col[0].rc_size); + size += rr->rr_col[c].rc_size; + } + + rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); + + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; + abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, + offset, col->rc_size); + + abd_copy(tmp, col->rc_abd, col->rc_size); + + if (abd_is_gang(col->rc_abd)) + abd_free(col->rc_abd); + else + abd_put(col->rc_abd); + + col->rc_abd = tmp; + offset += col->rc_size; + } + ASSERT3U(offset, ==, size); + } +} + +const zio_vsd_ops_t vdev_draid_vsd_ops = { + .vsd_free = vdev_draid_map_free_vsd, + .vsd_cksum_report = vdev_draid_cksum_report +}; + +/* + * Full stripe writes. When writing, all columns (D+P) are required. Parity + * is calculated over all the columns, including empty zero filled sectors, + * and each is written to disk. While only the data columns are needed for + * a normal read, all of the columns are required for reconstruction when + * performing a sequential resilver. + * + * For "big columns" it's sufficient to map the correct range of the zio ABD. + * Partial columns require allocating a gang ABD in order to zero fill the + * empty sectors. When the column is empty a zero filled sector must be + * mapped. In all cases the data ABDs must be the same size as the parity + * ABDs (e.g. rc->rc_size == parity_size). + */ +static void +vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t abd_off = abd_offset; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small write), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + rc->rc_abd = abd_get_zeros(skip_size); + } else if (rc->rc_size == parity_size) { + /* this is a "big column" */ + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + zio->io_abd, abd_off, rc->rc_size), B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size), + B_TRUE); + } + + ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size); + + abd_off += rc->rc_size; + rc->rc_size = parity_size; + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); +} + +/* + * Scrub/resilver reads. In order to store the contents of the skip sectors + * an additional ABD is allocated. The columns are handled in the same way + * as a full stripe write except instead of using the zero ABD the newly + * allocated skip ABD is used to back the skip sectors. In all cases the + * data ABD must be the same size as the parity ABDs. + */ +static void +vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t abd_off = abd_offset; + uint64_t skip_off = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, ==, NULL); + + if (rr->rr_nempty > 0) { + rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, + B_FALSE); + } + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small read), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, + skip_off, skip_size); + skip_off += skip_size; + } else if (rc->rc_size == parity_size) { + /* this is a "big column" */ + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + zio->io_abd, abd_off, rc->rc_size), B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + rr->rr_abd_empty, skip_off, skip_size), B_TRUE); + skip_off += skip_size; + } + + uint64_t abd_size = abd_get_size(rc->rc_abd); + ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); + + /* + * Increase rc_size so the skip ABD is included in subsequent + * parity calculations. + */ + abd_off += rc->rc_size; + rc->rc_size = abd_size; + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); + ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); +} + +/* + * Normal reads. In this common case only the columns containing data + * are read in to the zio ABDs. Neither the parity columns or empty skip + * sectors are read unless the checksum fails verification. In which case + * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand + * the raid map in order to allow reconstruction using the parity data and + * skip sectors. + */ +static void +vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t abd_off = abd_offset; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size > 0) { + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + abd_off += rc->rc_size; + } + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); +} + +/* + * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key + * difference is that an ABD is allocated to back skip sectors so they may + * be read in to memory, verified, and repaired if needed. + */ +void +vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t skip_off = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, ==, NULL); + + if (rr->rr_nempty > 0) { + rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, + B_FALSE); + } + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small read), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + ASSERT3P(rc->rc_abd, ==, NULL); + rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, + skip_off, skip_size); + skip_off += skip_size; + } else if (rc->rc_size == parity_size) { + /* this is a "big column", nothing to add */ + ASSERT3P(rc->rc_abd, !=, NULL); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + ASSERT3P(rc->rc_abd, !=, NULL); + ASSERT(!abd_is_gang(rc->rc_abd)); + abd_t *read_abd = rc->rc_abd; + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, read_abd, B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + rr->rr_abd_empty, skip_off, skip_size), B_TRUE); + skip_off += skip_size; + } + + /* + * Increase rc_size so the empty ABD is included in subsequent + * parity calculations. + */ + rc->rc_size = parity_size; + } + + ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); +} + +/* + * Given a logical address within a dRAID configuration, return the physical + * address on the first drive in the group that this address maps to + * (at position 'start' in permutation number 'perm'). + */ +static uint64_t +vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset, + uint64_t *perm, uint64_t *start) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + /* b is the dRAID (parent) sector offset. */ + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t b_offset = logical_offset >> ashift; + + /* + * The height of a row in units of the vdev's minimum sector size. + * This is the amount of data written to each disk of each group + * in a given permutation. + */ + uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift; + + /* + * We cycle through a disk permutation every groupsz * ngroups chunk + * of address space. Note that ngroups * groupsz must be a multiple + * of the number of data drives (ndisks) in order to guarantee + * alignment. So, for example, if our row height is 16MB, our group + * size is 10, and there are 13 data drives in the draid, then ngroups + * will be 13, we will change permutation every 2.08GB and each + * disk will have 160MB of data per chunk. + */ + uint64_t groupwidth = vdc->vdc_groupwidth; + uint64_t ngroups = vdc->vdc_ngroups; + uint64_t ndisks = vdc->vdc_ndisks; + + /* + * groupstart is where the group this IO will land in "starts" in + * the permutation array. + */ + uint64_t group = logical_offset / vdc->vdc_groupsz; + uint64_t groupstart = (group * groupwidth) % ndisks; + ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart); + *start = groupstart; + + /* b_offset is the sector offset within a group chunk */ + b_offset = b_offset % (rowheight_sectors * groupwidth); + ASSERT0(b_offset % groupwidth); + + /* + * Find the starting byte offset on each child vdev: + * - within a permutation there are ngroups groups spread over the + * rows, where each row covers a slice portion of the disk + * - each permutation has (groupwidth * ngroups) / ndisks rows + * - so each permutation covers rows * slice portion of the disk + * - so we need to find the row where this IO group target begins + */ + *perm = group / ngroups; + uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) + + (((group % ngroups) * groupwidth) / ndisks); + + return (((rowheight_sectors * row) + + (b_offset / groupwidth)) << ashift); +} + +static uint64_t +vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, + uint64_t abd_offset, uint64_t abd_size) +{ + vdev_t *vd = zio->io_vd; + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t io_size = abd_size; + uint64_t io_asize = vdev_draid_asize(vd, io_size); + uint64_t group = vdev_draid_offset_to_group(vd, io_offset); + uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); + + /* + * Limit the io_size to the space remaining in the group. A second + * row in the raidz_map_t is created for the remainder. + */ + if (io_offset + io_asize > start_offset) { + io_size = vdev_draid_asize_to_psize(vd, + start_offset - io_offset); + } + + /* + * At most a block may span the logical end of one group and the start + * of the next group. Therefore, at the end of a group the io_size must + * span the group width evenly and the remainder must be aligned to the + * start of the next group. + */ + IMPLY(abd_offset == 0 && io_size < zio->io_size, + (io_asize >> ashift) % vdc->vdc_groupwidth == 0); + IMPLY(abd_offset != 0, + vdev_draid_group_to_offset(vd, group) == io_offset); + + /* Lookup starting byte offset on each child vdev */ + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + io_offset, &perm, &groupstart); + + /* + * If there is less than groupwidth drives available after the group + * start, the group is going to wrap onto the next row. 'wrap' is the + * group disk number that starts on the next row. + */ + uint64_t ndisks = vdc->vdc_ndisks; + uint64_t groupwidth = vdc->vdc_groupwidth; + uint64_t wrap = groupwidth; + + if (groupstart + groupwidth > ndisks) + wrap = ndisks - groupstart; + + /* The io size in units of the vdev's minimum sector size. */ + const uint64_t psize = io_size >> ashift; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ + uint64_t q = psize / vdc->vdc_ndata; + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + uint64_t r = psize - q * vdc->vdc_ndata; + + /* The number of "big columns" - those which contain remainder data. */ + uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity); + ASSERT3U(bc, <, groupwidth); + + /* The total number of data and parity sectors for this I/O. */ + uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1))); + + raidz_row_t *rr; + rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP); + rr->rr_cols = groupwidth; + rr->rr_scols = groupwidth; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = vdc->vdc_nparity; + rr->rr_abd_copy = NULL; + rr->rr_abd_empty = NULL; +#ifdef ZFS_DEBUG + rr->rr_offset = io_offset; + rr->rr_size = io_size; +#endif + *rrp = rr; + + uint8_t *base; + uint64_t iter, asize = 0; + vdev_draid_get_perm(vdc, perm, &base, &iter); + for (uint64_t i = 0; i < groupwidth; i++) { + raidz_col_t *rc = &rr->rr_col[i]; + uint64_t c = (groupstart + i) % ndisks; + + /* increment the offset if we wrap to the next row */ + if (i == wrap) + physical_offset += VDEV_DRAID_ROWHEIGHT; + + rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); + rc->rc_offset = physical_offset; + rc->rc_abd = NULL; + rc->rc_gdata = NULL; + rc->rc_orig_data = NULL; + rc->rc_error = 0; + rc->rc_tried = 0; + rc->rc_skipped = 0; + rc->rc_repair = 0; + rc->rc_need_orig_restore = B_FALSE; + + if (q == 0 && i >= bc) + rc->rc_size = 0; + else if (i < bc) + rc->rc_size = (q + 1) << ashift; + else + rc->rc_size = q << ashift; + + asize += rc->rc_size; + } + + ASSERT3U(asize, ==, tot << ashift); + rr->rr_nempty = roundup(tot, groupwidth) - tot; + IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc); + + /* Allocate buffers for the parity columns */ + for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); + } + + /* + * Map buffers for data columns and allocate/map buffers for skip + * sectors. There are three distinct cases for dRAID which are + * required to support sequential rebuild. + */ + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_draid_map_alloc_write(zio, abd_offset, rr); + } else if ((rr->rr_nempty > 0) && + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + vdev_draid_map_alloc_scrub(zio, abd_offset, rr); + } else { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + vdev_draid_map_alloc_read(zio, abd_offset, rr); + } + + return (io_size); +} + +/* + * Allocate the raidz mapping to be applied to the dRAID I/O. The parity + * calculations for dRAID are identical to raidz however there are a few + * differences in the layout. + * + * - dRAID always allocates a full stripe width. Any extra sectors due + * this padding are zero filled and written to disk. They will be read + * back during a scrub or repair operation since they are included in + * the parity calculation. This property enables sequential resilvering. + * + * - When the block at the logical offset spans redundancy groups then two + * rows are allocated in the raidz_map_t. One row resides at the end of + * the first group and the other at the start of the following group. + */ +static raidz_map_t * +vdev_draid_map_alloc(zio_t *zio) +{ + raidz_row_t *rr[2]; + uint64_t abd_offset = 0; + uint64_t abd_size = zio->io_size; + uint64_t io_offset = zio->io_offset; + uint64_t size; + int nrows = 1; + + size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset, + abd_offset, abd_size); + if (size < abd_size) { + vdev_t *vd = zio->io_vd; + + io_offset += vdev_draid_asize(vd, size); + abd_offset += size; + abd_size -= size; + nrows++; + + ASSERT3U(io_offset, ==, vdev_draid_group_to_offset( + vd, vdev_draid_offset_to_group(vd, io_offset))); + ASSERT3U(abd_offset, <, zio->io_size); + ASSERT3U(abd_size, !=, 0); + + size = vdev_draid_map_alloc_row(zio, &rr[1], + io_offset, abd_offset, abd_size); + VERIFY3U(size, ==, abd_size); + } + + raidz_map_t *rm; + rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP); + rm->rm_ops = vdev_raidz_math_get_ops(); + rm->rm_nrows = nrows; + rm->rm_row[0] = rr[0]; + if (nrows == 2) + rm->rm_row[1] = rr[1]; + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_draid_vsd_ops; + + return (rm); +} + +/* + * Given an offset into a dRAID return the next group width aligned offset + * which can be used to start an allocation. + */ +static uint64_t +vdev_draid_get_astart(vdev_t *vd, const uint64_t start) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift)); +} + +/* + * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child) + * rounded down to the last full slice. So each child must provide at least + * 1 / (children - nspares) of its asize. + */ +static uint64_t +vdev_draid_min_asize(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return ((vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); +} + +/* + * When using dRAID the minimum allocation size is determined by the number + * of data disks in the redundancy group. Full stripes are always used. + */ +static uint64_t +vdev_draid_min_alloc(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (vdc->vdc_ndata << vd->vdev_ashift); +} + +/* + * Returns true if the txg range does not exist on any leaf vdev. + * + * A dRAID spare does not fit into the DTL model. While it has child vdevs + * there is no redundancy among them, and the effective child vdev is + * determined by offset. Essentially we do a vdev_dtl_reassess() on the + * fly by replacing a dRAID spare with the child vdev under the offset. + * Note that it is a recursive process because the child vdev can be + * another dRAID spare and so on. + */ +boolean_t +vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg, + uint64_t size) +{ + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + /* + * Check all of the readable children, if any child + * contains the txg range the data it is not missing. + */ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (!vdev_draid_missing(cvd, physical_offset, + txg, size)) + return (B_FALSE); + } + + return (B_TRUE); + } + + if (vd->vdev_ops == &vdev_draid_spare_ops) { + /* + * When sequentially resilvering we don't have a proper + * txg range so instead we must presume all txgs are + * missing on this vdev until the resilver completes. + */ + if (vd->vdev_rebuild_txg != 0) + return (B_TRUE); + + /* + * DTL_MISSING is set for all prior txgs when a resilver + * is started in spa_vdev_attach(). + */ + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + /* + * Consult the DTL on the relevant vdev. Either a vdev + * leaf or spare/replace mirror child may be returned so + * we must recursively call vdev_draid_missing_impl(). + */ + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_TRUE); + + return (vdev_draid_missing(vd, physical_offset, + txg, size)); + } + + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + +/* + * Returns true if the txg is only partially replicated on the leaf vdevs. + */ +static boolean_t +vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg, + uint64_t size) +{ + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + /* + * Check all of the readable children, if any child is + * missing the txg range then it is partially replicated. + */ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (vdev_draid_partial(cvd, physical_offset, txg, size)) + return (B_TRUE); + } + + return (B_FALSE); + } + + if (vd->vdev_ops == &vdev_draid_spare_ops) { + /* + * When sequentially resilvering we don't have a proper + * txg range so instead we must presume all txgs are + * missing on this vdev until the resilver completes. + */ + if (vd->vdev_rebuild_txg != 0) + return (B_TRUE); + + /* + * DTL_MISSING is set for all prior txgs when a resilver + * is started in spa_vdev_attach(). + */ + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + /* + * Consult the DTL on the relevant vdev. Either a vdev + * leaf or spare/replace mirror child may be returned so + * we must recursively call vdev_draid_missing_impl(). + */ + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_TRUE); + + return (vdev_draid_partial(vd, physical_offset, txg, size)); + } + + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + +/* + * Determine if the vdev is readable at the given offset. + */ +boolean_t +vdev_draid_readable(vdev_t *vd, uint64_t physical_offset) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) { + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_FALSE); + } + + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (vdev_draid_readable(cvd, physical_offset)) + return (B_TRUE); + } + + return (B_FALSE); + } + + return (vdev_readable(vd)); +} + +/* + * Returns the first distributed spare found under the provided vdev tree. + */ +static vdev_t * +vdev_draid_find_spare(vdev_t *vd) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vd); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]); + if (svd != NULL) + return (svd); + } + + return (NULL); +} + +/* + * Returns B_TRUE if the passed in vdev is currently "faulted". + * Faulted, in this context, means that the vdev represents a + * replacing or sparing vdev tree. + */ +static boolean_t +vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) { + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_FALSE); + + /* + * After resolving the distributed spare to a leaf vdev + * check the parent to determine if it's "faulted". + */ + vd = vd->vdev_parent; + } + + return (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); +} + +/* + * Determine if the dRAID block at the logical offset is degraded. + * Used by sequential resilver. + */ +static boolean_t +vdev_draid_group_degraded(vdev_t *vd, uint64_t offset) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); + + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + offset, &perm, &groupstart); + + uint8_t *base; + uint64_t iter; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); + vdev_t *cvd = vd->vdev_child[cid]; + + /* Group contains a faulted vdev. */ + if (vdev_draid_faulted(cvd, physical_offset)) + return (B_TRUE); + + /* + * Always check groups with active distributed spares + * because any vdev failure in the pool will affect them. + */ + if (vdev_draid_find_spare(cvd) != NULL) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Determine if the txg is missing. Used by healing resilver. + */ +static boolean_t +vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg, + uint64_t size) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); + + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + offset, &perm, &groupstart); + + uint8_t *base; + uint64_t iter; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); + vdev_t *cvd = vd->vdev_child[cid]; + + /* Transaction group is known to be partially replicated. */ + if (vdev_draid_partial(cvd, physical_offset, txg, size)) + return (B_TRUE); + + /* + * Always check groups with active distributed spares + * because any vdev failure in the pool will affect them. + */ + if (vdev_draid_find_spare(cvd) != NULL) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Find the smallest child asize and largest sector size to calculate the + * available capacity. Distributed spares are ignored since their capacity + * is also based of the minimum child size in the top-level dRAID. + */ +static void +vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, + uint64_t *logical_ashiftp, uint64_t *physical_ashiftp) +{ + uint64_t logical_ashift = 0, physical_ashift = 0; + uint64_t asize = 0, max_asize = 0; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + continue; + + asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; + max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; + logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); + physical_ashift = MAX(physical_ashift, + cvd->vdev_physical_ashift); + } + + *asizep = asize; + *max_asizep = max_asize; + *logical_ashiftp = logical_ashift; + *physical_ashiftp = physical_ashift; +} + +/* + * Open spare vdevs. + */ +static boolean_t +vdev_draid_open_spares(vdev_t *vd) +{ + return (vd->vdev_ops == &vdev_draid_spare_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); +} + +/* + * Open all children, excluding spares. + */ +static boolean_t +vdev_draid_open_children(vdev_t *vd) +{ + return (!vdev_draid_open_spares(vd)); +} + +/* + * Open a top-level dRAID vdev. + */ +static int +vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t nparity = vdc->vdc_nparity; + int open_errors = 0; + + if (nparity > VDEV_DRAID_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * First open the normal children then the distributed spares. This + * ordering is important to ensure the distributed spares calculate + * the correct psize in the event that the dRAID vdevs were expanded. + */ + vdev_open_children_subset(vd, vdev_draid_open_children); + vdev_open_children_subset(vd, vdev_draid_open_spares); + + /* Verify enough of the children are available to continue. */ + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c]->vdev_open_error != 0) { + if ((++open_errors) > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (SET_ERROR(ENXIO)); + } + } + } + + /* + * Allocatable capacity is the sum of the space on all children less + * the number of distributed spares rounded down to last full row + * and then to the last full group. An additional 32MB of scratch + * space is reserved at the end of each child for use by the dRAID + * expansion feature. + */ + uint64_t child_asize, child_max_asize; + vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize, + logical_ashift, physical_ashift); + + /* + * Should be unreachable since the minimum child size is 64MB, but + * we want to make sure an underflow absolutely cannot occur here. + */ + if (child_asize < VDEV_DRAID_REFLOW_RESERVE || + child_max_asize < VDEV_DRAID_REFLOW_RESERVE) { + return (SET_ERROR(ENXIO)); + } + + child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) / + VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; + child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) / + VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; + + *asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * + vdc->vdc_groupsz); + *max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * + vdc->vdc_groupsz); + + return (0); +} + +/* + * Close a top-level dRAID vdev. + */ +static void +vdev_draid_close(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } +} + +/* + * Return the maximum asize for a rebuild zio in the provided range + * given the following constraints. A dRAID chunks may not: + * + * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or + * - Span dRAID redundancy groups. + */ +static uint64_t +vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t ashift = vd->vdev_ashift; + uint64_t ndata = vdc->vdc_ndata; + uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift), + SPA_MAXBLOCKSIZE); + + ASSERT3U(vdev_draid_get_astart(vd, start), ==, start); + ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0); + + /* Chunks must evenly span all data columns in the group. */ + psize = (((psize >> ashift) / ndata) * ndata) << ashift; + uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize)); + + /* Reduce the chunk size to the group space remaining. */ + uint64_t group = vdev_draid_offset_to_group(vd, start); + uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start; + chunk_size = MIN(chunk_size, left); + + ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0); + ASSERT3U(vdev_draid_offset_to_group(vd, start), ==, + vdev_draid_offset_to_group(vd, start + chunk_size - 1)); + + return (chunk_size); +} + +/* + * Align the start of the metaslab to the group width and slightly reduce + * its size to a multiple of the group width. Since full stripe writes are + * required by dRAID this space is unallocable. Furthermore, aligning the + * metaslab start is important for vdev initialize and TRIM which both operate + * on metaslab boundaries which vdev_xlate() expects to be aligned. + */ +static void +vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift; + uint64_t astart = vdev_draid_get_astart(vd, *ms_start); + uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz; + + *ms_start = astart; + *ms_size = asize; + + ASSERT0(*ms_start % sz); + ASSERT0(*ms_size % sz); +} + +/* + * Add virtual dRAID spares to the list of valid spares. In order to accomplish + * this the existing array must be freed and reallocated with the additional + * entries. + */ +int +vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, + uint64_t next_vdev_id) +{ + uint64_t draid_nspares = 0; + uint64_t ndraid = 0; + int error; + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + + if (cvd->vdev_ops == &vdev_draid_ops) { + vdev_draid_config_t *vdc = cvd->vdev_tsd; + draid_nspares += vdc->vdc_nspares; + ndraid++; + } + } + + if (draid_nspares == 0) { + *ndraidp = ndraid; + return (0); + } + + nvlist_t **old_spares, **new_spares; + uint_t old_nspares; + error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &old_spares, &old_nspares); + if (error) + old_nspares = 0; + + /* Allocate memory and copy of the existing spares. */ + new_spares = kmem_alloc(sizeof (nvlist_t *) * + (draid_nspares + old_nspares), KM_SLEEP); + for (uint_t i = 0; i < old_nspares; i++) + new_spares[i] = fnvlist_dup(old_spares[i]); + + /* Add new distributed spares to ZPOOL_CONFIG_SPARES. */ + uint64_t n = old_nspares; + for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) { + vdev_t *cvd = vd->vdev_child[vdev_id]; + char path[64]; + + if (cvd->vdev_ops != &vdev_draid_ops) + continue; + + vdev_draid_config_t *vdc = cvd->vdev_tsd; + uint64_t nspares = vdc->vdc_nspares; + uint64_t nparity = vdc->vdc_nparity; + + for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) { + bzero(path, sizeof (path)); + (void) snprintf(path, sizeof (path) - 1, + "%s%llu-%llu-%llu", VDEV_TYPE_DRAID, + (u_longlong_t)nparity, + (u_longlong_t)next_vdev_id + vdev_id, + (u_longlong_t)spare_id); + + nvlist_t *spare = fnvlist_alloc(); + fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path); + fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_DRAID_SPARE); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID, + cvd->vdev_guid); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID, + spare_id); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT, + cvd->vdev_ashift); + + new_spares[n] = spare; + n++; + } + } + + if (n > 0) { + (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + new_spares, n); + } + + for (int i = 0; i < n; i++) + nvlist_free(new_spares[i]); + + kmem_free(new_spares, sizeof (*new_spares) * n); + *ndraidp = ndraid; + + return (0); +} + +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. + */ +static boolean_t +vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t asize = vdev_draid_asize(vd, psize); + + if (phys_birth == TXG_UNKNOWN) { + /* + * Sequential resilver. There is no meaningful phys_birth + * for this block, we can only determine if block resides + * in a degraded group in which case it must be resilvered. + */ + ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==, + vdev_draid_offset_to_group(vd, offset + asize - 1)); + + return (vdev_draid_group_degraded(vd, offset)); + } else { + /* + * Healing resilver. TXGs not in DTL_PARTIAL are intact, + * as are blocks in non-degraded groups. + */ + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + if (vdev_draid_group_missing(vd, offset, phys_birth, 1)) + return (B_TRUE); + + /* The block may span groups in which case check both. */ + if (vdev_draid_offset_to_group(vd, offset) != + vdev_draid_offset_to_group(vd, offset + asize - 1)) { + if (vdev_draid_group_missing(vd, + offset + asize, phys_birth, 1)) + return (B_TRUE); + } + + return (B_FALSE); + } +} + +static boolean_t +vdev_draid_rebuilding(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (vdev_draid_rebuilding(vd->vdev_child[i])) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static void +vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) +{ +#ifdef ZFS_DEBUG + range_seg64_t logical_rs, physical_rs, remain_rs; + logical_rs.rs_start = rr->rr_offset; + logical_rs.rs_end = logical_rs.rs_start + + vdev_draid_asize(vd, rr->rr_size); + + raidz_col_t *rc = &rr->rr_col[col]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); + ASSERT(vdev_xlate_is_empty(&remain_rs)); + ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); + ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); + ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end); +#endif +} + +/* + * For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. A gang ABD is allocated by vdev_draid_map_alloc() + * if a skip sector needs to be added to a column. + */ +static void +vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + raidz_map_t *rm = zio->io_vsd; + + vdev_raidz_generate_parity_row(rm, rr); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + /* + * Empty columns are zero filled and included in the parity + * calculation and therefore must be written. + */ + ASSERT3U(rc->rc_size, !=, 0); + + /* Verify physical to logical translation */ + vdev_draid_io_verify(vd, rr, c); + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], rc->rc_offset, + rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, + 0, vdev_raidz_child_done, rc)); + } +} + +/* + * For read operations: + * 1. The vdev_draid_map_alloc() function will create a minimal raidz + * mapping for the read based on the zio->io_flags. There are two + * possible mappings either 1) a normal read, or 2) a scrub/resilver. + * 2. Create the zio read operations. This will include all parity + * columns and skip sectors for a scrub/resilver. + */ +static void +vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + + /* Sequential rebuild must do IO at redundancy group boundary. */ + IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last. Any errors along the way will force us to read the parity. + * For scrub/resilver IOs which verify skip sectors, a gang ABD will + * have been allocated to store them and rc->rc_size is increased. + */ + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (!vdev_draid_readable(cvd, rc->rc_offset)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; + rc->rc_skipped = 1; + continue; + } + + if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + + /* + * Empty columns may be read during vdev_draid_io_done(). + * Only skip them after the readable and missing checks + * verify they are available. + */ + if (rc->rc_size == 0) { + rc->rc_skipped = 1; + continue; + } + + if (zio->io_flags & ZIO_FLAG_RESILVER) { + vdev_t *svd; + + /* + * If this child is a distributed spare then the + * offset might reside on the vdev being replaced. + * In which case this data must be written to the + * new device. Failure to do so would result in + * checksum errors when the old device is detached + * and the pool is scrubbed. + */ + if ((svd = vdev_draid_find_spare(cvd)) != NULL) { + svd = vdev_draid_spare_get_child(svd, + rc->rc_offset); + if (svd && (svd->vdev_ops == &vdev_spare_ops || + svd->vdev_ops == &vdev_replacing_ops)) { + rc->rc_repair = 1; + } + } + + /* + * Always issue a repair IO to this child when its + * a spare or replacing vdev with an active rebuild. + */ + if ((cvd->vdev_ops == &vdev_spare_ops || + cvd->vdev_ops == &vdev_replacing_ops) && + vdev_draid_rebuilding(cvd)) { + rc->rc_repair = 1; + } + } + } + + /* + * Either a parity or data column is missing this means a repair + * may be attempted by vdev_draid_io_done(). Expand the raid map + * to read in empty columns which are needed along with the parity + * during reconstruction. + */ + if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) && + rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) { + vdev_draid_map_alloc_empty(zio, rr); + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + + if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } +} + +/* + * Start an IO operation to a dRAID vdev. + */ +static void +vdev_draid_io_start(zio_t *zio) +{ + vdev_t *vd __maybe_unused = zio->io_vd; + raidz_map_t *rm; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset)); + + rm = vdev_draid_map_alloc(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_draid_io_start_write(zio, rm->rm_row[i]); + } + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_draid_io_start_read(zio, rm->rm_row[i]); + } + } + + zio_execute(zio); +} + +/* + * Complete an IO operation on a dRAID vdev. The raidz logic can be applied + * to dRAID since the layout is fully described by the raidz_map_t. + */ +static void +vdev_draid_io_done(zio_t *zio) +{ + vdev_raidz_io_done(zio); +} + +static void +vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + ASSERT(vd->vdev_ops == &vdev_draid_ops); + + if (faulted > vdc->vdc_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +static void +vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) +{ + vdev_t *raidvd = cvd->vdev_parent; + ASSERT(raidvd->vdev_ops == &vdev_draid_ops); + + vdev_draid_config_t *vdc = raidvd->vdev_tsd; + uint64_t ashift = raidvd->vdev_top->vdev_ashift; + + /* Make sure the offsets are block-aligned */ + ASSERT0(logical_rs->rs_start % (1 << ashift)); + ASSERT0(logical_rs->rs_end % (1 << ashift)); + + uint64_t logical_start = logical_rs->rs_start; + uint64_t logical_end = logical_rs->rs_end; + + /* + * Unaligned ranges must be skipped. All metaslabs are correctly + * aligned so this should not happen, but this case is handled in + * case it's needed by future callers. + */ + uint64_t astart = vdev_draid_get_astart(raidvd, logical_start); + if (astart != logical_start) { + physical_rs->rs_start = logical_start; + physical_rs->rs_end = logical_start; + remain_rs->rs_start = MIN(astart, logical_end); + remain_rs->rs_end = logical_end; + return; + } + + /* + * Unlike with mirrors and raidz a dRAID logical range can map + * to multiple non-contiguous physical ranges. This is handled by + * limiting the size of the logical range to a single group and + * setting the remain argument such that it describes the remaining + * unmapped logical range. This is stricter than absolutely + * necessary but helps simplify the logic below. + */ + uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start); + uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1); + if (logical_end > nextstart) + logical_end = nextstart; + + /* Find the starting offset for each vdev in the group */ + uint64_t perm, groupstart; + uint64_t start = vdev_draid_logical_to_physical(raidvd, + logical_start, &perm, &groupstart); + uint64_t end = start; + + uint8_t *base; + uint64_t iter, id; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + /* + * Check if the passed child falls within the group. If it does + * update the start and end to reflect the physical range. + * Otherwise, leave them unmodified which will result in an empty + * (zero-length) physical range being returned. + */ + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + + if (c == 0 && i != 0) { + /* the group wrapped, increment the start */ + start += VDEV_DRAID_ROWHEIGHT; + end = start; + } + + id = vdev_draid_permute_id(vdc, base, iter, c); + if (id == cvd->vdev_id) { + uint64_t b_size = (logical_end >> ashift) - + (logical_start >> ashift); + ASSERT3U(b_size, >, 0); + end = start + ((((b_size - 1) / + vdc->vdc_groupwidth) + 1) << ashift); + break; + } + } + physical_rs->rs_start = start; + physical_rs->rs_end = end; + + /* + * Only top-level vdevs are allowed to set remain_rs because + * when .vdev_op_xlate() is called for their children the full + * logical range is not provided by vdev_xlate(). + */ + remain_rs->rs_start = logical_end; + remain_rs->rs_end = logical_rs->rs_end; + + ASSERT3U(physical_rs->rs_start, <=, logical_start); + ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, + logical_end - logical_start); +} + +/* + * Add dRAID specific fields to the config nvlist. + */ +static void +vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + vdev_draid_config_t *vdc = vd->vdev_tsd; + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups); +} + +/* + * Initialize private dRAID specific fields from the nvlist. + */ +static int +vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + uint64_t ndata, nparity, nspares, ngroups; + int error; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata)) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) || + nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { + return (SET_ERROR(EINVAL)); + } + + uint_t children; + nvlist_t **child; + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0 || children == 0 || + children > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) || + nspares > 100 || nspares > (children - (ndata + nparity))) { + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) || + ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + + /* + * Validate the minimum number of children exist per group for the + * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4). + */ + if (children < (ndata + nparity + nspares)) + return (SET_ERROR(EINVAL)); + + /* + * Create the dRAID configuration using the pool nvlist configuration + * and the fixed mapping for the correct number of children. + */ + vdev_draid_config_t *vdc; + const draid_map_t *map; + + error = vdev_draid_lookup_map(children, &map); + if (error) + return (SET_ERROR(EINVAL)); + + vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP); + vdc->vdc_ndata = ndata; + vdc->vdc_nparity = nparity; + vdc->vdc_nspares = nspares; + vdc->vdc_children = children; + vdc->vdc_ngroups = ngroups; + vdc->vdc_nperms = map->dm_nperms; + + error = vdev_draid_generate_perms(map, &vdc->vdc_perms); + if (error) { + kmem_free(vdc, sizeof (*vdc)); + return (SET_ERROR(EINVAL)); + } + + /* + * Derived constants. + */ + vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity; + vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares; + vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT; + vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) / + vdc->vdc_ndisks; + + ASSERT3U(vdc->vdc_groupwidth, >=, 2); + ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks); + ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT); + ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT); + ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0); + ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) % + vdc->vdc_ndisks, ==, 0); + + *tsd = vdc; + + return (0); +} + +static void +vdev_draid_fini(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + vmem_free(vdc->vdc_perms, sizeof (uint8_t) * + vdc->vdc_children * vdc->vdc_nperms); + kmem_free(vdc, sizeof (*vdc)); +} + +static uint64_t +vdev_draid_nparity(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + return (vdc->vdc_nparity); +} + +static uint64_t +vdev_draid_ndisks(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + return (vdc->vdc_ndisks); +} + +vdev_ops_t vdev_draid_ops = { + .vdev_op_init = vdev_draid_init, + .vdev_op_fini = vdev_draid_fini, + .vdev_op_open = vdev_draid_open, + .vdev_op_close = vdev_draid_close, + .vdev_op_asize = vdev_draid_asize, + .vdev_op_min_asize = vdev_draid_min_asize, + .vdev_op_min_alloc = vdev_draid_min_alloc, + .vdev_op_io_start = vdev_draid_io_start, + .vdev_op_io_done = vdev_draid_io_done, + .vdev_op_state_change = vdev_draid_state_change, + .vdev_op_need_resilver = vdev_draid_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_draid_xlate, + .vdev_op_rebuild_asize = vdev_draid_rebuild_asize, + .vdev_op_metaslab_init = vdev_draid_metaslab_init, + .vdev_op_config_generate = vdev_draid_config_generate, + .vdev_op_nparity = vdev_draid_nparity, + .vdev_op_ndisks = vdev_draid_ndisks, + .vdev_op_type = VDEV_TYPE_DRAID, + .vdev_op_leaf = B_FALSE, +}; + + +/* + * A dRAID distributed spare is a virtual leaf vdev which is included in the + * parent dRAID configuration. The last N columns of the dRAID permutation + * table are used to determine on which dRAID children a specific offset + * should be written. These spare leaf vdevs can only be used to replace + * faulted children in the same dRAID configuration. + */ + +/* + * Distributed spare state. All fields are set when the distributed spare is + * first opened and are immutable. + */ +typedef struct { + vdev_t *vds_draid_vdev; /* top-level parent dRAID vdev */ + uint64_t vds_top_guid; /* top-level parent dRAID guid */ + uint64_t vds_spare_id; /* spare id (0 - vdc->vdc_nspares-1) */ +} vdev_draid_spare_t; + +/* + * Returns the parent dRAID vdev to which the distributed spare belongs. + * This may be safely called even when the vdev is not open. + */ +vdev_t * +vdev_draid_spare_get_parent(vdev_t *vd) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + if (vds->vds_draid_vdev != NULL) + return (vds->vds_draid_vdev); + + return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev, + vds->vds_top_guid)); +} + +/* + * A dRAID space is active when it's the child of a vdev using the + * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops. + */ +static boolean_t +vdev_draid_spare_is_active(vdev_t *vd) +{ + vdev_t *pvd = vd->vdev_parent; + + if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops || + pvd->vdev_ops == &vdev_replacing_ops || + pvd->vdev_ops == &vdev_draid_ops)) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +/* + * Given a dRAID distribute spare vdev, returns the physical child vdev + * on which the provided offset resides. This may involve recursing through + * multiple layers of distributed spares. Note that offset is relative to + * this vdev. + */ +vdev_t * +vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + /* The vdev is closed */ + if (vds->vds_draid_vdev == NULL) + return (NULL); + + vdev_t *tvd = vds->vds_draid_vdev; + vdev_draid_config_t *vdc = tvd->vdev_tsd; + + ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares); + + uint8_t *base; + uint64_t iter; + uint64_t perm = physical_offset / vdc->vdc_devslicesz; + + vdev_draid_get_perm(vdc, perm, &base, &iter); + + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, + (tvd->vdev_children - 1) - vds->vds_spare_id); + vdev_t *cvd = tvd->vdev_child[cid]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_spare_get_child(cvd, physical_offset)); + + return (cvd); +} + +/* ARGSUSED */ +static void +vdev_draid_spare_close(vdev_t *vd) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + vds->vds_draid_vdev = NULL; +} + +/* + * Opening a dRAID spare device is done by looking up the associated dRAID + * top-level vdev guid from the spare configuration. + */ +static int +vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + uint64_t asize, max_asize; + + vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid); + if (tvd == NULL) { + /* + * When spa_vdev_add() is labeling new spares the + * associated dRAID is not attached to the root vdev + * nor does this spare have a parent. Simulate a valid + * device in order to allow the label to be initialized + * and the distributed spare added to the configuration. + */ + if (vd->vdev_parent == NULL) { + *psize = *max_psize = SPA_MINDEVSIZE; + *logical_ashift = *physical_ashift = ASHIFT_MIN; + return (0); + } + + return (SET_ERROR(EINVAL)); + } + + vdev_draid_config_t *vdc = tvd->vdev_tsd; + if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL) + return (SET_ERROR(EINVAL)); + + if (vds->vds_spare_id >= vdc->vdc_nspares) + return (SET_ERROR(EINVAL)); + + /* + * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here + * because the caller may be vdev_draid_open() in which case the + * values are stale as they haven't yet been updated by vdev_open(). + * To avoid this always recalculate the dRAID asize and max_asize. + */ + vdev_draid_calculate_asize(tvd, &asize, &max_asize, + logical_ashift, physical_ashift); + + *psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + *max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + + vds->vds_draid_vdev = tvd; + + return (0); +} + +/* + * Completed distributed spare IO. Store the result in the parent zio + * as if it had performed the operation itself. Only the first error is + * preserved if there are multiple errors. + */ +static void +vdev_draid_spare_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + /* + * IOs are issued to non-writable vdevs in order to keep their + * DTLs accurate. However, we don't want to propagate the + * error in to the distributed spare's DTL. When resilvering + * vdev_draid_need_resilver() will consult the relevant DTL + * to determine if the data is missing and must be repaired. + */ + if (!vdev_writeable(zio->io_vd)) + return; + + if (pio->io_error == 0) + pio->io_error = zio->io_error; +} + +/* + * Returns a valid label nvlist for the distributed spare vdev. This is + * used to bypass the IO pipeline to avoid the complexity of constructing + * a complete label with valid checksum to return when read. + */ +nvlist_t * +vdev_draid_read_config_spare(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + spa_aux_vdev_t *sav = &spa->spa_spares; + uint64_t guid = vd->vdev_guid; + + nvlist_t *nv = fnvlist_alloc(); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE, + vdev_draid_spare_is_active(vd) ? + POOL_STATE_ACTIVE : POOL_STATE_SPARE); + + /* Set the vdev guid based on the vdev list in sav_count. */ + for (int i = 0; i < sav->sav_count; i++) { + if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops && + strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) { + guid = sav->sav_vdevs[i]->vdev_guid; + break; + } + } + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid); + + return (nv); +} + +/* + * Handle any ioctl requested of the distributed spare. Only flushes + * are supported in which case all children must be flushed. + */ +static int +vdev_draid_spare_ioctl(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + int error = 0; + + if (zio->io_cmd == DKIOCFLUSHWRITECACHE) { + for (int c = 0; c < vd->vdev_children; c++) { + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[c], zio->io_offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } else { + error = SET_ERROR(ENOTSUP); + } + + return (error); +} + +/* + * Initiate an IO to the distributed spare. For normal IOs this entails using + * the zio->io_offset and permutation table to calculate which child dRAID vdev + * is responsible for the data. Then passing along the zio to that child to + * perform the actual IO. The label ranges are not stored on disk and require + * some special handling which is described below. + */ +static void +vdev_draid_spare_io_start(zio_t *zio) +{ + vdev_t *cvd = NULL, *vd = zio->io_vd; + vdev_draid_spare_t *vds = vd->vdev_tsd; + uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (vds == NULL) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + zio->io_error = vdev_draid_spare_ioctl(zio); + break; + + case ZIO_TYPE_WRITE: + if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { + /* + * Accept probe IOs and config writers to simulate the + * existence of an on disk label. vdev_label_sync(), + * vdev_uberblock_sync() and vdev_copy_uberblocks() + * skip the distributed spares. This only leaves + * vdev_label_init() which is allowed to succeed to + * avoid adding special cases the function. + */ + if (zio->io_flags & ZIO_FLAG_PROBE || + zio->io_flags & ZIO_FLAG_CONFIG_WRITER) { + zio->io_error = 0; + } else { + zio->io_error = SET_ERROR(EIO); + } + } else { + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } + break; + + case ZIO_TYPE_READ: + if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { + /* + * Accept probe IOs to simulate the existence of a + * label. vdev_label_read_config() bypasses the + * pipeline to read the label configuration and + * vdev_uberblock_load() skips distributed spares + * when attempting to locate the best uberblock. + */ + if (zio->io_flags & ZIO_FLAG_PROBE) { + zio->io_error = 0; + } else { + zio->io_error = SET_ERROR(EIO); + } + } else { + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL || !vdev_readable(cvd)) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } + break; + + case ZIO_TYPE_TRIM: + /* The vdev label ranges are never trimmed */ + ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)); + + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL || !cvd->vdev_has_trim) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + break; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + zio_execute(zio); +} + +/* ARGSUSED */ +static void +vdev_draid_spare_io_done(zio_t *zio) +{ +} + +/* + * Lookup the full spare config in spa->spa_spares.sav_config and + * return the top_guid and spare_id for the named spare. + */ +static int +vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp, + uint64_t *spare_idp) +{ + nvlist_t **spares; + uint_t nspares; + int error; + + if ((spa->spa_spares.sav_config == NULL) || + (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) { + return (SET_ERROR(ENOENT)); + } + + char *spare_name; + error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name); + if (error != 0) + return (SET_ERROR(EINVAL)); + + for (int i = 0; i < nspares; i++) { + nvlist_t *spare = spares[i]; + uint64_t top_guid, spare_id; + char *type, *path; + + /* Skip non-distributed spares */ + error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type); + if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0) + continue; + + /* Skip spares with the wrong name */ + error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path); + if (error != 0 || strcmp(path, spare_name) != 0) + continue; + + /* Found the matching spare */ + error = nvlist_lookup_uint64(spare, + ZPOOL_CONFIG_TOP_GUID, &top_guid); + if (error == 0) { + error = nvlist_lookup_uint64(spare, + ZPOOL_CONFIG_SPARE_ID, &spare_id); + } + + if (error != 0) { + return (SET_ERROR(EINVAL)); + } else { + *top_guidp = top_guid; + *spare_idp = spare_id; + return (0); + } + } + + return (SET_ERROR(ENOENT)); +} + +/* + * Initialize private dRAID spare specific fields from the nvlist. + */ +static int +vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + vdev_draid_spare_t *vds; + uint64_t top_guid = 0; + uint64_t spare_id; + + /* + * In the normal case check the list of spares stored in the spa + * to lookup the top_guid and spare_id for provided spare config. + * When creating a new pool or adding vdevs the spare list is not + * yet populated and the values are provided in the passed config. + */ + if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID, + &spare_id) != 0) + return (SET_ERROR(EINVAL)); + } + + vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP); + vds->vds_draid_vdev = NULL; + vds->vds_top_guid = top_guid; + vds->vds_spare_id = spare_id; + + *tsd = vds; + + return (0); +} + +static void +vdev_draid_spare_fini(vdev_t *vd) +{ + kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t)); +} + +static void +vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id); +} + +vdev_ops_t vdev_draid_spare_ops = { + .vdev_op_init = vdev_draid_spare_init, + .vdev_op_fini = vdev_draid_spare_fini, + .vdev_op_open = vdev_draid_spare_open, + .vdev_op_close = vdev_draid_spare_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_draid_spare_io_start, + .vdev_op_io_done = vdev_draid_spare_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_draid_spare_config_generate, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DRAID_SPARE, + .vdev_op_leaf = B_TRUE, +}; diff --git a/module/zfs/vdev_draid_rand.c b/module/zfs/vdev_draid_rand.c new file mode 100644 index 000000000..fe1a75c11 --- /dev/null +++ b/module/zfs/vdev_draid_rand.c @@ -0,0 +1,40 @@ +/* + * Xorshift Pseudo Random Number Generator based on work by David Blackman + * and Sebastiano Vigna (vigna@acm.org). + * + * "Further scramblings of Marsaglia's xorshift generators" + * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * http://prng.di.unimi.it/xoroshiro128plusplus.c + * + * To the extent possible under law, the author has dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide. This software is distributed without any warranty. + * + * See . + * + * This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid, + * small-state generators. It is extremely (sub-ns) fast and it passes all + * tests we are aware of, but its state space is large enough only for + * mild parallelism. + */ + +#include + +static inline uint64_t rotl(const uint64_t x, int k) +{ + return (x << k) | (x >> (64 - k)); +} + +uint64_t +vdev_draid_rand(uint64_t *s) +{ + const uint64_t s0 = s[0]; + uint64_t s1 = s[1]; + const uint64_t result = rotl(s0 + s1, 17) + s0; + + s1 ^= s0; + s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b + s[1] = rotl(s1, 28); // c + + return (result); +} diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 12ee393bd..009394bfe 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1844,9 +1844,13 @@ vdev_indirect_io_done(zio_t *zio) } vdev_ops_t vdev_indirect_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_indirect_open, .vdev_op_close = vdev_indirect_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_indirect_io_start, .vdev_op_io_done = vdev_indirect_io_done, .vdev_op_state_change = NULL, @@ -1855,6 +1859,11 @@ vdev_ops_t vdev_indirect_ops = { .vdev_op_rele = NULL, .vdev_op_remap = vdev_indirect_remap, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* leaf vdev */ }; diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 7ff7fffcc..083ad2861 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -121,6 +121,8 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { vd->vdev_initialize_action_time = gethrestime_sec(); } + + vdev_initializing_state_t old_state = vd->vdev_initialize_state; vd->vdev_initialize_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); @@ -138,8 +140,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) "vdev=%s suspended", vd->vdev_path); break; case VDEV_INITIALIZE_CANCELED: - spa_history_log_internal(spa, "initialize", tx, - "vdev=%s canceled", vd->vdev_path); + if (old_state == VDEV_INITIALIZE_ACTIVE || + old_state == VDEV_INITIALIZE_SUSPENDED) + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s canceled", vd->vdev_path); break; case VDEV_INITIALIZE_COMPLETE: spa_history_log_internal(spa, "initialize", tx, @@ -317,6 +321,32 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) return (0); } +static void +vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +{ + uint64_t *last_rs_end = (uint64_t *)arg; + + if (physical_rs->rs_end > *last_rs_end) + *last_rs_end = physical_rs->rs_end; +} + +static void +vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = (vdev_t *)arg; + + uint64_t size = physical_rs->rs_end - physical_rs->rs_start; + vd->vdev_initialize_bytes_est += size; + + if (vd->vdev_initialize_last_offset > physical_rs->rs_end) { + vd->vdev_initialize_bytes_done += size; + } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start && + vd->vdev_initialize_last_offset < physical_rs->rs_end) { + vd->vdev_initialize_bytes_done += + vd->vdev_initialize_last_offset - physical_rs->rs_start; + } +} + static void vdev_initialize_calculate_progress(vdev_t *vd) { @@ -331,28 +361,35 @@ vdev_initialize_calculate_progress(vdev_t *vd) metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); - uint64_t ms_free = msp->ms_size - - metaslab_allocated_space(msp); - - if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) - ms_free /= vd->vdev_top->vdev_children; + uint64_t ms_free = (msp->ms_size - + metaslab_allocated_space(msp)) / + vdev_get_ndisks(vd->vdev_top); /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; - vdev_xlate(vd, &logical_rs, &physical_rs); + /* Metaslab space after this offset has not been initialized */ + vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { + } + + /* Metaslab space before this offset has been initialized */ + uint64_t last_rs_end = physical_rs.rs_end; + if (!vdev_xlate_is_empty(&remain_rs)) { + vdev_xlate_walk(vd, &remain_rs, + vdev_initialize_xlate_last_rs_end, &last_rs_end); + } + + if (vd->vdev_initialize_last_offset > last_rs_end) { vd->vdev_initialize_bytes_done += ms_free; vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); @@ -374,22 +411,9 @@ vdev_initialize_calculate_progress(vdev_t *vd) &where)) { logical_rs.rs_start = rs_get_start(rs, rt); logical_rs.rs_end = rs_get_end(rs, rt); - vdev_xlate(vd, &logical_rs, &physical_rs); - uint64_t size = physical_rs.rs_end - - physical_rs.rs_start; - vd->vdev_initialize_bytes_est += size; - if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += size; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_start && - vd->vdev_initialize_last_offset < - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += - vd->vdev_initialize_last_offset - - physical_rs.rs_start; - } + vdev_xlate_walk(vd, &logical_rs, + vdev_initialize_xlate_progress, vd); } mutex_exit(&msp->ms_lock); } @@ -419,6 +443,34 @@ vdev_initialize_load(vdev_t *vd) return (err); } +static void +vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = arg; + + /* Only add segments that we have not visited yet */ + if (physical_rs->rs_end <= vd->vdev_initialize_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_initialize_last_offset > physical_rs->rs_start) { + zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " + "(%llu, %llu)", vd->vdev_path, + (u_longlong_t)physical_rs->rs_start, + (u_longlong_t)physical_rs->rs_end, + (u_longlong_t)vd->vdev_initialize_last_offset, + (u_longlong_t)physical_rs->rs_end); + ASSERT3U(physical_rs->rs_end, >, + vd->vdev_initialize_last_offset); + physical_rs->rs_start = vd->vdev_initialize_last_offset; + } + + ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); + + range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start, + physical_rs->rs_end - physical_rs->rs_start); +} + /* * Convert the logical range into a physical range and add it to our * avl tree. @@ -427,47 +479,12 @@ static void vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; ASSERT(vd->vdev_ops->vdev_op_leaf); - vdev_xlate(vd, &logical_rs, &physical_rs); - - IMPLY(vd->vdev_top == vd, - logical_rs.rs_start == physical_rs.rs_start); - IMPLY(vd->vdev_top == vd, - logical_rs.rs_end == physical_rs.rs_end); - - /* Only add segments that we have not visited yet */ - if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) - return; - - /* Pick up where we left off mid-range. */ - if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { - zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " - "(%llu, %llu)", vd->vdev_path, - (u_longlong_t)physical_rs.rs_start, - (u_longlong_t)physical_rs.rs_end, - (u_longlong_t)vd->vdev_initialize_last_offset, - (u_longlong_t)physical_rs.rs_end); - ASSERT3U(physical_rs.rs_end, >, - vd->vdev_initialize_last_offset); - physical_rs.rs_start = vd->vdev_initialize_last_offset; - } - ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); - - /* - * With raidz, it's possible that the logical range does not live on - * this leaf vdev. We only add the physical range to this vdev's if it - * has a length greater than 0. - */ - if (physical_rs.rs_end > physical_rs.rs_start) { - range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, - physical_rs.rs_end - physical_rs.rs_start); - } else { - ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); - } + vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg); } static void diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index d063b77ea..fbd117d2d 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -142,6 +142,7 @@ #include #include #include +#include #include #include #include @@ -453,31 +454,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_fru != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); - if (vd->vdev_nparity != 0) { - ASSERT(strcmp(vd->vdev_ops->vdev_op_type, - VDEV_TYPE_RAIDZ) == 0); + if (vd->vdev_ops->vdev_op_config_generate != NULL) + vd->vdev_ops->vdev_op_config_generate(vd, nv); - /* - * Make sure someone hasn't managed to sneak a fancy new vdev - * into a crufty old storage pool. - */ - ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity <= 2 && - spa_version(spa) >= SPA_VERSION_RAIDZ2) || - (vd->vdev_nparity <= 3 && - spa_version(spa) >= SPA_VERSION_RAIDZ3)); - - /* - * Note that we'll add the nparity tag even on storage pools - * that only support a single parity device -- older software - * will just ignore it. - */ - fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); - } - - if (vd->vdev_wholedisk != -1ULL) + if (vd->vdev_wholedisk != -1ULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); + } if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); @@ -785,6 +768,14 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); + /* + * The label for a dRAID distributed spare is not stored on disk. + * Instead it is generated when needed which allows us to bypass + * the pipeline when reading the config from the label. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_read_config_spare(vd)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); vp = abd_to_buf(vp_abd); @@ -1497,7 +1488,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && + vd->vdev_ops != &vdev_draid_spare_ops) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, @@ -1586,6 +1578,13 @@ vdev_copy_uberblocks(vdev_t *vd) SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); + /* + * No uberblocks are stored on distributed spares, they may be + * safely skipped when expanding a leaf vdev. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER); ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); @@ -1647,6 +1646,15 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, if (!vdev_writeable(vd)) return; + /* + * There's no need to write uberblocks to a distributed spare, they + * are already stored on all the leaves of the parent dRAID. For + * this same reason vdev_uberblock_load_impl() skips distributed + * spares when reading uberblocks. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + /* If the vdev was expanded, need to copy uberblock rings. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && vd->vdev_copy_uberblocks == B_TRUE) { @@ -1763,6 +1771,14 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes, if (!vdev_writeable(vd)) return; + /* + * The top-level config never needs to be written to a distributed + * spare. When read vdev_dspare_label_read_config() will generate + * the config for the vdev_label_read_config(). + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + /* * Generate a label describing the top-level config to which we belong. */ diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 71b5adbbd..71ca43cae 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -99,7 +100,6 @@ vdev_mirror_stat_fini(void) /* * Virtual device vector for mirroring. */ - typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; @@ -108,6 +108,7 @@ typedef struct mirror_child { uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; + uint8_t mc_rebuilding; } mirror_child_t; typedef struct mirror_map { @@ -115,6 +116,7 @@ typedef struct mirror_map { int mm_preferred_cnt; int mm_children; boolean_t mm_resilvering; + boolean_t mm_rebuilding; boolean_t mm_root; mirror_child_t mm_child[]; } mirror_map_t; @@ -239,6 +241,21 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) return (load + zfs_vdev_mirror_rotating_seek_inc); } +static boolean_t +vdev_mirror_rebuilding(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (vdev_mirror_rebuilding(vd->vdev_child[i])) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * Avoid inlining the function to keep vdev_mirror_io_start(), which * is this functions only caller, as small as possible on the stack. @@ -356,6 +373,9 @@ vdev_mirror_map_init(zio_t *zio) mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; + + if (vdev_mirror_rebuilding(mc->mc_vd)) + mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE; } } @@ -493,12 +513,37 @@ vdev_mirror_preferred_child_randomize(zio_t *zio) return (mm->mm_preferred[p]); } +static boolean_t +vdev_mirror_child_readable(mirror_child_t *mc) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_readable(vd, mc->mc_offset)); + else + return (vdev_readable(vd)); +} + +static boolean_t +vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_missing(vd, mc->mc_offset, txg, size)); + else + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + /* * Try to find a vdev whose DTL doesn't contain the block we want to read - * preferring vdevs based on determined load. + * preferring vdevs based on determined load. If we can't, try the read on + * any vdev we haven't already tried. * - * Try to find a child whose DTL doesn't contain the block we want to read. - * If we can't, try the read on any vdev we haven't already tried. + * Distributed spares are an exception to the above load rule. They are + * always preferred in order to detect gaps in the distributed spare which + * are created when another disk in the dRAID fails. In order to restore + * redundancy those gaps must be read to trigger the required repair IO. */ static int vdev_mirror_child_select(zio_t *zio) @@ -518,20 +563,27 @@ vdev_mirror_child_select(zio_t *zio) if (mc->mc_tried || mc->mc_skipped) continue; - if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { + if (mc->mc_vd == NULL || + !vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + if (vdev_mirror_child_missing(mc, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; continue; } + if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) { + mm->mm_preferred[0] = c; + mm->mm_preferred_cnt = 1; + break; + } + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); if (mc->mc_load > lowest_load) continue; @@ -625,11 +677,25 @@ vdev_mirror_io_start(zio_t *zio) while (children--) { mc = &mm->mm_child[c]; + c++; + + /* + * When sequentially resilvering only issue write repair + * IOs to the vdev which is being rebuilt since performance + * is limited by the slowest child. This is an issue for + * faster replacement devices such as distributed spares. + */ + if ((zio->io_priority == ZIO_PRIORITY_REBUILD) && + (zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !(zio->io_flags & ZIO_FLAG_SCRUB) && + mm->mm_rebuilding && !mc->mc_rebuilding) { + continue; + } + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); - c++; } zio_execute(zio); @@ -744,6 +810,8 @@ vdev_mirror_io_done(zio_t *zio) mc = &mm->mm_child[c]; if (mc->mc_error == 0) { + vdev_ops_t *ops = mc->mc_vd->vdev_ops; + if (mc->mc_tried) continue; /* @@ -752,15 +820,16 @@ vdev_mirror_io_done(zio_t *zio) * 1. it's a scrub (in which case we have * tried everything that was healthy) * - or - - * 2. it's an indirect vdev (in which case - * it could point to any other vdev, which - * might have a bad DTL) + * 2. it's an indirect or distributed spare + * vdev (in which case it could point to any + * other vdev, which might have a bad DTL) * - or - * 3. the DTL indicates that this data is * missing from this vdev */ if (!(zio->io_flags & ZIO_FLAG_SCRUB) && - mc->mc_vd->vdev_ops != &vdev_indirect_ops && + ops != &vdev_indirect_ops && + ops != &vdev_draid_spare_ops && !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; @@ -796,50 +865,90 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) } } +/* + * Return the maximum asize for a rebuild zio in the provided range. + */ +static uint64_t +vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), + SPA_MAXBLOCKSIZE); + + return (MIN(asize, vdev_psize_to_asize(vd, psize))); +} + vdev_ops_t vdev_mirror_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index ce90df6e8..e9145fd01 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -81,9 +81,13 @@ vdev_missing_io_done(zio_t *zio) } vdev_ops_t vdev_missing_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, .vdev_op_state_change = NULL, @@ -92,14 +96,23 @@ vdev_ops_t vdev_missing_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; vdev_ops_t vdev_hole_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, .vdev_op_state_change = NULL, @@ -108,6 +121,11 @@ vdev_ops_t vdev_hole_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index a8ef3d747..45d92819d 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -593,6 +593,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) return (NULL); + /* + * I/Os to distributed spares are directly dispatched to the dRAID + * leaf vdevs for aggregation. See the comment at the end of the + * zio_vdev_io_start() function. + */ + ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops); + first = last = zio; if (zio->io_type == ZIO_TYPE_READ) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 47312e02f..989b90dc2 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -134,25 +135,51 @@ VDEV_RAIDZ_64MUL_2((x), mask); \ } -void -vdev_raidz_map_free(raidz_map_t *rm) +static void +vdev_raidz_row_free(raidz_row_t *rr) { int c; - for (c = 0; c < rm->rm_firstdatacol; c++) { - abd_free(rm->rm_col[c].rc_abd); + for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) { + abd_free(rr->rr_col[c].rc_abd); - if (rm->rm_col[c].rc_gdata != NULL) - abd_free(rm->rm_col[c].rc_gdata); + if (rr->rr_col[c].rc_gdata != NULL) { + abd_free(rr->rr_col[c].rc_gdata); + } + if (rr->rr_col[c].rc_orig_data != NULL) { + zio_buf_free(rr->rr_col[c].rc_orig_data, + rr->rr_col[c].rc_size); + } + } + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + if (rr->rr_col[c].rc_size != 0) { + if (abd_is_gang(rr->rr_col[c].rc_abd)) + abd_free(rr->rr_col[c].rc_abd); + else + abd_put(rr->rr_col[c].rc_abd); + } + if (rr->rr_col[c].rc_orig_data != NULL) { + zio_buf_free(rr->rr_col[c].rc_orig_data, + rr->rr_col[c].rc_size); + } } - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - abd_put(rm->rm_col[c].rc_abd); + if (rr->rr_abd_copy != NULL) + abd_free(rr->rr_abd_copy); - if (rm->rm_abd_copy != NULL) - abd_free(rm->rm_abd_copy); + if (rr->rr_abd_empty != NULL) + abd_free(rr->rr_abd_empty); - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); + kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); +} + +void +vdev_raidz_map_free(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) + vdev_raidz_row_free(rm->rm_row[i]); + + kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } static void @@ -161,10 +188,11 @@ vdev_raidz_map_free_vsd(zio_t *zio) raidz_map_t *rm = zio->io_vsd; ASSERT0(rm->rm_freed); - rm->rm_freed = 1; + rm->rm_freed = B_TRUE; - if (rm->rm_reports == 0) + if (rm->rm_reports == 0) { vdev_raidz_map_free(rm); + } } /*ARGSUSED*/ @@ -175,7 +203,7 @@ vdev_raidz_cksum_free(void *arg, size_t ignored) ASSERT3U(rm->rm_reports, >, 0); - if (--rm->rm_reports == 0 && rm->rm_freed != 0) + if (--rm->rm_reports == 0 && rm->rm_freed) vdev_raidz_map_free(rm); } @@ -186,77 +214,79 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) const size_t c = zcr->zcr_cbinfo; size_t x, offset; - const abd_t *good = NULL; - const abd_t *bad = rm->rm_col[c].rc_abd; - if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); return; } - if (c < rm->rm_firstdatacol) { + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + + const abd_t *good = NULL; + const abd_t *bad = rr->rr_col[c].rc_abd; + + if (c < rr->rr_firstdatacol) { /* * The first time through, calculate the parity blocks for * the good data (this relies on the fact that the good * data never changes for a given logical ZIO) */ - if (rm->rm_col[0].rc_gdata == NULL) { + if (rr->rr_col[0].rc_gdata == NULL) { abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; /* - * Set up the rm_col[]s to generate the parity for + * Set up the rr_col[]s to generate the parity for * good_data, first saving the parity bufs and * replacing them with buffers to hold the result. */ - for (x = 0; x < rm->rm_firstdatacol; x++) { - bad_parity[x] = rm->rm_col[x].rc_abd; - rm->rm_col[x].rc_abd = - rm->rm_col[x].rc_gdata = - abd_alloc_sametype(rm->rm_col[x].rc_abd, - rm->rm_col[x].rc_size); + for (x = 0; x < rr->rr_firstdatacol; x++) { + bad_parity[x] = rr->rr_col[x].rc_abd; + rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = + abd_alloc_sametype(rr->rr_col[x].rc_abd, + rr->rr_col[x].rc_size); } /* fill in the data columns from good_data */ offset = 0; - for (; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); + for (; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); - rm->rm_col[x].rc_abd = + rr->rr_col[x].rc_abd = abd_get_offset_size((abd_t *)good_data, - offset, rm->rm_col[x].rc_size); - offset += rm->rm_col[x].rc_size; + offset, rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; } /* * Construct the parity from the good data. */ - vdev_raidz_generate_parity(rm); + vdev_raidz_generate_parity_row(rm, rr); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) - rm->rm_col[x].rc_abd = bad_parity[x]; + for (x = 0; x < rr->rr_firstdatacol; x++) + rr->rr_col[x].rc_abd = bad_parity[x]; offset = 0; - for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_offset_size( - rm->rm_abd_copy, offset, - rm->rm_col[x].rc_size); - offset += rm->rm_col[x].rc_size; + for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); + rr->rr_col[x].rc_abd = abd_get_offset_size( + rr->rr_abd_copy, offset, + rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; } } - ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); - good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, - rm->rm_col[c].rc_size); + ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); + good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, + rr->rr_col[c].rc_size); } else { /* adjust good_data to point at the start of our column */ offset = 0; - for (x = rm->rm_firstdatacol; x < c; x++) - offset += rm->rm_col[x].rc_size; + for (x = rr->rr_firstdatacol; x < c; x++) + offset += rr->rr_col[x].rc_size; good = abd_get_offset_size((abd_t *)good_data, offset, - rm->rm_col[c].rc_size); + rr->rr_col[c].rc_size); } /* we drop the ereport if it ends up that the data was good */ @@ -274,10 +304,7 @@ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - size_t offset; - raidz_map_t *rm = zio->io_vsd; - size_t size; /* set up the report and bump the refcount */ zcr->zcr_cbdata = rm; @@ -287,8 +314,9 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); + ASSERT3U(rm->rm_nrows, ==, 1); - if (rm->rm_abd_copy != NULL) + if (rm->rm_row[0]->rr_abd_copy != NULL) return; /* @@ -299,26 +327,30 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) * Our parity data is already in separate buffers, so there's no need * to copy them. */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + size_t offset = 0; + size_t size = 0; - size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - size += rm->rm_col[c].rc_size; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) + size += rr->rr_col[c].rc_size; - rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); + rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); - for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; - abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, - col->rc_size); + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; + abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, + offset, col->rc_size); - abd_copy(tmp, col->rc_abd, col->rc_size); + abd_copy(tmp, col->rc_abd, col->rc_size); - abd_put(col->rc_abd); - col->rc_abd = tmp; + abd_put(col->rc_abd); + col->rc_abd = tmp; - offset += col->rc_size; + offset += col->rc_size; + } + ASSERT3U(offset, ==, size); } - ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -337,7 +369,7 @@ noinline raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t nparity) { - raidz_map_t *rm; + raidz_row_t *rr; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ @@ -349,6 +381,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; uint64_t off = 0; + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); + rm->rm_nrows = 1; + /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. @@ -370,8 +406,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); - /* acols: The columns that will be accessed. */ - /* scols: The columns that will be accessed or skipped. */ + /* + * acols: The columns that will be accessed. + * scols: The columns that will be accessed or skipped. + */ if (q == 0) { /* Our I/O request doesn't span all child vdevs. */ acols = bc; @@ -383,65 +421,70 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, ASSERT3U(acols, <=, scols); - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); + rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); + rm->rm_row[0] = rr; - rm->rm_cols = acols; - rm->rm_scols = scols; - rm->rm_bigcols = bc; - rm->rm_skipstart = bc; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - rm->rm_firstdatacol = nparity; - rm->rm_abd_copy = NULL; - rm->rm_reports = 0; - rm->rm_freed = 0; - rm->rm_ecksuminjected = 0; + rr->rr_cols = acols; + rr->rr_scols = scols; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = nparity; + rr->rr_abd_copy = NULL; + rr->rr_abd_empty = NULL; + rr->rr_nempty = 0; +#ifdef ZFS_DEBUG + rr->rr_offset = zio->io_offset; + rr->rr_size = zio->io_size; +#endif asize = 0; for (c = 0; c < scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; col = f + c; coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } - rm->rm_col[c].rc_devidx = col; - rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_abd = NULL; - rm->rm_col[c].rc_gdata = NULL; - rm->rm_col[c].rc_error = 0; - rm->rm_col[c].rc_tried = 0; - rm->rm_col[c].rc_skipped = 0; + rc->rc_devidx = col; + rc->rc_offset = coff; + rc->rc_abd = NULL; + rc->rc_gdata = NULL; + rc->rc_orig_data = NULL; + rc->rc_error = 0; + rc->rc_tried = 0; + rc->rc_skipped = 0; + rc->rc_repair = 0; + rc->rc_need_orig_restore = B_FALSE; if (c >= acols) - rm->rm_col[c].rc_size = 0; + rc->rc_size = 0; else if (c < bc) - rm->rm_col[c].rc_size = (q + 1) << ashift; + rc->rc_size = (q + 1) << ashift; else - rm->rm_col[c].rc_size = q << ashift; + rc->rc_size = q << ashift; - asize += rm->rm_col[c].rc_size; + asize += rc->rc_size; } ASSERT3U(asize, ==, tot << ashift); - rm->rm_asize = roundup(asize, (nparity + 1) << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; - ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); - ASSERT3U(rm->rm_nskip, <=, nparity); + rm->rm_skipstart = bc; - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_abd = - abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); + for (c = 0; c < rr->rr_firstdatacol; c++) + rr->rr_col[c].rc_abd = + abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, - rm->rm_col[c].rc_size); - off = rm->rm_col[c].rc_size; + rr->rr_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, + rr->rr_col[c].rc_size); + off = rr->rr_col[c].rc_size; for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, - rm->rm_col[c].rc_size); - off += rm->rm_col[c].rc_size; + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_get_offset_size(zio->io_abd, off, rc->rc_size); + off += rc->rc_size; } /* @@ -464,24 +507,21 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, * skip the first column since at least one data and one parity * column must appear in each row. */ - ASSERT(rm->rm_cols >= 2); - ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); - if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { - devidx = rm->rm_col[0].rc_devidx; - o = rm->rm_col[0].rc_offset; - rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; - rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; - rm->rm_col[1].rc_devidx = devidx; - rm->rm_col[1].rc_offset = o; + if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + devidx = rr->rr_col[0].rc_devidx; + o = rr->rr_col[0].rc_offset; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[1].rc_devidx = devidx; + rr->rr_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; - /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); @@ -550,50 +590,43 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private) } static void -vdev_raidz_generate_parity_p(raidz_map_t *rm) +vdev_raidz_generate_parity_p(raidz_row_t *rr) { - uint64_t *p; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + if (c == rr->rr_firstdatacol) { + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_p_func, &pqr); } } } static void -vdev_raidz_generate_parity_pq(raidz_map_t *rm) +vdev_raidz_generate_parity_pq(raidz_row_t *rr) { - uint64_t *p, *q, pcnt, ccnt, mask, i; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); - for (i = ccnt; i < pcnt; i++) { + for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } @@ -601,14 +634,15 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) struct pqr_struct pqr = { p, q, NULL }; ASSERT(ccnt <= pcnt); - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pq_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); } } @@ -616,33 +650,29 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) } static void -vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { - uint64_t *p, *q, *r, pcnt, ccnt, mask, i; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_R].rc_size); - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_R].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); - (void) memcpy(r, p, rm->rm_col[c].rc_size); + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); + (void) memcpy(r, p, rr->rr_col[c].rc_size); - for (i = ccnt; i < pcnt; i++) { + for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; r[i] = 0; @@ -651,14 +681,15 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) struct pqr_struct pqr = { p, q, r }; ASSERT(ccnt <= pcnt); - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pqr_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); VDEV_RAIDZ_64MUL_4(r[i], mask); } @@ -671,27 +702,38 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) * parity columns available. */ void -vdev_raidz_generate_parity(raidz_map_t *rm) +vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { + ASSERT3U(rr->rr_cols, !=, 0); + /* Generate using the new math implementation */ - if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) + if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) return; - switch (rm->rm_firstdatacol) { + switch (rr->rr_firstdatacol) { case 1: - vdev_raidz_generate_parity_p(rm); + vdev_raidz_generate_parity_p(rr); break; case 2: - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); break; case 3: - vdev_raidz_generate_parity_pqr(rm); + vdev_raidz_generate_parity_pqr(rr); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); } } +void +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_generate_parity_row(rm, rr); + } +} + /* ARGSUSED */ static int vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) @@ -809,30 +851,27 @@ vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) } static int -vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; - int c; abd_t *dst, *src; - ASSERT(ntgts == 1); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(x < rm->rm_cols); + ASSERT3U(ntgts, ==, 1); + ASSERT3U(x, >=, rr->rr_firstdatacol); + ASSERT3U(x, <, rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); - ASSERT(rm->rm_col[x].rc_size > 0); + ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); - src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + dst = rr->rr_col[x].rc_abd; - abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); + abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; if (c == x) continue; @@ -845,7 +884,7 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) } static int -vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; int c, exp; @@ -853,44 +892,44 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) ASSERT(ntgts == 1); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; + dst = rr->rr_col[x].rc_abd; - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { abd_copy(dst, src, size); - if (rm->rm_col[x].rc_size > size) + if (rr->rr_col[x].rc_size > size) { abd_zero_off(dst, size, - rm->rm_col[x].rc_size - size); - + rr->rr_col[x].rc_size - size); + } } else { - ASSERT3U(size, <=, rm->rm_col[x].rc_size); + ASSERT3U(size, <=, rr->rr_col[x].rc_size); (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, - size, rm->rm_col[x].rc_size - size, + size, rr->rr_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); } } - src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - dst = rm->rm_col[x].rc_abd; - exp = 255 - (rm->rm_cols - 1 - x); + src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + dst = rr->rr_col[x].rc_abd; + exp = 255 - (rr->rr_cols - 1 - x); struct reconst_q_struct rq = { abd_to_buf(src), exp }; - (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } static int -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; abd_t *pdata, *qdata; @@ -901,10 +940,10 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) ASSERT(ntgts == 2); ASSERT(x < y); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(y < rm->rm_cols); + ASSERT(x >= rr->rr_firstdatacol); + ASSERT(y < rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); + ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as @@ -913,29 +952,29 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - xsize = rm->rm_col[x].rc_size; - ysize = rm->rm_col[y].rc_size; + pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + xsize = rr->rr_col[x].rc_size; + ysize = rr->rr_col[y].rc_size; - rm->rm_col[VDEV_RAIDZ_P].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); - rm->rm_col[x].rc_size = 0; - rm->rm_col[y].rc_size = 0; + rr->rr_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); + rr->rr_col[x].rc_size = 0; + rr->rr_col[y].rc_size = 0; - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); - rm->rm_col[x].rc_size = xsize; - rm->rm_col[y].rc_size = ysize; + rr->rr_col[x].rc_size = xsize; + rr->rr_col[y].rc_size = ysize; p = abd_to_buf(pdata); q = abd_to_buf(qdata); - pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - xd = rm->rm_col[x].rc_abd; - yd = rm->rm_col[y].rc_abd; + pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + xd = rr->rr_col[x].rc_abd; + yd = rr->rr_col[y].rc_abd; /* * We now have: @@ -953,7 +992,7 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) */ a = vdev_raidz_pow2[255 + x - y]; - b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; + b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; @@ -967,14 +1006,14 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) (void) abd_iterate_func(xd, ysize, xsize - ysize, vdev_raidz_reconst_pq_tail_func, &rpq); - abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ - rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; + rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } @@ -1134,13 +1173,13 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) /* END CSTYLED */ static void -vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, +vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; - ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); /* * Fill in the missing rows of interest. @@ -1164,7 +1203,7 @@ vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, } static void -vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, +vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; @@ -1176,10 +1215,10 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, * correspond to data columns. */ for (i = 0; i < nmissing; i++) { - ASSERT3S(used[i], <, rm->rm_firstdatacol); + ASSERT3S(used[i], <, rr->rr_firstdatacol); } for (; i < n; i++) { - ASSERT3S(used[i], >=, rm->rm_firstdatacol); + ASSERT3S(used[i], >=, rr->rr_firstdatacol); } /* @@ -1196,8 +1235,8 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { - ASSERT3U(used[j], >=, rm->rm_firstdatacol); - jj = used[j] - rm->rm_firstdatacol; + ASSERT3U(used[j], >=, rr->rr_firstdatacol); + jj = used[j] - rr->rr_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; @@ -1258,7 +1297,7 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, } static void -vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, +vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; @@ -1290,22 +1329,24 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, for (i = 0; i < n; i++) { c = used[i]; - ASSERT3U(c, <, rm->rm_cols); + ASSERT3U(c, <, rr->rr_cols); - src = abd_to_buf(rm->rm_col[c].rc_abd); - ccount = rm->rm_col[c].rc_size; + ccount = rr->rr_col[c].rc_size; + ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); + if (ccount == 0) + continue; + src = abd_to_buf(rr->rr_col[c].rc_abd); for (j = 0; j < nmissing; j++) { - cc = missing[j] + rm->rm_firstdatacol; - ASSERT3U(cc, >=, rm->rm_firstdatacol); - ASSERT3U(cc, <, rm->rm_cols); + cc = missing[j] + rr->rr_firstdatacol; + ASSERT3U(cc, >=, rr->rr_firstdatacol); + ASSERT3U(cc, <, rr->rr_cols); ASSERT3U(cc, !=, c); - dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); - dcount[j] = rm->rm_col[cc].rc_size; + dcount[j] = rr->rr_col[cc].rc_size; + if (dcount[j] != 0) + dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); } - ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); - for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; @@ -1334,16 +1375,14 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, } static int -vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int n, i, c, t, tt; int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; - uint8_t *p, *pp; size_t psize; - uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; @@ -1354,30 +1393,39 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate - * temporary linear ABDs. + * temporary linear ABDs if any non-linear ABDs are found. */ - if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { - bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { + if (!abd_is_linear(rr->rr_col[i].rc_abd)) { + bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), + KM_PUSHPAGE); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; - bufs[c] = col->rc_abd; - col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); - abd_copy(col->rc_abd, bufs[c], col->rc_size); + bufs[c] = col->rc_abd; + if (bufs[c] != NULL) { + col->rc_abd = abd_alloc_linear( + col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], + col->rc_size); + } + } + + break; } } - n = rm->rm_cols - rm->rm_firstdatacol; + n = rr->rr_cols - rr->rr_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { - if (tgts[t] >= rm->rm_firstdatacol) { + if (tgts[t] >= rr->rr_firstdatacol) { missing_rows[nmissing_rows++] = - tgts[t] - rm->rm_firstdatacol; + tgts[t] - rr->rr_firstdatacol; } } @@ -1387,7 +1435,7 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); - ASSERT(c < rm->rm_firstdatacol); + ASSERT(c < rr->rr_firstdatacol); /* * Skip any targeted parity columns. @@ -1422,9 +1470,9 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) used[i] = parity_map[i]; } - for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { if (tt < nmissing_rows && - c == missing_rows[tt] + rm->rm_firstdatacol) { + c == missing_rows[tt] + rr->rr_firstdatacol) { tt++; continue; } @@ -1437,18 +1485,18 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) /* * Initialize the interesting rows of the matrix. */ - vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ - vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ - vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); @@ -1457,21 +1505,24 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) * copy back from temporary linear abds and free them */ if (bufs) { - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; - abd_copy(bufs[c], col->rc_abd, col->rc_size); - abd_free(col->rc_abd); + if (bufs[c] != NULL) { + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + } col->rc_abd = bufs[c]; } - kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); } return (code); } -int -vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) +static int +vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, + const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; @@ -1480,26 +1531,19 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; - /* - * The tgts list must already be sorted. - */ - for (i = 1; i < nt; i++) { - ASSERT(t[i] > t[i - 1]); - } - - nbadparity = rm->rm_firstdatacol; - nbaddata = rm->rm_cols - nbadparity; + nbadparity = rr->rr_firstdatacol; + nbaddata = rr->rr_cols - nbadparity; ntgts = 0; - for (i = 0, c = 0; c < rm->rm_cols; c++) { - if (c < rm->rm_firstdatacol) + for (i = 0, c = 0; c < rr->rr_cols; c++) { + if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; - } else if (rm->rm_col[c].rc_error != 0) { + } else if (rr->rr_col[c].rc_error != 0) { tgts[ntgts++] = c; - } else if (c >= rm->rm_firstdatacol) { + } else if (c >= rr->rr_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; @@ -1514,7 +1558,7 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ - ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); + ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) return (ret); @@ -1524,29 +1568,29 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) switch (nbaddata) { case 1: if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); + return (vdev_raidz_reconstruct_p(rr, dt, 1)); - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); + return (vdev_raidz_reconstruct_q(rr, dt, 1)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; case 2: - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + return (vdev_raidz_reconstruct_pq(rr, dt, 2)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; } - code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + code = vdev_raidz_reconstruct_general(rr, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); return (code); @@ -1556,8 +1600,8 @@ static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { - vdev_t *cvd; - uint64_t nparity = vd->vdev_nparity; + vdev_raidz_t *vdrz = vd->vdev_tsd; + uint64_t nparity = vdrz->vd_nparity; int c; int lasterror = 0; int numerrors = 0; @@ -1573,7 +1617,7 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; @@ -1602,19 +1646,20 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, static void vdev_raidz_close(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } } static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { + vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; + uint64_t cols = vdrz->vd_logical_width; + uint64_t nparity = vdrz->vd_nparity; asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); @@ -1623,7 +1668,18 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize) return (asize); } -static void +/* + * The allocatable space for a raidz vdev is N * sizeof(smallest child) + * so each child must provide at least 1/Nth of its asize. + */ +static uint64_t +vdev_raidz_min_asize(vdev_t *vd) +{ + return ((vd->vdev_min_asize + vd->vdev_children - 1) / + vd->vdev_children); +} + +void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; @@ -1634,21 +1690,21 @@ vdev_raidz_child_done(zio_t *zio) } static void -vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) +vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) { #ifdef ZFS_DEBUG - vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - range_seg64_t logical_rs, physical_rs; - logical_rs.rs_start = zio->io_offset; + range_seg64_t logical_rs, physical_rs, remain_rs; + logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(zio->io_vd, zio->io_size); + vdev_raidz_asize(vd, rr->rr_size); - raidz_col_t *rc = &rm->rm_col[col]; + raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - vdev_xlate(cvd, &logical_rs, &physical_rs); + vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); + ASSERT(vdev_xlate_is_empty(&remain_rs)); ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* @@ -1666,6 +1722,91 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) #endif } +static void +vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) +{ + vdev_t *vd = zio->io_vd; + raidz_map_t *rm = zio->io_vsd; + int c, i; + + vdev_raidz_generate_parity_row(rm, rr); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + + /* Verify physical to logical translation */ + vdev_raidz_io_verify(vd, rr, c); + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], rc->rc_offset, + rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, + 0, vdev_raidz_child_done, rc)); + } + + /* + * Generate optional I/Os for skip sectors to improve aggregation + * contiguity. + */ + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + ASSERT(c <= rr->rr_scols); + if (c == rr->rr_scols) + c = 0; + + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } +} + +static void +vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last -- any errors along the way will force us to read the parity. + */ + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + if (!vdev_readable(cvd)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } +} + /* * Start an IO operation on a RAIDZ VDev * @@ -1688,96 +1829,32 @@ vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - vdev_t *cvd; + vdev_raidz_t *vdrz = vd->vdev_tsd; raidz_map_t *rm; - raidz_col_t *rc; - int c, i; - rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, - vd->vdev_nparity); - - ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); - - if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_generate_parity(rm); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - - /* - * Verify physical to logical translation. - */ - vdev_raidz_io_verify(zio, rm, c); - - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } - - /* - * Generate optional I/Os for any skipped sectors to improve - * aggregation contiguity. - */ - for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { - ASSERT(c <= rm->rm_scols); - if (c == rm->rm_scols) - c = 0; - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset + rc->rc_size, NULL, - 1 << tvd->vdev_ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); - } - - zio_execute(zio); - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, + vdrz->vd_logical_width, vdrz->vd_nparity); /* - * Iterate over the columns in reverse order so that we hit the parity - * last -- any errors along the way will force us to read the parity. + * Until raidz expansion is implemented all maps for a raidz vdev + * contain a single row. */ - for (c = rm->rm_cols - 1; c >= 0; c--) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - if (!vdev_readable(cvd)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = SET_ERROR(ENXIO); - rc->rc_tried = 1; /* don't even try */ - rc->rc_skipped = 1; - continue; - } - if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = SET_ERROR(ESTALE); - rc->rc_skipped = 1; - continue; - } - if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || - (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + vdev_raidz_io_start_read(zio, rr); } zio_execute(zio); } - /* * Report a checksum error for a child of a RAID-Z device. */ @@ -1786,7 +1863,8 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; @@ -1827,13 +1905,14 @@ raidz_checksum_verify(zio_t *zio) * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the - * number such failures. + * number of such failures. */ static int -raidz_parity_verify(zio_t *zio, raidz_map_t *rm) +raidz_parity_verify(zio_t *zio, raidz_row_t *rr) { abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; + raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; blkptr_t *bp = zio->io_bp; @@ -1843,8 +1922,18 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (checksum == ZIO_CHECKSUM_NOPARITY) return (ret); - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; + /* + * All data columns must have been successfully read in order + * to use them to generate parity columns for comparison. + */ + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + rc = &rr->rr_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + return (ret); + } + + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; @@ -1852,12 +1941,19 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) abd_copy(orig[c], rc->rc_abd, rc->rc_size); } - vdev_raidz_generate_parity(rm); + /* + * Regenerates parity even for !tried||rc_error!=0 columns. This + * isn't harmful but it does have the side effect of fixing stuff + * we didn't realize was necessary (i.e. even if we return 0). + */ + vdev_raidz_generate_parity_row(rm, rr); + + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; + if (abd_cmp(orig[c], rc->rc_abd) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); @@ -1870,464 +1966,606 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) } static int -vdev_raidz_worst_error(raidz_map_t *rm) +vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; - for (int c = 0; c < rm->rm_cols; c++) - error = zio_worst_error(error, rm->rm_col[c].rc_error); + for (int c = 0; c < rr->rr_cols; c++) + error = zio_worst_error(error, rr->rr_col[c].rc_error); return (error); } -/* - * Iterate over all combinations of bad data and attempt a reconstruction. - * Note that the algorithm below is non-optimal because it doesn't take into - * account how reconstruction is actually performed. For example, with - * triple-parity RAID-Z the reconstruction procedure is the same if column 4 - * is targeted as invalid as if columns 1 and 4 are targeted since in both - * cases we'd only use parity information in column 0. - */ -static int -vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) -{ - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc; - abd_t *orig[VDEV_RAIDZ_MAXPARITY]; - int tstore[VDEV_RAIDZ_MAXPARITY + 2]; - int *tgts = &tstore[1]; - int curr, next, i, c, n; - int code, ret = 0; - - ASSERT(total_errors < rm->rm_firstdatacol); - - /* - * This simplifies one edge condition. - */ - tgts[-1] = -1; - - for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { - /* - * Initialize the targets array by finding the first n columns - * that contain no error. - * - * If there were no data errors, we need to ensure that we're - * always explicitly attempting to reconstruct at least one - * data column. To do this, we simply push the highest target - * up into the data columns. - */ - for (c = 0, i = 0; i < n; i++) { - if (i == n - 1 && data_errors == 0 && - c < rm->rm_firstdatacol) { - c = rm->rm_firstdatacol; - } - - while (rm->rm_col[c].rc_error != 0) { - c++; - ASSERT3S(c, <, rm->rm_cols); - } - - tgts[i] = c++; - } - - /* - * Setting tgts[n] simplifies the other edge condition. - */ - tgts[n] = rm->rm_cols; - - /* - * These buffers were allocated in previous iterations. - */ - for (i = 0; i < n - 1; i++) { - ASSERT(orig[i] != NULL); - } - - orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, - rm->rm_col[0].rc_size); - - curr = 0; - next = tgts[curr]; - - while (curr != n) { - tgts[curr] = next; - curr = 0; - - /* - * Save off the original data that we're going to - * attempt to reconstruct. - */ - for (i = 0; i < n; i++) { - ASSERT(orig[i] != NULL); - c = tgts[i]; - ASSERT3S(c, >=, 0); - ASSERT3S(c, <, rm->rm_cols); - rc = &rm->rm_col[c]; - abd_copy(orig[i], rc->rc_abd, rc->rc_size); - } - - /* - * Attempt a reconstruction and exit the outer loop on - * success. - */ - code = vdev_raidz_reconstruct(rm, tgts, n); - if (raidz_checksum_verify(zio) == 0) { - - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - ASSERT(rc->rc_error == 0); - if (rc->rc_tried) - raidz_checksum_error(zio, rc, - orig[i]); - rc->rc_error = SET_ERROR(ECKSUM); - } - - ret = code; - goto done; - } - - /* - * Restore the original data. - */ - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - abd_copy(rc->rc_abd, orig[i], rc->rc_size); - } - - do { - /* - * Find the next valid column after the curr - * position.. - */ - for (next = tgts[curr] + 1; - next < rm->rm_cols && - rm->rm_col[next].rc_error != 0; next++) - continue; - - ASSERT(next <= tgts[curr + 1]); - - /* - * If that spot is available, we're done here. - */ - if (next != tgts[curr + 1]) - break; - - /* - * Otherwise, find the next valid column after - * the previous position. - */ - for (c = tgts[curr - 1] + 1; - rm->rm_col[c].rc_error != 0; c++) - continue; - - tgts[curr] = c; - curr++; - - } while (curr != n); - } - } - n--; -done: - for (i = 0; i < n; i++) - abd_free(orig[i]); - - return (ret); -} - -/* - * Complete an IO operation on a RAIDZ VDev - * - * Outline: - * - For write operations: - * 1. Check for errors on the child IOs. - * 2. Return, setting an error code if too few child VDevs were written - * to reconstruct the data later. Note that partial writes are - * considered successful if they can be reconstructed at all. - * - For read operations: - * 1. Check for errors on the child IOs. - * 2. If data errors occurred: - * a. Try to reassemble the data from the parity available. - * b. If we haven't yet read the parity drives, read them now. - * c. If all parity drives have been read but the data still doesn't - * reassemble with a correct checksum, then try combinatorial - * reconstruction. - * d. If that doesn't work, return an error. - * 3. If there were unexpected errors or this is a resilver operation, - * rewrite the vdevs that had errors. - */ static void -vdev_raidz_io_done(zio_t *zio) +vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { - vdev_t *vd = zio->io_vd; - vdev_t *cvd; - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc = NULL; int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; - int total_errors = 0; - int n, c; - int tgts[VDEV_RAIDZ_MAXPARITY]; - int code; - ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); - ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { - ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - - if (c < rm->rm_firstdatacol) + if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; if (!rc->rc_skipped) unexpected_errors++; - - total_errors++; - } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { + } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } } - if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * XXX -- for now, treat partial writes as a success. - * (If we couldn't write enough columns to reconstruct - * the data, the I/O failed. Otherwise, good enough.) - * - * Now that we support write reallocation, it would be better - * to treat partial failure as real failure unless there are - * no non-degraded top-level vdevs left, and not update DTLs - * if we intend to reallocate. - */ - /* XXPOLICY */ - if (total_errors > rm->rm_firstdatacol) - zio->io_error = vdev_raidz_worst_error(rm); - - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); /* - * There are three potential phases for a read: - * 1. produce valid data from the columns read - * 2. read all disks and try again - * 3. perform combinatorial reconstruction + * If we read more parity disks than were used for + * reconstruction, confirm that the other parity disks produced + * correct data. * - * Each phase is progressively both more expensive and less likely to - * occur. If we encounter more errors than we can repair or all phases - * fail, we have no choice but to return an error. + * Note that we also regenerate parity when resilvering so we + * can write it out to failed devices later. */ - - /* - * If the number of errors we saw was correctable -- less than or equal - * to the number of parity disks read -- attempt to produce data that - * has a valid checksum. Naturally, this case applies in the absence of - * any errors. - */ - if (total_errors <= rm->rm_firstdatacol - parity_untried) { - if (data_errors == 0) { - if (raidz_checksum_verify(zio) == 0) { - /* - * If we read parity information (unnecessarily - * as it happens since no reconstruction was - * needed) regenerate and verify the parity. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. - */ - if (parity_errors + parity_untried < - rm->rm_firstdatacol || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - goto done; - } - } else { - /* - * We either attempt to read all the parity columns or - * none of them. If we didn't try to read parity, we - * wouldn't be here in the correctable case. There must - * also have been fewer parity errors than parity - * columns or, again, we wouldn't be in this code path. - */ - ASSERT(parity_untried == 0); - ASSERT(parity_errors < rm->rm_firstdatacol); - - /* - * Identify the data columns that reported an error. - */ - n = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) { - ASSERT(n < VDEV_RAIDZ_MAXPARITY); - tgts[n++] = c; - } - } - - ASSERT(rm->rm_firstdatacol >= n); - - code = vdev_raidz_reconstruct(rm, tgts, n); - - if (raidz_checksum_verify(zio) == 0) { - /* - * If we read more parity disks than were used - * for reconstruction, confirm that the other - * parity disks produced correct data. This - * routine is suboptimal in that it regenerates - * the parity that we already used in addition - * to the parity that we're attempting to - * verify, but this should be a relatively - * uncommon case, and can be optimized if it - * becomes a problem. Note that we regenerate - * parity when resilvering so we can write it - * out to failed devices later. - */ - if (parity_errors < rm->rm_firstdatacol - n || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - - goto done; - } - } + if (parity_errors + parity_untried < + rr->rr_firstdatacol - data_errors || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + int n = raidz_parity_verify(zio, rr); + unexpected_errors += n; + ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); } - /* - * This isn't a typical situation -- either we got a read error or - * a child silently returned bad data. Read every block so we can - * try again with as much data and parity as we can track down. If - * we've already been through once before, all children will be marked - * as tried so we'll proceed to combinatorial reconstruction. - */ - unexpected_errors = 1; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - - for (c = 0; c < rm->rm_cols; c++) { - if (rm->rm_col[c].rc_tried) - continue; - - zio_vdev_io_redone(zio); - do { - rc = &rm->rm_col[c]; - if (rc->rc_tried) - continue; - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[rc->rc_devidx], - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } while (++c < rm->rm_cols); - - return; - } - - /* - * At this point we've attempted to reconstruct the data given the - * errors we detected, and we've attempted to read all columns. There - * must, therefore, be one or more additional problems -- silent errors - * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. We check if there is enough additional data to - * possibly reconstruct the data and then perform combinatorial - * reconstruction over all possible combinations. If that fails, - * we're cooked. - */ - if (total_errors > rm->rm_firstdatacol) { - zio->io_error = vdev_raidz_worst_error(rm); - - } else if (total_errors < rm->rm_firstdatacol && - (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { - /* - * If we didn't use all the available parity for the - * combinatorial reconstruction, verify that the remaining - * parity is correct. - */ - if (code != (1 << rm->rm_firstdatacol) - 1) - (void) raidz_parity_verify(zio, rm); - } else { - /* - * We're here because either: - * - * total_errors == rm_first_datacol, or - * vdev_raidz_combrec() failed - * - * In either case, there is enough bad data to prevent - * reconstruction. - * - * Start checksum ereports for all children which haven't - * failed, and the IO wasn't speculative. - */ - zio->io_error = SET_ERROR(ECKSUM); - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - for (c = 0; c < rm->rm_cols; c++) { - vdev_t *cvd; - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - if (rc->rc_error != 0) - continue; - - zio_bad_cksum_t zbc; - zbc.zbc_has_cksum = 0; - zbc.zbc_injected = rm->rm_ecksuminjected; - - int ret = zfs_ereport_start_checksum( - zio->io_spa, cvd, &zio->io_bookmark, zio, - rc->rc_offset, rc->rc_size, - (void *)(uintptr_t)c, &zbc); - if (ret != EALREADY) { - mutex_enter(&cvd->vdev_stat_lock); - cvd->vdev_stat.vs_checksum_errors++; - mutex_exit(&cvd->vdev_stat_lock); - } - } - } - } - -done: - zio_checksum_verified(zio); - if (zio->io_error == 0 && spa_writeable(zio->io_spa) && - (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. */ - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *vd = zio->io_vd; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - if (rc->rc_error == 0) + if ((rc->rc_error == 0 || rc->rc_size == 0) && + (rc->rc_repair == 0)) { continue; + } zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_TYPE_WRITE, + zio->io_priority == ZIO_PRIORITY_REBUILD ? + ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } +static void +raidz_restore_orig_data(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + abd_copy_from_buf(rc->rc_abd, + rc->rc_orig_data, rc->rc_size); + rc->rc_need_orig_restore = B_FALSE; + } + } + } +} + +/* + * returns EINVAL if reconstruction of the block will not be possible + * returns ECKSUM if this specific reconstruction failed + * returns 0 on successful reconstruction + */ +static int +raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) +{ + raidz_map_t *rm = zio->io_vsd; + + /* Reconstruct each row */ + for (int r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ + int t = 0; + int dead = 0; + int dead_data = 0; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + ASSERT0(rc->rc_need_orig_restore); + if (rc->rc_error != 0) { + dead++; + if (c >= nparity) + dead_data++; + continue; + } + if (rc->rc_size == 0) + continue; + for (int lt = 0; lt < ntgts; lt++) { + if (rc->rc_devidx == ltgts[lt]) { + if (rc->rc_orig_data == NULL) { + rc->rc_orig_data = + zio_buf_alloc(rc->rc_size); + abd_copy_to_buf( + rc->rc_orig_data, + rc->rc_abd, rc->rc_size); + } + rc->rc_need_orig_restore = B_TRUE; + + dead++; + if (c >= nparity) + dead_data++; + my_tgts[t++] = c; + break; + } + } + } + if (dead > nparity) { + /* reconstruction not possible */ + raidz_restore_orig_data(rm); + return (EINVAL); + } + rr->rr_code = 0; + if (dead_data > 0) + rr->rr_code = vdev_raidz_reconstruct_row(rm, rr, + my_tgts, t); + } + + /* Check for success */ + if (raidz_checksum_verify(zio) == 0) { + + /* Reconstruction succeeded - report errors */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + /* + * Note: if this is a parity column, + * we don't really know if it's wrong. + * We need to let + * vdev_raidz_io_done_verified() check + * it, and if we set rc_error, it will + * think that it is a "known" error + * that doesn't need to be checked + * or corrected. + */ + if (rc->rc_error == 0 && + c >= rr->rr_firstdatacol) { + raidz_checksum_error(zio, + rc, rc->rc_gdata); + rc->rc_error = + SET_ERROR(ECKSUM); + } + rc->rc_need_orig_restore = B_FALSE; + } + } + + vdev_raidz_io_done_verified(zio, rr); + } + + zio_checksum_verified(zio); + + return (0); + } + + /* Reconstruction failed - restore original data */ + raidz_restore_orig_data(rm); + return (ECKSUM); +} + +/* + * Iterate over all combinations of N bad vdevs and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + * + * The order that we find the various possible combinations of failed + * disks is dictated by these rules: + * - Examine each "slot" (the "i" in tgts[i]) + * - Try to increment this slot (tgts[i] = tgts[i] + 1) + * - if we can't increment because it runs into the next slot, + * reset our slot to the minimum, and examine the next slot + * + * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose + * 3 columns to reconstruct), we will generate the following sequence: + * + * STATE ACTION + * 0 1 2 special case: skip since these are all parity + * 0 1 3 first slot: reset to 0; middle slot: increment to 2 + * 0 2 3 first slot: increment to 1 + * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 + * 0 1 4 first: reset to 0; middle: increment to 2 + * 0 2 4 first: increment to 1 + * 1 2 4 first: reset to 0; middle: increment to 3 + * 0 3 4 first: increment to 1 + * 1 3 4 first: increment to 2 + * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 + * 0 1 5 first: reset to 0; middle: increment to 2 + * 0 2 5 first: increment to 1 + * 1 2 5 first: reset to 0; middle: increment to 3 + * 0 3 5 first: increment to 1 + * 1 3 5 first: increment to 2 + * 2 3 5 first: reset to 0; middle: increment to 4 + * 0 4 5 first: increment to 1 + * 1 4 5 first: increment to 2 + * 2 4 5 first: increment to 3 + * 3 4 5 done + * + * This strategy works for dRAID but is less effecient when there are a large + * number of child vdevs and therefore permutations to check. Furthermore, + * since the raidz_map_t rows likely do not overlap reconstruction would be + * possible as long as there are no more than nparity data errors per row. + * These additional permutations are not currently checked but could be as + * a future improvement. + */ +static int +vdev_raidz_combrec(zio_t *zio) +{ + int nparity = vdev_get_nparity(zio->io_vd); + raidz_map_t *rm = zio->io_vsd; + + /* Check if there's enough data to attempt reconstrution. */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + int total_errors = 0; + + for (int c = 0; c < rr->rr_cols; c++) { + if (rr->rr_col[c].rc_error) + total_errors++; + } + + if (total_errors > nparity) + return (vdev_raidz_worst_error(rr)); + } + + for (int num_failures = 1; num_failures <= nparity; num_failures++) { + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *ltgts = &tstore[1]; /* value is logical child ID */ + + /* Determine number of logical children, n */ + int n = zio->io_vd->vdev_children; + + ASSERT3U(num_failures, <=, nparity); + ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); + + /* Handle corner cases in combrec logic */ + ltgts[-1] = -1; + for (int i = 0; i < num_failures; i++) { + ltgts[i] = i; + } + ltgts[num_failures] = n; + + for (;;) { + int err = raidz_reconstruct(zio, ltgts, num_failures, + nparity); + if (err == EINVAL) { + /* + * Reconstruction not possible with this # + * failures; try more failures. + */ + break; + } else if (err == 0) + return (0); + + /* Compute next targets to try */ + for (int t = 0; ; t++) { + ASSERT3U(t, <, num_failures); + ltgts[t]++; + if (ltgts[t] == n) { + /* try more failures */ + ASSERT3U(t, ==, num_failures - 1); + break; + } + + ASSERT3U(ltgts[t], <, n); + ASSERT3U(ltgts[t], <=, ltgts[t + 1]); + + /* + * If that spot is available, we're done here. + * Try the next combination. + */ + if (ltgts[t] != ltgts[t + 1]) + break; + + /* + * Otherwise, reset this tgt to the minimum, + * and move on to the next tgt. + */ + ltgts[t] = ltgts[t - 1] + 1; + ASSERT3U(ltgts[t], ==, t); + } + + /* Increase the number of failures and keep trying. */ + if (ltgts[num_failures - 1] == n) + break; + } + } + + return (ECKSUM); +} + +void +vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) +{ + for (uint64_t row = 0; row < rm->rm_nrows; row++) { + raidz_row_t *rr = rm->rm_row[row]; + vdev_raidz_reconstruct_row(rm, rr, t, nt); + } +} + +/* + * Complete a write IO operation on a RAIDZ VDev + * + * Outline: + * 1. Check for errors on the child IOs. + * 2. Return, setting an error code if too few child VDevs were written + * to reconstruct the data later. Note that partial writes are + * considered successful if they can be reconstructed at all. + */ +static void +vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) +{ + int total_errors = 0; + + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + total_errors++; + } + } + + /* + * Treat partial writes as a success. If we couldn't write enough + * columns to reconstruct the data, the I/O failed. Otherwise, + * good enough. + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + if (total_errors > rr->rr_firstdatacol) { + zio->io_error = zio_worst_error(zio->io_error, + vdev_raidz_worst_error(rr)); + } +} + +/* + * return 0 if no reconstruction occurred, otherwise the "code" from + * vdev_raidz_reconstruct(). + */ +static int +vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, + raidz_row_t *rr) +{ + int parity_errors = 0; + int parity_untried = 0; + int data_errors = 0; + int total_errors = 0; + int code = 0; + + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + if (c < rr->rr_firstdatacol) + parity_errors++; + else + data_errors++; + + total_errors++; + } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { + parity_untried++; + } + } + + /* + * If there were data errors and the number of errors we saw was + * correctable -- less than or equal to the number of parity disks read + * -- reconstruct based on the missing data. + */ + if (data_errors != 0 && + total_errors <= rr->rr_firstdatacol - parity_untried) { + /* + * We either attempt to read all the parity columns or + * none of them. If we didn't try to read parity, we + * wouldn't be here in the correctable case. There must + * also have been fewer parity errors than parity + * columns or, again, we wouldn't be in this code path. + */ + ASSERT(parity_untried == 0); + ASSERT(parity_errors < rr->rr_firstdatacol); + + /* + * Identify the data columns that reported an error. + */ + int n = 0; + int tgts[VDEV_RAIDZ_MAXPARITY]; + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; + } + } + + ASSERT(rr->rr_firstdatacol >= n); + + code = vdev_raidz_reconstruct_row(rm, rr, tgts, n); + } + + return (code); +} + +/* + * Return the number of reads issued. + */ +static int +vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + int nread = 0; + + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + + /* + * If this rows contains empty sectors which are not required + * for a normal read then allocate an ABD for them now so they + * may be read, verified, and any needed repairs performed. + */ + if (rr->rr_nempty && rr->rr_abd_empty == NULL) + vdev_draid_map_alloc_empty(zio, rr); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_tried || rc->rc_size == 0) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + nread++; + } + return (nread); +} + +/* + * We're here because either there were too many errors to even attempt + * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() + * failed. In either case, there is enough bad data to prevent reconstruction. + * Start checksum ereports for all children which haven't failed. + */ +static void +vdev_raidz_io_done_unrecoverable(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error != 0) + continue; + + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + int ret = zfs_ereport_start_checksum(zio->io_spa, + cvd, &zio->io_bookmark, zio, rc->rc_offset, + rc->rc_size, (void *)(uintptr_t)c, &zbc); + if (ret != EALREADY) { + mutex_enter(&cvd->vdev_stat_lock); + cvd->vdev_stat.vs_checksum_errors++; + mutex_exit(&cvd->vdev_stat_lock); + } + } + } +} + +void +vdev_raidz_io_done(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); + } + } else { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + rr->rr_code = + vdev_raidz_io_done_reconstruct_known_missing(zio, + rm, rr); + } + + if (raidz_checksum_verify(zio) == 0) { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_io_done_verified(zio, rr); + } + zio_checksum_verified(zio); + } else { + /* + * A sequential resilver has no checksum which makes + * combinatoral reconstruction impossible. This code + * path is unreachable since raidz_checksum_verify() + * has no checksum to verify and must succeed. + */ + ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); + + /* + * This isn't a typical situation -- either we got a + * read error or a child silently returned bad data. + * Read every block so we can try again with as much + * data and parity as we can track down. If we've + * already been through once before, all children will + * be marked as tried so we'll proceed to combinatorial + * reconstruction. + */ + int nread = 0; + for (int i = 0; i < rm->rm_nrows; i++) { + nread += vdev_raidz_read_all(zio, + rm->rm_row[i]); + } + if (nread != 0) { + /* + * Normally our stage is VDEV_IO_DONE, but if + * we've already called redone(), it will have + * changed to VDEV_IO_START, in which case we + * don't want to call redone() again. + */ + if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) + zio_vdev_io_redone(zio); + return; + } + + zio->io_error = vdev_raidz_combrec(zio); + if (zio->io_error == ECKSUM && + !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + vdev_raidz_io_done_unrecoverable(zio); + } + } + } +} + static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { - if (faulted > vd->vdev_nparity) + vdev_raidz_t *vdrz = vd->vdev_tsd; + if (faulted > vdrz->vd_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) @@ -2343,18 +2581,26 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) * width blocks must be resilvered. */ static boolean_t -vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) { + vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t dcols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; + uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; /* The starting RAIDZ (parent) vdev sector of the block. */ - uint64_t b = offset >> ashift; + uint64_t b = DVA_GET_OFFSET(dva) >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = ((psize - 1) >> ashift) + 1; /* The first column for this stripe. */ uint64_t f = b % dcols; + /* Unreachable by sequential resilver. */ + ASSERT3U(phys_birth, !=, TXG_UNKNOWN); + + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + if (s + nparity >= dcols) return (B_TRUE); @@ -2375,7 +2621,8 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) } static void -vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) +vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); @@ -2385,10 +2632,10 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) uint64_t ashift = raidvd->vdev_top->vdev_ashift; /* make sure the offsets are block-aligned */ - ASSERT0(in->rs_start % (1 << ashift)); - ASSERT0(in->rs_end % (1 << ashift)); - uint64_t b_start = in->rs_start >> ashift; - uint64_t b_end = in->rs_end >> ashift; + ASSERT0(logical_rs->rs_start % (1 << ashift)); + ASSERT0(logical_rs->rs_end % (1 << ashift)); + uint64_t b_start = logical_rs->rs_start >> ashift; + uint64_t b_end = logical_rs->rs_end >> ashift; uint64_t start_row = 0; if (b_start > tgt_col) /* avoid underflow */ @@ -2398,17 +2645,119 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) if (b_end > tgt_col) end_row = ((b_end - tgt_col - 1) / width) + 1; - res->rs_start = start_row << ashift; - res->rs_end = end_row << ashift; + physical_rs->rs_start = start_row << ashift; + physical_rs->rs_end = end_row << ashift; - ASSERT3U(res->rs_start, <=, in->rs_start); - ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); + ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); + ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, + logical_rs->rs_end - logical_rs->rs_start); +} + +/* + * Initialize private RAIDZ specific fields from the nvlist. + */ +static int +vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + vdev_raidz_t *vdrz; + uint64_t nparity; + + uint_t children; + nvlist_t **child; + int error = nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &children); + if (error != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { + if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) + return (SET_ERROR(EINVAL)); + + /* + * Previous versions could only support 1 or 2 parity + * device. + */ + if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) + return (SET_ERROR(EINVAL)); + else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) + return (SET_ERROR(EINVAL)); + } else { + /* + * We require the parity to be specified for SPAs that + * support multiple parity levels. + */ + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) + return (SET_ERROR(EINVAL)); + + /* + * Otherwise, we default to 1 parity device for RAID-Z. + */ + nparity = 1; + } + + vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); + vdrz->vd_logical_width = children; + vdrz->vd_nparity = nparity; + + *tsd = vdrz; + + return (0); +} + +static void +vdev_raidz_fini(vdev_t *vd) +{ + kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); +} + +/* + * Add RAIDZ specific fields to the config nvlist. + */ +static void +vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); + vdev_raidz_t *vdrz = vd->vdev_tsd; + + /* + * Make sure someone hasn't managed to sneak a fancy new vdev + * into a crufty old storage pool. + */ + ASSERT(vdrz->vd_nparity == 1 || + (vdrz->vd_nparity <= 2 && + spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || + (vdrz->vd_nparity <= 3 && + spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); + + /* + * Note that we'll add these even on storage pools where they + * aren't strictly required -- older software will just ignore + * it. + */ + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); +} + +static uint64_t +vdev_raidz_nparity(vdev_t *vd) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + return (vdrz->vd_nparity); +} + +static uint64_t +vdev_raidz_ndisks(vdev_t *vd) +{ + return (vd->vdev_children); } vdev_ops_t vdev_raidz_ops = { + .vdev_op_init = vdev_raidz_init, + .vdev_op_fini = vdev_raidz_fini, .vdev_op_open = vdev_raidz_open, .vdev_op_close = vdev_raidz_close, .vdev_op_asize = vdev_raidz_asize, + .vdev_op_min_asize = vdev_raidz_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_raidz_io_start, .vdev_op_io_done = vdev_raidz_io_done, .vdev_op_state_change = vdev_raidz_state_change, @@ -2417,6 +2766,11 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_raidz_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_raidz_config_generate, + .vdev_op_nparity = vdev_raidz_nparity, + .vdev_op_ndisks = vdev_raidz_ndisks, .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index 9595a7b95..a8eca06f9 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -149,7 +149,7 @@ vdev_raidz_math_get_ops(void) * Select parity generation method for raidz_map */ int -vdev_raidz_math_generate(raidz_map_t *rm) +vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr) { raidz_gen_f gen_parity = NULL; @@ -174,7 +174,7 @@ vdev_raidz_math_generate(raidz_map_t *rm) if (gen_parity == NULL) return (RAIDZ_ORIGINAL_IMPL); - gen_parity(rm); + gen_parity(rr); return (0); } @@ -241,8 +241,8 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, * @nbaddata - Number of failed data columns */ int -vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, - const int *dt, const int nbaddata) +vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr, + const int *parity_valid, const int *dt, const int nbaddata) { raidz_rec_f rec_fn = NULL; @@ -265,7 +265,7 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, if (rec_fn == NULL) return (RAIDZ_ORIGINAL_IMPL); else - return (rec_fn(rm, dt)); + return (rec_fn(rr, dt)); } const char *raidz_gen_name[] = { diff --git a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h index 89c2082c4..35e016fc6 100644 --- a/module/zfs/vdev_raidz_math_impl.h +++ b/module/zfs/vdev_raidz_math_impl.h @@ -26,6 +26,7 @@ #define _VDEV_RAIDZ_MATH_IMPL_H #include +#include #define raidz_inline inline __attribute__((always_inline)) #ifndef noinline @@ -36,33 +37,33 @@ * Functions calculate multiplication constants for data reconstruction. * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and * used parity columns for reconstruction. - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes * @coeff output array of coefficients. Array must be provided by * user and must hold minimum MUL_CNT values. */ static noinline void -raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1)); } static noinline void -raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1)); } static noinline void -raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; gf_t a, b, e; @@ -76,9 +77,9 @@ raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; @@ -93,9 +94,9 @@ raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; @@ -114,9 +115,9 @@ raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; const unsigned z = tgtidx[TARGET_Z]; @@ -347,26 +348,26 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private) /* * Generate P parity (RAIDZ1) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_p_impl(raidz_map_t * const rm) +raidz_generate_p_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t psize = rm->rm_col[CODE_P].rc_size; - abd_t *pabd = rm->rm_col[CODE_P].rc_abd; + const size_t ncols = rr->rr_cols; + const size_t psize = rr->rr_col[CODE_P].rc_size; + abd_t *pabd = rr->rr_col[CODE_P].rc_abd; size_t size; abd_t *dabd; raidz_math_begin(); /* start with first data column */ - raidz_copy(pabd, rm->rm_col[1].rc_abd, psize); + raidz_copy(pabd, rr->rr_col[1].rc_abd, psize); for (c = 2; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - size = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + size = rr->rr_col[c].rc_size; /* add data column */ raidz_add(pabd, dabd, size); @@ -414,29 +415,29 @@ raidz_gen_pq_add(void **c, const void *dc, const size_t csize, /* * Generate PQ parity (RAIDZ2) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_pq_impl(raidz_map_t * const rm) +raidz_generate_pq_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t csize = rm->rm_col[CODE_P].rc_size; + const size_t ncols = rr->rr_cols; + const size_t csize = rr->rr_col[CODE_P].rc_size; size_t dsize; abd_t *dabd; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd }; raidz_math_begin(); - raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize); for (c = 3; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, raidz_gen_pq_add); @@ -487,31 +488,31 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, /* * Generate PQR parity (RAIDZ2) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_pqr_impl(raidz_map_t * const rm) +raidz_generate_pqr_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t csize = rm->rm_col[CODE_P].rc_size; + const size_t ncols = rr->rr_cols; + const size_t csize = rr->rr_col[CODE_P].rc_size; size_t dsize; abd_t *dabd; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; raidz_math_begin(); - raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize); for (c = 4; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, raidz_gen_pqr_add); @@ -579,33 +580,36 @@ raidz_generate_pqr_impl(raidz_map_t * const rm) * @syn_method raidz_add_abd() * @rec_method not applicable * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - const size_t xsize = rm->rm_col[x].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; size_t size; abd_t *dabd; + if (xabd == NULL) + return (1 << CODE_P); + raidz_math_begin(); /* copy P into target */ - raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize); /* generate p_syndrome */ for (c = firstdc; c < ncols; c++) { if (c == x) continue; - dabd = rm->rm_col[c].rc_abd; - size = MIN(rm->rm_col[c].rc_size, xsize); + dabd = rr->rr_col[c].rc_abd; + size = MIN(rr->rr_col[c].rc_size, xsize); raidz_add(xabd, dabd, size); } @@ -653,30 +657,33 @@ raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - abd_t *xabd = rm->rm_col[x].rc_abd; - const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; abd_t *tabds[] = { xabd }; + if (xabd == NULL) + return (1 << CODE_Q); + unsigned coeff[MUL_CNT]; - raidz_rec_q_coeff(rm, tgtidx, coeff); + raidz_rec_q_coeff(rr, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); } @@ -687,8 +694,8 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, @@ -696,7 +703,7 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) } /* add Q to the syndrome */ - raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); @@ -744,30 +751,33 @@ raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * - * @rm RAIDZ map + * @rr RAIDZ rr * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - const size_t xsize = rm->rm_col[x].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; abd_t *tabds[] = { xabd }; + if (xabd == NULL) + return (1 << CODE_R); + unsigned coeff[MUL_CNT]; - raidz_rec_r_coeff(rm, tgtidx, coeff); + raidz_rec_r_coeff(rr, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); } @@ -779,8 +789,8 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, @@ -788,7 +798,7 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) } /* add R to the syndrome */ - raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); @@ -881,31 +891,34 @@ raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, * @syn_method raidz_syn_pq_abd() * @rec_method raidz_rec_pq_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd }; + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_Q)); + unsigned coeff[MUL_CNT]; - raidz_rec_pq_coeff(rm, tgtidx, coeff); + raidz_rec_pq_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -921,8 +934,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -934,8 +947,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -946,7 +959,7 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) /* Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); @@ -1038,30 +1051,34 @@ raidz_rec_pr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_pr_abd() * @rec_method raidz_rec_pr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[0]; const size_t y = tgtidx[1]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_pr_coeff(rm, tgtidx, coeff); + raidz_rec_pr_coeff(rr, tgtidx, coeff); /* * Check if some of targets are shorter then others. @@ -1077,8 +1094,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1090,8 +1107,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -1104,14 +1121,14 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); - return ((1 << CODE_P) | (1 << CODE_Q)); + return ((1 << CODE_P) | (1 << CODE_R)); } @@ -1201,30 +1218,34 @@ raidz_rec_qr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_qr_abd() * @rec_method raidz_rec_qr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_Q) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_qr_coeff(rm, tgtidx, coeff); + raidz_rec_qr_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -1240,8 +1261,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1253,8 +1274,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -1267,7 +1288,7 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); @@ -1384,34 +1405,38 @@ raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_pqr_abd() * @rec_method raidz_rec_pqr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; const size_t z = tgtidx[TARGET_Z]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - const size_t zsize = rm->rm_col[z].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; - abd_t *zabd = rm->rm_col[z].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + const size_t zsize = rr->rr_col[z].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; + abd_t *zabd = rr->rr_col[z].rc_abd; abd_t *tabds[] = { xabd, yabd, zabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_pqr_coeff(rm, tgtidx, coeff); + raidz_rec_pqr_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -1431,9 +1456,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1446,8 +1471,8 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, @@ -1460,9 +1485,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); if (zsize < xsize) - raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize); + raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize); raidz_math_end(); diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 3362d608c..784d1af15 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -25,6 +25,7 @@ */ #include +#include #include #include #include @@ -63,13 +64,15 @@ * * Limitations: * - * - Only supported for mirror vdev types. Due to the variable stripe - * width used by raidz sequential reconstruction is not possible. + * - Sequential reconstruction is not possible on RAIDZ due to its + * variable stripe width. Note dRAID uses a fixed stripe width which + * avoids this issue, but comes at the expense of some usable capacity. * - * - Block checksums are not verified during sequential reconstuction. + * - Block checksums are not verified during sequential reconstruction. * Similar to traditional RAID the parity/mirror data is reconstructed * but cannot be immediately double checked. For this reason when the - * last active resilver completes the pool is automatically scrubbed. + * last active resilver completes the pool is automatically scrubbed + * by default. * * - Deferred resilvers using sequential reconstruction are not currently * supported. When adding another vdev to an active top-level resilver @@ -77,8 +80,8 @@ * * Advantages: * - * - Sequential reconstuction is performed in LBA order which may be faster - * than healing reconstuction particularly when using using HDDs (or + * - Sequential reconstruction is performed in LBA order which may be faster + * than healing reconstruction particularly when using using HDDs (or * especially with SMR devices). Only allocated capacity is resilvered. * * - Sequential reconstruction is not constrained by ZFS block boundaries. @@ -86,9 +89,9 @@ * allowing all of these logical blocks to be repaired with a single IO. * * - Unlike a healing resilver or scrub which are pool wide operations, - * sequential reconstruction is handled by the top-level mirror vdevs. - * This allows for it to be started or canceled on a top-level vdev - * without impacting any other top-level vdevs in the pool. + * sequential reconstruction is handled by the top-level vdevs. This + * allows for it to be started or canceled on a top-level vdev without + * impacting any other top-level vdevs in the pool. * * - Data only referenced by a pool checkpoint will be repaired because * that space is reflected in the space maps. This differs for a @@ -97,18 +100,36 @@ /* - * Maximum number of queued rebuild I/Os top-level vdev. The number of - * concurrent rebuild I/Os issued to the device is controlled by the - * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module - * options. - */ -unsigned int zfs_rebuild_queue_limit = 20; - -/* - * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE. + * Size of rebuild reads; defaults to 1MiB per data disk and is capped at + * SPA_MAXBLOCKSIZE. */ unsigned long zfs_rebuild_max_segment = 1024 * 1024; +/* + * Maximum number of parallelly executed bytes per leaf vdev caused by a + * sequential resilver. We attempt to strike a balance here between keeping + * the vdev queues full of I/Os at all times and not overflowing the queues + * to cause long latency, which would cause long txg sync times. + * + * A large default value can be safely used here because the default target + * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep + * the queue depth short. + * + * 32MB was selected as the default value to achieve good performance with + * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential + * rebuild was unable to saturate all of the drives using smaller values. + * With a value of 32MB the sequential resilver write rate was measured at + * 800MB/s sustained while rebuilding to a distributed spare. + */ +unsigned long zfs_rebuild_vdev_limit = 32 << 20; + +/* + * Automatically start a pool scrub when the last active sequential resilver + * completes in order to verify the checksums of all blocks which have been + * resilvered. This option is enabled by default and is strongly recommended. + */ +int zfs_rebuild_scrub_enabled = 1; + /* * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). */ @@ -293,7 +314,7 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); - vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); + vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); spa_history_log_internal(spa, "rebuild", tx, @@ -306,7 +327,16 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) vd->vdev_rebuilding = B_FALSE; mutex_exit(&vd->vdev_rebuild_lock); - spa_notify_waiters(spa); + /* + * While we're in syncing context take the opportunity to + * setup the scrub when there are no more active rebuilds. + */ + if (!vdev_rebuild_active(spa->spa_root_vdev) && + zfs_rebuild_scrub_enabled) { + pool_scan_func_t func = POOL_SCAN_SCRUB; + dsl_scan_setup_sync(&func, tx); + } + cv_broadcast(&vd->vdev_rebuild_cv); } @@ -438,7 +468,7 @@ vdev_rebuild_cb(zio_t *zio) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vdev_t *vd = vr->vr_top_vdev; - mutex_enter(&vd->vdev_rebuild_io_lock); + mutex_enter(&vr->vr_io_lock); if (zio->io_error == ENXIO && !vdev_writeable(vd)) { /* * The I/O failed because the top-level vdev was unavailable. @@ -455,34 +485,30 @@ vdev_rebuild_cb(zio_t *zio) abd_free(zio->io_abd); - ASSERT3U(vd->vdev_rebuild_inflight, >, 0); - vd->vdev_rebuild_inflight--; - cv_broadcast(&vd->vdev_rebuild_io_cv); - mutex_exit(&vd->vdev_rebuild_io_lock); + ASSERT3U(vr->vr_bytes_inflight, >, 0); + vr->vr_bytes_inflight -= zio->io_size; + cv_broadcast(&vr->vr_io_cv); + mutex_exit(&vr->vr_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* - * Rebuild the data in this range by constructing a special dummy block - * pointer for the given range. It has no relation to any existing blocks - * in the pool. But by disabling checksum verification and issuing a scrub - * I/O mirrored vdevs will replicate the block using any available mirror - * leaf vdevs. + * Initialize a block pointer that can be used to read the given segment + * for sequential rebuild. */ static void -vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize, - uint64_t txg) +vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, + uint64_t asize) { - vdev_t *vd = vr->vr_top_vdev; - spa_t *spa = vd->vdev_spa; - uint64_t psize = asize; - - ASSERT(vd->vdev_ops == &vdev_mirror_ops || + ASSERT(vd->vdev_ops == &vdev_draid_ops || + vd->vdev_ops == &vdev_mirror_ops || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); - blkptr_t blk, *bp = &blk; + uint64_t psize = vd->vdev_ops == &vdev_draid_ops ? + vdev_draid_asize_to_psize(vd, asize) : asize; + BP_ZERO(bp); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); @@ -499,19 +525,6 @@ vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize, BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - - /* - * We increment the issued bytes by the asize rather than the psize - * so the scanned and issued bytes may be directly compared. This - * is consistent with the scrub/resilver issued reporting. - */ - vr->vr_pass_bytes_issued += asize; - vr->vr_rebuild_phys.vrp_bytes_issued += asize; - - zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp, - abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, - ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | - ZIO_FLAG_RESILVER, NULL)); } /* @@ -525,6 +538,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; vdev_t *vd = vr->vr_top_vdev; spa_t *spa = vd->vdev_spa; + blkptr_t blk; ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); @@ -532,14 +546,26 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) vr->vr_pass_bytes_scanned += size; vr->vr_rebuild_phys.vrp_bytes_scanned += size; - mutex_enter(&vd->vdev_rebuild_io_lock); + /* + * Rebuild the data in this range by constructing a special block + * pointer. It has no relation to any existing blocks in the pool. + * However, by disabling checksum verification and issuing a scrub IO + * we can reconstruct and repair any children with missing data. + */ + vdev_rebuild_blkptr_init(&blk, vd, start, size); + uint64_t psize = BP_GET_PSIZE(&blk); + + if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) + return (0); + + mutex_enter(&vr->vr_io_lock); /* Limit in flight rebuild I/Os */ - while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit) - cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max) + cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); - vd->vdev_rebuild_inflight++; - mutex_exit(&vd->vdev_rebuild_io_lock); + vr->vr_bytes_inflight += psize; + mutex_exit(&vr->vr_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); @@ -558,45 +584,29 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) /* When exiting write out our progress. */ if (vdev_rebuild_should_stop(vd)) { - mutex_enter(&vd->vdev_rebuild_io_lock); - vd->vdev_rebuild_inflight--; - mutex_exit(&vd->vdev_rebuild_io_lock); + mutex_enter(&vr->vr_io_lock); + vr->vr_bytes_inflight -= psize; + mutex_exit(&vr->vr_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); mutex_exit(&vd->vdev_rebuild_lock); dmu_tx_commit(tx); return (SET_ERROR(EINTR)); } mutex_exit(&vd->vdev_rebuild_lock); - - vr->vr_scan_offset[txg & TXG_MASK] = start + size; - vdev_rebuild_rebuild_block(vr, start, size, txg); - dmu_tx_commit(tx); + vr->vr_scan_offset[txg & TXG_MASK] = start + size; + vr->vr_pass_bytes_issued += size; + vr->vr_rebuild_phys.vrp_bytes_issued += size; + + zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk, + abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, + ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | + ZIO_FLAG_RESILVER, NULL)); + return (0); } -/* - * Split range into legally-sized logical chunks given the constraints of the - * top-level mirror vdev type. - */ -static uint64_t -vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size) -{ - uint64_t chunk_size, max_asize, max_segment; - - ASSERT(vd->vdev_ops == &vdev_mirror_ops || - vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - - max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment, - 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE); - max_asize = vdev_psize_to_asize(vd, max_segment); - chunk_size = MIN(size, max_asize); - - return (chunk_size); -} - /* * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. */ @@ -625,7 +635,14 @@ vdev_rebuild_ranges(vdev_rebuild_t *vr) while (size > 0) { uint64_t chunk_size; - chunk_size = vdev_rebuild_chunk_size(vd, start, size); + /* + * Split range into legally-sized logical chunks + * given the constraints of the top-level vdev + * being rebuilt (dRAID or mirror). + */ + ASSERT3P(vd->vdev_ops, !=, NULL); + chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd, + start, size, zfs_rebuild_max_segment); error = vdev_rebuild_range(vr, start, chunk_size); if (error != 0) @@ -747,10 +764,16 @@ vdev_rebuild_thread(void *arg) vr->vr_top_vdev = vd; vr->vr_scan_msp = NULL; vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); + vr->vr_pass_start_time = gethrtime(); vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; + vr->vr_bytes_inflight_max = MAX(1ULL << 20, + zfs_rebuild_vdev_limit * vd->vdev_children); + uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); @@ -780,21 +803,32 @@ vdev_rebuild_thread(void *arg) ASSERT0(range_tree_space(vr->vr_scan_tree)); - /* - * Disable any new allocations to this metaslab and wait - * for any writes inflight to complete. This is needed to - * ensure all allocated ranges are rebuilt. - */ + /* Disable any new allocations to this metaslab */ metaslab_disable(msp); spa_config_exit(spa, SCL_CONFIG, FTAG); - txg_wait_synced(dsl, 0); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); + /* + * If there are outstanding allocations wait for them to be + * synced. This is needed to ensure all allocated ranges are + * on disk and therefore will be rebuilt. + */ + for (int j = 0; j < TXG_SIZE; j++) { + if (range_tree_space(msp->ms_allocating[j])) { + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + txg_wait_synced(dsl, 0); + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + break; + } + } + /* * When a metaslab has been allocated from read its allocated - * ranges from the space map object in to the vr_scan_tree. + * ranges from the space map object into the vr_scan_tree. * Then add inflight / unflushed ranges and remove inflight / * unflushed frees. This is the minimum range to be rebuilt. */ @@ -827,7 +861,7 @@ vdev_rebuild_thread(void *arg) /* * To provide an accurate estimate re-calculate the estimated * size every 5 minutes to account for recent allocations and - * frees made space maps which have not yet been rebuilt. + * frees made to space maps which have not yet been rebuilt. */ if (gethrtime() > update_est_time + SEC2NSEC(300)) { update_est_time = gethrtime(); @@ -851,11 +885,14 @@ vdev_rebuild_thread(void *arg) spa_config_exit(spa, SCL_CONFIG, FTAG); /* Wait for any remaining rebuild I/O to complete */ - mutex_enter(&vd->vdev_rebuild_io_lock); - while (vd->vdev_rebuild_inflight > 0) - cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + mutex_enter(&vr->vr_io_lock); + while (vr->vr_bytes_inflight > 0) + cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); - mutex_exit(&vd->vdev_rebuild_io_lock); + mutex_exit(&vr->vr_io_lock); + + mutex_destroy(&vr->vr_io_lock); + cv_destroy(&vr->vr_io_cv); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1100,5 +1137,11 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, - "Max segment size in bytes of rebuild reads"); + "Max segment size in bytes of rebuild reads"); + +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW, + "Max bytes in flight per leaf vdev for sequential resilvers"); + +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, + "Automatically scrub after sequential resilver completes"); /* END CSTYLED */ diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index ed7d1d4b3..4606af9aa 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -250,7 +250,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) spa_vdev_removal_t *svr = NULL; uint64_t txg __maybe_unused = dmu_tx_get_txg(tx); - ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + ASSERT0(vdev_get_nparity(vd)); svr = spa_vdev_removal_create(vd); ASSERT(vd->vdev_removing); @@ -1120,7 +1120,7 @@ static void vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) { ASSERT3P(zlist, !=, NULL); - ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + ASSERT0(vdev_get_nparity(vd)); if (vd->vdev_leaf_zap != 0) { char zkey[32]; @@ -2041,7 +2041,7 @@ spa_vdev_remove_top_check(vdev_t *vd) /* * All vdevs in normal class must have the same ashift - * and not be raidz. + * and not be raidz or draid. */ vdev_t *rvd = spa->spa_root_vdev; int num_indirect = 0; @@ -2064,7 +2064,7 @@ spa_vdev_remove_top_check(vdev_t *vd) num_indirect++; if (!vdev_is_concrete(cvd)) continue; - if (cvd->vdev_ops == &vdev_raidz_ops) + if (vdev_get_nparity(cvd) != 0) return (SET_ERROR(EINVAL)); /* * Need the mirror to be mirror of leaf vdevs only @@ -2217,18 +2217,30 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) * in this pool. */ if (vd == NULL || unspare) { - if (vd == NULL) - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_AUX); + char *type; + boolean_t draid_spare = B_FALSE; - vd_type = VDEV_TYPE_SPARE; - vd_path = spa_strdup(fnvlist_lookup_string( - nv, ZPOOL_CONFIG_PATH)); - spa_vdev_remove_aux(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares, nv); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) + == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) + draid_spare = B_TRUE; + + if (vd == NULL && draid_spare) { + error = SET_ERROR(ENOTSUP); + } else { + if (vd == NULL) + vd = spa_lookup_by_guid(spa, + guid, B_TRUE); + ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_AUX); + + vd_type = VDEV_TYPE_SPARE; + vd_path = spa_strdup(fnvlist_lookup_string( + nv, ZPOOL_CONFIG_PATH)); + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } } else { error = SET_ERROR(EBUSY); } diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 9e8aac7d0..45ddc2f71 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -142,9 +142,13 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded) } vdev_ops_t vdev_root_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_root_open, .vdev_op_close = vdev_root_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = NULL, /* not applicable to the root */ .vdev_op_io_done = NULL, /* not applicable to the root */ .vdev_op_state_change = vdev_root_state_change, @@ -153,6 +157,11 @@ vdev_ops_t vdev_root_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 02b42ddd5..895957bda 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -311,7 +311,8 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, vd->vdev_trim_secure = secure; } - boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED); + vdev_trim_state_t old_state = vd->vdev_trim_state; + boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED); vd->vdev_trim_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); @@ -332,9 +333,12 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, "vdev=%s suspended", vd->vdev_path); break; case VDEV_TRIM_CANCELED: - spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); - spa_history_log_internal(spa, "trim", tx, - "vdev=%s canceled", vd->vdev_path); + if (old_state == VDEV_TRIM_ACTIVE || + old_state == VDEV_TRIM_SUSPENDED) { + spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); + spa_history_log_internal(spa, "trim", tx, + "vdev=%s canceled", vd->vdev_path); + } break; case VDEV_TRIM_COMPLETE: spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH); @@ -601,6 +605,32 @@ vdev_trim_ranges(trim_args_t *ta) return (0); } +static void +vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +{ + uint64_t *last_rs_end = (uint64_t *)arg; + + if (physical_rs->rs_end > *last_rs_end) + *last_rs_end = physical_rs->rs_end; +} + +static void +vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = (vdev_t *)arg; + + uint64_t size = physical_rs->rs_end - physical_rs->rs_start; + vd->vdev_trim_bytes_est += size; + + if (vd->vdev_trim_last_offset >= physical_rs->rs_end) { + vd->vdev_trim_bytes_done += size; + } else if (vd->vdev_trim_last_offset > physical_rs->rs_start && + vd->vdev_trim_last_offset <= physical_rs->rs_end) { + vd->vdev_trim_bytes_done += + vd->vdev_trim_last_offset - physical_rs->rs_start; + } +} + /* * Calculates the completion percentage of a manual TRIM. */ @@ -618,27 +648,35 @@ vdev_trim_calculate_progress(vdev_t *vd) metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); - uint64_t ms_free = msp->ms_size - - metaslab_allocated_space(msp); - - if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) - ms_free /= vd->vdev_top->vdev_children; + uint64_t ms_free = (msp->ms_size - + metaslab_allocated_space(msp)) / + vdev_get_ndisks(vd->vdev_top); /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; - vdev_xlate(vd, &logical_rs, &physical_rs); + /* Metaslab space after this offset has not been trimmed. */ + vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); if (vd->vdev_trim_last_offset <= physical_rs.rs_start) { vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; - } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) { + } + + /* Metaslab space before this offset has been trimmed */ + uint64_t last_rs_end = physical_rs.rs_end; + if (!vdev_xlate_is_empty(&remain_rs)) { + vdev_xlate_walk(vd, &remain_rs, + vdev_trim_xlate_last_rs_end, &last_rs_end); + } + + if (vd->vdev_trim_last_offset > last_rs_end) { vd->vdev_trim_bytes_done += ms_free; vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); @@ -659,21 +697,9 @@ vdev_trim_calculate_progress(vdev_t *vd) rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { logical_rs.rs_start = rs_get_start(rs, rt); logical_rs.rs_end = rs_get_end(rs, rt); - vdev_xlate(vd, &logical_rs, &physical_rs); - uint64_t size = physical_rs.rs_end - - physical_rs.rs_start; - vd->vdev_trim_bytes_est += size; - if (vd->vdev_trim_last_offset >= physical_rs.rs_end) { - vd->vdev_trim_bytes_done += size; - } else if (vd->vdev_trim_last_offset > - physical_rs.rs_start && - vd->vdev_trim_last_offset <= - physical_rs.rs_end) { - vd->vdev_trim_bytes_done += - vd->vdev_trim_last_offset - - physical_rs.rs_start; - } + vdev_xlate_walk(vd, &logical_rs, + vdev_trim_xlate_progress, vd); } mutex_exit(&msp->ms_lock); } @@ -741,8 +767,38 @@ vdev_trim_load(vdev_t *vd) return (err); } +static void +vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs) +{ + trim_args_t *ta = arg; + vdev_t *vd = ta->trim_vdev; + + /* + * Only a manual trim will be traversing the vdev sequentially. + * For an auto trim all valid ranges should be added. + */ + if (ta->trim_type == TRIM_TYPE_MANUAL) { + + /* Only add segments that we have not visited yet */ + if (physical_rs->rs_end <= vd->vdev_trim_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_trim_last_offset > physical_rs->rs_start) { + ASSERT3U(physical_rs->rs_end, >, + vd->vdev_trim_last_offset); + physical_rs->rs_start = vd->vdev_trim_last_offset; + } + } + + ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); + + range_tree_add(ta->trim_tree, physical_rs->rs_start, + physical_rs->rs_end - physical_rs->rs_start); +} + /* - * Convert the logical range into a physical range and add it to the + * Convert the logical range into physical ranges and add them to the * range tree passed in the trim_args_t. */ static void @@ -750,7 +806,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; @@ -767,44 +823,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) } ASSERT(vd->vdev_ops->vdev_op_leaf); - vdev_xlate(vd, &logical_rs, &physical_rs); - - IMPLY(vd->vdev_top == vd, - logical_rs.rs_start == physical_rs.rs_start); - IMPLY(vd->vdev_top == vd, - logical_rs.rs_end == physical_rs.rs_end); - - /* - * Only a manual trim will be traversing the vdev sequentially. - * For an auto trim all valid ranges should be added. - */ - if (ta->trim_type == TRIM_TYPE_MANUAL) { - - /* Only add segments that we have not visited yet */ - if (physical_rs.rs_end <= vd->vdev_trim_last_offset) - return; - - /* Pick up where we left off mid-range. */ - if (vd->vdev_trim_last_offset > physical_rs.rs_start) { - ASSERT3U(physical_rs.rs_end, >, - vd->vdev_trim_last_offset); - physical_rs.rs_start = vd->vdev_trim_last_offset; - } - } - - ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); - - /* - * With raidz, it's possible that the logical range does not live on - * this leaf vdev. We only add the physical range to this vdev's if it - * has a length greater than 0. - */ - if (physical_rs.rs_end > physical_rs.rs_start) { - range_tree_add(ta->trim_tree, physical_rs.rs_start, - physical_rs.rs_end - physical_rs.rs_start); - } else { - ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); - } + vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg); } /* diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index a8341f50b..ea71ef325 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1111,7 +1111,9 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, bcopy(info, report->zcr_ckinfo, sizeof (*info)); } - report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_align = + vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); report->zcr_length = length; #ifdef _KERNEL diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ccba6cea3..982940dbd 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1702,16 +1702,16 @@ zio_write_compress(zio_t *zio) return (zio); } else { /* - * Round up compressed size up to the ashift - * of the smallest-ashift device, and zero the tail. - * This ensures that the compressed size of the BP - * (and thus compressratio property) are correct, + * Round compressed size up to the minimum allocation + * size of the smallest-ashift device, and zero the + * tail. This ensures that the compressed size of the + * BP (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ - ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); - size_t rounded = (size_t)P2ROUNDUP(psize, - 1ULL << spa->spa_min_ashift); + ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT); + size_t rounded = (size_t)roundup(psize, + spa->spa_min_alloc); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); @@ -3754,19 +3754,37 @@ zio_vdev_io_start(zio_t *zio) * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. + * + * Leaf DTL_PARTIAL can be empty when a legitimate write comes from + * a dRAID spare vdev. For example, when a dRAID spare is first + * used, its spare blocks need to be written to but the leaf vdev's + * of such blocks can have empty DTL_PARTIAL. + * + * There seemed no clean way to allow such writes while bypassing + * spurious ones. At this point, just avoid all bypassing for dRAID + * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && + vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } - if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || - zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { + /* + * Select the next best leaf I/O to process. Distributed spares are + * excluded since they dispatch the I/O directly to a leaf vdev after + * applying the dRAID mapping. + */ + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops && + (zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_TRIM)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) return (zio); @@ -3803,8 +3821,8 @@ zio_vdev_io_done(zio_t *zio) if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { - + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) @@ -4206,7 +4224,7 @@ zio_checksum_verify(zio_t *zio) if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); - ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); + ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index fb8ce0916..e56ea8868 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -265,6 +265,12 @@ zio_handle_fault_injection(zio_t *zio, int error) if (zio->io_type != ZIO_TYPE_READ) return (0); + /* + * A rebuild I/O has no checksum to verify. + */ + if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM) + return (0); + rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 9d3994752..af7c36c8f 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -36,6 +36,7 @@ export ZPOOL_SCRIPT_DIR=$$CMD_DIR/zpool/zpool.d export ZPOOL_SCRIPTS_PATH=$$CMD_DIR/zpool/zpool.d export CONTRIB_DIR=@abs_top_builddir@/contrib export LIB_DIR=@abs_top_builddir@/lib +export SYSCONF_DIR=@abs_top_builddir@/etc export INSTALL_UDEV_DIR=@udevdir@ export INSTALL_UDEV_RULE_DIR=@udevruledir@ diff --git a/scripts/zfs-helpers.sh b/scripts/zfs-helpers.sh index 02b492200..f4edd48e8 100755 --- a/scripts/zfs-helpers.sh +++ b/scripts/zfs-helpers.sh @@ -166,6 +166,8 @@ if [ "${INSTALL}" = "yes" ]; then "$INSTALL_UDEV_RULE_DIR/90-zfs.rules" install "$CMD_DIR/zpool/zpool.d" \ "$INSTALL_SYSCONF_DIR/zfs/zpool.d" + install "$SYSCONF_DIR/zfs/draid.d" \ + "$INSTALL_SYSCONF_DIR/zfs/draid.d" install "$CONTRIB_DIR/pyzfs/libzfs_core" \ "$INSTALL_PYTHON_DIR/libzfs_core" # Ideally we would install these in the configured ${libdir}, which is @@ -185,6 +187,7 @@ else remove "$INSTALL_UDEV_RULE_DIR/69-vdev.rules" remove "$INSTALL_UDEV_RULE_DIR/90-zfs.rules" remove "$INSTALL_SYSCONF_DIR/zfs/zpool.d" + remove "$INSTALL_SYSCONF_DIR/zfs/draid.d" remove "$INSTALL_PYTHON_DIR/libzfs_core" remove "/lib/libzfs_core.so" remove "/lib/libnvpair.so" diff --git a/scripts/zloop.sh b/scripts/zloop.sh index 3d9baaf0e..bbe326aa0 100755 --- a/scripts/zloop.sh +++ b/scripts/zloop.sh @@ -18,6 +18,7 @@ # # Copyright (c) 2015 by Delphix. All rights reserved. # Copyright (C) 2016 Lawrence Livermore National Security, LLC. +# Copyright (c) 2017, Intel Corporation. # BASE_DIR=$(dirname "$0") @@ -246,27 +247,60 @@ while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do or_die rm -rf "$workdir" or_die mkdir "$workdir" - # switch between common arrangements & fully randomized - if [[ $((RANDOM % 2)) -eq 0 ]]; then - mirrors=2 - raidz=0 - parity=1 - vdevs=2 - else - mirrors=$(((RANDOM % 3) * 1)) - parity=$(((RANDOM % 3) + 1)) - raidz=$((((RANDOM % 9) + parity + 1) * (RANDOM % 2))) - vdevs=$(((RANDOM % 3) + 3)) - fi + # switch between three types of configs + # 1/3 basic, 1/3 raidz mix, and 1/3 draid mix + choice=$((RANDOM % 3)) + + # ashift range 9 - 15 align=$(((RANDOM % 2) * 3 + 9)) - runtime=$((RANDOM % 100)) + + # randomly use special classes + class="special=random" + + if [[ $choice -eq 0 ]]; then + # basic mirror only + parity=1 + mirrors=2 + draid_data=0 + draid_spares=0 + raid_children=0 + vdevs=2 + raid_type="raidz" + elif [[ $choice -eq 1 ]]; then + # fully randomized mirror/raidz (sans dRAID) + parity=$(((RANDOM % 3) + 1)) + mirrors=$(((RANDOM % 3) * 1)) + draid_data=0 + draid_spares=0 + raid_children=$((((RANDOM % 9) + parity + 1) * (RANDOM % 2))) + vdevs=$(((RANDOM % 3) + 3)) + raid_type="raidz" + else + # fully randomized dRAID (sans mirror/raidz) + parity=$(((RANDOM % 3) + 1)) + mirrors=0 + draid_data=$(((RANDOM % 8) + 3)) + draid_spares=$(((RANDOM % 2) + parity)) + stripe=$((draid_data + parity)) + extra=$((draid_spares + (RANDOM % 4))) + raid_children=$(((((RANDOM % 4) + 1) * stripe) + extra)) + vdevs=$((RANDOM % 3)) + raid_type="draid" + fi + + # run from 30 to 120 seconds + runtime=$(((RANDOM % 90) + 30)) passtime=$((RANDOM % (runtime / 3 + 1) + 10)) + zopt="$zopt -K $raid_type" zopt="$zopt -m $mirrors" - zopt="$zopt -r $raidz" + zopt="$zopt -r $raid_children" + zopt="$zopt -D $draid_data" + zopt="$zopt -S $draid_spares" zopt="$zopt -R $parity" zopt="$zopt -v $vdevs" zopt="$zopt -a $align" + zopt="$zopt -C $class" zopt="$zopt -T $runtime" zopt="$zopt -P $passtime" zopt="$zopt -s $size" diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 1a4693b7d..c91da0a45 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -333,6 +333,8 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', 'zpool_create_024_pos', 'zpool_create_encrypted', 'zpool_create_crypt_combos', + 'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos', + 'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_features_005_pos', @@ -375,7 +377,7 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos', 'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg', 'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos', - 'zpool_import_015_pos', + 'zpool_import_015_pos', 'zpool_import_016_pos', 'zpool_import_017_pos', 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos', 'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos', @@ -710,12 +712,14 @@ tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', tags = ['functional', 'redacted_send'] [tests/functional/raidz] -tests = ['raidz_001_neg', 'raidz_002_pos'] +tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos'] tags = ['functional', 'raidz'] [tests/functional/redundancy] -tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos', - 'redundancy_004_neg'] +tests = ['redundancy_draid1', 'redundancy_draid2', 'redundancy_draid3', + 'redundancy_draid_spare1', 'redundancy_draid_spare2', + 'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz1', + 'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_stripe'] tags = ['functional', 'redundancy'] [tests/functional/refquota] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 6b5cd191c..50fc96475 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -218,6 +218,7 @@ maybe = { 'no_space/enospc_002_pos': ['FAIL', enospc_reason], 'projectquota/setup': ['SKIP', exec_reason], 'redundancy/redundancy_004_neg': ['FAIL', '7290'], + 'redundancy/redundancy_draid_spare3': ['SKIP', known_reason], 'reservation/reservation_008_pos': ['FAIL', '7741'], 'reservation/reservation_018_pos': ['FAIL', '5642'], 'rsend/rsend_019_pos': ['FAIL', '6086'], diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index bf54c1d45..7fe9a2c57 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -6,6 +6,7 @@ SUBDIRS = \ chg_usr_exec \ devname2devid \ dir_rd_update \ + draid \ file_check \ file_trunc \ file_write \ diff --git a/tests/zfs-tests/cmd/draid/.gitignore b/tests/zfs-tests/cmd/draid/.gitignore new file mode 100644 index 000000000..911b9f077 --- /dev/null +++ b/tests/zfs-tests/cmd/draid/.gitignore @@ -0,0 +1 @@ +/draid diff --git a/tests/zfs-tests/cmd/draid/Makefile.am b/tests/zfs-tests/cmd/draid/Makefile.am new file mode 100644 index 000000000..69fed7a6b --- /dev/null +++ b/tests/zfs-tests/cmd/draid/Makefile.am @@ -0,0 +1,15 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +AM_CFLAGS += $(ZLIB_CFLAGS) + +pkgexec_PROGRAMS = draid + +draid_SOURCES = draid.c + +draid_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +draid_LDADD += $(ZLIB_LIBS) diff --git a/tests/zfs-tests/cmd/draid/draid.c b/tests/zfs-tests/cmd/draid/draid.c new file mode 100644 index 000000000..861c6ba1a --- /dev/null +++ b/tests/zfs-tests/cmd/draid/draid.c @@ -0,0 +1,1414 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include + +/* + * The number of rows to generate for new permutation maps. + */ +#define MAP_ROWS_DEFAULT 256 + +/* + * Key values for dRAID maps when stored as nvlists. + */ +#define MAP_SEED "seed" +#define MAP_CHECKSUM "checksum" +#define MAP_WORST_RATIO "worst_ratio" +#define MAP_AVG_RATIO "avg_ratio" +#define MAP_CHILDREN "children" +#define MAP_NPERMS "nperms" +#define MAP_PERMS "perms" + +static void +draid_usage(void) +{ + (void) fprintf(stderr, + "usage: draid command args ...\n" + "Available commands are:\n" + "\n" + "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n" + "\tdraid verify [-rv] FILE\n" + "\tdraid dump [-v] [-m min] [-n max] FILE\n" + "\tdraid table FILE\n" + "\tdraid merge FILE SRC SRC...\n"); + exit(1); +} + +static int +read_map(const char *filename, nvlist_t **allcfgs) +{ + int block_size = 131072; + int buf_size = 131072; + int tmp_size, error; + char *tmp_buf; + + struct stat64 stat; + if (lstat64(filename, &stat) != 0) + return (errno); + + if (stat.st_size == 0 || + !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) { + return (EINVAL); + } + + gzFile fp = gzopen(filename, "rb"); + if (fp == Z_NULL) + return (errno); + + char *buf = malloc(buf_size); + if (buf == NULL) { + (void) gzclose(fp); + return (ENOMEM); + } + + ssize_t rc, bytes = 0; + while (!gzeof(fp)) { + rc = gzread(fp, buf + bytes, block_size); + if ((rc < 0) || (rc == 0 && !gzeof(fp))) { + free(buf); + (void) gzclose(fp); + (void) gzerror(fp, &error); + return (error); + } else { + bytes += rc; + + if (bytes + block_size >= buf_size) { + tmp_size = 2 * buf_size; + tmp_buf = malloc(tmp_size); + if (tmp_buf == NULL) { + free(buf); + (void) gzclose(fp); + return (ENOMEM); + } + + memcpy(tmp_buf, buf, bytes); + free(buf); + buf = tmp_buf; + buf_size = tmp_size; + } + } + } + + (void) gzclose(fp); + + error = nvlist_unpack(buf, bytes, allcfgs, 0); + free(buf); + + return (error); +} + +/* + * Read a map from the specified filename. A file contains multiple maps + * which are indexed by the number of children. The caller is responsible + * for freeing the configuration returned. + */ +static int +read_map_key(const char *filename, char *key, nvlist_t **cfg) +{ + nvlist_t *allcfgs, *foundcfg = NULL; + int error; + + error = read_map(filename, &allcfgs); + if (error != 0) + return (error); + + nvlist_lookup_nvlist(allcfgs, key, &foundcfg); + if (foundcfg != NULL) { + nvlist_dup(foundcfg, cfg, KM_SLEEP); + error = 0; + } else { + error = ENOENT; + } + + nvlist_free(allcfgs); + + return (error); +} + +/* + * Write all mappings to the map file. + */ +static int +write_map(const char *filename, nvlist_t *allcfgs) +{ + size_t buflen = 0; + int error; + + error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR); + if (error) + return (error); + + char *buf = malloc(buflen); + if (buf == NULL) + return (ENOMEM); + + error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); + if (error) { + free(buf); + return (error); + } + + /* + * Atomically update the file using a temporary file and the + * traditional unlink then rename steps. This code provides + * no locking, it only guarantees the packed nvlist on disk + * is updated atomically and is internally consistent. + */ + char *tmpname = calloc(MAXPATHLEN, 1); + if (tmpname == NULL) { + free(buf); + return (ENOMEM); + } + + snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename); + + int fd = mkstemp(tmpname); + if (fd < 0) { + error = errno; + free(buf); + free(tmpname); + return (error); + } + (void) close(fd); + + gzFile fp = gzopen(tmpname, "w9b"); + if (fp == Z_NULL) { + error = errno; + free(buf); + free(tmpname); + return (errno); + } + + ssize_t rc, bytes = 0; + while (bytes < buflen) { + size_t size = MIN(buflen - bytes, 131072); + rc = gzwrite(fp, buf + bytes, size); + if (rc < 0) { + free(buf); + (void) gzerror(fp, &error); + (void) gzclose(fp); + (void) unlink(tmpname); + free(tmpname); + return (error); + } else if (rc == 0) { + break; + } else { + bytes += rc; + } + } + + free(buf); + (void) gzclose(fp); + + if (bytes != buflen) { + (void) unlink(tmpname); + free(tmpname); + return (EIO); + } + + /* + * Unlink the previous config file and replace it with the updated + * version. If we're able to unlink the file then directory is + * writable by us and the subsequent rename should never fail. + */ + error = unlink(filename); + if (error != 0 && errno != ENOENT) { + error = errno; + (void) unlink(tmpname); + free(tmpname); + return (error); + } + + error = rename(tmpname, filename); + if (error != 0) { + error = errno; + (void) unlink(tmpname); + free(tmpname); + return (error); + } + + free(tmpname); + + return (0); +} + +/* + * Add the dRAID map to the file and write it out. + */ +static int +write_map_key(const char *filename, char *key, draid_map_t *map, + double worst_ratio, double avg_ratio) +{ + nvlist_t *nv_cfg, *allcfgs; + int error; + + /* + * Add the configuration to an existing or new file. The new + * configuration will replace an existing configuration with the + * same key if it has a lower ratio and is therefore better. + */ + error = read_map(filename, &allcfgs); + if (error == ENOENT) { + allcfgs = fnvlist_alloc(); + } else if (error != 0) { + return (error); + } + + error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg); + if (error == 0) { + uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg, + MAP_WORST_RATIO); + double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0; + + if (worst_ratio < nv_worst_ratio) { + /* Replace old map with the more balanced new map. */ + fnvlist_remove(allcfgs, key); + } else { + /* The old map is preferable, keep it. */ + nvlist_free(allcfgs); + return (EEXIST); + } + } + + nvlist_t *cfg = fnvlist_alloc(); + fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed); + fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum); + fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children); + fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms); + fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms, + map->dm_children * map->dm_nperms * sizeof (uint8_t)); + + fnvlist_add_uint64(cfg, MAP_WORST_RATIO, + (uint64_t)(worst_ratio * 1000.0)); + fnvlist_add_uint64(cfg, MAP_AVG_RATIO, + (uint64_t)(avg_ratio * 1000.0)); + + error = nvlist_add_nvlist(allcfgs, key, cfg); + if (error == 0) + error = write_map(filename, allcfgs); + + nvlist_free(cfg); + nvlist_free(allcfgs); + return (error); +} + +static void +dump_map(draid_map_t *map, char *key, double worst_ratio, double avg_ratio, + int verbose) +{ + if (verbose == 0) { + return; + } else if (verbose == 1) { + printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f " + "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed, + worst_ratio, avg_ratio); + return; + } else { + printf(" \"%s\":\n" + " seed: 0x%016llx\n" + " checksum: 0x%016llx\n" + " worst_ratio: %2.03f\n" + " avg_ratio: %2.03f\n" + " children: %llu\n" + " nperms: %llu\n", + key, (u_longlong_t)map->dm_seed, + (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio, + (u_longlong_t)map->dm_children, + (u_longlong_t)map->dm_nperms); + + if (verbose > 2) { + printf(" perms = {\n"); + for (int i = 0; i < map->dm_nperms; i++) { + printf(" { "); + for (int j = 0; j < map->dm_children; j++) { + printf("%3d%s ", map->dm_perms[ + i * map->dm_children + j], + j < map->dm_children - 1 ? + "," : ""); + } + printf(" },\n"); + } + printf(" }\n"); + } else if (verbose == 2) { + printf(" draid_perms = \n"); + } + } +} + +static void +dump_map_nv(char *key, nvlist_t *cfg, int verbose) +{ + draid_map_t map; + uint_t c; + + uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); + uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); + + map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); + map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); + map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); + map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); + nvlist_lookup_uint8_array(cfg, MAP_PERMS, &map.dm_perms, &c); + + dump_map(&map, key, (double)worst_ratio / 1000.0, + avg_ratio / 1000.0, verbose); +} + +/* + * Print a summary of the mapping. + */ +static int +dump_map_key(const char *filename, char *key, int verbose) +{ + nvlist_t *cfg; + int error; + + error = read_map_key(filename, key, &cfg); + if (error != 0) + return (error); + + dump_map_nv(key, cfg, verbose); + + return (0); +} + +/* + * Allocate a new permutation map for evaluation. + */ +static int +alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed, + draid_map_t **mapp) +{ + draid_map_t *map; + int error; + + map = malloc(sizeof (draid_map_t)); + if (map == NULL) + return (ENOMEM); + + map->dm_children = children; + map->dm_nperms = nperms; + map->dm_seed = seed; + map->dm_checksum = 0; + + error = vdev_draid_generate_perms(map, &map->dm_perms); + if (error) { + free(map); + return (error); + } + + *mapp = map; + + return (0); +} + +/* + * Allocate the fixed permutation map for N children. + */ +static int +alloc_fixed_map(uint64_t children, draid_map_t **mapp) +{ + const draid_map_t *fixed_map; + draid_map_t *map; + int error; + + error = vdev_draid_lookup_map(children, &fixed_map); + if (error) + return (error); + + map = malloc(sizeof (draid_map_t)); + if (map == NULL) + return (ENOMEM); + + memcpy(map, fixed_map, sizeof (draid_map_t)); + VERIFY3U(map->dm_checksum, !=, 0); + + error = vdev_draid_generate_perms(map, &map->dm_perms); + if (error) { + free(map); + return (error); + } + + *mapp = map; + + return (0); +} + +/* + * Free a permutation map. + */ +static void +free_map(draid_map_t *map) +{ + free(map->dm_perms); + free(map); +} + +/* + * Check if dev is in the provided list of faulted devices. + */ +static inline boolean_t +is_faulted(int *faulted_devs, int nfaulted, int dev) +{ + for (int i = 0; i < nfaulted; i++) + if (faulted_devs[i] == dev) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Evaluate how resilvering I/O will be distributed given a list of faulted + * vdevs. As a simplification we assume one IO is sufficient to repair each + * damaged device in a group. + */ +static double +eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares, + int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios) +{ + uint64_t children = map->dm_children; + uint64_t ngroups = 1; + uint64_t ndisks = children - nspares; + + /* + * Calculate the minimum number of groups required to fill a slice. + */ + while (ngroups * (groupwidth) % (children - nspares) != 0) + ngroups++; + + int *ios = calloc(map->dm_children, sizeof (uint64_t)); + + /* Resilver all rows */ + for (int i = 0; i < map->dm_nperms; i++) { + uint8_t *row = &map->dm_perms[i * map->dm_children]; + + /* Resilver all groups with faulted drives */ + for (int j = 0; j < ngroups; j++) { + uint64_t spareidx = map->dm_children - nspares; + boolean_t repair_needed = B_FALSE; + + /* See if any devices in this group are faulted */ + uint64_t groupstart = (j * groupwidth) % ndisks; + + for (int k = 0; k < groupwidth; k++) { + uint64_t groupidx = (groupstart + k) % ndisks; + + repair_needed = is_faulted(faulted_devs, + nfaulted, row[groupidx]); + if (repair_needed) + break; + } + + if (repair_needed == B_FALSE) + continue; + + /* + * This group is degraded. Calculate the number of + * reads the non-faulted drives require and the number + * of writes to the distributed hot spare for this row. + */ + for (int k = 0; k < groupwidth; k++) { + uint64_t groupidx = (groupstart + k) % ndisks; + + if (!is_faulted(faulted_devs, nfaulted, + row[groupidx])) { + ios[row[groupidx]]++; + } else if (nspares > 0) { + while (is_faulted(faulted_devs, + nfaulted, row[spareidx])) { + spareidx++; + } + + ASSERT3U(spareidx, <, map->dm_children); + ios[row[spareidx]]++; + spareidx++; + } + } + } + } + + *min_child_ios = INT_MAX; + *max_child_ios = 0; + + /* + * Find the drives with fewest and most required I/O. These values + * are used to calculate the imbalance ratio. To avoid returning an + * infinite value for permutations which have children that perform + * no IO a floor of 1 IO per child is set. This ensures a meaningful + * ratio is returned for comparison and it is not an uncommon when + * there are a large number of children. + */ + for (int i = 0; i < map->dm_children; i++) { + + if (is_faulted(faulted_devs, nfaulted, i)) { + ASSERT0(ios[i]); + continue; + } + + if (ios[i] == 0) + ios[i] = 1; + + if (ios[i] < *min_child_ios) + *min_child_ios = ios[i]; + + if (ios[i] > *max_child_ios) + *max_child_ios = ios[i]; + } + + ASSERT3S(*min_child_ios, !=, INT_MAX); + ASSERT3S(*max_child_ios, !=, 0); + + double ratio = (double)(*max_child_ios) / (double)(*min_child_ios); + + free(ios); + + return (ratio); +} + +/* + * Evaluate the quality of the permutation mapping by considering possible + * device failures. Returns the imbalance ratio for the worst mapping which + * is defined to be the largest number of child IOs over the fewest number + * child IOs. A value of 1.0 indicates the mapping is perfectly balance and + * all children perform an equal amount of work during reconstruction. + */ +static void +eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop) +{ + uint64_t children = map->dm_children; + double worst_ratio = 1.0; + double sum = 0; + int worst_min_ios = 0, worst_max_ios = 0; + int n = 0; + + /* + * When there are only 2 children there can be no distributed + * spare and no resilver to evaluate. Default to a ratio of 1.0 + * for this degenerate case. + */ + if (children == VDEV_DRAID_MIN_CHILDREN) { + *worst_ratiop = 1.0; + *avg_ratiop = 1.0; + return; + } + + /* + * Score the mapping as if it had either 1 or 2 distributed spares. + */ + for (int nspares = 1; nspares <= 2; nspares++) { + uint64_t faults = nspares; + + /* + * Score groupwidths up to 19. This value was choosen as the + * largest reasonable width (16d+3p). dRAID pools may be still + * be created with wider stripes but they are not considered in + * this analysis in order to optimize for the most common cases. + */ + for (uint64_t groupwidth = 2; + groupwidth <= MIN(children - nspares, 19); + groupwidth++) { + int faulted_devs[2]; + int min_ios, max_ios; + + /* + * Score possible devices faults. This is limited + * to exactly one fault per distributed spare for + * the purposes of this similation. + */ + for (int f1 = 0; f1 < children; f1++) { + faulted_devs[0] = f1; + double ratio; + + if (faults == 1) { + ratio = eval_resilver(map, groupwidth, + nspares, faulted_devs, faults, + &min_ios, &max_ios); + + if (ratio > worst_ratio) { + worst_ratio = ratio; + worst_min_ios = min_ios; + worst_max_ios = max_ios; + } + + sum += ratio; + n++; + } else if (faults == 2) { + for (int f2 = f1 + 1; f2 < children; + f2++) { + faulted_devs[1] = f2; + + ratio = eval_resilver(map, + groupwidth, nspares, + faulted_devs, faults, + &min_ios, &max_ios); + + if (ratio > worst_ratio) { + worst_ratio = ratio; + worst_min_ios = min_ios; + worst_max_ios = max_ios; + } + + sum += ratio; + n++; + } + } + } + } + } + + *worst_ratiop = worst_ratio; + *avg_ratiop = sum / n; + + /* + * Log the min/max io values for particularly unbalanced maps. + * Since the maps are generated entirely randomly these are possible + * be exceedingly unlikely. We log it for possible investigation. + */ + if (worst_ratio > 100.0) { + dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2); + printf("worst_min_ios=%d worst_max_ios=%d\n", + worst_min_ios, worst_max_ios); + } +} + +static int +eval_maps(uint64_t children, int passes, uint64_t *map_seed, + draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop) +{ + draid_map_t *best_map = NULL; + double best_worst_ratio = 1000.0; + double best_avg_ratio = 1000.0; + + /* + * Perform the requested number of passes evaluating randomly + * generated permutation maps. Only the best version is kept. + */ + for (int i = 0; i < passes; i++) { + double worst_ratio, avg_ratio; + draid_map_t *map; + int error; + + /* + * Calculate the next seed and generate a new candidate map. + */ + error = alloc_new_map(children, MAP_ROWS_DEFAULT, + vdev_draid_rand(map_seed), &map); + if (error) + return (error); + + /* + * Consider maps with a lower worst_ratio to be of higher + * quality. Some maps may have a lower avg_ratio but they + * are discarded since they might include some particularly + * imbalanced permuations. The average is tracked to in + * order to get a sense of the average permutation quality. + */ + eval_decluster(map, &worst_ratio, &avg_ratio); + + if (best_map == NULL || worst_ratio < best_worst_ratio) { + + if (best_map != NULL) + free_map(best_map); + + best_map = map; + best_worst_ratio = worst_ratio; + best_avg_ratio = avg_ratio; + } else { + free_map(map); + } + } + + /* + * After determining the best map generate a checksum over the full + * permutation array. This checksum is verified when opening a dRAID + * pool to ensure the generated in memory permutations are correct. + */ + zio_cksum_t cksum; + fletcher_4_native_varsize(best_map->dm_perms, + sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms, + &cksum); + best_map->dm_checksum = cksum.zc_word[0]; + + *best_mapp = best_map; + *best_ratiop = best_worst_ratio; + *avg_ratiop = best_avg_ratio; + + return (0); +} + +static int +draid_generate(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + uint64_t map_seed; + int c, fd, error, verbose = 0, passes = 1, continuous = 0; + int min_children = VDEV_DRAID_MIN_CHILDREN; + int max_children = VDEV_DRAID_MAX_CHILDREN; + int restarts = 0; + + while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) { + switch (c) { + case 'c': + continuous++; + break; + case 'm': + min_children = (int)strtol(optarg, NULL, 0); + if (min_children < VDEV_DRAID_MIN_CHILDREN) { + (void) fprintf(stderr, "A minimum of 2 " + "children are required.\n"); + return (1); + } + + break; + case 'n': + max_children = (int)strtol(optarg, NULL, 0); + if (max_children > VDEV_DRAID_MAX_CHILDREN) { + (void) fprintf(stderr, "A maximum of %d " + "children are allowed.\n", + VDEV_DRAID_MAX_CHILDREN); + return (1); + } + break; + case 'p': + passes = (int)strtol(optarg, NULL, 0); + break; + case 'v': + /* + * 0 - Only log when a better map is added to the file. + * 1 - Log the current best map for each child count. + * Minimal output on a single summary line. + * 2 - Log the current best map for each child count. + * More verbose includes most map fields. + * 3 - Log the current best map for each child count. + * Very verbose all fields including the full map. + */ + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc > optind) { + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + +restart: + /* + * Start with a fresh seed from /dev/urandom. + */ + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + printf("Unable to open /dev/urandom: %s\n:", strerror(errno)); + return (1); + } else { + ssize_t bytes = sizeof (map_seed); + ssize_t bytes_read = 0; + + while (bytes_read < bytes) { + ssize_t rc = read(fd, ((char *)&map_seed) + bytes_read, + bytes - bytes_read); + if (rc < 0) { + printf("Unable to read /dev/urandom: %s\n:", + strerror(errno)); + return (1); + } + bytes_read += rc; + } + + (void) close(fd); + } + + if (restarts == 0) + printf("Writing generated mappings to '%s':\n", filename); + + /* + * Generate maps for all requested child counts. The best map for + * each child count is written out to the specified file. If the file + * already contains a better mapping this map will not be added. + */ + for (uint64_t children = min_children; + children <= max_children; children++) { + char key[8] = { 0 }; + draid_map_t *map; + double worst_ratio = 1000.0; + double avg_ratio = 1000.0; + + error = eval_maps(children, passes, &map_seed, &map, + &worst_ratio, &avg_ratio); + if (error) { + printf("Error eval_maps(): %s\n", strerror(error)); + return (1); + } + + if (worst_ratio < 1.0 || avg_ratio < 1.0) { + printf("Error ratio < 1.0: worst_ratio = %2.03f " + "avg_ratio = %2.03f\n", worst_ratio, avg_ratio); + return (1); + } + + snprintf(key, 7, "%llu", (u_longlong_t)children); + error = write_map_key(filename, key, map, worst_ratio, + avg_ratio); + if (error == 0) { + /* The new map was added to the file. */ + dump_map(map, key, worst_ratio, avg_ratio, + MAX(verbose, 1)); + } else if (error == EEXIST) { + /* The existing map was preferable and kept. */ + if (verbose > 0) + dump_map_key(filename, key, verbose); + } else { + printf("Error write_map_key(): %s\n", strerror(error)); + return (1); + } + + free_map(map); + } + + /* + * When the continuous option is set restart at the minimum number of + * children instead of exiting. This option is useful as a mechanism + * to continuous try and refine the discovered permutations. + */ + if (continuous) { + restarts++; + printf("Restarting by request (-c): %d\n", restarts); + goto restart; + } + + return (0); +} + +/* + * Verify each map in the file by generating its in-memory permutation array + * and comfirming its checksum is correct. + */ +static int +draid_verify(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int n = 0, c, error, verbose = 1; + int check_ratios = 0; + + while ((c = getopt(argc, argv, ":rv")) != -1) { + switch (c) { + case 'r': + check_ratios++; + break; + case 'v': + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc > optind) { + char *abspath = malloc(MAXPATHLEN); + if (abspath == NULL) + return (ENOMEM); + + bzero(filename, MAXPATHLEN); + if (realpath(argv[optind], abspath) != NULL) + strncpy(filename, abspath, MAXPATHLEN - 1); + else + strncpy(filename, argv[optind], MAXPATHLEN - 1); + + free(abspath); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + + printf("Verifying permutation maps: '%s'\n", filename); + + /* + * Lookup hardcoded permutation map for each valid number of children + * and verify a generated map has the correct checksum. Then compare + * the generated map values with the nvlist map values read from the + * reference file to cross-check the permutation. + */ + for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; + children <= VDEV_DRAID_MAX_CHILDREN; + children++) { + draid_map_t *map; + char key[8]; + + bzero(key, 8); + snprintf(key, 8, "%llu", (u_longlong_t)children); + + error = alloc_fixed_map(children, &map); + if (error) { + printf("Error alloc_fixed_map() failed: %s\n", + error == ECKSUM ? "Invalid checksum" : + strerror(error)); + return (1); + } + + uint64_t nv_seed, nv_checksum, nv_children, nv_nperms; + uint8_t *nv_perms; + nvlist_t *cfg; + uint_t c; + + error = read_map_key(filename, key, &cfg); + if (error != 0) { + printf("Error read_map_key() failed: %s\n", + strerror(error)); + free_map(map); + return (1); + } + + nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); + nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); + nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); + nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); + nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c); + + /* + * Compare draid_map_t and nvlist reference values. + */ + if (map->dm_seed != nv_seed) { + printf("Error different seeds: 0x%016llx != " + "0x%016llx\n", (u_longlong_t)map->dm_seed, + (u_longlong_t)nv_seed); + error = EINVAL; + } + + if (map->dm_checksum != nv_checksum) { + printf("Error different checksums: 0x%016llx " + "!= 0x%016llx\n", + (u_longlong_t)map->dm_checksum, + (u_longlong_t)nv_checksum); + error = EINVAL; + } + + if (map->dm_children != nv_children) { + printf("Error different children: %llu " + "!= %llu\n", (u_longlong_t)map->dm_children, + (u_longlong_t)nv_children); + error = EINVAL; + } + + if (map->dm_nperms != nv_nperms) { + printf("Error different nperms: %llu " + "!= %llu\n", (u_longlong_t)map->dm_nperms, + (u_longlong_t)nv_nperms); + error = EINVAL; + } + + for (uint64_t i = 0; i < nv_children * nv_nperms; i++) { + if (map->dm_perms[i] != nv_perms[i]) { + printf("Error different perms[%llu]: " + "%d != %d\n", (u_longlong_t)i, + (int)map->dm_perms[i], + (int)nv_perms[i]); + error = EINVAL; + break; + } + } + + /* + * For good measure recalculate the worst and average + * ratios and confirm they match the nvlist values. + */ + if (check_ratios) { + uint64_t nv_worst_ratio, nv_avg_ratio; + double worst_ratio, avg_ratio; + + eval_decluster(map, &worst_ratio, &avg_ratio); + + nv_worst_ratio = fnvlist_lookup_uint64(cfg, + MAP_WORST_RATIO); + nv_avg_ratio = fnvlist_lookup_uint64(cfg, + MAP_AVG_RATIO); + + if (worst_ratio < 1.0 || avg_ratio < 1.0) { + printf("Error ratio out of range %2.03f, " + "%2.03f\n", worst_ratio, avg_ratio); + error = EINVAL; + } + + if ((uint64_t)(worst_ratio * 1000.0) != + nv_worst_ratio) { + printf("Error different worst_ratio %2.03f " + "!= %2.03f\n", (double)nv_worst_ratio / + 1000.0, worst_ratio); + error = EINVAL; + } + + if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) { + printf("Error different average_ratio %2.03f " + "!= %2.03f\n", (double)nv_avg_ratio / + 1000.0, avg_ratio); + error = EINVAL; + } + } + + if (error) { + free_map(map); + nvlist_free(cfg); + return (1); + } + + if (verbose > 0) { + printf("- %llu children: good\n", + (u_longlong_t)children); + } + n++; + + free_map(map); + nvlist_free(cfg); + } + + if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) { + printf("Error permutation maps missing: %d / %d checked\n", + n, VDEV_DRAID_MAX_CHILDREN - 1); + return (1); + } + + printf("Successfully verified %d / %d permutation maps\n", + n, VDEV_DRAID_MAX_CHILDREN - 1); + + return (0); +} + +/* + * Dump the contents of the specified mapping(s) for inspection. + */ +static int +draid_dump(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int c, error, verbose = 1; + int min_children = VDEV_DRAID_MIN_CHILDREN; + int max_children = VDEV_DRAID_MAX_CHILDREN; + + while ((c = getopt(argc, argv, ":vm:n:")) != -1) { + switch (c) { + case 'm': + min_children = (int)strtol(optarg, NULL, 0); + if (min_children < 2) { + (void) fprintf(stderr, "A minimum of 2 " + "children are required.\n"); + return (1); + } + + break; + case 'n': + max_children = (int)strtol(optarg, NULL, 0); + if (max_children > VDEV_DRAID_MAX_CHILDREN) { + (void) fprintf(stderr, "A maximum of %d " + "children are allowed.\n", + VDEV_DRAID_MAX_CHILDREN); + return (1); + } + break; + case 'v': + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc > optind) { + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + + /* + * Dump maps for the requested child counts. + */ + for (uint64_t children = min_children; + children <= max_children; children++) { + char key[8] = { 0 }; + + snprintf(key, 7, "%llu", (u_longlong_t)children); + error = dump_map_key(filename, key, verbose); + if (error) { + printf("Error dump_map_key(): %s\n", strerror(error)); + return (1); + } + } + + return (0); +} + +/* + * Print all of the mappings as a C formated draid_map_t array. This table + * is found in the module/zcommon/zfs_draid.c file and is the definative + * source for all mapping used by dRAID. It cannot be updated without + * changing the dRAID on disk format. + */ +static int +draid_table(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int error; + + if (argc > optind) { + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + + printf("static const draid_map_t " + "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n"); + + for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; + children <= VDEV_DRAID_MAX_CHILDREN; + children++) { + uint64_t seed, checksum, nperms, avg_ratio; + nvlist_t *cfg; + char key[8]; + + bzero(key, 8); + snprintf(key, 8, "%llu", (u_longlong_t)children); + + error = read_map_key(filename, key, &cfg); + if (error != 0) { + printf("Error read_map_key() failed: %s\n", + strerror(error)); + return (1); + } + + seed = fnvlist_lookup_uint64(cfg, MAP_SEED); + checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); + children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); + nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); + avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); + + printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t" + "/* %2.03f */\n", (u_longlong_t)children, + (u_longlong_t)nperms, (u_longlong_t)seed, + (u_longlong_t)checksum, (double)avg_ratio / 1000.0); + + nvlist_free(cfg); + } + + printf("};\n"); + + return (0); +} + +static int +draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp) +{ + nvlist_t *srccfgs; + nvpair_t *elem = NULL; + int error, merged = 0; + + error = read_map(srcfilename, &srccfgs); + if (error != 0) + return (error); + + while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) { + uint64_t nv_worst_ratio; + uint64_t allcfg_worst_ratio; + nvlist_t *cfg, *allcfg; + char *key; + + switch (nvpair_type(elem)) { + case DATA_TYPE_NVLIST: + + (void) nvpair_value_nvlist(elem, &cfg); + key = nvpair_name(elem); + + nv_worst_ratio = fnvlist_lookup_uint64(cfg, + MAP_WORST_RATIO); + + error = nvlist_lookup_nvlist(allcfgs, key, &allcfg); + if (error == 0) { + allcfg_worst_ratio = fnvlist_lookup_uint64( + allcfg, MAP_WORST_RATIO); + + if (nv_worst_ratio < allcfg_worst_ratio) { + fnvlist_remove(allcfgs, key); + error = nvlist_add_nvlist(allcfgs, + key, cfg); + merged++; + } + } else if (error == ENOENT) { + error = nvlist_add_nvlist(allcfgs, key, cfg); + merged++; + } else { + return (error); + } + + break; + default: + continue; + } + } + + nvlist_free(srccfgs); + + *mergedp = merged; + + return (0); +} + +/* + * Merge the best map for each child count found in the listed files into + * a new file. This allows 'draid generate' to be run in parallel and for + * the results maps to be combined. + */ +static int +draid_merge(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int c, error, total_merged = 0, verbose = 0; + nvlist_t *allcfgs; + + while ((c = getopt(argc, argv, ":v")) != -1) { + switch (c) { + case 'v': + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc < 4) { + (void) fprintf(stderr, + "A FILE and multiple SRCs must be specified.\n"); + return (1); + } + + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + optind++; + + error = read_map(filename, &allcfgs); + if (error == ENOENT) { + allcfgs = fnvlist_alloc(); + } else if (error != 0) { + printf("Error read_map(): %s\n", strerror(error)); + return (error); + } + + while (optind < argc) { + char srcfilename[MAXPATHLEN]; + int merged = 0; + + bzero(srcfilename, MAXPATHLEN); + strncpy(srcfilename, argv[optind], MAXPATHLEN - 1); + + error = draid_merge_impl(allcfgs, srcfilename, &merged); + if (error) { + printf("Error draid_merge_impl(): %s\n", + strerror(error)); + nvlist_free(allcfgs); + return (1); + } + + total_merged += merged; + printf("Merged %d key(s) from '%s' into '%s'\n", merged, + srcfilename, filename); + + optind++; + } + + if (total_merged > 0) + write_map(filename, allcfgs); + + printf("Merged a total of %d key(s) into '%s'\n", total_merged, + filename); + + nvlist_free(allcfgs); + + return (0); +} + +int +main(int argc, char *argv[]) +{ + if (argc < 2) + draid_usage(); + + char *subcommand = argv[1]; + + if (strcmp(subcommand, "generate") == 0) { + return (draid_generate(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "verify") == 0) { + return (draid_verify(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "dump") == 0) { + return (draid_dump(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "table") == 0) { + return (draid_table(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "merge") == 0) { + return (draid_merge(argc - 1, argv + 1)); + } else { + draid_usage(); + } +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 2b81e1c19..299653547 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -197,6 +197,7 @@ export ZFSTEST_FILES='badsend chg_usr_exec devname2devid dir_rd_update + draid file_check file_trunc file_write diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 98ef54e4d..d494eda55 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -2336,7 +2336,7 @@ function check_pool_status # pool token keyword function is_pool_resilvering #pool { check_pool_status "$1" "scan" \ - "resilver[ ()0-9A-Za-z_-]* in progress since" $2 + "resilver[ ()0-9A-Za-z:_-]* in progress since" $2 return $? } diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index b9e7fe2df..e93e299ea 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -60,6 +60,7 @@ MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_inter MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable +REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib index bd45fabbc..85566e565 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib @@ -66,7 +66,8 @@ function setup_filesystem #disklist #pool #fs #mntpoint #type #vdev if [[ $vdev != "" && \ $vdev != "mirror" && \ - $vdev != "raidz" ]] ; then + $vdev != "raidz" && \ + $vdev != "draid" ]] ; then log_note "Wrong vdev: (\"$vdev\")" return 1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh index aa50de3be..191ec839a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh @@ -55,23 +55,26 @@ log_assert "'zpool add ...' can add devices to the pool." log_onexit cleanup -set -A keywords "" "mirror" "raidz" "raidz1" "spare" +set -A keywords "" "mirror" "raidz" "raidz1" "draid:1s" "draid1:1s" "spare" pooldevs="${DISK0} \ \"${DISK0} ${DISK1}\" \ \"${DISK0} ${DISK1} ${DISK2}\"" mirrordevs="\"${DISK0} ${DISK1}\"" raidzdevs="\"${DISK0} ${DISK1}\"" +draiddevs="\"${DISK0} ${DISK1} ${DISK2}\"" disk0=$TEST_BASE_DIR/disk0 disk1=$TEST_BASE_DIR/disk1 -truncate -s $MINVDEVSIZE $disk0 $disk1 +disk2=$TEST_BASE_DIR/disk2 +truncate -s $MINVDEVSIZE $disk0 $disk1 $disk2 typeset -i i=0 typeset vdev eval set -A poolarray $pooldevs eval set -A mirrorarray $mirrordevs eval set -A raidzarray $raidzdevs +eval set -A draidarray $draiddevs while (( $i < ${#keywords[*]} )); do @@ -107,6 +110,19 @@ while (( $i < ${#keywords[*]} )); do destroy_pool "$TESTPOOL" done + ;; + draid:1s|draid1:1s) + for vdev in "${draidarray[@]}"; do + create_pool "$TESTPOOL" "${keywords[i]}" \ + "$disk0" "$disk1" "$disk2" + log_must poolexists "$TESTPOOL" + log_must zpool add "$TESTPOOL" ${keywords[i]} $vdev + log_must vdevs_in_pool "$TESTPOOL" "$vdev" + log_must vdevs_in_pool "$TESTPOOL" "draid1-0-0" + log_must vdevs_in_pool "$TESTPOOL" "draid1-1-0" + destroy_pool "$TESTPOOL" + done + ;; esac diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am index 3c595935a..4d75851bd 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am @@ -27,6 +27,10 @@ dist_pkgdata_SCRIPTS = \ zpool_create_024_pos.ksh \ zpool_create_encrypted.ksh \ zpool_create_crypt_combos.ksh \ + zpool_create_draid_001_pos.ksh \ + zpool_create_draid_002_pos.ksh \ + zpool_create_draid_003_pos.ksh \ + zpool_create_draid_004_pos.ksh \ zpool_create_features_001_pos.ksh \ zpool_create_features_002_pos.ksh \ zpool_create_features_003_pos.ksh \ @@ -36,5 +40,6 @@ dist_pkgdata_SCRIPTS = \ zpool_create_tempname.ksh dist_pkgdata_DATA = \ + draidcfg.gz \ zpool_create.cfg \ zpool_create.shlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/draidcfg.gz b/tests/zfs-tests/tests/functional/cli_root/zpool_create/draidcfg.gz new file mode 100644 index 000000000..b8c0a583c Binary files /dev/null and b/tests/zfs-tests/tests/functional/cli_root/zpool_create/draidcfg.gz differ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh index 799160722..42f57beae 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh @@ -64,14 +64,16 @@ pooldevs="${DISK0} \ \"${DISK0} ${DISK1}\" \ \"${DISK0} ${DISK1} ${DISK2}\" \ \"$disk1 $disk2\"" -raidzdevs="\"${DISK0} ${DISK1} ${DISK2}\"" mirrordevs="\"${DISK0} ${DISK1}\" \ $raidzdevs \ \"$disk1 $disk2\"" +raidzdevs="\"${DISK0} ${DISK1} ${DISK2}\"" +draiddevs="\"${DISK0} ${DISK1} ${DISK2}\"" create_pool_test "$TESTPOOL" "" "$pooldevs" create_pool_test "$TESTPOOL" "mirror" "$mirrordevs" create_pool_test "$TESTPOOL" "raidz" "$raidzdevs" create_pool_test "$TESTPOOL" "raidz1" "$raidzdevs" +create_pool_test "$TESTPOOL" "draid" "$draiddevs" log_pass "'zpool create ...' success." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh index 165453e8b..e1d8cc474 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh @@ -54,7 +54,7 @@ log_assert "'zpool create [-R root][-m mountpoint] ...' can create "an alternate pool or a new pool mounted at the specified mountpoint." log_onexit cleanup -set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2" +set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2" "draid" "draid2" # # cleanup the pools created in previous case if zpool_create_004_pos timedout @@ -67,8 +67,8 @@ done rm -rf $TESTDIR log_must mkdir -p $TESTDIR typeset -i i=1 -while (( i < 4 )); do - log_must mkfile $FILESIZE $TESTDIR/file.$i +while (( i < 5 )); do + log_must truncate -s $FILESIZE $TESTDIR/file.$i (( i = i + 1 )) done @@ -87,7 +87,7 @@ do log_must zpool destroy -f $TESTPOOL [[ -d $TESTDIR1 ]] && rm -rf $TESTDIR1 log_must zpool create $opt $TESTPOOL ${pooltype[i]} \ - $file.1 $file.2 $file.3 + $file.1 $file.2 $file.3 $file.4 ! poolexists $TESTPOOL && \ log_fail "Creating pool with $opt fails." mpt=`zfs mount | egrep "^$TESTPOOL[^/]" | awk '{print $2}'` diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh index 15cd23e44..79b41fdae 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh @@ -97,6 +97,20 @@ set -A valid_args \ "raidz2 $vdev0 $vdev1 $vdev2 spare $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \ "raidz3 $vdev0 $vdev1 $vdev2 $vdev3 \ mirror $vdev4 $vdev5 $vdev6 $vdev7" \ + "draid $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4" \ + "draid $vdev0 $vdev1 $vdev2 raidz1 $vdev3 $vdev4 $vdev5" \ + "draid $vdev0 $vdev1 $vdev2 draid1 $vdev3 $vdev4 $vdev5" \ + "draid $vdev0 $vdev1 $vdev2 special mirror $vdev3 $vdev4" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 mirror $vdev4 $vdev5 $vdev6" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 draid2 $vdev4 $vdev5 $vdev6 $vdev7"\ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 \ + special mirror $vdev4 $vdev5 $vdev6" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 \ + special mirror $vdev4 $vdev5 $vdev6 \ + cache $vdev7 log mirror $vdev8 $vdev9" \ + "draid $vdev0 $vdev1 $vdev2 draid $vdev4 $vdev5 $vdev6 $vdev7 \ + special mirror $vdev8 $vdev9" \ "spare $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4 raidz $vdev5 $vdev6" set -A forced_args \ @@ -109,11 +123,19 @@ set -A forced_args \ "raidz $vdev0 $vdev1 raidz2 $vdev2 $vdev3 $vdev4" \ "raidz $vdev0 $vdev1 raidz2 $vdev2 $vdev3 $vdev4 spare $vdev5" \ "raidz $vdev0 $vdev1 spare $vdev2 raidz2 $vdev3 $vdev4 $vdev5" \ + "raidz $vdev0 $vdev1 draid2 $vdev2 $vdev3 $vdev4 $vdev5" \ + "raidz $vdev0 $vdev1 draid3 $vdev2 $vdev3 $vdev4 $vdev5 $vdev6" \ "mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \ "mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 \ raidz2 $vdev4 $vdev5 $vdev6 spare $vdev7" \ "mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 \ spare $vdev4 raidz2 $vdev5 $vdev6 $vdev7" \ + "mirror $vdev0 $vdev1 draid $vdev2 $vdev3 $vdev4 \ + draid2 $vdev5 $vdev6 $vdev7 $vdev8 spare $vdev9" \ + "draid $vdev0 $vdev1 $vdev2 $vdev3 \ + draid2 $vdev4 $vdev5 $vdev6 $vdev7 $vdev8" \ + "draid $vdev0 $vdev1 $vdev2 draid $vdev4 $vdev5 $vdev6 \ + special mirror $vdev7 $vdev8 $vdev9" \ "spare $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4 \ raidz2 $vdev5 $vdev6 $vdev7" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh index bafc238ea..2873202cc 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh @@ -54,13 +54,16 @@ set -A args "" "-?" "-n" "-f" "-nf" "-fn" "-f -n" "--f" "-e" "-s" \ "$TESTPOOL c0txd0" "$TESTPOOL c0t0dx" "$TESTPOOL cxtxdx" \ "$TESTPOOL mirror" "$TESTPOOL raidz" "$TESTPOOL mirror raidz" \ "$TESTPOOL raidz1" "$TESTPOOL mirror raidz1" \ + "$TESTPOOL draid1" "$TESTPOOL mirror draid1" \ "$TESTPOOL mirror c?t?d?" "$TESTPOOL mirror $DISK0 c0t1d?" \ "$TESTPOOL RAIDZ $DISK0 $DISK1" \ "$TESTPOOL $DISK0 log $DISK1 log $DISK2" \ "$TESTPOOL $DISK0 spare $DISK1 spare $DISK2" \ - "$TESTPOOL RAIDZ1 $DISK0 $DISK1" \ - "$TESTPOOL MIRROR $DISK0" "$TESTPOOL raidz $DISK0" \ - "$TESTPOOL raidz1 $DISK0" \ + "$TESTPOOL RAIDZ1 $DISK0 $DISK1" "$TESTPOOL MIRROR $DISK0" \ + "$TESTPOOL DRAID $DISK1 $DISK2 $DISK3" "$TESTPOOL raidz $DISK0" \ + "$TESTPOOL raidz1 $DISK0" "$TESTPOOL draid $DISK0" \ + "$TESTPOOL draid2 $DISK0 $DISK1" \ + "$TESTPOOL draid $DISK0 $DISK1 $DISK2 spare s0-draid1-0" \ "1tank $DISK0" "1234 $DISK0" "?tank $DISK0" \ "tan%k $DISK0" "ta@# $DISK0" "tan+k $DISK0" \ "$BYND_MAX_NAME $DISK0" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh index 0d7acdb40..e2f389903 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh @@ -63,7 +63,7 @@ log_onexit cleanup unset NOINUSE_CHECK typeset opt -for opt in "" "mirror" "raidz" "raidz1"; do +for opt in "" "mirror" "raidz" "draid"; do if [[ $opt == "" ]]; then typeset disks=$DISK0 else diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh index e0b3850e4..36bbaa7de 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh @@ -63,15 +63,16 @@ log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS typeset -l devsize=$(($SPA_MINDEVSIZE - 1024 * 1024)) -for files in $TESTDIR/file1 $TESTDIR/file2 +for files in $TESTDIR/file1 $TESTDIR/file2 $TESTDIR/file3 do - log_must mkfile $devsize $files + log_must truncate -s $devsize $files done set -A args \ "$TOOSMALL $TESTDIR/file1" "$TESTPOOL1 $TESTDIR/file1 $TESTDIR/file2" \ "$TOOSMALL mirror $TESTDIR/file1 $TESTDIR/file2" \ - "$TOOSMALL raidz $TESTDIR/file1 $TESTDIR/file2" + "$TOOSMALL raidz $TESTDIR/file1 $TESTDIR/file2" \ + "$TOOSMALL draid $TESTDIR/file1 $TESTDIR/file2 $TESTDIR/file3" typeset -i i=0 while [[ $i -lt ${#args[*]} ]]; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh index 140771d4f..9437033ae 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh @@ -54,7 +54,7 @@ function cleanup destroy_pool $pool done - rm -rf $disk1 $disk2 $disk3 + rm -rf $disk1 $disk2 $disk3 $disk4 if [[ -n $saved_dump_dev ]]; then log_must dumpadm -u -d $saved_dump_dev @@ -66,12 +66,16 @@ log_onexit cleanup disk1=$(create_blockfile $FILESIZE) disk2=$(create_blockfile $FILESIZE) -disk3=$(create_blockfile $FILESIZE1) +disk3=$(create_blockfile $FILESIZE) +disk4=$(create_blockfile $FILESIZE1) mirror1="$DISK0 $DISK1" mirror2="$disk1 $disk2" raidz1=$mirror1 raidz2=$mirror2 -diff_size_dev="$disk2 $disk3" +draid1="$DISK0 $DISK1 $DISK2" +draid2="$disk1 $disk2 $disk3" +diff_size_dev="$disk2 $disk4" +draid_diff_size_dev="$disk1 $disk2 $disk4" vfstab_dev=$(find_vfstab_dev) if is_illumos; then @@ -91,13 +95,17 @@ set -A arg \ "$TESTPOOL1 mirror mirror $mirror1 mirror $mirror2" \ "$TESTPOOL1 raidz raidz $raidz1 raidz $raidz2" \ "$TESTPOOL1 raidz1 raidz1 $raidz1 raidz1 $raidz2" \ + "$TESTPOOL1 draid draid $draid draid $draid2" \ "$TESTPOOL1 mirror raidz $raidz1 raidz $raidz2" \ "$TESTPOOL1 mirror raidz1 $raidz1 raidz1 $raidz2" \ + "$TESTPOOL1 mirror draid $draid1 draid $draid2" \ "$TESTPOOL1 raidz mirror $mirror1 mirror $mirror2" \ "$TESTPOOL1 raidz1 mirror $mirror1 mirror $mirror2" \ + "$TESTPOOL1 draid1 mirror $mirror1 mirror $mirror2" \ "$TESTPOOL1 mirror $diff_size_dev" \ "$TESTPOOL1 raidz $diff_size_dev" \ "$TESTPOOL1 raidz1 $diff_size_dev" \ + "$TESTPOOL1 draid1 $draid_diff_size_dev" \ "$TESTPOOL1 mirror $mirror1 spare $mirror2 spare $diff_size_dev" \ "$TESTPOOL1 $vfstab_dev" \ "$TESTPOOL1 ${DISK0}s10" \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh new file mode 100755 index 000000000..9717af505 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create a variety of dRAID pools using the minimal dRAID vdev syntax. +# +# STRATEGY: +# 1) Create the required number of allowed dRAID vdevs. +# 2) Create few pools of various sizes using the draid1|draid2|draid3 syntax. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + rm -f $all_vdevs + rmdir $TESTDIR +} + +log_assert "'zpool create ...' can create a pool." + +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/file.{01..84}) + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE $all_vdevs + +# Verify all configurations up to 24 vdevs. +for parity in {1..3}; do + for children in {$((parity + 2))..24}; do + vdevs=$(echo $TESTDIR/file.{01..${children}}) + log_must zpool create $TESTPOOL draid$parity $vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + done +done + +# Spot check a few large configurations. +children_counts="53 84" +for children in $children_counts; do + vdevs=$(echo $TESTDIR/file.{01..${children}}) + log_must zpool create $TESTPOOL draid $vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL +done + +log_pass "'zpool create ...' success." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh new file mode 100755 index 000000000..2e1ff3931 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh @@ -0,0 +1,82 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create dRAID pool using the maximum number of vdevs (255). Then verify +# that creating a pool with 256 fails as expected. +# +# STRATEGY: +# 1) Verify a pool with fewer than the required vdevs fails. +# 2) Verify pools with a valid number of vdevs succeed. +# 3) Verify a pool which exceeds the maximum number of vdevs fails. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + rm -f $all_vdevs + rmdir $TESTDIR +} + +log_assert "'zpool create draid '" + +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/file.{01..256}) + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE $all_vdevs + +# Below maximum dRAID vdev count for specified parity level. +log_mustnot zpool create $TESTPOOL draid1 $(echo $TESTDIR/file.{01..01}) +log_mustnot zpool create $TESTPOOL draid2 $(echo $TESTDIR/file.{01..02}) +log_mustnot zpool create $TESTPOOL draid3 $(echo $TESTDIR/file.{01..03}) + +# Verify pool sizes from 2-10. Values in between are skipped to speed +# up the test case but will be exercised by the random pool creation +# done in zpool_create_draid_002_pos.ksh. +for (( i=2; i<=10; i++ )); do + log_must zpool create $TESTPOOL draid:${i}c \ + $(echo $TESTDIR/file.{01..$i}) + log_must destroy_pool $TESTPOOL +done + +# Verify pool sizes from 254-255. +for (( i=254; i<=255; i++ )); do + log_must zpool create $TESTPOOL draid:${i}c \ + $(echo $TESTDIR/file.{01..$i}) + log_must destroy_pool $TESTPOOL +done + +# Exceeds maximum dRAID vdev count (256). +log_mustnot zpool create $TESTPOOL draid $(echo $TESTDIR/file.{01..256}) + +log_pass "'zpool create draid '" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh new file mode 100755 index 000000000..52cd00cf4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify allowed striped widths (data+parity) and hot spares may be +# configured at pool creation time. +# +# STRATEGY: +# 1) Test valid stripe/spare combinations given the number of children. +# 2) Test invalid stripe/spare/children combinations outside the allow limits. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + rm -f $draid_vdevs + rmdir $TESTDIR +} + +log_assert "'zpool create draid:#d:#c:#s '" + +log_onexit cleanup + +mkdir $TESTDIR + +# Generate 10 random valid configurations to test. +for (( i=0; i<10; i++ )); do + parity=$(random_int_between 1 3) + spares=$(random_int_between 0 3) + data=$(random_int_between 1 16) + + (( min_children = (data + parity + spares) )) + children=$(random_int_between $min_children 32) + + draid="draid${parity}:${data}d:${children}c:${spares}s" + + draid_vdevs=$(echo $TESTDIR/file.{01..$children}) + log_must truncate -s $MINVDEVSIZE $draid_vdevs + + log_must zpool create $TESTPOOL $draid $draid_vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + + rm -f $draid_vdevs +done + +children=32 +draid_vdevs=$(echo $TESTDIR/file.{01..$children}) +log_must truncate -s $MINVDEVSIZE $draid_vdevs + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE $draid_vdevs + +# Out of order and unknown suffixes should fail. +log_mustnot zpool create $TESTPOOL draid:d8 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:s3 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:c32 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:10x $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:x10 $draid_vdevs + +# Exceeds maximum data disks (limited by total children) +log_must zpool create $TESTPOOL draid2:30d $draid_vdevs +log_must destroy_pool $TESTPOOL +log_mustnot zpool create $TESTPOOL draid2:31d $draid_vdevs + +# At least one data disk must be requested. +log_mustnot zpool create $TESTPOOL draid2:0d $draid_vdevs + +# Check invalid parity levels. +log_mustnot zpool create $TESTPOOL draid0 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid4 $draid_vdevs + +# Spares are limited: spares < children - (parity + data). +log_must zpool create $TESTPOOL draid2:20d:10s $draid_vdevs +log_must destroy_pool $TESTPOOL +log_mustnot zpool create $TESTPOOL draid2:20d:11s $draid_vdevs + +# The required children argument is enforced. +log_mustnot zpool create $TESTPOOL draid2:0c $draid_vdevs +log_mustnot zpool create $TESTPOOL draid2:31c $draid_vdevs +log_must zpool create $TESTPOOL draid2:32c $draid_vdevs +destroy_pool $TESTPOOL + +log_pass "'zpool create draid:#d:#c:#s '" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh new file mode 100755 index 000000000..6b700fa36 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh @@ -0,0 +1,43 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify generated dRAID permutation maps against the authoritative +# reference file contains the full permutations. +# + +verify_runnable "global" + +log_assert "'draid verify'" + +DRAIDCFG="$STF_SUITE/tests/functional/cli_root/zpool_create/draidcfg.gz" + +log_must draid verify $DRAIDCFG + +log_pass "'draid verify'" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh index f39e6267b..922e35125 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh @@ -72,7 +72,7 @@ log_onexit cleanup log_assert "zpool can be autoexpanded after set autoexpand=on on vdev expansion" -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid; do log_note "Setting up loopback, scsi_debug, and file vdevs" log_must truncate -s $org_size $FILE_LO DEV1=$(losetup -f) @@ -144,6 +144,16 @@ for type in " " mirror raidz raidz2; do if [[ $? -ne 0 ]] ; then log_fail "pool $TESTPOOL1 has not expanded" fi + elif [[ $type == "draid" ]]; then + typeset expansion_size=$((2*($exp_size-$org_size))) + zpool history -il $TESTPOOL1 | \ + grep "pool '$TESTPOOL1' size:" | \ + grep "vdev online" | \ + grep "(+${expansion_size})" >/dev/null 2>&1 + + if [[ $? -ne 0 ]]; then + log_fail "pool $TESTPOOL has not expanded" + fi else typeset expansion_size=$((3*($exp_size-$org_size))) zpool history -il $TESTPOOL1 | \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh index a49d4fc17..62843b062 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh @@ -63,7 +63,7 @@ log_onexit cleanup log_assert "zpool can expand after zpool online -e zvol vdevs on vdev expansion" -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid:1s; do # Initialize the file devices and the pool for i in 1 2 3; do log_must truncate -s $org_size ${TEMPFILE}.$i @@ -92,6 +92,8 @@ for type in " " mirror raidz raidz2; do if [[ $type == "mirror" ]]; then typeset expected_zpool_expandsize=$(($exp_size-$org_size)) + elif [[ $type == "draid:1s" ]]; then + typeset expected_zpool_expandsize=$((2*($exp_size-$org_size))) else typeset expected_zpool_expandsize=$((3*($exp_size-$org_size))) fi @@ -147,6 +149,17 @@ for type in " " mirror raidz raidz2; do log_fail "pool $TESTPOOL1 has not expanded " \ "after zpool online -e" fi + elif [[ $type == "draid:1s" ]]; then + typeset expansion_size=$((2*($exp_size-$org_size))) + zpool history -il $TESTPOOL1 | \ + grep "pool '$TESTPOOL1' size:" | \ + grep "vdev online" | \ + grep "(+${expansion_size})" >/dev/null 2>&1 + + if [[ $? -ne 0 ]] ; then + log_fail "pool $TESTPOOL1 has not expanded " \ + "after zpool online -e" + fi else typeset expansion_size=$((3*($exp_size-$org_size))) zpool history -il $TESTPOOL1 | \ @@ -160,9 +173,17 @@ for type in " " mirror raidz raidz2; do fi fi else - log_fail "pool $TESTPOOL1 did not expand after vdev expansion " \ - "and zpool online -e" + log_fail "pool $TESTPOOL1 did not expand after vdev " \ + "expansion and zpool online -e" fi + + # For dRAID pools verify the distributed spare was resized after + # expansion and it is large enough to be used to replace a pool vdev. + if [[ $type == "draid:1s" ]]; then + log_must zpool replace -w $TESTPOOL1 $TEMPFILE.3 draid1-0-0 + verify_pool $TESTPOOL1 + fi + log_must zpool destroy $TESTPOOL1 done log_pass "zpool can expand after zpool online -e" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh index 323d0b907..b3c71b666 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh @@ -73,7 +73,7 @@ log_onexit cleanup log_assert "zpool can not expand if set autoexpand=off after vdev expansion" -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid; do log_note "Setting up loopback, scsi_debug, and file vdevs" log_must truncate -s $org_size $FILE_LO DEV1=$(losetup -f) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh index 8a4db824b..09e2b6da2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh @@ -61,7 +61,7 @@ log_onexit cleanup log_assert "After vdev expansion, all 4 labels have the same set of uberblocks." -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid; do for i in 1 2 3; do log_must truncate -s $org_size ${TEMPFILE}.$i done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 8abef65de..3c536ca12 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -80,6 +80,7 @@ typeset -a properties=( "feature@bookmark_written" "feature@log_spacemap" "feature@device_rebuild" + "feature@draid" ) if is_linux || is_freebsd; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am index ad0f9c46e..a99c5011e 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am @@ -29,6 +29,8 @@ dist_pkgdata_SCRIPTS = \ zpool_import_013_neg.ksh \ zpool_import_014_pos.ksh \ zpool_import_015_pos.ksh \ + zpool_import_016_pos.ksh \ + zpool_import_017_pos.ksh \ zpool_import_all_001_pos.ksh \ zpool_import_features_001_pos.ksh \ zpool_import_features_002_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh index ab72042a2..3238faaa9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh @@ -69,6 +69,8 @@ test_add_vdevs "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3" \ "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "raidz $VDEV1 $VDEV2 $VDEV3" \ "$VDEV0 raidz $VDEV1 $VDEV2 $VDEV3" +test_add_vdevs "$VDEV0" "draid $VDEV1 $VDEV2 $VDEV3" \ + "$VDEV0 draid $VDEV1 $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "log $VDEV1" "$VDEV0 log $VDEV1" test_add_vdevs "$VDEV0 log $VDEV1" "$VDEV2" "$VDEV0 $VDEV2 log $VDEV1" test_add_vdevs "$VDEV0" "$VDEV1 log $VDEV2" "$VDEV0 $VDEV1 log $VDEV2" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh index a42c69747..8a81c18cd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh @@ -155,6 +155,12 @@ test_replacing_vdevs "raidz $VDEV0 $VDEV1 $VDEV2" \ "$VDEV0 $VDEV1 $VDEV2" \ true 20 +test_replacing_vdevs "draid:1s $VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4" \ + "$VDEV1" "$VDEV5" \ + "draid $VDEV0 $VDEV5 $VDEV2 $VDEV3 $VDEV4 spares draid1-0-0" \ + "$VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4" \ + true 30 + set_zfs_txg_timeout $ZFS_TXG_TIMEOUT log_pass "zpool import -c cachefile_unaware_of_replace passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh index 887993dfd..87942b4a5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh @@ -108,6 +108,7 @@ test_shared_device "mirror $VDEV0 $VDEV1" "mirror $VDEV1 $VDEV2" "$VDEV1" test_shared_device "mirror $VDEV0 $VDEV1 $VDEV2" "mirror $VDEV2 $VDEV3" \ "$VDEV2" test_shared_device "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV2" "$VDEV2" +test_shared_device "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV2" "$VDEV2" test_shared_device "$VDEV0 log $VDEV1" "$VDEV2 log $VDEV1" "$VDEV1" "-m" log_pass "Pool doesn't write to a device it doesn't own anymore." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh index 7ee306e26..15f3a0a7b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh @@ -89,9 +89,11 @@ test_new_paths "$VDEV0 $VDEV1" "$VDEV0 $VDEV1" test_new_paths "mirror $VDEV0 $VDEV1" "$VDEV0 $VDEV1" test_new_paths "$VDEV0 log $VDEV1" "$VDEV1" test_new_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV1" +test_new_paths "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV1" test_swap_paths "$VDEV0 $VDEV1" "$VDEV0" "$VDEV1" test_swap_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV0" "$VDEV1" +test_swap_paths "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV0" "$VDEV1" test_swap_paths "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3" \ "$VDEV0" "$VDEV2" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh index 74d75b6cd..3ac8c104f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh @@ -220,6 +220,7 @@ test_add_vdevs "$VDEV0 $VDEV1" "$VDEV2" test_add_vdevs "$VDEV0" "$VDEV1 $VDEV2" test_add_vdevs "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "raidz $VDEV1 $VDEV2 $VDEV3" +test_add_vdevs "$VDEV0" "draid $VDEV1 $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "log $VDEV1" test_add_vdevs "$VDEV0 log $VDEV1" "$VDEV2" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh index 94d1cb25d..b03b39d17 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh @@ -176,6 +176,11 @@ test_replace_vdev "raidz $VDEV0 $VDEV1 $VDEV2" \ "raidz $VDEV0 $VDEV3 $VDEV2" \ "$VDEV0 $VDEV1 $VDEV2" 10 +test_replace_vdev "draid $VDEV0 $VDEV1 $VDEV2 $VDEV3" \ + "$VDEV1" "$VDEV4" \ + "draid $VDEV0 $VDEV4 $VDEV2 $VDEV3 spares draid1-0-0" \ + "$VDEV0 $VDEV1 $VDEV2 $VDEV3" 10 + set_zfs_txg_timeout $ZFS_TXG_TIMEOUT log_pass "zpool import rewind after device replacement passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh index 74324c84e..22e619d74 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh @@ -49,7 +49,7 @@ log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS i=0 while (( i < $MAX_NUM )); do - log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i (( i = i + 1 )) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg index 6c1ab194e..25f541ebf 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg @@ -34,7 +34,7 @@ export DISK=${DISKS%% *} export FS_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 32))m" export FILE_SIZE="$((MINVDEVSIZE))" export SLICE_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 2))m" -export MAX_NUM=5 +export MAX_NUM=6 export DEVICE_DIR=$TEST_BASE_DIR/dev_import-test export BACKUP_DEVICE_DIR=$TEST_BASE_DIR/bakdev_import-test export DEVICE_FILE=disk @@ -60,5 +60,6 @@ export VDEV1=$DEVICE_DIR/${DEVICE_FILE}1 export VDEV2=$DEVICE_DIR/${DEVICE_FILE}2 export VDEV3=$DEVICE_DIR/${DEVICE_FILE}3 export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4 +export VDEV5=$DEVICE_DIR/${DEVICE_FILE}5 export ALTER_ROOT=/alter_import-test diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib index 48794c982..8bbd668a9 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib @@ -31,7 +31,7 @@ function cleanup log_must rm -rf $DEVICE_DIR/* typeset i=0 while (( i < $MAX_NUM )); do - log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i ((i += 1)) done is_linux && set_tunable32 TXG_HISTORY 0 @@ -163,7 +163,7 @@ function increase_device_sizes typeset -i i=0 while (( i < $MAX_NUM )); do - log_must mkfile $newfilesize ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $newfilesize ${DEVICE_DIR}/${DEVICE_FILE}$i ((i += 1)) done } @@ -171,15 +171,18 @@ function increase_device_sizes # # Translate vdev names returned by zpool status into more generic names. # -# eg: mirror-2 --> mirror -# function _translate_vdev { typeset vdev=$1 - typeset keywords="mirror replacing raidz1 raidz2 raidz3 indirect" + # + # eg: mirror-2 --> mirror + # eg: draid2:4d:12c:1s-0 --> draid2 + # + typeset keywords="mirror replacing raidz1 raidz2 raidz3 indirect draid1 draid2 draid3" for word in $keywords; do - echo $vdev | egrep "^${word}-[0-9]+\$" > /dev/null + echo $vdev | egrep -qE \ + "^${word}-[0-9]+\$|^${word}:[0-9]+d:[0-9]c:[0-9]+s-[0-9]+\$" if [[ $? -eq 0 ]]; then vdev=$word break @@ -188,6 +191,7 @@ function _translate_vdev [[ $vdev == "logs" ]] && echo "log" && return 0 [[ $vdev == "raidz1" ]] && echo "raidz" && return 0 + [[ $vdev == "draid1" ]] && echo "draid" && return 0 echo $vdev return 0 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh index 6e93fd471..928efebdd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh @@ -63,7 +63,7 @@ log_assert "For raidz, one destroyed pools devices was removed or used by " \ "other pool, it still can be imported correctly." log_onexit cleanup -log_must zpool create $TESTPOOL1 raidz $VDEV0 $VDEV1 $VDEV2 $VDIV3 +log_must zpool create $TESTPOOL1 raidz $VDEV0 $VDEV1 $VDEV2 $VDEV3 typeset guid=$(get_config $TESTPOOL1 pool_guid) typeset target=$TESTPOOL1 if (( RANDOM % 2 == 0 )) ; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh index 096bbe811..f8da584aa 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh @@ -63,7 +63,7 @@ log_assert "For raidz2, two destroyed pools devices was removed or used by " \ "other pool, it still can be imported correctly." log_onexit cleanup -log_must zpool create $TESTPOOL1 raidz2 $VDEV0 $VDEV1 $VDEV2 $VDIV3 +log_must zpool create $TESTPOOL1 raidz2 $VDEV0 $VDEV1 $VDEV2 $VDEV3 typeset guid=$(get_config $TESTPOOL1 pool_guid) typeset target=$TESTPOOL1 if (( RANDOM % 2 == 0 )) ; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh index b337bd00f..212024dfc 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh @@ -39,7 +39,7 @@ # STRATEGY: # 1. Create a 5 ways mirror pool A with dev0/1/2/3/4, then destroy it. # 2. Create a stripe pool B with dev1. Then destroy it. -# 3. Create a raidz2 pool C with dev2/3/4. Then destroy it. +# 3. Create a draid2 pool C with dev2/3/4/5. Then destroy it. # 4. Create a raidz pool D with dev3/4. Then destroy it. # 5. Create a stripe pool E with dev4. Then destroy it. # 6. Verify 'zpool import -D -a' recover all the pools. @@ -74,7 +74,7 @@ log_must zpool destroy $poolA log_must zpool create $poolB $VDEV1 log_must zpool destroy $poolB -log_must zpool create $poolC raidz2 $VDEV2 $VDEV3 $VDEV4 +log_must zpool create $poolC draid2 $VDEV2 $VDEV3 $VDEV4 $VDEV5 log_must zpool destroy $poolC log_must zpool create $poolD raidz $VDEV3 $VDEV4 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_016_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_016_pos.ksh new file mode 100755 index 000000000..5434625cb --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_016_pos.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg + +# +# DESCRIPTION: +# For draid, one destroyed pools devices was removed or used by other +# pool, it still can be imported correctly. +# +# STRATEGY: +# 1. Create a draid pool A with N disks. +# 2. Destroy this pool A. +# 3. Create another pool B with 1 disk which was used by pool A. +# 4. Verify import this draid pool can succeed. +# + +verify_runnable "global" + +function cleanup +{ + destroy_pool $TESTPOOL2 + destroy_pool $TESTPOOL1 + + log_must rm -rf $DEVICE_DIR/* + typeset i=0 + while (( i < $MAX_NUM )); do + log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + ((i += 1)) + done +} + +log_assert "For draid, one destroyed pools devices was removed or used by " \ + "other pool, it still can be imported correctly." +log_onexit cleanup + +log_must zpool create $TESTPOOL1 draid $VDEV0 $VDEV1 $VDEV2 $VDEV3 +typeset guid=$(get_config $TESTPOOL1 pool_guid) +typeset target=$TESTPOOL1 +if (( RANDOM % 2 == 0 )) ; then + target=$guid + log_note "Import by guid." +fi +log_must zpool destroy $TESTPOOL1 + +log_must zpool create $TESTPOOL2 $VDEV0 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_must zpool destroy $TESTPOOL2 +log_must rm -rf $VDEV0 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_note "For draid, two destroyed pool's devices were used, import failed." +log_must mkfile $FILE_SIZE $VDEV0 +log_must zpool create $TESTPOOL2 $VDEV0 $VDEV1 +log_mustnot zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL2 + +log_pass "zpool import -D draid passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_017_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_017_pos.ksh new file mode 100755 index 000000000..2e6cef265 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_017_pos.ksh @@ -0,0 +1,92 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg + +# +# DESCRIPTION: +# For draid2, two destroyed pool's devices were removed or used by other +# pool, it still can be imported correctly. +# +# STRATEGY: +# 1. Create a draid2 pool A with N disks. +# 2. Destroy this pool A. +# 3. Create another pool B with two disks which were used by pool A. +# 4. Verify import this draid2 pool can succeed. +# + +verify_runnable "global" + +function cleanup +{ + destroy_pool $TESTPOOL2 + destroy_pool $TESTPOOL1 + + log_must rm -rf $DEVICE_DIR/* + typeset i=0 + while (( i < $MAX_NUM )); do + log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + ((i += 1)) + done +} + +log_assert "For draid2, two destroyed pools devices was removed or used by " \ + "other pool, it still can be imported correctly." +log_onexit cleanup + +log_must zpool create $TESTPOOL1 draid2 $VDEV0 $VDEV1 $VDEV2 $VDEV3 +typeset guid=$(get_config $TESTPOOL1 pool_guid) +typeset target=$TESTPOOL1 +if (( RANDOM % 2 == 0 )) ; then + target=$guid + log_note "Import by guid." +fi +log_must zpool destroy $TESTPOOL1 + +log_must zpool create $TESTPOOL2 $VDEV0 $VDEV1 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_must zpool destroy $TESTPOOL2 +log_must rm -rf $VDEV0 $VDEV1 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_note "For draid2, more than two destroyed pool's devices were used, " \ + "import failed." +log_must mkfile $FILE_SIZE $VDEV0 $VDEV1 +log_must zpool create $TESTPOOL2 $VDEV0 $VDEV1 $VDEV2 +log_mustnot zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL2 + +log_pass "zpool import -D draid2 passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh index 78e9bbf68..3b5167ff0 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh @@ -57,8 +57,8 @@ # Using the various combinations. # - Regular import # - Alternate Root Specified -# It should be succeed with single d/m device upon 'raidz' & 'mirror', -# but failed against 'regular' or more d/m devices. +# It should succeed with single d/m device upon 'raidz', 'mirror', +# 'draid' but failed against 'regular' or more d/m devices. # 6. If import succeed, verify following is true: # - The pool shows up under 'zpool list'. # - The pool's health should be DEGRADED. @@ -67,7 +67,16 @@ verify_runnable "global" -set -A vdevs "" "mirror" "raidz" +# Randomly test a subset of combinations to speed up the test. +(( rc=RANDOM % 3 )) +if [[ $rc == 0 ]] ; then + set -A vdevs "" "mirror" "raidz" +elif [[ $rc == 1 ]] ; then + set -A vdevs "" "mirror" "draid" +else + set -A vdevs "" "raidz" "draid" +fi + set -A options "" "-R $ALTER_ROOT" function cleanup @@ -89,7 +98,8 @@ function recreate_files log_must rm -rf $DEVICE_DIR/* typeset i=0 while (( i < $MAX_NUM )); do - log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must rm -f ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i ((i += 1)) done } @@ -157,6 +167,9 @@ while (( i < ${#vdevs[*]} )); do 'raidz') (( count > 1 )) && \ action=log_mustnot ;; + 'draid') (( count > 1 )) && \ + action=log_mustnot + ;; '') action=log_mustnot ;; esac diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh index c6d263707..60af3f321 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh @@ -43,6 +43,8 @@ # before data integrity is compromised # - Raidz could withstand one devices failing # before data integrity is compromised +# - dRAID could withstand one devices failing +# before data integrity is compromised # Verify that is true. # # STRATEGY: @@ -50,6 +52,7 @@ # - Regular pool # - Mirror # - Raidz +# - dRAID # 2. Create necessary filesystem and test files. # 3. Export the test pool. # 4. Move one or more device files to other directory @@ -62,7 +65,16 @@ verify_runnable "global" -set -A vdevs "" "mirror" "raidz" +# Randomly test a subset of combinations to speed up the test. +(( rc=RANDOM % 3 )) +if [[ $rc == 0 ]] ; then + set -A vdevs "" "mirror" "raidz" +elif [[ $rc == 1 ]] ; then + set -A vdevs "" "mirror" "draid" +else + set -A vdevs "" "raidz" "draid" +fi + set -A options "" "-R $ALTER_ROOT" function cleanup @@ -88,7 +100,8 @@ function cleanup_all while (( i < $MAX_NUM )); do typeset dev_file=${DEVICE_DIR}/${DEVICE_FILE}$i if [[ ! -e ${dev_file} ]]; then - log_must mkfile $FILE_SIZE ${dev_file} + log_must rm -f ${dev_file} + log_must truncate -s $FILE_SIZE ${dev_file} fi ((i += 1)) done @@ -158,7 +171,8 @@ while (( i < ${#vdevs[*]} )); do # Backup all device files while filesystem prepared. # if [[ -z $backup ]] ; then - log_must tar cf $DEVICE_DIR/$DEVICE_ARCHIVE ${DEVICE_FILE}* + log_must tar cf $DEVICE_DIR/$DEVICE_ARCHIVE \ + ${DEVICE_FILE}0 ${DEVICE_FILE}1 ${DEVICE_FILE}2 backup="true" fi @@ -174,6 +188,9 @@ while (( i < ${#vdevs[*]} )); do 'raidz') (( count == 1 )) && \ action=log_must ;; + 'draid') (( count == 1 )) && \ + action=log_must + ;; esac typeset target=$TESTPOOL1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh index 6fa55250a..9d4629a77 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh @@ -64,7 +64,7 @@ if ! is_illumos; then log_unsupported "Test case may be slow" fi -set -A vdevs "" "mirror" "raidz" +set -A vdevs "" "mirror" "raidz" "draid" function verify { @@ -207,6 +207,9 @@ while (( i < ${#vdevs[*]} )); do 'raidz') (( overlap > 1 )) && \ action=log_mustnot ;; + 'draid') (( overlap > 1 )) && \ + action=log_mustnot + ;; '') action=log_mustnot ;; esac diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh index f135de4bc..a899e9f99 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh @@ -41,6 +41,7 @@ function cleanup log_must zpool detach $TESTPOOL $DISK2 get_disklist $TESTPOOL | grep $DISK3 >/dev/null && \ log_must zpool detach $TESTPOOL $DISK3 + log_must zpool sync $TESTPOOL } typeset pid diff --git a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh index 1bf54b1a8..0abe1e2ce 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh @@ -54,7 +54,7 @@ if is_linux; then # Add one 512b scsi_debug device (4Kn would generate IO errors) # NOTE: must be larger than other "file" vdevs and minimum SPA devsize: # add 32m of fudge - load_scsi_debug $(($SPA_MINDEVSIZE/1024/1024+32)) 1 1 1 '512b' + load_scsi_debug $(($MINVDEVSIZE/1024/1024+32)) 1 1 1 '512b' else log_unsupported "scsi debug module unsupported" fi @@ -85,10 +85,10 @@ typeset poolconfs=( "mirror $filedev1 $filedev2 special mirror $filedev3 $removedev" ) -log_must truncate -s $SPA_MINDEVSIZE $filedev1 -log_must truncate -s $SPA_MINDEVSIZE $filedev2 -log_must truncate -s $SPA_MINDEVSIZE $filedev3 -log_must truncate -s $SPA_MINDEVSIZE $sparedev +log_must truncate -s $MINVDEVSIZE $filedev1 +log_must truncate -s $MINVDEVSIZE $filedev2 +log_must truncate -s $MINVDEVSIZE $filedev3 +log_must truncate -s $MINVDEVSIZE $sparedev for conf in "${poolconfs[@]}" do diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh index b6af1a3f4..a93267185 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh @@ -55,36 +55,59 @@ zed_events_drain TESTFILE="/$TESTPOOL/$TESTFS/testfile" -for type in "mirror" "raidz" "raidz2"; do - # 1. Create a pool with hot spares - truncate -s $SPA_MINDEVSIZE $VDEV_FILES $SPARE_FILE - log_must zpool create -f $TESTPOOL $type $VDEV_FILES spare $SPARE_FILE +for type in "mirror" "raidz" "raidz2" "draid:1s"; do + if [ "$type" = "draid:1s" ]; then + # 1. Create a dRAID pool with a distributed hot spare + # + # Corruption is injected in the file-2 instead of file-1 + # vdev since the dRAID permutation at these offsets maps + # to distributed spare space and not data devices. + # + log_must truncate -s $MINVDEVSIZE $VDEV_FILES + log_must zpool create -f $TESTPOOL $type $VDEV_FILES + SPARE="draid1-0-0" + FAULT="$TEST_BASE_DIR/file-2" + else + # 1. Create a pool with hot spares + log_must truncate -s $MINVDEVSIZE $VDEV_FILES $SPARE_FILE + log_must zpool create -f $TESTPOOL $type $VDEV_FILES \ + spare $SPARE_FILE + SPARE=$SPARE_FILE + FAULT=$FAULT_FILE + fi # 2. Create a filesystem with the primary cache disable to force reads log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS log_must zfs set recordsize=16k $TESTPOOL/$TESTFS # 3. Write a file to the pool to be read back - log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=16 + log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=64 # 4. Inject IO ERRORS on read with a zinject error handler - log_must zinject -d $FAULT_FILE -e io -T read $TESTPOOL + log_must zinject -d $FAULT -e io -T read $TESTPOOL log_must cp $TESTFILE /dev/null # 5. Verify the ZED kicks in a hot spare and expected pool/device status log_note "Wait for ZED to auto-spare" - log_must wait_vdev_state $TESTPOOL $FAULT_FILE "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_FILE "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_FILE "INUSE" + log_must wait_vdev_state $TESTPOOL $FAULT "FAULTED" 60 + log_must wait_vdev_state $TESTPOOL $SPARE "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" + # The ZED will use a sequential resilver for dRAID. Wait for the + # resilver and subsequent scrub to complete before moving on. + if [ "$type" = "draid:1s" ]; then + log_must wait_scrubbed $TESTPOOL + fi + # 6. Clear the fault log_must zinject -c all - log_must zpool clear $TESTPOOL $FAULT_FILE + log_must zpool clear $TESTPOOL $FAULT # 7. Verify the hot spare is available and expected pool/device status - log_must wait_vdev_state $TESTPOOL $FAULT_FILE "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_FILE "AVAIL" + log_must wait_vdev_state $TESTPOOL $FAULT "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE "AVAIL" + log_must is_pool_resilvered $TESTPOOL log_must check_state $TESTPOOL "" "ONLINE" diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh index f6d720a01..e9517bad7 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh @@ -60,15 +60,16 @@ TESTFILE="/$TESTPOOL/$TESTFS/testfile" for type in "mirror" "raidz" "raidz2"; do # 1. Create a pool with hot spares - truncate -s $SPA_MINDEVSIZE $VDEV_FILES $SPARE_FILE - log_must zpool create -f $TESTPOOL $type $VDEV_FILES spare $SPARE_FILE + log_must truncate -s $MINVDEVSIZE $VDEV_FILES $SPARE_FILE + log_must zpool create -f $TESTPOOL $type $VDEV_FILES \ + spare $SPARE_FILE # 2. Create a filesystem with the primary cache disable to force reads log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS log_must zfs set recordsize=16k $TESTPOOL/$TESTFS # 3. Write a file to the pool to be read back - log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=16 + log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=64 # 4. Inject CHECKSUM ERRORS on read with a zinject error handler log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh index e9857518e..f4fd21d04 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh @@ -60,7 +60,7 @@ FAIL_DEVICE="$TEST_BASE_DIR/fail-dev" # 1. Create a pool from 512b devices and set "ashift" pool property accordingly for vdev in $SAFE_DEVICE $FAIL_DEVICE; do - truncate -s $SPA_MINDEVSIZE $vdev + truncate -s $MINVDEVSIZE $vdev done log_must zpool create -f $TESTPOOL mirror $SAFE_DEVICE $FAIL_DEVICE # NOTE: file VDEVs should be added as 512b devices, verify this "just in case" @@ -71,7 +71,7 @@ log_must zpool set ashift=9 $TESTPOOL # 2. Add one 512e spare device (4Kn would generate IO errors on replace) # NOTE: must be larger than the existing 512b devices, add 32m of fudge -load_scsi_debug $(($SPA_MINDEVSIZE/1024/1024+32)) $SDHOSTS $SDTGTS $SDLUNS '512e' +load_scsi_debug $(($MINVDEVSIZE/1024/1024+32)) $SDHOSTS $SDTGTS $SDLUNS '512e' SPARE_DEVICE=$(get_debug_device) log_must_busy zpool add $TESTPOOL spare $SPARE_DEVICE diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh index bec413527..8a9cf6f53 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh @@ -63,15 +63,43 @@ FAULT_DEV1="$TEST_BASE_DIR/fault-dev1" FAULT_DEV2="$TEST_BASE_DIR/fault-dev2" SAFE_DEV1="$TEST_BASE_DIR/safe-dev1" SAFE_DEV2="$TEST_BASE_DIR/safe-dev2" -DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2" +SAFE_DEV3="$TEST_BASE_DIR/safe-dev3" +SAFE_DEV4="$TEST_BASE_DIR/safe-dev4" +DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2 $SAFE_DEV3 $SAFE_DEV4" SPARE_DEV1="$TEST_BASE_DIR/spare-dev1" SPARE_DEV2="$TEST_BASE_DIR/spare-dev2" SPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2" -for type in "mirror" "raidz" "raidz2" "raidz3"; do - # 1. Create a pool with two hot spares - truncate -s $SPA_MINDEVSIZE $DATA_DEVS $SPARE_DEVS - log_must zpool create -f $TESTPOOL $type $DATA_DEVS spare $SPARE_DEVS +for type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do + if [ "$type" = "draid2:1s" ]; then + # 1. Create a dRAID pool with a distributed and traditional + # hot spare to provide test coverage for both configurations. + # + # Corruption is injected in the third and fourth vdevs + # since the dRAID permutation at these offsets maps to + # distributed spare space and not data devices. + # + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 + log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ + $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ + spare $SPARE_DEV1 + SPARE1=$SPARE_DEV1 + SPARE2="draid2-0-0" + elif [ "$type" = "mirror" ]; then + # 1. Create a 3-way mirror pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type \ + $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + else + # 1. Create a raidz pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ + spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + fi # 2. Inject IO ERRORS with a zinject error handler on the first device log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL @@ -79,11 +107,11 @@ for type in "mirror" "raidz" "raidz2" "raidz3"; do # 3. Start a scrub log_must zpool scrub $TESTPOOL - # 4. Verify the ZED kicks in a hot spare and expected pool/device status + # 4. Verify the ZED kicks in a hot spare and the pool/device status log_note "Wait for ZED to auto-spare" log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV1 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "INUSE" + log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" # 5. Inject IO ERRORS on a second device @@ -98,10 +126,14 @@ for type in "mirror" "raidz" "raidz2" "raidz3"; do # 7. Verify the ZED kicks in a second hot spare log_note "Wait for ZED to auto-spare" log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV2 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "INUSE" + log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" + while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do + sleep 1 + done + # 8. Clear the fault on both devices log_must zinject -c all log_must zpool clear $TESTPOOL $FAULT_DEV1 @@ -110,8 +142,8 @@ for type in "mirror" "raidz" "raidz2" "raidz3"; do # 9. Verify the hot spares are available and expected pool/device status log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "AVAIL" - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "AVAIL" + log_must wait_hotspare_state $TESTPOOL $SPARE1 "AVAIL" + log_must wait_hotspare_state $TESTPOOL $SPARE2 "AVAIL" log_must check_state $TESTPOOL "" "ONLINE" # Cleanup @@ -120,11 +152,37 @@ done # Rinse and repeat, this time faulting both devices at the same time # NOTE: "raidz" is excluded since it cannot survive 2 faulted devices -# NOTE: "mirror" is a 4-way mirror here and should survive this test -for type in "mirror" "raidz2" "raidz3"; do - # 1. Create a pool with two hot spares - truncate -s $SPA_MINDEVSIZE $DATA_DEVS $SPARE_DEVS - log_must zpool create -f $TESTPOOL $type $DATA_DEVS spare $SPARE_DEVS +# NOTE: "mirror" is a 3-way mirror here and should survive this test +for type in "mirror" "raidz2" "raidz3" "draid2:1s"; do + if [ "$type" = "draid2:1s" ]; then + # 1. Create a dRAID pool with a distributed and traditional + # hot spare to provide test coverage for both configurations. + # + # Corruption is injected in the third and fourth vdevs + # since the dRAID permutation at these offsets maps to + # distributed spare space and not data devices. + # + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 + log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ + $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ + spare $SPARE_DEV1 + SPARE1=$SPARE_DEV1 + SPARE2="draid2-0-0" + elif [ "$type" = "mirror" ]; then + # 1. Create a 3-way mirror pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type \ + $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + else + # 1. Create a raidz pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ + spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + fi # 2. Inject IO ERRORS with a zinject error handler on two devices log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &" @@ -133,14 +191,14 @@ for type in "mirror" "raidz2" "raidz3"; do # 3. Start a scrub log_must zpool scrub $TESTPOOL - # 4. Verify the ZED kicks in two hot spares and expected pool/device status + # 4. Verify the ZED kicks in two hot spares and the pool/device status log_note "Wait for ZED to auto-spare" log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV1 "ONLINE" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV2 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "INUSE" - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "INUSE" + log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 + log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" + log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" # 5. Clear the fault on both devices diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh index 467161359..4229537b3 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh @@ -42,7 +42,7 @@ if is_linux; then # Add one 512b spare device (4Kn would generate IO errors on replace) # NOTE: must be larger than other "file" vdevs and minimum SPA devsize: # add 32m of fudge - load_scsi_debug $(($SPA_MINDEVSIZE/1024/1024+32)) 1 1 1 '512b' + load_scsi_debug $(($MINVDEVSIZE/1024/1024+32)) 1 1 1 '512b' else log_unsupported "scsi debug module unsupported" fi @@ -72,7 +72,7 @@ SPARE_DISKDEV="$(get_debug_device)" for vdev in $SAFE_FILEDEVPOOL1 $SAFE_FILEDEVPOOL2 $FAIL_FILEDEVPOOL1 \ $FAIL_FILEDEVPOOL2 $SPARE_FILEDEV; do - log_must truncate -s $SPA_MINDEVSIZE $vdev + log_must truncate -s $MINVDEVSIZE $vdev done for spare in $SPARE_FILEDEV $SPARE_DISKDEV; do diff --git a/tests/zfs-tests/tests/functional/raidz/Makefile.am b/tests/zfs-tests/tests/functional/raidz/Makefile.am index 694de18a6..d93eb73cf 100644 --- a/tests/zfs-tests/tests/functional/raidz/Makefile.am +++ b/tests/zfs-tests/tests/functional/raidz/Makefile.am @@ -3,4 +3,6 @@ dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ raidz_001_neg.ksh \ - raidz_002_pos.ksh + raidz_002_pos.ksh \ + raidz_003_pos.ksh \ + raidz_004_pos.ksh diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh new file mode 100755 index 000000000..bf22632c7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh @@ -0,0 +1,41 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Call the raidz_test tool with -S and -e to test all supported raidz +# implementations with expanded map and default reflow offset. +# This options will test several raidz block geometries and several zio +# parameters that affect raidz block layout. Data reconstruction performs +# all combinations of failed disks. Wall time is set to 5min, but actual +# runtime might be longer. +# + +log_must raidz_test -S -e -t 60 + +log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh new file mode 100755 index 000000000..6cd2bf7c9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh @@ -0,0 +1,41 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Call the raidz_test tool with -S and -e to test all supported raidz +# implementations with expanded map and zero reflow offset. +# This options will test several raidz block geometries and several zio +# parameters that affect raidz block layout. Data reconstruction performs +# all combinations of failed disks. Wall time is set to 5min, but actual +# runtime might be longer. +# + +log_must raidz_test -S -e -r 0 -t 60 + +log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/tests/zfs-tests/tests/functional/redundancy/Makefile.am b/tests/zfs-tests/tests/functional/redundancy/Makefile.am index 6f6cc405b..b2d4414b2 100644 --- a/tests/zfs-tests/tests/functional/redundancy/Makefile.am +++ b/tests/zfs-tests/tests/functional/redundancy/Makefile.am @@ -2,10 +2,17 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/redundancy dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ - redundancy_001_pos.ksh \ - redundancy_002_pos.ksh \ - redundancy_003_pos.ksh \ - redundancy_004_neg.ksh + redundancy_draid1.ksh \ + redundancy_draid2.ksh \ + redundancy_draid3.ksh \ + redundancy_draid_spare1.ksh \ + redundancy_draid_spare2.ksh \ + redundancy_draid_spare3.ksh \ + redundancy_mirror.ksh \ + redundancy_raidz1.ksh \ + redundancy_raidz2.ksh \ + redundancy_raidz3.ksh \ + redundancy_stripe.ksh dist_pkgdata_DATA = \ redundancy.cfg \ diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib index 9bf2df0d1..26ded8720 100644 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib @@ -66,6 +66,23 @@ function random echo $value } +# +# Get the number of checksum errors for the pool. +# +# $1 Pool +# +function cksum_pool +{ + typeset -i cksum=$(zpool status $1 | awk ' + !NF { isvdev = 0 } + isvdev { errors += $NF } + /CKSUM$/ { isvdev = 1 } + END { print errors } + ') + + echo $cksum +} + # # Record the directories construction and checksum all the files which reside # within the specified pool @@ -81,6 +98,7 @@ function record_data [[ -z $pool ]] && log_fail "No specified pool." [[ -f $recordfile ]] && log_must rm -f $recordfile + sync_pool $pool typeset mntpnt mntpnt=$(get_prop mountpoint $pool) log_must eval "du -a $mntpnt > $recordfile 2>&1" @@ -119,22 +137,43 @@ function setup_test_env destroy_pool $pool fi - log_must mkfile $MINVDEVSIZE $vdevs + log_must truncate -s $MINVDEVSIZE $vdevs - log_must zpool create -m $TESTDIR $pool $keyword $vdevs + log_must zpool create -f -m $TESTDIR $pool $keyword $vdevs log_note "Filling up the filesystem ..." typeset -i ret=0 typeset -i i=0 typeset file=$TESTDIR/file + typeset -i limit + (( limit = $(get_prop available $pool) / 4 )) + while true ; do - file_write -o create -f $file.$i \ - -b $BLOCKSZ -c $NUM_WRITES + [[ $(get_prop available $pool) -lt $limit ]] && break + file_write -o create -f $file.$i -b $BLOCKSZ -c $NUM_WRITES + ret=$? + (( $ret != 0 )) && break + (( i = i + 1 )) + done + + record_data $TESTPOOL $PRE_RECORD_FILE +} + +function refill_test_env +{ + log_note "Re-filling the filesystem ..." + typeset -i ret=0 + typeset -i i=0 + typeset mntpnt + mntpnt=$(get_prop mountpoint $pool) + typeset file=$mntpnt/file + while [[ -e $file.$i ]]; do + log_must rm -f $file.$i + file_write -o create -f $file.$i -b $BLOCKSZ -c $NUM_WRITES ret=$? (( $ret != 0 )) && break (( i = i + 1 )) done - (($ret != 28 )) && log_note "file_write return value($ret) is unexpected." record_data $TESTPOOL $PRE_RECORD_FILE } diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid1.ksh new file mode 100755 index 000000000..85d420ab0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid1.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A draid pool can withstand at most 1 device failing or missing. +# +# STRATEGY: +# 1. Create N(>3,<6) virtual disk files. +# 2. Create draid pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged one of the virtual disk file. +# 6. Verify the data is correct to prove draid can withstand 1 device is +# failing. +# + +verify_runnable "global" + +log_assert "Verify draid pool can withstand one device failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 3 6) +setup_test_env $TESTPOOL draid $cnt + +# +# Inject data corruption error for draid pool +# +damage_devs $TESTPOOL 1 "label" +log_must is_data_valid $TESTPOOL +log_must clear_errors $TESTPOOL + +# +# Inject bad device error for draid pool +# +damage_devs $TESTPOOL 1 +log_must is_data_valid $TESTPOOL +log_must recover_bad_missing_devs $TESTPOOL 1 + +# +# Inject missing device error for draid pool +# +remove_devs $TESTPOOL 1 +log_must is_data_valid $TESTPOOL + +log_pass "draid pool can withstand one device failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid2.ksh new file mode 100755 index 000000000..04f1fdfb1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid2.ksh @@ -0,0 +1,85 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A draid2 pool can withstand 2 devices are failing or missing. +# +# STRATEGY: +# 1. Create N(>4,<6) virtual disk files. +# 2. Create draid2 pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged at most two of the virtual disk files. +# 6. Verify the data is correct to prove draid2 can withstand 2 devices +# are failing. +# + +verify_runnable "global" + +log_assert "Verify draid2 pool can withstand two devices failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 4 6) +setup_test_env $TESTPOOL draid2 $cnt + +# +# Inject data corruption errors for draid2 pool +# +for i in 1 2; do + damage_devs $TESTPOOL $i "label" + log_must is_data_valid $TESTPOOL + log_must clear_errors $TESTPOOL +done + +# +# Inject bad devices errors for draid2 pool +# +for i in 1 2; do + damage_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +# +# Inject missing device errors for draid2 pool +# +for i in 1 2; do + remove_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +log_pass "draid2 pool can withstand two devices failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid3.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid3.ksh new file mode 100755 index 000000000..bddd150d0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid3.ksh @@ -0,0 +1,85 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A draid3 pool can withstand 3 devices are failing or missing. +# +# STRATEGY: +# 1. Create N(>5,<6) virtual disk files. +# 2. Create draid3 pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged at most two of the virtual disk files. +# 6. Verify the data is correct to prove draid3 can withstand 3 devices +# are failing. +# + +verify_runnable "global" + +log_assert "Verify draid3 pool can withstand three devices failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 5 6) +setup_test_env $TESTPOOL draid3 $cnt + +# +# Inject data corruption errors for draid3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i "label" + log_must is_data_valid $TESTPOOL + log_must clear_errors $TESTPOOL +done + +# +# Inject bad devices errors for draid3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +# +# Inject missing device errors for draid3 pool +# +for i in 1 2 3; do + remove_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +log_pass "draid3 pool can withstand three devices failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh new file mode 100755 index 000000000..3b7951596 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh @@ -0,0 +1,107 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify resilver to dRAID distributed spares. +# +# STRATEGY: +# 1. For resilvers: +# a. Create a semi-random dRAID pool configuration which can: +# - sustain N failures (1-3), and +# - has N distributed spares to replace all faulted vdevs +# b. Fill the pool with data +# c. Systematically fault a vdev, then replace it with a spare +# d. Scrub the pool to verify no data was lost +# e. Verify the contents of files in the pool +# + +log_assert "Verify resilver to dRAID distributed spares" + +log_onexit cleanup + +for replace_mode in "healing" "sequential"; do + + if [[ "$replace_mode" = "sequential" ]]; then + flags="-s" + else + flags="" + fi + + parity=$(random_int_between 1 3) + spares=$(random_int_between $parity 3) + data=$(random_int_between 1 8) + + (( min_children = (data + parity + spares) )) + children=$(random_int_between $min_children 16) + + draid="draid${parity}:${data}d:${children}c:${spares}s" + + setup_test_env $TESTPOOL $draid $children + + i=0 + while [[ $i -lt $spares ]]; do + fault_vdev="$BASEDIR/vdev$i" + spare_vdev="draid${parity}-0-${i}" + + log_must zpool offline -f $TESTPOOL $fault_vdev + log_must check_vdev_state $TESTPOOL $fault_vdev "FAULTED" + log_must zpool replace -w $flags $TESTPOOL \ + $fault_vdev $spare_vdev + log_must check_vdev_state spare-$i "DEGRADED" + log_must check_vdev_state $spare_vdev "ONLINE" + log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE" + log_must zpool detach $TESTPOOL $fault_vdev + + resilver_cksum=$(cksum_pool $TESTPOOL) + if [[ $resilver_cksum != 0 ]]; then + log_must zpool status -v $TESTPOOL + log_fail "$replace_mode resilver " + "cksum errors: $resilver_cksum" + fi + + if [[ "$replace_mode" = "healing" ]]; then + log_must zpool scrub $TESTPOOL + fi + + log_must wait_scrubbed $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + scrub_cksum=$(cksum_pool $TESTPOOL) + if [[ $scrub_cksum != 0 ]]; then + log_must zpool status -v $TESTPOOL + log_fail "scrub cksum errors: $scrub_cksum" + fi + + (( i += 1 )) + done + + log_must is_data_valid $TESTPOOL + + cleanup +done + +log_pass "Verify resilver to dRAID distributed spares" diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh new file mode 100755 index 000000000..08fdd558f --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify multiple dRAID spares can be used. +# +# STRATEGY: +# 1. Create a pool and fill it with data. +# 2. Engage 3 distributed spares and verify the pool +# 3. Refill the filesystem with new data +# 4. Clear the pool to online previous faulted devices and resilver +# 5. Verify the pool and its contents +# + +log_assert "Verify multiple dRAID spares" + +log_onexit cleanup + +parity=1 +spares=3 +data=$(random_int_between 1 4) +children=10 +draid="draid${parity}:${data}d:${children}c:${spares}s" + +setup_test_env $TESTPOOL $draid $children + +# Replace vdev7 -> draid1-0-0 +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev7 +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev7 draid1-0-0 + +# Replace vdev8 -> draid1-0-1 +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8 +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev8 draid1-0-1 + +# Replace vdev9 -> draid1-0-2 +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev9 +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev9 draid1-0-2 + +# Verify, refill and verify the pool contents. +verify_pool $TESTPOOL +refill_test_env $TESTPOOL +verify_pool $TESTPOOL + +# Bring everything back online and check for errors. +log_must zpool clear $TESTPOOL +log_must zpool wait -t resilver $TESTPOOL + +log_must wait_hotspare_state $TESTPOOL draid1-0-0 "AVAIL" +log_must wait_hotspare_state $TESTPOOL draid1-0-1 "AVAIL" +log_must wait_hotspare_state $TESTPOOL draid1-0-2 "AVAIL" + +log_must zpool scrub -w $TESTPOOL +log_must check_pool_status $TESTPOOL "scan" "repaired 0B" +log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + +log_must is_data_valid $TESTPOOL + +log_pass "Verify multiple dRAID spares" diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh new file mode 100755 index 000000000..587a1be0a --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh @@ -0,0 +1,197 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify dRAID resilver to traditional and distributed spares for +# a variety of pool configurations and pool states. +# +# STRATEGY: +# 1. For resilvers: +# a. Create a semi-random dRAID pool configuration which can +# sustain 1 failure and has 5 distributed spares. +# b. Fill the pool with data +# c. Systematically fault and replace vdevs in the pools with +# spares to test resilving in common pool states. +# d. Scrub the pool to verify no data was lost +# e. Verify the contents of files in the pool +# + +log_assert "Verify dRAID resilver" + +function cleanup_tunable +{ + log_must set_tunable32 REBUILD_SCRUB_ENABLED 1 + cleanup +} + +log_onexit cleanup_tunable + +if is_kmemleak; then + log_unsupported "Test case runs slowly when kmemleak is enabled" +fi + +# +# Disable scrubbing after a sequential resilver to verify the resilver +# alone is able to reconstruct the data without the help of a scrub. +# +log_must set_tunable32 REBUILD_SCRUB_ENABLED 0 + +for replace_mode in "healing" "sequential"; do + + if [[ "$replace_mode" = "sequential" ]]; then + flags="-s" + else + flags="" + fi + + parity=1 + spares=5 + data=$(random_int_between 1 4) + children=10 + draid="draid${parity}:${data}d:${children}c:${spares}s" + + setup_test_env $TESTPOOL $draid $children + + # + # Perform a variety of replacements to normal and distributed spares + # for a variety of different vdev configurations to exercise different + # resilver code paths. The final configuration is expected to be: + # + # NAME STATE READ WRITE CKSUM + # testpool DEGRADED 0 0 0 + # draid1:1d:10c:5s-0 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/new_vdev0 ONLINE 0 0 0 + # /var/tmp/basedir.28683/new_vdev1 ONLINE 0 0 0 + # spare-2 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/vdev2 FAULTED 0 0 0 + # draid1-0-3 ONLINE 0 0 0 + # spare-3 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/vdev3 FAULTED 0 0 0 + # draid1-0-4 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev4 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev5 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev6 ONLINE 0 0 0 + # draid1-0-0 ONLINE 0 0 0 + # spare-8 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/vdev8 FAULTED 0 0 0 + # draid1-0-1 ONLINE 0 0 0 + # spare-9 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev9 ONLINE 0 0 0 + # draid1-0-2 ONLINE 0 0 0 + # spares + # draid1-0-0 INUSE currently in use + # draid1-0-1 INUSE currently in use + # draid1-0-2 INUSE currently in use + # draid1-0-3 INUSE currently in use + # draid1-0-4 INUSE currently in use + # + + # Distributed spare which replaces original online device + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev7 "ONLINE" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev7 draid1-0-0 + log_must zpool detach $TESTPOOL $BASEDIR/vdev7 + log_must check_vdev_state $TESTPOOL draid1-0-0 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-0 "INUSE" + + # Distributed spare in mirror with original device faulted + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev8 "FAULTED" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev8 draid1-0-1 + log_must check_vdev_state $TESTPOOL spare-8 "DEGRADED" + log_must check_vdev_state $TESTPOOL draid1-0-1 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-1 "INUSE" + + # Distributed spare in mirror with original device still online + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev9 "ONLINE" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev9 draid1-0-2 + log_must check_vdev_state $TESTPOOL spare-9 "ONLINE" + log_must check_vdev_state $TESTPOOL draid1-0-2 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-2 "INUSE" + + # Normal faulted device replacement + new_vdev0="$BASEDIR/new_vdev0" + log_must truncate -s $MINVDEVSIZE $new_vdev0 + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev0 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev0 "FAULTED" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev0 $new_vdev0 + log_must check_vdev_state $TESTPOOL $new_vdev0 "ONLINE" + + # Distributed spare faulted device replacement + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev2 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev2 "FAULTED" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev2 draid1-0-3 + log_must check_vdev_state $TESTPOOL spare-2 "DEGRADED" + log_must check_vdev_state $TESTPOOL draid1-0-3 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-3 "INUSE" + + # Normal online device replacement + new_vdev1="$BASEDIR/new_vdev1" + log_must truncate -s $MINVDEVSIZE $new_vdev1 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev1 "ONLINE" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev1 $new_vdev1 + log_must check_vdev_state $TESTPOOL $new_vdev1 "ONLINE" + + # Distributed spare online device replacement (then fault) + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev3 draid1-0-4 + log_must check_vdev_state $TESTPOOL spare-3 "ONLINE" + log_must check_vdev_state $TESTPOOL draid1-0-4 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-4 "INUSE" + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev3 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev3 "FAULTED" + log_must check_vdev_state $TESTPOOL spare-3 "DEGRADED" + + resilver_cksum=$(cksum_pool $TESTPOOL) + if [[ $resilver_cksum != 0 ]]; then + log_must zpool status -v $TESTPOOL + log_fail "$replace_mode resilver cksum errors: $resilver_cksum" + fi + + if [[ "$replace_mode" = "healing" ]]; then + log_must zpool scrub -w $TESTPOOL + else + if [[ $(get_tunable REBUILD_SCRUB_ENABLED) -eq 0 ]]; then + log_must zpool scrub -w $TESTPOOL + else + log_must wait_scrubbed $TESTPOOL + fi + fi + + log_must is_pool_scrubbed $TESTPOOL + + scrub_cksum=$(cksum_pool $TESTPOOL) + if [[ $scrub_cksum != 0 ]]; then + log_must zpool status -v $TESTPOOL + log_fail "scrub cksum errors: $scrub_cksum" + fi + + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + log_must is_data_valid $TESTPOOL + + cleanup +done + +log_pass "Verify resilver to dRAID distributed spares" diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_003_pos.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_mirror.ksh similarity index 100% rename from tests/zfs-tests/tests/functional/redundancy/redundancy_003_pos.ksh rename to tests/zfs-tests/tests/functional/redundancy/redundancy_mirror.ksh diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_001_pos.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz1.ksh similarity index 93% rename from tests/zfs-tests/tests/functional/redundancy/redundancy_001_pos.ksh rename to tests/zfs-tests/tests/functional/redundancy/redundancy_raidz1.ksh index 90d14f600..a73890e4c 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz1.ksh @@ -48,7 +48,7 @@ verify_runnable "global" -log_assert "Verify raidz pool can withstand one device is failing." +log_assert "Verify raidz pool can withstand one device failing." log_onexit cleanup typeset -i cnt=$(random_int_between 2 5) @@ -74,4 +74,4 @@ log_must recover_bad_missing_devs $TESTPOOL 1 remove_devs $TESTPOOL 1 log_must is_data_valid $TESTPOOL -log_pass "Raidz pool can withstand one devices is failing passed." +log_pass "raidz pool can withstand one device failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_002_pos.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz2.ksh similarity index 93% rename from tests/zfs-tests/tests/functional/redundancy/redundancy_002_pos.ksh rename to tests/zfs-tests/tests/functional/redundancy/redundancy_raidz2.ksh index 74bda1999..94b9b8825 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz2.ksh @@ -48,7 +48,7 @@ verify_runnable "global" -log_assert "Verify raidz2 pool can withstand two devices are failing." +log_assert "Verify raidz2 pool can withstand two devices failing." log_onexit cleanup typeset -i cnt=$(random_int_between 3 5) @@ -81,4 +81,4 @@ for i in 1 2; do log_must recover_bad_missing_devs $TESTPOOL $i done -log_pass "Raidz2 pool can withstand two devices are failing passed." +log_pass "raidz2 pool can withstand two devices failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz3.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz3.ksh new file mode 100755 index 000000000..0a01c4710 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz3.ksh @@ -0,0 +1,84 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A raidz3 pool can withstand 3 devices are failing or missing. +# +# STRATEGY: +# 1. Create N(>4,<5) virtual disk files. +# 2. Create raidz3 pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged at most two of the virtual disk files. +# 6. Verify the data is correct to prove raidz3 can withstand 3 devices +# are failing. +# + +verify_runnable "global" + +log_assert "Verify raidz3 pool can withstand three devices failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 4 5) +setup_test_env $TESTPOOL raidz3 $cnt + +# +# Inject data corruption errors for raidz3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i "label" + log_must is_data_valid $TESTPOOL + log_must clear_errors $TESTPOOL +done + +# +# Inject bad devices errors for raidz3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +# +# Inject missing device errors for raidz3 pool +# +for i in 1 2 3; do + remove_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +log_pass "raidz3 pool can withstand three devices failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_004_neg.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_stripe.ksh similarity index 100% rename from tests/zfs-tests/tests/functional/redundancy/redundancy_004_neg.ksh rename to tests/zfs-tests/tests/functional/redundancy/redundancy_stripe.ksh diff --git a/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh index e9427c7ad..998d3eec7 100755 --- a/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh +++ b/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh @@ -38,7 +38,7 @@ # Attaching disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror/raidz) and +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and # start some random I/O # 2. Attach a disk to the pool. # 3. Verify the integrity of the file system and the resilvering. @@ -152,7 +152,7 @@ done log_note "Verify 'zpool attach' fails with non-mirrors." -for type in "" "raidz" "raidz1"; do +for type in "" "raidz" "raidz1" "draid" "draid1"; do for op in "" "-f"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 diff --git a/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh b/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh index 4261d4d67..e99d681bb 100755 --- a/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh +++ b/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh @@ -37,7 +37,7 @@ # Attaching disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror/raidz) and +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and # start some random I/O # 2. Attach a disk to the pool. # 3. Verify the integrity of the file system and the resilvering. @@ -151,7 +151,7 @@ done log_note "Verify 'zpool attach' fails with non-mirrors." -for type in "" "raidz" "raidz1"; do +for type in "" "raidz" "raidz1" "draid"; do for op in "" "-f"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 diff --git a/tests/zfs-tests/tests/functional/replacement/detach.ksh b/tests/zfs-tests/tests/functional/replacement/detach.ksh index aa3ec4f7a..f049c639d 100755 --- a/tests/zfs-tests/tests/functional/replacement/detach.ksh +++ b/tests/zfs-tests/tests/functional/replacement/detach.ksh @@ -37,7 +37,7 @@ # Detaching disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror/raidz) and +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and # start some random I/O # 2. Detach a disk from the pool. # 3. Verify the integrity of the file system and the resilvering. @@ -143,7 +143,7 @@ destroy_pool $TESTPOOL1 log_note "Verify 'zpool detach' fails with non-mirrors." -for type in "" "raidz" "raidz1"; do +for type in "" "raidz" "raidz1" "draid"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh index c919b44b2..26dc6f87b 100755 --- a/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh +++ b/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh @@ -26,7 +26,7 @@ # # DESCRIPTION: # Executing 'zpool replace -s' for raidz vdevs failed. Sequential -# resilvers are only allowed for stripe/mirror pools. +# resilvers are only allowed for stripe/mirror/dRAID pools. # # STRATEGY: # 1. Create a raidz pool, verify 'zpool replace -s' fails @@ -67,4 +67,9 @@ log_must zpool create $TESTPOOL1 mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE destroy_pool $TESTPOOL1 +# draid +log_must zpool create $TESTPOOL1 draid ${VDEV_FILES[@]} +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +destroy_pool $TESTPOOL1 + log_pass "Sequential resilver is not allowed for raidz vdevs" diff --git a/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh index 599735228..b3c7995fd 100755 --- a/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh +++ b/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh @@ -38,7 +38,7 @@ # Replacing disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror) and +# 1. Create multidisk pools (stripe/mirror/draid) and # start some random I/O # 2. Replace a disk in the pool with another disk. # 3. Verify the integrity of the file system and the rebuilding. @@ -137,7 +137,7 @@ done # log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE -for type in "" "mirror"; do +for type in "" "mirror" "draid"; do for op in "" "-f"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 diff --git a/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh b/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh index 253cf65e4..2585397bb 100755 --- a/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh +++ b/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh @@ -37,7 +37,7 @@ # Replacing disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror/raidz) and +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and # start some random I/O # 2. Replace a disk in the pool with another disk. # 3. Verify the integrity of the file system and the resilvering. @@ -134,7 +134,7 @@ done # log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE -for type in "" "raidz" "mirror"; do +for type in "" "raidz" "mirror" "draid"; do for op in "" "-f"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh index d48ee45d0..924b56935 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh @@ -70,14 +70,20 @@ log_must set_tunable64 VDEV_MIN_MS_COUNT 32 typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) -for type in "" "mirror" "raidz2"; do +for type in "" "mirror" "raidz2" "draid"; do if [[ "$type" = "" ]]; then VDEVS="$TRIM_VDEV1" elif [[ "$type" = "mirror" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" - else + elif [[ "$type" = "raidz2" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" + elif [[ "$type" = "draid" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" + + # The per-vdev utilization is lower due to the capacity + # resilverd for the distributed spare. + VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.50 / 1024 / 1024) )) fi log_must truncate -s $((4 * MINVDEVSIZE)) $VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh index 6af877241..78fe18fa6 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh @@ -60,7 +60,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "raidz2" "raidz3"; do +for type in "" "mirror" "raidz" "draid"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh index a0dd1c884..13c9b95e0 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh @@ -61,7 +61,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "raidz2" "raidz3"; do +for type in "" "mirror" "raidz" "raidz2" "draid" "draid2"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/trim_config.ksh b/tests/zfs-tests/tests/functional/trim/trim_config.ksh index 44f187cc6..9a6e19e1c 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_config.ksh @@ -70,14 +70,20 @@ log_must set_tunable64 VDEV_MIN_MS_COUNT 32 typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) -for type in "" "mirror" "raidz2"; do +for type in "" "mirror" "raidz2" "draid"; do if [[ "$type" = "" ]]; then VDEVS="$TRIM_VDEV1" elif [[ "$type" = "mirror" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" - else + elif [[ "$type" = "raidz2" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" + elif [[ "$type" = "draid" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" + + # The per-vdev utilization is lower due to the capacity + # resilverd for the distributed spare. + VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.50 / 1024 / 1024) )) fi log_must truncate -s $((4 * MINVDEVSIZE)) $VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh index e25b52747..38f226d7f 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh @@ -60,7 +60,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "raidz2" "raidz3"; do +for type in "" "mirror" "raidz" "draid"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS