mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 10:37:35 +03:00
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
This commit is contained in:
@@ -83,8 +83,17 @@ run_gen_bench_impl(const char *impl)
|
||||
/* create suitable raidz_map */
|
||||
ncols = rto_opts.rto_dcols + fn + 1;
|
||||
zio_bench.io_size = 1ULL << ds;
|
||||
rm_bench = vdev_raidz_map_alloc(&zio_bench,
|
||||
BENCH_ASHIFT, ncols, fn+1);
|
||||
|
||||
if (rto_opts.rto_expand) {
|
||||
rm_bench = vdev_raidz_map_alloc_expanded(
|
||||
zio_bench.io_abd,
|
||||
zio_bench.io_size, zio_bench.io_offset,
|
||||
rto_opts.rto_ashift, ncols+1, ncols,
|
||||
fn+1, rto_opts.rto_expand_offset);
|
||||
} else {
|
||||
rm_bench = vdev_raidz_map_alloc(&zio_bench,
|
||||
BENCH_ASHIFT, ncols, fn+1);
|
||||
}
|
||||
|
||||
/* estimate iteration count */
|
||||
iter_cnt = GEN_BENCH_MEMORY;
|
||||
@@ -163,8 +172,16 @@ run_rec_bench_impl(const char *impl)
|
||||
(1ULL << BENCH_ASHIFT))
|
||||
continue;
|
||||
|
||||
rm_bench = vdev_raidz_map_alloc(&zio_bench,
|
||||
BENCH_ASHIFT, ncols, PARITY_PQR);
|
||||
if (rto_opts.rto_expand) {
|
||||
rm_bench = vdev_raidz_map_alloc_expanded(
|
||||
zio_bench.io_abd,
|
||||
zio_bench.io_size, zio_bench.io_offset,
|
||||
BENCH_ASHIFT, ncols+1, ncols,
|
||||
PARITY_PQR, rto_opts.rto_expand_offset);
|
||||
} else {
|
||||
rm_bench = vdev_raidz_map_alloc(&zio_bench,
|
||||
BENCH_ASHIFT, ncols, PARITY_PQR);
|
||||
}
|
||||
|
||||
/* estimate iteration count */
|
||||
iter_cnt = (REC_BENCH_MEMORY);
|
||||
|
||||
+285
-45
@@ -77,16 +77,20 @@ static void print_opts(raidz_test_opts_t *opts, boolean_t force)
|
||||
(void) fprintf(stdout, DBLSEP "Running with options:\n"
|
||||
" (-a) zio ashift : %zu\n"
|
||||
" (-o) zio offset : 1 << %zu\n"
|
||||
" (-e) expanded map : %s\n"
|
||||
" (-r) reflow offset : %llx\n"
|
||||
" (-d) number of raidz data columns : %zu\n"
|
||||
" (-s) size of DATA : 1 << %zu\n"
|
||||
" (-S) sweep parameters : %s \n"
|
||||
" (-v) verbose : %s \n\n",
|
||||
opts->rto_ashift, /* -a */
|
||||
ilog2(opts->rto_offset), /* -o */
|
||||
opts->rto_dcols, /* -d */
|
||||
ilog2(opts->rto_dsize), /* -s */
|
||||
opts->rto_sweep ? "yes" : "no", /* -S */
|
||||
verbose); /* -v */
|
||||
opts->rto_ashift, /* -a */
|
||||
ilog2(opts->rto_offset), /* -o */
|
||||
opts->rto_expand ? "yes" : "no", /* -e */
|
||||
(u_longlong_t)opts->rto_expand_offset, /* -r */
|
||||
opts->rto_dcols, /* -d */
|
||||
ilog2(opts->rto_dsize), /* -s */
|
||||
opts->rto_sweep ? "yes" : "no", /* -S */
|
||||
verbose); /* -v */
|
||||
}
|
||||
}
|
||||
|
||||
@@ -104,6 +108,8 @@ static void usage(boolean_t requested)
|
||||
"\t[-S parameter sweep (default: %s)]\n"
|
||||
"\t[-t timeout for parameter sweep test]\n"
|
||||
"\t[-B benchmark all raidz implementations]\n"
|
||||
"\t[-e use expanded raidz map (default: %s)]\n"
|
||||
"\t[-r expanded raidz map reflow offset (default: %llx)]\n"
|
||||
"\t[-v increase verbosity (default: %zu)]\n"
|
||||
"\t[-h (print help)]\n"
|
||||
"\t[-T test the test, see if failure would be detected]\n"
|
||||
@@ -114,6 +120,8 @@ static void usage(boolean_t requested)
|
||||
o->rto_dcols, /* -d */
|
||||
ilog2(o->rto_dsize), /* -s */
|
||||
rto_opts.rto_sweep ? "yes" : "no", /* -S */
|
||||
rto_opts.rto_expand ? "yes" : "no", /* -e */
|
||||
(u_longlong_t)o->rto_expand_offset, /* -r */
|
||||
o->rto_v); /* -d */
|
||||
|
||||
exit(requested ? 0 : 1);
|
||||
@@ -128,7 +136,7 @@ static void process_options(int argc, char **argv)
|
||||
|
||||
bcopy(&rto_opts_defaults, o, sizeof (*o));
|
||||
|
||||
while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) {
|
||||
while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) {
|
||||
value = 0;
|
||||
|
||||
switch (opt) {
|
||||
@@ -136,6 +144,12 @@ static void process_options(int argc, char **argv)
|
||||
value = strtoull(optarg, NULL, 0);
|
||||
o->rto_ashift = MIN(13, MAX(9, value));
|
||||
break;
|
||||
case 'e':
|
||||
o->rto_expand = 1;
|
||||
break;
|
||||
case 'r':
|
||||
o->rto_expand_offset = strtoull(optarg, NULL, 0);
|
||||
break;
|
||||
case 'o':
|
||||
value = strtoull(optarg, NULL, 0);
|
||||
o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9;
|
||||
@@ -179,25 +193,34 @@ static void process_options(int argc, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd)
|
||||
#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size)
|
||||
#define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd)
|
||||
#define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size)
|
||||
|
||||
#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd)
|
||||
#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size)
|
||||
#define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd)
|
||||
#define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size)
|
||||
|
||||
static int
|
||||
cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
|
||||
{
|
||||
int i, ret = 0;
|
||||
int r, i, ret = 0;
|
||||
|
||||
VERIFY(parity >= 1 && parity <= 3);
|
||||
|
||||
for (i = 0; i < parity; i++) {
|
||||
if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i))
|
||||
!= 0) {
|
||||
ret++;
|
||||
LOG_OPT(D_DEBUG, opts,
|
||||
"\nParity block [%d] different!\n", i);
|
||||
for (r = 0; r < rm->rm_nrows; r++) {
|
||||
raidz_row_t * const rr = rm->rm_row[r];
|
||||
raidz_row_t * const rrg = opts->rm_golden->rm_row[r];
|
||||
for (i = 0; i < parity; i++) {
|
||||
if (CODE_COL_SIZE(rrg, i) == 0) {
|
||||
VERIFY0(CODE_COL_SIZE(rr, i));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (abd_cmp(CODE_COL(rr, i),
|
||||
CODE_COL(rrg, i)) != 0) {
|
||||
ret++;
|
||||
LOG_OPT(D_DEBUG, opts,
|
||||
"\nParity block [%d] different!\n", i);
|
||||
}
|
||||
}
|
||||
}
|
||||
return (ret);
|
||||
@@ -206,16 +229,26 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
|
||||
static int
|
||||
cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
|
||||
{
|
||||
int i, ret = 0;
|
||||
int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden);
|
||||
int r, i, dcols, ret = 0;
|
||||
|
||||
for (i = 0; i < dcols; i++) {
|
||||
if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i))
|
||||
!= 0) {
|
||||
ret++;
|
||||
for (r = 0; r < rm->rm_nrows; r++) {
|
||||
raidz_row_t *rr = rm->rm_row[r];
|
||||
raidz_row_t *rrg = opts->rm_golden->rm_row[r];
|
||||
dcols = opts->rm_golden->rm_row[0]->rr_cols -
|
||||
raidz_parity(opts->rm_golden);
|
||||
for (i = 0; i < dcols; i++) {
|
||||
if (DATA_COL_SIZE(rrg, i) == 0) {
|
||||
VERIFY0(DATA_COL_SIZE(rr, i));
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG_OPT(D_DEBUG, opts,
|
||||
"\nData block [%d] different!\n", i);
|
||||
if (abd_cmp(DATA_COL(rrg, i),
|
||||
DATA_COL(rr, i)) != 0) {
|
||||
ret++;
|
||||
|
||||
LOG_OPT(D_DEBUG, opts,
|
||||
"\nData block [%d] different!\n", i);
|
||||
}
|
||||
}
|
||||
}
|
||||
return (ret);
|
||||
@@ -236,12 +269,13 @@ init_rand(void *data, size_t size, void *private)
|
||||
static void
|
||||
corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
|
||||
{
|
||||
int i;
|
||||
raidz_col_t *col;
|
||||
|
||||
for (i = 0; i < cnt; i++) {
|
||||
col = &rm->rm_col[tgts[i]];
|
||||
abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL);
|
||||
for (int r = 0; r < rm->rm_nrows; r++) {
|
||||
raidz_row_t *rr = rm->rm_row[r];
|
||||
for (int i = 0; i < cnt; i++) {
|
||||
raidz_col_t *col = &rr->rr_col[tgts[i]];
|
||||
abd_iterate_func(col->rc_abd, 0, col->rc_size,
|
||||
init_rand, NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -288,10 +322,22 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
|
||||
|
||||
VERIFY0(vdev_raidz_impl_set("original"));
|
||||
|
||||
opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
|
||||
opts->rto_ashift, total_ncols, parity);
|
||||
rm_test = vdev_raidz_map_alloc(zio_test,
|
||||
opts->rto_ashift, total_ncols, parity);
|
||||
if (opts->rto_expand) {
|
||||
opts->rm_golden =
|
||||
vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
|
||||
opts->zio_golden->io_size, opts->zio_golden->io_offset,
|
||||
opts->rto_ashift, total_ncols+1, total_ncols,
|
||||
parity, opts->rto_expand_offset);
|
||||
rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
|
||||
zio_test->io_size, zio_test->io_offset,
|
||||
opts->rto_ashift, total_ncols+1, total_ncols,
|
||||
parity, opts->rto_expand_offset);
|
||||
} else {
|
||||
opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
|
||||
opts->rto_ashift, total_ncols, parity);
|
||||
rm_test = vdev_raidz_map_alloc(zio_test,
|
||||
opts->rto_ashift, total_ncols, parity);
|
||||
}
|
||||
|
||||
VERIFY(opts->zio_golden);
|
||||
VERIFY(opts->rm_golden);
|
||||
@@ -312,6 +358,188 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
|
||||
return (err);
|
||||
}
|
||||
|
||||
/*
|
||||
* If reflow is not in progress, reflow_offset should be UINT64_MAX.
|
||||
* For each row, if the row is entirely before reflow_offset, it will
|
||||
* come from the new location. Otherwise this row will come from the
|
||||
* old location. Therefore, rows that straddle the reflow_offset will
|
||||
* come from the old location.
|
||||
*
|
||||
* NOTE: Until raidz expansion is implemented this function is only
|
||||
* needed by raidz_test.c to the multi-row raid_map_t functionality.
|
||||
*/
|
||||
raidz_map_t *
|
||||
vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
|
||||
uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
|
||||
uint64_t nparity, uint64_t reflow_offset)
|
||||
{
|
||||
/* The zio's size in units of the vdev's minimum sector size. */
|
||||
uint64_t s = size >> ashift;
|
||||
uint64_t q, r, bc, devidx, asize = 0, tot;
|
||||
|
||||
/*
|
||||
* "Quotient": The number of data sectors for this stripe on all but
|
||||
* the "big column" child vdevs that also contain "remainder" data.
|
||||
* AKA "full rows"
|
||||
*/
|
||||
q = s / (logical_cols - nparity);
|
||||
|
||||
/*
|
||||
* "Remainder": The number of partial stripe data sectors in this I/O.
|
||||
* This will add a sector to some, but not all, child vdevs.
|
||||
*/
|
||||
r = s - q * (logical_cols - nparity);
|
||||
|
||||
/* The number of "big columns" - those which contain remainder data. */
|
||||
bc = (r == 0 ? 0 : r + nparity);
|
||||
|
||||
/*
|
||||
* The total number of data and parity sectors associated with
|
||||
* this I/O.
|
||||
*/
|
||||
tot = s + nparity * (q + (r == 0 ? 0 : 1));
|
||||
|
||||
/* How many rows contain data (not skip) */
|
||||
uint64_t rows = howmany(tot, logical_cols);
|
||||
int cols = MIN(tot, logical_cols);
|
||||
|
||||
raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
|
||||
KM_SLEEP);
|
||||
rm->rm_nrows = rows;
|
||||
|
||||
for (uint64_t row = 0; row < rows; row++) {
|
||||
raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
|
||||
rr_col[cols]), KM_SLEEP);
|
||||
rm->rm_row[row] = rr;
|
||||
|
||||
/* The starting RAIDZ (parent) vdev sector of the row. */
|
||||
uint64_t b = (offset >> ashift) + row * logical_cols;
|
||||
|
||||
/*
|
||||
* If we are in the middle of a reflow, and any part of this
|
||||
* row has not been copied, then use the old location of
|
||||
* this row.
|
||||
*/
|
||||
int row_phys_cols = physical_cols;
|
||||
if (b + (logical_cols - nparity) > reflow_offset >> ashift)
|
||||
row_phys_cols--;
|
||||
|
||||
/* starting child of this row */
|
||||
uint64_t child_id = b % row_phys_cols;
|
||||
/* The starting byte offset on each child vdev. */
|
||||
uint64_t child_offset = (b / row_phys_cols) << ashift;
|
||||
|
||||
/*
|
||||
* We set cols to the entire width of the block, even
|
||||
* if this row is shorter. This is needed because parity
|
||||
* generation (for Q and R) needs to know the entire width,
|
||||
* because it treats the short row as though it was
|
||||
* full-width (and the "phantom" sectors were zero-filled).
|
||||
*
|
||||
* Another approach to this would be to set cols shorter
|
||||
* (to just the number of columns that we might do i/o to)
|
||||
* and have another mechanism to tell the parity generation
|
||||
* about the "entire width". Reconstruction (at least
|
||||
* vdev_raidz_reconstruct_general()) would also need to
|
||||
* know about the "entire width".
|
||||
*/
|
||||
rr->rr_cols = cols;
|
||||
rr->rr_bigcols = bc;
|
||||
rr->rr_missingdata = 0;
|
||||
rr->rr_missingparity = 0;
|
||||
rr->rr_firstdatacol = nparity;
|
||||
rr->rr_abd_copy = NULL;
|
||||
rr->rr_abd_empty = NULL;
|
||||
rr->rr_nempty = 0;
|
||||
|
||||
for (int c = 0; c < rr->rr_cols; c++, child_id++) {
|
||||
if (child_id >= row_phys_cols) {
|
||||
child_id -= row_phys_cols;
|
||||
child_offset += 1ULL << ashift;
|
||||
}
|
||||
rr->rr_col[c].rc_devidx = child_id;
|
||||
rr->rr_col[c].rc_offset = child_offset;
|
||||
rr->rr_col[c].rc_gdata = NULL;
|
||||
rr->rr_col[c].rc_orig_data = NULL;
|
||||
rr->rr_col[c].rc_error = 0;
|
||||
rr->rr_col[c].rc_tried = 0;
|
||||
rr->rr_col[c].rc_skipped = 0;
|
||||
rr->rr_col[c].rc_need_orig_restore = B_FALSE;
|
||||
|
||||
uint64_t dc = c - rr->rr_firstdatacol;
|
||||
if (c < rr->rr_firstdatacol) {
|
||||
rr->rr_col[c].rc_size = 1ULL << ashift;
|
||||
rr->rr_col[c].rc_abd =
|
||||
abd_alloc_linear(rr->rr_col[c].rc_size,
|
||||
B_TRUE);
|
||||
} else if (row == rows - 1 && bc != 0 && c >= bc) {
|
||||
/*
|
||||
* Past the end, this for parity generation.
|
||||
*/
|
||||
rr->rr_col[c].rc_size = 0;
|
||||
rr->rr_col[c].rc_abd = NULL;
|
||||
} else {
|
||||
/*
|
||||
* "data column" (col excluding parity)
|
||||
* Add an ASCII art diagram here
|
||||
*/
|
||||
uint64_t off;
|
||||
|
||||
if (c < bc || r == 0) {
|
||||
off = dc * rows + row;
|
||||
} else {
|
||||
off = r * rows +
|
||||
(dc - r) * (rows - 1) + row;
|
||||
}
|
||||
rr->rr_col[c].rc_size = 1ULL << ashift;
|
||||
rr->rr_col[c].rc_abd =
|
||||
abd_get_offset(abd, off << ashift);
|
||||
}
|
||||
|
||||
asize += rr->rr_col[c].rc_size;
|
||||
}
|
||||
/*
|
||||
* If all data stored spans all columns, there's a danger that
|
||||
* parity will always be on the same device and, since parity
|
||||
* isn't read during normal operation, that that device's I/O
|
||||
* bandwidth won't be used effectively. We therefore switch
|
||||
* the parity every 1MB.
|
||||
*
|
||||
* ...at least that was, ostensibly, the theory. As a practical
|
||||
* matter unless we juggle the parity between all devices
|
||||
* evenly, we won't see any benefit. Further, occasional writes
|
||||
* that aren't a multiple of the LCM of the number of children
|
||||
* and the minimum stripe width are sufficient to avoid pessimal
|
||||
* behavior. Unfortunately, this decision created an implicit
|
||||
* on-disk format requirement that we need to support for all
|
||||
* eternity, but only for single-parity RAID-Z.
|
||||
*
|
||||
* If we intend to skip a sector in the zeroth column for
|
||||
* padding we must make sure to note this swap. We will never
|
||||
* intend to skip the first column since at least one data and
|
||||
* one parity column must appear in each row.
|
||||
*/
|
||||
if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
|
||||
(offset & (1ULL << 20))) {
|
||||
ASSERT(rr->rr_cols >= 2);
|
||||
ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
|
||||
devidx = rr->rr_col[0].rc_devidx;
|
||||
uint64_t o = rr->rr_col[0].rc_offset;
|
||||
rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
|
||||
rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
|
||||
rr->rr_col[1].rc_devidx = devidx;
|
||||
rr->rr_col[1].rc_offset = o;
|
||||
}
|
||||
|
||||
}
|
||||
ASSERT3U(asize, ==, tot << ashift);
|
||||
|
||||
/* init RAIDZ parity ops */
|
||||
rm->rm_ops = vdev_raidz_math_get_ops();
|
||||
|
||||
return (rm);
|
||||
}
|
||||
|
||||
static raidz_map_t *
|
||||
init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
|
||||
{
|
||||
@@ -330,8 +558,15 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
|
||||
(*zio)->io_abd = raidz_alloc(alloc_dsize);
|
||||
init_zio_abd(*zio);
|
||||
|
||||
rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
|
||||
total_ncols, parity);
|
||||
if (opts->rto_expand) {
|
||||
rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
|
||||
(*zio)->io_size, (*zio)->io_offset,
|
||||
opts->rto_ashift, total_ncols+1, total_ncols,
|
||||
parity, opts->rto_expand_offset);
|
||||
} else {
|
||||
rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
|
||||
total_ncols, parity);
|
||||
}
|
||||
VERIFY(rm);
|
||||
|
||||
/* Make sure code columns are destroyed */
|
||||
@@ -420,7 +655,7 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
|
||||
if (fn < RAIDZ_REC_PQ) {
|
||||
/* can reconstruct 1 failed data disk */
|
||||
for (x0 = 0; x0 < opts->rto_dcols; x0++) {
|
||||
if (x0 >= rm->rm_cols - raidz_parity(rm))
|
||||
if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
|
||||
continue;
|
||||
|
||||
/* Check if should stop */
|
||||
@@ -445,10 +680,11 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
|
||||
} else if (fn < RAIDZ_REC_PQR) {
|
||||
/* can reconstruct 2 failed data disk */
|
||||
for (x0 = 0; x0 < opts->rto_dcols; x0++) {
|
||||
if (x0 >= rm->rm_cols - raidz_parity(rm))
|
||||
if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
|
||||
continue;
|
||||
for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
|
||||
if (x1 >= rm->rm_cols - raidz_parity(rm))
|
||||
if (x1 >= rm->rm_row[0]->rr_cols -
|
||||
raidz_parity(rm))
|
||||
continue;
|
||||
|
||||
/* Check if should stop */
|
||||
@@ -475,14 +711,15 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
|
||||
} else {
|
||||
/* can reconstruct 3 failed data disk */
|
||||
for (x0 = 0; x0 < opts->rto_dcols; x0++) {
|
||||
if (x0 >= rm->rm_cols - raidz_parity(rm))
|
||||
if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
|
||||
continue;
|
||||
for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
|
||||
if (x1 >= rm->rm_cols - raidz_parity(rm))
|
||||
if (x1 >= rm->rm_row[0]->rr_cols -
|
||||
raidz_parity(rm))
|
||||
continue;
|
||||
for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) {
|
||||
if (x2 >=
|
||||
rm->rm_cols - raidz_parity(rm))
|
||||
if (x2 >= rm->rm_row[0]->rr_cols -
|
||||
raidz_parity(rm))
|
||||
continue;
|
||||
|
||||
/* Check if should stop */
|
||||
@@ -700,6 +937,8 @@ run_sweep(void)
|
||||
opts->rto_dcols = dcols_v[d];
|
||||
opts->rto_offset = (1 << ashift_v[a]) * rand();
|
||||
opts->rto_dsize = size_v[s];
|
||||
opts->rto_expand = rto_opts.rto_expand;
|
||||
opts->rto_expand_offset = rto_opts.rto_expand_offset;
|
||||
opts->rto_v = 0; /* be quiet */
|
||||
|
||||
VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts,
|
||||
@@ -732,6 +971,7 @@ exit:
|
||||
return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
|
||||
@@ -44,13 +44,15 @@ static const char *raidz_impl_names[] = {
|
||||
|
||||
typedef struct raidz_test_opts {
|
||||
size_t rto_ashift;
|
||||
size_t rto_offset;
|
||||
uint64_t rto_offset;
|
||||
size_t rto_dcols;
|
||||
size_t rto_dsize;
|
||||
size_t rto_v;
|
||||
size_t rto_sweep;
|
||||
size_t rto_sweep_timeout;
|
||||
size_t rto_benchmark;
|
||||
size_t rto_expand;
|
||||
uint64_t rto_expand_offset;
|
||||
size_t rto_sanity;
|
||||
size_t rto_gdb;
|
||||
|
||||
@@ -69,6 +71,8 @@ static const raidz_test_opts_t rto_opts_defaults = {
|
||||
.rto_v = 0,
|
||||
.rto_sweep = 0,
|
||||
.rto_benchmark = 0,
|
||||
.rto_expand = 0,
|
||||
.rto_expand_offset = -1ULL,
|
||||
.rto_sanity = 0,
|
||||
.rto_gdb = 0,
|
||||
.rto_should_stop = B_FALSE
|
||||
@@ -113,4 +117,7 @@ void init_zio_abd(zio_t *zio);
|
||||
|
||||
void run_raidz_benchmark(void);
|
||||
|
||||
struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
|
||||
uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
|
||||
|
||||
#endif /* RAIDZ_TEST_H */
|
||||
|
||||
+7
-3
@@ -1642,7 +1642,11 @@ dump_metaslab(metaslab_t *msp)
|
||||
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
|
||||
}
|
||||
|
||||
ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
|
||||
if (vd->vdev_ops == &vdev_draid_ops)
|
||||
ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
|
||||
else
|
||||
ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);
|
||||
|
||||
dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
|
||||
|
||||
if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
|
||||
@@ -5203,8 +5207,6 @@ zdb_blkptr_done(zio_t *zio)
|
||||
zdb_cb_t *zcb = zio->io_private;
|
||||
zbookmark_phys_t *zb = &zio->io_bookmark;
|
||||
|
||||
abd_free(zio->io_abd);
|
||||
|
||||
mutex_enter(&spa->spa_scrub_lock);
|
||||
spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
|
||||
cv_broadcast(&spa->spa_scrub_io_cv);
|
||||
@@ -5231,6 +5233,8 @@ zdb_blkptr_done(zio_t *zio)
|
||||
blkbuf);
|
||||
}
|
||||
mutex_exit(&spa->spa_scrub_lock);
|
||||
|
||||
abd_free(zio->io_abd);
|
||||
}
|
||||
|
||||
static int
|
||||
|
||||
@@ -435,7 +435,15 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
|
||||
return;
|
||||
}
|
||||
|
||||
ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE);
|
||||
/*
|
||||
* Prefer sequential resilvering when supported (mirrors and dRAID),
|
||||
* otherwise fallback to a traditional healing resilver.
|
||||
*/
|
||||
ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE);
|
||||
if (ret != 0) {
|
||||
ret = zpool_vdev_attach(zhp, fullpath, path, nvroot,
|
||||
B_TRUE, B_FALSE);
|
||||
}
|
||||
|
||||
zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
|
||||
fullpath, path, (ret == 0) ? "no errors" :
|
||||
|
||||
@@ -219,12 +219,18 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
|
||||
* replace it.
|
||||
*/
|
||||
for (s = 0; s < nspares; s++) {
|
||||
char *spare_name;
|
||||
boolean_t rebuild = B_FALSE;
|
||||
char *spare_name, *type;
|
||||
|
||||
if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
|
||||
&spare_name) != 0)
|
||||
continue;
|
||||
|
||||
/* prefer sequential resilvering for distributed spares */
|
||||
if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE,
|
||||
&type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
|
||||
rebuild = B_TRUE;
|
||||
|
||||
/* if set, add the "ashift" pool property to the spare nvlist */
|
||||
if (source != ZPROP_SRC_DEFAULT)
|
||||
(void) nvlist_add_uint64(spares[s],
|
||||
@@ -237,7 +243,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
|
||||
dev_name, basename(spare_name));
|
||||
|
||||
if (zpool_vdev_attach(zhp, dev_name, spare_name,
|
||||
replacement, B_TRUE, B_FALSE) == 0) {
|
||||
replacement, B_TRUE, rebuild) == 0) {
|
||||
free(dev_name);
|
||||
nvlist_free(replacement);
|
||||
return (B_TRUE);
|
||||
@@ -499,6 +505,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
|
||||
* Attempt to substitute a hot spare.
|
||||
*/
|
||||
(void) replace_with_spare(hdl, zhp, vdev);
|
||||
|
||||
zpool_close(zhp);
|
||||
}
|
||||
|
||||
|
||||
+118
-12
@@ -892,6 +892,107 @@ usage:
|
||||
return (-1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a default volblocksize for the pool which always uses more than
|
||||
* half of the data sectors. This primarily applies to dRAID which always
|
||||
* writes full stripe widths.
|
||||
*/
|
||||
static uint64_t
|
||||
default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
|
||||
{
|
||||
uint64_t volblocksize, asize = SPA_MINBLOCKSIZE;
|
||||
nvlist_t *tree, **vdevs;
|
||||
uint_t nvdevs;
|
||||
|
||||
nvlist_t *config = zpool_get_config(zhp, NULL);
|
||||
|
||||
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
|
||||
nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
|
||||
&vdevs, &nvdevs) != 0) {
|
||||
return (ZVOL_DEFAULT_BLOCKSIZE);
|
||||
}
|
||||
|
||||
for (int i = 0; i < nvdevs; i++) {
|
||||
nvlist_t *nv = vdevs[i];
|
||||
uint64_t ashift, ndata, nparity;
|
||||
|
||||
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)
|
||||
continue;
|
||||
|
||||
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA,
|
||||
&ndata) == 0) {
|
||||
/* dRAID minimum allocation width */
|
||||
asize = MAX(asize, ndata * (1ULL << ashift));
|
||||
} else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
|
||||
&nparity) == 0) {
|
||||
/* raidz minimum allocation width */
|
||||
if (nparity == 1)
|
||||
asize = MAX(asize, 2 * (1ULL << ashift));
|
||||
else
|
||||
asize = MAX(asize, 4 * (1ULL << ashift));
|
||||
} else {
|
||||
/* mirror or (non-redundant) leaf vdev */
|
||||
asize = MAX(asize, 1ULL << ashift);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the target volblocksize such that more than half
|
||||
* of the asize is used. The following table is for 4k sectors.
|
||||
*
|
||||
* n asize blksz used | n asize blksz used
|
||||
* -------------------------+---------------------------------
|
||||
* 1 4,096 8,192 100% | 9 36,864 32,768 88%
|
||||
* 2 8,192 8,192 100% | 10 40,960 32,768 80%
|
||||
* 3 12,288 8,192 66% | 11 45,056 32,768 72%
|
||||
* 4 16,384 16,384 100% | 12 49,152 32,768 66%
|
||||
* 5 20,480 16,384 80% | 13 53,248 32,768 61%
|
||||
* 6 24,576 16,384 66% | 14 57,344 32,768 57%
|
||||
* 7 28,672 16,384 57% | 15 61,440 32,768 53%
|
||||
* 8 32,768 32,768 100% | 16 65,536 65,636 100%
|
||||
*
|
||||
* This is primarily a concern for dRAID which always allocates
|
||||
* a full stripe width. For dRAID the default stripe width is
|
||||
* n=8 in which case the volblocksize is set to 32k. Ignoring
|
||||
* compression there are no unused sectors. This same reasoning
|
||||
* applies to raidz[2,3] so target 4 sectors to minimize waste.
|
||||
*/
|
||||
uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
|
||||
while (tgt_volblocksize * 2 <= asize)
|
||||
tgt_volblocksize *= 2;
|
||||
|
||||
const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE);
|
||||
if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) {
|
||||
|
||||
/* Issue a warning when a non-optimal size is requested. */
|
||||
if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) {
|
||||
(void) fprintf(stderr, gettext("Warning: "
|
||||
"volblocksize (%llu) is less than the default "
|
||||
"minimum block size (%llu).\nTo reduce wasted "
|
||||
"space a volblocksize of %llu is recommended.\n"),
|
||||
(u_longlong_t)volblocksize,
|
||||
(u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE,
|
||||
(u_longlong_t)tgt_volblocksize);
|
||||
} else if (volblocksize < tgt_volblocksize) {
|
||||
(void) fprintf(stderr, gettext("Warning: "
|
||||
"volblocksize (%llu) is much less than the "
|
||||
"minimum allocation\nunit (%llu), which wastes "
|
||||
"at least %llu%% of space. To reduce wasted "
|
||||
"space,\nuse a larger volblocksize (%llu is "
|
||||
"recommended), fewer dRAID data disks\n"
|
||||
"per group, or smaller sector size (ashift).\n"),
|
||||
(u_longlong_t)volblocksize, (u_longlong_t)asize,
|
||||
(u_longlong_t)((100 * (asize - volblocksize)) /
|
||||
asize), (u_longlong_t)tgt_volblocksize);
|
||||
}
|
||||
} else {
|
||||
volblocksize = tgt_volblocksize;
|
||||
fnvlist_add_uint64(props, prop, volblocksize);
|
||||
}
|
||||
|
||||
return (volblocksize);
|
||||
}
|
||||
|
||||
/*
|
||||
* zfs create [-Pnpv] [-o prop=value] ... fs
|
||||
* zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size
|
||||
@@ -932,6 +1033,7 @@ zfs_do_create(int argc, char **argv)
|
||||
int ret = 1;
|
||||
nvlist_t *props;
|
||||
uint64_t intval;
|
||||
char *strval;
|
||||
|
||||
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
|
||||
nomem();
|
||||
@@ -1018,7 +1120,7 @@ zfs_do_create(int argc, char **argv)
|
||||
goto badusage;
|
||||
}
|
||||
|
||||
if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) {
|
||||
if (dryrun || type == ZFS_TYPE_VOLUME) {
|
||||
char msg[ZFS_MAX_DATASET_NAME_LEN * 2];
|
||||
char *p;
|
||||
|
||||
@@ -1040,18 +1142,24 @@ zfs_do_create(int argc, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* if volsize is not a multiple of volblocksize, round it up to the
|
||||
* nearest multiple of the volblocksize
|
||||
*/
|
||||
if (type == ZFS_TYPE_VOLUME) {
|
||||
uint64_t volblocksize;
|
||||
const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE);
|
||||
uint64_t volblocksize = default_volblocksize(zpool_handle,
|
||||
real_props);
|
||||
|
||||
if (nvlist_lookup_uint64(props,
|
||||
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
|
||||
&volblocksize) != 0)
|
||||
volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
|
||||
if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE &&
|
||||
nvlist_lookup_string(props, prop, &strval) != 0) {
|
||||
if (asprintf(&strval, "%llu",
|
||||
(u_longlong_t)volblocksize) == -1)
|
||||
nomem();
|
||||
nvlist_add_string(props, prop, strval);
|
||||
free(strval);
|
||||
}
|
||||
|
||||
/*
|
||||
* If volsize is not a multiple of volblocksize, round it
|
||||
* up to the nearest multiple of the volblocksize.
|
||||
*/
|
||||
if (volsize % volblocksize) {
|
||||
volsize = P2ROUNDUP_TYPED(volsize, volblocksize,
|
||||
uint64_t);
|
||||
@@ -1064,11 +1172,9 @@ zfs_do_create(int argc, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (type == ZFS_TYPE_VOLUME && !noreserve) {
|
||||
uint64_t spa_version;
|
||||
zfs_prop_t resv_prop;
|
||||
char *strval;
|
||||
|
||||
spa_version = zpool_get_prop_int(zpool_handle,
|
||||
ZPOOL_PROP_VERSION, NULL);
|
||||
|
||||
@@ -2294,7 +2294,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
||||
}
|
||||
}
|
||||
|
||||
/* Display vdev initialization and trim status for leaves */
|
||||
/* Display vdev initialization and trim status for leaves. */
|
||||
if (children == 0) {
|
||||
print_status_initialize(vs, cb->cb_print_vdev_init);
|
||||
print_status_trim(vs, cb->cb_print_vdev_trim);
|
||||
@@ -9849,7 +9849,8 @@ vdev_any_spare_replacing(nvlist_t *nv)
|
||||
(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type);
|
||||
|
||||
if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 ||
|
||||
strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) {
|
||||
strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 ||
|
||||
strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) {
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
|
||||
+345
-56
@@ -86,9 +86,6 @@
|
||||
boolean_t error_seen;
|
||||
boolean_t is_force;
|
||||
|
||||
|
||||
|
||||
|
||||
/*PRINTFLIKE1*/
|
||||
void
|
||||
vdev_error(const char *fmt, ...)
|
||||
@@ -222,6 +219,9 @@ is_spare(nvlist_t *config, const char *path)
|
||||
uint_t i, nspares;
|
||||
boolean_t inuse;
|
||||
|
||||
if (zpool_is_draid_spare(path))
|
||||
return (B_TRUE);
|
||||
|
||||
if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
|
||||
return (B_FALSE);
|
||||
|
||||
@@ -267,9 +267,10 @@ is_spare(nvlist_t *config, const char *path)
|
||||
* /dev/xxx Complete disk path
|
||||
* /xxx Full path to file
|
||||
* xxx Shorthand for <zfs_vdev_paths>/xxx
|
||||
* draid* Virtual dRAID spare
|
||||
*/
|
||||
static nvlist_t *
|
||||
make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
|
||||
make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
|
||||
{
|
||||
char path[MAXPATHLEN];
|
||||
struct stat64 statbuf;
|
||||
@@ -309,6 +310,17 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
|
||||
|
||||
/* After whole disk check restore original passed path */
|
||||
strlcpy(path, arg, sizeof (path));
|
||||
} else if (zpool_is_draid_spare(arg)) {
|
||||
if (!is_primary) {
|
||||
(void) fprintf(stderr,
|
||||
gettext("cannot open '%s': dRAID spares can only "
|
||||
"be used to replace primary vdevs\n"), arg);
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
wholedisk = B_TRUE;
|
||||
strlcpy(path, arg, sizeof (path));
|
||||
type = VDEV_TYPE_DRAID_SPARE;
|
||||
} else {
|
||||
err = is_shorthand_path(arg, path, sizeof (path),
|
||||
&statbuf, &wholedisk);
|
||||
@@ -337,17 +349,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine whether this is a device or a file.
|
||||
*/
|
||||
if (wholedisk || S_ISBLK(statbuf.st_mode)) {
|
||||
type = VDEV_TYPE_DISK;
|
||||
} else if (S_ISREG(statbuf.st_mode)) {
|
||||
type = VDEV_TYPE_FILE;
|
||||
} else {
|
||||
(void) fprintf(stderr, gettext("cannot use '%s': must be a "
|
||||
"block device or regular file\n"), path);
|
||||
return (NULL);
|
||||
if (type == NULL) {
|
||||
/*
|
||||
* Determine whether this is a device or a file.
|
||||
*/
|
||||
if (wholedisk || S_ISBLK(statbuf.st_mode)) {
|
||||
type = VDEV_TYPE_DISK;
|
||||
} else if (S_ISREG(statbuf.st_mode)) {
|
||||
type = VDEV_TYPE_FILE;
|
||||
} else {
|
||||
fprintf(stderr, gettext("cannot use '%s': must "
|
||||
"be a block device or regular file\n"), path);
|
||||
return (NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -358,10 +372,7 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
|
||||
verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
|
||||
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
|
||||
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
|
||||
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
|
||||
if (is_log)
|
||||
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
VDEV_ALLOC_BIAS_LOG) == 0);
|
||||
|
||||
if (strcmp(type, VDEV_TYPE_DISK) == 0)
|
||||
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
|
||||
(uint64_t)wholedisk) == 0);
|
||||
@@ -432,11 +443,16 @@ typedef struct replication_level {
|
||||
|
||||
#define ZPOOL_FUZZ (16 * 1024 * 1024)
|
||||
|
||||
/*
|
||||
* N.B. For the purposes of comparing replication levels dRAID can be
|
||||
* considered functionally equivilant to raidz.
|
||||
*/
|
||||
static boolean_t
|
||||
is_raidz_mirror(replication_level_t *a, replication_level_t *b,
|
||||
replication_level_t **raidz, replication_level_t **mirror)
|
||||
{
|
||||
if (strcmp(a->zprl_type, "raidz") == 0 &&
|
||||
if ((strcmp(a->zprl_type, "raidz") == 0 ||
|
||||
strcmp(a->zprl_type, "draid") == 0) &&
|
||||
strcmp(b->zprl_type, "mirror") == 0) {
|
||||
*raidz = a;
|
||||
*mirror = b;
|
||||
@@ -445,6 +461,22 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b,
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Comparison for determining if dRAID and raidz where passed in either order.
|
||||
*/
|
||||
static boolean_t
|
||||
is_raidz_draid(replication_level_t *a, replication_level_t *b)
|
||||
{
|
||||
if ((strcmp(a->zprl_type, "raidz") == 0 ||
|
||||
strcmp(a->zprl_type, "draid") == 0) &&
|
||||
(strcmp(b->zprl_type, "raidz") == 0 ||
|
||||
strcmp(b->zprl_type, "draid") == 0)) {
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a list of toplevel vdevs, return the current replication level. If
|
||||
* the config is inconsistent, then NULL is returned. If 'fatal' is set, then
|
||||
@@ -511,7 +543,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
|
||||
rep.zprl_type = type;
|
||||
rep.zprl_children = 0;
|
||||
|
||||
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
|
||||
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
|
||||
strcmp(type, VDEV_TYPE_DRAID) == 0) {
|
||||
verify(nvlist_lookup_uint64(nv,
|
||||
ZPOOL_CONFIG_NPARITY,
|
||||
&rep.zprl_parity) == 0);
|
||||
@@ -677,6 +710,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
|
||||
else
|
||||
return (NULL);
|
||||
}
|
||||
} else if (is_raidz_draid(&lastrep, &rep)) {
|
||||
/*
|
||||
* Accepted raidz and draid when they can
|
||||
* handle the same number of disk failures.
|
||||
*/
|
||||
if (lastrep.zprl_parity != rep.zprl_parity) {
|
||||
if (ret != NULL)
|
||||
free(ret);
|
||||
ret = NULL;
|
||||
if (fatal)
|
||||
vdev_error(gettext(
|
||||
"mismatched replication "
|
||||
"level: %s and %s vdevs "
|
||||
"with different "
|
||||
"redundancy, %llu vs. "
|
||||
"%llu are present\n"),
|
||||
lastrep.zprl_type,
|
||||
rep.zprl_type,
|
||||
lastrep.zprl_parity,
|
||||
rep.zprl_parity);
|
||||
else
|
||||
return (NULL);
|
||||
}
|
||||
} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
|
||||
0) {
|
||||
if (ret != NULL)
|
||||
@@ -1103,31 +1159,87 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
|
||||
return (anyinuse);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the parity level extracted from a raidz or draid type.
|
||||
* If the parity cannot be determined zero is returned.
|
||||
*/
|
||||
static int
|
||||
get_parity(const char *type)
|
||||
{
|
||||
long parity = 0;
|
||||
const char *p;
|
||||
|
||||
if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) {
|
||||
p = type + strlen(VDEV_TYPE_RAIDZ);
|
||||
|
||||
if (*p == '\0') {
|
||||
/* when unspecified default to single parity */
|
||||
return (1);
|
||||
} else if (*p == '0') {
|
||||
/* no zero prefixes allowed */
|
||||
return (0);
|
||||
} else {
|
||||
/* 0-3, no suffixes allowed */
|
||||
char *end;
|
||||
errno = 0;
|
||||
parity = strtol(p, &end, 10);
|
||||
if (errno != 0 || *end != '\0' ||
|
||||
parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) {
|
||||
return (0);
|
||||
}
|
||||
}
|
||||
} else if (strncmp(type, VDEV_TYPE_DRAID,
|
||||
strlen(VDEV_TYPE_DRAID)) == 0) {
|
||||
p = type + strlen(VDEV_TYPE_DRAID);
|
||||
|
||||
if (*p == '\0' || *p == ':') {
|
||||
/* when unspecified default to single parity */
|
||||
return (1);
|
||||
} else if (*p == '0') {
|
||||
/* no zero prefixes allowed */
|
||||
return (0);
|
||||
} else {
|
||||
/* 0-3, allowed suffixes: '\0' or ':' */
|
||||
char *end;
|
||||
errno = 0;
|
||||
parity = strtol(p, &end, 10);
|
||||
if (errno != 0 ||
|
||||
parity < 1 || parity > VDEV_DRAID_MAXPARITY ||
|
||||
(*end != '\0' && *end != ':')) {
|
||||
return (0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ((int)parity);
|
||||
}
|
||||
|
||||
/*
|
||||
* Assign the minimum and maximum number of devices allowed for
|
||||
* the specified type. On error NULL is returned, otherwise the
|
||||
* type prefix is returned (raidz, mirror, etc).
|
||||
*/
|
||||
static const char *
|
||||
is_grouping(const char *type, int *mindev, int *maxdev)
|
||||
{
|
||||
if (strncmp(type, "raidz", 5) == 0) {
|
||||
const char *p = type + 5;
|
||||
char *end;
|
||||
long nparity;
|
||||
|
||||
if (*p == '\0') {
|
||||
nparity = 1;
|
||||
} else if (*p == '0') {
|
||||
return (NULL); /* no zero prefixes allowed */
|
||||
} else {
|
||||
errno = 0;
|
||||
nparity = strtol(p, &end, 10);
|
||||
if (errno != 0 || nparity < 1 || nparity >= 255 ||
|
||||
*end != '\0')
|
||||
return (NULL);
|
||||
}
|
||||
int nparity;
|
||||
|
||||
if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
|
||||
strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) {
|
||||
nparity = get_parity(type);
|
||||
if (nparity == 0)
|
||||
return (NULL);
|
||||
if (mindev != NULL)
|
||||
*mindev = nparity + 1;
|
||||
if (maxdev != NULL)
|
||||
*maxdev = 255;
|
||||
return (VDEV_TYPE_RAIDZ);
|
||||
|
||||
if (strncmp(type, VDEV_TYPE_RAIDZ,
|
||||
strlen(VDEV_TYPE_RAIDZ)) == 0) {
|
||||
return (VDEV_TYPE_RAIDZ);
|
||||
} else {
|
||||
return (VDEV_TYPE_DRAID);
|
||||
}
|
||||
}
|
||||
|
||||
if (maxdev != NULL)
|
||||
@@ -1167,6 +1279,163 @@ is_grouping(const char *type, int *mindev, int *maxdev)
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract the configuration parameters encoded in the dRAID type and
|
||||
* use them to generate a dRAID configuration. The expected format is:
|
||||
*
|
||||
* draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>]
|
||||
*
|
||||
* The intent is to be able to generate a good configuration when no
|
||||
* additional information is provided. The only mandatory component
|
||||
* of the 'type' is the 'draid' prefix. If a value is not provided
|
||||
* then reasonable defaults are used. The optional components may
|
||||
* appear in any order but the d/s/c suffix is required.
|
||||
*
|
||||
* Valid inputs:
|
||||
* - data: number of data devices per group (1-255)
|
||||
* - parity: number of parity blocks per group (1-3)
|
||||
* - spares: number of distributed spare (0-100)
|
||||
* - children: total number of devices (1-255)
|
||||
*
|
||||
* Examples:
|
||||
* - zpool create tank draid <devices...>
|
||||
* - zpool create tank draid2:8d:51c:2s <devices...>
|
||||
*/
|
||||
static int
|
||||
draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
|
||||
{
|
||||
uint64_t nparity = 1;
|
||||
uint64_t nspares = 0;
|
||||
uint64_t ndata = UINT64_MAX;
|
||||
uint64_t ngroups = 1;
|
||||
long value;
|
||||
|
||||
if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0)
|
||||
return (EINVAL);
|
||||
|
||||
nparity = (uint64_t)get_parity(type);
|
||||
if (nparity == 0)
|
||||
return (EINVAL);
|
||||
|
||||
char *p = (char *)type;
|
||||
while ((p = strchr(p, ':')) != NULL) {
|
||||
char *end;
|
||||
|
||||
p = p + 1;
|
||||
errno = 0;
|
||||
|
||||
if (!isdigit(p[0])) {
|
||||
(void) fprintf(stderr, gettext("invalid dRAID "
|
||||
"syntax; expected [:<number><c|d|s>] not '%s'\n"),
|
||||
type);
|
||||
return (EINVAL);
|
||||
}
|
||||
|
||||
/* Expected non-zero value with c/d/s suffix */
|
||||
value = strtol(p, &end, 10);
|
||||
char suffix = tolower(*end);
|
||||
if (errno != 0 ||
|
||||
(suffix != 'c' && suffix != 'd' && suffix != 's')) {
|
||||
(void) fprintf(stderr, gettext("invalid dRAID "
|
||||
"syntax; expected [:<number><c|d|s>] not '%s'\n"),
|
||||
type);
|
||||
return (EINVAL);
|
||||
}
|
||||
|
||||
if (suffix == 'c') {
|
||||
if ((uint64_t)value != children) {
|
||||
fprintf(stderr,
|
||||
gettext("invalid number of dRAID children; "
|
||||
"%llu required but %llu provided\n"),
|
||||
(u_longlong_t)value,
|
||||
(u_longlong_t)children);
|
||||
return (EINVAL);
|
||||
}
|
||||
} else if (suffix == 'd') {
|
||||
ndata = (uint64_t)value;
|
||||
} else if (suffix == 's') {
|
||||
nspares = (uint64_t)value;
|
||||
} else {
|
||||
verify(0); /* Unreachable */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* When a specific number of data disks is not provided limit a
|
||||
* redundancy group to 8 data disks. This value was selected to
|
||||
* provide a reasonable tradeoff between capacity and performance.
|
||||
*/
|
||||
if (ndata == UINT64_MAX) {
|
||||
if (children > nspares + nparity) {
|
||||
ndata = MIN(children - nspares - nparity, 8);
|
||||
} else {
|
||||
fprintf(stderr, gettext("request number of "
|
||||
"distributed spares %llu and parity level %llu\n"
|
||||
"leaves no disks available for data\n"),
|
||||
(u_longlong_t)nspares, (u_longlong_t)nparity);
|
||||
return (EINVAL);
|
||||
}
|
||||
}
|
||||
|
||||
/* Verify the maximum allowed group size is never exceeded. */
|
||||
if (ndata == 0 || (ndata + nparity > children - nspares)) {
|
||||
fprintf(stderr, gettext("requested number of dRAID data "
|
||||
"disks per group %llu is too high,\nat most %llu disks "
|
||||
"are available for data\n"), (u_longlong_t)ndata,
|
||||
(u_longlong_t)(children - nspares - nparity));
|
||||
return (EINVAL);
|
||||
}
|
||||
|
||||
if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
|
||||
fprintf(stderr,
|
||||
gettext("invalid dRAID parity level %llu; must be "
|
||||
"between 1 and %d\n"), (u_longlong_t)nparity,
|
||||
VDEV_DRAID_MAXPARITY);
|
||||
return (EINVAL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify the requested number of spares can be satisfied.
|
||||
* An arbitrary limit of 100 distributed spares is applied.
|
||||
*/
|
||||
if (nspares > 100 || nspares > (children - (ndata + nparity))) {
|
||||
fprintf(stderr,
|
||||
gettext("invalid number of dRAID spares %llu; additional "
|
||||
"disks would be required\n"), (u_longlong_t)nspares);
|
||||
return (EINVAL);
|
||||
}
|
||||
|
||||
/* Verify the requested number children is sufficient. */
|
||||
if (children < (ndata + nparity + nspares)) {
|
||||
fprintf(stderr, gettext("%llu disks were provided, but at "
|
||||
"least %llu disks are required for this config\n"),
|
||||
(u_longlong_t)children,
|
||||
(u_longlong_t)(ndata + nparity + nspares));
|
||||
}
|
||||
|
||||
if (children > VDEV_DRAID_MAX_CHILDREN) {
|
||||
fprintf(stderr, gettext("%llu disks were provided, but "
|
||||
"dRAID only supports up to %u disks"),
|
||||
(u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN);
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the minimum number of groups required to fill a slice.
|
||||
* This is the LCM of the stripe width (ndata + nparity) and the
|
||||
* number of data drives (children - nspares).
|
||||
*/
|
||||
while (ngroups * (ndata + nparity) % (children - nspares) != 0)
|
||||
ngroups++;
|
||||
|
||||
/* Store the basic dRAID configuration. */
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata);
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct a syntactically valid vdev specification,
|
||||
* and ensure that all devices and files exist and can be opened.
|
||||
@@ -1178,8 +1447,8 @@ construct_spec(nvlist_t *props, int argc, char **argv)
|
||||
{
|
||||
nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
|
||||
int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
|
||||
const char *type;
|
||||
uint64_t is_log, is_special, is_dedup;
|
||||
const char *type, *fulltype;
|
||||
boolean_t is_log, is_special, is_dedup, is_spare;
|
||||
boolean_t seen_logs;
|
||||
|
||||
top = NULL;
|
||||
@@ -1189,18 +1458,20 @@ construct_spec(nvlist_t *props, int argc, char **argv)
|
||||
nspares = 0;
|
||||
nlogs = 0;
|
||||
nl2cache = 0;
|
||||
is_log = is_special = is_dedup = B_FALSE;
|
||||
is_log = is_special = is_dedup = is_spare = B_FALSE;
|
||||
seen_logs = B_FALSE;
|
||||
nvroot = NULL;
|
||||
|
||||
while (argc > 0) {
|
||||
fulltype = argv[0];
|
||||
nv = NULL;
|
||||
|
||||
/*
|
||||
* If it's a mirror or raidz, the subsequent arguments are
|
||||
* its leaves -- until we encounter the next mirror or raidz.
|
||||
* If it's a mirror, raidz, or draid the subsequent arguments
|
||||
* are its leaves -- until we encounter the next mirror,
|
||||
* raidz or draid.
|
||||
*/
|
||||
if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
|
||||
if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) {
|
||||
nvlist_t **child = NULL;
|
||||
int c, children = 0;
|
||||
|
||||
@@ -1212,6 +1483,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
|
||||
"specified only once\n"));
|
||||
goto spec_out;
|
||||
}
|
||||
is_spare = B_TRUE;
|
||||
is_log = is_special = is_dedup = B_FALSE;
|
||||
}
|
||||
|
||||
@@ -1225,8 +1497,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
|
||||
}
|
||||
seen_logs = B_TRUE;
|
||||
is_log = B_TRUE;
|
||||
is_special = B_FALSE;
|
||||
is_dedup = B_FALSE;
|
||||
is_special = is_dedup = is_spare = B_FALSE;
|
||||
argc--;
|
||||
argv++;
|
||||
/*
|
||||
@@ -1238,8 +1509,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
|
||||
|
||||
if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
|
||||
is_special = B_TRUE;
|
||||
is_log = B_FALSE;
|
||||
is_dedup = B_FALSE;
|
||||
is_log = is_dedup = is_spare = B_FALSE;
|
||||
argc--;
|
||||
argv++;
|
||||
continue;
|
||||
@@ -1247,8 +1517,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
|
||||
|
||||
if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
|
||||
is_dedup = B_TRUE;
|
||||
is_log = B_FALSE;
|
||||
is_special = B_FALSE;
|
||||
is_log = is_special = is_spare = B_FALSE;
|
||||
argc--;
|
||||
argv++;
|
||||
continue;
|
||||
@@ -1262,7 +1531,8 @@ construct_spec(nvlist_t *props, int argc, char **argv)
|
||||
"specified only once\n"));
|
||||
goto spec_out;
|
||||
}
|
||||
is_log = is_special = is_dedup = B_FALSE;
|
||||
is_log = is_special = B_FALSE;
|
||||
is_dedup = is_spare = B_FALSE;
|
||||
}
|
||||
|
||||
if (is_log || is_special || is_dedup) {
|
||||
@@ -1280,13 +1550,15 @@ construct_spec(nvlist_t *props, int argc, char **argv)
|
||||
for (c = 1; c < argc; c++) {
|
||||
if (is_grouping(argv[c], NULL, NULL) != NULL)
|
||||
break;
|
||||
|
||||
children++;
|
||||
child = realloc(child,
|
||||
children * sizeof (nvlist_t *));
|
||||
if (child == NULL)
|
||||
zpool_no_memory();
|
||||
if ((nv = make_leaf_vdev(props, argv[c],
|
||||
B_FALSE)) == NULL) {
|
||||
!(is_log || is_special || is_dedup ||
|
||||
is_spare))) == NULL) {
|
||||
for (c = 0; c < children - 1; c++)
|
||||
nvlist_free(child[c]);
|
||||
free(child);
|
||||
@@ -1335,10 +1607,11 @@ construct_spec(nvlist_t *props, int argc, char **argv)
|
||||
type) == 0);
|
||||
verify(nvlist_add_uint64(nv,
|
||||
ZPOOL_CONFIG_IS_LOG, is_log) == 0);
|
||||
if (is_log)
|
||||
if (is_log) {
|
||||
verify(nvlist_add_string(nv,
|
||||
ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
VDEV_ALLOC_BIAS_LOG) == 0);
|
||||
}
|
||||
if (is_special) {
|
||||
verify(nvlist_add_string(nv,
|
||||
ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
@@ -1354,6 +1627,15 @@ construct_spec(nvlist_t *props, int argc, char **argv)
|
||||
ZPOOL_CONFIG_NPARITY,
|
||||
mindev - 1) == 0);
|
||||
}
|
||||
if (strcmp(type, VDEV_TYPE_DRAID) == 0) {
|
||||
if (draid_config_by_type(nv,
|
||||
fulltype, children) != 0) {
|
||||
for (c = 0; c < children; c++)
|
||||
nvlist_free(child[c]);
|
||||
free(child);
|
||||
goto spec_out;
|
||||
}
|
||||
}
|
||||
verify(nvlist_add_nvlist_array(nv,
|
||||
ZPOOL_CONFIG_CHILDREN, child,
|
||||
children) == 0);
|
||||
@@ -1367,12 +1649,19 @@ construct_spec(nvlist_t *props, int argc, char **argv)
|
||||
* We have a device. Pass off to make_leaf_vdev() to
|
||||
* construct the appropriate nvlist describing the vdev.
|
||||
*/
|
||||
if ((nv = make_leaf_vdev(props, argv[0],
|
||||
is_log)) == NULL)
|
||||
if ((nv = make_leaf_vdev(props, argv[0], !(is_log ||
|
||||
is_special || is_dedup || is_spare))) == NULL)
|
||||
goto spec_out;
|
||||
|
||||
if (is_log)
|
||||
verify(nvlist_add_uint64(nv,
|
||||
ZPOOL_CONFIG_IS_LOG, is_log) == 0);
|
||||
if (is_log) {
|
||||
verify(nvlist_add_string(nv,
|
||||
ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
VDEV_ALLOC_BIAS_LOG) == 0);
|
||||
nlogs++;
|
||||
}
|
||||
|
||||
if (is_special) {
|
||||
verify(nvlist_add_string(nv,
|
||||
ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
|
||||
+210
-71
@@ -104,6 +104,7 @@
|
||||
#include <sys/zio.h>
|
||||
#include <sys/zil.h>
|
||||
#include <sys/zil_impl.h>
|
||||
#include <sys/vdev_draid.h>
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/vdev_file.h>
|
||||
#include <sys/vdev_initialize.h>
|
||||
@@ -167,8 +168,11 @@ typedef struct ztest_shared_opts {
|
||||
size_t zo_vdev_size;
|
||||
int zo_ashift;
|
||||
int zo_mirrors;
|
||||
int zo_raidz;
|
||||
int zo_raidz_parity;
|
||||
int zo_raid_children;
|
||||
int zo_raid_parity;
|
||||
char zo_raid_type[8];
|
||||
int zo_draid_data;
|
||||
int zo_draid_spares;
|
||||
int zo_datasets;
|
||||
int zo_threads;
|
||||
uint64_t zo_passtime;
|
||||
@@ -191,9 +195,12 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
|
||||
.zo_vdevs = 5,
|
||||
.zo_ashift = SPA_MINBLOCKSHIFT,
|
||||
.zo_mirrors = 2,
|
||||
.zo_raidz = 4,
|
||||
.zo_raidz_parity = 1,
|
||||
.zo_raid_children = 4,
|
||||
.zo_raid_parity = 1,
|
||||
.zo_raid_type = VDEV_TYPE_RAIDZ,
|
||||
.zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */
|
||||
.zo_draid_data = 4, /* data drives */
|
||||
.zo_draid_spares = 1, /* distributed spares */
|
||||
.zo_datasets = 7,
|
||||
.zo_threads = 23,
|
||||
.zo_passtime = 60, /* 60 seconds */
|
||||
@@ -232,7 +239,7 @@ static ztest_shared_ds_t *ztest_shared_ds;
|
||||
|
||||
#define BT_MAGIC 0x123456789abcdefULL
|
||||
#define MAXFAULTS(zs) \
|
||||
(MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
|
||||
(MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1)
|
||||
|
||||
enum ztest_io_type {
|
||||
ZTEST_IO_WRITE_TAG,
|
||||
@@ -689,8 +696,11 @@ usage(boolean_t requested)
|
||||
"\t[-s size_of_each_vdev (default: %s)]\n"
|
||||
"\t[-a alignment_shift (default: %d)] use 0 for random\n"
|
||||
"\t[-m mirror_copies (default: %d)]\n"
|
||||
"\t[-r raidz_disks (default: %d)]\n"
|
||||
"\t[-R raidz_parity (default: %d)]\n"
|
||||
"\t[-r raidz_disks / draid_disks (default: %d)]\n"
|
||||
"\t[-R raid_parity (default: %d)]\n"
|
||||
"\t[-K raid_kind (default: random)] raidz|draid|random\n"
|
||||
"\t[-D draid_data (default: %d)] in config\n"
|
||||
"\t[-S draid_spares (default: %d)]\n"
|
||||
"\t[-d datasets (default: %d)]\n"
|
||||
"\t[-t threads (default: %d)]\n"
|
||||
"\t[-g gang_block_threshold (default: %s)]\n"
|
||||
@@ -716,8 +726,10 @@ usage(boolean_t requested)
|
||||
nice_vdev_size, /* -s */
|
||||
zo->zo_ashift, /* -a */
|
||||
zo->zo_mirrors, /* -m */
|
||||
zo->zo_raidz, /* -r */
|
||||
zo->zo_raidz_parity, /* -R */
|
||||
zo->zo_raid_children, /* -r */
|
||||
zo->zo_raid_parity, /* -R */
|
||||
zo->zo_draid_data, /* -D */
|
||||
zo->zo_draid_spares, /* -S */
|
||||
zo->zo_datasets, /* -d */
|
||||
zo->zo_threads, /* -t */
|
||||
nice_force_ganging, /* -g */
|
||||
@@ -731,6 +743,21 @@ usage(boolean_t requested)
|
||||
exit(requested ? 0 : 1);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
ztest_random(uint64_t range)
|
||||
{
|
||||
uint64_t r;
|
||||
|
||||
ASSERT3S(ztest_fd_rand, >=, 0);
|
||||
|
||||
if (range == 0)
|
||||
return (0);
|
||||
|
||||
if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
|
||||
fatal(1, "short read from /dev/urandom");
|
||||
|
||||
return (r % range);
|
||||
}
|
||||
|
||||
static void
|
||||
ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo)
|
||||
@@ -780,11 +807,12 @@ process_options(int argc, char **argv)
|
||||
int opt;
|
||||
uint64_t value;
|
||||
char altdir[MAXNAMELEN] = { 0 };
|
||||
char raid_kind[8] = { "random" };
|
||||
|
||||
bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
|
||||
|
||||
while ((opt = getopt(argc, argv,
|
||||
"v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) {
|
||||
"v:s:a:m:r:R:K:D:S:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) {
|
||||
value = 0;
|
||||
switch (opt) {
|
||||
case 'v':
|
||||
@@ -793,6 +821,8 @@ process_options(int argc, char **argv)
|
||||
case 'm':
|
||||
case 'r':
|
||||
case 'R':
|
||||
case 'D':
|
||||
case 'S':
|
||||
case 'd':
|
||||
case 't':
|
||||
case 'g':
|
||||
@@ -817,10 +847,19 @@ process_options(int argc, char **argv)
|
||||
zo->zo_mirrors = value;
|
||||
break;
|
||||
case 'r':
|
||||
zo->zo_raidz = MAX(1, value);
|
||||
zo->zo_raid_children = MAX(1, value);
|
||||
break;
|
||||
case 'R':
|
||||
zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
|
||||
zo->zo_raid_parity = MIN(MAX(value, 1), 3);
|
||||
break;
|
||||
case 'K':
|
||||
(void) strlcpy(raid_kind, optarg, sizeof (raid_kind));
|
||||
break;
|
||||
case 'D':
|
||||
zo->zo_draid_data = MAX(1, value);
|
||||
break;
|
||||
case 'S':
|
||||
zo->zo_draid_spares = MAX(1, value);
|
||||
break;
|
||||
case 'd':
|
||||
zo->zo_datasets = MAX(1, value);
|
||||
@@ -895,7 +934,54 @@ process_options(int argc, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
|
||||
/* When raid choice is 'random' add a draid pool 50% of the time */
|
||||
if (strcmp(raid_kind, "random") == 0) {
|
||||
(void) strlcpy(raid_kind, (ztest_random(2) == 0) ?
|
||||
"draid" : "raidz", sizeof (raid_kind));
|
||||
|
||||
if (ztest_opts.zo_verbose >= 3)
|
||||
(void) printf("choosing RAID type '%s'\n", raid_kind);
|
||||
}
|
||||
|
||||
if (strcmp(raid_kind, "draid") == 0) {
|
||||
uint64_t min_devsize;
|
||||
|
||||
/* With fewer disk use 256M, otherwise 128M is OK */
|
||||
min_devsize = (ztest_opts.zo_raid_children < 16) ?
|
||||
(256ULL << 20) : (128ULL << 20);
|
||||
|
||||
/* No top-level mirrors with dRAID for now */
|
||||
zo->zo_mirrors = 0;
|
||||
|
||||
/* Use more appropriate defaults for dRAID */
|
||||
if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs)
|
||||
zo->zo_vdevs = 1;
|
||||
if (zo->zo_raid_children ==
|
||||
ztest_opts_defaults.zo_raid_children)
|
||||
zo->zo_raid_children = 16;
|
||||
if (zo->zo_ashift < 12)
|
||||
zo->zo_ashift = 12;
|
||||
if (zo->zo_vdev_size < min_devsize)
|
||||
zo->zo_vdev_size = min_devsize;
|
||||
|
||||
if (zo->zo_draid_data + zo->zo_raid_parity >
|
||||
zo->zo_raid_children - zo->zo_draid_spares) {
|
||||
(void) fprintf(stderr, "error: too few draid "
|
||||
"children (%d) for stripe width (%d)\n",
|
||||
zo->zo_raid_children,
|
||||
zo->zo_draid_data + zo->zo_raid_parity);
|
||||
usage(B_FALSE);
|
||||
}
|
||||
|
||||
(void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID,
|
||||
sizeof (zo->zo_raid_type));
|
||||
|
||||
} else /* using raidz */ {
|
||||
ASSERT0(strcmp(raid_kind, "raidz"));
|
||||
|
||||
zo->zo_raid_parity = MIN(zo->zo_raid_parity,
|
||||
zo->zo_raid_children - 1);
|
||||
}
|
||||
|
||||
zo->zo_vdevtime =
|
||||
(zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
|
||||
@@ -966,22 +1052,6 @@ ztest_kill(ztest_shared_t *zs)
|
||||
(void) kill(getpid(), SIGKILL);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
ztest_random(uint64_t range)
|
||||
{
|
||||
uint64_t r;
|
||||
|
||||
ASSERT3S(ztest_fd_rand, >=, 0);
|
||||
|
||||
if (range == 0)
|
||||
return (0);
|
||||
|
||||
if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
|
||||
fatal(1, "short read from /dev/urandom");
|
||||
|
||||
return (r % range);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static void
|
||||
ztest_record_enospc(const char *s)
|
||||
@@ -997,12 +1067,27 @@ ztest_get_ashift(void)
|
||||
return (ztest_opts.zo_ashift);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
ztest_is_draid_spare(const char *name)
|
||||
{
|
||||
uint64_t spare_id = 0, parity = 0, vdev_id = 0;
|
||||
|
||||
if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu",
|
||||
(u_longlong_t *)&parity, (u_longlong_t *)&vdev_id,
|
||||
(u_longlong_t *)&spare_id) == 3) {
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
static nvlist_t *
|
||||
make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
|
||||
{
|
||||
char *pathbuf;
|
||||
uint64_t vdev;
|
||||
nvlist_t *file;
|
||||
boolean_t draid_spare = B_FALSE;
|
||||
|
||||
pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
|
||||
|
||||
@@ -1024,9 +1109,11 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
|
||||
ztest_dev_template, ztest_opts.zo_dir,
|
||||
pool == NULL ? ztest_opts.zo_pool : pool, vdev);
|
||||
}
|
||||
} else {
|
||||
draid_spare = ztest_is_draid_spare(path);
|
||||
}
|
||||
|
||||
if (size != 0) {
|
||||
if (size != 0 && !draid_spare) {
|
||||
int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
|
||||
if (fd == -1)
|
||||
fatal(1, "can't open %s", path);
|
||||
@@ -1035,20 +1122,21 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
|
||||
(void) close(fd);
|
||||
}
|
||||
|
||||
VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
|
||||
VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
|
||||
VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
|
||||
VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
|
||||
VERIFY0(nvlist_alloc(&file, NV_UNIQUE_NAME, 0));
|
||||
VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_TYPE,
|
||||
draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE));
|
||||
VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path));
|
||||
VERIFY0(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift));
|
||||
umem_free(pathbuf, MAXPATHLEN);
|
||||
|
||||
return (file);
|
||||
}
|
||||
|
||||
static nvlist_t *
|
||||
make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
|
||||
make_vdev_raid(char *path, char *aux, char *pool, size_t size,
|
||||
uint64_t ashift, int r)
|
||||
{
|
||||
nvlist_t *raidz, **child;
|
||||
nvlist_t *raid, **child;
|
||||
int c;
|
||||
|
||||
if (r < 2)
|
||||
@@ -1058,20 +1146,41 @@ make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
|
||||
for (c = 0; c < r; c++)
|
||||
child[c] = make_vdev_file(path, aux, pool, size, ashift);
|
||||
|
||||
VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
|
||||
VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
|
||||
VDEV_TYPE_RAIDZ) == 0);
|
||||
VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
|
||||
ztest_opts.zo_raidz_parity) == 0);
|
||||
VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
|
||||
child, r) == 0);
|
||||
VERIFY0(nvlist_alloc(&raid, NV_UNIQUE_NAME, 0));
|
||||
VERIFY0(nvlist_add_string(raid, ZPOOL_CONFIG_TYPE,
|
||||
ztest_opts.zo_raid_type));
|
||||
VERIFY0(nvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY,
|
||||
ztest_opts.zo_raid_parity));
|
||||
VERIFY0(nvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN,
|
||||
child, r));
|
||||
|
||||
if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) {
|
||||
uint64_t ndata = ztest_opts.zo_draid_data;
|
||||
uint64_t nparity = ztest_opts.zo_raid_parity;
|
||||
uint64_t nspares = ztest_opts.zo_draid_spares;
|
||||
uint64_t children = ztest_opts.zo_raid_children;
|
||||
uint64_t ngroups = 1;
|
||||
|
||||
/*
|
||||
* Calculate the minimum number of groups required to fill a
|
||||
* slice. This is the LCM of the stripe width (data + parity)
|
||||
* and the number of data drives (children - spares).
|
||||
*/
|
||||
while (ngroups * (ndata + nparity) % (children - nspares) != 0)
|
||||
ngroups++;
|
||||
|
||||
/* Store the basic dRAID configuration. */
|
||||
fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata);
|
||||
fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
|
||||
fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
|
||||
}
|
||||
|
||||
for (c = 0; c < r; c++)
|
||||
nvlist_free(child[c]);
|
||||
|
||||
umem_free(child, r * sizeof (nvlist_t *));
|
||||
|
||||
return (raidz);
|
||||
return (raid);
|
||||
}
|
||||
|
||||
static nvlist_t *
|
||||
@@ -1082,12 +1191,12 @@ make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
|
||||
int c;
|
||||
|
||||
if (m < 1)
|
||||
return (make_vdev_raidz(path, aux, pool, size, ashift, r));
|
||||
return (make_vdev_raid(path, aux, pool, size, ashift, r));
|
||||
|
||||
child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
|
||||
|
||||
for (c = 0; c < m; c++)
|
||||
child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
|
||||
child[c] = make_vdev_raid(path, aux, pool, size, ashift, r);
|
||||
|
||||
VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
|
||||
VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
|
||||
@@ -2809,6 +2918,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
|
||||
if (ztest_opts.zo_mmp_test)
|
||||
return;
|
||||
|
||||
/* dRAID added after feature flags, skip upgrade test. */
|
||||
if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0)
|
||||
return;
|
||||
|
||||
mutex_enter(&ztest_vdev_lock);
|
||||
name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
|
||||
|
||||
@@ -2818,13 +2931,13 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
|
||||
(void) spa_destroy(name);
|
||||
|
||||
nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
|
||||
NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
|
||||
NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1);
|
||||
|
||||
/*
|
||||
* If we're configuring a RAIDZ device then make sure that the
|
||||
* initial version is capable of supporting that feature.
|
||||
*/
|
||||
switch (ztest_opts.zo_raidz_parity) {
|
||||
switch (ztest_opts.zo_raid_parity) {
|
||||
case 0:
|
||||
case 1:
|
||||
initial_version = SPA_VERSION_INITIAL;
|
||||
@@ -2970,7 +3083,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
|
||||
return;
|
||||
|
||||
mutex_enter(&ztest_vdev_lock);
|
||||
leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
|
||||
leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) *
|
||||
ztest_opts.zo_raid_children;
|
||||
|
||||
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
|
||||
|
||||
@@ -3024,7 +3138,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
|
||||
*/
|
||||
nvroot = make_vdev_root(NULL, NULL, NULL,
|
||||
ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ?
|
||||
"log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
|
||||
"log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors,
|
||||
1);
|
||||
|
||||
error = spa_vdev_add(spa, nvroot);
|
||||
nvlist_free(nvroot);
|
||||
@@ -3078,14 +3193,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
|
||||
return;
|
||||
}
|
||||
|
||||
leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
|
||||
leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) *
|
||||
ztest_opts.zo_raid_children;
|
||||
|
||||
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
|
||||
ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
|
||||
spa_config_exit(spa, SCL_VDEV, FTAG);
|
||||
|
||||
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
|
||||
class, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
|
||||
class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1);
|
||||
|
||||
error = spa_vdev_add(spa, nvroot);
|
||||
nvlist_free(nvroot);
|
||||
@@ -3134,7 +3250,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
|
||||
char *aux;
|
||||
char *path;
|
||||
uint64_t guid = 0;
|
||||
int error;
|
||||
int error, ignore_err = 0;
|
||||
|
||||
if (ztest_opts.zo_mmp_test)
|
||||
return;
|
||||
@@ -3157,7 +3273,13 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
|
||||
/*
|
||||
* Pick a random device to remove.
|
||||
*/
|
||||
guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
|
||||
vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)];
|
||||
|
||||
/* dRAID spares cannot be removed; try anyways to see ENOTSUP */
|
||||
if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL)
|
||||
ignore_err = ENOTSUP;
|
||||
|
||||
guid = svd->vdev_guid;
|
||||
} else {
|
||||
/*
|
||||
* Find an unused device we can add.
|
||||
@@ -3214,7 +3336,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
|
||||
case ZFS_ERR_DISCARDING_CHECKPOINT:
|
||||
break;
|
||||
default:
|
||||
fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
|
||||
if (error != ignore_err)
|
||||
fatal(0, "spa_vdev_remove(%llu) = %d", guid,
|
||||
error);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3243,7 +3367,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id)
|
||||
mutex_enter(&ztest_vdev_lock);
|
||||
|
||||
/* ensure we have a usable config; mirrors of raidz aren't supported */
|
||||
if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
|
||||
if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) {
|
||||
mutex_exit(&ztest_vdev_lock);
|
||||
return;
|
||||
}
|
||||
@@ -3343,6 +3467,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
|
||||
int replacing;
|
||||
int oldvd_has_siblings = B_FALSE;
|
||||
int newvd_is_spare = B_FALSE;
|
||||
int newvd_is_dspare = B_FALSE;
|
||||
int oldvd_is_log;
|
||||
int error, expected_error;
|
||||
|
||||
@@ -3353,7 +3478,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
|
||||
newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
|
||||
|
||||
mutex_enter(&ztest_vdev_lock);
|
||||
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
|
||||
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children;
|
||||
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
|
||||
@@ -3393,14 +3518,17 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
|
||||
if (zs->zs_mirrors >= 1) {
|
||||
ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
|
||||
ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
|
||||
oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
|
||||
oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children];
|
||||
}
|
||||
|
||||
/* pick a child out of the raidz group */
|
||||
if (ztest_opts.zo_raidz > 1) {
|
||||
ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
|
||||
ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
|
||||
oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
|
||||
if (ztest_opts.zo_raid_children > 1) {
|
||||
if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0)
|
||||
ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
|
||||
else
|
||||
ASSERT(oldvd->vdev_ops == &vdev_draid_ops);
|
||||
ASSERT(oldvd->vdev_children == ztest_opts.zo_raid_children);
|
||||
oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children];
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3447,6 +3575,10 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
|
||||
if (sav->sav_count != 0 && ztest_random(3) == 0) {
|
||||
newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
|
||||
newvd_is_spare = B_TRUE;
|
||||
|
||||
if (newvd->vdev_ops == &vdev_draid_spare_ops)
|
||||
newvd_is_dspare = B_TRUE;
|
||||
|
||||
(void) strcpy(newpath, newvd->vdev_path);
|
||||
} else {
|
||||
(void) snprintf(newpath, MAXPATHLEN, ztest_dev_template,
|
||||
@@ -3480,6 +3612,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
|
||||
* If newvd is already part of the pool, it should fail with EBUSY.
|
||||
*
|
||||
* If newvd is too small, it should fail with EOVERFLOW.
|
||||
*
|
||||
* If newvd is a distributed spare and it's being attached to a
|
||||
* dRAID which is not its parent it should fail with EINVAL.
|
||||
*/
|
||||
if (pvd->vdev_ops != &vdev_mirror_ops &&
|
||||
pvd->vdev_ops != &vdev_root_ops && (!replacing ||
|
||||
@@ -3492,10 +3627,12 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
|
||||
expected_error = replacing ? 0 : EBUSY;
|
||||
else if (vdev_lookup_by_path(rvd, newpath) != NULL)
|
||||
expected_error = EBUSY;
|
||||
else if (newsize < oldsize)
|
||||
else if (!newvd_is_dspare && newsize < oldsize)
|
||||
expected_error = EOVERFLOW;
|
||||
else if (ashift > oldvd->vdev_top->vdev_ashift)
|
||||
expected_error = EDOM;
|
||||
else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd))
|
||||
expected_error = ENOTSUP;
|
||||
else
|
||||
expected_error = 0;
|
||||
|
||||
@@ -4880,13 +5017,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
|
||||
void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
|
||||
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
|
||||
|
||||
VERIFY(0 == dmu_read(os, packobj, packoff,
|
||||
VERIFY0(dmu_read(os, packobj, packoff,
|
||||
packsize, packcheck, DMU_READ_PREFETCH));
|
||||
VERIFY(0 == dmu_read(os, bigobj, bigoff,
|
||||
VERIFY0(dmu_read(os, bigobj, bigoff,
|
||||
bigsize, bigcheck, DMU_READ_PREFETCH));
|
||||
|
||||
ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
|
||||
ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
|
||||
ASSERT0(bcmp(packbuf, packcheck, packsize));
|
||||
ASSERT0(bcmp(bigbuf, bigcheck, bigsize));
|
||||
|
||||
umem_free(packcheck, packsize);
|
||||
umem_free(bigcheck, bigsize);
|
||||
@@ -5761,7 +5898,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
|
||||
}
|
||||
|
||||
maxfaults = MAXFAULTS(zs);
|
||||
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
|
||||
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children;
|
||||
mirror_save = zs->zs_mirrors;
|
||||
mutex_exit(&ztest_vdev_lock);
|
||||
|
||||
@@ -6011,7 +6148,7 @@ out:
|
||||
/*
|
||||
* By design ztest will never inject uncorrectable damage in to the pool.
|
||||
* Issue a scrub, wait for it to complete, and verify there is never any
|
||||
* any persistent damage.
|
||||
* persistent damage.
|
||||
*
|
||||
* Only after a full scrub has been completed is it safe to start injecting
|
||||
* data corruption. See the comment in zfs_fault_inject().
|
||||
@@ -7347,7 +7484,7 @@ ztest_init(ztest_shared_t *zs)
|
||||
zs->zs_splits = 0;
|
||||
zs->zs_mirrors = ztest_opts.zo_mirrors;
|
||||
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
|
||||
NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
|
||||
NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1);
|
||||
props = make_random_props();
|
||||
|
||||
/*
|
||||
@@ -7683,10 +7820,12 @@ main(int argc, char **argv)
|
||||
|
||||
if (ztest_opts.zo_verbose >= 1) {
|
||||
(void) printf("%llu vdevs, %d datasets, %d threads,"
|
||||
" %llu seconds...\n",
|
||||
"%d %s disks, %llu seconds...\n\n",
|
||||
(u_longlong_t)ztest_opts.zo_vdevs,
|
||||
ztest_opts.zo_datasets,
|
||||
ztest_opts.zo_threads,
|
||||
ztest_opts.zo_raid_children,
|
||||
ztest_opts.zo_raid_type,
|
||||
(u_longlong_t)ztest_opts.zo_time);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user