Distributed Spare (dRAID) Feature

This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID.  This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.

A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`.  No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.

    zpool create <pool> draid[1,2,3] <vdevs...>

Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons.  The supported options include:

    zpool create <pool> \
        draid[<parity>][:<data>d][:<children>c][:<spares>s] \
        <vdevs...>

    - draid[parity]       - Parity level (default 1)
    - draid[:<data>d]     - Data devices per group (default 8)
    - draid[:<children>c] - Expected number of child vdevs
    - draid[:<spares>s]   - Distributed hot spares (default 0)

Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.

```
  pool: tank
 state: ONLINE
config:

    NAME                  STATE     READ WRITE CKSUM
    slag7                 ONLINE       0     0     0
      draid2:8d:68c:2s-0  ONLINE       0     0     0
        L0                ONLINE       0     0     0
        L1                ONLINE       0     0     0
        ...
        U25               ONLINE       0     0     0
        U26               ONLINE       0     0     0
        spare-53          ONLINE       0     0     0
          U27             ONLINE       0     0     0
          draid2-0-0      ONLINE       0     0     0
        U28               ONLINE       0     0     0
        U29               ONLINE       0     0     0
        ...
        U42               ONLINE       0     0     0
        U43               ONLINE       0     0     0
    special
      mirror-1            ONLINE       0     0     0
        L5                ONLINE       0     0     0
        U5                ONLINE       0     0     0
      mirror-2            ONLINE       0     0     0
        L6                ONLINE       0     0     0
        U6                ONLINE       0     0     0
    spares
      draid2-0-0          INUSE     currently in use
      draid2-0-1          AVAIL
```

When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command.  These options are leverages
by zloop.sh to test a wide range of dRAID configurations.

    -K draid|raidz|random - kind of RAID to test
    -D <value>            - dRAID data drives per group
    -S <value>            - dRAID distributed hot spares
    -R <value>            - RAID parity (raidz or dRAID)

The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.

Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
This commit is contained in:
Brian Behlendorf
2020-11-13 13:51:51 -08:00
committed by GitHub
parent a724db0374
commit b2255edcc0
153 changed files with 10203 additions and 1882 deletions
+39 -22
View File
@@ -29,6 +29,7 @@
#include <sys/debug.h>
#include <sys/kstat.h>
#include <sys/abd.h>
#include <sys/vdev_impl.h>
#ifdef __cplusplus
extern "C" {
@@ -106,30 +107,45 @@ typedef struct raidz_col {
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
abd_t *rc_abd; /* I/O data */
void *rc_gdata; /* used to store the "good" version */
void *rc_orig_data; /* pre-reconstruction */
abd_t *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */
uint8_t rc_need_orig_restore; /* need to restore from orig_data? */
uint8_t rc_repair; /* Write good data to this column */
} raidz_col_t;
typedef struct raidz_row {
uint64_t rr_cols; /* Regular column count */
uint64_t rr_scols; /* Count including skipped columns */
uint64_t rr_bigcols; /* Remainder data column count */
uint64_t rr_missingdata; /* Count of missing data devices */
uint64_t rr_missingparity; /* Count of missing parity devices */
uint64_t rr_firstdatacol; /* First data column/parity count */
abd_t *rr_abd_copy; /* rm_asize-buffer of copied data */
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
int rr_nempty; /* empty sectors included in parity */
int rr_code; /* reconstruction code (unused) */
#ifdef ZFS_DEBUG
uint64_t rr_offset; /* Logical offset for *_io_verify() */
uint64_t rr_size; /* Physical size for *_io_verify() */
#endif
raidz_col_t rr_col[0]; /* Flexible array of I/O columns */
} raidz_row_t;
typedef struct raidz_map {
uint64_t rm_cols; /* Regular column count */
uint64_t rm_scols; /* Count including skipped columns */
uint64_t rm_bigcols; /* Number of oversized columns */
uint64_t rm_asize; /* Actual total I/O size */
uint64_t rm_missingdata; /* Count of missing data devices */
uint64_t rm_missingparity; /* Count of missing parity devices */
uint64_t rm_firstdatacol; /* First data column/parity count */
uint64_t rm_nskip; /* Skipped sectors for padding */
uint64_t rm_skipstart; /* Column index of padding start */
abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
uintptr_t rm_reports; /* # of referencing checksum reports */
uint8_t rm_freed; /* map no longer has referencing ZIO */
uint8_t rm_ecksuminjected; /* checksum error was injected */
boolean_t rm_freed; /* map no longer has referencing ZIO */
boolean_t rm_ecksuminjected; /* checksum error was injected */
int rm_nrows; /* Regular row count */
int rm_nskip; /* RAIDZ sectors skipped for padding */
int rm_skipstart; /* Column index of padding start */
const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
raidz_row_t *rm_row[0]; /* flexible array of rows */
} raidz_map_t;
#define RAIDZ_ORIGINAL_IMPL (INT_MAX)
extern const raidz_impl_ops_t vdev_raidz_scalar_impl;
@@ -163,14 +179,15 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl;
*
* raidz_parity Returns parity of the RAIDZ block
* raidz_ncols Returns number of columns the block spans
* Note, all rows have the same number of columns.
* raidz_nbigcols Returns number of big columns
* raidz_col_p Returns pointer to a column
* raidz_col_size Returns size of a column
* raidz_big_size Returns size of big columns
* raidz_short_size Returns size of short columns
*/
#define raidz_parity(rm) ((rm)->rm_firstdatacol)
#define raidz_ncols(rm) ((rm)->rm_cols)
#define raidz_parity(rm) ((rm)->rm_row[0]->rr_firstdatacol)
#define raidz_ncols(rm) ((rm)->rm_row[0]->rr_cols)
#define raidz_nbigcols(rm) ((rm)->rm_bigcols)
#define raidz_col_p(rm, c) ((rm)->rm_col + (c))
#define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size)
@@ -185,10 +202,10 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl;
*/
#define _RAIDZ_GEN_WRAP(code, impl) \
static void \
impl ## _gen_ ## code(void *rmp) \
impl ## _gen_ ## code(void *rrp) \
{ \
raidz_map_t *rm = (raidz_map_t *)rmp; \
raidz_generate_## code ## _impl(rm); \
raidz_row_t *rr = (raidz_row_t *)rrp; \
raidz_generate_## code ## _impl(rr); \
}
/*
@@ -199,10 +216,10 @@ impl ## _gen_ ## code(void *rmp) \
*/
#define _RAIDZ_REC_WRAP(code, impl) \
static int \
impl ## _rec_ ## code(void *rmp, const int *tgtidx) \
impl ## _rec_ ## code(void *rrp, const int *tgtidx) \
{ \
raidz_map_t *rm = (raidz_map_t *)rmp; \
return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \
raidz_row_t *rr = (raidz_row_t *)rrp; \
return (raidz_reconstruct_## code ## _impl(rr, tgtidx)); \
}
/*