mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 11:18:52 +03:00
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
This commit is contained in:
@@ -29,6 +29,7 @@
|
||||
#include <sys/debug.h>
|
||||
#include <sys/kstat.h>
|
||||
#include <sys/abd.h>
|
||||
#include <sys/vdev_impl.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -106,30 +107,45 @@ typedef struct raidz_col {
|
||||
uint64_t rc_offset; /* device offset */
|
||||
uint64_t rc_size; /* I/O size */
|
||||
abd_t *rc_abd; /* I/O data */
|
||||
void *rc_gdata; /* used to store the "good" version */
|
||||
void *rc_orig_data; /* pre-reconstruction */
|
||||
abd_t *rc_gdata; /* used to store the "good" version */
|
||||
int rc_error; /* I/O error for this device */
|
||||
uint8_t rc_tried; /* Did we attempt this I/O column? */
|
||||
uint8_t rc_skipped; /* Did we skip this I/O column? */
|
||||
uint8_t rc_need_orig_restore; /* need to restore from orig_data? */
|
||||
uint8_t rc_repair; /* Write good data to this column */
|
||||
} raidz_col_t;
|
||||
|
||||
typedef struct raidz_row {
|
||||
uint64_t rr_cols; /* Regular column count */
|
||||
uint64_t rr_scols; /* Count including skipped columns */
|
||||
uint64_t rr_bigcols; /* Remainder data column count */
|
||||
uint64_t rr_missingdata; /* Count of missing data devices */
|
||||
uint64_t rr_missingparity; /* Count of missing parity devices */
|
||||
uint64_t rr_firstdatacol; /* First data column/parity count */
|
||||
abd_t *rr_abd_copy; /* rm_asize-buffer of copied data */
|
||||
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
|
||||
int rr_nempty; /* empty sectors included in parity */
|
||||
int rr_code; /* reconstruction code (unused) */
|
||||
#ifdef ZFS_DEBUG
|
||||
uint64_t rr_offset; /* Logical offset for *_io_verify() */
|
||||
uint64_t rr_size; /* Physical size for *_io_verify() */
|
||||
#endif
|
||||
raidz_col_t rr_col[0]; /* Flexible array of I/O columns */
|
||||
} raidz_row_t;
|
||||
|
||||
typedef struct raidz_map {
|
||||
uint64_t rm_cols; /* Regular column count */
|
||||
uint64_t rm_scols; /* Count including skipped columns */
|
||||
uint64_t rm_bigcols; /* Number of oversized columns */
|
||||
uint64_t rm_asize; /* Actual total I/O size */
|
||||
uint64_t rm_missingdata; /* Count of missing data devices */
|
||||
uint64_t rm_missingparity; /* Count of missing parity devices */
|
||||
uint64_t rm_firstdatacol; /* First data column/parity count */
|
||||
uint64_t rm_nskip; /* Skipped sectors for padding */
|
||||
uint64_t rm_skipstart; /* Column index of padding start */
|
||||
abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
|
||||
uintptr_t rm_reports; /* # of referencing checksum reports */
|
||||
uint8_t rm_freed; /* map no longer has referencing ZIO */
|
||||
uint8_t rm_ecksuminjected; /* checksum error was injected */
|
||||
boolean_t rm_freed; /* map no longer has referencing ZIO */
|
||||
boolean_t rm_ecksuminjected; /* checksum error was injected */
|
||||
int rm_nrows; /* Regular row count */
|
||||
int rm_nskip; /* RAIDZ sectors skipped for padding */
|
||||
int rm_skipstart; /* Column index of padding start */
|
||||
const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
|
||||
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
|
||||
raidz_row_t *rm_row[0]; /* flexible array of rows */
|
||||
} raidz_map_t;
|
||||
|
||||
|
||||
#define RAIDZ_ORIGINAL_IMPL (INT_MAX)
|
||||
|
||||
extern const raidz_impl_ops_t vdev_raidz_scalar_impl;
|
||||
@@ -163,14 +179,15 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl;
|
||||
*
|
||||
* raidz_parity Returns parity of the RAIDZ block
|
||||
* raidz_ncols Returns number of columns the block spans
|
||||
* Note, all rows have the same number of columns.
|
||||
* raidz_nbigcols Returns number of big columns
|
||||
* raidz_col_p Returns pointer to a column
|
||||
* raidz_col_size Returns size of a column
|
||||
* raidz_big_size Returns size of big columns
|
||||
* raidz_short_size Returns size of short columns
|
||||
*/
|
||||
#define raidz_parity(rm) ((rm)->rm_firstdatacol)
|
||||
#define raidz_ncols(rm) ((rm)->rm_cols)
|
||||
#define raidz_parity(rm) ((rm)->rm_row[0]->rr_firstdatacol)
|
||||
#define raidz_ncols(rm) ((rm)->rm_row[0]->rr_cols)
|
||||
#define raidz_nbigcols(rm) ((rm)->rm_bigcols)
|
||||
#define raidz_col_p(rm, c) ((rm)->rm_col + (c))
|
||||
#define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size)
|
||||
@@ -185,10 +202,10 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl;
|
||||
*/
|
||||
#define _RAIDZ_GEN_WRAP(code, impl) \
|
||||
static void \
|
||||
impl ## _gen_ ## code(void *rmp) \
|
||||
impl ## _gen_ ## code(void *rrp) \
|
||||
{ \
|
||||
raidz_map_t *rm = (raidz_map_t *)rmp; \
|
||||
raidz_generate_## code ## _impl(rm); \
|
||||
raidz_row_t *rr = (raidz_row_t *)rrp; \
|
||||
raidz_generate_## code ## _impl(rr); \
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -199,10 +216,10 @@ impl ## _gen_ ## code(void *rmp) \
|
||||
*/
|
||||
#define _RAIDZ_REC_WRAP(code, impl) \
|
||||
static int \
|
||||
impl ## _rec_ ## code(void *rmp, const int *tgtidx) \
|
||||
impl ## _rec_ ## code(void *rrp, const int *tgtidx) \
|
||||
{ \
|
||||
raidz_map_t *rm = (raidz_map_t *)rmp; \
|
||||
return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \
|
||||
raidz_row_t *rr = (raidz_row_t *)rrp; \
|
||||
return (raidz_reconstruct_## code ## _impl(rr, tgtidx)); \
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user