Distributed Spare (dRAID) Feature

This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID.  This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.

A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`.  No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.

    zpool create <pool> draid[1,2,3] <vdevs...>

Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons.  The supported options include:

    zpool create <pool> \
        draid[<parity>][:<data>d][:<children>c][:<spares>s] \
        <vdevs...>

    - draid[parity]       - Parity level (default 1)
    - draid[:<data>d]     - Data devices per group (default 8)
    - draid[:<children>c] - Expected number of child vdevs
    - draid[:<spares>s]   - Distributed hot spares (default 0)

Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.

```
  pool: tank
 state: ONLINE
config:

    NAME                  STATE     READ WRITE CKSUM
    slag7                 ONLINE       0     0     0
      draid2:8d:68c:2s-0  ONLINE       0     0     0
        L0                ONLINE       0     0     0
        L1                ONLINE       0     0     0
        ...
        U25               ONLINE       0     0     0
        U26               ONLINE       0     0     0
        spare-53          ONLINE       0     0     0
          U27             ONLINE       0     0     0
          draid2-0-0      ONLINE       0     0     0
        U28               ONLINE       0     0     0
        U29               ONLINE       0     0     0
        ...
        U42               ONLINE       0     0     0
        U43               ONLINE       0     0     0
    special
      mirror-1            ONLINE       0     0     0
        L5                ONLINE       0     0     0
        U5                ONLINE       0     0     0
      mirror-2            ONLINE       0     0     0
        L6                ONLINE       0     0     0
        U6                ONLINE       0     0     0
    spares
      draid2-0-0          INUSE     currently in use
      draid2-0-1          AVAIL
```

When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command.  These options are leverages
by zloop.sh to test a wide range of dRAID configurations.

    -K draid|raidz|random - kind of RAID to test
    -D <value>            - dRAID data drives per group
    -S <value>            - dRAID distributed hot spares
    -R <value>            - RAID parity (raidz or dRAID)

The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.

Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
This commit is contained in:
Brian Behlendorf
2020-11-13 13:51:51 -08:00
committed by GitHub
parent a724db0374
commit b2255edcc0
153 changed files with 10203 additions and 1882 deletions
+123 -14
View File
@@ -33,6 +33,7 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/zio.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
@@ -99,7 +100,6 @@ vdev_mirror_stat_fini(void)
/*
* Virtual device vector for mirroring.
*/
typedef struct mirror_child {
vdev_t *mc_vd;
uint64_t mc_offset;
@@ -108,6 +108,7 @@ typedef struct mirror_child {
uint8_t mc_tried;
uint8_t mc_skipped;
uint8_t mc_speculative;
uint8_t mc_rebuilding;
} mirror_child_t;
typedef struct mirror_map {
@@ -115,6 +116,7 @@ typedef struct mirror_map {
int mm_preferred_cnt;
int mm_children;
boolean_t mm_resilvering;
boolean_t mm_rebuilding;
boolean_t mm_root;
mirror_child_t mm_child[];
} mirror_map_t;
@@ -239,6 +241,21 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
return (load + zfs_vdev_mirror_rotating_seek_inc);
}
static boolean_t
vdev_mirror_rebuilding(vdev_t *vd)
{
if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
return (B_TRUE);
for (int i = 0; i < vd->vdev_children; i++) {
if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
return (B_TRUE);
}
}
return (B_FALSE);
}
/*
* Avoid inlining the function to keep vdev_mirror_io_start(), which
* is this functions only caller, as small as possible on the stack.
@@ -356,6 +373,9 @@ vdev_mirror_map_init(zio_t *zio)
mc = &mm->mm_child[c];
mc->mc_vd = vd->vdev_child[c];
mc->mc_offset = zio->io_offset;
if (vdev_mirror_rebuilding(mc->mc_vd))
mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
}
}
@@ -493,12 +513,37 @@ vdev_mirror_preferred_child_randomize(zio_t *zio)
return (mm->mm_preferred[p]);
}
static boolean_t
vdev_mirror_child_readable(mirror_child_t *mc)
{
vdev_t *vd = mc->mc_vd;
if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
return (vdev_draid_readable(vd, mc->mc_offset));
else
return (vdev_readable(vd));
}
static boolean_t
vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
{
vdev_t *vd = mc->mc_vd;
if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
else
return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
}
/*
* Try to find a vdev whose DTL doesn't contain the block we want to read
* preferring vdevs based on determined load.
* preferring vdevs based on determined load. If we can't, try the read on
* any vdev we haven't already tried.
*
* Try to find a child whose DTL doesn't contain the block we want to read.
* If we can't, try the read on any vdev we haven't already tried.
* Distributed spares are an exception to the above load rule. They are
* always preferred in order to detect gaps in the distributed spare which
* are created when another disk in the dRAID fails. In order to restore
* redundancy those gaps must be read to trigger the required repair IO.
*/
static int
vdev_mirror_child_select(zio_t *zio)
@@ -518,20 +563,27 @@ vdev_mirror_child_select(zio_t *zio)
if (mc->mc_tried || mc->mc_skipped)
continue;
if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) {
if (mc->mc_vd == NULL ||
!vdev_mirror_child_readable(mc)) {
mc->mc_error = SET_ERROR(ENXIO);
mc->mc_tried = 1; /* don't even try */
mc->mc_skipped = 1;
continue;
}
if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
if (vdev_mirror_child_missing(mc, txg, 1)) {
mc->mc_error = SET_ERROR(ESTALE);
mc->mc_skipped = 1;
mc->mc_speculative = 1;
continue;
}
if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
mm->mm_preferred[0] = c;
mm->mm_preferred_cnt = 1;
break;
}
mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
if (mc->mc_load > lowest_load)
continue;
@@ -625,11 +677,25 @@ vdev_mirror_io_start(zio_t *zio)
while (children--) {
mc = &mm->mm_child[c];
c++;
/*
* When sequentially resilvering only issue write repair
* IOs to the vdev which is being rebuilt since performance
* is limited by the slowest child. This is an issue for
* faster replacement devices such as distributed spares.
*/
if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SCRUB) &&
mm->mm_rebuilding && !mc->mc_rebuilding) {
continue;
}
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_child_done, mc));
c++;
}
zio_execute(zio);
@@ -744,6 +810,8 @@ vdev_mirror_io_done(zio_t *zio)
mc = &mm->mm_child[c];
if (mc->mc_error == 0) {
vdev_ops_t *ops = mc->mc_vd->vdev_ops;
if (mc->mc_tried)
continue;
/*
@@ -752,15 +820,16 @@ vdev_mirror_io_done(zio_t *zio)
* 1. it's a scrub (in which case we have
* tried everything that was healthy)
* - or -
* 2. it's an indirect vdev (in which case
* it could point to any other vdev, which
* might have a bad DTL)
* 2. it's an indirect or distributed spare
* vdev (in which case it could point to any
* other vdev, which might have a bad DTL)
* - or -
* 3. the DTL indicates that this data is
* missing from this vdev
*/
if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
ops != &vdev_indirect_ops &&
ops != &vdev_draid_spare_ops &&
!vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
zio->io_txg, 1))
continue;
@@ -796,50 +865,90 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
}
}
/*
* Return the maximum asize for a rebuild zio in the provided range.
*/
static uint64_t
vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
uint64_t max_segment)
{
uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
SPA_MAXBLOCKSIZE);
return (MIN(asize, vdev_psize_to_asize(vd, psize)));
}
vdev_ops_t vdev_mirror_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
.vdev_op_need_resilver = NULL,
.vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
vdev_ops_t vdev_replacing_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
.vdev_op_need_resilver = NULL,
.vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
vdev_ops_t vdev_spare_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
.vdev_op_need_resilver = NULL,
.vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};