mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-25 03:37:45 +03:00
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
This commit is contained in:
+94
-32
@@ -60,6 +60,7 @@
|
||||
#include <sys/vdev_rebuild.h>
|
||||
#include <sys/vdev_trim.h>
|
||||
#include <sys/vdev_disk.h>
|
||||
#include <sys/vdev_draid.h>
|
||||
#include <sys/metaslab.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
#include <sys/mmp.h>
|
||||
@@ -3681,7 +3682,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
|
||||
/*
|
||||
* Build a new vdev tree from the trusted config
|
||||
*/
|
||||
VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
|
||||
error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
|
||||
if (error != 0) {
|
||||
nvlist_free(mos_config);
|
||||
spa_config_exit(spa, SCL_ALL, FTAG);
|
||||
spa_load_failed(spa, "spa_config_parse failed [error=%d]",
|
||||
error);
|
||||
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
|
||||
}
|
||||
|
||||
/*
|
||||
* Vdev paths in the MOS may be obsolete. If the untrusted config was
|
||||
@@ -5631,7 +5639,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
|
||||
uint64_t txg = TXG_INITIAL;
|
||||
nvlist_t **spares, **l2cache;
|
||||
uint_t nspares, nl2cache;
|
||||
uint64_t version, obj;
|
||||
uint64_t version, obj, ndraid = 0;
|
||||
boolean_t has_features;
|
||||
boolean_t has_encryption;
|
||||
boolean_t has_allocclass;
|
||||
@@ -5753,8 +5761,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
|
||||
|
||||
if (error == 0 &&
|
||||
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
|
||||
(error = spa_validate_aux(spa, nvroot, txg,
|
||||
VDEV_ALLOC_ADD)) == 0) {
|
||||
(error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
|
||||
(error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
|
||||
/*
|
||||
* instantiate the metaslab groups (this will dirty the vdevs)
|
||||
* we can no longer error exit past this point
|
||||
@@ -5895,6 +5903,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
|
||||
spa_sync_props(props, tx);
|
||||
}
|
||||
|
||||
for (int i = 0; i < ndraid; i++)
|
||||
spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
spa->spa_sync_on = B_TRUE;
|
||||
@@ -6403,13 +6414,26 @@ spa_reset(const char *pool)
|
||||
* ==========================================================================
|
||||
*/
|
||||
|
||||
/*
|
||||
* This is called as a synctask to increment the draid feature flag
|
||||
*/
|
||||
static void
|
||||
spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
|
||||
int draid = (int)(uintptr_t)arg;
|
||||
|
||||
for (int c = 0; c < draid; c++)
|
||||
spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a device to a storage pool.
|
||||
*/
|
||||
int
|
||||
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
{
|
||||
uint64_t txg;
|
||||
uint64_t txg, ndraid = 0;
|
||||
int error;
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
vdev_t *vd, *tvd;
|
||||
@@ -6438,8 +6462,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
return (spa_vdev_exit(spa, vd, txg, EINVAL));
|
||||
|
||||
if (vd->vdev_children != 0 &&
|
||||
(error = vdev_create(vd, txg, B_FALSE)) != 0)
|
||||
(error = vdev_create(vd, txg, B_FALSE)) != 0) {
|
||||
return (spa_vdev_exit(spa, vd, txg, error));
|
||||
}
|
||||
|
||||
/*
|
||||
* The virtual dRAID spares must be added after vdev tree is created
|
||||
* and the vdev guids are generated. The guid of their assoicated
|
||||
* dRAID is stored in the config and used when opening the spare.
|
||||
*/
|
||||
if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
|
||||
rvd->vdev_children)) == 0) {
|
||||
if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
|
||||
ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
|
||||
nspares = 0;
|
||||
} else {
|
||||
return (spa_vdev_exit(spa, vd, txg, error));
|
||||
}
|
||||
|
||||
/*
|
||||
* We must validate the spares and l2cache devices after checking the
|
||||
@@ -6452,7 +6491,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
* If we are in the middle of a device removal, we can only add
|
||||
* devices which match the existing devices in the pool.
|
||||
* If we are in the middle of a removal, or have some indirect
|
||||
* vdevs, we can not add raidz toplevels.
|
||||
* vdevs, we can not add raidz or dRAID top levels.
|
||||
*/
|
||||
if (spa->spa_vdev_removal != NULL ||
|
||||
spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
|
||||
@@ -6462,10 +6501,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
tvd->vdev_ashift != spa->spa_max_ashift) {
|
||||
return (spa_vdev_exit(spa, vd, txg, EINVAL));
|
||||
}
|
||||
/* Fail if top level vdev is raidz */
|
||||
if (tvd->vdev_ops == &vdev_raidz_ops) {
|
||||
/* Fail if top level vdev is raidz or a dRAID */
|
||||
if (vdev_get_nparity(tvd) != 0)
|
||||
return (spa_vdev_exit(spa, vd, txg, EINVAL));
|
||||
}
|
||||
|
||||
/*
|
||||
* Need the top level mirror to be
|
||||
* a mirror of leaf vdevs only
|
||||
@@ -6505,6 +6544,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
spa->spa_l2cache.sav_sync = B_TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* We can't increment a feature while holding spa_vdev so we
|
||||
* have to do it in a synctask.
|
||||
*/
|
||||
if (ndraid != 0) {
|
||||
dmu_tx_t *tx;
|
||||
|
||||
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
||||
dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
|
||||
(void *)(uintptr_t)ndraid, tx);
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
|
||||
/*
|
||||
* We have to be careful when adding new vdevs to an existing pool.
|
||||
* If other threads start allocating from these vdevs before we
|
||||
@@ -6615,14 +6667,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
|
||||
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
|
||||
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
|
||||
|
||||
/*
|
||||
* A dRAID spare can only replace a child of its parent dRAID vdev.
|
||||
*/
|
||||
if (newvd->vdev_ops == &vdev_draid_spare_ops &&
|
||||
oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
|
||||
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
|
||||
}
|
||||
|
||||
if (rebuild) {
|
||||
/*
|
||||
* For rebuilds, the parent vdev must support reconstruction
|
||||
* For rebuilds, the top vdev must support reconstruction
|
||||
* using only space maps. This means the only allowable
|
||||
* parents are the root vdev or a mirror vdev.
|
||||
* vdevs types are the root vdev, a mirror, or dRAID.
|
||||
*/
|
||||
if (pvd->vdev_ops != &vdev_mirror_ops &&
|
||||
pvd->vdev_ops != &vdev_root_ops) {
|
||||
tvd = pvd;
|
||||
if (pvd->vdev_top != NULL)
|
||||
tvd = pvd->vdev_top;
|
||||
|
||||
if (tvd->vdev_ops != &vdev_mirror_ops &&
|
||||
tvd->vdev_ops != &vdev_root_ops &&
|
||||
tvd->vdev_ops != &vdev_draid_ops) {
|
||||
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
|
||||
}
|
||||
}
|
||||
@@ -6915,14 +6980,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are detaching the original disk from a spare, then it implies
|
||||
* that the spare should become a real disk, and be removed from the
|
||||
* active spare list for the pool.
|
||||
* If we are detaching the original disk from a normal spare, then it
|
||||
* implies that the spare should become a real disk, and be removed
|
||||
* from the active spare list for the pool. dRAID spares on the
|
||||
* other hand are coupled to the pool and thus should never be removed
|
||||
* from the spares list.
|
||||
*/
|
||||
if (pvd->vdev_ops == &vdev_spare_ops &&
|
||||
vd->vdev_id == 0 &&
|
||||
pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
|
||||
unspare = B_TRUE;
|
||||
if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
|
||||
vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
|
||||
|
||||
if (last_cvd->vdev_isspare &&
|
||||
last_cvd->vdev_ops != &vdev_draid_spare_ops) {
|
||||
unspare = B_TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Erase the disk labels so the disk can be used for other things.
|
||||
@@ -8013,18 +8084,9 @@ spa_async_thread(void *arg)
|
||||
/*
|
||||
* If any devices are done replacing, detach them.
|
||||
*/
|
||||
if (tasks & SPA_ASYNC_RESILVER_DONE)
|
||||
if (tasks & SPA_ASYNC_RESILVER_DONE ||
|
||||
tasks & SPA_ASYNC_REBUILD_DONE) {
|
||||
spa_vdev_resilver_done(spa);
|
||||
|
||||
/*
|
||||
* If any devices are done replacing, detach them. Then if no
|
||||
* top-level vdevs are rebuilding attempt to kick off a scrub.
|
||||
*/
|
||||
if (tasks & SPA_ASYNC_REBUILD_DONE) {
|
||||
spa_vdev_resilver_done(spa);
|
||||
|
||||
if (!vdev_rebuild_active(spa->spa_root_vdev))
|
||||
(void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user