mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 03:08:51 +03:00
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
This commit is contained in:
@@ -121,6 +121,8 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
|
||||
if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
|
||||
vd->vdev_initialize_action_time = gethrestime_sec();
|
||||
}
|
||||
|
||||
vdev_initializing_state_t old_state = vd->vdev_initialize_state;
|
||||
vd->vdev_initialize_state = new_state;
|
||||
|
||||
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
|
||||
@@ -138,8 +140,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
|
||||
"vdev=%s suspended", vd->vdev_path);
|
||||
break;
|
||||
case VDEV_INITIALIZE_CANCELED:
|
||||
spa_history_log_internal(spa, "initialize", tx,
|
||||
"vdev=%s canceled", vd->vdev_path);
|
||||
if (old_state == VDEV_INITIALIZE_ACTIVE ||
|
||||
old_state == VDEV_INITIALIZE_SUSPENDED)
|
||||
spa_history_log_internal(spa, "initialize", tx,
|
||||
"vdev=%s canceled", vd->vdev_path);
|
||||
break;
|
||||
case VDEV_INITIALIZE_COMPLETE:
|
||||
spa_history_log_internal(spa, "initialize", tx,
|
||||
@@ -317,6 +321,32 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
|
||||
{
|
||||
uint64_t *last_rs_end = (uint64_t *)arg;
|
||||
|
||||
if (physical_rs->rs_end > *last_rs_end)
|
||||
*last_rs_end = physical_rs->rs_end;
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs)
|
||||
{
|
||||
vdev_t *vd = (vdev_t *)arg;
|
||||
|
||||
uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
|
||||
vd->vdev_initialize_bytes_est += size;
|
||||
|
||||
if (vd->vdev_initialize_last_offset > physical_rs->rs_end) {
|
||||
vd->vdev_initialize_bytes_done += size;
|
||||
} else if (vd->vdev_initialize_last_offset > physical_rs->rs_start &&
|
||||
vd->vdev_initialize_last_offset < physical_rs->rs_end) {
|
||||
vd->vdev_initialize_bytes_done +=
|
||||
vd->vdev_initialize_last_offset - physical_rs->rs_start;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_initialize_calculate_progress(vdev_t *vd)
|
||||
{
|
||||
@@ -331,28 +361,35 @@ vdev_initialize_calculate_progress(vdev_t *vd)
|
||||
metaslab_t *msp = vd->vdev_top->vdev_ms[i];
|
||||
mutex_enter(&msp->ms_lock);
|
||||
|
||||
uint64_t ms_free = msp->ms_size -
|
||||
metaslab_allocated_space(msp);
|
||||
|
||||
if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
|
||||
ms_free /= vd->vdev_top->vdev_children;
|
||||
uint64_t ms_free = (msp->ms_size -
|
||||
metaslab_allocated_space(msp)) /
|
||||
vdev_get_ndisks(vd->vdev_top);
|
||||
|
||||
/*
|
||||
* Convert the metaslab range to a physical range
|
||||
* on our vdev. We use this to determine if we are
|
||||
* in the middle of this metaslab range.
|
||||
*/
|
||||
range_seg64_t logical_rs, physical_rs;
|
||||
range_seg64_t logical_rs, physical_rs, remain_rs;
|
||||
logical_rs.rs_start = msp->ms_start;
|
||||
logical_rs.rs_end = msp->ms_start + msp->ms_size;
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs);
|
||||
|
||||
/* Metaslab space after this offset has not been initialized */
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
|
||||
if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
|
||||
vd->vdev_initialize_bytes_est += ms_free;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
continue;
|
||||
} else if (vd->vdev_initialize_last_offset >
|
||||
physical_rs.rs_end) {
|
||||
}
|
||||
|
||||
/* Metaslab space before this offset has been initialized */
|
||||
uint64_t last_rs_end = physical_rs.rs_end;
|
||||
if (!vdev_xlate_is_empty(&remain_rs)) {
|
||||
vdev_xlate_walk(vd, &remain_rs,
|
||||
vdev_initialize_xlate_last_rs_end, &last_rs_end);
|
||||
}
|
||||
|
||||
if (vd->vdev_initialize_last_offset > last_rs_end) {
|
||||
vd->vdev_initialize_bytes_done += ms_free;
|
||||
vd->vdev_initialize_bytes_est += ms_free;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
@@ -374,22 +411,9 @@ vdev_initialize_calculate_progress(vdev_t *vd)
|
||||
&where)) {
|
||||
logical_rs.rs_start = rs_get_start(rs, rt);
|
||||
logical_rs.rs_end = rs_get_end(rs, rt);
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs);
|
||||
|
||||
uint64_t size = physical_rs.rs_end -
|
||||
physical_rs.rs_start;
|
||||
vd->vdev_initialize_bytes_est += size;
|
||||
if (vd->vdev_initialize_last_offset >
|
||||
physical_rs.rs_end) {
|
||||
vd->vdev_initialize_bytes_done += size;
|
||||
} else if (vd->vdev_initialize_last_offset >
|
||||
physical_rs.rs_start &&
|
||||
vd->vdev_initialize_last_offset <
|
||||
physical_rs.rs_end) {
|
||||
vd->vdev_initialize_bytes_done +=
|
||||
vd->vdev_initialize_last_offset -
|
||||
physical_rs.rs_start;
|
||||
}
|
||||
vdev_xlate_walk(vd, &logical_rs,
|
||||
vdev_initialize_xlate_progress, vd);
|
||||
}
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
@@ -419,6 +443,34 @@ vdev_initialize_load(vdev_t *vd)
|
||||
return (err);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs)
|
||||
{
|
||||
vdev_t *vd = arg;
|
||||
|
||||
/* Only add segments that we have not visited yet */
|
||||
if (physical_rs->rs_end <= vd->vdev_initialize_last_offset)
|
||||
return;
|
||||
|
||||
/* Pick up where we left off mid-range. */
|
||||
if (vd->vdev_initialize_last_offset > physical_rs->rs_start) {
|
||||
zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
|
||||
"(%llu, %llu)", vd->vdev_path,
|
||||
(u_longlong_t)physical_rs->rs_start,
|
||||
(u_longlong_t)physical_rs->rs_end,
|
||||
(u_longlong_t)vd->vdev_initialize_last_offset,
|
||||
(u_longlong_t)physical_rs->rs_end);
|
||||
ASSERT3U(physical_rs->rs_end, >,
|
||||
vd->vdev_initialize_last_offset);
|
||||
physical_rs->rs_start = vd->vdev_initialize_last_offset;
|
||||
}
|
||||
|
||||
ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
|
||||
|
||||
range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start,
|
||||
physical_rs->rs_end - physical_rs->rs_start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert the logical range into a physical range and add it to our
|
||||
* avl tree.
|
||||
@@ -427,47 +479,12 @@ static void
|
||||
vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
vdev_t *vd = arg;
|
||||
range_seg64_t logical_rs, physical_rs;
|
||||
range_seg64_t logical_rs;
|
||||
logical_rs.rs_start = start;
|
||||
logical_rs.rs_end = start + size;
|
||||
|
||||
ASSERT(vd->vdev_ops->vdev_op_leaf);
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs);
|
||||
|
||||
IMPLY(vd->vdev_top == vd,
|
||||
logical_rs.rs_start == physical_rs.rs_start);
|
||||
IMPLY(vd->vdev_top == vd,
|
||||
logical_rs.rs_end == physical_rs.rs_end);
|
||||
|
||||
/* Only add segments that we have not visited yet */
|
||||
if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
|
||||
return;
|
||||
|
||||
/* Pick up where we left off mid-range. */
|
||||
if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
|
||||
zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
|
||||
"(%llu, %llu)", vd->vdev_path,
|
||||
(u_longlong_t)physical_rs.rs_start,
|
||||
(u_longlong_t)physical_rs.rs_end,
|
||||
(u_longlong_t)vd->vdev_initialize_last_offset,
|
||||
(u_longlong_t)physical_rs.rs_end);
|
||||
ASSERT3U(physical_rs.rs_end, >,
|
||||
vd->vdev_initialize_last_offset);
|
||||
physical_rs.rs_start = vd->vdev_initialize_last_offset;
|
||||
}
|
||||
ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
|
||||
|
||||
/*
|
||||
* With raidz, it's possible that the logical range does not live on
|
||||
* this leaf vdev. We only add the physical range to this vdev's if it
|
||||
* has a length greater than 0.
|
||||
*/
|
||||
if (physical_rs.rs_end > physical_rs.rs_start) {
|
||||
range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
|
||||
physical_rs.rs_end - physical_rs.rs_start);
|
||||
} else {
|
||||
ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
|
||||
}
|
||||
vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
Reference in New Issue
Block a user