mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 03:08:51 +03:00
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
This commit is contained in:
+253
-112
@@ -40,6 +40,7 @@
|
||||
#include <sys/dsl_dir.h>
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/vdev_rebuild.h>
|
||||
#include <sys/vdev_draid.h>
|
||||
#include <sys/uberblock_impl.h>
|
||||
#include <sys/metaslab.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
@@ -51,6 +52,7 @@
|
||||
#include <sys/arc.h>
|
||||
#include <sys/zil.h>
|
||||
#include <sys/dsl_scan.h>
|
||||
#include <sys/vdev_raidz.h>
|
||||
#include <sys/abd.h>
|
||||
#include <sys/vdev_initialize.h>
|
||||
#include <sys/vdev_trim.h>
|
||||
@@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
|
||||
static vdev_ops_t *vdev_ops_table[] = {
|
||||
&vdev_root_ops,
|
||||
&vdev_raidz_ops,
|
||||
&vdev_draid_ops,
|
||||
&vdev_draid_spare_ops,
|
||||
&vdev_mirror_ops,
|
||||
&vdev_replacing_ops,
|
||||
&vdev_spare_ops,
|
||||
@@ -221,10 +225,11 @@ vdev_getops(const char *type)
|
||||
|
||||
/* ARGSUSED */
|
||||
void
|
||||
vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
|
||||
vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
|
||||
range_seg64_t *physical_rs, range_seg64_t *remain_rs)
|
||||
{
|
||||
res->rs_start = in->rs_start;
|
||||
res->rs_end = in->rs_end;
|
||||
physical_rs->rs_start = logical_rs->rs_start;
|
||||
physical_rs->rs_end = logical_rs->rs_end;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
|
||||
return (asize);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
vdev_default_min_asize(vdev_t *vd)
|
||||
{
|
||||
return (vd->vdev_min_asize);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the minimum allocatable size. We define the allocatable size as
|
||||
* the vdev's asize rounded to the nearest metaslab. This allows us to
|
||||
@@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd)
|
||||
if (vd == vd->vdev_top)
|
||||
return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
|
||||
|
||||
/*
|
||||
* The allocatable space for a raidz vdev is N * sizeof(smallest child),
|
||||
* so each child must provide at least 1/Nth of its asize.
|
||||
*/
|
||||
if (pvd->vdev_ops == &vdev_raidz_ops)
|
||||
return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
|
||||
pvd->vdev_children);
|
||||
|
||||
return (pvd->vdev_min_asize);
|
||||
return (pvd->vdev_ops->vdev_op_min_asize(pvd));
|
||||
}
|
||||
|
||||
void
|
||||
@@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd)
|
||||
vdev_set_min_asize(vd->vdev_child[c]);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the minimal allocation size for the top-level vdev.
|
||||
*/
|
||||
uint64_t
|
||||
vdev_get_min_alloc(vdev_t *vd)
|
||||
{
|
||||
uint64_t min_alloc = 1ULL << vd->vdev_ashift;
|
||||
|
||||
if (vd->vdev_ops->vdev_op_min_alloc != NULL)
|
||||
min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
|
||||
|
||||
return (min_alloc);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the parity level for a top-level vdev.
|
||||
*/
|
||||
uint64_t
|
||||
vdev_get_nparity(vdev_t *vd)
|
||||
{
|
||||
uint64_t nparity = 0;
|
||||
|
||||
if (vd->vdev_ops->vdev_op_nparity != NULL)
|
||||
nparity = vd->vdev_ops->vdev_op_nparity(vd);
|
||||
|
||||
return (nparity);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the number of data disks for a top-level vdev.
|
||||
*/
|
||||
uint64_t
|
||||
vdev_get_ndisks(vdev_t *vd)
|
||||
{
|
||||
uint64_t ndisks = 1;
|
||||
|
||||
if (vd->vdev_ops->vdev_op_ndisks != NULL)
|
||||
ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
|
||||
|
||||
return (ndisks);
|
||||
}
|
||||
|
||||
vdev_t *
|
||||
vdev_lookup_top(spa_t *spa, uint64_t vdev)
|
||||
{
|
||||
@@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
list_link_init(&vd->vdev_initialize_node);
|
||||
list_link_init(&vd->vdev_leaf_node);
|
||||
list_link_init(&vd->vdev_trim_node);
|
||||
|
||||
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
|
||||
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
@@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
|
||||
cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
for (int t = 0; t < DTL_TYPES; t++) {
|
||||
vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
|
||||
@@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
{
|
||||
vdev_ops_t *ops;
|
||||
char *type;
|
||||
uint64_t guid = 0, islog, nparity;
|
||||
uint64_t guid = 0, islog;
|
||||
vdev_t *vd;
|
||||
vdev_indirect_config_t *vic;
|
||||
char *tmp = NULL;
|
||||
@@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
|
||||
/*
|
||||
* Set the nparity property for RAID-Z vdevs.
|
||||
*/
|
||||
nparity = -1ULL;
|
||||
if (ops == &vdev_raidz_ops) {
|
||||
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
|
||||
&nparity) == 0) {
|
||||
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
|
||||
return (SET_ERROR(EINVAL));
|
||||
/*
|
||||
* Previous versions could only support 1 or 2 parity
|
||||
* device.
|
||||
*/
|
||||
if (nparity > 1 &&
|
||||
spa_version(spa) < SPA_VERSION_RAIDZ2)
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
if (nparity > 2 &&
|
||||
spa_version(spa) < SPA_VERSION_RAIDZ3)
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
} else {
|
||||
/*
|
||||
* We require the parity to be specified for SPAs that
|
||||
* support multiple parity levels.
|
||||
*/
|
||||
if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
|
||||
return (SET_ERROR(EINVAL));
|
||||
/*
|
||||
* Otherwise, we default to 1 parity device for RAID-Z.
|
||||
*/
|
||||
nparity = 1;
|
||||
}
|
||||
} else {
|
||||
nparity = 0;
|
||||
}
|
||||
ASSERT(nparity != -1ULL);
|
||||
|
||||
/*
|
||||
* If creating a top-level vdev, check for allocation classes input
|
||||
*/
|
||||
if (top_level && alloctype == VDEV_ALLOC_ADD) {
|
||||
char *bias;
|
||||
|
||||
/*
|
||||
* If creating a top-level vdev, check for allocation
|
||||
* classes input.
|
||||
*/
|
||||
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
&bias) == 0) {
|
||||
alloc_bias = vdev_derive_alloc_bias(bias);
|
||||
@@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
}
|
||||
}
|
||||
|
||||
/* spa_vdev_add() expects feature to be enabled */
|
||||
if (ops == &vdev_draid_ops &&
|
||||
spa->spa_load_state != SPA_LOAD_CREATE &&
|
||||
!spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the vdev specific data. This is done before calling
|
||||
* vdev_alloc_common() since it may fail and this simplifies the
|
||||
* error reporting and cleanup code paths.
|
||||
*/
|
||||
void *tsd = NULL;
|
||||
if (ops->vdev_op_init != NULL) {
|
||||
rc = ops->vdev_op_init(spa, nv, &tsd);
|
||||
if (rc != 0) {
|
||||
return (rc);
|
||||
}
|
||||
}
|
||||
|
||||
vd = vdev_alloc_common(spa, id, guid, ops);
|
||||
vic = &vd->vdev_indirect_config;
|
||||
|
||||
vd->vdev_tsd = tsd;
|
||||
vd->vdev_islog = islog;
|
||||
vd->vdev_nparity = nparity;
|
||||
|
||||
if (top_level && alloc_bias != VDEV_BIAS_NONE)
|
||||
vd->vdev_alloc_bias = alloc_bias;
|
||||
|
||||
@@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
&vd->vdev_wholedisk) != 0)
|
||||
vd->vdev_wholedisk = -1ULL;
|
||||
|
||||
vic = &vd->vdev_indirect_config;
|
||||
|
||||
ASSERT0(vic->vic_mapping_object);
|
||||
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
|
||||
&vic->vic_mapping_object);
|
||||
@@ -937,6 +967,9 @@ vdev_free(vdev_t *vd)
|
||||
ASSERT(vd->vdev_child == NULL);
|
||||
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
|
||||
|
||||
if (vd->vdev_ops->vdev_op_fini != NULL)
|
||||
vd->vdev_ops->vdev_op_fini(vd);
|
||||
|
||||
/*
|
||||
* Discard allocation state.
|
||||
*/
|
||||
@@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd)
|
||||
cv_destroy(&vd->vdev_trim_io_cv);
|
||||
|
||||
mutex_destroy(&vd->vdev_rebuild_lock);
|
||||
mutex_destroy(&vd->vdev_rebuild_io_lock);
|
||||
cv_destroy(&vd->vdev_rebuild_cv);
|
||||
cv_destroy(&vd->vdev_rebuild_io_cv);
|
||||
|
||||
zfs_ratelimit_fini(&vd->vdev_delay_rl);
|
||||
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
|
||||
@@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd)
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a mirror/replacing vdev above an existing vdev.
|
||||
* Add a mirror/replacing vdev above an existing vdev. There is no need to
|
||||
* call .vdev_op_init() since mirror/replacing vdevs do not have private state.
|
||||
*/
|
||||
vdev_t *
|
||||
vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
|
||||
@@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd)
|
||||
spa->spa_max_ashift = vd->vdev_ashift;
|
||||
if (vd->vdev_ashift < spa->spa_min_ashift)
|
||||
spa->spa_min_ashift = vd->vdev_ashift;
|
||||
|
||||
uint64_t min_alloc = vdev_get_min_alloc(vd);
|
||||
if (min_alloc < spa->spa_min_alloc)
|
||||
spa->spa_min_alloc = min_alloc;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd)
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns B_TRUE if the passed child should be opened.
|
||||
*/
|
||||
static boolean_t
|
||||
vdev_default_open_children_func(vdev_t *vd)
|
||||
{
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Open the requested child vdevs. If any of the leaf vdevs are using
|
||||
* a ZFS volume then do the opens in a single thread. This avoids a
|
||||
* deadlock when the current thread is holding the spa_namespace_lock.
|
||||
*/
|
||||
static void
|
||||
vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
|
||||
{
|
||||
int children = vd->vdev_children;
|
||||
|
||||
taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
|
||||
children, children, TASKQ_PREPOPULATE);
|
||||
vd->vdev_nonrot = B_TRUE;
|
||||
|
||||
for (int c = 0; c < children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
|
||||
if (open_func(cvd) == B_FALSE)
|
||||
continue;
|
||||
|
||||
if (tq == NULL || vdev_uses_zvols(vd)) {
|
||||
cvd->vdev_open_error = vdev_open(cvd);
|
||||
} else {
|
||||
VERIFY(taskq_dispatch(tq, vdev_open_child,
|
||||
cvd, TQ_SLEEP) != TASKQID_INVALID);
|
||||
}
|
||||
|
||||
vd->vdev_nonrot &= cvd->vdev_nonrot;
|
||||
}
|
||||
|
||||
if (tq != NULL) {
|
||||
taskq_wait(tq);
|
||||
taskq_destroy(tq);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Open all child vdevs.
|
||||
*/
|
||||
void
|
||||
vdev_open_children(vdev_t *vd)
|
||||
{
|
||||
taskq_t *tq;
|
||||
int children = vd->vdev_children;
|
||||
vdev_open_children_impl(vd, vdev_default_open_children_func);
|
||||
}
|
||||
|
||||
/*
|
||||
* in order to handle pools on top of zvols, do the opens
|
||||
* in a single thread so that the same thread holds the
|
||||
* spa_namespace_lock
|
||||
*/
|
||||
if (vdev_uses_zvols(vd)) {
|
||||
retry_sync:
|
||||
for (int c = 0; c < children; c++)
|
||||
vd->vdev_child[c]->vdev_open_error =
|
||||
vdev_open(vd->vdev_child[c]);
|
||||
} else {
|
||||
tq = taskq_create("vdev_open", children, minclsyspri,
|
||||
children, children, TASKQ_PREPOPULATE);
|
||||
if (tq == NULL)
|
||||
goto retry_sync;
|
||||
|
||||
for (int c = 0; c < children; c++)
|
||||
VERIFY(taskq_dispatch(tq, vdev_open_child,
|
||||
vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
|
||||
|
||||
taskq_destroy(tq);
|
||||
}
|
||||
|
||||
vd->vdev_nonrot = B_TRUE;
|
||||
|
||||
for (int c = 0; c < children; c++)
|
||||
vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
|
||||
/*
|
||||
* Conditionally open a subset of child vdevs.
|
||||
*/
|
||||
void
|
||||
vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
|
||||
{
|
||||
vdev_open_children_impl(vd, open_func);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1952,6 +2016,16 @@ vdev_open(vdev_t *vd)
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Track the the minimum allocation size.
|
||||
*/
|
||||
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
|
||||
vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
|
||||
uint64_t min_alloc = vdev_get_min_alloc(vd);
|
||||
if (min_alloc < spa->spa_min_alloc)
|
||||
spa->spa_min_alloc = min_alloc;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is a leaf vdev, assess whether a resilver is needed.
|
||||
* But don't do this if we are doing a reopen for a scrub, since
|
||||
@@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd)
|
||||
vdev_t *pvd = vd->vdev_parent;
|
||||
spa_t *spa __maybe_unused = vd->vdev_spa;
|
||||
|
||||
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
|
||||
ASSERT(vd != NULL);
|
||||
ASSERT(vd->vdev_open_thread == curthread ||
|
||||
spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
|
||||
|
||||
/*
|
||||
* If our parent is reopening, then we are as well, unless we are
|
||||
@@ -2606,10 +2682,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns B_TRUE if vdev determines offset needs to be resilvered.
|
||||
* Check if the txg falls within the range which must be
|
||||
* resilvered. DVAs outside this range can always be skipped.
|
||||
*/
|
||||
boolean_t
|
||||
vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
|
||||
vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
|
||||
uint64_t phys_birth)
|
||||
{
|
||||
/* Set by sequential resilver. */
|
||||
if (phys_birth == TXG_UNKNOWN)
|
||||
return (B_TRUE);
|
||||
|
||||
return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
|
||||
*/
|
||||
boolean_t
|
||||
vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
|
||||
uint64_t phys_birth)
|
||||
{
|
||||
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
|
||||
|
||||
@@ -2617,7 +2709,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
|
||||
vd->vdev_ops->vdev_op_leaf)
|
||||
return (B_TRUE);
|
||||
|
||||
return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
|
||||
return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
|
||||
phys_birth));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2862,8 +2955,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
|
||||
continue; /* leaf vdevs only */
|
||||
if (t == DTL_PARTIAL)
|
||||
minref = 1; /* i.e. non-zero */
|
||||
else if (vd->vdev_nparity != 0)
|
||||
minref = vd->vdev_nparity + 1; /* RAID-Z */
|
||||
else if (vdev_get_nparity(vd) != 0)
|
||||
minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
|
||||
else
|
||||
minref = vd->vdev_children; /* any kind of mirror */
|
||||
space_reftree_create(&reftree);
|
||||
@@ -3727,6 +3820,9 @@ top:
|
||||
if (!vd->vdev_ops->vdev_op_leaf)
|
||||
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
|
||||
|
||||
if (vd->vdev_ops == &vdev_draid_spare_ops)
|
||||
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
|
||||
|
||||
tvd = vd->vdev_top;
|
||||
mg = tvd->vdev_mg;
|
||||
generation = spa->spa_config_generation + 1;
|
||||
@@ -3971,6 +4067,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
|
||||
static void
|
||||
vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
|
||||
{
|
||||
/*
|
||||
* Exclude the dRAID spare when aggregating to avoid double counting
|
||||
* the ops and bytes. These IOs are counted by the physical leaves.
|
||||
*/
|
||||
if (cvd->vdev_ops == &vdev_draid_spare_ops)
|
||||
return;
|
||||
|
||||
for (int t = 0; t < VS_ZIO_TYPES; t++) {
|
||||
vs->vs_ops[t] += cvs->vs_ops[t];
|
||||
vs->vs_bytes[t] += cvs->vs_bytes[t];
|
||||
@@ -4063,7 +4166,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
|
||||
vdev_get_child_stat(cvd, vs, cvs);
|
||||
if (vsx)
|
||||
vdev_get_child_stat_ex(cvd, vsx, cvsx);
|
||||
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
@@ -4248,7 +4350,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
|
||||
|
||||
/*
|
||||
* Repair is the result of a rebuild issued by the
|
||||
* rebuild thread (vdev_rebuild_thread).
|
||||
* rebuild thread (vdev_rebuild_thread). To avoid
|
||||
* double counting repaired bytes the virtual dRAID
|
||||
* spare vdev is excluded from the processed bytes.
|
||||
*/
|
||||
if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
|
||||
vdev_t *tvd = vd->vdev_top;
|
||||
@@ -4256,8 +4360,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
|
||||
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
|
||||
uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf)
|
||||
if (vd->vdev_ops->vdev_op_leaf &&
|
||||
vd->vdev_ops != &vdev_draid_spare_ops) {
|
||||
atomic_add_64(rebuilt, psize);
|
||||
}
|
||||
vs->vs_rebuild_processed += psize;
|
||||
}
|
||||
|
||||
@@ -4981,31 +5087,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
|
||||
vdev_resilver_needed(vd, NULL, NULL));
|
||||
}
|
||||
|
||||
boolean_t
|
||||
vdev_xlate_is_empty(range_seg64_t *rs)
|
||||
{
|
||||
return (rs->rs_start == rs->rs_end);
|
||||
}
|
||||
|
||||
/*
|
||||
* Translate a logical range to the physical range for the specified vdev_t.
|
||||
* This function is initially called with a leaf vdev and will walk each
|
||||
* parent vdev until it reaches a top-level vdev. Once the top-level is
|
||||
* reached the physical range is initialized and the recursive function
|
||||
* begins to unwind. As it unwinds it calls the parent's vdev specific
|
||||
* translation function to do the real conversion.
|
||||
* Translate a logical range to the first contiguous physical range for the
|
||||
* specified vdev_t. This function is initially called with a leaf vdev and
|
||||
* will walk each parent vdev until it reaches a top-level vdev. Once the
|
||||
* top-level is reached the physical range is initialized and the recursive
|
||||
* function begins to unwind. As it unwinds it calls the parent's vdev
|
||||
* specific translation function to do the real conversion.
|
||||
*/
|
||||
void
|
||||
vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
|
||||
range_seg64_t *physical_rs)
|
||||
range_seg64_t *physical_rs, range_seg64_t *remain_rs)
|
||||
{
|
||||
/*
|
||||
* Walk up the vdev tree
|
||||
*/
|
||||
if (vd != vd->vdev_top) {
|
||||
vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
|
||||
vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
|
||||
remain_rs);
|
||||
} else {
|
||||
/*
|
||||
* We've reached the top-level vdev, initialize the
|
||||
* physical range to the logical range and start to
|
||||
* unwind.
|
||||
* We've reached the top-level vdev, initialize the physical
|
||||
* range to the logical range and set an empty remaining
|
||||
* range then start to unwind.
|
||||
*/
|
||||
physical_rs->rs_start = logical_rs->rs_start;
|
||||
physical_rs->rs_end = logical_rs->rs_end;
|
||||
|
||||
remain_rs->rs_start = logical_rs->rs_start;
|
||||
remain_rs->rs_end = logical_rs->rs_start;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -5015,16 +5132,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
|
||||
|
||||
/*
|
||||
* As this recursive function unwinds, translate the logical
|
||||
* range into its physical components by calling the
|
||||
* vdev specific translate function.
|
||||
* range into its physical and any remaining components by calling
|
||||
* the vdev specific translate function.
|
||||
*/
|
||||
range_seg64_t intermediate = { 0 };
|
||||
pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
|
||||
pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
|
||||
|
||||
physical_rs->rs_start = intermediate.rs_start;
|
||||
physical_rs->rs_end = intermediate.rs_end;
|
||||
}
|
||||
|
||||
void
|
||||
vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
|
||||
vdev_xlate_func_t *func, void *arg)
|
||||
{
|
||||
range_seg64_t iter_rs = *logical_rs;
|
||||
range_seg64_t physical_rs;
|
||||
range_seg64_t remain_rs;
|
||||
|
||||
while (!vdev_xlate_is_empty(&iter_rs)) {
|
||||
|
||||
vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
|
||||
|
||||
/*
|
||||
* With raidz and dRAID, it's possible that the logical range
|
||||
* does not live on this leaf vdev. Only when there is a non-
|
||||
* zero physical size call the provided function.
|
||||
*/
|
||||
if (!vdev_xlate_is_empty(&physical_rs))
|
||||
func(arg, &physical_rs);
|
||||
|
||||
iter_rs = remain_rs;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Look at the vdev tree and determine whether any devices are currently being
|
||||
* replaced.
|
||||
|
||||
Reference in New Issue
Block a user