Distributed Spare (dRAID) Feature

This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID.  This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.

A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`.  No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.

    zpool create <pool> draid[1,2,3] <vdevs...>

Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons.  The supported options include:

    zpool create <pool> \
        draid[<parity>][:<data>d][:<children>c][:<spares>s] \
        <vdevs...>

    - draid[parity]       - Parity level (default 1)
    - draid[:<data>d]     - Data devices per group (default 8)
    - draid[:<children>c] - Expected number of child vdevs
    - draid[:<spares>s]   - Distributed hot spares (default 0)

Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.

```
  pool: tank
 state: ONLINE
config:

    NAME                  STATE     READ WRITE CKSUM
    slag7                 ONLINE       0     0     0
      draid2:8d:68c:2s-0  ONLINE       0     0     0
        L0                ONLINE       0     0     0
        L1                ONLINE       0     0     0
        ...
        U25               ONLINE       0     0     0
        U26               ONLINE       0     0     0
        spare-53          ONLINE       0     0     0
          U27             ONLINE       0     0     0
          draid2-0-0      ONLINE       0     0     0
        U28               ONLINE       0     0     0
        U29               ONLINE       0     0     0
        ...
        U42               ONLINE       0     0     0
        U43               ONLINE       0     0     0
    special
      mirror-1            ONLINE       0     0     0
        L5                ONLINE       0     0     0
        U5                ONLINE       0     0     0
      mirror-2            ONLINE       0     0     0
        L6                ONLINE       0     0     0
        U6                ONLINE       0     0     0
    spares
      draid2-0-0          INUSE     currently in use
      draid2-0-1          AVAIL
```

When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command.  These options are leverages
by zloop.sh to test a wide range of dRAID configurations.

    -K draid|raidz|random - kind of RAID to test
    -D <value>            - dRAID data drives per group
    -S <value>            - dRAID distributed hot spares
    -R <value>            - RAID parity (raidz or dRAID)

The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.

Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
This commit is contained in:
Brian Behlendorf
2020-11-13 13:51:51 -08:00
committed by GitHub
parent a724db0374
commit b2255edcc0
153 changed files with 10203 additions and 1882 deletions
+2
View File
@@ -84,6 +84,8 @@ $(MODULE)-objs += uberblock.o
$(MODULE)-objs += unique.o
$(MODULE)-objs += vdev.o
$(MODULE)-objs += vdev_cache.o
$(MODULE)-objs += vdev_draid.o
$(MODULE)-objs += vdev_draid_rand.o
$(MODULE)-objs += vdev_indirect.o
$(MODULE)-objs += vdev_indirect_births.o
$(MODULE)-objs += vdev_indirect_mapping.o
+9 -5
View File
@@ -781,16 +781,17 @@ int
abd_iterate_func(abd_t *abd, size_t off, size_t size,
abd_iter_func_t *func, void *private)
{
int ret = 0;
struct abd_iter aiter;
boolean_t abd_multi;
abd_t *c_abd;
int ret = 0;
if (size == 0)
return (0);
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
abd_multi = abd_is_gang(abd);
c_abd = abd_init_abd_iter(abd, &aiter, off);
boolean_t abd_multi = abd_is_gang(abd);
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
while (size > 0) {
/* If we are at the end of the gang ABD we are done */
@@ -920,6 +921,9 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
abd_t *c_dabd, *c_sabd;
if (size == 0)
return (0);
abd_verify(dabd);
abd_verify(sabd);
+2 -9
View File
@@ -713,7 +713,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
return (0);
}
static void
void
dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
@@ -3327,20 +3327,13 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
return (B_TRUE);
}
/*
* Check if the txg falls within the range which must be
* resilvered. DVAs outside this range can always be skipped.
*/
if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
return (B_FALSE);
/*
* Check if the top-level vdev must resilver this offset.
* When the offset does not intersect with a dirty leaf DTL
* then it may be possible to skip the resilver IO. The psize
* is provided instead of asize to simplify the check for RAIDZ.
*/
if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth))
return (B_FALSE);
/*
+7 -1
View File
@@ -32,6 +32,7 @@
#include <sys/space_map.h>
#include <sys/metaslab_impl.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/zio.h>
#include <sys/spa_impl.h>
#include <sys/zfeature.h>
@@ -1563,6 +1564,7 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
defined(WITH_CF_BLOCK_ALLOCATOR)
/*
* This is a helper function that can be used by the allocator to find a
* suitable block to allocate. This will search the specified B-tree looking
@@ -1654,6 +1656,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
range_seg_t *rs;
if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
metaslab_size_tree_full_load(msp->ms_allocatable);
if (metaslab_df_use_largest_segment) {
/* use largest free segment */
rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
@@ -2616,6 +2619,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
ms->ms_allocator = -1;
ms->ms_new = B_TRUE;
vdev_ops_t *ops = vd->vdev_ops;
if (ops->vdev_op_metaslab_init != NULL)
ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
/*
* We only open space map objects that already exist. All others
* will be opened when we finally allocate an object for it.
@@ -5813,7 +5820,6 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
metaslab_group_alloc_increment(spa,
DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
}
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
+10 -1
View File
@@ -307,8 +307,17 @@ mmp_next_leaf(spa_t *spa)
if (leaf == NULL)
leaf = list_head(&spa->spa_leaf_list);
if (!vdev_writeable(leaf)) {
/*
* We skip unwritable, offline, detached, and dRAID spare
* devices as they are either not legal targets or the write
* may fail or not be seen by other hosts. Skipped dRAID
* spares can never be written so the fail mask is not set.
*/
if (!vdev_writeable(leaf) || leaf->vdev_offline ||
leaf->vdev_detached) {
fail_mask |= MMP_FAIL_NOT_WRITABLE;
} else if (leaf->vdev_ops == &vdev_draid_spare_ops) {
continue;
} else if (leaf->vdev_mmp_pending != 0) {
fail_mask |= MMP_FAIL_WRITE_PENDING;
} else {
+94 -32
View File
@@ -60,6 +60,7 @@
#include <sys/vdev_rebuild.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_draid.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/mmp.h>
@@ -3681,7 +3682,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
/*
* Build a new vdev tree from the trusted config
*/
VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
if (error != 0) {
nvlist_free(mos_config);
spa_config_exit(spa, SCL_ALL, FTAG);
spa_load_failed(spa, "spa_config_parse failed [error=%d]",
error);
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
}
/*
* Vdev paths in the MOS may be obsolete. If the untrusted config was
@@ -5631,7 +5639,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
uint64_t version, obj;
uint64_t version, obj, ndraid = 0;
boolean_t has_features;
boolean_t has_encryption;
boolean_t has_allocclass;
@@ -5753,8 +5761,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
if (error == 0 &&
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
(error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
/*
* instantiate the metaslab groups (this will dirty the vdevs)
* we can no longer error exit past this point
@@ -5895,6 +5903,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa_sync_props(props, tx);
}
for (int i = 0; i < ndraid; i++)
spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
dmu_tx_commit(tx);
spa->spa_sync_on = B_TRUE;
@@ -6403,13 +6414,26 @@ spa_reset(const char *pool)
* ==========================================================================
*/
/*
* This is called as a synctask to increment the draid feature flag
*/
static void
spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
int draid = (int)(uintptr_t)arg;
for (int c = 0; c < draid; c++)
spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
}
/*
* Add a device to a storage pool.
*/
int
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
{
uint64_t txg;
uint64_t txg, ndraid = 0;
int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd;
@@ -6438,8 +6462,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
return (spa_vdev_exit(spa, vd, txg, EINVAL));
if (vd->vdev_children != 0 &&
(error = vdev_create(vd, txg, B_FALSE)) != 0)
(error = vdev_create(vd, txg, B_FALSE)) != 0) {
return (spa_vdev_exit(spa, vd, txg, error));
}
/*
* The virtual dRAID spares must be added after vdev tree is created
* and the vdev guids are generated. The guid of their assoicated
* dRAID is stored in the config and used when opening the spare.
*/
if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
rvd->vdev_children)) == 0) {
if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
nspares = 0;
} else {
return (spa_vdev_exit(spa, vd, txg, error));
}
/*
* We must validate the spares and l2cache devices after checking the
@@ -6452,7 +6491,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* If we are in the middle of a device removal, we can only add
* devices which match the existing devices in the pool.
* If we are in the middle of a removal, or have some indirect
* vdevs, we can not add raidz toplevels.
* vdevs, we can not add raidz or dRAID top levels.
*/
if (spa->spa_vdev_removal != NULL ||
spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
@@ -6462,10 +6501,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
tvd->vdev_ashift != spa->spa_max_ashift) {
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
/* Fail if top level vdev is raidz */
if (tvd->vdev_ops == &vdev_raidz_ops) {
/* Fail if top level vdev is raidz or a dRAID */
if (vdev_get_nparity(tvd) != 0)
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
/*
* Need the top level mirror to be
* a mirror of leaf vdevs only
@@ -6505,6 +6544,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
spa->spa_l2cache.sav_sync = B_TRUE;
}
/*
* We can't increment a feature while holding spa_vdev so we
* have to do it in a synctask.
*/
if (ndraid != 0) {
dmu_tx_t *tx;
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
(void *)(uintptr_t)ndraid, tx);
dmu_tx_commit(tx);
}
/*
* We have to be careful when adding new vdevs to an existing pool.
* If other threads start allocating from these vdevs before we
@@ -6615,14 +6667,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
/*
* A dRAID spare can only replace a child of its parent dRAID vdev.
*/
if (newvd->vdev_ops == &vdev_draid_spare_ops &&
oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
}
if (rebuild) {
/*
* For rebuilds, the parent vdev must support reconstruction
* For rebuilds, the top vdev must support reconstruction
* using only space maps. This means the only allowable
* parents are the root vdev or a mirror vdev.
* vdevs types are the root vdev, a mirror, or dRAID.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
pvd->vdev_ops != &vdev_root_ops) {
tvd = pvd;
if (pvd->vdev_top != NULL)
tvd = pvd->vdev_top;
if (tvd->vdev_ops != &vdev_mirror_ops &&
tvd->vdev_ops != &vdev_root_ops &&
tvd->vdev_ops != &vdev_draid_ops) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
}
}
@@ -6915,14 +6980,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
}
/*
* If we are detaching the original disk from a spare, then it implies
* that the spare should become a real disk, and be removed from the
* active spare list for the pool.
* If we are detaching the original disk from a normal spare, then it
* implies that the spare should become a real disk, and be removed
* from the active spare list for the pool. dRAID spares on the
* other hand are coupled to the pool and thus should never be removed
* from the spares list.
*/
if (pvd->vdev_ops == &vdev_spare_ops &&
vd->vdev_id == 0 &&
pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
unspare = B_TRUE;
if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
if (last_cvd->vdev_isspare &&
last_cvd->vdev_ops != &vdev_draid_spare_ops) {
unspare = B_TRUE;
}
}
/*
* Erase the disk labels so the disk can be used for other things.
@@ -8013,18 +8084,9 @@ spa_async_thread(void *arg)
/*
* If any devices are done replacing, detach them.
*/
if (tasks & SPA_ASYNC_RESILVER_DONE)
if (tasks & SPA_ASYNC_RESILVER_DONE ||
tasks & SPA_ASYNC_REBUILD_DONE) {
spa_vdev_resilver_done(spa);
/*
* If any devices are done replacing, detach them. Then if no
* top-level vdevs are rebuilding attempt to kick off a scrub.
*/
if (tasks & SPA_ASYNC_REBUILD_DONE) {
spa_vdev_resilver_done(spa);
if (!vdev_rebuild_active(spa->spa_root_vdev))
(void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
}
/*
+1
View File
@@ -741,6 +741,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_min_ashift = INT_MAX;
spa->spa_max_ashift = 0;
spa->spa_min_alloc = INT_MAX;
/* Reset cached value */
spa->spa_dedup_dspace = ~0ULL;
+253 -112
View File
@@ -40,6 +40,7 @@
#include <sys/dsl_dir.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_rebuild.h>
#include <sys/vdev_draid.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -51,6 +52,7 @@
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
#include <sys/vdev_raidz.h>
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
@@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
static vdev_ops_t *vdev_ops_table[] = {
&vdev_root_ops,
&vdev_raidz_ops,
&vdev_draid_ops,
&vdev_draid_spare_ops,
&vdev_mirror_ops,
&vdev_replacing_ops,
&vdev_spare_ops,
@@ -221,10 +225,11 @@ vdev_getops(const char *type)
/* ARGSUSED */
void
vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
range_seg64_t *physical_rs, range_seg64_t *remain_rs)
{
res->rs_start = in->rs_start;
res->rs_end = in->rs_end;
physical_rs->rs_start = logical_rs->rs_start;
physical_rs->rs_end = logical_rs->rs_end;
}
/*
@@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
return (asize);
}
uint64_t
vdev_default_min_asize(vdev_t *vd)
{
return (vd->vdev_min_asize);
}
/*
* Get the minimum allocatable size. We define the allocatable size as
* the vdev's asize rounded to the nearest metaslab. This allows us to
@@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd)
if (vd == vd->vdev_top)
return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
/*
* The allocatable space for a raidz vdev is N * sizeof(smallest child),
* so each child must provide at least 1/Nth of its asize.
*/
if (pvd->vdev_ops == &vdev_raidz_ops)
return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
pvd->vdev_children);
return (pvd->vdev_min_asize);
return (pvd->vdev_ops->vdev_op_min_asize(pvd));
}
void
@@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd)
vdev_set_min_asize(vd->vdev_child[c]);
}
/*
* Get the minimal allocation size for the top-level vdev.
*/
uint64_t
vdev_get_min_alloc(vdev_t *vd)
{
uint64_t min_alloc = 1ULL << vd->vdev_ashift;
if (vd->vdev_ops->vdev_op_min_alloc != NULL)
min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
return (min_alloc);
}
/*
* Get the parity level for a top-level vdev.
*/
uint64_t
vdev_get_nparity(vdev_t *vd)
{
uint64_t nparity = 0;
if (vd->vdev_ops->vdev_op_nparity != NULL)
nparity = vd->vdev_ops->vdev_op_nparity(vd);
return (nparity);
}
/*
* Get the number of data disks for a top-level vdev.
*/
uint64_t
vdev_get_ndisks(vdev_t *vd)
{
uint64_t ndisks = 1;
if (vd->vdev_ops->vdev_op_ndisks != NULL)
ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
return (ndisks);
}
vdev_t *
vdev_lookup_top(spa_t *spa, uint64_t vdev)
{
@@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
list_link_init(&vd->vdev_initialize_node);
list_link_init(&vd->vdev_leaf_node);
list_link_init(&vd->vdev_trim_node);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
@@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
{
vdev_ops_t *ops;
char *type;
uint64_t guid = 0, islog, nparity;
uint64_t guid = 0, islog;
vdev_t *vd;
vdev_indirect_config_t *vic;
char *tmp = NULL;
@@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
return (SET_ERROR(ENOTSUP));
/*
* Set the nparity property for RAID-Z vdevs.
*/
nparity = -1ULL;
if (ops == &vdev_raidz_ops) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
return (SET_ERROR(EINVAL));
/*
* Previous versions could only support 1 or 2 parity
* device.
*/
if (nparity > 1 &&
spa_version(spa) < SPA_VERSION_RAIDZ2)
return (SET_ERROR(ENOTSUP));
if (nparity > 2 &&
spa_version(spa) < SPA_VERSION_RAIDZ3)
return (SET_ERROR(ENOTSUP));
} else {
/*
* We require the parity to be specified for SPAs that
* support multiple parity levels.
*/
if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
return (SET_ERROR(EINVAL));
/*
* Otherwise, we default to 1 parity device for RAID-Z.
*/
nparity = 1;
}
} else {
nparity = 0;
}
ASSERT(nparity != -1ULL);
/*
* If creating a top-level vdev, check for allocation classes input
*/
if (top_level && alloctype == VDEV_ALLOC_ADD) {
char *bias;
/*
* If creating a top-level vdev, check for allocation
* classes input.
*/
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
&bias) == 0) {
alloc_bias = vdev_derive_alloc_bias(bias);
@@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
return (SET_ERROR(ENOTSUP));
}
}
/* spa_vdev_add() expects feature to be enabled */
if (ops == &vdev_draid_ops &&
spa->spa_load_state != SPA_LOAD_CREATE &&
!spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
return (SET_ERROR(ENOTSUP));
}
}
/*
* Initialize the vdev specific data. This is done before calling
* vdev_alloc_common() since it may fail and this simplifies the
* error reporting and cleanup code paths.
*/
void *tsd = NULL;
if (ops->vdev_op_init != NULL) {
rc = ops->vdev_op_init(spa, nv, &tsd);
if (rc != 0) {
return (rc);
}
}
vd = vdev_alloc_common(spa, id, guid, ops);
vic = &vd->vdev_indirect_config;
vd->vdev_tsd = tsd;
vd->vdev_islog = islog;
vd->vdev_nparity = nparity;
if (top_level && alloc_bias != VDEV_BIAS_NONE)
vd->vdev_alloc_bias = alloc_bias;
@@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_wholedisk) != 0)
vd->vdev_wholedisk = -1ULL;
vic = &vd->vdev_indirect_config;
ASSERT0(vic->vic_mapping_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
&vic->vic_mapping_object);
@@ -937,6 +967,9 @@ vdev_free(vdev_t *vd)
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
if (vd->vdev_ops->vdev_op_fini != NULL)
vd->vdev_ops->vdev_op_fini(vd);
/*
* Discard allocation state.
*/
@@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd)
cv_destroy(&vd->vdev_trim_io_cv);
mutex_destroy(&vd->vdev_rebuild_lock);
mutex_destroy(&vd->vdev_rebuild_io_lock);
cv_destroy(&vd->vdev_rebuild_cv);
cv_destroy(&vd->vdev_rebuild_io_cv);
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd)
}
/*
* Add a mirror/replacing vdev above an existing vdev.
* Add a mirror/replacing vdev above an existing vdev. There is no need to
* call .vdev_op_init() since mirror/replacing vdevs do not have private state.
*/
vdev_t *
vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
@@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd)
spa->spa_max_ashift = vd->vdev_ashift;
if (vd->vdev_ashift < spa->spa_min_ashift)
spa->spa_min_ashift = vd->vdev_ashift;
uint64_t min_alloc = vdev_get_min_alloc(vd);
if (min_alloc < spa->spa_min_alloc)
spa->spa_min_alloc = min_alloc;
}
}
}
@@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd)
return (B_FALSE);
}
/*
* Returns B_TRUE if the passed child should be opened.
*/
static boolean_t
vdev_default_open_children_func(vdev_t *vd)
{
return (B_TRUE);
}
/*
* Open the requested child vdevs. If any of the leaf vdevs are using
* a ZFS volume then do the opens in a single thread. This avoids a
* deadlock when the current thread is holding the spa_namespace_lock.
*/
static void
vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
{
int children = vd->vdev_children;
taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
children, children, TASKQ_PREPOPULATE);
vd->vdev_nonrot = B_TRUE;
for (int c = 0; c < children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (open_func(cvd) == B_FALSE)
continue;
if (tq == NULL || vdev_uses_zvols(vd)) {
cvd->vdev_open_error = vdev_open(cvd);
} else {
VERIFY(taskq_dispatch(tq, vdev_open_child,
cvd, TQ_SLEEP) != TASKQID_INVALID);
}
vd->vdev_nonrot &= cvd->vdev_nonrot;
}
if (tq != NULL) {
taskq_wait(tq);
taskq_destroy(tq);
}
}
/*
* Open all child vdevs.
*/
void
vdev_open_children(vdev_t *vd)
{
taskq_t *tq;
int children = vd->vdev_children;
vdev_open_children_impl(vd, vdev_default_open_children_func);
}
/*
* in order to handle pools on top of zvols, do the opens
* in a single thread so that the same thread holds the
* spa_namespace_lock
*/
if (vdev_uses_zvols(vd)) {
retry_sync:
for (int c = 0; c < children; c++)
vd->vdev_child[c]->vdev_open_error =
vdev_open(vd->vdev_child[c]);
} else {
tq = taskq_create("vdev_open", children, minclsyspri,
children, children, TASKQ_PREPOPULATE);
if (tq == NULL)
goto retry_sync;
for (int c = 0; c < children; c++)
VERIFY(taskq_dispatch(tq, vdev_open_child,
vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
taskq_destroy(tq);
}
vd->vdev_nonrot = B_TRUE;
for (int c = 0; c < children; c++)
vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
/*
* Conditionally open a subset of child vdevs.
*/
void
vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
{
vdev_open_children_impl(vd, open_func);
}
/*
@@ -1952,6 +2016,16 @@ vdev_open(vdev_t *vd)
return (error);
}
/*
* Track the the minimum allocation size.
*/
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
uint64_t min_alloc = vdev_get_min_alloc(vd);
if (min_alloc < spa->spa_min_alloc)
spa->spa_min_alloc = min_alloc;
}
/*
* If this is a leaf vdev, assess whether a resilver is needed.
* But don't do this if we are doing a reopen for a scrub, since
@@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd)
vdev_t *pvd = vd->vdev_parent;
spa_t *spa __maybe_unused = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT(vd != NULL);
ASSERT(vd->vdev_open_thread == curthread ||
spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
/*
* If our parent is reopening, then we are as well, unless we are
@@ -2606,10 +2682,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
}
/*
* Returns B_TRUE if vdev determines offset needs to be resilvered.
* Check if the txg falls within the range which must be
* resilvered. DVAs outside this range can always be skipped.
*/
boolean_t
vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
/* Set by sequential resilver. */
if (phys_birth == TXG_UNKNOWN)
return (B_TRUE);
return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
}
/*
* Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
*/
boolean_t
vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
@@ -2617,7 +2709,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
vd->vdev_ops->vdev_op_leaf)
return (B_TRUE);
return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
phys_birth));
}
/*
@@ -2862,8 +2955,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
continue; /* leaf vdevs only */
if (t == DTL_PARTIAL)
minref = 1; /* i.e. non-zero */
else if (vd->vdev_nparity != 0)
minref = vd->vdev_nparity + 1; /* RAID-Z */
else if (vdev_get_nparity(vd) != 0)
minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
else
minref = vd->vdev_children; /* any kind of mirror */
space_reftree_create(&reftree);
@@ -3727,6 +3820,9 @@ top:
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
if (vd->vdev_ops == &vdev_draid_spare_ops)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
tvd = vd->vdev_top;
mg = tvd->vdev_mg;
generation = spa->spa_config_generation + 1;
@@ -3971,6 +4067,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
static void
vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
{
/*
* Exclude the dRAID spare when aggregating to avoid double counting
* the ops and bytes. These IOs are counted by the physical leaves.
*/
if (cvd->vdev_ops == &vdev_draid_spare_ops)
return;
for (int t = 0; t < VS_ZIO_TYPES; t++) {
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
@@ -4063,7 +4166,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vdev_get_child_stat(cvd, vs, cvs);
if (vsx)
vdev_get_child_stat_ex(cvd, vsx, cvsx);
}
} else {
/*
@@ -4248,7 +4350,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
/*
* Repair is the result of a rebuild issued by the
* rebuild thread (vdev_rebuild_thread).
* rebuild thread (vdev_rebuild_thread). To avoid
* double counting repaired bytes the virtual dRAID
* spare vdev is excluded from the processed bytes.
*/
if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
vdev_t *tvd = vd->vdev_top;
@@ -4256,8 +4360,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
if (vd->vdev_ops->vdev_op_leaf)
if (vd->vdev_ops->vdev_op_leaf &&
vd->vdev_ops != &vdev_draid_spare_ops) {
atomic_add_64(rebuilt, psize);
}
vs->vs_rebuild_processed += psize;
}
@@ -4981,31 +5087,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
vdev_resilver_needed(vd, NULL, NULL));
}
boolean_t
vdev_xlate_is_empty(range_seg64_t *rs)
{
return (rs->rs_start == rs->rs_end);
}
/*
* Translate a logical range to the physical range for the specified vdev_t.
* This function is initially called with a leaf vdev and will walk each
* parent vdev until it reaches a top-level vdev. Once the top-level is
* reached the physical range is initialized and the recursive function
* begins to unwind. As it unwinds it calls the parent's vdev specific
* translation function to do the real conversion.
* Translate a logical range to the first contiguous physical range for the
* specified vdev_t. This function is initially called with a leaf vdev and
* will walk each parent vdev until it reaches a top-level vdev. Once the
* top-level is reached the physical range is initialized and the recursive
* function begins to unwind. As it unwinds it calls the parent's vdev
* specific translation function to do the real conversion.
*/
void
vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
range_seg64_t *physical_rs)
range_seg64_t *physical_rs, range_seg64_t *remain_rs)
{
/*
* Walk up the vdev tree
*/
if (vd != vd->vdev_top) {
vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
remain_rs);
} else {
/*
* We've reached the top-level vdev, initialize the
* physical range to the logical range and start to
* unwind.
* We've reached the top-level vdev, initialize the physical
* range to the logical range and set an empty remaining
* range then start to unwind.
*/
physical_rs->rs_start = logical_rs->rs_start;
physical_rs->rs_end = logical_rs->rs_end;
remain_rs->rs_start = logical_rs->rs_start;
remain_rs->rs_end = logical_rs->rs_start;
return;
}
@@ -5015,16 +5132,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
/*
* As this recursive function unwinds, translate the logical
* range into its physical components by calling the
* vdev specific translate function.
* range into its physical and any remaining components by calling
* the vdev specific translate function.
*/
range_seg64_t intermediate = { 0 };
pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
physical_rs->rs_start = intermediate.rs_start;
physical_rs->rs_end = intermediate.rs_end;
}
void
vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
vdev_xlate_func_t *func, void *arg)
{
range_seg64_t iter_rs = *logical_rs;
range_seg64_t physical_rs;
range_seg64_t remain_rs;
while (!vdev_xlate_is_empty(&iter_rs)) {
vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
/*
* With raidz and dRAID, it's possible that the logical range
* does not live on this leaf vdev. Only when there is a non-
* zero physical size call the provided function.
*/
if (!vdev_xlate_is_empty(&physical_rs))
func(arg, &physical_rs);
iter_rs = remain_rs;
}
}
/*
* Look at the vdev tree and determine whether any devices are currently being
* replaced.
File diff suppressed because it is too large Load Diff
+40
View File
@@ -0,0 +1,40 @@
/*
* Xorshift Pseudo Random Number Generator based on work by David Blackman
* and Sebastiano Vigna (vigna@acm.org).
*
* "Further scramblings of Marsaglia's xorshift generators"
* http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
* http://prng.di.unimi.it/xoroshiro128plusplus.c
*
* To the extent possible under law, the author has dedicated all copyright
* and related and neighboring rights to this software to the public domain
* worldwide. This software is distributed without any warranty.
*
* See <http://creativecommons.org/publicdomain/zero/1.0/>.
*
* This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid,
* small-state generators. It is extremely (sub-ns) fast and it passes all
* tests we are aware of, but its state space is large enough only for
* mild parallelism.
*/
#include <sys/vdev_draid.h>
static inline uint64_t rotl(const uint64_t x, int k)
{
return (x << k) | (x >> (64 - k));
}
uint64_t
vdev_draid_rand(uint64_t *s)
{
const uint64_t s0 = s[0];
uint64_t s1 = s[1];
const uint64_t result = rotl(s0 + s1, 17) + s0;
s1 ^= s0;
s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
s[1] = rotl(s1, 28); // c
return (result);
}
+9
View File
@@ -1844,9 +1844,13 @@ vdev_indirect_io_done(zio_t *zio)
}
vdev_ops_t vdev_indirect_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_indirect_open,
.vdev_op_close = vdev_indirect_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_indirect_io_start,
.vdev_op_io_done = vdev_indirect_io_done,
.vdev_op_state_change = NULL,
@@ -1855,6 +1859,11 @@ vdev_ops_t vdev_indirect_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = vdev_indirect_remap,
.vdev_op_xlate = NULL,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* leaf vdev */
};
+80 -63
View File
@@ -121,6 +121,8 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
vd->vdev_initialize_action_time = gethrestime_sec();
}
vdev_initializing_state_t old_state = vd->vdev_initialize_state;
vd->vdev_initialize_state = new_state;
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
@@ -138,8 +140,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
"vdev=%s suspended", vd->vdev_path);
break;
case VDEV_INITIALIZE_CANCELED:
spa_history_log_internal(spa, "initialize", tx,
"vdev=%s canceled", vd->vdev_path);
if (old_state == VDEV_INITIALIZE_ACTIVE ||
old_state == VDEV_INITIALIZE_SUSPENDED)
spa_history_log_internal(spa, "initialize", tx,
"vdev=%s canceled", vd->vdev_path);
break;
case VDEV_INITIALIZE_COMPLETE:
spa_history_log_internal(spa, "initialize", tx,
@@ -317,6 +321,32 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
return (0);
}
static void
vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
{
uint64_t *last_rs_end = (uint64_t *)arg;
if (physical_rs->rs_end > *last_rs_end)
*last_rs_end = physical_rs->rs_end;
}
static void
vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs)
{
vdev_t *vd = (vdev_t *)arg;
uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
vd->vdev_initialize_bytes_est += size;
if (vd->vdev_initialize_last_offset > physical_rs->rs_end) {
vd->vdev_initialize_bytes_done += size;
} else if (vd->vdev_initialize_last_offset > physical_rs->rs_start &&
vd->vdev_initialize_last_offset < physical_rs->rs_end) {
vd->vdev_initialize_bytes_done +=
vd->vdev_initialize_last_offset - physical_rs->rs_start;
}
}
static void
vdev_initialize_calculate_progress(vdev_t *vd)
{
@@ -331,28 +361,35 @@ vdev_initialize_calculate_progress(vdev_t *vd)
metaslab_t *msp = vd->vdev_top->vdev_ms[i];
mutex_enter(&msp->ms_lock);
uint64_t ms_free = msp->ms_size -
metaslab_allocated_space(msp);
if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
ms_free /= vd->vdev_top->vdev_children;
uint64_t ms_free = (msp->ms_size -
metaslab_allocated_space(msp)) /
vdev_get_ndisks(vd->vdev_top);
/*
* Convert the metaslab range to a physical range
* on our vdev. We use this to determine if we are
* in the middle of this metaslab range.
*/
range_seg64_t logical_rs, physical_rs;
range_seg64_t logical_rs, physical_rs, remain_rs;
logical_rs.rs_start = msp->ms_start;
logical_rs.rs_end = msp->ms_start + msp->ms_size;
vdev_xlate(vd, &logical_rs, &physical_rs);
/* Metaslab space after this offset has not been initialized */
vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
vd->vdev_initialize_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
continue;
} else if (vd->vdev_initialize_last_offset >
physical_rs.rs_end) {
}
/* Metaslab space before this offset has been initialized */
uint64_t last_rs_end = physical_rs.rs_end;
if (!vdev_xlate_is_empty(&remain_rs)) {
vdev_xlate_walk(vd, &remain_rs,
vdev_initialize_xlate_last_rs_end, &last_rs_end);
}
if (vd->vdev_initialize_last_offset > last_rs_end) {
vd->vdev_initialize_bytes_done += ms_free;
vd->vdev_initialize_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
@@ -374,22 +411,9 @@ vdev_initialize_calculate_progress(vdev_t *vd)
&where)) {
logical_rs.rs_start = rs_get_start(rs, rt);
logical_rs.rs_end = rs_get_end(rs, rt);
vdev_xlate(vd, &logical_rs, &physical_rs);
uint64_t size = physical_rs.rs_end -
physical_rs.rs_start;
vd->vdev_initialize_bytes_est += size;
if (vd->vdev_initialize_last_offset >
physical_rs.rs_end) {
vd->vdev_initialize_bytes_done += size;
} else if (vd->vdev_initialize_last_offset >
physical_rs.rs_start &&
vd->vdev_initialize_last_offset <
physical_rs.rs_end) {
vd->vdev_initialize_bytes_done +=
vd->vdev_initialize_last_offset -
physical_rs.rs_start;
}
vdev_xlate_walk(vd, &logical_rs,
vdev_initialize_xlate_progress, vd);
}
mutex_exit(&msp->ms_lock);
}
@@ -419,6 +443,34 @@ vdev_initialize_load(vdev_t *vd)
return (err);
}
static void
vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs)
{
vdev_t *vd = arg;
/* Only add segments that we have not visited yet */
if (physical_rs->rs_end <= vd->vdev_initialize_last_offset)
return;
/* Pick up where we left off mid-range. */
if (vd->vdev_initialize_last_offset > physical_rs->rs_start) {
zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
"(%llu, %llu)", vd->vdev_path,
(u_longlong_t)physical_rs->rs_start,
(u_longlong_t)physical_rs->rs_end,
(u_longlong_t)vd->vdev_initialize_last_offset,
(u_longlong_t)physical_rs->rs_end);
ASSERT3U(physical_rs->rs_end, >,
vd->vdev_initialize_last_offset);
physical_rs->rs_start = vd->vdev_initialize_last_offset;
}
ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start,
physical_rs->rs_end - physical_rs->rs_start);
}
/*
* Convert the logical range into a physical range and add it to our
* avl tree.
@@ -427,47 +479,12 @@ static void
vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
{
vdev_t *vd = arg;
range_seg64_t logical_rs, physical_rs;
range_seg64_t logical_rs;
logical_rs.rs_start = start;
logical_rs.rs_end = start + size;
ASSERT(vd->vdev_ops->vdev_op_leaf);
vdev_xlate(vd, &logical_rs, &physical_rs);
IMPLY(vd->vdev_top == vd,
logical_rs.rs_start == physical_rs.rs_start);
IMPLY(vd->vdev_top == vd,
logical_rs.rs_end == physical_rs.rs_end);
/* Only add segments that we have not visited yet */
if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
return;
/* Pick up where we left off mid-range. */
if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
"(%llu, %llu)", vd->vdev_path,
(u_longlong_t)physical_rs.rs_start,
(u_longlong_t)physical_rs.rs_end,
(u_longlong_t)vd->vdev_initialize_last_offset,
(u_longlong_t)physical_rs.rs_end);
ASSERT3U(physical_rs.rs_end, >,
vd->vdev_initialize_last_offset);
physical_rs.rs_start = vd->vdev_initialize_last_offset;
}
ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
/*
* With raidz, it's possible that the logical range does not live on
* this leaf vdev. We only add the physical range to this vdev's if it
* has a length greater than 0.
*/
if (physical_rs.rs_end > physical_rs.rs_start) {
range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
physical_rs.rs_end - physical_rs.rs_start);
} else {
ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
}
vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
}
static void
+39 -23
View File
@@ -142,6 +142,7 @@
#include <sys/zap.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -453,31 +454,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_fru != NULL)
fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
if (vd->vdev_nparity != 0) {
ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
VDEV_TYPE_RAIDZ) == 0);
if (vd->vdev_ops->vdev_op_config_generate != NULL)
vd->vdev_ops->vdev_op_config_generate(vd, nv);
/*
* Make sure someone hasn't managed to sneak a fancy new vdev
* into a crufty old storage pool.
*/
ASSERT(vd->vdev_nparity == 1 ||
(vd->vdev_nparity <= 2 &&
spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
(vd->vdev_nparity <= 3 &&
spa_version(spa) >= SPA_VERSION_RAIDZ3));
/*
* Note that we'll add the nparity tag even on storage pools
* that only support a single parity device -- older software
* will just ignore it.
*/
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
}
if (vd->vdev_wholedisk != -1ULL)
if (vd->vdev_wholedisk != -1ULL) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
vd->vdev_wholedisk);
}
if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
@@ -785,6 +768,14 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
if (!vdev_readable(vd))
return (NULL);
/*
* The label for a dRAID distributed spare is not stored on disk.
* Instead it is generated when needed which allows us to bypass
* the pipeline when reading the config from the label.
*/
if (vd->vdev_ops == &vdev_draid_spare_ops)
return (vdev_draid_read_config_spare(vd));
vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
vp = abd_to_buf(vp_abd);
@@ -1497,7 +1488,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
for (int c = 0; c < vd->vdev_children; c++)
vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) &&
vd->vdev_ops != &vdev_draid_spare_ops) {
for (int l = 0; l < VDEV_LABELS; l++) {
for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_read(zio, vd, l,
@@ -1586,6 +1578,13 @@ vdev_copy_uberblocks(vdev_t *vd)
SCL_STATE);
ASSERT(vd->vdev_ops->vdev_op_leaf);
/*
* No uberblocks are stored on distributed spares, they may be
* safely skipped when expanding a leaf vdev.
*/
if (vd->vdev_ops == &vdev_draid_spare_ops)
return;
spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER);
ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
@@ -1647,6 +1646,15 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
if (!vdev_writeable(vd))
return;
/*
* There's no need to write uberblocks to a distributed spare, they
* are already stored on all the leaves of the parent dRAID. For
* this same reason vdev_uberblock_load_impl() skips distributed
* spares when reading uberblocks.
*/
if (vd->vdev_ops == &vdev_draid_spare_ops)
return;
/* If the vdev was expanded, need to copy uberblock rings. */
if (vd->vdev_state == VDEV_STATE_HEALTHY &&
vd->vdev_copy_uberblocks == B_TRUE) {
@@ -1763,6 +1771,14 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes,
if (!vdev_writeable(vd))
return;
/*
* The top-level config never needs to be written to a distributed
* spare. When read vdev_dspare_label_read_config() will generate
* the config for the vdev_label_read_config().
*/
if (vd->vdev_ops == &vdev_draid_spare_ops)
return;
/*
* Generate a label describing the top-level config to which we belong.
*/
+123 -14
View File
@@ -33,6 +33,7 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/zio.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
@@ -99,7 +100,6 @@ vdev_mirror_stat_fini(void)
/*
* Virtual device vector for mirroring.
*/
typedef struct mirror_child {
vdev_t *mc_vd;
uint64_t mc_offset;
@@ -108,6 +108,7 @@ typedef struct mirror_child {
uint8_t mc_tried;
uint8_t mc_skipped;
uint8_t mc_speculative;
uint8_t mc_rebuilding;
} mirror_child_t;
typedef struct mirror_map {
@@ -115,6 +116,7 @@ typedef struct mirror_map {
int mm_preferred_cnt;
int mm_children;
boolean_t mm_resilvering;
boolean_t mm_rebuilding;
boolean_t mm_root;
mirror_child_t mm_child[];
} mirror_map_t;
@@ -239,6 +241,21 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
return (load + zfs_vdev_mirror_rotating_seek_inc);
}
static boolean_t
vdev_mirror_rebuilding(vdev_t *vd)
{
if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
return (B_TRUE);
for (int i = 0; i < vd->vdev_children; i++) {
if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
return (B_TRUE);
}
}
return (B_FALSE);
}
/*
* Avoid inlining the function to keep vdev_mirror_io_start(), which
* is this functions only caller, as small as possible on the stack.
@@ -356,6 +373,9 @@ vdev_mirror_map_init(zio_t *zio)
mc = &mm->mm_child[c];
mc->mc_vd = vd->vdev_child[c];
mc->mc_offset = zio->io_offset;
if (vdev_mirror_rebuilding(mc->mc_vd))
mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
}
}
@@ -493,12 +513,37 @@ vdev_mirror_preferred_child_randomize(zio_t *zio)
return (mm->mm_preferred[p]);
}
static boolean_t
vdev_mirror_child_readable(mirror_child_t *mc)
{
vdev_t *vd = mc->mc_vd;
if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
return (vdev_draid_readable(vd, mc->mc_offset));
else
return (vdev_readable(vd));
}
static boolean_t
vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
{
vdev_t *vd = mc->mc_vd;
if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
else
return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
}
/*
* Try to find a vdev whose DTL doesn't contain the block we want to read
* preferring vdevs based on determined load.
* preferring vdevs based on determined load. If we can't, try the read on
* any vdev we haven't already tried.
*
* Try to find a child whose DTL doesn't contain the block we want to read.
* If we can't, try the read on any vdev we haven't already tried.
* Distributed spares are an exception to the above load rule. They are
* always preferred in order to detect gaps in the distributed spare which
* are created when another disk in the dRAID fails. In order to restore
* redundancy those gaps must be read to trigger the required repair IO.
*/
static int
vdev_mirror_child_select(zio_t *zio)
@@ -518,20 +563,27 @@ vdev_mirror_child_select(zio_t *zio)
if (mc->mc_tried || mc->mc_skipped)
continue;
if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) {
if (mc->mc_vd == NULL ||
!vdev_mirror_child_readable(mc)) {
mc->mc_error = SET_ERROR(ENXIO);
mc->mc_tried = 1; /* don't even try */
mc->mc_skipped = 1;
continue;
}
if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
if (vdev_mirror_child_missing(mc, txg, 1)) {
mc->mc_error = SET_ERROR(ESTALE);
mc->mc_skipped = 1;
mc->mc_speculative = 1;
continue;
}
if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
mm->mm_preferred[0] = c;
mm->mm_preferred_cnt = 1;
break;
}
mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
if (mc->mc_load > lowest_load)
continue;
@@ -625,11 +677,25 @@ vdev_mirror_io_start(zio_t *zio)
while (children--) {
mc = &mm->mm_child[c];
c++;
/*
* When sequentially resilvering only issue write repair
* IOs to the vdev which is being rebuilt since performance
* is limited by the slowest child. This is an issue for
* faster replacement devices such as distributed spares.
*/
if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SCRUB) &&
mm->mm_rebuilding && !mc->mc_rebuilding) {
continue;
}
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_child_done, mc));
c++;
}
zio_execute(zio);
@@ -744,6 +810,8 @@ vdev_mirror_io_done(zio_t *zio)
mc = &mm->mm_child[c];
if (mc->mc_error == 0) {
vdev_ops_t *ops = mc->mc_vd->vdev_ops;
if (mc->mc_tried)
continue;
/*
@@ -752,15 +820,16 @@ vdev_mirror_io_done(zio_t *zio)
* 1. it's a scrub (in which case we have
* tried everything that was healthy)
* - or -
* 2. it's an indirect vdev (in which case
* it could point to any other vdev, which
* might have a bad DTL)
* 2. it's an indirect or distributed spare
* vdev (in which case it could point to any
* other vdev, which might have a bad DTL)
* - or -
* 3. the DTL indicates that this data is
* missing from this vdev
*/
if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
ops != &vdev_indirect_ops &&
ops != &vdev_draid_spare_ops &&
!vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
zio->io_txg, 1))
continue;
@@ -796,50 +865,90 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
}
}
/*
* Return the maximum asize for a rebuild zio in the provided range.
*/
static uint64_t
vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
uint64_t max_segment)
{
uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
SPA_MAXBLOCKSIZE);
return (MIN(asize, vdev_psize_to_asize(vd, psize)));
}
vdev_ops_t vdev_mirror_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
.vdev_op_need_resilver = NULL,
.vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
vdev_ops_t vdev_replacing_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
.vdev_op_need_resilver = NULL,
.vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
vdev_ops_t vdev_spare_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
.vdev_op_need_resilver = NULL,
.vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
+18
View File
@@ -81,9 +81,13 @@ vdev_missing_io_done(zio_t *zio)
}
vdev_ops_t vdev_missing_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_missing_open,
.vdev_op_close = vdev_missing_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_missing_io_start,
.vdev_op_io_done = vdev_missing_io_done,
.vdev_op_state_change = NULL,
@@ -92,14 +96,23 @@ vdev_ops_t vdev_missing_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = NULL,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};
vdev_ops_t vdev_hole_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_missing_open,
.vdev_op_close = vdev_missing_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_missing_io_start,
.vdev_op_io_done = vdev_missing_io_done,
.vdev_op_state_change = NULL,
@@ -108,6 +121,11 @@ vdev_ops_t vdev_hole_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = NULL,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};
+7
View File
@@ -593,6 +593,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
return (NULL);
/*
* I/Os to distributed spares are directly dispatched to the dRAID
* leaf vdevs for aggregation. See the comment at the end of the
* zio_vdev_io_start() function.
*/
ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops);
first = last = zio;
if (zio->io_type == ZIO_TYPE_READ)
+1175 -821
View File
File diff suppressed because it is too large Load Diff
+5 -5
View File
@@ -149,7 +149,7 @@ vdev_raidz_math_get_ops(void)
* Select parity generation method for raidz_map
*/
int
vdev_raidz_math_generate(raidz_map_t *rm)
vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr)
{
raidz_gen_f gen_parity = NULL;
@@ -174,7 +174,7 @@ vdev_raidz_math_generate(raidz_map_t *rm)
if (gen_parity == NULL)
return (RAIDZ_ORIGINAL_IMPL);
gen_parity(rm);
gen_parity(rr);
return (0);
}
@@ -241,8 +241,8 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
* @nbaddata - Number of failed data columns
*/
int
vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
const int *dt, const int nbaddata)
vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr,
const int *parity_valid, const int *dt, const int nbaddata)
{
raidz_rec_f rec_fn = NULL;
@@ -265,7 +265,7 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
if (rec_fn == NULL)
return (RAIDZ_ORIGINAL_IMPL);
else
return (rec_fn(rm, dt));
return (rec_fn(rr, dt));
}
const char *raidz_gen_name[] = {
+169 -144
View File
@@ -26,6 +26,7 @@
#define _VDEV_RAIDZ_MATH_IMPL_H
#include <sys/types.h>
#include <sys/vdev_raidz_impl.h>
#define raidz_inline inline __attribute__((always_inline))
#ifndef noinline
@@ -36,33 +37,33 @@
* Functions calculate multiplication constants for data reconstruction.
* Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and
* used parity columns for reconstruction.
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
* @coeff output array of coefficients. Array must be provided by
* user and must hold minimum MUL_CNT values.
*/
static noinline void
raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1));
}
static noinline void
raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1));
}
static noinline void
raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
gf_t a, b, e;
@@ -76,9 +77,9 @@ raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
}
static noinline void
raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
@@ -93,9 +94,9 @@ raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
}
static noinline void
raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
@@ -114,9 +115,9 @@ raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
}
static noinline void
raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
const unsigned z = tgtidx[TARGET_Z];
@@ -347,26 +348,26 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private)
/*
* Generate P parity (RAIDZ1)
*
* @rm RAIDZ map
* @rr RAIDZ row
*/
static raidz_inline void
raidz_generate_p_impl(raidz_map_t * const rm)
raidz_generate_p_impl(raidz_row_t * const rr)
{
size_t c;
const size_t ncols = raidz_ncols(rm);
const size_t psize = rm->rm_col[CODE_P].rc_size;
abd_t *pabd = rm->rm_col[CODE_P].rc_abd;
const size_t ncols = rr->rr_cols;
const size_t psize = rr->rr_col[CODE_P].rc_size;
abd_t *pabd = rr->rr_col[CODE_P].rc_abd;
size_t size;
abd_t *dabd;
raidz_math_begin();
/* start with first data column */
raidz_copy(pabd, rm->rm_col[1].rc_abd, psize);
raidz_copy(pabd, rr->rr_col[1].rc_abd, psize);
for (c = 2; c < ncols; c++) {
dabd = rm->rm_col[c].rc_abd;
size = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
size = rr->rr_col[c].rc_size;
/* add data column */
raidz_add(pabd, dabd, size);
@@ -414,29 +415,29 @@ raidz_gen_pq_add(void **c, const void *dc, const size_t csize,
/*
* Generate PQ parity (RAIDZ2)
*
* @rm RAIDZ map
* @rr RAIDZ row
*/
static raidz_inline void
raidz_generate_pq_impl(raidz_map_t * const rm)
raidz_generate_pq_impl(raidz_row_t * const rr)
{
size_t c;
const size_t ncols = raidz_ncols(rm);
const size_t csize = rm->rm_col[CODE_P].rc_size;
const size_t ncols = rr->rr_cols;
const size_t csize = rr->rr_col[CODE_P].rc_size;
size_t dsize;
abd_t *dabd;
abd_t *cabds[] = {
rm->rm_col[CODE_P].rc_abd,
rm->rm_col[CODE_Q].rc_abd
rr->rr_col[CODE_P].rc_abd,
rr->rr_col[CODE_Q].rc_abd
};
raidz_math_begin();
raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize);
raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize);
raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize);
raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize);
for (c = 3; c < ncols; c++) {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
raidz_gen_pq_add);
@@ -487,31 +488,31 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
/*
* Generate PQR parity (RAIDZ2)
*
* @rm RAIDZ map
* @rr RAIDZ row
*/
static raidz_inline void
raidz_generate_pqr_impl(raidz_map_t * const rm)
raidz_generate_pqr_impl(raidz_row_t * const rr)
{
size_t c;
const size_t ncols = raidz_ncols(rm);
const size_t csize = rm->rm_col[CODE_P].rc_size;
const size_t ncols = rr->rr_cols;
const size_t csize = rr->rr_col[CODE_P].rc_size;
size_t dsize;
abd_t *dabd;
abd_t *cabds[] = {
rm->rm_col[CODE_P].rc_abd,
rm->rm_col[CODE_Q].rc_abd,
rm->rm_col[CODE_R].rc_abd
rr->rr_col[CODE_P].rc_abd,
rr->rr_col[CODE_Q].rc_abd,
rr->rr_col[CODE_R].rc_abd
};
raidz_math_begin();
raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize);
raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize);
raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize);
raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize);
raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize);
raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize);
for (c = 4; c < ncols; c++) {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
raidz_gen_pqr_add);
@@ -579,33 +580,36 @@ raidz_generate_pqr_impl(raidz_map_t * const rm)
* @syn_method raidz_add_abd()
* @rec_method not applicable
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t xsize = rm->rm_col[x].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
size_t size;
abd_t *dabd;
if (xabd == NULL)
return (1 << CODE_P);
raidz_math_begin();
/* copy P into target */
raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize);
/* generate p_syndrome */
for (c = firstdc; c < ncols; c++) {
if (c == x)
continue;
dabd = rm->rm_col[c].rc_abd;
size = MIN(rm->rm_col[c].rc_size, xsize);
dabd = rr->rr_col[c].rc_abd;
size = MIN(rr->rr_col[c].rc_size, xsize);
raidz_add(xabd, dabd, size);
}
@@ -653,30 +657,33 @@ raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize,
* @syn_method raidz_add_abd()
* @rec_method raidz_mul_abd_cb()
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
abd_t *xabd = rm->rm_col[x].rc_abd;
const size_t xsize = rm->rm_col[x].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
abd_t *tabds[] = { xabd };
if (xabd == NULL)
return (1 << CODE_Q);
unsigned coeff[MUL_CNT];
raidz_rec_q_coeff(rm, tgtidx, coeff);
raidz_rec_q_coeff(rr, tgtidx, coeff);
raidz_math_begin();
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
}
@@ -687,8 +694,8 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
@@ -696,7 +703,7 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
}
/* add Q to the syndrome */
raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize);
raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize);
/* transform the syndrome */
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
@@ -744,30 +751,33 @@ raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize,
* @syn_method raidz_add_abd()
* @rec_method raidz_mul_abd_cb()
*
* @rm RAIDZ map
* @rr RAIDZ rr
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t xsize = rm->rm_col[x].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
abd_t *tabds[] = { xabd };
if (xabd == NULL)
return (1 << CODE_R);
unsigned coeff[MUL_CNT];
raidz_rec_r_coeff(rm, tgtidx, coeff);
raidz_rec_r_coeff(rr, tgtidx, coeff);
raidz_math_begin();
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
}
@@ -779,8 +789,8 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
@@ -788,7 +798,7 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
}
/* add R to the syndrome */
raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize);
raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize);
/* transform the syndrome */
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
@@ -881,31 +891,34 @@ raidz_rec_pq_abd(void **tc, const size_t tsize, void **c,
* @syn_method raidz_syn_pq_abd()
* @rec_method raidz_rec_pq_abd()
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t y = tgtidx[TARGET_Y];
const size_t xsize = rm->rm_col[x].rc_size;
const size_t ysize = rm->rm_col[y].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
abd_t *yabd = rm->rm_col[y].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
const size_t ysize = rr->rr_col[y].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
abd_t *yabd = rr->rr_col[y].rc_abd;
abd_t *tabds[2] = { xabd, yabd };
abd_t *cabds[] = {
rm->rm_col[CODE_P].rc_abd,
rm->rm_col[CODE_Q].rc_abd
rr->rr_col[CODE_P].rc_abd,
rr->rr_col[CODE_Q].rc_abd
};
if (xabd == NULL)
return ((1 << CODE_P) | (1 << CODE_Q));
unsigned coeff[MUL_CNT];
raidz_rec_pq_coeff(rm, tgtidx, coeff);
raidz_rec_pq_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets is shorter then others
@@ -921,8 +934,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -934,8 +947,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@@ -946,7 +959,7 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
/* Copy shorter targets back to the original abd buffer */
if (ysize < xsize)
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
raidz_math_end();
@@ -1038,30 +1051,34 @@ raidz_rec_pr_abd(void **t, const size_t tsize, void **c,
* @syn_method raidz_syn_pr_abd()
* @rec_method raidz_rec_pr_abd()
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[0];
const size_t y = tgtidx[1];
const size_t xsize = rm->rm_col[x].rc_size;
const size_t ysize = rm->rm_col[y].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
abd_t *yabd = rm->rm_col[y].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
const size_t ysize = rr->rr_col[y].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
abd_t *yabd = rr->rr_col[y].rc_abd;
abd_t *tabds[2] = { xabd, yabd };
abd_t *cabds[] = {
rm->rm_col[CODE_P].rc_abd,
rm->rm_col[CODE_R].rc_abd
rr->rr_col[CODE_P].rc_abd,
rr->rr_col[CODE_R].rc_abd
};
if (xabd == NULL)
return ((1 << CODE_P) | (1 << CODE_R));
unsigned coeff[MUL_CNT];
raidz_rec_pr_coeff(rm, tgtidx, coeff);
raidz_rec_pr_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets are shorter then others.
@@ -1077,8 +1094,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -1090,8 +1107,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@@ -1104,14 +1121,14 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
raidz_math_end();
if (ysize < xsize)
abd_free(yabd);
return ((1 << CODE_P) | (1 << CODE_Q));
return ((1 << CODE_P) | (1 << CODE_R));
}
@@ -1201,30 +1218,34 @@ raidz_rec_qr_abd(void **t, const size_t tsize, void **c,
* @syn_method raidz_syn_qr_abd()
* @rec_method raidz_rec_qr_abd()
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t y = tgtidx[TARGET_Y];
const size_t xsize = rm->rm_col[x].rc_size;
const size_t ysize = rm->rm_col[y].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
abd_t *yabd = rm->rm_col[y].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
const size_t ysize = rr->rr_col[y].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
abd_t *yabd = rr->rr_col[y].rc_abd;
abd_t *tabds[2] = { xabd, yabd };
abd_t *cabds[] = {
rm->rm_col[CODE_Q].rc_abd,
rm->rm_col[CODE_R].rc_abd
rr->rr_col[CODE_Q].rc_abd,
rr->rr_col[CODE_R].rc_abd
};
if (xabd == NULL)
return ((1 << CODE_Q) | (1 << CODE_R));
unsigned coeff[MUL_CNT];
raidz_rec_qr_coeff(rm, tgtidx, coeff);
raidz_rec_qr_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets is shorter then others
@@ -1240,8 +1261,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -1253,8 +1274,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@@ -1267,7 +1288,7 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
raidz_math_end();
@@ -1384,34 +1405,38 @@ raidz_rec_pqr_abd(void **t, const size_t tsize, void **c,
* @syn_method raidz_syn_pqr_abd()
* @rec_method raidz_rec_pqr_abd()
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t y = tgtidx[TARGET_Y];
const size_t z = tgtidx[TARGET_Z];
const size_t xsize = rm->rm_col[x].rc_size;
const size_t ysize = rm->rm_col[y].rc_size;
const size_t zsize = rm->rm_col[z].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
abd_t *yabd = rm->rm_col[y].rc_abd;
abd_t *zabd = rm->rm_col[z].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
const size_t ysize = rr->rr_col[y].rc_size;
const size_t zsize = rr->rr_col[z].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
abd_t *yabd = rr->rr_col[y].rc_abd;
abd_t *zabd = rr->rr_col[z].rc_abd;
abd_t *tabds[] = { xabd, yabd, zabd };
abd_t *cabds[] = {
rm->rm_col[CODE_P].rc_abd,
rm->rm_col[CODE_Q].rc_abd,
rm->rm_col[CODE_R].rc_abd
rr->rr_col[CODE_P].rc_abd,
rr->rr_col[CODE_Q].rc_abd,
rr->rr_col[CODE_R].rc_abd
};
if (xabd == NULL)
return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R));
unsigned coeff[MUL_CNT];
raidz_rec_pqr_coeff(rm, tgtidx, coeff);
raidz_rec_pqr_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets is shorter then others
@@ -1431,9 +1456,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -1446,8 +1471,8 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
@@ -1460,9 +1485,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
if (zsize < xsize)
raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize);
raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize);
raidz_math_end();
+141 -98
View File
@@ -25,6 +25,7 @@
*/
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/dsl_scan.h>
#include <sys/spa_impl.h>
#include <sys/metaslab_impl.h>
@@ -63,13 +64,15 @@
*
* Limitations:
*
* - Only supported for mirror vdev types. Due to the variable stripe
* width used by raidz sequential reconstruction is not possible.
* - Sequential reconstruction is not possible on RAIDZ due to its
* variable stripe width. Note dRAID uses a fixed stripe width which
* avoids this issue, but comes at the expense of some usable capacity.
*
* - Block checksums are not verified during sequential reconstuction.
* - Block checksums are not verified during sequential reconstruction.
* Similar to traditional RAID the parity/mirror data is reconstructed
* but cannot be immediately double checked. For this reason when the
* last active resilver completes the pool is automatically scrubbed.
* last active resilver completes the pool is automatically scrubbed
* by default.
*
* - Deferred resilvers using sequential reconstruction are not currently
* supported. When adding another vdev to an active top-level resilver
@@ -77,8 +80,8 @@
*
* Advantages:
*
* - Sequential reconstuction is performed in LBA order which may be faster
* than healing reconstuction particularly when using using HDDs (or
* - Sequential reconstruction is performed in LBA order which may be faster
* than healing reconstruction particularly when using using HDDs (or
* especially with SMR devices). Only allocated capacity is resilvered.
*
* - Sequential reconstruction is not constrained by ZFS block boundaries.
@@ -86,9 +89,9 @@
* allowing all of these logical blocks to be repaired with a single IO.
*
* - Unlike a healing resilver or scrub which are pool wide operations,
* sequential reconstruction is handled by the top-level mirror vdevs.
* This allows for it to be started or canceled on a top-level vdev
* without impacting any other top-level vdevs in the pool.
* sequential reconstruction is handled by the top-level vdevs. This
* allows for it to be started or canceled on a top-level vdev without
* impacting any other top-level vdevs in the pool.
*
* - Data only referenced by a pool checkpoint will be repaired because
* that space is reflected in the space maps. This differs for a
@@ -97,18 +100,36 @@
/*
* Maximum number of queued rebuild I/Os top-level vdev. The number of
* concurrent rebuild I/Os issued to the device is controlled by the
* zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module
* options.
*/
unsigned int zfs_rebuild_queue_limit = 20;
/*
* Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE.
* Size of rebuild reads; defaults to 1MiB per data disk and is capped at
* SPA_MAXBLOCKSIZE.
*/
unsigned long zfs_rebuild_max_segment = 1024 * 1024;
/*
* Maximum number of parallelly executed bytes per leaf vdev caused by a
* sequential resilver. We attempt to strike a balance here between keeping
* the vdev queues full of I/Os at all times and not overflowing the queues
* to cause long latency, which would cause long txg sync times.
*
* A large default value can be safely used here because the default target
* segment size is also large (zfs_rebuild_max_segment=1M). This helps keep
* the queue depth short.
*
* 32MB was selected as the default value to achieve good performance with
* a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
* rebuild was unable to saturate all of the drives using smaller values.
* With a value of 32MB the sequential resilver write rate was measured at
* 800MB/s sustained while rebuilding to a distributed spare.
*/
unsigned long zfs_rebuild_vdev_limit = 32 << 20;
/*
* Automatically start a pool scrub when the last active sequential resilver
* completes in order to verify the checksums of all blocks which have been
* resilvered. This option is enabled by default and is strongly recommended.
*/
int zfs_rebuild_scrub_enabled = 1;
/*
* For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
*/
@@ -293,7 +314,7 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
REBUILD_PHYS_ENTRIES, vrp, tx));
vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
spa_history_log_internal(spa, "rebuild", tx,
@@ -306,7 +327,16 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
vd->vdev_rebuilding = B_FALSE;
mutex_exit(&vd->vdev_rebuild_lock);
spa_notify_waiters(spa);
/*
* While we're in syncing context take the opportunity to
* setup the scrub when there are no more active rebuilds.
*/
if (!vdev_rebuild_active(spa->spa_root_vdev) &&
zfs_rebuild_scrub_enabled) {
pool_scan_func_t func = POOL_SCAN_SCRUB;
dsl_scan_setup_sync(&func, tx);
}
cv_broadcast(&vd->vdev_rebuild_cv);
}
@@ -438,7 +468,7 @@ vdev_rebuild_cb(zio_t *zio)
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
vdev_t *vd = vr->vr_top_vdev;
mutex_enter(&vd->vdev_rebuild_io_lock);
mutex_enter(&vr->vr_io_lock);
if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
/*
* The I/O failed because the top-level vdev was unavailable.
@@ -455,34 +485,30 @@ vdev_rebuild_cb(zio_t *zio)
abd_free(zio->io_abd);
ASSERT3U(vd->vdev_rebuild_inflight, >, 0);
vd->vdev_rebuild_inflight--;
cv_broadcast(&vd->vdev_rebuild_io_cv);
mutex_exit(&vd->vdev_rebuild_io_lock);
ASSERT3U(vr->vr_bytes_inflight, >, 0);
vr->vr_bytes_inflight -= zio->io_size;
cv_broadcast(&vr->vr_io_cv);
mutex_exit(&vr->vr_io_lock);
spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
}
/*
* Rebuild the data in this range by constructing a special dummy block
* pointer for the given range. It has no relation to any existing blocks
* in the pool. But by disabling checksum verification and issuing a scrub
* I/O mirrored vdevs will replicate the block using any available mirror
* leaf vdevs.
* Initialize a block pointer that can be used to read the given segment
* for sequential rebuild.
*/
static void
vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
uint64_t txg)
vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start,
uint64_t asize)
{
vdev_t *vd = vr->vr_top_vdev;
spa_t *spa = vd->vdev_spa;
uint64_t psize = asize;
ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
ASSERT(vd->vdev_ops == &vdev_draid_ops ||
vd->vdev_ops == &vdev_mirror_ops ||
vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops);
blkptr_t blk, *bp = &blk;
uint64_t psize = vd->vdev_ops == &vdev_draid_ops ?
vdev_draid_asize_to_psize(vd, asize) : asize;
BP_ZERO(bp);
DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
@@ -499,19 +525,6 @@ vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
BP_SET_LEVEL(bp, 0);
BP_SET_DEDUP(bp, 0);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
/*
* We increment the issued bytes by the asize rather than the psize
* so the scanned and issued bytes may be directly compared. This
* is consistent with the scrub/resilver issued reporting.
*/
vr->vr_pass_bytes_issued += asize;
vr->vr_rebuild_phys.vrp_bytes_issued += asize;
zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp,
abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
ZIO_FLAG_RESILVER, NULL));
}
/*
@@ -525,6 +538,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id;
vdev_t *vd = vr->vr_top_vdev;
spa_t *spa = vd->vdev_spa;
blkptr_t blk;
ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift);
ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift);
@@ -532,14 +546,26 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
vr->vr_pass_bytes_scanned += size;
vr->vr_rebuild_phys.vrp_bytes_scanned += size;
mutex_enter(&vd->vdev_rebuild_io_lock);
/*
* Rebuild the data in this range by constructing a special block
* pointer. It has no relation to any existing blocks in the pool.
* However, by disabling checksum verification and issuing a scrub IO
* we can reconstruct and repair any children with missing data.
*/
vdev_rebuild_blkptr_init(&blk, vd, start, size);
uint64_t psize = BP_GET_PSIZE(&blk);
if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
return (0);
mutex_enter(&vr->vr_io_lock);
/* Limit in flight rebuild I/Os */
while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit)
cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max)
cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
vd->vdev_rebuild_inflight++;
mutex_exit(&vd->vdev_rebuild_io_lock);
vr->vr_bytes_inflight += psize;
mutex_exit(&vr->vr_io_lock);
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
@@ -558,45 +584,29 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
/* When exiting write out our progress. */
if (vdev_rebuild_should_stop(vd)) {
mutex_enter(&vd->vdev_rebuild_io_lock);
vd->vdev_rebuild_inflight--;
mutex_exit(&vd->vdev_rebuild_io_lock);
mutex_enter(&vr->vr_io_lock);
vr->vr_bytes_inflight -= psize;
mutex_exit(&vr->vr_io_lock);
spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
mutex_exit(&vd->vdev_rebuild_lock);
dmu_tx_commit(tx);
return (SET_ERROR(EINTR));
}
mutex_exit(&vd->vdev_rebuild_lock);
vr->vr_scan_offset[txg & TXG_MASK] = start + size;
vdev_rebuild_rebuild_block(vr, start, size, txg);
dmu_tx_commit(tx);
vr->vr_scan_offset[txg & TXG_MASK] = start + size;
vr->vr_pass_bytes_issued += size;
vr->vr_rebuild_phys.vrp_bytes_issued += size;
zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk,
abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
ZIO_FLAG_RESILVER, NULL));
return (0);
}
/*
* Split range into legally-sized logical chunks given the constraints of the
* top-level mirror vdev type.
*/
static uint64_t
vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size)
{
uint64_t chunk_size, max_asize, max_segment;
ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops);
max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment,
1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE);
max_asize = vdev_psize_to_asize(vd, max_segment);
chunk_size = MIN(size, max_asize);
return (chunk_size);
}
/*
* Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree.
*/
@@ -625,7 +635,14 @@ vdev_rebuild_ranges(vdev_rebuild_t *vr)
while (size > 0) {
uint64_t chunk_size;
chunk_size = vdev_rebuild_chunk_size(vd, start, size);
/*
* Split range into legally-sized logical chunks
* given the constraints of the top-level vdev
* being rebuilt (dRAID or mirror).
*/
ASSERT3P(vd->vdev_ops, !=, NULL);
chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd,
start, size, zfs_rebuild_max_segment);
error = vdev_rebuild_range(vr, start, chunk_size);
if (error != 0)
@@ -747,10 +764,16 @@ vdev_rebuild_thread(void *arg)
vr->vr_top_vdev = vd;
vr->vr_scan_msp = NULL;
vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);
vr->vr_pass_start_time = gethrtime();
vr->vr_pass_bytes_scanned = 0;
vr->vr_pass_bytes_issued = 0;
vr->vr_bytes_inflight_max = MAX(1ULL << 20,
zfs_rebuild_vdev_limit * vd->vdev_children);
uint64_t update_est_time = gethrtime();
vdev_rebuild_update_bytes_est(vd, 0);
@@ -780,21 +803,32 @@ vdev_rebuild_thread(void *arg)
ASSERT0(range_tree_space(vr->vr_scan_tree));
/*
* Disable any new allocations to this metaslab and wait
* for any writes inflight to complete. This is needed to
* ensure all allocated ranges are rebuilt.
*/
/* Disable any new allocations to this metaslab */
metaslab_disable(msp);
spa_config_exit(spa, SCL_CONFIG, FTAG);
txg_wait_synced(dsl, 0);
mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
/*
* If there are outstanding allocations wait for them to be
* synced. This is needed to ensure all allocated ranges are
* on disk and therefore will be rebuilt.
*/
for (int j = 0; j < TXG_SIZE; j++) {
if (range_tree_space(msp->ms_allocating[j])) {
mutex_exit(&msp->ms_lock);
mutex_exit(&msp->ms_sync_lock);
txg_wait_synced(dsl, 0);
mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
break;
}
}
/*
* When a metaslab has been allocated from read its allocated
* ranges from the space map object in to the vr_scan_tree.
* ranges from the space map object into the vr_scan_tree.
* Then add inflight / unflushed ranges and remove inflight /
* unflushed frees. This is the minimum range to be rebuilt.
*/
@@ -827,7 +861,7 @@ vdev_rebuild_thread(void *arg)
/*
* To provide an accurate estimate re-calculate the estimated
* size every 5 minutes to account for recent allocations and
* frees made space maps which have not yet been rebuilt.
* frees made to space maps which have not yet been rebuilt.
*/
if (gethrtime() > update_est_time + SEC2NSEC(300)) {
update_est_time = gethrtime();
@@ -851,11 +885,14 @@ vdev_rebuild_thread(void *arg)
spa_config_exit(spa, SCL_CONFIG, FTAG);
/* Wait for any remaining rebuild I/O to complete */
mutex_enter(&vd->vdev_rebuild_io_lock);
while (vd->vdev_rebuild_inflight > 0)
cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
mutex_enter(&vr->vr_io_lock);
while (vr->vr_bytes_inflight > 0)
cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
mutex_exit(&vd->vdev_rebuild_io_lock);
mutex_exit(&vr->vr_io_lock);
mutex_destroy(&vr->vr_io_lock);
cv_destroy(&vr->vr_io_cv);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -1100,5 +1137,11 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
"Max segment size in bytes of rebuild reads");
"Max segment size in bytes of rebuild reads");
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW,
"Max bytes in flight per leaf vdev for sequential resilvers");
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW,
"Automatically scrub after sequential resilver completes");
/* END CSTYLED */
+27 -15
View File
@@ -250,7 +250,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
spa_vdev_removal_t *svr = NULL;
uint64_t txg __maybe_unused = dmu_tx_get_txg(tx);
ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
ASSERT0(vdev_get_nparity(vd));
svr = spa_vdev_removal_create(vd);
ASSERT(vd->vdev_removing);
@@ -1120,7 +1120,7 @@ static void
vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
{
ASSERT3P(zlist, !=, NULL);
ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
ASSERT0(vdev_get_nparity(vd));
if (vd->vdev_leaf_zap != 0) {
char zkey[32];
@@ -2041,7 +2041,7 @@ spa_vdev_remove_top_check(vdev_t *vd)
/*
* All vdevs in normal class must have the same ashift
* and not be raidz.
* and not be raidz or draid.
*/
vdev_t *rvd = spa->spa_root_vdev;
int num_indirect = 0;
@@ -2064,7 +2064,7 @@ spa_vdev_remove_top_check(vdev_t *vd)
num_indirect++;
if (!vdev_is_concrete(cvd))
continue;
if (cvd->vdev_ops == &vdev_raidz_ops)
if (vdev_get_nparity(cvd) != 0)
return (SET_ERROR(EINVAL));
/*
* Need the mirror to be mirror of leaf vdevs only
@@ -2217,18 +2217,30 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
* in this pool.
*/
if (vd == NULL || unspare) {
if (vd == NULL)
vd = spa_lookup_by_guid(spa, guid, B_TRUE);
ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_AUX);
char *type;
boolean_t draid_spare = B_FALSE;
vd_type = VDEV_TYPE_SPARE;
vd_path = spa_strdup(fnvlist_lookup_string(
nv, ZPOOL_CONFIG_PATH));
spa_vdev_remove_aux(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, nspares, nv);
spa_load_spares(spa);
spa->spa_spares.sav_sync = B_TRUE;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
== 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
draid_spare = B_TRUE;
if (vd == NULL && draid_spare) {
error = SET_ERROR(ENOTSUP);
} else {
if (vd == NULL)
vd = spa_lookup_by_guid(spa,
guid, B_TRUE);
ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_AUX);
vd_type = VDEV_TYPE_SPARE;
vd_path = spa_strdup(fnvlist_lookup_string(
nv, ZPOOL_CONFIG_PATH));
spa_vdev_remove_aux(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, nspares, nv);
spa_load_spares(spa);
spa->spa_spares.sav_sync = B_TRUE;
}
} else {
error = SET_ERROR(EBUSY);
}
+9
View File
@@ -142,9 +142,13 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
}
vdev_ops_t vdev_root_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_root_open,
.vdev_op_close = vdev_root_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = NULL, /* not applicable to the root */
.vdev_op_io_done = NULL, /* not applicable to the root */
.vdev_op_state_change = vdev_root_state_change,
@@ -153,6 +157,11 @@ vdev_ops_t vdev_root_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = NULL,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
+85 -66
View File
@@ -311,7 +311,8 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
vd->vdev_trim_secure = secure;
}
boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED);
vdev_trim_state_t old_state = vd->vdev_trim_state;
boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED);
vd->vdev_trim_state = new_state;
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
@@ -332,9 +333,12 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
"vdev=%s suspended", vd->vdev_path);
break;
case VDEV_TRIM_CANCELED:
spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
spa_history_log_internal(spa, "trim", tx,
"vdev=%s canceled", vd->vdev_path);
if (old_state == VDEV_TRIM_ACTIVE ||
old_state == VDEV_TRIM_SUSPENDED) {
spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
spa_history_log_internal(spa, "trim", tx,
"vdev=%s canceled", vd->vdev_path);
}
break;
case VDEV_TRIM_COMPLETE:
spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
@@ -601,6 +605,32 @@ vdev_trim_ranges(trim_args_t *ta)
return (0);
}
static void
vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
{
uint64_t *last_rs_end = (uint64_t *)arg;
if (physical_rs->rs_end > *last_rs_end)
*last_rs_end = physical_rs->rs_end;
}
static void
vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs)
{
vdev_t *vd = (vdev_t *)arg;
uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
vd->vdev_trim_bytes_est += size;
if (vd->vdev_trim_last_offset >= physical_rs->rs_end) {
vd->vdev_trim_bytes_done += size;
} else if (vd->vdev_trim_last_offset > physical_rs->rs_start &&
vd->vdev_trim_last_offset <= physical_rs->rs_end) {
vd->vdev_trim_bytes_done +=
vd->vdev_trim_last_offset - physical_rs->rs_start;
}
}
/*
* Calculates the completion percentage of a manual TRIM.
*/
@@ -618,27 +648,35 @@ vdev_trim_calculate_progress(vdev_t *vd)
metaslab_t *msp = vd->vdev_top->vdev_ms[i];
mutex_enter(&msp->ms_lock);
uint64_t ms_free = msp->ms_size -
metaslab_allocated_space(msp);
if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
ms_free /= vd->vdev_top->vdev_children;
uint64_t ms_free = (msp->ms_size -
metaslab_allocated_space(msp)) /
vdev_get_ndisks(vd->vdev_top);
/*
* Convert the metaslab range to a physical range
* on our vdev. We use this to determine if we are
* in the middle of this metaslab range.
*/
range_seg64_t logical_rs, physical_rs;
range_seg64_t logical_rs, physical_rs, remain_rs;
logical_rs.rs_start = msp->ms_start;
logical_rs.rs_end = msp->ms_start + msp->ms_size;
vdev_xlate(vd, &logical_rs, &physical_rs);
/* Metaslab space after this offset has not been trimmed. */
vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
if (vd->vdev_trim_last_offset <= physical_rs.rs_start) {
vd->vdev_trim_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
continue;
} else if (vd->vdev_trim_last_offset > physical_rs.rs_end) {
}
/* Metaslab space before this offset has been trimmed */
uint64_t last_rs_end = physical_rs.rs_end;
if (!vdev_xlate_is_empty(&remain_rs)) {
vdev_xlate_walk(vd, &remain_rs,
vdev_trim_xlate_last_rs_end, &last_rs_end);
}
if (vd->vdev_trim_last_offset > last_rs_end) {
vd->vdev_trim_bytes_done += ms_free;
vd->vdev_trim_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
@@ -659,21 +697,9 @@ vdev_trim_calculate_progress(vdev_t *vd)
rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) {
logical_rs.rs_start = rs_get_start(rs, rt);
logical_rs.rs_end = rs_get_end(rs, rt);
vdev_xlate(vd, &logical_rs, &physical_rs);
uint64_t size = physical_rs.rs_end -
physical_rs.rs_start;
vd->vdev_trim_bytes_est += size;
if (vd->vdev_trim_last_offset >= physical_rs.rs_end) {
vd->vdev_trim_bytes_done += size;
} else if (vd->vdev_trim_last_offset >
physical_rs.rs_start &&
vd->vdev_trim_last_offset <=
physical_rs.rs_end) {
vd->vdev_trim_bytes_done +=
vd->vdev_trim_last_offset -
physical_rs.rs_start;
}
vdev_xlate_walk(vd, &logical_rs,
vdev_trim_xlate_progress, vd);
}
mutex_exit(&msp->ms_lock);
}
@@ -741,8 +767,38 @@ vdev_trim_load(vdev_t *vd)
return (err);
}
static void
vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs)
{
trim_args_t *ta = arg;
vdev_t *vd = ta->trim_vdev;
/*
* Only a manual trim will be traversing the vdev sequentially.
* For an auto trim all valid ranges should be added.
*/
if (ta->trim_type == TRIM_TYPE_MANUAL) {
/* Only add segments that we have not visited yet */
if (physical_rs->rs_end <= vd->vdev_trim_last_offset)
return;
/* Pick up where we left off mid-range. */
if (vd->vdev_trim_last_offset > physical_rs->rs_start) {
ASSERT3U(physical_rs->rs_end, >,
vd->vdev_trim_last_offset);
physical_rs->rs_start = vd->vdev_trim_last_offset;
}
}
ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
range_tree_add(ta->trim_tree, physical_rs->rs_start,
physical_rs->rs_end - physical_rs->rs_start);
}
/*
* Convert the logical range into a physical range and add it to the
* Convert the logical range into physical ranges and add them to the
* range tree passed in the trim_args_t.
*/
static void
@@ -750,7 +806,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
{
trim_args_t *ta = arg;
vdev_t *vd = ta->trim_vdev;
range_seg64_t logical_rs, physical_rs;
range_seg64_t logical_rs;
logical_rs.rs_start = start;
logical_rs.rs_end = start + size;
@@ -767,44 +823,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
}
ASSERT(vd->vdev_ops->vdev_op_leaf);
vdev_xlate(vd, &logical_rs, &physical_rs);
IMPLY(vd->vdev_top == vd,
logical_rs.rs_start == physical_rs.rs_start);
IMPLY(vd->vdev_top == vd,
logical_rs.rs_end == physical_rs.rs_end);
/*
* Only a manual trim will be traversing the vdev sequentially.
* For an auto trim all valid ranges should be added.
*/
if (ta->trim_type == TRIM_TYPE_MANUAL) {
/* Only add segments that we have not visited yet */
if (physical_rs.rs_end <= vd->vdev_trim_last_offset)
return;
/* Pick up where we left off mid-range. */
if (vd->vdev_trim_last_offset > physical_rs.rs_start) {
ASSERT3U(physical_rs.rs_end, >,
vd->vdev_trim_last_offset);
physical_rs.rs_start = vd->vdev_trim_last_offset;
}
}
ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
/*
* With raidz, it's possible that the logical range does not live on
* this leaf vdev. We only add the physical range to this vdev's if it
* has a length greater than 0.
*/
if (physical_rs.rs_end > physical_rs.rs_start) {
range_tree_add(ta->trim_tree, physical_rs.rs_start,
physical_rs.rs_end - physical_rs.rs_start);
} else {
ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
}
vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg);
}
/*
+3 -1
View File
@@ -1111,7 +1111,9 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
bcopy(info, report->zcr_ckinfo, sizeof (*info));
}
report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
report->zcr_align =
vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
report->zcr_length = length;
#ifdef _KERNEL
+30 -12
View File
@@ -1702,16 +1702,16 @@ zio_write_compress(zio_t *zio)
return (zio);
} else {
/*
* Round up compressed size up to the ashift
* of the smallest-ashift device, and zero the tail.
* This ensures that the compressed size of the BP
* (and thus compressratio property) are correct,
* Round compressed size up to the minimum allocation
* size of the smallest-ashift device, and zero the
* tail. This ensures that the compressed size of the
* BP (and thus compressratio property) are correct,
* in that we charge for the padding used to fill out
* the last sector.
*/
ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
size_t rounded = (size_t)P2ROUNDUP(psize,
1ULL << spa->spa_min_ashift);
ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
size_t rounded = (size_t)roundup(psize,
spa->spa_min_alloc);
if (rounded >= lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
@@ -3754,19 +3754,37 @@ zio_vdev_io_start(zio_t *zio)
* However, indirect vdevs point off to other vdevs which may have
* DTL's, so we never bypass them. The child i/os on concrete vdevs
* will be properly bypassed instead.
*
* Leaf DTL_PARTIAL can be empty when a legitimate write comes from
* a dRAID spare vdev. For example, when a dRAID spare is first
* used, its spare blocks need to be written to but the leaf vdev's
* of such blocks can have empty DTL_PARTIAL.
*
* There seemed no clean way to allow such writes while bypassing
* spurious ones. At this point, just avoid all bypassing for dRAID
* for correctness.
*/
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
zio->io_txg != 0 && /* not a delegated i/o */
vd->vdev_ops != &vdev_indirect_ops &&
vd->vdev_top->vdev_ops != &vdev_draid_ops &&
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio_vdev_io_bypass(zio);
return (zio);
}
if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) {
/*
* Select the next best leaf I/O to process. Distributed spares are
* excluded since they dispatch the I/O directly to a leaf vdev after
* applying the dRAID mapping.
*/
if (vd->vdev_ops->vdev_op_leaf &&
vd->vdev_ops != &vdev_draid_spare_ops &&
(zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE ||
zio->io_type == ZIO_TYPE_TRIM)) {
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
return (zio);
@@ -3803,8 +3821,8 @@ zio_vdev_io_done(zio_t *zio)
if (zio->io_delay)
zio->io_delay = gethrtime() - zio->io_delay;
if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
vd->vdev_ops != &vdev_draid_spare_ops) {
vdev_queue_io_done(zio);
if (zio->io_type == ZIO_TYPE_WRITE)
@@ -4206,7 +4224,7 @@ zio_checksum_verify(zio_t *zio)
if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
return (zio);
ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
}
if ((error = zio_checksum_error(zio, &info)) != 0) {
+6
View File
@@ -265,6 +265,12 @@ zio_handle_fault_injection(zio_t *zio, int error)
if (zio->io_type != ZIO_TYPE_READ)
return (0);
/*
* A rebuild I/O has no checksum to verify.
*/
if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
return (0);
rw_enter(&inject_lock, RW_READER);
for (handler = list_head(&inject_handlers); handler != NULL;