mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 18:40:43 +03:00
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
This commit is contained in:
@@ -84,6 +84,8 @@ $(MODULE)-objs += uberblock.o
|
||||
$(MODULE)-objs += unique.o
|
||||
$(MODULE)-objs += vdev.o
|
||||
$(MODULE)-objs += vdev_cache.o
|
||||
$(MODULE)-objs += vdev_draid.o
|
||||
$(MODULE)-objs += vdev_draid_rand.o
|
||||
$(MODULE)-objs += vdev_indirect.o
|
||||
$(MODULE)-objs += vdev_indirect_births.o
|
||||
$(MODULE)-objs += vdev_indirect_mapping.o
|
||||
|
||||
+9
-5
@@ -781,16 +781,17 @@ int
|
||||
abd_iterate_func(abd_t *abd, size_t off, size_t size,
|
||||
abd_iter_func_t *func, void *private)
|
||||
{
|
||||
int ret = 0;
|
||||
struct abd_iter aiter;
|
||||
boolean_t abd_multi;
|
||||
abd_t *c_abd;
|
||||
int ret = 0;
|
||||
|
||||
if (size == 0)
|
||||
return (0);
|
||||
|
||||
abd_verify(abd);
|
||||
ASSERT3U(off + size, <=, abd->abd_size);
|
||||
|
||||
abd_multi = abd_is_gang(abd);
|
||||
c_abd = abd_init_abd_iter(abd, &aiter, off);
|
||||
boolean_t abd_multi = abd_is_gang(abd);
|
||||
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
|
||||
|
||||
while (size > 0) {
|
||||
/* If we are at the end of the gang ABD we are done */
|
||||
@@ -920,6 +921,9 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
|
||||
boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
|
||||
abd_t *c_dabd, *c_sabd;
|
||||
|
||||
if (size == 0)
|
||||
return (0);
|
||||
|
||||
abd_verify(dabd);
|
||||
abd_verify(sabd);
|
||||
|
||||
|
||||
@@ -713,7 +713,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
void
|
||||
dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
|
||||
@@ -3327,20 +3327,13 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if the txg falls within the range which must be
|
||||
* resilvered. DVAs outside this range can always be skipped.
|
||||
*/
|
||||
if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
|
||||
return (B_FALSE);
|
||||
|
||||
/*
|
||||
* Check if the top-level vdev must resilver this offset.
|
||||
* When the offset does not intersect with a dirty leaf DTL
|
||||
* then it may be possible to skip the resilver IO. The psize
|
||||
* is provided instead of asize to simplify the check for RAIDZ.
|
||||
*/
|
||||
if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
|
||||
if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth))
|
||||
return (B_FALSE);
|
||||
|
||||
/*
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include <sys/space_map.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/vdev_draid.h>
|
||||
#include <sys/zio.h>
|
||||
#include <sys/spa_impl.h>
|
||||
#include <sys/zfeature.h>
|
||||
@@ -1563,6 +1564,7 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
|
||||
|
||||
#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
|
||||
defined(WITH_CF_BLOCK_ALLOCATOR)
|
||||
|
||||
/*
|
||||
* This is a helper function that can be used by the allocator to find a
|
||||
* suitable block to allocate. This will search the specified B-tree looking
|
||||
@@ -1654,6 +1656,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
|
||||
range_seg_t *rs;
|
||||
if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
|
||||
metaslab_size_tree_full_load(msp->ms_allocatable);
|
||||
|
||||
if (metaslab_df_use_largest_segment) {
|
||||
/* use largest free segment */
|
||||
rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
|
||||
@@ -2616,6 +2619,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
|
||||
ms->ms_allocator = -1;
|
||||
ms->ms_new = B_TRUE;
|
||||
|
||||
vdev_ops_t *ops = vd->vdev_ops;
|
||||
if (ops->vdev_op_metaslab_init != NULL)
|
||||
ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
|
||||
|
||||
/*
|
||||
* We only open space map objects that already exist. All others
|
||||
* will be opened when we finally allocate an object for it.
|
||||
@@ -5813,7 +5820,6 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
|
||||
metaslab_group_alloc_increment(spa,
|
||||
DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
|
||||
}
|
||||
|
||||
}
|
||||
ASSERT(error == 0);
|
||||
ASSERT(BP_GET_NDVAS(bp) == ndvas);
|
||||
|
||||
+10
-1
@@ -307,8 +307,17 @@ mmp_next_leaf(spa_t *spa)
|
||||
if (leaf == NULL)
|
||||
leaf = list_head(&spa->spa_leaf_list);
|
||||
|
||||
if (!vdev_writeable(leaf)) {
|
||||
/*
|
||||
* We skip unwritable, offline, detached, and dRAID spare
|
||||
* devices as they are either not legal targets or the write
|
||||
* may fail or not be seen by other hosts. Skipped dRAID
|
||||
* spares can never be written so the fail mask is not set.
|
||||
*/
|
||||
if (!vdev_writeable(leaf) || leaf->vdev_offline ||
|
||||
leaf->vdev_detached) {
|
||||
fail_mask |= MMP_FAIL_NOT_WRITABLE;
|
||||
} else if (leaf->vdev_ops == &vdev_draid_spare_ops) {
|
||||
continue;
|
||||
} else if (leaf->vdev_mmp_pending != 0) {
|
||||
fail_mask |= MMP_FAIL_WRITE_PENDING;
|
||||
} else {
|
||||
|
||||
+94
-32
@@ -60,6 +60,7 @@
|
||||
#include <sys/vdev_rebuild.h>
|
||||
#include <sys/vdev_trim.h>
|
||||
#include <sys/vdev_disk.h>
|
||||
#include <sys/vdev_draid.h>
|
||||
#include <sys/metaslab.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
#include <sys/mmp.h>
|
||||
@@ -3681,7 +3682,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
|
||||
/*
|
||||
* Build a new vdev tree from the trusted config
|
||||
*/
|
||||
VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
|
||||
error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
|
||||
if (error != 0) {
|
||||
nvlist_free(mos_config);
|
||||
spa_config_exit(spa, SCL_ALL, FTAG);
|
||||
spa_load_failed(spa, "spa_config_parse failed [error=%d]",
|
||||
error);
|
||||
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
|
||||
}
|
||||
|
||||
/*
|
||||
* Vdev paths in the MOS may be obsolete. If the untrusted config was
|
||||
@@ -5631,7 +5639,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
|
||||
uint64_t txg = TXG_INITIAL;
|
||||
nvlist_t **spares, **l2cache;
|
||||
uint_t nspares, nl2cache;
|
||||
uint64_t version, obj;
|
||||
uint64_t version, obj, ndraid = 0;
|
||||
boolean_t has_features;
|
||||
boolean_t has_encryption;
|
||||
boolean_t has_allocclass;
|
||||
@@ -5753,8 +5761,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
|
||||
|
||||
if (error == 0 &&
|
||||
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
|
||||
(error = spa_validate_aux(spa, nvroot, txg,
|
||||
VDEV_ALLOC_ADD)) == 0) {
|
||||
(error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
|
||||
(error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
|
||||
/*
|
||||
* instantiate the metaslab groups (this will dirty the vdevs)
|
||||
* we can no longer error exit past this point
|
||||
@@ -5895,6 +5903,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
|
||||
spa_sync_props(props, tx);
|
||||
}
|
||||
|
||||
for (int i = 0; i < ndraid; i++)
|
||||
spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
spa->spa_sync_on = B_TRUE;
|
||||
@@ -6403,13 +6414,26 @@ spa_reset(const char *pool)
|
||||
* ==========================================================================
|
||||
*/
|
||||
|
||||
/*
|
||||
* This is called as a synctask to increment the draid feature flag
|
||||
*/
|
||||
static void
|
||||
spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
|
||||
int draid = (int)(uintptr_t)arg;
|
||||
|
||||
for (int c = 0; c < draid; c++)
|
||||
spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a device to a storage pool.
|
||||
*/
|
||||
int
|
||||
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
{
|
||||
uint64_t txg;
|
||||
uint64_t txg, ndraid = 0;
|
||||
int error;
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
vdev_t *vd, *tvd;
|
||||
@@ -6438,8 +6462,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
return (spa_vdev_exit(spa, vd, txg, EINVAL));
|
||||
|
||||
if (vd->vdev_children != 0 &&
|
||||
(error = vdev_create(vd, txg, B_FALSE)) != 0)
|
||||
(error = vdev_create(vd, txg, B_FALSE)) != 0) {
|
||||
return (spa_vdev_exit(spa, vd, txg, error));
|
||||
}
|
||||
|
||||
/*
|
||||
* The virtual dRAID spares must be added after vdev tree is created
|
||||
* and the vdev guids are generated. The guid of their assoicated
|
||||
* dRAID is stored in the config and used when opening the spare.
|
||||
*/
|
||||
if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
|
||||
rvd->vdev_children)) == 0) {
|
||||
if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
|
||||
ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
|
||||
nspares = 0;
|
||||
} else {
|
||||
return (spa_vdev_exit(spa, vd, txg, error));
|
||||
}
|
||||
|
||||
/*
|
||||
* We must validate the spares and l2cache devices after checking the
|
||||
@@ -6452,7 +6491,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
* If we are in the middle of a device removal, we can only add
|
||||
* devices which match the existing devices in the pool.
|
||||
* If we are in the middle of a removal, or have some indirect
|
||||
* vdevs, we can not add raidz toplevels.
|
||||
* vdevs, we can not add raidz or dRAID top levels.
|
||||
*/
|
||||
if (spa->spa_vdev_removal != NULL ||
|
||||
spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
|
||||
@@ -6462,10 +6501,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
tvd->vdev_ashift != spa->spa_max_ashift) {
|
||||
return (spa_vdev_exit(spa, vd, txg, EINVAL));
|
||||
}
|
||||
/* Fail if top level vdev is raidz */
|
||||
if (tvd->vdev_ops == &vdev_raidz_ops) {
|
||||
/* Fail if top level vdev is raidz or a dRAID */
|
||||
if (vdev_get_nparity(tvd) != 0)
|
||||
return (spa_vdev_exit(spa, vd, txg, EINVAL));
|
||||
}
|
||||
|
||||
/*
|
||||
* Need the top level mirror to be
|
||||
* a mirror of leaf vdevs only
|
||||
@@ -6505,6 +6544,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
spa->spa_l2cache.sav_sync = B_TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* We can't increment a feature while holding spa_vdev so we
|
||||
* have to do it in a synctask.
|
||||
*/
|
||||
if (ndraid != 0) {
|
||||
dmu_tx_t *tx;
|
||||
|
||||
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
||||
dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
|
||||
(void *)(uintptr_t)ndraid, tx);
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
|
||||
/*
|
||||
* We have to be careful when adding new vdevs to an existing pool.
|
||||
* If other threads start allocating from these vdevs before we
|
||||
@@ -6615,14 +6667,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
|
||||
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
|
||||
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
|
||||
|
||||
/*
|
||||
* A dRAID spare can only replace a child of its parent dRAID vdev.
|
||||
*/
|
||||
if (newvd->vdev_ops == &vdev_draid_spare_ops &&
|
||||
oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
|
||||
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
|
||||
}
|
||||
|
||||
if (rebuild) {
|
||||
/*
|
||||
* For rebuilds, the parent vdev must support reconstruction
|
||||
* For rebuilds, the top vdev must support reconstruction
|
||||
* using only space maps. This means the only allowable
|
||||
* parents are the root vdev or a mirror vdev.
|
||||
* vdevs types are the root vdev, a mirror, or dRAID.
|
||||
*/
|
||||
if (pvd->vdev_ops != &vdev_mirror_ops &&
|
||||
pvd->vdev_ops != &vdev_root_ops) {
|
||||
tvd = pvd;
|
||||
if (pvd->vdev_top != NULL)
|
||||
tvd = pvd->vdev_top;
|
||||
|
||||
if (tvd->vdev_ops != &vdev_mirror_ops &&
|
||||
tvd->vdev_ops != &vdev_root_ops &&
|
||||
tvd->vdev_ops != &vdev_draid_ops) {
|
||||
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
|
||||
}
|
||||
}
|
||||
@@ -6915,14 +6980,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are detaching the original disk from a spare, then it implies
|
||||
* that the spare should become a real disk, and be removed from the
|
||||
* active spare list for the pool.
|
||||
* If we are detaching the original disk from a normal spare, then it
|
||||
* implies that the spare should become a real disk, and be removed
|
||||
* from the active spare list for the pool. dRAID spares on the
|
||||
* other hand are coupled to the pool and thus should never be removed
|
||||
* from the spares list.
|
||||
*/
|
||||
if (pvd->vdev_ops == &vdev_spare_ops &&
|
||||
vd->vdev_id == 0 &&
|
||||
pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
|
||||
unspare = B_TRUE;
|
||||
if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
|
||||
vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
|
||||
|
||||
if (last_cvd->vdev_isspare &&
|
||||
last_cvd->vdev_ops != &vdev_draid_spare_ops) {
|
||||
unspare = B_TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Erase the disk labels so the disk can be used for other things.
|
||||
@@ -8013,18 +8084,9 @@ spa_async_thread(void *arg)
|
||||
/*
|
||||
* If any devices are done replacing, detach them.
|
||||
*/
|
||||
if (tasks & SPA_ASYNC_RESILVER_DONE)
|
||||
if (tasks & SPA_ASYNC_RESILVER_DONE ||
|
||||
tasks & SPA_ASYNC_REBUILD_DONE) {
|
||||
spa_vdev_resilver_done(spa);
|
||||
|
||||
/*
|
||||
* If any devices are done replacing, detach them. Then if no
|
||||
* top-level vdevs are rebuilding attempt to kick off a scrub.
|
||||
*/
|
||||
if (tasks & SPA_ASYNC_REBUILD_DONE) {
|
||||
spa_vdev_resilver_done(spa);
|
||||
|
||||
if (!vdev_rebuild_active(spa->spa_root_vdev))
|
||||
(void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -741,6 +741,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
|
||||
|
||||
spa->spa_min_ashift = INT_MAX;
|
||||
spa->spa_max_ashift = 0;
|
||||
spa->spa_min_alloc = INT_MAX;
|
||||
|
||||
/* Reset cached value */
|
||||
spa->spa_dedup_dspace = ~0ULL;
|
||||
|
||||
+253
-112
@@ -40,6 +40,7 @@
|
||||
#include <sys/dsl_dir.h>
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/vdev_rebuild.h>
|
||||
#include <sys/vdev_draid.h>
|
||||
#include <sys/uberblock_impl.h>
|
||||
#include <sys/metaslab.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
@@ -51,6 +52,7 @@
|
||||
#include <sys/arc.h>
|
||||
#include <sys/zil.h>
|
||||
#include <sys/dsl_scan.h>
|
||||
#include <sys/vdev_raidz.h>
|
||||
#include <sys/abd.h>
|
||||
#include <sys/vdev_initialize.h>
|
||||
#include <sys/vdev_trim.h>
|
||||
@@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
|
||||
static vdev_ops_t *vdev_ops_table[] = {
|
||||
&vdev_root_ops,
|
||||
&vdev_raidz_ops,
|
||||
&vdev_draid_ops,
|
||||
&vdev_draid_spare_ops,
|
||||
&vdev_mirror_ops,
|
||||
&vdev_replacing_ops,
|
||||
&vdev_spare_ops,
|
||||
@@ -221,10 +225,11 @@ vdev_getops(const char *type)
|
||||
|
||||
/* ARGSUSED */
|
||||
void
|
||||
vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
|
||||
vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
|
||||
range_seg64_t *physical_rs, range_seg64_t *remain_rs)
|
||||
{
|
||||
res->rs_start = in->rs_start;
|
||||
res->rs_end = in->rs_end;
|
||||
physical_rs->rs_start = logical_rs->rs_start;
|
||||
physical_rs->rs_end = logical_rs->rs_end;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
|
||||
return (asize);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
vdev_default_min_asize(vdev_t *vd)
|
||||
{
|
||||
return (vd->vdev_min_asize);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the minimum allocatable size. We define the allocatable size as
|
||||
* the vdev's asize rounded to the nearest metaslab. This allows us to
|
||||
@@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd)
|
||||
if (vd == vd->vdev_top)
|
||||
return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
|
||||
|
||||
/*
|
||||
* The allocatable space for a raidz vdev is N * sizeof(smallest child),
|
||||
* so each child must provide at least 1/Nth of its asize.
|
||||
*/
|
||||
if (pvd->vdev_ops == &vdev_raidz_ops)
|
||||
return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
|
||||
pvd->vdev_children);
|
||||
|
||||
return (pvd->vdev_min_asize);
|
||||
return (pvd->vdev_ops->vdev_op_min_asize(pvd));
|
||||
}
|
||||
|
||||
void
|
||||
@@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd)
|
||||
vdev_set_min_asize(vd->vdev_child[c]);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the minimal allocation size for the top-level vdev.
|
||||
*/
|
||||
uint64_t
|
||||
vdev_get_min_alloc(vdev_t *vd)
|
||||
{
|
||||
uint64_t min_alloc = 1ULL << vd->vdev_ashift;
|
||||
|
||||
if (vd->vdev_ops->vdev_op_min_alloc != NULL)
|
||||
min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
|
||||
|
||||
return (min_alloc);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the parity level for a top-level vdev.
|
||||
*/
|
||||
uint64_t
|
||||
vdev_get_nparity(vdev_t *vd)
|
||||
{
|
||||
uint64_t nparity = 0;
|
||||
|
||||
if (vd->vdev_ops->vdev_op_nparity != NULL)
|
||||
nparity = vd->vdev_ops->vdev_op_nparity(vd);
|
||||
|
||||
return (nparity);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the number of data disks for a top-level vdev.
|
||||
*/
|
||||
uint64_t
|
||||
vdev_get_ndisks(vdev_t *vd)
|
||||
{
|
||||
uint64_t ndisks = 1;
|
||||
|
||||
if (vd->vdev_ops->vdev_op_ndisks != NULL)
|
||||
ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
|
||||
|
||||
return (ndisks);
|
||||
}
|
||||
|
||||
vdev_t *
|
||||
vdev_lookup_top(spa_t *spa, uint64_t vdev)
|
||||
{
|
||||
@@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
list_link_init(&vd->vdev_initialize_node);
|
||||
list_link_init(&vd->vdev_leaf_node);
|
||||
list_link_init(&vd->vdev_trim_node);
|
||||
|
||||
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
|
||||
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
@@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
|
||||
cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
for (int t = 0; t < DTL_TYPES; t++) {
|
||||
vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
|
||||
@@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
{
|
||||
vdev_ops_t *ops;
|
||||
char *type;
|
||||
uint64_t guid = 0, islog, nparity;
|
||||
uint64_t guid = 0, islog;
|
||||
vdev_t *vd;
|
||||
vdev_indirect_config_t *vic;
|
||||
char *tmp = NULL;
|
||||
@@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
|
||||
/*
|
||||
* Set the nparity property for RAID-Z vdevs.
|
||||
*/
|
||||
nparity = -1ULL;
|
||||
if (ops == &vdev_raidz_ops) {
|
||||
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
|
||||
&nparity) == 0) {
|
||||
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
|
||||
return (SET_ERROR(EINVAL));
|
||||
/*
|
||||
* Previous versions could only support 1 or 2 parity
|
||||
* device.
|
||||
*/
|
||||
if (nparity > 1 &&
|
||||
spa_version(spa) < SPA_VERSION_RAIDZ2)
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
if (nparity > 2 &&
|
||||
spa_version(spa) < SPA_VERSION_RAIDZ3)
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
} else {
|
||||
/*
|
||||
* We require the parity to be specified for SPAs that
|
||||
* support multiple parity levels.
|
||||
*/
|
||||
if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
|
||||
return (SET_ERROR(EINVAL));
|
||||
/*
|
||||
* Otherwise, we default to 1 parity device for RAID-Z.
|
||||
*/
|
||||
nparity = 1;
|
||||
}
|
||||
} else {
|
||||
nparity = 0;
|
||||
}
|
||||
ASSERT(nparity != -1ULL);
|
||||
|
||||
/*
|
||||
* If creating a top-level vdev, check for allocation classes input
|
||||
*/
|
||||
if (top_level && alloctype == VDEV_ALLOC_ADD) {
|
||||
char *bias;
|
||||
|
||||
/*
|
||||
* If creating a top-level vdev, check for allocation
|
||||
* classes input.
|
||||
*/
|
||||
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
&bias) == 0) {
|
||||
alloc_bias = vdev_derive_alloc_bias(bias);
|
||||
@@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
}
|
||||
}
|
||||
|
||||
/* spa_vdev_add() expects feature to be enabled */
|
||||
if (ops == &vdev_draid_ops &&
|
||||
spa->spa_load_state != SPA_LOAD_CREATE &&
|
||||
!spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the vdev specific data. This is done before calling
|
||||
* vdev_alloc_common() since it may fail and this simplifies the
|
||||
* error reporting and cleanup code paths.
|
||||
*/
|
||||
void *tsd = NULL;
|
||||
if (ops->vdev_op_init != NULL) {
|
||||
rc = ops->vdev_op_init(spa, nv, &tsd);
|
||||
if (rc != 0) {
|
||||
return (rc);
|
||||
}
|
||||
}
|
||||
|
||||
vd = vdev_alloc_common(spa, id, guid, ops);
|
||||
vic = &vd->vdev_indirect_config;
|
||||
|
||||
vd->vdev_tsd = tsd;
|
||||
vd->vdev_islog = islog;
|
||||
vd->vdev_nparity = nparity;
|
||||
|
||||
if (top_level && alloc_bias != VDEV_BIAS_NONE)
|
||||
vd->vdev_alloc_bias = alloc_bias;
|
||||
|
||||
@@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
&vd->vdev_wholedisk) != 0)
|
||||
vd->vdev_wholedisk = -1ULL;
|
||||
|
||||
vic = &vd->vdev_indirect_config;
|
||||
|
||||
ASSERT0(vic->vic_mapping_object);
|
||||
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
|
||||
&vic->vic_mapping_object);
|
||||
@@ -937,6 +967,9 @@ vdev_free(vdev_t *vd)
|
||||
ASSERT(vd->vdev_child == NULL);
|
||||
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
|
||||
|
||||
if (vd->vdev_ops->vdev_op_fini != NULL)
|
||||
vd->vdev_ops->vdev_op_fini(vd);
|
||||
|
||||
/*
|
||||
* Discard allocation state.
|
||||
*/
|
||||
@@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd)
|
||||
cv_destroy(&vd->vdev_trim_io_cv);
|
||||
|
||||
mutex_destroy(&vd->vdev_rebuild_lock);
|
||||
mutex_destroy(&vd->vdev_rebuild_io_lock);
|
||||
cv_destroy(&vd->vdev_rebuild_cv);
|
||||
cv_destroy(&vd->vdev_rebuild_io_cv);
|
||||
|
||||
zfs_ratelimit_fini(&vd->vdev_delay_rl);
|
||||
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
|
||||
@@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd)
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a mirror/replacing vdev above an existing vdev.
|
||||
* Add a mirror/replacing vdev above an existing vdev. There is no need to
|
||||
* call .vdev_op_init() since mirror/replacing vdevs do not have private state.
|
||||
*/
|
||||
vdev_t *
|
||||
vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
|
||||
@@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd)
|
||||
spa->spa_max_ashift = vd->vdev_ashift;
|
||||
if (vd->vdev_ashift < spa->spa_min_ashift)
|
||||
spa->spa_min_ashift = vd->vdev_ashift;
|
||||
|
||||
uint64_t min_alloc = vdev_get_min_alloc(vd);
|
||||
if (min_alloc < spa->spa_min_alloc)
|
||||
spa->spa_min_alloc = min_alloc;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd)
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns B_TRUE if the passed child should be opened.
|
||||
*/
|
||||
static boolean_t
|
||||
vdev_default_open_children_func(vdev_t *vd)
|
||||
{
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Open the requested child vdevs. If any of the leaf vdevs are using
|
||||
* a ZFS volume then do the opens in a single thread. This avoids a
|
||||
* deadlock when the current thread is holding the spa_namespace_lock.
|
||||
*/
|
||||
static void
|
||||
vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
|
||||
{
|
||||
int children = vd->vdev_children;
|
||||
|
||||
taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
|
||||
children, children, TASKQ_PREPOPULATE);
|
||||
vd->vdev_nonrot = B_TRUE;
|
||||
|
||||
for (int c = 0; c < children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
|
||||
if (open_func(cvd) == B_FALSE)
|
||||
continue;
|
||||
|
||||
if (tq == NULL || vdev_uses_zvols(vd)) {
|
||||
cvd->vdev_open_error = vdev_open(cvd);
|
||||
} else {
|
||||
VERIFY(taskq_dispatch(tq, vdev_open_child,
|
||||
cvd, TQ_SLEEP) != TASKQID_INVALID);
|
||||
}
|
||||
|
||||
vd->vdev_nonrot &= cvd->vdev_nonrot;
|
||||
}
|
||||
|
||||
if (tq != NULL) {
|
||||
taskq_wait(tq);
|
||||
taskq_destroy(tq);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Open all child vdevs.
|
||||
*/
|
||||
void
|
||||
vdev_open_children(vdev_t *vd)
|
||||
{
|
||||
taskq_t *tq;
|
||||
int children = vd->vdev_children;
|
||||
vdev_open_children_impl(vd, vdev_default_open_children_func);
|
||||
}
|
||||
|
||||
/*
|
||||
* in order to handle pools on top of zvols, do the opens
|
||||
* in a single thread so that the same thread holds the
|
||||
* spa_namespace_lock
|
||||
*/
|
||||
if (vdev_uses_zvols(vd)) {
|
||||
retry_sync:
|
||||
for (int c = 0; c < children; c++)
|
||||
vd->vdev_child[c]->vdev_open_error =
|
||||
vdev_open(vd->vdev_child[c]);
|
||||
} else {
|
||||
tq = taskq_create("vdev_open", children, minclsyspri,
|
||||
children, children, TASKQ_PREPOPULATE);
|
||||
if (tq == NULL)
|
||||
goto retry_sync;
|
||||
|
||||
for (int c = 0; c < children; c++)
|
||||
VERIFY(taskq_dispatch(tq, vdev_open_child,
|
||||
vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
|
||||
|
||||
taskq_destroy(tq);
|
||||
}
|
||||
|
||||
vd->vdev_nonrot = B_TRUE;
|
||||
|
||||
for (int c = 0; c < children; c++)
|
||||
vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
|
||||
/*
|
||||
* Conditionally open a subset of child vdevs.
|
||||
*/
|
||||
void
|
||||
vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
|
||||
{
|
||||
vdev_open_children_impl(vd, open_func);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1952,6 +2016,16 @@ vdev_open(vdev_t *vd)
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Track the the minimum allocation size.
|
||||
*/
|
||||
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
|
||||
vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
|
||||
uint64_t min_alloc = vdev_get_min_alloc(vd);
|
||||
if (min_alloc < spa->spa_min_alloc)
|
||||
spa->spa_min_alloc = min_alloc;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is a leaf vdev, assess whether a resilver is needed.
|
||||
* But don't do this if we are doing a reopen for a scrub, since
|
||||
@@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd)
|
||||
vdev_t *pvd = vd->vdev_parent;
|
||||
spa_t *spa __maybe_unused = vd->vdev_spa;
|
||||
|
||||
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
|
||||
ASSERT(vd != NULL);
|
||||
ASSERT(vd->vdev_open_thread == curthread ||
|
||||
spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
|
||||
|
||||
/*
|
||||
* If our parent is reopening, then we are as well, unless we are
|
||||
@@ -2606,10 +2682,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns B_TRUE if vdev determines offset needs to be resilvered.
|
||||
* Check if the txg falls within the range which must be
|
||||
* resilvered. DVAs outside this range can always be skipped.
|
||||
*/
|
||||
boolean_t
|
||||
vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
|
||||
vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
|
||||
uint64_t phys_birth)
|
||||
{
|
||||
/* Set by sequential resilver. */
|
||||
if (phys_birth == TXG_UNKNOWN)
|
||||
return (B_TRUE);
|
||||
|
||||
return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
|
||||
*/
|
||||
boolean_t
|
||||
vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
|
||||
uint64_t phys_birth)
|
||||
{
|
||||
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
|
||||
|
||||
@@ -2617,7 +2709,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
|
||||
vd->vdev_ops->vdev_op_leaf)
|
||||
return (B_TRUE);
|
||||
|
||||
return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
|
||||
return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
|
||||
phys_birth));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2862,8 +2955,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
|
||||
continue; /* leaf vdevs only */
|
||||
if (t == DTL_PARTIAL)
|
||||
minref = 1; /* i.e. non-zero */
|
||||
else if (vd->vdev_nparity != 0)
|
||||
minref = vd->vdev_nparity + 1; /* RAID-Z */
|
||||
else if (vdev_get_nparity(vd) != 0)
|
||||
minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
|
||||
else
|
||||
minref = vd->vdev_children; /* any kind of mirror */
|
||||
space_reftree_create(&reftree);
|
||||
@@ -3727,6 +3820,9 @@ top:
|
||||
if (!vd->vdev_ops->vdev_op_leaf)
|
||||
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
|
||||
|
||||
if (vd->vdev_ops == &vdev_draid_spare_ops)
|
||||
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
|
||||
|
||||
tvd = vd->vdev_top;
|
||||
mg = tvd->vdev_mg;
|
||||
generation = spa->spa_config_generation + 1;
|
||||
@@ -3971,6 +4067,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
|
||||
static void
|
||||
vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
|
||||
{
|
||||
/*
|
||||
* Exclude the dRAID spare when aggregating to avoid double counting
|
||||
* the ops and bytes. These IOs are counted by the physical leaves.
|
||||
*/
|
||||
if (cvd->vdev_ops == &vdev_draid_spare_ops)
|
||||
return;
|
||||
|
||||
for (int t = 0; t < VS_ZIO_TYPES; t++) {
|
||||
vs->vs_ops[t] += cvs->vs_ops[t];
|
||||
vs->vs_bytes[t] += cvs->vs_bytes[t];
|
||||
@@ -4063,7 +4166,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
|
||||
vdev_get_child_stat(cvd, vs, cvs);
|
||||
if (vsx)
|
||||
vdev_get_child_stat_ex(cvd, vsx, cvsx);
|
||||
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
@@ -4248,7 +4350,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
|
||||
|
||||
/*
|
||||
* Repair is the result of a rebuild issued by the
|
||||
* rebuild thread (vdev_rebuild_thread).
|
||||
* rebuild thread (vdev_rebuild_thread). To avoid
|
||||
* double counting repaired bytes the virtual dRAID
|
||||
* spare vdev is excluded from the processed bytes.
|
||||
*/
|
||||
if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
|
||||
vdev_t *tvd = vd->vdev_top;
|
||||
@@ -4256,8 +4360,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
|
||||
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
|
||||
uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf)
|
||||
if (vd->vdev_ops->vdev_op_leaf &&
|
||||
vd->vdev_ops != &vdev_draid_spare_ops) {
|
||||
atomic_add_64(rebuilt, psize);
|
||||
}
|
||||
vs->vs_rebuild_processed += psize;
|
||||
}
|
||||
|
||||
@@ -4981,31 +5087,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
|
||||
vdev_resilver_needed(vd, NULL, NULL));
|
||||
}
|
||||
|
||||
boolean_t
|
||||
vdev_xlate_is_empty(range_seg64_t *rs)
|
||||
{
|
||||
return (rs->rs_start == rs->rs_end);
|
||||
}
|
||||
|
||||
/*
|
||||
* Translate a logical range to the physical range for the specified vdev_t.
|
||||
* This function is initially called with a leaf vdev and will walk each
|
||||
* parent vdev until it reaches a top-level vdev. Once the top-level is
|
||||
* reached the physical range is initialized and the recursive function
|
||||
* begins to unwind. As it unwinds it calls the parent's vdev specific
|
||||
* translation function to do the real conversion.
|
||||
* Translate a logical range to the first contiguous physical range for the
|
||||
* specified vdev_t. This function is initially called with a leaf vdev and
|
||||
* will walk each parent vdev until it reaches a top-level vdev. Once the
|
||||
* top-level is reached the physical range is initialized and the recursive
|
||||
* function begins to unwind. As it unwinds it calls the parent's vdev
|
||||
* specific translation function to do the real conversion.
|
||||
*/
|
||||
void
|
||||
vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
|
||||
range_seg64_t *physical_rs)
|
||||
range_seg64_t *physical_rs, range_seg64_t *remain_rs)
|
||||
{
|
||||
/*
|
||||
* Walk up the vdev tree
|
||||
*/
|
||||
if (vd != vd->vdev_top) {
|
||||
vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
|
||||
vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
|
||||
remain_rs);
|
||||
} else {
|
||||
/*
|
||||
* We've reached the top-level vdev, initialize the
|
||||
* physical range to the logical range and start to
|
||||
* unwind.
|
||||
* We've reached the top-level vdev, initialize the physical
|
||||
* range to the logical range and set an empty remaining
|
||||
* range then start to unwind.
|
||||
*/
|
||||
physical_rs->rs_start = logical_rs->rs_start;
|
||||
physical_rs->rs_end = logical_rs->rs_end;
|
||||
|
||||
remain_rs->rs_start = logical_rs->rs_start;
|
||||
remain_rs->rs_end = logical_rs->rs_start;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -5015,16 +5132,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
|
||||
|
||||
/*
|
||||
* As this recursive function unwinds, translate the logical
|
||||
* range into its physical components by calling the
|
||||
* vdev specific translate function.
|
||||
* range into its physical and any remaining components by calling
|
||||
* the vdev specific translate function.
|
||||
*/
|
||||
range_seg64_t intermediate = { 0 };
|
||||
pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
|
||||
pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
|
||||
|
||||
physical_rs->rs_start = intermediate.rs_start;
|
||||
physical_rs->rs_end = intermediate.rs_end;
|
||||
}
|
||||
|
||||
void
|
||||
vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
|
||||
vdev_xlate_func_t *func, void *arg)
|
||||
{
|
||||
range_seg64_t iter_rs = *logical_rs;
|
||||
range_seg64_t physical_rs;
|
||||
range_seg64_t remain_rs;
|
||||
|
||||
while (!vdev_xlate_is_empty(&iter_rs)) {
|
||||
|
||||
vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
|
||||
|
||||
/*
|
||||
* With raidz and dRAID, it's possible that the logical range
|
||||
* does not live on this leaf vdev. Only when there is a non-
|
||||
* zero physical size call the provided function.
|
||||
*/
|
||||
if (!vdev_xlate_is_empty(&physical_rs))
|
||||
func(arg, &physical_rs);
|
||||
|
||||
iter_rs = remain_rs;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Look at the vdev tree and determine whether any devices are currently being
|
||||
* replaced.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* Xorshift Pseudo Random Number Generator based on work by David Blackman
|
||||
* and Sebastiano Vigna (vigna@acm.org).
|
||||
*
|
||||
* "Further scramblings of Marsaglia's xorshift generators"
|
||||
* http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
|
||||
* http://prng.di.unimi.it/xoroshiro128plusplus.c
|
||||
*
|
||||
* To the extent possible under law, the author has dedicated all copyright
|
||||
* and related and neighboring rights to this software to the public domain
|
||||
* worldwide. This software is distributed without any warranty.
|
||||
*
|
||||
* See <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*
|
||||
* This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid,
|
||||
* small-state generators. It is extremely (sub-ns) fast and it passes all
|
||||
* tests we are aware of, but its state space is large enough only for
|
||||
* mild parallelism.
|
||||
*/
|
||||
|
||||
#include <sys/vdev_draid.h>
|
||||
|
||||
static inline uint64_t rotl(const uint64_t x, int k)
|
||||
{
|
||||
return (x << k) | (x >> (64 - k));
|
||||
}
|
||||
|
||||
uint64_t
|
||||
vdev_draid_rand(uint64_t *s)
|
||||
{
|
||||
const uint64_t s0 = s[0];
|
||||
uint64_t s1 = s[1];
|
||||
const uint64_t result = rotl(s0 + s1, 17) + s0;
|
||||
|
||||
s1 ^= s0;
|
||||
s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
|
||||
s[1] = rotl(s1, 28); // c
|
||||
|
||||
return (result);
|
||||
}
|
||||
@@ -1844,9 +1844,13 @@ vdev_indirect_io_done(zio_t *zio)
|
||||
}
|
||||
|
||||
vdev_ops_t vdev_indirect_ops = {
|
||||
.vdev_op_init = NULL,
|
||||
.vdev_op_fini = NULL,
|
||||
.vdev_op_open = vdev_indirect_open,
|
||||
.vdev_op_close = vdev_indirect_close,
|
||||
.vdev_op_asize = vdev_default_asize,
|
||||
.vdev_op_min_asize = vdev_default_min_asize,
|
||||
.vdev_op_min_alloc = NULL,
|
||||
.vdev_op_io_start = vdev_indirect_io_start,
|
||||
.vdev_op_io_done = vdev_indirect_io_done,
|
||||
.vdev_op_state_change = NULL,
|
||||
@@ -1855,6 +1859,11 @@ vdev_ops_t vdev_indirect_ops = {
|
||||
.vdev_op_rele = NULL,
|
||||
.vdev_op_remap = vdev_indirect_remap,
|
||||
.vdev_op_xlate = NULL,
|
||||
.vdev_op_rebuild_asize = NULL,
|
||||
.vdev_op_metaslab_init = NULL,
|
||||
.vdev_op_config_generate = NULL,
|
||||
.vdev_op_nparity = NULL,
|
||||
.vdev_op_ndisks = NULL,
|
||||
.vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */
|
||||
.vdev_op_leaf = B_FALSE /* leaf vdev */
|
||||
};
|
||||
|
||||
@@ -121,6 +121,8 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
|
||||
if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
|
||||
vd->vdev_initialize_action_time = gethrestime_sec();
|
||||
}
|
||||
|
||||
vdev_initializing_state_t old_state = vd->vdev_initialize_state;
|
||||
vd->vdev_initialize_state = new_state;
|
||||
|
||||
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
|
||||
@@ -138,8 +140,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
|
||||
"vdev=%s suspended", vd->vdev_path);
|
||||
break;
|
||||
case VDEV_INITIALIZE_CANCELED:
|
||||
spa_history_log_internal(spa, "initialize", tx,
|
||||
"vdev=%s canceled", vd->vdev_path);
|
||||
if (old_state == VDEV_INITIALIZE_ACTIVE ||
|
||||
old_state == VDEV_INITIALIZE_SUSPENDED)
|
||||
spa_history_log_internal(spa, "initialize", tx,
|
||||
"vdev=%s canceled", vd->vdev_path);
|
||||
break;
|
||||
case VDEV_INITIALIZE_COMPLETE:
|
||||
spa_history_log_internal(spa, "initialize", tx,
|
||||
@@ -317,6 +321,32 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
|
||||
{
|
||||
uint64_t *last_rs_end = (uint64_t *)arg;
|
||||
|
||||
if (physical_rs->rs_end > *last_rs_end)
|
||||
*last_rs_end = physical_rs->rs_end;
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs)
|
||||
{
|
||||
vdev_t *vd = (vdev_t *)arg;
|
||||
|
||||
uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
|
||||
vd->vdev_initialize_bytes_est += size;
|
||||
|
||||
if (vd->vdev_initialize_last_offset > physical_rs->rs_end) {
|
||||
vd->vdev_initialize_bytes_done += size;
|
||||
} else if (vd->vdev_initialize_last_offset > physical_rs->rs_start &&
|
||||
vd->vdev_initialize_last_offset < physical_rs->rs_end) {
|
||||
vd->vdev_initialize_bytes_done +=
|
||||
vd->vdev_initialize_last_offset - physical_rs->rs_start;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_initialize_calculate_progress(vdev_t *vd)
|
||||
{
|
||||
@@ -331,28 +361,35 @@ vdev_initialize_calculate_progress(vdev_t *vd)
|
||||
metaslab_t *msp = vd->vdev_top->vdev_ms[i];
|
||||
mutex_enter(&msp->ms_lock);
|
||||
|
||||
uint64_t ms_free = msp->ms_size -
|
||||
metaslab_allocated_space(msp);
|
||||
|
||||
if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
|
||||
ms_free /= vd->vdev_top->vdev_children;
|
||||
uint64_t ms_free = (msp->ms_size -
|
||||
metaslab_allocated_space(msp)) /
|
||||
vdev_get_ndisks(vd->vdev_top);
|
||||
|
||||
/*
|
||||
* Convert the metaslab range to a physical range
|
||||
* on our vdev. We use this to determine if we are
|
||||
* in the middle of this metaslab range.
|
||||
*/
|
||||
range_seg64_t logical_rs, physical_rs;
|
||||
range_seg64_t logical_rs, physical_rs, remain_rs;
|
||||
logical_rs.rs_start = msp->ms_start;
|
||||
logical_rs.rs_end = msp->ms_start + msp->ms_size;
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs);
|
||||
|
||||
/* Metaslab space after this offset has not been initialized */
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
|
||||
if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
|
||||
vd->vdev_initialize_bytes_est += ms_free;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
continue;
|
||||
} else if (vd->vdev_initialize_last_offset >
|
||||
physical_rs.rs_end) {
|
||||
}
|
||||
|
||||
/* Metaslab space before this offset has been initialized */
|
||||
uint64_t last_rs_end = physical_rs.rs_end;
|
||||
if (!vdev_xlate_is_empty(&remain_rs)) {
|
||||
vdev_xlate_walk(vd, &remain_rs,
|
||||
vdev_initialize_xlate_last_rs_end, &last_rs_end);
|
||||
}
|
||||
|
||||
if (vd->vdev_initialize_last_offset > last_rs_end) {
|
||||
vd->vdev_initialize_bytes_done += ms_free;
|
||||
vd->vdev_initialize_bytes_est += ms_free;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
@@ -374,22 +411,9 @@ vdev_initialize_calculate_progress(vdev_t *vd)
|
||||
&where)) {
|
||||
logical_rs.rs_start = rs_get_start(rs, rt);
|
||||
logical_rs.rs_end = rs_get_end(rs, rt);
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs);
|
||||
|
||||
uint64_t size = physical_rs.rs_end -
|
||||
physical_rs.rs_start;
|
||||
vd->vdev_initialize_bytes_est += size;
|
||||
if (vd->vdev_initialize_last_offset >
|
||||
physical_rs.rs_end) {
|
||||
vd->vdev_initialize_bytes_done += size;
|
||||
} else if (vd->vdev_initialize_last_offset >
|
||||
physical_rs.rs_start &&
|
||||
vd->vdev_initialize_last_offset <
|
||||
physical_rs.rs_end) {
|
||||
vd->vdev_initialize_bytes_done +=
|
||||
vd->vdev_initialize_last_offset -
|
||||
physical_rs.rs_start;
|
||||
}
|
||||
vdev_xlate_walk(vd, &logical_rs,
|
||||
vdev_initialize_xlate_progress, vd);
|
||||
}
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
@@ -419,6 +443,34 @@ vdev_initialize_load(vdev_t *vd)
|
||||
return (err);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs)
|
||||
{
|
||||
vdev_t *vd = arg;
|
||||
|
||||
/* Only add segments that we have not visited yet */
|
||||
if (physical_rs->rs_end <= vd->vdev_initialize_last_offset)
|
||||
return;
|
||||
|
||||
/* Pick up where we left off mid-range. */
|
||||
if (vd->vdev_initialize_last_offset > physical_rs->rs_start) {
|
||||
zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
|
||||
"(%llu, %llu)", vd->vdev_path,
|
||||
(u_longlong_t)physical_rs->rs_start,
|
||||
(u_longlong_t)physical_rs->rs_end,
|
||||
(u_longlong_t)vd->vdev_initialize_last_offset,
|
||||
(u_longlong_t)physical_rs->rs_end);
|
||||
ASSERT3U(physical_rs->rs_end, >,
|
||||
vd->vdev_initialize_last_offset);
|
||||
physical_rs->rs_start = vd->vdev_initialize_last_offset;
|
||||
}
|
||||
|
||||
ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
|
||||
|
||||
range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start,
|
||||
physical_rs->rs_end - physical_rs->rs_start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert the logical range into a physical range and add it to our
|
||||
* avl tree.
|
||||
@@ -427,47 +479,12 @@ static void
|
||||
vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
vdev_t *vd = arg;
|
||||
range_seg64_t logical_rs, physical_rs;
|
||||
range_seg64_t logical_rs;
|
||||
logical_rs.rs_start = start;
|
||||
logical_rs.rs_end = start + size;
|
||||
|
||||
ASSERT(vd->vdev_ops->vdev_op_leaf);
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs);
|
||||
|
||||
IMPLY(vd->vdev_top == vd,
|
||||
logical_rs.rs_start == physical_rs.rs_start);
|
||||
IMPLY(vd->vdev_top == vd,
|
||||
logical_rs.rs_end == physical_rs.rs_end);
|
||||
|
||||
/* Only add segments that we have not visited yet */
|
||||
if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
|
||||
return;
|
||||
|
||||
/* Pick up where we left off mid-range. */
|
||||
if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
|
||||
zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
|
||||
"(%llu, %llu)", vd->vdev_path,
|
||||
(u_longlong_t)physical_rs.rs_start,
|
||||
(u_longlong_t)physical_rs.rs_end,
|
||||
(u_longlong_t)vd->vdev_initialize_last_offset,
|
||||
(u_longlong_t)physical_rs.rs_end);
|
||||
ASSERT3U(physical_rs.rs_end, >,
|
||||
vd->vdev_initialize_last_offset);
|
||||
physical_rs.rs_start = vd->vdev_initialize_last_offset;
|
||||
}
|
||||
ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
|
||||
|
||||
/*
|
||||
* With raidz, it's possible that the logical range does not live on
|
||||
* this leaf vdev. We only add the physical range to this vdev's if it
|
||||
* has a length greater than 0.
|
||||
*/
|
||||
if (physical_rs.rs_end > physical_rs.rs_start) {
|
||||
range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
|
||||
physical_rs.rs_end - physical_rs.rs_start);
|
||||
} else {
|
||||
ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
|
||||
}
|
||||
vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
+39
-23
@@ -142,6 +142,7 @@
|
||||
#include <sys/zap.h>
|
||||
#include <sys/vdev.h>
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/vdev_draid.h>
|
||||
#include <sys/uberblock_impl.h>
|
||||
#include <sys/metaslab.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
@@ -453,31 +454,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
|
||||
if (vd->vdev_fru != NULL)
|
||||
fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
|
||||
|
||||
if (vd->vdev_nparity != 0) {
|
||||
ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
|
||||
VDEV_TYPE_RAIDZ) == 0);
|
||||
if (vd->vdev_ops->vdev_op_config_generate != NULL)
|
||||
vd->vdev_ops->vdev_op_config_generate(vd, nv);
|
||||
|
||||
/*
|
||||
* Make sure someone hasn't managed to sneak a fancy new vdev
|
||||
* into a crufty old storage pool.
|
||||
*/
|
||||
ASSERT(vd->vdev_nparity == 1 ||
|
||||
(vd->vdev_nparity <= 2 &&
|
||||
spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
|
||||
(vd->vdev_nparity <= 3 &&
|
||||
spa_version(spa) >= SPA_VERSION_RAIDZ3));
|
||||
|
||||
/*
|
||||
* Note that we'll add the nparity tag even on storage pools
|
||||
* that only support a single parity device -- older software
|
||||
* will just ignore it.
|
||||
*/
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
|
||||
}
|
||||
|
||||
if (vd->vdev_wholedisk != -1ULL)
|
||||
if (vd->vdev_wholedisk != -1ULL) {
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
|
||||
vd->vdev_wholedisk);
|
||||
}
|
||||
|
||||
if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
|
||||
@@ -785,6 +768,14 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
|
||||
if (!vdev_readable(vd))
|
||||
return (NULL);
|
||||
|
||||
/*
|
||||
* The label for a dRAID distributed spare is not stored on disk.
|
||||
* Instead it is generated when needed which allows us to bypass
|
||||
* the pipeline when reading the config from the label.
|
||||
*/
|
||||
if (vd->vdev_ops == &vdev_draid_spare_ops)
|
||||
return (vdev_draid_read_config_spare(vd));
|
||||
|
||||
vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
|
||||
vp = abd_to_buf(vp_abd);
|
||||
|
||||
@@ -1497,7 +1488,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
|
||||
for (int c = 0; c < vd->vdev_children; c++)
|
||||
vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
|
||||
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) &&
|
||||
vd->vdev_ops != &vdev_draid_spare_ops) {
|
||||
for (int l = 0; l < VDEV_LABELS; l++) {
|
||||
for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
|
||||
vdev_label_read(zio, vd, l,
|
||||
@@ -1586,6 +1578,13 @@ vdev_copy_uberblocks(vdev_t *vd)
|
||||
SCL_STATE);
|
||||
ASSERT(vd->vdev_ops->vdev_op_leaf);
|
||||
|
||||
/*
|
||||
* No uberblocks are stored on distributed spares, they may be
|
||||
* safely skipped when expanding a leaf vdev.
|
||||
*/
|
||||
if (vd->vdev_ops == &vdev_draid_spare_ops)
|
||||
return;
|
||||
|
||||
spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER);
|
||||
|
||||
ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
|
||||
@@ -1647,6 +1646,15 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
|
||||
if (!vdev_writeable(vd))
|
||||
return;
|
||||
|
||||
/*
|
||||
* There's no need to write uberblocks to a distributed spare, they
|
||||
* are already stored on all the leaves of the parent dRAID. For
|
||||
* this same reason vdev_uberblock_load_impl() skips distributed
|
||||
* spares when reading uberblocks.
|
||||
*/
|
||||
if (vd->vdev_ops == &vdev_draid_spare_ops)
|
||||
return;
|
||||
|
||||
/* If the vdev was expanded, need to copy uberblock rings. */
|
||||
if (vd->vdev_state == VDEV_STATE_HEALTHY &&
|
||||
vd->vdev_copy_uberblocks == B_TRUE) {
|
||||
@@ -1763,6 +1771,14 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes,
|
||||
if (!vdev_writeable(vd))
|
||||
return;
|
||||
|
||||
/*
|
||||
* The top-level config never needs to be written to a distributed
|
||||
* spare. When read vdev_dspare_label_read_config() will generate
|
||||
* the config for the vdev_label_read_config().
|
||||
*/
|
||||
if (vd->vdev_ops == &vdev_draid_spare_ops)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Generate a label describing the top-level config to which we belong.
|
||||
*/
|
||||
|
||||
+123
-14
@@ -33,6 +33,7 @@
|
||||
#include <sys/dsl_pool.h>
|
||||
#include <sys/dsl_scan.h>
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/vdev_draid.h>
|
||||
#include <sys/zio.h>
|
||||
#include <sys/abd.h>
|
||||
#include <sys/fs/zfs.h>
|
||||
@@ -99,7 +100,6 @@ vdev_mirror_stat_fini(void)
|
||||
/*
|
||||
* Virtual device vector for mirroring.
|
||||
*/
|
||||
|
||||
typedef struct mirror_child {
|
||||
vdev_t *mc_vd;
|
||||
uint64_t mc_offset;
|
||||
@@ -108,6 +108,7 @@ typedef struct mirror_child {
|
||||
uint8_t mc_tried;
|
||||
uint8_t mc_skipped;
|
||||
uint8_t mc_speculative;
|
||||
uint8_t mc_rebuilding;
|
||||
} mirror_child_t;
|
||||
|
||||
typedef struct mirror_map {
|
||||
@@ -115,6 +116,7 @@ typedef struct mirror_map {
|
||||
int mm_preferred_cnt;
|
||||
int mm_children;
|
||||
boolean_t mm_resilvering;
|
||||
boolean_t mm_rebuilding;
|
||||
boolean_t mm_root;
|
||||
mirror_child_t mm_child[];
|
||||
} mirror_map_t;
|
||||
@@ -239,6 +241,21 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
|
||||
return (load + zfs_vdev_mirror_rotating_seek_inc);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
vdev_mirror_rebuilding(vdev_t *vd)
|
||||
{
|
||||
if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
|
||||
return (B_TRUE);
|
||||
|
||||
for (int i = 0; i < vd->vdev_children; i++) {
|
||||
if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
|
||||
return (B_TRUE);
|
||||
}
|
||||
}
|
||||
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Avoid inlining the function to keep vdev_mirror_io_start(), which
|
||||
* is this functions only caller, as small as possible on the stack.
|
||||
@@ -356,6 +373,9 @@ vdev_mirror_map_init(zio_t *zio)
|
||||
mc = &mm->mm_child[c];
|
||||
mc->mc_vd = vd->vdev_child[c];
|
||||
mc->mc_offset = zio->io_offset;
|
||||
|
||||
if (vdev_mirror_rebuilding(mc->mc_vd))
|
||||
mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -493,12 +513,37 @@ vdev_mirror_preferred_child_randomize(zio_t *zio)
|
||||
return (mm->mm_preferred[p]);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
vdev_mirror_child_readable(mirror_child_t *mc)
|
||||
{
|
||||
vdev_t *vd = mc->mc_vd;
|
||||
|
||||
if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
|
||||
return (vdev_draid_readable(vd, mc->mc_offset));
|
||||
else
|
||||
return (vdev_readable(vd));
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
|
||||
{
|
||||
vdev_t *vd = mc->mc_vd;
|
||||
|
||||
if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
|
||||
return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
|
||||
else
|
||||
return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to find a vdev whose DTL doesn't contain the block we want to read
|
||||
* preferring vdevs based on determined load.
|
||||
* preferring vdevs based on determined load. If we can't, try the read on
|
||||
* any vdev we haven't already tried.
|
||||
*
|
||||
* Try to find a child whose DTL doesn't contain the block we want to read.
|
||||
* If we can't, try the read on any vdev we haven't already tried.
|
||||
* Distributed spares are an exception to the above load rule. They are
|
||||
* always preferred in order to detect gaps in the distributed spare which
|
||||
* are created when another disk in the dRAID fails. In order to restore
|
||||
* redundancy those gaps must be read to trigger the required repair IO.
|
||||
*/
|
||||
static int
|
||||
vdev_mirror_child_select(zio_t *zio)
|
||||
@@ -518,20 +563,27 @@ vdev_mirror_child_select(zio_t *zio)
|
||||
if (mc->mc_tried || mc->mc_skipped)
|
||||
continue;
|
||||
|
||||
if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) {
|
||||
if (mc->mc_vd == NULL ||
|
||||
!vdev_mirror_child_readable(mc)) {
|
||||
mc->mc_error = SET_ERROR(ENXIO);
|
||||
mc->mc_tried = 1; /* don't even try */
|
||||
mc->mc_skipped = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
|
||||
if (vdev_mirror_child_missing(mc, txg, 1)) {
|
||||
mc->mc_error = SET_ERROR(ESTALE);
|
||||
mc->mc_skipped = 1;
|
||||
mc->mc_speculative = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
|
||||
mm->mm_preferred[0] = c;
|
||||
mm->mm_preferred_cnt = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
|
||||
if (mc->mc_load > lowest_load)
|
||||
continue;
|
||||
@@ -625,11 +677,25 @@ vdev_mirror_io_start(zio_t *zio)
|
||||
|
||||
while (children--) {
|
||||
mc = &mm->mm_child[c];
|
||||
c++;
|
||||
|
||||
/*
|
||||
* When sequentially resilvering only issue write repair
|
||||
* IOs to the vdev which is being rebuilt since performance
|
||||
* is limited by the slowest child. This is an issue for
|
||||
* faster replacement devices such as distributed spares.
|
||||
*/
|
||||
if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
|
||||
(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
|
||||
!(zio->io_flags & ZIO_FLAG_SCRUB) &&
|
||||
mm->mm_rebuilding && !mc->mc_rebuilding) {
|
||||
continue;
|
||||
}
|
||||
|
||||
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
|
||||
mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
|
||||
zio->io_type, zio->io_priority, 0,
|
||||
vdev_mirror_child_done, mc));
|
||||
c++;
|
||||
}
|
||||
|
||||
zio_execute(zio);
|
||||
@@ -744,6 +810,8 @@ vdev_mirror_io_done(zio_t *zio)
|
||||
mc = &mm->mm_child[c];
|
||||
|
||||
if (mc->mc_error == 0) {
|
||||
vdev_ops_t *ops = mc->mc_vd->vdev_ops;
|
||||
|
||||
if (mc->mc_tried)
|
||||
continue;
|
||||
/*
|
||||
@@ -752,15 +820,16 @@ vdev_mirror_io_done(zio_t *zio)
|
||||
* 1. it's a scrub (in which case we have
|
||||
* tried everything that was healthy)
|
||||
* - or -
|
||||
* 2. it's an indirect vdev (in which case
|
||||
* it could point to any other vdev, which
|
||||
* might have a bad DTL)
|
||||
* 2. it's an indirect or distributed spare
|
||||
* vdev (in which case it could point to any
|
||||
* other vdev, which might have a bad DTL)
|
||||
* - or -
|
||||
* 3. the DTL indicates that this data is
|
||||
* missing from this vdev
|
||||
*/
|
||||
if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
|
||||
mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
|
||||
ops != &vdev_indirect_ops &&
|
||||
ops != &vdev_draid_spare_ops &&
|
||||
!vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
|
||||
zio->io_txg, 1))
|
||||
continue;
|
||||
@@ -796,50 +865,90 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the maximum asize for a rebuild zio in the provided range.
|
||||
*/
|
||||
static uint64_t
|
||||
vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
|
||||
uint64_t max_segment)
|
||||
{
|
||||
uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
|
||||
SPA_MAXBLOCKSIZE);
|
||||
|
||||
return (MIN(asize, vdev_psize_to_asize(vd, psize)));
|
||||
}
|
||||
|
||||
vdev_ops_t vdev_mirror_ops = {
|
||||
.vdev_op_init = NULL,
|
||||
.vdev_op_fini = NULL,
|
||||
.vdev_op_open = vdev_mirror_open,
|
||||
.vdev_op_close = vdev_mirror_close,
|
||||
.vdev_op_asize = vdev_default_asize,
|
||||
.vdev_op_min_asize = vdev_default_min_asize,
|
||||
.vdev_op_min_alloc = NULL,
|
||||
.vdev_op_io_start = vdev_mirror_io_start,
|
||||
.vdev_op_io_done = vdev_mirror_io_done,
|
||||
.vdev_op_state_change = vdev_mirror_state_change,
|
||||
.vdev_op_need_resilver = NULL,
|
||||
.vdev_op_need_resilver = vdev_default_need_resilver,
|
||||
.vdev_op_hold = NULL,
|
||||
.vdev_op_rele = NULL,
|
||||
.vdev_op_remap = NULL,
|
||||
.vdev_op_xlate = vdev_default_xlate,
|
||||
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
|
||||
.vdev_op_metaslab_init = NULL,
|
||||
.vdev_op_config_generate = NULL,
|
||||
.vdev_op_nparity = NULL,
|
||||
.vdev_op_ndisks = NULL,
|
||||
.vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */
|
||||
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
|
||||
};
|
||||
|
||||
vdev_ops_t vdev_replacing_ops = {
|
||||
.vdev_op_init = NULL,
|
||||
.vdev_op_fini = NULL,
|
||||
.vdev_op_open = vdev_mirror_open,
|
||||
.vdev_op_close = vdev_mirror_close,
|
||||
.vdev_op_asize = vdev_default_asize,
|
||||
.vdev_op_min_asize = vdev_default_min_asize,
|
||||
.vdev_op_min_alloc = NULL,
|
||||
.vdev_op_io_start = vdev_mirror_io_start,
|
||||
.vdev_op_io_done = vdev_mirror_io_done,
|
||||
.vdev_op_state_change = vdev_mirror_state_change,
|
||||
.vdev_op_need_resilver = NULL,
|
||||
.vdev_op_need_resilver = vdev_default_need_resilver,
|
||||
.vdev_op_hold = NULL,
|
||||
.vdev_op_rele = NULL,
|
||||
.vdev_op_remap = NULL,
|
||||
.vdev_op_xlate = vdev_default_xlate,
|
||||
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
|
||||
.vdev_op_metaslab_init = NULL,
|
||||
.vdev_op_config_generate = NULL,
|
||||
.vdev_op_nparity = NULL,
|
||||
.vdev_op_ndisks = NULL,
|
||||
.vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */
|
||||
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
|
||||
};
|
||||
|
||||
vdev_ops_t vdev_spare_ops = {
|
||||
.vdev_op_init = NULL,
|
||||
.vdev_op_fini = NULL,
|
||||
.vdev_op_open = vdev_mirror_open,
|
||||
.vdev_op_close = vdev_mirror_close,
|
||||
.vdev_op_asize = vdev_default_asize,
|
||||
.vdev_op_min_asize = vdev_default_min_asize,
|
||||
.vdev_op_min_alloc = NULL,
|
||||
.vdev_op_io_start = vdev_mirror_io_start,
|
||||
.vdev_op_io_done = vdev_mirror_io_done,
|
||||
.vdev_op_state_change = vdev_mirror_state_change,
|
||||
.vdev_op_need_resilver = NULL,
|
||||
.vdev_op_need_resilver = vdev_default_need_resilver,
|
||||
.vdev_op_hold = NULL,
|
||||
.vdev_op_rele = NULL,
|
||||
.vdev_op_remap = NULL,
|
||||
.vdev_op_xlate = vdev_default_xlate,
|
||||
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
|
||||
.vdev_op_metaslab_init = NULL,
|
||||
.vdev_op_config_generate = NULL,
|
||||
.vdev_op_nparity = NULL,
|
||||
.vdev_op_ndisks = NULL,
|
||||
.vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */
|
||||
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
|
||||
};
|
||||
|
||||
@@ -81,9 +81,13 @@ vdev_missing_io_done(zio_t *zio)
|
||||
}
|
||||
|
||||
vdev_ops_t vdev_missing_ops = {
|
||||
.vdev_op_init = NULL,
|
||||
.vdev_op_fini = NULL,
|
||||
.vdev_op_open = vdev_missing_open,
|
||||
.vdev_op_close = vdev_missing_close,
|
||||
.vdev_op_asize = vdev_default_asize,
|
||||
.vdev_op_min_asize = vdev_default_min_asize,
|
||||
.vdev_op_min_alloc = NULL,
|
||||
.vdev_op_io_start = vdev_missing_io_start,
|
||||
.vdev_op_io_done = vdev_missing_io_done,
|
||||
.vdev_op_state_change = NULL,
|
||||
@@ -92,14 +96,23 @@ vdev_ops_t vdev_missing_ops = {
|
||||
.vdev_op_rele = NULL,
|
||||
.vdev_op_remap = NULL,
|
||||
.vdev_op_xlate = NULL,
|
||||
.vdev_op_rebuild_asize = NULL,
|
||||
.vdev_op_metaslab_init = NULL,
|
||||
.vdev_op_config_generate = NULL,
|
||||
.vdev_op_nparity = NULL,
|
||||
.vdev_op_ndisks = NULL,
|
||||
.vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */
|
||||
.vdev_op_leaf = B_TRUE /* leaf vdev */
|
||||
};
|
||||
|
||||
vdev_ops_t vdev_hole_ops = {
|
||||
.vdev_op_init = NULL,
|
||||
.vdev_op_fini = NULL,
|
||||
.vdev_op_open = vdev_missing_open,
|
||||
.vdev_op_close = vdev_missing_close,
|
||||
.vdev_op_asize = vdev_default_asize,
|
||||
.vdev_op_min_asize = vdev_default_min_asize,
|
||||
.vdev_op_min_alloc = NULL,
|
||||
.vdev_op_io_start = vdev_missing_io_start,
|
||||
.vdev_op_io_done = vdev_missing_io_done,
|
||||
.vdev_op_state_change = NULL,
|
||||
@@ -108,6 +121,11 @@ vdev_ops_t vdev_hole_ops = {
|
||||
.vdev_op_rele = NULL,
|
||||
.vdev_op_remap = NULL,
|
||||
.vdev_op_xlate = NULL,
|
||||
.vdev_op_rebuild_asize = NULL,
|
||||
.vdev_op_metaslab_init = NULL,
|
||||
.vdev_op_config_generate = NULL,
|
||||
.vdev_op_nparity = NULL,
|
||||
.vdev_op_ndisks = NULL,
|
||||
.vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */
|
||||
.vdev_op_leaf = B_TRUE /* leaf vdev */
|
||||
};
|
||||
|
||||
@@ -593,6 +593,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
|
||||
if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
|
||||
return (NULL);
|
||||
|
||||
/*
|
||||
* I/Os to distributed spares are directly dispatched to the dRAID
|
||||
* leaf vdevs for aggregation. See the comment at the end of the
|
||||
* zio_vdev_io_start() function.
|
||||
*/
|
||||
ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops);
|
||||
|
||||
first = last = zio;
|
||||
|
||||
if (zio->io_type == ZIO_TYPE_READ)
|
||||
|
||||
+1175
-821
File diff suppressed because it is too large
Load Diff
@@ -149,7 +149,7 @@ vdev_raidz_math_get_ops(void)
|
||||
* Select parity generation method for raidz_map
|
||||
*/
|
||||
int
|
||||
vdev_raidz_math_generate(raidz_map_t *rm)
|
||||
vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr)
|
||||
{
|
||||
raidz_gen_f gen_parity = NULL;
|
||||
|
||||
@@ -174,7 +174,7 @@ vdev_raidz_math_generate(raidz_map_t *rm)
|
||||
if (gen_parity == NULL)
|
||||
return (RAIDZ_ORIGINAL_IMPL);
|
||||
|
||||
gen_parity(rm);
|
||||
gen_parity(rr);
|
||||
|
||||
return (0);
|
||||
}
|
||||
@@ -241,8 +241,8 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
|
||||
* @nbaddata - Number of failed data columns
|
||||
*/
|
||||
int
|
||||
vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
|
||||
const int *dt, const int nbaddata)
|
||||
vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr,
|
||||
const int *parity_valid, const int *dt, const int nbaddata)
|
||||
{
|
||||
raidz_rec_f rec_fn = NULL;
|
||||
|
||||
@@ -265,7 +265,7 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
|
||||
if (rec_fn == NULL)
|
||||
return (RAIDZ_ORIGINAL_IMPL);
|
||||
else
|
||||
return (rec_fn(rm, dt));
|
||||
return (rec_fn(rr, dt));
|
||||
}
|
||||
|
||||
const char *raidz_gen_name[] = {
|
||||
|
||||
+169
-144
@@ -26,6 +26,7 @@
|
||||
#define _VDEV_RAIDZ_MATH_IMPL_H
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/vdev_raidz_impl.h>
|
||||
|
||||
#define raidz_inline inline __attribute__((always_inline))
|
||||
#ifndef noinline
|
||||
@@ -36,33 +37,33 @@
|
||||
* Functions calculate multiplication constants for data reconstruction.
|
||||
* Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and
|
||||
* used parity columns for reconstruction.
|
||||
* @rm RAIDZ map
|
||||
* @rr RAIDZ row
|
||||
* @tgtidx array of missing data indexes
|
||||
* @coeff output array of coefficients. Array must be provided by
|
||||
* user and must hold minimum MUL_CNT values.
|
||||
*/
|
||||
static noinline void
|
||||
raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
|
||||
raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
|
||||
{
|
||||
const unsigned ncols = raidz_ncols(rm);
|
||||
const unsigned ncols = rr->rr_cols;
|
||||
const unsigned x = tgtidx[TARGET_X];
|
||||
|
||||
coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1));
|
||||
}
|
||||
|
||||
static noinline void
|
||||
raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
|
||||
raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
|
||||
{
|
||||
const unsigned ncols = raidz_ncols(rm);
|
||||
const unsigned ncols = rr->rr_cols;
|
||||
const unsigned x = tgtidx[TARGET_X];
|
||||
|
||||
coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1));
|
||||
}
|
||||
|
||||
static noinline void
|
||||
raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
|
||||
raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
|
||||
{
|
||||
const unsigned ncols = raidz_ncols(rm);
|
||||
const unsigned ncols = rr->rr_cols;
|
||||
const unsigned x = tgtidx[TARGET_X];
|
||||
const unsigned y = tgtidx[TARGET_Y];
|
||||
gf_t a, b, e;
|
||||
@@ -76,9 +77,9 @@ raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
|
||||
}
|
||||
|
||||
static noinline void
|
||||
raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
|
||||
raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
|
||||
{
|
||||
const unsigned ncols = raidz_ncols(rm);
|
||||
const unsigned ncols = rr->rr_cols;
|
||||
const unsigned x = tgtidx[TARGET_X];
|
||||
const unsigned y = tgtidx[TARGET_Y];
|
||||
|
||||
@@ -93,9 +94,9 @@ raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
|
||||
}
|
||||
|
||||
static noinline void
|
||||
raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
|
||||
raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
|
||||
{
|
||||
const unsigned ncols = raidz_ncols(rm);
|
||||
const unsigned ncols = rr->rr_cols;
|
||||
const unsigned x = tgtidx[TARGET_X];
|
||||
const unsigned y = tgtidx[TARGET_Y];
|
||||
|
||||
@@ -114,9 +115,9 @@ raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
|
||||
}
|
||||
|
||||
static noinline void
|
||||
raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
|
||||
raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
|
||||
{
|
||||
const unsigned ncols = raidz_ncols(rm);
|
||||
const unsigned ncols = rr->rr_cols;
|
||||
const unsigned x = tgtidx[TARGET_X];
|
||||
const unsigned y = tgtidx[TARGET_Y];
|
||||
const unsigned z = tgtidx[TARGET_Z];
|
||||
@@ -347,26 +348,26 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private)
|
||||
/*
|
||||
* Generate P parity (RAIDZ1)
|
||||
*
|
||||
* @rm RAIDZ map
|
||||
* @rr RAIDZ row
|
||||
*/
|
||||
static raidz_inline void
|
||||
raidz_generate_p_impl(raidz_map_t * const rm)
|
||||
raidz_generate_p_impl(raidz_row_t * const rr)
|
||||
{
|
||||
size_t c;
|
||||
const size_t ncols = raidz_ncols(rm);
|
||||
const size_t psize = rm->rm_col[CODE_P].rc_size;
|
||||
abd_t *pabd = rm->rm_col[CODE_P].rc_abd;
|
||||
const size_t ncols = rr->rr_cols;
|
||||
const size_t psize = rr->rr_col[CODE_P].rc_size;
|
||||
abd_t *pabd = rr->rr_col[CODE_P].rc_abd;
|
||||
size_t size;
|
||||
abd_t *dabd;
|
||||
|
||||
raidz_math_begin();
|
||||
|
||||
/* start with first data column */
|
||||
raidz_copy(pabd, rm->rm_col[1].rc_abd, psize);
|
||||
raidz_copy(pabd, rr->rr_col[1].rc_abd, psize);
|
||||
|
||||
for (c = 2; c < ncols; c++) {
|
||||
dabd = rm->rm_col[c].rc_abd;
|
||||
size = rm->rm_col[c].rc_size;
|
||||
dabd = rr->rr_col[c].rc_abd;
|
||||
size = rr->rr_col[c].rc_size;
|
||||
|
||||
/* add data column */
|
||||
raidz_add(pabd, dabd, size);
|
||||
@@ -414,29 +415,29 @@ raidz_gen_pq_add(void **c, const void *dc, const size_t csize,
|
||||
/*
|
||||
* Generate PQ parity (RAIDZ2)
|
||||
*
|
||||
* @rm RAIDZ map
|
||||
* @rr RAIDZ row
|
||||
*/
|
||||
static raidz_inline void
|
||||
raidz_generate_pq_impl(raidz_map_t * const rm)
|
||||
raidz_generate_pq_impl(raidz_row_t * const rr)
|
||||
{
|
||||
size_t c;
|
||||
const size_t ncols = raidz_ncols(rm);
|
||||
const size_t csize = rm->rm_col[CODE_P].rc_size;
|
||||
const size_t ncols = rr->rr_cols;
|
||||
const size_t csize = rr->rr_col[CODE_P].rc_size;
|
||||
size_t dsize;
|
||||
abd_t *dabd;
|
||||
abd_t *cabds[] = {
|
||||
rm->rm_col[CODE_P].rc_abd,
|
||||
rm->rm_col[CODE_Q].rc_abd
|
||||
rr->rr_col[CODE_P].rc_abd,
|
||||
rr->rr_col[CODE_Q].rc_abd
|
||||
};
|
||||
|
||||
raidz_math_begin();
|
||||
|
||||
raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize);
|
||||
raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize);
|
||||
raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize);
|
||||
raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize);
|
||||
|
||||
for (c = 3; c < ncols; c++) {
|
||||
dabd = rm->rm_col[c].rc_abd;
|
||||
dsize = rm->rm_col[c].rc_size;
|
||||
dabd = rr->rr_col[c].rc_abd;
|
||||
dsize = rr->rr_col[c].rc_size;
|
||||
|
||||
abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
|
||||
raidz_gen_pq_add);
|
||||
@@ -487,31 +488,31 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
|
||||
/*
|
||||
* Generate PQR parity (RAIDZ2)
|
||||
*
|
||||
* @rm RAIDZ map
|
||||
* @rr RAIDZ row
|
||||
*/
|
||||
static raidz_inline void
|
||||
raidz_generate_pqr_impl(raidz_map_t * const rm)
|
||||
raidz_generate_pqr_impl(raidz_row_t * const rr)
|
||||
{
|
||||
size_t c;
|
||||
const size_t ncols = raidz_ncols(rm);
|
||||
const size_t csize = rm->rm_col[CODE_P].rc_size;
|
||||
const size_t ncols = rr->rr_cols;
|
||||
const size_t csize = rr->rr_col[CODE_P].rc_size;
|
||||
size_t dsize;
|
||||
abd_t *dabd;
|
||||
abd_t *cabds[] = {
|
||||
rm->rm_col[CODE_P].rc_abd,
|
||||
rm->rm_col[CODE_Q].rc_abd,
|
||||
rm->rm_col[CODE_R].rc_abd
|
||||
rr->rr_col[CODE_P].rc_abd,
|
||||
rr->rr_col[CODE_Q].rc_abd,
|
||||
rr->rr_col[CODE_R].rc_abd
|
||||
};
|
||||
|
||||
raidz_math_begin();
|
||||
|
||||
raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize);
|
||||
raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize);
|
||||
raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize);
|
||||
raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize);
|
||||
raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize);
|
||||
raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize);
|
||||
|
||||
for (c = 4; c < ncols; c++) {
|
||||
dabd = rm->rm_col[c].rc_abd;
|
||||
dsize = rm->rm_col[c].rc_size;
|
||||
dabd = rr->rr_col[c].rc_abd;
|
||||
dsize = rr->rr_col[c].rc_size;
|
||||
|
||||
abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
|
||||
raidz_gen_pqr_add);
|
||||
@@ -579,33 +580,36 @@ raidz_generate_pqr_impl(raidz_map_t * const rm)
|
||||
* @syn_method raidz_add_abd()
|
||||
* @rec_method not applicable
|
||||
*
|
||||
* @rm RAIDZ map
|
||||
* @rr RAIDZ row
|
||||
* @tgtidx array of missing data indexes
|
||||
*/
|
||||
static raidz_inline int
|
||||
raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx)
|
||||
{
|
||||
size_t c;
|
||||
const size_t firstdc = raidz_parity(rm);
|
||||
const size_t ncols = raidz_ncols(rm);
|
||||
const size_t firstdc = rr->rr_firstdatacol;
|
||||
const size_t ncols = rr->rr_cols;
|
||||
const size_t x = tgtidx[TARGET_X];
|
||||
const size_t xsize = rm->rm_col[x].rc_size;
|
||||
abd_t *xabd = rm->rm_col[x].rc_abd;
|
||||
const size_t xsize = rr->rr_col[x].rc_size;
|
||||
abd_t *xabd = rr->rr_col[x].rc_abd;
|
||||
size_t size;
|
||||
abd_t *dabd;
|
||||
|
||||
if (xabd == NULL)
|
||||
return (1 << CODE_P);
|
||||
|
||||
raidz_math_begin();
|
||||
|
||||
/* copy P into target */
|
||||
raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize);
|
||||
raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize);
|
||||
|
||||
/* generate p_syndrome */
|
||||
for (c = firstdc; c < ncols; c++) {
|
||||
if (c == x)
|
||||
continue;
|
||||
|
||||
dabd = rm->rm_col[c].rc_abd;
|
||||
size = MIN(rm->rm_col[c].rc_size, xsize);
|
||||
dabd = rr->rr_col[c].rc_abd;
|
||||
size = MIN(rr->rr_col[c].rc_size, xsize);
|
||||
|
||||
raidz_add(xabd, dabd, size);
|
||||
}
|
||||
@@ -653,30 +657,33 @@ raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize,
|
||||
* @syn_method raidz_add_abd()
|
||||
* @rec_method raidz_mul_abd_cb()
|
||||
*
|
||||
* @rm RAIDZ map
|
||||
* @rr RAIDZ row
|
||||
* @tgtidx array of missing data indexes
|
||||
*/
|
||||
static raidz_inline int
|
||||
raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
|
||||
{
|
||||
size_t c;
|
||||
size_t dsize;
|
||||
abd_t *dabd;
|
||||
const size_t firstdc = raidz_parity(rm);
|
||||
const size_t ncols = raidz_ncols(rm);
|
||||
const size_t firstdc = rr->rr_firstdatacol;
|
||||
const size_t ncols = rr->rr_cols;
|
||||
const size_t x = tgtidx[TARGET_X];
|
||||
abd_t *xabd = rm->rm_col[x].rc_abd;
|
||||
const size_t xsize = rm->rm_col[x].rc_size;
|
||||
abd_t *xabd = rr->rr_col[x].rc_abd;
|
||||
const size_t xsize = rr->rr_col[x].rc_size;
|
||||
abd_t *tabds[] = { xabd };
|
||||
|
||||
if (xabd == NULL)
|
||||
return (1 << CODE_Q);
|
||||
|
||||
unsigned coeff[MUL_CNT];
|
||||
raidz_rec_q_coeff(rm, tgtidx, coeff);
|
||||
raidz_rec_q_coeff(rr, tgtidx, coeff);
|
||||
|
||||
raidz_math_begin();
|
||||
|
||||
/* Start with first data column if present */
|
||||
if (firstdc != x) {
|
||||
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
|
||||
} else {
|
||||
raidz_zero(xabd, xsize);
|
||||
}
|
||||
@@ -687,8 +694,8 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
dabd = NULL;
|
||||
dsize = 0;
|
||||
} else {
|
||||
dabd = rm->rm_col[c].rc_abd;
|
||||
dsize = rm->rm_col[c].rc_size;
|
||||
dabd = rr->rr_col[c].rc_abd;
|
||||
dsize = rr->rr_col[c].rc_size;
|
||||
}
|
||||
|
||||
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
|
||||
@@ -696,7 +703,7 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
}
|
||||
|
||||
/* add Q to the syndrome */
|
||||
raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize);
|
||||
raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize);
|
||||
|
||||
/* transform the syndrome */
|
||||
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
|
||||
@@ -744,30 +751,33 @@ raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize,
|
||||
* @syn_method raidz_add_abd()
|
||||
* @rec_method raidz_mul_abd_cb()
|
||||
*
|
||||
* @rm RAIDZ map
|
||||
* @rr RAIDZ rr
|
||||
* @tgtidx array of missing data indexes
|
||||
*/
|
||||
static raidz_inline int
|
||||
raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
|
||||
{
|
||||
size_t c;
|
||||
size_t dsize;
|
||||
abd_t *dabd;
|
||||
const size_t firstdc = raidz_parity(rm);
|
||||
const size_t ncols = raidz_ncols(rm);
|
||||
const size_t firstdc = rr->rr_firstdatacol;
|
||||
const size_t ncols = rr->rr_cols;
|
||||
const size_t x = tgtidx[TARGET_X];
|
||||
const size_t xsize = rm->rm_col[x].rc_size;
|
||||
abd_t *xabd = rm->rm_col[x].rc_abd;
|
||||
const size_t xsize = rr->rr_col[x].rc_size;
|
||||
abd_t *xabd = rr->rr_col[x].rc_abd;
|
||||
abd_t *tabds[] = { xabd };
|
||||
|
||||
if (xabd == NULL)
|
||||
return (1 << CODE_R);
|
||||
|
||||
unsigned coeff[MUL_CNT];
|
||||
raidz_rec_r_coeff(rm, tgtidx, coeff);
|
||||
raidz_rec_r_coeff(rr, tgtidx, coeff);
|
||||
|
||||
raidz_math_begin();
|
||||
|
||||
/* Start with first data column if present */
|
||||
if (firstdc != x) {
|
||||
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
|
||||
} else {
|
||||
raidz_zero(xabd, xsize);
|
||||
}
|
||||
@@ -779,8 +789,8 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
dabd = NULL;
|
||||
dsize = 0;
|
||||
} else {
|
||||
dabd = rm->rm_col[c].rc_abd;
|
||||
dsize = rm->rm_col[c].rc_size;
|
||||
dabd = rr->rr_col[c].rc_abd;
|
||||
dsize = rr->rr_col[c].rc_size;
|
||||
}
|
||||
|
||||
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
|
||||
@@ -788,7 +798,7 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
}
|
||||
|
||||
/* add R to the syndrome */
|
||||
raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize);
|
||||
raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize);
|
||||
|
||||
/* transform the syndrome */
|
||||
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
|
||||
@@ -881,31 +891,34 @@ raidz_rec_pq_abd(void **tc, const size_t tsize, void **c,
|
||||
* @syn_method raidz_syn_pq_abd()
|
||||
* @rec_method raidz_rec_pq_abd()
|
||||
*
|
||||
* @rm RAIDZ map
|
||||
* @rr RAIDZ row
|
||||
* @tgtidx array of missing data indexes
|
||||
*/
|
||||
static raidz_inline int
|
||||
raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
|
||||
{
|
||||
size_t c;
|
||||
size_t dsize;
|
||||
abd_t *dabd;
|
||||
const size_t firstdc = raidz_parity(rm);
|
||||
const size_t ncols = raidz_ncols(rm);
|
||||
const size_t firstdc = rr->rr_firstdatacol;
|
||||
const size_t ncols = rr->rr_cols;
|
||||
const size_t x = tgtidx[TARGET_X];
|
||||
const size_t y = tgtidx[TARGET_Y];
|
||||
const size_t xsize = rm->rm_col[x].rc_size;
|
||||
const size_t ysize = rm->rm_col[y].rc_size;
|
||||
abd_t *xabd = rm->rm_col[x].rc_abd;
|
||||
abd_t *yabd = rm->rm_col[y].rc_abd;
|
||||
const size_t xsize = rr->rr_col[x].rc_size;
|
||||
const size_t ysize = rr->rr_col[y].rc_size;
|
||||
abd_t *xabd = rr->rr_col[x].rc_abd;
|
||||
abd_t *yabd = rr->rr_col[y].rc_abd;
|
||||
abd_t *tabds[2] = { xabd, yabd };
|
||||
abd_t *cabds[] = {
|
||||
rm->rm_col[CODE_P].rc_abd,
|
||||
rm->rm_col[CODE_Q].rc_abd
|
||||
rr->rr_col[CODE_P].rc_abd,
|
||||
rr->rr_col[CODE_Q].rc_abd
|
||||
};
|
||||
|
||||
if (xabd == NULL)
|
||||
return ((1 << CODE_P) | (1 << CODE_Q));
|
||||
|
||||
unsigned coeff[MUL_CNT];
|
||||
raidz_rec_pq_coeff(rm, tgtidx, coeff);
|
||||
raidz_rec_pq_coeff(rr, tgtidx, coeff);
|
||||
|
||||
/*
|
||||
* Check if some of targets is shorter then others
|
||||
@@ -921,8 +934,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
|
||||
/* Start with first data column if present */
|
||||
if (firstdc != x) {
|
||||
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
|
||||
} else {
|
||||
raidz_zero(xabd, xsize);
|
||||
raidz_zero(yabd, xsize);
|
||||
@@ -934,8 +947,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
dabd = NULL;
|
||||
dsize = 0;
|
||||
} else {
|
||||
dabd = rm->rm_col[c].rc_abd;
|
||||
dsize = rm->rm_col[c].rc_size;
|
||||
dabd = rr->rr_col[c].rc_abd;
|
||||
dsize = rr->rr_col[c].rc_size;
|
||||
}
|
||||
|
||||
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
|
||||
@@ -946,7 +959,7 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
|
||||
/* Copy shorter targets back to the original abd buffer */
|
||||
if (ysize < xsize)
|
||||
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
|
||||
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
|
||||
|
||||
raidz_math_end();
|
||||
|
||||
@@ -1038,30 +1051,34 @@ raidz_rec_pr_abd(void **t, const size_t tsize, void **c,
|
||||
* @syn_method raidz_syn_pr_abd()
|
||||
* @rec_method raidz_rec_pr_abd()
|
||||
*
|
||||
* @rm RAIDZ map
|
||||
* @rr RAIDZ row
|
||||
* @tgtidx array of missing data indexes
|
||||
*/
|
||||
static raidz_inline int
|
||||
raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
|
||||
{
|
||||
size_t c;
|
||||
size_t dsize;
|
||||
abd_t *dabd;
|
||||
const size_t firstdc = raidz_parity(rm);
|
||||
const size_t ncols = raidz_ncols(rm);
|
||||
const size_t firstdc = rr->rr_firstdatacol;
|
||||
const size_t ncols = rr->rr_cols;
|
||||
const size_t x = tgtidx[0];
|
||||
const size_t y = tgtidx[1];
|
||||
const size_t xsize = rm->rm_col[x].rc_size;
|
||||
const size_t ysize = rm->rm_col[y].rc_size;
|
||||
abd_t *xabd = rm->rm_col[x].rc_abd;
|
||||
abd_t *yabd = rm->rm_col[y].rc_abd;
|
||||
const size_t xsize = rr->rr_col[x].rc_size;
|
||||
const size_t ysize = rr->rr_col[y].rc_size;
|
||||
abd_t *xabd = rr->rr_col[x].rc_abd;
|
||||
abd_t *yabd = rr->rr_col[y].rc_abd;
|
||||
abd_t *tabds[2] = { xabd, yabd };
|
||||
abd_t *cabds[] = {
|
||||
rm->rm_col[CODE_P].rc_abd,
|
||||
rm->rm_col[CODE_R].rc_abd
|
||||
rr->rr_col[CODE_P].rc_abd,
|
||||
rr->rr_col[CODE_R].rc_abd
|
||||
};
|
||||
|
||||
if (xabd == NULL)
|
||||
return ((1 << CODE_P) | (1 << CODE_R));
|
||||
|
||||
unsigned coeff[MUL_CNT];
|
||||
raidz_rec_pr_coeff(rm, tgtidx, coeff);
|
||||
raidz_rec_pr_coeff(rr, tgtidx, coeff);
|
||||
|
||||
/*
|
||||
* Check if some of targets are shorter then others.
|
||||
@@ -1077,8 +1094,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
|
||||
/* Start with first data column if present */
|
||||
if (firstdc != x) {
|
||||
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
|
||||
} else {
|
||||
raidz_zero(xabd, xsize);
|
||||
raidz_zero(yabd, xsize);
|
||||
@@ -1090,8 +1107,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
dabd = NULL;
|
||||
dsize = 0;
|
||||
} else {
|
||||
dabd = rm->rm_col[c].rc_abd;
|
||||
dsize = rm->rm_col[c].rc_size;
|
||||
dabd = rr->rr_col[c].rc_abd;
|
||||
dsize = rr->rr_col[c].rc_size;
|
||||
}
|
||||
|
||||
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
|
||||
@@ -1104,14 +1121,14 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
* Copy shorter targets back to the original abd buffer
|
||||
*/
|
||||
if (ysize < xsize)
|
||||
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
|
||||
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
|
||||
|
||||
raidz_math_end();
|
||||
|
||||
if (ysize < xsize)
|
||||
abd_free(yabd);
|
||||
|
||||
return ((1 << CODE_P) | (1 << CODE_Q));
|
||||
return ((1 << CODE_P) | (1 << CODE_R));
|
||||
}
|
||||
|
||||
|
||||
@@ -1201,30 +1218,34 @@ raidz_rec_qr_abd(void **t, const size_t tsize, void **c,
|
||||
* @syn_method raidz_syn_qr_abd()
|
||||
* @rec_method raidz_rec_qr_abd()
|
||||
*
|
||||
* @rm RAIDZ map
|
||||
* @rr RAIDZ row
|
||||
* @tgtidx array of missing data indexes
|
||||
*/
|
||||
static raidz_inline int
|
||||
raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
|
||||
{
|
||||
size_t c;
|
||||
size_t dsize;
|
||||
abd_t *dabd;
|
||||
const size_t firstdc = raidz_parity(rm);
|
||||
const size_t ncols = raidz_ncols(rm);
|
||||
const size_t firstdc = rr->rr_firstdatacol;
|
||||
const size_t ncols = rr->rr_cols;
|
||||
const size_t x = tgtidx[TARGET_X];
|
||||
const size_t y = tgtidx[TARGET_Y];
|
||||
const size_t xsize = rm->rm_col[x].rc_size;
|
||||
const size_t ysize = rm->rm_col[y].rc_size;
|
||||
abd_t *xabd = rm->rm_col[x].rc_abd;
|
||||
abd_t *yabd = rm->rm_col[y].rc_abd;
|
||||
const size_t xsize = rr->rr_col[x].rc_size;
|
||||
const size_t ysize = rr->rr_col[y].rc_size;
|
||||
abd_t *xabd = rr->rr_col[x].rc_abd;
|
||||
abd_t *yabd = rr->rr_col[y].rc_abd;
|
||||
abd_t *tabds[2] = { xabd, yabd };
|
||||
abd_t *cabds[] = {
|
||||
rm->rm_col[CODE_Q].rc_abd,
|
||||
rm->rm_col[CODE_R].rc_abd
|
||||
rr->rr_col[CODE_Q].rc_abd,
|
||||
rr->rr_col[CODE_R].rc_abd
|
||||
};
|
||||
|
||||
if (xabd == NULL)
|
||||
return ((1 << CODE_Q) | (1 << CODE_R));
|
||||
|
||||
unsigned coeff[MUL_CNT];
|
||||
raidz_rec_qr_coeff(rm, tgtidx, coeff);
|
||||
raidz_rec_qr_coeff(rr, tgtidx, coeff);
|
||||
|
||||
/*
|
||||
* Check if some of targets is shorter then others
|
||||
@@ -1240,8 +1261,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
|
||||
/* Start with first data column if present */
|
||||
if (firstdc != x) {
|
||||
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
|
||||
} else {
|
||||
raidz_zero(xabd, xsize);
|
||||
raidz_zero(yabd, xsize);
|
||||
@@ -1253,8 +1274,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
dabd = NULL;
|
||||
dsize = 0;
|
||||
} else {
|
||||
dabd = rm->rm_col[c].rc_abd;
|
||||
dsize = rm->rm_col[c].rc_size;
|
||||
dabd = rr->rr_col[c].rc_abd;
|
||||
dsize = rr->rr_col[c].rc_size;
|
||||
}
|
||||
|
||||
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
|
||||
@@ -1267,7 +1288,7 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
* Copy shorter targets back to the original abd buffer
|
||||
*/
|
||||
if (ysize < xsize)
|
||||
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
|
||||
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
|
||||
|
||||
raidz_math_end();
|
||||
|
||||
@@ -1384,34 +1405,38 @@ raidz_rec_pqr_abd(void **t, const size_t tsize, void **c,
|
||||
* @syn_method raidz_syn_pqr_abd()
|
||||
* @rec_method raidz_rec_pqr_abd()
|
||||
*
|
||||
* @rm RAIDZ map
|
||||
* @rr RAIDZ row
|
||||
* @tgtidx array of missing data indexes
|
||||
*/
|
||||
static raidz_inline int
|
||||
raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
|
||||
{
|
||||
size_t c;
|
||||
size_t dsize;
|
||||
abd_t *dabd;
|
||||
const size_t firstdc = raidz_parity(rm);
|
||||
const size_t ncols = raidz_ncols(rm);
|
||||
const size_t firstdc = rr->rr_firstdatacol;
|
||||
const size_t ncols = rr->rr_cols;
|
||||
const size_t x = tgtidx[TARGET_X];
|
||||
const size_t y = tgtidx[TARGET_Y];
|
||||
const size_t z = tgtidx[TARGET_Z];
|
||||
const size_t xsize = rm->rm_col[x].rc_size;
|
||||
const size_t ysize = rm->rm_col[y].rc_size;
|
||||
const size_t zsize = rm->rm_col[z].rc_size;
|
||||
abd_t *xabd = rm->rm_col[x].rc_abd;
|
||||
abd_t *yabd = rm->rm_col[y].rc_abd;
|
||||
abd_t *zabd = rm->rm_col[z].rc_abd;
|
||||
const size_t xsize = rr->rr_col[x].rc_size;
|
||||
const size_t ysize = rr->rr_col[y].rc_size;
|
||||
const size_t zsize = rr->rr_col[z].rc_size;
|
||||
abd_t *xabd = rr->rr_col[x].rc_abd;
|
||||
abd_t *yabd = rr->rr_col[y].rc_abd;
|
||||
abd_t *zabd = rr->rr_col[z].rc_abd;
|
||||
abd_t *tabds[] = { xabd, yabd, zabd };
|
||||
abd_t *cabds[] = {
|
||||
rm->rm_col[CODE_P].rc_abd,
|
||||
rm->rm_col[CODE_Q].rc_abd,
|
||||
rm->rm_col[CODE_R].rc_abd
|
||||
rr->rr_col[CODE_P].rc_abd,
|
||||
rr->rr_col[CODE_Q].rc_abd,
|
||||
rr->rr_col[CODE_R].rc_abd
|
||||
};
|
||||
|
||||
if (xabd == NULL)
|
||||
return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R));
|
||||
|
||||
unsigned coeff[MUL_CNT];
|
||||
raidz_rec_pqr_coeff(rm, tgtidx, coeff);
|
||||
raidz_rec_pqr_coeff(rr, tgtidx, coeff);
|
||||
|
||||
/*
|
||||
* Check if some of targets is shorter then others
|
||||
@@ -1431,9 +1456,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
|
||||
/* Start with first data column if present */
|
||||
if (firstdc != x) {
|
||||
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
|
||||
raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize);
|
||||
} else {
|
||||
raidz_zero(xabd, xsize);
|
||||
raidz_zero(yabd, xsize);
|
||||
@@ -1446,8 +1471,8 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
dabd = NULL;
|
||||
dsize = 0;
|
||||
} else {
|
||||
dabd = rm->rm_col[c].rc_abd;
|
||||
dsize = rm->rm_col[c].rc_size;
|
||||
dabd = rr->rr_col[c].rc_abd;
|
||||
dsize = rr->rr_col[c].rc_size;
|
||||
}
|
||||
|
||||
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
|
||||
@@ -1460,9 +1485,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
|
||||
* Copy shorter targets back to the original abd buffer
|
||||
*/
|
||||
if (ysize < xsize)
|
||||
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
|
||||
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
|
||||
if (zsize < xsize)
|
||||
raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize);
|
||||
raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize);
|
||||
|
||||
raidz_math_end();
|
||||
|
||||
|
||||
+141
-98
@@ -25,6 +25,7 @@
|
||||
*/
|
||||
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/vdev_draid.h>
|
||||
#include <sys/dsl_scan.h>
|
||||
#include <sys/spa_impl.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
@@ -63,13 +64,15 @@
|
||||
*
|
||||
* Limitations:
|
||||
*
|
||||
* - Only supported for mirror vdev types. Due to the variable stripe
|
||||
* width used by raidz sequential reconstruction is not possible.
|
||||
* - Sequential reconstruction is not possible on RAIDZ due to its
|
||||
* variable stripe width. Note dRAID uses a fixed stripe width which
|
||||
* avoids this issue, but comes at the expense of some usable capacity.
|
||||
*
|
||||
* - Block checksums are not verified during sequential reconstuction.
|
||||
* - Block checksums are not verified during sequential reconstruction.
|
||||
* Similar to traditional RAID the parity/mirror data is reconstructed
|
||||
* but cannot be immediately double checked. For this reason when the
|
||||
* last active resilver completes the pool is automatically scrubbed.
|
||||
* last active resilver completes the pool is automatically scrubbed
|
||||
* by default.
|
||||
*
|
||||
* - Deferred resilvers using sequential reconstruction are not currently
|
||||
* supported. When adding another vdev to an active top-level resilver
|
||||
@@ -77,8 +80,8 @@
|
||||
*
|
||||
* Advantages:
|
||||
*
|
||||
* - Sequential reconstuction is performed in LBA order which may be faster
|
||||
* than healing reconstuction particularly when using using HDDs (or
|
||||
* - Sequential reconstruction is performed in LBA order which may be faster
|
||||
* than healing reconstruction particularly when using using HDDs (or
|
||||
* especially with SMR devices). Only allocated capacity is resilvered.
|
||||
*
|
||||
* - Sequential reconstruction is not constrained by ZFS block boundaries.
|
||||
@@ -86,9 +89,9 @@
|
||||
* allowing all of these logical blocks to be repaired with a single IO.
|
||||
*
|
||||
* - Unlike a healing resilver or scrub which are pool wide operations,
|
||||
* sequential reconstruction is handled by the top-level mirror vdevs.
|
||||
* This allows for it to be started or canceled on a top-level vdev
|
||||
* without impacting any other top-level vdevs in the pool.
|
||||
* sequential reconstruction is handled by the top-level vdevs. This
|
||||
* allows for it to be started or canceled on a top-level vdev without
|
||||
* impacting any other top-level vdevs in the pool.
|
||||
*
|
||||
* - Data only referenced by a pool checkpoint will be repaired because
|
||||
* that space is reflected in the space maps. This differs for a
|
||||
@@ -97,18 +100,36 @@
|
||||
|
||||
|
||||
/*
|
||||
* Maximum number of queued rebuild I/Os top-level vdev. The number of
|
||||
* concurrent rebuild I/Os issued to the device is controlled by the
|
||||
* zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module
|
||||
* options.
|
||||
*/
|
||||
unsigned int zfs_rebuild_queue_limit = 20;
|
||||
|
||||
/*
|
||||
* Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE.
|
||||
* Size of rebuild reads; defaults to 1MiB per data disk and is capped at
|
||||
* SPA_MAXBLOCKSIZE.
|
||||
*/
|
||||
unsigned long zfs_rebuild_max_segment = 1024 * 1024;
|
||||
|
||||
/*
|
||||
* Maximum number of parallelly executed bytes per leaf vdev caused by a
|
||||
* sequential resilver. We attempt to strike a balance here between keeping
|
||||
* the vdev queues full of I/Os at all times and not overflowing the queues
|
||||
* to cause long latency, which would cause long txg sync times.
|
||||
*
|
||||
* A large default value can be safely used here because the default target
|
||||
* segment size is also large (zfs_rebuild_max_segment=1M). This helps keep
|
||||
* the queue depth short.
|
||||
*
|
||||
* 32MB was selected as the default value to achieve good performance with
|
||||
* a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
|
||||
* rebuild was unable to saturate all of the drives using smaller values.
|
||||
* With a value of 32MB the sequential resilver write rate was measured at
|
||||
* 800MB/s sustained while rebuilding to a distributed spare.
|
||||
*/
|
||||
unsigned long zfs_rebuild_vdev_limit = 32 << 20;
|
||||
|
||||
/*
|
||||
* Automatically start a pool scrub when the last active sequential resilver
|
||||
* completes in order to verify the checksums of all blocks which have been
|
||||
* resilvered. This option is enabled by default and is strongly recommended.
|
||||
*/
|
||||
int zfs_rebuild_scrub_enabled = 1;
|
||||
|
||||
/*
|
||||
* For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
|
||||
*/
|
||||
@@ -293,7 +314,7 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
|
||||
VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
|
||||
REBUILD_PHYS_ENTRIES, vrp, tx));
|
||||
|
||||
vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
|
||||
vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
|
||||
spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
|
||||
|
||||
spa_history_log_internal(spa, "rebuild", tx,
|
||||
@@ -306,7 +327,16 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
|
||||
vd->vdev_rebuilding = B_FALSE;
|
||||
mutex_exit(&vd->vdev_rebuild_lock);
|
||||
|
||||
spa_notify_waiters(spa);
|
||||
/*
|
||||
* While we're in syncing context take the opportunity to
|
||||
* setup the scrub when there are no more active rebuilds.
|
||||
*/
|
||||
if (!vdev_rebuild_active(spa->spa_root_vdev) &&
|
||||
zfs_rebuild_scrub_enabled) {
|
||||
pool_scan_func_t func = POOL_SCAN_SCRUB;
|
||||
dsl_scan_setup_sync(&func, tx);
|
||||
}
|
||||
|
||||
cv_broadcast(&vd->vdev_rebuild_cv);
|
||||
}
|
||||
|
||||
@@ -438,7 +468,7 @@ vdev_rebuild_cb(zio_t *zio)
|
||||
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
|
||||
vdev_t *vd = vr->vr_top_vdev;
|
||||
|
||||
mutex_enter(&vd->vdev_rebuild_io_lock);
|
||||
mutex_enter(&vr->vr_io_lock);
|
||||
if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
|
||||
/*
|
||||
* The I/O failed because the top-level vdev was unavailable.
|
||||
@@ -455,34 +485,30 @@ vdev_rebuild_cb(zio_t *zio)
|
||||
|
||||
abd_free(zio->io_abd);
|
||||
|
||||
ASSERT3U(vd->vdev_rebuild_inflight, >, 0);
|
||||
vd->vdev_rebuild_inflight--;
|
||||
cv_broadcast(&vd->vdev_rebuild_io_cv);
|
||||
mutex_exit(&vd->vdev_rebuild_io_lock);
|
||||
ASSERT3U(vr->vr_bytes_inflight, >, 0);
|
||||
vr->vr_bytes_inflight -= zio->io_size;
|
||||
cv_broadcast(&vr->vr_io_cv);
|
||||
mutex_exit(&vr->vr_io_lock);
|
||||
|
||||
spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
|
||||
}
|
||||
|
||||
/*
|
||||
* Rebuild the data in this range by constructing a special dummy block
|
||||
* pointer for the given range. It has no relation to any existing blocks
|
||||
* in the pool. But by disabling checksum verification and issuing a scrub
|
||||
* I/O mirrored vdevs will replicate the block using any available mirror
|
||||
* leaf vdevs.
|
||||
* Initialize a block pointer that can be used to read the given segment
|
||||
* for sequential rebuild.
|
||||
*/
|
||||
static void
|
||||
vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
|
||||
uint64_t txg)
|
||||
vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start,
|
||||
uint64_t asize)
|
||||
{
|
||||
vdev_t *vd = vr->vr_top_vdev;
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
uint64_t psize = asize;
|
||||
|
||||
ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
|
||||
ASSERT(vd->vdev_ops == &vdev_draid_ops ||
|
||||
vd->vdev_ops == &vdev_mirror_ops ||
|
||||
vd->vdev_ops == &vdev_replacing_ops ||
|
||||
vd->vdev_ops == &vdev_spare_ops);
|
||||
|
||||
blkptr_t blk, *bp = &blk;
|
||||
uint64_t psize = vd->vdev_ops == &vdev_draid_ops ?
|
||||
vdev_draid_asize_to_psize(vd, asize) : asize;
|
||||
|
||||
BP_ZERO(bp);
|
||||
|
||||
DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
|
||||
@@ -499,19 +525,6 @@ vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
|
||||
BP_SET_LEVEL(bp, 0);
|
||||
BP_SET_DEDUP(bp, 0);
|
||||
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
|
||||
|
||||
/*
|
||||
* We increment the issued bytes by the asize rather than the psize
|
||||
* so the scanned and issued bytes may be directly compared. This
|
||||
* is consistent with the scrub/resilver issued reporting.
|
||||
*/
|
||||
vr->vr_pass_bytes_issued += asize;
|
||||
vr->vr_rebuild_phys.vrp_bytes_issued += asize;
|
||||
|
||||
zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp,
|
||||
abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
|
||||
ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
|
||||
ZIO_FLAG_RESILVER, NULL));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -525,6 +538,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
|
||||
uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id;
|
||||
vdev_t *vd = vr->vr_top_vdev;
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
blkptr_t blk;
|
||||
|
||||
ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift);
|
||||
ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift);
|
||||
@@ -532,14 +546,26 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
|
||||
vr->vr_pass_bytes_scanned += size;
|
||||
vr->vr_rebuild_phys.vrp_bytes_scanned += size;
|
||||
|
||||
mutex_enter(&vd->vdev_rebuild_io_lock);
|
||||
/*
|
||||
* Rebuild the data in this range by constructing a special block
|
||||
* pointer. It has no relation to any existing blocks in the pool.
|
||||
* However, by disabling checksum verification and issuing a scrub IO
|
||||
* we can reconstruct and repair any children with missing data.
|
||||
*/
|
||||
vdev_rebuild_blkptr_init(&blk, vd, start, size);
|
||||
uint64_t psize = BP_GET_PSIZE(&blk);
|
||||
|
||||
if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
|
||||
return (0);
|
||||
|
||||
mutex_enter(&vr->vr_io_lock);
|
||||
|
||||
/* Limit in flight rebuild I/Os */
|
||||
while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit)
|
||||
cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
|
||||
while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max)
|
||||
cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
|
||||
|
||||
vd->vdev_rebuild_inflight++;
|
||||
mutex_exit(&vd->vdev_rebuild_io_lock);
|
||||
vr->vr_bytes_inflight += psize;
|
||||
mutex_exit(&vr->vr_io_lock);
|
||||
|
||||
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
|
||||
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
|
||||
@@ -558,45 +584,29 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
|
||||
|
||||
/* When exiting write out our progress. */
|
||||
if (vdev_rebuild_should_stop(vd)) {
|
||||
mutex_enter(&vd->vdev_rebuild_io_lock);
|
||||
vd->vdev_rebuild_inflight--;
|
||||
mutex_exit(&vd->vdev_rebuild_io_lock);
|
||||
mutex_enter(&vr->vr_io_lock);
|
||||
vr->vr_bytes_inflight -= psize;
|
||||
mutex_exit(&vr->vr_io_lock);
|
||||
spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
|
||||
mutex_exit(&vd->vdev_rebuild_lock);
|
||||
dmu_tx_commit(tx);
|
||||
return (SET_ERROR(EINTR));
|
||||
}
|
||||
mutex_exit(&vd->vdev_rebuild_lock);
|
||||
|
||||
vr->vr_scan_offset[txg & TXG_MASK] = start + size;
|
||||
vdev_rebuild_rebuild_block(vr, start, size, txg);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
vr->vr_scan_offset[txg & TXG_MASK] = start + size;
|
||||
vr->vr_pass_bytes_issued += size;
|
||||
vr->vr_rebuild_phys.vrp_bytes_issued += size;
|
||||
|
||||
zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk,
|
||||
abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
|
||||
ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
|
||||
ZIO_FLAG_RESILVER, NULL));
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Split range into legally-sized logical chunks given the constraints of the
|
||||
* top-level mirror vdev type.
|
||||
*/
|
||||
static uint64_t
|
||||
vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size)
|
||||
{
|
||||
uint64_t chunk_size, max_asize, max_segment;
|
||||
|
||||
ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
|
||||
vd->vdev_ops == &vdev_replacing_ops ||
|
||||
vd->vdev_ops == &vdev_spare_ops);
|
||||
|
||||
max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment,
|
||||
1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE);
|
||||
max_asize = vdev_psize_to_asize(vd, max_segment);
|
||||
chunk_size = MIN(size, max_asize);
|
||||
|
||||
return (chunk_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree.
|
||||
*/
|
||||
@@ -625,7 +635,14 @@ vdev_rebuild_ranges(vdev_rebuild_t *vr)
|
||||
while (size > 0) {
|
||||
uint64_t chunk_size;
|
||||
|
||||
chunk_size = vdev_rebuild_chunk_size(vd, start, size);
|
||||
/*
|
||||
* Split range into legally-sized logical chunks
|
||||
* given the constraints of the top-level vdev
|
||||
* being rebuilt (dRAID or mirror).
|
||||
*/
|
||||
ASSERT3P(vd->vdev_ops, !=, NULL);
|
||||
chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd,
|
||||
start, size, zfs_rebuild_max_segment);
|
||||
|
||||
error = vdev_rebuild_range(vr, start, chunk_size);
|
||||
if (error != 0)
|
||||
@@ -747,10 +764,16 @@ vdev_rebuild_thread(void *arg)
|
||||
vr->vr_top_vdev = vd;
|
||||
vr->vr_scan_msp = NULL;
|
||||
vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
|
||||
mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
vr->vr_pass_start_time = gethrtime();
|
||||
vr->vr_pass_bytes_scanned = 0;
|
||||
vr->vr_pass_bytes_issued = 0;
|
||||
|
||||
vr->vr_bytes_inflight_max = MAX(1ULL << 20,
|
||||
zfs_rebuild_vdev_limit * vd->vdev_children);
|
||||
|
||||
uint64_t update_est_time = gethrtime();
|
||||
vdev_rebuild_update_bytes_est(vd, 0);
|
||||
|
||||
@@ -780,21 +803,32 @@ vdev_rebuild_thread(void *arg)
|
||||
|
||||
ASSERT0(range_tree_space(vr->vr_scan_tree));
|
||||
|
||||
/*
|
||||
* Disable any new allocations to this metaslab and wait
|
||||
* for any writes inflight to complete. This is needed to
|
||||
* ensure all allocated ranges are rebuilt.
|
||||
*/
|
||||
/* Disable any new allocations to this metaslab */
|
||||
metaslab_disable(msp);
|
||||
spa_config_exit(spa, SCL_CONFIG, FTAG);
|
||||
txg_wait_synced(dsl, 0);
|
||||
|
||||
mutex_enter(&msp->ms_sync_lock);
|
||||
mutex_enter(&msp->ms_lock);
|
||||
|
||||
/*
|
||||
* If there are outstanding allocations wait for them to be
|
||||
* synced. This is needed to ensure all allocated ranges are
|
||||
* on disk and therefore will be rebuilt.
|
||||
*/
|
||||
for (int j = 0; j < TXG_SIZE; j++) {
|
||||
if (range_tree_space(msp->ms_allocating[j])) {
|
||||
mutex_exit(&msp->ms_lock);
|
||||
mutex_exit(&msp->ms_sync_lock);
|
||||
txg_wait_synced(dsl, 0);
|
||||
mutex_enter(&msp->ms_sync_lock);
|
||||
mutex_enter(&msp->ms_lock);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* When a metaslab has been allocated from read its allocated
|
||||
* ranges from the space map object in to the vr_scan_tree.
|
||||
* ranges from the space map object into the vr_scan_tree.
|
||||
* Then add inflight / unflushed ranges and remove inflight /
|
||||
* unflushed frees. This is the minimum range to be rebuilt.
|
||||
*/
|
||||
@@ -827,7 +861,7 @@ vdev_rebuild_thread(void *arg)
|
||||
/*
|
||||
* To provide an accurate estimate re-calculate the estimated
|
||||
* size every 5 minutes to account for recent allocations and
|
||||
* frees made space maps which have not yet been rebuilt.
|
||||
* frees made to space maps which have not yet been rebuilt.
|
||||
*/
|
||||
if (gethrtime() > update_est_time + SEC2NSEC(300)) {
|
||||
update_est_time = gethrtime();
|
||||
@@ -851,11 +885,14 @@ vdev_rebuild_thread(void *arg)
|
||||
spa_config_exit(spa, SCL_CONFIG, FTAG);
|
||||
|
||||
/* Wait for any remaining rebuild I/O to complete */
|
||||
mutex_enter(&vd->vdev_rebuild_io_lock);
|
||||
while (vd->vdev_rebuild_inflight > 0)
|
||||
cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
|
||||
mutex_enter(&vr->vr_io_lock);
|
||||
while (vr->vr_bytes_inflight > 0)
|
||||
cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
|
||||
|
||||
mutex_exit(&vd->vdev_rebuild_io_lock);
|
||||
mutex_exit(&vr->vr_io_lock);
|
||||
|
||||
mutex_destroy(&vr->vr_io_lock);
|
||||
cv_destroy(&vr->vr_io_cv);
|
||||
|
||||
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
|
||||
|
||||
@@ -1100,5 +1137,11 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
|
||||
|
||||
/* BEGIN CSTYLED */
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
|
||||
"Max segment size in bytes of rebuild reads");
|
||||
"Max segment size in bytes of rebuild reads");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW,
|
||||
"Max bytes in flight per leaf vdev for sequential resilvers");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW,
|
||||
"Automatically scrub after sequential resilver completes");
|
||||
/* END CSTYLED */
|
||||
|
||||
+27
-15
@@ -250,7 +250,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
|
||||
spa_vdev_removal_t *svr = NULL;
|
||||
uint64_t txg __maybe_unused = dmu_tx_get_txg(tx);
|
||||
|
||||
ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
|
||||
ASSERT0(vdev_get_nparity(vd));
|
||||
svr = spa_vdev_removal_create(vd);
|
||||
|
||||
ASSERT(vd->vdev_removing);
|
||||
@@ -1120,7 +1120,7 @@ static void
|
||||
vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
|
||||
{
|
||||
ASSERT3P(zlist, !=, NULL);
|
||||
ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
|
||||
ASSERT0(vdev_get_nparity(vd));
|
||||
|
||||
if (vd->vdev_leaf_zap != 0) {
|
||||
char zkey[32];
|
||||
@@ -2041,7 +2041,7 @@ spa_vdev_remove_top_check(vdev_t *vd)
|
||||
|
||||
/*
|
||||
* All vdevs in normal class must have the same ashift
|
||||
* and not be raidz.
|
||||
* and not be raidz or draid.
|
||||
*/
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
int num_indirect = 0;
|
||||
@@ -2064,7 +2064,7 @@ spa_vdev_remove_top_check(vdev_t *vd)
|
||||
num_indirect++;
|
||||
if (!vdev_is_concrete(cvd))
|
||||
continue;
|
||||
if (cvd->vdev_ops == &vdev_raidz_ops)
|
||||
if (vdev_get_nparity(cvd) != 0)
|
||||
return (SET_ERROR(EINVAL));
|
||||
/*
|
||||
* Need the mirror to be mirror of leaf vdevs only
|
||||
@@ -2217,18 +2217,30 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
|
||||
* in this pool.
|
||||
*/
|
||||
if (vd == NULL || unspare) {
|
||||
if (vd == NULL)
|
||||
vd = spa_lookup_by_guid(spa, guid, B_TRUE);
|
||||
ev = spa_event_create(spa, vd, NULL,
|
||||
ESC_ZFS_VDEV_REMOVE_AUX);
|
||||
char *type;
|
||||
boolean_t draid_spare = B_FALSE;
|
||||
|
||||
vd_type = VDEV_TYPE_SPARE;
|
||||
vd_path = spa_strdup(fnvlist_lookup_string(
|
||||
nv, ZPOOL_CONFIG_PATH));
|
||||
spa_vdev_remove_aux(spa->spa_spares.sav_config,
|
||||
ZPOOL_CONFIG_SPARES, spares, nspares, nv);
|
||||
spa_load_spares(spa);
|
||||
spa->spa_spares.sav_sync = B_TRUE;
|
||||
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
|
||||
== 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
|
||||
draid_spare = B_TRUE;
|
||||
|
||||
if (vd == NULL && draid_spare) {
|
||||
error = SET_ERROR(ENOTSUP);
|
||||
} else {
|
||||
if (vd == NULL)
|
||||
vd = spa_lookup_by_guid(spa,
|
||||
guid, B_TRUE);
|
||||
ev = spa_event_create(spa, vd, NULL,
|
||||
ESC_ZFS_VDEV_REMOVE_AUX);
|
||||
|
||||
vd_type = VDEV_TYPE_SPARE;
|
||||
vd_path = spa_strdup(fnvlist_lookup_string(
|
||||
nv, ZPOOL_CONFIG_PATH));
|
||||
spa_vdev_remove_aux(spa->spa_spares.sav_config,
|
||||
ZPOOL_CONFIG_SPARES, spares, nspares, nv);
|
||||
spa_load_spares(spa);
|
||||
spa->spa_spares.sav_sync = B_TRUE;
|
||||
}
|
||||
} else {
|
||||
error = SET_ERROR(EBUSY);
|
||||
}
|
||||
|
||||
@@ -142,9 +142,13 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
|
||||
}
|
||||
|
||||
vdev_ops_t vdev_root_ops = {
|
||||
.vdev_op_init = NULL,
|
||||
.vdev_op_fini = NULL,
|
||||
.vdev_op_open = vdev_root_open,
|
||||
.vdev_op_close = vdev_root_close,
|
||||
.vdev_op_asize = vdev_default_asize,
|
||||
.vdev_op_min_asize = vdev_default_min_asize,
|
||||
.vdev_op_min_alloc = NULL,
|
||||
.vdev_op_io_start = NULL, /* not applicable to the root */
|
||||
.vdev_op_io_done = NULL, /* not applicable to the root */
|
||||
.vdev_op_state_change = vdev_root_state_change,
|
||||
@@ -153,6 +157,11 @@ vdev_ops_t vdev_root_ops = {
|
||||
.vdev_op_rele = NULL,
|
||||
.vdev_op_remap = NULL,
|
||||
.vdev_op_xlate = NULL,
|
||||
.vdev_op_rebuild_asize = NULL,
|
||||
.vdev_op_metaslab_init = NULL,
|
||||
.vdev_op_config_generate = NULL,
|
||||
.vdev_op_nparity = NULL,
|
||||
.vdev_op_ndisks = NULL,
|
||||
.vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */
|
||||
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
|
||||
};
|
||||
|
||||
+85
-66
@@ -311,7 +311,8 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
|
||||
vd->vdev_trim_secure = secure;
|
||||
}
|
||||
|
||||
boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED);
|
||||
vdev_trim_state_t old_state = vd->vdev_trim_state;
|
||||
boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED);
|
||||
vd->vdev_trim_state = new_state;
|
||||
|
||||
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
|
||||
@@ -332,9 +333,12 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
|
||||
"vdev=%s suspended", vd->vdev_path);
|
||||
break;
|
||||
case VDEV_TRIM_CANCELED:
|
||||
spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
|
||||
spa_history_log_internal(spa, "trim", tx,
|
||||
"vdev=%s canceled", vd->vdev_path);
|
||||
if (old_state == VDEV_TRIM_ACTIVE ||
|
||||
old_state == VDEV_TRIM_SUSPENDED) {
|
||||
spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
|
||||
spa_history_log_internal(spa, "trim", tx,
|
||||
"vdev=%s canceled", vd->vdev_path);
|
||||
}
|
||||
break;
|
||||
case VDEV_TRIM_COMPLETE:
|
||||
spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
|
||||
@@ -601,6 +605,32 @@ vdev_trim_ranges(trim_args_t *ta)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
|
||||
{
|
||||
uint64_t *last_rs_end = (uint64_t *)arg;
|
||||
|
||||
if (physical_rs->rs_end > *last_rs_end)
|
||||
*last_rs_end = physical_rs->rs_end;
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs)
|
||||
{
|
||||
vdev_t *vd = (vdev_t *)arg;
|
||||
|
||||
uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
|
||||
vd->vdev_trim_bytes_est += size;
|
||||
|
||||
if (vd->vdev_trim_last_offset >= physical_rs->rs_end) {
|
||||
vd->vdev_trim_bytes_done += size;
|
||||
} else if (vd->vdev_trim_last_offset > physical_rs->rs_start &&
|
||||
vd->vdev_trim_last_offset <= physical_rs->rs_end) {
|
||||
vd->vdev_trim_bytes_done +=
|
||||
vd->vdev_trim_last_offset - physical_rs->rs_start;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculates the completion percentage of a manual TRIM.
|
||||
*/
|
||||
@@ -618,27 +648,35 @@ vdev_trim_calculate_progress(vdev_t *vd)
|
||||
metaslab_t *msp = vd->vdev_top->vdev_ms[i];
|
||||
mutex_enter(&msp->ms_lock);
|
||||
|
||||
uint64_t ms_free = msp->ms_size -
|
||||
metaslab_allocated_space(msp);
|
||||
|
||||
if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
|
||||
ms_free /= vd->vdev_top->vdev_children;
|
||||
uint64_t ms_free = (msp->ms_size -
|
||||
metaslab_allocated_space(msp)) /
|
||||
vdev_get_ndisks(vd->vdev_top);
|
||||
|
||||
/*
|
||||
* Convert the metaslab range to a physical range
|
||||
* on our vdev. We use this to determine if we are
|
||||
* in the middle of this metaslab range.
|
||||
*/
|
||||
range_seg64_t logical_rs, physical_rs;
|
||||
range_seg64_t logical_rs, physical_rs, remain_rs;
|
||||
logical_rs.rs_start = msp->ms_start;
|
||||
logical_rs.rs_end = msp->ms_start + msp->ms_size;
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs);
|
||||
|
||||
/* Metaslab space after this offset has not been trimmed. */
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
|
||||
if (vd->vdev_trim_last_offset <= physical_rs.rs_start) {
|
||||
vd->vdev_trim_bytes_est += ms_free;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
continue;
|
||||
} else if (vd->vdev_trim_last_offset > physical_rs.rs_end) {
|
||||
}
|
||||
|
||||
/* Metaslab space before this offset has been trimmed */
|
||||
uint64_t last_rs_end = physical_rs.rs_end;
|
||||
if (!vdev_xlate_is_empty(&remain_rs)) {
|
||||
vdev_xlate_walk(vd, &remain_rs,
|
||||
vdev_trim_xlate_last_rs_end, &last_rs_end);
|
||||
}
|
||||
|
||||
if (vd->vdev_trim_last_offset > last_rs_end) {
|
||||
vd->vdev_trim_bytes_done += ms_free;
|
||||
vd->vdev_trim_bytes_est += ms_free;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
@@ -659,21 +697,9 @@ vdev_trim_calculate_progress(vdev_t *vd)
|
||||
rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) {
|
||||
logical_rs.rs_start = rs_get_start(rs, rt);
|
||||
logical_rs.rs_end = rs_get_end(rs, rt);
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs);
|
||||
|
||||
uint64_t size = physical_rs.rs_end -
|
||||
physical_rs.rs_start;
|
||||
vd->vdev_trim_bytes_est += size;
|
||||
if (vd->vdev_trim_last_offset >= physical_rs.rs_end) {
|
||||
vd->vdev_trim_bytes_done += size;
|
||||
} else if (vd->vdev_trim_last_offset >
|
||||
physical_rs.rs_start &&
|
||||
vd->vdev_trim_last_offset <=
|
||||
physical_rs.rs_end) {
|
||||
vd->vdev_trim_bytes_done +=
|
||||
vd->vdev_trim_last_offset -
|
||||
physical_rs.rs_start;
|
||||
}
|
||||
vdev_xlate_walk(vd, &logical_rs,
|
||||
vdev_trim_xlate_progress, vd);
|
||||
}
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
@@ -741,8 +767,38 @@ vdev_trim_load(vdev_t *vd)
|
||||
return (err);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs)
|
||||
{
|
||||
trim_args_t *ta = arg;
|
||||
vdev_t *vd = ta->trim_vdev;
|
||||
|
||||
/*
|
||||
* Only a manual trim will be traversing the vdev sequentially.
|
||||
* For an auto trim all valid ranges should be added.
|
||||
*/
|
||||
if (ta->trim_type == TRIM_TYPE_MANUAL) {
|
||||
|
||||
/* Only add segments that we have not visited yet */
|
||||
if (physical_rs->rs_end <= vd->vdev_trim_last_offset)
|
||||
return;
|
||||
|
||||
/* Pick up where we left off mid-range. */
|
||||
if (vd->vdev_trim_last_offset > physical_rs->rs_start) {
|
||||
ASSERT3U(physical_rs->rs_end, >,
|
||||
vd->vdev_trim_last_offset);
|
||||
physical_rs->rs_start = vd->vdev_trim_last_offset;
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
|
||||
|
||||
range_tree_add(ta->trim_tree, physical_rs->rs_start,
|
||||
physical_rs->rs_end - physical_rs->rs_start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert the logical range into a physical range and add it to the
|
||||
* Convert the logical range into physical ranges and add them to the
|
||||
* range tree passed in the trim_args_t.
|
||||
*/
|
||||
static void
|
||||
@@ -750,7 +806,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
trim_args_t *ta = arg;
|
||||
vdev_t *vd = ta->trim_vdev;
|
||||
range_seg64_t logical_rs, physical_rs;
|
||||
range_seg64_t logical_rs;
|
||||
logical_rs.rs_start = start;
|
||||
logical_rs.rs_end = start + size;
|
||||
|
||||
@@ -767,44 +823,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
|
||||
}
|
||||
|
||||
ASSERT(vd->vdev_ops->vdev_op_leaf);
|
||||
vdev_xlate(vd, &logical_rs, &physical_rs);
|
||||
|
||||
IMPLY(vd->vdev_top == vd,
|
||||
logical_rs.rs_start == physical_rs.rs_start);
|
||||
IMPLY(vd->vdev_top == vd,
|
||||
logical_rs.rs_end == physical_rs.rs_end);
|
||||
|
||||
/*
|
||||
* Only a manual trim will be traversing the vdev sequentially.
|
||||
* For an auto trim all valid ranges should be added.
|
||||
*/
|
||||
if (ta->trim_type == TRIM_TYPE_MANUAL) {
|
||||
|
||||
/* Only add segments that we have not visited yet */
|
||||
if (physical_rs.rs_end <= vd->vdev_trim_last_offset)
|
||||
return;
|
||||
|
||||
/* Pick up where we left off mid-range. */
|
||||
if (vd->vdev_trim_last_offset > physical_rs.rs_start) {
|
||||
ASSERT3U(physical_rs.rs_end, >,
|
||||
vd->vdev_trim_last_offset);
|
||||
physical_rs.rs_start = vd->vdev_trim_last_offset;
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
|
||||
|
||||
/*
|
||||
* With raidz, it's possible that the logical range does not live on
|
||||
* this leaf vdev. We only add the physical range to this vdev's if it
|
||||
* has a length greater than 0.
|
||||
*/
|
||||
if (physical_rs.rs_end > physical_rs.rs_start) {
|
||||
range_tree_add(ta->trim_tree, physical_rs.rs_start,
|
||||
physical_rs.rs_end - physical_rs.rs_start);
|
||||
} else {
|
||||
ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
|
||||
}
|
||||
vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
+3
-1
@@ -1111,7 +1111,9 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
|
||||
bcopy(info, report->zcr_ckinfo, sizeof (*info));
|
||||
}
|
||||
|
||||
report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
|
||||
report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
|
||||
report->zcr_align =
|
||||
vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
|
||||
report->zcr_length = length;
|
||||
|
||||
#ifdef _KERNEL
|
||||
|
||||
+30
-12
@@ -1702,16 +1702,16 @@ zio_write_compress(zio_t *zio)
|
||||
return (zio);
|
||||
} else {
|
||||
/*
|
||||
* Round up compressed size up to the ashift
|
||||
* of the smallest-ashift device, and zero the tail.
|
||||
* This ensures that the compressed size of the BP
|
||||
* (and thus compressratio property) are correct,
|
||||
* Round compressed size up to the minimum allocation
|
||||
* size of the smallest-ashift device, and zero the
|
||||
* tail. This ensures that the compressed size of the
|
||||
* BP (and thus compressratio property) are correct,
|
||||
* in that we charge for the padding used to fill out
|
||||
* the last sector.
|
||||
*/
|
||||
ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
|
||||
size_t rounded = (size_t)P2ROUNDUP(psize,
|
||||
1ULL << spa->spa_min_ashift);
|
||||
ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
|
||||
size_t rounded = (size_t)roundup(psize,
|
||||
spa->spa_min_alloc);
|
||||
if (rounded >= lsize) {
|
||||
compress = ZIO_COMPRESS_OFF;
|
||||
zio_buf_free(cbuf, lsize);
|
||||
@@ -3754,19 +3754,37 @@ zio_vdev_io_start(zio_t *zio)
|
||||
* However, indirect vdevs point off to other vdevs which may have
|
||||
* DTL's, so we never bypass them. The child i/os on concrete vdevs
|
||||
* will be properly bypassed instead.
|
||||
*
|
||||
* Leaf DTL_PARTIAL can be empty when a legitimate write comes from
|
||||
* a dRAID spare vdev. For example, when a dRAID spare is first
|
||||
* used, its spare blocks need to be written to but the leaf vdev's
|
||||
* of such blocks can have empty DTL_PARTIAL.
|
||||
*
|
||||
* There seemed no clean way to allow such writes while bypassing
|
||||
* spurious ones. At this point, just avoid all bypassing for dRAID
|
||||
* for correctness.
|
||||
*/
|
||||
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
|
||||
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
|
||||
zio->io_txg != 0 && /* not a delegated i/o */
|
||||
vd->vdev_ops != &vdev_indirect_ops &&
|
||||
vd->vdev_top->vdev_ops != &vdev_draid_ops &&
|
||||
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
|
||||
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
||||
zio_vdev_io_bypass(zio);
|
||||
return (zio);
|
||||
}
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ ||
|
||||
zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) {
|
||||
/*
|
||||
* Select the next best leaf I/O to process. Distributed spares are
|
||||
* excluded since they dispatch the I/O directly to a leaf vdev after
|
||||
* applying the dRAID mapping.
|
||||
*/
|
||||
if (vd->vdev_ops->vdev_op_leaf &&
|
||||
vd->vdev_ops != &vdev_draid_spare_ops &&
|
||||
(zio->io_type == ZIO_TYPE_READ ||
|
||||
zio->io_type == ZIO_TYPE_WRITE ||
|
||||
zio->io_type == ZIO_TYPE_TRIM)) {
|
||||
|
||||
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
|
||||
return (zio);
|
||||
@@ -3803,8 +3821,8 @@ zio_vdev_io_done(zio_t *zio)
|
||||
if (zio->io_delay)
|
||||
zio->io_delay = gethrtime() - zio->io_delay;
|
||||
|
||||
if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
|
||||
|
||||
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
|
||||
vd->vdev_ops != &vdev_draid_spare_ops) {
|
||||
vdev_queue_io_done(zio);
|
||||
|
||||
if (zio->io_type == ZIO_TYPE_WRITE)
|
||||
@@ -4206,7 +4224,7 @@ zio_checksum_verify(zio_t *zio)
|
||||
if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
|
||||
return (zio);
|
||||
|
||||
ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
|
||||
ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
|
||||
}
|
||||
|
||||
if ((error = zio_checksum_error(zio, &info)) != 0) {
|
||||
|
||||
@@ -265,6 +265,12 @@ zio_handle_fault_injection(zio_t *zio, int error)
|
||||
if (zio->io_type != ZIO_TYPE_READ)
|
||||
return (0);
|
||||
|
||||
/*
|
||||
* A rebuild I/O has no checksum to verify.
|
||||
*/
|
||||
if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
|
||||
return (0);
|
||||
|
||||
rw_enter(&inject_lock, RW_READER);
|
||||
|
||||
for (handler = list_head(&inject_handlers); handler != NULL;
|
||||
|
||||
Reference in New Issue
Block a user