mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 10:37:35 +03:00
4101 metaslab_debug should allow for fine-grained control 4102 space_maps should store more information about themselves 4103 space map object blocksize should be increased 4105 removing a mirrored log device results in a leaked object 4106 asynchronously load metaslab Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Sebastien Roy <seb@delphix.com> Approved by: Garrett D'Amore <garrett@damore.org> Prior to this patch, space_maps were preferred solely based on the amount of free space left in each. Unfortunately, this heuristic didn't contain any information about the make-up of that free space, which meant we could keep preferring and loading a highly fragmented space map that wouldn't actually have enough contiguous space to satisfy the allocation; then unloading that space_map and repeating the process. This change modifies the space_map's to store additional information about the contiguous space in the space_map, so that we can use this information to make a better decision about which space_map to load. This requires reallocating all space_map objects to increase their bonus buffer size sizes enough to fit the new metadata. The above feature can be enabled via a new feature flag introduced by this change: com.delphix:spacemap_histogram In addition to the above, this patch allows the space_map block size to be increase. Currently the block size is set to be 4K in size, which has certain implications including the following: * 4K sector devices will not see any compression benefit * large space_maps require more metadata on-disk * large space_maps require more time to load (typically random reads) Now the space_map block size can adjust as needed up to the maximum size set via the space_map_max_blksz variable. A bug was fixed which resulted in potentially leaking an object when removing a mirrored log device. The previous logic for vdev_remove() did not deal with removing top-level vdevs that are interior vdevs (i.e. mirror) correctly. The problem would occur when removing a mirrored log device, and result in the DTL space map object being leaked; because top-level vdevs don't have DTL space map objects associated with them. References: https://www.illumos.org/issues/4101 https://www.illumos.org/issues/4102 https://www.illumos.org/issues/4103 https://www.illumos.org/issues/4105 https://www.illumos.org/issues/4106 https://github.com/illumos/illumos-gate/commit/0713e23 Porting notes: A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also, the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary. Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Prakash Surya <surya1@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2488
This commit is contained in:
committed by
Brian Behlendorf
parent
1be627f5c2
commit
93cf20764a
@@ -35,6 +35,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/gzip.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/lzjb.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/lz4.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/metaslab.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/range_tree.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/refcount.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/rrwlock.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/sa.o
|
||||
@@ -47,6 +48,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/space_reftree.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/txg.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/unique.o
|
||||
|
||||
+1
-1
@@ -1335,7 +1335,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
|
||||
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
|
||||
|
||||
/* Check for any allocated blocks beyond the first */
|
||||
if (dn->dn_phys->dn_maxblkid != 0)
|
||||
if (dn->dn_maxblkid != 0)
|
||||
goto fail;
|
||||
|
||||
mutex_enter(&dn->dn_dbufs_mtx);
|
||||
|
||||
+740
-538
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,391 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/spa.h>
|
||||
#include <sys/dmu.h>
|
||||
#include <sys/dnode.h>
|
||||
#include <sys/zio.h>
|
||||
#include <sys/range_tree.h>
|
||||
|
||||
static kmem_cache_t *range_seg_cache;
|
||||
|
||||
void
|
||||
range_tree_init(void)
|
||||
{
|
||||
ASSERT(range_seg_cache == NULL);
|
||||
range_seg_cache = kmem_cache_create("range_seg_cache",
|
||||
sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_fini(void)
|
||||
{
|
||||
kmem_cache_destroy(range_seg_cache);
|
||||
range_seg_cache = NULL;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_stat_verify(range_tree_t *rt)
|
||||
{
|
||||
range_seg_t *rs;
|
||||
uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
|
||||
int i;
|
||||
|
||||
for (rs = avl_first(&rt->rt_root); rs != NULL;
|
||||
rs = AVL_NEXT(&rt->rt_root, rs)) {
|
||||
uint64_t size = rs->rs_end - rs->rs_start;
|
||||
int idx = highbit(size) - 1;
|
||||
|
||||
hist[idx]++;
|
||||
ASSERT3U(hist[idx], !=, 0);
|
||||
}
|
||||
|
||||
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
|
||||
if (hist[i] != rt->rt_histogram[i]) {
|
||||
zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
|
||||
i, hist, hist[i], rt->rt_histogram[i]);
|
||||
}
|
||||
VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
|
||||
{
|
||||
uint64_t size = rs->rs_end - rs->rs_start;
|
||||
int idx = highbit(size) - 1;
|
||||
|
||||
ASSERT3U(idx, <,
|
||||
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
rt->rt_histogram[idx]++;
|
||||
ASSERT3U(rt->rt_histogram[idx], !=, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
|
||||
{
|
||||
uint64_t size = rs->rs_end - rs->rs_start;
|
||||
int idx = highbit(size) - 1;
|
||||
|
||||
ASSERT3U(idx, <,
|
||||
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
ASSERT3U(rt->rt_histogram[idx], !=, 0);
|
||||
rt->rt_histogram[idx]--;
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: caller is responsible for all locking.
|
||||
*/
|
||||
static int
|
||||
range_tree_seg_compare(const void *x1, const void *x2)
|
||||
{
|
||||
const range_seg_t *r1 = x1;
|
||||
const range_seg_t *r2 = x2;
|
||||
|
||||
if (r1->rs_start < r2->rs_start) {
|
||||
if (r1->rs_end > r2->rs_start)
|
||||
return (0);
|
||||
return (-1);
|
||||
}
|
||||
if (r1->rs_start > r2->rs_start) {
|
||||
if (r1->rs_start < r2->rs_end)
|
||||
return (0);
|
||||
return (1);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
range_tree_t *
|
||||
range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
|
||||
{
|
||||
range_tree_t *rt;
|
||||
|
||||
rt = kmem_zalloc(sizeof (range_tree_t), KM_PUSHPAGE);
|
||||
|
||||
avl_create(&rt->rt_root, range_tree_seg_compare,
|
||||
sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
|
||||
|
||||
rt->rt_lock = lp;
|
||||
rt->rt_ops = ops;
|
||||
rt->rt_arg = arg;
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_create(rt, rt->rt_arg);
|
||||
|
||||
return (rt);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_destroy(range_tree_t *rt)
|
||||
{
|
||||
VERIFY0(rt->rt_space);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
|
||||
|
||||
avl_destroy(&rt->rt_root);
|
||||
kmem_free(rt, sizeof (*rt));
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_add(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_tree_t *rt = arg;
|
||||
avl_index_t where;
|
||||
range_seg_t rsearch, *rs_before, *rs_after, *rs;
|
||||
uint64_t end = start + size;
|
||||
boolean_t merge_before, merge_after;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
VERIFY(size != 0);
|
||||
|
||||
rsearch.rs_start = start;
|
||||
rsearch.rs_end = end;
|
||||
rs = avl_find(&rt->rt_root, &rsearch, &where);
|
||||
|
||||
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
|
||||
zfs_panic_recover("zfs: allocating allocated segment"
|
||||
"(offset=%llu size=%llu)\n",
|
||||
(longlong_t)start, (longlong_t)size);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Make sure we don't overlap with either of our neighbors */
|
||||
VERIFY(rs == NULL);
|
||||
|
||||
rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
|
||||
rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
|
||||
|
||||
merge_before = (rs_before != NULL && rs_before->rs_end == start);
|
||||
merge_after = (rs_after != NULL && rs_after->rs_start == end);
|
||||
|
||||
if (merge_before && merge_after) {
|
||||
avl_remove(&rt->rt_root, rs_before);
|
||||
if (rt->rt_ops != NULL) {
|
||||
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
|
||||
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
|
||||
}
|
||||
|
||||
range_tree_stat_decr(rt, rs_before);
|
||||
range_tree_stat_decr(rt, rs_after);
|
||||
|
||||
rs_after->rs_start = rs_before->rs_start;
|
||||
kmem_cache_free(range_seg_cache, rs_before);
|
||||
rs = rs_after;
|
||||
} else if (merge_before) {
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
|
||||
|
||||
range_tree_stat_decr(rt, rs_before);
|
||||
|
||||
rs_before->rs_end = end;
|
||||
rs = rs_before;
|
||||
} else if (merge_after) {
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
|
||||
|
||||
range_tree_stat_decr(rt, rs_after);
|
||||
|
||||
rs_after->rs_start = start;
|
||||
rs = rs_after;
|
||||
} else {
|
||||
rs = kmem_cache_alloc(range_seg_cache, KM_PUSHPAGE);
|
||||
rs->rs_start = start;
|
||||
rs->rs_end = end;
|
||||
avl_insert(&rt->rt_root, rs, where);
|
||||
}
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
|
||||
range_tree_stat_incr(rt, rs);
|
||||
rt->rt_space += size;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_tree_t *rt = arg;
|
||||
avl_index_t where;
|
||||
range_seg_t rsearch, *rs, *newseg;
|
||||
uint64_t end = start + size;
|
||||
boolean_t left_over, right_over;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
VERIFY3U(size, !=, 0);
|
||||
VERIFY3U(size, <=, rt->rt_space);
|
||||
|
||||
rsearch.rs_start = start;
|
||||
rsearch.rs_end = end;
|
||||
rs = avl_find(&rt->rt_root, &rsearch, &where);
|
||||
|
||||
/* Make sure we completely overlap with someone */
|
||||
if (rs == NULL) {
|
||||
zfs_panic_recover("zfs: freeing free segment "
|
||||
"(offset=%llu size=%llu)",
|
||||
(longlong_t)start, (longlong_t)size);
|
||||
return;
|
||||
}
|
||||
VERIFY3U(rs->rs_start, <=, start);
|
||||
VERIFY3U(rs->rs_end, >=, end);
|
||||
|
||||
left_over = (rs->rs_start != start);
|
||||
right_over = (rs->rs_end != end);
|
||||
|
||||
range_tree_stat_decr(rt, rs);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
|
||||
|
||||
if (left_over && right_over) {
|
||||
newseg = kmem_cache_alloc(range_seg_cache, KM_PUSHPAGE);
|
||||
newseg->rs_start = end;
|
||||
newseg->rs_end = rs->rs_end;
|
||||
range_tree_stat_incr(rt, newseg);
|
||||
|
||||
rs->rs_end = start;
|
||||
|
||||
avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
|
||||
} else if (left_over) {
|
||||
rs->rs_end = start;
|
||||
} else if (right_over) {
|
||||
rs->rs_start = end;
|
||||
} else {
|
||||
avl_remove(&rt->rt_root, rs);
|
||||
kmem_cache_free(range_seg_cache, rs);
|
||||
rs = NULL;
|
||||
}
|
||||
|
||||
if (rs != NULL) {
|
||||
range_tree_stat_incr(rt, rs);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
}
|
||||
|
||||
rt->rt_space -= size;
|
||||
}
|
||||
|
||||
static range_seg_t *
|
||||
range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size,
|
||||
avl_index_t *wherep)
|
||||
{
|
||||
range_seg_t rsearch, *rs;
|
||||
uint64_t end = start + size;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
VERIFY(size != 0);
|
||||
|
||||
rsearch.rs_start = start;
|
||||
rsearch.rs_end = end;
|
||||
rs = avl_find(&rt->rt_root, &rsearch, wherep);
|
||||
|
||||
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end)
|
||||
return (rs);
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
|
||||
{
|
||||
range_seg_t *rs;
|
||||
avl_index_t where;
|
||||
|
||||
mutex_enter(rt->rt_lock);
|
||||
rs = range_tree_find(rt, off, size, &where);
|
||||
if (rs != NULL)
|
||||
panic("freeing free block; rs=%p", (void *)rs);
|
||||
mutex_exit(rt->rt_lock);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||
{
|
||||
avl_index_t where;
|
||||
|
||||
return (range_tree_find(rt, start, size, &where) != NULL);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
|
||||
{
|
||||
range_tree_t *rt;
|
||||
|
||||
ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
|
||||
ASSERT0(range_tree_space(*rtdst));
|
||||
ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
|
||||
|
||||
rt = *rtsrc;
|
||||
*rtsrc = *rtdst;
|
||||
*rtdst = rt;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
|
||||
{
|
||||
range_seg_t *rs;
|
||||
void *cookie = NULL;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
|
||||
|
||||
while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
|
||||
if (func != NULL)
|
||||
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
|
||||
kmem_cache_free(range_seg_cache, rs);
|
||||
}
|
||||
|
||||
bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
|
||||
rt->rt_space = 0;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
|
||||
{
|
||||
range_seg_t *rs;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
|
||||
for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
|
||||
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
range_tree_space(range_tree_t *rt)
|
||||
{
|
||||
return (rt->rt_space);
|
||||
}
|
||||
+24
-21
@@ -1259,6 +1259,15 @@ spa_unload(spa_t *spa)
|
||||
|
||||
bpobj_close(&spa->spa_deferred_bpobj);
|
||||
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
|
||||
/*
|
||||
* Close all vdevs.
|
||||
*/
|
||||
if (spa->spa_root_vdev)
|
||||
vdev_free(spa->spa_root_vdev);
|
||||
ASSERT(spa->spa_root_vdev == NULL);
|
||||
|
||||
/*
|
||||
* Close the dsl pool.
|
||||
*/
|
||||
@@ -1270,20 +1279,12 @@ spa_unload(spa_t *spa)
|
||||
|
||||
ddt_unload(spa);
|
||||
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
|
||||
/*
|
||||
* Drop and purge level 2 cache
|
||||
*/
|
||||
spa_l2cache_drop(spa);
|
||||
|
||||
/*
|
||||
* Close all vdevs.
|
||||
*/
|
||||
if (spa->spa_root_vdev)
|
||||
vdev_free(spa->spa_root_vdev);
|
||||
ASSERT(spa->spa_root_vdev == NULL);
|
||||
|
||||
for (i = 0; i < spa->spa_spares.sav_count; i++)
|
||||
vdev_free(spa->spa_spares.sav_vdevs[i]);
|
||||
if (spa->spa_spares.sav_vdevs) {
|
||||
@@ -4568,7 +4569,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
|
||||
vdev_dirty(tvd, VDD_DTL, newvd, txg);
|
||||
|
||||
/*
|
||||
* Restart the resilver
|
||||
* Schedule the resilver to restart in the future. We do this to
|
||||
* ensure that dmu_sync-ed blocks have been stitched into the
|
||||
* respective datasets.
|
||||
*/
|
||||
dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
|
||||
|
||||
@@ -5193,7 +5196,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
|
||||
ASSERT0(vd->vdev_stat.vs_alloc);
|
||||
txg = spa_vdev_config_enter(spa);
|
||||
vd->vdev_removing = B_TRUE;
|
||||
vdev_dirty(vd, 0, NULL, txg);
|
||||
vdev_dirty_leaves(vd, VDD_DTL, txg);
|
||||
vdev_config_dirty(vd);
|
||||
spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
|
||||
|
||||
@@ -5965,7 +5968,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
|
||||
ASSERT(zpool_prop_feature(nvpair_name(elem)));
|
||||
|
||||
fname = strchr(nvpair_name(elem), '@') + 1;
|
||||
VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
|
||||
VERIFY0(zfeature_lookup_name(fname, &feature));
|
||||
|
||||
spa_feature_enable(spa, feature, tx);
|
||||
spa_history_log_internal(spa, "set", tx,
|
||||
@@ -5973,7 +5976,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
|
||||
break;
|
||||
|
||||
case ZPOOL_PROP_VERSION:
|
||||
VERIFY(nvpair_value_uint64(elem, &intval) == 0);
|
||||
intval = fnvpair_value_uint64(elem);
|
||||
/*
|
||||
* The version is synced seperatly before other
|
||||
* properties and should be correct by now.
|
||||
@@ -5997,7 +6000,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
|
||||
*/
|
||||
break;
|
||||
case ZPOOL_PROP_COMMENT:
|
||||
VERIFY(nvpair_value_string(elem, &strval) == 0);
|
||||
strval = fnvpair_value_string(elem);
|
||||
if (spa->spa_comment != NULL)
|
||||
spa_strfree(spa->spa_comment);
|
||||
spa->spa_comment = spa_strdup(strval);
|
||||
@@ -6029,23 +6032,23 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
|
||||
|
||||
if (nvpair_type(elem) == DATA_TYPE_STRING) {
|
||||
ASSERT(proptype == PROP_TYPE_STRING);
|
||||
VERIFY(nvpair_value_string(elem, &strval) == 0);
|
||||
VERIFY(zap_update(mos,
|
||||
strval = fnvpair_value_string(elem);
|
||||
VERIFY0(zap_update(mos,
|
||||
spa->spa_pool_props_object, propname,
|
||||
1, strlen(strval) + 1, strval, tx) == 0);
|
||||
1, strlen(strval) + 1, strval, tx));
|
||||
spa_history_log_internal(spa, "set", tx,
|
||||
"%s=%s", nvpair_name(elem), strval);
|
||||
} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
|
||||
VERIFY(nvpair_value_uint64(elem, &intval) == 0);
|
||||
intval = fnvpair_value_uint64(elem);
|
||||
|
||||
if (proptype == PROP_TYPE_INDEX) {
|
||||
const char *unused;
|
||||
VERIFY(zpool_prop_index_to_string(
|
||||
prop, intval, &unused) == 0);
|
||||
VERIFY0(zpool_prop_index_to_string(
|
||||
prop, intval, &unused));
|
||||
}
|
||||
VERIFY(zap_update(mos,
|
||||
VERIFY0(zap_update(mos,
|
||||
spa->spa_pool_props_object, propname,
|
||||
8, 1, &intval, tx) == 0);
|
||||
8, 1, &intval, tx));
|
||||
spa_history_log_internal(spa, "set", tx,
|
||||
"%s=%lld", nvpair_name(elem), intval);
|
||||
} else {
|
||||
|
||||
@@ -986,7 +986,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
|
||||
txg_wait_synced(spa->spa_dsl_pool, txg);
|
||||
|
||||
if (vd != NULL) {
|
||||
ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
|
||||
ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
|
||||
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
|
||||
vdev_free(vd);
|
||||
spa_config_exit(spa, SCL_ALL, spa);
|
||||
@@ -1655,7 +1655,7 @@ spa_init(int mode)
|
||||
fm_init();
|
||||
refcount_init();
|
||||
unique_init();
|
||||
space_map_init();
|
||||
range_tree_init();
|
||||
ddt_init();
|
||||
zio_init();
|
||||
dmu_init();
|
||||
@@ -1682,7 +1682,7 @@ spa_fini(void)
|
||||
dmu_fini();
|
||||
zio_fini();
|
||||
ddt_fini();
|
||||
space_map_fini();
|
||||
range_tree_fini();
|
||||
unique_fini();
|
||||
refcount_fini();
|
||||
fm_fini();
|
||||
|
||||
+422
-466
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,159 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/range_tree.h>
|
||||
#include <sys/space_reftree.h>
|
||||
|
||||
/*
|
||||
* Space reference trees.
|
||||
*
|
||||
* A range tree is a collection of integers. Every integer is either
|
||||
* in the tree, or it's not. A space reference tree generalizes
|
||||
* the idea: it allows its members to have arbitrary reference counts,
|
||||
* as opposed to the implicit reference count of 0 or 1 in a range tree.
|
||||
* This representation comes in handy when computing the union or
|
||||
* intersection of multiple space maps. For example, the union of
|
||||
* N range trees is the subset of the reference tree with refcnt >= 1.
|
||||
* The intersection of N range trees is the subset with refcnt >= N.
|
||||
*
|
||||
* [It's very much like a Fourier transform. Unions and intersections
|
||||
* are hard to perform in the 'range tree domain', so we convert the trees
|
||||
* into the 'reference count domain', where it's trivial, then invert.]
|
||||
*
|
||||
* vdev_dtl_reassess() uses computations of this form to determine
|
||||
* DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
|
||||
* has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
|
||||
* has an outage wherever refcnt >= vdev_children.
|
||||
*/
|
||||
static int
|
||||
space_reftree_compare(const void *x1, const void *x2)
|
||||
{
|
||||
const space_ref_t *sr1 = x1;
|
||||
const space_ref_t *sr2 = x2;
|
||||
|
||||
if (sr1->sr_offset < sr2->sr_offset)
|
||||
return (-1);
|
||||
if (sr1->sr_offset > sr2->sr_offset)
|
||||
return (1);
|
||||
|
||||
if (sr1 < sr2)
|
||||
return (-1);
|
||||
if (sr1 > sr2)
|
||||
return (1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
void
|
||||
space_reftree_create(avl_tree_t *t)
|
||||
{
|
||||
avl_create(t, space_reftree_compare,
|
||||
sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
|
||||
}
|
||||
|
||||
void
|
||||
space_reftree_destroy(avl_tree_t *t)
|
||||
{
|
||||
space_ref_t *sr;
|
||||
void *cookie = NULL;
|
||||
|
||||
while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
|
||||
kmem_free(sr, sizeof (*sr));
|
||||
|
||||
avl_destroy(t);
|
||||
}
|
||||
|
||||
static void
|
||||
space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
|
||||
{
|
||||
space_ref_t *sr;
|
||||
|
||||
sr = kmem_alloc(sizeof (*sr), KM_PUSHPAGE);
|
||||
sr->sr_offset = offset;
|
||||
sr->sr_refcnt = refcnt;
|
||||
|
||||
avl_add(t, sr);
|
||||
}
|
||||
|
||||
void
|
||||
space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
|
||||
int64_t refcnt)
|
||||
{
|
||||
space_reftree_add_node(t, start, refcnt);
|
||||
space_reftree_add_node(t, end, -refcnt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert (or add) a range tree into a reference tree.
|
||||
*/
|
||||
void
|
||||
space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt)
|
||||
{
|
||||
range_seg_t *rs;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
|
||||
for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
|
||||
space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a reference tree into a range tree. The range tree will contain
|
||||
* all members of the reference tree for which refcnt >= minref.
|
||||
*/
|
||||
void
|
||||
space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref)
|
||||
{
|
||||
uint64_t start = -1ULL;
|
||||
int64_t refcnt = 0;
|
||||
space_ref_t *sr;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
|
||||
range_tree_vacate(rt, NULL, NULL);
|
||||
|
||||
for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
|
||||
refcnt += sr->sr_refcnt;
|
||||
if (refcnt >= minref) {
|
||||
if (start == -1ULL) {
|
||||
start = sr->sr_offset;
|
||||
}
|
||||
} else {
|
||||
if (start != -1ULL) {
|
||||
uint64_t end = sr->sr_offset;
|
||||
ASSERT(start <= end);
|
||||
if (end > start)
|
||||
range_tree_add(rt, start, end - start);
|
||||
start = -1ULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT(refcnt == 0);
|
||||
ASSERT(start == -1ULL);
|
||||
}
|
||||
+166
-141
@@ -36,6 +36,7 @@
|
||||
#include <sys/metaslab.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
#include <sys/space_map.h>
|
||||
#include <sys/space_reftree.h>
|
||||
#include <sys/zio.h>
|
||||
#include <sys/zap.h>
|
||||
#include <sys/fs/zfs.h>
|
||||
@@ -193,7 +194,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
|
||||
pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
|
||||
newsize = pvd->vdev_children * sizeof (vdev_t *);
|
||||
|
||||
newchild = kmem_zalloc(newsize, KM_PUSHPAGE);
|
||||
newchild = kmem_alloc(newsize, KM_PUSHPAGE);
|
||||
if (pvd->vdev_child != NULL) {
|
||||
bcopy(pvd->vdev_child, newchild, oldsize);
|
||||
kmem_free(pvd->vdev_child, oldsize);
|
||||
@@ -263,7 +264,7 @@ vdev_compact_children(vdev_t *pvd)
|
||||
if (pvd->vdev_child[c])
|
||||
newc++;
|
||||
|
||||
newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_PUSHPAGE);
|
||||
newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_PUSHPAGE);
|
||||
|
||||
for (c = newc = 0; c < oldc; c++) {
|
||||
if ((cvd = pvd->vdev_child[c]) != NULL) {
|
||||
@@ -324,7 +325,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
for (t = 0; t < DTL_TYPES; t++) {
|
||||
space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
|
||||
vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
|
||||
&vd->vdev_dtl_lock);
|
||||
}
|
||||
txg_list_create(&vd->vdev_ms_list,
|
||||
@@ -510,7 +511,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
alloctype == VDEV_ALLOC_ROOTPOOL)) {
|
||||
if (alloctype == VDEV_ALLOC_LOAD) {
|
||||
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
|
||||
&vd->vdev_dtl_smo.smo_object);
|
||||
&vd->vdev_dtl_object);
|
||||
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
|
||||
&vd->vdev_unspare);
|
||||
}
|
||||
@@ -633,9 +634,10 @@ vdev_free(vdev_t *vd)
|
||||
txg_list_destroy(&vd->vdev_dtl_list);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
space_map_close(vd->vdev_dtl_sm);
|
||||
for (t = 0; t < DTL_TYPES; t++) {
|
||||
space_map_unload(&vd->vdev_dtl[t]);
|
||||
space_map_destroy(&vd->vdev_dtl[t]);
|
||||
range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
|
||||
range_tree_destroy(vd->vdev_dtl[t]);
|
||||
}
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
@@ -859,27 +861,16 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
|
||||
vd->vdev_ms_count = newc;
|
||||
|
||||
for (m = oldc; m < newc; m++) {
|
||||
space_map_obj_t smo = { 0, 0, 0 };
|
||||
uint64_t object = 0;
|
||||
|
||||
if (txg == 0) {
|
||||
uint64_t object = 0;
|
||||
error = dmu_read(mos, vd->vdev_ms_array,
|
||||
m * sizeof (uint64_t), sizeof (uint64_t), &object,
|
||||
DMU_READ_PREFETCH);
|
||||
if (error)
|
||||
return (error);
|
||||
if (object != 0) {
|
||||
dmu_buf_t *db;
|
||||
error = dmu_bonus_hold(mos, object, FTAG, &db);
|
||||
if (error)
|
||||
return (error);
|
||||
ASSERT3U(db->db_size, >=, sizeof (smo));
|
||||
bcopy(db->db_data, &smo, sizeof (smo));
|
||||
ASSERT3U(smo.smo_object, ==, object);
|
||||
dmu_buf_rele(db, FTAG);
|
||||
}
|
||||
}
|
||||
vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
|
||||
m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
|
||||
vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, m, object, txg);
|
||||
}
|
||||
|
||||
if (txg == 0)
|
||||
@@ -907,9 +898,12 @@ vdev_metaslab_fini(vdev_t *vd)
|
||||
|
||||
if (vd->vdev_ms != NULL) {
|
||||
metaslab_group_passivate(vd->vdev_mg);
|
||||
for (m = 0; m < count; m++)
|
||||
if (vd->vdev_ms[m] != NULL)
|
||||
metaslab_fini(vd->vdev_ms[m]);
|
||||
for (m = 0; m < count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
|
||||
if (msp != NULL)
|
||||
metaslab_fini(msp);
|
||||
}
|
||||
kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
|
||||
vd->vdev_ms = NULL;
|
||||
}
|
||||
@@ -1572,9 +1566,10 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
|
||||
}
|
||||
|
||||
/*
|
||||
* Recursively initialize all labels.
|
||||
* Recursively load DTLs and initialize all labels.
|
||||
*/
|
||||
if ((error = vdev_label_init(vd, txg, isreplacing ?
|
||||
if ((error = vdev_dtl_load(vd)) != 0 ||
|
||||
(error = vdev_label_init(vd, txg, isreplacing ?
|
||||
VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
|
||||
vdev_close(vd);
|
||||
return (error);
|
||||
@@ -1610,6 +1605,18 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
|
||||
(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
|
||||
}
|
||||
|
||||
void
|
||||
vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
|
||||
{
|
||||
int c;
|
||||
|
||||
for (c = 0; c < vd->vdev_children; c++)
|
||||
vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf)
|
||||
vdev_dirty(vd->vdev_top, flags, vd, txg);
|
||||
}
|
||||
|
||||
/*
|
||||
* DTLs.
|
||||
*
|
||||
@@ -1651,31 +1658,31 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
|
||||
void
|
||||
vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
|
||||
{
|
||||
space_map_t *sm = &vd->vdev_dtl[t];
|
||||
range_tree_t *rt = vd->vdev_dtl[t];
|
||||
|
||||
ASSERT(t < DTL_TYPES);
|
||||
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
|
||||
ASSERT(spa_writeable(vd->vdev_spa));
|
||||
|
||||
mutex_enter(sm->sm_lock);
|
||||
if (!space_map_contains(sm, txg, size))
|
||||
space_map_add(sm, txg, size);
|
||||
mutex_exit(sm->sm_lock);
|
||||
mutex_enter(rt->rt_lock);
|
||||
if (!range_tree_contains(rt, txg, size))
|
||||
range_tree_add(rt, txg, size);
|
||||
mutex_exit(rt->rt_lock);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
|
||||
{
|
||||
space_map_t *sm = &vd->vdev_dtl[t];
|
||||
range_tree_t *rt = vd->vdev_dtl[t];
|
||||
boolean_t dirty = B_FALSE;
|
||||
|
||||
ASSERT(t < DTL_TYPES);
|
||||
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
|
||||
|
||||
mutex_enter(sm->sm_lock);
|
||||
if (sm->sm_space != 0)
|
||||
dirty = space_map_contains(sm, txg, size);
|
||||
mutex_exit(sm->sm_lock);
|
||||
mutex_enter(rt->rt_lock);
|
||||
if (range_tree_space(rt) != 0)
|
||||
dirty = range_tree_contains(rt, txg, size);
|
||||
mutex_exit(rt->rt_lock);
|
||||
|
||||
return (dirty);
|
||||
}
|
||||
@@ -1683,12 +1690,12 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
|
||||
boolean_t
|
||||
vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
|
||||
{
|
||||
space_map_t *sm = &vd->vdev_dtl[t];
|
||||
range_tree_t *rt = vd->vdev_dtl[t];
|
||||
boolean_t empty;
|
||||
|
||||
mutex_enter(sm->sm_lock);
|
||||
empty = (sm->sm_space == 0);
|
||||
mutex_exit(sm->sm_lock);
|
||||
mutex_enter(rt->rt_lock);
|
||||
empty = (range_tree_space(rt) == 0);
|
||||
mutex_exit(rt->rt_lock);
|
||||
|
||||
return (empty);
|
||||
}
|
||||
@@ -1699,14 +1706,14 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
|
||||
static uint64_t
|
||||
vdev_dtl_min(vdev_t *vd)
|
||||
{
|
||||
space_seg_t *ss;
|
||||
range_seg_t *rs;
|
||||
|
||||
ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
|
||||
ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
|
||||
ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
|
||||
ASSERT0(vd->vdev_children);
|
||||
|
||||
ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
|
||||
return (ss->ss_start - 1);
|
||||
rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
|
||||
return (rs->rs_start - 1);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1715,14 +1722,14 @@ vdev_dtl_min(vdev_t *vd)
|
||||
static uint64_t
|
||||
vdev_dtl_max(vdev_t *vd)
|
||||
{
|
||||
space_seg_t *ss;
|
||||
range_seg_t *rs;
|
||||
|
||||
ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
|
||||
ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
|
||||
ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
|
||||
ASSERT0(vd->vdev_children);
|
||||
|
||||
ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
|
||||
return (ss->ss_end);
|
||||
rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
|
||||
return (rs->rs_end);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1743,7 +1750,7 @@ vdev_dtl_should_excise(vdev_t *vd)
|
||||
ASSERT0(vd->vdev_children);
|
||||
|
||||
if (vd->vdev_resilver_txg == 0 ||
|
||||
vd->vdev_dtl[DTL_MISSING].sm_space == 0)
|
||||
range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
|
||||
return (B_TRUE);
|
||||
|
||||
/*
|
||||
@@ -1813,35 +1820,35 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
|
||||
* positive refcnt -- either 1 or 2. We then convert
|
||||
* the reference tree into the new DTL_MISSING map.
|
||||
*/
|
||||
space_map_ref_create(&reftree);
|
||||
space_map_ref_add_map(&reftree,
|
||||
&vd->vdev_dtl[DTL_MISSING], 1);
|
||||
space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
|
||||
space_map_ref_add_map(&reftree,
|
||||
&vd->vdev_dtl[DTL_SCRUB], 2);
|
||||
space_map_ref_generate_map(&reftree,
|
||||
&vd->vdev_dtl[DTL_MISSING], 1);
|
||||
space_map_ref_destroy(&reftree);
|
||||
space_reftree_create(&reftree);
|
||||
space_reftree_add_map(&reftree,
|
||||
vd->vdev_dtl[DTL_MISSING], 1);
|
||||
space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
|
||||
space_reftree_add_map(&reftree,
|
||||
vd->vdev_dtl[DTL_SCRUB], 2);
|
||||
space_reftree_generate_map(&reftree,
|
||||
vd->vdev_dtl[DTL_MISSING], 1);
|
||||
space_reftree_destroy(&reftree);
|
||||
}
|
||||
space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
|
||||
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
|
||||
space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
|
||||
range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
|
||||
range_tree_walk(vd->vdev_dtl[DTL_MISSING],
|
||||
range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
|
||||
if (scrub_done)
|
||||
space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
|
||||
space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
|
||||
range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
|
||||
range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
|
||||
if (!vdev_readable(vd))
|
||||
space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
|
||||
range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
|
||||
else
|
||||
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
|
||||
space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
|
||||
range_tree_walk(vd->vdev_dtl[DTL_MISSING],
|
||||
range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
|
||||
|
||||
/*
|
||||
* If the vdev was resilvering and no longer has any
|
||||
* DTLs then reset its resilvering flag.
|
||||
*/
|
||||
if (vd->vdev_resilver_txg != 0 &&
|
||||
vd->vdev_dtl[DTL_MISSING].sm_space == 0 &&
|
||||
vd->vdev_dtl[DTL_OUTAGE].sm_space == 0)
|
||||
range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
|
||||
range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0)
|
||||
vd->vdev_resilver_txg = 0;
|
||||
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
@@ -1853,6 +1860,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
for (t = 0; t < DTL_TYPES; t++) {
|
||||
int c;
|
||||
|
||||
/* account for child's outage in parent's missing map */
|
||||
int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
|
||||
if (t == DTL_SCRUB)
|
||||
@@ -1863,46 +1872,56 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
|
||||
minref = vd->vdev_nparity + 1; /* RAID-Z */
|
||||
else
|
||||
minref = vd->vdev_children; /* any kind of mirror */
|
||||
space_map_ref_create(&reftree);
|
||||
space_reftree_create(&reftree);
|
||||
for (c = 0; c < vd->vdev_children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
mutex_enter(&cvd->vdev_dtl_lock);
|
||||
space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
|
||||
space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
|
||||
mutex_exit(&cvd->vdev_dtl_lock);
|
||||
}
|
||||
space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
|
||||
space_map_ref_destroy(&reftree);
|
||||
space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
|
||||
space_reftree_destroy(&reftree);
|
||||
}
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
}
|
||||
|
||||
static int
|
||||
int
|
||||
vdev_dtl_load(vdev_t *vd)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
space_map_obj_t *smo = &vd->vdev_dtl_smo;
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
dmu_buf_t *db;
|
||||
int error;
|
||||
int error = 0;
|
||||
int c;
|
||||
|
||||
ASSERT(vd->vdev_children == 0);
|
||||
if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
|
||||
ASSERT(!vd->vdev_ishole);
|
||||
|
||||
if (smo->smo_object == 0)
|
||||
return (0);
|
||||
error = space_map_open(&vd->vdev_dtl_sm, mos,
|
||||
vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
|
||||
if (error)
|
||||
return (error);
|
||||
ASSERT(vd->vdev_dtl_sm != NULL);
|
||||
|
||||
ASSERT(!vd->vdev_ishole);
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
|
||||
/*
|
||||
* Now that we've opened the space_map we need to update
|
||||
* the in-core DTL.
|
||||
*/
|
||||
space_map_update(vd->vdev_dtl_sm);
|
||||
|
||||
error = space_map_load(vd->vdev_dtl_sm,
|
||||
vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
|
||||
return (error);
|
||||
}
|
||||
|
||||
ASSERT3U(db->db_size, >=, sizeof (*smo));
|
||||
bcopy(db->db_data, smo, sizeof (*smo));
|
||||
dmu_buf_rele(db, FTAG);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
|
||||
NULL, SM_ALLOC, smo, mos);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
for (c = 0; c < vd->vdev_children; c++) {
|
||||
error = vdev_dtl_load(vd->vdev_child[c]);
|
||||
if (error != 0)
|
||||
break;
|
||||
}
|
||||
|
||||
return (error);
|
||||
}
|
||||
@@ -1911,64 +1930,74 @@ void
|
||||
vdev_dtl_sync(vdev_t *vd, uint64_t txg)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
space_map_obj_t *smo = &vd->vdev_dtl_smo;
|
||||
space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
|
||||
range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
space_map_t smsync;
|
||||
kmutex_t smlock;
|
||||
dmu_buf_t *db;
|
||||
range_tree_t *rtsync;
|
||||
kmutex_t rtlock;
|
||||
dmu_tx_t *tx;
|
||||
uint64_t object = space_map_object(vd->vdev_dtl_sm);
|
||||
|
||||
ASSERT(!vd->vdev_ishole);
|
||||
ASSERT(vd->vdev_ops->vdev_op_leaf);
|
||||
|
||||
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
||||
|
||||
if (vd->vdev_detached) {
|
||||
if (smo->smo_object != 0) {
|
||||
VERIFY0(dmu_object_free(mos, smo->smo_object, tx));
|
||||
smo->smo_object = 0;
|
||||
}
|
||||
if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
space_map_free(vd->vdev_dtl_sm, tx);
|
||||
space_map_close(vd->vdev_dtl_sm);
|
||||
vd->vdev_dtl_sm = NULL;
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
dmu_tx_commit(tx);
|
||||
return;
|
||||
}
|
||||
|
||||
if (smo->smo_object == 0) {
|
||||
ASSERT(smo->smo_objsize == 0);
|
||||
ASSERT(smo->smo_alloc == 0);
|
||||
smo->smo_object = dmu_object_alloc(mos,
|
||||
DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
|
||||
DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
|
||||
ASSERT(smo->smo_object != 0);
|
||||
if (vd->vdev_dtl_sm == NULL) {
|
||||
uint64_t new_object;
|
||||
|
||||
new_object = space_map_alloc(mos, tx);
|
||||
VERIFY3U(new_object, !=, 0);
|
||||
|
||||
VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
|
||||
0, -1ULL, 0, &vd->vdev_dtl_lock));
|
||||
ASSERT(vd->vdev_dtl_sm != NULL);
|
||||
}
|
||||
|
||||
mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
rtsync = range_tree_create(NULL, NULL, &rtlock);
|
||||
|
||||
mutex_enter(&rtlock);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
range_tree_walk(rt, range_tree_add, rtsync);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
space_map_truncate(vd->vdev_dtl_sm, tx);
|
||||
space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
|
||||
range_tree_vacate(rtsync, NULL, NULL);
|
||||
|
||||
range_tree_destroy(rtsync);
|
||||
|
||||
mutex_exit(&rtlock);
|
||||
mutex_destroy(&rtlock);
|
||||
|
||||
/*
|
||||
* If the object for the space map has changed then dirty
|
||||
* the top level so that we update the config.
|
||||
*/
|
||||
if (object != space_map_object(vd->vdev_dtl_sm)) {
|
||||
zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
|
||||
"new object %llu", txg, spa_name(spa), object,
|
||||
space_map_object(vd->vdev_dtl_sm));
|
||||
vdev_config_dirty(vd->vdev_top);
|
||||
}
|
||||
|
||||
mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
|
||||
&smlock);
|
||||
|
||||
mutex_enter(&smlock);
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
space_map_walk(sm, space_map_add, &smsync);
|
||||
space_map_update(vd->vdev_dtl_sm);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
space_map_truncate(smo, mos, tx);
|
||||
space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
|
||||
space_map_vacate(&smsync, NULL, NULL);
|
||||
|
||||
space_map_destroy(&smsync);
|
||||
|
||||
mutex_exit(&smlock);
|
||||
mutex_destroy(&smlock);
|
||||
|
||||
VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
|
||||
dmu_buf_will_dirty(db, tx);
|
||||
ASSERT3U(db->db_size, >=, sizeof (*smo));
|
||||
bcopy(smo, db->db_data, sizeof (*smo));
|
||||
dmu_buf_rele(db, FTAG);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2018,7 +2047,7 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
|
||||
|
||||
if (vd->vdev_children == 0) {
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
|
||||
if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
|
||||
vdev_writeable(vd)) {
|
||||
|
||||
thismin = vdev_dtl_min(vd);
|
||||
@@ -2126,29 +2155,25 @@ vdev_remove(vdev_t *vd, uint64_t txg)
|
||||
|
||||
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
|
||||
|
||||
if (vd->vdev_dtl_smo.smo_object) {
|
||||
ASSERT0(vd->vdev_dtl_smo.smo_alloc);
|
||||
(void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
|
||||
vd->vdev_dtl_smo.smo_object = 0;
|
||||
}
|
||||
|
||||
if (vd->vdev_ms != NULL) {
|
||||
for (m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
|
||||
if (msp == NULL || msp->ms_smo.smo_object == 0)
|
||||
if (msp == NULL || msp->ms_sm == NULL)
|
||||
continue;
|
||||
|
||||
ASSERT0(msp->ms_smo.smo_alloc);
|
||||
(void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
|
||||
msp->ms_smo.smo_object = 0;
|
||||
mutex_enter(&msp->ms_lock);
|
||||
VERIFY0(space_map_allocated(msp->ms_sm));
|
||||
space_map_free(msp->ms_sm, tx);
|
||||
space_map_close(msp->ms_sm);
|
||||
msp->ms_sm = NULL;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
}
|
||||
|
||||
if (vd->vdev_ms_array) {
|
||||
(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
|
||||
vd->vdev_ms_array = 0;
|
||||
vd->vdev_ms_shift = 0;
|
||||
}
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
|
||||
@@ -283,9 +283,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
|
||||
vd->vdev_removing);
|
||||
}
|
||||
|
||||
if (vd->vdev_dtl_smo.smo_object != 0)
|
||||
if (vd->vdev_dtl_sm != NULL) {
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
|
||||
vd->vdev_dtl_smo.smo_object);
|
||||
space_map_object(vd->vdev_dtl_sm));
|
||||
}
|
||||
|
||||
if (vd->vdev_crtxg)
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
|
||||
|
||||
+22
-12
@@ -369,36 +369,46 @@ spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
|
||||
spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx));
|
||||
}
|
||||
|
||||
/*
|
||||
* If the specified feature has not yet been enabled, this function returns
|
||||
* ENOTSUP; otherwise, this function increments the feature's refcount (or
|
||||
* returns EOVERFLOW if the refcount cannot be incremented). This function must
|
||||
* be called from syncing context.
|
||||
*/
|
||||
void
|
||||
spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(dmu_tx_is_syncing(tx));
|
||||
ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
|
||||
VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
|
||||
spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
|
||||
spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx));
|
||||
}
|
||||
|
||||
/*
|
||||
* If the specified feature has not yet been enabled, this function returns
|
||||
* ENOTSUP; otherwise, this function decrements the feature's refcount (or
|
||||
* returns EOVERFLOW if the refcount is already 0). This function must
|
||||
* be called from syncing context.
|
||||
*/
|
||||
void
|
||||
spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(dmu_tx_is_syncing(tx));
|
||||
ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
|
||||
VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
|
||||
spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
|
||||
spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx));
|
||||
}
|
||||
|
||||
/*
|
||||
* This interface is for debugging only. Normal consumers should use
|
||||
* spa_feature_is_enabled/spa_feature_is_active.
|
||||
*/
|
||||
int
|
||||
spa_feature_get_refcount(spa_t *spa, zfeature_info_t *feature)
|
||||
{
|
||||
int err;
|
||||
uint64_t refcount = 0;
|
||||
|
||||
if (spa_version(spa) < SPA_VERSION_FEATURES)
|
||||
return (B_FALSE);
|
||||
|
||||
err = feature_get_refcount(spa->spa_meta_objset,
|
||||
spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
|
||||
feature, &refcount);
|
||||
ASSERT(err == 0 || err == ENOTSUP);
|
||||
return (err == 0 ? refcount : 0);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature)
|
||||
{
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
*/
|
||||
|
||||
@@ -164,4 +164,7 @@ zpool_feature_init(void)
|
||||
zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
|
||||
"org.illumos:lz4_compress", "lz4_compress",
|
||||
"LZ4 compression algorithm support.", B_FALSE, B_FALSE, NULL);
|
||||
zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
|
||||
"com.delphix:spacemap_histogram", "spacemap_histogram",
|
||||
"Spacemaps maintain space histograms.", B_TRUE, B_FALSE, NULL);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user