mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-25 02:49:32 +03:00
4101 metaslab_debug should allow for fine-grained control 4102 space_maps should store more information about themselves 4103 space map object blocksize should be increased 4105 removing a mirrored log device results in a leaked object 4106 asynchronously load metaslab Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Sebastien Roy <seb@delphix.com> Approved by: Garrett D'Amore <garrett@damore.org> Prior to this patch, space_maps were preferred solely based on the amount of free space left in each. Unfortunately, this heuristic didn't contain any information about the make-up of that free space, which meant we could keep preferring and loading a highly fragmented space map that wouldn't actually have enough contiguous space to satisfy the allocation; then unloading that space_map and repeating the process. This change modifies the space_map's to store additional information about the contiguous space in the space_map, so that we can use this information to make a better decision about which space_map to load. This requires reallocating all space_map objects to increase their bonus buffer size sizes enough to fit the new metadata. The above feature can be enabled via a new feature flag introduced by this change: com.delphix:spacemap_histogram In addition to the above, this patch allows the space_map block size to be increase. Currently the block size is set to be 4K in size, which has certain implications including the following: * 4K sector devices will not see any compression benefit * large space_maps require more metadata on-disk * large space_maps require more time to load (typically random reads) Now the space_map block size can adjust as needed up to the maximum size set via the space_map_max_blksz variable. A bug was fixed which resulted in potentially leaking an object when removing a mirrored log device. The previous logic for vdev_remove() did not deal with removing top-level vdevs that are interior vdevs (i.e. mirror) correctly. The problem would occur when removing a mirrored log device, and result in the DTL space map object being leaked; because top-level vdevs don't have DTL space map objects associated with them. References: https://www.illumos.org/issues/4101 https://www.illumos.org/issues/4102 https://www.illumos.org/issues/4103 https://www.illumos.org/issues/4105 https://www.illumos.org/issues/4106 https://github.com/illumos/illumos-gate/commit/0713e23 Porting notes: A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also, the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary. Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Prakash Surya <surya1@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2488
This commit is contained in:
parent
1be627f5c2
commit
93cf20764a
233
cmd/zdb/zdb.c
233
cmd/zdb/zdb.c
@ -246,7 +246,7 @@ const char histo_stars[] = "****************************************";
|
||||
const int histo_width = sizeof (histo_stars) - 1;
|
||||
|
||||
static void
|
||||
dump_histogram(const uint64_t *histo, int size)
|
||||
dump_histogram(const uint64_t *histo, int size, int offset)
|
||||
{
|
||||
int i;
|
||||
int minidx = size - 1;
|
||||
@ -267,7 +267,7 @@ dump_histogram(const uint64_t *histo, int size)
|
||||
|
||||
for (i = minidx; i <= maxidx; i++) {
|
||||
(void) printf("\t\t\t%3u: %6llu %s\n",
|
||||
i, (u_longlong_t)histo[i],
|
||||
i + offset, (u_longlong_t)histo[i],
|
||||
&histo_stars[(max - histo[i]) * histo_width / max]);
|
||||
}
|
||||
}
|
||||
@ -320,19 +320,19 @@ dump_zap_stats(objset_t *os, uint64_t object)
|
||||
(u_longlong_t)zs.zs_salt);
|
||||
|
||||
(void) printf("\t\tLeafs with 2^n pointers:\n");
|
||||
dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE);
|
||||
dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
|
||||
|
||||
(void) printf("\t\tBlocks with n*5 entries:\n");
|
||||
dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE);
|
||||
dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
|
||||
|
||||
(void) printf("\t\tBlocks n/10 full:\n");
|
||||
dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE);
|
||||
dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
|
||||
|
||||
(void) printf("\t\tEntries with n chunks:\n");
|
||||
dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE);
|
||||
dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
|
||||
|
||||
(void) printf("\t\tBuckets with n entries:\n");
|
||||
dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE);
|
||||
dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
|
||||
}
|
||||
|
||||
/*ARGSUSED*/
|
||||
@ -521,26 +521,87 @@ dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
|
||||
zap_cursor_fini(&zc);
|
||||
}
|
||||
|
||||
int
|
||||
get_dtl_refcount(vdev_t *vd)
|
||||
{
|
||||
int refcount = 0;
|
||||
int c;
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf) {
|
||||
space_map_t *sm = vd->vdev_dtl_sm;
|
||||
|
||||
if (sm != NULL &&
|
||||
sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
|
||||
return (1);
|
||||
return (0);
|
||||
}
|
||||
|
||||
for (c = 0; c < vd->vdev_children; c++)
|
||||
refcount += get_dtl_refcount(vd->vdev_child[c]);
|
||||
return (refcount);
|
||||
}
|
||||
|
||||
int
|
||||
get_metaslab_refcount(vdev_t *vd)
|
||||
{
|
||||
int refcount = 0;
|
||||
int c, m;
|
||||
|
||||
if (vd->vdev_top == vd) {
|
||||
for (m = 0; m < vd->vdev_ms_count; m++) {
|
||||
space_map_t *sm = vd->vdev_ms[m]->ms_sm;
|
||||
|
||||
if (sm != NULL &&
|
||||
sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
|
||||
refcount++;
|
||||
}
|
||||
}
|
||||
for (c = 0; c < vd->vdev_children; c++)
|
||||
refcount += get_metaslab_refcount(vd->vdev_child[c]);
|
||||
|
||||
return (refcount);
|
||||
}
|
||||
|
||||
static int
|
||||
verify_spacemap_refcounts(spa_t *spa)
|
||||
{
|
||||
int expected_refcount, actual_refcount;
|
||||
|
||||
expected_refcount = spa_feature_get_refcount(spa,
|
||||
&spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM]);
|
||||
actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
|
||||
actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
|
||||
|
||||
if (expected_refcount != actual_refcount) {
|
||||
(void) printf("space map refcount mismatch: expected %d != "
|
||||
"actual %d\n", expected_refcount, actual_refcount);
|
||||
return (2);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
|
||||
dump_spacemap(objset_t *os, space_map_t *sm)
|
||||
{
|
||||
uint64_t alloc, offset, entry;
|
||||
uint8_t mapshift = sm->sm_shift;
|
||||
uint64_t mapstart = sm->sm_start;
|
||||
char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
|
||||
"INVALID", "INVALID", "INVALID", "INVALID" };
|
||||
|
||||
if (smo->smo_object == 0)
|
||||
if (sm == NULL)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Print out the freelist entries in both encoded and decoded form.
|
||||
*/
|
||||
alloc = 0;
|
||||
for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
|
||||
VERIFY3U(0, ==, dmu_read(os, smo->smo_object, offset,
|
||||
for (offset = 0; offset < space_map_length(sm);
|
||||
offset += sizeof (entry)) {
|
||||
uint8_t mapshift = sm->sm_shift;
|
||||
|
||||
VERIFY0(dmu_read(os, space_map_object(sm), offset,
|
||||
sizeof (entry), &entry, DMU_READ_PREFETCH));
|
||||
if (SM_DEBUG_DECODE(entry)) {
|
||||
|
||||
(void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
|
||||
(u_longlong_t)(offset / sizeof (entry)),
|
||||
ddata[SM_DEBUG_ACTION_DECODE(entry)],
|
||||
@ -552,10 +613,10 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
|
||||
(u_longlong_t)(offset / sizeof (entry)),
|
||||
SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
|
||||
(u_longlong_t)((SM_OFFSET_DECODE(entry) <<
|
||||
mapshift) + mapstart),
|
||||
mapshift) + sm->sm_start),
|
||||
(u_longlong_t)((SM_OFFSET_DECODE(entry) <<
|
||||
mapshift) + mapstart + (SM_RUN_DECODE(entry) <<
|
||||
mapshift)),
|
||||
mapshift) + sm->sm_start +
|
||||
(SM_RUN_DECODE(entry) << mapshift)),
|
||||
(u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
|
||||
if (SM_TYPE_DECODE(entry) == SM_ALLOC)
|
||||
alloc += SM_RUN_DECODE(entry) << mapshift;
|
||||
@ -563,10 +624,10 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
|
||||
alloc -= SM_RUN_DECODE(entry) << mapshift;
|
||||
}
|
||||
}
|
||||
if (alloc != smo->smo_alloc) {
|
||||
if (alloc != space_map_allocated(sm)) {
|
||||
(void) printf("space_map_object alloc (%llu) INCONSISTENT "
|
||||
"with space map summary (%llu)\n",
|
||||
(u_longlong_t)smo->smo_alloc, (u_longlong_t)alloc);
|
||||
(u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
|
||||
}
|
||||
}
|
||||
|
||||
@ -574,15 +635,17 @@ static void
|
||||
dump_metaslab_stats(metaslab_t *msp)
|
||||
{
|
||||
char maxbuf[32];
|
||||
space_map_t *sm = msp->ms_map;
|
||||
avl_tree_t *t = sm->sm_pp_root;
|
||||
int free_pct = sm->sm_space * 100 / sm->sm_size;
|
||||
range_tree_t *rt = msp->ms_tree;
|
||||
avl_tree_t *t = &msp->ms_size_tree;
|
||||
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
|
||||
|
||||
zdb_nicenum(space_map_maxsize(sm), maxbuf);
|
||||
zdb_nicenum(metaslab_block_maxsize(msp), maxbuf);
|
||||
|
||||
(void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
|
||||
"segments", avl_numnodes(t), "maxsize", maxbuf,
|
||||
"freepct", free_pct);
|
||||
(void) printf("\tIn-memory histogram:\n");
|
||||
dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -590,33 +653,45 @@ dump_metaslab(metaslab_t *msp)
|
||||
{
|
||||
vdev_t *vd = msp->ms_group->mg_vd;
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
space_map_t *sm = msp->ms_map;
|
||||
space_map_obj_t *smo = &msp->ms_smo;
|
||||
space_map_t *sm = msp->ms_sm;
|
||||
char freebuf[32];
|
||||
|
||||
zdb_nicenum(sm->sm_size - smo->smo_alloc, freebuf);
|
||||
zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf);
|
||||
|
||||
(void) printf(
|
||||
"\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
|
||||
(u_longlong_t)(sm->sm_start / sm->sm_size),
|
||||
(u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf);
|
||||
(u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
|
||||
(u_longlong_t)space_map_object(sm), freebuf);
|
||||
|
||||
if (dump_opt['m'] > 1 && !dump_opt['L']) {
|
||||
if (dump_opt['m'] > 2 && !dump_opt['L']) {
|
||||
mutex_enter(&msp->ms_lock);
|
||||
space_map_load_wait(sm);
|
||||
if (!sm->sm_loaded)
|
||||
VERIFY(space_map_load(sm, zfs_metaslab_ops,
|
||||
SM_FREE, smo, spa->spa_meta_objset) == 0);
|
||||
metaslab_load_wait(msp);
|
||||
if (!msp->ms_loaded) {
|
||||
VERIFY0(metaslab_load(msp));
|
||||
range_tree_stat_verify(msp->ms_tree);
|
||||
}
|
||||
dump_metaslab_stats(msp);
|
||||
space_map_unload(sm);
|
||||
metaslab_unload(msp);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
|
||||
ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift));
|
||||
if (dump_opt['m'] > 1 && sm != NULL &&
|
||||
spa_feature_is_active(spa,
|
||||
&spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM])) {
|
||||
/*
|
||||
* The space map histogram represents free space in chunks
|
||||
* of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
|
||||
*/
|
||||
(void) printf("\tOn-disk histogram:\n");
|
||||
dump_histogram(sm->sm_phys->smp_histogram,
|
||||
SPACE_MAP_HISTOGRAM_SIZE(sm), sm->sm_shift);
|
||||
}
|
||||
|
||||
if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
|
||||
ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
|
||||
|
||||
mutex_enter(&msp->ms_lock);
|
||||
dump_spacemap(spa->spa_meta_objset, smo, sm);
|
||||
dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
}
|
||||
@ -812,9 +887,9 @@ dump_all_ddts(spa_t *spa)
|
||||
}
|
||||
|
||||
static void
|
||||
dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
char *prefix = (void *)sm;
|
||||
char *prefix = arg;
|
||||
|
||||
(void) printf("%s [%llu,%llu) length %llu\n",
|
||||
prefix,
|
||||
@ -845,17 +920,17 @@ dump_dtl(vdev_t *vd, int indent)
|
||||
required ? "DTL-required" : "DTL-expendable");
|
||||
|
||||
for (t = 0; t < DTL_TYPES; t++) {
|
||||
space_map_t *sm = &vd->vdev_dtl[t];
|
||||
if (sm->sm_space == 0)
|
||||
range_tree_t *rt = vd->vdev_dtl[t];
|
||||
if (range_tree_space(rt) == 0)
|
||||
continue;
|
||||
(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
|
||||
indent + 2, "", name[t]);
|
||||
mutex_enter(sm->sm_lock);
|
||||
space_map_walk(sm, dump_dtl_seg, (void *)prefix);
|
||||
mutex_exit(sm->sm_lock);
|
||||
mutex_enter(rt->rt_lock);
|
||||
range_tree_walk(rt, dump_dtl_seg, prefix);
|
||||
mutex_exit(rt->rt_lock);
|
||||
if (dump_opt['d'] > 5 && vd->vdev_children == 0)
|
||||
dump_spacemap(spa->spa_meta_objset,
|
||||
&vd->vdev_dtl_smo, sm);
|
||||
vd->vdev_dtl_sm);
|
||||
}
|
||||
|
||||
for (c = 0; c < vd->vdev_children; c++)
|
||||
@ -2261,39 +2336,17 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
||||
}
|
||||
|
||||
static void
|
||||
zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
zdb_leak(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
vdev_t *vd = sm->sm_ppd;
|
||||
vdev_t *vd = arg;
|
||||
|
||||
(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
|
||||
(u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static void
|
||||
zdb_space_map_load(space_map_t *sm)
|
||||
{
|
||||
}
|
||||
|
||||
static void
|
||||
zdb_space_map_unload(space_map_t *sm)
|
||||
{
|
||||
space_map_vacate(sm, zdb_leak, sm);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static void
|
||||
zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
{
|
||||
}
|
||||
|
||||
static space_map_ops_t zdb_space_map_ops = {
|
||||
zdb_space_map_load,
|
||||
zdb_space_map_unload,
|
||||
static metaslab_ops_t zdb_metaslab_ops = {
|
||||
NULL, /* alloc */
|
||||
zdb_space_map_claim,
|
||||
NULL, /* free */
|
||||
NULL /* maxsize */
|
||||
NULL /* fragmented */
|
||||
};
|
||||
|
||||
static void
|
||||
@ -2350,11 +2403,21 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
|
||||
for (m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
mutex_enter(&msp->ms_lock);
|
||||
space_map_unload(msp->ms_map);
|
||||
VERIFY(space_map_load(msp->ms_map,
|
||||
&zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
|
||||
spa->spa_meta_objset) == 0);
|
||||
msp->ms_map->sm_ppd = vd;
|
||||
metaslab_unload(msp);
|
||||
|
||||
/*
|
||||
* For leak detection, we overload the metaslab
|
||||
* ms_tree to contain allocated segments
|
||||
* instead of free segments. As a result,
|
||||
* we can't use the normal metaslab_load/unload
|
||||
* interfaces.
|
||||
*/
|
||||
if (msp->ms_sm != NULL) {
|
||||
msp->ms_ops = &zdb_metaslab_ops;
|
||||
VERIFY0(space_map_load(msp->ms_sm,
|
||||
msp->ms_tree, SM_ALLOC));
|
||||
msp->ms_loaded = B_TRUE;
|
||||
}
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
}
|
||||
@ -2379,7 +2442,20 @@ zdb_leak_fini(spa_t *spa)
|
||||
for (m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
mutex_enter(&msp->ms_lock);
|
||||
space_map_unload(msp->ms_map);
|
||||
|
||||
/*
|
||||
* The ms_tree has been overloaded to
|
||||
* contain allocated segments. Now that we
|
||||
* finished traversing all blocks, any
|
||||
* block that remains in the ms_tree
|
||||
* represents an allocated block that we
|
||||
* did not claim during the traversal.
|
||||
* Claimed blocks would have been removed
|
||||
* from the ms_tree.
|
||||
*/
|
||||
range_tree_vacate(msp->ms_tree, zdb_leak, vd);
|
||||
msp->ms_loaded = B_FALSE;
|
||||
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
}
|
||||
@ -2596,7 +2672,7 @@ dump_block_stats(spa_t *spa)
|
||||
"(in 512-byte sectors): "
|
||||
"number of blocks\n");
|
||||
dump_histogram(zb->zb_psize_histogram,
|
||||
PSIZE_HISTO_SIZE);
|
||||
PSIZE_HISTO_SIZE, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2769,6 +2845,9 @@ dump_zpool(spa_t *spa)
|
||||
if (dump_opt['b'] || dump_opt['c'])
|
||||
rc = dump_block_stats(spa);
|
||||
|
||||
if (rc == 0)
|
||||
rc = verify_spacemap_refcounts(spa);
|
||||
|
||||
if (dump_opt['s'])
|
||||
show_pool_stats(spa);
|
||||
|
||||
|
@ -32,12 +32,14 @@ COMMON_H = \
|
||||
$(top_srcdir)/include/sys/metaslab_impl.h \
|
||||
$(top_srcdir)/include/sys/nvpair.h \
|
||||
$(top_srcdir)/include/sys/nvpair_impl.h \
|
||||
$(top_srcdir)/include/sys/range_tree.h \
|
||||
$(top_srcdir)/include/sys/refcount.h \
|
||||
$(top_srcdir)/include/sys/rrwlock.h \
|
||||
$(top_srcdir)/include/sys/sa.h \
|
||||
$(top_srcdir)/include/sys/sa_impl.h \
|
||||
$(top_srcdir)/include/sys/spa_boot.h \
|
||||
$(top_srcdir)/include/sys/space_map.h \
|
||||
$(top_srcdir)/include/sys/space_reftree.h \
|
||||
$(top_srcdir)/include/sys/spa.h \
|
||||
$(top_srcdir)/include/sys/spa_impl.h \
|
||||
$(top_srcdir)/include/sys/txg.h \
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_METASLAB_H
|
||||
@ -36,14 +36,25 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern space_map_ops_t *zfs_metaslab_ops;
|
||||
typedef struct metaslab_ops {
|
||||
uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size);
|
||||
boolean_t (*msop_fragmented)(metaslab_t *msp);
|
||||
} metaslab_ops_t;
|
||||
|
||||
extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
|
||||
uint64_t start, uint64_t size, uint64_t txg);
|
||||
extern void metaslab_fini(metaslab_t *msp);
|
||||
extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
|
||||
extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
|
||||
extern void metaslab_sync_reassess(metaslab_group_t *mg);
|
||||
extern metaslab_ops_t *zfs_metaslab_ops;
|
||||
|
||||
metaslab_t *metaslab_init(metaslab_group_t *mg, uint64_t id,
|
||||
uint64_t object, uint64_t txg);
|
||||
void metaslab_fini(metaslab_t *msp);
|
||||
|
||||
void metaslab_load_wait(metaslab_t *msp);
|
||||
int metaslab_load(metaslab_t *msp);
|
||||
void metaslab_unload(metaslab_t *msp);
|
||||
|
||||
void metaslab_sync(metaslab_t *msp, uint64_t txg);
|
||||
void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
|
||||
void metaslab_sync_reassess(metaslab_group_t *mg);
|
||||
uint64_t metaslab_block_maxsize(metaslab_t *msp);
|
||||
|
||||
#define METASLAB_HINTBP_FAVOR 0x0
|
||||
#define METASLAB_HINTBP_AVOID 0x1
|
||||
@ -52,33 +63,30 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg);
|
||||
#define METASLAB_GANG_AVOID 0x8
|
||||
#define METASLAB_FASTWRITE 0x10
|
||||
|
||||
extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||
int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||
blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
|
||||
extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
|
||||
boolean_t now);
|
||||
extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
|
||||
extern void metaslab_check_free(spa_t *spa, const blkptr_t *bp);
|
||||
extern void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp);
|
||||
extern void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp);
|
||||
void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now);
|
||||
int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
|
||||
void metaslab_check_free(spa_t *spa, const blkptr_t *bp);
|
||||
void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp);
|
||||
void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp);
|
||||
|
||||
extern metaslab_class_t *metaslab_class_create(spa_t *spa,
|
||||
space_map_ops_t *ops);
|
||||
extern void metaslab_class_destroy(metaslab_class_t *mc);
|
||||
extern int metaslab_class_validate(metaslab_class_t *mc);
|
||||
metaslab_class_t *metaslab_class_create(spa_t *spa, metaslab_ops_t *ops);
|
||||
void metaslab_class_destroy(metaslab_class_t *mc);
|
||||
int metaslab_class_validate(metaslab_class_t *mc);
|
||||
|
||||
extern void metaslab_class_space_update(metaslab_class_t *mc,
|
||||
void metaslab_class_space_update(metaslab_class_t *mc,
|
||||
int64_t alloc_delta, int64_t defer_delta,
|
||||
int64_t space_delta, int64_t dspace_delta);
|
||||
extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
|
||||
extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
|
||||
extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
|
||||
extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
|
||||
uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
|
||||
uint64_t metaslab_class_get_space(metaslab_class_t *mc);
|
||||
uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
|
||||
uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
|
||||
|
||||
extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
|
||||
vdev_t *vd);
|
||||
extern void metaslab_group_destroy(metaslab_group_t *mg);
|
||||
extern void metaslab_group_activate(metaslab_group_t *mg);
|
||||
extern void metaslab_group_passivate(metaslab_group_t *mg);
|
||||
metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd);
|
||||
void metaslab_group_destroy(metaslab_group_t *mg);
|
||||
void metaslab_group_activate(metaslab_group_t *mg);
|
||||
void metaslab_group_passivate(metaslab_group_t *mg);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -32,6 +32,7 @@
|
||||
|
||||
#include <sys/metaslab.h>
|
||||
#include <sys/space_map.h>
|
||||
#include <sys/range_tree.h>
|
||||
#include <sys/vdev.h>
|
||||
#include <sys/txg.h>
|
||||
#include <sys/avl.h>
|
||||
@ -43,7 +44,7 @@ extern "C" {
|
||||
struct metaslab_class {
|
||||
spa_t *mc_spa;
|
||||
metaslab_group_t *mc_rotor;
|
||||
space_map_ops_t *mc_ops;
|
||||
metaslab_ops_t *mc_ops;
|
||||
uint64_t mc_aliquot;
|
||||
uint64_t mc_alloc_groups; /* # of allocatable groups */
|
||||
uint64_t mc_alloc; /* total allocated space */
|
||||
@ -57,7 +58,6 @@ struct metaslab_group {
|
||||
kmutex_t mg_lock;
|
||||
avl_tree_t mg_metaslab_tree;
|
||||
uint64_t mg_aliquot;
|
||||
uint64_t mg_bonus_area;
|
||||
uint64_t mg_alloc_failures;
|
||||
boolean_t mg_allocatable; /* can we allocate? */
|
||||
uint64_t mg_free_capacity; /* percentage free */
|
||||
@ -65,45 +65,102 @@ struct metaslab_group {
|
||||
int64_t mg_activation_count;
|
||||
metaslab_class_t *mg_class;
|
||||
vdev_t *mg_vd;
|
||||
taskq_t *mg_taskq;
|
||||
metaslab_group_t *mg_prev;
|
||||
metaslab_group_t *mg_next;
|
||||
};
|
||||
|
||||
/*
|
||||
* Each metaslab maintains an in-core free map (ms_map) that contains the
|
||||
* current list of free segments. As blocks are allocated, the allocated
|
||||
* segment is removed from the ms_map and added to a per txg allocation map.
|
||||
* As blocks are freed, they are added to the per txg free map. These per
|
||||
* txg maps allow us to process all allocations and frees in syncing context
|
||||
* where it is safe to update the on-disk space maps.
|
||||
* This value defines the number of elements in the ms_lbas array. The value
|
||||
* of 64 was chosen as it covers to cover all power of 2 buckets up to
|
||||
* UINT64_MAX. This is the equivalent of highbit(UINT64_MAX).
|
||||
*/
|
||||
#define MAX_LBAS 64
|
||||
|
||||
/*
|
||||
* Each metaslab maintains a set of in-core trees to track metaslab operations.
|
||||
* The in-core free tree (ms_tree) contains the current list of free segments.
|
||||
* As blocks are allocated, the allocated segment are removed from the ms_tree
|
||||
* and added to a per txg allocation tree (ms_alloctree). As blocks are freed,
|
||||
* they are added to the per txg free tree (ms_freetree). These per txg
|
||||
* trees allow us to process all allocations and frees in syncing context
|
||||
* where it is safe to update the on-disk space maps. One additional in-core
|
||||
* tree is maintained to track deferred frees (ms_defertree). Once a block
|
||||
* is freed it will move from the ms_freetree to the ms_defertree. A deferred
|
||||
* free means that a block has been freed but cannot be used by the pool
|
||||
* until TXG_DEFER_SIZE transactions groups later. For example, a block
|
||||
* that is freed in txg 50 will not be available for reallocation until
|
||||
* txg 52 (50 + TXG_DEFER_SIZE). This provides a safety net for uberblock
|
||||
* rollback. A pool could be safely rolled back TXG_DEFERS_SIZE
|
||||
* transactions groups and ensure that no block has been reallocated.
|
||||
*
|
||||
* Each metaslab's free space is tracked in a space map object in the MOS,
|
||||
* The simplified transition diagram looks like this:
|
||||
*
|
||||
*
|
||||
* ALLOCATE
|
||||
* |
|
||||
* V
|
||||
* free segment (ms_tree) --------> ms_alloctree ----> (write to space map)
|
||||
* ^
|
||||
* |
|
||||
* | ms_freetree <--- FREE
|
||||
* | |
|
||||
* | |
|
||||
* | |
|
||||
* +----------- ms_defertree <-------+---------> (write to space map)
|
||||
*
|
||||
*
|
||||
* Each metaslab's space is tracked in a single space map in the MOS,
|
||||
* which is only updated in syncing context. Each time we sync a txg,
|
||||
* we append the allocs and frees from that txg to the space map object.
|
||||
* When the txg is done syncing, metaslab_sync_done() updates ms_smo
|
||||
* to ms_smo_syncing. Everything in ms_smo is always safe to allocate.
|
||||
* we append the allocs and frees from that txg to the space map.
|
||||
* The pool space is only updated once all metaslabs have finished syncing.
|
||||
*
|
||||
* To load the in-core free map we read the space map object from disk.
|
||||
* To load the in-core free tree we read the space map from disk.
|
||||
* This object contains a series of alloc and free records that are
|
||||
* combined to make up the list of all free segments in this metaslab. These
|
||||
* segments are represented in-core by the ms_map and are stored in an
|
||||
* segments are represented in-core by the ms_tree and are stored in an
|
||||
* AVL tree.
|
||||
*
|
||||
* As the space map objects grows (as a result of the appends) it will
|
||||
* eventually become space-inefficient. When the space map object is
|
||||
* zfs_condense_pct/100 times the size of the minimal on-disk representation,
|
||||
* we rewrite it in its minimized form.
|
||||
* As the space map grows (as a result of the appends) it will
|
||||
* eventually become space-inefficient. When the metaslab's in-core free tree
|
||||
* is zfs_condense_pct/100 times the size of the minimal on-disk
|
||||
* representation, we rewrite it in its minimized form. If a metaslab
|
||||
* needs to condense then we must set the ms_condensing flag to ensure
|
||||
* that allocations are not performed on the metaslab that is being written.
|
||||
*/
|
||||
struct metaslab {
|
||||
kmutex_t ms_lock; /* metaslab lock */
|
||||
space_map_obj_t ms_smo; /* synced space map object */
|
||||
space_map_obj_t ms_smo_syncing; /* syncing space map object */
|
||||
space_map_t *ms_allocmap[TXG_SIZE]; /* allocated this txg */
|
||||
space_map_t *ms_freemap[TXG_SIZE]; /* freed this txg */
|
||||
space_map_t *ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */
|
||||
space_map_t *ms_map; /* in-core free space map */
|
||||
kmutex_t ms_lock;
|
||||
kcondvar_t ms_load_cv;
|
||||
space_map_t *ms_sm;
|
||||
metaslab_ops_t *ms_ops;
|
||||
uint64_t ms_id;
|
||||
uint64_t ms_start;
|
||||
uint64_t ms_size;
|
||||
|
||||
range_tree_t *ms_alloctree[TXG_SIZE];
|
||||
range_tree_t *ms_freetree[TXG_SIZE];
|
||||
range_tree_t *ms_defertree[TXG_DEFER_SIZE];
|
||||
range_tree_t *ms_tree;
|
||||
|
||||
boolean_t ms_condensing; /* condensing? */
|
||||
boolean_t ms_loaded;
|
||||
boolean_t ms_loading;
|
||||
|
||||
int64_t ms_deferspace; /* sum of ms_defermap[] space */
|
||||
uint64_t ms_weight; /* weight vs. others in group */
|
||||
uint64_t ms_factor;
|
||||
uint64_t ms_access_txg;
|
||||
|
||||
/*
|
||||
* The metaslab block allocators can optionally use a size-ordered
|
||||
* range tree and/or an array of LBAs. Not all allocators use
|
||||
* this functionality. The ms_size_tree should always contain the
|
||||
* same number of segments as the ms_tree. The only difference
|
||||
* is that the ms_size_tree is ordered by segment sizes.
|
||||
*/
|
||||
avl_tree_t ms_size_tree;
|
||||
uint64_t ms_lbas[MAX_LBAS];
|
||||
|
||||
metaslab_group_t *ms_group; /* metaslab group */
|
||||
avl_node_t ms_group_node; /* node in metaslab group tree */
|
||||
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
|
||||
|
96
include/sys/range_tree.h
Normal file
96
include/sys/range_tree.h
Normal file
@ -0,0 +1,96 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_RANGE_TREE_H
|
||||
#define _SYS_RANGE_TREE_H
|
||||
|
||||
#include <sys/avl.h>
|
||||
#include <sys/dmu.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define RANGE_TREE_HISTOGRAM_SIZE 64
|
||||
|
||||
typedef struct range_tree_ops range_tree_ops_t;
|
||||
|
||||
typedef struct range_tree {
|
||||
avl_tree_t rt_root; /* offset-ordered segment AVL tree */
|
||||
uint64_t rt_space; /* sum of all segments in the map */
|
||||
range_tree_ops_t *rt_ops;
|
||||
void *rt_arg;
|
||||
|
||||
/*
|
||||
* The rt_histogram maintains a histogram of ranges. Each bucket,
|
||||
* rt_histogram[i], contains the number of ranges whose size is:
|
||||
* 2^i <= size of range in bytes < 2^(i+1)
|
||||
*/
|
||||
uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
|
||||
kmutex_t *rt_lock; /* pointer to lock that protects map */
|
||||
} range_tree_t;
|
||||
|
||||
typedef struct range_seg {
|
||||
avl_node_t rs_node; /* AVL node */
|
||||
avl_node_t rs_pp_node; /* AVL picker-private node */
|
||||
uint64_t rs_start; /* starting offset of this segment */
|
||||
uint64_t rs_end; /* ending offset (non-inclusive) */
|
||||
} range_seg_t;
|
||||
|
||||
struct range_tree_ops {
|
||||
void (*rtop_create)(range_tree_t *rt, void *arg);
|
||||
void (*rtop_destroy)(range_tree_t *rt, void *arg);
|
||||
void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg);
|
||||
void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg);
|
||||
void (*rtop_vacate)(range_tree_t *rt, void *arg);
|
||||
};
|
||||
|
||||
typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
|
||||
|
||||
void range_tree_init(void);
|
||||
void range_tree_fini(void);
|
||||
range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp);
|
||||
void range_tree_destroy(range_tree_t *rt);
|
||||
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||
uint64_t range_tree_space(range_tree_t *rt);
|
||||
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
|
||||
void range_tree_stat_verify(range_tree_t *rt);
|
||||
|
||||
void range_tree_add(void *arg, uint64_t start, uint64_t size);
|
||||
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
|
||||
|
||||
void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
|
||||
void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _SYS_RANGE_TREE_H */
|
@ -24,66 +24,72 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_SPACE_MAP_H
|
||||
#define _SYS_SPACE_MAP_H
|
||||
|
||||
#include <sys/avl.h>
|
||||
#include <sys/range_tree.h>
|
||||
#include <sys/dmu.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef const struct space_map_ops space_map_ops_t;
|
||||
/*
|
||||
* The size of the space map object has increased to include a histogram.
|
||||
* The SPACE_MAP_SIZE_V0 designates the original size and is used to
|
||||
* maintain backward compatibility.
|
||||
*/
|
||||
#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
|
||||
#define SPACE_MAP_HISTOGRAM_SIZE(sm) \
|
||||
(sizeof ((sm)->sm_phys->smp_histogram) / \
|
||||
sizeof ((sm)->sm_phys->smp_histogram[0]))
|
||||
|
||||
/*
|
||||
* The space_map_phys is the on-disk representation of the space map.
|
||||
* Consumers of space maps should never reference any of the members of this
|
||||
* structure directly. These members may only be updated in syncing context.
|
||||
*
|
||||
* Note the smp_object is no longer used but remains in the structure
|
||||
* for backward compatibility.
|
||||
*/
|
||||
typedef struct space_map_phys {
|
||||
uint64_t smp_object; /* on-disk space map object */
|
||||
uint64_t smp_objsize; /* size of the object */
|
||||
uint64_t smp_alloc; /* space allocated from the map */
|
||||
uint64_t smp_pad[5]; /* reserved */
|
||||
|
||||
/*
|
||||
* The smp_histogram maintains a histogram of free regions. Each
|
||||
* bucket, smp_histogram[i], contains the number of free regions
|
||||
* whose size is:
|
||||
* 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
|
||||
*/
|
||||
uint64_t smp_histogram[32]; /* histogram of free space */
|
||||
} space_map_phys_t;
|
||||
|
||||
/*
|
||||
* The space map object defines a region of space, its size, how much is
|
||||
* allocated, and the on-disk object that stores this information.
|
||||
* Consumers of space maps may only access the members of this structure.
|
||||
*/
|
||||
typedef struct space_map {
|
||||
avl_tree_t sm_root; /* offset-ordered segment AVL tree */
|
||||
uint64_t sm_space; /* sum of all segments in the map */
|
||||
uint64_t sm_start; /* start of map */
|
||||
uint64_t sm_size; /* size of map */
|
||||
uint8_t sm_shift; /* unit shift */
|
||||
uint8_t sm_loaded; /* map loaded? */
|
||||
uint8_t sm_loading; /* map loading? */
|
||||
uint8_t sm_condensing; /* map condensing? */
|
||||
kcondvar_t sm_load_cv; /* map load completion */
|
||||
space_map_ops_t *sm_ops; /* space map block picker ops vector */
|
||||
avl_tree_t *sm_pp_root; /* size-ordered, picker-private tree */
|
||||
void *sm_ppd; /* picker-private data */
|
||||
uint64_t sm_length; /* synced length */
|
||||
uint64_t sm_alloc; /* synced space allocated */
|
||||
objset_t *sm_os; /* objset for this map */
|
||||
uint64_t sm_object; /* object id for this map */
|
||||
uint32_t sm_blksz; /* block size for space map */
|
||||
dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */
|
||||
space_map_phys_t *sm_phys; /* on-disk space map */
|
||||
kmutex_t *sm_lock; /* pointer to lock that protects map */
|
||||
} space_map_t;
|
||||
|
||||
typedef struct space_seg {
|
||||
avl_node_t ss_node; /* AVL node */
|
||||
avl_node_t ss_pp_node; /* AVL picker-private node */
|
||||
uint64_t ss_start; /* starting offset of this segment */
|
||||
uint64_t ss_end; /* ending offset (non-inclusive) */
|
||||
} space_seg_t;
|
||||
|
||||
typedef struct space_ref {
|
||||
avl_node_t sr_node; /* AVL node */
|
||||
uint64_t sr_offset; /* offset (start or end) */
|
||||
int64_t sr_refcnt; /* associated reference count */
|
||||
} space_ref_t;
|
||||
|
||||
typedef struct space_map_obj {
|
||||
uint64_t smo_object; /* on-disk space map object */
|
||||
uint64_t smo_objsize; /* size of the object */
|
||||
uint64_t smo_alloc; /* space allocated from the map */
|
||||
} space_map_obj_t;
|
||||
|
||||
struct space_map_ops {
|
||||
void (*smop_load)(space_map_t *sm);
|
||||
void (*smop_unload)(space_map_t *sm);
|
||||
uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
|
||||
void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
uint64_t (*smop_max)(space_map_t *sm);
|
||||
boolean_t (*smop_fragmented)(space_map_t *sm);
|
||||
};
|
||||
|
||||
/*
|
||||
* debug entry
|
||||
*
|
||||
@ -124,61 +130,45 @@ struct space_map_ops {
|
||||
|
||||
#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
|
||||
|
||||
#define SM_ALLOC 0x0
|
||||
#define SM_FREE 0x1
|
||||
typedef enum {
|
||||
SM_ALLOC,
|
||||
SM_FREE
|
||||
} maptype_t;
|
||||
|
||||
/*
|
||||
* The data for a given space map can be kept on blocks of any size.
|
||||
* Larger blocks entail fewer i/o operations, but they also cause the
|
||||
* DMU to keep more data in-core, and also to waste more i/o bandwidth
|
||||
* when only a few blocks have changed since the last transaction group.
|
||||
* This could use a lot more research, but for now, set the freelist
|
||||
* block size to 4k (2^12).
|
||||
* Rather than having a fixed block size for all space maps the block size
|
||||
* can adjust as needed (see space_map_max_blksz). Set the initial block
|
||||
* size for the space map to 4k.
|
||||
*/
|
||||
#define SPACE_MAP_BLOCKSHIFT 12
|
||||
#define SPACE_MAP_INITIAL_BLOCKSIZE (1ULL << 12)
|
||||
|
||||
typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
|
||||
|
||||
extern void space_map_init(void);
|
||||
extern void space_map_fini(void);
|
||||
extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
|
||||
uint8_t shift, kmutex_t *lp);
|
||||
extern void space_map_destroy(space_map_t *sm);
|
||||
extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
extern boolean_t space_map_contains(space_map_t *sm,
|
||||
uint64_t start, uint64_t size);
|
||||
extern space_seg_t *space_map_find(space_map_t *sm, uint64_t start,
|
||||
uint64_t size, avl_index_t *wherep);
|
||||
extern void space_map_swap(space_map_t **msrc, space_map_t **mdest);
|
||||
extern void space_map_vacate(space_map_t *sm,
|
||||
space_map_func_t *func, space_map_t *mdest);
|
||||
extern void space_map_walk(space_map_t *sm,
|
||||
space_map_func_t *func, space_map_t *mdest);
|
||||
void space_map_histogram_clear(space_map_t *sm);
|
||||
void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
|
||||
dmu_tx_t *tx);
|
||||
|
||||
extern void space_map_load_wait(space_map_t *sm);
|
||||
extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
|
||||
uint8_t maptype, space_map_obj_t *smo, objset_t *os);
|
||||
extern void space_map_unload(space_map_t *sm);
|
||||
void space_map_update(space_map_t *sm);
|
||||
|
||||
extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
|
||||
extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
extern uint64_t space_map_maxsize(space_map_t *sm);
|
||||
uint64_t space_map_object(space_map_t *sm);
|
||||
uint64_t space_map_allocated(space_map_t *sm);
|
||||
uint64_t space_map_length(space_map_t *sm);
|
||||
|
||||
extern void space_map_sync(space_map_t *sm, uint8_t maptype,
|
||||
space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
|
||||
extern void space_map_truncate(space_map_obj_t *smo,
|
||||
objset_t *os, dmu_tx_t *tx);
|
||||
void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
|
||||
dmu_tx_t *tx);
|
||||
void space_map_truncate(space_map_t *sm, dmu_tx_t *tx);
|
||||
uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx);
|
||||
void space_map_free(space_map_t *sm, dmu_tx_t *tx);
|
||||
|
||||
extern void space_map_ref_create(avl_tree_t *t);
|
||||
extern void space_map_ref_destroy(avl_tree_t *t);
|
||||
extern void space_map_ref_add_seg(avl_tree_t *t,
|
||||
uint64_t start, uint64_t end, int64_t refcnt);
|
||||
extern void space_map_ref_add_map(avl_tree_t *t,
|
||||
space_map_t *sm, int64_t refcnt);
|
||||
extern void space_map_ref_generate_map(avl_tree_t *t,
|
||||
space_map_t *sm, int64_t minref);
|
||||
int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
|
||||
uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp);
|
||||
void space_map_close(space_map_t *sm);
|
||||
|
||||
int64_t space_map_alloc_delta(space_map_t *sm);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
57
include/sys/space_reftree.h
Normal file
57
include/sys/space_reftree.h
Normal file
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_SPACE_REFTREE_H
|
||||
#define _SYS_SPACE_REFTREE_H
|
||||
|
||||
#include <sys/range_tree.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct space_ref {
|
||||
avl_node_t sr_node; /* AVL node */
|
||||
uint64_t sr_offset; /* range offset (start or end) */
|
||||
int64_t sr_refcnt; /* associated reference count */
|
||||
} space_ref_t;
|
||||
|
||||
void space_reftree_create(avl_tree_t *t);
|
||||
void space_reftree_destroy(avl_tree_t *t);
|
||||
void space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
|
||||
int64_t refcnt);
|
||||
void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt);
|
||||
void space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt,
|
||||
int64_t minref);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _SYS_SPACE_REFTREE_H */
|
@ -152,7 +152,6 @@ struct vdev {
|
||||
vdev_t *vdev_parent; /* parent vdev */
|
||||
vdev_t **vdev_child; /* array of children */
|
||||
uint64_t vdev_children; /* number of children */
|
||||
space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
|
||||
vdev_stat_t vdev_stat; /* virtual device statistics */
|
||||
boolean_t vdev_expanding; /* expand the vdev? */
|
||||
boolean_t vdev_reopening; /* reopen in progress? */
|
||||
@ -174,19 +173,21 @@ struct vdev {
|
||||
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
|
||||
boolean_t vdev_remove_wanted; /* async remove wanted? */
|
||||
boolean_t vdev_probe_wanted; /* async probe wanted? */
|
||||
uint64_t vdev_removing; /* device is being removed? */
|
||||
list_node_t vdev_config_dirty_node; /* config dirty list */
|
||||
list_node_t vdev_state_dirty_node; /* state dirty list */
|
||||
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
|
||||
uint64_t vdev_islog; /* is an intent log device */
|
||||
uint64_t vdev_ishole; /* is a hole in the namespace */
|
||||
uint64_t vdev_removing; /* device is being removed? */
|
||||
boolean_t vdev_ishole; /* is a hole in the namespace */
|
||||
|
||||
/*
|
||||
* Leaf vdev state.
|
||||
*/
|
||||
uint64_t vdev_psize; /* physical device capacity */
|
||||
space_map_obj_t vdev_dtl_smo; /* dirty time log space map obj */
|
||||
range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */
|
||||
space_map_t *vdev_dtl_sm; /* dirty time log space map */
|
||||
txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
|
||||
uint64_t vdev_dtl_object; /* DTL object */
|
||||
uint64_t vdev_psize; /* physical device capacity */
|
||||
uint64_t vdev_wholedisk; /* true if this is a whole disk */
|
||||
uint64_t vdev_offline; /* persistent offline state */
|
||||
uint64_t vdev_faulted; /* persistent faulted state */
|
||||
@ -200,18 +201,17 @@ struct vdev {
|
||||
char *vdev_fru; /* physical FRU location */
|
||||
uint64_t vdev_not_present; /* not present during import */
|
||||
uint64_t vdev_unspare; /* unspare when resilvering done */
|
||||
hrtime_t vdev_last_try; /* last reopen time */
|
||||
boolean_t vdev_nowritecache; /* true if flushwritecache failed */
|
||||
boolean_t vdev_checkremove; /* temporary online test */
|
||||
boolean_t vdev_forcefault; /* force online fault */
|
||||
boolean_t vdev_splitting; /* split or repair in progress */
|
||||
boolean_t vdev_delayed_close; /* delayed device close? */
|
||||
uint8_t vdev_tmpoffline; /* device taken offline temporarily? */
|
||||
uint8_t vdev_detached; /* device detached? */
|
||||
uint8_t vdev_cant_read; /* vdev is failing all reads */
|
||||
uint8_t vdev_cant_write; /* vdev is failing all writes */
|
||||
uint64_t vdev_isspare; /* was a hot spare */
|
||||
uint64_t vdev_isl2cache; /* was a l2cache device */
|
||||
boolean_t vdev_tmpoffline; /* device taken offline temporarily? */
|
||||
boolean_t vdev_detached; /* device detached? */
|
||||
boolean_t vdev_cant_read; /* vdev is failing all reads */
|
||||
boolean_t vdev_cant_write; /* vdev is failing all writes */
|
||||
boolean_t vdev_isspare; /* was a hot spare */
|
||||
boolean_t vdev_isl2cache; /* was a l2cache device */
|
||||
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
|
||||
vdev_cache_t vdev_cache; /* physical block cache */
|
||||
spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */
|
||||
@ -312,9 +312,11 @@ extern void vdev_remove_parent(vdev_t *cvd);
|
||||
extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
|
||||
extern boolean_t vdev_log_state_valid(vdev_t *vd);
|
||||
extern void vdev_load(vdev_t *vd);
|
||||
extern int vdev_dtl_load(vdev_t *vd);
|
||||
extern void vdev_sync(vdev_t *vd, uint64_t txg);
|
||||
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
|
||||
extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
|
||||
extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
|
||||
|
||||
/*
|
||||
* Available vdev types.
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_ZFEATURE_H
|
||||
@ -47,6 +47,7 @@ extern void spa_feature_incr(struct spa *, zfeature_info_t *, struct dmu_tx *);
|
||||
extern void spa_feature_decr(struct spa *, zfeature_info_t *, struct dmu_tx *);
|
||||
extern boolean_t spa_feature_is_enabled(struct spa *, zfeature_info_t *);
|
||||
extern boolean_t spa_feature_is_active(struct spa *, zfeature_info_t *);
|
||||
extern int spa_feature_get_refcount(struct spa *, zfeature_info_t *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
*/
|
||||
|
||||
@ -54,6 +54,7 @@ typedef enum spa_feature {
|
||||
SPA_FEATURE_ASYNC_DESTROY,
|
||||
SPA_FEATURE_EMPTY_BPOBJ,
|
||||
SPA_FEATURE_LZ4_COMPRESS,
|
||||
SPA_FEATURE_SPACEMAP_HISTOGRAM,
|
||||
SPA_FEATURES
|
||||
} spa_feature_t;
|
||||
|
||||
|
@ -53,6 +53,7 @@ libzpool_la_SOURCES = \
|
||||
$(top_srcdir)/module/zfs/lzjb.c \
|
||||
$(top_srcdir)/module/zfs/lz4.c \
|
||||
$(top_srcdir)/module/zfs/metaslab.c \
|
||||
$(top_srcdir)/module/zfs/range_tree.c \
|
||||
$(top_srcdir)/module/zfs/refcount.c \
|
||||
$(top_srcdir)/module/zfs/rrwlock.c \
|
||||
$(top_srcdir)/module/zfs/sa.c \
|
||||
@ -65,6 +66,7 @@ libzpool_la_SOURCES = \
|
||||
$(top_srcdir)/module/zfs/spa_misc.c \
|
||||
$(top_srcdir)/module/zfs/spa_stats.c \
|
||||
$(top_srcdir)/module/zfs/space_map.c \
|
||||
$(top_srcdir)/module/zfs/space_reftree.c \
|
||||
$(top_srcdir)/module/zfs/txg.c \
|
||||
$(top_srcdir)/module/zfs/uberblock.c \
|
||||
$(top_srcdir)/module/zfs/unique.c \
|
||||
|
@ -1,5 +1,5 @@
|
||||
'\" te
|
||||
.\" Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
.\" Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
.\" The contents of this file are subject to the terms of the Common Development
|
||||
.\" and Distribution License (the "License"). You may not use this file except
|
||||
@ -13,7 +13,7 @@
|
||||
.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
|
||||
.\" own identifying information:
|
||||
.\" Portions Copyright [yyyy] [name of copyright owner]
|
||||
.TH ZPOOL-FEATURES 5 "Feb 4, 2013"
|
||||
.TH ZPOOL-FEATURES 5 "Aug 27, 2013"
|
||||
.SH NAME
|
||||
zpool\-features \- ZFS pool feature descriptions
|
||||
.SH DESCRIPTION
|
||||
@ -228,6 +228,26 @@ read-only compatible, this operation will render the pool unimportable
|
||||
on systems without support for the \fBlz4_compress\fR feature. At the
|
||||
moment, this operation cannot be reversed. Booting off of
|
||||
\fBlz4\fR-compressed root pools is supported.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fB\fBspacemap_histogram\fR\fR
|
||||
.ad
|
||||
.RS 4n
|
||||
.TS
|
||||
l l .
|
||||
GUID com.delphix:spacemap_histogram
|
||||
READ\-ONLY COMPATIBLE yes
|
||||
DEPENDENCIES none
|
||||
.TE
|
||||
|
||||
This features allows ZFS to maintain more information about how free space
|
||||
is organized within the pool. If this feature is \fBenabled\fR, ZFS will
|
||||
set this feature to \fBactive\fR when a new space map object is created or
|
||||
an existing space map is upgraded to the new format. Once the feature is
|
||||
\fBactive\fR, it will remain in that state until the pool is destroyed.
|
||||
|
||||
.RE
|
||||
|
||||
|
@ -35,6 +35,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/gzip.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/lzjb.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/lz4.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/metaslab.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/range_tree.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/refcount.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/rrwlock.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/sa.o
|
||||
@ -47,6 +48,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/space_reftree.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/txg.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o
|
||||
$(MODULE)-objs += @top_srcdir@/module/zfs/unique.o
|
||||
|
@ -1335,7 +1335,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
|
||||
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
|
||||
|
||||
/* Check for any allocated blocks beyond the first */
|
||||
if (dn->dn_phys->dn_maxblkid != 0)
|
||||
if (dn->dn_maxblkid != 0)
|
||||
goto fail;
|
||||
|
||||
mutex_enter(&dn->dn_dbufs_mtx);
|
||||
|
File diff suppressed because it is too large
Load Diff
391
module/zfs/range_tree.c
Normal file
391
module/zfs/range_tree.c
Normal file
@ -0,0 +1,391 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/spa.h>
|
||||
#include <sys/dmu.h>
|
||||
#include <sys/dnode.h>
|
||||
#include <sys/zio.h>
|
||||
#include <sys/range_tree.h>
|
||||
|
||||
static kmem_cache_t *range_seg_cache;
|
||||
|
||||
void
|
||||
range_tree_init(void)
|
||||
{
|
||||
ASSERT(range_seg_cache == NULL);
|
||||
range_seg_cache = kmem_cache_create("range_seg_cache",
|
||||
sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_fini(void)
|
||||
{
|
||||
kmem_cache_destroy(range_seg_cache);
|
||||
range_seg_cache = NULL;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_stat_verify(range_tree_t *rt)
|
||||
{
|
||||
range_seg_t *rs;
|
||||
uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
|
||||
int i;
|
||||
|
||||
for (rs = avl_first(&rt->rt_root); rs != NULL;
|
||||
rs = AVL_NEXT(&rt->rt_root, rs)) {
|
||||
uint64_t size = rs->rs_end - rs->rs_start;
|
||||
int idx = highbit(size) - 1;
|
||||
|
||||
hist[idx]++;
|
||||
ASSERT3U(hist[idx], !=, 0);
|
||||
}
|
||||
|
||||
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
|
||||
if (hist[i] != rt->rt_histogram[i]) {
|
||||
zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
|
||||
i, hist, hist[i], rt->rt_histogram[i]);
|
||||
}
|
||||
VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
|
||||
{
|
||||
uint64_t size = rs->rs_end - rs->rs_start;
|
||||
int idx = highbit(size) - 1;
|
||||
|
||||
ASSERT3U(idx, <,
|
||||
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
rt->rt_histogram[idx]++;
|
||||
ASSERT3U(rt->rt_histogram[idx], !=, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
|
||||
{
|
||||
uint64_t size = rs->rs_end - rs->rs_start;
|
||||
int idx = highbit(size) - 1;
|
||||
|
||||
ASSERT3U(idx, <,
|
||||
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
ASSERT3U(rt->rt_histogram[idx], !=, 0);
|
||||
rt->rt_histogram[idx]--;
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: caller is responsible for all locking.
|
||||
*/
|
||||
static int
|
||||
range_tree_seg_compare(const void *x1, const void *x2)
|
||||
{
|
||||
const range_seg_t *r1 = x1;
|
||||
const range_seg_t *r2 = x2;
|
||||
|
||||
if (r1->rs_start < r2->rs_start) {
|
||||
if (r1->rs_end > r2->rs_start)
|
||||
return (0);
|
||||
return (-1);
|
||||
}
|
||||
if (r1->rs_start > r2->rs_start) {
|
||||
if (r1->rs_start < r2->rs_end)
|
||||
return (0);
|
||||
return (1);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
range_tree_t *
|
||||
range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
|
||||
{
|
||||
range_tree_t *rt;
|
||||
|
||||
rt = kmem_zalloc(sizeof (range_tree_t), KM_PUSHPAGE);
|
||||
|
||||
avl_create(&rt->rt_root, range_tree_seg_compare,
|
||||
sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
|
||||
|
||||
rt->rt_lock = lp;
|
||||
rt->rt_ops = ops;
|
||||
rt->rt_arg = arg;
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_create(rt, rt->rt_arg);
|
||||
|
||||
return (rt);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_destroy(range_tree_t *rt)
|
||||
{
|
||||
VERIFY0(rt->rt_space);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
|
||||
|
||||
avl_destroy(&rt->rt_root);
|
||||
kmem_free(rt, sizeof (*rt));
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_add(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_tree_t *rt = arg;
|
||||
avl_index_t where;
|
||||
range_seg_t rsearch, *rs_before, *rs_after, *rs;
|
||||
uint64_t end = start + size;
|
||||
boolean_t merge_before, merge_after;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
VERIFY(size != 0);
|
||||
|
||||
rsearch.rs_start = start;
|
||||
rsearch.rs_end = end;
|
||||
rs = avl_find(&rt->rt_root, &rsearch, &where);
|
||||
|
||||
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
|
||||
zfs_panic_recover("zfs: allocating allocated segment"
|
||||
"(offset=%llu size=%llu)\n",
|
||||
(longlong_t)start, (longlong_t)size);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Make sure we don't overlap with either of our neighbors */
|
||||
VERIFY(rs == NULL);
|
||||
|
||||
rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
|
||||
rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
|
||||
|
||||
merge_before = (rs_before != NULL && rs_before->rs_end == start);
|
||||
merge_after = (rs_after != NULL && rs_after->rs_start == end);
|
||||
|
||||
if (merge_before && merge_after) {
|
||||
avl_remove(&rt->rt_root, rs_before);
|
||||
if (rt->rt_ops != NULL) {
|
||||
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
|
||||
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
|
||||
}
|
||||
|
||||
range_tree_stat_decr(rt, rs_before);
|
||||
range_tree_stat_decr(rt, rs_after);
|
||||
|
||||
rs_after->rs_start = rs_before->rs_start;
|
||||
kmem_cache_free(range_seg_cache, rs_before);
|
||||
rs = rs_after;
|
||||
} else if (merge_before) {
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
|
||||
|
||||
range_tree_stat_decr(rt, rs_before);
|
||||
|
||||
rs_before->rs_end = end;
|
||||
rs = rs_before;
|
||||
} else if (merge_after) {
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
|
||||
|
||||
range_tree_stat_decr(rt, rs_after);
|
||||
|
||||
rs_after->rs_start = start;
|
||||
rs = rs_after;
|
||||
} else {
|
||||
rs = kmem_cache_alloc(range_seg_cache, KM_PUSHPAGE);
|
||||
rs->rs_start = start;
|
||||
rs->rs_end = end;
|
||||
avl_insert(&rt->rt_root, rs, where);
|
||||
}
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
|
||||
range_tree_stat_incr(rt, rs);
|
||||
rt->rt_space += size;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_tree_t *rt = arg;
|
||||
avl_index_t where;
|
||||
range_seg_t rsearch, *rs, *newseg;
|
||||
uint64_t end = start + size;
|
||||
boolean_t left_over, right_over;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
VERIFY3U(size, !=, 0);
|
||||
VERIFY3U(size, <=, rt->rt_space);
|
||||
|
||||
rsearch.rs_start = start;
|
||||
rsearch.rs_end = end;
|
||||
rs = avl_find(&rt->rt_root, &rsearch, &where);
|
||||
|
||||
/* Make sure we completely overlap with someone */
|
||||
if (rs == NULL) {
|
||||
zfs_panic_recover("zfs: freeing free segment "
|
||||
"(offset=%llu size=%llu)",
|
||||
(longlong_t)start, (longlong_t)size);
|
||||
return;
|
||||
}
|
||||
VERIFY3U(rs->rs_start, <=, start);
|
||||
VERIFY3U(rs->rs_end, >=, end);
|
||||
|
||||
left_over = (rs->rs_start != start);
|
||||
right_over = (rs->rs_end != end);
|
||||
|
||||
range_tree_stat_decr(rt, rs);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
|
||||
|
||||
if (left_over && right_over) {
|
||||
newseg = kmem_cache_alloc(range_seg_cache, KM_PUSHPAGE);
|
||||
newseg->rs_start = end;
|
||||
newseg->rs_end = rs->rs_end;
|
||||
range_tree_stat_incr(rt, newseg);
|
||||
|
||||
rs->rs_end = start;
|
||||
|
||||
avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
|
||||
} else if (left_over) {
|
||||
rs->rs_end = start;
|
||||
} else if (right_over) {
|
||||
rs->rs_start = end;
|
||||
} else {
|
||||
avl_remove(&rt->rt_root, rs);
|
||||
kmem_cache_free(range_seg_cache, rs);
|
||||
rs = NULL;
|
||||
}
|
||||
|
||||
if (rs != NULL) {
|
||||
range_tree_stat_incr(rt, rs);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
}
|
||||
|
||||
rt->rt_space -= size;
|
||||
}
|
||||
|
||||
static range_seg_t *
|
||||
range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size,
|
||||
avl_index_t *wherep)
|
||||
{
|
||||
range_seg_t rsearch, *rs;
|
||||
uint64_t end = start + size;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
VERIFY(size != 0);
|
||||
|
||||
rsearch.rs_start = start;
|
||||
rsearch.rs_end = end;
|
||||
rs = avl_find(&rt->rt_root, &rsearch, wherep);
|
||||
|
||||
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end)
|
||||
return (rs);
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
|
||||
{
|
||||
range_seg_t *rs;
|
||||
avl_index_t where;
|
||||
|
||||
mutex_enter(rt->rt_lock);
|
||||
rs = range_tree_find(rt, off, size, &where);
|
||||
if (rs != NULL)
|
||||
panic("freeing free block; rs=%p", (void *)rs);
|
||||
mutex_exit(rt->rt_lock);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||
{
|
||||
avl_index_t where;
|
||||
|
||||
return (range_tree_find(rt, start, size, &where) != NULL);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
|
||||
{
|
||||
range_tree_t *rt;
|
||||
|
||||
ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
|
||||
ASSERT0(range_tree_space(*rtdst));
|
||||
ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
|
||||
|
||||
rt = *rtsrc;
|
||||
*rtsrc = *rtdst;
|
||||
*rtdst = rt;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
|
||||
{
|
||||
range_seg_t *rs;
|
||||
void *cookie = NULL;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
|
||||
|
||||
while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
|
||||
if (func != NULL)
|
||||
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
|
||||
kmem_cache_free(range_seg_cache, rs);
|
||||
}
|
||||
|
||||
bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
|
||||
rt->rt_space = 0;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
|
||||
{
|
||||
range_seg_t *rs;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
|
||||
for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
|
||||
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
range_tree_space(range_tree_t *rt)
|
||||
{
|
||||
return (rt->rt_space);
|
||||
}
|
@ -1259,6 +1259,15 @@ spa_unload(spa_t *spa)
|
||||
|
||||
bpobj_close(&spa->spa_deferred_bpobj);
|
||||
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
|
||||
/*
|
||||
* Close all vdevs.
|
||||
*/
|
||||
if (spa->spa_root_vdev)
|
||||
vdev_free(spa->spa_root_vdev);
|
||||
ASSERT(spa->spa_root_vdev == NULL);
|
||||
|
||||
/*
|
||||
* Close the dsl pool.
|
||||
*/
|
||||
@ -1270,20 +1279,12 @@ spa_unload(spa_t *spa)
|
||||
|
||||
ddt_unload(spa);
|
||||
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
|
||||
/*
|
||||
* Drop and purge level 2 cache
|
||||
*/
|
||||
spa_l2cache_drop(spa);
|
||||
|
||||
/*
|
||||
* Close all vdevs.
|
||||
*/
|
||||
if (spa->spa_root_vdev)
|
||||
vdev_free(spa->spa_root_vdev);
|
||||
ASSERT(spa->spa_root_vdev == NULL);
|
||||
|
||||
for (i = 0; i < spa->spa_spares.sav_count; i++)
|
||||
vdev_free(spa->spa_spares.sav_vdevs[i]);
|
||||
if (spa->spa_spares.sav_vdevs) {
|
||||
@ -4568,7 +4569,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
|
||||
vdev_dirty(tvd, VDD_DTL, newvd, txg);
|
||||
|
||||
/*
|
||||
* Restart the resilver
|
||||
* Schedule the resilver to restart in the future. We do this to
|
||||
* ensure that dmu_sync-ed blocks have been stitched into the
|
||||
* respective datasets.
|
||||
*/
|
||||
dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
|
||||
|
||||
@ -5193,7 +5196,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
|
||||
ASSERT0(vd->vdev_stat.vs_alloc);
|
||||
txg = spa_vdev_config_enter(spa);
|
||||
vd->vdev_removing = B_TRUE;
|
||||
vdev_dirty(vd, 0, NULL, txg);
|
||||
vdev_dirty_leaves(vd, VDD_DTL, txg);
|
||||
vdev_config_dirty(vd);
|
||||
spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
|
||||
|
||||
@ -5965,7 +5968,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
|
||||
ASSERT(zpool_prop_feature(nvpair_name(elem)));
|
||||
|
||||
fname = strchr(nvpair_name(elem), '@') + 1;
|
||||
VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
|
||||
VERIFY0(zfeature_lookup_name(fname, &feature));
|
||||
|
||||
spa_feature_enable(spa, feature, tx);
|
||||
spa_history_log_internal(spa, "set", tx,
|
||||
@ -5973,7 +5976,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
|
||||
break;
|
||||
|
||||
case ZPOOL_PROP_VERSION:
|
||||
VERIFY(nvpair_value_uint64(elem, &intval) == 0);
|
||||
intval = fnvpair_value_uint64(elem);
|
||||
/*
|
||||
* The version is synced seperatly before other
|
||||
* properties and should be correct by now.
|
||||
@ -5997,7 +6000,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
|
||||
*/
|
||||
break;
|
||||
case ZPOOL_PROP_COMMENT:
|
||||
VERIFY(nvpair_value_string(elem, &strval) == 0);
|
||||
strval = fnvpair_value_string(elem);
|
||||
if (spa->spa_comment != NULL)
|
||||
spa_strfree(spa->spa_comment);
|
||||
spa->spa_comment = spa_strdup(strval);
|
||||
@ -6029,23 +6032,23 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
|
||||
|
||||
if (nvpair_type(elem) == DATA_TYPE_STRING) {
|
||||
ASSERT(proptype == PROP_TYPE_STRING);
|
||||
VERIFY(nvpair_value_string(elem, &strval) == 0);
|
||||
VERIFY(zap_update(mos,
|
||||
strval = fnvpair_value_string(elem);
|
||||
VERIFY0(zap_update(mos,
|
||||
spa->spa_pool_props_object, propname,
|
||||
1, strlen(strval) + 1, strval, tx) == 0);
|
||||
1, strlen(strval) + 1, strval, tx));
|
||||
spa_history_log_internal(spa, "set", tx,
|
||||
"%s=%s", nvpair_name(elem), strval);
|
||||
} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
|
||||
VERIFY(nvpair_value_uint64(elem, &intval) == 0);
|
||||
intval = fnvpair_value_uint64(elem);
|
||||
|
||||
if (proptype == PROP_TYPE_INDEX) {
|
||||
const char *unused;
|
||||
VERIFY(zpool_prop_index_to_string(
|
||||
prop, intval, &unused) == 0);
|
||||
VERIFY0(zpool_prop_index_to_string(
|
||||
prop, intval, &unused));
|
||||
}
|
||||
VERIFY(zap_update(mos,
|
||||
VERIFY0(zap_update(mos,
|
||||
spa->spa_pool_props_object, propname,
|
||||
8, 1, &intval, tx) == 0);
|
||||
8, 1, &intval, tx));
|
||||
spa_history_log_internal(spa, "set", tx,
|
||||
"%s=%lld", nvpair_name(elem), intval);
|
||||
} else {
|
||||
|
@ -986,7 +986,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
|
||||
txg_wait_synced(spa->spa_dsl_pool, txg);
|
||||
|
||||
if (vd != NULL) {
|
||||
ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
|
||||
ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
|
||||
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
|
||||
vdev_free(vd);
|
||||
spa_config_exit(spa, SCL_ALL, spa);
|
||||
@ -1655,7 +1655,7 @@ spa_init(int mode)
|
||||
fm_init();
|
||||
refcount_init();
|
||||
unique_init();
|
||||
space_map_init();
|
||||
range_tree_init();
|
||||
ddt_init();
|
||||
zio_init();
|
||||
dmu_init();
|
||||
@ -1682,7 +1682,7 @@ spa_fini(void)
|
||||
dmu_fini();
|
||||
zio_fini();
|
||||
ddt_fini();
|
||||
space_map_fini();
|
||||
range_tree_fini();
|
||||
unique_fini();
|
||||
refcount_fini();
|
||||
fm_fini();
|
||||
|
File diff suppressed because it is too large
Load Diff
159
module/zfs/space_reftree.c
Normal file
159
module/zfs/space_reftree.c
Normal file
@ -0,0 +1,159 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/range_tree.h>
|
||||
#include <sys/space_reftree.h>
|
||||
|
||||
/*
|
||||
* Space reference trees.
|
||||
*
|
||||
* A range tree is a collection of integers. Every integer is either
|
||||
* in the tree, or it's not. A space reference tree generalizes
|
||||
* the idea: it allows its members to have arbitrary reference counts,
|
||||
* as opposed to the implicit reference count of 0 or 1 in a range tree.
|
||||
* This representation comes in handy when computing the union or
|
||||
* intersection of multiple space maps. For example, the union of
|
||||
* N range trees is the subset of the reference tree with refcnt >= 1.
|
||||
* The intersection of N range trees is the subset with refcnt >= N.
|
||||
*
|
||||
* [It's very much like a Fourier transform. Unions and intersections
|
||||
* are hard to perform in the 'range tree domain', so we convert the trees
|
||||
* into the 'reference count domain', where it's trivial, then invert.]
|
||||
*
|
||||
* vdev_dtl_reassess() uses computations of this form to determine
|
||||
* DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
|
||||
* has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
|
||||
* has an outage wherever refcnt >= vdev_children.
|
||||
*/
|
||||
static int
|
||||
space_reftree_compare(const void *x1, const void *x2)
|
||||
{
|
||||
const space_ref_t *sr1 = x1;
|
||||
const space_ref_t *sr2 = x2;
|
||||
|
||||
if (sr1->sr_offset < sr2->sr_offset)
|
||||
return (-1);
|
||||
if (sr1->sr_offset > sr2->sr_offset)
|
||||
return (1);
|
||||
|
||||
if (sr1 < sr2)
|
||||
return (-1);
|
||||
if (sr1 > sr2)
|
||||
return (1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
void
|
||||
space_reftree_create(avl_tree_t *t)
|
||||
{
|
||||
avl_create(t, space_reftree_compare,
|
||||
sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
|
||||
}
|
||||
|
||||
void
|
||||
space_reftree_destroy(avl_tree_t *t)
|
||||
{
|
||||
space_ref_t *sr;
|
||||
void *cookie = NULL;
|
||||
|
||||
while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
|
||||
kmem_free(sr, sizeof (*sr));
|
||||
|
||||
avl_destroy(t);
|
||||
}
|
||||
|
||||
static void
|
||||
space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
|
||||
{
|
||||
space_ref_t *sr;
|
||||
|
||||
sr = kmem_alloc(sizeof (*sr), KM_PUSHPAGE);
|
||||
sr->sr_offset = offset;
|
||||
sr->sr_refcnt = refcnt;
|
||||
|
||||
avl_add(t, sr);
|
||||
}
|
||||
|
||||
void
|
||||
space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
|
||||
int64_t refcnt)
|
||||
{
|
||||
space_reftree_add_node(t, start, refcnt);
|
||||
space_reftree_add_node(t, end, -refcnt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert (or add) a range tree into a reference tree.
|
||||
*/
|
||||
void
|
||||
space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt)
|
||||
{
|
||||
range_seg_t *rs;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
|
||||
for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
|
||||
space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a reference tree into a range tree. The range tree will contain
|
||||
* all members of the reference tree for which refcnt >= minref.
|
||||
*/
|
||||
void
|
||||
space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref)
|
||||
{
|
||||
uint64_t start = -1ULL;
|
||||
int64_t refcnt = 0;
|
||||
space_ref_t *sr;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
|
||||
range_tree_vacate(rt, NULL, NULL);
|
||||
|
||||
for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
|
||||
refcnt += sr->sr_refcnt;
|
||||
if (refcnt >= minref) {
|
||||
if (start == -1ULL) {
|
||||
start = sr->sr_offset;
|
||||
}
|
||||
} else {
|
||||
if (start != -1ULL) {
|
||||
uint64_t end = sr->sr_offset;
|
||||
ASSERT(start <= end);
|
||||
if (end > start)
|
||||
range_tree_add(rt, start, end - start);
|
||||
start = -1ULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT(refcnt == 0);
|
||||
ASSERT(start == -1ULL);
|
||||
}
|
@ -36,6 +36,7 @@
|
||||
#include <sys/metaslab.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
#include <sys/space_map.h>
|
||||
#include <sys/space_reftree.h>
|
||||
#include <sys/zio.h>
|
||||
#include <sys/zap.h>
|
||||
#include <sys/fs/zfs.h>
|
||||
@ -193,7 +194,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
|
||||
pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
|
||||
newsize = pvd->vdev_children * sizeof (vdev_t *);
|
||||
|
||||
newchild = kmem_zalloc(newsize, KM_PUSHPAGE);
|
||||
newchild = kmem_alloc(newsize, KM_PUSHPAGE);
|
||||
if (pvd->vdev_child != NULL) {
|
||||
bcopy(pvd->vdev_child, newchild, oldsize);
|
||||
kmem_free(pvd->vdev_child, oldsize);
|
||||
@ -263,7 +264,7 @@ vdev_compact_children(vdev_t *pvd)
|
||||
if (pvd->vdev_child[c])
|
||||
newc++;
|
||||
|
||||
newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_PUSHPAGE);
|
||||
newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_PUSHPAGE);
|
||||
|
||||
for (c = newc = 0; c < oldc; c++) {
|
||||
if ((cvd = pvd->vdev_child[c]) != NULL) {
|
||||
@ -324,7 +325,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
for (t = 0; t < DTL_TYPES; t++) {
|
||||
space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
|
||||
vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
|
||||
&vd->vdev_dtl_lock);
|
||||
}
|
||||
txg_list_create(&vd->vdev_ms_list,
|
||||
@ -510,7 +511,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
alloctype == VDEV_ALLOC_ROOTPOOL)) {
|
||||
if (alloctype == VDEV_ALLOC_LOAD) {
|
||||
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
|
||||
&vd->vdev_dtl_smo.smo_object);
|
||||
&vd->vdev_dtl_object);
|
||||
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
|
||||
&vd->vdev_unspare);
|
||||
}
|
||||
@ -633,9 +634,10 @@ vdev_free(vdev_t *vd)
|
||||
txg_list_destroy(&vd->vdev_dtl_list);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
space_map_close(vd->vdev_dtl_sm);
|
||||
for (t = 0; t < DTL_TYPES; t++) {
|
||||
space_map_unload(&vd->vdev_dtl[t]);
|
||||
space_map_destroy(&vd->vdev_dtl[t]);
|
||||
range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
|
||||
range_tree_destroy(vd->vdev_dtl[t]);
|
||||
}
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
@ -859,27 +861,16 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
|
||||
vd->vdev_ms_count = newc;
|
||||
|
||||
for (m = oldc; m < newc; m++) {
|
||||
space_map_obj_t smo = { 0, 0, 0 };
|
||||
if (txg == 0) {
|
||||
uint64_t object = 0;
|
||||
|
||||
if (txg == 0) {
|
||||
error = dmu_read(mos, vd->vdev_ms_array,
|
||||
m * sizeof (uint64_t), sizeof (uint64_t), &object,
|
||||
DMU_READ_PREFETCH);
|
||||
if (error)
|
||||
return (error);
|
||||
if (object != 0) {
|
||||
dmu_buf_t *db;
|
||||
error = dmu_bonus_hold(mos, object, FTAG, &db);
|
||||
if (error)
|
||||
return (error);
|
||||
ASSERT3U(db->db_size, >=, sizeof (smo));
|
||||
bcopy(db->db_data, &smo, sizeof (smo));
|
||||
ASSERT3U(smo.smo_object, ==, object);
|
||||
dmu_buf_rele(db, FTAG);
|
||||
}
|
||||
}
|
||||
vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
|
||||
m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
|
||||
vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, m, object, txg);
|
||||
}
|
||||
|
||||
if (txg == 0)
|
||||
@ -907,9 +898,12 @@ vdev_metaslab_fini(vdev_t *vd)
|
||||
|
||||
if (vd->vdev_ms != NULL) {
|
||||
metaslab_group_passivate(vd->vdev_mg);
|
||||
for (m = 0; m < count; m++)
|
||||
if (vd->vdev_ms[m] != NULL)
|
||||
metaslab_fini(vd->vdev_ms[m]);
|
||||
for (m = 0; m < count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
|
||||
if (msp != NULL)
|
||||
metaslab_fini(msp);
|
||||
}
|
||||
kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
|
||||
vd->vdev_ms = NULL;
|
||||
}
|
||||
@ -1572,9 +1566,10 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
|
||||
}
|
||||
|
||||
/*
|
||||
* Recursively initialize all labels.
|
||||
* Recursively load DTLs and initialize all labels.
|
||||
*/
|
||||
if ((error = vdev_label_init(vd, txg, isreplacing ?
|
||||
if ((error = vdev_dtl_load(vd)) != 0 ||
|
||||
(error = vdev_label_init(vd, txg, isreplacing ?
|
||||
VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
|
||||
vdev_close(vd);
|
||||
return (error);
|
||||
@ -1610,6 +1605,18 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
|
||||
(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
|
||||
}
|
||||
|
||||
void
|
||||
vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
|
||||
{
|
||||
int c;
|
||||
|
||||
for (c = 0; c < vd->vdev_children; c++)
|
||||
vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf)
|
||||
vdev_dirty(vd->vdev_top, flags, vd, txg);
|
||||
}
|
||||
|
||||
/*
|
||||
* DTLs.
|
||||
*
|
||||
@ -1651,31 +1658,31 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
|
||||
void
|
||||
vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
|
||||
{
|
||||
space_map_t *sm = &vd->vdev_dtl[t];
|
||||
range_tree_t *rt = vd->vdev_dtl[t];
|
||||
|
||||
ASSERT(t < DTL_TYPES);
|
||||
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
|
||||
ASSERT(spa_writeable(vd->vdev_spa));
|
||||
|
||||
mutex_enter(sm->sm_lock);
|
||||
if (!space_map_contains(sm, txg, size))
|
||||
space_map_add(sm, txg, size);
|
||||
mutex_exit(sm->sm_lock);
|
||||
mutex_enter(rt->rt_lock);
|
||||
if (!range_tree_contains(rt, txg, size))
|
||||
range_tree_add(rt, txg, size);
|
||||
mutex_exit(rt->rt_lock);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
|
||||
{
|
||||
space_map_t *sm = &vd->vdev_dtl[t];
|
||||
range_tree_t *rt = vd->vdev_dtl[t];
|
||||
boolean_t dirty = B_FALSE;
|
||||
|
||||
ASSERT(t < DTL_TYPES);
|
||||
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
|
||||
|
||||
mutex_enter(sm->sm_lock);
|
||||
if (sm->sm_space != 0)
|
||||
dirty = space_map_contains(sm, txg, size);
|
||||
mutex_exit(sm->sm_lock);
|
||||
mutex_enter(rt->rt_lock);
|
||||
if (range_tree_space(rt) != 0)
|
||||
dirty = range_tree_contains(rt, txg, size);
|
||||
mutex_exit(rt->rt_lock);
|
||||
|
||||
return (dirty);
|
||||
}
|
||||
@ -1683,12 +1690,12 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
|
||||
boolean_t
|
||||
vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
|
||||
{
|
||||
space_map_t *sm = &vd->vdev_dtl[t];
|
||||
range_tree_t *rt = vd->vdev_dtl[t];
|
||||
boolean_t empty;
|
||||
|
||||
mutex_enter(sm->sm_lock);
|
||||
empty = (sm->sm_space == 0);
|
||||
mutex_exit(sm->sm_lock);
|
||||
mutex_enter(rt->rt_lock);
|
||||
empty = (range_tree_space(rt) == 0);
|
||||
mutex_exit(rt->rt_lock);
|
||||
|
||||
return (empty);
|
||||
}
|
||||
@ -1699,14 +1706,14 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
|
||||
static uint64_t
|
||||
vdev_dtl_min(vdev_t *vd)
|
||||
{
|
||||
space_seg_t *ss;
|
||||
range_seg_t *rs;
|
||||
|
||||
ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
|
||||
ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
|
||||
ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
|
||||
ASSERT0(vd->vdev_children);
|
||||
|
||||
ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
|
||||
return (ss->ss_start - 1);
|
||||
rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
|
||||
return (rs->rs_start - 1);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1715,14 +1722,14 @@ vdev_dtl_min(vdev_t *vd)
|
||||
static uint64_t
|
||||
vdev_dtl_max(vdev_t *vd)
|
||||
{
|
||||
space_seg_t *ss;
|
||||
range_seg_t *rs;
|
||||
|
||||
ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
|
||||
ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
|
||||
ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
|
||||
ASSERT0(vd->vdev_children);
|
||||
|
||||
ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
|
||||
return (ss->ss_end);
|
||||
rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
|
||||
return (rs->rs_end);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1743,7 +1750,7 @@ vdev_dtl_should_excise(vdev_t *vd)
|
||||
ASSERT0(vd->vdev_children);
|
||||
|
||||
if (vd->vdev_resilver_txg == 0 ||
|
||||
vd->vdev_dtl[DTL_MISSING].sm_space == 0)
|
||||
range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
|
||||
return (B_TRUE);
|
||||
|
||||
/*
|
||||
@ -1813,35 +1820,35 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
|
||||
* positive refcnt -- either 1 or 2. We then convert
|
||||
* the reference tree into the new DTL_MISSING map.
|
||||
*/
|
||||
space_map_ref_create(&reftree);
|
||||
space_map_ref_add_map(&reftree,
|
||||
&vd->vdev_dtl[DTL_MISSING], 1);
|
||||
space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
|
||||
space_map_ref_add_map(&reftree,
|
||||
&vd->vdev_dtl[DTL_SCRUB], 2);
|
||||
space_map_ref_generate_map(&reftree,
|
||||
&vd->vdev_dtl[DTL_MISSING], 1);
|
||||
space_map_ref_destroy(&reftree);
|
||||
space_reftree_create(&reftree);
|
||||
space_reftree_add_map(&reftree,
|
||||
vd->vdev_dtl[DTL_MISSING], 1);
|
||||
space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
|
||||
space_reftree_add_map(&reftree,
|
||||
vd->vdev_dtl[DTL_SCRUB], 2);
|
||||
space_reftree_generate_map(&reftree,
|
||||
vd->vdev_dtl[DTL_MISSING], 1);
|
||||
space_reftree_destroy(&reftree);
|
||||
}
|
||||
space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
|
||||
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
|
||||
space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
|
||||
range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
|
||||
range_tree_walk(vd->vdev_dtl[DTL_MISSING],
|
||||
range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
|
||||
if (scrub_done)
|
||||
space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
|
||||
space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
|
||||
range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
|
||||
range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
|
||||
if (!vdev_readable(vd))
|
||||
space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
|
||||
range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
|
||||
else
|
||||
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
|
||||
space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
|
||||
range_tree_walk(vd->vdev_dtl[DTL_MISSING],
|
||||
range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
|
||||
|
||||
/*
|
||||
* If the vdev was resilvering and no longer has any
|
||||
* DTLs then reset its resilvering flag.
|
||||
*/
|
||||
if (vd->vdev_resilver_txg != 0 &&
|
||||
vd->vdev_dtl[DTL_MISSING].sm_space == 0 &&
|
||||
vd->vdev_dtl[DTL_OUTAGE].sm_space == 0)
|
||||
range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
|
||||
range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0)
|
||||
vd->vdev_resilver_txg = 0;
|
||||
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
@ -1853,6 +1860,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
for (t = 0; t < DTL_TYPES; t++) {
|
||||
int c;
|
||||
|
||||
/* account for child's outage in parent's missing map */
|
||||
int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
|
||||
if (t == DTL_SCRUB)
|
||||
@ -1863,47 +1872,57 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
|
||||
minref = vd->vdev_nparity + 1; /* RAID-Z */
|
||||
else
|
||||
minref = vd->vdev_children; /* any kind of mirror */
|
||||
space_map_ref_create(&reftree);
|
||||
space_reftree_create(&reftree);
|
||||
for (c = 0; c < vd->vdev_children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
mutex_enter(&cvd->vdev_dtl_lock);
|
||||
space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
|
||||
space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
|
||||
mutex_exit(&cvd->vdev_dtl_lock);
|
||||
}
|
||||
space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
|
||||
space_map_ref_destroy(&reftree);
|
||||
space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
|
||||
space_reftree_destroy(&reftree);
|
||||
}
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
}
|
||||
|
||||
static int
|
||||
int
|
||||
vdev_dtl_load(vdev_t *vd)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
space_map_obj_t *smo = &vd->vdev_dtl_smo;
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
dmu_buf_t *db;
|
||||
int error;
|
||||
|
||||
ASSERT(vd->vdev_children == 0);
|
||||
|
||||
if (smo->smo_object == 0)
|
||||
return (0);
|
||||
int error = 0;
|
||||
int c;
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
|
||||
ASSERT(!vd->vdev_ishole);
|
||||
|
||||
if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
|
||||
error = space_map_open(&vd->vdev_dtl_sm, mos,
|
||||
vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
ASSERT3U(db->db_size, >=, sizeof (*smo));
|
||||
bcopy(db->db_data, smo, sizeof (*smo));
|
||||
dmu_buf_rele(db, FTAG);
|
||||
ASSERT(vd->vdev_dtl_sm != NULL);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
|
||||
NULL, SM_ALLOC, smo, mos);
|
||||
|
||||
/*
|
||||
* Now that we've opened the space_map we need to update
|
||||
* the in-core DTL.
|
||||
*/
|
||||
space_map_update(vd->vdev_dtl_sm);
|
||||
|
||||
error = space_map_load(vd->vdev_dtl_sm,
|
||||
vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
for (c = 0; c < vd->vdev_children; c++) {
|
||||
error = vdev_dtl_load(vd->vdev_child[c]);
|
||||
if (error != 0)
|
||||
break;
|
||||
}
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
@ -1911,64 +1930,74 @@ void
|
||||
vdev_dtl_sync(vdev_t *vd, uint64_t txg)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
space_map_obj_t *smo = &vd->vdev_dtl_smo;
|
||||
space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
|
||||
range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
space_map_t smsync;
|
||||
kmutex_t smlock;
|
||||
dmu_buf_t *db;
|
||||
range_tree_t *rtsync;
|
||||
kmutex_t rtlock;
|
||||
dmu_tx_t *tx;
|
||||
uint64_t object = space_map_object(vd->vdev_dtl_sm);
|
||||
|
||||
ASSERT(!vd->vdev_ishole);
|
||||
ASSERT(vd->vdev_ops->vdev_op_leaf);
|
||||
|
||||
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
||||
|
||||
if (vd->vdev_detached) {
|
||||
if (smo->smo_object != 0) {
|
||||
VERIFY0(dmu_object_free(mos, smo->smo_object, tx));
|
||||
smo->smo_object = 0;
|
||||
}
|
||||
if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
space_map_free(vd->vdev_dtl_sm, tx);
|
||||
space_map_close(vd->vdev_dtl_sm);
|
||||
vd->vdev_dtl_sm = NULL;
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
dmu_tx_commit(tx);
|
||||
return;
|
||||
}
|
||||
|
||||
if (smo->smo_object == 0) {
|
||||
ASSERT(smo->smo_objsize == 0);
|
||||
ASSERT(smo->smo_alloc == 0);
|
||||
smo->smo_object = dmu_object_alloc(mos,
|
||||
DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
|
||||
DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
|
||||
ASSERT(smo->smo_object != 0);
|
||||
if (vd->vdev_dtl_sm == NULL) {
|
||||
uint64_t new_object;
|
||||
|
||||
new_object = space_map_alloc(mos, tx);
|
||||
VERIFY3U(new_object, !=, 0);
|
||||
|
||||
VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
|
||||
0, -1ULL, 0, &vd->vdev_dtl_lock));
|
||||
ASSERT(vd->vdev_dtl_sm != NULL);
|
||||
}
|
||||
|
||||
mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
rtsync = range_tree_create(NULL, NULL, &rtlock);
|
||||
|
||||
mutex_enter(&rtlock);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
range_tree_walk(rt, range_tree_add, rtsync);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
space_map_truncate(vd->vdev_dtl_sm, tx);
|
||||
space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
|
||||
range_tree_vacate(rtsync, NULL, NULL);
|
||||
|
||||
range_tree_destroy(rtsync);
|
||||
|
||||
mutex_exit(&rtlock);
|
||||
mutex_destroy(&rtlock);
|
||||
|
||||
/*
|
||||
* If the object for the space map has changed then dirty
|
||||
* the top level so that we update the config.
|
||||
*/
|
||||
if (object != space_map_object(vd->vdev_dtl_sm)) {
|
||||
zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
|
||||
"new object %llu", txg, spa_name(spa), object,
|
||||
space_map_object(vd->vdev_dtl_sm));
|
||||
vdev_config_dirty(vd->vdev_top);
|
||||
}
|
||||
|
||||
mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
|
||||
&smlock);
|
||||
|
||||
mutex_enter(&smlock);
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
space_map_walk(sm, space_map_add, &smsync);
|
||||
space_map_update(vd->vdev_dtl_sm);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
space_map_truncate(smo, mos, tx);
|
||||
space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
|
||||
space_map_vacate(&smsync, NULL, NULL);
|
||||
|
||||
space_map_destroy(&smsync);
|
||||
|
||||
mutex_exit(&smlock);
|
||||
mutex_destroy(&smlock);
|
||||
|
||||
VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
|
||||
dmu_buf_will_dirty(db, tx);
|
||||
ASSERT3U(db->db_size, >=, sizeof (*smo));
|
||||
bcopy(smo, db->db_data, sizeof (*smo));
|
||||
dmu_buf_rele(db, FTAG);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2018,7 +2047,7 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
|
||||
|
||||
if (vd->vdev_children == 0) {
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
|
||||
if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
|
||||
vdev_writeable(vd)) {
|
||||
|
||||
thismin = vdev_dtl_min(vd);
|
||||
@ -2126,29 +2155,25 @@ vdev_remove(vdev_t *vd, uint64_t txg)
|
||||
|
||||
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
|
||||
|
||||
if (vd->vdev_dtl_smo.smo_object) {
|
||||
ASSERT0(vd->vdev_dtl_smo.smo_alloc);
|
||||
(void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
|
||||
vd->vdev_dtl_smo.smo_object = 0;
|
||||
}
|
||||
|
||||
if (vd->vdev_ms != NULL) {
|
||||
for (m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
|
||||
if (msp == NULL || msp->ms_smo.smo_object == 0)
|
||||
if (msp == NULL || msp->ms_sm == NULL)
|
||||
continue;
|
||||
|
||||
ASSERT0(msp->ms_smo.smo_alloc);
|
||||
(void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
|
||||
msp->ms_smo.smo_object = 0;
|
||||
mutex_enter(&msp->ms_lock);
|
||||
VERIFY0(space_map_allocated(msp->ms_sm));
|
||||
space_map_free(msp->ms_sm, tx);
|
||||
space_map_close(msp->ms_sm);
|
||||
msp->ms_sm = NULL;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
}
|
||||
|
||||
if (vd->vdev_ms_array) {
|
||||
(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
|
||||
vd->vdev_ms_array = 0;
|
||||
vd->vdev_ms_shift = 0;
|
||||
}
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
|
@ -283,9 +283,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
|
||||
vd->vdev_removing);
|
||||
}
|
||||
|
||||
if (vd->vdev_dtl_smo.smo_object != 0)
|
||||
if (vd->vdev_dtl_sm != NULL) {
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
|
||||
vd->vdev_dtl_smo.smo_object);
|
||||
space_map_object(vd->vdev_dtl_sm));
|
||||
}
|
||||
|
||||
if (vd->vdev_crtxg)
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
|
||||
|
@ -369,36 +369,46 @@ spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
|
||||
spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx));
|
||||
}
|
||||
|
||||
/*
|
||||
* If the specified feature has not yet been enabled, this function returns
|
||||
* ENOTSUP; otherwise, this function increments the feature's refcount (or
|
||||
* returns EOVERFLOW if the refcount cannot be incremented). This function must
|
||||
* be called from syncing context.
|
||||
*/
|
||||
void
|
||||
spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(dmu_tx_is_syncing(tx));
|
||||
ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
|
||||
VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
|
||||
spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
|
||||
spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx));
|
||||
}
|
||||
|
||||
/*
|
||||
* If the specified feature has not yet been enabled, this function returns
|
||||
* ENOTSUP; otherwise, this function decrements the feature's refcount (or
|
||||
* returns EOVERFLOW if the refcount is already 0). This function must
|
||||
* be called from syncing context.
|
||||
*/
|
||||
void
|
||||
spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(dmu_tx_is_syncing(tx));
|
||||
ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
|
||||
VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
|
||||
spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
|
||||
spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx));
|
||||
}
|
||||
|
||||
/*
|
||||
* This interface is for debugging only. Normal consumers should use
|
||||
* spa_feature_is_enabled/spa_feature_is_active.
|
||||
*/
|
||||
int
|
||||
spa_feature_get_refcount(spa_t *spa, zfeature_info_t *feature)
|
||||
{
|
||||
int err;
|
||||
uint64_t refcount = 0;
|
||||
|
||||
if (spa_version(spa) < SPA_VERSION_FEATURES)
|
||||
return (B_FALSE);
|
||||
|
||||
err = feature_get_refcount(spa->spa_meta_objset,
|
||||
spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
|
||||
feature, &refcount);
|
||||
ASSERT(err == 0 || err == ENOTSUP);
|
||||
return (err == 0 ? refcount : 0);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature)
|
||||
{
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
*/
|
||||
|
||||
@ -164,4 +164,7 @@ zpool_feature_init(void)
|
||||
zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
|
||||
"org.illumos:lz4_compress", "lz4_compress",
|
||||
"LZ4 compression algorithm support.", B_FALSE, B_FALSE, NULL);
|
||||
zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
|
||||
"com.delphix:spacemap_histogram", "spacemap_histogram",
|
||||
"Spacemaps maintain space histograms.", B_TRUE, B_FALSE, NULL);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user