mirror_zfs/module/zfs/range_tree.c
George Wilson 93cf20764a Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>

Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.

This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.

The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram

In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:

    * 4K sector devices will not see any compression benefit
    * large space_maps require more metadata on-disk
    * large space_maps require more time to load (typically random reads)

Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.

A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.

References:
  https://www.illumos.org/issues/4101
  https://www.illumos.org/issues/4102
  https://www.illumos.org/issues/4103
  https://www.illumos.org/issues/4105
  https://www.illumos.org/issues/4106
  https://github.com/illumos/illumos-gate/commit/0713e23

Porting notes:

A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.

Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2014-07-22 09:39:16 -07:00

392 lines
9.0 KiB
C

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/dnode.h>
#include <sys/zio.h>
#include <sys/range_tree.h>
static kmem_cache_t *range_seg_cache;
void
range_tree_init(void)
{
ASSERT(range_seg_cache == NULL);
range_seg_cache = kmem_cache_create("range_seg_cache",
sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
}
void
range_tree_fini(void)
{
kmem_cache_destroy(range_seg_cache);
range_seg_cache = NULL;
}
void
range_tree_stat_verify(range_tree_t *rt)
{
range_seg_t *rs;
uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
int i;
for (rs = avl_first(&rt->rt_root); rs != NULL;
rs = AVL_NEXT(&rt->rt_root, rs)) {
uint64_t size = rs->rs_end - rs->rs_start;
int idx = highbit(size) - 1;
hist[idx]++;
ASSERT3U(hist[idx], !=, 0);
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
if (hist[i] != rt->rt_histogram[i]) {
zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
i, hist, hist[i], rt->rt_histogram[i]);
}
VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
}
}
static void
range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
{
uint64_t size = rs->rs_end - rs->rs_start;
int idx = highbit(size) - 1;
ASSERT3U(idx, <,
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
ASSERT(MUTEX_HELD(rt->rt_lock));
rt->rt_histogram[idx]++;
ASSERT3U(rt->rt_histogram[idx], !=, 0);
}
static void
range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
{
uint64_t size = rs->rs_end - rs->rs_start;
int idx = highbit(size) - 1;
ASSERT3U(idx, <,
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
ASSERT(MUTEX_HELD(rt->rt_lock));
ASSERT3U(rt->rt_histogram[idx], !=, 0);
rt->rt_histogram[idx]--;
}
/*
* NOTE: caller is responsible for all locking.
*/
static int
range_tree_seg_compare(const void *x1, const void *x2)
{
const range_seg_t *r1 = x1;
const range_seg_t *r2 = x2;
if (r1->rs_start < r2->rs_start) {
if (r1->rs_end > r2->rs_start)
return (0);
return (-1);
}
if (r1->rs_start > r2->rs_start) {
if (r1->rs_start < r2->rs_end)
return (0);
return (1);
}
return (0);
}
range_tree_t *
range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
{
range_tree_t *rt;
rt = kmem_zalloc(sizeof (range_tree_t), KM_PUSHPAGE);
avl_create(&rt->rt_root, range_tree_seg_compare,
sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
rt->rt_lock = lp;
rt->rt_ops = ops;
rt->rt_arg = arg;
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_create(rt, rt->rt_arg);
return (rt);
}
void
range_tree_destroy(range_tree_t *rt)
{
VERIFY0(rt->rt_space);
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
avl_destroy(&rt->rt_root);
kmem_free(rt, sizeof (*rt));
}
void
range_tree_add(void *arg, uint64_t start, uint64_t size)
{
range_tree_t *rt = arg;
avl_index_t where;
range_seg_t rsearch, *rs_before, *rs_after, *rs;
uint64_t end = start + size;
boolean_t merge_before, merge_after;
ASSERT(MUTEX_HELD(rt->rt_lock));
VERIFY(size != 0);
rsearch.rs_start = start;
rsearch.rs_end = end;
rs = avl_find(&rt->rt_root, &rsearch, &where);
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
zfs_panic_recover("zfs: allocating allocated segment"
"(offset=%llu size=%llu)\n",
(longlong_t)start, (longlong_t)size);
return;
}
/* Make sure we don't overlap with either of our neighbors */
VERIFY(rs == NULL);
rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
merge_before = (rs_before != NULL && rs_before->rs_end == start);
merge_after = (rs_after != NULL && rs_after->rs_start == end);
if (merge_before && merge_after) {
avl_remove(&rt->rt_root, rs_before);
if (rt->rt_ops != NULL) {
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
}
range_tree_stat_decr(rt, rs_before);
range_tree_stat_decr(rt, rs_after);
rs_after->rs_start = rs_before->rs_start;
kmem_cache_free(range_seg_cache, rs_before);
rs = rs_after;
} else if (merge_before) {
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
range_tree_stat_decr(rt, rs_before);
rs_before->rs_end = end;
rs = rs_before;
} else if (merge_after) {
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
range_tree_stat_decr(rt, rs_after);
rs_after->rs_start = start;
rs = rs_after;
} else {
rs = kmem_cache_alloc(range_seg_cache, KM_PUSHPAGE);
rs->rs_start = start;
rs->rs_end = end;
avl_insert(&rt->rt_root, rs, where);
}
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
range_tree_stat_incr(rt, rs);
rt->rt_space += size;
}
void
range_tree_remove(void *arg, uint64_t start, uint64_t size)
{
range_tree_t *rt = arg;
avl_index_t where;
range_seg_t rsearch, *rs, *newseg;
uint64_t end = start + size;
boolean_t left_over, right_over;
ASSERT(MUTEX_HELD(rt->rt_lock));
VERIFY3U(size, !=, 0);
VERIFY3U(size, <=, rt->rt_space);
rsearch.rs_start = start;
rsearch.rs_end = end;
rs = avl_find(&rt->rt_root, &rsearch, &where);
/* Make sure we completely overlap with someone */
if (rs == NULL) {
zfs_panic_recover("zfs: freeing free segment "
"(offset=%llu size=%llu)",
(longlong_t)start, (longlong_t)size);
return;
}
VERIFY3U(rs->rs_start, <=, start);
VERIFY3U(rs->rs_end, >=, end);
left_over = (rs->rs_start != start);
right_over = (rs->rs_end != end);
range_tree_stat_decr(rt, rs);
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
if (left_over && right_over) {
newseg = kmem_cache_alloc(range_seg_cache, KM_PUSHPAGE);
newseg->rs_start = end;
newseg->rs_end = rs->rs_end;
range_tree_stat_incr(rt, newseg);
rs->rs_end = start;
avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
} else if (left_over) {
rs->rs_end = start;
} else if (right_over) {
rs->rs_start = end;
} else {
avl_remove(&rt->rt_root, rs);
kmem_cache_free(range_seg_cache, rs);
rs = NULL;
}
if (rs != NULL) {
range_tree_stat_incr(rt, rs);
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
}
rt->rt_space -= size;
}
static range_seg_t *
range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size,
avl_index_t *wherep)
{
range_seg_t rsearch, *rs;
uint64_t end = start + size;
ASSERT(MUTEX_HELD(rt->rt_lock));
VERIFY(size != 0);
rsearch.rs_start = start;
rsearch.rs_end = end;
rs = avl_find(&rt->rt_root, &rsearch, wherep);
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end)
return (rs);
return (NULL);
}
void
range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
{
range_seg_t *rs;
avl_index_t where;
mutex_enter(rt->rt_lock);
rs = range_tree_find(rt, off, size, &where);
if (rs != NULL)
panic("freeing free block; rs=%p", (void *)rs);
mutex_exit(rt->rt_lock);
}
boolean_t
range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
{
avl_index_t where;
return (range_tree_find(rt, start, size, &where) != NULL);
}
void
range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
{
range_tree_t *rt;
ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
ASSERT0(range_tree_space(*rtdst));
ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
rt = *rtsrc;
*rtsrc = *rtdst;
*rtdst = rt;
}
void
range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
{
range_seg_t *rs;
void *cookie = NULL;
ASSERT(MUTEX_HELD(rt->rt_lock));
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
if (func != NULL)
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
kmem_cache_free(range_seg_cache, rs);
}
bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
rt->rt_space = 0;
}
void
range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
{
range_seg_t *rs;
ASSERT(MUTEX_HELD(rt->rt_lock));
for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
}
uint64_t
range_tree_space(range_tree_t *rt)
{
return (rt->rt_space);
}