diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index a6368c67d..0f09ec70a 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -103,6 +103,7 @@ extern uint64_t zfs_arc_max, zfs_arc_meta_limit; extern int zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern int zfs_reconstruct_indirect_combinations_max; +extern int zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; @@ -949,7 +950,7 @@ dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ @@ -958,7 +959,7 @@ dump_metaslab_stats(metaslab_t *msp) zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", - "segments", avl_numnodes(t), "maxsize", maxbuf, + "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); @@ -3141,7 +3142,7 @@ cksum_record_compare(const void *x1, const void *x2) int difference; for (int i = 0; i < arraysize; i++) { - difference = AVL_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); + difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); if (difference) break; } @@ -4063,7 +4064,7 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) ASSERT0(range_tree_space(svr->svr_allocd_segs)); - range_tree_t *allocs = range_tree_create(NULL, NULL); + range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; @@ -6088,7 +6089,8 @@ dump_zpool(spa_t *spa) if (dump_opt['d'] || dump_opt['i']) { spa_feature_t f; - mos_refd_objs = range_tree_create(NULL, NULL); + mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); dump_objset(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { @@ -6643,6 +6645,13 @@ main(int argc, char **argv) if (spa_config_path_env != NULL) spa_config_path = spa_config_path_env; + /* + * For performance reasons, we set this tunable down. We do so before + * the arg parsing section so that the user can override this value if + * they choose. + */ + zfs_btree_verify_intensity = 3; + while ((c = getopt(argc, argv, "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) { switch (c) { diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 734d5c161..5785bfd55 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -7,10 +7,12 @@ COMMON_H = \ $(top_srcdir)/include/sys/arc_impl.h \ $(top_srcdir)/include/sys/avl.h \ $(top_srcdir)/include/sys/avl_impl.h \ + $(top_srcdir)/include/sys/bitops.h \ $(top_srcdir)/include/sys/blkptr.h \ $(top_srcdir)/include/sys/bplist.h \ $(top_srcdir)/include/sys/bpobj.h \ $(top_srcdir)/include/sys/bptree.h \ + $(top_srcdir)/include/sys/btree.h \ $(top_srcdir)/include/sys/bqueue.h \ $(top_srcdir)/include/sys/cityhash.h \ $(top_srcdir)/include/sys/dataset_kstats.h \ diff --git a/include/sys/avl.h b/include/sys/avl.h index 962e8b1cf..6c4e7ed5c 100644 --- a/include/sys/avl.h +++ b/include/sys/avl.h @@ -108,9 +108,9 @@ extern "C" { /* * AVL comparator helpers */ -#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0)) -#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b))) -#define AVL_PCMP(a, b) \ +#define TREE_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define TREE_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define TREE_PCMP(a, b) \ (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) /* diff --git a/include/sys/bitops.h b/include/sys/bitops.h new file mode 100644 index 000000000..56d52073b --- /dev/null +++ b/include/sys/bitops.h @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017 Datto Inc. + */ + +#ifndef _SYS_BITOPS_H +#define _SYS_BITOPS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1U << (len)); \ + ASSERT3U(low + len, <=, 32); \ + (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ +_NOTE(CONSTCOND) } while (0) + +#define BF64_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1ULL << (len)); \ + ASSERT3U(low + len, <=, 64); \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ +_NOTE(CONSTCOND) } while (0) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +/* + * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in + * the case where val is a constant. We can't fix ASSERT because it's used as + * an expression in several places in the kernel; as a result, changing it to + * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option. + */ +#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) +#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BITOPS_H */ diff --git a/include/sys/btree.h b/include/sys/btree.h new file mode 100644 index 000000000..1673b5217 --- /dev/null +++ b/include/sys/btree.h @@ -0,0 +1,238 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +#ifndef _BTREE_H +#define _BTREE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * This file defines the interface for a B-Tree implementation for ZFS. The + * tree can be used to store arbitrary sortable data types with low overhead + * and good operation performance. In addition the tree intelligently + * optimizes bulk in-order insertions to improve memory use and performance. + * + * Note that for all B-Tree functions, the values returned are pointers to the + * internal copies of the data in the tree. The internal data can only be + * safely mutated if the changes cannot change the ordering of the element + * with respect to any other elements in the tree. + * + * The major drawback of the B-Tree is that any returned elements or indexes + * are only valid until a side-effectful operation occurs, since these can + * result in reallocation or relocation of data. Side effectful operations are + * defined as insertion, removal, and zfs_btree_destroy_nodes. + * + * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core + * nodes have an array of children pointing to other nodes, and an array of + * elements that act as separators between the elements of the subtrees rooted + * at its children. Leaf nodes only contain data elements, and form the bottom + * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the + * elements in the core nodes are not copies of or references to leaf node + * elements. Each element occcurs only once in the tree, no matter what kind + * of node it is in. + * + * The tree's height is the same throughout, unlike many other forms of search + * tree. Each node (except for the root) must be between half minus one and + * completely full of elements (and children) at all times. Any operation that + * would put the node outside of that range results in a rebalancing operation + * (taking, merging, or splitting). + * + * This tree was implemented using descriptions from Wikipedia's articles on + * B-Trees and B+ Trees. + */ + +/* + * Decreasing these values results in smaller memmove operations, but more of + * them, and increased memory overhead. Increasing these values results in + * higher variance in operation time, and reduces memory overhead. + */ +#define BTREE_CORE_ELEMS 128 +#define BTREE_LEAF_SIZE 4096 + +extern kmem_cache_t *zfs_btree_leaf_cache; + +typedef struct zfs_btree_hdr { + struct zfs_btree_core *bth_parent; + boolean_t bth_core; + /* + * For both leaf and core nodes, represents the number of elements in + * the node. For core nodes, they will have bth_count + 1 children. + */ + uint32_t bth_count; +} zfs_btree_hdr_t; + +typedef struct zfs_btree_core { + zfs_btree_hdr_t btc_hdr; + zfs_btree_hdr_t *btc_children[BTREE_CORE_ELEMS + 1]; + uint8_t btc_elems[]; +} zfs_btree_core_t; + +typedef struct zfs_btree_leaf { + zfs_btree_hdr_t btl_hdr; + uint8_t btl_elems[]; +} zfs_btree_leaf_t; + +typedef struct zfs_btree_index { + zfs_btree_hdr_t *bti_node; + uint64_t bti_offset; + /* + * True if the location is before the list offset, false if it's at + * the listed offset. + */ + boolean_t bti_before; +} zfs_btree_index_t; + +typedef struct btree { + zfs_btree_hdr_t *bt_root; + int64_t bt_height; + size_t bt_elem_size; + uint64_t bt_num_elems; + uint64_t bt_num_nodes; + zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading + int (*bt_compar) (const void *, const void *); +} zfs_btree_t; + +/* + * Allocate and deallocate caches for btree nodes. + */ +void zfs_btree_init(void); +void zfs_btree_fini(void); + +/* + * Initialize an B-Tree. Arguments are: + * + * tree - the tree to be initialized + * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 + * -1 for <, 0 for ==, and +1 for > + * size - the value of sizeof(struct my_type) + */ +void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), + size_t); + +/* + * Find a node with a matching value in the tree. Returns the matching node + * found. If not found, it returns NULL and then if "where" is not NULL it sets + * "where" for use with zfs_btree_insert() or zfs_btree_nearest(). + * + * node - node that has the value being looked for + * where - position for use with zfs_btree_nearest() or zfs_btree_insert(), + * may be NULL + */ +void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *); + +/* + * Insert a node into the tree. + * + * node - the node to insert + * where - position as returned from zfs_btree_find() + */ +void zfs_btree_insert(zfs_btree_t *, const void *, const zfs_btree_index_t *); + +/* + * Return the first or last valued node in the tree. Will return NULL + * if the tree is empty. + */ +void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *); +void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Return the next or previous valued node in the tree. + */ +void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *, + zfs_btree_index_t *); +void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *, + zfs_btree_index_t *); + +/* + * Get a value from a tree and an index. + */ +void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Add a single value to the tree. The value must not compare equal to any + * other node already in the tree. + */ +void zfs_btree_add(zfs_btree_t *, const void *); + +/* + * Remove a single value from the tree. The value must be in the tree. The + * pointer passed in may be a pointer into a tree-controlled buffer, but it + * need not be. + */ +void zfs_btree_remove(zfs_btree_t *, const void *); + +/* + * Remove the value at the given location from the tree. + */ +void zfs_btree_remove_from(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Return the number of nodes in the tree + */ +ulong_t zfs_btree_numnodes(zfs_btree_t *); + +/* + * Used to destroy any remaining nodes in a tree. The cookie argument should + * be initialized to NULL before the first call. Returns a node that has been + * removed from the tree and may be free()'d. Returns NULL when the tree is + * empty. + * + * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it + * and finally zfs_btree_destroy(). No other B-Tree routines will be valid. + * + * cookie - an index used to save state between calls to + * zfs_btree_destroy_nodes() + * + * EXAMPLE: + * zfs_btree_t *tree; + * struct my_data *node; + * zfs_btree_index_t *cookie; + * + * cookie = NULL; + * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) + * data_destroy(node); + * zfs_btree_destroy(tree); + */ +void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **); + +/* + * Destroys all nodes in the tree quickly. This doesn't give the caller an + * opportunity to iterate over each node and do its own cleanup; for that, use + * zfs_btree_destroy_nodes(). + */ +void zfs_btree_clear(zfs_btree_t *); + +/* + * Final destroy of an B-Tree. Arguments are: + * + * tree - the empty tree to destroy + */ +void zfs_btree_destroy(zfs_btree_t *tree); + +/* Runs a variety of self-checks on the btree to verify integrity. */ +void zfs_btree_verify(zfs_btree_t *tree); + +#ifdef __cplusplus +} +#endif + +#endif /* _BTREE_H */ diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 00b8b4758..f8d9c6a82 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -95,8 +95,8 @@ void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); -void metaslab_alloc_trace_init(void); -void metaslab_alloc_trace_fini(void); +void metaslab_stat_init(void); +void metaslab_stat_fini(void); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 3ce39183e..d140f741d 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -509,8 +509,8 @@ struct metaslab { * only difference is that the ms_allocatable_by_size is ordered by * segment sizes. */ - avl_tree_t ms_allocatable_by_size; - avl_tree_t ms_unflushed_frees_by_size; + zfs_btree_t ms_allocatable_by_size; + zfs_btree_t ms_unflushed_frees_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index e299613b8..d80faa933 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -30,7 +30,7 @@ #ifndef _SYS_RANGE_TREE_H #define _SYS_RANGE_TREE_H -#include +#include #include #ifdef __cplusplus @@ -41,20 +41,35 @@ extern "C" { typedef struct range_tree_ops range_tree_ops_t; +typedef enum range_seg_type { + RANGE_SEG32, + RANGE_SEG64, + RANGE_SEG_GAP, + RANGE_SEG_NUM_TYPES, +} range_seg_type_t; + /* * Note: the range_tree may not be accessed concurrently; consumers * must provide external locking if required. */ typedef struct range_tree { - avl_tree_t rt_root; /* offset-ordered segment AVL tree */ + zfs_btree_t rt_root; /* offset-ordered segment b-tree */ uint64_t rt_space; /* sum of all segments in the map */ - uint64_t rt_gap; /* allowable inter-segment gap */ + range_seg_type_t rt_type; /* type of range_seg_t in use */ + /* + * All data that is stored in the range tree must have a start higher + * than or equal to rt_start, and all sizes and offsets must be + * multiples of 1 << rt_shift. + */ + uint8_t rt_shift; + uint64_t rt_start; range_tree_ops_t *rt_ops; - /* rt_avl_compare should only be set if rt_arg is an AVL tree */ + /* rt_btree_compare should only be set if rt_arg is a b-tree */ void *rt_arg; - int (*rt_avl_compare)(const void *, const void *); + int (*rt_btree_compare)(const void *, const void *); + uint64_t rt_gap; /* allowable inter-segment gap */ /* * The rt_histogram maintains a histogram of ranges. Each bucket, @@ -64,36 +79,217 @@ typedef struct range_tree { uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; } range_tree_t; -typedef struct range_seg { - avl_node_t rs_node; /* AVL node */ - avl_node_t rs_pp_node; /* AVL picker-private node */ +typedef struct range_seg32 { + uint32_t rs_start; /* starting offset of this segment */ + uint32_t rs_end; /* ending offset (non-inclusive) */ +} range_seg32_t; + +/* + * Extremely large metaslabs, vdev-wide trees, and dnode-wide trees may + * require 64-bit integers for ranges. + */ +typedef struct range_seg64 { + uint64_t rs_start; /* starting offset of this segment */ + uint64_t rs_end; /* ending offset (non-inclusive) */ +} range_seg64_t; + +typedef struct range_seg_gap { uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ uint64_t rs_fill; /* actual fill if gap mode is on */ -} range_seg_t; +} range_seg_gap_t; + +/* + * This type needs to be the largest of the range segs, since it will be stack + * allocated and then cast the actual type to do tree operations. + */ +typedef range_seg_gap_t range_seg_max_t; + +/* + * This is just for clarity of code purposes, so we can make it clear that a + * pointer is to a range seg of some type; when we need to do the actual math, + * we'll figure out the real type. + */ +typedef void range_seg_t; struct range_tree_ops { void (*rtop_create)(range_tree_t *rt, void *arg); void (*rtop_destroy)(range_tree_t *rt, void *arg); - void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg); - void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg); + void (*rtop_add)(range_tree_t *rt, void *rs, void *arg); + void (*rtop_remove)(range_tree_t *rt, void *rs, void *arg); void (*rtop_vacate)(range_tree_t *rt, void *arg); }; +static inline uint64_t +rs_get_start_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + return (((range_seg32_t *)rs)->rs_start); + case RANGE_SEG64: + return (((range_seg64_t *)rs)->rs_start); + case RANGE_SEG_GAP: + return (((range_seg_gap_t *)rs)->rs_start); + default: + VERIFY(0); + return (0); + } +} + +static inline uint64_t +rs_get_end_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + return (((range_seg32_t *)rs)->rs_end); + case RANGE_SEG64: + return (((range_seg64_t *)rs)->rs_end); + case RANGE_SEG_GAP: + return (((range_seg_gap_t *)rs)->rs_end); + default: + VERIFY(0); + return (0); + } +} + +static inline uint64_t +rs_get_fill_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: { + const range_seg32_t *r32 = rs; + return (r32->rs_end - r32->rs_start); + } + case RANGE_SEG64: { + const range_seg64_t *r64 = rs; + return (r64->rs_end - r64->rs_start); + } + case RANGE_SEG_GAP: + return (((range_seg_gap_t *)rs)->rs_fill); + default: + VERIFY(0); + return (0); + } + +} + +static inline uint64_t +rs_get_start(const range_seg_t *rs, const range_tree_t *rt) +{ + return ((rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start); +} + +static inline uint64_t +rs_get_end(const range_seg_t *rs, const range_tree_t *rt) +{ + return ((rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start); +} + +static inline uint64_t +rs_get_fill(const range_seg_t *rs, const range_tree_t *rt) +{ + return (rs_get_fill_raw(rs, rt) << rt->rt_shift); +} + +static inline void +rs_set_start_raw(range_seg_t *rs, range_tree_t *rt, uint64_t start) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + ASSERT3U(start, <=, UINT32_MAX); + ((range_seg32_t *)rs)->rs_start = (uint32_t)start; + break; + case RANGE_SEG64: + ((range_seg64_t *)rs)->rs_start = start; + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_start = start; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_end_raw(range_seg_t *rs, range_tree_t *rt, uint64_t end) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + ASSERT3U(end, <=, UINT32_MAX); + ((range_seg32_t *)rs)->rs_end = (uint32_t)end; + break; + case RANGE_SEG64: + ((range_seg64_t *)rs)->rs_end = end; + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_end = end; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_fill_raw(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + /* fall through */ + case RANGE_SEG64: + ASSERT3U(fill, ==, rs_get_end_raw(rs, rt) - rs_get_start_raw(rs, + rt)); + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_fill = fill; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_start(range_seg_t *rs, range_tree_t *rt, uint64_t start) +{ + ASSERT3U(start, >=, rt->rt_start); + ASSERT(IS_P2ALIGNED(start, 1ULL << rt->rt_shift)); + rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift); +} + +static inline void +rs_set_end(range_seg_t *rs, range_tree_t *rt, uint64_t end) +{ + ASSERT3U(end, >=, rt->rt_start); + ASSERT(IS_P2ALIGNED(end, 1ULL << rt->rt_shift)); + rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift); +} + +static inline void +rs_set_fill(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +{ + ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift)); + rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); +} + typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); -void range_tree_init(void); -void range_tree_fini(void); -range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare) (const void *, const void *), uint64_t gap); -range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); +range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, + range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + int (*zfs_btree_compare) (const void *, const void *), uint64_t gap); +range_tree_t *range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, + void *arg, uint64_t start, uint64_t shift); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, uint64_t *ostart, uint64_t *osize); void range_tree_verify_not_present(range_tree_t *rt, uint64_t start, uint64_t size); -range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize); uint64_t range_tree_space(range_tree_t *rt); @@ -120,12 +316,12 @@ void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, range_tree_t *addto); -void rt_avl_create(range_tree_t *rt, void *arg); -void rt_avl_destroy(range_tree_t *rt, void *arg); -void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_vacate(range_tree_t *rt, void *arg); -extern struct range_tree_ops rt_avl_ops; +void rt_btree_create(range_tree_t *rt, void *arg); +void rt_btree_destroy(range_tree_t *rt, void *arg); +void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_btree_vacate(range_tree_t *rt, void *arg); +extern range_tree_ops_t rt_btree_ops; #ifdef __cplusplus } diff --git a/include/sys/spa.h b/include/sys/spa.h index caa2157b9..18933e0d4 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -43,6 +43,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -69,51 +70,6 @@ struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; -/* - * General-purpose 32-bit and 64-bit bitfield encodings. - */ -#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) -#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) -#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) -#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) - -#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) -#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) - -#define BF32_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1U << (len)); \ - ASSERT3U(low + len, <=, 32); \ - (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ -_NOTE(CONSTCOND) } while (0) - -#define BF64_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1ULL << (len)); \ - ASSERT3U(low + len, <=, 64); \ - ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ -_NOTE(CONSTCOND) } while (0) - -#define BF32_GET_SB(x, low, len, shift, bias) \ - ((BF32_GET(x, low, len) + (bias)) << (shift)) -#define BF64_GET_SB(x, low, len, shift, bias) \ - ((BF64_GET(x, low, len) + (bias)) << (shift)) - -/* - * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in - * the case where val is a constant. We can't fix ASSERT because it's used as - * an expression in several places in the kernel; as a result, changing it to - * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option. - */ -#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) -#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) - /* * We currently support block sizes from 512 bytes to 16MB. * The benefits of larger blocks, and thus larger IO, need to be weighed diff --git a/include/sys/space_reftree.h b/include/sys/space_reftree.h index 249b15be6..ca9d41dc1 100644 --- a/include/sys/space_reftree.h +++ b/include/sys/space_reftree.h @@ -31,7 +31,7 @@ #define _SYS_SPACE_REFTREE_H #include - +#include #ifdef __cplusplus extern "C" { #endif diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 0d0d0234e..c2fbcc549 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -96,8 +96,8 @@ extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd, char *tag); -extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, - range_seg_t *physical_rs); +extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs); extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index c179191e3..ae82b75c0 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -86,8 +86,8 @@ typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, * Given a target vdev, translates the logical range "in" to the physical * range "res" */ -typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in, - range_seg_t *res); +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *in, + range_seg64_t *res); typedef const struct vdev_ops { vdev_open_func_t *vdev_op_open; @@ -526,8 +526,8 @@ extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ -extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, - range_seg_t *out); +extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, + range_seg64_t *out); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index ef638a498..e88ace53f 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -796,7 +796,7 @@ libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); - return (AVL_ISIGN(rv)); + return (TREE_ISIGN(rv)); } void diff --git a/lib/libzfs/libzfs_iter.c b/lib/libzfs/libzfs_iter.c index d3ba58a9e..31b2ca30e 100644 --- a/lib/libzfs/libzfs_iter.c +++ b/lib/libzfs/libzfs_iter.c @@ -302,7 +302,7 @@ zfs_snapshot_compare(const void *larg, const void *rarg) lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); - return (AVL_CMP(lcreate, rcreate)); + return (TREE_CMP(lcreate, rcreate)); } int diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 0b6b31568..29cd24eed 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -511,7 +511,7 @@ fsavl_compare(const void *arg1, const void *arg2) const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; - return (AVL_CMP(fn1->fn_guid, fn2->fn_guid)); + return (TREE_CMP(fn1->fn_guid, fn2->fn_guid)); } /* diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 97f9abcf4..ca3038f7d 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -45,6 +45,7 @@ KERNEL_C = \ bplist.c \ bpobj.c \ bptree.c \ + btree.c \ bqueue.c \ cityhash.c \ dbuf.c \ diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index e85ce4594..b8cdc118b 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -972,11 +972,11 @@ slice_cache_compare(const void *arg1, const void *arg2) uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; int rv; - rv = AVL_ISIGN(strcmp(nm1, nm2)); + rv = TREE_ISIGN(strcmp(nm1, nm2)); if (rv) return (rv); - return (AVL_CMP(guid1, guid2)); + return (TREE_CMP(guid1, guid2)); } static int diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index a4d336abe..ebfaae67b 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -247,7 +247,7 @@ zfs_znode_hold_compare(const void *a, const void *b) const znode_hold_t *zh_a = (const znode_hold_t *)a; const znode_hold_t *zh_b = (const znode_hold_t *)b; - return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj)); + return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); } boolean_t diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index b60b799b5..c00c8e424 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -22,6 +22,7 @@ $(MODULE)-objs += blkptr.o $(MODULE)-objs += bplist.o $(MODULE)-objs += bpobj.o $(MODULE)-objs += bptree.o +$(MODULE)-objs += btree.o $(MODULE)-objs += bqueue.o $(MODULE)-objs += cityhash.o $(MODULE)-objs += dataset_kstats.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 9b90690b6..c1ad8785d 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4759,7 +4759,6 @@ arc_kmem_reap_soon(void) kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; - extern kmem_cache_t *range_seg_cache; #ifdef _KERNEL if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) && @@ -4796,7 +4795,7 @@ arc_kmem_reap_soon(void) kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); - kmem_cache_reap_now(range_seg_cache); + kmem_cache_reap_now(zfs_btree_leaf_cache); if (zio_arena != NULL) { /* diff --git a/module/zfs/btree.c b/module/zfs/btree.c new file mode 100644 index 000000000..8f514caab --- /dev/null +++ b/module/zfs/btree.c @@ -0,0 +1,2124 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include + +kmem_cache_t *zfs_btree_leaf_cache; + +/* + * Control the extent of the verification that occurs when zfs_btree_verify is + * called. Primarily used for debugging when extending the btree logic and + * functionality. As the intensity is increased, new verification steps are + * added. These steps are cumulative; intensity = 3 includes the intensity = 1 + * and intensity = 2 steps as well. + * + * Intensity 1: Verify that the tree's height is consistent throughout. + * Intensity 2: Verify that a core node's children's parent pointers point + * to the core node. + * Intensity 3: Verify that the total number of elements in the tree matches the + * sum of the number of elements in each node. Also verifies that each node's + * count obeys the invariants (less than or equal to maximum value, greater than + * or equal to half the maximum minus one). + * Intensity 4: Verify that each element compares less than the element + * immediately after it and greater than the one immediately before it using the + * comparator function. For core nodes, also checks that each element is greater + * than the last element in the first of the two nodes it separates, and less + * than the first element in the second of the two nodes. + * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside + * of each node is poisoned appropriately. Note that poisoning always occurs if + * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal + * operation. + * + * Intensity 4 and 5 are particularly expensive to perform; the previous levels + * are a few memory operations per node, while these levels require multiple + * operations per element. In addition, when creating large btrees, these + * operations are called at every step, resulting in extremely slow operation + * (while the asymptotic complexity of the other steps is the same, the + * importance of the constant factors cannot be denied). + */ +int zfs_btree_verify_intensity = 0; + +/* + * A convenience function to silence warnings from memmove's return value and + * change argument order to src, dest. + */ +void +bmov(const void *src, void *dest, size_t size) +{ + (void) memmove(dest, src, size); +} + +#ifdef _ILP32 +#define BTREE_POISON 0xabadb10c +#else +#define BTREE_POISON 0xabadb10cdeadbeef +#endif + +static void +zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f, + BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) - + hdr->bth_count * size); + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + node->btc_children[i] = + (zfs_btree_hdr_t *)BTREE_POISON; + } + (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, + (BTREE_CORE_ELEMS - hdr->bth_count) * size); + } +#endif +} + +static inline void +zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + uint64_t offset) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + ASSERT3U(offset, >=, hdr->bth_count); + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems + offset * size, 0x0f, size); + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + node->btc_children[offset + 1] = + (zfs_btree_hdr_t *)BTREE_POISON; + (void) memset(node->btc_elems + offset * size, 0x0f, size); + } +#endif +} + +static inline void +zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + uint64_t offset) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + uint8_t eval = 0x0f; + if (hdr->bth_core) { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; + VERIFY3P(node->btc_children[offset + 1], ==, cval); + for (int i = 0; i < size; i++) + VERIFY3U(node->btc_elems[offset * size + i], ==, eval); + } else { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + for (int i = 0; i < size; i++) + VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval); + } +#endif +} + +void +zfs_btree_init(void) +{ + zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", + BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, + NULL, 0); +} + +void +zfs_btree_fini(void) +{ + kmem_cache_destroy(zfs_btree_leaf_cache); +} + +void +zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), + size_t size) +{ + /* + * We need a minimmum of 4 elements so that when we split a node we + * always have at least two elements in each node. This simplifies the + * logic in zfs_btree_bulk_finish, since it means the last leaf will + * always have a left sibling to share with (unless it's the root). + */ + ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4); + + bzero(tree, sizeof (*tree)); + tree->bt_compar = compar; + tree->bt_elem_size = size; + tree->bt_height = -1; + tree->bt_bulk = NULL; +} + +/* + * Find value in the array of elements provided. Uses a simple binary search. + */ +static void * +zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems, + const void *value, zfs_btree_index_t *where) +{ + uint64_t max = nelems; + uint64_t min = 0; + while (max > min) { + uint64_t idx = (min + max) / 2; + uint8_t *cur = buf + idx * tree->bt_elem_size; + int comp = tree->bt_compar(cur, value); + if (comp == -1) { + min = idx + 1; + } else if (comp == 1) { + max = idx; + } else { + ASSERT0(comp); + where->bti_offset = idx; + where->bti_before = B_FALSE; + return (cur); + } + } + + where->bti_offset = max; + where->bti_before = B_TRUE; + return (NULL); +} + +/* + * Find the given value in the tree. where may be passed as null to use as a + * membership test or if the btree is being used as a map. + */ +void * +zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + if (where != NULL) { + where->bti_node = NULL; + where->bti_offset = 0; + } + ASSERT0(tree->bt_num_elems); + return (NULL); + } + + /* + * If we're in bulk-insert mode, we check the last spot in the tree + * and the last leaf in the tree before doing the normal search, + * because for most workloads the vast majority of finds in + * bulk-insert mode are to insert new elements. + */ + zfs_btree_index_t idx; + if (tree->bt_bulk != NULL) { + zfs_btree_leaf_t *last_leaf = tree->bt_bulk; + int compar = tree->bt_compar(last_leaf->btl_elems + + ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size), + value); + if (compar < 0) { + /* + * If what they're looking for is after the last + * element, it's not in the tree. + */ + if (where != NULL) { + where->bti_node = (zfs_btree_hdr_t *)last_leaf; + where->bti_offset = + last_leaf->btl_hdr.bth_count; + where->bti_before = B_TRUE; + } + return (NULL); + } else if (compar == 0) { + if (where != NULL) { + where->bti_node = (zfs_btree_hdr_t *)last_leaf; + where->bti_offset = + last_leaf->btl_hdr.bth_count - 1; + where->bti_before = B_FALSE; + } + return (last_leaf->btl_elems + + ((last_leaf->btl_hdr.bth_count - 1) * + tree->bt_elem_size)); + } + if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) { + /* + * If what they're looking for is after the first + * element in the last leaf, it's in the last leaf or + * it's not in the tree. + */ + void *d = zfs_btree_find_in_buf(tree, + last_leaf->btl_elems, last_leaf->btl_hdr.bth_count, + value, &idx); + + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)last_leaf; + *where = idx; + } + return (d); + } + } + + zfs_btree_core_t *node = NULL; + uint64_t child = 0; + uint64_t depth = 0; + + /* + * Iterate down the tree, finding which child the value should be in + * by comparing with the separators. + */ + for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; + node = (zfs_btree_core_t *)node->btc_children[child], depth++) { + ASSERT3P(node, !=, NULL); + void *d = zfs_btree_find_in_buf(tree, node->btc_elems, + node->btc_hdr.bth_count, value, &idx); + EQUIV(d != NULL, !idx.bti_before); + if (d != NULL) { + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)node; + *where = idx; + } + return (d); + } + ASSERT(idx.bti_before); + child = idx.bti_offset; + } + + /* + * The value is in this leaf, or it would be if it were in the + * tree. Find its proper location and return it. + */ + zfs_btree_leaf_t *leaf = (depth == 0 ? + (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); + void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems, + leaf->btl_hdr.bth_count, value, &idx); + + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)leaf; + *where = idx; + } + + return (d); +} + +/* + * To explain the following functions, it is useful to understand the four + * kinds of shifts used in btree operation. First, a shift is a movement of + * elements within a node. It is used to create gaps for inserting new + * elements and children, or cover gaps created when things are removed. A + * shift has two fundamental properties, each of which can be one of two + * values, making four types of shifts. There is the direction of the shift + * (left or right) and the shape of the shift (parallelogram or isoceles + * trapezoid (shortened to trapezoid hereafter)). The shape distinction only + * applies to shifts of core nodes. + * + * The names derive from the following imagining of the layout of a node: + * + * Elements: * * * * * * * ... * * * + * Children: * * * * * * * * ... * * * + * + * This layout follows from the fact that the elements act as separators + * between pairs of children, and that children root subtrees "below" the + * current node. A left and right shift are fairly self-explanatory; a left + * shift moves things to the left, while a right shift moves things to the + * right. A parallelogram shift is a shift with the same number of elements + * and children being moved, while a trapezoid shift is a shift that moves one + * more children than elements. An example follows: + * + * A parallelogram shift could contain the following: + * _______________ + * \* * * * \ * * * ... * * * + * * \ * * * *\ * * * ... * * * + * --------------- + * A trapezoid shift could contain the following: + * ___________ + * * / * * * \ * * * ... * * * + * * / * * * *\ * * * ... * * * + * --------------- + * + * Note that a parellelogram shift is always shaped like a "left-leaning" + * parallelogram, where the starting index of the children being moved is + * always one higher than the starting index of the elements being moved. No + * "right-leaning" parallelogram shifts are needed (shifts where the starting + * element index and starting child index being moved are the same) to achieve + * any btree operations, so we ignore them. + */ + +enum bt_shift_shape { + BSS_TRAPEZOID, + BSS_PARALLELOGRAM +}; + +enum bt_shift_direction { + BSD_LEFT, + BSD_RIGHT +}; + +/* + * Shift elements and children in the provided core node by off spots. The + * first element moved is idx, and count elements are moved. The shape of the + * shift is determined by shape. The direction is determined by dir. + */ +static inline void +bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, uint64_t off, enum bt_shift_shape shape, + enum bt_shift_direction dir) +{ + size_t size = tree->bt_elem_size; + ASSERT(node->btc_hdr.bth_core); + + uint8_t *e_start = node->btc_elems + idx * size; + int sign = (dir == BSD_LEFT ? -1 : +1); + uint8_t *e_out = e_start + sign * off * size; + uint64_t e_count = count; + bmov(e_start, e_out, e_count * size); + + zfs_btree_hdr_t **c_start = node->btc_children + idx + + (shape == BSS_TRAPEZOID ? 0 : 1); + zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : + c_start + off); + uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); + bmov(c_start, c_out, c_count * sizeof (*c_start)); +} + +/* + * Shift elements and children in the provided core node left by one spot. + * The first element moved is idx, and count elements are moved. The + * shape of the shift is determined by trap; true if the shift is a trapezoid, + * false if it is a parallelogram. + */ +static inline void +bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, enum bt_shift_shape shape) +{ + bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); +} + +/* + * Shift elements and children in the provided core node right by one spot. + * Starts with elements[idx] and children[idx] and one more child than element. + */ +static inline void +bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, enum bt_shift_shape shape) +{ + bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); +} + +/* + * Shift elements and children in the provided leaf node by off spots. + * The first element moved is idx, and count elements are moved. The direction + * is determined by left. + */ +static inline void +bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx, + uint64_t count, uint64_t off, enum bt_shift_direction dir) +{ + size_t size = tree->bt_elem_size; + ASSERT(!node->btl_hdr.bth_core); + + uint8_t *start = node->btl_elems + idx * size; + int sign = (dir == BSD_LEFT ? -1 : +1); + uint8_t *out = start + sign * off * size; + bmov(start, out, count * size); +} + +static inline void +bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, + uint64_t count) +{ + bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT); +} + +static inline void +bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, + uint64_t count) +{ + bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT); +} + +/* + * Move children and elements from one core node to another. The shape + * parameter behaves the same as it does in the shift logic. + */ +static inline void +bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx, + uint64_t count, zfs_btree_core_t *dest, uint64_t didx, + enum bt_shift_shape shape) +{ + size_t size = tree->bt_elem_size; + ASSERT(source->btc_hdr.bth_core); + ASSERT(dest->btc_hdr.bth_core); + + bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size, + count * size); + + uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); + bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), + dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), + c_count * sizeof (*source->btc_children)); +} + +static inline void +bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx, + uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx) +{ + size_t size = tree->bt_elem_size; + ASSERT(!source->btl_hdr.bth_core); + ASSERT(!dest->btl_hdr.bth_core); + + bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size, + count * size); +} + +/* + * Find the first element in the subtree rooted at hdr, return its value and + * put its location in where if non-null. + */ +static void * +zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) +{ + zfs_btree_hdr_t *node; + + for (node = hdr; node->bth_core; node = + ((zfs_btree_core_t *)node)->btc_children[0]) + ; + + ASSERT(!node->bth_core); + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; + if (where != NULL) { + where->bti_node = node; + where->bti_offset = 0; + where->bti_before = B_FALSE; + } + return (&leaf->btl_elems[0]); +} + +/* Insert an element and a child into a core node at the given offset. */ +static void +zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, + uint64_t offset, zfs_btree_hdr_t *new_node, void *buf) +{ + uint64_t size = tree->bt_elem_size; + zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; + ASSERT3P(par_hdr, ==, new_node->bth_parent); + ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); + + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, par_hdr, + par_hdr->bth_count); + } + /* Shift existing elements and children */ + uint64_t count = par_hdr->bth_count - offset; + bt_shift_core_right(tree, parent, offset, count, + BSS_PARALLELOGRAM); + + /* Insert new values */ + parent->btc_children[offset + 1] = new_node; + bmov(buf, parent->btc_elems + offset * size, size); + par_hdr->bth_count++; +} + +/* + * Insert new_node into the parent of old_node directly after old_node, with + * buf as the dividing element between the two. + */ +static void +zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, + zfs_btree_hdr_t *new_node, void *buf) +{ + ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); + uint64_t size = tree->bt_elem_size; + zfs_btree_core_t *parent = old_node->bth_parent; + zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; + + /* + * If this is the root node we were splitting, we create a new root + * and increase the height of the tree. + */ + if (parent == NULL) { + ASSERT3P(old_node, ==, tree->bt_root); + tree->bt_num_nodes++; + zfs_btree_core_t *new_root = + kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * + size, KM_SLEEP); + zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; + new_root_hdr->bth_parent = NULL; + new_root_hdr->bth_core = B_TRUE; + new_root_hdr->bth_count = 1; + + old_node->bth_parent = new_node->bth_parent = new_root; + new_root->btc_children[0] = old_node; + new_root->btc_children[1] = new_node; + bmov(buf, new_root->btc_elems, size); + + tree->bt_height++; + tree->bt_root = new_root_hdr; + zfs_btree_poison_node(tree, new_root_hdr); + return; + } + + /* + * Since we have the new separator, binary search for where to put + * new_node. + */ + zfs_btree_index_t idx; + ASSERT(par_hdr->bth_core); + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + par_hdr->bth_count, buf, &idx), ==, NULL); + ASSERT(idx.bti_before); + uint64_t offset = idx.bti_offset; + ASSERT3U(offset, <=, par_hdr->bth_count); + ASSERT3P(parent->btc_children[offset], ==, old_node); + + /* + * If the parent isn't full, shift things to accomodate our insertions + * and return. + */ + if (par_hdr->bth_count != BTREE_CORE_ELEMS) { + zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); + return; + } + + /* + * We need to split this core node into two. Currently there are + * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for + * BTREE_CORE_ELEMS + 2. Some of the children will be part of the + * current node, and the others will be moved to the new core node. + * There are BTREE_CORE_ELEMS + 1 elements including the new one. One + * will be used as the new separator in our parent, and the others + * will be split among the two core nodes. + * + * Usually we will split the node in half evenly, with + * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we + * instead move only about a quarter of the elements (and children) to + * the new node. Since the average state after a long time is a 3/4 + * full node, shortcutting directly to that state improves efficiency. + * + * We do this in two stages: first we split into two nodes, and then we + * reuse our existing logic to insert the new element and child. + */ + uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? + 2 : 4)) - 1, 2); + uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1; + ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); + tree->bt_num_nodes++; + zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + + BTREE_CORE_ELEMS * size, KM_SLEEP); + zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; + new_par_hdr->bth_parent = par_hdr->bth_parent; + new_par_hdr->bth_core = B_TRUE; + new_par_hdr->bth_count = move_count; + zfs_btree_poison_node(tree, new_par_hdr); + + par_hdr->bth_count = keep_count; + + bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, + 0, BSS_TRAPEZOID); + + /* Store the new separator in a buffer. */ + uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); + bmov(parent->btc_elems + keep_count * size, tmp_buf, + size); + zfs_btree_poison_node(tree, par_hdr); + + if (offset < keep_count) { + /* Insert the new node into the left half */ + zfs_btree_insert_core_impl(tree, parent, offset, new_node, + buf); + + /* + * Move the new separator to the existing buffer. + */ + bmov(tmp_buf, buf, size); + } else if (offset > keep_count) { + /* Insert the new node into the right half */ + new_node->bth_parent = new_parent; + zfs_btree_insert_core_impl(tree, new_parent, + offset - keep_count - 1, new_node, buf); + + /* + * Move the new separator to the existing buffer. + */ + bmov(tmp_buf, buf, size); + } else { + /* + * Move the new separator into the right half, and replace it + * with buf. We also need to shift back the elements in the + * right half to accomodate new_node. + */ + bt_shift_core_right(tree, new_parent, 0, move_count, + BSS_TRAPEZOID); + new_parent->btc_children[0] = new_node; + bmov(tmp_buf, new_parent->btc_elems, size); + new_par_hdr->bth_count++; + } + kmem_free(tmp_buf, size); + zfs_btree_poison_node(tree, par_hdr); + + for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++) + new_parent->btc_children[i]->bth_parent = new_parent; + + for (int i = 0; i <= parent->btc_hdr.bth_count; i++) + ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); + + /* + * Now that the node is split, we need to insert the new node into its + * parent. This may cause further splitting. + */ + zfs_btree_insert_into_parent(tree, &parent->btc_hdr, + &new_parent->btc_hdr, buf); +} + +/* Insert an element into a leaf node at the given offset. */ +static void +zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, + uint64_t idx, const void *value) +{ + uint64_t size = tree->bt_elem_size; + uint8_t *start = leaf->btl_elems + (idx * size); + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + ASSERTV(uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2)); + uint64_t count = leaf->btl_hdr.bth_count - idx; + ASSERT3U(leaf->btl_hdr.bth_count, <, capacity); + + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, + leaf->btl_hdr.bth_count); + } + + bt_shift_leaf_right(tree, leaf, idx, count); + bmov(value, start, size); + hdr->bth_count++; +} + +/* Helper function for inserting a new value into leaf at the given index. */ +static void +zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, + const void *value, uint64_t idx) +{ + uint64_t size = tree->bt_elem_size; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + /* + * If the leaf isn't full, shift the elements after idx and insert + * value. + */ + if (leaf->btl_hdr.bth_count != capacity) { + zfs_btree_insert_leaf_impl(tree, leaf, idx, value); + return; + } + + /* + * Otherwise, we split the leaf node into two nodes. If we're not bulk + * inserting, each is of size (capacity / 2). If we are bulk + * inserting, we move a quarter of the elements to the new node so + * inserts into the old node don't cause immediate splitting but the + * tree stays relatively dense. Since the average state after a long + * time is a 3/4 full node, shortcutting directly to that state + * improves efficiency. At the end of the bulk insertion process + * we'll need to go through and fix up any nodes (the last leaf and + * its ancestors, potentially) that are below the minimum. + * + * In either case, we're left with one extra element. The leftover + * element will become the new dividing element between the two nodes. + */ + uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) - + 1, 2); + uint64_t keep_count = capacity - move_count - 1; + ASSERT3U(capacity - move_count, >=, 2); + tree->bt_num_nodes++; + zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, + KM_SLEEP); + zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; + new_hdr->bth_parent = leaf->btl_hdr.bth_parent; + new_hdr->bth_core = B_FALSE; + new_hdr->bth_count = move_count; + zfs_btree_poison_node(tree, new_hdr); + + leaf->btl_hdr.bth_count = keep_count; + + if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) + tree->bt_bulk = new_leaf; + + /* Copy the back part to the new leaf. */ + bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, + 0); + + /* We store the new separator in a buffer we control for simplicity. */ + uint8_t *buf = kmem_alloc(size, KM_SLEEP); + bmov(leaf->btl_elems + (keep_count * size), buf, size); + zfs_btree_poison_node(tree, &leaf->btl_hdr); + + if (idx < keep_count) { + /* Insert into the existing leaf. */ + zfs_btree_insert_leaf_impl(tree, leaf, idx, value); + } else if (idx > keep_count) { + /* Insert into the new leaf. */ + zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - + 1, value); + } else { + /* + * Shift the elements in the new leaf to make room for the + * separator, and use the new value as the new separator. + */ + bt_shift_leaf_right(tree, new_leaf, 0, move_count); + bmov(buf, new_leaf->btl_elems, size); + bmov(value, buf, size); + new_hdr->bth_count++; + } + + /* + * Now that the node is split, we need to insert the new node into its + * parent. This may cause further splitting, bur only of core nodes. + */ + zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, + buf); + kmem_free(buf, size); +} + +static uint64_t +zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + void *buf; + if (hdr->bth_core) { + buf = ((zfs_btree_core_t *)hdr)->btc_elems; + } else { + buf = ((zfs_btree_leaf_t *)hdr)->btl_elems; + } + zfs_btree_index_t idx; + zfs_btree_core_t *parent = hdr->bth_parent; + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + parent->btc_hdr.bth_count, buf, &idx), ==, NULL); + ASSERT(idx.bti_before); + ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); + ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); + return (idx.bti_offset); +} + +/* + * Take the b-tree out of bulk insert mode. During bulk-insert mode, some + * nodes may violate the invariant that non-root nodes must be at least half + * full. All nodes violating this invariant should be the last node in their + * particular level. To correct the invariant, we take values from their left + * neighbor until they are half full. They must have a left neighbor at their + * level because the last node at a level is not the first node unless it's + * the root. + */ +static void +zfs_btree_bulk_finish(zfs_btree_t *tree) +{ + ASSERT3P(tree->bt_bulk, !=, NULL); + ASSERT3P(tree->bt_root, !=, NULL); + zfs_btree_leaf_t *leaf = tree->bt_bulk; + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t size = tree->bt_elem_size; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + /* + * The invariant doesn't apply to the root node, if that's the only + * node in the tree we're done. + */ + if (parent == NULL) { + tree->bt_bulk = NULL; + return; + } + + /* First, take elements to rebalance the leaf node. */ + if (hdr->bth_count < capacity / 2) { + /* + * First, find the left neighbor. The simplest way to do this + * is to call zfs_btree_prev twice; the first time finds some + * ancestor of this node, and the second time finds the left + * neighbor. The ancestor found is the lowest common ancestor + * of leaf and the neighbor. + */ + zfs_btree_index_t idx = { + .bti_node = hdr, + .bti_offset = 0 + }; + VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); + ASSERT(idx.bti_node->bth_core); + zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; + uint64_t common_idx = idx.bti_offset; + + VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); + ASSERT(!idx.bti_node->bth_core); + zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; + zfs_btree_hdr_t *l_hdr = idx.bti_node; + uint64_t move_count = (capacity / 2) - hdr->bth_count; + ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, + capacity / 2); + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < move_count; i++) { + zfs_btree_verify_poison_at(tree, hdr, + leaf->btl_hdr.bth_count + i); + } + } + + /* First, shift elements in leaf back. */ + bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count, + BSD_RIGHT); + + /* Next, move the separator from the common ancestor to leaf. */ + uint8_t *separator = common->btc_elems + (common_idx * size); + uint8_t *out = leaf->btl_elems + ((move_count - 1) * size); + bmov(separator, out, size); + move_count--; + + /* + * Now we move elements from the tail of the left neighbor to + * fill the remaining spots in leaf. + */ + bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - + move_count, move_count, leaf, 0); + + /* + * Finally, move the new last element in the left neighbor to + * the separator. + */ + bmov(l_neighbor->btl_elems + (l_hdr->bth_count - + move_count - 1) * size, separator, size); + + /* Adjust the node's counts, and we're done. */ + l_hdr->bth_count -= move_count + 1; + hdr->bth_count += move_count + 1; + + ASSERT3U(l_hdr->bth_count, >=, capacity / 2); + ASSERT3U(hdr->bth_count, >=, capacity / 2); + zfs_btree_poison_node(tree, l_hdr); + } + + /* + * Now we have to rebalance any ancestors of leaf that may also + * violate the invariant. + */ + capacity = BTREE_CORE_ELEMS; + while (parent->btc_hdr.bth_parent != NULL) { + zfs_btree_core_t *cur = parent; + zfs_btree_hdr_t *hdr = &cur->btc_hdr; + parent = hdr->bth_parent; + /* + * If the invariant isn't violated, move on to the next + * ancestor. + */ + if (hdr->bth_count >= capacity / 2) + continue; + + /* + * Because the smallest number of nodes we can move when + * splitting is 2, we never need to worry about not having a + * left sibling (a sibling is a neighbor with the same parent). + */ + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + ASSERT3U(parent_idx, >, 0); + zfs_btree_core_t *l_neighbor = + (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; + uint64_t move_count = (capacity / 2) - hdr->bth_count; + ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, + capacity / 2); + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < move_count; i++) { + zfs_btree_verify_poison_at(tree, hdr, + hdr->bth_count + i); + } + } + /* First, shift things in the right node back. */ + bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, + BSS_TRAPEZOID, BSD_RIGHT); + + /* Next, move the separator to the right node. */ + uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * + size); + uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); + bmov(separator, e_out, size); + + /* + * Now, move elements and children from the left node to the + * right. We move one more child than elements. + */ + move_count--; + uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; + bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, + BSS_TRAPEZOID); + + /* + * Finally, move the last element in the left node to the + * separator's position. + */ + move_idx--; + bmov(l_neighbor->btc_elems + move_idx * size, separator, size); + + l_neighbor->btc_hdr.bth_count -= move_count + 1; + hdr->bth_count += move_count + 1; + + ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); + ASSERT3U(hdr->bth_count, >=, capacity / 2); + + zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); + + for (int i = 0; i <= hdr->bth_count; i++) + cur->btc_children[i]->bth_parent = cur; + } + + tree->bt_bulk = NULL; +} + +/* + * Insert value into tree at the location specified by where. + */ +void +zfs_btree_insert(zfs_btree_t *tree, const void *value, + const zfs_btree_index_t *where) +{ + zfs_btree_index_t idx = {0}; + + /* If we're not inserting in the last leaf, end bulk insert mode. */ + if (tree->bt_bulk != NULL) { + if (where->bti_node != &tree->bt_bulk->btl_hdr) { + zfs_btree_bulk_finish(tree); + VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); + where = &idx; + } + } + + tree->bt_num_elems++; + /* + * If this is the first element in the tree, create a leaf root node + * and add the value to it. + */ + if (where->bti_node == NULL) { + ASSERT3U(tree->bt_num_elems, ==, 1); + ASSERT3S(tree->bt_height, ==, -1); + ASSERT3P(tree->bt_root, ==, NULL); + ASSERT0(where->bti_offset); + + tree->bt_num_nodes++; + zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache, + KM_SLEEP); + tree->bt_root = &leaf->btl_hdr; + tree->bt_height++; + + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + hdr->bth_parent = NULL; + hdr->bth_core = B_FALSE; + hdr->bth_count = 0; + zfs_btree_poison_node(tree, hdr); + + zfs_btree_insert_into_leaf(tree, leaf, value, 0); + tree->bt_bulk = leaf; + } else if (!where->bti_node->bth_core) { + /* + * If we're inserting into a leaf, go directly to the helper + * function. + */ + zfs_btree_insert_into_leaf(tree, + (zfs_btree_leaf_t *)where->bti_node, value, + where->bti_offset); + } else { + /* + * If we're inserting into a core node, we can't just shift + * the existing element in that slot in the same node without + * breaking our ordering invariants. Instead we place the new + * value in the node at that spot and then insert the old + * separator into the first slot in the subtree to the right. + */ + ASSERT(where->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; + + /* + * We can ignore bti_before, because either way the value + * should end up in bti_offset. + */ + uint64_t off = where->bti_offset; + zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; + size_t size = tree->bt_elem_size; + uint8_t *buf = kmem_alloc(size, KM_SLEEP); + bmov(node->btc_elems + off * size, buf, size); + bmov(value, node->btc_elems + off * size, size); + + /* + * Find the first slot in the subtree to the right, insert + * there. + */ + zfs_btree_index_t new_idx; + VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL); + ASSERT0(new_idx.bti_offset); + ASSERT(!new_idx.bti_node->bth_core); + zfs_btree_insert_into_leaf(tree, + (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); + kmem_free(buf, size); + } + zfs_btree_verify(tree); +} + +/* + * Return the first element in the tree, and put its location in where if + * non-null. + */ +void * +zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + ASSERT0(tree->bt_num_elems); + return (NULL); + } + return (zfs_btree_first_helper(tree->bt_root, where)); +} + +/* + * Find the last element in the subtree rooted at hdr, return its value and + * put its location in where if non-null. + */ +static void * +zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, + zfs_btree_index_t *where) +{ + zfs_btree_hdr_t *node; + + for (node = hdr; node->bth_core; node = + ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) + ; + + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; + if (where != NULL) { + where->bti_node = node; + where->bti_offset = node->bth_count - 1; + where->bti_before = B_FALSE; + } + return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size); +} + +/* + * Return the last element in the tree, and put its location in where if + * non-null. + */ +void * +zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + ASSERT0(tree->bt_num_elems); + return (NULL); + } + return (zfs_btree_last_helper(tree, tree->bt_root, where)); +} + +/* + * This function contains the logic to find the next node in the tree. A + * helper function is used because there are multiple internal consumemrs of + * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each + * node after we've finished with it. + */ +static void * +zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx, + void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) +{ + if (idx->bti_node == NULL) { + ASSERT3S(tree->bt_height, ==, -1); + return (NULL); + } + + uint64_t offset = idx->bti_offset; + if (!idx->bti_node->bth_core) { + /* + * When finding the next element of an element in a leaf, + * there are two cases. If the element isn't the last one in + * the leaf, in which case we just return the next element in + * the leaf. Otherwise, we need to traverse up our parents + * until we find one where our ancestor isn't the last child + * of its parent. Once we do, the next element is the + * separator after our ancestor in its parent. + */ + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + uint64_t new_off = offset + (idx->bti_before ? 0 : 1); + if (leaf->btl_hdr.bth_count > new_off) { + out_idx->bti_node = &leaf->btl_hdr; + out_idx->bti_offset = new_off; + out_idx->bti_before = B_FALSE; + return (leaf->btl_elems + new_off * tree->bt_elem_size); + } + + zfs_btree_hdr_t *prev = &leaf->btl_hdr; + for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; + node != NULL; node = node->btc_hdr.bth_parent) { + zfs_btree_hdr_t *hdr = &node->btc_hdr; + ASSERT(hdr->bth_core); + uint64_t i = zfs_btree_find_parent_idx(tree, prev); + if (done_func != NULL) + done_func(tree, prev); + if (i == hdr->bth_count) { + prev = hdr; + continue; + } + out_idx->bti_node = hdr; + out_idx->bti_offset = i; + out_idx->bti_before = B_FALSE; + return (node->btc_elems + i * tree->bt_elem_size); + } + if (done_func != NULL) + done_func(tree, prev); + /* + * We've traversed all the way up and been at the end of the + * node every time, so this was the last element in the tree. + */ + return (NULL); + } + + /* If we were before an element in a core node, return that element. */ + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + if (idx->bti_before) { + out_idx->bti_before = B_FALSE; + return (node->btc_elems + offset * tree->bt_elem_size); + } + + /* + * The next element from one in a core node is the first element in + * the subtree just to the right of the separator. + */ + zfs_btree_hdr_t *child = node->btc_children[offset + 1]; + return (zfs_btree_first_helper(child, out_idx)); +} + +/* + * Return the next valued node in the tree. The same address can be safely + * passed for idx and out_idx. + */ +void * +zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx) +{ + return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); +} + +/* + * Return the previous valued node in the tree. The same value can be safely + * passed for idx and out_idx. + */ +void * +zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx) +{ + if (idx->bti_node == NULL) { + ASSERT3S(tree->bt_height, ==, -1); + return (NULL); + } + + uint64_t offset = idx->bti_offset; + if (!idx->bti_node->bth_core) { + /* + * When finding the previous element of an element in a leaf, + * there are two cases. If the element isn't the first one in + * the leaf, in which case we just return the previous element + * in the leaf. Otherwise, we need to traverse up our parents + * until we find one where our previous ancestor isn't the + * first child. Once we do, the previous element is the + * separator after our previous ancestor. + */ + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + if (offset != 0) { + out_idx->bti_node = &leaf->btl_hdr; + out_idx->bti_offset = offset - 1; + out_idx->bti_before = B_FALSE; + return (leaf->btl_elems + (offset - 1) * + tree->bt_elem_size); + } + zfs_btree_hdr_t *prev = &leaf->btl_hdr; + for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; + node != NULL; node = node->btc_hdr.bth_parent) { + zfs_btree_hdr_t *hdr = &node->btc_hdr; + ASSERT(hdr->bth_core); + uint64_t i = zfs_btree_find_parent_idx(tree, prev); + if (i == 0) { + prev = hdr; + continue; + } + out_idx->bti_node = hdr; + out_idx->bti_offset = i - 1; + out_idx->bti_before = B_FALSE; + return (node->btc_elems + (i - 1) * tree->bt_elem_size); + } + /* + * We've traversed all the way up and been at the start of the + * node every time, so this was the first node in the tree. + */ + return (NULL); + } + + /* + * The previous element from one in a core node is the last element in + * the subtree just to the left of the separator. + */ + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + zfs_btree_hdr_t *child = node->btc_children[offset]; + return (zfs_btree_last_helper(tree, child, out_idx)); +} + +/* + * Get the value at the provided index in the tree. + * + * Note that the value returned from this function can be mutated, but only + * if it will not change the ordering of the element with respect to any other + * elements that could be in the tree. + */ +void * +zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) +{ + ASSERT(!idx->bti_before); + if (!idx->bti_node->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size); + } + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + return (node->btc_elems + idx->bti_offset * tree->bt_elem_size); +} + +/* Add the given value to the tree. Must not already be in the tree. */ +void +zfs_btree_add(zfs_btree_t *tree, const void *node) +{ + zfs_btree_index_t where = {0}; + VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); + zfs_btree_insert(tree, node, &where); +} + +/* Helper function to free a tree node. */ +static void +zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) +{ + tree->bt_num_nodes--; + if (!node->bth_core) { + kmem_cache_free(zfs_btree_leaf_cache, node); + } else { + kmem_free(node, sizeof (zfs_btree_core_t) + + BTREE_CORE_ELEMS * tree->bt_elem_size); + } +} + +/* + * Remove the rm_hdr and the separator to its left from the parent node. The + * buffer that rm_hdr was stored in may already be freed, so its contents + * cannot be accessed. + */ +static void +zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, + zfs_btree_hdr_t *rm_hdr) +{ + size_t size = tree->bt_elem_size; + uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1; + zfs_btree_hdr_t *hdr = &node->btc_hdr; + /* + * If the node is the root node and rm_hdr is one of two children, + * promote the other child to the root. + */ + if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { + ASSERT3U(hdr->bth_count, ==, 1); + ASSERT3P(tree->bt_root, ==, node); + ASSERT3P(node->btc_children[1], ==, rm_hdr); + tree->bt_root = node->btc_children[0]; + node->btc_children[0]->bth_parent = NULL; + zfs_btree_node_destroy(tree, hdr); + tree->bt_height--; + return; + } + + uint64_t idx; + for (idx = 0; idx <= hdr->bth_count; idx++) { + if (node->btc_children[idx] == rm_hdr) + break; + } + ASSERT3U(idx, <=, hdr->bth_count); + + /* + * If the node is the root or it has more than the minimum number of + * children, just remove the child and separator, and return. + */ + if (hdr->bth_parent == NULL || + hdr->bth_count > min_count) { + /* + * Shift the element and children to the right of rm_hdr to + * the left by one spot. + */ + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, + BSS_PARALLELOGRAM); + hdr->bth_count--; + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); + return; + } + + ASSERT3U(hdr->bth_count, ==, min_count); + + /* + * Now we try to take a node from a neighbor. We check left, then + * right. If the neighbor exists and has more than the minimum number + * of elements, we move the separator betweeen us and them to our + * node, move their closest element (last for left, first for right) + * to the separator, and move their closest child to our node. Along + * the way we need to collapse the gap made by idx, and (for our right + * neighbor) the gap made by removing their first element and child. + * + * Note: this logic currently doesn't support taking from a neighbor + * that isn't a sibling (i.e. a neighbor with a different + * parent). This isn't critical functionality, but may be worth + * implementing in the future for completeness' sake. + */ + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + + zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + parent->btc_children[parent_idx - 1]); + if (l_hdr != NULL && l_hdr->bth_count > min_count) { + /* We can take a node from the left neighbor. */ + ASSERT(l_hdr->bth_core); + zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; + + /* + * Start by shifting the elements and children in the current + * node to the right by one spot. + */ + bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); + + /* + * Move the separator between node and neighbor to the first + * element slot in the current node. + */ + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, node->btc_elems, size); + + /* Move the last child of neighbor to our first child slot. */ + zfs_btree_hdr_t **take_child = neighbor->btc_children + + l_hdr->bth_count; + bmov(take_child, node->btc_children, sizeof (*take_child)); + node->btc_children[0]->bth_parent = node; + + /* Move the last element of neighbor to the separator spot. */ + uint8_t *take_elem = neighbor->btc_elems + + (l_hdr->bth_count - 1) * size; + bmov(take_elem, separator, size); + l_hdr->bth_count--; + zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + return; + } + + zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + NULL : parent->btc_children[parent_idx + 1]); + if (r_hdr != NULL && r_hdr->bth_count > min_count) { + /* We can take a node from the right neighbor. */ + ASSERT(r_hdr->bth_core); + zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; + + /* + * Shift elements in node left by one spot to overwrite rm_hdr + * and the separator before it. + */ + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, + BSS_PARALLELOGRAM); + + /* + * Move the separator between node and neighbor to the last + * element spot in node. + */ + uint8_t *separator = parent->btc_elems + parent_idx * size; + bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size, + size); + + /* + * Move the first child of neighbor to the last child spot in + * node. + */ + zfs_btree_hdr_t **take_child = neighbor->btc_children; + bmov(take_child, node->btc_children + hdr->bth_count, + sizeof (*take_child)); + node->btc_children[hdr->bth_count]->bth_parent = node; + + /* Move the first element of neighbor to the separator spot. */ + uint8_t *take_elem = neighbor->btc_elems; + bmov(take_elem, separator, size); + r_hdr->bth_count--; + + /* + * Shift the elements and children of neighbor to cover the + * stolen elements. + */ + bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, + BSS_TRAPEZOID); + zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + return; + } + + /* + * In this case, neither of our neighbors can spare an element, so we + * need to merge with one of them. We prefer the left one, + * arabitrarily. Move the separator into the leftmost merging node + * (which may be us or the left neighbor), and then move the right + * merging node's elements. Once that's done, we go back and delete + * the element we're removing. Finally, go into the parent and delete + * the right merging node and the separator. This may cause further + * merging. + */ + zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; + uint64_t new_idx = idx; + if (l_hdr != NULL) { + keep_hdr = l_hdr; + new_rm_hdr = hdr; + new_idx += keep_hdr->bth_count + 1; + } else { + ASSERT3P(r_hdr, !=, NULL); + keep_hdr = hdr; + new_rm_hdr = r_hdr; + parent_idx++; + } + + ASSERT(keep_hdr->bth_core); + ASSERT(new_rm_hdr->bth_core); + + zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; + zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) { + zfs_btree_verify_poison_at(tree, keep_hdr, + keep_hdr->bth_count + i); + } + } + + /* Move the separator into the left node. */ + uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, e_out, size); + keep_hdr->bth_count++; + + /* Move all our elements and children into the left node. */ + bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, + keep_hdr->bth_count, BSS_TRAPEZOID); + + uint64_t old_count = keep_hdr->bth_count; + + /* Update bookkeeping */ + keep_hdr->bth_count += new_rm_hdr->bth_count; + ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); + + /* + * Shift the element and children to the right of rm_hdr to + * the left by one spot. + */ + ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); + bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, + BSS_PARALLELOGRAM); + keep_hdr->bth_count--; + + /* Reparent all our children to point to the left node. */ + zfs_btree_hdr_t **new_start = keep->btc_children + + old_count - 1; + for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) + new_start[i]->bth_parent = keep; + for (int i = 0; i <= keep_hdr->bth_count; i++) { + ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); + ASSERT3P(keep->btc_children[i], !=, rm_hdr); + } + zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); + + new_rm_hdr->bth_count = 0; + zfs_btree_node_destroy(tree, new_rm_hdr); + zfs_btree_remove_from_node(tree, parent, new_rm_hdr); +} + +/* Remove the element at the specific location. */ +void +zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + size_t size = tree->bt_elem_size; + zfs_btree_hdr_t *hdr = where->bti_node; + uint64_t idx = where->bti_offset; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + ASSERT(!where->bti_before); + if (tree->bt_bulk != NULL) { + /* + * Leave bulk insert mode. Note that our index would be + * invalid after we correct the tree, so we copy the value + * we're planning to remove and find it again after + * bulk_finish. + */ + uint8_t *value = zfs_btree_get(tree, where); + uint8_t *tmp = kmem_alloc(size, KM_SLEEP); + bmov(value, tmp, size); + zfs_btree_bulk_finish(tree); + VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); + kmem_free(tmp, size); + hdr = where->bti_node; + idx = where->bti_offset; + } + + tree->bt_num_elems--; + /* + * If the element happens to be in a core node, we move a leaf node's + * element into its place and then remove the leaf node element. This + * makes the rebalance logic not need to be recursive both upwards and + * downwards. + */ + if (hdr->bth_core) { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; + void *new_value = zfs_btree_last_helper(tree, left_subtree, + where); + ASSERT3P(new_value, !=, NULL); + + bmov(new_value, node->btc_elems + idx * size, size); + + hdr = where->bti_node; + idx = where->bti_offset; + ASSERT(!where->bti_before); + } + + /* + * First, we'll update the leaf's metadata. Then, we shift any + * elements after the idx to the left. After that, we rebalance if + * needed. + */ + ASSERT(!hdr->bth_core); + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + ASSERT3U(hdr->bth_count, >, 0); + + uint64_t min_count = (capacity / 2) - 1; + + /* + * If we're over the minimum size or this is the root, just overwrite + * the value and return. + */ + if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { + hdr->bth_count--; + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); + if (hdr->bth_parent == NULL) { + ASSERT0(tree->bt_height); + if (hdr->bth_count == 0) { + tree->bt_root = NULL; + tree->bt_height--; + zfs_btree_node_destroy(tree, &leaf->btl_hdr); + } + } + if (tree->bt_root != NULL) + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); + zfs_btree_verify(tree); + return; + } + ASSERT3U(hdr->bth_count, ==, min_count); + + /* + * Now we try to take a node from a sibling. We check left, then + * right. If they exist and have more than the minimum number of + * elements, we move the separator betweeen us and them to our node + * and move their closest element (last for left, first for right) to + * the separator. Along the way we need to collapse the gap made by + * idx, and (for our right neighbor) the gap made by removing their + * first element. + * + * Note: this logic currently doesn't support taking from a neighbor + * that isn't a sibling. This isn't critical functionality, but may be + * worth implementing in the future for completeness' sake. + */ + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + + zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + parent->btc_children[parent_idx - 1]); + if (l_hdr != NULL && l_hdr->bth_count > min_count) { + /* We can take a node from the left neighbor. */ + ASSERT(!l_hdr->bth_core); + + /* + * Move our elements back by one spot to make room for the + * stolen element and overwrite the element being removed. + */ + bt_shift_leaf_right(tree, leaf, 0, idx); + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems + + (l_hdr->bth_count - 1) * size; + /* Move the separator to our first spot. */ + bmov(separator, leaf->btl_elems, size); + + /* Move our neighbor's last element to the separator. */ + bmov(take_elem, separator, size); + + /* Update the bookkeeping. */ + l_hdr->bth_count--; + zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + + zfs_btree_verify(tree); + return; + } + + zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + NULL : parent->btc_children[parent_idx + 1]); + if (r_hdr != NULL && r_hdr->bth_count > min_count) { + /* We can take a node from the right neighbor. */ + ASSERT(!r_hdr->bth_core); + zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; + + /* + * Move our elements after the element being removed forwards + * by one spot to make room for the stolen element and + * overwrite the element being removed. + */ + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx - + 1); + + uint8_t *separator = parent->btc_elems + parent_idx * size; + uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems; + /* Move the separator between us to our last spot. */ + bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size, + size); + + /* Move our neighbor's first element to the separator. */ + bmov(take_elem, separator, size); + + /* Update the bookkeeping. */ + r_hdr->bth_count--; + + /* + * Move our neighbors elements forwards to overwrite the + * stolen element. + */ + bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count); + zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + zfs_btree_verify(tree); + return; + } + + /* + * In this case, neither of our neighbors can spare an element, so we + * need to merge with one of them. We prefer the left one, + * arabitrarily. Move the separator into the leftmost merging node + * (which may be us or the left neighbor), and then move the right + * merging node's elements. Once that's done, we go back and delete + * the element we're removing. Finally, go into the parent and delete + * the right merging node and the separator. This may cause further + * merging. + */ + zfs_btree_hdr_t *rm_hdr, *keep_hdr; + uint64_t new_idx = idx; + if (l_hdr != NULL) { + keep_hdr = l_hdr; + rm_hdr = hdr; + new_idx += keep_hdr->bth_count + 1; // 449 + } else { + ASSERT3P(r_hdr, !=, NULL); + keep_hdr = hdr; + rm_hdr = r_hdr; + parent_idx++; + } + + ASSERT(!keep_hdr->bth_core); + ASSERT(!rm_hdr->bth_core); + ASSERT3U(keep_hdr->bth_count, ==, min_count); + ASSERT3U(rm_hdr->bth_count, ==, min_count); + + zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr; + zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < rm_hdr->bth_count + 1; i++) { + zfs_btree_verify_poison_at(tree, keep_hdr, + keep_hdr->bth_count + i); + } + } + /* + * Move the separator into the first open spot in the left + * neighbor. + */ + uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, out, size); + keep_hdr->bth_count++; + + /* Move our elements to the left neighbor. */ + bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, + keep_hdr->bth_count); + + /* Update the bookkeeping. */ + keep_hdr->bth_count += rm_hdr->bth_count; + ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1); + + /* Remove the value from the node */ + keep_hdr->bth_count--; + bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count - + new_idx); + zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); + + rm_hdr->bth_count = 0; + zfs_btree_node_destroy(tree, rm_hdr); + /* Remove the emptied node from the parent. */ + zfs_btree_remove_from_node(tree, parent, rm_hdr); + zfs_btree_verify(tree); +} + +/* Remove the given value from the tree. */ +void +zfs_btree_remove(zfs_btree_t *tree, const void *value) +{ + zfs_btree_index_t where = {0}; + VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); + zfs_btree_remove_from(tree, &where); +} + +/* Return the number of elements in the tree. */ +ulong_t +zfs_btree_numnodes(zfs_btree_t *tree) +{ + return (tree->bt_num_elems); +} + +/* + * This function is used to visit all the elements in the tree before + * destroying the tree. This allows the calling code to perform any cleanup it + * needs to do. This is more efficient than just removing the first element + * over and over, because it removes all rebalancing. Once the destroy_nodes() + * function has been called, no other btree operations are valid until it + * returns NULL, which point the only valid operation is zfs_btree_destroy(). + * + * example: + * + * zfs_btree_index_t *cookie = NULL; + * my_data_t *node; + * + * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) + * free(node->ptr); + * zfs_btree_destroy(tree); + * + */ +void * +zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) +{ + if (*cookie == NULL) { + if (tree->bt_height == -1) + return (NULL); + *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); + return (zfs_btree_first(tree, *cookie)); + } + + void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, + zfs_btree_node_destroy); + if (rval == NULL) { + tree->bt_root = NULL; + tree->bt_height = -1; + tree->bt_num_elems = 0; + kmem_free(*cookie, sizeof (**cookie)); + tree->bt_bulk = NULL; + } + return (rval); +} + +static void +zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (hdr->bth_core) { + zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_clear_helper(tree, btc->btc_children[i]); + } + } + + zfs_btree_node_destroy(tree, hdr); +} + +void +zfs_btree_clear(zfs_btree_t *tree) +{ + if (tree->bt_root == NULL) { + ASSERT0(tree->bt_num_elems); + return; + } + + zfs_btree_clear_helper(tree, tree->bt_root); + tree->bt_num_elems = 0; + tree->bt_root = NULL; + tree->bt_num_nodes = 0; + tree->bt_height = -1; + tree->bt_bulk = NULL; +} + +void +zfs_btree_destroy(zfs_btree_t *tree) +{ + ASSERT0(tree->bt_num_elems); + ASSERT3P(tree->bt_root, ==, NULL); +} + +/* Verify that every child of this node has the correct parent pointer. */ +static void +zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (!hdr->bth_core) + return; + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = 0; i <= hdr->bth_count; i++) { + VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); + zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); + } +} + +/* Verify that every node has the correct parent pointer. */ +static void +zfs_btree_verify_pointers(zfs_btree_t *tree) +{ + if (tree->bt_height == -1) { + VERIFY3P(tree->bt_root, ==, NULL); + return; + } + VERIFY3P(tree->bt_root->bth_parent, ==, NULL); + zfs_btree_verify_pointers_helper(tree, tree->bt_root); +} + +/* + * Verify that all the current node and its children satisfy the count + * invariants, and return the total count in the subtree rooted in this node. + */ +static uint64_t +zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (!hdr->bth_core) { + if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) { + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2); + VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1); + } + + return (hdr->bth_count); + } else { + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint64_t ret = hdr->bth_count; + if (tree->bt_root != hdr && tree->bt_bulk == NULL) + VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); + for (int i = 0; i <= hdr->bth_count; i++) { + ret += zfs_btree_verify_counts_helper(tree, + node->btc_children[i]); + } + + return (ret); + } +} + +/* + * Verify that all nodes satisfy the invariants and that the total number of + * elements is correct. + */ +static void +zfs_btree_verify_counts(zfs_btree_t *tree) +{ + EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); + if (tree->bt_height == -1) { + return; + } + VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, + tree->bt_num_elems); +} + +/* + * Check that the subtree rooted at this node has a uniform height. Returns + * the number of nodes under this node, to help verify bt_num_nodes. + */ +static uint64_t +zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + int64_t height) +{ + if (!hdr->bth_core) { + VERIFY0(height); + return (1); + } + + VERIFY(hdr->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint64_t ret = 1; + for (int i = 0; i <= hdr->bth_count; i++) { + ret += zfs_btree_verify_height_helper(tree, + node->btc_children[i], height - 1); + } + return (ret); +} + +/* + * Check that the tree rooted at this node has a uniform height, and that the + * bt_height in the tree is correct. + */ +static void +zfs_btree_verify_height(zfs_btree_t *tree) +{ + EQUIV(tree->bt_height == -1, tree->bt_root == NULL); + if (tree->bt_height == -1) { + return; + } + + VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, + tree->bt_height), ==, tree->bt_num_nodes); +} + +/* + * Check that the elements in this node are sorted, and that if this is a core + * node, the separators are properly between the subtrees they separaate and + * that the children also satisfy this requirement. + */ +static void +zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + for (int i = 1; i < hdr->bth_count; i++) { + VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) * + size, leaf->btl_elems + i * size), ==, -1); + } + return; + } + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = 1; i < hdr->bth_count; i++) { + VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, + node->btc_elems + i * size), ==, -1); + } + for (int i = 0; i < hdr->bth_count; i++) { + uint8_t *left_child_last = NULL; + zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; + if (left_child_hdr->bth_core) { + zfs_btree_core_t *left_child = + (zfs_btree_core_t *)left_child_hdr; + left_child_last = left_child->btc_elems + + (left_child_hdr->bth_count - 1) * size; + } else { + zfs_btree_leaf_t *left_child = + (zfs_btree_leaf_t *)left_child_hdr; + left_child_last = left_child->btl_elems + + (left_child_hdr->bth_count - 1) * size; + } + if (tree->bt_compar(node->btc_elems + i * size, + left_child_last) != 1) { + panic("btree: compar returned %d (expected 1) at " + "%px %d: compar(%px, %px)", tree->bt_compar( + node->btc_elems + i * size, left_child_last), + (void *)node, i, (void *)(node->btc_elems + i * + size), (void *)left_child_last); + } + + uint8_t *right_child_first = NULL; + zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; + if (right_child_hdr->bth_core) { + zfs_btree_core_t *right_child = + (zfs_btree_core_t *)right_child_hdr; + right_child_first = right_child->btc_elems; + } else { + zfs_btree_leaf_t *right_child = + (zfs_btree_leaf_t *)right_child_hdr; + right_child_first = right_child->btl_elems; + } + if (tree->bt_compar(node->btc_elems + i * size, + right_child_first) != -1) { + panic("btree: compar returned %d (expected -1) at " + "%px %d: compar(%px, %px)", tree->bt_compar( + node->btc_elems + i * size, right_child_first), + (void *)node, i, (void *)(node->btc_elems + i * + size), (void *)right_child_first); + } + } + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_verify_order_helper(tree, node->btc_children[i]); + } +} + +/* Check that all elements in the tree are in sorted order. */ +static void +zfs_btree_verify_order(zfs_btree_t *tree) +{ + EQUIV(tree->bt_height == -1, tree->bt_root == NULL); + if (tree->bt_height == -1) { + return; + } + + zfs_btree_verify_order_helper(tree, tree->bt_root); +} + +#ifdef ZFS_DEBUG +/* Check that all unused memory is poisoned correctly. */ +static void +zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + uint8_t val = 0x0f; + for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t); i++) { + VERIFY3U(leaf->btl_elems[i], ==, val); + } + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint8_t val = 0x0f; + for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size; + i++) { + VERIFY3U(node->btc_elems[i], ==, val); + } + + for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + VERIFY3P(node->btc_children[i], ==, + (zfs_btree_hdr_t *)BTREE_POISON); + } + + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_verify_poison_helper(tree, + node->btc_children[i]); + } + } +} +#endif + +/* Check that unused memory in the tree is still poisoned. */ +static void +zfs_btree_verify_poison(zfs_btree_t *tree) +{ +#ifdef ZFS_DEBUG + if (tree->bt_height == -1) + return; + zfs_btree_verify_poison_helper(tree, tree->bt_root); +#endif +} + +void +zfs_btree_verify(zfs_btree_t *tree) +{ + if (zfs_btree_verify_intensity == 0) + return; + zfs_btree_verify_height(tree); + if (zfs_btree_verify_intensity == 1) + return; + zfs_btree_verify_pointers(tree); + if (zfs_btree_verify_intensity == 2) + return; + zfs_btree_verify_counts(tree); + if (zfs_btree_verify_intensity == 3) + return; + zfs_btree_verify_order(tree); + + if (zfs_btree_verify_intensity == 4) + return; + zfs_btree_verify_poison(tree); +} diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index a0f90496a..5e7b31c76 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -783,7 +783,7 @@ ddt_entry_compare(const void *x1, const void *x2) break; } - return (AVL_ISIGN(cmp)); + return (TREE_ISIGN(cmp)); } static ddt_t * diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index de57b170f..f7498854a 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1788,7 +1788,7 @@ userquota_compare(const void *l, const void *r) */ rv = strcmp(luqn->uqn_id, ruqn->uqn_id); - return (AVL_ISIGN(rv)); + return (TREE_ISIGN(rv)); } static void diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 6249e165f..48c3705c6 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1209,7 +1209,7 @@ guid_compare(const void *arg1, const void *arg2) const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; - return (AVL_CMP(gmep1->guid, gmep2->guid)); + return (TREE_CMP(gmep1->guid, gmep2->guid)); } static void diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 42116a1c3..ae4f91fc3 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1329,11 +1329,11 @@ send_range_after(const struct send_range *from, const struct send_range *to) return (-1); if (from_obj >= to_end_obj) return (1); - int64_t cmp = AVL_CMP(to->type == OBJECT_RANGE, from->type == + int64_t cmp = TREE_CMP(to->type == OBJECT_RANGE, from->type == OBJECT_RANGE); if (unlikely(cmp)) return (cmp); - cmp = AVL_CMP(to->type == OBJECT, from->type == OBJECT); + cmp = TREE_CMP(to->type == OBJECT, from->type == OBJECT); if (unlikely(cmp)) return (cmp); if (from->end_blkid <= to->start_blkid) @@ -1402,7 +1402,7 @@ send_range_start_compare(struct send_range *r1, struct send_range *r2) uint64_t r1_l0equiv = r1->start_blkid; uint64_t r2_objequiv = r2->object; uint64_t r2_l0equiv = r2->start_blkid; - int64_t cmp = AVL_CMP(r1->eos_marker, r2->eos_marker); + int64_t cmp = TREE_CMP(r1->eos_marker, r2->eos_marker); if (unlikely(cmp)) return (cmp); if (r1->object == 0) { @@ -1414,17 +1414,17 @@ send_range_start_compare(struct send_range *r1, struct send_range *r2) r2_l0equiv = 0; } - cmp = AVL_CMP(r1_objequiv, r2_objequiv); + cmp = TREE_CMP(r1_objequiv, r2_objequiv); if (likely(cmp)) return (cmp); - cmp = AVL_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE); + cmp = TREE_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE); if (unlikely(cmp)) return (cmp); - cmp = AVL_CMP(r2->type == OBJECT, r1->type == OBJECT); + cmp = TREE_CMP(r2->type == OBJECT, r1->type == OBJECT); if (unlikely(cmp)) return (cmp); - return (AVL_CMP(r1_l0equiv, r2_l0equiv)); + return (TREE_CMP(r1_l0equiv, r2_l0equiv)); } enum q_idx { diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 95132344c..9bca72d53 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -89,11 +89,11 @@ dbuf_compare(const void *x1, const void *x2) const dmu_buf_impl_t *d1 = x1; const dmu_buf_impl_t *d2 = x2; - int cmp = AVL_CMP(d1->db_level, d2->db_level); + int cmp = TREE_CMP(d1->db_level, d2->db_level); if (likely(cmp)) return (cmp); - cmp = AVL_CMP(d1->db_blkid, d2->db_blkid); + cmp = TREE_CMP(d1->db_blkid, d2->db_blkid); if (likely(cmp)) return (cmp); @@ -105,7 +105,7 @@ dbuf_compare(const void *x1, const void *x2) return (1); } - return (AVL_PCMP(d1, d2)); + return (TREE_PCMP(d1, d2)); } /* ARGSUSED */ @@ -2218,7 +2218,8 @@ done: { int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { - dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL); + dn->dn_free_ranges[txgoff] = range_tree_create(NULL, + RANGE_SEG64, NULL, 0, 0); } range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index 2126f3d9b..42c612abc 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -595,16 +595,16 @@ dsl_bookmark_compare(const void *l, const void *r) const dsl_bookmark_node_t *ldbn = l; const dsl_bookmark_node_t *rdbn = r; - int64_t cmp = AVL_CMP(ldbn->dbn_phys.zbm_creation_txg, + int64_t cmp = TREE_CMP(ldbn->dbn_phys.zbm_creation_txg, rdbn->dbn_phys.zbm_creation_txg); if (likely(cmp)) return (cmp); - cmp = AVL_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN), + cmp = TREE_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN), (rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)); if (likely(cmp)) return (cmp); cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name); - return (AVL_ISIGN(cmp)); + return (TREE_ISIGN(cmp)); } /* diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 2e4d18ade..8cb0f90fb 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -118,7 +118,7 @@ dsl_deadlist_compare(const void *arg1, const void *arg2) const dsl_deadlist_entry_t *dle1 = arg1; const dsl_deadlist_entry_t *dle2 = arg2; - return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); + return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); } static int @@ -127,7 +127,7 @@ dsl_deadlist_cache_compare(const void *arg1, const void *arg2) const dsl_deadlist_cache_entry_t *dlce1 = arg1; const dsl_deadlist_cache_entry_t *dlce2 = arg2; - return (AVL_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); + return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); } static void @@ -917,14 +917,14 @@ livelist_compare(const void *larg, const void *rarg) uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); if (l_dva0_vdev != r_dva0_vdev) - return (AVL_CMP(l_dva0_vdev, r_dva0_vdev)); + return (TREE_CMP(l_dva0_vdev, r_dva0_vdev)); /* if vdevs are equal, sort by offsets. */ uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); if (l_dva0_offset == r_dva0_offset) ASSERT3U(l->blk_birth, ==, r->blk_birth); - return (AVL_CMP(l_dva0_offset, r_dva0_offset)); + return (TREE_CMP(l_dva0_offset, r_dva0_offset)); } struct livelist_iter_arg { diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index cef460f02..cf8a3c9bb 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -399,7 +399,7 @@ perm_set_compare(const void *arg1, const void *arg2) val = strcmp(node1->p_setname, node2->p_setname); - return (AVL_ISIGN(val)); + return (TREE_ISIGN(val)); } /* diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f1621c9c9..3eacb42f1 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -279,7 +279,7 @@ struct dsl_scan_io_queue { /* trees used for sorting I/Os and extents of I/Os */ range_tree_t *q_exts_by_addr; - avl_tree_t q_exts_by_size; + zfs_btree_t q_exts_by_size; avl_tree_t q_sios_by_addr; uint64_t q_sio_memused; @@ -646,7 +646,8 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) mutex_enter(&vd->vdev_scan_io_queue_lock); ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); - ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL); + ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==, + NULL); ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); mutex_exit(&vd->vdev_scan_io_queue_lock); } @@ -1242,7 +1243,7 @@ dsl_scan_should_clear(dsl_scan_t *scn) queue = tvd->vdev_scan_io_queue; if (queue != NULL) { /* # extents in exts_by_size = # in exts_by_addr */ - mused += avl_numnodes(&queue->q_exts_by_size) * + mused += zfs_btree_numnodes(&queue->q_exts_by_size) * sizeof (range_seg_t) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); @@ -2847,7 +2848,7 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) srch_sio = sio_alloc(1); srch_sio->sio_nr_dvas = 1; - SIO_SET_OFFSET(srch_sio, rs->rs_start); + SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr)); /* * The exact start of the extent might not contain any matching zios, @@ -2859,10 +2860,12 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); - while (sio != NULL && - SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) { - ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start); - ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end); + while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + queue->q_exts_by_addr) && num_sios <= 32) { + ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs, + queue->q_exts_by_addr)); + ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs, + queue->q_exts_by_addr)); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); @@ -2880,16 +2883,19 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ - if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) { + if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + queue->q_exts_by_addr)) { range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); range_tree_resize_segment(queue->q_exts_by_addr, rs, - SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio)); + SIO_GET_OFFSET(sio), rs_get_end(rs, + queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); return (B_TRUE); } else { - range_tree_remove(queue->q_exts_by_addr, rs->rs_start, - rs->rs_end - rs->rs_start); + uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr); + uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr); + range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); return (B_FALSE); } } @@ -2909,6 +2915,7 @@ static range_seg_t * scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; + range_tree_t *rt = queue->q_exts_by_addr; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(scn->scn_is_sorted); @@ -2916,9 +2923,16 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) /* handle tunable overrides */ if (scn->scn_checkpointing || scn->scn_clearing) { if (zfs_scan_issue_strategy == 1) { - return (range_tree_first(queue->q_exts_by_addr)); + return (range_tree_first(rt)); } else if (zfs_scan_issue_strategy == 2) { - return (avl_first(&queue->q_exts_by_size)); + range_seg_t *size_rs = + zfs_btree_first(&queue->q_exts_by_size, NULL); + uint64_t start = rs_get_start(size_rs, rt); + uint64_t size = rs_get_end(size_rs, rt) - start; + range_seg_t *addr_rs = range_tree_find(rt, start, + size); + ASSERT3P(addr_rs, !=, NULL); + return (addr_rs); } } @@ -2932,9 +2946,15 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) * In this case, we instead switch to issuing extents in LBA order. */ if (scn->scn_checkpointing) { - return (range_tree_first(queue->q_exts_by_addr)); + return (range_tree_first(rt)); } else if (scn->scn_clearing) { - return (avl_first(&queue->q_exts_by_size)); + range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size, + NULL); + uint64_t start = rs_get_start(size_rs, rt); + uint64_t size = rs_get_end(size_rs, rt) - start; + range_seg_t *addr_rs = range_tree_find(rt, start, size); + ASSERT3P(addr_rs, !=, NULL); + return (addr_rs); } else { return (NULL); } @@ -4038,9 +4058,10 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, static int ext_size_compare(const void *x, const void *y) { - const range_seg_t *rsa = x, *rsb = y; - uint64_t sa = rsa->rs_end - rsa->rs_start, - sb = rsb->rs_end - rsb->rs_start; + const range_seg_gap_t *rsa = x, *rsb = y; + + uint64_t sa = rsa->rs_end - rsa->rs_start; + uint64_t sb = rsb->rs_end - rsb->rs_start; uint64_t score_a, score_b; score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * @@ -4069,7 +4090,7 @@ sio_addr_compare(const void *x, const void *y) { const scan_io_t *a = x, *b = y; - return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); + return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); } /* IO queues are created on demand when they are needed. */ @@ -4083,8 +4104,8 @@ scan_io_queue_create(vdev_t *vd) q->q_vd = vd; q->q_sio_memused = 0; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); - q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, - &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); + q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP, + &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap); avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index d4dacbc2f..7b8ac91b1 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -36,6 +36,7 @@ #include #include #include +#include #define WITH_DF_BLOCK_ALLOCATOR @@ -183,6 +184,13 @@ int metaslab_df_free_pct = 4; */ int metaslab_df_max_search = 16 * 1024 * 1024; +/* + * Forces the metaslab_block_picker function to search for at least this many + * segments forwards until giving up on finding a segment that the allocation + * will fit into. + */ +uint32_t metaslab_min_search_count = 100; + /* * If we are not searching forward (due to metaslab_df_max_search, * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable @@ -276,19 +284,34 @@ uint64_t metaslab_trace_max_entries = 5000; */ int max_disabled_ms = 3; -/* - * Maximum percentage of memory to use on storing loaded metaslabs. If loading - * a metaslab would take it over this percentage, the oldest selected metaslab - * is automatically unloaded. - */ -int zfs_metaslab_mem_limit = 25; - /* * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. * To avoid 64-bit overflow, don't set above UINT32_MAX. */ unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ +/* + * Maximum percentage of memory to use on storing loaded metaslabs. If loading + * a metaslab would take it over this percentage, the oldest selected metaslab + * is automatically unloaded. + */ +int zfs_metaslab_mem_limit = 75; + +/* + * Force the per-metaslab range trees to use 64-bit integers to store + * segments. Used for debugging purposes. + */ +boolean_t zfs_metaslab_force_large_segs = B_FALSE; + +/* + * By default we only store segments over a certain size in the size-sorted + * metaslab trees (ms_allocatable_by_size and + * ms_unflushed_frees_by_size). This dramatically reduces memory usage and + * improves load and unload times at the cost of causing us to use slightly + * larger segments than we would otherwise in some cases. + */ +uint32_t metaslab_by_size_min_shift = 14; + static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); @@ -299,8 +322,66 @@ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); +static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); #ifdef _METASLAB_TRACING kmem_cache_t *metaslab_alloc_trace_cache; + +typedef struct metaslab_stats { + kstat_named_t metaslabstat_trace_over_limit; + kstat_named_t metaslabstat_df_find_under_floor; + kstat_named_t metaslabstat_reload_tree; +} metaslab_stats_t; + +static metaslab_stats_t metaslab_stats = { + { "trace_over_limit", KSTAT_DATA_UINT64 }, + { "df_find_under_floor", KSTAT_DATA_UINT64 }, + { "reload_tree", KSTAT_DATA_UINT64 }, +}; + +#define METASLABSTAT_BUMP(stat) \ + atomic_inc_64(&metaslab_stats.stat.value.ui64); + + +kstat_t *metaslab_ksp; + +void +metaslab_stat_init(void) +{ + ASSERT(metaslab_alloc_trace_cache == NULL); + metaslab_alloc_trace_cache = kmem_cache_create( + "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", + "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / + sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (metaslab_ksp != NULL) { + metaslab_ksp->ks_data = &metaslab_stats; + kstat_install(metaslab_ksp); + } +} + +void +metaslab_stat_fini(void) +{ + if (metaslab_ksp != NULL) { + kstat_delete(metaslab_ksp); + metaslab_ksp = NULL; + } + + kmem_cache_destroy(metaslab_alloc_trace_cache); + metaslab_alloc_trace_cache = NULL; +} +#else + +void +metaslab_stat_init(void) +{ +} + +void +metaslab_stat_fini(void) +{ +} #endif /* @@ -608,13 +689,13 @@ metaslab_compare(const void *x1, const void *x2) if (sort1 > sort2) return (1); - int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); + int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); - IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); + IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); - return (AVL_CMP(m1->ms_start, m2->ms_start)); + return (TREE_CMP(m1->ms_start, m2->ms_start)); } /* @@ -711,17 +792,17 @@ metaslab_sort_by_flushed(const void *va, const void *vb) const metaslab_t *a = va; const metaslab_t *b = vb; - int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); + int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); if (likely(cmp)) return (cmp); uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; - cmp = AVL_CMP(a_vdev_id, b_vdev_id); + cmp = TREE_CMP(a_vdev_id, b_vdev_id); if (cmp) return (cmp); - return (AVL_CMP(a->ms_id, b->ms_id)); + return (TREE_CMP(a->ms_id, b->ms_id)); } metaslab_group_t * @@ -1212,24 +1293,169 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, */ /* - * Comparison function for the private size-ordered tree. Tree is sorted - * by size, larger sizes at the end of the tree. + * Comparison function for the private size-ordered tree using 32-bit + * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ static int -metaslab_rangesize_compare(const void *x1, const void *x2) +metaslab_rangesize32_compare(const void *x1, const void *x2) { - const range_seg_t *r1 = x1; - const range_seg_t *r2 = x2; + const range_seg32_t *r1 = x1; + const range_seg32_t *r2 = x2; + uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; - int cmp = AVL_CMP(rs_size1, rs_size2); + int cmp = TREE_CMP(rs_size1, rs_size2); if (likely(cmp)) return (cmp); - return (AVL_CMP(r1->rs_start, r2->rs_start)); + return (TREE_CMP(r1->rs_start, r2->rs_start)); } +/* + * Comparison function for the private size-ordered tree using 64-bit + * ranges. Tree is sorted by size, larger sizes at the end of the tree. + */ +static int +metaslab_rangesize64_compare(const void *x1, const void *x2) +{ + const range_seg64_t *r1 = x1; + const range_seg64_t *r2 = x2; + + uint64_t rs_size1 = r1->rs_end - r1->rs_start; + uint64_t rs_size2 = r2->rs_end - r2->rs_start; + + int cmp = TREE_CMP(rs_size1, rs_size2); + if (likely(cmp)) + return (cmp); + + return (TREE_CMP(r1->rs_start, r2->rs_start)); +} +typedef struct metaslab_rt_arg { + zfs_btree_t *mra_bt; + uint32_t mra_floor_shift; +} metaslab_rt_arg_t; + +struct mssa_arg { + range_tree_t *rt; + metaslab_rt_arg_t *mra; +}; + +static void +metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) +{ + struct mssa_arg *mssap = arg; + range_tree_t *rt = mssap->rt; + metaslab_rt_arg_t *mrap = mssap->mra; + range_seg_max_t seg = {0}; + rs_set_start(&seg, rt, start); + rs_set_end(&seg, rt, start + size); + metaslab_rt_add(rt, &seg, mrap); +} + +static void +metaslab_size_tree_full_load(range_tree_t *rt) +{ + metaslab_rt_arg_t *mrap = rt->rt_arg; +#ifdef _METASLAB_TRACING + METASLABSTAT_BUMP(metaslabstat_reload_tree); +#endif + ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); + mrap->mra_floor_shift = 0; + struct mssa_arg arg = {0}; + arg.rt = rt; + arg.mra = mrap; + range_tree_walk(rt, metaslab_size_sorted_add, &arg); +} + +/* + * Create any block allocator specific components. The current allocators + * rely on using both a size-ordered range_tree_t and an array of uint64_t's. + */ +/* ARGSUSED */ +static void +metaslab_rt_create(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + size_t size; + int (*compare) (const void *, const void *); + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + compare = metaslab_rangesize32_compare; + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + compare = metaslab_rangesize64_compare; + break; + default: + panic("Invalid range seg type %d", rt->rt_type); + } + zfs_btree_create(size_tree, compare, size); + mrap->mra_floor_shift = metaslab_by_size_min_shift; +} + +/* ARGSUSED */ +static void +metaslab_rt_destroy(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + zfs_btree_destroy(size_tree); + kmem_free(mrap, sizeof (*mrap)); +} + +/* ARGSUSED */ +static void +metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < + (1 << mrap->mra_floor_shift)) + return; + + zfs_btree_add(size_tree, rs); +} + +/* ARGSUSED */ +static void +metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 << + mrap->mra_floor_shift)) + return; + + zfs_btree_remove(size_tree, rs); +} + +/* ARGSUSED */ +static void +metaslab_rt_vacate(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + zfs_btree_clear(size_tree); + zfs_btree_destroy(size_tree); + + metaslab_rt_create(rt, arg); +} + +static range_tree_ops_t metaslab_rt_ops = { + .rtop_create = metaslab_rt_create, + .rtop_destroy = metaslab_rt_destroy, + .rtop_add = metaslab_rt_add, + .rtop_remove = metaslab_rt_remove, + .rtop_vacate = metaslab_rt_vacate +}; + /* * ========================================================================== * Common allocator routines @@ -1242,16 +1468,20 @@ metaslab_rangesize_compare(const void *x1, const void *x2) uint64_t metaslab_largest_allocatable(metaslab_t *msp) { - avl_tree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; if (t == NULL) return (0); - rs = avl_last(t); + if (zfs_btree_numnodes(t) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); + + rs = zfs_btree_last(t, NULL); if (rs == NULL) return (0); - return (rs->rs_end - rs->rs_start); + return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, + msp->ms_allocatable)); } /* @@ -1266,7 +1496,10 @@ metaslab_largest_unflushed_free(metaslab_t *msp) if (msp->ms_unflushed_frees == NULL) return (0); - range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size); + if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) + metaslab_size_tree_full_load(msp->ms_unflushed_frees); + range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, + NULL); if (rs == NULL) return (0); @@ -1293,8 +1526,8 @@ metaslab_largest_unflushed_free(metaslab_t *msp) * the largest segment; there may be other usable chunks in the * largest segment, but we ignore them. */ - uint64_t rstart = rs->rs_start; - uint64_t rsize = rs->rs_end - rstart; + uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); + uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; for (int t = 0; t < TXG_DEFER_SIZE; t++) { uint64_t start = 0; uint64_t size = 0; @@ -1318,17 +1551,18 @@ metaslab_largest_unflushed_free(metaslab_t *msp) } static range_seg_t * -metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) +metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, + uint64_t size, zfs_btree_index_t *where) { - range_seg_t *rs, rsearch; - avl_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch; - rsearch.rs_start = start; - rsearch.rs_end = start + size; + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, start + size); - rs = avl_find(t, &rsearch, &where); + rs = zfs_btree_find(t, &rsearch, where); if (rs == NULL) { - rs = avl_nearest(t, where, AVL_AFTER); + rs = zfs_btree_next(t, where, where); } return (rs); @@ -1337,27 +1571,34 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) #if defined(WITH_DF_BLOCK_ALLOCATOR) || \ defined(WITH_CF_BLOCK_ALLOCATOR) /* - * This is a helper function that can be used by the allocator to find - * a suitable block to allocate. This will search the specified AVL - * tree looking for a block that matches the specified criteria. + * This is a helper function that can be used by the allocator to find a + * suitable block to allocate. This will search the specified B-tree looking + * for a block that matches the specified criteria. */ static uint64_t -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, +metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, uint64_t max_search) { - range_seg_t *rs = metaslab_block_find(t, *cursor, size); + if (*cursor == 0) + *cursor = rt->rt_start; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t where; + range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); uint64_t first_found; + int count_searched = 0; if (rs != NULL) - first_found = rs->rs_start; + first_found = rs_get_start(rs, rt); - while (rs != NULL && rs->rs_start - first_found <= max_search) { - uint64_t offset = rs->rs_start; - if (offset + size <= rs->rs_end) { + while (rs != NULL && (rs_get_start(rs, rt) - first_found <= + max_search || count_searched < metaslab_min_search_count)) { + uint64_t offset = rs_get_start(rs, rt); + if (offset + size <= rs_get_end(rs, rt)) { *cursor = offset + size; return (offset); } - rs = AVL_NEXT(t, rs); + rs = zfs_btree_next(bt, &where, &where); + count_searched++; } *cursor = 0; @@ -1403,8 +1644,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(&rt->rt_root), ==, - avl_numnodes(&msp->ms_allocatable_by_size)); /* * If we're running low on space, find a segment based on size, @@ -1414,22 +1653,33 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) free_pct < metaslab_df_free_pct) { offset = -1; } else { - offset = metaslab_block_picker(&rt->rt_root, + offset = metaslab_block_picker(rt, cursor, size, metaslab_df_max_search); } if (offset == -1) { range_seg_t *rs; + if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); if (metaslab_df_use_largest_segment) { /* use largest free segment */ - rs = avl_last(&msp->ms_allocatable_by_size); + rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); } else { + zfs_btree_index_t where; /* use segment of this size, or next largest */ +#ifdef _METASLAB_TRACING + metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; + if (size < (1 << mrap->mra_floor_shift)) { + METASLABSTAT_BUMP( + metaslabstat_df_find_under_floor); + } +#endif rs = metaslab_block_find(&msp->ms_allocatable_by_size, - 0, size); + rt, msp->ms_start, size, &where); } - if (rs != NULL && rs->rs_start + size <= rs->rs_end) { - offset = rs->rs_start; + if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, + rt)) { + offset = rs_get_start(rs, rt); *cursor = offset + size; } } @@ -1458,25 +1708,27 @@ static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { range_seg_t *rs; - rs = avl_last(&msp->ms_allocatable_by_size); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) + if (zfs_btree_numnodes(t) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); + rs = zfs_btree_last(t, NULL); + if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < + size) return (-1ULL); - *cursor = rs->rs_start; - *cursor_end = rs->rs_end; + *cursor = rs_get_start(rs, rt); + *cursor_end = rs_get_end(rs, rt); } offset = *cursor; @@ -1511,39 +1763,40 @@ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { - avl_tree_t *t = &msp->ms_allocatable->rt_root; - avl_index_t where; - range_seg_t *rs, rsearch; + zfs_btree_t *t = &msp->ms_allocatable->rt_root; + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, - avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); - rsearch.rs_start = *cursor; - rsearch.rs_end = *cursor + size; + rs_set_start(&rsearch, rt, *cursor); + rs_set_end(&rsearch, rt, *cursor + size); - rs = avl_find(t, &rsearch, &where); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { + rs = zfs_btree_find(t, &rsearch, &where); + if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; - rsearch.rs_start = 0; - rsearch.rs_end = MIN(max_size, - 1ULL << (hbit + metaslab_ndf_clump_shift)); - rs = avl_find(t, &rsearch, &where); + rs_set_start(&rsearch, rt, 0); + rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + + metaslab_ndf_clump_shift))); + + rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL) - rs = avl_nearest(t, where, AVL_AFTER); + rs = zfs_btree_next(t, &where, &where); ASSERT(rs != NULL); } - if ((rs->rs_end - rs->rs_start) >= size) { - *cursor = rs->rs_start + size; - return (rs->rs_start); + if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { + *cursor = rs_get_start(rs, rt) + size; + return (rs_get_start(rs, rt)); } return (-1ULL); } @@ -1889,9 +2142,8 @@ metaslab_potentially_evict(metaslab_class_t *mc) { #ifdef _KERNEL uint64_t allmem = arc_all_memory(); - extern kmem_cache_t *range_seg_cache; - uint64_t inuse = range_seg_cache->skc_obj_total; - uint64_t size = range_seg_cache->skc_obj_size; + uint64_t inuse = zfs_btree_leaf_cache->skc_obj_total; + uint64_t size = zfs_btree_leaf_cache->skc_obj_size; int tries = 0; for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2; @@ -1928,7 +2180,7 @@ metaslab_potentially_evict(metaslab_class_t *mc) */ if (msp->ms_loading) { msp = next_msp; - inuse = range_seg_cache->skc_obj_total; + inuse = zfs_btree_leaf_cache->skc_obj_total; continue; } /* @@ -1950,7 +2202,7 @@ metaslab_potentially_evict(metaslab_class_t *mc) } mutex_exit(&msp->ms_lock); msp = next_msp; - inuse = range_seg_cache->skc_obj_total; + inuse = zfs_btree_leaf_cache->skc_obj_total; } } #endif @@ -1993,10 +2245,39 @@ metaslab_load_impl(metaslab_t *msp) mutex_exit(&msp->ms_lock); hrtime_t load_start = gethrtime(); + metaslab_rt_arg_t *mrap; + if (msp->ms_allocatable->rt_arg == NULL) { + mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + } else { + mrap = msp->ms_allocatable->rt_arg; + msp->ms_allocatable->rt_ops = NULL; + msp->ms_allocatable->rt_arg = NULL; + } + mrap->mra_bt = &msp->ms_allocatable_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; + if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); + + /* Now, populate the size-sorted tree. */ + metaslab_rt_create(msp->ms_allocatable, mrap); + msp->ms_allocatable->rt_ops = &metaslab_rt_ops; + msp->ms_allocatable->rt_arg = mrap; + + struct mssa_arg arg = {0}; + arg.rt = msp->ms_allocatable; + arg.mra = mrap; + range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, + &arg); } else { + /* + * Add the size-sorted tree first, since we don't need to load + * the metaslab from the spacemap. + */ + metaslab_rt_create(msp->ms_allocatable, mrap); + msp->ms_allocatable->rt_ops = &metaslab_rt_ops; + msp->ms_allocatable->rt_arg = mrap; /* * The space map has not been allocated yet, so treat * all the space in the metaslab as free and add it to the @@ -2252,6 +2533,29 @@ metaslab_unload(metaslab_t *msp) metaslab_recalculate_weight_and_sort(msp); } +/* + * We want to optimize the memory use of the per-metaslab range + * trees. To do this, we store the segments in the range trees in + * units of sectors, zero-indexing from the start of the metaslab. If + * the vdev_ms_shift - the vdev_ashift is less than 32, we can store + * the ranges using two uint32_ts, rather than two uint64_ts. + */ +static range_seg_type_t +metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, + uint64_t *start, uint64_t *shift) +{ + if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && + !zfs_metaslab_force_large_segs) { + *shift = vdev->vdev_ashift; + *start = msp->ms_start; + return (RANGE_SEG32); + } else { + *shift = 0; + *start = 0; + return (RANGE_SEG64); + } +} + void metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) { @@ -2327,6 +2631,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); + /* * We create the ms_allocatable here, but we don't create the * other range trees until metaslab_sync_done(). This serves @@ -2335,10 +2643,9 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, * we'd data fault on any attempt to use this metaslab before * it's ready. */ - ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, - &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); + ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); - ms->ms_trim = range_tree_create(NULL, NULL); + ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms, B_FALSE); @@ -2393,7 +2700,7 @@ metaslab_unflushed_changes_memused(metaslab_t *ms) { return ((range_tree_numsegs(ms->ms_unflushed_allocs) + range_tree_numsegs(ms->ms_unflushed_frees)) * - sizeof (range_seg_t)); + ms->ms_unflushed_allocs->rt_root.bt_elem_size); } void @@ -3187,7 +3494,7 @@ metaslab_should_condense(metaslab_t *msp) * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ - if (avl_is_empty(&msp->ms_allocatable_by_size) || + if (range_tree_numsegs(msp->ms_allocatable) == 0 || msp->ms_condense_wanted) return (B_TRUE); @@ -3233,28 +3540,29 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) * So to truncate the space map to represent all the entries of * previous TXGs we do the following: * - * 1] We create a range tree (condense tree) that is 100% allocated. - * 2] We remove from it all segments found in the ms_defer trees + * 1] We create a range tree (condense tree) that is 100% empty. + * 2] We add to it all segments found in the ms_defer trees * as those segments are marked as free in the original space * map. We do the same with the ms_allocating trees for the same - * reason. Removing these segments should be a relatively + * reason. Adding these segments should be a relatively * inexpensive operation since we expect these trees to have a * small number of nodes. - * 3] We vacate any unflushed allocs as they should already exist - * in the condense tree. Then we vacate any unflushed frees as - * they should already be part of ms_allocatable. - * 4] At this point, we would ideally like to remove all segments + * 3] We vacate any unflushed allocs, since they are not frees we + * need to add to the condense tree. Then we vacate any + * unflushed frees as they should already be part of ms_allocatable. + * 4] At this point, we would ideally like to add all segments * in the ms_allocatable tree from the condense tree. This way * we would write all the entries of the condense tree as the - * condensed space map, which would only contain allocated - * segments with everything else assumed to be freed. + * condensed space map, which would only contain freeed + * segments with everything else assumed to be allocated. * * Doing so can be prohibitively expensive as ms_allocatable can - * be large, and therefore computationally expensive to subtract - * from the condense_tree. Instead we first sync out the - * condense_tree and then the ms_allocatable, in the condensed - * space map. While this is not optimal, it is typically close to - * optimal and more importantly much cheaper to compute. + * be large, and therefore computationally expensive to add to + * the condense_tree. Instead we first sync out an entry marking + * everything as allocated, then the condense_tree and then the + * ms_allocatable, in the condensed space map. While this is not + * optimal, it is typically close to optimal and more importantly + * much cheaper to compute. * * 5] Finally, as both of the unflushed trees were written to our * new and condensed metaslab space map, we basically flushed @@ -3268,22 +3576,26 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, spa->spa_name, space_map_length(msp->ms_sm), - avl_numnodes(&msp->ms_allocatable->rt_root), + range_tree_numsegs(msp->ms_allocatable), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; - condense_tree = range_tree_create(NULL, NULL); - range_tree_add(condense_tree, msp->ms_start, msp->ms_size); + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, + &start, &shift); + + condense_tree = range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], - range_tree_remove, condense_tree); + range_tree_add, condense_tree); } for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], - range_tree_remove, condense_tree); + range_tree_add, condense_tree); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, @@ -3331,11 +3643,17 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ - space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); + range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start, + shift); + range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); + space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); + space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); + range_tree_vacate(tmp_tree, NULL, NULL); + range_tree_destroy(tmp_tree); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; @@ -3578,7 +3896,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) return; - VERIFY(txg <= spa_final_dirty_txg(spa)); + VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); /* * The only state that can actually be changing concurrently @@ -3867,32 +4185,46 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * range trees and add its capacity to the vdev. */ if (msp->ms_freed == NULL) { + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(vd, msp, &start, + &shift); + for (int t = 0; t < TXG_SIZE; t++) { ASSERT(msp->ms_allocating[t] == NULL); - msp->ms_allocating[t] = range_tree_create(NULL, NULL); + msp->ms_allocating[t] = range_tree_create(NULL, type, + NULL, start, shift); } ASSERT3P(msp->ms_freeing, ==, NULL); - msp->ms_freeing = range_tree_create(NULL, NULL); + msp->ms_freeing = range_tree_create(NULL, type, NULL, start, + shift); ASSERT3P(msp->ms_freed, ==, NULL); - msp->ms_freed = range_tree_create(NULL, NULL); + msp->ms_freed = range_tree_create(NULL, type, NULL, start, + shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ASSERT3P(msp->ms_defer[t], ==, NULL); - msp->ms_defer[t] = range_tree_create(NULL, NULL); + msp->ms_defer[t] = range_tree_create(NULL, type, NULL, + start, shift); } ASSERT3P(msp->ms_checkpointing, ==, NULL); - msp->ms_checkpointing = range_tree_create(NULL, NULL); + msp->ms_checkpointing = range_tree_create(NULL, type, NULL, + start, shift); ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); - msp->ms_unflushed_allocs = range_tree_create(NULL, NULL); + msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL, + start, shift); + + metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + mrap->mra_bt = &msp->ms_unflushed_frees_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; ASSERT3P(msp->ms_unflushed_frees, ==, NULL); - msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops, - &msp->ms_unflushed_frees_by_size, - metaslab_rangesize_compare, 0); + msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, + type, mrap, start, shift); metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } @@ -4052,36 +4384,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva) * ========================================================================== */ #ifdef _METASLAB_TRACING -kstat_t *metaslab_trace_ksp; -kstat_named_t metaslab_trace_over_limit; - -void -metaslab_alloc_trace_init(void) -{ - ASSERT(metaslab_alloc_trace_cache == NULL); - metaslab_alloc_trace_cache = kmem_cache_create( - "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); - metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", - "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); - if (metaslab_trace_ksp != NULL) { - metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; - kstat_named_init(&metaslab_trace_over_limit, - "metaslab_trace_over_limit", KSTAT_DATA_UINT64); - kstat_install(metaslab_trace_ksp); - } -} - -void -metaslab_alloc_trace_fini(void) -{ - if (metaslab_trace_ksp != NULL) { - kstat_delete(metaslab_trace_ksp); - metaslab_trace_ksp = NULL; - } - kmem_cache_destroy(metaslab_alloc_trace_cache); - metaslab_alloc_trace_cache = NULL; -} /* * Add an allocation trace element to the allocation tracing list. @@ -4108,7 +4410,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, #ifdef DEBUG panic("too many entries in allocation list"); #endif - atomic_inc_64(&metaslab_trace_over_limit.value.ui64); + METASLABSTAT_BUMP(metaslabstat_trace_over_limit); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next); @@ -4160,16 +4462,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal) #define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) -void -metaslab_alloc_trace_init(void) -{ -} - -void -metaslab_alloc_trace_fini(void) -{ -} - void metaslab_trace_init(zio_alloc_list_t *zal) { diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 0e1297214..0b369a438 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -74,42 +74,38 @@ * support removing complete segments. */ -kmem_cache_t *range_seg_cache; - -/* Generic ops for managing an AVL tree alongside a range tree */ -struct range_tree_ops rt_avl_ops = { - .rtop_create = rt_avl_create, - .rtop_destroy = rt_avl_destroy, - .rtop_add = rt_avl_add, - .rtop_remove = rt_avl_remove, - .rtop_vacate = rt_avl_vacate, -}; - -void -range_tree_init(void) +static inline void +rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt) { - ASSERT(range_seg_cache == NULL); - range_seg_cache = kmem_cache_create("range_seg_cache", - sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -range_tree_fini(void) -{ - kmem_cache_destroy(range_seg_cache); - range_seg_cache = NULL; + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + size_t size = 0; + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + break; + default: + VERIFY(0); + } + bcopy(src, dest, size); } void range_tree_stat_verify(range_tree_t *rt) { range_seg_t *rs; + zfs_btree_index_t where; uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; int i; - for (rs = avl_first(&rt->rt_root); rs != NULL; - rs = AVL_NEXT(&rt->rt_root, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; hist[idx]++; @@ -128,7 +124,7 @@ range_tree_stat_verify(range_tree_t *rt) static void range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) { - uint64_t size = rs->rs_end - rs->rs_start; + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); @@ -142,7 +138,7 @@ range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) static void range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) { - uint64_t size = rs->rs_end - rs->rs_start; + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); @@ -153,14 +149,35 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) rt->rt_histogram[idx]--; } -/* - * NOTE: caller is responsible for all locking. - */ static int -range_tree_seg_compare(const void *x1, const void *x2) +range_tree_seg32_compare(const void *x1, const void *x2) { - const range_seg_t *r1 = (const range_seg_t *)x1; - const range_seg_t *r2 = (const range_seg_t *)x2; + const range_seg32_t *r1 = x1; + const range_seg32_t *r2 = x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + +static int +range_tree_seg64_compare(const void *x1, const void *x2) +{ + const range_seg64_t *r1 = x1; + const range_seg64_t *r2 = x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + +static int +range_tree_seg_gap_compare(const void *x1, const void *x2) +{ + const range_seg_gap_t *r1 = x1; + const range_seg_gap_t *r2 = x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); @@ -169,18 +186,42 @@ range_tree_seg_compare(const void *x1, const void *x2) } range_tree_t * -range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare) (const void *, const void *), uint64_t gap) +range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, + uint64_t start, uint64_t shift, + int (*zfs_btree_compare) (const void *, const void *), + uint64_t gap) { range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); - avl_create(&rt->rt_root, range_tree_seg_compare, - sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); + ASSERT3U(shift, <, 64); + ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); + size_t size; + int (*compare) (const void *, const void *); + switch (type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + compare = range_tree_seg32_compare; + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + compare = range_tree_seg64_compare; + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + compare = range_tree_seg_gap_compare; + break; + default: + panic("Invalid range seg type %d", type); + } + zfs_btree_create(&rt->rt_root, compare, size); rt->rt_ops = ops; rt->rt_gap = gap; rt->rt_arg = arg; - rt->rt_avl_compare = avl_compare; + rt->rt_type = type; + rt->rt_start = start; + rt->rt_shift = shift; + rt->rt_btree_compare = zfs_btree_compare; if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); @@ -189,9 +230,10 @@ range_tree_create_impl(range_tree_ops_t *ops, void *arg, } range_tree_t * -range_tree_create(range_tree_ops_t *ops, void *arg) +range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, + void *arg, uint64_t start, uint64_t shift) { - return (range_tree_create_impl(ops, arg, NULL, 0)); + return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0)); } void @@ -202,19 +244,20 @@ range_tree_destroy(range_tree_t *rt) if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); - avl_destroy(&rt->rt_root); + zfs_btree_destroy(&rt->rt_root); kmem_free(rt, sizeof (*rt)); } void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) { - ASSERT3U(rs->rs_fill + delta, !=, 0); - ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); + ASSERT3U(rs_get_fill(rs, rt) + delta, !=, 0); + ASSERT3U(rs_get_fill(rs, rt) + delta, <=, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs->rs_fill += delta; + rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } @@ -223,28 +266,20 @@ static void range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) { range_tree_t *rt = arg; - avl_index_t where; - range_seg_t rsearch, *rs_before, *rs_after, *rs; + zfs_btree_index_t where; + range_seg_t *rs_before, *rs_after, *rs; + range_seg_max_t tmp, rsearch; uint64_t end = start + size, gap = rt->rt_gap; uint64_t bridge_size = 0; boolean_t merge_before, merge_after; ASSERT3U(size, !=, 0); ASSERT3U(fill, <=, size); + ASSERT3U(start + size, >, start); - rsearch.rs_start = start; - rsearch.rs_end = end; - rs = avl_find(&rt->rt_root, &rsearch, &where); - - if (gap == 0 && rs != NULL && - rs->rs_start <= start && rs->rs_end >= end) { - zfs_panic_recover("zfs: allocating allocated segment" - "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n", - (longlong_t)start, (longlong_t)size, - (longlong_t)rs->rs_start, - (longlong_t)rs->rs_end - rs->rs_start); - return; - } + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* * If this is a gap-supporting range tree, it is possible that we @@ -255,27 +290,28 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * the normal code paths. */ if (rs != NULL) { + ASSERT3U(rt->rt_gap, !=, 0); + uint64_t rstart = rs_get_start(rs, rt); + uint64_t rend = rs_get_end(rs, rt); ASSERT3U(gap, !=, 0); - if (rs->rs_start <= start && rs->rs_end >= end) { + if (rstart <= start && rend >= end) { range_tree_adjust_fill(rt, rs, fill); return; } - avl_remove(&rt->rt_root, rs); + zfs_btree_remove(&rt->rt_root, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); range_tree_stat_decr(rt, rs); - rt->rt_space -= rs->rs_end - rs->rs_start; + rt->rt_space -= rend - rstart; - fill += rs->rs_fill; - start = MIN(start, rs->rs_start); - end = MAX(end, rs->rs_end); + fill += rs_get_fill(rs, rt); + start = MIN(start, rstart); + end = MAX(end, rend); size = end - start; range_tree_add_impl(rt, start, size, fill); - - kmem_cache_free(range_seg_cache, rs); return; } @@ -286,19 +322,21 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * If gap != 0, we might need to merge with our neighbors even if we * aren't directly touching. */ - rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); - rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); + zfs_btree_index_t where_before, where_after; + rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before); + rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after); - merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap); - merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap); + merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >= + start - gap); + merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end + + gap); if (merge_before && gap != 0) - bridge_size += start - rs_before->rs_end; + bridge_size += start - rs_get_end(rs_before, rt); if (merge_after && gap != 0) - bridge_size += rs_after->rs_start - end; + bridge_size += rs_get_start(rs_after, rt) - end; if (merge_before && merge_after) { - avl_remove(&rt->rt_root, rs_before); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); @@ -307,9 +345,19 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) range_tree_stat_decr(rt, rs_before); range_tree_stat_decr(rt, rs_after); - rs_after->rs_fill += rs_before->rs_fill + fill; - rs_after->rs_start = rs_before->rs_start; - kmem_cache_free(range_seg_cache, rs_before); + rs_copy(rs_after, &tmp, rt); + uint64_t before_start = rs_get_start_raw(rs_before, rt); + uint64_t before_fill = rs_get_fill(rs_before, rt); + uint64_t after_fill = rs_get_fill(rs_after, rt); + zfs_btree_remove_from(&rt->rt_root, &where_before); + + /* + * We have to re-find the node because our old reference is + * invalid as soon as we do any mutating btree operations. + */ + rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after); + rs_set_start_raw(rs_after, rt, before_start); + rs_set_fill(rs_after, rt, after_fill + before_fill + fill); rs = rs_after; } else if (merge_before) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) @@ -317,8 +365,9 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) range_tree_stat_decr(rt, rs_before); - rs_before->rs_fill += fill; - rs_before->rs_end = end; + uint64_t before_fill = rs_get_fill(rs_before, rt); + rs_set_end(rs_before, rt, end); + rs_set_fill(rs_before, rt, before_fill + fill); rs = rs_before; } else if (merge_after) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) @@ -326,22 +375,26 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) range_tree_stat_decr(rt, rs_after); - rs_after->rs_fill += fill; - rs_after->rs_start = start; + uint64_t after_fill = rs_get_fill(rs_after, rt); + rs_set_start(rs_after, rt, start); + rs_set_fill(rs_after, rt, after_fill + fill); rs = rs_after; } else { - rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); + rs = &tmp; - rs->rs_fill = fill; - rs->rs_start = start; - rs->rs_end = end; - avl_insert(&rt->rt_root, rs, where); + rs_set_start(rs, rt, start); + rs_set_end(rs, rt, end); + rs_set_fill(rs, rt, fill); + zfs_btree_insert(&rt->rt_root, rs, &where); } - if (gap != 0) - ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); - else - ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start); + if (gap != 0) { + ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } else { + ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); @@ -360,22 +413,25 @@ static void range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, boolean_t do_fill) { - avl_index_t where; - range_seg_t rsearch, *rs, *newseg; + zfs_btree_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch, rs_tmp; uint64_t end = start + size; boolean_t left_over, right_over; VERIFY3U(size, !=, 0); VERIFY3U(size, <=, rt->rt_space); + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); - rsearch.rs_start = start; - rsearch.rs_end = end; - rs = avl_find(&rt->rt_root, &rsearch, &where); + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* Make sure we completely overlap with someone */ if (rs == NULL) { - zfs_panic_recover("zfs: freeing free segment " - "(offset=%llu size=%llu)", + zfs_panic_recover("zfs: removing nonexistent segment from " + "range tree (offset=%llu size=%llu)", (longlong_t)start, (longlong_t)size); return; } @@ -388,30 +444,32 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, */ if (rt->rt_gap != 0) { if (do_fill) { - if (rs->rs_fill == size) { - start = rs->rs_start; - end = rs->rs_end; + if (rs_get_fill(rs, rt) == size) { + start = rs_get_start(rs, rt); + end = rs_get_end(rs, rt); size = end - start; } else { range_tree_adjust_fill(rt, rs, -size); return; } - } else if (rs->rs_start != start || rs->rs_end != end) { + } else if (rs_get_start(rs, rt) != start || + rs_get_end(rs, rt) != end) { zfs_panic_recover("zfs: freeing partial segment of " "gap tree (offset=%llu size=%llu) of " "(offset=%llu size=%llu)", (longlong_t)start, (longlong_t)size, - (longlong_t)rs->rs_start, - (longlong_t)rs->rs_end - rs->rs_start); + (longlong_t)rs_get_start(rs, rt), + (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs, + rt)); return; } } - VERIFY3U(rs->rs_start, <=, start); - VERIFY3U(rs->rs_end, >=, end); + VERIFY3U(rs_get_start(rs, rt), <=, start); + VERIFY3U(rs_get_end(rs, rt), >=, end); - left_over = (rs->rs_start != start); - right_over = (rs->rs_end != end); + left_over = (rs_get_start(rs, rt) != start); + right_over = (rs_get_end(rs, rt) != end); range_tree_stat_decr(rt, rs); @@ -419,24 +477,33 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { - newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); - newseg->rs_start = end; - newseg->rs_end = rs->rs_end; - newseg->rs_fill = newseg->rs_end - newseg->rs_start; - range_tree_stat_incr(rt, newseg); + range_seg_max_t newseg; + rs_set_start(&newseg, rt, end); + rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt)); + rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end); + range_tree_stat_incr(rt, &newseg); - rs->rs_end = start; + // This modifies the buffer already inside the range tree + rs_set_end(rs, rt, start); + + rs_copy(rs, &rs_tmp, rt); + if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL) + zfs_btree_insert(&rt->rt_root, &newseg, &where); + else + zfs_btree_add(&rt->rt_root, &newseg); - avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); + rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg); } else if (left_over) { - rs->rs_end = start; + // This modifies the buffer already inside the range tree + rs_set_end(rs, rt, start); + rs_copy(rs, &rs_tmp, rt); } else if (right_over) { - rs->rs_start = end; + // This modifies the buffer already inside the range tree + rs_set_start(rs, rt, end); + rs_copy(rs, &rs_tmp, rt); } else { - avl_remove(&rt->rt_root, rs); - kmem_cache_free(range_seg_cache, rs); + zfs_btree_remove_from(&rt->rt_root, &where); rs = NULL; } @@ -446,11 +513,12 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, * the size, since we do not support removing partial segments * of range trees with gaps. */ - rs->rs_fill = rs->rs_end - rs->rs_start; - range_tree_stat_incr(rt, rs); + rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) - + rs_get_start_raw(rs, rt)); + range_tree_stat_incr(rt, &rs_tmp); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg); } rt->rt_space -= size; @@ -472,14 +540,14 @@ void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize) { - int64_t delta = newsize - (rs->rs_end - rs->rs_start); + int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt)); range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs->rs_start = newstart; - rs->rs_end = newstart + newsize; + rs_set_start(rs, rt, newstart); + rs_set_end(rs, rt, newstart + newsize); range_tree_stat_incr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) @@ -491,22 +559,27 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) { - range_seg_t rsearch; + range_seg_max_t rsearch; uint64_t end = start + size; VERIFY(size != 0); - rsearch.rs_start = start; - rsearch.rs_end = end; - return (avl_find(&rt->rt_root, &rsearch, NULL)); + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + return (zfs_btree_find(&rt->rt_root, &rsearch, NULL)); } range_seg_t * range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) { + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + range_seg_t *rs = range_tree_find_impl(rt, start, size); - if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) + if (rs != NULL && rs_get_start(rs, rt) <= start && + rs_get_end(rs, rt) >= start + size) { return (rs); + } return (NULL); } @@ -533,24 +606,28 @@ boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, uint64_t *ostart, uint64_t *osize) { - range_seg_t rsearch; - rsearch.rs_start = start; - rsearch.rs_end = start + 1; + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); - avl_index_t where; - range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where); + range_seg_max_t rsearch; + rs_set_start(&rsearch, rt, start); + rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1); + + zfs_btree_index_t where; + range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); if (rs != NULL) { *ostart = start; - *osize = MIN(size, rs->rs_end - start); + *osize = MIN(size, rs_get_end(rs, rt) - start); return (B_TRUE); } - rs = avl_nearest(&rt->rt_root, where, AVL_AFTER); - if (rs == NULL || rs->rs_start > start + size) + rs = zfs_btree_next(&rt->rt_root, &where, &where); + if (rs == NULL || rs_get_start(rs, rt) > start + size) return (B_FALSE); - *ostart = rs->rs_start; - *osize = MIN(start + size, rs->rs_end) - rs->rs_start; + *ostart = rs_get_start(rs, rt); + *osize = MIN(start + size, rs_get_end(rs, rt)) - + rs_get_start(rs, rt); return (B_TRUE); } @@ -566,9 +643,12 @@ range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) if (size == 0) return; + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { - uint64_t free_start = MAX(rs->rs_start, start); - uint64_t free_end = MIN(rs->rs_end, start + size); + uint64_t free_start = MAX(rs_get_start(rs, rt), start); + uint64_t free_end = MIN(rs_get_end(rs, rt), start + size); range_tree_remove(rt, free_start, free_end - free_start); } } @@ -579,7 +659,7 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) range_tree_t *rt; ASSERT0(range_tree_space(*rtdst)); - ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); + ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root)); rt = *rtsrc; *rtsrc = *rtdst; @@ -589,16 +669,20 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) { - range_seg_t *rs; - void *cookie = NULL; - if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); - while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { - if (func != NULL) - func(arg, rs->rs_start, rs->rs_end - rs->rs_start); - kmem_cache_free(range_seg_cache, rs); + if (func != NULL) { + range_seg_t *rs; + zfs_btree_index_t *cookie = NULL; + + while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) != + NULL) { + func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } + } else { + zfs_btree_clear(&rt->rt_root); } bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); @@ -608,16 +692,18 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) { - for (range_seg_t *rs = avl_first(&rt->rt_root); rs; - rs = AVL_NEXT(&rt->rt_root, rs)) { - func(arg, rs->rs_start, rs->rs_end - rs->rs_start); + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); + rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - + rs_get_start(rs, rt)); } } range_seg_t * range_tree_first(range_tree_t *rt) { - return (avl_first(&rt->rt_root)); + return (zfs_btree_first(&rt->rt_root, NULL)); } uint64_t @@ -629,7 +715,7 @@ range_tree_space(range_tree_t *rt) uint64_t range_tree_numsegs(range_tree_t *rt) { - return ((rt == NULL) ? 0 : avl_numnodes(&rt->rt_root)); + return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root)); } boolean_t @@ -639,70 +725,75 @@ range_tree_is_empty(range_tree_t *rt) return (range_tree_space(rt) == 0); } -/* Generic range tree functions for maintaining segments in an AVL tree. */ +/* ARGSUSED */ void -rt_avl_create(range_tree_t *rt, void *arg) +rt_btree_create(range_tree_t *rt, void *arg) { - avl_tree_t *tree = arg; + zfs_btree_t *size_tree = arg; - avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t), - offsetof(range_seg_t, rs_pp_node)); + size_t size; + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + break; + default: + panic("Invalid range seg type %d", rt->rt_type); + } + zfs_btree_create(size_tree, rt->rt_btree_compare, size); } +/* ARGSUSED */ void -rt_avl_destroy(range_tree_t *rt, void *arg) +rt_btree_destroy(range_tree_t *rt, void *arg) { - avl_tree_t *tree = arg; + zfs_btree_t *size_tree = arg; + ASSERT0(zfs_btree_numnodes(size_tree)); - ASSERT0(avl_numnodes(tree)); - avl_destroy(tree); + zfs_btree_destroy(size_tree); } +/* ARGSUSED */ void -rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg) +rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg) { - avl_tree_t *tree = arg; - avl_add(tree, rs); + zfs_btree_t *size_tree = arg; + + zfs_btree_add(size_tree, rs); } +/* ARGSUSED */ void -rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { - avl_tree_t *tree = arg; - avl_remove(tree, rs); + zfs_btree_t *size_tree = arg; + + zfs_btree_remove(size_tree, rs); } +/* ARGSUSED */ void -rt_avl_vacate(range_tree_t *rt, void *arg) +rt_btree_vacate(range_tree_t *rt, void *arg) { - /* - * Normally one would walk the tree freeing nodes along the way. - * Since the nodes are shared with the range trees we can avoid - * walking all nodes and just reinitialize the avl tree. The nodes - * will be freed by the range tree, so we don't want to free them here. - */ - rt_avl_create(rt, arg); + zfs_btree_t *size_tree = arg; + zfs_btree_clear(size_tree); + zfs_btree_destroy(size_tree); + + rt_btree_create(rt, arg); } -uint64_t -range_tree_min(range_tree_t *rt) -{ - range_seg_t *rs = avl_first(&rt->rt_root); - return (rs != NULL ? rs->rs_start : 0); -} - -uint64_t -range_tree_max(range_tree_t *rt) -{ - range_seg_t *rs = avl_last(&rt->rt_root); - return (rs != NULL ? rs->rs_end : 0); -} - -uint64_t -range_tree_span(range_tree_t *rt) -{ - return (range_tree_max(rt) - range_tree_min(rt)); -} +range_tree_ops_t rt_btree_ops = { + .rtop_create = rt_btree_create, + .rtop_destroy = rt_btree_destroy, + .rtop_add = rt_btree_add, + .rtop_remove = rt_btree_remove, + .rtop_vacate = rt_btree_vacate +}; /* * Remove any overlapping ranges between the given segment [start, end) @@ -712,42 +803,62 @@ void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, range_tree_t *removefrom, range_tree_t *addto) { - avl_index_t where; - range_seg_t starting_rs = { - .rs_start = start, - .rs_end = start + 1 - }; + zfs_btree_index_t where; + range_seg_max_t starting_rs; + rs_set_start(&starting_rs, removefrom, start); + rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs, + removefrom) + 1); - range_seg_t *curr = avl_find(&removefrom->rt_root, + range_seg_t *curr = zfs_btree_find(&removefrom->rt_root, &starting_rs, &where); if (curr == NULL) - curr = avl_nearest(&removefrom->rt_root, where, AVL_AFTER); + curr = zfs_btree_next(&removefrom->rt_root, &where, &where); range_seg_t *next; for (; curr != NULL; curr = next) { - next = AVL_NEXT(&removefrom->rt_root, curr); - if (start == end) return; VERIFY3U(start, <, end); /* there is no overlap */ - if (end <= curr->rs_start) { + if (end <= rs_get_start(curr, removefrom)) { range_tree_add(addto, start, end - start); return; } - uint64_t overlap_start = MAX(curr->rs_start, start); - uint64_t overlap_end = MIN(curr->rs_end, end); + uint64_t overlap_start = MAX(rs_get_start(curr, removefrom), + start); + uint64_t overlap_end = MIN(rs_get_end(curr, removefrom), + end); uint64_t overlap_size = overlap_end - overlap_start; ASSERT3S(overlap_size, >, 0); + range_seg_max_t rs; + rs_copy(curr, &rs, removefrom); + range_tree_remove(removefrom, overlap_start, overlap_size); if (start < overlap_start) range_tree_add(addto, start, overlap_start - start); start = overlap_end; + next = zfs_btree_find(&removefrom->rt_root, &rs, &where); + /* + * If we find something here, we only removed part of the + * curr segment. Either there's some left at the end + * because we've reached the end of the range we're removing, + * or there's some left at the start because we started + * partway through the range. Either way, we continue with + * the loop. If it's the former, we'll return at the start of + * the loop, and if it's the latter we'll see if there is more + * area to process. + */ + if (next != NULL) { + ASSERT(start == end || start == rs_get_end(&rs, + removefrom)); + } + + next = zfs_btree_next(&removefrom->rt_root, &where, &where); } VERIFY3P(curr, ==, NULL); @@ -767,9 +878,30 @@ void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, range_tree_t *addto) { - for (range_seg_t *rs = avl_first(&rt->rt_root); rs; - rs = AVL_NEXT(&rt->rt_root, rs)) { - range_tree_remove_xor_add_segment(rs->rs_start, rs->rs_end, - removefrom, addto); + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + range_tree_remove_xor_add_segment(rs_get_start(rs, rt), + rs_get_end(rs, rt), removefrom, addto); } } + +uint64_t +range_tree_min(range_tree_t *rt) +{ + range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL); + return (rs != NULL ? rs_get_start(rs, rt) : 0); +} + +uint64_t +range_tree_max(range_tree_t *rt) +{ + range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL); + return (rs != NULL ? rs_get_end(rs, rt) : 0); +} + +uint64_t +range_tree_span(range_tree_t *rt) +{ + return (range_tree_max(rt) - range_tree_min(rt)); +} diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 98bd2d330..cc2a681bd 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -252,7 +252,7 @@ layout_num_compare(const void *arg1, const void *arg2) const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; - return (AVL_CMP(node1->lot_num, node2->lot_num)); + return (TREE_CMP(node1->lot_num, node2->lot_num)); } static int @@ -261,11 +261,11 @@ layout_hash_compare(const void *arg1, const void *arg2) const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; - int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash); + int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash); if (likely(cmp)) return (cmp); - return (AVL_CMP(node1->lot_instance, node2->lot_instance)); + return (TREE_CMP(node1->lot_instance, node2->lot_instance)); } boolean_t diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 7e5c474eb..529ffc03d 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -939,7 +939,7 @@ spa_error_entry_compare(const void *a, const void *b) ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); - return (AVL_ISIGN(ret)); + return (TREE_ISIGN(ret)); } /* diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index ac0dd5ba9..78bf110b4 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -58,6 +58,7 @@ #include #include #include "zfs_prop.h" +#include #include #include @@ -619,7 +620,7 @@ spa_log_sm_sort_by_txg(const void *va, const void *vb) const spa_log_sm_t *a = va; const spa_log_sm_t *b = vb; - return (AVL_CMP(a->sls_txg, b->sls_txg)); + return (TREE_CMP(a->sls_txg, b->sls_txg)); } /* @@ -939,7 +940,7 @@ spa_aux_compare(const void *a, const void *b) const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; - return (AVL_CMP(sa->aux_guid, sb->aux_guid)); + return (TREE_CMP(sa->aux_guid, sb->aux_guid)); } void @@ -2270,7 +2271,7 @@ spa_name_compare(const void *a1, const void *a2) s = strcmp(s1->spa_name, s2->spa_name); - return (AVL_ISIGN(s)); + return (TREE_ISIGN(s)); } void @@ -2318,8 +2319,8 @@ spa_init(int mode) fm_init(); zfs_refcount_init(); unique_init(); - range_tree_init(); - metaslab_alloc_trace_init(); + zfs_btree_init(); + metaslab_stat_init(); ddt_init(); zio_init(); dmu_init(); @@ -2353,8 +2354,8 @@ spa_fini(void) dmu_fini(); zio_fini(); ddt_fini(); - metaslab_alloc_trace_fini(); - range_tree_fini(); + metaslab_stat_fini(); + zfs_btree_fini(); unique_fini(); zfs_refcount_fini(); fm_fini(); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index b9467b26b..eb2c36942 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -523,8 +523,9 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) * dbuf must be dirty for the changes in sm_phys to take effect. */ static void -space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, - uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx) +space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend, + maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, + void *tag, dmu_tx_t *tx) { ASSERT3U(words, !=, 0); ASSERT3U(words, <=, 2); @@ -548,14 +549,14 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, ASSERT3P(block_cursor, <=, block_end); - uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; + uint64_t size = (rend - rstart) >> sm->sm_shift; + uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift; uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; - ASSERT3U(rs->rs_start, >=, sm->sm_start); - ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size); - ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size); - ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size); + ASSERT3U(rstart, >=, sm->sm_start); + ASSERT3U(rstart, <, sm->sm_start + sm->sm_size); + ASSERT3U(rend - rstart, <=, sm->sm_size); + ASSERT3U(rend, <=, sm->sm_start + sm->sm_size); while (size != 0) { ASSERT3P(block_cursor, <=, block_end); @@ -673,10 +674,14 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, dmu_buf_will_dirty(db, tx); - avl_tree_t *t = &rt->rt_root; - for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { - uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift; - uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + zfs_btree_t *t = &rt->rt_root; + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL; + rs = zfs_btree_next(t, &where, &where)) { + uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >> + sm->sm_shift; + uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >> + sm->sm_shift; uint8_t words = 1; /* @@ -701,8 +706,8 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, spa_get_random(100) == 0))) words = 2; - space_map_write_seg(sm, rs, maptype, vdev_id, words, - &db, FTAG, tx); + space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs, + rt), maptype, vdev_id, words, &db, FTAG, tx); } dmu_buf_rele(db, FTAG); @@ -749,7 +754,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, else sm->sm_phys->smp_alloc -= range_tree_space(rt); - uint64_t nodes = avl_numnodes(&rt->rt_root); + uint64_t nodes = zfs_btree_numnodes(&rt->rt_root); uint64_t rt_space = range_tree_space(rt); space_map_write_impl(sm, rt, maptype, vdev_id, tx); @@ -758,7 +763,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * Ensure that the space_map's accounting wasn't changed * while we were in the middle of writing it out. */ - VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root)); + VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root)); VERIFY3U(range_tree_space(rt), ==, rt_space); } diff --git a/module/zfs/space_reftree.c b/module/zfs/space_reftree.c index aa289ba10..080fc6646 100644 --- a/module/zfs/space_reftree.c +++ b/module/zfs/space_reftree.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. */ #include @@ -57,11 +57,11 @@ space_reftree_compare(const void *x1, const void *x2) const space_ref_t *sr1 = (const space_ref_t *)x1; const space_ref_t *sr2 = (const space_ref_t *)x2; - int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset); + int cmp = TREE_CMP(sr1->sr_offset, sr2->sr_offset); if (likely(cmp)) return (cmp); - return (AVL_PCMP(sr1, sr2)); + return (TREE_PCMP(sr1, sr2)); } void @@ -109,10 +109,13 @@ space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) { - range_seg_t *rs; + zfs_btree_index_t where; - for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) - space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt); + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs = + zfs_btree_next(&rt->rt_root, &where, &where)) { + space_reftree_add_seg(t, rs_get_start(rs, rt), rs_get_end(rs, + rt), refcnt); + } } /* diff --git a/module/zfs/unique.c b/module/zfs/unique.c index 5cdd025f4..0e076797a 100644 --- a/module/zfs/unique.c +++ b/module/zfs/unique.c @@ -45,7 +45,7 @@ unique_compare(const void *a, const void *b) const unique_t *una = (const unique_t *)a; const unique_t *unb = (const unique_t *)b; - return (AVL_CMP(una->un_value, unb->un_value)); + return (TREE_CMP(una->un_value, unb->un_value)); } void diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index af2d1a25a..3a120b001 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -216,7 +216,7 @@ vdev_getops(const char *type) /* ARGSUSED */ void -vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) +vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res) { res->rs_start = in->rs_start; res->rs_end = in->rs_end; @@ -528,7 +528,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); - vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); + vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay @@ -561,7 +562,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { - vd->vdev_dtl[t] = range_tree_create(NULL, NULL); + vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); @@ -2539,14 +2541,11 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) static uint64_t vdev_dtl_min(vdev_t *vd) { - range_seg_t *rs; - ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); - rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); - return (rs->rs_start - 1); + return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* @@ -2555,14 +2554,11 @@ vdev_dtl_min(vdev_t *vd) static uint64_t vdev_dtl_max(vdev_t *vd) { - range_seg_t *rs; - ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); - rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); - return (rs->rs_end); + return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* @@ -2883,7 +2879,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) ASSERT(vd->vdev_dtl_sm != NULL); } - rtsync = range_tree_create(NULL, NULL); + rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); @@ -4730,7 +4726,8 @@ vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd) * translation function to do the real conversion. */ void -vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) +vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs) { /* * Walk up the vdev tree @@ -4757,7 +4754,7 @@ vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) * range into its physical components by calling the * vdev specific translate function. */ - range_seg_t intermediate = { { { 0, 0 } } }; + range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); physical_rs->rs_start = intermediate.rs_start; diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index 35a5b480b..6cf449d4a 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -111,7 +111,7 @@ vdev_cache_offset_compare(const void *a1, const void *a2) const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - return (AVL_CMP(ve1->ve_offset, ve2->ve_offset)); + return (TREE_CMP(ve1->ve_offset, ve2->ve_offset)); } static int @@ -120,7 +120,7 @@ vdev_cache_lastused_compare(const void *a1, const void *a2) const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused); + int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused); if (likely(cmp)) return (cmp); diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 169b9282d..a27ab3c25 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -291,11 +291,13 @@ vdev_initialize_block_free(abd_t *data) static int vdev_initialize_ranges(vdev_t *vd, abd_t *data) { - avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; + range_tree_t *rt = vd->vdev_initialize_tree; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t where; - for (range_seg_t *rs = avl_first(rt); rs != NULL; - rs = AVL_NEXT(rt, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; + rs = zfs_btree_next(bt, &where, &where)) { + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); /* Split range into legally-sized physical chunks */ uint64_t writes_required = @@ -305,7 +307,7 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) int error; error = vdev_initialize_write(vd, - VDEV_LABEL_START_SIZE + rs->rs_start + + VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) + (w * zfs_initialize_chunk_size), MIN(size - (w * zfs_initialize_chunk_size), zfs_initialize_chunk_size), data); @@ -341,7 +343,7 @@ vdev_initialize_calculate_progress(vdev_t *vd) * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; vdev_xlate(vd, &logical_rs, &physical_rs); @@ -365,10 +367,14 @@ vdev_initialize_calculate_progress(vdev_t *vd) */ VERIFY0(metaslab_load(msp)); - for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); - rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { - logical_rs.rs_start = rs->rs_start; - logical_rs.rs_end = rs->rs_end; + zfs_btree_index_t where; + range_tree_t *rt = msp->ms_allocatable; + for (range_seg_t *rs = + zfs_btree_first(&rt->rt_root, &where); rs; + rs = zfs_btree_next(&rt->rt_root, &where, + &where)) { + logical_rs.rs_start = rs_get_start(rs, rt); + logical_rs.rs_end = rs_get_end(rs, rt); vdev_xlate(vd, &logical_rs, &physical_rs); uint64_t size = physical_rs.rs_end - @@ -422,7 +428,7 @@ void vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; @@ -481,7 +487,8 @@ vdev_initialize_thread(void *arg) abd_t *deadbeef = vdev_initialize_block_alloc(); - vd->vdev_initialize_tree = range_tree_create(NULL, NULL); + vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 6320732ed..6bb3c3c68 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1197,12 +1197,12 @@ retry: static int vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { - int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); + int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg); if (likely(cmp)) return (cmp); - cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); + cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp); if (likely(cmp)) return (cmp); @@ -1226,7 +1226,7 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) seq2 = MMP_SEQ(ub2); - return (AVL_CMP(seq1, seq2)); + return (TREE_CMP(seq1, seq2)); } struct ubl_cbdata { diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 5e4db3df6..e156e2b01 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -218,12 +218,12 @@ vdev_queue_offset_compare(const void *x1, const void *x2) const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = AVL_CMP(z1->io_offset, z2->io_offset); + int cmp = TREE_CMP(z1->io_offset, z2->io_offset); if (likely(cmp)) return (cmp); - return (AVL_PCMP(z1, z2)); + return (TREE_PCMP(z1, z2)); } static inline avl_tree_t * @@ -250,12 +250,12 @@ vdev_queue_timestamp_compare(const void *x1, const void *x2) const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = AVL_CMP(z1->io_timestamp, z2->io_timestamp); + int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); if (likely(cmp)) return (cmp); - return (AVL_PCMP(z1, z2)); + return (TREE_PCMP(z1, z2)); } static int diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index f63ccaa94..3408ddf72 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. */ @@ -1638,7 +1638,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = zio->io_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_raidz_asize(zio->io_vd, zio->io_size); @@ -2372,7 +2372,7 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) } static void -vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) +vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 549087163..a4fac1cc5 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -198,11 +198,12 @@ spa_vdev_removal_create(vdev_t *vd) spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); - svr->svr_allocd_segs = range_tree_create(NULL, NULL); + svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { - svr->svr_frees[i] = range_tree_create(NULL, NULL); + svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); @@ -967,18 +968,15 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, * the allocation at the end of a segment, thus avoiding * additional split blocks. */ - range_seg_t search; - avl_index_t where; - search.rs_start = start + maxalloc; - search.rs_end = search.rs_start; - range_seg_t *rs = avl_find(&segs->rt_root, &search, &where); - if (rs == NULL) { - rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE); - } else { - rs = AVL_PREV(&segs->rt_root, rs); - } + range_seg_max_t search; + zfs_btree_index_t where; + rs_set_start(&search, segs, start + maxalloc); + rs_set_end(&search, segs, start + maxalloc); + (void) zfs_btree_find(&segs->rt_root, &search, &where); + range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, + &where); if (rs != NULL) { - size = rs->rs_end - start; + size = rs_get_end(rs, segs) - start; } else { /* * There are no segments that end before maxalloc. @@ -1011,20 +1009,22 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ - range_tree_t *obsolete_segs = range_tree_create(NULL, NULL); + range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); - range_seg_t *rs = avl_first(&segs->rt_root); - ASSERT3U(rs->rs_start, ==, start); - uint64_t prev_seg_end = rs->rs_end; - while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) { - if (rs->rs_start >= start + size) { + zfs_btree_index_t where; + range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); + ASSERT3U(rs_get_start(rs, segs), ==, start); + uint64_t prev_seg_end = rs_get_end(rs, segs); + while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) { + if (rs_get_start(rs, segs) >= start + size) { break; } else { range_tree_add(obsolete_segs, prev_seg_end - start, - rs->rs_start - prev_seg_end); + rs_get_start(rs, segs) - prev_seg_end); } - prev_seg_end = rs->rs_end; + prev_seg_end = rs_get_end(rs, segs); } /* We don't end in the middle of an obsolete range */ ASSERT3U(start + size, <=, prev_seg_end); @@ -1268,9 +1268,10 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ - range_tree_t *segs = range_tree_create(NULL, NULL); + range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (;;) { - range_seg_t *rs = range_tree_first(svr->svr_allocd_segs); + range_tree_t *rt = svr->svr_allocd_segs; + range_seg_t *rs = range_tree_first(rt); if (rs == NULL) break; @@ -1279,17 +1280,17 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, if (range_tree_is_empty(segs)) { /* need to truncate the first seg based on max_alloc */ - seg_length = - MIN(rs->rs_end - rs->rs_start, *max_alloc); + seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs, + rt), *max_alloc); } else { - if (rs->rs_start - range_tree_max(segs) > + if (rs_get_start(rs, rt) - range_tree_max(segs) > vdev_removal_max_span) { /* * Including this segment would cause us to * copy a larger unneeded chunk than is allowed. */ break; - } else if (rs->rs_end - range_tree_min(segs) > + } else if (rs_get_end(rs, rt) - range_tree_min(segs) > *max_alloc) { /* * This additional segment would extend past @@ -1298,13 +1299,14 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, */ break; } else { - seg_length = rs->rs_end - rs->rs_start; + seg_length = rs_get_end(rs, rt) - + rs_get_start(rs, rt); } } - range_tree_add(segs, rs->rs_start, seg_length); + range_tree_add(segs, rs_get_start(rs, rt), seg_length); range_tree_remove(svr->svr_allocd_segs, - rs->rs_start, seg_length); + rs_get_start(rs, rt), seg_length); } if (range_tree_is_empty(segs)) { @@ -1483,7 +1485,7 @@ spa_vdev_remove_thread(void *arg) vca.vca_msp = msp; zfs_dbgmsg("copying %llu segments for metaslab %llu", - avl_numnodes(&svr->svr_allocd_segs->rt_root), + zfs_btree_numnodes(&svr->svr_allocd_segs->rt_root), msp->ms_id); while (!svr->svr_thread_exit && diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 771e7a159..c7c429cbd 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -523,7 +523,8 @@ static int vdev_trim_ranges(trim_args_t *ta) { vdev_t *vd = ta->trim_vdev; - avl_tree_t *rt = &ta->trim_tree->rt_root; + zfs_btree_t *t = &ta->trim_tree->rt_root; + zfs_btree_index_t idx; uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; @@ -531,9 +532,10 @@ vdev_trim_ranges(trim_args_t *ta) ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; - for (range_seg_t *rs = avl_first(rt); rs != NULL; - rs = AVL_NEXT(rt, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + rs = zfs_btree_next(t, &idx, &idx)) { + uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs, + ta->trim_tree); if (extent_bytes_min && size < extent_bytes_min) { spa_iostats_trim_add(spa, ta->trim_type, @@ -548,9 +550,9 @@ vdev_trim_ranges(trim_args_t *ta) int error; error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + - rs->rs_start + (w * extent_bytes_max), - MIN(size - (w * extent_bytes_max), - extent_bytes_max)); + rs_get_start(rs, ta->trim_tree) + + (w *extent_bytes_max), MIN(size - + (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { return (error); } @@ -588,7 +590,7 @@ vdev_trim_calculate_progress(vdev_t *vd) * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; vdev_xlate(vd, &logical_rs, &physical_rs); @@ -611,10 +613,13 @@ vdev_trim_calculate_progress(vdev_t *vd) */ VERIFY0(metaslab_load(msp)); - for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); - rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { - logical_rs.rs_start = rs->rs_start; - logical_rs.rs_end = rs->rs_end; + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t idx; + for (range_seg_t *rs = zfs_btree_first(bt, &idx); + rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { + logical_rs.rs_start = rs_get_start(rs, rt); + logical_rs.rs_end = rs_get_end(rs, rt); vdev_xlate(vd, &logical_rs, &physical_rs); uint64_t size = physical_rs.rs_end - @@ -706,7 +711,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; @@ -719,7 +724,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) metaslab_t *msp = ta->trim_msp; VERIFY0(metaslab_load(msp)); VERIFY3B(msp->ms_loaded, ==, B_TRUE); - VERIFY(range_tree_find(msp->ms_allocatable, start, size)); + VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); } ASSERT(vd->vdev_ops->vdev_op_leaf); @@ -798,7 +803,7 @@ vdev_trim_thread(void *arg) ta.trim_vdev = vd; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; - ta.trim_tree = range_tree_create(NULL, NULL); + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_flags = 0; @@ -1080,7 +1085,7 @@ vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size) VERIFY3B(msp->ms_loaded, ==, B_TRUE); VERIFY3U(msp->ms_disabled, >, 0); - VERIFY(range_tree_find(msp->ms_allocatable, start, size) != NULL); + VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); } /* @@ -1178,7 +1183,8 @@ vdev_autotrim_thread(void *arg) * Allocate an empty range tree which is swapped in * for the existing ms_trim tree while it is processed. */ - trim_tree = range_tree_create(NULL, NULL); + trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); range_tree_swap(&msp->ms_trim, &trim_tree); ASSERT(range_tree_is_empty(msp->ms_trim)); @@ -1232,7 +1238,8 @@ vdev_autotrim_thread(void *arg) if (!cvd->vdev_ops->vdev_op_leaf) continue; - ta->trim_tree = range_tree_create(NULL, NULL); + ta->trim_tree = range_tree_create(NULL, + RANGE_SEG64, NULL, 0, 0); range_tree_walk(trim_tree, vdev_trim_range_add, ta); } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 467812ff6..c3d42d0ee 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -280,11 +280,11 @@ mze_compare(const void *arg1, const void *arg2) const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; - int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash); + int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash); if (likely(cmp)) return (cmp); - return (AVL_CMP(mze1->mze_cd, mze2->mze_cd)); + return (TREE_CMP(mze1->mze_cd, mze2->mze_cd)); } static void diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 78071707a..925d1934f 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -73,7 +73,7 @@ idx_compare(const void *arg1, const void *arg2) const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; - return (AVL_CMP(node1->f_idx, node2->f_idx)); + return (TREE_CMP(node1->f_idx, node2->f_idx)); } /* @@ -88,7 +88,7 @@ domain_compare(const void *arg1, const void *arg2) val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); - return (AVL_ISIGN(val)); + return (TREE_ISIGN(val)); } void diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c index 94203a40c..db587c731 100644 --- a/module/zfs/zfs_rlock.c +++ b/module/zfs/zfs_rlock.c @@ -109,7 +109,7 @@ zfs_rangelock_compare(const void *arg1, const void *arg2) const locked_range_t *rl1 = (const locked_range_t *)arg1; const locked_range_t *rl2 = (const locked_range_t *)arg2; - return (AVL_CMP(rl1->lr_offset, rl2->lr_offset)); + return (TREE_CMP(rl1->lr_offset, rl2->lr_offset)); } /* diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 30a73515c..7e65ac090 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -146,11 +146,11 @@ zil_bp_compare(const void *x1, const void *x2) const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; - int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); if (likely(cmp)) return (cmp); - return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); + return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); } static void @@ -535,7 +535,7 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; - return (AVL_CMP(v1, v2)); + return (TREE_CMP(v1, v2)); } static lwb_t * @@ -1869,7 +1869,7 @@ zil_aitx_compare(const void *x1, const void *x2) const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; - return (AVL_CMP(o1, o2)); + return (TREE_CMP(o1, o2)); } /*