mirror_zfs/include/sys/avl.h

326 lines
9.2 KiB
C
Raw Normal View History

2008-11-20 23:01:55 +03:00
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
2008-11-20 23:01:55 +03:00
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
2008-11-20 23:01:55 +03:00
* Use is subject to license terms.
*/
/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/
2008-11-20 23:01:55 +03:00
#ifndef _AVL_H
#define _AVL_H extern __attribute__((visibility("default")))
2008-11-20 23:01:55 +03:00
/*
* This is a private header file. Applications should not directly include
* this file.
*/
#ifdef __cplusplus
extern "C" {
#endif
#include <sys/types.h>
2008-11-20 23:01:55 +03:00
#include <sys/avl_impl.h>
/*
* This is a generic implementation of AVL trees for use in the Solaris kernel.
2008-11-20 23:01:55 +03:00
* The interfaces provide an efficient way of implementing an ordered set of
* data structures.
*
* AVL trees provide an alternative to using an ordered linked list. Using AVL
* trees will usually be faster, however they requires more storage. An ordered
* linked list in general requires 2 pointers in each data structure. The
* AVL tree implementation uses 3 pointers. The following chart gives the
* approximate performance of operations with the different approaches:
*
* Operation Link List AVL tree
* --------- -------- --------
* lookup O(n) O(log(n))
*
* insert 1 node constant constant
*
* delete 1 node constant between constant and O(log(n))
*
* delete all nodes O(n) O(n)
*
* visit the next
* or prev node constant between constant and O(log(n))
*
*
* The data structure nodes are anchored at an "avl_tree_t" (the equivalent
* of a list header) and the individual nodes will have a field of
* type "avl_node_t" (corresponding to list pointers).
*
* The type "avl_index_t" is used to indicate a position in the list for
* certain calls.
*
* The usage scenario is generally:
*
* 1. Create the list/tree with: avl_create()
*
* followed by any mixture of:
*
* 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert()
*
* 2b. Visited elements with:
* avl_first() - returns the lowest valued node
* avl_last() - returns the highest valued node
* AVL_NEXT() - given a node go to next higher one
* AVL_PREV() - given a node go to previous lower one
*
* 2c. Find the node with the closest value either less than or greater
* than a given value with avl_nearest().
*
* 2d. Remove individual nodes from the list/tree with avl_remove().
*
* and finally when the list is being destroyed
*
* 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes.
* Note that once you use avl_destroy_nodes(), you can no longer
* use any routine except avl_destroy_nodes() and avl_destroy().
2008-11-20 23:01:55 +03:00
*
* 4. Use avl_destroy() to destroy the AVL tree itself.
*
* Any locking for multiple thread access is up to the user to provide, just
* as is needed for any linked list implementation.
*/
Performance optimization of AVL tree comparator functions perf: 2.75x faster ddt_entry_compare() First 256bits of ddt_key_t is a block checksum, which are expected to be close to random data. Hence, on average, comparison only needs to look at first few bytes of the keys. To reduce number of conditional jump instructions, the result is computed as: sign(memcmp(k1, k2)). Sign of an integer 'a' can be obtained as: `(0 < a) - (a < 0)` := {-1, 0, 1} , which is computed efficiently. Synthetic performance evaluation of original and new algorithm over 1G random keys on 2.6GHz Intel(R) Xeon(R) CPU E5-2660 v3: old 6.85789 s new 2.49089 s perf: 2.8x faster vdev_queue_offset_compare() and vdev_queue_timestamp_compare() Compute the result directly instead of using conditionals perf: zfs_range_compare() Speedup between 1.1x - 2.5x, depending on compiler version and optimization level. perf: spa_error_entry_compare() `bcmp()` is not suitable for comparator use. Use `memcmp()` instead. perf: 2.8x faster metaslab_compare() and metaslab_rangesize_compare() perf: 2.8x faster zil_bp_compare() perf: 2.8x faster mze_compare() perf: faster dbuf_compare() perf: faster compares in spa_misc perf: 2.8x faster layout_hash_compare() perf: 2.8x faster space_reftree_compare() perf: libzfs: faster avl tree comparators perf: guid_compare() perf: dsl_deadlist_compare() perf: perm_set_compare() perf: 2x faster range_tree_seg_compare() perf: faster unique_compare() perf: faster vdev_cache _compare() perf: faster vdev_uberblock_compare() perf: faster fuid _compare() perf: faster zfs_znode_hold_compare() Signed-off-by: Gvozden Neskovic <neskovic@gmail.com> Signed-off-by: Richard Elling <richard.elling@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #5033
2016-08-27 21:12:53 +03:00
/*
* AVL comparator helpers
*/
Reduce loaded range tree memory usage This patch implements a new tree structure for ZFS, and uses it to store range trees more efficiently. The new structure is approximately a B-tree, though there are some small differences from the usual characterizations. The tree has core nodes and leaf nodes; each contain data elements, which the elements in the core nodes acting as separators between its children. The difference between core and leaf nodes is that the core nodes have an array of children, while leaf nodes don't. Every node in the tree may be only partially full; in most cases, they are all at least 50% full (in terms of element count) except for the root node, which can be less full. Underfull nodes will steal from their neighbors or merge to remain full enough, while overfull nodes will split in two. The data elements are contained in tree-controlled buffers; they are copied into these on insertion, and overwritten on deletion. This means that the elements are not independently allocated, which reduces overhead, but also means they can't be shared between trees (and also that pointers to them are only valid until a side-effectful tree operation occurs). The overhead varies based on how dense the tree is, but is usually on the order of about 50% of the element size; the per-node overheads are very small, and so don't make a significant difference. The trees can accept arbitrary records; they accept a size and a comparator to allow them to be used for a variety of purposes. The new trees replace the AVL trees used in the range trees today. Currently, the range_seg_t structure contains three 8 byte integers of payload and two 24 byte avl_tree_node_ts to handle its storage in both an offset-sorted tree and a size-sorted tree (total size: 64 bytes). In the new model, the range seg structures are usually two 4 byte integers, but a separate one needs to exist for the size-sorted and offset-sorted tree. Between the raw size, the 50% overhead, and the double storage, the new btrees are expected to use 8*1.5*2 = 24 bytes per record, or 33.3% as much memory as the AVL trees (this is for the purposes of storing metaslab range trees; for other purposes, like scrubs, they use ~50% as much memory). We reduced the size of the payload in the range segments by teaching range trees about starting offsets and shifts; since metaslabs have a fixed starting offset, and they all operate in terms of disk sectors, we can store the ranges using 4-byte integers as long as the size of the metaslab divided by the sector size is less than 2^32. For 512-byte sectors, this is a 2^41 (or 2TB) metaslab, which with the default settings corresponds to a 256PB disk. 4k sector disks can handle metaslabs up to 2^46 bytes, or 2^63 byte disks. Since we do not anticipate disks of this size in the near future, there should be almost no cases where metaslabs need 64-byte integers to store their ranges. We do still have the capability to store 64-byte integer ranges to account for cases where we are storing per-vdev (or per-dnode) trees, which could reasonably go above the limits discussed. We also do not store fill information in the compact version of the node, since it is only used for sorted scrub. We also optimized the metaslab loading process in various other ways to offset some inefficiencies in the btree model. While individual operations (find, insert, remove_from) are faster for the btree than they are for the avl tree, remove usually requires a find operation, while in the AVL tree model the element itself suffices. Some clever changes actually caused an overall speedup in metaslab loading; we use approximately 40% less cpu to load metaslabs in our tests on Illumos. Another memory and performance optimization was achieved by changing what is stored in the size-sorted trees. When a disk is heavily fragmented, the df algorithm used by default in ZFS will almost always find a number of small regions in its initial cursor-based search; it will usually only fall back to the size-sorted tree to find larger regions. If we increase the size of the cursor-based search slightly, and don't store segments that are smaller than a tunable size floor in the size-sorted tree, we can further cut memory usage down to below 20% of what the AVL trees store. This also results in further reductions in CPU time spent loading metaslabs. The 16KiB size floor was chosen because it results in substantial memory usage reduction while not usually resulting in situations where we can't find an appropriate chunk with the cursor and are forced to use an oversized chunk from the size-sorted tree. In addition, even if we do have to use an oversized chunk from the size-sorted tree, the chunk would be too small to use for ZIL allocations, so it isn't as big of a loss as it might otherwise be. And often, more small allocations will follow the initial one, and the cursor search will now find the remainder of the chunk we didn't use all of and use it for subsequent allocations. Practical testing has shown little or no change in fragmentation as a result of this change. If the size-sorted tree becomes empty while the offset sorted one still has entries, it will load all the entries from the offset sorted tree and disregard the size floor until it is unloaded again. This operation occurs rarely with the default setting, only on incredibly thoroughly fragmented pools. There are some other small changes to zdb to teach it to handle btrees, but nothing major. Reviewed-by: George Wilson <gwilson@delphix.com> Reviewed-by: Matt Ahrens <matt@delphix.com> Reviewed by: Sebastien Roy seb@delphix.com Reviewed-by: Igor Kozhukhov <igor@dilos.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Paul Dagnelie <pcd@delphix.com> Closes #9181
2019-10-09 20:36:03 +03:00
#define TREE_ISIGN(a) (((a) > 0) - ((a) < 0))
#define TREE_CMP(a, b) (((a) > (b)) - ((a) < (b)))
#define TREE_PCMP(a, b) \
Performance optimization of AVL tree comparator functions perf: 2.75x faster ddt_entry_compare() First 256bits of ddt_key_t is a block checksum, which are expected to be close to random data. Hence, on average, comparison only needs to look at first few bytes of the keys. To reduce number of conditional jump instructions, the result is computed as: sign(memcmp(k1, k2)). Sign of an integer 'a' can be obtained as: `(0 < a) - (a < 0)` := {-1, 0, 1} , which is computed efficiently. Synthetic performance evaluation of original and new algorithm over 1G random keys on 2.6GHz Intel(R) Xeon(R) CPU E5-2660 v3: old 6.85789 s new 2.49089 s perf: 2.8x faster vdev_queue_offset_compare() and vdev_queue_timestamp_compare() Compute the result directly instead of using conditionals perf: zfs_range_compare() Speedup between 1.1x - 2.5x, depending on compiler version and optimization level. perf: spa_error_entry_compare() `bcmp()` is not suitable for comparator use. Use `memcmp()` instead. perf: 2.8x faster metaslab_compare() and metaslab_rangesize_compare() perf: 2.8x faster zil_bp_compare() perf: 2.8x faster mze_compare() perf: faster dbuf_compare() perf: faster compares in spa_misc perf: 2.8x faster layout_hash_compare() perf: 2.8x faster space_reftree_compare() perf: libzfs: faster avl tree comparators perf: guid_compare() perf: dsl_deadlist_compare() perf: perm_set_compare() perf: 2x faster range_tree_seg_compare() perf: faster unique_compare() perf: faster vdev_cache _compare() perf: faster vdev_uberblock_compare() perf: faster fuid _compare() perf: faster zfs_znode_hold_compare() Signed-off-by: Gvozden Neskovic <neskovic@gmail.com> Signed-off-by: Richard Elling <richard.elling@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #5033
2016-08-27 21:12:53 +03:00
(((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
2008-11-20 23:01:55 +03:00
/*
* Type used for the root of the AVL tree.
*/
typedef struct avl_tree avl_tree_t;
/*
* The data nodes in the AVL tree must have a field of this type.
*/
typedef struct avl_node avl_node_t;
/*
* An opaque type used to locate a position in the tree where a node
* would be inserted.
*/
typedef uintptr_t avl_index_t;
/*
* Direction constants used for avl_nearest().
*/
#define AVL_BEFORE (0)
#define AVL_AFTER (1)
/*
* Prototypes
*
* Where not otherwise mentioned, "void *" arguments are a pointer to the
* user data structure which must contain a field of type avl_node_t.
*
* Also assume the user data structures looks like:
* struct my_type {
2008-11-20 23:01:55 +03:00
* ...
* avl_node_t my_link;
* ...
* };
*/
/*
* Initialize an AVL tree. Arguments are:
*
* tree - the tree to be initialized
* compar - function to compare two nodes, it must return exactly: -1, 0, or +1
* -1 for <, 0 for ==, and +1 for >
* size - the value of sizeof(struct my_type)
* offset - the value of OFFSETOF(struct my_type, my_link)
*/
_AVL_H void avl_create(avl_tree_t *tree,
2008-11-20 23:01:55 +03:00
int (*compar) (const void *, const void *), size_t size, size_t offset);
/*
* Find a node with a matching value in the tree. Returns the matching node
* found. If not found, it returns NULL and then if "where" is not NULL it sets
* "where" for use with avl_insert() or avl_nearest().
*
* node - node that has the value being looked for
* where - position for use with avl_nearest() or avl_insert(), may be NULL
*/
_AVL_H void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where);
2008-11-20 23:01:55 +03:00
/*
* Insert a node into the tree.
*
* node - the node to insert
* where - position as returned from avl_find()
*/
_AVL_H void avl_insert(avl_tree_t *tree, void *node, avl_index_t where);
2008-11-20 23:01:55 +03:00
/*
* Insert "new_data" in "tree" in the given "direction" either after
* or before the data "here".
*
* This might be useful for avl clients caching recently accessed
2008-11-20 23:01:55 +03:00
* data to avoid doing avl_find() again for insertion.
*
* new_data - new data to insert
* here - existing node in "tree"
2008-11-20 23:01:55 +03:00
* direction - either AVL_AFTER or AVL_BEFORE the data "here".
*/
_AVL_H void avl_insert_here(avl_tree_t *tree, void *new_data, void *here,
2008-11-20 23:01:55 +03:00
int direction);
/*
* Return the first or last valued node in the tree. Will return NULL
* if the tree is empty.
*
*/
_AVL_H void *avl_first(avl_tree_t *tree);
_AVL_H void *avl_last(avl_tree_t *tree);
2008-11-20 23:01:55 +03:00
/*
* Return the next or previous valued node in the tree.
* AVL_NEXT() will return NULL if at the last node.
* AVL_PREV() will return NULL if at the first node.
*
* node - the node from which the next or previous node is found
*/
#define AVL_NEXT(tree, node) avl_walk(tree, node, AVL_AFTER)
#define AVL_PREV(tree, node) avl_walk(tree, node, AVL_BEFORE)
/*
* Find the node with the nearest value either greater or less than
* the value from a previous avl_find(). Returns the node or NULL if
* there isn't a matching one.
*
* where - position as returned from avl_find()
* direction - either AVL_BEFORE or AVL_AFTER
*
* EXAMPLE get the greatest node that is less than a given value:
*
* avl_tree_t *tree;
* struct my_data look_for_value = {....};
* struct my_data *node;
* struct my_data *less;
* avl_index_t where;
*
* node = avl_find(tree, &look_for_value, &where);
* if (node != NULL)
* less = AVL_PREV(tree, node);
* else
* less = avl_nearest(tree, where, AVL_BEFORE);
*/
_AVL_H void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction);
2008-11-20 23:01:55 +03:00
/*
* Add a single node to the tree.
* The node must not be in the tree, and it must not
* compare equal to any other node already in the tree.
*
* node - the node to add
*/
_AVL_H void avl_add(avl_tree_t *tree, void *node);
2008-11-20 23:01:55 +03:00
/*
* Remove a single node from the tree. The node must be in the tree.
*
* node - the node to remove
*/
_AVL_H void avl_remove(avl_tree_t *tree, void *node);
2008-11-20 23:01:55 +03:00
/*
* Reinsert a node only if its order has changed relative to its nearest
* neighbors. To optimize performance avl_update_lt() checks only the previous
* node and avl_update_gt() checks only the next node. Use avl_update_lt() and
* avl_update_gt() only if you know the direction in which the order of the
* node may change.
*/
_AVL_H boolean_t avl_update(avl_tree_t *, void *);
_AVL_H boolean_t avl_update_lt(avl_tree_t *, void *);
_AVL_H boolean_t avl_update_gt(avl_tree_t *, void *);
/*
* Swaps the contents of the two trees.
*/
_AVL_H void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);
2008-11-20 23:01:55 +03:00
/*
* Return the number of nodes in the tree
*/
_AVL_H ulong_t avl_numnodes(avl_tree_t *tree);
2008-11-20 23:01:55 +03:00
/*
* Return B_TRUE if there are zero nodes in the tree, B_FALSE otherwise.
*/
_AVL_H boolean_t avl_is_empty(avl_tree_t *tree);
2008-11-20 23:01:55 +03:00
/*
* Used to destroy any remaining nodes in a tree. The cookie argument should
* be initialized to NULL before the first call. Returns a node that has been
* removed from the tree and may be free()'d. Returns NULL when the tree is
* empty.
*
* Once you call avl_destroy_nodes(), you can only continuing calling it and
* finally avl_destroy(). No other AVL routines will be valid.
*
* cookie - a "void *" used to save state between calls to avl_destroy_nodes()
*
* EXAMPLE:
* avl_tree_t *tree;
* struct my_data *node;
* void *cookie;
*
* cookie = NULL;
* while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
* free(node);
* avl_destroy(tree);
*/
_AVL_H void *avl_destroy_nodes(avl_tree_t *tree, void **cookie);
2008-11-20 23:01:55 +03:00
/*
* Final destroy of an AVL tree. Arguments are:
*
* tree - the empty tree to destroy
*/
_AVL_H void avl_destroy(avl_tree_t *tree);
2008-11-20 23:01:55 +03:00
#ifdef __cplusplus
}
#endif
#endif /* _AVL_H */