mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 10:37:35 +03:00
Optimize microzaps
Microzap on-disk format does not include a hash tree, expecting one to be built in RAM during mzap_open(). The built tree is linked to DMU user buffer, freed when original DMU buffer is dropped from cache. I've found that workloads accessing many large directories and having active eviction from DMU cache spend significant amount of time building and then destroying the trees. I've also found that for each 64 byte mzap element additional 64 byte tree element is allocated, that is a waste of memory and CPU caches. Improve memory efficiency of the hash tree by switching from AVL-tree to B-tree. It allows to save 24 bytes per element just on pointers. Save 32 bits on mze_hash by storing only upper 32 bits since lower 32 bits are always zero for microzaps. Save 16 bits on mze_chunkid, since microzap can never have so many elements. Respectively with the 16 bits there can be no more than 16 bits of collision differentiators. As result, struct mzap_ent now drops from 48 (rounded to 64) to 8 bytes. Tune B-trees for small data. Reduce BTREE_CORE_ELEMS from 128 to 126 to allow struct zfs_btree_core in case of 8 byte elements to pack into 2KB instead of 4KB. Aside of the microzaps it should also help 32bit range trees. Allow custom B-tree leaf size to reduce memmove() time. Split zap_name_alloc() into zap_name_alloc() and zap_name_init_str(). It allows to not waste time allocating/freeing memory when processing multiple names in a loop during mzap_open(). Together on a pool with 10K directories of 1800 files each and DMU cache limited to 128MB this reduces time of `find . -name zzz` by 41% from 7.63s to 4.47s, and saves additional ~30% of CPU time on the DMU cache reclamation. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #14039
This commit is contained in:
+8
-7
@@ -65,7 +65,7 @@ extern "C" {
|
||||
* them, and increased memory overhead. Increasing these values results in
|
||||
* higher variance in operation time, and reduces memory overhead.
|
||||
*/
|
||||
#define BTREE_CORE_ELEMS 128
|
||||
#define BTREE_CORE_ELEMS 126
|
||||
#define BTREE_LEAF_SIZE 4096
|
||||
|
||||
extern kmem_cache_t *zfs_btree_leaf_cache;
|
||||
@@ -95,9 +95,6 @@ typedef struct zfs_btree_leaf {
|
||||
uint8_t btl_elems[];
|
||||
} zfs_btree_leaf_t;
|
||||
|
||||
#define BTREE_LEAF_ESIZE (BTREE_LEAF_SIZE - \
|
||||
offsetof(zfs_btree_leaf_t, btl_elems))
|
||||
|
||||
typedef struct zfs_btree_index {
|
||||
zfs_btree_hdr_t *bti_node;
|
||||
uint32_t bti_offset;
|
||||
@@ -109,14 +106,15 @@ typedef struct zfs_btree_index {
|
||||
} zfs_btree_index_t;
|
||||
|
||||
typedef struct btree {
|
||||
zfs_btree_hdr_t *bt_root;
|
||||
int64_t bt_height;
|
||||
int (*bt_compar) (const void *, const void *);
|
||||
size_t bt_elem_size;
|
||||
size_t bt_leaf_size;
|
||||
uint32_t bt_leaf_cap;
|
||||
int32_t bt_height;
|
||||
uint64_t bt_num_elems;
|
||||
uint64_t bt_num_nodes;
|
||||
zfs_btree_hdr_t *bt_root;
|
||||
zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading
|
||||
int (*bt_compar) (const void *, const void *);
|
||||
} zfs_btree_t;
|
||||
|
||||
/*
|
||||
@@ -132,9 +130,12 @@ void zfs_btree_fini(void);
|
||||
* compar - function to compare two nodes, it must return exactly: -1, 0, or +1
|
||||
* -1 for <, 0 for ==, and +1 for >
|
||||
* size - the value of sizeof(struct my_type)
|
||||
* lsize - custom leaf size
|
||||
*/
|
||||
void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
|
||||
size_t);
|
||||
void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *),
|
||||
size_t, size_t);
|
||||
|
||||
/*
|
||||
* Find a node with a matching value in the tree. Returns the matching node
|
||||
|
||||
Reference in New Issue
Block a user