mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-26 18:04:22 +03:00
9dcdee7889
Microzap on-disk format does not include a hash tree, expecting one to be built in RAM during mzap_open(). The built tree is linked to DMU user buffer, freed when original DMU buffer is dropped from cache. I've found that workloads accessing many large directories and having active eviction from DMU cache spend significant amount of time building and then destroying the trees. I've also found that for each 64 byte mzap element additional 64 byte tree element is allocated, that is a waste of memory and CPU caches. Improve memory efficiency of the hash tree by switching from AVL-tree to B-tree. It allows to save 24 bytes per element just on pointers. Save 32 bits on mze_hash by storing only upper 32 bits since lower 32 bits are always zero for microzaps. Save 16 bits on mze_chunkid, since microzap can never have so many elements. Respectively with the 16 bits there can be no more than 16 bits of collision differentiators. As result, struct mzap_ent now drops from 48 (rounded to 64) to 8 bytes. Tune B-trees for small data. Reduce BTREE_CORE_ELEMS from 128 to 126 to allow struct zfs_btree_core in case of 8 byte elements to pack into 2KB instead of 4KB. Aside of the microzaps it should also help 32bit range trees. Allow custom B-tree leaf size to reduce memmove() time. Split zap_name_alloc() into zap_name_alloc() and zap_name_init_str(). It allows to not waste time allocating/freeing memory when processing multiple names in a loop during mzap_open(). Together on a pool with 10K directories of 1800 files each and DMU cache limited to 128MB this reduces time of `find . -name zzz` by 41% from 7.63s to 4.47s, and saves additional ~30% of CPU time on the DMU cache reclamation. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #14039
849 lines
23 KiB
C
849 lines
23 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
|
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
|
|
* Copyright 2017 Nexenta Systems, Inc.
|
|
*/
|
|
|
|
/*
|
|
* The 512-byte leaf is broken into 32 16-byte chunks.
|
|
* chunk number n means l_chunk[n], even though the header precedes it.
|
|
* the names are stored null-terminated.
|
|
*/
|
|
|
|
#include <sys/zio.h>
|
|
#include <sys/spa.h>
|
|
#include <sys/dmu.h>
|
|
#include <sys/zfs_context.h>
|
|
#include <sys/fs/zfs.h>
|
|
#include <sys/zap.h>
|
|
#include <sys/zap_impl.h>
|
|
#include <sys/zap_leaf.h>
|
|
#include <sys/arc.h>
|
|
|
|
static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
|
|
|
|
#define CHAIN_END 0xffff /* end of the chunk chain */
|
|
|
|
#define LEAF_HASH(l, h) \
|
|
((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
|
|
((h) >> \
|
|
(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
|
|
|
|
#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
|
|
|
|
static void
|
|
zap_memset(void *a, int c, size_t n)
|
|
{
|
|
char *cp = a;
|
|
char *cpend = cp + n;
|
|
|
|
while (cp < cpend)
|
|
*cp++ = c;
|
|
}
|
|
|
|
static void
|
|
stv(int len, void *addr, uint64_t value)
|
|
{
|
|
switch (len) {
|
|
case 1:
|
|
*(uint8_t *)addr = value;
|
|
return;
|
|
case 2:
|
|
*(uint16_t *)addr = value;
|
|
return;
|
|
case 4:
|
|
*(uint32_t *)addr = value;
|
|
return;
|
|
case 8:
|
|
*(uint64_t *)addr = value;
|
|
return;
|
|
default:
|
|
cmn_err(CE_PANIC, "bad int len %d", len);
|
|
}
|
|
}
|
|
|
|
static uint64_t
|
|
ldv(int len, const void *addr)
|
|
{
|
|
switch (len) {
|
|
case 1:
|
|
return (*(uint8_t *)addr);
|
|
case 2:
|
|
return (*(uint16_t *)addr);
|
|
case 4:
|
|
return (*(uint32_t *)addr);
|
|
case 8:
|
|
return (*(uint64_t *)addr);
|
|
default:
|
|
cmn_err(CE_PANIC, "bad int len %d", len);
|
|
}
|
|
return (0xFEEDFACEDEADBEEFULL);
|
|
}
|
|
|
|
void
|
|
zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
|
|
{
|
|
zap_leaf_t l;
|
|
dmu_buf_t l_dbuf;
|
|
|
|
l_dbuf.db_data = buf;
|
|
l.l_bs = highbit64(size) - 1;
|
|
l.l_dbuf = &l_dbuf;
|
|
|
|
buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type);
|
|
buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix);
|
|
buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic);
|
|
buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree);
|
|
buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries);
|
|
buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
|
|
buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
|
|
|
|
for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
|
|
buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
|
|
|
|
for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
|
|
zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
|
|
struct zap_leaf_entry *le;
|
|
|
|
switch (lc->l_free.lf_type) {
|
|
case ZAP_CHUNK_ENTRY:
|
|
le = &lc->l_entry;
|
|
|
|
le->le_type = BSWAP_8(le->le_type);
|
|
le->le_value_intlen = BSWAP_8(le->le_value_intlen);
|
|
le->le_next = BSWAP_16(le->le_next);
|
|
le->le_name_chunk = BSWAP_16(le->le_name_chunk);
|
|
le->le_name_numints = BSWAP_16(le->le_name_numints);
|
|
le->le_value_chunk = BSWAP_16(le->le_value_chunk);
|
|
le->le_value_numints = BSWAP_16(le->le_value_numints);
|
|
le->le_cd = BSWAP_32(le->le_cd);
|
|
le->le_hash = BSWAP_64(le->le_hash);
|
|
break;
|
|
case ZAP_CHUNK_FREE:
|
|
lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type);
|
|
lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next);
|
|
break;
|
|
case ZAP_CHUNK_ARRAY:
|
|
lc->l_array.la_type = BSWAP_8(lc->l_array.la_type);
|
|
lc->l_array.la_next = BSWAP_16(lc->l_array.la_next);
|
|
/* la_array doesn't need swapping */
|
|
break;
|
|
default:
|
|
cmn_err(CE_PANIC, "bad leaf type %d",
|
|
lc->l_free.lf_type);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
zap_leaf_init(zap_leaf_t *l, boolean_t sort)
|
|
{
|
|
l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
|
|
zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
|
|
sizeof (struct zap_leaf_header));
|
|
zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
|
|
2*ZAP_LEAF_HASH_NUMENTRIES(l));
|
|
for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
|
|
ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
|
|
ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
|
|
}
|
|
ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
|
|
zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
|
|
zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
|
|
zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
|
|
if (sort)
|
|
zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
|
|
}
|
|
|
|
/*
|
|
* Routines which manipulate leaf chunks (l_chunk[]).
|
|
*/
|
|
|
|
static uint16_t
|
|
zap_leaf_chunk_alloc(zap_leaf_t *l)
|
|
{
|
|
ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
|
|
|
|
int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
|
|
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
|
|
ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
|
|
|
|
zap_leaf_phys(l)->l_hdr.lh_freelist =
|
|
ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
|
|
|
|
zap_leaf_phys(l)->l_hdr.lh_nfree--;
|
|
|
|
return (chunk);
|
|
}
|
|
|
|
static void
|
|
zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
|
|
{
|
|
struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
|
|
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
|
|
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
|
|
ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
|
|
|
|
zlf->lf_type = ZAP_CHUNK_FREE;
|
|
zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
|
|
memset(zlf->lf_pad, 0, sizeof (zlf->lf_pad)); /* help it to compress */
|
|
zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
|
|
|
|
zap_leaf_phys(l)->l_hdr.lh_nfree++;
|
|
}
|
|
|
|
/*
|
|
* Routines which manipulate leaf arrays (zap_leaf_array type chunks).
|
|
*/
|
|
|
|
static uint16_t
|
|
zap_leaf_array_create(zap_leaf_t *l, const char *buf,
|
|
int integer_size, int num_integers)
|
|
{
|
|
uint16_t chunk_head;
|
|
uint16_t *chunkp = &chunk_head;
|
|
int byten = 0;
|
|
uint64_t value = 0;
|
|
int shift = (integer_size - 1) * 8;
|
|
int len = num_integers;
|
|
|
|
ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
|
|
|
|
while (len > 0) {
|
|
uint16_t chunk = zap_leaf_chunk_alloc(l);
|
|
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
|
|
|
|
la->la_type = ZAP_CHUNK_ARRAY;
|
|
for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
|
|
if (byten == 0)
|
|
value = ldv(integer_size, buf);
|
|
la->la_array[i] = value >> shift;
|
|
value <<= 8;
|
|
if (++byten == integer_size) {
|
|
byten = 0;
|
|
buf += integer_size;
|
|
if (--len == 0)
|
|
break;
|
|
}
|
|
}
|
|
|
|
*chunkp = chunk;
|
|
chunkp = &la->la_next;
|
|
}
|
|
*chunkp = CHAIN_END;
|
|
|
|
return (chunk_head);
|
|
}
|
|
|
|
static void
|
|
zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
|
|
{
|
|
uint16_t chunk = *chunkp;
|
|
|
|
*chunkp = CHAIN_END;
|
|
|
|
while (chunk != CHAIN_END) {
|
|
int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
|
|
ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
|
|
ZAP_CHUNK_ARRAY);
|
|
zap_leaf_chunk_free(l, chunk);
|
|
chunk = nextchunk;
|
|
}
|
|
}
|
|
|
|
/* array_len and buf_len are in integers, not bytes */
|
|
static void
|
|
zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
|
|
int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
|
|
void *buf)
|
|
{
|
|
int len = MIN(array_len, buf_len);
|
|
int byten = 0;
|
|
uint64_t value = 0;
|
|
char *p = buf;
|
|
|
|
ASSERT3U(array_int_len, <=, buf_int_len);
|
|
|
|
/* Fast path for one 8-byte integer */
|
|
if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
|
|
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
|
|
uint8_t *ip = la->la_array;
|
|
uint64_t *buf64 = buf;
|
|
|
|
*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
|
|
(uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
|
|
(uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
|
|
(uint64_t)ip[6] << 8 | (uint64_t)ip[7];
|
|
return;
|
|
}
|
|
|
|
/* Fast path for an array of 1-byte integers (eg. the entry name) */
|
|
if (array_int_len == 1 && buf_int_len == 1 &&
|
|
buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
|
|
while (chunk != CHAIN_END) {
|
|
struct zap_leaf_array *la =
|
|
&ZAP_LEAF_CHUNK(l, chunk).l_array;
|
|
memcpy(p, la->la_array, ZAP_LEAF_ARRAY_BYTES);
|
|
p += ZAP_LEAF_ARRAY_BYTES;
|
|
chunk = la->la_next;
|
|
}
|
|
return;
|
|
}
|
|
|
|
while (len > 0) {
|
|
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
|
|
|
|
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
|
|
for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
|
|
value = (value << 8) | la->la_array[i];
|
|
byten++;
|
|
if (byten == array_int_len) {
|
|
stv(buf_int_len, p, value);
|
|
byten = 0;
|
|
len--;
|
|
if (len == 0)
|
|
return;
|
|
p += buf_int_len;
|
|
}
|
|
}
|
|
chunk = la->la_next;
|
|
}
|
|
}
|
|
|
|
static boolean_t
|
|
zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
|
|
int chunk, int array_numints)
|
|
{
|
|
int bseen = 0;
|
|
|
|
if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
|
|
uint64_t *thiskey =
|
|
kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
|
|
ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
|
|
|
|
zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
|
|
sizeof (*thiskey), array_numints, thiskey);
|
|
boolean_t match = memcmp(thiskey, zn->zn_key_orig,
|
|
array_numints * sizeof (*thiskey)) == 0;
|
|
kmem_free(thiskey, array_numints * sizeof (*thiskey));
|
|
return (match);
|
|
}
|
|
|
|
ASSERT(zn->zn_key_intlen == 1);
|
|
if (zn->zn_matchtype & MT_NORMALIZE) {
|
|
char *thisname = kmem_alloc(array_numints, KM_SLEEP);
|
|
|
|
zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
|
|
sizeof (char), array_numints, thisname);
|
|
boolean_t match = zap_match(zn, thisname);
|
|
kmem_free(thisname, array_numints);
|
|
return (match);
|
|
}
|
|
|
|
/*
|
|
* Fast path for exact matching.
|
|
* First check that the lengths match, so that we don't read
|
|
* past the end of the zn_key_orig array.
|
|
*/
|
|
if (array_numints != zn->zn_key_orig_numints)
|
|
return (B_FALSE);
|
|
while (bseen < array_numints) {
|
|
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
|
|
int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
|
|
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
|
|
if (memcmp(la->la_array, (char *)zn->zn_key_orig + bseen,
|
|
toread))
|
|
break;
|
|
chunk = la->la_next;
|
|
bseen += toread;
|
|
}
|
|
return (bseen == array_numints);
|
|
}
|
|
|
|
/*
|
|
* Routines which manipulate leaf entries.
|
|
*/
|
|
|
|
int
|
|
zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
|
|
{
|
|
struct zap_leaf_entry *le;
|
|
|
|
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
|
|
|
|
for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
|
|
*chunkp != CHAIN_END; chunkp = &le->le_next) {
|
|
uint16_t chunk = *chunkp;
|
|
le = ZAP_LEAF_ENTRY(l, chunk);
|
|
|
|
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
|
|
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
|
|
|
|
if (le->le_hash != zn->zn_hash)
|
|
continue;
|
|
|
|
/*
|
|
* NB: the entry chain is always sorted by cd on
|
|
* normalized zap objects, so this will find the
|
|
* lowest-cd match for MT_NORMALIZE.
|
|
*/
|
|
ASSERT((zn->zn_matchtype == 0) ||
|
|
(zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
|
|
if (zap_leaf_array_match(l, zn, le->le_name_chunk,
|
|
le->le_name_numints)) {
|
|
zeh->zeh_num_integers = le->le_value_numints;
|
|
zeh->zeh_integer_size = le->le_value_intlen;
|
|
zeh->zeh_cd = le->le_cd;
|
|
zeh->zeh_hash = le->le_hash;
|
|
zeh->zeh_chunkp = chunkp;
|
|
zeh->zeh_leaf = l;
|
|
return (0);
|
|
}
|
|
}
|
|
|
|
return (SET_ERROR(ENOENT));
|
|
}
|
|
|
|
/* Return (h1,cd1 >= h2,cd2) */
|
|
#define HCD_GTEQ(h1, cd1, h2, cd2) \
|
|
((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
|
|
|
|
int
|
|
zap_leaf_lookup_closest(zap_leaf_t *l,
|
|
uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
|
|
{
|
|
uint64_t besth = -1ULL;
|
|
uint32_t bestcd = -1U;
|
|
uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
|
|
struct zap_leaf_entry *le;
|
|
|
|
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
|
|
|
|
for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
|
|
for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
|
|
chunk != CHAIN_END; chunk = le->le_next) {
|
|
le = ZAP_LEAF_ENTRY(l, chunk);
|
|
|
|
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
|
|
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
|
|
|
|
if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
|
|
HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
|
|
ASSERT3U(bestlh, >=, lh);
|
|
bestlh = lh;
|
|
besth = le->le_hash;
|
|
bestcd = le->le_cd;
|
|
|
|
zeh->zeh_num_integers = le->le_value_numints;
|
|
zeh->zeh_integer_size = le->le_value_intlen;
|
|
zeh->zeh_cd = le->le_cd;
|
|
zeh->zeh_hash = le->le_hash;
|
|
zeh->zeh_fakechunk = chunk;
|
|
zeh->zeh_chunkp = &zeh->zeh_fakechunk;
|
|
zeh->zeh_leaf = l;
|
|
}
|
|
}
|
|
}
|
|
|
|
return (bestcd == -1U ? SET_ERROR(ENOENT) : 0);
|
|
}
|
|
|
|
int
|
|
zap_entry_read(const zap_entry_handle_t *zeh,
|
|
uint8_t integer_size, uint64_t num_integers, void *buf)
|
|
{
|
|
struct zap_leaf_entry *le =
|
|
ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
|
|
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
|
|
|
|
if (le->le_value_intlen > integer_size)
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
|
|
le->le_value_intlen, le->le_value_numints,
|
|
integer_size, num_integers, buf);
|
|
|
|
if (zeh->zeh_num_integers > num_integers)
|
|
return (SET_ERROR(EOVERFLOW));
|
|
return (0);
|
|
|
|
}
|
|
|
|
int
|
|
zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
|
|
char *buf)
|
|
{
|
|
struct zap_leaf_entry *le =
|
|
ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
|
|
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
|
|
|
|
if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
|
|
zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
|
|
le->le_name_numints, 8, buflen / 8, buf);
|
|
} else {
|
|
zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
|
|
le->le_name_numints, 1, buflen, buf);
|
|
}
|
|
if (le->le_name_numints > buflen)
|
|
return (SET_ERROR(EOVERFLOW));
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
zap_entry_update(zap_entry_handle_t *zeh,
|
|
uint8_t integer_size, uint64_t num_integers, const void *buf)
|
|
{
|
|
zap_leaf_t *l = zeh->zeh_leaf;
|
|
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
|
|
|
|
int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
|
|
ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
|
|
|
|
if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
|
|
return (SET_ERROR(EAGAIN));
|
|
|
|
zap_leaf_array_free(l, &le->le_value_chunk);
|
|
le->le_value_chunk =
|
|
zap_leaf_array_create(l, buf, integer_size, num_integers);
|
|
le->le_value_numints = num_integers;
|
|
le->le_value_intlen = integer_size;
|
|
return (0);
|
|
}
|
|
|
|
void
|
|
zap_entry_remove(zap_entry_handle_t *zeh)
|
|
{
|
|
zap_leaf_t *l = zeh->zeh_leaf;
|
|
|
|
ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
|
|
|
|
uint16_t entry_chunk = *zeh->zeh_chunkp;
|
|
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
|
|
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
|
|
|
|
zap_leaf_array_free(l, &le->le_name_chunk);
|
|
zap_leaf_array_free(l, &le->le_value_chunk);
|
|
|
|
*zeh->zeh_chunkp = le->le_next;
|
|
zap_leaf_chunk_free(l, entry_chunk);
|
|
|
|
zap_leaf_phys(l)->l_hdr.lh_nentries--;
|
|
}
|
|
|
|
int
|
|
zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
|
|
uint8_t integer_size, uint64_t num_integers, const void *buf,
|
|
zap_entry_handle_t *zeh)
|
|
{
|
|
uint16_t chunk;
|
|
struct zap_leaf_entry *le;
|
|
uint64_t h = zn->zn_hash;
|
|
|
|
uint64_t valuelen = integer_size * num_integers;
|
|
|
|
int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
|
|
zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
|
|
if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
|
|
return (SET_ERROR(E2BIG));
|
|
|
|
if (cd == ZAP_NEED_CD) {
|
|
/* find the lowest unused cd */
|
|
if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
|
|
cd = 0;
|
|
|
|
for (chunk = *LEAF_HASH_ENTPTR(l, h);
|
|
chunk != CHAIN_END; chunk = le->le_next) {
|
|
le = ZAP_LEAF_ENTRY(l, chunk);
|
|
if (le->le_cd > cd)
|
|
break;
|
|
if (le->le_hash == h) {
|
|
ASSERT3U(cd, ==, le->le_cd);
|
|
cd++;
|
|
}
|
|
}
|
|
} else {
|
|
/* old unsorted format; do it the O(n^2) way */
|
|
for (cd = 0; ; cd++) {
|
|
for (chunk = *LEAF_HASH_ENTPTR(l, h);
|
|
chunk != CHAIN_END; chunk = le->le_next) {
|
|
le = ZAP_LEAF_ENTRY(l, chunk);
|
|
if (le->le_hash == h &&
|
|
le->le_cd == cd) {
|
|
break;
|
|
}
|
|
}
|
|
/* If this cd is not in use, we are good. */
|
|
if (chunk == CHAIN_END)
|
|
break;
|
|
}
|
|
}
|
|
/*
|
|
* We would run out of space in a block before we could
|
|
* store enough entries to run out of CD values.
|
|
*/
|
|
ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
|
|
}
|
|
|
|
if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
|
|
return (SET_ERROR(EAGAIN));
|
|
|
|
/* make the entry */
|
|
chunk = zap_leaf_chunk_alloc(l);
|
|
le = ZAP_LEAF_ENTRY(l, chunk);
|
|
le->le_type = ZAP_CHUNK_ENTRY;
|
|
le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
|
|
zn->zn_key_intlen, zn->zn_key_orig_numints);
|
|
le->le_name_numints = zn->zn_key_orig_numints;
|
|
le->le_value_chunk =
|
|
zap_leaf_array_create(l, buf, integer_size, num_integers);
|
|
le->le_value_numints = num_integers;
|
|
le->le_value_intlen = integer_size;
|
|
le->le_hash = h;
|
|
le->le_cd = cd;
|
|
|
|
/* link it into the hash chain */
|
|
/* XXX if we did the search above, we could just use that */
|
|
uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
|
|
|
|
zap_leaf_phys(l)->l_hdr.lh_nentries++;
|
|
|
|
zeh->zeh_leaf = l;
|
|
zeh->zeh_num_integers = num_integers;
|
|
zeh->zeh_integer_size = le->le_value_intlen;
|
|
zeh->zeh_cd = le->le_cd;
|
|
zeh->zeh_hash = le->le_hash;
|
|
zeh->zeh_chunkp = chunkp;
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Determine if there is another entry with the same normalized form.
|
|
* For performance purposes, either zn or name must be provided (the
|
|
* other can be NULL). Note, there usually won't be any hash
|
|
* conflicts, in which case we don't need the concatenated/normalized
|
|
* form of the name. But all callers have one of these on hand anyway,
|
|
* so might as well take advantage. A cleaner but slower interface
|
|
* would accept neither argument, and compute the normalized name as
|
|
* needed (using zap_name_alloc_str(zap_entry_read_name(zeh))).
|
|
*/
|
|
boolean_t
|
|
zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
|
|
const char *name, zap_t *zap)
|
|
{
|
|
struct zap_leaf_entry *le;
|
|
boolean_t allocdzn = B_FALSE;
|
|
|
|
if (zap->zap_normflags == 0)
|
|
return (B_FALSE);
|
|
|
|
for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
|
|
chunk != CHAIN_END; chunk = le->le_next) {
|
|
le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
|
|
if (le->le_hash != zeh->zeh_hash)
|
|
continue;
|
|
if (le->le_cd == zeh->zeh_cd)
|
|
continue;
|
|
|
|
if (zn == NULL) {
|
|
zn = zap_name_alloc_str(zap, name, MT_NORMALIZE);
|
|
allocdzn = B_TRUE;
|
|
}
|
|
if (zap_leaf_array_match(zeh->zeh_leaf, zn,
|
|
le->le_name_chunk, le->le_name_numints)) {
|
|
if (allocdzn)
|
|
zap_name_free(zn);
|
|
return (B_TRUE);
|
|
}
|
|
}
|
|
if (allocdzn)
|
|
zap_name_free(zn);
|
|
return (B_FALSE);
|
|
}
|
|
|
|
/*
|
|
* Routines for transferring entries between leafs.
|
|
*/
|
|
|
|
static uint16_t *
|
|
zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
|
|
{
|
|
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
|
|
struct zap_leaf_entry *le2;
|
|
uint16_t *chunkp;
|
|
|
|
/*
|
|
* keep the entry chain sorted by cd
|
|
* NB: this will not cause problems for unsorted leafs, though
|
|
* it is unnecessary there.
|
|
*/
|
|
for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
|
|
*chunkp != CHAIN_END; chunkp = &le2->le_next) {
|
|
le2 = ZAP_LEAF_ENTRY(l, *chunkp);
|
|
if (le2->le_cd > le->le_cd)
|
|
break;
|
|
}
|
|
|
|
le->le_next = *chunkp;
|
|
*chunkp = entry;
|
|
return (chunkp);
|
|
}
|
|
|
|
static uint16_t
|
|
zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
|
|
{
|
|
uint16_t new_chunk;
|
|
uint16_t *nchunkp = &new_chunk;
|
|
|
|
while (chunk != CHAIN_END) {
|
|
uint16_t nchunk = zap_leaf_chunk_alloc(nl);
|
|
struct zap_leaf_array *nla =
|
|
&ZAP_LEAF_CHUNK(nl, nchunk).l_array;
|
|
struct zap_leaf_array *la =
|
|
&ZAP_LEAF_CHUNK(l, chunk).l_array;
|
|
int nextchunk = la->la_next;
|
|
|
|
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
|
|
ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
|
|
|
|
*nla = *la; /* structure assignment */
|
|
|
|
zap_leaf_chunk_free(l, chunk);
|
|
chunk = nextchunk;
|
|
*nchunkp = nchunk;
|
|
nchunkp = &nla->la_next;
|
|
}
|
|
*nchunkp = CHAIN_END;
|
|
return (new_chunk);
|
|
}
|
|
|
|
static void
|
|
zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
|
|
{
|
|
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
|
|
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
|
|
|
|
uint16_t chunk = zap_leaf_chunk_alloc(nl);
|
|
struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
|
|
*nle = *le; /* structure assignment */
|
|
|
|
(void) zap_leaf_rehash_entry(nl, chunk);
|
|
|
|
nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
|
|
nle->le_value_chunk =
|
|
zap_leaf_transfer_array(l, le->le_value_chunk, nl);
|
|
|
|
zap_leaf_chunk_free(l, entry);
|
|
|
|
zap_leaf_phys(l)->l_hdr.lh_nentries--;
|
|
zap_leaf_phys(nl)->l_hdr.lh_nentries++;
|
|
}
|
|
|
|
/*
|
|
* Transfer the entries whose hash prefix ends in 1 to the new leaf.
|
|
*/
|
|
void
|
|
zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
|
|
{
|
|
int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
|
|
|
|
/* set new prefix and prefix_len */
|
|
zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
|
|
zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
|
|
zap_leaf_phys(nl)->l_hdr.lh_prefix =
|
|
zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
|
|
zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
|
|
zap_leaf_phys(l)->l_hdr.lh_prefix_len;
|
|
|
|
/* break existing hash chains */
|
|
zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
|
|
2*ZAP_LEAF_HASH_NUMENTRIES(l));
|
|
|
|
if (sort)
|
|
zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
|
|
|
|
/*
|
|
* Transfer entries whose hash bit 'bit' is set to nl; rehash
|
|
* the remaining entries
|
|
*
|
|
* NB: We could find entries via the hashtable instead. That
|
|
* would be O(hashents+numents) rather than O(numblks+numents),
|
|
* but this accesses memory more sequentially, and when we're
|
|
* called, the block is usually pretty full.
|
|
*/
|
|
for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
|
|
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
|
|
if (le->le_type != ZAP_CHUNK_ENTRY)
|
|
continue;
|
|
|
|
if (le->le_hash & (1ULL << bit))
|
|
zap_leaf_transfer_entry(l, i, nl);
|
|
else
|
|
(void) zap_leaf_rehash_entry(l, i);
|
|
}
|
|
}
|
|
|
|
void
|
|
zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
|
|
{
|
|
int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
|
|
zap_leaf_phys(l)->l_hdr.lh_prefix_len;
|
|
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
|
|
zs->zs_leafs_with_2n_pointers[n]++;
|
|
|
|
|
|
n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
|
|
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
|
|
zs->zs_blocks_with_n5_entries[n]++;
|
|
|
|
n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
|
|
zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
|
|
(1<<FZAP_BLOCK_SHIFT(zap));
|
|
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
|
|
zs->zs_blocks_n_tenths_full[n]++;
|
|
|
|
for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
|
|
int nentries = 0;
|
|
int chunk = zap_leaf_phys(l)->l_hash[i];
|
|
|
|
while (chunk != CHAIN_END) {
|
|
struct zap_leaf_entry *le =
|
|
ZAP_LEAF_ENTRY(l, chunk);
|
|
|
|
n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
|
|
ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
|
|
le->le_value_intlen);
|
|
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
|
|
zs->zs_entries_using_n_chunks[n]++;
|
|
|
|
chunk = le->le_next;
|
|
nentries++;
|
|
}
|
|
|
|
n = nentries;
|
|
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
|
|
zs->zs_buckets_with_n_entries[n]++;
|
|
}
|
|
}
|