mirror_zfs/module/zfs/zap_leaf.c

840 lines
23 KiB
C
Raw Normal View History

2008-11-20 23:01:55 +03:00
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
2008-11-20 23:01:55 +03:00
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
2008-11-20 23:01:55 +03:00
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
2008-11-20 23:01:55 +03:00
*/
/*
* The 512-byte leaf is broken into 32 16-byte chunks.
* chunk number n means l_chunk[n], even though the header precedes it.
* the names are stored null-terminated.
*/
#include <sys/zio.h>
2009-07-03 02:44:48 +04:00
#include <sys/spa.h>
#include <sys/dmu.h>
2008-11-20 23:01:55 +03:00
#include <sys/zfs_context.h>
2009-07-03 02:44:48 +04:00
#include <sys/fs/zfs.h>
2008-11-20 23:01:55 +03:00
#include <sys/zap.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
#include <sys/arc.h>
2008-11-20 23:01:55 +03:00
static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le,
uint16_t entry);
2008-11-20 23:01:55 +03:00
#define CHAIN_END 0xffff /* end of the chunk chain */
#define LEAF_HASH(l, h) \
((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
((h) >> \
(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
2008-11-20 23:01:55 +03:00
Fix ENOSPC in "Handle zap_add() failures in ..." Commit cc63068 caused ENOSPC error when copy a large amount of files between two directories. The reason is that the patch limits zap leaf expansion to 2 retries, and return ENOSPC when failed. The intent for limiting retries is to prevent pointlessly growing table to max size when adding a block full of entries with same name in different case in mixed mode. However, it turns out we cannot use any limit on the retry. When we copy files from one directory in readdir order, we are copying in hash order, one leaf block at a time. Which means that if the leaf block in source directory has expanded 6 times, and you copy those entries in that block, by the time you need to expand the leaf in destination directory, you need to expand it 6 times in one go. So any limit on the retry will result in error where it shouldn't. Note that while we do use different salt for different directories, it seems that the salt/hash function doesn't provide enough randomization to the hash distance to prevent this from happening. Since cc63068 has already been reverted. This patch adds it back and removes the retry limit. Also, as it turn out, failing on zap_add() has a serious side effect for mzap_upgrade(). When upgrading from micro zap to fat zap, it will call zap_add() to transfer entries one at a time. If it hit any error halfway through, the remaining entries will be lost, causing those files to become orphan. This patch add a VERIFY to catch it. Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com> Reviewed-by: Richard Yao <ryao@gentoo.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Albert Lee <trisk@forkgnu.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Chunwei Chen <david.chen@nutanix.com> Closes #7401 Closes #7421
2018-04-19 00:19:50 +03:00
#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
2008-11-20 23:01:55 +03:00
static void
stv(int len, void *addr, uint64_t value)
{
switch (len) {
case 1:
*(uint8_t *)addr = value;
return;
case 2:
*(uint16_t *)addr = value;
return;
case 4:
*(uint32_t *)addr = value;
return;
case 8:
*(uint64_t *)addr = value;
return;
default:
PANIC("bad int len %d", len);
2008-11-20 23:01:55 +03:00
}
}
static uint64_t
ldv(int len, const void *addr)
{
switch (len) {
case 1:
return (*(uint8_t *)addr);
case 2:
return (*(uint16_t *)addr);
case 4:
return (*(uint32_t *)addr);
case 8:
return (*(uint64_t *)addr);
default:
PANIC("bad int len %d", len);
2008-11-20 23:01:55 +03:00
}
return (0xFEEDFACEDEADBEEFULL);
}
void
zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size)
2008-11-20 23:01:55 +03:00
{
zap_leaf_t l;
dmu_buf_t l_dbuf;
l_dbuf.db_data = buf;
l.l_bs = highbit64(size) - 1;
l.l_dbuf = &l_dbuf;
2008-11-20 23:01:55 +03:00
buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type);
buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix);
buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic);
buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree);
buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries);
buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
2008-11-20 23:01:55 +03:00
for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
2008-11-20 23:01:55 +03:00
buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
2008-11-20 23:01:55 +03:00
zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
struct zap_leaf_entry *le;
switch (lc->l_free.lf_type) {
case ZAP_CHUNK_ENTRY:
le = &lc->l_entry;
le->le_type = BSWAP_8(le->le_type);
le->le_value_intlen = BSWAP_8(le->le_value_intlen);
2008-11-20 23:01:55 +03:00
le->le_next = BSWAP_16(le->le_next);
le->le_name_chunk = BSWAP_16(le->le_name_chunk);
le->le_name_numints = BSWAP_16(le->le_name_numints);
2008-11-20 23:01:55 +03:00
le->le_value_chunk = BSWAP_16(le->le_value_chunk);
le->le_value_numints = BSWAP_16(le->le_value_numints);
2008-11-20 23:01:55 +03:00
le->le_cd = BSWAP_32(le->le_cd);
le->le_hash = BSWAP_64(le->le_hash);
break;
case ZAP_CHUNK_FREE:
lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type);
lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next);
break;
case ZAP_CHUNK_ARRAY:
lc->l_array.la_type = BSWAP_8(lc->l_array.la_type);
lc->l_array.la_next = BSWAP_16(lc->l_array.la_next);
/* la_array doesn't need swapping */
break;
default:
cmn_err(CE_PANIC, "bad leaf type %d",
lc->l_free.lf_type);
2008-11-20 23:01:55 +03:00
}
}
}
void
zap_leaf_init(zap_leaf_t *l, boolean_t sort)
{
l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
memset(&zap_leaf_phys(l)->l_hdr, 0,
sizeof (struct zap_leaf_header));
memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
2*ZAP_LEAF_HASH_NUMENTRIES(l));
for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
2008-11-20 23:01:55 +03:00
ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
}
ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
2008-11-20 23:01:55 +03:00
if (sort)
zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
2008-11-20 23:01:55 +03:00
}
/*
* Routines which manipulate leaf chunks (l_chunk[]).
*/
static uint16_t
zap_leaf_chunk_alloc(zap_leaf_t *l)
{
ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
2008-11-20 23:01:55 +03:00
uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
2008-11-20 23:01:55 +03:00
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
zap_leaf_phys(l)->l_hdr.lh_freelist =
ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
2008-11-20 23:01:55 +03:00
zap_leaf_phys(l)->l_hdr.lh_nfree--;
2008-11-20 23:01:55 +03:00
return (chunk);
}
static void
zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
{
struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
2008-11-20 23:01:55 +03:00
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
zlf->lf_type = ZAP_CHUNK_FREE;
zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
memset(zlf->lf_pad, 0, sizeof (zlf->lf_pad)); /* help it to compress */
zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
2008-11-20 23:01:55 +03:00
zap_leaf_phys(l)->l_hdr.lh_nfree++;
2008-11-20 23:01:55 +03:00
}
/*
* Routines which manipulate leaf arrays (zap_leaf_array type chunks).
*/
static uint16_t
zap_leaf_array_create(zap_leaf_t *l, const char *buf,
int integer_size, int num_integers)
2008-11-20 23:01:55 +03:00
{
uint16_t chunk_head;
uint16_t *chunkp = &chunk_head;
int byten = integer_size;
uint64_t value = 0;
int shift = (integer_size - 1) * 8;
2008-11-20 23:01:55 +03:00
int len = num_integers;
ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
2008-11-20 23:01:55 +03:00
if (len > 0)
value = ldv(integer_size, buf);
2008-11-20 23:01:55 +03:00
while (len > 0) {
uint16_t chunk = zap_leaf_chunk_alloc(l);
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
la->la_type = ZAP_CHUNK_ARRAY;
for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
2008-11-20 23:01:55 +03:00
la->la_array[i] = value >> shift;
value <<= 8;
if (--byten == 0) {
2008-11-20 23:01:55 +03:00
if (--len == 0)
break;
byten = integer_size;
buf += integer_size;
value = ldv(integer_size, buf);
2008-11-20 23:01:55 +03:00
}
}
*chunkp = chunk;
chunkp = &la->la_next;
}
*chunkp = CHAIN_END;
return (chunk_head);
}
static void
zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
{
uint16_t chunk = *chunkp;
*chunkp = CHAIN_END;
while (chunk != CHAIN_END) {
uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
2008-11-20 23:01:55 +03:00
ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
ZAP_CHUNK_ARRAY);
zap_leaf_chunk_free(l, chunk);
chunk = nextchunk;
}
}
/* array_len and buf_len are in integers, not bytes */
static void
zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
void *buf)
2008-11-20 23:01:55 +03:00
{
int len = MIN(array_len, buf_len);
int byten = 0;
uint64_t value = 0;
char *p = buf;
2008-11-20 23:01:55 +03:00
ASSERT3U(array_int_len, <=, buf_int_len);
/* Fast path for one 8-byte integer */
if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
uint8_t *ip = la->la_array;
uint64_t *buf64 = buf;
2008-11-20 23:01:55 +03:00
*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
(uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
(uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
(uint64_t)ip[6] << 8 | (uint64_t)ip[7];
return;
}
/* Fast path for an array of 1-byte integers (eg. the entry name) */
if (array_int_len == 1 && buf_int_len == 1 &&
buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
while (chunk != CHAIN_END) {
struct zap_leaf_array *la =
&ZAP_LEAF_CHUNK(l, chunk).l_array;
memcpy(p, la->la_array, ZAP_LEAF_ARRAY_BYTES);
p += ZAP_LEAF_ARRAY_BYTES;
2008-11-20 23:01:55 +03:00
chunk = la->la_next;
}
return;
}
while (len > 0) {
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
Cleanup: Remove constant comparisons reported by CodeQL CodeQL's cpp/constant-comparison query from its security-and-extended query set reported 4 instances where we have comparions that always evaluate the same way. In `draid_config_by_type()`, we have an early `if (nparity == 0)` check that returns `EINVAL`, making a later `if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY)` partially redundant. The later check prints an error message when parity is 0, but the early check does not. This is not useful feedback, so we move the later check to the place where the early check runs to replace the early check. In `perform_thread_merge()`, we return when `num_threads == 0`. After that block, we do `if (num_threads > 0) {`, which will always be true. We remove the `if` statement. In `sa_modify_attrs()`, we have a loop condition that is `k != 2`, but at the end of the loop, we have `if (k == 0 && hdl->sa_spill)` followed by an else that does a break. The result is that k != 2 will never be evaluated when it is false. We drop the comparison. In `zap_leaf_array_read()`, we have a for loop condition that is `i < ZAP_LEAF_ARRAY_BYTES && len > 0`. However, that loop itself is in a loop that is `while (len > 0)` and while the value of len is decremented inside the loop, when `len == 0`, it will return, such that `len > 0` inside the loop condition will always be true. We drop that part of the condition. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Closes #14575
2023-03-05 01:57:01 +03:00
for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
2008-11-20 23:01:55 +03:00
value = (value << 8) | la->la_array[i];
byten++;
if (byten == array_int_len) {
stv(buf_int_len, p, value);
2008-11-20 23:01:55 +03:00
byten = 0;
len--;
if (len == 0)
return;
p += buf_int_len;
2008-11-20 23:01:55 +03:00
}
}
chunk = la->la_next;
}
}
static boolean_t
zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
uint_t chunk, int array_numints)
2008-11-20 23:01:55 +03:00
{
int bseen = 0;
if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
uint64_t *thiskey =
kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
sizeof (*thiskey), array_numints, thiskey);
boolean_t match = memcmp(thiskey, zn->zn_key_orig,
array_numints * sizeof (*thiskey)) == 0;
kmem_free(thiskey, array_numints * sizeof (*thiskey));
return (match);
}
ASSERT(zn->zn_key_intlen == 1);
if (zn->zn_matchtype & MT_NORMALIZE) {
char *thisname = kmem_alloc(array_numints, KM_SLEEP);
2008-11-20 23:01:55 +03:00
zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
sizeof (char), array_numints, thisname);
boolean_t match = zap_match(zn, thisname);
kmem_free(thisname, array_numints);
2008-11-20 23:01:55 +03:00
return (match);
}
/*
* Fast path for exact matching.
* First check that the lengths match, so that we don't read
* past the end of the zn_key_orig array.
*/
if (array_numints != zn->zn_key_orig_numints)
return (B_FALSE);
while (bseen < array_numints) {
2008-11-20 23:01:55 +03:00
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
2008-11-20 23:01:55 +03:00
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
if (memcmp(la->la_array, (char *)zn->zn_key_orig + bseen,
toread))
2008-11-20 23:01:55 +03:00
break;
chunk = la->la_next;
bseen += toread;
}
return (bseen == array_numints);
2008-11-20 23:01:55 +03:00
}
/*
* Routines which manipulate leaf entries.
*/
int
zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
{
struct zap_leaf_entry *le;
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
2008-11-20 23:01:55 +03:00
for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
2008-11-20 23:01:55 +03:00
*chunkp != CHAIN_END; chunkp = &le->le_next) {
uint16_t chunk = *chunkp;
le = ZAP_LEAF_ENTRY(l, chunk);
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
if (le->le_hash != zn->zn_hash)
continue;
/*
* NB: the entry chain is always sorted by cd on
* normalized zap objects, so this will find the
* lowest-cd match for MT_NORMALIZE.
2008-11-20 23:01:55 +03:00
*/
ASSERT((zn->zn_matchtype == 0) ||
(zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
2008-11-20 23:01:55 +03:00
if (zap_leaf_array_match(l, zn, le->le_name_chunk,
le->le_name_numints)) {
zeh->zeh_num_integers = le->le_value_numints;
zeh->zeh_integer_size = le->le_value_intlen;
2008-11-20 23:01:55 +03:00
zeh->zeh_cd = le->le_cd;
zeh->zeh_hash = le->le_hash;
zeh->zeh_chunkp = chunkp;
zeh->zeh_leaf = l;
return (0);
}
}
return (SET_ERROR(ENOENT));
2008-11-20 23:01:55 +03:00
}
/* Return (h1,cd1 >= h2,cd2) */
#define HCD_GTEQ(h1, cd1, h2, cd2) \
((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
int
zap_leaf_lookup_closest(zap_leaf_t *l,
uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
{
uint64_t besth = -1ULL;
uint32_t bestcd = -1U;
2008-11-20 23:01:55 +03:00
uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
struct zap_leaf_entry *le;
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
2008-11-20 23:01:55 +03:00
for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
2008-11-20 23:01:55 +03:00
chunk != CHAIN_END; chunk = le->le_next) {
le = ZAP_LEAF_ENTRY(l, chunk);
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
ASSERT3U(bestlh, >=, lh);
bestlh = lh;
besth = le->le_hash;
bestcd = le->le_cd;
zeh->zeh_num_integers = le->le_value_numints;
zeh->zeh_integer_size = le->le_value_intlen;
2008-11-20 23:01:55 +03:00
zeh->zeh_cd = le->le_cd;
zeh->zeh_hash = le->le_hash;
zeh->zeh_fakechunk = chunk;
zeh->zeh_chunkp = &zeh->zeh_fakechunk;
zeh->zeh_leaf = l;
}
}
}
return (bestcd == -1U ? SET_ERROR(ENOENT) : 0);
2008-11-20 23:01:55 +03:00
}
int
zap_entry_read(const zap_entry_handle_t *zeh,
uint8_t integer_size, uint64_t num_integers, void *buf)
{
struct zap_leaf_entry *le =
ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
if (le->le_value_intlen > integer_size)
return (SET_ERROR(EINVAL));
2008-11-20 23:01:55 +03:00
zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
le->le_value_intlen, le->le_value_numints,
integer_size, num_integers, buf);
2008-11-20 23:01:55 +03:00
if (zeh->zeh_num_integers > num_integers)
return (SET_ERROR(EOVERFLOW));
2008-11-20 23:01:55 +03:00
return (0);
}
int
zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
char *buf)
2008-11-20 23:01:55 +03:00
{
struct zap_leaf_entry *le =
ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
le->le_name_numints, 8, buflen / 8, buf);
} else {
zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
le->le_name_numints, 1, buflen, buf);
}
if (le->le_name_numints > buflen)
return (SET_ERROR(EOVERFLOW));
2008-11-20 23:01:55 +03:00
return (0);
}
int
zap_entry_update(zap_entry_handle_t *zeh,
uint8_t integer_size, uint64_t num_integers, const void *buf)
2008-11-20 23:01:55 +03:00
{
zap_leaf_t *l = zeh->zeh_leaf;
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
2008-11-20 23:01:55 +03:00
if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
return (SET_ERROR(EAGAIN));
2008-11-20 23:01:55 +03:00
zap_leaf_array_free(l, &le->le_value_chunk);
le->le_value_chunk =
zap_leaf_array_create(l, buf, integer_size, num_integers);
le->le_value_numints = num_integers;
le->le_value_intlen = integer_size;
2008-11-20 23:01:55 +03:00
return (0);
}
void
zap_entry_remove(zap_entry_handle_t *zeh)
{
zap_leaf_t *l = zeh->zeh_leaf;
ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
uint16_t entry_chunk = *zeh->zeh_chunkp;
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
2008-11-20 23:01:55 +03:00
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
zap_leaf_array_free(l, &le->le_name_chunk);
zap_leaf_array_free(l, &le->le_value_chunk);
*zeh->zeh_chunkp = le->le_next;
zap_leaf_chunk_free(l, entry_chunk);
zap_leaf_phys(l)->l_hdr.lh_nentries--;
2008-11-20 23:01:55 +03:00
}
int
zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
2008-11-20 23:01:55 +03:00
uint8_t integer_size, uint64_t num_integers, const void *buf,
zap_entry_handle_t *zeh)
{
uint16_t chunk;
struct zap_leaf_entry *le;
uint64_t h = zn->zn_hash;
2008-11-20 23:01:55 +03:00
uint64_t valuelen = integer_size * num_integers;
2008-11-20 23:01:55 +03:00
uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
2008-11-20 23:01:55 +03:00
if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
return (SET_ERROR(E2BIG));
2008-11-20 23:01:55 +03:00
if (cd == ZAP_NEED_CD) {
2008-11-20 23:01:55 +03:00
/* find the lowest unused cd */
if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
2008-11-20 23:01:55 +03:00
cd = 0;
for (chunk = *LEAF_HASH_ENTPTR(l, h);
chunk != CHAIN_END; chunk = le->le_next) {
le = ZAP_LEAF_ENTRY(l, chunk);
if (le->le_cd > cd)
break;
if (le->le_hash == h) {
ASSERT3U(cd, ==, le->le_cd);
cd++;
}
}
} else {
/* old unsorted format; do it the O(n^2) way */
for (cd = 0; ; cd++) {
2008-11-20 23:01:55 +03:00
for (chunk = *LEAF_HASH_ENTPTR(l, h);
chunk != CHAIN_END; chunk = le->le_next) {
le = ZAP_LEAF_ENTRY(l, chunk);
if (le->le_hash == h &&
le->le_cd == cd) {
break;
}
}
/* If this cd is not in use, we are good. */
if (chunk == CHAIN_END)
break;
}
}
/*
* We would run out of space in a block before we could
* store enough entries to run out of CD values.
2008-11-20 23:01:55 +03:00
*/
ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
2008-11-20 23:01:55 +03:00
}
if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
return (SET_ERROR(EAGAIN));
2008-11-20 23:01:55 +03:00
/* make the entry */
chunk = zap_leaf_chunk_alloc(l);
le = ZAP_LEAF_ENTRY(l, chunk);
le->le_type = ZAP_CHUNK_ENTRY;
le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
zn->zn_key_intlen, zn->zn_key_orig_numints);
le->le_name_numints = zn->zn_key_orig_numints;
2008-11-20 23:01:55 +03:00
le->le_value_chunk =
zap_leaf_array_create(l, buf, integer_size, num_integers);
le->le_value_numints = num_integers;
le->le_value_intlen = integer_size;
2008-11-20 23:01:55 +03:00
le->le_hash = h;
le->le_cd = cd;
/* link it into the hash chain */
/* XXX if we did the search above, we could just use that */
uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk);
2008-11-20 23:01:55 +03:00
zap_leaf_phys(l)->l_hdr.lh_nentries++;
2008-11-20 23:01:55 +03:00
zeh->zeh_leaf = l;
zeh->zeh_num_integers = num_integers;
zeh->zeh_integer_size = le->le_value_intlen;
2008-11-20 23:01:55 +03:00
zeh->zeh_cd = le->le_cd;
zeh->zeh_hash = le->le_hash;
zeh->zeh_chunkp = chunkp;
return (0);
}
/*
* Determine if there is another entry with the same normalized form.
* For performance purposes, either zn or name must be provided (the
* other can be NULL). Note, there usually won't be any hash
* conflicts, in which case we don't need the concatenated/normalized
* form of the name. But all callers have one of these on hand anyway,
* so might as well take advantage. A cleaner but slower interface
* would accept neither argument, and compute the normalized name as
Optimize microzaps Microzap on-disk format does not include a hash tree, expecting one to be built in RAM during mzap_open(). The built tree is linked to DMU user buffer, freed when original DMU buffer is dropped from cache. I've found that workloads accessing many large directories and having active eviction from DMU cache spend significant amount of time building and then destroying the trees. I've also found that for each 64 byte mzap element additional 64 byte tree element is allocated, that is a waste of memory and CPU caches. Improve memory efficiency of the hash tree by switching from AVL-tree to B-tree. It allows to save 24 bytes per element just on pointers. Save 32 bits on mze_hash by storing only upper 32 bits since lower 32 bits are always zero for microzaps. Save 16 bits on mze_chunkid, since microzap can never have so many elements. Respectively with the 16 bits there can be no more than 16 bits of collision differentiators. As result, struct mzap_ent now drops from 48 (rounded to 64) to 8 bytes. Tune B-trees for small data. Reduce BTREE_CORE_ELEMS from 128 to 126 to allow struct zfs_btree_core in case of 8 byte elements to pack into 2KB instead of 4KB. Aside of the microzaps it should also help 32bit range trees. Allow custom B-tree leaf size to reduce memmove() time. Split zap_name_alloc() into zap_name_alloc() and zap_name_init_str(). It allows to not waste time allocating/freeing memory when processing multiple names in a loop during mzap_open(). Together on a pool with 10K directories of 1800 files each and DMU cache limited to 128MB this reduces time of `find . -name zzz` by 41% from 7.63s to 4.47s, and saves additional ~30% of CPU time on the DMU cache reclamation. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #14039
2022-10-20 21:57:15 +03:00
* needed (using zap_name_alloc_str(zap_entry_read_name(zeh))).
2008-11-20 23:01:55 +03:00
*/
boolean_t
zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
const char *name, zap_t *zap)
{
struct zap_leaf_entry *le;
boolean_t allocdzn = B_FALSE;
if (zap->zap_normflags == 0)
return (B_FALSE);
for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
2008-11-20 23:01:55 +03:00
chunk != CHAIN_END; chunk = le->le_next) {
le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
if (le->le_hash != zeh->zeh_hash)
continue;
if (le->le_cd == zeh->zeh_cd)
continue;
if (zn == NULL) {
Optimize microzaps Microzap on-disk format does not include a hash tree, expecting one to be built in RAM during mzap_open(). The built tree is linked to DMU user buffer, freed when original DMU buffer is dropped from cache. I've found that workloads accessing many large directories and having active eviction from DMU cache spend significant amount of time building and then destroying the trees. I've also found that for each 64 byte mzap element additional 64 byte tree element is allocated, that is a waste of memory and CPU caches. Improve memory efficiency of the hash tree by switching from AVL-tree to B-tree. It allows to save 24 bytes per element just on pointers. Save 32 bits on mze_hash by storing only upper 32 bits since lower 32 bits are always zero for microzaps. Save 16 bits on mze_chunkid, since microzap can never have so many elements. Respectively with the 16 bits there can be no more than 16 bits of collision differentiators. As result, struct mzap_ent now drops from 48 (rounded to 64) to 8 bytes. Tune B-trees for small data. Reduce BTREE_CORE_ELEMS from 128 to 126 to allow struct zfs_btree_core in case of 8 byte elements to pack into 2KB instead of 4KB. Aside of the microzaps it should also help 32bit range trees. Allow custom B-tree leaf size to reduce memmove() time. Split zap_name_alloc() into zap_name_alloc() and zap_name_init_str(). It allows to not waste time allocating/freeing memory when processing multiple names in a loop during mzap_open(). Together on a pool with 10K directories of 1800 files each and DMU cache limited to 128MB this reduces time of `find . -name zzz` by 41% from 7.63s to 4.47s, and saves additional ~30% of CPU time on the DMU cache reclamation. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #14039
2022-10-20 21:57:15 +03:00
zn = zap_name_alloc_str(zap, name, MT_NORMALIZE);
2008-11-20 23:01:55 +03:00
allocdzn = B_TRUE;
}
if (zap_leaf_array_match(zeh->zeh_leaf, zn,
le->le_name_chunk, le->le_name_numints)) {
2008-11-20 23:01:55 +03:00
if (allocdzn)
zap_name_free(zn);
return (B_TRUE);
}
}
if (allocdzn)
zap_name_free(zn);
return (B_FALSE);
}
/*
* Routines for transferring entries between leafs.
*/
static uint16_t *
zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry)
2008-11-20 23:01:55 +03:00
{
struct zap_leaf_entry *le2;
uint16_t *chunkp;
/*
* keep the entry chain sorted by cd
* NB: this will not cause problems for unsorted leafs, though
* it is unnecessary there.
*/
for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
*chunkp != CHAIN_END; chunkp = &le2->le_next) {
le2 = ZAP_LEAF_ENTRY(l, *chunkp);
if (le2->le_cd > le->le_cd)
break;
}
le->le_next = *chunkp;
*chunkp = entry;
return (chunkp);
}
static uint16_t
zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
{
uint16_t new_chunk;
uint16_t *nchunkp = &new_chunk;
while (chunk != CHAIN_END) {
uint16_t nchunk = zap_leaf_chunk_alloc(nl);
struct zap_leaf_array *nla =
&ZAP_LEAF_CHUNK(nl, nchunk).l_array;
struct zap_leaf_array *la =
&ZAP_LEAF_CHUNK(l, chunk).l_array;
uint_t nextchunk = la->la_next;
2008-11-20 23:01:55 +03:00
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
*nla = *la; /* structure assignment */
zap_leaf_chunk_free(l, chunk);
chunk = nextchunk;
*nchunkp = nchunk;
nchunkp = &nla->la_next;
}
*nchunkp = CHAIN_END;
return (new_chunk);
}
static void
zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl)
2008-11-20 23:01:55 +03:00
{
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
2008-11-20 23:01:55 +03:00
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
uint16_t chunk = zap_leaf_chunk_alloc(nl);
struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
2008-11-20 23:01:55 +03:00
*nle = *le; /* structure assignment */
(void) zap_leaf_rehash_entry(nl, nle, chunk);
2008-11-20 23:01:55 +03:00
nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
nle->le_value_chunk =
zap_leaf_transfer_array(l, le->le_value_chunk, nl);
zap_leaf_chunk_free(l, entry);
zap_leaf_phys(l)->l_hdr.lh_nentries--;
zap_leaf_phys(nl)->l_hdr.lh_nentries++;
2008-11-20 23:01:55 +03:00
}
/*
* Transfer the entries whose hash prefix ends in 1 to the new leaf.
*/
void
zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
{
uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
2008-11-20 23:01:55 +03:00
/* set new prefix and prefix_len */
zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
zap_leaf_phys(nl)->l_hdr.lh_prefix =
zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
zap_leaf_phys(l)->l_hdr.lh_prefix_len;
2008-11-20 23:01:55 +03:00
/* break existing hash chains */
memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
2*ZAP_LEAF_HASH_NUMENTRIES(l));
2008-11-20 23:01:55 +03:00
if (sort)
zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
2008-11-20 23:01:55 +03:00
/*
* Transfer entries whose hash bit 'bit' is set to nl; rehash
* the remaining entries
*
* NB: We could find entries via the hashtable instead. That
* would be O(hashents+numents) rather than O(numblks+numents),
* but this accesses memory more sequentially, and when we're
* called, the block is usually pretty full.
*/
for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
2008-11-20 23:01:55 +03:00
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
if (le->le_type != ZAP_CHUNK_ENTRY)
continue;
if (le->le_hash & (1ULL << bit))
zap_leaf_transfer_entry(l, i, nl);
else
(void) zap_leaf_rehash_entry(l, le, i);
2008-11-20 23:01:55 +03:00
}
}
void
zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
{
uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
zap_leaf_phys(l)->l_hdr.lh_prefix_len;
2008-11-20 23:01:55 +03:00
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_leafs_with_2n_pointers[n]++;
n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
2008-11-20 23:01:55 +03:00
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_blocks_with_n5_entries[n]++;
n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
2008-11-20 23:01:55 +03:00
(1<<FZAP_BLOCK_SHIFT(zap));
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_blocks_n_tenths_full[n]++;
for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
uint_t nentries = 0;
uint_t chunk = zap_leaf_phys(l)->l_hash[i];
2008-11-20 23:01:55 +03:00
while (chunk != CHAIN_END) {
struct zap_leaf_entry *le =
ZAP_LEAF_ENTRY(l, chunk);
n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
le->le_value_intlen);
2008-11-20 23:01:55 +03:00
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_entries_using_n_chunks[n]++;
chunk = le->le_next;
nentries++;
}
n = nentries;
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_buckets_with_n_entries[n]++;
}
}