mirror_zfs/module/zfs/refcount.c
Paul Dagnelie 246e5883bb
Implement allocation size ranges and use for gang leaves (#17111)
When forced to resort to ganging, ZFS currently allocates three child
blocks, each one third of the size of the original. This is true
regardless of whether larger allocations could be made, which would
allow us to have fewer gang leaves. This improves performance when
fragmentation is high enough to require ganging, but not so high that
all the free ranges are only just big enough to hold a third of the
recordsize. This is also useful for improving the behavior of a future
change to allow larger gang headers.

We add the ability for the allocation codepath to allocate a range of
sizes instead of a single fixed size. We then use this to pre-allocate
the DVAs for the gang children. If those allocations fail, we fall back
to the normal write path, which will likely re-gang.

Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Co-authored-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
2025-05-02 15:32:18 -07:00

360 lines
9.0 KiB
C

// SPDX-License-Identifier: CDDL-1.0
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2021 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/zfs_refcount.h>
#ifdef ZFS_DEBUG
/*
* Reference count tracking is disabled by default. It's memory requirements
* are reasonable, however as implemented it consumes a significant amount of
* cpu time. Until its performance is improved it should be manually enabled.
*/
int reference_tracking_enable = B_FALSE;
static uint_t reference_history = 3; /* tunable */
static kmem_cache_t *reference_cache;
void
zfs_refcount_init(void)
{
reference_cache = kmem_cache_create("reference_cache",
sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
}
void
zfs_refcount_fini(void)
{
kmem_cache_destroy(reference_cache);
}
static int
zfs_refcount_compare(const void *x1, const void *x2)
{
const reference_t *r1 = (const reference_t *)x1;
const reference_t *r2 = (const reference_t *)x2;
int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder);
int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number);
int cmp = cmp1 ? cmp1 : cmp2;
return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2));
}
void
zfs_refcount_create(zfs_refcount_t *rc)
{
mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t),
offsetof(reference_t, ref_link.a));
list_create(&rc->rc_removed, sizeof (reference_t),
offsetof(reference_t, ref_link.l));
rc->rc_count = 0;
rc->rc_removed_count = 0;
rc->rc_tracked = reference_tracking_enable;
}
void
zfs_refcount_create_tracked(zfs_refcount_t *rc)
{
zfs_refcount_create(rc);
rc->rc_tracked = B_TRUE;
}
void
zfs_refcount_create_untracked(zfs_refcount_t *rc)
{
zfs_refcount_create(rc);
rc->rc_tracked = B_FALSE;
}
void
zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
{
reference_t *ref;
void *cookie = NULL;
ASSERT3U(rc->rc_count, ==, number);
while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL)
kmem_cache_free(reference_cache, ref);
avl_destroy(&rc->rc_tree);
while ((ref = list_remove_head(&rc->rc_removed)))
kmem_cache_free(reference_cache, ref);
list_destroy(&rc->rc_removed);
mutex_destroy(&rc->rc_mtx);
}
void
zfs_refcount_destroy(zfs_refcount_t *rc)
{
zfs_refcount_destroy_many(rc, 0);
}
int
zfs_refcount_is_zero(zfs_refcount_t *rc)
{
return (zfs_refcount_count(rc) == 0);
}
int64_t
zfs_refcount_count(zfs_refcount_t *rc)
{
return (atomic_load_64(&rc->rc_count));
}
int64_t
zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
{
reference_t *ref;
int64_t count;
if (likely(!rc->rc_tracked)) {
count = atomic_add_64_nv(&(rc)->rc_count, number);
ASSERT3U(count, >=, number);
return (count);
}
ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
ref->ref_holder = holder;
ref->ref_number = number;
ref->ref_search = B_FALSE;
mutex_enter(&rc->rc_mtx);
avl_add(&rc->rc_tree, ref);
rc->rc_count += number;
count = rc->rc_count;
mutex_exit(&rc->rc_mtx);
return (count);
}
int64_t
zfs_refcount_add(zfs_refcount_t *rc, const void *holder)
{
return (zfs_refcount_add_many(rc, 1, holder));
}
void
zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
{
if (likely(!rc->rc_tracked))
(void) zfs_refcount_add_many(rc, number, holder);
else for (; number > 0; number--)
(void) zfs_refcount_add(rc, holder);
}
int64_t
zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
const void *holder)
{
reference_t *ref, s;
int64_t count;
if (likely(!rc->rc_tracked)) {
count = atomic_add_64_nv(&(rc)->rc_count, -number);
ASSERT3S(count, >=, 0);
return (count);
}
s.ref_holder = holder;
s.ref_number = number;
s.ref_search = B_TRUE;
mutex_enter(&rc->rc_mtx);
ASSERT3U(rc->rc_count, >=, number);
ref = avl_find(&rc->rc_tree, &s, NULL);
if (unlikely(ref == NULL)) {
PANIC("No such hold %llx on refcount %llx",
(u_longlong_t)(uintptr_t)holder,
(u_longlong_t)(uintptr_t)rc);
return (-1);
}
avl_remove(&rc->rc_tree, ref);
if (reference_history > 0) {
list_insert_head(&rc->rc_removed, ref);
if (rc->rc_removed_count >= reference_history) {
ref = list_remove_tail(&rc->rc_removed);
kmem_cache_free(reference_cache, ref);
} else {
rc->rc_removed_count++;
}
} else {
kmem_cache_free(reference_cache, ref);
}
rc->rc_count -= number;
count = rc->rc_count;
mutex_exit(&rc->rc_mtx);
return (count);
}
int64_t
zfs_refcount_remove(zfs_refcount_t *rc, const void *holder)
{
return (zfs_refcount_remove_many(rc, 1, holder));
}
void
zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
{
if (likely(!rc->rc_tracked))
(void) zfs_refcount_remove_many(rc, number, holder);
else for (; number > 0; number--)
(void) zfs_refcount_remove(rc, holder);
}
void
zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
{
avl_tree_t tree;
list_t removed;
reference_t *ref;
void *cookie = NULL;
uint64_t count;
uint_t removed_count;
avl_create(&tree, zfs_refcount_compare, sizeof (reference_t),
offsetof(reference_t, ref_link.a));
list_create(&removed, sizeof (reference_t),
offsetof(reference_t, ref_link.l));
mutex_enter(&src->rc_mtx);
count = src->rc_count;
removed_count = src->rc_removed_count;
src->rc_count = 0;
src->rc_removed_count = 0;
avl_swap(&tree, &src->rc_tree);
list_move_tail(&removed, &src->rc_removed);
mutex_exit(&src->rc_mtx);
mutex_enter(&dst->rc_mtx);
dst->rc_count += count;
dst->rc_removed_count += removed_count;
if (avl_is_empty(&dst->rc_tree))
avl_swap(&dst->rc_tree, &tree);
else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL)
avl_add(&dst->rc_tree, ref);
list_move_tail(&dst->rc_removed, &removed);
mutex_exit(&dst->rc_mtx);
avl_destroy(&tree);
list_destroy(&removed);
}
void
zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number,
const void *current_holder, const void *new_holder)
{
reference_t *ref, s;
if (likely(!rc->rc_tracked))
return;
s.ref_holder = current_holder;
s.ref_number = number;
s.ref_search = B_TRUE;
mutex_enter(&rc->rc_mtx);
ref = avl_find(&rc->rc_tree, &s, NULL);
ASSERT(ref);
ref->ref_holder = new_holder;
avl_update(&rc->rc_tree, ref);
mutex_exit(&rc->rc_mtx);
}
void
zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder,
const void *new_holder)
{
return (zfs_refcount_transfer_ownership_many(rc, 1, current_holder,
new_holder));
}
/*
* If tracking is enabled, return true if a reference exists that matches
* the "holder" tag. If tracking is disabled, then return true if a reference
* might be held.
*/
boolean_t
zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
{
reference_t *ref, s;
avl_index_t idx;
boolean_t res;
if (likely(!rc->rc_tracked))
return (zfs_refcount_count(rc) > 0);
s.ref_holder = holder;
s.ref_number = 0;
s.ref_search = B_TRUE;
mutex_enter(&rc->rc_mtx);
ref = avl_find(&rc->rc_tree, &s, &idx);
if (likely(ref == NULL))
ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
res = ref && ref->ref_holder == holder;
mutex_exit(&rc->rc_mtx);
return (res);
}
/*
* If tracking is enabled, return true if a reference does not exist that
* matches the "holder" tag. If tracking is disabled, always return true
* since the reference might not be held.
*/
boolean_t
zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder)
{
reference_t *ref, s;
avl_index_t idx;
boolean_t res;
if (likely(!rc->rc_tracked))
return (B_TRUE);
mutex_enter(&rc->rc_mtx);
s.ref_holder = holder;
s.ref_number = 0;
s.ref_search = B_TRUE;
ref = avl_find(&rc->rc_tree, &s, &idx);
if (likely(ref == NULL))
ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
res = ref == NULL || ref->ref_holder != holder;
mutex_exit(&rc->rc_mtx);
return (res);
}
EXPORT_SYMBOL(zfs_refcount_create);
EXPORT_SYMBOL(zfs_refcount_destroy);
EXPORT_SYMBOL(zfs_refcount_is_zero);
EXPORT_SYMBOL(zfs_refcount_count);
EXPORT_SYMBOL(zfs_refcount_add);
EXPORT_SYMBOL(zfs_refcount_remove);
EXPORT_SYMBOL(zfs_refcount_held);
ZFS_MODULE_PARAM(zfs, , reference_tracking_enable, INT, ZMOD_RW,
"Track reference holders to refcount_t objects");
ZFS_MODULE_PARAM(zfs, , reference_history, UINT, ZMOD_RW,
"Maximum reference holders being tracked");
#endif /* ZFS_DEBUG */