Implement allocation size ranges and use for gang leaves (#17111)

When forced to resort to ganging, ZFS currently allocates three child
blocks, each one third of the size of the original. This is true
regardless of whether larger allocations could be made, which would
allow us to have fewer gang leaves. This improves performance when
fragmentation is high enough to require ganging, but not so high that
all the free ranges are only just big enough to hold a third of the
recordsize. This is also useful for improving the behavior of a future
change to allow larger gang headers.

We add the ability for the allocation codepath to allocate a range of
sizes instead of a single fixed size. We then use this to pre-allocate
the DVAs for the gang children. If those allocations fail, we fall back
to the normal write path, which will likely re-gang.

Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Co-authored-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
This commit is contained in:
Paul Dagnelie
2025-05-02 15:32:18 -07:00
committed by GitHub
parent a7de203c86
commit 246e5883bb
24 changed files with 392 additions and 107 deletions
+155 -45
View File
@@ -1692,17 +1692,30 @@ metaslab_largest_unflushed_free(metaslab_t *msp)
static zfs_range_seg_t *
metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start,
uint64_t size, zfs_btree_index_t *where)
uint64_t size, uint64_t max_size, zfs_btree_index_t *where)
{
zfs_range_seg_t *rs;
zfs_range_seg_max_t rsearch;
zfs_rs_set_start(&rsearch, rt, start);
zfs_rs_set_end(&rsearch, rt, start + size);
zfs_rs_set_end(&rsearch, rt, start + max_size);
rs = zfs_btree_find(t, &rsearch, where);
if (rs == NULL) {
rs = zfs_btree_next(t, where, where);
if (size == max_size) {
rs = zfs_btree_next(t, where, where);
} else {
/*
* If we're searching for a range, get the largest
* segment in that range, or the smallest one bigger
* than it.
*/
rs = zfs_btree_prev(t, where, where);
if (rs == NULL || zfs_rs_get_end(rs, rt) -
zfs_rs_get_start(rs, rt) < size) {
rs = zfs_btree_next(t, where, where);
}
}
}
return (rs);
@@ -1715,14 +1728,14 @@ metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start,
*/
static uint64_t
metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size,
uint64_t max_search)
uint64_t max_size, uint64_t max_search, uint64_t *found_size)
{
if (*cursor == 0)
*cursor = rt->rt_start;
zfs_btree_t *bt = &rt->rt_root;
zfs_btree_index_t where;
zfs_range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size,
&where);
max_size, &where);
uint64_t first_found;
int count_searched = 0;
@@ -1733,7 +1746,9 @@ metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size,
max_search || count_searched < metaslab_min_search_count)) {
uint64_t offset = zfs_rs_get_start(rs, rt);
if (offset + size <= zfs_rs_get_end(rs, rt)) {
*cursor = offset + size;
*found_size = MIN(zfs_rs_get_end(rs, rt) - offset,
max_size);
*cursor = offset + *found_size;
return (offset);
}
rs = zfs_btree_next(bt, &where, &where);
@@ -1741,12 +1756,16 @@ metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size,
}
*cursor = 0;
*found_size = 0;
return (-1ULL);
}
static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size);
static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size);
static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size);
static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size,
uint64_t max_size, uint64_t *found_size);
static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size,
uint64_t max_size, uint64_t *found_size);
static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size,
uint64_t max_size, uint64_t *found_size);
metaslab_ops_t *metaslab_allocator(spa_t *spa);
static metaslab_ops_t metaslab_allocators[] = {
@@ -1832,7 +1851,8 @@ metaslab_allocator(spa_t *spa)
* ==========================================================================
*/
static uint64_t
metaslab_df_alloc(metaslab_t *msp, uint64_t size)
metaslab_df_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
uint64_t *found_size)
{
/*
* Find the largest power of 2 block size that evenly divides the
@@ -1841,7 +1861,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
* bucket) but it does not guarantee that other allocations sizes
* may exist in the same region.
*/
uint64_t align = size & -size;
uint64_t align = max_size & -max_size;
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
zfs_range_tree_t *rt = msp->ms_allocatable;
uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size;
@@ -1855,10 +1875,18 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
*/
if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
free_pct < metaslab_df_free_pct) {
align = size & -size;
cursor = &msp->ms_lbas[highbit64(align) - 1];
offset = -1;
} else {
offset = metaslab_block_picker(rt,
cursor, size, metaslab_df_max_search);
offset = metaslab_block_picker(rt, cursor, size, max_size,
metaslab_df_max_search, found_size);
if (max_size != size && offset == -1) {
align = size & -size;
cursor = &msp->ms_lbas[highbit64(align) - 1];
offset = metaslab_block_picker(rt, cursor, size,
max_size, metaslab_df_max_search, found_size);
}
}
if (offset == -1) {
@@ -1873,12 +1901,14 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
zfs_btree_index_t where;
/* use segment of this size, or next largest */
rs = metaslab_block_find(&msp->ms_allocatable_by_size,
rt, msp->ms_start, size, &where);
rt, msp->ms_start, size, max_size, &where);
}
if (rs != NULL && zfs_rs_get_start(rs, rt) + size <=
zfs_rs_get_end(rs, rt)) {
offset = zfs_rs_get_start(rs, rt);
*cursor = offset + size;
*found_size = MIN(zfs_rs_get_end(rs, rt) - offset,
max_size);
*cursor = offset + *found_size;
}
}
@@ -1895,7 +1925,8 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
* ==========================================================================
*/
static uint64_t
metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
metaslab_cf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
uint64_t *found_size)
{
zfs_range_tree_t *rt = msp->ms_allocatable;
zfs_btree_t *t = &msp->ms_allocatable_by_size;
@@ -1922,7 +1953,8 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
}
offset = *cursor;
*cursor += size;
*found_size = MIN(*cursor_end - offset, max_size);
*cursor = offset + *found_size;
return (offset);
}
@@ -1943,33 +1975,43 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
uint64_t metaslab_ndf_clump_shift = 4;
static uint64_t
metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
metaslab_ndf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
uint64_t *found_size)
{
zfs_btree_t *t = &msp->ms_allocatable->rt_root;
zfs_range_tree_t *rt = msp->ms_allocatable;
zfs_btree_index_t where;
zfs_range_seg_t *rs;
zfs_range_seg_max_t rsearch;
uint64_t hbit = highbit64(size);
uint64_t hbit = highbit64(max_size);
uint64_t *cursor = &msp->ms_lbas[hbit - 1];
uint64_t max_size = metaslab_largest_allocatable(msp);
uint64_t max_possible_size = metaslab_largest_allocatable(msp);
ASSERT(MUTEX_HELD(&msp->ms_lock));
if (max_size < size)
if (max_possible_size < size)
return (-1ULL);
zfs_rs_set_start(&rsearch, rt, *cursor);
zfs_rs_set_end(&rsearch, rt, *cursor + size);
zfs_rs_set_end(&rsearch, rt, *cursor + max_size);
rs = zfs_btree_find(t, &rsearch, &where);
if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) <
max_size) {
hbit = highbit64(size);
cursor = &msp->ms_lbas[hbit - 1];
zfs_rs_set_start(&rsearch, rt, *cursor);
zfs_rs_set_end(&rsearch, rt, *cursor + size);
rs = zfs_btree_find(t, &rsearch, &where);
}
if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) <
size) {
t = &msp->ms_allocatable_by_size;
zfs_rs_set_start(&rsearch, rt, 0);
zfs_rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
metaslab_ndf_clump_shift)));
zfs_rs_set_end(&rsearch, rt, MIN(max_possible_size,
1ULL << (hbit + metaslab_ndf_clump_shift)));
rs = zfs_btree_find(t, &rsearch, &where);
if (rs == NULL)
@@ -1978,7 +2020,9 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
}
if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) {
*cursor = zfs_rs_get_start(rs, rt) + size;
*found_size = MIN(zfs_rs_get_end(rs, rt) -
zfs_rs_get_start(rs, rt), max_size);
*cursor = zfs_rs_get_start(rs, rt) + *found_size;
return (zfs_rs_get_start(rs, rt));
}
return (-1ULL);
@@ -4668,6 +4712,15 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
}
void
metaslab_trace_move(zio_alloc_list_t *old, zio_alloc_list_t *new)
{
ASSERT0(new->zal_size);
list_move_tail(&new->zal_list, &old->zal_list);
new->zal_size = old->zal_size;
list_destroy(&old->zal_list);
}
void
metaslab_trace_init(zio_alloc_list_t *zal)
{
@@ -4697,7 +4750,7 @@ static void
metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator,
int flags, uint64_t psize, const void *tag)
{
if (!(flags & METASLAB_ASYNC_ALLOC))
if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL)
return;
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
@@ -4708,11 +4761,22 @@ metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator,
(void) zfs_refcount_add_many(&mga->mga_queue_depth, psize, tag);
}
void
metaslab_group_alloc_increment_all(spa_t *spa, blkptr_t *bp, int allocator,
int flags, uint64_t psize, const void *tag)
{
for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[d]);
metaslab_group_alloc_increment(spa, vdev, allocator, flags,
psize, tag);
}
}
void
metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, int allocator,
int flags, uint64_t psize, const void *tag)
{
if (!(flags & METASLAB_ASYNC_ALLOC))
if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL)
return;
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
@@ -4724,7 +4788,8 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, int allocator,
}
static uint64_t
metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
uint64_t txg, uint64_t *actual_size)
{
uint64_t start;
zfs_range_tree_t *rt = msp->ms_allocatable;
@@ -4735,8 +4800,9 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
VERIFY0(msp->ms_disabled);
VERIFY0(msp->ms_new);
start = mc->mc_ops->msop_alloc(msp, size);
start = mc->mc_ops->msop_alloc(msp, size, max_size, actual_size);
if (start != -1ULL) {
size = *actual_size;
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
@@ -4879,8 +4945,9 @@ metaslab_active_mask_verify(metaslab_t *msp)
static uint64_t
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
uint64_t asize, uint64_t txg, dva_t *dva, int d, int allocator,
boolean_t try_hard)
uint64_t asize, uint64_t max_asize, uint64_t txg,
dva_t *dva, int d, int allocator, boolean_t try_hard,
uint64_t *actual_asize)
{
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
@@ -5095,16 +5162,19 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
continue;
}
offset = metaslab_block_alloc(msp, asize, txg);
metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
offset = metaslab_block_alloc(msp, asize, max_asize, txg,
actual_asize);
if (offset != -1ULL) {
metaslab_trace_add(zal, mg, msp, *actual_asize, d,
offset, allocator);
/* Proactively passivate the metaslab, if needed */
if (activated)
metaslab_segment_may_passivate(msp);
mutex_exit(&msp->ms_lock);
break;
}
metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
next:
ASSERT(msp->ms_loaded);
@@ -5243,13 +5313,10 @@ metaslab_group_allocatable(spa_t *spa, metaslab_group_t *mg, uint64_t psize,
return (B_TRUE);
}
/*
* Allocate a block for the specified i/o.
*/
int
metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
zio_alloc_list_t *zal, int allocator)
static int
metaslab_alloc_dva_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
uint64_t max_psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg,
int flags, zio_alloc_list_t *zal, int allocator, uint64_t *actual_psize)
{
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
metaslab_group_t *mg = NULL, *rotor;
@@ -5272,6 +5339,13 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
allocator);
return (SET_ERROR(ENOSPC));
}
if (max_psize > psize && max_psize >= metaslab_force_ganging &&
metaslab_force_ganging_pct > 0 &&
(random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
max_psize = MAX((psize + max_psize) / 2,
metaslab_force_ganging);
}
ASSERT3U(psize, <=, max_psize);
/*
* Start at the rotor and loop through all mgs until we find something.
@@ -5319,11 +5393,18 @@ top:
vd = mg->mg_vd;
uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
dva, d, allocator, try_hard);
ASSERT0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
uint64_t max_asize = vdev_psize_to_asize_txg(vd, max_psize,
txg);
ASSERT0(P2PHASE(max_asize, 1ULL << vd->vdev_ashift));
uint64_t offset = metaslab_group_alloc(mg, zal, asize,
max_asize, txg, dva, d, allocator, try_hard,
&asize);
if (offset != -1ULL) {
if (actual_psize)
*actual_psize = vdev_asize_to_psize_txg(vd,
asize, txg);
metaslab_class_rotate(mg, allocator, psize, B_TRUE);
DVA_SET_VDEV(&dva[d], vd->vdev_id);
@@ -5354,6 +5435,18 @@ next:
return (SET_ERROR(ENOSPC));
}
/*
* Allocate a block for the specified i/o.
*/
int
metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
zio_alloc_list_t *zal, int allocator)
{
return (metaslab_alloc_dva_range(spa, mc, psize, psize, dva, d, hintdva,
txg, flags, zal, allocator, NULL));
}
void
metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
boolean_t checkpoint)
@@ -5841,6 +5934,16 @@ int
metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
zio_alloc_list_t *zal, int allocator, const void *tag)
{
return (metaslab_alloc_range(spa, mc, psize, psize, bp, ndvas, txg,
hintbp, flags, zal, allocator, tag, NULL));
}
int
metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
uint64_t max_psize, blkptr_t *bp, int ndvas, uint64_t txg,
blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, int allocator,
const void *tag, uint64_t *actual_psize)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
@@ -5862,9 +5965,12 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
ASSERT3P(zal, !=, NULL);
uint64_t cur_psize = 0;
for (int d = 0; d < ndvas; d++) {
error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
txg, flags, zal, allocator);
error = metaslab_alloc_dva_range(spa, mc, psize, max_psize,
dva, d, hintdva, txg, flags, zal, allocator,
actual_psize ? &cur_psize : NULL);
if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_unalloc_dva(spa, &dva[d], txg);
@@ -5883,10 +5989,14 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
metaslab_group_alloc_increment(spa,
DVA_GET_VDEV(&dva[d]), allocator, flags, psize,
tag);
if (actual_psize)
max_psize = MIN(cur_psize, max_psize);
}
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
if (actual_psize)
*actual_psize = max_psize;
spa_config_exit(spa, SCL_ALLOC, FTAG);