Implement dynamic gang header sizes

ZFS gang block headers are currently fixed at 512 bytes. This is
increasingly wasteful in the era of larger disk sector sizes. This PR
allows any size allocation to work as a gang header. It also contains
supporting changes to ZDB to make gang headers easier to work with.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Closes #17004
This commit is contained in:
Paul Dagnelie
2025-01-23 16:26:09 -08:00
committed by Brian Behlendorf
parent e845be28e7
commit a981cb69e4
18 changed files with 387 additions and 72 deletions
+7 -7
View File
@@ -5974,12 +5974,12 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
ASSERT3P(zal, !=, NULL);
uint64_t cur_psize = 0;
uint64_t smallest_psize = UINT64_MAX;
for (int d = 0; d < ndvas; d++) {
error = metaslab_alloc_dva_range(spa, mc, psize, max_psize,
dva, d, hintdva, txg, flags, zal, allocator,
actual_psize ? &cur_psize : NULL);
uint64_t cur_psize = 0;
error = metaslab_alloc_dva_range(spa, mc, psize,
MIN(smallest_psize, max_psize), dva, d, hintdva, txg,
flags, zal, allocator, actual_psize ? &cur_psize : NULL);
if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_unalloc_dva(spa, &dva[d], txg);
@@ -5999,13 +5999,13 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
DVA_GET_VDEV(&dva[d]), allocator, flags, psize,
tag);
if (actual_psize)
max_psize = MIN(cur_psize, max_psize);
smallest_psize = MIN(cur_psize, smallest_psize);
}
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
if (actual_psize)
*actual_psize = max_psize;
*actual_psize = smallest_psize;
spa_config_exit(spa, SCL_ALLOC, FTAG);
+105 -38
View File
@@ -2743,11 +2743,14 @@ zio_resume_wait(spa_t *spa)
* being nearly full, it calls zio_write_gang_block() to construct the
* block from smaller fragments.
*
* A gang block consists of a gang header (zio_gbh_phys_t) and up to
* three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
* an indirect block: it's an array of block pointers. It consumes
* only one sector and hence is allocatable regardless of fragmentation.
* The gang header's bps point to its gang members, which hold the data.
* A gang block consists of a a gang header and up to gbh_nblkptrs(size)
* gang members. The gang header is like an indirect block: it's an array
* of block pointers, though the header has a small tail (a zio_eck_t)
* that stores an embedded checksum. It is allocated using only a single
* sector as the requested size, and hence is allocatable regardless of
* fragmentation. Its size is determined by the smallest allocatable
* asize of the vdevs it was allocated on. The gang header's bps point
* to its gang members, which hold the data.
*
* Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
* as the verifier to ensure uniqueness of the SHA256 checksum.
@@ -2826,10 +2829,10 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
if (gn != NULL) {
abd_t *gbh_abd =
abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
abd_get_from_buf(gn->gn_gbh, gn->gn_gangblocksize);
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
gbh_abd, gn->gn_gangblocksize, zio_gang_issue_func_done,
NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark);
/*
* As we rewrite each gang header, the pipeline will compute
@@ -2900,14 +2903,16 @@ static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
static void zio_gang_tree_assemble_done(zio_t *zio);
static zio_gang_node_t *
zio_gang_node_alloc(zio_gang_node_t **gnpp)
zio_gang_node_alloc(zio_gang_node_t **gnpp, uint64_t gangblocksize)
{
zio_gang_node_t *gn;
ASSERT(*gnpp == NULL);
gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
gn = kmem_zalloc(sizeof (*gn) +
(gbh_nblkptrs(gangblocksize) * sizeof (gn)), KM_SLEEP);
gn->gn_gangblocksize = gn->gn_allocsize = gangblocksize;
gn->gn_gbh = zio_buf_alloc(gangblocksize);
*gnpp = gn;
return (gn);
@@ -2918,11 +2923,12 @@ zio_gang_node_free(zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = *gnpp;
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
ASSERT(gn->gn_child[g] == NULL);
zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
kmem_free(gn, sizeof (*gn));
zio_buf_free(gn->gn_gbh, gn->gn_allocsize);
kmem_free(gn, sizeof (*gn) +
(gbh_nblkptrs(gn->gn_allocsize) * sizeof (gn)));
*gnpp = NULL;
}
@@ -2934,7 +2940,7 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
if (gn == NULL)
return;
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
zio_gang_tree_free(&gn->gn_child[g]);
zio_gang_node_free(gnpp);
@@ -2943,13 +2949,28 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
static void
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
uint64_t gangblocksize = UINT64_MAX;
if (spa_feature_is_active(gio->io_spa,
SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
spa_config_enter(gio->io_spa, SCL_VDEV, FTAG, RW_READER);
for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) {
vdev_t *vd = vdev_lookup_top(gio->io_spa,
DVA_GET_VDEV(&bp->blk_dva[dva]));
uint64_t asize = vdev_gang_header_asize(vd);
gangblocksize = MIN(gangblocksize, asize);
}
spa_config_exit(gio->io_spa, SCL_VDEV, FTAG);
} else {
gangblocksize = SPA_OLD_GANGBLOCKSIZE;
}
ASSERT3U(gangblocksize, !=, UINT64_MAX);
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp, gangblocksize);
abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, gangblocksize);
ASSERT(gio->io_gang_leader == gio);
ASSERT(BP_IS_GANG(bp));
zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, gangblocksize,
zio_gang_tree_assemble_done, gn, gio->io_priority,
ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
}
@@ -2972,13 +2993,17 @@ zio_gang_tree_assemble_done(zio_t *zio)
byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
/*
* If this was an old-style gangblock, the gangblocksize should have
* been updated in zio_checksum_error to reflect that.
*/
ASSERT3U(gbh_eck(gn->gn_gbh, gn->gn_gangblocksize)->zec_magic,
==, ZEC_MAGIC);
abd_free(zio->io_abd);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
if (!BP_IS_GANG(gbp))
continue;
zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
@@ -3003,10 +3028,11 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
if (gn != NULL) {
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
ASSERT3U(gbh_eck(gn->gn_gbh,
gn->gn_gangblocksize)->zec_magic, ==, ZEC_MAGIC);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
if (BP_IS_HOLE(gbp))
continue;
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
@@ -3113,6 +3139,13 @@ zio_write_gang_done(zio_t *zio)
abd_free(zio->io_abd);
}
static void
zio_update_feature(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_feature_incr(spa, (spa_feature_t)(uintptr_t)arg, tx);
}
static zio_t *
zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
{
@@ -3158,13 +3191,17 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
flags |= METASLAB_ASYNC_ALLOC;
}
error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
uint64_t gangblocksize = SPA_OLD_GANGBLOCKSIZE;
uint64_t candidate = gangblocksize;
error = metaslab_alloc_range(spa, mc, gangblocksize, gangblocksize,
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
&pio->io_alloc_list, pio->io_allocator, pio);
&pio->io_alloc_list, pio->io_allocator, pio, &candidate);
if (error) {
pio->io_error = error;
return (pio);
}
if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER))
gangblocksize = candidate;
if (pio == gio) {
gnpp = &gio->io_gang_tree;
@@ -3173,15 +3210,15 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
ASSERT(pio->io_ready == zio_write_gang_member_ready);
}
gn = zio_gang_node_alloc(gnpp);
gn = zio_gang_node_alloc(gnpp, gangblocksize);
gbh = gn->gn_gbh;
memset(gbh, 0, SPA_GANGBLOCKSIZE);
gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
memset(gbh, 0, gangblocksize);
gbh_abd = abd_get_from_buf(gbh, gangblocksize);
/*
* Create the gang header.
*/
zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, gangblocksize,
zio_write_gang_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
@@ -3198,7 +3235,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
* opportunistic allocations. If that fails to generate enough
* space, we fall back to normal zio_write calls for nested gang.
*/
for (int g = 0; resid != 0; g++) {
int g;
boolean_t any_failed = B_FALSE;
for (g = 0; resid != 0; g++) {
flags &= METASLAB_ASYNC_ALLOC;
flags |= METASLAB_GANG_CHILD;
zp.zp_checksum = gio->io_prop.zp_checksum;
@@ -3219,9 +3258,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
uint64_t min_size = zio_roundup_alloc_size(spa,
resid / (SPA_GBH_NBLKPTRS - g));
resid / (gbh_nblkptrs(gangblocksize) - g));
min_size = MIN(min_size, resid);
bp = &gbh->zg_blkptr[g];
bp = &((blkptr_t *)gbh)[g];
zio_alloc_list_t cio_list;
metaslab_trace_init(&cio_list);
@@ -3231,6 +3270,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
flags, &cio_list, zio->io_allocator, NULL, &allocated_size);
boolean_t allocated = error == 0;
any_failed |= !allocated;
uint64_t psize = allocated ? MIN(resid, allocated_size) :
min_size;
@@ -3262,6 +3302,29 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zio_nowait(cio);
}
/*
* If we used more gang children than the old limit, we must already be
* using the new headers. No need to update anything, just move on.
*
* Otherwise, we might be in a case where we need to turn on the new
* feature, so we check that. We enable the new feature if we didn't
* manage to fit everything into 3 gang children and we could have
* written more than that.
*/
if (g > gbh_nblkptrs(SPA_OLD_GANGBLOCKSIZE)) {
ASSERT(spa_feature_is_active(spa,
SPA_FEATURE_DYNAMIC_GANG_HEADER));
} else if (any_failed && candidate > SPA_OLD_GANGBLOCKSIZE &&
spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) &&
!spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
dmu_tx_t *tx =
dmu_tx_create_assigned(spa->spa_dsl_pool, txg + 1);
dsl_sync_task_nowait(spa->spa_dsl_pool,
zio_update_feature,
(void *)SPA_FEATURE_DYNAMIC_GANG_HEADER, tx);
dmu_tx_commit(tx);
}
/*
* Set pio's pipeline to just wait for zio to finish.
*/
@@ -4331,9 +4394,9 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
}
if (gn != NULL) {
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
zio_dva_unallocate(zio, gn->gn_child[g],
&gn->gn_gbh->zg_blkptr[g]);
gbh_bp(gn->gn_gbh, g));
}
}
}
@@ -5262,6 +5325,7 @@ zio_dva_throttle_done(zio_t *zio)
vdev_t *vd = zio->io_vd;
int flags = METASLAB_ASYNC_ALLOC;
const void *tag = pio;
uint64_t size = pio->io_size;
ASSERT3P(zio->io_bp, !=, NULL);
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
@@ -5277,10 +5341,13 @@ zio_dva_throttle_done(zio_t *zio)
* Parents of gang children can have two flavors -- ones that allocated
* the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that
* allocated the constituent blocks. The first use their parent as tag.
* We set the size to match the original allocation call for that case.
*/
if (pio->io_child_type == ZIO_CHILD_GANG &&
(pio->io_flags & ZIO_FLAG_IO_REWRITE))
(pio->io_flags & ZIO_FLAG_IO_REWRITE)) {
tag = zio_unique_parent(pio);
size = SPA_OLD_GANGBLOCKSIZE;
}
ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG &&
(pio->io_flags & ZIO_FLAG_IO_REWRITE)));
@@ -5293,7 +5360,7 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id,
pio->io_allocator, flags, pio->io_size, tag);
pio->io_allocator, flags, size, tag);
if (metaslab_class_throttle_unreserve(pio->io_metaslab_class,
pio->io_allocator, 1, pio->io_size)) {
+23 -2
View File
@@ -545,14 +545,35 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
int error;
uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t size = bp ? BP_GET_PSIZE(bp) : zio->io_size;
uint64_t offset = zio->io_offset;
abd_t *data = zio->io_abd;
spa_t *spa = zio->io_spa;
if (bp && BP_IS_GANG(bp)) {
if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER))
size = zio->io_size;
else
size = SPA_OLD_GANGBLOCKSIZE;
}
error = zio_checksum_error_impl(spa, bp, checksum, data, size,
offset, info);
if (error && bp && BP_IS_GANG(bp) && size > SPA_OLD_GANGBLOCKSIZE) {
/*
* It's possible that this is an old gang block. Rerun
* the checksum with the old size; if that passes, then
* update the gangblocksize appropriately.
*/
error = zio_checksum_error_impl(spa, bp, checksum, data,
SPA_OLD_GANGBLOCKSIZE, offset, info);
if (error == 0) {
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
zio_t *pio = zio_unique_parent(zio);
zio_gang_node_t *gn = pio->io_private;
gn->gn_gangblocksize = SPA_OLD_GANGBLOCKSIZE;
}
}
if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
error = zio_handle_fault_injection(zio, ECKSUM);