Implement dynamic gang header sizes

ZFS gang block headers are currently fixed at 512 bytes. This is
increasingly wasteful in the era of larger disk sector sizes. This PR
allows any size allocation to work as a gang header. It also contains
supporting changes to ZDB to make gang headers easier to work with.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Closes #17004
This commit is contained in:
Paul Dagnelie 2025-01-23 16:26:09 -08:00 committed by Brian Behlendorf
parent e845be28e7
commit a981cb69e4
18 changed files with 387 additions and 72 deletions

View File

@ -8588,9 +8588,9 @@ zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
}
static void
zdb_dump_gbh(void *buf, int flags)
zdb_dump_gbh(void *buf, uint64_t size, int flags)
{
zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
zdb_dump_indirect((blkptr_t *)buf, gbh_nblkptrs(size), flags);
}
static void
@ -9073,7 +9073,7 @@ zdb_read_block(char *thing, spa_t *spa)
zdb_dump_indirect((blkptr_t *)buf,
orig_lsize / sizeof (blkptr_t), flags);
else if (flags & ZDB_FLAG_GBH)
zdb_dump_gbh(buf, flags);
zdb_dump_gbh(buf, lsize, flags);
else
zdb_dump_block(thing, buf, lsize, flags);

View File

@ -148,7 +148,7 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
static inline uint64_t
vdev_gang_header_asize(vdev_t *vd)
{
return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0));
return (vdev_psize_to_asize_txg(vd, SPA_OLD_GANGBLOCKSIZE, 0));
}
extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);

View File

@ -59,21 +59,36 @@ typedef struct zio_eck {
/*
* Gang block headers are self-checksumming and contain an array
* of block pointers.
* of block pointers. The old gang block size has enough room for 3 blkptrs,
* while new gang blocks can store more.
*
* Layout:
* +--------+--------+--------+-----+---------+-----------+
* | | | | | | |
* | blkptr | blkptr | blkptr | ... | padding | zio_eck_t |
* | 1 | 2 | 3 | | | |
* +--------+--------+--------+-----+---------+-----------+
* 128B 128B 128B 88B 40B
*/
#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
sizeof (zio_eck_t)) / sizeof (blkptr_t))
#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
sizeof (zio_eck_t) - \
(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
sizeof (uint64_t))
#define SPA_OLD_GANGBLOCKSIZE SPA_MINBLOCKSIZE
typedef void zio_gbh_phys_t;
typedef struct zio_gbh {
blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
uint64_t zg_filler[SPA_GBH_FILLER];
zio_eck_t zg_tail;
} zio_gbh_phys_t;
static inline uint64_t
gbh_nblkptrs(uint64_t size) {
ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t)));
return ((size - sizeof (zio_eck_t)) / sizeof (blkptr_t));
}
static inline zio_eck_t *
gbh_eck(zio_gbh_phys_t *gbh, uint64_t size) {
ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t)));
return ((zio_eck_t *)((uintptr_t)gbh + size - sizeof (zio_eck_t)));
}
static inline blkptr_t *
gbh_bp(zio_gbh_phys_t *gbh, int bp) {
return (&((blkptr_t *)gbh)[bp]);
}
enum zio_checksum {
ZIO_CHECKSUM_INHERIT = 0,
@ -398,7 +413,9 @@ typedef struct zio_vsd_ops {
typedef struct zio_gang_node {
zio_gbh_phys_t *gn_gbh;
struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS];
uint64_t gn_gangblocksize;
uint64_t gn_allocsize;
struct zio_gang_node *gn_child[];
} zio_gang_node_t;
typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,

View File

@ -87,6 +87,7 @@ typedef enum spa_feature {
SPA_FEATURE_FAST_DEDUP,
SPA_FEATURE_LONGNAME,
SPA_FEATURE_LARGE_MICROZAP,
SPA_FEATURE_DYNAMIC_GANG_HEADER,
SPA_FEATURES
} spa_feature_t;

View File

@ -631,7 +631,7 @@
<elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2464' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2520' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_deleg_perm_tab' size='528' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -6210,7 +6210,8 @@
<enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/>
<enumerator name='SPA_FEATURE_LONGNAME' value='42'/>
<enumerator name='SPA_FEATURE_LARGE_MICROZAP' value='43'/>
<enumerator name='SPA_FEATURES' value='44'/>
<enumerator name='SPA_FEATURE_DYNAMIC_GANG_HEADER' value='44'/>
<enumerator name='SPA_FEATURES' value='45'/>
</enum-decl>
<typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
<qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
@ -9394,8 +9395,8 @@
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='19712' id='fd4573e5'>
<subrange length='44' type-id='7359adad' id='cf8ba455'/>
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='20160' id='b948da70'>
<subrange length='45' type-id='7359adad' id='cb8ddca0'/>
</array-type-def>
<enum-decl name='zfeature_flags' id='6db816a4'>
<underlying-type type-id='9cac1fee'/>
@ -9403,6 +9404,7 @@
<enumerator name='ZFEATURE_FLAG_MOS' value='2'/>
<enumerator name='ZFEATURE_FLAG_ACTIVATE_ON_ENABLE' value='4'/>
<enumerator name='ZFEATURE_FLAG_PER_DATASET' value='8'/>
<enumerator name='ZFEATURE_FLAG_NO_UPGRADE' value='16'/>
</enum-decl>
<typedef-decl name='zfeature_flags_t' type-id='6db816a4' id='fc329033'/>
<enum-decl name='zfeature_type' id='c4fa2355'>
@ -9472,7 +9474,7 @@
<pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
<qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
<pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
<var-decl name='spa_feature_table' type-id='fd4573e5' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
<var-decl name='spa_feature_table' type-id='b948da70' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
<var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
<function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>

View File

@ -493,6 +493,19 @@ vdev type, or when adding a new
.Sy draid
vdev to an existing pool.
.
.feature com.klarasystems dynamic_gang_header no
This feature enables larger gang headers based on the sector size of the pool.
When enabled, gang headers will use the entire space allocated for them, instead
of always restricting themselves to 512 bytes.
This can reduce the need for nested gang trees in extreme fragmentation
scenarios.
.Pp
This feature becomes active when a gang header is written that is larger than
512 bytes.
This feature is not enabled by
.Xr zpool-upgrade 8 .
Instead, it must be manually enabled, or be part of a compatibility file.
.
.feature org.illumos edonr no extensible_dataset
This feature enables the use of the Edon-R hash algorithm for checksum,
including for nopwrite

View File

@ -786,6 +786,12 @@ zpool_feature_init(void)
ZFEATURE_TYPE_BOOLEAN, large_microzap_deps, sfeatures);
}
zfeature_register(SPA_FEATURE_DYNAMIC_GANG_HEADER,
"com.klarasystems:dynamic_gang_header", "dynamic_gang_header",
"Support for dynamically sized gang headers",
ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_NO_UPGRADE,
ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
zfs_mod_list_supported_free(sfeatures);
}

View File

@ -5974,12 +5974,12 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
ASSERT3P(zal, !=, NULL);
uint64_t cur_psize = 0;
uint64_t smallest_psize = UINT64_MAX;
for (int d = 0; d < ndvas; d++) {
error = metaslab_alloc_dva_range(spa, mc, psize, max_psize,
dva, d, hintdva, txg, flags, zal, allocator,
actual_psize ? &cur_psize : NULL);
uint64_t cur_psize = 0;
error = metaslab_alloc_dva_range(spa, mc, psize,
MIN(smallest_psize, max_psize), dva, d, hintdva, txg,
flags, zal, allocator, actual_psize ? &cur_psize : NULL);
if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_unalloc_dva(spa, &dva[d], txg);
@ -5999,13 +5999,13 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
DVA_GET_VDEV(&dva[d]), allocator, flags, psize,
tag);
if (actual_psize)
max_psize = MIN(cur_psize, max_psize);
smallest_psize = MIN(cur_psize, smallest_psize);
}
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
if (actual_psize)
*actual_psize = max_psize;
*actual_psize = smallest_psize;
spa_config_exit(spa, SCL_ALLOC, FTAG);

View File

@ -2743,11 +2743,14 @@ zio_resume_wait(spa_t *spa)
* being nearly full, it calls zio_write_gang_block() to construct the
* block from smaller fragments.
*
* A gang block consists of a gang header (zio_gbh_phys_t) and up to
* three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
* an indirect block: it's an array of block pointers. It consumes
* only one sector and hence is allocatable regardless of fragmentation.
* The gang header's bps point to its gang members, which hold the data.
* A gang block consists of a a gang header and up to gbh_nblkptrs(size)
* gang members. The gang header is like an indirect block: it's an array
* of block pointers, though the header has a small tail (a zio_eck_t)
* that stores an embedded checksum. It is allocated using only a single
* sector as the requested size, and hence is allocatable regardless of
* fragmentation. Its size is determined by the smallest allocatable
* asize of the vdevs it was allocated on. The gang header's bps point
* to its gang members, which hold the data.
*
* Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
* as the verifier to ensure uniqueness of the SHA256 checksum.
@ -2826,10 +2829,10 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
if (gn != NULL) {
abd_t *gbh_abd =
abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
abd_get_from_buf(gn->gn_gbh, gn->gn_gangblocksize);
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
gbh_abd, gn->gn_gangblocksize, zio_gang_issue_func_done,
NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark);
/*
* As we rewrite each gang header, the pipeline will compute
@ -2900,14 +2903,16 @@ static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
static void zio_gang_tree_assemble_done(zio_t *zio);
static zio_gang_node_t *
zio_gang_node_alloc(zio_gang_node_t **gnpp)
zio_gang_node_alloc(zio_gang_node_t **gnpp, uint64_t gangblocksize)
{
zio_gang_node_t *gn;
ASSERT(*gnpp == NULL);
gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
gn = kmem_zalloc(sizeof (*gn) +
(gbh_nblkptrs(gangblocksize) * sizeof (gn)), KM_SLEEP);
gn->gn_gangblocksize = gn->gn_allocsize = gangblocksize;
gn->gn_gbh = zio_buf_alloc(gangblocksize);
*gnpp = gn;
return (gn);
@ -2918,11 +2923,12 @@ zio_gang_node_free(zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = *gnpp;
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
ASSERT(gn->gn_child[g] == NULL);
zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
kmem_free(gn, sizeof (*gn));
zio_buf_free(gn->gn_gbh, gn->gn_allocsize);
kmem_free(gn, sizeof (*gn) +
(gbh_nblkptrs(gn->gn_allocsize) * sizeof (gn)));
*gnpp = NULL;
}
@ -2934,7 +2940,7 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
if (gn == NULL)
return;
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
zio_gang_tree_free(&gn->gn_child[g]);
zio_gang_node_free(gnpp);
@ -2943,13 +2949,28 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
static void
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
uint64_t gangblocksize = UINT64_MAX;
if (spa_feature_is_active(gio->io_spa,
SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
spa_config_enter(gio->io_spa, SCL_VDEV, FTAG, RW_READER);
for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) {
vdev_t *vd = vdev_lookup_top(gio->io_spa,
DVA_GET_VDEV(&bp->blk_dva[dva]));
uint64_t asize = vdev_gang_header_asize(vd);
gangblocksize = MIN(gangblocksize, asize);
}
spa_config_exit(gio->io_spa, SCL_VDEV, FTAG);
} else {
gangblocksize = SPA_OLD_GANGBLOCKSIZE;
}
ASSERT3U(gangblocksize, !=, UINT64_MAX);
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp, gangblocksize);
abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, gangblocksize);
ASSERT(gio->io_gang_leader == gio);
ASSERT(BP_IS_GANG(bp));
zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, gangblocksize,
zio_gang_tree_assemble_done, gn, gio->io_priority,
ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
}
@ -2972,13 +2993,17 @@ zio_gang_tree_assemble_done(zio_t *zio)
byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
/*
* If this was an old-style gangblock, the gangblocksize should have
* been updated in zio_checksum_error to reflect that.
*/
ASSERT3U(gbh_eck(gn->gn_gbh, gn->gn_gangblocksize)->zec_magic,
==, ZEC_MAGIC);
abd_free(zio->io_abd);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
if (!BP_IS_GANG(gbp))
continue;
zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
@ -3003,10 +3028,11 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
if (gn != NULL) {
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
ASSERT3U(gbh_eck(gn->gn_gbh,
gn->gn_gangblocksize)->zec_magic, ==, ZEC_MAGIC);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
if (BP_IS_HOLE(gbp))
continue;
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
@ -3113,6 +3139,13 @@ zio_write_gang_done(zio_t *zio)
abd_free(zio->io_abd);
}
static void
zio_update_feature(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_feature_incr(spa, (spa_feature_t)(uintptr_t)arg, tx);
}
static zio_t *
zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
{
@ -3158,13 +3191,17 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
flags |= METASLAB_ASYNC_ALLOC;
}
error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
uint64_t gangblocksize = SPA_OLD_GANGBLOCKSIZE;
uint64_t candidate = gangblocksize;
error = metaslab_alloc_range(spa, mc, gangblocksize, gangblocksize,
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
&pio->io_alloc_list, pio->io_allocator, pio);
&pio->io_alloc_list, pio->io_allocator, pio, &candidate);
if (error) {
pio->io_error = error;
return (pio);
}
if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER))
gangblocksize = candidate;
if (pio == gio) {
gnpp = &gio->io_gang_tree;
@ -3173,15 +3210,15 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
ASSERT(pio->io_ready == zio_write_gang_member_ready);
}
gn = zio_gang_node_alloc(gnpp);
gn = zio_gang_node_alloc(gnpp, gangblocksize);
gbh = gn->gn_gbh;
memset(gbh, 0, SPA_GANGBLOCKSIZE);
gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
memset(gbh, 0, gangblocksize);
gbh_abd = abd_get_from_buf(gbh, gangblocksize);
/*
* Create the gang header.
*/
zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, gangblocksize,
zio_write_gang_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
@ -3198,7 +3235,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
* opportunistic allocations. If that fails to generate enough
* space, we fall back to normal zio_write calls for nested gang.
*/
for (int g = 0; resid != 0; g++) {
int g;
boolean_t any_failed = B_FALSE;
for (g = 0; resid != 0; g++) {
flags &= METASLAB_ASYNC_ALLOC;
flags |= METASLAB_GANG_CHILD;
zp.zp_checksum = gio->io_prop.zp_checksum;
@ -3219,9 +3258,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
uint64_t min_size = zio_roundup_alloc_size(spa,
resid / (SPA_GBH_NBLKPTRS - g));
resid / (gbh_nblkptrs(gangblocksize) - g));
min_size = MIN(min_size, resid);
bp = &gbh->zg_blkptr[g];
bp = &((blkptr_t *)gbh)[g];
zio_alloc_list_t cio_list;
metaslab_trace_init(&cio_list);
@ -3231,6 +3270,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
flags, &cio_list, zio->io_allocator, NULL, &allocated_size);
boolean_t allocated = error == 0;
any_failed |= !allocated;
uint64_t psize = allocated ? MIN(resid, allocated_size) :
min_size;
@ -3262,6 +3302,29 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zio_nowait(cio);
}
/*
* If we used more gang children than the old limit, we must already be
* using the new headers. No need to update anything, just move on.
*
* Otherwise, we might be in a case where we need to turn on the new
* feature, so we check that. We enable the new feature if we didn't
* manage to fit everything into 3 gang children and we could have
* written more than that.
*/
if (g > gbh_nblkptrs(SPA_OLD_GANGBLOCKSIZE)) {
ASSERT(spa_feature_is_active(spa,
SPA_FEATURE_DYNAMIC_GANG_HEADER));
} else if (any_failed && candidate > SPA_OLD_GANGBLOCKSIZE &&
spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) &&
!spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
dmu_tx_t *tx =
dmu_tx_create_assigned(spa->spa_dsl_pool, txg + 1);
dsl_sync_task_nowait(spa->spa_dsl_pool,
zio_update_feature,
(void *)SPA_FEATURE_DYNAMIC_GANG_HEADER, tx);
dmu_tx_commit(tx);
}
/*
* Set pio's pipeline to just wait for zio to finish.
*/
@ -4331,9 +4394,9 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
}
if (gn != NULL) {
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
zio_dva_unallocate(zio, gn->gn_child[g],
&gn->gn_gbh->zg_blkptr[g]);
gbh_bp(gn->gn_gbh, g));
}
}
}
@ -5262,6 +5325,7 @@ zio_dva_throttle_done(zio_t *zio)
vdev_t *vd = zio->io_vd;
int flags = METASLAB_ASYNC_ALLOC;
const void *tag = pio;
uint64_t size = pio->io_size;
ASSERT3P(zio->io_bp, !=, NULL);
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
@ -5277,10 +5341,13 @@ zio_dva_throttle_done(zio_t *zio)
* Parents of gang children can have two flavors -- ones that allocated
* the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that
* allocated the constituent blocks. The first use their parent as tag.
* We set the size to match the original allocation call for that case.
*/
if (pio->io_child_type == ZIO_CHILD_GANG &&
(pio->io_flags & ZIO_FLAG_IO_REWRITE))
(pio->io_flags & ZIO_FLAG_IO_REWRITE)) {
tag = zio_unique_parent(pio);
size = SPA_OLD_GANGBLOCKSIZE;
}
ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG &&
(pio->io_flags & ZIO_FLAG_IO_REWRITE)));
@ -5293,7 +5360,7 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id,
pio->io_allocator, flags, pio->io_size, tag);
pio->io_allocator, flags, size, tag);
if (metaslab_class_throttle_unreserve(pio->io_metaslab_class,
pio->io_allocator, 1, pio->io_size)) {

View File

@ -545,14 +545,35 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
int error;
uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t size = bp ? BP_GET_PSIZE(bp) : zio->io_size;
uint64_t offset = zio->io_offset;
abd_t *data = zio->io_abd;
spa_t *spa = zio->io_spa;
if (bp && BP_IS_GANG(bp)) {
if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER))
size = zio->io_size;
else
size = SPA_OLD_GANGBLOCKSIZE;
}
error = zio_checksum_error_impl(spa, bp, checksum, data, size,
offset, info);
if (error && bp && BP_IS_GANG(bp) && size > SPA_OLD_GANGBLOCKSIZE) {
/*
* It's possible that this is an old gang block. Rerun
* the checksum with the old size; if that passes, then
* update the gangblocksize appropriately.
*/
error = zio_checksum_error_impl(spa, bp, checksum, data,
SPA_OLD_GANGBLOCKSIZE, offset, info);
if (error == 0) {
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
zio_t *pio = zio_unique_parent(zio);
zio_gang_node_t *gn = pio->io_private;
gn->gn_gangblocksize = SPA_OLD_GANGBLOCKSIZE;
}
}
if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
error = zio_handle_fault_injection(zio, ECKSUM);

View File

@ -739,7 +739,8 @@ tags = ['functional', 'features', 'large_dnode']
[tests/functional/gang_blocks]
tests = ['gang_blocks_001_pos', 'gang_blocks_redundant',
'gang_blocks_ddt_copies']
'gang_blocks_ddt_copies', 'gang_blocks_dyn_header_pos',
'gang_blocks_dyn_header_neg', 'gang_blocks_dyn_multi']
tags = ['functional', 'gang_blocks']
[tests/functional/grow]

View File

@ -1579,6 +1579,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/gang_blocks/gang_blocks_001_pos.ksh \
functional/gang_blocks/gang_blocks_ddt_copies.ksh \
functional/gang_blocks/gang_blocks_redundant.ksh \
functional/gang_blocks/gang_blocks_dyn_header_neg.ksh \
functional/gang_blocks/gang_blocks_dyn_header_pos.ksh \
functional/gang_blocks/gang_blocks_dyn_multi.ksh \
functional/gang_blocks/setup.ksh \
functional/grow/grow_pool_001_pos.ksh \
functional/grow/grow_replicas_001_pos.ksh \

View File

@ -50,7 +50,7 @@ function cleanup
function check_features
{
for state in $(zpool get all $TESTPOOL | \
for state in $(zpool get all $TESTPOOL | grep -v "dynamic_gang_header" | \
awk '$2 ~ /feature@/ { print $3 }'); do
if [[ "$state" != "enabled" && "$state" != "active" ]]; then
log_fail "some features are not enabled on new pool"

View File

@ -58,6 +58,9 @@ function check_features
return 1;
fi
else
if [[ "feature@dynamic_gang_header" == "${2}" ]]; then
continue
fi
# Failure other features must be enabled or active.
if [[ "${3}" != "enabled" && "${3}" != "active" ]]; then
return 2;

View File

@ -91,6 +91,7 @@ typeset -a properties=(
"feature@device_rebuild"
"feature@draid"
"feature@redaction_list_spill"
"feature@dynamic_gang_header"
)
if is_linux || is_freebsd; then

View File

@ -0,0 +1,53 @@
#!/bin/ksh
# SPDX-License-Identifier: CDDL-1.0
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2025 by Klara Inc.
#
#
# Description:
# Verify that we don't use larger gang headers on ashift=9 pools
#
# Strategy:
# 1. Create a pool with dynamic gang headers.
# 2. Set metaslab_force_ganging to force multi-level ganging.
# 3. Verify that a large file has multi-level ganging
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
log_assert "Verify that we don't use large gang headers on small-ashift pools".
log_onexit cleanup
preamble
log_must zpool create -f -o ashift=9 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS
mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
set_tunable64 METASLAB_FORCE_GANGING 200000
set_tunable32 METASLAB_FORCE_GANGING_PCT 100
path="${mountpoint}/file"
log_must dd if=/dev/urandom of=$path bs=1M count=1
log_must zpool sync $TESTPOOL
first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
leaves=$(read_gang_header $TESTPOOL $first_block 200)
gangs=$(echo "$leaves" | grep -c gang)
[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed"
log_must verify_pool $TESTPOOL
status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers active on an ashift-9 pool"
log_pass "We don't use large gang headers on small-ashift pools".

View File

@ -0,0 +1,73 @@
#!/bin/ksh
# SPDX-License-Identifier: CDDL-1.0
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2025 by Klara Inc.
#
#
# Description:
# Verify that we use larger gang headers on ashift=12 pools
#
# Strategy:
# 1. Create a pool with dynamic gang headers.
# 2. Set metaslab_force_ganging to force ganging.
# 3. Verify that a large file has more than 3 gang headers.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
log_assert "Verify that we don't use large gang headers on small-ashift pools".
log_onexit cleanup
preamble
log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS
mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
set_tunable64 METASLAB_FORCE_GANGING 200000
set_tunable32 METASLAB_FORCE_GANGING_PCT 100
status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers not enabled"
path="${mountpoint}/file"
log_must dd if=/dev/urandom of=$path bs=1M count=1
log_must zpool sync $TESTPOOL
first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE)
first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*<//' | sed 's/>.*//')
check_not_gang_dva $first_dva
num_leaves=$(echo "$leaves" | wc -l)
[[ "$num_leaves" -gt 3 ]] && log_fail "used a larger gang header too soon: \"$leaves\""
log_must verify_pool $TESTPOOL
status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
path="${mountpoint}/file2"
log_must dd if=/dev/urandom of=$path bs=1M count=1
log_must zpool sync $TESTPOOL
first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file2)
leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE)
first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*<//' | sed 's/>.*//')
check_not_gang_dva $first_dva
num_leaves=$(echo "$leaves" | wc -l)
[[ "$num_leaves" -gt 3 ]] || log_fail "didn't use a larger gang header: \"$leaves\""
log_must verify_pool $TESTPOOL
status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
log_pass "We don't use large gang headers on small-ashift pools".

View File

@ -0,0 +1,54 @@
#!/bin/ksh
# SPDX-License-Identifier: CDDL-1.0
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2025 by Klara Inc.
#
#
# Description:
# Verify that multi-level ganging still works with dynamic headers
#
# Strategy:
# 1. Create a pool with dynamic gang headers and ashift=12.
# 2. Set metaslab_force_ganging to force multi-level ganging.
# 3. Verify that a large file has multi-level ganging
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
log_assert "Verify that we can still multi-level gang with large headers."
log_onexit cleanup
preamble
log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
log_must zfs create -o recordsize=16M $TESTPOOL/$TESTFS
mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
set_tunable64 METASLAB_FORCE_GANGING 50000
set_tunable32 METASLAB_FORCE_GANGING_PCT 100
path="${mountpoint}/file"
log_must dd if=/dev/urandom of=$path bs=16M count=1
log_must zpool sync $TESTPOOL
first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
leaves=$(read_gang_header $TESTPOOL $first_block 200)
gangs=$(echo "$leaves" | grep -c gang)
[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed"
log_must verify_pool $TESTPOOL
status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
log_pass "We can still multi-level gang with large headers."