diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 45eb9c783..d6f144c0e 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -8588,9 +8588,9 @@ zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
}
static void
-zdb_dump_gbh(void *buf, int flags)
+zdb_dump_gbh(void *buf, uint64_t size, int flags)
{
- zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
+ zdb_dump_indirect((blkptr_t *)buf, gbh_nblkptrs(size), flags);
}
static void
@@ -9073,7 +9073,7 @@ zdb_read_block(char *thing, spa_t *spa)
zdb_dump_indirect((blkptr_t *)buf,
orig_lsize / sizeof (blkptr_t), flags);
else if (flags & ZDB_FLAG_GBH)
- zdb_dump_gbh(buf, flags);
+ zdb_dump_gbh(buf, lsize, flags);
else
zdb_dump_block(thing, buf, lsize, flags);
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 7f457c3a0..7f5a9aaef 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -148,7 +148,7 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
static inline uint64_t
vdev_gang_header_asize(vdev_t *vd)
{
- return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0));
+ return (vdev_psize_to_asize_txg(vd, SPA_OLD_GANGBLOCKSIZE, 0));
}
extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
diff --git a/include/sys/zio.h b/include/sys/zio.h
index e65ac2803..b139c9de4 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -59,21 +59,36 @@ typedef struct zio_eck {
/*
* Gang block headers are self-checksumming and contain an array
- * of block pointers.
+ * of block pointers. The old gang block size has enough room for 3 blkptrs,
+ * while new gang blocks can store more.
+ *
+ * Layout:
+ * +--------+--------+--------+-----+---------+-----------+
+ * | | | | | | |
+ * | blkptr | blkptr | blkptr | ... | padding | zio_eck_t |
+ * | 1 | 2 | 3 | | | |
+ * +--------+--------+--------+-----+---------+-----------+
+ * 128B 128B 128B 88B 40B
*/
-#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
-#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_eck_t)) / sizeof (blkptr_t))
-#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_eck_t) - \
- (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
- sizeof (uint64_t))
+#define SPA_OLD_GANGBLOCKSIZE SPA_MINBLOCKSIZE
+typedef void zio_gbh_phys_t;
-typedef struct zio_gbh {
- blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
- uint64_t zg_filler[SPA_GBH_FILLER];
- zio_eck_t zg_tail;
-} zio_gbh_phys_t;
+static inline uint64_t
+gbh_nblkptrs(uint64_t size) {
+ ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t)));
+ return ((size - sizeof (zio_eck_t)) / sizeof (blkptr_t));
+}
+
+static inline zio_eck_t *
+gbh_eck(zio_gbh_phys_t *gbh, uint64_t size) {
+ ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t)));
+ return ((zio_eck_t *)((uintptr_t)gbh + size - sizeof (zio_eck_t)));
+}
+
+static inline blkptr_t *
+gbh_bp(zio_gbh_phys_t *gbh, int bp) {
+ return (&((blkptr_t *)gbh)[bp]);
+}
enum zio_checksum {
ZIO_CHECKSUM_INHERIT = 0,
@@ -398,7 +413,9 @@ typedef struct zio_vsd_ops {
typedef struct zio_gang_node {
zio_gbh_phys_t *gn_gbh;
- struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS];
+ uint64_t gn_gangblocksize;
+ uint64_t gn_allocsize;
+ struct zio_gang_node *gn_child[];
} zio_gang_node_t;
typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 5d37bb956..53e1ecae3 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -87,6 +87,7 @@ typedef enum spa_feature {
SPA_FEATURE_FAST_DEDUP,
SPA_FEATURE_LONGNAME,
SPA_FEATURE_LARGE_MICROZAP,
+ SPA_FEATURE_DYNAMIC_GANG_HEADER,
SPA_FEATURES
} spa_feature_t;
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 35ecdca76..ecfd40efc 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -631,7 +631,7 @@
-
+
@@ -6210,7 +6210,8 @@
-
+
+
@@ -9394,8 +9395,8 @@
-
-
+
+
@@ -9403,6 +9404,7 @@
+
@@ -9472,7 +9474,7 @@
-
+
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7
index 8ae1b2b3b..7ec271164 100644
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -493,6 +493,19 @@ vdev type, or when adding a new
.Sy draid
vdev to an existing pool.
.
+.feature com.klarasystems dynamic_gang_header no
+This feature enables larger gang headers based on the sector size of the pool.
+When enabled, gang headers will use the entire space allocated for them, instead
+of always restricting themselves to 512 bytes.
+This can reduce the need for nested gang trees in extreme fragmentation
+scenarios.
+.Pp
+This feature becomes active when a gang header is written that is larger than
+512 bytes.
+This feature is not enabled by
+.Xr zpool-upgrade 8 .
+Instead, it must be manually enabled, or be part of a compatibility file.
+.
.feature org.illumos edonr no extensible_dataset
This feature enables the use of the Edon-R hash algorithm for checksum,
including for nopwrite
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index 0362d82ef..8ac1c7cab 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -786,6 +786,12 @@ zpool_feature_init(void)
ZFEATURE_TYPE_BOOLEAN, large_microzap_deps, sfeatures);
}
+ zfeature_register(SPA_FEATURE_DYNAMIC_GANG_HEADER,
+ "com.klarasystems:dynamic_gang_header", "dynamic_gang_header",
+ "Support for dynamically sized gang headers",
+ ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_NO_UPGRADE,
+ ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
+
zfs_mod_list_supported_free(sfeatures);
}
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 23eca0425..082d379cd 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5974,12 +5974,12 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
ASSERT3P(zal, !=, NULL);
- uint64_t cur_psize = 0;
-
+ uint64_t smallest_psize = UINT64_MAX;
for (int d = 0; d < ndvas; d++) {
- error = metaslab_alloc_dva_range(spa, mc, psize, max_psize,
- dva, d, hintdva, txg, flags, zal, allocator,
- actual_psize ? &cur_psize : NULL);
+ uint64_t cur_psize = 0;
+ error = metaslab_alloc_dva_range(spa, mc, psize,
+ MIN(smallest_psize, max_psize), dva, d, hintdva, txg,
+ flags, zal, allocator, actual_psize ? &cur_psize : NULL);
if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_unalloc_dva(spa, &dva[d], txg);
@@ -5999,13 +5999,13 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
DVA_GET_VDEV(&dva[d]), allocator, flags, psize,
tag);
if (actual_psize)
- max_psize = MIN(cur_psize, max_psize);
+ smallest_psize = MIN(cur_psize, smallest_psize);
}
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
if (actual_psize)
- *actual_psize = max_psize;
+ *actual_psize = smallest_psize;
spa_config_exit(spa, SCL_ALLOC, FTAG);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 64f3d31f5..67ee3d5ba 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2743,11 +2743,14 @@ zio_resume_wait(spa_t *spa)
* being nearly full, it calls zio_write_gang_block() to construct the
* block from smaller fragments.
*
- * A gang block consists of a gang header (zio_gbh_phys_t) and up to
- * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
- * an indirect block: it's an array of block pointers. It consumes
- * only one sector and hence is allocatable regardless of fragmentation.
- * The gang header's bps point to its gang members, which hold the data.
+ * A gang block consists of a a gang header and up to gbh_nblkptrs(size)
+ * gang members. The gang header is like an indirect block: it's an array
+ * of block pointers, though the header has a small tail (a zio_eck_t)
+ * that stores an embedded checksum. It is allocated using only a single
+ * sector as the requested size, and hence is allocatable regardless of
+ * fragmentation. Its size is determined by the smallest allocatable
+ * asize of the vdevs it was allocated on. The gang header's bps point
+ * to its gang members, which hold the data.
*
* Gang blocks are self-checksumming, using the bp's
* as the verifier to ensure uniqueness of the SHA256 checksum.
@@ -2826,10 +2829,10 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
if (gn != NULL) {
abd_t *gbh_abd =
- abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+ abd_get_from_buf(gn->gn_gbh, gn->gn_gangblocksize);
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
- gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
- pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+ gbh_abd, gn->gn_gangblocksize, zio_gang_issue_func_done,
+ NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark);
/*
* As we rewrite each gang header, the pipeline will compute
@@ -2900,14 +2903,16 @@ static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
static void zio_gang_tree_assemble_done(zio_t *zio);
static zio_gang_node_t *
-zio_gang_node_alloc(zio_gang_node_t **gnpp)
+zio_gang_node_alloc(zio_gang_node_t **gnpp, uint64_t gangblocksize)
{
zio_gang_node_t *gn;
ASSERT(*gnpp == NULL);
- gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
- gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
+ gn = kmem_zalloc(sizeof (*gn) +
+ (gbh_nblkptrs(gangblocksize) * sizeof (gn)), KM_SLEEP);
+ gn->gn_gangblocksize = gn->gn_allocsize = gangblocksize;
+ gn->gn_gbh = zio_buf_alloc(gangblocksize);
*gnpp = gn;
return (gn);
@@ -2918,11 +2923,12 @@ zio_gang_node_free(zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = *gnpp;
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
ASSERT(gn->gn_child[g] == NULL);
- zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
- kmem_free(gn, sizeof (*gn));
+ zio_buf_free(gn->gn_gbh, gn->gn_allocsize);
+ kmem_free(gn, sizeof (*gn) +
+ (gbh_nblkptrs(gn->gn_allocsize) * sizeof (gn)));
*gnpp = NULL;
}
@@ -2934,7 +2940,7 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
if (gn == NULL)
return;
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
zio_gang_tree_free(&gn->gn_child[g]);
zio_gang_node_free(gnpp);
@@ -2943,13 +2949,28 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
static void
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
{
- zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
- abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+ uint64_t gangblocksize = UINT64_MAX;
+ if (spa_feature_is_active(gio->io_spa,
+ SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
+ spa_config_enter(gio->io_spa, SCL_VDEV, FTAG, RW_READER);
+ for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) {
+ vdev_t *vd = vdev_lookup_top(gio->io_spa,
+ DVA_GET_VDEV(&bp->blk_dva[dva]));
+ uint64_t asize = vdev_gang_header_asize(vd);
+ gangblocksize = MIN(gangblocksize, asize);
+ }
+ spa_config_exit(gio->io_spa, SCL_VDEV, FTAG);
+ } else {
+ gangblocksize = SPA_OLD_GANGBLOCKSIZE;
+ }
+ ASSERT3U(gangblocksize, !=, UINT64_MAX);
+ zio_gang_node_t *gn = zio_gang_node_alloc(gnpp, gangblocksize);
+ abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, gangblocksize);
ASSERT(gio->io_gang_leader == gio);
ASSERT(BP_IS_GANG(bp));
- zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, gangblocksize,
zio_gang_tree_assemble_done, gn, gio->io_priority,
ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
}
@@ -2972,13 +2993,17 @@ zio_gang_tree_assemble_done(zio_t *zio)
byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
- ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
- ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+ /*
+ * If this was an old-style gangblock, the gangblocksize should have
+ * been updated in zio_checksum_error to reflect that.
+ */
+ ASSERT3U(gbh_eck(gn->gn_gbh, gn->gn_gangblocksize)->zec_magic,
+ ==, ZEC_MAGIC);
abd_free(zio->io_abd);
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
- blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
+ blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
if (!BP_IS_GANG(gbp))
continue;
zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
@@ -3003,10 +3028,11 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
if (gn != NULL) {
- ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+ ASSERT3U(gbh_eck(gn->gn_gbh,
+ gn->gn_gangblocksize)->zec_magic, ==, ZEC_MAGIC);
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
- blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
+ blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
if (BP_IS_HOLE(gbp))
continue;
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
@@ -3113,6 +3139,13 @@ zio_write_gang_done(zio_t *zio)
abd_free(zio->io_abd);
}
+static void
+zio_update_feature(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ spa_feature_incr(spa, (spa_feature_t)(uintptr_t)arg, tx);
+}
+
static zio_t *
zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
{
@@ -3158,13 +3191,17 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
flags |= METASLAB_ASYNC_ALLOC;
}
- error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+ uint64_t gangblocksize = SPA_OLD_GANGBLOCKSIZE;
+ uint64_t candidate = gangblocksize;
+ error = metaslab_alloc_range(spa, mc, gangblocksize, gangblocksize,
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
- &pio->io_alloc_list, pio->io_allocator, pio);
+ &pio->io_alloc_list, pio->io_allocator, pio, &candidate);
if (error) {
pio->io_error = error;
return (pio);
}
+ if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER))
+ gangblocksize = candidate;
if (pio == gio) {
gnpp = &gio->io_gang_tree;
@@ -3173,15 +3210,15 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
ASSERT(pio->io_ready == zio_write_gang_member_ready);
}
- gn = zio_gang_node_alloc(gnpp);
+ gn = zio_gang_node_alloc(gnpp, gangblocksize);
gbh = gn->gn_gbh;
- memset(gbh, 0, SPA_GANGBLOCKSIZE);
- gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
+ memset(gbh, 0, gangblocksize);
+ gbh_abd = abd_get_from_buf(gbh, gangblocksize);
/*
* Create the gang header.
*/
- zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, gangblocksize,
zio_write_gang_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
@@ -3198,7 +3235,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
* opportunistic allocations. If that fails to generate enough
* space, we fall back to normal zio_write calls for nested gang.
*/
- for (int g = 0; resid != 0; g++) {
+ int g;
+ boolean_t any_failed = B_FALSE;
+ for (g = 0; resid != 0; g++) {
flags &= METASLAB_ASYNC_ALLOC;
flags |= METASLAB_GANG_CHILD;
zp.zp_checksum = gio->io_prop.zp_checksum;
@@ -3219,9 +3258,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
uint64_t min_size = zio_roundup_alloc_size(spa,
- resid / (SPA_GBH_NBLKPTRS - g));
+ resid / (gbh_nblkptrs(gangblocksize) - g));
min_size = MIN(min_size, resid);
- bp = &gbh->zg_blkptr[g];
+ bp = &((blkptr_t *)gbh)[g];
zio_alloc_list_t cio_list;
metaslab_trace_init(&cio_list);
@@ -3231,6 +3270,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
flags, &cio_list, zio->io_allocator, NULL, &allocated_size);
boolean_t allocated = error == 0;
+ any_failed |= !allocated;
uint64_t psize = allocated ? MIN(resid, allocated_size) :
min_size;
@@ -3262,6 +3302,29 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zio_nowait(cio);
}
+ /*
+ * If we used more gang children than the old limit, we must already be
+ * using the new headers. No need to update anything, just move on.
+ *
+ * Otherwise, we might be in a case where we need to turn on the new
+ * feature, so we check that. We enable the new feature if we didn't
+ * manage to fit everything into 3 gang children and we could have
+ * written more than that.
+ */
+ if (g > gbh_nblkptrs(SPA_OLD_GANGBLOCKSIZE)) {
+ ASSERT(spa_feature_is_active(spa,
+ SPA_FEATURE_DYNAMIC_GANG_HEADER));
+ } else if (any_failed && candidate > SPA_OLD_GANGBLOCKSIZE &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) &&
+ !spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
+ dmu_tx_t *tx =
+ dmu_tx_create_assigned(spa->spa_dsl_pool, txg + 1);
+ dsl_sync_task_nowait(spa->spa_dsl_pool,
+ zio_update_feature,
+ (void *)SPA_FEATURE_DYNAMIC_GANG_HEADER, tx);
+ dmu_tx_commit(tx);
+ }
+
/*
* Set pio's pipeline to just wait for zio to finish.
*/
@@ -4331,9 +4394,9 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
}
if (gn != NULL) {
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
zio_dva_unallocate(zio, gn->gn_child[g],
- &gn->gn_gbh->zg_blkptr[g]);
+ gbh_bp(gn->gn_gbh, g));
}
}
}
@@ -5262,6 +5325,7 @@ zio_dva_throttle_done(zio_t *zio)
vdev_t *vd = zio->io_vd;
int flags = METASLAB_ASYNC_ALLOC;
const void *tag = pio;
+ uint64_t size = pio->io_size;
ASSERT3P(zio->io_bp, !=, NULL);
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
@@ -5277,10 +5341,13 @@ zio_dva_throttle_done(zio_t *zio)
* Parents of gang children can have two flavors -- ones that allocated
* the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that
* allocated the constituent blocks. The first use their parent as tag.
+ * We set the size to match the original allocation call for that case.
*/
if (pio->io_child_type == ZIO_CHILD_GANG &&
- (pio->io_flags & ZIO_FLAG_IO_REWRITE))
+ (pio->io_flags & ZIO_FLAG_IO_REWRITE)) {
tag = zio_unique_parent(pio);
+ size = SPA_OLD_GANGBLOCKSIZE;
+ }
ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG &&
(pio->io_flags & ZIO_FLAG_IO_REWRITE)));
@@ -5293,7 +5360,7 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id,
- pio->io_allocator, flags, pio->io_size, tag);
+ pio->io_allocator, flags, size, tag);
if (metaslab_class_throttle_unreserve(pio->io_metaslab_class,
pio->io_allocator, 1, pio->io_size)) {
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index a91775b04..8cec3a6f5 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -545,14 +545,35 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
int error;
- uint64_t size = (bp == NULL ? zio->io_size :
- (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
+ uint64_t size = bp ? BP_GET_PSIZE(bp) : zio->io_size;
uint64_t offset = zio->io_offset;
abd_t *data = zio->io_abd;
spa_t *spa = zio->io_spa;
+ if (bp && BP_IS_GANG(bp)) {
+ if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER))
+ size = zio->io_size;
+ else
+ size = SPA_OLD_GANGBLOCKSIZE;
+ }
+
error = zio_checksum_error_impl(spa, bp, checksum, data, size,
offset, info);
+ if (error && bp && BP_IS_GANG(bp) && size > SPA_OLD_GANGBLOCKSIZE) {
+ /*
+ * It's possible that this is an old gang block. Rerun
+ * the checksum with the old size; if that passes, then
+ * update the gangblocksize appropriately.
+ */
+ error = zio_checksum_error_impl(spa, bp, checksum, data,
+ SPA_OLD_GANGBLOCKSIZE, offset, info);
+ if (error == 0) {
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ zio_t *pio = zio_unique_parent(zio);
+ zio_gang_node_t *gn = pio->io_private;
+ gn->gn_gangblocksize = SPA_OLD_GANGBLOCKSIZE;
+ }
+ }
if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
error = zio_handle_fault_injection(zio, ECKSUM);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 376518e9f..214fa70fe 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -739,7 +739,8 @@ tags = ['functional', 'features', 'large_dnode']
[tests/functional/gang_blocks]
tests = ['gang_blocks_001_pos', 'gang_blocks_redundant',
- 'gang_blocks_ddt_copies']
+ 'gang_blocks_ddt_copies', 'gang_blocks_dyn_header_pos',
+ 'gang_blocks_dyn_header_neg', 'gang_blocks_dyn_multi']
tags = ['functional', 'gang_blocks']
[tests/functional/grow]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 20a17a531..8813f2627 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1579,6 +1579,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/gang_blocks/gang_blocks_001_pos.ksh \
functional/gang_blocks/gang_blocks_ddt_copies.ksh \
functional/gang_blocks/gang_blocks_redundant.ksh \
+ functional/gang_blocks/gang_blocks_dyn_header_neg.ksh \
+ functional/gang_blocks/gang_blocks_dyn_header_pos.ksh \
+ functional/gang_blocks/gang_blocks_dyn_multi.ksh \
functional/gang_blocks/setup.ksh \
functional/grow/grow_pool_001_pos.ksh \
functional/grow/grow_replicas_001_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh
index f96d291cc..94ccabeb8 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh
@@ -50,7 +50,7 @@ function cleanup
function check_features
{
- for state in $(zpool get all $TESTPOOL | \
+ for state in $(zpool get all $TESTPOOL | grep -v "dynamic_gang_header" | \
awk '$2 ~ /feature@/ { print $3 }'); do
if [[ "$state" != "enabled" && "$state" != "active" ]]; then
log_fail "some features are not enabled on new pool"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh
index 7366a46f9..676aca1a2 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh
@@ -58,6 +58,9 @@ function check_features
return 1;
fi
else
+ if [[ "feature@dynamic_gang_header" == "${2}" ]]; then
+ continue
+ fi
# Failure other features must be enabled or active.
if [[ "${3}" != "enabled" && "${3}" != "active" ]]; then
return 2;
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index cf5e0961f..6de086976 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -91,6 +91,7 @@ typeset -a properties=(
"feature@device_rebuild"
"feature@draid"
"feature@redaction_list_spill"
+ "feature@dynamic_gang_header"
)
if is_linux || is_freebsd; then
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh
new file mode 100755
index 000000000..e9cb1d2a0
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh
@@ -0,0 +1,53 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2025 by Klara Inc.
+#
+
+#
+# Description:
+# Verify that we don't use larger gang headers on ashift=9 pools
+#
+# Strategy:
+# 1. Create a pool with dynamic gang headers.
+# 2. Set metaslab_force_ganging to force multi-level ganging.
+# 3. Verify that a large file has multi-level ganging
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
+
+log_assert "Verify that we don't use large gang headers on small-ashift pools".
+
+log_onexit cleanup
+preamble
+
+log_must zpool create -f -o ashift=9 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
+log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS
+mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+set_tunable64 METASLAB_FORCE_GANGING 200000
+set_tunable32 METASLAB_FORCE_GANGING_PCT 100
+
+path="${mountpoint}/file"
+log_must dd if=/dev/urandom of=$path bs=1M count=1
+log_must zpool sync $TESTPOOL
+first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
+leaves=$(read_gang_header $TESTPOOL $first_block 200)
+gangs=$(echo "$leaves" | grep -c gang)
+[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed"
+
+log_must verify_pool $TESTPOOL
+status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers active on an ashift-9 pool"
+log_pass "We don't use large gang headers on small-ashift pools".
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh
new file mode 100755
index 000000000..e6d6629e9
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh
@@ -0,0 +1,73 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2025 by Klara Inc.
+#
+
+#
+# Description:
+# Verify that we use larger gang headers on ashift=12 pools
+#
+# Strategy:
+# 1. Create a pool with dynamic gang headers.
+# 2. Set metaslab_force_ganging to force ganging.
+# 3. Verify that a large file has more than 3 gang headers.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
+
+log_assert "Verify that we don't use large gang headers on small-ashift pools".
+
+log_onexit cleanup
+preamble
+
+log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
+log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS
+mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+set_tunable64 METASLAB_FORCE_GANGING 200000
+set_tunable32 METASLAB_FORCE_GANGING_PCT 100
+
+status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers not enabled"
+path="${mountpoint}/file"
+log_must dd if=/dev/urandom of=$path bs=1M count=1
+log_must zpool sync $TESTPOOL
+first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
+leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE)
+first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*/' | sed 's/>.*//')
+check_not_gang_dva $first_dva
+
+num_leaves=$(echo "$leaves" | wc -l)
+[[ "$num_leaves" -gt 3 ]] && log_fail "used a larger gang header too soon: \"$leaves\""
+log_must verify_pool $TESTPOOL
+status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
+
+path="${mountpoint}/file2"
+log_must dd if=/dev/urandom of=$path bs=1M count=1
+log_must zpool sync $TESTPOOL
+first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file2)
+leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE)
+first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*/' | sed 's/>.*//')
+check_not_gang_dva $first_dva
+
+num_leaves=$(echo "$leaves" | wc -l)
+[[ "$num_leaves" -gt 3 ]] || log_fail "didn't use a larger gang header: \"$leaves\""
+
+
+log_must verify_pool $TESTPOOL
+status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
+log_pass "We don't use large gang headers on small-ashift pools".
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh
new file mode 100755
index 000000000..2ffe24968
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh
@@ -0,0 +1,54 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2025 by Klara Inc.
+#
+
+#
+# Description:
+# Verify that multi-level ganging still works with dynamic headers
+#
+# Strategy:
+# 1. Create a pool with dynamic gang headers and ashift=12.
+# 2. Set metaslab_force_ganging to force multi-level ganging.
+# 3. Verify that a large file has multi-level ganging
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
+
+log_assert "Verify that we can still multi-level gang with large headers."
+
+log_onexit cleanup
+preamble
+
+log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
+log_must zfs create -o recordsize=16M $TESTPOOL/$TESTFS
+mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+set_tunable64 METASLAB_FORCE_GANGING 50000
+set_tunable32 METASLAB_FORCE_GANGING_PCT 100
+
+path="${mountpoint}/file"
+log_must dd if=/dev/urandom of=$path bs=16M count=1
+log_must zpool sync $TESTPOOL
+first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
+leaves=$(read_gang_header $TESTPOOL $first_block 200)
+gangs=$(echo "$leaves" | grep -c gang)
+[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed"
+
+log_must verify_pool $TESTPOOL
+status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
+
+log_pass "We can still multi-level gang with large headers."