From a981cb69e44fc736d94f64f432941ba247143687 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 23 Jan 2025 16:26:09 -0800 Subject: [PATCH] Implement dynamic gang header sizes ZFS gang block headers are currently fixed at 512 bytes. This is increasingly wasteful in the era of larger disk sector sizes. This PR allows any size allocation to work as a gang header. It also contains supporting changes to ZDB to make gang headers easier to work with. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Reviewed-by: Allan Jude Signed-off-by: Paul Dagnelie Closes #17004 --- cmd/zdb/zdb.c | 6 +- include/sys/vdev.h | 2 +- include/sys/zio.h | 45 ++++-- include/zfeature_common.h | 1 + lib/libzfs/libzfs.abi | 12 +- man/man7/zpool-features.7 | 13 ++ module/zcommon/zfeature_common.c | 6 + module/zfs/metaslab.c | 14 +- module/zfs/zio.c | 143 +++++++++++++----- module/zfs/zio_checksum.c | 25 ++- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 3 + .../zpool_create_features_001_pos.ksh | 2 +- .../zpool_create_features_005_pos.ksh | 3 + .../cli_root/zpool_get/zpool_get.cfg | 1 + .../gang_blocks_dyn_header_neg.ksh | 53 +++++++ .../gang_blocks_dyn_header_pos.ksh | 73 +++++++++ .../gang_blocks/gang_blocks_dyn_multi.ksh | 54 +++++++ 18 files changed, 387 insertions(+), 72 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 45eb9c783..d6f144c0e 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8588,9 +8588,9 @@ zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) } static void -zdb_dump_gbh(void *buf, int flags) +zdb_dump_gbh(void *buf, uint64_t size, int flags) { - zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); + zdb_dump_indirect((blkptr_t *)buf, gbh_nblkptrs(size), flags); } static void @@ -9073,7 +9073,7 @@ zdb_read_block(char *thing, spa_t *spa) zdb_dump_indirect((blkptr_t *)buf, orig_lsize / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) - zdb_dump_gbh(buf, flags); + zdb_dump_gbh(buf, lsize, flags); else zdb_dump_block(thing, buf, lsize, flags); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 7f457c3a0..7f5a9aaef 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -148,7 +148,7 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); static inline uint64_t vdev_gang_header_asize(vdev_t *vd) { - return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0)); + return (vdev_psize_to_asize_txg(vd, SPA_OLD_GANGBLOCKSIZE, 0)); } extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); diff --git a/include/sys/zio.h b/include/sys/zio.h index e65ac2803..b139c9de4 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -59,21 +59,36 @@ typedef struct zio_eck { /* * Gang block headers are self-checksumming and contain an array - * of block pointers. + * of block pointers. The old gang block size has enough room for 3 blkptrs, + * while new gang blocks can store more. + * + * Layout: + * +--------+--------+--------+-----+---------+-----------+ + * | | | | | | | + * | blkptr | blkptr | blkptr | ... | padding | zio_eck_t | + * | 1 | 2 | 3 | | | | + * +--------+--------+--------+-----+---------+-----------+ + * 128B 128B 128B 88B 40B */ -#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE -#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_eck_t)) / sizeof (blkptr_t)) -#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_eck_t) - \ - (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ - sizeof (uint64_t)) +#define SPA_OLD_GANGBLOCKSIZE SPA_MINBLOCKSIZE +typedef void zio_gbh_phys_t; -typedef struct zio_gbh { - blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; - uint64_t zg_filler[SPA_GBH_FILLER]; - zio_eck_t zg_tail; -} zio_gbh_phys_t; +static inline uint64_t +gbh_nblkptrs(uint64_t size) { + ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t))); + return ((size - sizeof (zio_eck_t)) / sizeof (blkptr_t)); +} + +static inline zio_eck_t * +gbh_eck(zio_gbh_phys_t *gbh, uint64_t size) { + ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t))); + return ((zio_eck_t *)((uintptr_t)gbh + size - sizeof (zio_eck_t))); +} + +static inline blkptr_t * +gbh_bp(zio_gbh_phys_t *gbh, int bp) { + return (&((blkptr_t *)gbh)[bp]); +} enum zio_checksum { ZIO_CHECKSUM_INHERIT = 0, @@ -398,7 +413,9 @@ typedef struct zio_vsd_ops { typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; - struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; + uint64_t gn_gangblocksize; + uint64_t gn_allocsize; + struct zio_gang_node *gn_child[]; } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 5d37bb956..53e1ecae3 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -87,6 +87,7 @@ typedef enum spa_feature { SPA_FEATURE_FAST_DEDUP, SPA_FEATURE_LONGNAME, SPA_FEATURE_LARGE_MICROZAP, + SPA_FEATURE_DYNAMIC_GANG_HEADER, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 35ecdca76..ecfd40efc 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -631,7 +631,7 @@ - + @@ -6210,7 +6210,8 @@ - + + @@ -9394,8 +9395,8 @@ - - + + @@ -9403,6 +9404,7 @@ + @@ -9472,7 +9474,7 @@ - + diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 8ae1b2b3b..7ec271164 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -493,6 +493,19 @@ vdev type, or when adding a new .Sy draid vdev to an existing pool. . +.feature com.klarasystems dynamic_gang_header no +This feature enables larger gang headers based on the sector size of the pool. +When enabled, gang headers will use the entire space allocated for them, instead +of always restricting themselves to 512 bytes. +This can reduce the need for nested gang trees in extreme fragmentation +scenarios. +.Pp +This feature becomes active when a gang header is written that is larger than +512 bytes. +This feature is not enabled by +.Xr zpool-upgrade 8 . +Instead, it must be manually enabled, or be part of a compatibility file. +. .feature org.illumos edonr no extensible_dataset This feature enables the use of the Edon-R hash algorithm for checksum, including for nopwrite diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 0362d82ef..8ac1c7cab 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -786,6 +786,12 @@ zpool_feature_init(void) ZFEATURE_TYPE_BOOLEAN, large_microzap_deps, sfeatures); } + zfeature_register(SPA_FEATURE_DYNAMIC_GANG_HEADER, + "com.klarasystems:dynamic_gang_header", "dynamic_gang_header", + "Support for dynamically sized gang headers", + ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_NO_UPGRADE, + ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 23eca0425..082d379cd 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -5974,12 +5974,12 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); - uint64_t cur_psize = 0; - + uint64_t smallest_psize = UINT64_MAX; for (int d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva_range(spa, mc, psize, max_psize, - dva, d, hintdva, txg, flags, zal, allocator, - actual_psize ? &cur_psize : NULL); + uint64_t cur_psize = 0; + error = metaslab_alloc_dva_range(spa, mc, psize, + MIN(smallest_psize, max_psize), dva, d, hintdva, txg, + flags, zal, allocator, actual_psize ? &cur_psize : NULL); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); @@ -5999,13 +5999,13 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, DVA_GET_VDEV(&dva[d]), allocator, flags, psize, tag); if (actual_psize) - max_psize = MIN(cur_psize, max_psize); + smallest_psize = MIN(cur_psize, smallest_psize); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); if (actual_psize) - *actual_psize = max_psize; + *actual_psize = smallest_psize; spa_config_exit(spa, SCL_ALLOC, FTAG); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 64f3d31f5..67ee3d5ba 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2743,11 +2743,14 @@ zio_resume_wait(spa_t *spa) * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * - * A gang block consists of a gang header (zio_gbh_phys_t) and up to - * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like - * an indirect block: it's an array of block pointers. It consumes - * only one sector and hence is allocatable regardless of fragmentation. - * The gang header's bps point to its gang members, which hold the data. + * A gang block consists of a a gang header and up to gbh_nblkptrs(size) + * gang members. The gang header is like an indirect block: it's an array + * of block pointers, though the header has a small tail (a zio_eck_t) + * that stores an embedded checksum. It is allocated using only a single + * sector as the requested size, and hence is allocatable regardless of + * fragmentation. Its size is determined by the smallest allocatable + * asize of the vdevs it was allocated on. The gang header's bps point + * to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. @@ -2826,10 +2829,10 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, if (gn != NULL) { abd_t *gbh_abd = - abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + abd_get_from_buf(gn->gn_gbh, gn->gn_gangblocksize); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + gbh_abd, gn->gn_gangblocksize, zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute @@ -2900,14 +2903,16 @@ static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * -zio_gang_node_alloc(zio_gang_node_t **gnpp) +zio_gang_node_alloc(zio_gang_node_t **gnpp, uint64_t gangblocksize) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); - gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); - gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); + gn = kmem_zalloc(sizeof (*gn) + + (gbh_nblkptrs(gangblocksize) * sizeof (gn)), KM_SLEEP); + gn->gn_gangblocksize = gn->gn_allocsize = gangblocksize; + gn->gn_gbh = zio_buf_alloc(gangblocksize); *gnpp = gn; return (gn); @@ -2918,11 +2923,12 @@ zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++) ASSERT(gn->gn_child[g] == NULL); - zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); - kmem_free(gn, sizeof (*gn)); + zio_buf_free(gn->gn_gbh, gn->gn_allocsize); + kmem_free(gn, sizeof (*gn) + + (gbh_nblkptrs(gn->gn_allocsize) * sizeof (gn))); *gnpp = NULL; } @@ -2934,7 +2940,7 @@ zio_gang_tree_free(zio_gang_node_t **gnpp) if (gn == NULL) return; - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); @@ -2943,13 +2949,28 @@ zio_gang_tree_free(zio_gang_node_t **gnpp) static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { - zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); - abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + uint64_t gangblocksize = UINT64_MAX; + if (spa_feature_is_active(gio->io_spa, + SPA_FEATURE_DYNAMIC_GANG_HEADER)) { + spa_config_enter(gio->io_spa, SCL_VDEV, FTAG, RW_READER); + for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) { + vdev_t *vd = vdev_lookup_top(gio->io_spa, + DVA_GET_VDEV(&bp->blk_dva[dva])); + uint64_t asize = vdev_gang_header_asize(vd); + gangblocksize = MIN(gangblocksize, asize); + } + spa_config_exit(gio->io_spa, SCL_VDEV, FTAG); + } else { + gangblocksize = SPA_OLD_GANGBLOCKSIZE; + } + ASSERT3U(gangblocksize, !=, UINT64_MAX); + zio_gang_node_t *gn = zio_gang_node_alloc(gnpp, gangblocksize); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, gangblocksize); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, gangblocksize, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } @@ -2972,13 +2993,17 @@ zio_gang_tree_assemble_done(zio_t *zio) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); - ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); - ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + /* + * If this was an old-style gangblock, the gangblocksize should have + * been updated in zio_checksum_error to reflect that. + */ + ASSERT3U(gbh_eck(gn->gn_gbh, gn->gn_gangblocksize)->zec_magic, + ==, ZEC_MAGIC); abd_free(zio->io_abd); - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { - blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { + blkptr_t *gbp = gbh_bp(gn->gn_gbh, g); if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); @@ -3003,10 +3028,11 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { - ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + ASSERT3U(gbh_eck(gn->gn_gbh, + gn->gn_gangblocksize)->zec_magic, ==, ZEC_MAGIC); - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { - blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { + blkptr_t *gbp = gbh_bp(gn->gn_gbh, g); if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, @@ -3113,6 +3139,13 @@ zio_write_gang_done(zio_t *zio) abd_free(zio->io_abd); } +static void +zio_update_feature(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_feature_incr(spa, (spa_feature_t)(uintptr_t)arg, tx); +} + static zio_t * zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { @@ -3158,13 +3191,17 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) flags |= METASLAB_ASYNC_ALLOC; } - error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, + uint64_t gangblocksize = SPA_OLD_GANGBLOCKSIZE; + uint64_t candidate = gangblocksize; + error = metaslab_alloc_range(spa, mc, gangblocksize, gangblocksize, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, - &pio->io_alloc_list, pio->io_allocator, pio); + &pio->io_alloc_list, pio->io_allocator, pio, &candidate); if (error) { pio->io_error = error; return (pio); } + if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) + gangblocksize = candidate; if (pio == gio) { gnpp = &gio->io_gang_tree; @@ -3173,15 +3210,15 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) ASSERT(pio->io_ready == zio_write_gang_member_ready); } - gn = zio_gang_node_alloc(gnpp); + gn = zio_gang_node_alloc(gnpp, gangblocksize); gbh = gn->gn_gbh; - memset(gbh, 0, SPA_GANGBLOCKSIZE); - gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); + memset(gbh, 0, gangblocksize); + gbh_abd = abd_get_from_buf(gbh, gangblocksize); /* * Create the gang header. */ - zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, gangblocksize, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); @@ -3198,7 +3235,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) * opportunistic allocations. If that fails to generate enough * space, we fall back to normal zio_write calls for nested gang. */ - for (int g = 0; resid != 0; g++) { + int g; + boolean_t any_failed = B_FALSE; + for (g = 0; resid != 0; g++) { flags &= METASLAB_ASYNC_ALLOC; flags |= METASLAB_GANG_CHILD; zp.zp_checksum = gio->io_prop.zp_checksum; @@ -3219,9 +3258,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); uint64_t min_size = zio_roundup_alloc_size(spa, - resid / (SPA_GBH_NBLKPTRS - g)); + resid / (gbh_nblkptrs(gangblocksize) - g)); min_size = MIN(min_size, resid); - bp = &gbh->zg_blkptr[g]; + bp = &((blkptr_t *)gbh)[g]; zio_alloc_list_t cio_list; metaslab_trace_init(&cio_list); @@ -3231,6 +3270,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) flags, &cio_list, zio->io_allocator, NULL, &allocated_size); boolean_t allocated = error == 0; + any_failed |= !allocated; uint64_t psize = allocated ? MIN(resid, allocated_size) : min_size; @@ -3262,6 +3302,29 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_nowait(cio); } + /* + * If we used more gang children than the old limit, we must already be + * using the new headers. No need to update anything, just move on. + * + * Otherwise, we might be in a case where we need to turn on the new + * feature, so we check that. We enable the new feature if we didn't + * manage to fit everything into 3 gang children and we could have + * written more than that. + */ + if (g > gbh_nblkptrs(SPA_OLD_GANGBLOCKSIZE)) { + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_DYNAMIC_GANG_HEADER)); + } else if (any_failed && candidate > SPA_OLD_GANGBLOCKSIZE && + spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) && + !spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) { + dmu_tx_t *tx = + dmu_tx_create_assigned(spa->spa_dsl_pool, txg + 1); + dsl_sync_task_nowait(spa->spa_dsl_pool, + zio_update_feature, + (void *)SPA_FEATURE_DYNAMIC_GANG_HEADER, tx); + dmu_tx_commit(tx); + } + /* * Set pio's pipeline to just wait for zio to finish. */ @@ -4331,9 +4394,9 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) } if (gn != NULL) { - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { zio_dva_unallocate(zio, gn->gn_child[g], - &gn->gn_gbh->zg_blkptr[g]); + gbh_bp(gn->gn_gbh, g)); } } } @@ -5262,6 +5325,7 @@ zio_dva_throttle_done(zio_t *zio) vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; const void *tag = pio; + uint64_t size = pio->io_size; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); @@ -5277,10 +5341,13 @@ zio_dva_throttle_done(zio_t *zio) * Parents of gang children can have two flavors -- ones that allocated * the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that * allocated the constituent blocks. The first use their parent as tag. + * We set the size to match the original allocation call for that case. */ if (pio->io_child_type == ZIO_CHILD_GANG && - (pio->io_flags & ZIO_FLAG_IO_REWRITE)) + (pio->io_flags & ZIO_FLAG_IO_REWRITE)) { tag = zio_unique_parent(pio); + size = SPA_OLD_GANGBLOCKSIZE; + } ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG && (pio->io_flags & ZIO_FLAG_IO_REWRITE))); @@ -5293,7 +5360,7 @@ zio_dva_throttle_done(zio_t *zio) ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, - pio->io_allocator, flags, pio->io_size, tag); + pio->io_allocator, flags, size, tag); if (metaslab_class_throttle_unreserve(pio->io_metaslab_class, pio->io_allocator, 1, pio->io_size)) { diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index a91775b04..8cec3a6f5 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -545,14 +545,35 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); int error; - uint64_t size = (bp == NULL ? zio->io_size : - (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t size = bp ? BP_GET_PSIZE(bp) : zio->io_size; uint64_t offset = zio->io_offset; abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; + if (bp && BP_IS_GANG(bp)) { + if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) + size = zio->io_size; + else + size = SPA_OLD_GANGBLOCKSIZE; + } + error = zio_checksum_error_impl(spa, bp, checksum, data, size, offset, info); + if (error && bp && BP_IS_GANG(bp) && size > SPA_OLD_GANGBLOCKSIZE) { + /* + * It's possible that this is an old gang block. Rerun + * the checksum with the old size; if that passes, then + * update the gangblocksize appropriately. + */ + error = zio_checksum_error_impl(spa, bp, checksum, data, + SPA_OLD_GANGBLOCKSIZE, offset, info); + if (error == 0) { + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + zio_t *pio = zio_unique_parent(zio); + zio_gang_node_t *gn = pio->io_private; + gn->gn_gangblocksize = SPA_OLD_GANGBLOCKSIZE; + } + } if (zio_injection_enabled && error == 0 && zio->io_error == 0) { error = zio_handle_fault_injection(zio, ECKSUM); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 376518e9f..214fa70fe 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -739,7 +739,8 @@ tags = ['functional', 'features', 'large_dnode'] [tests/functional/gang_blocks] tests = ['gang_blocks_001_pos', 'gang_blocks_redundant', - 'gang_blocks_ddt_copies'] + 'gang_blocks_ddt_copies', 'gang_blocks_dyn_header_pos', + 'gang_blocks_dyn_header_neg', 'gang_blocks_dyn_multi'] tags = ['functional', 'gang_blocks'] [tests/functional/grow] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 20a17a531..8813f2627 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1579,6 +1579,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/gang_blocks/gang_blocks_001_pos.ksh \ functional/gang_blocks/gang_blocks_ddt_copies.ksh \ functional/gang_blocks/gang_blocks_redundant.ksh \ + functional/gang_blocks/gang_blocks_dyn_header_neg.ksh \ + functional/gang_blocks/gang_blocks_dyn_header_pos.ksh \ + functional/gang_blocks/gang_blocks_dyn_multi.ksh \ functional/gang_blocks/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh index f96d291cc..94ccabeb8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh @@ -50,7 +50,7 @@ function cleanup function check_features { - for state in $(zpool get all $TESTPOOL | \ + for state in $(zpool get all $TESTPOOL | grep -v "dynamic_gang_header" | \ awk '$2 ~ /feature@/ { print $3 }'); do if [[ "$state" != "enabled" && "$state" != "active" ]]; then log_fail "some features are not enabled on new pool" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh index 7366a46f9..676aca1a2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh @@ -58,6 +58,9 @@ function check_features return 1; fi else + if [[ "feature@dynamic_gang_header" == "${2}" ]]; then + continue + fi # Failure other features must be enabled or active. if [[ "${3}" != "enabled" && "${3}" != "active" ]]; then return 2; diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index cf5e0961f..6de086976 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -91,6 +91,7 @@ typeset -a properties=( "feature@device_rebuild" "feature@draid" "feature@redaction_list_spill" + "feature@dynamic_gang_header" ) if is_linux || is_freebsd; then diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh new file mode 100755 index 000000000..e9cb1d2a0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh @@ -0,0 +1,53 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that we don't use larger gang headers on ashift=9 pools +# +# Strategy: +# 1. Create a pool with dynamic gang headers. +# 2. Set metaslab_force_ganging to force multi-level ganging. +# 3. Verify that a large file has multi-level ganging +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we don't use large gang headers on small-ashift pools". + +log_onexit cleanup +preamble + +log_must zpool create -f -o ashift=9 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 200000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=1M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 200) +gangs=$(echo "$leaves" | grep -c gang) +[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed" + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers active on an ashift-9 pool" +log_pass "We don't use large gang headers on small-ashift pools". diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh new file mode 100755 index 000000000..e6d6629e9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh @@ -0,0 +1,73 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that we use larger gang headers on ashift=12 pools +# +# Strategy: +# 1. Create a pool with dynamic gang headers. +# 2. Set metaslab_force_ganging to force ganging. +# 3. Verify that a large file has more than 3 gang headers. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we don't use large gang headers on small-ashift pools". + +log_onexit cleanup +preamble + +log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 200000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers not enabled" +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=1M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE) +first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*.*//') +check_not_gang_dva $first_dva + +num_leaves=$(echo "$leaves" | wc -l) +[[ "$num_leaves" -gt 3 ]] && log_fail "used a larger gang header too soon: \"$leaves\"" +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active" + +path="${mountpoint}/file2" +log_must dd if=/dev/urandom of=$path bs=1M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file2) +leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE) +first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*.*//') +check_not_gang_dva $first_dva + +num_leaves=$(echo "$leaves" | wc -l) +[[ "$num_leaves" -gt 3 ]] || log_fail "didn't use a larger gang header: \"$leaves\"" + + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active" +log_pass "We don't use large gang headers on small-ashift pools". diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh new file mode 100755 index 000000000..2ffe24968 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh @@ -0,0 +1,54 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that multi-level ganging still works with dynamic headers +# +# Strategy: +# 1. Create a pool with dynamic gang headers and ashift=12. +# 2. Set metaslab_force_ganging to force multi-level ganging. +# 3. Verify that a large file has multi-level ganging +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we can still multi-level gang with large headers." + +log_onexit cleanup +preamble + +log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=16M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 50000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=16M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 200) +gangs=$(echo "$leaves" | grep -c gang) +[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed" + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active" + +log_pass "We can still multi-level gang with large headers."