diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 45eb9c783..d6f144c0e 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8588,9 +8588,9 @@ zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) } static void -zdb_dump_gbh(void *buf, int flags) +zdb_dump_gbh(void *buf, uint64_t size, int flags) { - zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); + zdb_dump_indirect((blkptr_t *)buf, gbh_nblkptrs(size), flags); } static void @@ -9073,7 +9073,7 @@ zdb_read_block(char *thing, spa_t *spa) zdb_dump_indirect((blkptr_t *)buf, orig_lsize / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) - zdb_dump_gbh(buf, flags); + zdb_dump_gbh(buf, lsize, flags); else zdb_dump_block(thing, buf, lsize, flags); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 7f457c3a0..7f5a9aaef 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -148,7 +148,7 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); static inline uint64_t vdev_gang_header_asize(vdev_t *vd) { - return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0)); + return (vdev_psize_to_asize_txg(vd, SPA_OLD_GANGBLOCKSIZE, 0)); } extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); diff --git a/include/sys/zio.h b/include/sys/zio.h index e65ac2803..b139c9de4 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -59,21 +59,36 @@ typedef struct zio_eck { /* * Gang block headers are self-checksumming and contain an array - * of block pointers. + * of block pointers. The old gang block size has enough room for 3 blkptrs, + * while new gang blocks can store more. + * + * Layout: + * +--------+--------+--------+-----+---------+-----------+ + * | | | | | | | + * | blkptr | blkptr | blkptr | ... | padding | zio_eck_t | + * | 1 | 2 | 3 | | | | + * +--------+--------+--------+-----+---------+-----------+ + * 128B 128B 128B 88B 40B */ -#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE -#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_eck_t)) / sizeof (blkptr_t)) -#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_eck_t) - \ - (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ - sizeof (uint64_t)) +#define SPA_OLD_GANGBLOCKSIZE SPA_MINBLOCKSIZE +typedef void zio_gbh_phys_t; -typedef struct zio_gbh { - blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; - uint64_t zg_filler[SPA_GBH_FILLER]; - zio_eck_t zg_tail; -} zio_gbh_phys_t; +static inline uint64_t +gbh_nblkptrs(uint64_t size) { + ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t))); + return ((size - sizeof (zio_eck_t)) / sizeof (blkptr_t)); +} + +static inline zio_eck_t * +gbh_eck(zio_gbh_phys_t *gbh, uint64_t size) { + ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t))); + return ((zio_eck_t *)((uintptr_t)gbh + size - sizeof (zio_eck_t))); +} + +static inline blkptr_t * +gbh_bp(zio_gbh_phys_t *gbh, int bp) { + return (&((blkptr_t *)gbh)[bp]); +} enum zio_checksum { ZIO_CHECKSUM_INHERIT = 0, @@ -398,7 +413,9 @@ typedef struct zio_vsd_ops { typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; - struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; + uint64_t gn_gangblocksize; + uint64_t gn_allocsize; + struct zio_gang_node *gn_child[]; } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 5d37bb956..53e1ecae3 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -87,6 +87,7 @@ typedef enum spa_feature { SPA_FEATURE_FAST_DEDUP, SPA_FEATURE_LONGNAME, SPA_FEATURE_LARGE_MICROZAP, + SPA_FEATURE_DYNAMIC_GANG_HEADER, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 35ecdca76..ecfd40efc 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -631,7 +631,7 @@ - + @@ -6210,7 +6210,8 @@ - + + @@ -9394,8 +9395,8 @@ - - + + @@ -9403,6 +9404,7 @@ + @@ -9472,7 +9474,7 @@ - + diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 8ae1b2b3b..7ec271164 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -493,6 +493,19 @@ vdev type, or when adding a new .Sy draid vdev to an existing pool. . +.feature com.klarasystems dynamic_gang_header no +This feature enables larger gang headers based on the sector size of the pool. +When enabled, gang headers will use the entire space allocated for them, instead +of always restricting themselves to 512 bytes. +This can reduce the need for nested gang trees in extreme fragmentation +scenarios. +.Pp +This feature becomes active when a gang header is written that is larger than +512 bytes. +This feature is not enabled by +.Xr zpool-upgrade 8 . +Instead, it must be manually enabled, or be part of a compatibility file. +. .feature org.illumos edonr no extensible_dataset This feature enables the use of the Edon-R hash algorithm for checksum, including for nopwrite diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 0362d82ef..8ac1c7cab 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -786,6 +786,12 @@ zpool_feature_init(void) ZFEATURE_TYPE_BOOLEAN, large_microzap_deps, sfeatures); } + zfeature_register(SPA_FEATURE_DYNAMIC_GANG_HEADER, + "com.klarasystems:dynamic_gang_header", "dynamic_gang_header", + "Support for dynamically sized gang headers", + ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_NO_UPGRADE, + ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 23eca0425..082d379cd 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -5974,12 +5974,12 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); - uint64_t cur_psize = 0; - + uint64_t smallest_psize = UINT64_MAX; for (int d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva_range(spa, mc, psize, max_psize, - dva, d, hintdva, txg, flags, zal, allocator, - actual_psize ? &cur_psize : NULL); + uint64_t cur_psize = 0; + error = metaslab_alloc_dva_range(spa, mc, psize, + MIN(smallest_psize, max_psize), dva, d, hintdva, txg, + flags, zal, allocator, actual_psize ? &cur_psize : NULL); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); @@ -5999,13 +5999,13 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, DVA_GET_VDEV(&dva[d]), allocator, flags, psize, tag); if (actual_psize) - max_psize = MIN(cur_psize, max_psize); + smallest_psize = MIN(cur_psize, smallest_psize); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); if (actual_psize) - *actual_psize = max_psize; + *actual_psize = smallest_psize; spa_config_exit(spa, SCL_ALLOC, FTAG); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 64f3d31f5..67ee3d5ba 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2743,11 +2743,14 @@ zio_resume_wait(spa_t *spa) * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * - * A gang block consists of a gang header (zio_gbh_phys_t) and up to - * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like - * an indirect block: it's an array of block pointers. It consumes - * only one sector and hence is allocatable regardless of fragmentation. - * The gang header's bps point to its gang members, which hold the data. + * A gang block consists of a a gang header and up to gbh_nblkptrs(size) + * gang members. The gang header is like an indirect block: it's an array + * of block pointers, though the header has a small tail (a zio_eck_t) + * that stores an embedded checksum. It is allocated using only a single + * sector as the requested size, and hence is allocatable regardless of + * fragmentation. Its size is determined by the smallest allocatable + * asize of the vdevs it was allocated on. The gang header's bps point + * to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. @@ -2826,10 +2829,10 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, if (gn != NULL) { abd_t *gbh_abd = - abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + abd_get_from_buf(gn->gn_gbh, gn->gn_gangblocksize); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + gbh_abd, gn->gn_gangblocksize, zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute @@ -2900,14 +2903,16 @@ static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * -zio_gang_node_alloc(zio_gang_node_t **gnpp) +zio_gang_node_alloc(zio_gang_node_t **gnpp, uint64_t gangblocksize) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); - gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); - gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); + gn = kmem_zalloc(sizeof (*gn) + + (gbh_nblkptrs(gangblocksize) * sizeof (gn)), KM_SLEEP); + gn->gn_gangblocksize = gn->gn_allocsize = gangblocksize; + gn->gn_gbh = zio_buf_alloc(gangblocksize); *gnpp = gn; return (gn); @@ -2918,11 +2923,12 @@ zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++) ASSERT(gn->gn_child[g] == NULL); - zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); - kmem_free(gn, sizeof (*gn)); + zio_buf_free(gn->gn_gbh, gn->gn_allocsize); + kmem_free(gn, sizeof (*gn) + + (gbh_nblkptrs(gn->gn_allocsize) * sizeof (gn))); *gnpp = NULL; } @@ -2934,7 +2940,7 @@ zio_gang_tree_free(zio_gang_node_t **gnpp) if (gn == NULL) return; - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); @@ -2943,13 +2949,28 @@ zio_gang_tree_free(zio_gang_node_t **gnpp) static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { - zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); - abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + uint64_t gangblocksize = UINT64_MAX; + if (spa_feature_is_active(gio->io_spa, + SPA_FEATURE_DYNAMIC_GANG_HEADER)) { + spa_config_enter(gio->io_spa, SCL_VDEV, FTAG, RW_READER); + for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) { + vdev_t *vd = vdev_lookup_top(gio->io_spa, + DVA_GET_VDEV(&bp->blk_dva[dva])); + uint64_t asize = vdev_gang_header_asize(vd); + gangblocksize = MIN(gangblocksize, asize); + } + spa_config_exit(gio->io_spa, SCL_VDEV, FTAG); + } else { + gangblocksize = SPA_OLD_GANGBLOCKSIZE; + } + ASSERT3U(gangblocksize, !=, UINT64_MAX); + zio_gang_node_t *gn = zio_gang_node_alloc(gnpp, gangblocksize); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, gangblocksize); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, gangblocksize, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } @@ -2972,13 +2993,17 @@ zio_gang_tree_assemble_done(zio_t *zio) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); - ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); - ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + /* + * If this was an old-style gangblock, the gangblocksize should have + * been updated in zio_checksum_error to reflect that. + */ + ASSERT3U(gbh_eck(gn->gn_gbh, gn->gn_gangblocksize)->zec_magic, + ==, ZEC_MAGIC); abd_free(zio->io_abd); - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { - blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { + blkptr_t *gbp = gbh_bp(gn->gn_gbh, g); if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); @@ -3003,10 +3028,11 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { - ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + ASSERT3U(gbh_eck(gn->gn_gbh, + gn->gn_gangblocksize)->zec_magic, ==, ZEC_MAGIC); - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { - blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { + blkptr_t *gbp = gbh_bp(gn->gn_gbh, g); if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, @@ -3113,6 +3139,13 @@ zio_write_gang_done(zio_t *zio) abd_free(zio->io_abd); } +static void +zio_update_feature(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_feature_incr(spa, (spa_feature_t)(uintptr_t)arg, tx); +} + static zio_t * zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { @@ -3158,13 +3191,17 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) flags |= METASLAB_ASYNC_ALLOC; } - error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, + uint64_t gangblocksize = SPA_OLD_GANGBLOCKSIZE; + uint64_t candidate = gangblocksize; + error = metaslab_alloc_range(spa, mc, gangblocksize, gangblocksize, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, - &pio->io_alloc_list, pio->io_allocator, pio); + &pio->io_alloc_list, pio->io_allocator, pio, &candidate); if (error) { pio->io_error = error; return (pio); } + if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) + gangblocksize = candidate; if (pio == gio) { gnpp = &gio->io_gang_tree; @@ -3173,15 +3210,15 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) ASSERT(pio->io_ready == zio_write_gang_member_ready); } - gn = zio_gang_node_alloc(gnpp); + gn = zio_gang_node_alloc(gnpp, gangblocksize); gbh = gn->gn_gbh; - memset(gbh, 0, SPA_GANGBLOCKSIZE); - gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); + memset(gbh, 0, gangblocksize); + gbh_abd = abd_get_from_buf(gbh, gangblocksize); /* * Create the gang header. */ - zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, gangblocksize, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); @@ -3198,7 +3235,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) * opportunistic allocations. If that fails to generate enough * space, we fall back to normal zio_write calls for nested gang. */ - for (int g = 0; resid != 0; g++) { + int g; + boolean_t any_failed = B_FALSE; + for (g = 0; resid != 0; g++) { flags &= METASLAB_ASYNC_ALLOC; flags |= METASLAB_GANG_CHILD; zp.zp_checksum = gio->io_prop.zp_checksum; @@ -3219,9 +3258,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); uint64_t min_size = zio_roundup_alloc_size(spa, - resid / (SPA_GBH_NBLKPTRS - g)); + resid / (gbh_nblkptrs(gangblocksize) - g)); min_size = MIN(min_size, resid); - bp = &gbh->zg_blkptr[g]; + bp = &((blkptr_t *)gbh)[g]; zio_alloc_list_t cio_list; metaslab_trace_init(&cio_list); @@ -3231,6 +3270,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) flags, &cio_list, zio->io_allocator, NULL, &allocated_size); boolean_t allocated = error == 0; + any_failed |= !allocated; uint64_t psize = allocated ? MIN(resid, allocated_size) : min_size; @@ -3262,6 +3302,29 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_nowait(cio); } + /* + * If we used more gang children than the old limit, we must already be + * using the new headers. No need to update anything, just move on. + * + * Otherwise, we might be in a case where we need to turn on the new + * feature, so we check that. We enable the new feature if we didn't + * manage to fit everything into 3 gang children and we could have + * written more than that. + */ + if (g > gbh_nblkptrs(SPA_OLD_GANGBLOCKSIZE)) { + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_DYNAMIC_GANG_HEADER)); + } else if (any_failed && candidate > SPA_OLD_GANGBLOCKSIZE && + spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) && + !spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) { + dmu_tx_t *tx = + dmu_tx_create_assigned(spa->spa_dsl_pool, txg + 1); + dsl_sync_task_nowait(spa->spa_dsl_pool, + zio_update_feature, + (void *)SPA_FEATURE_DYNAMIC_GANG_HEADER, tx); + dmu_tx_commit(tx); + } + /* * Set pio's pipeline to just wait for zio to finish. */ @@ -4331,9 +4394,9 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) } if (gn != NULL) { - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { zio_dva_unallocate(zio, gn->gn_child[g], - &gn->gn_gbh->zg_blkptr[g]); + gbh_bp(gn->gn_gbh, g)); } } } @@ -5262,6 +5325,7 @@ zio_dva_throttle_done(zio_t *zio) vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; const void *tag = pio; + uint64_t size = pio->io_size; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); @@ -5277,10 +5341,13 @@ zio_dva_throttle_done(zio_t *zio) * Parents of gang children can have two flavors -- ones that allocated * the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that * allocated the constituent blocks. The first use their parent as tag. + * We set the size to match the original allocation call for that case. */ if (pio->io_child_type == ZIO_CHILD_GANG && - (pio->io_flags & ZIO_FLAG_IO_REWRITE)) + (pio->io_flags & ZIO_FLAG_IO_REWRITE)) { tag = zio_unique_parent(pio); + size = SPA_OLD_GANGBLOCKSIZE; + } ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG && (pio->io_flags & ZIO_FLAG_IO_REWRITE))); @@ -5293,7 +5360,7 @@ zio_dva_throttle_done(zio_t *zio) ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, - pio->io_allocator, flags, pio->io_size, tag); + pio->io_allocator, flags, size, tag); if (metaslab_class_throttle_unreserve(pio->io_metaslab_class, pio->io_allocator, 1, pio->io_size)) { diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index a91775b04..8cec3a6f5 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -545,14 +545,35 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); int error; - uint64_t size = (bp == NULL ? zio->io_size : - (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t size = bp ? BP_GET_PSIZE(bp) : zio->io_size; uint64_t offset = zio->io_offset; abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; + if (bp && BP_IS_GANG(bp)) { + if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) + size = zio->io_size; + else + size = SPA_OLD_GANGBLOCKSIZE; + } + error = zio_checksum_error_impl(spa, bp, checksum, data, size, offset, info); + if (error && bp && BP_IS_GANG(bp) && size > SPA_OLD_GANGBLOCKSIZE) { + /* + * It's possible that this is an old gang block. Rerun + * the checksum with the old size; if that passes, then + * update the gangblocksize appropriately. + */ + error = zio_checksum_error_impl(spa, bp, checksum, data, + SPA_OLD_GANGBLOCKSIZE, offset, info); + if (error == 0) { + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + zio_t *pio = zio_unique_parent(zio); + zio_gang_node_t *gn = pio->io_private; + gn->gn_gangblocksize = SPA_OLD_GANGBLOCKSIZE; + } + } if (zio_injection_enabled && error == 0 && zio->io_error == 0) { error = zio_handle_fault_injection(zio, ECKSUM); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 376518e9f..214fa70fe 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -739,7 +739,8 @@ tags = ['functional', 'features', 'large_dnode'] [tests/functional/gang_blocks] tests = ['gang_blocks_001_pos', 'gang_blocks_redundant', - 'gang_blocks_ddt_copies'] + 'gang_blocks_ddt_copies', 'gang_blocks_dyn_header_pos', + 'gang_blocks_dyn_header_neg', 'gang_blocks_dyn_multi'] tags = ['functional', 'gang_blocks'] [tests/functional/grow] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 20a17a531..8813f2627 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1579,6 +1579,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/gang_blocks/gang_blocks_001_pos.ksh \ functional/gang_blocks/gang_blocks_ddt_copies.ksh \ functional/gang_blocks/gang_blocks_redundant.ksh \ + functional/gang_blocks/gang_blocks_dyn_header_neg.ksh \ + functional/gang_blocks/gang_blocks_dyn_header_pos.ksh \ + functional/gang_blocks/gang_blocks_dyn_multi.ksh \ functional/gang_blocks/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh index f96d291cc..94ccabeb8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh @@ -50,7 +50,7 @@ function cleanup function check_features { - for state in $(zpool get all $TESTPOOL | \ + for state in $(zpool get all $TESTPOOL | grep -v "dynamic_gang_header" | \ awk '$2 ~ /feature@/ { print $3 }'); do if [[ "$state" != "enabled" && "$state" != "active" ]]; then log_fail "some features are not enabled on new pool" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh index 7366a46f9..676aca1a2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh @@ -58,6 +58,9 @@ function check_features return 1; fi else + if [[ "feature@dynamic_gang_header" == "${2}" ]]; then + continue + fi # Failure other features must be enabled or active. if [[ "${3}" != "enabled" && "${3}" != "active" ]]; then return 2; diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index cf5e0961f..6de086976 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -91,6 +91,7 @@ typeset -a properties=( "feature@device_rebuild" "feature@draid" "feature@redaction_list_spill" + "feature@dynamic_gang_header" ) if is_linux || is_freebsd; then diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh new file mode 100755 index 000000000..e9cb1d2a0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh @@ -0,0 +1,53 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that we don't use larger gang headers on ashift=9 pools +# +# Strategy: +# 1. Create a pool with dynamic gang headers. +# 2. Set metaslab_force_ganging to force multi-level ganging. +# 3. Verify that a large file has multi-level ganging +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we don't use large gang headers on small-ashift pools". + +log_onexit cleanup +preamble + +log_must zpool create -f -o ashift=9 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 200000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=1M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 200) +gangs=$(echo "$leaves" | grep -c gang) +[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed" + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers active on an ashift-9 pool" +log_pass "We don't use large gang headers on small-ashift pools". diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh new file mode 100755 index 000000000..e6d6629e9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh @@ -0,0 +1,73 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that we use larger gang headers on ashift=12 pools +# +# Strategy: +# 1. Create a pool with dynamic gang headers. +# 2. Set metaslab_force_ganging to force ganging. +# 3. Verify that a large file has more than 3 gang headers. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we don't use large gang headers on small-ashift pools". + +log_onexit cleanup +preamble + +log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 200000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers not enabled" +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=1M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE) +first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*.*//') +check_not_gang_dva $first_dva + +num_leaves=$(echo "$leaves" | wc -l) +[[ "$num_leaves" -gt 3 ]] && log_fail "used a larger gang header too soon: \"$leaves\"" +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active" + +path="${mountpoint}/file2" +log_must dd if=/dev/urandom of=$path bs=1M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file2) +leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE) +first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*.*//') +check_not_gang_dva $first_dva + +num_leaves=$(echo "$leaves" | wc -l) +[[ "$num_leaves" -gt 3 ]] || log_fail "didn't use a larger gang header: \"$leaves\"" + + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active" +log_pass "We don't use large gang headers on small-ashift pools". diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh new file mode 100755 index 000000000..2ffe24968 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh @@ -0,0 +1,54 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that multi-level ganging still works with dynamic headers +# +# Strategy: +# 1. Create a pool with dynamic gang headers and ashift=12. +# 2. Set metaslab_force_ganging to force multi-level ganging. +# 3. Verify that a large file has multi-level ganging +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we can still multi-level gang with large headers." + +log_onexit cleanup +preamble + +log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=16M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 50000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=16M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 200) +gangs=$(echo "$leaves" | grep -c gang) +[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed" + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active" + +log_pass "We can still multi-level gang with large headers."