diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 8bdd7ca3a..f1687d471 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -352,6 +352,8 @@ extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp); extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern int ddt_phys_is_gang(const ddt_univ_phys_t *ddp, + ddt_phys_variant_t v); extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, boolean_t encrypted); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index b0cd7f089..5ecfbc130 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -856,6 +856,17 @@ ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) return (ddp->ddp_trad[v].ddp_phys_birth); } +int +ddt_phys_is_gang(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + const dva_t *dvas = (v == DDT_PHYS_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; + + return (DVA_GET_GANG(&dvas[0])); +} + int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, boolean_t encrypted) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 47f229dcb..eb08a6eac 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3145,6 +3145,17 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) * This value respects the redundant_metadata property. */ int gbh_copies = gio->io_prop.zp_gang_copies; + if (gbh_copies == 0) { + /* + * This should only happen in the case where we're filling in + * DDT entries for a parent that wants more copies than the DDT + * has. In that case, we cannot gang without creating a mixed + * blkptr, which is illegal. + */ + ASSERT3U(gio->io_child_type, ==, ZIO_CHILD_DDT); + pio->io_error = EAGAIN; + return (pio); + } ASSERT3S(gbh_copies, >, 0); ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP); @@ -3614,16 +3625,6 @@ zio_ddt_child_write_done(zio_t *zio) return; } - /* - * We've successfully added new DVAs to the entry. Clear the saved - * state or, if there's still outstanding IO, remember it so we can - * revert to a known good state if that IO fails. - */ - if (dde->dde_io->dde_lead_zio[p] == NULL) - ddt_phys_clear(orig, v); - else - ddt_phys_copy(orig, ddp, v); - /* * Add references for all dedup writes that were waiting on the * physical one, skipping any other physical writes that are waiting. @@ -3635,6 +3636,16 @@ zio_ddt_child_write_done(zio_t *zio) ddt_phys_addref(ddp, v); } + /* + * We've successfully added new DVAs to the entry. Clear the saved + * state or, if there's still outstanding IO, remember it so we can + * revert to a known good state if that IO fails. + */ + if (dde->dde_io->dde_lead_zio[p] == NULL) + ddt_phys_clear(orig, v); + else + ddt_phys_copy(orig, ddp, v); + ddt_exit(ddt); } @@ -3650,6 +3661,16 @@ zio_ddt_child_write_ready(zio_t *zio) int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + if (ddt_phys_is_gang(dde->dde_phys, v)) { + for (int i = 0; i < BP_GET_NDVAS(zio->io_bp); i++) { + dva_t *d = &zio->io_bp->blk_dva[i]; + metaslab_group_alloc_decrement(zio->io_spa, + DVA_GET_VDEV(d), zio->io_allocator, + METASLAB_ASYNC_ALLOC, zio->io_size, zio); + } + zio->io_error = EAGAIN; + } + if (zio->io_error != 0) return; @@ -3769,9 +3790,12 @@ zio_ddt_write(zio_t *zio) */ int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp)); IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0); + boolean_t is_ganged = ddt_phys_is_gang(ddp, v); - /* Number of DVAs requested bya the IO. */ + /* Number of DVAs requested by the IO. */ uint8_t need_dvas = zp->zp_copies; + /* Number of DVAs in outstanding writes for this dde. */ + uint8_t parent_dvas = 0; /* * What we do next depends on whether or not there's IO outstanding that @@ -3901,7 +3925,7 @@ zio_ddt_write(zio_t *zio) zio_t *pio = zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl); ASSERT(pio); - uint8_t parent_dvas = pio->io_prop.zp_copies; + parent_dvas = pio->io_prop.zp_copies; if (parent_dvas >= need_dvas) { zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); @@ -3916,13 +3940,27 @@ zio_ddt_write(zio_t *zio) need_dvas -= parent_dvas; } + if (is_ganged) { + zp->zp_dedup = B_FALSE; + BP_SET_DEDUP(bp, B_FALSE); + zio->io_pipeline = ZIO_WRITE_PIPELINE; + ddt_exit(ddt); + return (zio); + } + /* * We need to write. We will create a new write with the copies * property adjusted to match the number of DVAs we need to need to * grow the DDT entry by to satisfy the request. */ zio_prop_t czp = *zp; - czp.zp_copies = czp.zp_gang_copies = need_dvas; + if (have_dvas > 0 || parent_dvas > 0) { + czp.zp_copies = need_dvas; + czp.zp_gang_copies = 0; + } else { + ASSERT3U(czp.zp_copies, ==, need_dvas); + } + zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, zio_ddt_child_write_ready, NULL, @@ -4106,6 +4144,7 @@ zio_dva_allocate(zio_t *zio) ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); + ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); @@ -5391,6 +5430,16 @@ zio_done(zio_t *zio) } if (zio->io_error && zio == zio->io_logical) { + + /* + * A DDT child tried to create a mixed gang/non-gang BP. We're + * going to have to just retry as a non-dedup IO. + */ + if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) && + zio->io_prop.zp_dedup) { + zio->io_reexecute |= ZIO_REEXECUTE_NOW; + zio->io_prop.zp_dedup = B_FALSE; + } /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 5e99dbac5..9373b39a1 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -726,7 +726,7 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', tags = ['functional', 'features', 'large_dnode'] [tests/functional/gang_blocks] -tests = ['gang_blocks_redundant'] +tests = ['gang_blocks_redundant', 'gang_blocks_ddt_copies'] tags = ['functional', 'gang_blocks'] [tests/functional/grow] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 6d2c2f6fd..6a5d11761 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1561,6 +1561,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/features/large_dnode/large_dnode_009_pos.ksh \ functional/features/large_dnode/setup.ksh \ functional/gang_blocks/cleanup.ksh \ + functional/gang_blocks/gang_blocks_ddt_copies.ksh \ functional/gang_blocks/gang_blocks_redundant.ksh \ functional/gang_blocks/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_ddt_copies.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_ddt_copies.ksh new file mode 100755 index 000000000..12ebcec3a --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_ddt_copies.ksh @@ -0,0 +1,79 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that mixed gang blocks and copies interact correctly in FDT +# +# Strategy: +# 1. Store a block with copies = 1 in the DDT unganged. +# 2. Add a new entry with copies = 2 that gangs, ensure it doesn't panic +# 3. Store a block with copies = 1 in the DDT ganged. +# 4. Add a new entry with copies = 3 that doesn't gang, ensure that it doesn't panic. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that mixed gang blocks and copies interact correctly in FDT" + +save_tunable DEDUP_LOG_TXG_MAX + +function cleanup2 +{ + zfs destroy $TESTPOOL/fs1 + zfs destroy $TESTPOOL/fs2 + restore_tunable DEDUP_LOG_TXG_MAX + cleanup +} + +preamble +log_onexit cleanup2 + +log_must zpool create -f -o ashift=9 -o feature@block_cloning=disabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=64k -o dedup=on $TESTPOOL/fs1 +log_must zfs create -o recordsize=64k -o dedup=on -o copies=3 $TESTPOOL/fs2 +set_tunable32 DEDUP_LOG_TXG_MAX 1 +log_must dd if=/dev/urandom of=/$TESTPOOL/fs1/f1 bs=64k count=1 +log_must sync_pool $TESTPOOL +set_tunable32 METASLAB_FORCE_GANGING 20000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 +log_must dd if=/$TESTPOOL/fs1/f1 of=/$TESTPOOL/fs2/f1 bs=64k count=1 +log_must sync_pool $TESTPOOL + +log_must rm /$TESTPOOL/fs*/f1 +log_must sync_pool $TESTPOOL +log_must dd if=/dev/urandom of=/$TESTPOOL/fs1/f1 bs=64k count=1 +log_must sync_pool $TESTPOOL +log_must zdb -D $TESTPOOL +set_tunable32 METASLAB_FORCE_GANGING_PCT 0 +log_must dd if=/$TESTPOOL/fs1/f1 of=/$TESTPOOL/fs2/f1 bs=64k count=1 +log_must sync_pool $TESTPOOL + +log_must rm /$TESTPOOL/fs*/f1 +log_must sync_pool $TESTPOOL +set_tunable32 METASLAB_FORCE_GANGING_PCT 50 +set_tunable32 METASLAB_FORCE_GANGING 40000 +log_must dd if=/dev/urandom of=/$TESTPOOL/f1 bs=64k count=1 +for i in `seq 1 16`; do + log_must cp /$TESTPOOL/f1 /$TESTPOOL/fs2/f1 + log_must cp /$TESTPOOL/f1 /$TESTPOOL/fs1/f1 + log_must sync_pool $TESTPOOL + log_must zdb -D $TESTPOOL +done + +log_pass "Verify that mixed gang blocks and copies interact correctly in FDT"