mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-05-23 06:54:58 +03:00
Handle interaction between gang blocks, copies, and FDT.
With the advent of fast dedup, there are no longer separate dedup tables for different copies values. There is now logic that will add DVAs to the dedup table entry if more copies are needed for new writes. However, this interacts poorly with ganging. There are two different cases that can result in mixed gang/non-gang BPs, which are illegal in ZFS. This change modifies updates of existing FDT; if there are already gang DVAs in the FDT, we prevent the new write from extending the DDT entry. We cannot safely mix different gang trees in one block pointer. if there are non-gang DVAs in the FDT, then this allocation may not be gangs. If it would gang, we have to redo the whole write as a non-dedup write. This change also fixes a refcount leak that could occur if the lead DDT write failed. Sponsored by: iXsystems, Inc. Sponsored-by: Klara, Inc. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Closes: #17123
This commit is contained in:
parent
1d8f625233
commit
5f5321effa
@ -352,6 +352,8 @@ extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
|
|||||||
const ddt_entry_t *dde, const blkptr_t *bp);
|
const ddt_entry_t *dde, const blkptr_t *bp);
|
||||||
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
|
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
|
||||||
ddt_phys_variant_t v);
|
ddt_phys_variant_t v);
|
||||||
|
extern int ddt_phys_is_gang(const ddt_univ_phys_t *ddp,
|
||||||
|
ddt_phys_variant_t v);
|
||||||
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
|
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
|
||||||
boolean_t encrypted);
|
boolean_t encrypted);
|
||||||
|
|
||||||
|
@ -856,6 +856,17 @@ ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
|
|||||||
return (ddp->ddp_trad[v].ddp_phys_birth);
|
return (ddp->ddp_trad[v].ddp_phys_birth);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
ddt_phys_is_gang(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
|
||||||
|
{
|
||||||
|
ASSERT3U(v, <, DDT_PHYS_NONE);
|
||||||
|
|
||||||
|
const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
|
||||||
|
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
|
||||||
|
|
||||||
|
return (DVA_GET_GANG(&dvas[0]));
|
||||||
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
|
ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
|
||||||
boolean_t encrypted)
|
boolean_t encrypted)
|
||||||
|
@ -3145,6 +3145,17 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
|
|||||||
* This value respects the redundant_metadata property.
|
* This value respects the redundant_metadata property.
|
||||||
*/
|
*/
|
||||||
int gbh_copies = gio->io_prop.zp_gang_copies;
|
int gbh_copies = gio->io_prop.zp_gang_copies;
|
||||||
|
if (gbh_copies == 0) {
|
||||||
|
/*
|
||||||
|
* This should only happen in the case where we're filling in
|
||||||
|
* DDT entries for a parent that wants more copies than the DDT
|
||||||
|
* has. In that case, we cannot gang without creating a mixed
|
||||||
|
* blkptr, which is illegal.
|
||||||
|
*/
|
||||||
|
ASSERT3U(gio->io_child_type, ==, ZIO_CHILD_DDT);
|
||||||
|
pio->io_error = EAGAIN;
|
||||||
|
return (pio);
|
||||||
|
}
|
||||||
ASSERT3S(gbh_copies, >, 0);
|
ASSERT3S(gbh_copies, >, 0);
|
||||||
ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);
|
ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);
|
||||||
|
|
||||||
@ -3614,16 +3625,6 @@ zio_ddt_child_write_done(zio_t *zio)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* We've successfully added new DVAs to the entry. Clear the saved
|
|
||||||
* state or, if there's still outstanding IO, remember it so we can
|
|
||||||
* revert to a known good state if that IO fails.
|
|
||||||
*/
|
|
||||||
if (dde->dde_io->dde_lead_zio[p] == NULL)
|
|
||||||
ddt_phys_clear(orig, v);
|
|
||||||
else
|
|
||||||
ddt_phys_copy(orig, ddp, v);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add references for all dedup writes that were waiting on the
|
* Add references for all dedup writes that were waiting on the
|
||||||
* physical one, skipping any other physical writes that are waiting.
|
* physical one, skipping any other physical writes that are waiting.
|
||||||
@ -3635,6 +3636,16 @@ zio_ddt_child_write_done(zio_t *zio)
|
|||||||
ddt_phys_addref(ddp, v);
|
ddt_phys_addref(ddp, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We've successfully added new DVAs to the entry. Clear the saved
|
||||||
|
* state or, if there's still outstanding IO, remember it so we can
|
||||||
|
* revert to a known good state if that IO fails.
|
||||||
|
*/
|
||||||
|
if (dde->dde_io->dde_lead_zio[p] == NULL)
|
||||||
|
ddt_phys_clear(orig, v);
|
||||||
|
else
|
||||||
|
ddt_phys_copy(orig, ddp, v);
|
||||||
|
|
||||||
ddt_exit(ddt);
|
ddt_exit(ddt);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3650,6 +3661,16 @@ zio_ddt_child_write_ready(zio_t *zio)
|
|||||||
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
|
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
|
||||||
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
||||||
|
|
||||||
|
if (ddt_phys_is_gang(dde->dde_phys, v)) {
|
||||||
|
for (int i = 0; i < BP_GET_NDVAS(zio->io_bp); i++) {
|
||||||
|
dva_t *d = &zio->io_bp->blk_dva[i];
|
||||||
|
metaslab_group_alloc_decrement(zio->io_spa,
|
||||||
|
DVA_GET_VDEV(d), zio->io_allocator,
|
||||||
|
METASLAB_ASYNC_ALLOC, zio->io_size, zio);
|
||||||
|
}
|
||||||
|
zio->io_error = EAGAIN;
|
||||||
|
}
|
||||||
|
|
||||||
if (zio->io_error != 0)
|
if (zio->io_error != 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@ -3769,9 +3790,12 @@ zio_ddt_write(zio_t *zio)
|
|||||||
*/
|
*/
|
||||||
int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
|
int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
|
||||||
IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
|
IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
|
||||||
|
boolean_t is_ganged = ddt_phys_is_gang(ddp, v);
|
||||||
|
|
||||||
/* Number of DVAs requested bya the IO. */
|
/* Number of DVAs requested by the IO. */
|
||||||
uint8_t need_dvas = zp->zp_copies;
|
uint8_t need_dvas = zp->zp_copies;
|
||||||
|
/* Number of DVAs in outstanding writes for this dde. */
|
||||||
|
uint8_t parent_dvas = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* What we do next depends on whether or not there's IO outstanding that
|
* What we do next depends on whether or not there's IO outstanding that
|
||||||
@ -3901,7 +3925,7 @@ zio_ddt_write(zio_t *zio)
|
|||||||
zio_t *pio =
|
zio_t *pio =
|
||||||
zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
|
zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
|
||||||
ASSERT(pio);
|
ASSERT(pio);
|
||||||
uint8_t parent_dvas = pio->io_prop.zp_copies;
|
parent_dvas = pio->io_prop.zp_copies;
|
||||||
|
|
||||||
if (parent_dvas >= need_dvas) {
|
if (parent_dvas >= need_dvas) {
|
||||||
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
|
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
|
||||||
@ -3916,13 +3940,27 @@ zio_ddt_write(zio_t *zio)
|
|||||||
need_dvas -= parent_dvas;
|
need_dvas -= parent_dvas;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (is_ganged) {
|
||||||
|
zp->zp_dedup = B_FALSE;
|
||||||
|
BP_SET_DEDUP(bp, B_FALSE);
|
||||||
|
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
||||||
|
ddt_exit(ddt);
|
||||||
|
return (zio);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We need to write. We will create a new write with the copies
|
* We need to write. We will create a new write with the copies
|
||||||
* property adjusted to match the number of DVAs we need to need to
|
* property adjusted to match the number of DVAs we need to need to
|
||||||
* grow the DDT entry by to satisfy the request.
|
* grow the DDT entry by to satisfy the request.
|
||||||
*/
|
*/
|
||||||
zio_prop_t czp = *zp;
|
zio_prop_t czp = *zp;
|
||||||
czp.zp_copies = czp.zp_gang_copies = need_dvas;
|
if (have_dvas > 0 || parent_dvas > 0) {
|
||||||
|
czp.zp_copies = need_dvas;
|
||||||
|
czp.zp_gang_copies = 0;
|
||||||
|
} else {
|
||||||
|
ASSERT3U(czp.zp_copies, ==, need_dvas);
|
||||||
|
}
|
||||||
|
|
||||||
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
|
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
|
||||||
zio->io_orig_size, zio->io_orig_size, &czp,
|
zio->io_orig_size, zio->io_orig_size, &czp,
|
||||||
zio_ddt_child_write_ready, NULL,
|
zio_ddt_child_write_ready, NULL,
|
||||||
@ -4106,6 +4144,7 @@ zio_dva_allocate(zio_t *zio)
|
|||||||
ASSERT(BP_IS_HOLE(bp));
|
ASSERT(BP_IS_HOLE(bp));
|
||||||
ASSERT0(BP_GET_NDVAS(bp));
|
ASSERT0(BP_GET_NDVAS(bp));
|
||||||
ASSERT3U(zio->io_prop.zp_copies, >, 0);
|
ASSERT3U(zio->io_prop.zp_copies, >, 0);
|
||||||
|
|
||||||
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
|
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
|
||||||
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
|
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
|
||||||
|
|
||||||
@ -5391,6 +5430,16 @@ zio_done(zio_t *zio)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (zio->io_error && zio == zio->io_logical) {
|
if (zio->io_error && zio == zio->io_logical) {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A DDT child tried to create a mixed gang/non-gang BP. We're
|
||||||
|
* going to have to just retry as a non-dedup IO.
|
||||||
|
*/
|
||||||
|
if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) &&
|
||||||
|
zio->io_prop.zp_dedup) {
|
||||||
|
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
|
||||||
|
zio->io_prop.zp_dedup = B_FALSE;
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
* Determine whether zio should be reexecuted. This will
|
* Determine whether zio should be reexecuted. This will
|
||||||
* propagate all the way to the root via zio_notify_parent().
|
* propagate all the way to the root via zio_notify_parent().
|
||||||
|
@ -726,7 +726,7 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
|
|||||||
tags = ['functional', 'features', 'large_dnode']
|
tags = ['functional', 'features', 'large_dnode']
|
||||||
|
|
||||||
[tests/functional/gang_blocks]
|
[tests/functional/gang_blocks]
|
||||||
tests = ['gang_blocks_redundant']
|
tests = ['gang_blocks_redundant', 'gang_blocks_ddt_copies']
|
||||||
tags = ['functional', 'gang_blocks']
|
tags = ['functional', 'gang_blocks']
|
||||||
|
|
||||||
[tests/functional/grow]
|
[tests/functional/grow]
|
||||||
|
@ -1561,6 +1561,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||||||
functional/features/large_dnode/large_dnode_009_pos.ksh \
|
functional/features/large_dnode/large_dnode_009_pos.ksh \
|
||||||
functional/features/large_dnode/setup.ksh \
|
functional/features/large_dnode/setup.ksh \
|
||||||
functional/gang_blocks/cleanup.ksh \
|
functional/gang_blocks/cleanup.ksh \
|
||||||
|
functional/gang_blocks/gang_blocks_ddt_copies.ksh \
|
||||||
functional/gang_blocks/gang_blocks_redundant.ksh \
|
functional/gang_blocks/gang_blocks_redundant.ksh \
|
||||||
functional/gang_blocks/setup.ksh \
|
functional/gang_blocks/setup.ksh \
|
||||||
functional/grow/grow_pool_001_pos.ksh \
|
functional/grow/grow_pool_001_pos.ksh \
|
||||||
|
79
tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_ddt_copies.ksh
Executable file
79
tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_ddt_copies.ksh
Executable file
@ -0,0 +1,79 @@
|
|||||||
|
#!/bin/ksh
|
||||||
|
# SPDX-License-Identifier: CDDL-1.0
|
||||||
|
#
|
||||||
|
# This file and its contents are supplied under the terms of the
|
||||||
|
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||||
|
# You may only use this file in accordance with the terms of version
|
||||||
|
# 1.0 of the CDDL.
|
||||||
|
#
|
||||||
|
# A full copy of the text of the CDDL should have accompanied this
|
||||||
|
# source. A copy of the CDDL is also available via the Internet at
|
||||||
|
# http://www.illumos.org/license/CDDL.
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Copyright (c) 2025 by Klara Inc.
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Verify that mixed gang blocks and copies interact correctly in FDT
|
||||||
|
#
|
||||||
|
# Strategy:
|
||||||
|
# 1. Store a block with copies = 1 in the DDT unganged.
|
||||||
|
# 2. Add a new entry with copies = 2 that gangs, ensure it doesn't panic
|
||||||
|
# 3. Store a block with copies = 1 in the DDT ganged.
|
||||||
|
# 4. Add a new entry with copies = 3 that doesn't gang, ensure that it doesn't panic.
|
||||||
|
#
|
||||||
|
|
||||||
|
. $STF_SUITE/include/libtest.shlib
|
||||||
|
. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
|
||||||
|
|
||||||
|
log_assert "Verify that mixed gang blocks and copies interact correctly in FDT"
|
||||||
|
|
||||||
|
save_tunable DEDUP_LOG_TXG_MAX
|
||||||
|
|
||||||
|
function cleanup2
|
||||||
|
{
|
||||||
|
zfs destroy $TESTPOOL/fs1
|
||||||
|
zfs destroy $TESTPOOL/fs2
|
||||||
|
restore_tunable DEDUP_LOG_TXG_MAX
|
||||||
|
cleanup
|
||||||
|
}
|
||||||
|
|
||||||
|
preamble
|
||||||
|
log_onexit cleanup2
|
||||||
|
|
||||||
|
log_must zpool create -f -o ashift=9 -o feature@block_cloning=disabled $TESTPOOL $DISKS
|
||||||
|
log_must zfs create -o recordsize=64k -o dedup=on $TESTPOOL/fs1
|
||||||
|
log_must zfs create -o recordsize=64k -o dedup=on -o copies=3 $TESTPOOL/fs2
|
||||||
|
set_tunable32 DEDUP_LOG_TXG_MAX 1
|
||||||
|
log_must dd if=/dev/urandom of=/$TESTPOOL/fs1/f1 bs=64k count=1
|
||||||
|
log_must sync_pool $TESTPOOL
|
||||||
|
set_tunable32 METASLAB_FORCE_GANGING 20000
|
||||||
|
set_tunable32 METASLAB_FORCE_GANGING_PCT 100
|
||||||
|
log_must dd if=/$TESTPOOL/fs1/f1 of=/$TESTPOOL/fs2/f1 bs=64k count=1
|
||||||
|
log_must sync_pool $TESTPOOL
|
||||||
|
|
||||||
|
log_must rm /$TESTPOOL/fs*/f1
|
||||||
|
log_must sync_pool $TESTPOOL
|
||||||
|
log_must dd if=/dev/urandom of=/$TESTPOOL/fs1/f1 bs=64k count=1
|
||||||
|
log_must sync_pool $TESTPOOL
|
||||||
|
log_must zdb -D $TESTPOOL
|
||||||
|
set_tunable32 METASLAB_FORCE_GANGING_PCT 0
|
||||||
|
log_must dd if=/$TESTPOOL/fs1/f1 of=/$TESTPOOL/fs2/f1 bs=64k count=1
|
||||||
|
log_must sync_pool $TESTPOOL
|
||||||
|
|
||||||
|
log_must rm /$TESTPOOL/fs*/f1
|
||||||
|
log_must sync_pool $TESTPOOL
|
||||||
|
set_tunable32 METASLAB_FORCE_GANGING_PCT 50
|
||||||
|
set_tunable32 METASLAB_FORCE_GANGING 40000
|
||||||
|
log_must dd if=/dev/urandom of=/$TESTPOOL/f1 bs=64k count=1
|
||||||
|
for i in `seq 1 16`; do
|
||||||
|
log_must cp /$TESTPOOL/f1 /$TESTPOOL/fs2/f1
|
||||||
|
log_must cp /$TESTPOOL/f1 /$TESTPOOL/fs1/f1
|
||||||
|
log_must sync_pool $TESTPOOL
|
||||||
|
log_must zdb -D $TESTPOOL
|
||||||
|
done
|
||||||
|
|
||||||
|
log_pass "Verify that mixed gang blocks and copies interact correctly in FDT"
|
Loading…
Reference in New Issue
Block a user