From 0ea44e576bd45133041811b398043c2286542bae Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 13 Mar 2025 13:27:57 -0400 Subject: [PATCH] Fix deduplication of overridden blocks Implementation of DDT pruning introduced verification of DVAs in a block pointer during ddt_lookup() to not by mistake free previous pruned incarnation of the entry. But when writing a new block in zio_ddt_write() we might have the DVAs only from override pointer, which may never have "D" flag to be confused with pruned DDT entry, and we'll abandon those DVAs if we find a matching entry in DDT. This fixes deduplication for blocks written via dmu_sync() for purposes of indirect ZIL write records, that I have tested. And I suspect it might actually allow deduplication for Direct I/O, even though in an odd way -- first write block directly and then delete it later during TXG commit if found duplicate, which part I haven't tested. Reviewed-by: Tony Hutter Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #17120 --- cmd/zdb/zdb.c | 2 +- include/sys/ddt.h | 3 ++- module/zfs/ddt.c | 15 ++++++++------- module/zfs/zio.c | 10 ++++++++-- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index d594cd112..1d7fe862a 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -5864,7 +5864,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, * Find the block. This will create the entry in memory, but * we'll know if that happened by its refcount. */ - ddt_entry_t *dde = ddt_lookup(ddt, bp); + ddt_entry_t *dde = ddt_lookup(ddt, bp, B_TRUE); /* * ddt_lookup() can return NULL if this block didn't exist diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 4e5ccd463..5afecf287 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -378,7 +378,8 @@ extern void ddt_enter(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt); extern void ddt_init(void); extern void ddt_fini(void); -extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp); +extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, + boolean_t verify); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_prefetch_all(spa_t *spa); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index e8b9fb498..bc27367d8 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -1106,7 +1106,7 @@ ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde) } ddt_entry_t * -ddt_lookup(ddt_t *ddt, const blkptr_t *bp) +ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify) { spa_t *spa = ddt->ddt_spa; ddt_key_t search; @@ -1141,7 +1141,7 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) /* If it's already loaded, we can just return it. */ DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit); if (dde->dde_flags & DDE_FLAG_LOADED) { - if (ddt_entry_lookup_is_valid(ddt, bp, dde)) + if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde)) return (dde); return (NULL); } @@ -1165,7 +1165,7 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) DDT_KSTAT_BUMP(ddt, dds_lookup_existing); /* Make sure the loaded entry matches the BP */ - if (ddt_entry_lookup_is_valid(ddt, bp, dde)) + if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde)) return (dde); return (NULL); } else @@ -1194,7 +1194,8 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) memcpy(dde->dde_phys, &ddlwe.ddlwe_phys, DDT_PHYS_SIZE(ddt)); /* Whatever we found isn't valid for this BP, eject */ - if (!ddt_entry_lookup_is_valid(ddt, bp, dde)) { + if (verify && + !ddt_entry_lookup_is_valid(ddt, bp, dde)) { avl_remove(&ddt->ddt_tree, dde); ddt_free(ddt, dde); return (NULL); @@ -1276,7 +1277,7 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) * worth the effort to deal with without taking an entry * update. */ - valid = ddt_entry_lookup_is_valid(ddt, bp, dde); + valid = !verify || ddt_entry_lookup_is_valid(ddt, bp, dde); if (!valid && dde->dde_waiters == 0) { avl_remove(&ddt->ddt_tree, dde); ddt_free(ddt, dde); @@ -2469,7 +2470,7 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) ddt = ddt_select(spa, bp); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp); + dde = ddt_lookup(ddt, bp, B_TRUE); /* Can be NULL if the entry for this block was pruned. */ if (dde == NULL) { @@ -2551,7 +2552,7 @@ prune_candidates_sync(void *arg, dmu_tx_t *tx) ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key, dpe->dpe_phys, DDT_PHYS_FLAT, &blk); - ddt_entry_t *dde = ddt_lookup(ddt, &blk); + ddt_entry_t *dde = ddt_lookup(ddt, &blk, B_TRUE); if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) { ASSERT(dde->dde_flags & DDE_FLAG_LOADED); /* diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 45174ae6c..6d9c20e8e 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3717,7 +3717,13 @@ zio_ddt_write(zio_t *zio) ASSERT3B(zio->io_prop.zp_direct_write, ==, B_FALSE); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp); + /* + * Search DDT for matching entry. Skip DVAs verification here, since + * they can go only from override, and once we get here the override + * pointer can't have "D" flag to be confused with pruned DDT entries. + */ + IMPLY(zio->io_bp_override, !BP_GET_DEDUP(zio->io_bp_override)); + dde = ddt_lookup(ddt, bp, B_FALSE); if (dde == NULL) { /* DDT size is over its quota so no new entries */ zp->zp_dedup = B_FALSE; @@ -3993,7 +3999,7 @@ zio_ddt_free(zio_t *zio) ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); - freedde = dde = ddt_lookup(ddt, bp); + freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); if (v != DDT_PHYS_NONE)