ddt: add "flat phys" feature

Traditional dedup keeps a separate ddt_phys_t "type" for each possible
count of DVAs (that is, copies=) parameter. Each of these are tracked
independently of each other, and have their own set of DVAs. This leads
to an (admittedly rare) situation where you can create as many as six
copies of the data, by changing the copies= parameter between copying.
This is both a waste of storage on disk, but also a waste of space in
the stored DDT entries, since there never needs to be more than three
DVAs to handle all possible values of copies=.

This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the
first ddt_phys_t is used. Each time a block is written with the dedup
bit set, this single phys is checked to see if it has enough DVAs to
fulfill the request. If it does, the block is filled with the saved DVAs
as normal. If not, an adjusted write is issued to create as many extra
copies as are needed to fulfill the request, which are then saved into
the entry too.

Because a single phys is no longer an all-or-nothing, but can be
transitioning from fewer to more DVAs, the write path now has to keep a
copy of the previous "known good" DVA set so we can revert to it in case
an error occurs. zio_ddt_write() has been restructured and heavily
commented to make it much easier to see what's happening.

Backwards compatibility is maintained simply by allocating four
ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys
selection macros to check the flag. In the old arrangement, each number
of copies gets a whole phys, so it will always have either zero or all
necessary DVAs filled, with no in-between, so the old behaviour
naturally falls out of the new code.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
This commit is contained in:
Rob Norris 2023-06-20 11:09:48 +10:00 committed by Brian Behlendorf
parent 0ba5f503c5
commit f4aeb23f52
10 changed files with 757 additions and 258 deletions

View File

@ -1922,14 +1922,16 @@ dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
blkptr_t blk; blkptr_t blk;
int p; int p;
for (p = 0; p < ddlwe->ddlwe_nphys; p++) { for (p = 0; p < DDT_NPHYS(ddt); p++) {
const ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p]; const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
if (ddp->ddp_phys_birth == 0) ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
if (ddt_phys_birth(ddp, v) == 0)
continue; continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
(void) printf("index %llx refcnt %llu phys %d %s\n", (void) printf("index %llx refcnt %llu phys %d %s\n",
(u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
p, blkbuf); p, blkbuf);
} }
} }
@ -3311,8 +3313,7 @@ zdb_ddt_cleanup(spa_t *spa)
ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next; ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
while (dde) { while (dde) {
next = AVL_NEXT(&ddt->ddt_tree, dde); next = AVL_NEXT(&ddt->ddt_tree, dde);
memset(&dde->dde_lead_zio, 0, dde->dde_io = NULL;
sizeof (dde->dde_lead_zio));
ddt_remove(ddt, dde); ddt_remove(ddt, dde);
dde = next; dde = next;
} }
@ -5689,6 +5690,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
blkptr_t tempbp;
if (BP_GET_DEDUP(bp)) { if (BP_GET_DEDUP(bp)) {
/* /*
* Dedup'd blocks are special. We need to count them, so we can * Dedup'd blocks are special. We need to count them, so we can
@ -5724,35 +5726,51 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
VERIFY3P(dde, !=, NULL); VERIFY3P(dde, !=, NULL);
/* Get the phys for this variant */ /* Get the phys for this variant */
ddt_phys_t *ddp = ddt_phys_select(ddt, dde, bp); ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
VERIFY3P(ddp, !=, NULL);
/* /*
* This entry may have multiple sets of DVAs. We must claim * This entry may have multiple sets of DVAs. We must claim
* each set the first time we see them in a real block on disk, * each set the first time we see them in a real block on disk,
* or count them on subsequent occurences. We don't have a * or count them on subsequent occurences. We don't have a
* convenient way to track the first time we see each variant, * convenient way to track the first time we see each variant,
* so we repurpose dde_lead_zio[] as a per-phys "seen" flag. We * so we repurpose dde_io as a set of "seen" flag bits. We can
* can do this safely in zdb because it never writes, so it * do this safely in zdb because it never writes, so it will
* will never have a writing zio for this block in that * never have a writing zio for this block in that pointer.
* pointer.
*/ */
boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
/*
* Work out which dde_phys index was used, get the seen flag,
* and update it if necessary.
*/
uint_t idx =
((uint_t)((uintptr_t)ddp - (uintptr_t)dde->dde_phys)) /
sizeof (ddt_phys_t);
VERIFY3P(ddp, ==, &dde->dde_phys[idx]);
boolean_t seen = (boolean_t)(uintptr_t)dde->dde_lead_zio[idx];
if (!seen) if (!seen)
dde->dde_lead_zio[idx] = (zio_t *)(uintptr_t)B_TRUE; dde->dde_io =
(void *)(((uintptr_t)dde->dde_io) | (1 << v));
/* Consume a reference for this block. */ /* Consume a reference for this block. */
VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0); VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0);
ddt_phys_decref(ddp); ddt_phys_decref(dde->dde_phys, v);
/*
* If this entry has a single flat phys, it may have been
* extended with additional DVAs at some time in its life.
* This block might be from before it was fully extended, and
* so have fewer DVAs.
*
* If this is the first time we've seen this block, and we
* claimed it as-is, then we would miss the claim on some
* number of DVAs, which would then be seen as leaked.
*
* In all cases, if we've had fewer DVAs, then the asize would
* be too small, and would lead to the pool apparently using
* more space than allocated.
*
* To handle this, we copy the canonical set of DVAs from the
* entry back to the block pointer before we claim it.
*/
if (v == DDT_PHYS_FLAT) {
ASSERT3U(BP_GET_BIRTH(bp), ==,
ddt_phys_birth(dde->dde_phys, v));
tempbp = *bp;
ddt_bp_fill(dde->dde_phys, v, &tempbp,
BP_GET_BIRTH(bp));
bp = &tempbp;
}
if (seen) { if (seen) {
/* /*

View File

@ -42,8 +42,8 @@ struct abd;
/* /*
* DDT-wide feature flags. These are set in ddt_flags by ddt_configure(). * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
*/ */
/* No flags yet. */ #define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */
#define DDT_FLAG_MASK (0) #define DDT_FLAG_MASK (DDT_FLAG_FLAT)
/* /*
* DDT on-disk storage object types. Each one corresponds to specific * DDT on-disk storage object types. Each one corresponds to specific
@ -126,21 +126,80 @@ typedef struct {
* characteristics of the stored block, such as its location on disk (DVAs), * characteristics of the stored block, such as its location on disk (DVAs),
* birth txg and ref count. * birth txg and ref count.
* *
* Note that an entry has an array of four ddt_phys_t, one for each number of * The "traditional" entry has an array of four, one for each number of DVAs
* DVAs (copies= property) and another for additional "ditto" copies. Most * (copies= property) and another for additional "ditto" copies. Users of the
* users of ddt_phys_t will handle indexing into or counting the phys they * traditional struct will specify the variant (index) of the one they want.
* want. *
* The newer "flat" entry has only a single form that is specified using the
* DDT_PHYS_FLAT variant.
*
* Since the value size varies, use one of the size macros when interfacing
* with the ddt zap.
*/ */
typedef struct {
#define DDT_PHYS_MAX (4)
/*
* Note - this can be used in a flexible array and allocated for
* a specific size (ddp_trad or ddp_flat). So be careful not to
* copy using "=" assignment but instead use ddt_phys_copy().
*/
typedef union {
/*
* Traditional physical payload value for DDT zap (256 bytes)
*/
struct {
dva_t ddp_dva[SPA_DVAS_PER_BP]; dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt; uint64_t ddp_refcnt;
uint64_t ddp_phys_birth; uint64_t ddp_phys_birth;
} ddt_phys_t; } ddp_trad[DDT_PHYS_MAX];
#define DDT_PHYS_MAX (4) /*
#define DDT_NPHYS(ddt) ((ddt) ? DDT_PHYS_MAX : DDT_PHYS_MAX) * Flat physical payload value for DDT zap (72 bytes)
#define DDT_PHYS_IS_DITTO(ddt, p) ((ddt) && p == 0) */
#define DDT_PHYS_FOR_COPIES(ddt, p) ((ddt) ? (p) : (p)) struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth; /* txg based from BP */
uint64_t ddp_class_start; /* in realtime seconds */
} ddp_flat;
} ddt_univ_phys_t;
/*
* This enum denotes which variant of a ddt_univ_phys_t to target. For
* a traditional DDT entry, it represents the indexes into the ddp_trad
* array. Any consumer of a ddt_univ_phys_t needs to know which variant
* is being targeted.
*
* Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
* we maintain the ability to free existing dedup-ditto blocks.
*/
typedef enum {
DDT_PHYS_DITTO = 0,
DDT_PHYS_SINGLE = 1,
DDT_PHYS_DOUBLE = 2,
DDT_PHYS_TRIPLE = 3,
DDT_PHYS_FLAT = 4,
DDT_PHYS_NONE = 5
} ddt_phys_variant_t;
#define DDT_PHYS_VARIANT(ddt, p) \
(ASSERT((p) < DDT_PHYS_NONE), \
((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))
#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat)
#define _DDT_PHYS_SWITCH(ddt, flat, trad) \
(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))
#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \
DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)
#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p)
#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0))
/* /*
* A "live" entry, holding changes to an entry made this txg, and other data to * A "live" entry, holding changes to an entry made this txg, and other data to
@ -159,6 +218,9 @@ typedef struct {
/* copy of data after a repair read, to be rewritten */ /* copy of data after a repair read, to be rewritten */
abd_t *dde_repair_abd; abd_t *dde_repair_abd;
/* original phys contents before update, for error handling */
ddt_univ_phys_t dde_orig_phys;
/* in-flight update IOs */ /* in-flight update IOs */
zio_t *dde_lead_zio[DDT_PHYS_MAX]; zio_t *dde_lead_zio[DDT_PHYS_MAX];
} ddt_entry_io_t; } ddt_entry_io_t;
@ -178,7 +240,7 @@ typedef struct {
ddt_entry_io_t *dde_io; /* IO support, when required */ ddt_entry_io_t *dde_io; /* IO support, when required */
ddt_phys_t dde_phys[]; /* physical data */ ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */
} ddt_entry_t; } ddt_entry_t;
/* /*
@ -189,8 +251,7 @@ typedef struct {
ddt_key_t ddlwe_key; ddt_key_t ddlwe_key;
ddt_type_t ddlwe_type; ddt_type_t ddlwe_type;
ddt_class_t ddlwe_class; ddt_class_t ddlwe_class;
uint8_t ddlwe_nphys; ddt_univ_phys_t ddlwe_phys;
ddt_phys_t ddlwe_phys[DDT_PHYS_MAX];
} ddt_lightweight_entry_t; } ddt_lightweight_entry_t;
/* /*
@ -236,17 +297,26 @@ typedef struct {
uint64_t ddb_cursor; uint64_t ddb_cursor;
} ddt_bookmark_t; } ddt_bookmark_t;
extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
uint64_t txg); blkptr_t *bp, uint64_t txg);
extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
const ddt_phys_t *ddp, blkptr_t *bp); const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);
extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
extern void ddt_phys_clear(ddt_phys_t *ddp);
extern void ddt_phys_addref(ddt_phys_t *ddp);
extern void ddt_phys_decref(ddt_phys_t *ddp);
extern ddt_phys_t *ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde,
const blkptr_t *bp); const blkptr_t *bp);
extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
ddt_phys_variant_t v);
extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
const ddt_entry_t *dde, const blkptr_t *bp);
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
boolean_t encrypted);
extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);

View File

@ -47,9 +47,7 @@ extern "C" {
(ddlwe)->ddlwe_key = (dde)->dde_key; \ (ddlwe)->ddlwe_key = (dde)->dde_key; \
(ddlwe)->ddlwe_type = (dde)->dde_type; \ (ddlwe)->ddlwe_type = (dde)->dde_type; \
(ddlwe)->ddlwe_class = (dde)->dde_class; \ (ddlwe)->ddlwe_class = (dde)->dde_class; \
(ddlwe)->ddlwe_nphys = DDT_NPHYS(ddt); \ memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
for (int p = 0; p < (ddlwe)->ddlwe_nphys; p++) \
(ddlwe)->ddlwe_phys[p] = (dde)->dde_phys[p]; \
} while (0) } while (0)
/* /*
@ -61,19 +59,19 @@ typedef struct {
boolean_t prehash); boolean_t prehash);
int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
int (*ddt_op_lookup)(objset_t *os, uint64_t object, int (*ddt_op_lookup)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); const ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_contains)(objset_t *os, uint64_t object, int (*ddt_op_contains)(objset_t *os, uint64_t object,
const ddt_key_t *ddk); const ddt_key_t *ddk);
void (*ddt_op_prefetch)(objset_t *os, uint64_t object, void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
const ddt_key_t *ddk); const ddt_key_t *ddk);
void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object); void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
int (*ddt_op_update)(objset_t *os, uint64_t object, int (*ddt_op_update)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, const ddt_key_t *ddk, const void *phys, size_t psize,
dmu_tx_t *tx); dmu_tx_t *tx);
int (*ddt_op_remove)(objset_t *os, uint64_t object, int (*ddt_op_remove)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, dmu_tx_t *tx); const ddt_key_t *ddk, dmu_tx_t *tx);
int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk, int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
} ddt_ops_t; } ddt_ops_t;

View File

@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx); ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,

View File

@ -572,7 +572,7 @@ typedef struct blkptr {
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
BP_GET_PSIZE(bp)) BP_GET_PSIZE(bp))
#define BP_ZERO(bp) \ #define BP_ZERO_DVAS(bp) \
{ \ { \
(bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[0] = 0; \
(bp)->blk_dva[0].dva_word[1] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \
@ -580,6 +580,11 @@ typedef struct blkptr {
(bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[1].dva_word[1] = 0; \
(bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \
(bp)->blk_dva[2].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \
}
#define BP_ZERO(bp) \
{ \
BP_ZERO_DVAS(bp); \
(bp)->blk_prop = 0; \ (bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \ (bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \ (bp)->blk_pad[1] = 0; \

View File

@ -75,12 +75,19 @@
* fill the BP with the DVAs from the entry, increment the refcount and cause * fill the BP with the DVAs from the entry, increment the refcount and cause
* the write IO to return immediately. * the write IO to return immediately.
* *
* Each ddt_phys_t slot in the entry represents a separate dedup block for the * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
* same content/checksum. The slot is selected based on the zp_copies parameter * block for the same content/checksum. The slot is selected based on the
* the block is written with, that is, the number of DVAs in the block. The * zp_copies parameter the block is written with, that is, the number of DVAs
* "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto" * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
* feature. These are no longer written, and will be freed if encountered on * now-removed "dedupditto" feature. These are no longer written, and will be
* old pools. * freed if encountered on old pools.
*
* If the "fast_dedup" feature is enabled, new dedup tables will be created
* with the "flat phys" option. In this mode, there is only one ddt_phys_t
* slot. If a write is issued for an entry that exists, but has fewer DVAs,
* then only as many new DVAs are allocated and written to make up the
* shortfall. The existing entry is then extended (ddt_phys_extend()) with the
* new DVAs.
* *
* ## Lifetime of an entry * ## Lifetime of an entry
* *
@ -130,6 +137,16 @@
* from the alternate block. If the block is actually damaged, this will invoke * from the alternate block. If the block is actually damaged, this will invoke
* the pool's "self-healing" mechanism, and repair the block. * the pool's "self-healing" mechanism, and repair the block.
* *
* If the "fast_dedup" feature is enabled, the "flat phys" option will be in
* use, so there is only ever one ddt_phys_t slot. The repair process will
* still happen in this case, though it is unlikely to succeed as there will
* usually be no other equivalent blocks to fall back on (though there might
* be, if this was an early version of a dedup'd block that has since been
* extended).
*
* Note that this repair mechanism is in addition to and separate from the
* regular OpenZFS scrub and self-healing mechanisms.
*
* ## Scanning (scrub/resilver) * ## Scanning (scrub/resilver)
* *
* If dedup is active, the scrub machinery will walk the dedup table first, and * If dedup is active, the scrub machinery will walk the dedup table first, and
@ -162,10 +179,15 @@
c == ZIO_CHECKSUM_BLAKE3) c == ZIO_CHECKSUM_BLAKE3)
static kmem_cache_t *ddt_cache; static kmem_cache_t *ddt_cache;
static kmem_cache_t *ddt_entry_cache;
#define DDT_ENTRY_SIZE \ static kmem_cache_t *ddt_entry_flat_cache;
(sizeof (ddt_entry_t) + sizeof (ddt_phys_t) * DDT_PHYS_MAX) static kmem_cache_t *ddt_entry_trad_cache;
#define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE)
#define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE)
#define DDT_ENTRY_SIZE(ddt) \
_DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)
/* /*
* Enable/disable prefetching of dedup-ed blocks which are going to be freed. * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
@ -195,7 +217,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
*/ */
static const uint64_t ddt_version_flags[] = { static const uint64_t ddt_version_flags[] = {
[DDT_VERSION_LEGACY] = 0, [DDT_VERSION_LEGACY] = 0,
[DDT_VERSION_FDT] = 0, [DDT_VERSION_FDT] = DDT_FLAG_FLAT,
}; };
/* Dummy version to signal that configure is still necessary */ /* Dummy version to signal that configure is still necessary */
@ -346,7 +368,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
ddt->ddt_object[type][class], &dde->dde_key, ddt->ddt_object[type][class], &dde->dde_key,
dde->dde_phys, sizeof (ddt_phys_t) * DDT_NPHYS(ddt))); dde->dde_phys, DDT_PHYS_SIZE(ddt)));
} }
static int static int
@ -388,8 +410,8 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
ASSERT(ddt_object_exists(ddt, type, class)); ASSERT(ddt_object_exists(ddt, type, class));
return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys, ddt->ddt_object[type][class], &dde->dde_key,
sizeof (ddt_phys_t) * DDT_NPHYS(ddt), tx)); dde->dde_phys, DDT_PHYS_SIZE(ddt), tx));
} }
static int static int
@ -410,11 +432,10 @@ ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os, int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key, ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key,
ddlwe->ddlwe_phys, sizeof (ddlwe->ddlwe_phys)); &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
if (error == 0) { if (error == 0) {
ddlwe->ddlwe_type = type; ddlwe->ddlwe_type = type;
ddlwe->ddlwe_class = class; ddlwe->ddlwe_class = class;
ddlwe->ddlwe_nphys = DDT_NPHYS(ddt);
return (0); return (0);
} }
return (error); return (error);
@ -451,13 +472,25 @@ ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
} }
void void
ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
blkptr_t *bp, uint64_t txg)
{ {
ASSERT3U(txg, !=, 0); ASSERT3U(txg, !=, 0);
ASSERT3U(v, <, DDT_PHYS_NONE);
uint64_t phys_birth;
const dva_t *dvap;
if (v == DDT_PHYS_FLAT) {
phys_birth = ddp->ddp_flat.ddp_phys_birth;
dvap = ddp->ddp_flat.ddp_dva;
} else {
phys_birth = ddp->ddp_trad[v].ddp_phys_birth;
dvap = ddp->ddp_trad[v].ddp_dva;
}
for (int d = 0; d < SPA_DVAS_PER_BP; d++) for (int d = 0; d < SPA_DVAS_PER_BP; d++)
bp->blk_dva[d] = ddp->ddp_dva[d]; bp->blk_dva[d] = dvap[d];
BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); BP_SET_BIRTH(bp, txg, phys_birth);
} }
/* /*
@ -465,13 +498,13 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
* will be missing the salt / IV required to do a full decrypting read. * will be missing the salt / IV required to do a full decrypting read.
*/ */
void void
ddt_bp_create(enum zio_checksum checksum, ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp)
{ {
BP_ZERO(bp); BP_ZERO(bp);
if (ddp != NULL) if (ddp != NULL)
ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v));
bp->blk_cksum = ddk->ddk_cksum; bp->blk_cksum = ddk->ddk_cksum;
@ -502,42 +535,101 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
} }
void void
ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
{ {
ASSERT0(ddp->ddp_phys_birth); ASSERT3U(v, <, DDT_PHYS_NONE);
int bp_ndvas = BP_GET_NDVAS(bp);
int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ?
SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
dva_t *dvas = (v == DDT_PHYS_FLAT) ?
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
for (int d = 0; d < SPA_DVAS_PER_BP; d++) int s = 0, d = 0;
ddp->ddp_dva[d] = bp->blk_dva[d]; while (s < bp_ndvas && d < ddp_max_dvas) {
ddp->ddp_phys_birth = BP_GET_BIRTH(bp); if (DVA_IS_VALID(&dvas[d])) {
d++;
continue;
}
dvas[d] = bp->blk_dva[s];
s++; d++;
}
/*
* If the caller offered us more DVAs than we can fit, something has
* gone wrong in their accounting. zio_ddt_write() should never ask for
* more than we need.
*/
ASSERT3U(s, ==, bp_ndvas);
if (BP_IS_ENCRYPTED(bp))
dvas[2] = bp->blk_dva[2];
if (ddt_phys_birth(ddp, v) == 0) {
if (v == DDT_PHYS_FLAT)
ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp);
else
ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp);
}
} }
void void
ddt_phys_clear(ddt_phys_t *ddp) ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
ddt_phys_variant_t v)
{ {
memset(ddp, 0, sizeof (*ddp)); ASSERT3U(v, <, DDT_PHYS_NONE);
if (v == DDT_PHYS_FLAT)
dst->ddp_flat = src->ddp_flat;
else
dst->ddp_trad[v] = src->ddp_trad[v];
} }
void void
ddt_phys_addref(ddt_phys_t *ddp) ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
{ {
ddp->ddp_refcnt++; ASSERT3U(v, <, DDT_PHYS_NONE);
if (v == DDT_PHYS_FLAT)
memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE);
else
memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
} }
void void
ddt_phys_decref(ddt_phys_t *ddp) ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
{ {
if (ddp) { ASSERT3U(v, <, DDT_PHYS_NONE);
ASSERT3U(ddp->ddp_refcnt, >, 0);
ddp->ddp_refcnt--; if (v == DDT_PHYS_FLAT)
ddp->ddp_flat.ddp_refcnt++;
else
ddp->ddp_trad[v].ddp_refcnt++;
} }
uint64_t
ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
{
ASSERT3U(v, <, DDT_PHYS_NONE);
uint64_t *refcntp;
if (v == DDT_PHYS_FLAT)
refcntp = &ddp->ddp_flat.ddp_refcnt;
else
refcntp = &ddp->ddp_trad[v].ddp_refcnt;
ASSERT3U(*refcntp, >, 0);
(*refcntp)--;
return (*refcntp);
} }
static void static void
ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp,
ddt_phys_variant_t v, uint64_t txg)
{ {
blkptr_t blk; blkptr_t blk;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
/* /*
* We clear the dedup bit so that zio_free() will actually free the * We clear the dedup bit so that zio_free() will actually free the
@ -545,20 +637,67 @@ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
*/ */
BP_SET_DEDUP(&blk, 0); BP_SET_DEDUP(&blk, 0);
ddt_phys_clear(ddp); ddt_phys_clear(ddp, v);
zio_free(ddt->ddt_spa, txg, &blk); zio_free(ddt->ddt_spa, txg, &blk);
} }
ddt_phys_t * uint64_t
ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
{
ASSERT3U(v, <, DDT_PHYS_NONE);
if (v == DDT_PHYS_FLAT)
return (ddp->ddp_flat.ddp_phys_birth);
else
return (ddp->ddp_trad[v].ddp_phys_birth);
}
int
ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
boolean_t encrypted)
{
ASSERT3U(v, <, DDT_PHYS_NONE);
const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
return (DVA_IS_VALID(&dvas[0]) +
DVA_IS_VALID(&dvas[1]) +
DVA_IS_VALID(&dvas[2]) * !encrypted);
}
ddt_phys_variant_t
ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp) ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
{ {
for (int p = 0; p < DDT_NPHYS(ddt); p++) { const ddt_univ_phys_t *ddp = dde->dde_phys;
ddt_phys_t *ddp = (ddt_phys_t *)&dde->dde_phys[p];
if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && if (ddt->ddt_flags & DDT_FLAG_FLAT) {
BP_GET_BIRTH(bp) == ddp->ddp_phys_birth) if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
return (ddp); BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
return (DDT_PHYS_FLAT);
} }
return (NULL); } else /* traditional phys */ {
for (int p = 0; p < DDT_PHYS_MAX; p++) {
if (DVA_EQUAL(BP_IDENTITY(bp),
&ddp->ddp_trad[p].ddp_dva[0]) &&
BP_GET_BIRTH(bp) ==
ddp->ddp_trad[p].ddp_phys_birth) {
return (p);
}
}
}
return (DDT_PHYS_NONE);
}
uint64_t
ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
{
ASSERT3U(v, <, DDT_PHYS_NONE);
if (v == DDT_PHYS_FLAT)
return (ddp->ddp_flat.ddp_refcnt);
else
return (ddp->ddp_trad[v].ddp_refcnt);
} }
uint64_t uint64_t
@ -566,10 +705,11 @@ ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde)
{ {
uint64_t refcnt = 0; uint64_t refcnt = 0;
for (int p = 0; p < DDT_NPHYS(ddt); p++) { if (ddt->ddt_flags & DDT_FLAG_FLAT) {
if (DDT_PHYS_IS_DITTO(ddt, p)) refcnt = dde->dde_phys->ddp_flat.ddp_refcnt;
continue; } else {
refcnt += dde->dde_phys[p].ddp_refcnt; for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt;
} }
return (refcnt); return (refcnt);
@ -599,24 +739,33 @@ ddt_init(void)
{ {
ddt_cache = kmem_cache_create("ddt_cache", ddt_cache = kmem_cache_create("ddt_cache",
sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
ddt_entry_cache = kmem_cache_create("ddt_entry_cache", ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache",
DDT_ENTRY_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
} }
void void
ddt_fini(void) ddt_fini(void)
{ {
kmem_cache_destroy(ddt_entry_cache); kmem_cache_destroy(ddt_entry_trad_cache);
kmem_cache_destroy(ddt_entry_flat_cache);
kmem_cache_destroy(ddt_cache); kmem_cache_destroy(ddt_cache);
} }
static ddt_entry_t * static ddt_entry_t *
ddt_alloc(const ddt_key_t *ddk) ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk)
{ {
ddt_entry_t *dde; ddt_entry_t *dde;
dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); if (ddt->ddt_flags & DDT_FLAG_FLAT) {
memset(dde, 0, DDT_ENTRY_SIZE); dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP);
memset(dde, 0, DDT_ENTRY_FLAT_SIZE);
} else {
dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP);
memset(dde, 0, DDT_ENTRY_TRAD_SIZE);
}
cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
dde->dde_key = *ddk; dde->dde_key = *ddk;
@ -647,7 +796,8 @@ ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
} }
cv_destroy(&dde->dde_cv); cv_destroy(&dde->dde_cv);
kmem_cache_free(ddt_entry_cache, dde); kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
ddt_entry_flat_cache : ddt_entry_trad_cache, dde);
} }
void void
@ -793,7 +943,12 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
} }
/* Time to make a new entry. */ /* Time to make a new entry. */
dde = ddt_alloc(&search); dde = ddt_alloc(ddt, &search);
/* Record the time this class was created (used by ddt prune) */
if (ddt->ddt_flags & DDT_FLAG_FLAT)
dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec();
avl_insert(&ddt->ddt_tree, dde, where); avl_insert(&ddt->ddt_tree, dde, where);
/* /*
@ -1206,7 +1361,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
ddt_key_fill(&ddk, bp); ddt_key_fill(&ddk, bp);
dde = ddt_alloc(&ddk); dde = ddt_alloc(ddt, &ddk);
ddt_alloc_entry_io(dde); ddt_alloc_entry_io(dde);
for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
@ -1222,7 +1377,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
} }
} }
memset(dde->dde_phys, 0, sizeof (ddt_phys_t) * DDT_NPHYS(ddt)); memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt));
return (dde); return (dde);
} }
@ -1265,13 +1420,26 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
ddt_repair_entry_done, rdde, rio->io_flags); ddt_repair_entry_done, rdde, rio->io_flags);
for (int p = 0; p < DDT_NPHYS(ddt); p++) { for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_univ_phys_t *ddp = dde->dde_phys;
ddt_phys_t *rddp = &rdde->dde_phys[p]; ddt_univ_phys_t *rddp = rdde->dde_phys;
if (ddp->ddp_phys_birth == 0 || ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
ddp->ddp_phys_birth != rddp->ddp_phys_birth || uint64_t phys_birth = ddt_phys_birth(ddp, v);
memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) const dva_t *dvas, *rdvas;
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
dvas = ddp->ddp_flat.ddp_dva;
rdvas = rddp->ddp_flat.ddp_dva;
} else {
dvas = ddp->ddp_trad[p].ddp_dva;
rdvas = rddp->ddp_trad[p].ddp_dva;
}
if (phys_birth == 0 ||
phys_birth != ddt_phys_birth(rddp, v) ||
memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP))
continue; continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk), rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
@ -1297,7 +1465,8 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio)
rdde_next = AVL_NEXT(t, rdde); rdde_next = AVL_NEXT(t, rdde);
avl_remove(&ddt->ddt_repair_tree, rdde); avl_remove(&ddt->ddt_repair_tree, rdde);
ddt_exit(ddt); ddt_exit(ddt);
ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL,
DDT_PHYS_NONE, &blk);
dde = ddt_repair_start(ddt, &blk); dde = ddt_repair_start(ddt, &blk);
ddt_repair_entry(ddt, dde, rdde, rio); ddt_repair_entry(ddt, dde, rdde, rio);
ddt_repair_done(ddt, dde); ddt_repair_done(ddt, dde);
@ -1322,9 +1491,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
for (int p = 0; p < DDT_NPHYS(ddt); p++) { for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ASSERT(dde->dde_io == NULL || ASSERT(dde->dde_io == NULL ||
dde->dde_io->dde_lead_zio[p] == NULL); dde->dde_io->dde_lead_zio[p] == NULL);
ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_univ_phys_t *ddp = dde->dde_phys;
if (ddp->ddp_phys_birth == 0) { ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
ASSERT0(ddp->ddp_refcnt); uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
if (ddt_phys_birth(ddp, v) == 0) {
ASSERT0(phys_refcnt);
continue; continue;
} }
if (DDT_PHYS_IS_DITTO(ddt, p)) { if (DDT_PHYS_IS_DITTO(ddt, p)) {
@ -1332,12 +1504,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
* Note, we no longer create DDT-DITTO blocks, but we * Note, we no longer create DDT-DITTO blocks, but we
* don't want to leak any written by older software. * don't want to leak any written by older software.
*/ */
ddt_phys_free(ddt, ddk, ddp, txg); ddt_phys_free(ddt, ddk, ddp, v, txg);
continue; continue;
} }
if (ddp->ddp_refcnt == 0) if (phys_refcnt == 0)
ddt_phys_free(ddt, ddk, ddp, txg); ddt_phys_free(ddt, ddk, ddp, v, txg);
total_refcnt += ddp->ddp_refcnt; total_refcnt += phys_refcnt;
} }
if (total_refcnt > 1) if (total_refcnt > 1)
@ -1371,7 +1543,7 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
ddt_lightweight_entry_t ddlwe; ddt_lightweight_entry_t ddlwe;
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
dsl_scan_ddt_entry(dp->dp_scan, dsl_scan_ddt_entry(dp->dp_scan,
ddt->ddt_checksum, &ddlwe, tx); ddt->ddt_checksum, ddt, &ddlwe, tx);
} }
} }
} }
@ -1536,12 +1708,10 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
} }
if (dde->dde_type < DDT_TYPES) { if (dde->dde_type < DDT_TYPES) {
ddt_phys_t *ddp;
ASSERT3S(dde->dde_class, <, DDT_CLASSES); ASSERT3S(dde->dde_class, <, DDT_CLASSES);
int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp)); int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
ddp = &dde->dde_phys[p]; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
/* /*
* This entry already existed (dde_type is real), so it must * This entry already existed (dde_type is real), so it must
@ -1553,9 +1723,9 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
* likely further action is required to fill out the DDT entry, * likely further action is required to fill out the DDT entry,
* and this is a place that is likely to be missed in testing. * and this is a place that is likely to be missed in testing.
*/ */
ASSERT3U(ddp->ddp_refcnt, >, 0); ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0);
ddt_phys_addref(ddp); ddt_phys_addref(dde->dde_phys, v);
result = B_TRUE; result = B_TRUE;
} else { } else {
/* /*

View File

@ -43,18 +43,22 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
memset(dds, 0, sizeof (*dds)); memset(dds, 0, sizeof (*dds));
for (int p = 0; p < DDT_NPHYS(ddt); p++) { for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ddt_phys_t *ddp = &dde->dde_phys[p]; const ddt_univ_phys_t *ddp = dde->dde_phys;
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
uint64_t dsize = 0; if (ddt_phys_birth(ddp, v) == 0)
uint64_t refcnt = ddp->ddp_refcnt;
if (ddp->ddp_phys_birth == 0)
continue; continue;
int ndvas = DDK_GET_CRYPT(&dde->dde_key) ? int ndvas = ddt_phys_dva_count(ddp, v,
SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; DDK_GET_CRYPT(&dde->dde_key));
const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
uint64_t dsize = 0;
for (int d = 0; d < ndvas; d++) for (int d = 0; d < ndvas; d++)
dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); dsize += dva_get_dsize_sync(spa, &dvas[d]);
uint64_t refcnt = ddt_phys_refcnt(ddp, v);
dds->dds_blocks += 1; dds->dds_blocks += 1;
dds->dds_lsize += lsize; dds->dds_lsize += lsize;

View File

@ -109,7 +109,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
static int static int
ddt_zap_lookup(objset_t *os, uint64_t object, ddt_zap_lookup(objset_t *os, uint64_t object,
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize) const ddt_key_t *ddk, void *phys, size_t psize)
{ {
uchar_t *cbuf; uchar_t *cbuf;
uint64_t one, csize; uint64_t one, csize;
@ -156,7 +156,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object)
static int static int
ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk, ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx) const void *phys, size_t psize, dmu_tx_t *tx)
{ {
const size_t cbuf_size = psize + 1; const size_t cbuf_size = psize + 1;
@ -182,7 +182,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,
static int static int
ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk, ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
ddt_phys_t *phys, size_t psize) void *phys, size_t psize)
{ {
zap_cursor_t zc; zap_cursor_t zc;
zap_attribute_t za; zap_attribute_t za;

View File

@ -2929,7 +2929,7 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
void void
dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
{ {
(void) tx; (void) tx;
const ddt_key_t *ddk = &ddlwe->ddlwe_key; const ddt_key_t *ddk = &ddlwe->ddlwe_key;
@ -2953,13 +2953,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
if (scn->scn_done_txg != 0) if (scn->scn_done_txg != 0)
return; return;
for (int p = 0; p < ddlwe->ddlwe_nphys; p++) { for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p]; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
if (ddp->ddp_phys_birth == 0 || if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
continue; continue;
ddt_bp_create(checksum, ddk, ddp, &bp); ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);
scn->scn_visited_this_txg++; scn->scn_visited_this_txg++;
scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
@ -3022,7 +3022,7 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
ASSERT(avl_first(&ddt->ddt_tree) == NULL); ASSERT(avl_first(&ddt->ddt_tree) == NULL);
dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &ddlwe, tx); dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
n++; n++;
if (dsl_scan_check_suspend(scn, NULL)) if (dsl_scan_check_suspend(scn, NULL))

View File

@ -3256,14 +3256,16 @@ zio_ddt_child_read_done(zio_t *zio)
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
ddt_t *ddt; ddt_t *ddt;
ddt_entry_t *dde = zio->io_private; ddt_entry_t *dde = zio->io_private;
ddt_phys_t *ddp;
zio_t *pio = zio_unique_parent(zio); zio_t *pio = zio_unique_parent(zio);
mutex_enter(&pio->io_lock); mutex_enter(&pio->io_lock);
ddt = ddt_select(zio->io_spa, bp); ddt = ddt_select(zio->io_spa, bp);
ddp = ddt_phys_select(ddt, dde, bp);
if (zio->io_error == 0) if (zio->io_error == 0) {
ddt_phys_clear(ddp); /* this ddp doesn't need repair */ ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
/* this phys variant doesn't need repair */
ddt_phys_clear(dde->dde_phys, v);
}
if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL) if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
dde->dde_io->dde_repair_abd = zio->io_abd; dde->dde_io->dde_repair_abd = zio->io_abd;
@ -3284,21 +3286,25 @@ zio_ddt_read_start(zio_t *zio)
if (zio->io_child_error[ZIO_CHILD_DDT]) { if (zio->io_child_error[ZIO_CHILD_DDT]) {
ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_t *ddt = ddt_select(zio->io_spa, bp);
ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp);
ddt_phys_t *ddp_self = ddt_phys_select(ddt, dde, bp); ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp);
ddt_univ_phys_t *ddp = dde->dde_phys;
blkptr_t blk; blkptr_t blk;
ASSERT(zio->io_vsd == NULL); ASSERT(zio->io_vsd == NULL);
zio->io_vsd = dde; zio->io_vsd = dde;
if (ddp_self == NULL) if (v_self == DDT_PHYS_NONE)
return (zio); return (zio);
/* issue I/O for the other copies */
for (int p = 0; p < DDT_NPHYS(ddt); p++) { for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
if (ddt_phys_birth(ddp, v) == 0 || v == v_self)
continue; continue;
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
&blk); ddt_bp_create(ddt->ddt_checksum, &dde->dde_key,
ddp, v, &blk);
zio_nowait(zio_read(zio, zio->io_spa, &blk, zio_nowait(zio_read(zio, zio->io_spa, &blk,
abd_alloc_for_io(zio->io_size, B_TRUE), abd_alloc_for_io(zio->io_size, B_TRUE),
zio->io_size, zio_ddt_child_read_done, dde, zio->io_size, zio_ddt_child_read_done, dde,
@ -3378,30 +3384,32 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (DDT_PHYS_IS_DITTO(ddt, p)) if (DDT_PHYS_IS_DITTO(ddt, p))
continue; continue;
zio_t *lio = dde->dde_io->dde_lead_zio[p]; if (dde->dde_io == NULL)
continue;
if (lio != NULL && do_raw) { zio_t *lio = dde->dde_io->dde_lead_zio[p];
if (lio == NULL)
continue;
if (do_raw)
return (lio->io_size != zio->io_size || return (lio->io_size != zio->io_size ||
abd_cmp(zio->io_abd, lio->io_abd) != 0); abd_cmp(zio->io_abd, lio->io_abd) != 0);
} else if (lio != NULL) {
return (lio->io_orig_size != zio->io_orig_size || return (lio->io_orig_size != zio->io_orig_size ||
abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
} }
}
for (int p = 0; p < DDT_NPHYS(ddt); p++) { for (int p = 0; p < DDT_NPHYS(ddt); p++) {
if (DDT_PHYS_IS_DITTO(ddt, p)) ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
continue; uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v);
ddt_phys_t *ddp = &dde->dde_phys[p]; if (phys_birth != 0 && do_raw) {
if (ddp->ddp_phys_birth != 0 && do_raw) {
blkptr_t blk = *zio->io_bp; blkptr_t blk = *zio->io_bp;
uint64_t psize; uint64_t psize;
abd_t *tmpabd; abd_t *tmpabd;
int error; int error;
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
psize = BP_GET_PSIZE(&blk); psize = BP_GET_PSIZE(&blk);
if (psize != zio->io_size) if (psize != zio->io_size)
@ -3424,13 +3432,13 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
abd_free(tmpabd); abd_free(tmpabd);
ddt_enter(ddt); ddt_enter(ddt);
return (error != 0); return (error != 0);
} else if (ddp->ddp_phys_birth != 0) { } else if (phys_birth != 0) {
arc_buf_t *abuf = NULL; arc_buf_t *abuf = NULL;
arc_flags_t aflags = ARC_FLAG_WAIT; arc_flags_t aflags = ARC_FLAG_WAIT;
blkptr_t blk = *zio->io_bp; blkptr_t blk = *zio->io_bp;
int error; int error;
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
if (BP_GET_LSIZE(&blk) != zio->io_orig_size) if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
return (B_TRUE); return (B_TRUE);
@ -3457,53 +3465,88 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
return (B_FALSE); return (B_FALSE);
} }
static void
zio_ddt_child_write_ready(zio_t *zio)
{
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
ddt_entry_t *dde = zio->io_private;
zio_t *pio;
if (zio->io_error)
return;
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
ddt_phys_t *ddp = &dde->dde_phys[p];
ddt_enter(ddt);
ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
ddt_phys_fill(ddp, zio->io_bp);
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(zio, &zl)) != NULL)
ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
ddt_exit(ddt);
}
static void static void
zio_ddt_child_write_done(zio_t *zio) zio_ddt_child_write_done(zio_t *zio)
{ {
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
ddt_entry_t *dde = zio->io_private; ddt_entry_t *dde = zio->io_private;
zio_link_t *zl = NULL;
ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
ddt_univ_phys_t *ddp = dde->dde_phys;
ddt_enter(ddt); ddt_enter(ddt);
ASSERT(ddp->ddp_refcnt == 0); /* we're the lead, so once we're done there's no one else outstanding */
ASSERT(dde->dde_io->dde_lead_zio[p] == zio); if (dde->dde_io->dde_lead_zio[p] == zio)
dde->dde_io->dde_lead_zio[p] = NULL; dde->dde_io->dde_lead_zio[p] = NULL;
if (zio->io_error == 0) { ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys;
if (zio->io_error != 0) {
/*
* The write failed, so we're about to abort the entire IO
* chain. We need to revert the entry back to what it was at
* the last time it was successfully extended.
*/
ddt_phys_copy(ddp, orig, v);
ddt_phys_clear(orig, v);
ddt_exit(ddt);
return;
}
/*
* We've successfully added new DVAs to the entry. Clear the saved
* state or, if there's still outstanding IO, remember it so we can
* revert to a known good state if that IO fails.
*/
if (dde->dde_io->dde_lead_zio[p] == NULL)
ddt_phys_clear(orig, v);
else
ddt_phys_copy(orig, ddp, v);
/*
* Add references for all dedup writes that were waiting on the
* physical one, skipping any other physical writes that are waiting.
*/
zio_t *pio;
zl = NULL;
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
ddt_phys_addref(ddp, v);
}
ddt_exit(ddt);
}
static void
zio_ddt_child_write_ready(zio_t *zio)
{
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
ddt_entry_t *dde = zio->io_private;
zio_link_t *zl = NULL; zio_link_t *zl = NULL;
while (zio_walk_parents(zio, &zl) != NULL) ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
ddt_phys_addref(ddp);
} else { int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
ddt_phys_clear(ddp); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
if (zio->io_error != 0)
return;
ddt_enter(ddt);
ddt_phys_extend(dde->dde_phys, v, zio->io_bp);
zio_t *pio;
zl = NULL;
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
} }
ddt_exit(ddt); ddt_exit(ddt);
@ -3516,7 +3559,6 @@ zio_ddt_write(zio_t *zio)
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
uint64_t txg = zio->io_txg; uint64_t txg = zio->io_txg;
zio_prop_t *zp = &zio->io_prop; zio_prop_t *zp = &zio->io_prop;
zio_t *cio = NULL;
ddt_t *ddt = ddt_select(spa, bp); ddt_t *ddt = ddt_select(spa, bp);
ddt_entry_t *dde; ddt_entry_t *dde;
@ -3537,9 +3579,6 @@ zio_ddt_write(zio_t *zio)
return (zio); return (zio);
} }
int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
ddt_phys_t *ddp = &dde->dde_phys[p];
if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
/* /*
* If we're using a weak checksum, upgrade to a strong checksum * If we're using a weak checksum, upgrade to a strong checksum
@ -3563,30 +3602,226 @@ zio_ddt_write(zio_t *zio)
return (zio); return (zio);
} }
ddt_alloc_entry_io(dde); int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
ddt_univ_phys_t *ddp = dde->dde_phys;
if (ddp->ddp_phys_birth != 0 || dde->dde_io->dde_lead_zio[p] != NULL) { /*
if (ddp->ddp_phys_birth != 0) * In the common cases, at this point we have a regular BP with no
ddt_bp_fill(ddp, bp, txg); * allocated DVAs, and the corresponding DDT entry for its checksum.
if (dde->dde_io->dde_lead_zio[p] != NULL) * Our goal is to fill the BP with enough DVAs to satisfy its copies=
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); * requirement.
else *
ddt_phys_addref(ddp); * One of three things needs to happen to fulfill this:
} else if (zio->io_bp_override) { *
* - if the DDT entry has enough DVAs to satisfy the BP, we just copy
* them out of the entry and return;
*
* - if the DDT entry has no DVAs (ie its brand new), then we have to
* issue the write as normal so that DVAs can be allocated and the
* data land on disk. We then copy the DVAs into the DDT entry on
* return.
*
* - if the DDT entry has some DVAs, but too few, we have to issue the
* write, adjusted to have allocate fewer copies. When it returns, we
* add the new DVAs to the DDT entry, and update the BP to have the
* full amount it originally requested.
*
* In all cases, if there's already a writing IO in flight, we need to
* defer the action until after the write is done. If our action is to
* write, we need to adjust our request for additional DVAs to match
* what will be in the DDT entry after it completes. In this way every
* IO can be guaranteed to recieve enough DVAs simply by joining the
* end of the chain and letting the sequence play out.
*/
/*
* Number of DVAs in the DDT entry. If the BP is encrypted we ignore
* the third one as normal.
*/
int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
/* Number of DVAs requested bya the IO. */
uint8_t need_dvas = zp->zp_copies;
/*
* What we do next depends on whether or not there's IO outstanding that
* will update this entry.
*/
if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
/*
* No IO outstanding, so we only need to worry about ourselves.
*/
/*
* Override BPs bring their own DVAs and their own problems.
*/
if (zio->io_bp_override) {
/*
* For a brand-new entry, all the work has been done
* for us, and we can just fill it out from the provided
* block and leave.
*/
if (have_dvas == 0) {
ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ASSERT(BP_EQUAL(bp, zio->io_bp_override));
ddt_phys_fill(ddp, bp); ddt_phys_extend(ddp, v, bp);
ddt_phys_addref(ddp); ddt_phys_addref(ddp, v);
ddt_exit(ddt);
return (zio);
}
/*
* If we already have this entry, then we want to treat
* it like a regular write. To do this we just wipe
* them out and proceed like a regular write.
*
* Even if there are some DVAs in the entry, we still
* have to clear them out. We can't use them to fill
* out the dedup entry, as they are all referenced
* together by a bp already on disk, and will be freed
* as a group.
*/
BP_ZERO_DVAS(bp);
BP_SET_BIRTH(bp, 0, 0);
}
/*
* If there are enough DVAs in the entry to service our request,
* then we can just use them as-is.
*/
if (have_dvas >= need_dvas) {
ddt_bp_fill(ddp, v, bp, txg);
ddt_phys_addref(ddp, v);
ddt_exit(ddt);
return (zio);
}
/*
* Otherwise, we have to issue IO to fill the entry up to the
* amount we need.
*/
need_dvas -= have_dvas;
} else { } else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, /*
zio->io_orig_size, zio->io_orig_size, zp, * There's a write in-flight. If there's already enough DVAs on
* the entry, then either there were already enough to start
* with, or the in-flight IO is between READY and DONE, and so
* has extended the entry with new DVAs. Either way, we don't
* need to do anything, we can just slot in behind it.
*/
if (zio->io_bp_override) {
/*
* If there's a write out, then we're soon going to
* have our own copies of this block, so clear out the
* override block and treat it as a regular dedup
* write. See comment above.
*/
BP_ZERO_DVAS(bp);
BP_SET_BIRTH(bp, 0, 0);
}
if (have_dvas >= need_dvas) {
/*
* A minor point: there might already be enough
* committed DVAs in the entry to service our request,
* but we don't know which are completed and which are
* allocated but not yet written. In this case, should
* the IO for the new DVAs fail, we will be on the end
* of the IO chain and will also recieve an error, even
* though our request could have been serviced.
*
* This is an extremely rare case, as it requires the
* original block to be copied with a request for a
* larger number of DVAs, then copied again requesting
* the same (or already fulfilled) number of DVAs while
* the first request is active, and then that first
* request errors. In return, the logic required to
* catch and handle it is complex. For now, I'm just
* not going to bother with it.
*/
/*
* We always fill the bp here as we may have arrived
* after the in-flight write has passed READY, and so
* missed out.
*/
ddt_bp_fill(ddp, v, bp, txg);
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
ddt_exit(ddt);
return (zio);
}
/*
* There's not enough in the entry yet, so we need to look at
* the write in-flight and see how many DVAs it will have once
* it completes.
*
* The in-flight write has potentially had its copies request
* reduced (if we're filling out an existing entry), so we need
* to reach in and get the original write to find out what it is
* expecting.
*
* Note that the parent of the lead zio will always have the
* highest zp_copies of any zio in the chain, because ones that
* can be serviced without additional IO are always added to
* the back of the chain.
*/
zio_link_t *zl = NULL;
zio_t *pio =
zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
ASSERT(pio);
uint8_t parent_dvas = pio->io_prop.zp_copies;
if (parent_dvas >= need_dvas) {
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
ddt_exit(ddt);
return (zio);
}
/*
* Still not enough, so we will need to issue to get the
* shortfall.
*/
need_dvas -= parent_dvas;
}
/*
* We need to write. We will create a new write with the copies
* property adjusted to match the number of DVAs we need to need to
* grow the DDT entry by to satisfy the request.
*/
zio_prop_t czp = *zp;
czp.zp_copies = need_dvas;
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp,
zio_ddt_child_write_ready, NULL, zio_ddt_child_write_ready, NULL,
zio_ddt_child_write_done, dde, zio->io_priority, zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_io->dde_lead_zio[p] = cio;
/*
* We are the new lead zio, because our parent has the highest
* zp_copies that has been requested for this entry so far.
*/
ddt_alloc_entry_io(dde);
if (dde->dde_io->dde_lead_zio[p] == NULL) {
/*
* First time out, take a copy of the stable entry to revert
* to if there's an error (see zio_ddt_child_write_done())
*/
ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
} else {
/*
* Make the existing chain our child, because it cannot
* complete until we have.
*/
zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
} }
dde->dde_io->dde_lead_zio[p] = cio;
ddt_exit(ddt); ddt_exit(ddt);
@ -3603,8 +3838,7 @@ zio_ddt_free(zio_t *zio)
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
ddt_t *ddt = ddt_select(spa, bp); ddt_t *ddt = ddt_select(spa, bp);
ddt_entry_t *dde; ddt_entry_t *dde = NULL;
ddt_phys_t *ddp;
ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_DEDUP(bp));
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
@ -3612,9 +3846,9 @@ zio_ddt_free(zio_t *zio)
ddt_enter(ddt); ddt_enter(ddt);
freedde = dde = ddt_lookup(ddt, bp); freedde = dde = ddt_lookup(ddt, bp);
if (dde) { if (dde) {
ddp = ddt_phys_select(ddt, dde, bp); ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
if (ddp) if (v != DDT_PHYS_NONE)
ddt_phys_decref(ddp); ddt_phys_decref(dde->dde_phys, v);
} }
ddt_exit(ddt); ddt_exit(ddt);