mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
ddt: add "flat phys" feature
Traditional dedup keeps a separate ddt_phys_t "type" for each possible count of DVAs (that is, copies=) parameter. Each of these are tracked independently of each other, and have their own set of DVAs. This leads to an (admittedly rare) situation where you can create as many as six copies of the data, by changing the copies= parameter between copying. This is both a waste of storage on disk, but also a waste of space in the stored DDT entries, since there never needs to be more than three DVAs to handle all possible values of copies=. This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the first ddt_phys_t is used. Each time a block is written with the dedup bit set, this single phys is checked to see if it has enough DVAs to fulfill the request. If it does, the block is filled with the saved DVAs as normal. If not, an adjusted write is issued to create as many extra copies as are needed to fulfill the request, which are then saved into the entry too. Because a single phys is no longer an all-or-nothing, but can be transitioning from fewer to more DVAs, the write path now has to keep a copy of the previous "known good" DVA set so we can revert to it in case an error occurs. zio_ddt_write() has been restructured and heavily commented to make it much easier to see what's happening. Backwards compatibility is maintained simply by allocating four ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys selection macros to check the flag. In the old arrangement, each number of copies gets a whole phys, so it will always have either zero or all necessary DVAs filled, with no in-between, so the old behaviour naturally falls out of the new code. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Co-authored-by: Don Brady <don.brady@klarasystems.com> Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15893
This commit is contained in:
committed by
Brian Behlendorf
parent
0ba5f503c5
commit
f4aeb23f52
+96
-26
@@ -42,8 +42,8 @@ struct abd;
|
||||
/*
|
||||
* DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
|
||||
*/
|
||||
/* No flags yet. */
|
||||
#define DDT_FLAG_MASK (0)
|
||||
#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */
|
||||
#define DDT_FLAG_MASK (DDT_FLAG_FLAT)
|
||||
|
||||
/*
|
||||
* DDT on-disk storage object types. Each one corresponds to specific
|
||||
@@ -126,21 +126,80 @@ typedef struct {
|
||||
* characteristics of the stored block, such as its location on disk (DVAs),
|
||||
* birth txg and ref count.
|
||||
*
|
||||
* Note that an entry has an array of four ddt_phys_t, one for each number of
|
||||
* DVAs (copies= property) and another for additional "ditto" copies. Most
|
||||
* users of ddt_phys_t will handle indexing into or counting the phys they
|
||||
* want.
|
||||
* The "traditional" entry has an array of four, one for each number of DVAs
|
||||
* (copies= property) and another for additional "ditto" copies. Users of the
|
||||
* traditional struct will specify the variant (index) of the one they want.
|
||||
*
|
||||
* The newer "flat" entry has only a single form that is specified using the
|
||||
* DDT_PHYS_FLAT variant.
|
||||
*
|
||||
* Since the value size varies, use one of the size macros when interfacing
|
||||
* with the ddt zap.
|
||||
*/
|
||||
typedef struct {
|
||||
dva_t ddp_dva[SPA_DVAS_PER_BP];
|
||||
uint64_t ddp_refcnt;
|
||||
uint64_t ddp_phys_birth;
|
||||
} ddt_phys_t;
|
||||
|
||||
#define DDT_PHYS_MAX (4)
|
||||
#define DDT_NPHYS(ddt) ((ddt) ? DDT_PHYS_MAX : DDT_PHYS_MAX)
|
||||
#define DDT_PHYS_IS_DITTO(ddt, p) ((ddt) && p == 0)
|
||||
#define DDT_PHYS_FOR_COPIES(ddt, p) ((ddt) ? (p) : (p))
|
||||
#define DDT_PHYS_MAX (4)
|
||||
|
||||
/*
|
||||
* Note - this can be used in a flexible array and allocated for
|
||||
* a specific size (ddp_trad or ddp_flat). So be careful not to
|
||||
* copy using "=" assignment but instead use ddt_phys_copy().
|
||||
*/
|
||||
typedef union {
|
||||
/*
|
||||
* Traditional physical payload value for DDT zap (256 bytes)
|
||||
*/
|
||||
struct {
|
||||
dva_t ddp_dva[SPA_DVAS_PER_BP];
|
||||
uint64_t ddp_refcnt;
|
||||
uint64_t ddp_phys_birth;
|
||||
} ddp_trad[DDT_PHYS_MAX];
|
||||
|
||||
/*
|
||||
* Flat physical payload value for DDT zap (72 bytes)
|
||||
*/
|
||||
struct {
|
||||
dva_t ddp_dva[SPA_DVAS_PER_BP];
|
||||
uint64_t ddp_refcnt;
|
||||
uint64_t ddp_phys_birth; /* txg based from BP */
|
||||
uint64_t ddp_class_start; /* in realtime seconds */
|
||||
} ddp_flat;
|
||||
} ddt_univ_phys_t;
|
||||
|
||||
/*
|
||||
* This enum denotes which variant of a ddt_univ_phys_t to target. For
|
||||
* a traditional DDT entry, it represents the indexes into the ddp_trad
|
||||
* array. Any consumer of a ddt_univ_phys_t needs to know which variant
|
||||
* is being targeted.
|
||||
*
|
||||
* Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
|
||||
* we maintain the ability to free existing dedup-ditto blocks.
|
||||
*/
|
||||
|
||||
typedef enum {
|
||||
DDT_PHYS_DITTO = 0,
|
||||
DDT_PHYS_SINGLE = 1,
|
||||
DDT_PHYS_DOUBLE = 2,
|
||||
DDT_PHYS_TRIPLE = 3,
|
||||
DDT_PHYS_FLAT = 4,
|
||||
DDT_PHYS_NONE = 5
|
||||
} ddt_phys_variant_t;
|
||||
|
||||
#define DDT_PHYS_VARIANT(ddt, p) \
|
||||
(ASSERT((p) < DDT_PHYS_NONE), \
|
||||
((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))
|
||||
|
||||
#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
|
||||
#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat)
|
||||
|
||||
#define _DDT_PHYS_SWITCH(ddt, flat, trad) \
|
||||
(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))
|
||||
|
||||
#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \
|
||||
DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)
|
||||
|
||||
#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
|
||||
#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p)
|
||||
#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0))
|
||||
|
||||
/*
|
||||
* A "live" entry, holding changes to an entry made this txg, and other data to
|
||||
@@ -159,6 +218,9 @@ typedef struct {
|
||||
/* copy of data after a repair read, to be rewritten */
|
||||
abd_t *dde_repair_abd;
|
||||
|
||||
/* original phys contents before update, for error handling */
|
||||
ddt_univ_phys_t dde_orig_phys;
|
||||
|
||||
/* in-flight update IOs */
|
||||
zio_t *dde_lead_zio[DDT_PHYS_MAX];
|
||||
} ddt_entry_io_t;
|
||||
@@ -178,7 +240,7 @@ typedef struct {
|
||||
|
||||
ddt_entry_io_t *dde_io; /* IO support, when required */
|
||||
|
||||
ddt_phys_t dde_phys[]; /* physical data */
|
||||
ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */
|
||||
} ddt_entry_t;
|
||||
|
||||
/*
|
||||
@@ -189,8 +251,7 @@ typedef struct {
|
||||
ddt_key_t ddlwe_key;
|
||||
ddt_type_t ddlwe_type;
|
||||
ddt_class_t ddlwe_class;
|
||||
uint8_t ddlwe_nphys;
|
||||
ddt_phys_t ddlwe_phys[DDT_PHYS_MAX];
|
||||
ddt_univ_phys_t ddlwe_phys;
|
||||
} ddt_lightweight_entry_t;
|
||||
|
||||
/*
|
||||
@@ -236,17 +297,26 @@ typedef struct {
|
||||
uint64_t ddb_cursor;
|
||||
} ddt_bookmark_t;
|
||||
|
||||
extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
|
||||
uint64_t txg);
|
||||
extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
|
||||
blkptr_t *bp, uint64_t txg);
|
||||
extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
|
||||
const ddt_phys_t *ddp, blkptr_t *bp);
|
||||
const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);
|
||||
|
||||
extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
|
||||
extern void ddt_phys_clear(ddt_phys_t *ddp);
|
||||
extern void ddt_phys_addref(ddt_phys_t *ddp);
|
||||
extern void ddt_phys_decref(ddt_phys_t *ddp);
|
||||
extern ddt_phys_t *ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde,
|
||||
extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
|
||||
const blkptr_t *bp);
|
||||
extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
|
||||
ddt_phys_variant_t v);
|
||||
extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
|
||||
extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
|
||||
extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
|
||||
extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
|
||||
ddt_phys_variant_t v);
|
||||
extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
|
||||
const ddt_entry_t *dde, const blkptr_t *bp);
|
||||
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
|
||||
ddt_phys_variant_t v);
|
||||
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
|
||||
boolean_t encrypted);
|
||||
|
||||
extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
|
||||
extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
|
||||
|
||||
+9
-11
@@ -42,14 +42,12 @@ extern "C" {
|
||||
#define DDT_DIR_FLAGS "flags"
|
||||
|
||||
/* Fill a lightweight entry from a live entry. */
|
||||
#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
|
||||
memset((ddlwe), 0, sizeof (*ddlwe)); \
|
||||
(ddlwe)->ddlwe_key = (dde)->dde_key; \
|
||||
(ddlwe)->ddlwe_type = (dde)->dde_type; \
|
||||
(ddlwe)->ddlwe_class = (dde)->dde_class; \
|
||||
(ddlwe)->ddlwe_nphys = DDT_NPHYS(ddt); \
|
||||
for (int p = 0; p < (ddlwe)->ddlwe_nphys; p++) \
|
||||
(ddlwe)->ddlwe_phys[p] = (dde)->dde_phys[p]; \
|
||||
#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
|
||||
memset((ddlwe), 0, sizeof (*ddlwe)); \
|
||||
(ddlwe)->ddlwe_key = (dde)->dde_key; \
|
||||
(ddlwe)->ddlwe_type = (dde)->dde_type; \
|
||||
(ddlwe)->ddlwe_class = (dde)->dde_class; \
|
||||
memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
@@ -61,19 +59,19 @@ typedef struct {
|
||||
boolean_t prehash);
|
||||
int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
|
||||
int (*ddt_op_lookup)(objset_t *os, uint64_t object,
|
||||
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
|
||||
const ddt_key_t *ddk, void *phys, size_t psize);
|
||||
int (*ddt_op_contains)(objset_t *os, uint64_t object,
|
||||
const ddt_key_t *ddk);
|
||||
void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
|
||||
const ddt_key_t *ddk);
|
||||
void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
|
||||
int (*ddt_op_update)(objset_t *os, uint64_t object,
|
||||
const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize,
|
||||
const ddt_key_t *ddk, const void *phys, size_t psize,
|
||||
dmu_tx_t *tx);
|
||||
int (*ddt_op_remove)(objset_t *os, uint64_t object,
|
||||
const ddt_key_t *ddk, dmu_tx_t *tx);
|
||||
int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
|
||||
ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
|
||||
ddt_key_t *ddk, void *phys, size_t psize);
|
||||
int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
|
||||
} ddt_ops_t;
|
||||
|
||||
|
||||
@@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
|
||||
boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
|
||||
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
|
||||
void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
|
||||
ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
|
||||
ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
|
||||
void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
|
||||
void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
|
||||
void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
|
||||
|
||||
+6
-1
@@ -572,7 +572,7 @@ typedef struct blkptr {
|
||||
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
|
||||
BP_GET_PSIZE(bp))
|
||||
|
||||
#define BP_ZERO(bp) \
|
||||
#define BP_ZERO_DVAS(bp) \
|
||||
{ \
|
||||
(bp)->blk_dva[0].dva_word[0] = 0; \
|
||||
(bp)->blk_dva[0].dva_word[1] = 0; \
|
||||
@@ -580,6 +580,11 @@ typedef struct blkptr {
|
||||
(bp)->blk_dva[1].dva_word[1] = 0; \
|
||||
(bp)->blk_dva[2].dva_word[0] = 0; \
|
||||
(bp)->blk_dva[2].dva_word[1] = 0; \
|
||||
}
|
||||
|
||||
#define BP_ZERO(bp) \
|
||||
{ \
|
||||
BP_ZERO_DVAS(bp); \
|
||||
(bp)->blk_prop = 0; \
|
||||
(bp)->blk_pad[0] = 0; \
|
||||
(bp)->blk_pad[1] = 0; \
|
||||
|
||||
Reference in New Issue
Block a user