mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 18:40:43 +03:00
ddt: dedup log
Adds a log/journal to dedup. At the end of txg, instead of writing the entry directly to the ZAP, instead its adding to an in-memory tree and appended to an on-disk object. The on-disk object is only read at import, to reload the in-memory tree. Lookups first go the the log tree before going to the ZAP, so recently-used entries will remain close by in memory. This vastly reduces overhead from dedup IO, as it will not have to do so many read/update/write cycles on ZAP leaf nodes. A flushing facility is added at end of txg, to push logged entries out to the ZAP. There's actually two separate "logs" (in-memory tree and on-disk object), one active (recieving updated entries) and one flushing (writing out to disk). These are swapped (ie flushing begins) based on memory used by the in-memory log trees and time since we last flushed something. The flushing facility monitors the amount of entries coming in and being flushed out, and calibrates itself to try to flush enough each txg to keep up with the ingest rate without competing too much with other IO. Multiple tuneables are provided to control the flushing facility. All the histograms and stats are update to accomodate the log as a separate entry store. zdb gains knowledge of how to count them and dump them. Documentation included! Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Co-authored-by: Allan Jude <allan@klarasystems.com> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15895
This commit is contained in:
committed by
Brian Behlendorf
parent
cbb9ef0a4c
commit
cd69ba3d49
+36
-3
@@ -43,7 +43,8 @@ struct abd;
|
||||
* DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
|
||||
*/
|
||||
#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */
|
||||
#define DDT_FLAG_MASK (DDT_FLAG_FLAT)
|
||||
#define DDT_FLAG_LOG (1 << 1) /* dedup log (journal) */
|
||||
#define DDT_FLAG_MASK (DDT_FLAG_FLAT|DDT_FLAG_LOG)
|
||||
|
||||
/*
|
||||
* DDT on-disk storage object types. Each one corresponds to specific
|
||||
@@ -209,6 +210,7 @@ typedef enum {
|
||||
/* State flags for dde_flags */
|
||||
#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
|
||||
#define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */
|
||||
#define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */
|
||||
|
||||
/*
|
||||
* Additional data to support entry update or repair. This is fixed size
|
||||
@@ -254,6 +256,19 @@ typedef struct {
|
||||
ddt_univ_phys_t ddlwe_phys;
|
||||
} ddt_lightweight_entry_t;
|
||||
|
||||
/*
|
||||
* In-core DDT log. A separate struct to make it easier to switch between the
|
||||
* appending and flushing logs.
|
||||
*/
|
||||
typedef struct {
|
||||
avl_tree_t ddl_tree; /* logged entries */
|
||||
uint32_t ddl_flags; /* flags for this log */
|
||||
uint64_t ddl_object; /* log object id */
|
||||
uint64_t ddl_length; /* on-disk log size */
|
||||
uint64_t ddl_first_txg; /* txg log became active */
|
||||
ddt_key_t ddl_checkpoint; /* last checkpoint */
|
||||
} ddt_log_t;
|
||||
|
||||
/*
|
||||
* In-core DDT object. This covers all entries and stats for a the whole pool
|
||||
* for a given checksum type.
|
||||
@@ -262,8 +277,22 @@ typedef struct {
|
||||
kmutex_t ddt_lock; /* protects changes to all fields */
|
||||
|
||||
avl_tree_t ddt_tree; /* "live" (changed) entries this txg */
|
||||
avl_tree_t ddt_log_tree; /* logged entries */
|
||||
|
||||
avl_tree_t ddt_repair_tree; /* entries being repaired */
|
||||
avl_tree_t ddt_repair_tree; /* entries being repaired */
|
||||
|
||||
ddt_log_t ddt_log[2]; /* active/flushing logs */
|
||||
ddt_log_t *ddt_log_active; /* pointers into ddt_log */
|
||||
ddt_log_t *ddt_log_flushing; /* swapped when flush starts */
|
||||
|
||||
hrtime_t ddt_flush_start; /* log flush start this txg */
|
||||
uint32_t ddt_flush_pass; /* log flush pass this txg */
|
||||
|
||||
int32_t ddt_flush_count; /* entries flushed this txg */
|
||||
int32_t ddt_flush_min; /* min rem entries to flush */
|
||||
int32_t ddt_log_ingest_rate; /* rolling log ingest rate */
|
||||
int32_t ddt_log_flush_rate; /* rolling log flush rate */
|
||||
int32_t ddt_log_flush_time_rate; /* avg time spent flushing */
|
||||
|
||||
enum zio_checksum ddt_checksum; /* checksum algorithm in use */
|
||||
spa_t *ddt_spa; /* pool this ddt is on */
|
||||
@@ -276,13 +305,17 @@ typedef struct {
|
||||
/* per-type/per-class entry store objects */
|
||||
uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
|
||||
|
||||
/* object ids for whole-ddt and per-type/per-class stats */
|
||||
/* object ids for stored, logged and per-type/per-class stats */
|
||||
uint64_t ddt_stat_object;
|
||||
ddt_object_t ddt_log_stats;
|
||||
ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
|
||||
|
||||
/* type/class stats by power-2-sized referenced blocks */
|
||||
ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
|
||||
ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
|
||||
|
||||
/* log stats power-2-sized referenced blocks */
|
||||
ddt_histogram_t ddt_log_histogram;
|
||||
} ddt_t;
|
||||
|
||||
/*
|
||||
|
||||
+130
-1
@@ -28,6 +28,7 @@
|
||||
#define _SYS_DDT_IMPL_H
|
||||
|
||||
#include <sys/ddt.h>
|
||||
#include <sys/bitops.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -50,6 +51,106 @@ extern "C" {
|
||||
memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
|
||||
} while (0)
|
||||
|
||||
#define DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do { \
|
||||
memset((ddlwe), 0, sizeof (*ddlwe)); \
|
||||
(ddlwe)->ddlwe_key = (ddle)->ddle_key; \
|
||||
(ddlwe)->ddlwe_type = (ddle)->ddle_type; \
|
||||
(ddlwe)->ddlwe_class = (ddle)->ddle_class; \
|
||||
memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* An entry on the log tree. These are "frozen", and a record of what's in
|
||||
* the on-disk log. They can't be used in place, but can be "loaded" back into
|
||||
* the live tree.
|
||||
*/
|
||||
typedef struct {
|
||||
ddt_key_t ddle_key; /* ddt_log_tree key */
|
||||
avl_node_t ddle_node; /* ddt_log_tree node */
|
||||
|
||||
ddt_type_t ddle_type; /* storage type */
|
||||
ddt_class_t ddle_class; /* storage class */
|
||||
|
||||
/* extra allocation for flat/trad phys */
|
||||
ddt_univ_phys_t ddle_phys[];
|
||||
} ddt_log_entry_t;
|
||||
|
||||
/* On-disk log record types. */
|
||||
typedef enum {
|
||||
DLR_INVALID = 0, /* end of block marker */
|
||||
DLR_ENTRY = 1, /* an entry to add or replace in the log tree */
|
||||
} ddt_log_record_type_t;
|
||||
|
||||
/* On-disk log record header. */
|
||||
typedef struct {
|
||||
/*
|
||||
* dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
|
||||
* access it.
|
||||
*
|
||||
* bits 0-7: record type (ddt_log_record_type_t)
|
||||
* bits 8-15: length of record header+payload
|
||||
* bits 16-47: reserved, all zero
|
||||
* bits 48-55: if type==DLR_ENTRY, storage type (ddt_type)
|
||||
* otherwise all zero
|
||||
* bits 56-63: if type==DLR_ENTRY, storage class (ddt_class)
|
||||
* otherwise all zero
|
||||
*/
|
||||
uint64_t dlr_info;
|
||||
uint8_t dlr_payload[];
|
||||
} ddt_log_record_t;
|
||||
|
||||
#define DLR_GET_TYPE(dlr) BF64_GET((dlr)->dlr_info, 0, 8)
|
||||
#define DLR_SET_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 0, 8, v)
|
||||
#define DLR_GET_RECLEN(dlr) BF64_GET((dlr)->dlr_info, 8, 16)
|
||||
#define DLR_SET_RECLEN(dlr, v) BF64_SET((dlr)->dlr_info, 8, 16, v)
|
||||
#define DLR_GET_ENTRY_TYPE(dlr) BF64_GET((dlr)->dlr_info, 48, 8)
|
||||
#define DLR_SET_ENTRY_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 48, 8, v)
|
||||
#define DLR_GET_ENTRY_CLASS(dlr) BF64_GET((dlr)->dlr_info, 56, 8)
|
||||
#define DLR_SET_ENTRY_CLASS(dlr, v) BF64_SET((dlr)->dlr_info, 56, 8, v)
|
||||
|
||||
/* Payload for DLR_ENTRY. */
|
||||
typedef struct {
|
||||
ddt_key_t dlre_key;
|
||||
ddt_univ_phys_t dlre_phys[];
|
||||
} ddt_log_record_entry_t;
|
||||
|
||||
/* Log flags (ddl_flags, dlh_flags) */
|
||||
#define DDL_FLAG_FLUSHING (1 << 0) /* this log is being flushed */
|
||||
#define DDL_FLAG_CHECKPOINT (1 << 1) /* header has a checkpoint */
|
||||
|
||||
/* On-disk log header, stored in the bonus buffer. */
|
||||
typedef struct {
|
||||
/*
|
||||
* dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
|
||||
* access it.
|
||||
*
|
||||
* bits 0-7: log version
|
||||
* bits 8-15: log flags
|
||||
* bits 16-63: reserved, all zero
|
||||
*/
|
||||
uint64_t dlh_info;
|
||||
|
||||
uint64_t dlh_length; /* log size in bytes */
|
||||
uint64_t dlh_first_txg; /* txg this log went active */
|
||||
ddt_key_t dlh_checkpoint; /* last checkpoint */
|
||||
} ddt_log_header_t;
|
||||
|
||||
#define DLH_GET_VERSION(dlh) BF64_GET((dlh)->dlh_info, 0, 8)
|
||||
#define DLH_SET_VERSION(dlh, v) BF64_SET((dlh)->dlh_info, 0, 8, v)
|
||||
#define DLH_GET_FLAGS(dlh) BF64_GET((dlh)->dlh_info, 8, 8)
|
||||
#define DLH_SET_FLAGS(dlh, v) BF64_SET((dlh)->dlh_info, 8, 8, v)
|
||||
|
||||
/* DDT log update state */
|
||||
typedef struct {
|
||||
dmu_tx_t *dlu_tx; /* tx the update is being applied to */
|
||||
dnode_t *dlu_dn; /* log object dnode */
|
||||
dmu_buf_t **dlu_dbp; /* array of block buffer pointers */
|
||||
int dlu_ndbp; /* number of block buffer pointers */
|
||||
uint16_t dlu_reclen; /* cached length of record */
|
||||
uint64_t dlu_block; /* block for next entry */
|
||||
uint64_t dlu_offset; /* offset for next entry */
|
||||
} ddt_log_update_t;
|
||||
|
||||
/*
|
||||
* Ops vector to access a specific DDT object type.
|
||||
*/
|
||||
@@ -77,6 +178,33 @@ typedef struct {
|
||||
|
||||
extern const ddt_ops_t ddt_zap_ops;
|
||||
|
||||
/* Dedup log API */
|
||||
extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx,
|
||||
ddt_log_update_t *dlu);
|
||||
extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde,
|
||||
ddt_log_update_t *dlu);
|
||||
extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
|
||||
|
||||
extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
|
||||
ddt_lightweight_entry_t *ddlwe);
|
||||
extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
|
||||
const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);
|
||||
|
||||
extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
|
||||
dmu_tx_t *tx);
|
||||
extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx);
|
||||
|
||||
extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx);
|
||||
|
||||
extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx);
|
||||
|
||||
extern int ddt_log_load(ddt_t *ddt);
|
||||
extern void ddt_log_alloc(ddt_t *ddt);
|
||||
extern void ddt_log_free(ddt_t *ddt);
|
||||
|
||||
extern void ddt_log_init(void);
|
||||
extern void ddt_log_fini(void);
|
||||
|
||||
/*
|
||||
* These are only exposed so that zdb can access them. Try not to use them
|
||||
* outside of the DDT implementation proper, and if you do, consider moving
|
||||
@@ -89,7 +217,8 @@ extern const ddt_ops_t ddt_zap_ops;
|
||||
*/
|
||||
#define DDT_NAMELEN 32
|
||||
|
||||
extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde);
|
||||
extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt,
|
||||
const ddt_univ_phys_t *ddp);
|
||||
|
||||
extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
|
||||
|
||||
|
||||
@@ -375,6 +375,7 @@ typedef struct dmu_buf {
|
||||
#define DMU_POOL_L2CACHE "l2cache"
|
||||
#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
|
||||
#define DMU_POOL_DDT "DDT-%s-%s-%s"
|
||||
#define DMU_POOL_DDT_LOG "DDT-log-%s-%u"
|
||||
#define DMU_POOL_DDT_STATS "DDT-statistics"
|
||||
#define DMU_POOL_DDT_DIR "DDT-%s"
|
||||
#define DMU_POOL_CREATION_VERSION "creation_version"
|
||||
|
||||
Reference in New Issue
Block a user