mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-12 19:20:28 +03:00
ddt: document the theory and the key data structures
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887
This commit is contained in:
parent
d961954688
commit
5720b00632
@ -21,6 +21,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2016 by Delphix. All rights reserved.
|
* Copyright (c) 2016 by Delphix. All rights reserved.
|
||||||
|
* Copyright (c) 2023, Klara Inc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _SYS_DDT_H
|
#ifndef _SYS_DDT_H
|
||||||
@ -39,10 +40,16 @@ extern "C" {
|
|||||||
struct abd;
|
struct abd;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* On-disk DDT formats, in the desired search order (newest version first).
|
* DDT on-disk storage object types. Each one corresponds to specific
|
||||||
|
* implementation, see ddt_ops_t. The value itself is not stored on disk.
|
||||||
|
*
|
||||||
|
* When searching for an entry, objects types will be searched in this order.
|
||||||
|
*
|
||||||
|
* Note that DDT_TYPES is used as the "no type" for new entries that have not
|
||||||
|
* yet been written to a storage object.
|
||||||
*/
|
*/
|
||||||
typedef enum {
|
typedef enum {
|
||||||
DDT_TYPE_ZAP = 0,
|
DDT_TYPE_ZAP = 0, /* ZAP storage object, ddt_zap */
|
||||||
DDT_TYPES
|
DDT_TYPES
|
||||||
} ddt_type_t;
|
} ddt_type_t;
|
||||||
|
|
||||||
@ -53,12 +60,18 @@ _Static_assert(DDT_TYPES <= UINT8_MAX,
|
|||||||
#define DDT_TYPE_DEFAULT (DDT_TYPE_ZAP)
|
#define DDT_TYPE_DEFAULT (DDT_TYPE_ZAP)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* DDT classes, in the desired search order (highest replication level first).
|
* DDT storage classes. Each class has a separate storage object for each type.
|
||||||
|
* The value itself is not stored on disk.
|
||||||
|
*
|
||||||
|
* When search for an entry, object classes will be searched in this order.
|
||||||
|
*
|
||||||
|
* Note that DDT_CLASSES is used as the "no class" for new entries that have not
|
||||||
|
* yet been written to a storage object.
|
||||||
*/
|
*/
|
||||||
typedef enum {
|
typedef enum {
|
||||||
DDT_CLASS_DITTO = 0,
|
DDT_CLASS_DITTO = 0, /* entry has ditto blocks (obsolete) */
|
||||||
DDT_CLASS_DUPLICATE,
|
DDT_CLASS_DUPLICATE, /* entry has multiple references */
|
||||||
DDT_CLASS_UNIQUE,
|
DDT_CLASS_UNIQUE, /* entry has a single reference */
|
||||||
DDT_CLASSES
|
DDT_CLASSES
|
||||||
} ddt_class_t;
|
} ddt_class_t;
|
||||||
|
|
||||||
@ -66,7 +79,9 @@ _Static_assert(DDT_CLASSES < UINT8_MAX,
|
|||||||
"ddt_class_t must fit in a uint8_t");
|
"ddt_class_t must fit in a uint8_t");
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* On-disk ddt entry: key (name) and physical storage (value).
|
* The "key" part of an on-disk entry. This is the unique "name" for a block,
|
||||||
|
* that is, that parts of the block pointer that will always be the same for
|
||||||
|
* the same data.
|
||||||
*/
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
zio_cksum_t ddk_cksum; /* 256-bit block checksum */
|
zio_cksum_t ddk_cksum; /* 256-bit block checksum */
|
||||||
@ -80,6 +95,10 @@ typedef struct {
|
|||||||
uint64_t ddk_prop;
|
uint64_t ddk_prop;
|
||||||
} ddt_key_t;
|
} ddt_key_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Macros for accessing parts of a ddt_key_t. These are similar to their BP_*
|
||||||
|
* counterparts.
|
||||||
|
*/
|
||||||
#define DDK_GET_LSIZE(ddk) \
|
#define DDK_GET_LSIZE(ddk) \
|
||||||
BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
|
BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
|
||||||
#define DDK_SET_LSIZE(ddk, x) \
|
#define DDK_SET_LSIZE(ddk, x) \
|
||||||
@ -96,6 +115,16 @@ typedef struct {
|
|||||||
#define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1)
|
#define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1)
|
||||||
#define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x)
|
#define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The "value" part for an on-disk entry. These are the "physical"
|
||||||
|
* characteristics of the stored block, such as its location on disk (DVAs),
|
||||||
|
* birth txg and ref count.
|
||||||
|
*
|
||||||
|
* Note that an entry has an array of four ddt_phys_t, one for each number of
|
||||||
|
* DVAs (copies= property) and another for additional "ditto" copies. Most
|
||||||
|
* users of ddt_phys_t will handle indexing into or counting the phys they
|
||||||
|
* want.
|
||||||
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
dva_t ddp_dva[SPA_DVAS_PER_BP];
|
dva_t ddp_dva[SPA_DVAS_PER_BP];
|
||||||
uint64_t ddp_refcnt;
|
uint64_t ddp_refcnt;
|
||||||
@ -103,6 +132,8 @@ typedef struct {
|
|||||||
} ddt_phys_t;
|
} ddt_phys_t;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
* Named indexes into the ddt_phys_t array in each entry.
|
||||||
|
*
|
||||||
* Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
|
* Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
|
||||||
* we maintain the ability to free existing dedup-ditto blocks.
|
* we maintain the ability to free existing dedup-ditto blocks.
|
||||||
*/
|
*/
|
||||||
@ -115,7 +146,8 @@ enum ddt_phys_type {
|
|||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In-core ddt entry
|
* A "live" entry, holding changes to an entry made this txg, and other data to
|
||||||
|
* support loading, updating and repairing the entry.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* State flags for dde_flags */
|
/* State flags for dde_flags */
|
||||||
@ -123,36 +155,56 @@ enum ddt_phys_type {
|
|||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
/* key must be first for ddt_key_compare */
|
/* key must be first for ddt_key_compare */
|
||||||
ddt_key_t dde_key;
|
ddt_key_t dde_key; /* ddt_tree key */
|
||||||
ddt_phys_t dde_phys[DDT_PHYS_TYPES];
|
ddt_phys_t dde_phys[DDT_PHYS_TYPES]; /* on-disk data */
|
||||||
|
|
||||||
|
/* in-flight update IOs */
|
||||||
zio_t *dde_lead_zio[DDT_PHYS_TYPES];
|
zio_t *dde_lead_zio[DDT_PHYS_TYPES];
|
||||||
|
|
||||||
|
/* copy of data after a repair read, to be rewritten */
|
||||||
struct abd *dde_repair_abd;
|
struct abd *dde_repair_abd;
|
||||||
|
|
||||||
|
/* storage type and class the entry was loaded from */
|
||||||
ddt_type_t dde_type;
|
ddt_type_t dde_type;
|
||||||
ddt_class_t dde_class;
|
ddt_class_t dde_class;
|
||||||
uint8_t dde_flags;
|
|
||||||
kcondvar_t dde_cv;
|
uint8_t dde_flags; /* load state flags */
|
||||||
avl_node_t dde_node;
|
kcondvar_t dde_cv; /* signaled when load completes */
|
||||||
|
|
||||||
|
avl_node_t dde_node; /* ddt_tree node */
|
||||||
} ddt_entry_t;
|
} ddt_entry_t;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In-core ddt
|
* In-core DDT object. This covers all entries and stats for a the whole pool
|
||||||
|
* for a given checksum type.
|
||||||
*/
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
kmutex_t ddt_lock;
|
kmutex_t ddt_lock; /* protects changes to all fields */
|
||||||
avl_tree_t ddt_tree;
|
|
||||||
avl_tree_t ddt_repair_tree;
|
avl_tree_t ddt_tree; /* "live" (changed) entries this txg */
|
||||||
enum zio_checksum ddt_checksum;
|
|
||||||
spa_t *ddt_spa;
|
avl_tree_t ddt_repair_tree; /* entries being repaired */
|
||||||
objset_t *ddt_os;
|
|
||||||
uint64_t ddt_stat_object;
|
enum zio_checksum ddt_checksum; /* checksum algorithm in use */
|
||||||
|
spa_t *ddt_spa; /* pool this ddt is on */
|
||||||
|
objset_t *ddt_os; /* ddt objset (always MOS) */
|
||||||
|
|
||||||
|
/* per-type/per-class entry store objects */
|
||||||
uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
|
uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
|
||||||
|
|
||||||
|
/* object ids for whole-ddt and per-type/per-class stats */
|
||||||
|
uint64_t ddt_stat_object;
|
||||||
|
ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
|
||||||
|
|
||||||
|
/* type/class stats by power-2-sized referenced blocks */
|
||||||
ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
|
ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
|
||||||
ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
|
ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
|
||||||
ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
|
|
||||||
} ddt_t;
|
} ddt_t;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In-core and on-disk bookmark for DDT walks
|
* In-core and on-disk bookmark for DDT walks. This is a cursor for ddt_walk(),
|
||||||
|
* and is stable across calls, even if the DDT is updated, the pool is
|
||||||
|
* restarted or loaded on another system, or OpenZFS is upgraded.
|
||||||
*/
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint64_t ddb_class;
|
uint64_t ddb_class;
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2022 by Pawel Jakub Dawidek
|
* Copyright (c) 2022 by Pawel Jakub Dawidek
|
||||||
|
* Copyright (c) 2023, Klara Inc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sys/zfs_context.h>
|
#include <sys/zfs_context.h>
|
||||||
@ -39,6 +40,101 @@
|
|||||||
#include <sys/dsl_scan.h>
|
#include <sys/dsl_scan.h>
|
||||||
#include <sys/abd.h>
|
#include <sys/abd.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* # DDT: Deduplication tables
|
||||||
|
*
|
||||||
|
* The dedup subsystem provides block-level deduplication. When enabled, blocks
|
||||||
|
* to be written will have the dedup (D) bit set, which causes them to be
|
||||||
|
* tracked in a "dedup table", or DDT. If a block has been seen before (exists
|
||||||
|
* in the DDT), instead of being written, it will instead be made to reference
|
||||||
|
* the existing on-disk data, and a refcount bumped in the DDT instead.
|
||||||
|
*
|
||||||
|
* ## Dedup tables and entries
|
||||||
|
*
|
||||||
|
* Conceptually, a DDT is a dictionary or map. Each entry has a "key"
|
||||||
|
* (ddt_key_t) made up a block's checksum and certian properties, and a "value"
|
||||||
|
* (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
|
||||||
|
* time and refcount. Together these are enough to track references to a
|
||||||
|
* specific block, to build a valid block pointer to reference that block (for
|
||||||
|
* freeing, scrubbing, etc), and to fill a new block pointer with the missing
|
||||||
|
* pieces to make it seem like it was written.
|
||||||
|
*
|
||||||
|
* There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
|
||||||
|
* Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
|
||||||
|
* object data formats, each with their own implementations) and "classes"
|
||||||
|
* (ddt_class_t, instance of a storage type object, for entries with a specific
|
||||||
|
* characteristic). An entry (key) will only ever exist on one of these objects
|
||||||
|
* at any given time, but may be moved from one to another if their type or
|
||||||
|
* class changes.
|
||||||
|
*
|
||||||
|
* The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
|
||||||
|
* is to be written, before DVAs have been allocated, ddt_lookup() is called to
|
||||||
|
* see if the block has been seen before. If its not found, the write proceeds
|
||||||
|
* as normal, and after it succeeds, a new entry is created. If it is found, we
|
||||||
|
* fill the BP with the DVAs from the entry, increment the refcount and cause
|
||||||
|
* the write IO to return immediately.
|
||||||
|
*
|
||||||
|
* Each ddt_phys_t slot in the entry represents a separate dedup block for the
|
||||||
|
* same content/checksum. The slot is selected based on the zp_copies parameter
|
||||||
|
* the block is written with, that is, the number of DVAs in the block. The
|
||||||
|
* "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto"
|
||||||
|
* feature. These are no longer written, and will be freed if encountered on
|
||||||
|
* old pools.
|
||||||
|
*
|
||||||
|
* ## Lifetime of an entry
|
||||||
|
*
|
||||||
|
* A DDT can be enormous, and typically is not held in memory all at once.
|
||||||
|
* Instead, the changes to an entry are tracked in memory, and written down to
|
||||||
|
* disk at the end of each txg.
|
||||||
|
*
|
||||||
|
* A "live" in-memory entry (ddt_entry_t) is a node on the live tree
|
||||||
|
* (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is
|
||||||
|
* required for IO, ddt_lookup() is called. If an entry already exists on
|
||||||
|
* ddt_tree, it is returned. Otherwise, a new one is created, and the
|
||||||
|
* type/class objects for the DDT are searched for that key. If its found, its
|
||||||
|
* value is copied into the live entry. If not, an empty entry is created.
|
||||||
|
*
|
||||||
|
* The live entry will be modified during the txg, usually by modifying the
|
||||||
|
* refcount, but sometimes by adding or updating DVAs. At the end of the txg
|
||||||
|
* (during spa_sync()), type and class are recalculated for entry (see
|
||||||
|
* ddt_sync_entry()), and the entry is written to the appropriate storage
|
||||||
|
* object and (if necessary), removed from an old one. ddt_tree is cleared and
|
||||||
|
* the next txg can start.
|
||||||
|
*
|
||||||
|
* ## Repair IO
|
||||||
|
*
|
||||||
|
* If a read on a dedup block fails, but there are other copies of the block in
|
||||||
|
* the other ddt_phys_t slots, reads will be issued for those instead
|
||||||
|
* (zio_ddt_read_start()). If one of those succeeds, the read is returned to
|
||||||
|
* the caller, and a copy is stashed on the entry's dde_repair_abd.
|
||||||
|
*
|
||||||
|
* During the end-of-txg sync, any entries with a dde_repair_abd get a
|
||||||
|
* "rewrite" write issued for the original block pointer, with the data read
|
||||||
|
* from the alternate block. If the block is actually damaged, this will invoke
|
||||||
|
* the pool's "self-healing" mechanism, and repair the block.
|
||||||
|
*
|
||||||
|
* ## Scanning (scrub/resilver)
|
||||||
|
*
|
||||||
|
* If dedup is active, the scrub machinery will walk the dedup table first, and
|
||||||
|
* scrub all blocks with refcnt > 1 first. After that it will move on to the
|
||||||
|
* regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
|
||||||
|
* In this way, heavily deduplicated blocks are only scrubbed once. See the
|
||||||
|
* commentary on dsl_scan_ddt() for more details.
|
||||||
|
*
|
||||||
|
* Walking the DDT is done via ddt_walk(). The current position is stored in a
|
||||||
|
* ddt_bookmark_t, which represents a stable position in the storage object.
|
||||||
|
* This bookmark is stored by the scan machinery, and must reference the same
|
||||||
|
* position on the object even if the object changes, the pool is exported, or
|
||||||
|
* OpenZFS is upgraded.
|
||||||
|
*
|
||||||
|
* ## Interaction with block cloning
|
||||||
|
*
|
||||||
|
* If block cloning and dedup are both enabled on a pool, BRT will look for the
|
||||||
|
* dedup bit on an incoming block pointer. If set, it will call into the DDT
|
||||||
|
* (ddt_addref()) to add a reference to the block, instead of adding a
|
||||||
|
* reference to the BRT. See brt_pending_apply().
|
||||||
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* These are the only checksums valid for dedup. They must match the list
|
* These are the only checksums valid for dedup. They must match the list
|
||||||
* from dedup_table in zfs_prop.c
|
* from dedup_table in zfs_prop.c
|
||||||
|
Loading…
Reference in New Issue
Block a user