mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 10:37:35 +03:00
Persistent L2ARC
This commit makes the L2ARC persistent across reboots. We implement a light-weight persistent L2ARC metadata structure that allows L2ARC contents to be recovered after a reboot. This significantly eases the impact a reboot has on read performance on systems with large caches. Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: George Wilson <gwilson@delphix.com> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Co-authored-by: Saso Kiselkov <skiselkov@gmail.com> Co-authored-by: Jorgen Lundman <lundman@lundman.net> Co-authored-by: George Amanakis <gamanakis@gmail.com> Ported-by: Yuxuan Shui <yshuiv7@gmail.com> Signed-off-by: George Amanakis <gamanakis@gmail.com> Closes #925 Closes #1823 Closes #2672 Closes #3744 Closes #9582
This commit is contained in:
+287
-18
@@ -20,9 +20,10 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013, Delphix. All rights reserved.
|
||||
* Copyright (c) 2013, Saso Kiselkov. All rights reserved.
|
||||
* Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2020, George Amanakis. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_ARC_IMPL_H
|
||||
@@ -176,6 +177,218 @@ typedef struct l1arc_buf_hdr {
|
||||
abd_t *b_pabd;
|
||||
} l1arc_buf_hdr_t;
|
||||
|
||||
typedef enum l2arc_dev_hdr_flags_t {
|
||||
L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
|
||||
} l2arc_dev_hdr_flags_t;
|
||||
|
||||
/*
|
||||
* Pointer used in persistent L2ARC (for pointing to log blocks).
|
||||
*/
|
||||
typedef struct l2arc_log_blkptr {
|
||||
/*
|
||||
* Offset of log block within the device, in bytes
|
||||
*/
|
||||
uint64_t lbp_daddr;
|
||||
/*
|
||||
* Aligned payload size (in bytes) of the log block
|
||||
*/
|
||||
uint64_t lbp_payload_asize;
|
||||
/*
|
||||
* Offset in bytes of the first buffer in the payload
|
||||
*/
|
||||
uint64_t lbp_payload_start;
|
||||
/*
|
||||
* lbp_prop has the following format:
|
||||
* * logical size (in bytes)
|
||||
* * physical (compressed) size (in bytes)
|
||||
* * compression algorithm (we always LZ4-compress l2arc logs)
|
||||
* * checksum algorithm (used for lbp_cksum)
|
||||
*/
|
||||
uint64_t lbp_prop;
|
||||
zio_cksum_t lbp_cksum; /* checksum of log */
|
||||
} l2arc_log_blkptr_t;
|
||||
|
||||
/*
|
||||
* The persistent L2ARC device header.
|
||||
* Byte order of magic determines whether 64-bit bswap of fields is necessary.
|
||||
*/
|
||||
typedef struct l2arc_dev_hdr_phys {
|
||||
uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */
|
||||
uint64_t dh_version; /* Persistent L2ARC version */
|
||||
|
||||
/*
|
||||
* Global L2ARC device state and metadata.
|
||||
*/
|
||||
uint64_t dh_spa_guid;
|
||||
uint64_t dh_vdev_guid;
|
||||
uint64_t dh_log_blk_ent; /* entries per log blk */
|
||||
uint64_t dh_evict; /* evicted offset in bytes */
|
||||
uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */
|
||||
/*
|
||||
* Used in zdb.c for determining if a log block is valid, in the same
|
||||
* way that l2arc_rebuild() does.
|
||||
*/
|
||||
uint64_t dh_start;
|
||||
uint64_t dh_end;
|
||||
|
||||
/*
|
||||
* Start of log block chain. [0] -> newest log, [1] -> one older (used
|
||||
* for initiating prefetch).
|
||||
*/
|
||||
l2arc_log_blkptr_t dh_start_lbps[2];
|
||||
const uint64_t dh_pad[34]; /* pad to 512 bytes */
|
||||
zio_eck_t dh_tail;
|
||||
} l2arc_dev_hdr_phys_t;
|
||||
CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
|
||||
|
||||
/*
|
||||
* A single ARC buffer header entry in a l2arc_log_blk_phys_t.
|
||||
*/
|
||||
typedef struct l2arc_log_ent_phys {
|
||||
dva_t le_dva; /* dva of buffer */
|
||||
uint64_t le_birth; /* birth txg of buffer */
|
||||
/*
|
||||
* le_prop has the following format:
|
||||
* * logical size (in bytes)
|
||||
* * physical (compressed) size (in bytes)
|
||||
* * compression algorithm
|
||||
* * object type (used to restore arc_buf_contents_t)
|
||||
* * protected status (used for encryption)
|
||||
* * prefetch status (used in l2arc_read_done())
|
||||
*/
|
||||
uint64_t le_prop;
|
||||
uint64_t le_daddr; /* buf location on l2dev */
|
||||
/*
|
||||
* We pad the size of each entry to a power of 2 so that the size of
|
||||
* l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT,
|
||||
* because of the L2ARC_SET_*SIZE macros.
|
||||
*/
|
||||
const uint64_t le_pad[3]; /* pad to 64 bytes */
|
||||
} l2arc_log_ent_phys_t;
|
||||
|
||||
#define L2ARC_LOG_BLK_MAX_ENTRIES (1022)
|
||||
|
||||
/*
|
||||
* A log block of up to 1022 ARC buffer log entries, chained into the
|
||||
* persistent L2ARC metadata linked list. Byte order of magic determines
|
||||
* whether 64-bit bswap of fields is necessary.
|
||||
*/
|
||||
typedef struct l2arc_log_blk_phys {
|
||||
uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */
|
||||
/*
|
||||
* There are 2 chains (headed by dh_start_lbps[2]), and this field
|
||||
* points back to the previous block in this chain. We alternate
|
||||
* which chain we append to, so they are time-wise and offset-wise
|
||||
* interleaved, but that is an optimization rather than for
|
||||
* correctness.
|
||||
*/
|
||||
l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */
|
||||
/*
|
||||
* Pad header section to 128 bytes
|
||||
*/
|
||||
uint64_t lb_pad[7];
|
||||
/* Payload */
|
||||
l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
|
||||
} l2arc_log_blk_phys_t; /* 64K total */
|
||||
|
||||
/*
|
||||
* The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with
|
||||
* SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros.
|
||||
*/
|
||||
CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t),
|
||||
1ULL << SPA_MINBLOCKSHIFT));
|
||||
CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE);
|
||||
CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE);
|
||||
|
||||
/*
|
||||
* These structures hold in-flight abd buffers for log blocks as they're being
|
||||
* written to the L2ARC device.
|
||||
*/
|
||||
typedef struct l2arc_lb_abd_buf {
|
||||
abd_t *abd;
|
||||
list_node_t node;
|
||||
} l2arc_lb_abd_buf_t;
|
||||
|
||||
/*
|
||||
* These structures hold pointers to log blocks present on the L2ARC device.
|
||||
*/
|
||||
typedef struct l2arc_lb_ptr_buf {
|
||||
l2arc_log_blkptr_t *lb_ptr;
|
||||
list_node_t node;
|
||||
} l2arc_lb_ptr_buf_t;
|
||||
|
||||
/* Macros for setting fields in le_prop and lbp_prop */
|
||||
#define L2BLK_GET_LSIZE(field) \
|
||||
BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
|
||||
#define L2BLK_SET_LSIZE(field, x) \
|
||||
BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
|
||||
#define L2BLK_GET_PSIZE(field) \
|
||||
BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
|
||||
#define L2BLK_SET_PSIZE(field, x) \
|
||||
BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
|
||||
#define L2BLK_GET_COMPRESS(field) \
|
||||
BF64_GET((field), 32, SPA_COMPRESSBITS)
|
||||
#define L2BLK_SET_COMPRESS(field, x) \
|
||||
BF64_SET((field), 32, SPA_COMPRESSBITS, x)
|
||||
#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1)
|
||||
#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x)
|
||||
#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8)
|
||||
#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x)
|
||||
#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8)
|
||||
#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x)
|
||||
#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1)
|
||||
#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x)
|
||||
|
||||
#define PTR_SWAP(x, y) \
|
||||
do { \
|
||||
void *tmp = (x);\
|
||||
x = y; \
|
||||
y = tmp; \
|
||||
_NOTE(CONSTCOND)\
|
||||
} while (0)
|
||||
|
||||
#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */
|
||||
#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */
|
||||
|
||||
/*
|
||||
* L2ARC Internals
|
||||
*/
|
||||
typedef struct l2arc_dev {
|
||||
vdev_t *l2ad_vdev; /* vdev */
|
||||
spa_t *l2ad_spa; /* spa */
|
||||
uint64_t l2ad_hand; /* next write location */
|
||||
uint64_t l2ad_start; /* first addr on device */
|
||||
uint64_t l2ad_end; /* last addr on device */
|
||||
boolean_t l2ad_first; /* first sweep through */
|
||||
boolean_t l2ad_writing; /* currently writing */
|
||||
kmutex_t l2ad_mtx; /* lock for buffer list */
|
||||
list_t l2ad_buflist; /* buffer list */
|
||||
list_node_t l2ad_node; /* device list node */
|
||||
zfs_refcount_t l2ad_alloc; /* allocated bytes */
|
||||
/*
|
||||
* Persistence-related stuff
|
||||
*/
|
||||
l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */
|
||||
uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */
|
||||
l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
|
||||
int l2ad_log_ent_idx; /* index into cur log blk */
|
||||
/* Number of bytes in current log block's payload */
|
||||
uint64_t l2ad_log_blk_payload_asize;
|
||||
/*
|
||||
* Offset (in bytes) of the first buffer in current log block's
|
||||
* payload.
|
||||
*/
|
||||
uint64_t l2ad_log_blk_payload_start;
|
||||
/* Flag indicating whether a rebuild is scheduled or is going on */
|
||||
boolean_t l2ad_rebuild;
|
||||
boolean_t l2ad_rebuild_cancel;
|
||||
boolean_t l2ad_rebuild_began;
|
||||
uint64_t l2ad_log_entries; /* entries per log blk */
|
||||
uint64_t l2ad_evict; /* evicted offset in bytes */
|
||||
/* List of pointers to log blocks present in the L2ARC device */
|
||||
list_t l2ad_lbptr_list;
|
||||
} l2arc_dev_t;
|
||||
|
||||
/*
|
||||
* Encrypted blocks will need to be stored encrypted on the L2ARC
|
||||
* disk as they appear in the main pool. In order for this to work we
|
||||
@@ -206,32 +419,19 @@ typedef struct arc_buf_hdr_crypt {
|
||||
uint8_t b_mac[ZIO_DATA_MAC_LEN];
|
||||
} arc_buf_hdr_crypt_t;
|
||||
|
||||
typedef struct l2arc_dev {
|
||||
vdev_t *l2ad_vdev; /* vdev */
|
||||
spa_t *l2ad_spa; /* spa */
|
||||
uint64_t l2ad_hand; /* next write location */
|
||||
uint64_t l2ad_start; /* first addr on device */
|
||||
uint64_t l2ad_end; /* last addr on device */
|
||||
boolean_t l2ad_first; /* first sweep through */
|
||||
boolean_t l2ad_writing; /* currently writing */
|
||||
kmutex_t l2ad_mtx; /* lock for buffer list */
|
||||
list_t l2ad_buflist; /* buffer list */
|
||||
list_node_t l2ad_node; /* device list node */
|
||||
zfs_refcount_t l2ad_alloc; /* allocated bytes */
|
||||
} l2arc_dev_t;
|
||||
|
||||
typedef struct l2arc_buf_hdr {
|
||||
/* protected by arc_buf_hdr mutex */
|
||||
l2arc_dev_t *b_dev; /* L2ARC device */
|
||||
uint64_t b_daddr; /* disk address, offset byte */
|
||||
uint32_t b_hits;
|
||||
|
||||
list_node_t b_l2node;
|
||||
} l2arc_buf_hdr_t;
|
||||
|
||||
typedef struct l2arc_write_callback {
|
||||
l2arc_dev_t *l2wcb_dev; /* device info */
|
||||
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
|
||||
/* in-flight list of log blocks */
|
||||
list_t l2wcb_abd_list;
|
||||
} l2arc_write_callback_t;
|
||||
|
||||
struct arc_buf_hdr {
|
||||
@@ -532,6 +732,71 @@ typedef struct arc_stats {
|
||||
kstat_named_t arcstat_l2_psize;
|
||||
/* Not updated directly; only synced in arc_kstat_update. */
|
||||
kstat_named_t arcstat_l2_hdr_size;
|
||||
/*
|
||||
* Number of L2ARC log blocks written. These are used for restoring the
|
||||
* L2ARC. Updated during writing of L2ARC log blocks.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_log_blk_writes;
|
||||
/*
|
||||
* Moving average of the physical size of the L2ARC log blocks, in
|
||||
* bytes. Updated during L2ARC rebuild and during writing of L2ARC
|
||||
* log blocks.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_log_blk_avg_size;
|
||||
/*
|
||||
* Moving average of the physical size of L2ARC restored data, in bytes,
|
||||
* to the physical size of their metadata in ARC, in bytes.
|
||||
* Updated during L2ARC rebuild and during writing of L2ARC log blocks.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_data_to_meta_ratio;
|
||||
/*
|
||||
* Number of times the L2ARC rebuild was successful for an L2ARC device.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_rebuild_success;
|
||||
/*
|
||||
* Number of times the L2ARC rebuild failed because the device header
|
||||
* was in an unsupported format or corrupted.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_rebuild_abort_unsupported;
|
||||
/*
|
||||
* Number of times the L2ARC rebuild failed because of IO errors
|
||||
* while reading a log block.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_rebuild_abort_io_errors;
|
||||
/*
|
||||
* Number of times the L2ARC rebuild failed because of IO errors when
|
||||
* reading the device header.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
|
||||
/*
|
||||
* Number of L2ARC log blocks which failed to be restored due to
|
||||
* checksum errors.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
|
||||
/*
|
||||
* Number of times the L2ARC rebuild was aborted due to low system
|
||||
* memory.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_rebuild_abort_lowmem;
|
||||
/* Logical size of L2ARC restored data, in bytes. */
|
||||
kstat_named_t arcstat_l2_rebuild_size;
|
||||
/*
|
||||
* Number of L2ARC log entries (buffers) that were successfully
|
||||
* restored in ARC.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_rebuild_bufs;
|
||||
/*
|
||||
* Number of L2ARC log entries (buffers) already cached in ARC. These
|
||||
* were not restored again.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_rebuild_bufs_precached;
|
||||
/* Physical size of L2ARC restored data, in bytes. */
|
||||
kstat_named_t arcstat_l2_rebuild_psize;
|
||||
/*
|
||||
* Number of L2ARC log blocks that were restored successfully. Each
|
||||
* log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
|
||||
*/
|
||||
kstat_named_t arcstat_l2_rebuild_log_blks;
|
||||
kstat_named_t arcstat_memory_throttle_count;
|
||||
kstat_named_t arcstat_memory_direct_count;
|
||||
kstat_named_t arcstat_memory_indirect_count;
|
||||
@@ -617,6 +882,10 @@ extern void arc_tuning_update(boolean_t);
|
||||
extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS);
|
||||
extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
|
||||
|
||||
/* used in zdb.c */
|
||||
boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
|
||||
const l2arc_log_blkptr_t *lbp);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user