Persistent L2ARC

This commit makes the L2ARC persistent across reboots. We implement
a light-weight persistent L2ARC metadata structure that allows L2ARC
contents to be recovered after a reboot. This significantly eases the
impact a reboot has on read performance on systems with large caches.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Saso Kiselkov <skiselkov@gmail.com>
Co-authored-by: Jorgen Lundman <lundman@lundman.net>
Co-authored-by: George Amanakis <gamanakis@gmail.com>
Ported-by: Yuxuan Shui <yshuiv7@gmail.com>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #925 
Closes #1823 
Closes #2672 
Closes #3744 
Closes #9582
This commit is contained in:
George Amanakis
2020-04-10 13:33:35 -04:00
committed by GitHub
parent 36a6e2335c
commit 77f6826b83
30 changed files with 3020 additions and 88 deletions
+4
View File
@@ -310,10 +310,14 @@ void arc_fini(void);
void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
void l2arc_remove_vdev(vdev_t *vd);
boolean_t l2arc_vdev_present(vdev_t *vd);
void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top,
uint64_t check);
void l2arc_init(void);
void l2arc_fini(void);
void l2arc_start(void);
void l2arc_stop(void);
void l2arc_spa_rebuild_start(spa_t *spa);
#ifndef _KERNEL
extern boolean_t arc_watch;
+287 -18
View File
@@ -20,9 +20,10 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013, Delphix. All rights reserved.
* Copyright (c) 2013, Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2020, George Amanakis. All rights reserved.
*/
#ifndef _SYS_ARC_IMPL_H
@@ -176,6 +177,218 @@ typedef struct l1arc_buf_hdr {
abd_t *b_pabd;
} l1arc_buf_hdr_t;
typedef enum l2arc_dev_hdr_flags_t {
L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
} l2arc_dev_hdr_flags_t;
/*
* Pointer used in persistent L2ARC (for pointing to log blocks).
*/
typedef struct l2arc_log_blkptr {
/*
* Offset of log block within the device, in bytes
*/
uint64_t lbp_daddr;
/*
* Aligned payload size (in bytes) of the log block
*/
uint64_t lbp_payload_asize;
/*
* Offset in bytes of the first buffer in the payload
*/
uint64_t lbp_payload_start;
/*
* lbp_prop has the following format:
* * logical size (in bytes)
* * physical (compressed) size (in bytes)
* * compression algorithm (we always LZ4-compress l2arc logs)
* * checksum algorithm (used for lbp_cksum)
*/
uint64_t lbp_prop;
zio_cksum_t lbp_cksum; /* checksum of log */
} l2arc_log_blkptr_t;
/*
* The persistent L2ARC device header.
* Byte order of magic determines whether 64-bit bswap of fields is necessary.
*/
typedef struct l2arc_dev_hdr_phys {
uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */
uint64_t dh_version; /* Persistent L2ARC version */
/*
* Global L2ARC device state and metadata.
*/
uint64_t dh_spa_guid;
uint64_t dh_vdev_guid;
uint64_t dh_log_blk_ent; /* entries per log blk */
uint64_t dh_evict; /* evicted offset in bytes */
uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */
/*
* Used in zdb.c for determining if a log block is valid, in the same
* way that l2arc_rebuild() does.
*/
uint64_t dh_start;
uint64_t dh_end;
/*
* Start of log block chain. [0] -> newest log, [1] -> one older (used
* for initiating prefetch).
*/
l2arc_log_blkptr_t dh_start_lbps[2];
const uint64_t dh_pad[34]; /* pad to 512 bytes */
zio_eck_t dh_tail;
} l2arc_dev_hdr_phys_t;
CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
/*
* A single ARC buffer header entry in a l2arc_log_blk_phys_t.
*/
typedef struct l2arc_log_ent_phys {
dva_t le_dva; /* dva of buffer */
uint64_t le_birth; /* birth txg of buffer */
/*
* le_prop has the following format:
* * logical size (in bytes)
* * physical (compressed) size (in bytes)
* * compression algorithm
* * object type (used to restore arc_buf_contents_t)
* * protected status (used for encryption)
* * prefetch status (used in l2arc_read_done())
*/
uint64_t le_prop;
uint64_t le_daddr; /* buf location on l2dev */
/*
* We pad the size of each entry to a power of 2 so that the size of
* l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT,
* because of the L2ARC_SET_*SIZE macros.
*/
const uint64_t le_pad[3]; /* pad to 64 bytes */
} l2arc_log_ent_phys_t;
#define L2ARC_LOG_BLK_MAX_ENTRIES (1022)
/*
* A log block of up to 1022 ARC buffer log entries, chained into the
* persistent L2ARC metadata linked list. Byte order of magic determines
* whether 64-bit bswap of fields is necessary.
*/
typedef struct l2arc_log_blk_phys {
uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */
/*
* There are 2 chains (headed by dh_start_lbps[2]), and this field
* points back to the previous block in this chain. We alternate
* which chain we append to, so they are time-wise and offset-wise
* interleaved, but that is an optimization rather than for
* correctness.
*/
l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */
/*
* Pad header section to 128 bytes
*/
uint64_t lb_pad[7];
/* Payload */
l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
} l2arc_log_blk_phys_t; /* 64K total */
/*
* The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with
* SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros.
*/
CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t),
1ULL << SPA_MINBLOCKSHIFT));
CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE);
CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE);
/*
* These structures hold in-flight abd buffers for log blocks as they're being
* written to the L2ARC device.
*/
typedef struct l2arc_lb_abd_buf {
abd_t *abd;
list_node_t node;
} l2arc_lb_abd_buf_t;
/*
* These structures hold pointers to log blocks present on the L2ARC device.
*/
typedef struct l2arc_lb_ptr_buf {
l2arc_log_blkptr_t *lb_ptr;
list_node_t node;
} l2arc_lb_ptr_buf_t;
/* Macros for setting fields in le_prop and lbp_prop */
#define L2BLK_GET_LSIZE(field) \
BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
#define L2BLK_SET_LSIZE(field, x) \
BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
#define L2BLK_GET_PSIZE(field) \
BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
#define L2BLK_SET_PSIZE(field, x) \
BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
#define L2BLK_GET_COMPRESS(field) \
BF64_GET((field), 32, SPA_COMPRESSBITS)
#define L2BLK_SET_COMPRESS(field, x) \
BF64_SET((field), 32, SPA_COMPRESSBITS, x)
#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1)
#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x)
#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8)
#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x)
#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8)
#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x)
#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1)
#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x)
#define PTR_SWAP(x, y) \
do { \
void *tmp = (x);\
x = y; \
y = tmp; \
_NOTE(CONSTCOND)\
} while (0)
#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */
#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */
/*
* L2ARC Internals
*/
typedef struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
spa_t *l2ad_spa; /* spa */
uint64_t l2ad_hand; /* next write location */
uint64_t l2ad_start; /* first addr on device */
uint64_t l2ad_end; /* last addr on device */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
kmutex_t l2ad_mtx; /* lock for buffer list */
list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
zfs_refcount_t l2ad_alloc; /* allocated bytes */
/*
* Persistence-related stuff
*/
l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */
uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */
l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
int l2ad_log_ent_idx; /* index into cur log blk */
/* Number of bytes in current log block's payload */
uint64_t l2ad_log_blk_payload_asize;
/*
* Offset (in bytes) of the first buffer in current log block's
* payload.
*/
uint64_t l2ad_log_blk_payload_start;
/* Flag indicating whether a rebuild is scheduled or is going on */
boolean_t l2ad_rebuild;
boolean_t l2ad_rebuild_cancel;
boolean_t l2ad_rebuild_began;
uint64_t l2ad_log_entries; /* entries per log blk */
uint64_t l2ad_evict; /* evicted offset in bytes */
/* List of pointers to log blocks present in the L2ARC device */
list_t l2ad_lbptr_list;
} l2arc_dev_t;
/*
* Encrypted blocks will need to be stored encrypted on the L2ARC
* disk as they appear in the main pool. In order for this to work we
@@ -206,32 +419,19 @@ typedef struct arc_buf_hdr_crypt {
uint8_t b_mac[ZIO_DATA_MAC_LEN];
} arc_buf_hdr_crypt_t;
typedef struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
spa_t *l2ad_spa; /* spa */
uint64_t l2ad_hand; /* next write location */
uint64_t l2ad_start; /* first addr on device */
uint64_t l2ad_end; /* last addr on device */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
kmutex_t l2ad_mtx; /* lock for buffer list */
list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
zfs_refcount_t l2ad_alloc; /* allocated bytes */
} l2arc_dev_t;
typedef struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
uint64_t b_daddr; /* disk address, offset byte */
uint32_t b_hits;
list_node_t b_l2node;
} l2arc_buf_hdr_t;
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
/* in-flight list of log blocks */
list_t l2wcb_abd_list;
} l2arc_write_callback_t;
struct arc_buf_hdr {
@@ -532,6 +732,71 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_psize;
/* Not updated directly; only synced in arc_kstat_update. */
kstat_named_t arcstat_l2_hdr_size;
/*
* Number of L2ARC log blocks written. These are used for restoring the
* L2ARC. Updated during writing of L2ARC log blocks.
*/
kstat_named_t arcstat_l2_log_blk_writes;
/*
* Moving average of the physical size of the L2ARC log blocks, in
* bytes. Updated during L2ARC rebuild and during writing of L2ARC
* log blocks.
*/
kstat_named_t arcstat_l2_log_blk_avg_size;
/*
* Moving average of the physical size of L2ARC restored data, in bytes,
* to the physical size of their metadata in ARC, in bytes.
* Updated during L2ARC rebuild and during writing of L2ARC log blocks.
*/
kstat_named_t arcstat_l2_data_to_meta_ratio;
/*
* Number of times the L2ARC rebuild was successful for an L2ARC device.
*/
kstat_named_t arcstat_l2_rebuild_success;
/*
* Number of times the L2ARC rebuild failed because the device header
* was in an unsupported format or corrupted.
*/
kstat_named_t arcstat_l2_rebuild_abort_unsupported;
/*
* Number of times the L2ARC rebuild failed because of IO errors
* while reading a log block.
*/
kstat_named_t arcstat_l2_rebuild_abort_io_errors;
/*
* Number of times the L2ARC rebuild failed because of IO errors when
* reading the device header.
*/
kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
/*
* Number of L2ARC log blocks which failed to be restored due to
* checksum errors.
*/
kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
/*
* Number of times the L2ARC rebuild was aborted due to low system
* memory.
*/
kstat_named_t arcstat_l2_rebuild_abort_lowmem;
/* Logical size of L2ARC restored data, in bytes. */
kstat_named_t arcstat_l2_rebuild_size;
/*
* Number of L2ARC log entries (buffers) that were successfully
* restored in ARC.
*/
kstat_named_t arcstat_l2_rebuild_bufs;
/*
* Number of L2ARC log entries (buffers) already cached in ARC. These
* were not restored again.
*/
kstat_named_t arcstat_l2_rebuild_bufs_precached;
/* Physical size of L2ARC restored data, in bytes. */
kstat_named_t arcstat_l2_rebuild_psize;
/*
* Number of L2ARC log blocks that were restored successfully. Each
* log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
*/
kstat_named_t arcstat_l2_rebuild_log_blks;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_memory_direct_count;
kstat_named_t arcstat_memory_indirect_count;
@@ -617,6 +882,10 @@ extern void arc_tuning_update(boolean_t);
extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
/* used in zdb.c */
boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
const l2arc_log_blkptr_t *lbp);
#ifdef __cplusplus
}
#endif
+5
View File
@@ -573,6 +573,11 @@ typedef enum zfs_key_location {
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
#define ZPL_VERSION_SA ZPL_VERSION_5
/* Persistent L2ARC version */
#define L2ARC_PERSISTENT_VERSION_1 1ULL
#define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1
#define L2ARC_PERSISTENT_VERSION_STRING "1"
/* Rewind policy information */
#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
+1
View File
@@ -787,6 +787,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_INITIALIZE_RESTART 0x100
#define SPA_ASYNC_TRIM_RESTART 0x200
#define SPA_ASYNC_AUTOTRIM_RESTART 0x400
#define SPA_ASYNC_L2CACHE_REBUILD 0x800
/*
* Controls the behavior of spa_vdev_remove().