mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
BRT: Rework structures and locks to be per-vdev
While block cloning operation from the beginning was made per-vdev, before this change most of its data were protected by two pool- wide locks. It created lots of lock contention in many workload. This change makes most of block cloning data structures per-vdev, which allows to lock them separately. The only pool-wide lock now it spa_brt_lock, protecting array of per-vdev pointers and in most cases taken as reader. Also this splits per-vdev locks into three different ones: bv_pending_lock protects the AVL-tree of pending operations in open context, bv_mos_entries_lock protects BRT ZAP object from while being prefetched, and bv_lock protects the rest of per-vdev context during TXG commit process. There should be no functional difference aside of some optimizations. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Pawel Jakub Dawidek <pjd@FreeBSD.org> Reviewed-by: Brian Atkinson <batkinson@lanl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #16740
This commit is contained in:
committed by
Brian Behlendorf
parent
309ce6303f
commit
fd6e8c1d2a
+44
-49
@@ -86,28 +86,38 @@ typedef struct brt_vdev_phys {
|
||||
uint64_t bvp_savedspace;
|
||||
} brt_vdev_phys_t;
|
||||
|
||||
typedef struct brt_vdev {
|
||||
struct brt_vdev {
|
||||
/*
|
||||
* Pending changes from open contexts.
|
||||
*/
|
||||
kmutex_t bv_pending_lock;
|
||||
avl_tree_t bv_pending_tree[TXG_SIZE];
|
||||
/*
|
||||
* Protects bv_mos_*.
|
||||
*/
|
||||
krwlock_t bv_mos_entries_lock ____cacheline_aligned;
|
||||
/*
|
||||
* Protects all the fields starting from bv_initiated.
|
||||
*/
|
||||
krwlock_t bv_lock ____cacheline_aligned;
|
||||
/*
|
||||
* VDEV id.
|
||||
*/
|
||||
uint64_t bv_vdevid;
|
||||
/*
|
||||
* Is the structure initiated?
|
||||
* (bv_entcount and bv_bitmap are allocated?)
|
||||
*/
|
||||
boolean_t bv_initiated;
|
||||
uint64_t bv_vdevid ____cacheline_aligned;
|
||||
/*
|
||||
* Object number in the MOS for the entcount array and brt_vdev_phys.
|
||||
*/
|
||||
uint64_t bv_mos_brtvdev;
|
||||
/*
|
||||
* Object number in the MOS for the entries table.
|
||||
* Object number in the MOS and dnode for the entries table.
|
||||
*/
|
||||
uint64_t bv_mos_entries;
|
||||
dnode_t *bv_mos_entries_dnode;
|
||||
/*
|
||||
* Entries to sync.
|
||||
* Is the structure initiated?
|
||||
* (bv_entcount and bv_bitmap are allocated?)
|
||||
*/
|
||||
avl_tree_t bv_tree;
|
||||
boolean_t bv_initiated;
|
||||
/*
|
||||
* Does the bv_entcount[] array needs byte swapping?
|
||||
*/
|
||||
@@ -120,6 +130,26 @@ typedef struct brt_vdev {
|
||||
* This is the array with BRT entry count per BRT_RANGESIZE.
|
||||
*/
|
||||
uint16_t *bv_entcount;
|
||||
/*
|
||||
* bv_entcount[] potentially can be a bit too big to sychronize it all
|
||||
* when we just changed few entcounts. The fields below allow us to
|
||||
* track updates to bv_entcount[] array since the last sync.
|
||||
* A single bit in the bv_bitmap represents as many entcounts as can
|
||||
* fit into a single BRT_BLOCKSIZE.
|
||||
* For example we have 65536 entcounts in the bv_entcount array
|
||||
* (so the whole array is 128kB). We updated bv_entcount[2] and
|
||||
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
|
||||
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
|
||||
*/
|
||||
ulong_t *bv_bitmap;
|
||||
/*
|
||||
* bv_entcount[] needs updating on disk.
|
||||
*/
|
||||
boolean_t bv_entcount_dirty;
|
||||
/*
|
||||
* brt_vdev_phys needs updating on disk.
|
||||
*/
|
||||
boolean_t bv_meta_dirty;
|
||||
/*
|
||||
* Sum of all bv_entcount[]s.
|
||||
*/
|
||||
@@ -133,45 +163,10 @@ typedef struct brt_vdev {
|
||||
*/
|
||||
uint64_t bv_savedspace;
|
||||
/*
|
||||
* brt_vdev_phys needs updating on disk.
|
||||
* Entries to sync.
|
||||
*/
|
||||
boolean_t bv_meta_dirty;
|
||||
/*
|
||||
* bv_entcount[] needs updating on disk.
|
||||
*/
|
||||
boolean_t bv_entcount_dirty;
|
||||
/*
|
||||
* bv_entcount[] potentially can be a bit too big to sychronize it all
|
||||
* when we just changed few entcounts. The fields below allow us to
|
||||
* track updates to bv_entcount[] array since the last sync.
|
||||
* A single bit in the bv_bitmap represents as many entcounts as can
|
||||
* fit into a single BRT_BLOCKSIZE.
|
||||
* For example we have 65536 entcounts in the bv_entcount array
|
||||
* (so the whole array is 128kB). We updated bv_entcount[2] and
|
||||
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
|
||||
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
|
||||
*/
|
||||
ulong_t *bv_bitmap;
|
||||
uint64_t bv_nblocks;
|
||||
} brt_vdev_t;
|
||||
|
||||
/*
|
||||
* In-core brt
|
||||
*/
|
||||
typedef struct brt {
|
||||
krwlock_t brt_lock;
|
||||
spa_t *brt_spa;
|
||||
#define brt_mos brt_spa->spa_meta_objset
|
||||
uint64_t brt_rangesize;
|
||||
uint64_t brt_usedspace;
|
||||
uint64_t brt_savedspace;
|
||||
avl_tree_t brt_pending_tree[TXG_SIZE];
|
||||
kmutex_t brt_pending_lock[TXG_SIZE];
|
||||
/* Sum of all entries across all bv_trees. */
|
||||
uint64_t brt_nentries;
|
||||
brt_vdev_t *brt_vdevs;
|
||||
uint64_t brt_nvdevs;
|
||||
} brt_t;
|
||||
avl_tree_t bv_tree;
|
||||
};
|
||||
|
||||
/* Size of bre_offset / sizeof (uint64_t). */
|
||||
#define BRT_KEY_WORDS (1)
|
||||
@@ -188,7 +183,7 @@ typedef struct brt_entry {
|
||||
|
||||
typedef struct brt_pending_entry {
|
||||
blkptr_t bpe_bp;
|
||||
int bpe_count;
|
||||
uint64_t bpe_count;
|
||||
avl_node_t bpe_node;
|
||||
} brt_pending_entry_t;
|
||||
|
||||
|
||||
@@ -53,6 +53,7 @@ extern "C" {
|
||||
/*
|
||||
* Forward references that lots of things need.
|
||||
*/
|
||||
typedef struct brt_vdev brt_vdev_t;
|
||||
typedef struct spa spa_t;
|
||||
typedef struct vdev vdev_t;
|
||||
typedef struct metaslab metaslab_t;
|
||||
|
||||
@@ -412,8 +412,12 @@ struct spa {
|
||||
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
|
||||
uint64_t spa_dedup_checksum; /* default dedup checksum */
|
||||
uint64_t spa_dspace; /* dspace in normal class */
|
||||
uint64_t spa_rdspace; /* raw (non-dedup) --//-- */
|
||||
boolean_t spa_active_ddt_prune; /* ddt prune process active */
|
||||
struct brt *spa_brt; /* in-core BRT */
|
||||
brt_vdev_t **spa_brt_vdevs; /* array of per-vdev BRTs */
|
||||
uint64_t spa_brt_nvdevs; /* number of vdevs in BRT */
|
||||
uint64_t spa_brt_rangesize; /* pool's BRT range size */
|
||||
krwlock_t spa_brt_lock; /* Protects brt_vdevs/nvdevs */
|
||||
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
|
||||
kmutex_t spa_proc_lock; /* protects spa_proc* */
|
||||
kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
|
||||
|
||||
Reference in New Issue
Block a user