BRT: Rework structures and locks to be per-vdev

While block cloning operation from the beginning was made per-vdev,
before this change most of its data were protected by two pool-
wide locks.  It created lots of lock contention in many workload.

This change makes most of block cloning data structures per-vdev,
which allows to lock them separately.  The only pool-wide lock now
it spa_brt_lock, protecting array of per-vdev pointers and in most
cases taken as reader.  Also this splits per-vdev locks into three
different ones: bv_pending_lock protects the AVL-tree of pending
operations in open context, bv_mos_entries_lock protects BRT ZAP
object from while being prefetched, and bv_lock protects the rest
of per-vdev context during TXG commit process.  There should be
no functional difference aside of some optimizations.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16740
This commit is contained in:
Alexander Motin
2024-11-10 17:29:25 -05:00
committed by Brian Behlendorf
parent 309ce6303f
commit fd6e8c1d2a
6 changed files with 401 additions and 533 deletions
+44 -49
View File
@@ -86,28 +86,38 @@ typedef struct brt_vdev_phys {
uint64_t bvp_savedspace;
} brt_vdev_phys_t;
typedef struct brt_vdev {
struct brt_vdev {
/*
* Pending changes from open contexts.
*/
kmutex_t bv_pending_lock;
avl_tree_t bv_pending_tree[TXG_SIZE];
/*
* Protects bv_mos_*.
*/
krwlock_t bv_mos_entries_lock ____cacheline_aligned;
/*
* Protects all the fields starting from bv_initiated.
*/
krwlock_t bv_lock ____cacheline_aligned;
/*
* VDEV id.
*/
uint64_t bv_vdevid;
/*
* Is the structure initiated?
* (bv_entcount and bv_bitmap are allocated?)
*/
boolean_t bv_initiated;
uint64_t bv_vdevid ____cacheline_aligned;
/*
* Object number in the MOS for the entcount array and brt_vdev_phys.
*/
uint64_t bv_mos_brtvdev;
/*
* Object number in the MOS for the entries table.
* Object number in the MOS and dnode for the entries table.
*/
uint64_t bv_mos_entries;
dnode_t *bv_mos_entries_dnode;
/*
* Entries to sync.
* Is the structure initiated?
* (bv_entcount and bv_bitmap are allocated?)
*/
avl_tree_t bv_tree;
boolean_t bv_initiated;
/*
* Does the bv_entcount[] array needs byte swapping?
*/
@@ -120,6 +130,26 @@ typedef struct brt_vdev {
* This is the array with BRT entry count per BRT_RANGESIZE.
*/
uint16_t *bv_entcount;
/*
* bv_entcount[] potentially can be a bit too big to sychronize it all
* when we just changed few entcounts. The fields below allow us to
* track updates to bv_entcount[] array since the last sync.
* A single bit in the bv_bitmap represents as many entcounts as can
* fit into a single BRT_BLOCKSIZE.
* For example we have 65536 entcounts in the bv_entcount array
* (so the whole array is 128kB). We updated bv_entcount[2] and
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
*/
ulong_t *bv_bitmap;
/*
* bv_entcount[] needs updating on disk.
*/
boolean_t bv_entcount_dirty;
/*
* brt_vdev_phys needs updating on disk.
*/
boolean_t bv_meta_dirty;
/*
* Sum of all bv_entcount[]s.
*/
@@ -133,45 +163,10 @@ typedef struct brt_vdev {
*/
uint64_t bv_savedspace;
/*
* brt_vdev_phys needs updating on disk.
* Entries to sync.
*/
boolean_t bv_meta_dirty;
/*
* bv_entcount[] needs updating on disk.
*/
boolean_t bv_entcount_dirty;
/*
* bv_entcount[] potentially can be a bit too big to sychronize it all
* when we just changed few entcounts. The fields below allow us to
* track updates to bv_entcount[] array since the last sync.
* A single bit in the bv_bitmap represents as many entcounts as can
* fit into a single BRT_BLOCKSIZE.
* For example we have 65536 entcounts in the bv_entcount array
* (so the whole array is 128kB). We updated bv_entcount[2] and
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
*/
ulong_t *bv_bitmap;
uint64_t bv_nblocks;
} brt_vdev_t;
/*
* In-core brt
*/
typedef struct brt {
krwlock_t brt_lock;
spa_t *brt_spa;
#define brt_mos brt_spa->spa_meta_objset
uint64_t brt_rangesize;
uint64_t brt_usedspace;
uint64_t brt_savedspace;
avl_tree_t brt_pending_tree[TXG_SIZE];
kmutex_t brt_pending_lock[TXG_SIZE];
/* Sum of all entries across all bv_trees. */
uint64_t brt_nentries;
brt_vdev_t *brt_vdevs;
uint64_t brt_nvdevs;
} brt_t;
avl_tree_t bv_tree;
};
/* Size of bre_offset / sizeof (uint64_t). */
#define BRT_KEY_WORDS (1)
@@ -188,7 +183,7 @@ typedef struct brt_entry {
typedef struct brt_pending_entry {
blkptr_t bpe_bp;
int bpe_count;
uint64_t bpe_count;
avl_node_t bpe_node;
} brt_pending_entry_t;
+1
View File
@@ -53,6 +53,7 @@ extern "C" {
/*
* Forward references that lots of things need.
*/
typedef struct brt_vdev brt_vdev_t;
typedef struct spa spa_t;
typedef struct vdev vdev_t;
typedef struct metaslab metaslab_t;
+5 -1
View File
@@ -412,8 +412,12 @@ struct spa {
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
uint64_t spa_dedup_checksum; /* default dedup checksum */
uint64_t spa_dspace; /* dspace in normal class */
uint64_t spa_rdspace; /* raw (non-dedup) --//-- */
boolean_t spa_active_ddt_prune; /* ddt prune process active */
struct brt *spa_brt; /* in-core BRT */
brt_vdev_t **spa_brt_vdevs; /* array of per-vdev BRTs */
uint64_t spa_brt_nvdevs; /* number of vdevs in BRT */
uint64_t spa_brt_rangesize; /* pool's BRT range size */
krwlock_t spa_brt_lock; /* Protects brt_vdevs/nvdevs */
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
kmutex_t spa_proc_lock; /* protects spa_proc* */
kcondvar_t spa_proc_cv; /* spa_proc_state transitions */