BRT: Rework structures and locks to be per-vdev

While block cloning operation from the beginning was made per-vdev,
before this change most of its data were protected by two pool-
wide locks.  It created lots of lock contention in many workload.

This change makes most of block cloning data structures per-vdev,
which allows to lock them separately.  The only pool-wide lock now
it spa_brt_lock, protecting array of per-vdev pointers and in most
cases taken as reader.  Also this splits per-vdev locks into three
different ones: bv_pending_lock protects the AVL-tree of pending
operations in open context, bv_mos_entries_lock protects BRT ZAP
object from while being prefetched, and bv_lock protects the rest
of per-vdev context during TXG commit process.  There should be
no functional difference aside of some optimizations.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16740
This commit is contained in:
Alexander Motin 2024-11-10 17:29:25 -05:00 committed by Brian Behlendorf
parent 1917c26944
commit 409aad3f33
6 changed files with 401 additions and 533 deletions

View File

@ -2119,9 +2119,6 @@ dump_brt(spa_t *spa)
return; return;
} }
brt_t *brt = spa->spa_brt;
VERIFY(brt);
char count[32], used[32], saved[32]; char count[32], used[32], saved[32];
zdb_nicebytes(brt_get_used(spa), used, sizeof (used)); zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved)); zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
@ -2132,11 +2129,8 @@ dump_brt(spa_t *spa)
if (dump_opt['T'] < 2) if (dump_opt['T'] < 2)
return; return;
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
if (brtvd == NULL)
continue;
if (!brtvd->bv_initiated) { if (!brtvd->bv_initiated) {
printf("BRT: vdev %" PRIu64 ": empty\n", vdevid); printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
continue; continue;
@ -2160,20 +2154,21 @@ dump_brt(spa_t *spa)
if (!do_histo) if (!do_histo)
printf("\n%-16s %-10s\n", "DVA", "REFCNT"); printf("\n%-16s %-10s\n", "DVA", "REFCNT");
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
if (brtvd == NULL || !brtvd->bv_initiated) if (!brtvd->bv_initiated)
continue; continue;
uint64_t counts[64] = {}; uint64_t counts[64] = {};
zap_cursor_t zc; zap_cursor_t zc;
zap_attribute_t *za = zap_attribute_alloc(); zap_attribute_t *za = zap_attribute_alloc();
for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries); for (zap_cursor_init(&zc, spa->spa_meta_objset,
brtvd->bv_mos_entries);
zap_cursor_retrieve(&zc, za) == 0; zap_cursor_retrieve(&zc, za) == 0;
zap_cursor_advance(&zc)) { zap_cursor_advance(&zc)) {
uint64_t refcnt; uint64_t refcnt;
VERIFY0(zap_lookup_uint64(brt->brt_mos, VERIFY0(zap_lookup_uint64(spa->spa_meta_objset,
brtvd->bv_mos_entries, brtvd->bv_mos_entries,
(const uint64_t *)za->za_name, 1, (const uint64_t *)za->za_name, 1,
za->za_integer_length, za->za_num_integers, za->za_integer_length, za->za_num_integers,
@ -8227,16 +8222,13 @@ dump_mos_leaks(spa_t *spa)
} }
} }
if (spa->spa_brt != NULL) { for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
brt_t *brt = spa->spa_brt; brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { if (brtvd->bv_initiated) {
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
if (brtvd != NULL && brtvd->bv_initiated) {
mos_obj_refd(brtvd->bv_mos_brtvdev); mos_obj_refd(brtvd->bv_mos_brtvdev);
mos_obj_refd(brtvd->bv_mos_entries); mos_obj_refd(brtvd->bv_mos_entries);
} }
} }
}
/* /*
* Visit all allocated objects and make sure they are referenced. * Visit all allocated objects and make sure they are referenced.

View File

@ -86,28 +86,38 @@ typedef struct brt_vdev_phys {
uint64_t bvp_savedspace; uint64_t bvp_savedspace;
} brt_vdev_phys_t; } brt_vdev_phys_t;
typedef struct brt_vdev { struct brt_vdev {
/*
* Pending changes from open contexts.
*/
kmutex_t bv_pending_lock;
avl_tree_t bv_pending_tree[TXG_SIZE];
/*
* Protects bv_mos_*.
*/
krwlock_t bv_mos_entries_lock ____cacheline_aligned;
/*
* Protects all the fields starting from bv_initiated.
*/
krwlock_t bv_lock ____cacheline_aligned;
/* /*
* VDEV id. * VDEV id.
*/ */
uint64_t bv_vdevid; uint64_t bv_vdevid ____cacheline_aligned;
/*
* Is the structure initiated?
* (bv_entcount and bv_bitmap are allocated?)
*/
boolean_t bv_initiated;
/* /*
* Object number in the MOS for the entcount array and brt_vdev_phys. * Object number in the MOS for the entcount array and brt_vdev_phys.
*/ */
uint64_t bv_mos_brtvdev; uint64_t bv_mos_brtvdev;
/* /*
* Object number in the MOS for the entries table. * Object number in the MOS and dnode for the entries table.
*/ */
uint64_t bv_mos_entries; uint64_t bv_mos_entries;
dnode_t *bv_mos_entries_dnode;
/* /*
* Entries to sync. * Is the structure initiated?
* (bv_entcount and bv_bitmap are allocated?)
*/ */
avl_tree_t bv_tree; boolean_t bv_initiated;
/* /*
* Does the bv_entcount[] array needs byte swapping? * Does the bv_entcount[] array needs byte swapping?
*/ */
@ -120,6 +130,26 @@ typedef struct brt_vdev {
* This is the array with BRT entry count per BRT_RANGESIZE. * This is the array with BRT entry count per BRT_RANGESIZE.
*/ */
uint16_t *bv_entcount; uint16_t *bv_entcount;
/*
* bv_entcount[] potentially can be a bit too big to sychronize it all
* when we just changed few entcounts. The fields below allow us to
* track updates to bv_entcount[] array since the last sync.
* A single bit in the bv_bitmap represents as many entcounts as can
* fit into a single BRT_BLOCKSIZE.
* For example we have 65536 entcounts in the bv_entcount array
* (so the whole array is 128kB). We updated bv_entcount[2] and
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
*/
ulong_t *bv_bitmap;
/*
* bv_entcount[] needs updating on disk.
*/
boolean_t bv_entcount_dirty;
/*
* brt_vdev_phys needs updating on disk.
*/
boolean_t bv_meta_dirty;
/* /*
* Sum of all bv_entcount[]s. * Sum of all bv_entcount[]s.
*/ */
@ -133,45 +163,10 @@ typedef struct brt_vdev {
*/ */
uint64_t bv_savedspace; uint64_t bv_savedspace;
/* /*
* brt_vdev_phys needs updating on disk. * Entries to sync.
*/ */
boolean_t bv_meta_dirty; avl_tree_t bv_tree;
/* };
* bv_entcount[] needs updating on disk.
*/
boolean_t bv_entcount_dirty;
/*
* bv_entcount[] potentially can be a bit too big to sychronize it all
* when we just changed few entcounts. The fields below allow us to
* track updates to bv_entcount[] array since the last sync.
* A single bit in the bv_bitmap represents as many entcounts as can
* fit into a single BRT_BLOCKSIZE.
* For example we have 65536 entcounts in the bv_entcount array
* (so the whole array is 128kB). We updated bv_entcount[2] and
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
*/
ulong_t *bv_bitmap;
uint64_t bv_nblocks;
} brt_vdev_t;
/*
* In-core brt
*/
typedef struct brt {
krwlock_t brt_lock;
spa_t *brt_spa;
#define brt_mos brt_spa->spa_meta_objset
uint64_t brt_rangesize;
uint64_t brt_usedspace;
uint64_t brt_savedspace;
avl_tree_t brt_pending_tree[TXG_SIZE];
kmutex_t brt_pending_lock[TXG_SIZE];
/* Sum of all entries across all bv_trees. */
uint64_t brt_nentries;
brt_vdev_t *brt_vdevs;
uint64_t brt_nvdevs;
} brt_t;
/* Size of bre_offset / sizeof (uint64_t). */ /* Size of bre_offset / sizeof (uint64_t). */
#define BRT_KEY_WORDS (1) #define BRT_KEY_WORDS (1)
@ -188,7 +183,7 @@ typedef struct brt_entry {
typedef struct brt_pending_entry { typedef struct brt_pending_entry {
blkptr_t bpe_bp; blkptr_t bpe_bp;
int bpe_count; uint64_t bpe_count;
avl_node_t bpe_node; avl_node_t bpe_node;
} brt_pending_entry_t; } brt_pending_entry_t;

View File

@ -53,6 +53,7 @@ extern "C" {
/* /*
* Forward references that lots of things need. * Forward references that lots of things need.
*/ */
typedef struct brt_vdev brt_vdev_t;
typedef struct spa spa_t; typedef struct spa spa_t;
typedef struct vdev vdev_t; typedef struct vdev vdev_t;
typedef struct metaslab metaslab_t; typedef struct metaslab metaslab_t;

View File

@ -412,8 +412,12 @@ struct spa {
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dedup_checksum; /* default dedup checksum */
uint64_t spa_dspace; /* dspace in normal class */ uint64_t spa_dspace; /* dspace in normal class */
uint64_t spa_rdspace; /* raw (non-dedup) --//-- */
boolean_t spa_active_ddt_prune; /* ddt prune process active */ boolean_t spa_active_ddt_prune; /* ddt prune process active */
struct brt *spa_brt; /* in-core BRT */ brt_vdev_t **spa_brt_vdevs; /* array of per-vdev BRTs */
uint64_t spa_brt_nvdevs; /* number of vdevs in BRT */
uint64_t spa_brt_rangesize; /* pool's BRT range size */
krwlock_t spa_brt_lock; /* Protects brt_vdevs/nvdevs */
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
kmutex_t spa_proc_lock; /* protects spa_proc* */ kmutex_t spa_proc_lock; /* protects spa_proc* */
kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */

File diff suppressed because it is too large Load Diff

View File

@ -1870,13 +1870,7 @@ spa_get_slop_space(spa_t *spa)
if (spa->spa_dedup_dspace == ~0ULL) if (spa->spa_dedup_dspace == ~0ULL)
spa_update_dspace(spa); spa_update_dspace(spa);
/* space = spa->spa_rdspace;
* spa_get_dspace() includes the space only logically "used" by
* deduplicated data, so since it's not useful to reserve more
* space with more deduplicated data, we subtract that out here.
*/
space =
spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa);
slop = MIN(space >> spa_slop_shift, spa_max_slop); slop = MIN(space >> spa_slop_shift, spa_max_slop);
/* /*
@ -1912,8 +1906,7 @@ spa_get_checkpoint_space(spa_t *spa)
void void
spa_update_dspace(spa_t *spa) spa_update_dspace(spa_t *spa)
{ {
spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa));
ddt_get_dedup_dspace(spa) + brt_get_dspace(spa);
if (spa->spa_nonallocating_dspace > 0) { if (spa->spa_nonallocating_dspace > 0) {
/* /*
* Subtract the space provided by all non-allocating vdevs that * Subtract the space provided by all non-allocating vdevs that
@ -1933,9 +1926,11 @@ spa_update_dspace(spa_t *spa)
* doesn't matter that the data we are moving may be * doesn't matter that the data we are moving may be
* allocated twice (on the old device and the new device). * allocated twice (on the old device and the new device).
*/ */
ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace); ASSERT3U(spa->spa_rdspace, >=, spa->spa_nonallocating_dspace);
spa->spa_dspace -= spa->spa_nonallocating_dspace; spa->spa_rdspace -= spa->spa_nonallocating_dspace;
} }
spa->spa_dspace = spa->spa_rdspace + ddt_get_dedup_dspace(spa) +
brt_get_dspace(spa);
} }
/* /*