Dynamically size dbuf hash mutex array

Incorrectly sizing the array of hash locks used to protect the dbuf hash table can lead to contention and reduce performance. We could unconditionally allocate a larger array for the locks but it's wasteful, particularly for a low-memory system. Instead, dynamically allocate the array of locks and scale it based on total system memory. Additionally, add a new `dbuf_mutex_cache_shift` module option which can be used to override the hash lock array size. This is disabled by default (dbuf_mutex_hash_shift=0) and can only be set at module load time. The minimum target array size is set to 8192, this matches the current constant value. Note that the count of the dbuf hash table and count of the mutex array were added to the /proc/spl/kstat/zfs/dbufstats kstat. Finally, this change removes the _KERNEL conditional checks. These were not required since for the user space build there is no difference between the kmem and vmem interfaces. Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #13928
2025-08-02 04:57:40 +03:00 · 2022-09-19 12:17:11 -07:00 · 2022-09-19 12:17:11 -07:00 · 505df8d133
commit 505df8d133
parent 223b04d23d
3 changed files with 64 additions and 34 deletions
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@ -321,13 +321,14 @@ typedef struct dmu_buf_impl {
 	uint8_t db_dirtycnt;
 } dmu_buf_impl_t;

-/* Note: the dbuf hash table is exposed only for the mdb module */
-#define	DBUF_MUTEXES 2048
-#define	DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
+#define	DBUF_HASH_MUTEX(h, idx) \
+	(&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)])
+
 typedef struct dbuf_hash_table {
 	uint64_t hash_table_mask;
+	uint64_t hash_mutex_mask;
 	dmu_buf_impl_t **hash_table;
-	kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned;
+	kmutex_t *hash_mutexes;
 } dbuf_hash_table_t;

 typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t);
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@ -66,6 +66,12 @@ Set the size of the dbuf metadata cache
 .Pq Sy dbuf_metadata_cache_max_bytes
 to a log2 fraction of the target ARC size.
 .
+.It Sy dbuf_mutex_cache_shift Ns = Ns Sy 0 Pq uint
+Set the size of the mutex array for the dbuf cache.
+When set to
+.Sy 0
+the array is dynamically sized based on total system memory.
+.
 .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq int
 dnode slots allocated in a single operation as a power of 2.
 The default value minimizes lock contention for the bulk operation performed.
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@ -99,6 +99,11 @@ typedef struct dbuf_stats {
 	 * already created and in the dbuf hash table.
 	 */
 	kstat_named_t hash_insert_race;
+	/*
+	 * Number of entries in the hash table dbuf and mutex arrays.
+	 */
+	kstat_named_t hash_table_count;
+	kstat_named_t hash_mutex_count;
 	/*
 	 * Statistics about the size of the metadata dbuf cache.
 	 */
@ -131,6 +136,8 @@ dbuf_stats_t dbuf_stats = {
 	{ "hash_chains",			KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",			KSTAT_DATA_UINT64 },
 	{ "hash_insert_race",			KSTAT_DATA_UINT64 },
+	{ "hash_table_count",			KSTAT_DATA_UINT64 },
+	{ "hash_mutex_count",			KSTAT_DATA_UINT64 },
 	{ "metadata_cache_count",		KSTAT_DATA_UINT64 },
 	{ "metadata_cache_size_bytes",		KSTAT_DATA_UINT64 },
 	{ "metadata_cache_size_bytes_max",	KSTAT_DATA_UINT64 },
@ -227,6 +234,9 @@ static unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
 static int dbuf_cache_shift = 5;
 static int dbuf_metadata_cache_shift = 6;

+/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
+static uint32_t dbuf_mutex_cache_shift = 0;
+
 static unsigned long dbuf_cache_target_bytes(void);
 static unsigned long dbuf_metadata_cache_target_bytes(void);

@ -838,6 +848,7 @@ static int
 dbuf_kstat_update(kstat_t *ksp, int rw)
 {
 	dbuf_stats_t *ds = ksp->ks_data;
+	dbuf_hash_table_t *h = &dbuf_hash_table;

 	if (rw == KSTAT_WRITE)
 		return (SET_ERROR(EACCES));
@ -867,6 +878,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
 	    wmsum_value(&dbuf_sums.hash_chains);
 	ds->hash_insert_race.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_insert_race);
+	ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
+	ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
 	ds->metadata_cache_count.value.ui64 =
 	    wmsum_value(&dbuf_sums.metadata_cache_count);
 	ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
@ -879,9 +892,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
 void
 dbuf_init(void)
 {
-	uint64_t hsize = 1ULL << 16;
+	uint64_t hmsize, hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
-	int i;

 	/*
 	 * The hash table is big enough to fill one eighth of physical memory
@ -892,29 +904,42 @@ dbuf_init(void)
 	while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
 		hsize <<= 1;

-retry:
-	h->hash_table_mask = hsize - 1;
-#if defined(_KERNEL)
+	h->hash_table = NULL;
+	while (h->hash_table == NULL) {
+		h->hash_table_mask = hsize - 1;
+
+		h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+		if (h->hash_table == NULL)
+			hsize >>= 1;
+
+		ASSERT3U(hsize, >=, 1ULL << 10);
+	}
+
 	/*
-	 * Large allocations which do not require contiguous pages
-	 * should be using vmem_alloc() in the linux kernel
+	 * The hash table buckets are protected by an array of mutexes where
+	 * each mutex is reponsible for protecting 128 buckets.  A minimum
+	 * array size of 8192 is targeted to avoid contention.
 	 */
-	h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
-#else
-	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
-#endif
-	if (h->hash_table == NULL) {
-		/* XXX - we should really return an error instead of assert */
-		ASSERT(hsize > (1ULL << 10));
-		hsize >>= 1;
-		goto retry;
+	if (dbuf_mutex_cache_shift == 0)
+		hmsize = MAX(hsize >> 7, 1ULL << 13);
+	else
+		hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
+
+	h->hash_mutexes = NULL;
+	while (h->hash_mutexes == NULL) {
+		h->hash_mutex_mask = hmsize - 1;
+
+		h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
+		    KM_SLEEP);
+		if (h->hash_mutexes == NULL)
+			hmsize >>= 1;
 	}

 	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);

-	for (i = 0; i < DBUF_MUTEXES; i++)
+	for (int i = 0; i < hmsize; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);

 	dbuf_stats_init(h);
@ -941,7 +966,7 @@ retry:

 	wmsum_init(&dbuf_sums.cache_count, 0);
 	wmsum_init(&dbuf_sums.cache_total_evicts, 0);
-	for (i = 0; i < DN_MAX_LEVELS; i++) {
+	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		wmsum_init(&dbuf_sums.cache_levels[i], 0);
 		wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
 	}
@ -957,7 +982,7 @@ retry:
 	    KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (dbuf_ksp != NULL) {
-		for (i = 0; i < DN_MAX_LEVELS; i++) {
+		for (int i = 0; i < DN_MAX_LEVELS; i++) {
 			snprintf(dbuf_stats.cache_levels[i].name,
 			    KSTAT_STRLEN, "cache_level_%d", i);
 			dbuf_stats.cache_levels[i].data_type =
@ -977,21 +1002,16 @@ void
 dbuf_fini(void)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
-	int i;

 	dbuf_stats_destroy();

-	for (i = 0; i < DBUF_MUTEXES; i++)
+	for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
 		mutex_destroy(&h->hash_mutexes[i]);
-#if defined(_KERNEL)
-	/*
-	 * Large allocations which do not require contiguous pages
-	 * should be using vmem_free() in the linux kernel
-	 */
+
 	vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-#else
-	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-#endif
+	vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
+	    sizeof (kmutex_t));
+
 	kmem_cache_destroy(dbuf_kmem_cache);
 	taskq_destroy(dbu_evict_taskq);

@ -1018,7 +1038,7 @@ dbuf_fini(void)

 	wmsum_fini(&dbuf_sums.cache_count);
 	wmsum_fini(&dbuf_sums.cache_total_evicts);
-	for (i = 0; i < DN_MAX_LEVELS; i++) {
+	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		wmsum_fini(&dbuf_sums.cache_levels[i]);
 		wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
 	}
@ -5117,3 +5137,6 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,

 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
 	"Set size of dbuf metadata cache to log2 fraction of arc size.");
+
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
+	"Set size of dbuf cache mutex array as log2 shift.");