diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 7886f728b..06489ea84 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -321,13 +321,14 @@ typedef struct dmu_buf_impl { uint8_t db_dirtycnt; } dmu_buf_impl_t; -/* Note: the dbuf hash table is exposed only for the mdb module */ -#define DBUF_MUTEXES 2048 -#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) +#define DBUF_HASH_MUTEX(h, idx) \ + (&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)]) + typedef struct dbuf_hash_table { uint64_t hash_table_mask; + uint64_t hash_mutex_mask; dmu_buf_impl_t **hash_table; - kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned; + kmutex_t *hash_mutexes; } dbuf_hash_table_t; typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 90a8ca788..805c037e3 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -66,6 +66,12 @@ Set the size of the dbuf metadata cache .Pq Sy dbuf_metadata_cache_max_bytes to a log2 fraction of the target ARC size. . +.It Sy dbuf_mutex_cache_shift Ns = Ns Sy 0 Pq uint +Set the size of the mutex array for the dbuf cache. +When set to +.Sy 0 +the array is dynamically sized based on total system memory. +. .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq int dnode slots allocated in a single operation as a power of 2. The default value minimizes lock contention for the bulk operation performed. diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 9e36afd31..85ba4e322 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -99,6 +99,11 @@ typedef struct dbuf_stats { * already created and in the dbuf hash table. */ kstat_named_t hash_insert_race; + /* + * Number of entries in the hash table dbuf and mutex arrays. + */ + kstat_named_t hash_table_count; + kstat_named_t hash_mutex_count; /* * Statistics about the size of the metadata dbuf cache. */ @@ -131,6 +136,8 @@ dbuf_stats_t dbuf_stats = { { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "hash_insert_race", KSTAT_DATA_UINT64 }, + { "hash_table_count", KSTAT_DATA_UINT64 }, + { "hash_mutex_count", KSTAT_DATA_UINT64 }, { "metadata_cache_count", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, @@ -227,6 +234,9 @@ static unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX; static int dbuf_cache_shift = 5; static int dbuf_metadata_cache_shift = 6; +/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */ +static uint32_t dbuf_mutex_cache_shift = 0; + static unsigned long dbuf_cache_target_bytes(void); static unsigned long dbuf_metadata_cache_target_bytes(void); @@ -838,6 +848,7 @@ static int dbuf_kstat_update(kstat_t *ksp, int rw) { dbuf_stats_t *ds = ksp->ks_data; + dbuf_hash_table_t *h = &dbuf_hash_table; if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); @@ -867,6 +878,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw) wmsum_value(&dbuf_sums.hash_chains); ds->hash_insert_race.value.ui64 = wmsum_value(&dbuf_sums.hash_insert_race); + ds->hash_table_count.value.ui64 = h->hash_table_mask + 1; + ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1; ds->metadata_cache_count.value.ui64 = wmsum_value(&dbuf_sums.metadata_cache_count); ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count( @@ -879,9 +892,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw) void dbuf_init(void) { - uint64_t hsize = 1ULL << 16; + uint64_t hmsize, hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; - int i; /* * The hash table is big enough to fill one eighth of physical memory @@ -892,29 +904,42 @@ dbuf_init(void) while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8) hsize <<= 1; -retry: - h->hash_table_mask = hsize - 1; -#if defined(_KERNEL) + h->hash_table = NULL; + while (h->hash_table == NULL) { + h->hash_table_mask = hsize - 1; + + h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); + if (h->hash_table == NULL) + hsize >>= 1; + + ASSERT3U(hsize, >=, 1ULL << 10); + } + /* - * Large allocations which do not require contiguous pages - * should be using vmem_alloc() in the linux kernel + * The hash table buckets are protected by an array of mutexes where + * each mutex is reponsible for protecting 128 buckets. A minimum + * array size of 8192 is targeted to avoid contention. */ - h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); -#else - h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); -#endif - if (h->hash_table == NULL) { - /* XXX - we should really return an error instead of assert */ - ASSERT(hsize > (1ULL << 10)); - hsize >>= 1; - goto retry; + if (dbuf_mutex_cache_shift == 0) + hmsize = MAX(hsize >> 7, 1ULL << 13); + else + hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24); + + h->hash_mutexes = NULL; + while (h->hash_mutexes == NULL) { + h->hash_mutex_mask = hmsize - 1; + + h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t), + KM_SLEEP); + if (h->hash_mutexes == NULL) + hmsize >>= 1; } dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - for (i = 0; i < DBUF_MUTEXES; i++) + for (int i = 0; i < hmsize; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); @@ -941,7 +966,7 @@ retry: wmsum_init(&dbuf_sums.cache_count, 0); wmsum_init(&dbuf_sums.cache_total_evicts, 0); - for (i = 0; i < DN_MAX_LEVELS; i++) { + for (int i = 0; i < DN_MAX_LEVELS; i++) { wmsum_init(&dbuf_sums.cache_levels[i], 0); wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0); } @@ -957,7 +982,7 @@ retry: KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dbuf_ksp != NULL) { - for (i = 0; i < DN_MAX_LEVELS; i++) { + for (int i = 0; i < DN_MAX_LEVELS; i++) { snprintf(dbuf_stats.cache_levels[i].name, KSTAT_STRLEN, "cache_level_%d", i); dbuf_stats.cache_levels[i].data_type = @@ -977,21 +1002,16 @@ void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; - int i; dbuf_stats_destroy(); - for (i = 0; i < DBUF_MUTEXES; i++) + for (int i = 0; i < (h->hash_mutex_mask + 1); i++) mutex_destroy(&h->hash_mutexes[i]); -#if defined(_KERNEL) - /* - * Large allocations which do not require contiguous pages - * should be using vmem_free() in the linux kernel - */ + vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); -#else - kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); -#endif + vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) * + sizeof (kmutex_t)); + kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); @@ -1018,7 +1038,7 @@ dbuf_fini(void) wmsum_fini(&dbuf_sums.cache_count); wmsum_fini(&dbuf_sums.cache_total_evicts); - for (i = 0; i < DN_MAX_LEVELS; i++) { + for (int i = 0; i < DN_MAX_LEVELS; i++) { wmsum_fini(&dbuf_sums.cache_levels[i]); wmsum_fini(&dbuf_sums.cache_levels_bytes[i]); } @@ -5117,3 +5137,6 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW, "Set size of dbuf metadata cache to log2 fraction of arc size."); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD, + "Set size of dbuf cache mutex array as log2 shift.");