Improve write issue taskqs utilization

- Reduce number of allocators on small system down to one per 4
CPU cores, keeping maximum at 4 on 16+ core systems. Small systems
should not have the lock contention multiple allocators supposed
to solve, while having several metaslabs open and modified each
TXG is not free.
 - Reduce number of write issue taskqs down to one per 16 CPU
cores and an integer fraction of number of allocators.  On mid-
sized systems, where multiple allocators already make sense, too
many write issue taskqs may reduce write speed on single-file
workloads, since single file is handled by only one taskq to
reduce fragmentation. On large systems, that can actually benefit
from many taskq's better IOPS, the bottleneck is less important,
since in worst case there will be at least 16 cores to handle it.
 - Distribute dnodes between allocators (and taskqs) in a round-
robin fashion instead of relying on sync taskqs to be balanced.
The last is not guarantied and may depend on scheduling.
 - Remove io_wr_iss_tq from struct zio.  io_allocator is enough.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16130
This commit is contained in:
Alexander Motin
2024-05-01 14:07:20 -04:00
committed by GitHub
parent 8fd3a5d02f
commit 645b833079
8 changed files with 98 additions and 47 deletions
+2
View File
@@ -1664,12 +1664,14 @@ sync_dnodes_task(void *arg)
sync_objset_arg_t *soa = sda->sda_soa;
objset_t *os = soa->soa_os;
uint_t allocator = spa_acq_allocator(os->os_spa);
multilist_sublist_t *ms =
multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
dmu_objset_sync_dnodes(ms, soa->soa_tx);
multilist_sublist_unlock(ms);
spa_rel_allocator(os->os_spa, allocator);
kmem_free(sda, sizeof (*sda));
+52 -29
View File
@@ -208,7 +208,7 @@ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */
static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
#endif
static uint_t zio_taskq_wr_iss_ncpus = 0;
static uint_t zio_taskq_write_tpq = 16;
/*
* Report any spa_load_verify errors found, but do not fail spa_load.
@@ -1067,17 +1067,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
case ZTI_MODE_SYNC:
/*
* Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
* not to exceed the number of spa allocators.
* Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
* not to exceed the number of spa allocators, and align to it.
*/
if (zio_taskq_wr_iss_ncpus == 0) {
count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
} else {
count = MAX(1,
boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
}
cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq));
count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
count = MIN(count, spa->spa_alloc_count);
while (spa->spa_alloc_count % count != 0 &&
spa->spa_alloc_count < count * 2)
count--;
/*
* zio_taskq_batch_pct is unbounded and may exceed 100%, but no
@@ -1495,15 +1494,11 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
ASSERT3P(tqs->stqs_taskq, !=, NULL);
ASSERT3U(tqs->stqs_count, !=, 0);
if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
(zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
/* dispatch to assigned write issue taskq */
tq = zio->io_wr_iss_tq;
return (tq);
}
if (tqs->stqs_count == 1) {
tq = tqs->stqs_taskq[0];
} else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
(zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) {
tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
} else {
tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
}
@@ -10233,16 +10228,10 @@ spa_sync_tq_create(spa_t *spa, const char *name)
VERIFY(spa->spa_sync_tq != NULL);
VERIFY(kthreads != NULL);
spa_taskqs_t *tqs =
&spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
spa_syncthread_info_t *ti = spa->spa_syncthreads;
for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
for (int i = 0; i < nthreads; i++, ti++) {
ti->sti_thread = kthreads[i];
if (w == tqs->stqs_count) {
w = 0;
}
ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
ti->sti_allocator = i;
}
kmem_free(kthreads, sizeof (*kthreads) * nthreads);
@@ -10261,6 +10250,42 @@ spa_sync_tq_destroy(spa_t *spa)
spa->spa_sync_tq = NULL;
}
uint_t
spa_acq_allocator(spa_t *spa)
{
int i;
if (spa->spa_alloc_count == 1)
return (0);
mutex_enter(&spa->spa_allocs_use->sau_lock);
uint_t r = spa->spa_allocs_use->sau_rotor;
do {
if (++r == spa->spa_alloc_count)
r = 0;
} while (spa->spa_allocs_use->sau_inuse[r]);
spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
spa->spa_allocs_use->sau_rotor = r;
mutex_exit(&spa->spa_allocs_use->sau_lock);
spa_syncthread_info_t *ti = spa->spa_syncthreads;
for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
if (ti->sti_thread == curthread) {
ti->sti_allocator = r;
break;
}
}
ASSERT3S(i, <, spa->spa_alloc_count);
return (r);
}
void
spa_rel_allocator(spa_t *spa, uint_t allocator)
{
if (spa->spa_alloc_count > 1)
spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
}
void
spa_select_allocator(zio_t *zio)
{
@@ -10288,8 +10313,7 @@ spa_select_allocator(zio_t *zio)
spa_syncthread_info_t *ti = spa->spa_syncthreads;
for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
if (ti->sti_thread == curthread) {
zio->io_allocator = i;
zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
zio->io_allocator = ti->sti_allocator;
return;
}
}
@@ -10306,7 +10330,6 @@ spa_select_allocator(zio_t *zio)
bm->zb_blkid >> 20);
zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
zio->io_wr_iss_tq = NULL;
}
/*
@@ -10919,5 +10942,5 @@ ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
#endif
/* END CSTYLED */
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
"Number of CPUs to run write issue taskqs");
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
"Number of CPUs per write issue taskq");
+19 -3
View File
@@ -394,6 +394,7 @@ static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
* Number of allocators to use, per spa instance
*/
static int spa_num_allocators = 4;
static int spa_cpus_per_allocator = 4;
/*
* Spa active allocator.
@@ -747,8 +748,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
if (altroot)
spa->spa_root = spa_strdup(altroot);
/* Do not allow more allocators than CPUs. */
spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus);
/* Do not allow more allocators than fraction of CPUs. */
spa->spa_alloc_count = MAX(MIN(spa_num_allocators,
boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1);
spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
sizeof (spa_alloc_t), KM_SLEEP);
@@ -758,6 +760,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
}
if (spa->spa_alloc_count > 1) {
spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t,
sau_inuse[spa->spa_alloc_count]), KM_SLEEP);
mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT,
NULL);
}
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
@@ -853,6 +861,11 @@ spa_remove(spa_t *spa)
}
kmem_free(spa->spa_allocs, spa->spa_alloc_count *
sizeof (spa_alloc_t));
if (spa->spa_alloc_count > 1) {
mutex_destroy(&spa->spa_allocs_use->sau_lock);
kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t,
sau_inuse[spa->spa_alloc_count]));
}
avl_destroy(&spa->spa_metaslabs_by_flushed);
avl_destroy(&spa->spa_sm_logs_by_txg);
@@ -3097,4 +3110,7 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
param_get_uint, ZMOD_RW, "Reserved free space in pool");
ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
"Number of allocators per spa, capped by ncpus");
"Number of allocators per spa");
ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW,
"Minimum number of CPUs per allocators");
-1
View File
@@ -2925,7 +2925,6 @@ static void
zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
{
cio->io_allocator = pio->io_allocator;
cio->io_wr_iss_tq = pio->io_wr_iss_tq;
}
static void