diff --git a/include/sys/spa.h b/include/sys/spa.h index 001c221fb..3073c4d1b 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -829,6 +829,8 @@ extern uint_t zfs_sync_pass_deferred_free; /* spa sync taskqueues */ taskq_t *spa_sync_tq_create(spa_t *spa, const char *name); void spa_sync_tq_destroy(spa_t *spa); +uint_t spa_acq_allocator(spa_t *spa); +void spa_rel_allocator(spa_t *spa, uint_t allocator); void spa_select_allocator(zio_t *zio); /* spa namespace global mutex */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index d7da085ab..a40914ec5 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -63,6 +63,12 @@ typedef struct spa_alloc { avl_tree_t spaa_tree; } ____cacheline_aligned spa_alloc_t; +typedef struct spa_allocs_use { + kmutex_t sau_lock; + uint_t sau_rotor; + boolean_t sau_inuse[]; +} spa_allocs_use_t; + typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; @@ -192,7 +198,7 @@ typedef struct spa_taskqs { /* one for each thread in the spa sync taskq */ typedef struct spa_syncthread_info { kthread_t *sti_thread; - taskq_t *sti_wr_iss_tq; /* assigned wr_iss taskq */ + uint_t sti_allocator; } spa_syncthread_info_t; typedef enum spa_all_vdev_zap_action { @@ -270,6 +276,7 @@ struct spa { * allocation performance in write-heavy workloads. */ spa_alloc_t *spa_allocs; + spa_allocs_use_t *spa_allocs_use; int spa_alloc_count; int spa_active_allocator; /* selectable allocator */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 4037b4299..77c70b9b4 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -528,9 +528,6 @@ struct zio { /* Taskq dispatching state */ taskq_ent_t io_tqent; - - /* write issue taskq selection, based upon sync thread */ - taskq_t *io_wr_iss_tq; }; enum blk_verify_flag { diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index ef0385d42..5edd80659 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -525,10 +525,17 @@ most ZPL operations (e.g. write, create) will return . .It Sy spa_num_allocators Ns = Ns Sy 4 Pq int Determines the number of block alloctators to use per spa instance. -Capped by the number of actual CPUs in the system. +Capped by the number of actual CPUs in the system via +.Sy spa_cpus_per_allocator . .Pp Note that setting this value too high could result in performance degredation and/or excess fragmentation. +Set value only applies to pools imported/created after that. +. +.It Sy spa_cpus_per_allocator Ns = Ns Sy 4 Pq int +Determines the minimum number of CPUs in a system for block alloctator +per spa instance. +Set value only applies to pools imported/created after that. . .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint Limits the number of on-disk error log entries that will be converted to the @@ -2339,21 +2346,19 @@ Set value only applies to pools imported/created after that. . .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint Number of worker threads per taskq. -Lower values improve I/O ordering and CPU utilization, -while higher reduces lock contention. +Higher values improve I/O ordering and CPU utilization, +while lower reduce lock contention. +Set value only applies to pools imported/created after that. .Pp If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. Set value only applies to pools imported/created after that. . -.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint -Determines the number of CPUs to run write issue taskqs. -.Pp -When 0 (the default), the value to use is computed internally -as the number of actual CPUs in the system divided by the -.Sy spa_num_allocators -value. +.It Sy zio_taskq_write_tpq Ns = Ns Sy 16 Pq uint +Determines the minumum number of threads per write issue taskq. +Higher values improve CPU utilization on high throughput, +while lower reduce taskq locks contention on high IOPS. Set value only applies to pools imported/created after that. . .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 5ea99f742..f1818ae15 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1664,12 +1664,14 @@ sync_dnodes_task(void *arg) sync_objset_arg_t *soa = sda->sda_soa; objset_t *os = soa->soa_os; + uint_t allocator = spa_acq_allocator(os->os_spa); multilist_sublist_t *ms = multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx); dmu_objset_sync_dnodes(ms, soa->soa_tx); multilist_sublist_unlock(ms); + spa_rel_allocator(os->os_spa, allocator); kmem_free(sda, sizeof (*sda)); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 147165ee8..ec2b674fb 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -208,7 +208,7 @@ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ #endif -static uint_t zio_taskq_wr_iss_ncpus = 0; +static uint_t zio_taskq_write_tpq = 16; /* * Report any spa_load_verify errors found, but do not fail spa_load. @@ -1067,17 +1067,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) case ZTI_MODE_SYNC: /* - * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus', - * not to exceed the number of spa allocators. + * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, + * not to exceed the number of spa allocators, and align to it. */ - if (zio_taskq_wr_iss_ncpus == 0) { - count = MAX(boot_ncpus / spa->spa_alloc_count, 1); - } else { - count = MAX(1, - boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus)); - } + cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); + count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq)); count = MAX(count, (zio_taskq_batch_pct + 99) / 100); count = MIN(count, spa->spa_alloc_count); + while (spa->spa_alloc_count % count != 0 && + spa->spa_alloc_count < count * 2) + count--; /* * zio_taskq_batch_pct is unbounded and may exceed 100%, but no @@ -1495,15 +1494,11 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); - if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && - (zio != NULL) && (zio->io_wr_iss_tq != NULL)) { - /* dispatch to assigned write issue taskq */ - tq = zio->io_wr_iss_tq; - return (tq); - } - if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; + } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && + (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) { + tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } @@ -10233,16 +10228,10 @@ spa_sync_tq_create(spa_t *spa, const char *name) VERIFY(spa->spa_sync_tq != NULL); VERIFY(kthreads != NULL); - spa_taskqs_t *tqs = - &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE]; - spa_syncthread_info_t *ti = spa->spa_syncthreads; - for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) { + for (int i = 0; i < nthreads; i++, ti++) { ti->sti_thread = kthreads[i]; - if (w == tqs->stqs_count) { - w = 0; - } - ti->sti_wr_iss_tq = tqs->stqs_taskq[w]; + ti->sti_allocator = i; } kmem_free(kthreads, sizeof (*kthreads) * nthreads); @@ -10261,6 +10250,42 @@ spa_sync_tq_destroy(spa_t *spa) spa->spa_sync_tq = NULL; } +uint_t +spa_acq_allocator(spa_t *spa) +{ + int i; + + if (spa->spa_alloc_count == 1) + return (0); + + mutex_enter(&spa->spa_allocs_use->sau_lock); + uint_t r = spa->spa_allocs_use->sau_rotor; + do { + if (++r == spa->spa_alloc_count) + r = 0; + } while (spa->spa_allocs_use->sau_inuse[r]); + spa->spa_allocs_use->sau_inuse[r] = B_TRUE; + spa->spa_allocs_use->sau_rotor = r; + mutex_exit(&spa->spa_allocs_use->sau_lock); + + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (i = 0; i < spa->spa_alloc_count; i++, ti++) { + if (ti->sti_thread == curthread) { + ti->sti_allocator = r; + break; + } + } + ASSERT3S(i, <, spa->spa_alloc_count); + return (r); +} + +void +spa_rel_allocator(spa_t *spa, uint_t allocator) +{ + if (spa->spa_alloc_count > 1) + spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; +} + void spa_select_allocator(zio_t *zio) { @@ -10288,8 +10313,7 @@ spa_select_allocator(zio_t *zio) spa_syncthread_info_t *ti = spa->spa_syncthreads; for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { if (ti->sti_thread == curthread) { - zio->io_allocator = i; - zio->io_wr_iss_tq = ti->sti_wr_iss_tq; + zio->io_allocator = ti->sti_allocator; return; } } @@ -10306,7 +10330,6 @@ spa_select_allocator(zio_t *zio) bm->zb_blkid >> 20); zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; - zio->io_wr_iss_tq = NULL; } /* @@ -10919,5 +10942,5 @@ ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, #endif /* END CSTYLED */ -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW, - "Number of CPUs to run write issue taskqs"); +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, + "Number of CPUs per write issue taskq"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 5fb7847b5..e6d4a9bdb 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -394,6 +394,7 @@ static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; * Number of allocators to use, per spa instance */ static int spa_num_allocators = 4; +static int spa_cpus_per_allocator = 4; /* * Spa active allocator. @@ -747,8 +748,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) if (altroot) spa->spa_root = spa_strdup(altroot); - /* Do not allow more allocators than CPUs. */ - spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus); + /* Do not allow more allocators than fraction of CPUs. */ + spa->spa_alloc_count = MAX(MIN(spa_num_allocators, + boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1); spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * sizeof (spa_alloc_t), KM_SLEEP); @@ -758,6 +760,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } + if (spa->spa_alloc_count > 1) { + spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t, + sau_inuse[spa->spa_alloc_count]), KM_SLEEP); + mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT, + NULL); + } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); @@ -853,6 +861,11 @@ spa_remove(spa_t *spa) } kmem_free(spa->spa_allocs, spa->spa_alloc_count * sizeof (spa_alloc_t)); + if (spa->spa_alloc_count > 1) { + mutex_destroy(&spa->spa_allocs_use->sau_lock); + kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t, + sau_inuse[spa->spa_alloc_count])); + } avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_sm_logs_by_txg); @@ -3097,4 +3110,7 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_uint, ZMOD_RW, "Reserved free space in pool"); ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, - "Number of allocators per spa, capped by ncpus"); + "Number of allocators per spa"); + +ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW, + "Minimum number of CPUs per allocators"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 0e7993d87..870343bf4 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2925,7 +2925,6 @@ static void zio_gang_inherit_allocator(zio_t *pio, zio_t *cio) { cio->io_allocator = pio->io_allocator; - cio->io_wr_iss_tq = pio->io_wr_iss_tq; } static void