Scale worker threads and taskqs with number of CPUs

While use of dynamic taskqs allows to reduce number of idle threads,
hardcoded 8 taskqs of each kind is a big overkill for small systems,
complicating CPU scheduling, increasing I/O reorder, etc, while
providing no real locking benefits, just not needed there.

On another side, 12*8 worker threads per kind are able to overload
almost any system nowadays.  For example, pool of several fast SSDs
with SHA256 checksum makes system barely responsive during scrub, or
with dedup enabled barely responsive during large file deletion.

To address both problems this patch introduces ZTI_SCALE macro, alike
to ZTI_BATCH, but with multiple taskqs, depending on number of CPUs,
to be used in places where lock scalability is needed, while request
ordering is not so much.  The code is made to create new taskq for
~6 worker threads (less for small systems, but more for very large)
up to 80% of CPU cores (previous 75% was not good for rounding down).
Both number of threads and threads per taskq are now tunable in case
somebody really wants to use all of system power for ZFS.

While obviously some benchmarks show small peak performance reduction
(not so big really, especially on systems with SMT, where use of the
second threads does not give as much performance as the first ones),
they also show dramatic latency reduction and much more smooth user-
space operation in case of high CPU usage by ZFS.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #11966
This commit is contained in:
Alexander Motin 2021-05-14 12:13:53 -04:00 committed by GitHub
parent 6a13add559
commit 7457b024ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 84 additions and 26 deletions

View File

@ -4060,11 +4060,25 @@ Percentage of online CPUs (or CPU cores, etc) which will run a worker thread
for I/O. These workers are responsible for I/O work such as compression and for I/O. These workers are responsible for I/O work such as compression and
checksum calculations. Fractional number of CPUs will be rounded down. checksum calculations. Fractional number of CPUs will be rounded down.
.sp .sp
The default value of 75 was chosen to avoid using all CPUs which can result in The default value of 80 was chosen to avoid using all CPUs which can result in
latency issues and inconsistent application performance, especially when high latency issues and inconsistent application performance, especially when slower
compression is enabled. compression and/or checksumming is enabled.
.sp .sp
Default value: \fB75\fR. Default value: \fB80\fR.
.RE
.sp
.ne 2
.na
\fBzio_taskq_batch_tpq\fR (uint)
.ad
.RS 12n
Number of worker threads per taskq. Lower value improves I/O ordering and
CPU utilization, while higher reduces lock contention.
.sp
By default about 6 worker threads per taskq, depending on system size.
.sp
Default value: \fB0\fR.
.RE .RE
.sp .sp

View File

@ -108,6 +108,7 @@ int zfs_ccw_retry_interval = 300;
typedef enum zti_modes { typedef enum zti_modes {
ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_FIXED, /* value is # of threads (min 1) */
ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */
ZTI_MODE_NULL, /* don't create a taskq */ ZTI_MODE_NULL, /* don't create a taskq */
ZTI_NMODES ZTI_NMODES
} zti_modes_t; } zti_modes_t;
@ -115,6 +116,7 @@ typedef enum zti_modes {
#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
#define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 }
#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
#define ZTI_N(n) ZTI_P(n, 1) #define ZTI_N(n) ZTI_P(n, 1)
@ -141,7 +143,8 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
* point of lock contention. The ZTI_P(#, #) macro indicates that we need an * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
* additional degree of parallelism specified by the number of threads per- * additional degree of parallelism specified by the number of threads per-
* taskq and the number of taskqs; when dispatching an event in this case, the * taskq and the number of taskqs; when dispatching an event in this case, the
* particular taskq is chosen at random. * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
* but with number of taskqs also scaling with number of CPUs.
* *
* The different taskq priorities are to handle the different contexts (issue * The different taskq priorities are to handle the different contexts (issue
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
@ -150,9 +153,9 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */ /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
{ ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
{ ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */ { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */
{ ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
{ ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */
@ -164,7 +167,8 @@ static boolean_t spa_has_active_shared_spare(spa_t *spa);
static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
static void spa_vdev_resilver_done(spa_t *spa); static void spa_vdev_resilver_done(spa_t *spa);
uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */
uint_t zio_taskq_batch_tpq; /* threads per taskq */
boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
uint_t zio_taskq_basedc = 80; /* base duty cycle */ uint_t zio_taskq_basedc = 80; /* base duty cycle */
@ -957,25 +961,12 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
uint_t value = ztip->zti_value; uint_t value = ztip->zti_value;
uint_t count = ztip->zti_count; uint_t count = ztip->zti_count;
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
uint_t flags = 0; uint_t cpus, flags = TASKQ_DYNAMIC;
boolean_t batch = B_FALSE; boolean_t batch = B_FALSE;
if (mode == ZTI_MODE_NULL) {
tqs->stqs_count = 0;
tqs->stqs_taskq = NULL;
return;
}
ASSERT3U(count, >, 0);
tqs->stqs_count = count;
tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
switch (mode) { switch (mode) {
case ZTI_MODE_FIXED: case ZTI_MODE_FIXED:
ASSERT3U(value, >=, 1); ASSERT3U(value, >, 0);
value = MAX(value, 1);
flags |= TASKQ_DYNAMIC;
break; break;
case ZTI_MODE_BATCH: case ZTI_MODE_BATCH:
@ -984,6 +975,48 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
value = MIN(zio_taskq_batch_pct, 100); value = MIN(zio_taskq_batch_pct, 100);
break; break;
case ZTI_MODE_SCALE:
flags |= TASKQ_THREADS_CPU_PCT;
/*
* We want more taskqs to reduce lock contention, but we want
* less for better request ordering and CPU utilization.
*/
cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
if (zio_taskq_batch_tpq > 0) {
count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) /
zio_taskq_batch_tpq);
} else {
/*
* Prefer 6 threads per taskq, but no more taskqs
* than threads in them on large systems. For 80%:
*
* taskq taskq total
* cpus taskqs percent threads threads
* ------- ------- ------- ------- -------
* 1 1 80% 1 1
* 2 1 80% 1 1
* 4 1 80% 3 3
* 8 2 40% 3 6
* 16 3 27% 4 12
* 32 5 16% 5 25
* 64 7 11% 7 49
* 128 10 8% 10 100
* 256 14 6% 15 210
*/
count = 1 + cpus / 6;
while (count * count > cpus)
count--;
}
/* Limit each taskq within 100% to not trigger assertion. */
count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
value = (zio_taskq_batch_pct + count / 2) / count;
break;
case ZTI_MODE_NULL:
tqs->stqs_count = 0;
tqs->stqs_taskq = NULL;
return;
default: default:
panic("unrecognized mode for %s_%s taskq (%u:%u) in " panic("unrecognized mode for %s_%s taskq (%u:%u) in "
"spa_activate()", "spa_activate()",
@ -991,10 +1024,18 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
break; break;
} }
ASSERT3U(count, >, 0);
tqs->stqs_count = count;
tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
for (uint_t i = 0; i < count; i++) { for (uint_t i = 0; i < count; i++) {
taskq_t *tq; taskq_t *tq;
char name[32]; char name[32];
if (count > 1)
(void) snprintf(name, sizeof (name), "%s_%s_%u",
zio_type_name[t], zio_taskq_types[q], i);
else
(void) snprintf(name, sizeof (name), "%s_%s", (void) snprintf(name, sizeof (name), "%s_%s",
zio_type_name[t], zio_taskq_types[q]); zio_type_name[t], zio_taskq_types[q]);
@ -9863,6 +9904,9 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
"Percentage of CPUs to run an IO worker thread"); "Percentage of CPUs to run an IO worker thread");
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
"Number of threads per IO worker taskqueue");
ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW,
"Allow importing pool with up to this number of missing top-level " "Allow importing pool with up to this number of missing top-level "
"vdevs (in read-only mode)"); "vdevs (in read-only mode)");