From 5fc134ff2ff55811a34c4653f4435468f58eeca5 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 4 Apr 2024 06:21:25 +0500 Subject: [PATCH] zvol: use multiple taskq Currently, zvol uses a single taskq, resulting in throughput bottleneck under heavy load due to lock contention on the single taskq. This patch addresses the performance bottleneck under heavy load conditions by utilizing multiple taskqs, thus mitigating lock contention. The number of taskqs scale dynamically based on the available CPUs in the system, as illustrated below: taskq total cpus taskqs threads threads ------- ------- ------- ------- 1 1 32 32 2 1 32 32 4 1 32 32 8 2 16 32 16 3 11 33 32 5 7 35 64 8 8 64 128 11 12 132 256 16 16 256 Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Tony Nguyen Signed-off-by: Ameer Hamza Closes #15992 --- man/man4/zfs.4 | 7 +++ module/os/linux/zfs/zvol_os.c | 102 ++++++++++++++++++++++++++++++---- 2 files changed, 99 insertions(+), 10 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index a98ec519a..bab40dd91 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2370,6 +2370,13 @@ The number of requests which can be handled concurrently is controlled by is ignored when running on a kernel that supports block multiqueue .Pq Li blk-mq . . +.It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint +Number of zvol taskqs. +If +.Sy 0 +(the default) then scaling is done internally to prefer 6 threads per taskq. +This only applies on Linux. +. .It Sy zvol_threads Ns = Ns Sy 0 Pq uint The number of system wide threads to use for processing zvol block IOs. If diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 8562e9897..f9c64a8a8 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,12 @@ static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; +/* + * Switch taskq at multiple of 512 MB offset. This can be set to a lower value + * to utilize more threads for small files but may affect prefetch hits. + */ +#define ZVOL_TASKQ_OFFSET_SHIFT 29 + #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static unsigned int zvol_open_timeout_ms = 1000; #endif @@ -74,6 +81,7 @@ static boolean_t zvol_use_blk_mq = B_FALSE; * read and write tests to a zvol in an NVMe pool (with 16 CPUs). */ static unsigned int zvol_blk_mq_blocks_per_thread = 8; +static unsigned int zvol_num_taskqs = 0; #endif #ifndef BLKDEV_DEFAULT_RQ @@ -114,7 +122,11 @@ struct zvol_state_os { boolean_t use_blk_mq; }; -static taskq_t *zvol_taskq; +typedef struct zv_taskq { + uint_t tqs_cnt; + taskq_t **tqs_taskq; +} zv_taskq_t; +static zv_taskq_t zvol_taskqs; static struct ida zvol_ida; typedef struct zv_request_stack { @@ -532,6 +544,17 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, } zv_request_task_t *task; + zv_taskq_t *ztqs = &zvol_taskqs; + uint_t blk_mq_hw_queue = 0; + uint_t tq_idx; + uint_t taskq_hash; +#ifdef HAVE_BLK_MQ + if (rq) + blk_mq_hw_queue = rq->mq_hctx->queue_num; +#endif + taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, + blk_mq_hw_queue, 0); + tq_idx = taskq_hash % ztqs->tqs_cnt; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { @@ -601,7 +624,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_discard_task, task, 0, &task->ent); } } else { @@ -609,7 +632,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_write(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_write_task, task, 0, &task->ent); } } @@ -631,7 +654,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_read(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_read_task, task, 0, &task->ent); } } @@ -1563,8 +1586,40 @@ zvol_init(void) zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); } + /* + * Use atleast 32 zvol_threads but for many core system, + * prefer 6 threads per taskq, but no more taskqs + * than threads in them on large systems. + * + * taskq total + * cpus taskqs threads threads + * ------- ------- ------- ------- + * 1 1 32 32 + * 2 1 32 32 + * 4 1 32 32 + * 8 2 16 32 + * 16 3 11 33 + * 32 5 7 35 + * 64 8 8 64 + * 128 11 12 132 + * 256 16 16 256 + */ + zv_taskq_t *ztqs = &zvol_taskqs; + uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); + if (num_tqs == 0) { + num_tqs = 1 + num_online_cpus() / 6; + while (num_tqs * num_tqs > zvol_actual_threads) + num_tqs--; + } + uint_t per_tq_thread = zvol_actual_threads / num_tqs; + if (per_tq_thread * num_tqs < zvol_actual_threads) + per_tq_thread++; + ztqs->tqs_cnt = num_tqs; + ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } @@ -1584,11 +1639,22 @@ zvol_init(void) 1024); } #endif - zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, - zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - if (zvol_taskq == NULL) { - unregister_blkdev(zvol_major, ZVOL_DRIVER); - return (-ENOMEM); + for (uint_t i = 0; i < num_tqs; i++) { + char name[32]; + (void) snprintf(name, sizeof (name), "%s_tq-%u", + ZVOL_DRIVER, i); + ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, + maxclsyspri, per_tq_thread, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (ztqs->tqs_taskq[i] == NULL) { + for (int j = i - 1; j >= 0; j--) + taskq_destroy(ztqs->tqs_taskq[j]); + unregister_blkdev(zvol_major, ZVOL_DRIVER); + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + return (-ENOMEM); + } } zvol_init_impl(); @@ -1599,9 +1665,22 @@ zvol_init(void) void zvol_fini(void) { + zv_taskq_t *ztqs = &zvol_taskqs; zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); - taskq_destroy(zvol_taskq); + + if (ztqs->tqs_taskq == NULL) { + ASSERT3U(ztqs->tqs_cnt, ==, 0); + } else { + for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { + ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); + taskq_destroy(ztqs->tqs_taskq[i]); + } + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + } + ida_destroy(&zvol_ida); } @@ -1622,6 +1701,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); +module_param(zvol_num_taskqs, uint, 0444); +MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); + module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");