From 1a8f5ad3b0ff63bf54a83941cb90786e09cbe25d Mon Sep 17 00:00:00 2001 From: Fedor Uporov <60701163+fuporovvStack@users.noreply.github.com> Date: Thu, 8 May 2025 22:25:40 +0300 Subject: [PATCH] zvol: Enable zvol threading functionality on FreeBSD Make zvol I/O requests processing asynchronous on FreeBSD side in some cases. Clone zvol threading logic and required module parameters from Linux side. Make zvol threadpool creation/destruction logic shared for both Linux and FreeBSD. The IO requests are processed asynchronously in next cases: - volmode=geom: if IO request thread is geom thread or cannot sleep. - volmode=cdev: if IO request passed thru struct cdevsw .d_strategy routine, mean is AIO request. In all other cases the IO requests are processed synchronously. The volthreading zvol property is ignored on FreeBSD side. Sponsored-by: vStack, Inc. Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Reviewed-by: @ImAwsumm Signed-off-by: Fedor Uporov Closes #17169 --- include/sys/zvol_impl.h | 32 ++++ module/os/freebsd/zfs/zvol_os.c | 167 ++++++++---------- module/os/linux/zfs/zvol_os.c | 143 +-------------- module/zfs/zvol.c | 110 ++++++++++++ tests/zfs-tests/include/libtest.shlib | 18 +- tests/zfs-tests/include/tunables.cfg | 1 + .../tests/functional/zvol/zvol_common.shlib | 8 + .../zvol/zvol_stress/zvol_stress.ksh | 18 +- 8 files changed, 253 insertions(+), 244 deletions(-) diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h index 3a40b40f7..b06018aa7 100644 --- a/include/sys/zvol_impl.h +++ b/include/sys/zvol_impl.h @@ -60,6 +60,32 @@ typedef struct zvol_state { boolean_t zv_threading; /* volthreading property */ } zvol_state_t; +/* + * zvol taskqs + */ +typedef struct zv_taskq { + uint_t tqs_cnt; + taskq_t **tqs_taskq; +} zv_taskq_t; + +typedef struct zv_request_stack { + zvol_state_t *zv; + struct bio *bio; +#ifdef __linux__ + struct request *rq; +#endif +} zv_request_t; + +typedef struct zv_request_task { + zv_request_t zvr; + taskq_ent_t ent; +} zv_request_task_t; + +/* + * Switch taskq at multiple of 512 MB offset. This can be set to a lower value + * to utilize more threads for small files but may affect prefetch hits. + */ +#define ZVOL_TASKQ_OFFSET_SHIFT 29 extern krwlock_t zvol_state_lock; #define ZVOL_HT_SIZE 1024 @@ -69,6 +95,10 @@ extern zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE]; extern unsigned int zvol_volmode; extern unsigned int zvol_inhibit_dev; +extern unsigned int zvol_threads; +extern unsigned int zvol_num_taskqs; +extern unsigned int zvol_request_sync; +extern zv_taskq_t zvol_taskqs; /* * platform independent functions exported to platform code @@ -94,6 +124,8 @@ int zvol_clone_range(zvol_state_handle_t *, uint64_t, void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps); +zv_request_task_t *zv_request_task_create(zv_request_t zvr); +void zv_request_task_free(zv_request_task_t *task); /* * platform dependent functions exported to platform independent code diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index d18ea9d59..140016f86 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -99,6 +99,7 @@ #include #include #include +#include #include "zfs_namecheck.h" @@ -112,12 +113,6 @@ #define ZVOL_RW_READ_HELD RW_READ_HELD #endif -enum zvol_geom_state { - ZVOL_GEOM_UNINIT, - ZVOL_GEOM_STOPPED, - ZVOL_GEOM_RUNNING, -}; - struct zvol_state_os { #define zso_dev _zso_state._zso_dev #define zso_geom _zso_state._zso_geom @@ -131,9 +126,6 @@ struct zvol_state_os { /* volmode=geom */ struct zvol_state_geom { struct g_provider *zsg_provider; - struct bio_queue_head zsg_queue; - struct mtx zsg_queue_mtx; - enum zvol_geom_state zsg_state; } _zso_geom; } _zso_state; int zso_dying; @@ -169,7 +161,7 @@ static d_close_t zvol_cdev_close; static d_ioctl_t zvol_cdev_ioctl; static d_read_t zvol_cdev_read; static d_write_t zvol_cdev_write; -static d_strategy_t zvol_geom_bio_strategy; +static d_strategy_t zvol_cdev_bio_strategy; static d_kqfilter_t zvol_cdev_kqfilter; static struct cdevsw zvol_cdevsw = { @@ -181,7 +173,7 @@ static struct cdevsw zvol_cdevsw = { .d_ioctl = zvol_cdev_ioctl, .d_read = zvol_cdev_read, .d_write = zvol_cdev_write, - .d_strategy = zvol_geom_bio_strategy, + .d_strategy = zvol_cdev_bio_strategy, .d_kqfilter = zvol_cdev_kqfilter, }; @@ -205,13 +197,11 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); static int zvol_geom_open(struct g_provider *pp, int flag, int count); static int zvol_geom_close(struct g_provider *pp, int flag, int count); -static void zvol_geom_run(zvol_state_t *zv); static void zvol_geom_destroy(zvol_state_t *zv); static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); -static void zvol_geom_worker(void *arg); static void zvol_geom_bio_start(struct bio *bp); static int zvol_geom_bio_getattr(struct bio *bp); -/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ +static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync); /* * GEOM mode implementation @@ -419,20 +409,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) return (0); } -static void -zvol_geom_run(zvol_state_t *zv) -{ - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp = zsg->zsg_provider; - - ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); - - g_error_provider(pp, 0); - - kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, - "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); -} - static void zvol_geom_destroy(zvol_state_t *zv) { @@ -443,9 +419,6 @@ zvol_geom_destroy(zvol_state_t *zv) g_topology_assert(); - mutex_enter(&zv->zv_state_lock); - VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING); - mutex_exit(&zv->zv_state_lock); zsg->zsg_provider = NULL; g_wither_geom(pp->geom, ENXIO); } @@ -516,44 +489,10 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) return (error); } -static void -zvol_geom_worker(void *arg) -{ - zvol_state_t *zv = arg; - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct bio *bp; - - ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); - - thread_lock(curthread); - sched_prio(curthread, PRIBIO); - thread_unlock(curthread); - - for (;;) { - mtx_lock(&zsg->zsg_queue_mtx); - bp = bioq_takefirst(&zsg->zsg_queue); - if (bp == NULL) { - if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { - zsg->zsg_state = ZVOL_GEOM_RUNNING; - wakeup(&zsg->zsg_state); - mtx_unlock(&zsg->zsg_queue_mtx); - kthread_exit(); - } - msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, - PRIBIO | PDROP, "zvol:io", 0); - continue; - } - mtx_unlock(&zsg->zsg_queue_mtx); - zvol_geom_bio_strategy(bp); - } -} - static void zvol_geom_bio_start(struct bio *bp) { zvol_state_t *zv = bp->bio_to->private; - struct zvol_state_geom *zsg; - boolean_t first; if (zv == NULL) { g_io_deliver(bp, ENXIO); @@ -565,18 +504,8 @@ zvol_geom_bio_start(struct bio *bp) return; } - if (!THREAD_CAN_SLEEP()) { - zsg = &zv->zv_zso->zso_geom; - mtx_lock(&zsg->zsg_queue_mtx); - first = (bioq_first(&zsg->zsg_queue) == NULL); - bioq_insert_tail(&zsg->zsg_queue, bp); - mtx_unlock(&zsg->zsg_queue_mtx); - if (first) - wakeup_one(&zsg->zsg_queue); - return; - } - - zvol_geom_bio_strategy(bp); + zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) && + THREAD_CAN_SLEEP()); } static int @@ -660,9 +589,10 @@ zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn) } static void -zvol_geom_bio_strategy(struct bio *bp) +zvol_strategy_impl(zv_request_t *zvr) { zvol_state_t *zv; + struct bio *bp; uint64_t off, volsize; size_t resid; char *addr; @@ -673,11 +603,8 @@ zvol_geom_bio_strategy(struct bio *bp) boolean_t is_dumpified; boolean_t commit; - if (bp->bio_to) - zv = bp->bio_to->private; - else - zv = bp->bio_dev->si_drv2; - + bp = zvr->bio; + zv = zvr->zv; if (zv == NULL) { error = SET_ERROR(ENXIO); goto out; @@ -813,6 +740,63 @@ out: biofinish(bp, NULL, error); } +static void +zvol_strategy_task(void *arg) +{ + zv_request_task_t *task = arg; + + zvol_strategy_impl(&task->zvr); + zv_request_task_free(task); +} + +static void +zvol_geom_bio_strategy(struct bio *bp, boolean_t sync) +{ + zv_taskq_t *ztqs = &zvol_taskqs; + zv_request_task_t *task; + zvol_state_t *zv; + uint_t tq_idx; + uint_t taskq_hash; + int error; + + if (bp->bio_to) + zv = bp->bio_to->private; + else + zv = bp->bio_dev->si_drv2; + + if (zv == NULL) { + error = SET_ERROR(ENXIO); + if (bp->bio_to) + g_io_deliver(bp, error); + else + biofinish(bp, NULL, error); + return; + } + + zv_request_t zvr = { + .zv = zv, + .bio = bp, + }; + + if (sync || zvol_request_sync) { + zvol_strategy_impl(&zvr); + return; + } + + taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >> + ZVOL_TASKQ_OFFSET_SHIFT); + tq_idx = taskq_hash % ztqs->tqs_cnt; + task = zv_request_task_create(zvr); + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task, + 0, &task->ent); +} + +static void +zvol_cdev_bio_strategy(struct bio *bp) +{ + zvol_geom_bio_strategy(bp, B_FALSE); +} + /* * Character device mode implementation */ @@ -1352,7 +1336,6 @@ zvol_os_free(zvol_state_t *zv) g_topology_lock(); zvol_geom_destroy(zv); g_topology_unlock(); - mtx_destroy(&zsg->zsg_queue_mtx); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; @@ -1432,9 +1415,6 @@ zvol_os_create_minor(const char *name) struct g_provider *pp; struct g_geom *gp; - zsg->zsg_state = ZVOL_GEOM_UNINIT; - mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); - g_topology_lock(); gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); gp->start = zvol_geom_bio_start; @@ -1446,7 +1426,6 @@ zvol_os_create_minor(const char *name) pp->private = zv; zsg->zsg_provider = pp; - bioq_init(&zsg->zsg_queue); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev; @@ -1502,7 +1481,7 @@ out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { - zvol_geom_run(zv); + g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0); g_topology_unlock(); } out_doi: @@ -1529,14 +1508,7 @@ zvol_os_clear_private(zvol_state_t *zv) if (pp->private == NULL) /* already cleared */ return; - mtx_lock(&zsg->zsg_queue_mtx); - zsg->zsg_state = ZVOL_GEOM_STOPPED; pp->private = NULL; - wakeup_one(&zsg->zsg_queue); - while (zsg->zsg_state != ZVOL_GEOM_RUNNING) - msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, - 0, "zvol:w", 0); - mtx_unlock(&zsg->zsg_queue_mtx); ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; @@ -1606,8 +1578,7 @@ zvol_busy(void) int zvol_init(void) { - zvol_init_impl(); - return (0); + return (zvol_init_impl()); } void diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index c8a045392..70046fe31 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -51,21 +51,13 @@ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync); static unsigned int zvol_major = ZVOL_MAJOR; -static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; -/* - * Switch taskq at multiple of 512 MB offset. This can be set to a lower value - * to utilize more threads for small files but may affect prefetch hits. - */ -#define ZVOL_TASKQ_OFFSET_SHIFT 29 - #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static unsigned int zvol_open_timeout_ms = 1000; #endif -static unsigned int zvol_threads = 0; static unsigned int zvol_blk_mq_threads = 0; static unsigned int zvol_blk_mq_actual_threads; static boolean_t zvol_use_blk_mq = B_FALSE; @@ -82,8 +74,6 @@ static boolean_t zvol_use_blk_mq = B_FALSE; */ static unsigned int zvol_blk_mq_blocks_per_thread = 8; -static unsigned int zvol_num_taskqs = 0; - #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ @@ -117,45 +107,8 @@ struct zvol_state_os { boolean_t use_blk_mq; }; -typedef struct zv_taskq { - uint_t tqs_cnt; - taskq_t **tqs_taskq; -} zv_taskq_t; -static zv_taskq_t zvol_taskqs; static struct ida zvol_ida; -typedef struct zv_request_stack { - zvol_state_t *zv; - struct bio *bio; - struct request *rq; -} zv_request_t; - -typedef struct zv_work { - struct request *rq; - struct work_struct work; -} zv_work_t; - -typedef struct zv_request_task { - zv_request_t zvr; - taskq_ent_t ent; -} zv_request_task_t; - -static zv_request_task_t * -zv_request_task_create(zv_request_t zvr) -{ - zv_request_task_t *task; - task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); - taskq_init_ent(&task->ent); - task->zvr = zvr; - return (task); -} - -static void -zv_request_task_free(zv_request_task_t *task) -{ - kmem_free(task, sizeof (*task)); -} - /* * This is called when a new block multiqueue request comes in. A request * contains one or more BIOs. @@ -1793,59 +1746,14 @@ zvol_init(void) { int error; - /* - * zvol_threads is the module param the user passes in. - * - * zvol_actual_threads is what we use internally, since the user can - * pass zvol_thread = 0 to mean "use all the CPUs" (the default). - */ - static unsigned int zvol_actual_threads; - - if (zvol_threads == 0) { - /* - * See dde9380a1 for why 32 was chosen here. This should - * probably be refined to be some multiple of the number - * of CPUs. - */ - zvol_actual_threads = MAX(num_online_cpus(), 32); - } else { - zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); + error = zvol_init_impl(); + if (error) { + printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error); + return (error); } - /* - * Use atleast 32 zvol_threads but for many core system, - * prefer 6 threads per taskq, but no more taskqs - * than threads in them on large systems. - * - * taskq total - * cpus taskqs threads threads - * ------- ------- ------- ------- - * 1 1 32 32 - * 2 1 32 32 - * 4 1 32 32 - * 8 2 16 32 - * 16 3 11 33 - * 32 5 7 35 - * 64 8 8 64 - * 128 11 12 132 - * 256 16 16 256 - */ - zv_taskq_t *ztqs = &zvol_taskqs; - uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); - if (num_tqs == 0) { - num_tqs = 1 + num_online_cpus() / 6; - while (num_tqs * num_tqs > zvol_actual_threads) - num_tqs--; - } - uint_t per_tq_thread = zvol_actual_threads / num_tqs; - if (per_tq_thread * num_tqs < zvol_actual_threads) - per_tq_thread++; - ztqs->tqs_cnt = num_tqs; - ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { - kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); - ztqs->tqs_taskq = NULL; printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } @@ -1864,25 +1772,6 @@ zvol_init(void) 1024); } - for (uint_t i = 0; i < num_tqs; i++) { - char name[32]; - (void) snprintf(name, sizeof (name), "%s_tq-%u", - ZVOL_DRIVER, i); - ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, - maxclsyspri, per_tq_thread, INT_MAX, - TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - if (ztqs->tqs_taskq[i] == NULL) { - for (int j = i - 1; j >= 0; j--) - taskq_destroy(ztqs->tqs_taskq[j]); - unregister_blkdev(zvol_major, ZVOL_DRIVER); - kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * - sizeof (taskq_t *)); - ztqs->tqs_taskq = NULL; - return (-ENOMEM); - } - } - - zvol_init_impl(); ida_init(&zvol_ida); return (0); } @@ -1890,21 +1779,9 @@ zvol_init(void) void zvol_fini(void) { - zv_taskq_t *ztqs = &zvol_taskqs; - zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); - if (ztqs->tqs_taskq == NULL) { - ASSERT3U(ztqs->tqs_cnt, ==, 0); - } else { - for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { - ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); - taskq_destroy(ztqs->tqs_taskq[i]); - } - kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * - sizeof (taskq_t *)); - ztqs->tqs_taskq = NULL; - } + zvol_fini_impl(); ida_destroy(&zvol_ida); } @@ -1915,19 +1792,9 @@ MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); -module_param(zvol_threads, uint, 0444); -MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" - "to 0 to use all active CPUs"); - -module_param(zvol_request_sync, uint, 0644); -MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); - module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); -module_param(zvol_num_taskqs, uint, 0444); -MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); - module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index d985ec0d6..23d1265b2 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -90,11 +90,15 @@ unsigned int zvol_inhibit_dev = 0; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; +unsigned int zvol_threads = 0; +unsigned int zvol_num_taskqs = 0; +unsigned int zvol_request_sync = 0; struct hlist_head *zvol_htable; static list_t zvol_state_list; krwlock_t zvol_state_lock; extern int zfs_bclone_wait_dirty; +zv_taskq_t zvol_taskqs; typedef enum { ZVOL_ASYNC_REMOVE_MINORS, @@ -111,6 +115,22 @@ typedef struct { uint64_t value; } zvol_task_t; +zv_request_task_t * +zv_request_task_create(zv_request_t zvr) +{ + zv_request_task_t *task; + task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); + taskq_init_ent(&task->ent); + task->zvr = zvr; + return (task); +} + +void +zv_request_task_free(zv_request_task_t *task) +{ + kmem_free(task, sizeof (*task)); +} + uint64_t zvol_name_hash(const char *name) { @@ -2018,6 +2038,75 @@ zvol_init_impl(void) { int i; + /* + * zvol_threads is the module param the user passes in. + * + * zvol_actual_threads is what we use internally, since the user can + * pass zvol_thread = 0 to mean "use all the CPUs" (the default). + */ + static unsigned int zvol_actual_threads; + + if (zvol_threads == 0) { + /* + * See dde9380a1 for why 32 was chosen here. This should + * probably be refined to be some multiple of the number + * of CPUs. + */ + zvol_actual_threads = MAX(max_ncpus, 32); + } else { + zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); + } + + /* + * Use at least 32 zvol_threads but for many core system, + * prefer 6 threads per taskq, but no more taskqs + * than threads in them on large systems. + * + * taskq total + * cpus taskqs threads threads + * ------- ------- ------- ------- + * 1 1 32 32 + * 2 1 32 32 + * 4 1 32 32 + * 8 2 16 32 + * 16 3 11 33 + * 32 5 7 35 + * 64 8 8 64 + * 128 11 12 132 + * 256 16 16 256 + */ + zv_taskq_t *ztqs = &zvol_taskqs; + int num_tqs = MIN(max_ncpus, zvol_num_taskqs); + if (num_tqs == 0) { + num_tqs = 1 + max_ncpus / 6; + while (num_tqs * num_tqs > zvol_actual_threads) + num_tqs--; + } + + int per_tq_thread = zvol_actual_threads / num_tqs; + if (per_tq_thread * num_tqs < zvol_actual_threads) + per_tq_thread++; + + ztqs->tqs_cnt = num_tqs; + ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); + + for (uint_t i = 0; i < num_tqs; i++) { + char name[32]; + (void) snprintf(name, sizeof (name), "%s_tq-%u", + ZVOL_DRIVER, i); + ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, + maxclsyspri, per_tq_thread, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (ztqs->tqs_taskq[i] == NULL) { + for (int j = i - 1; j >= 0; j--) + taskq_destroy(ztqs->tqs_taskq[j]); + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + return (SET_ERROR(ENOMEM)); + } + } + list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL); @@ -2033,6 +2122,8 @@ zvol_init_impl(void) void zvol_fini_impl(void) { + zv_taskq_t *ztqs = &zvol_taskqs; + zvol_remove_minors_impl(NULL); /* @@ -2046,4 +2137,23 @@ zvol_fini_impl(void) kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); list_destroy(&zvol_state_list); rw_destroy(&zvol_state_lock); + + if (ztqs->tqs_taskq == NULL) { + ASSERT3U(ztqs->tqs_cnt, ==, 0); + } else { + for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { + ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); + taskq_destroy(ztqs->tqs_taskq[i]); + } + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + } } + +ZFS_MODULE_PARAM(zfs, , zvol_threads, UINT, ZMOD_RW, + "Number of threads for I/O requests. Set to 0 to use all active CPUs"); +ZFS_MODULE_PARAM(zfs, , zvol_num_taskqs, UINT, ZMOD_RW, + "Number of zvol taskqs"); +ZFS_MODULE_PARAM(zfs, , zvol_request_sync, UINT, ZMOD_RW, + "Synchronously handle bio requests"); diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 8bffe9d82..4b8db1893 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3386,17 +3386,21 @@ function set_tunable_impl function save_tunable { - [[ ! -d $TEST_BASE_DIR ]] && return 1 - [[ -e $TEST_BASE_DIR/tunable-$1 ]] && return 2 - echo "$(get_tunable """$1""")" > "$TEST_BASE_DIR"/tunable-"$1" + if tunable_exists $1 ; then + [[ ! -d $TEST_BASE_DIR ]] && return 1 + [[ -e $TEST_BASE_DIR/tunable-$1 ]] && return 2 + echo "$(get_tunable """$1""")" > "$TEST_BASE_DIR"/tunable-"$1" + fi } function restore_tunable { - [[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1 - val="$(cat $TEST_BASE_DIR/tunable-"""$1""")" - set_tunable64 "$1" "$val" - rm $TEST_BASE_DIR/tunable-$1 + if tunable_exists $1 ; then + [[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1 + val="$(cat $TEST_BASE_DIR/tunable-"""$1""")" + set_tunable64 "$1" "$val" + rm $TEST_BASE_DIR/tunable-$1 + fi } # diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 79dc64ad9..c0aba27d3 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -102,6 +102,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED +VOL_REQUEST_SYNC zvol_request_sync zvol_request_sync VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq BCLONE_ENABLED bclone_enabled zfs_bclone_enabled BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib b/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib index c3069d681..8e0d8141f 100644 --- a/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib +++ b/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib @@ -140,3 +140,11 @@ function set_blk_mq log_must set_tunable32 VOL_USE_BLK_MQ $1 fi } + +# enable/disable zvol sync mode +# +# $1: 1 = enable, 0 = disable +function set_zvol_sync +{ + log_must set_tunable32 VOL_REQUEST_SYNC $1 +} diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh index 1805e597e..9047f14bc 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh @@ -60,6 +60,9 @@ typeset -f each_zvol_size=$(( floor($biggest_zvol_size_possible * 0.9 / \ typeset tmpdir="$(mktemp -t -d zvol_stress_fio_state.XXXXXX)" +log_must save_tunable VOL_USE_BLK_MQ +log_must save_tunable VOL_REQUEST_SYNC + function create_zvols { log_note "Creating $num_zvols zvols that are ${each_zvol_size}B each" @@ -124,7 +127,8 @@ function cleanup log_must zinject -c all log_must zpool clear $TESTPOOL destroy_zvols - set_blk_mq 0 + log_must restore_tunable VOL_USE_BLK_MQ + log_must restore_tunable VOL_REQUEST_SYNC # Remove all fio's leftover state files if [ -n "$tmpdir" ] ; then @@ -146,6 +150,18 @@ destroy_zvols set_blk_mq 1 create_zvols do_zvol_stress +destroy_zvols + +# Disable zvol sync mode, and re-run test +set_zvol_sync 0 +create_zvols +do_zvol_stress +destroy_zvols + +# Same for enabled zvol sync mode +set_zvol_sync 1 +create_zvols +do_zvol_stress # Inject some errors, and verify we see some IO errors in zpool status sync_pool $TESTPOOL