From 1a8f5ad3b0ff63bf54a83941cb90786e09cbe25d Mon Sep 17 00:00:00 2001
From: Fedor Uporov <60701163+fuporovvStack@users.noreply.github.com>
Date: Thu, 8 May 2025 22:25:40 +0300
Subject: [PATCH] zvol: Enable zvol threading functionality on FreeBSD

Make zvol I/O requests processing asynchronous on FreeBSD side in some
cases. Clone zvol threading logic and required module parameters from
Linux side. Make zvol threadpool creation/destruction logic shared for
both Linux and FreeBSD.
The IO requests are processed asynchronously in next cases:
- volmode=geom: if IO request thread is geom thread or cannot sleep.
- volmode=cdev: if IO request passed thru struct cdevsw .d_strategy
routine, mean is AIO request.
In all other cases the IO requests are processed synchronously. The
volthreading zvol property is ignored on FreeBSD side.

Sponsored-by: vStack, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: @ImAwsumm
Signed-off-by: Fedor Uporov <fuporov.vstack@gmail.com>
Closes #17169
---
 include/sys/zvol_impl.h                       |  32 ++++
 module/os/freebsd/zfs/zvol_os.c               | 167 ++++++++----------
 module/os/linux/zfs/zvol_os.c                 | 143 +--------------
 module/zfs/zvol.c                             | 110 ++++++++++++
 tests/zfs-tests/include/libtest.shlib         |  18 +-
 tests/zfs-tests/include/tunables.cfg          |   1 +
 .../tests/functional/zvol/zvol_common.shlib   |   8 +
 .../zvol/zvol_stress/zvol_stress.ksh          |  18 +-
 8 files changed, 253 insertions(+), 244 deletions(-)

diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h
index 3a40b40f7..b06018aa7 100644
--- a/include/sys/zvol_impl.h
+++ b/include/sys/zvol_impl.h
@@ -60,6 +60,32 @@ typedef struct zvol_state {
 	boolean_t		zv_threading;	/* volthreading property */
 } zvol_state_t;
 
+/*
+ * zvol taskqs
+ */
+typedef struct zv_taskq {
+	uint_t tqs_cnt;
+	taskq_t **tqs_taskq;
+} zv_taskq_t;
+
+typedef struct zv_request_stack {
+	zvol_state_t	*zv;
+	struct bio	*bio;
+#ifdef __linux__
+	struct request	*rq;
+#endif
+} zv_request_t;
+
+typedef struct zv_request_task {
+	zv_request_t	zvr;
+	taskq_ent_t	ent;
+} zv_request_task_t;
+
+/*
+ * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
+ * to utilize more threads for small files but may affect prefetch hits.
+ */
+#define	ZVOL_TASKQ_OFFSET_SHIFT 29
 
 extern krwlock_t zvol_state_lock;
 #define	ZVOL_HT_SIZE	1024
@@ -69,6 +95,10 @@ extern zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE];
 
 extern unsigned int zvol_volmode;
 extern unsigned int zvol_inhibit_dev;
+extern unsigned int zvol_threads;
+extern unsigned int zvol_num_taskqs;
+extern unsigned int zvol_request_sync;
+extern zv_taskq_t zvol_taskqs;
 
 /*
  * platform independent functions exported to platform code
@@ -94,6 +124,8 @@ int zvol_clone_range(zvol_state_handle_t *, uint64_t,
 void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps,
     size_t nbps);
+zv_request_task_t *zv_request_task_create(zv_request_t zvr);
+void zv_request_task_free(zv_request_task_t *task);
 
 /*
  * platform dependent functions exported to platform independent code
diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c
index d18ea9d59..140016f86 100644
--- a/module/os/freebsd/zfs/zvol_os.c
+++ b/module/os/freebsd/zfs/zvol_os.c
@@ -99,6 +99,7 @@
 #include <geom/geom.h>
 #include <sys/zvol.h>
 #include <sys/zvol_impl.h>
+#include <cityhash.h>
 
 #include "zfs_namecheck.h"
 
@@ -112,12 +113,6 @@
 #define	ZVOL_RW_READ_HELD	RW_READ_HELD
 #endif
 
-enum zvol_geom_state {
-	ZVOL_GEOM_UNINIT,
-	ZVOL_GEOM_STOPPED,
-	ZVOL_GEOM_RUNNING,
-};
-
 struct zvol_state_os {
 #define	zso_dev		_zso_state._zso_dev
 #define	zso_geom	_zso_state._zso_geom
@@ -131,9 +126,6 @@ struct zvol_state_os {
 		/* volmode=geom */
 		struct zvol_state_geom {
 			struct g_provider *zsg_provider;
-			struct bio_queue_head zsg_queue;
-			struct mtx zsg_queue_mtx;
-			enum zvol_geom_state zsg_state;
 		} _zso_geom;
 	} _zso_state;
 	int zso_dying;
@@ -169,7 +161,7 @@ static d_close_t	zvol_cdev_close;
 static d_ioctl_t	zvol_cdev_ioctl;
 static d_read_t		zvol_cdev_read;
 static d_write_t	zvol_cdev_write;
-static d_strategy_t	zvol_geom_bio_strategy;
+static d_strategy_t	zvol_cdev_bio_strategy;
 static d_kqfilter_t	zvol_cdev_kqfilter;
 
 static struct cdevsw zvol_cdevsw = {
@@ -181,7 +173,7 @@ static struct cdevsw zvol_cdevsw = {
 	.d_ioctl =	zvol_cdev_ioctl,
 	.d_read =	zvol_cdev_read,
 	.d_write =	zvol_cdev_write,
-	.d_strategy =	zvol_geom_bio_strategy,
+	.d_strategy =	zvol_cdev_bio_strategy,
 	.d_kqfilter =	zvol_cdev_kqfilter,
 };
 
@@ -205,13 +197,11 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
 
 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
-static void zvol_geom_run(zvol_state_t *zv);
 static void zvol_geom_destroy(zvol_state_t *zv);
 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
-static void zvol_geom_worker(void *arg);
 static void zvol_geom_bio_start(struct bio *bp);
 static int zvol_geom_bio_getattr(struct bio *bp);
-/* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
+static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync);
 
 /*
  * GEOM mode implementation
@@ -419,20 +409,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
 	return (0);
 }
 
-static void
-zvol_geom_run(zvol_state_t *zv)
-{
-	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
-	struct g_provider *pp = zsg->zsg_provider;
-
-	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
-
-	g_error_provider(pp, 0);
-
-	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
-	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
-}
-
 static void
 zvol_geom_destroy(zvol_state_t *zv)
 {
@@ -443,9 +419,6 @@ zvol_geom_destroy(zvol_state_t *zv)
 
 	g_topology_assert();
 
-	mutex_enter(&zv->zv_state_lock);
-	VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
-	mutex_exit(&zv->zv_state_lock);
 	zsg->zsg_provider = NULL;
 	g_wither_geom(pp->geom, ENXIO);
 }
@@ -516,44 +489,10 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
 	return (error);
 }
 
-static void
-zvol_geom_worker(void *arg)
-{
-	zvol_state_t *zv = arg;
-	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
-	struct bio *bp;
-
-	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
-
-	thread_lock(curthread);
-	sched_prio(curthread, PRIBIO);
-	thread_unlock(curthread);
-
-	for (;;) {
-		mtx_lock(&zsg->zsg_queue_mtx);
-		bp = bioq_takefirst(&zsg->zsg_queue);
-		if (bp == NULL) {
-			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
-				zsg->zsg_state = ZVOL_GEOM_RUNNING;
-				wakeup(&zsg->zsg_state);
-				mtx_unlock(&zsg->zsg_queue_mtx);
-				kthread_exit();
-			}
-			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
-			    PRIBIO | PDROP, "zvol:io", 0);
-			continue;
-		}
-		mtx_unlock(&zsg->zsg_queue_mtx);
-		zvol_geom_bio_strategy(bp);
-	}
-}
-
 static void
 zvol_geom_bio_start(struct bio *bp)
 {
 	zvol_state_t *zv = bp->bio_to->private;
-	struct zvol_state_geom *zsg;
-	boolean_t first;
 
 	if (zv == NULL) {
 		g_io_deliver(bp, ENXIO);
@@ -565,18 +504,8 @@ zvol_geom_bio_start(struct bio *bp)
 		return;
 	}
 
-	if (!THREAD_CAN_SLEEP()) {
-		zsg = &zv->zv_zso->zso_geom;
-		mtx_lock(&zsg->zsg_queue_mtx);
-		first = (bioq_first(&zsg->zsg_queue) == NULL);
-		bioq_insert_tail(&zsg->zsg_queue, bp);
-		mtx_unlock(&zsg->zsg_queue_mtx);
-		if (first)
-			wakeup_one(&zsg->zsg_queue);
-		return;
-	}
-
-	zvol_geom_bio_strategy(bp);
+	zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) &&
+	    THREAD_CAN_SLEEP());
 }
 
 static int
@@ -660,9 +589,10 @@ zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
 }
 
 static void
-zvol_geom_bio_strategy(struct bio *bp)
+zvol_strategy_impl(zv_request_t *zvr)
 {
 	zvol_state_t *zv;
+	struct bio *bp;
 	uint64_t off, volsize;
 	size_t resid;
 	char *addr;
@@ -673,11 +603,8 @@ zvol_geom_bio_strategy(struct bio *bp)
 	boolean_t is_dumpified;
 	boolean_t commit;
 
-	if (bp->bio_to)
-		zv = bp->bio_to->private;
-	else
-		zv = bp->bio_dev->si_drv2;
-
+	bp = zvr->bio;
+	zv = zvr->zv;
 	if (zv == NULL) {
 		error = SET_ERROR(ENXIO);
 		goto out;
@@ -813,6 +740,63 @@ out:
 		biofinish(bp, NULL, error);
 }
 
+static void
+zvol_strategy_task(void *arg)
+{
+	zv_request_task_t *task = arg;
+
+	zvol_strategy_impl(&task->zvr);
+	zv_request_task_free(task);
+}
+
+static void
+zvol_geom_bio_strategy(struct bio *bp, boolean_t sync)
+{
+	zv_taskq_t *ztqs = &zvol_taskqs;
+	zv_request_task_t *task;
+	zvol_state_t *zv;
+	uint_t tq_idx;
+	uint_t taskq_hash;
+	int error;
+
+	if (bp->bio_to)
+		zv = bp->bio_to->private;
+	else
+		zv = bp->bio_dev->si_drv2;
+
+	if (zv == NULL) {
+		error = SET_ERROR(ENXIO);
+		if (bp->bio_to)
+			g_io_deliver(bp, error);
+		else
+			biofinish(bp, NULL, error);
+		return;
+	}
+
+	zv_request_t zvr = {
+		.zv = zv,
+		.bio = bp,
+	};
+
+	if (sync || zvol_request_sync) {
+		zvol_strategy_impl(&zvr);
+		return;
+	}
+
+	taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >>
+	    ZVOL_TASKQ_OFFSET_SHIFT);
+	tq_idx = taskq_hash % ztqs->tqs_cnt;
+	task = zv_request_task_create(zvr);
+	taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task,
+	    0, &task->ent);
+}
+
+static void
+zvol_cdev_bio_strategy(struct bio *bp)
+{
+	zvol_geom_bio_strategy(bp, B_FALSE);
+}
+
 /*
  * Character device mode implementation
  */
@@ -1352,7 +1336,6 @@ zvol_os_free(zvol_state_t *zv)
 		g_topology_lock();
 		zvol_geom_destroy(zv);
 		g_topology_unlock();
-		mtx_destroy(&zsg->zsg_queue_mtx);
 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
 		struct cdev *dev = zsd->zsd_cdev;
@@ -1432,9 +1415,6 @@ zvol_os_create_minor(const char *name)
 		struct g_provider *pp;
 		struct g_geom *gp;
 
-		zsg->zsg_state = ZVOL_GEOM_UNINIT;
-		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
-
 		g_topology_lock();
 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
 		gp->start = zvol_geom_bio_start;
@@ -1446,7 +1426,6 @@ zvol_os_create_minor(const char *name)
 		pp->private = zv;
 
 		zsg->zsg_provider = pp;
-		bioq_init(&zsg->zsg_queue);
 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
 		struct cdev *dev;
@@ -1502,7 +1481,7 @@ out_dmu_objset_disown:
 	dmu_objset_disown(os, B_TRUE, FTAG);
 
 	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
-		zvol_geom_run(zv);
+		g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
 		g_topology_unlock();
 	}
 out_doi:
@@ -1529,14 +1508,7 @@ zvol_os_clear_private(zvol_state_t *zv)
 		if (pp->private == NULL) /* already cleared */
 			return;
 
-		mtx_lock(&zsg->zsg_queue_mtx);
-		zsg->zsg_state = ZVOL_GEOM_STOPPED;
 		pp->private = NULL;
-		wakeup_one(&zsg->zsg_queue);
-		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
-			msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
-			    0, "zvol:w", 0);
-		mtx_unlock(&zsg->zsg_queue_mtx);
 		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
@@ -1606,8 +1578,7 @@ zvol_busy(void)
 int
 zvol_init(void)
 {
-	zvol_init_impl();
-	return (0);
+	return (zvol_init_impl());
 }
 
 void
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index c8a045392..70046fe31 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -51,21 +51,13 @@ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
     struct request *rq, boolean_t force_sync);
 
 static unsigned int zvol_major = ZVOL_MAJOR;
-static unsigned int zvol_request_sync = 0;
 static unsigned int zvol_prefetch_bytes = (128 * 1024);
 static unsigned long zvol_max_discard_blocks = 16384;
 
-/*
- * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
- * to utilize more threads for small files but may affect prefetch hits.
- */
-#define	ZVOL_TASKQ_OFFSET_SHIFT 29
-
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 static unsigned int zvol_open_timeout_ms = 1000;
 #endif
 
-static unsigned int zvol_threads = 0;
 static unsigned int zvol_blk_mq_threads = 0;
 static unsigned int zvol_blk_mq_actual_threads;
 static boolean_t zvol_use_blk_mq = B_FALSE;
@@ -82,8 +74,6 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
  */
 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
 
-static unsigned int zvol_num_taskqs = 0;
-
 #ifndef	BLKDEV_DEFAULT_RQ
 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
 #define	BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
@@ -117,45 +107,8 @@ struct zvol_state_os {
 	boolean_t use_blk_mq;
 };
 
-typedef struct zv_taskq {
-	uint_t tqs_cnt;
-	taskq_t **tqs_taskq;
-} zv_taskq_t;
-static zv_taskq_t zvol_taskqs;
 static struct ida zvol_ida;
 
-typedef struct zv_request_stack {
-	zvol_state_t	*zv;
-	struct bio	*bio;
-	struct request *rq;
-} zv_request_t;
-
-typedef struct zv_work {
-	struct request  *rq;
-	struct work_struct work;
-} zv_work_t;
-
-typedef struct zv_request_task {
-	zv_request_t zvr;
-	taskq_ent_t	ent;
-} zv_request_task_t;
-
-static zv_request_task_t *
-zv_request_task_create(zv_request_t zvr)
-{
-	zv_request_task_t *task;
-	task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
-	taskq_init_ent(&task->ent);
-	task->zvr = zvr;
-	return (task);
-}
-
-static void
-zv_request_task_free(zv_request_task_t *task)
-{
-	kmem_free(task, sizeof (*task));
-}
-
 /*
  * This is called when a new block multiqueue request comes in.  A request
  * contains one or more BIOs.
@@ -1793,59 +1746,14 @@ zvol_init(void)
 {
 	int error;
 
-	/*
-	 * zvol_threads is the module param the user passes in.
-	 *
-	 * zvol_actual_threads is what we use internally, since the user can
-	 * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
-	 */
-	static unsigned int zvol_actual_threads;
-
-	if (zvol_threads == 0) {
-		/*
-		 * See dde9380a1 for why 32 was chosen here.  This should
-		 * probably be refined to be some multiple of the number
-		 * of CPUs.
-		 */
-		zvol_actual_threads = MAX(num_online_cpus(), 32);
-	} else {
-		zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
+	error = zvol_init_impl();
+	if (error) {
+		printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error);
+		return (error);
 	}
 
-	/*
-	 * Use atleast 32 zvol_threads but for many core system,
-	 * prefer 6 threads per taskq, but no more taskqs
-	 * than threads in them on large systems.
-	 *
-	 *                 taskq   total
-	 * cpus    taskqs  threads threads
-	 * ------- ------- ------- -------
-	 * 1       1       32       32
-	 * 2       1       32       32
-	 * 4       1       32       32
-	 * 8       2       16       32
-	 * 16      3       11       33
-	 * 32      5       7        35
-	 * 64      8       8        64
-	 * 128     11      12       132
-	 * 256     16      16       256
-	 */
-	zv_taskq_t *ztqs = &zvol_taskqs;
-	uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
-	if (num_tqs == 0) {
-		num_tqs = 1 + num_online_cpus() / 6;
-		while (num_tqs * num_tqs > zvol_actual_threads)
-			num_tqs--;
-	}
-	uint_t per_tq_thread = zvol_actual_threads / num_tqs;
-	if (per_tq_thread * num_tqs < zvol_actual_threads)
-		per_tq_thread++;
-	ztqs->tqs_cnt = num_tqs;
-	ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
 	error = register_blkdev(zvol_major, ZVOL_DRIVER);
 	if (error) {
-		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
-		ztqs->tqs_taskq = NULL;
 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
 		return (error);
 	}
@@ -1864,25 +1772,6 @@ zvol_init(void)
 		    1024);
 	}
 
-	for (uint_t i = 0; i < num_tqs; i++) {
-		char name[32];
-		(void) snprintf(name, sizeof (name), "%s_tq-%u",
-		    ZVOL_DRIVER, i);
-		ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
-		    maxclsyspri, per_tq_thread, INT_MAX,
-		    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
-		if (ztqs->tqs_taskq[i] == NULL) {
-			for (int j = i - 1; j >= 0; j--)
-				taskq_destroy(ztqs->tqs_taskq[j]);
-			unregister_blkdev(zvol_major, ZVOL_DRIVER);
-			kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
-			    sizeof (taskq_t *));
-			ztqs->tqs_taskq = NULL;
-			return (-ENOMEM);
-		}
-	}
-
-	zvol_init_impl();
 	ida_init(&zvol_ida);
 	return (0);
 }
@@ -1890,21 +1779,9 @@ zvol_init(void)
 void
 zvol_fini(void)
 {
-	zv_taskq_t *ztqs = &zvol_taskqs;
-	zvol_fini_impl();
 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
 
-	if (ztqs->tqs_taskq == NULL) {
-		ASSERT3U(ztqs->tqs_cnt, ==, 0);
-	} else {
-		for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
-			ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
-			taskq_destroy(ztqs->tqs_taskq[i]);
-		}
-		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
-		    sizeof (taskq_t *));
-		ztqs->tqs_taskq = NULL;
-	}
+	zvol_fini_impl();
 
 	ida_destroy(&zvol_ida);
 }
@@ -1915,19 +1792,9 @@ MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
 module_param(zvol_major, uint, 0444);
 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
 
-module_param(zvol_threads, uint, 0444);
-MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
-	"to 0 to use all active CPUs");
-
-module_param(zvol_request_sync, uint, 0644);
-MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
-
 module_param(zvol_max_discard_blocks, ulong, 0444);
 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
 
-module_param(zvol_num_taskqs, uint, 0444);
-MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
-
 module_param(zvol_prefetch_bytes, uint, 0644);
 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
 
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index d985ec0d6..23d1265b2 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -90,11 +90,15 @@
 
 unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
+unsigned int zvol_threads = 0;
+unsigned int zvol_num_taskqs = 0;
+unsigned int zvol_request_sync = 0;
 
 struct hlist_head *zvol_htable;
 static list_t zvol_state_list;
 krwlock_t zvol_state_lock;
 extern int zfs_bclone_wait_dirty;
+zv_taskq_t zvol_taskqs;
 
 typedef enum {
 	ZVOL_ASYNC_REMOVE_MINORS,
@@ -111,6 +115,22 @@ typedef struct {
 	uint64_t value;
 } zvol_task_t;
 
+zv_request_task_t *
+zv_request_task_create(zv_request_t zvr)
+{
+	zv_request_task_t *task;
+	task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
+	taskq_init_ent(&task->ent);
+	task->zvr = zvr;
+	return (task);
+}
+
+void
+zv_request_task_free(zv_request_task_t *task)
+{
+	kmem_free(task, sizeof (*task));
+}
+
 uint64_t
 zvol_name_hash(const char *name)
 {
@@ -2018,6 +2038,75 @@ zvol_init_impl(void)
 {
 	int i;
 
+	/*
+	 * zvol_threads is the module param the user passes in.
+	 *
+	 * zvol_actual_threads is what we use internally, since the user can
+	 * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
+	 */
+	static unsigned int zvol_actual_threads;
+
+	if (zvol_threads == 0) {
+		/*
+		 * See dde9380a1 for why 32 was chosen here.  This should
+		 * probably be refined to be some multiple of the number
+		 * of CPUs.
+		 */
+		zvol_actual_threads = MAX(max_ncpus, 32);
+	} else {
+		zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
+	}
+
+	/*
+	 * Use at least 32 zvol_threads but for many core system,
+	 * prefer 6 threads per taskq, but no more taskqs
+	 * than threads in them on large systems.
+	 *
+	 *                 taskq   total
+	 * cpus    taskqs  threads threads
+	 * ------- ------- ------- -------
+	 * 1       1       32       32
+	 * 2       1       32       32
+	 * 4       1       32       32
+	 * 8       2       16       32
+	 * 16      3       11       33
+	 * 32      5       7        35
+	 * 64      8       8        64
+	 * 128     11      12       132
+	 * 256     16      16       256
+	 */
+	zv_taskq_t *ztqs = &zvol_taskqs;
+	int num_tqs = MIN(max_ncpus, zvol_num_taskqs);
+	if (num_tqs == 0) {
+		num_tqs = 1 + max_ncpus / 6;
+		while (num_tqs * num_tqs > zvol_actual_threads)
+			num_tqs--;
+	}
+
+	int per_tq_thread = zvol_actual_threads / num_tqs;
+	if (per_tq_thread * num_tqs < zvol_actual_threads)
+		per_tq_thread++;
+
+	ztqs->tqs_cnt = num_tqs;
+	ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
+
+	for (uint_t i = 0; i < num_tqs; i++) {
+		char name[32];
+		(void) snprintf(name, sizeof (name), "%s_tq-%u",
+		    ZVOL_DRIVER, i);
+		ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
+		    maxclsyspri, per_tq_thread, INT_MAX,
+		    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+		if (ztqs->tqs_taskq[i] == NULL) {
+			for (int j = i - 1; j >= 0; j--)
+				taskq_destroy(ztqs->tqs_taskq[j]);
+			kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
+			    sizeof (taskq_t *));
+			ztqs->tqs_taskq = NULL;
+			return (SET_ERROR(ENOMEM));
+		}
+	}
+
 	list_create(&zvol_state_list, sizeof (zvol_state_t),
 	    offsetof(zvol_state_t, zv_next));
 	rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
@@ -2033,6 +2122,8 @@ zvol_init_impl(void)
 void
 zvol_fini_impl(void)
 {
+	zv_taskq_t *ztqs = &zvol_taskqs;
+
 	zvol_remove_minors_impl(NULL);
 
 	/*
@@ -2046,4 +2137,23 @@ zvol_fini_impl(void)
 	kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
 	list_destroy(&zvol_state_list);
 	rw_destroy(&zvol_state_lock);
+
+	if (ztqs->tqs_taskq == NULL) {
+		ASSERT3U(ztqs->tqs_cnt, ==, 0);
+	} else {
+		for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
+			ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
+			taskq_destroy(ztqs->tqs_taskq[i]);
+		}
+		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
+		    sizeof (taskq_t *));
+		ztqs->tqs_taskq = NULL;
+	}
 }
+
+ZFS_MODULE_PARAM(zfs, , zvol_threads, UINT, ZMOD_RW,
+	"Number of threads for I/O requests. Set to 0 to use all active CPUs");
+ZFS_MODULE_PARAM(zfs, , zvol_num_taskqs, UINT, ZMOD_RW,
+	"Number of zvol taskqs");
+ZFS_MODULE_PARAM(zfs, , zvol_request_sync, UINT, ZMOD_RW,
+	"Synchronously handle bio requests");
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 8bffe9d82..4b8db1893 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -3386,17 +3386,21 @@ function set_tunable_impl
 
 function save_tunable
 {
-	[[ ! -d $TEST_BASE_DIR ]] && return 1
-	[[ -e $TEST_BASE_DIR/tunable-$1 ]] && return 2
-	echo "$(get_tunable """$1""")" > "$TEST_BASE_DIR"/tunable-"$1"
+	if tunable_exists $1 ; then
+		[[ ! -d $TEST_BASE_DIR ]] && return 1
+		[[ -e $TEST_BASE_DIR/tunable-$1 ]] && return 2
+		echo "$(get_tunable """$1""")" > "$TEST_BASE_DIR"/tunable-"$1"
+	fi
 }
 
 function restore_tunable
 {
-	[[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1
-	val="$(cat $TEST_BASE_DIR/tunable-"""$1""")"
-	set_tunable64 "$1" "$val"
-	rm $TEST_BASE_DIR/tunable-$1
+	if tunable_exists $1 ; then
+		[[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1
+		val="$(cat $TEST_BASE_DIR/tunable-"""$1""")"
+		set_tunable64 "$1" "$val"
+		rm $TEST_BASE_DIR/tunable-$1
+	fi
 }
 
 #
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 79dc64ad9..c0aba27d3 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -102,6 +102,7 @@ VDEV_VALIDATE_SKIP		vdev.validate_skip		vdev_validate_skip
 VOL_INHIBIT_DEV			UNSUPPORTED			zvol_inhibit_dev
 VOL_MODE			vol.mode			zvol_volmode
 VOL_RECURSIVE			vol.recursive			UNSUPPORTED
+VOL_REQUEST_SYNC		zvol_request_sync		zvol_request_sync
 VOL_USE_BLK_MQ			UNSUPPORTED			zvol_use_blk_mq
 BCLONE_ENABLED			bclone_enabled			zfs_bclone_enabled
 BCLONE_WAIT_DIRTY		bclone_wait_dirty		zfs_bclone_wait_dirty
diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib b/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib
index c3069d681..8e0d8141f 100644
--- a/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib
+++ b/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib
@@ -140,3 +140,11 @@ function set_blk_mq
 		log_must set_tunable32 VOL_USE_BLK_MQ $1
 	fi
 }
+
+# enable/disable zvol sync mode
+#
+# $1: 1 = enable, 0 = disable
+function set_zvol_sync
+{
+	log_must set_tunable32 VOL_REQUEST_SYNC $1
+}
diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh
index 1805e597e..9047f14bc 100755
--- a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh
+++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh
@@ -60,6 +60,9 @@ typeset -f each_zvol_size=$(( floor($biggest_zvol_size_possible * 0.9 / \
 
 typeset tmpdir="$(mktemp -t -d zvol_stress_fio_state.XXXXXX)"
 
+log_must save_tunable VOL_USE_BLK_MQ
+log_must save_tunable VOL_REQUEST_SYNC
+
 function create_zvols
 {
 	log_note "Creating $num_zvols zvols that are ${each_zvol_size}B each"
@@ -124,7 +127,8 @@ function cleanup
 	log_must zinject -c all
 	log_must zpool clear $TESTPOOL
 	destroy_zvols
-	set_blk_mq 0
+	log_must restore_tunable VOL_USE_BLK_MQ
+	log_must restore_tunable VOL_REQUEST_SYNC
 
 	# Remove all fio's leftover state files
 	if [ -n "$tmpdir" ] ; then
@@ -146,6 +150,18 @@ destroy_zvols
 set_blk_mq 1
 create_zvols
 do_zvol_stress
+destroy_zvols
+
+# Disable zvol sync mode, and re-run test
+set_zvol_sync 0
+create_zvols
+do_zvol_stress
+destroy_zvols
+
+# Same for enabled zvol sync mode
+set_zvol_sync 1
+create_zvols
+do_zvol_stress
 
 # Inject some errors, and verify we see some IO errors in zpool status
 sync_pool $TESTPOOL