mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-05-22 22:45:02 +03:00
zvol: Enable zvol threading functionality on FreeBSD
Make zvol I/O requests processing asynchronous on FreeBSD side in some cases. Clone zvol threading logic and required module parameters from Linux side. Make zvol threadpool creation/destruction logic shared for both Linux and FreeBSD. The IO requests are processed asynchronously in next cases: - volmode=geom: if IO request thread is geom thread or cannot sleep. - volmode=cdev: if IO request passed thru struct cdevsw .d_strategy routine, mean is AIO request. In all other cases the IO requests are processed synchronously. The volthreading zvol property is ignored on FreeBSD side. Sponsored-by: vStack, Inc. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: @ImAwsumm Signed-off-by: Fedor Uporov <fuporov.vstack@gmail.com> Closes #17169
This commit is contained in:
parent
f13d760aa8
commit
1a8f5ad3b0
@ -60,6 +60,32 @@ typedef struct zvol_state {
|
||||
boolean_t zv_threading; /* volthreading property */
|
||||
} zvol_state_t;
|
||||
|
||||
/*
|
||||
* zvol taskqs
|
||||
*/
|
||||
typedef struct zv_taskq {
|
||||
uint_t tqs_cnt;
|
||||
taskq_t **tqs_taskq;
|
||||
} zv_taskq_t;
|
||||
|
||||
typedef struct zv_request_stack {
|
||||
zvol_state_t *zv;
|
||||
struct bio *bio;
|
||||
#ifdef __linux__
|
||||
struct request *rq;
|
||||
#endif
|
||||
} zv_request_t;
|
||||
|
||||
typedef struct zv_request_task {
|
||||
zv_request_t zvr;
|
||||
taskq_ent_t ent;
|
||||
} zv_request_task_t;
|
||||
|
||||
/*
|
||||
* Switch taskq at multiple of 512 MB offset. This can be set to a lower value
|
||||
* to utilize more threads for small files but may affect prefetch hits.
|
||||
*/
|
||||
#define ZVOL_TASKQ_OFFSET_SHIFT 29
|
||||
|
||||
extern krwlock_t zvol_state_lock;
|
||||
#define ZVOL_HT_SIZE 1024
|
||||
@ -69,6 +95,10 @@ extern zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE];
|
||||
|
||||
extern unsigned int zvol_volmode;
|
||||
extern unsigned int zvol_inhibit_dev;
|
||||
extern unsigned int zvol_threads;
|
||||
extern unsigned int zvol_num_taskqs;
|
||||
extern unsigned int zvol_request_sync;
|
||||
extern zv_taskq_t zvol_taskqs;
|
||||
|
||||
/*
|
||||
* platform independent functions exported to platform code
|
||||
@ -94,6 +124,8 @@ int zvol_clone_range(zvol_state_handle_t *, uint64_t,
|
||||
void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps,
|
||||
size_t nbps);
|
||||
zv_request_task_t *zv_request_task_create(zv_request_t zvr);
|
||||
void zv_request_task_free(zv_request_task_t *task);
|
||||
|
||||
/*
|
||||
* platform dependent functions exported to platform independent code
|
||||
|
@ -99,6 +99,7 @@
|
||||
#include <geom/geom.h>
|
||||
#include <sys/zvol.h>
|
||||
#include <sys/zvol_impl.h>
|
||||
#include <cityhash.h>
|
||||
|
||||
#include "zfs_namecheck.h"
|
||||
|
||||
@ -112,12 +113,6 @@
|
||||
#define ZVOL_RW_READ_HELD RW_READ_HELD
|
||||
#endif
|
||||
|
||||
enum zvol_geom_state {
|
||||
ZVOL_GEOM_UNINIT,
|
||||
ZVOL_GEOM_STOPPED,
|
||||
ZVOL_GEOM_RUNNING,
|
||||
};
|
||||
|
||||
struct zvol_state_os {
|
||||
#define zso_dev _zso_state._zso_dev
|
||||
#define zso_geom _zso_state._zso_geom
|
||||
@ -131,9 +126,6 @@ struct zvol_state_os {
|
||||
/* volmode=geom */
|
||||
struct zvol_state_geom {
|
||||
struct g_provider *zsg_provider;
|
||||
struct bio_queue_head zsg_queue;
|
||||
struct mtx zsg_queue_mtx;
|
||||
enum zvol_geom_state zsg_state;
|
||||
} _zso_geom;
|
||||
} _zso_state;
|
||||
int zso_dying;
|
||||
@ -169,7 +161,7 @@ static d_close_t zvol_cdev_close;
|
||||
static d_ioctl_t zvol_cdev_ioctl;
|
||||
static d_read_t zvol_cdev_read;
|
||||
static d_write_t zvol_cdev_write;
|
||||
static d_strategy_t zvol_geom_bio_strategy;
|
||||
static d_strategy_t zvol_cdev_bio_strategy;
|
||||
static d_kqfilter_t zvol_cdev_kqfilter;
|
||||
|
||||
static struct cdevsw zvol_cdevsw = {
|
||||
@ -181,7 +173,7 @@ static struct cdevsw zvol_cdevsw = {
|
||||
.d_ioctl = zvol_cdev_ioctl,
|
||||
.d_read = zvol_cdev_read,
|
||||
.d_write = zvol_cdev_write,
|
||||
.d_strategy = zvol_geom_bio_strategy,
|
||||
.d_strategy = zvol_cdev_bio_strategy,
|
||||
.d_kqfilter = zvol_cdev_kqfilter,
|
||||
};
|
||||
|
||||
@ -205,13 +197,11 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
|
||||
|
||||
static int zvol_geom_open(struct g_provider *pp, int flag, int count);
|
||||
static int zvol_geom_close(struct g_provider *pp, int flag, int count);
|
||||
static void zvol_geom_run(zvol_state_t *zv);
|
||||
static void zvol_geom_destroy(zvol_state_t *zv);
|
||||
static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
|
||||
static void zvol_geom_worker(void *arg);
|
||||
static void zvol_geom_bio_start(struct bio *bp);
|
||||
static int zvol_geom_bio_getattr(struct bio *bp);
|
||||
/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
|
||||
static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync);
|
||||
|
||||
/*
|
||||
* GEOM mode implementation
|
||||
@ -419,20 +409,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
zvol_geom_run(zvol_state_t *zv)
|
||||
{
|
||||
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
|
||||
struct g_provider *pp = zsg->zsg_provider;
|
||||
|
||||
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
|
||||
|
||||
g_error_provider(pp, 0);
|
||||
|
||||
kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
|
||||
"zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
|
||||
}
|
||||
|
||||
static void
|
||||
zvol_geom_destroy(zvol_state_t *zv)
|
||||
{
|
||||
@ -443,9 +419,6 @@ zvol_geom_destroy(zvol_state_t *zv)
|
||||
|
||||
g_topology_assert();
|
||||
|
||||
mutex_enter(&zv->zv_state_lock);
|
||||
VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
|
||||
mutex_exit(&zv->zv_state_lock);
|
||||
zsg->zsg_provider = NULL;
|
||||
g_wither_geom(pp->geom, ENXIO);
|
||||
}
|
||||
@ -516,44 +489,10 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
|
||||
return (error);
|
||||
}
|
||||
|
||||
static void
|
||||
zvol_geom_worker(void *arg)
|
||||
{
|
||||
zvol_state_t *zv = arg;
|
||||
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
|
||||
struct bio *bp;
|
||||
|
||||
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
|
||||
|
||||
thread_lock(curthread);
|
||||
sched_prio(curthread, PRIBIO);
|
||||
thread_unlock(curthread);
|
||||
|
||||
for (;;) {
|
||||
mtx_lock(&zsg->zsg_queue_mtx);
|
||||
bp = bioq_takefirst(&zsg->zsg_queue);
|
||||
if (bp == NULL) {
|
||||
if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
|
||||
zsg->zsg_state = ZVOL_GEOM_RUNNING;
|
||||
wakeup(&zsg->zsg_state);
|
||||
mtx_unlock(&zsg->zsg_queue_mtx);
|
||||
kthread_exit();
|
||||
}
|
||||
msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
|
||||
PRIBIO | PDROP, "zvol:io", 0);
|
||||
continue;
|
||||
}
|
||||
mtx_unlock(&zsg->zsg_queue_mtx);
|
||||
zvol_geom_bio_strategy(bp);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
zvol_geom_bio_start(struct bio *bp)
|
||||
{
|
||||
zvol_state_t *zv = bp->bio_to->private;
|
||||
struct zvol_state_geom *zsg;
|
||||
boolean_t first;
|
||||
|
||||
if (zv == NULL) {
|
||||
g_io_deliver(bp, ENXIO);
|
||||
@ -565,18 +504,8 @@ zvol_geom_bio_start(struct bio *bp)
|
||||
return;
|
||||
}
|
||||
|
||||
if (!THREAD_CAN_SLEEP()) {
|
||||
zsg = &zv->zv_zso->zso_geom;
|
||||
mtx_lock(&zsg->zsg_queue_mtx);
|
||||
first = (bioq_first(&zsg->zsg_queue) == NULL);
|
||||
bioq_insert_tail(&zsg->zsg_queue, bp);
|
||||
mtx_unlock(&zsg->zsg_queue_mtx);
|
||||
if (first)
|
||||
wakeup_one(&zsg->zsg_queue);
|
||||
return;
|
||||
}
|
||||
|
||||
zvol_geom_bio_strategy(bp);
|
||||
zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) &&
|
||||
THREAD_CAN_SLEEP());
|
||||
}
|
||||
|
||||
static int
|
||||
@ -660,9 +589,10 @@ zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
|
||||
}
|
||||
|
||||
static void
|
||||
zvol_geom_bio_strategy(struct bio *bp)
|
||||
zvol_strategy_impl(zv_request_t *zvr)
|
||||
{
|
||||
zvol_state_t *zv;
|
||||
struct bio *bp;
|
||||
uint64_t off, volsize;
|
||||
size_t resid;
|
||||
char *addr;
|
||||
@ -673,11 +603,8 @@ zvol_geom_bio_strategy(struct bio *bp)
|
||||
boolean_t is_dumpified;
|
||||
boolean_t commit;
|
||||
|
||||
if (bp->bio_to)
|
||||
zv = bp->bio_to->private;
|
||||
else
|
||||
zv = bp->bio_dev->si_drv2;
|
||||
|
||||
bp = zvr->bio;
|
||||
zv = zvr->zv;
|
||||
if (zv == NULL) {
|
||||
error = SET_ERROR(ENXIO);
|
||||
goto out;
|
||||
@ -813,6 +740,63 @@ out:
|
||||
biofinish(bp, NULL, error);
|
||||
}
|
||||
|
||||
static void
|
||||
zvol_strategy_task(void *arg)
|
||||
{
|
||||
zv_request_task_t *task = arg;
|
||||
|
||||
zvol_strategy_impl(&task->zvr);
|
||||
zv_request_task_free(task);
|
||||
}
|
||||
|
||||
static void
|
||||
zvol_geom_bio_strategy(struct bio *bp, boolean_t sync)
|
||||
{
|
||||
zv_taskq_t *ztqs = &zvol_taskqs;
|
||||
zv_request_task_t *task;
|
||||
zvol_state_t *zv;
|
||||
uint_t tq_idx;
|
||||
uint_t taskq_hash;
|
||||
int error;
|
||||
|
||||
if (bp->bio_to)
|
||||
zv = bp->bio_to->private;
|
||||
else
|
||||
zv = bp->bio_dev->si_drv2;
|
||||
|
||||
if (zv == NULL) {
|
||||
error = SET_ERROR(ENXIO);
|
||||
if (bp->bio_to)
|
||||
g_io_deliver(bp, error);
|
||||
else
|
||||
biofinish(bp, NULL, error);
|
||||
return;
|
||||
}
|
||||
|
||||
zv_request_t zvr = {
|
||||
.zv = zv,
|
||||
.bio = bp,
|
||||
};
|
||||
|
||||
if (sync || zvol_request_sync) {
|
||||
zvol_strategy_impl(&zvr);
|
||||
return;
|
||||
}
|
||||
|
||||
taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >>
|
||||
ZVOL_TASKQ_OFFSET_SHIFT);
|
||||
tq_idx = taskq_hash % ztqs->tqs_cnt;
|
||||
task = zv_request_task_create(zvr);
|
||||
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task,
|
||||
0, &task->ent);
|
||||
}
|
||||
|
||||
static void
|
||||
zvol_cdev_bio_strategy(struct bio *bp)
|
||||
{
|
||||
zvol_geom_bio_strategy(bp, B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Character device mode implementation
|
||||
*/
|
||||
@ -1352,7 +1336,6 @@ zvol_os_free(zvol_state_t *zv)
|
||||
g_topology_lock();
|
||||
zvol_geom_destroy(zv);
|
||||
g_topology_unlock();
|
||||
mtx_destroy(&zsg->zsg_queue_mtx);
|
||||
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
|
||||
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
|
||||
struct cdev *dev = zsd->zsd_cdev;
|
||||
@ -1432,9 +1415,6 @@ zvol_os_create_minor(const char *name)
|
||||
struct g_provider *pp;
|
||||
struct g_geom *gp;
|
||||
|
||||
zsg->zsg_state = ZVOL_GEOM_UNINIT;
|
||||
mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
|
||||
|
||||
g_topology_lock();
|
||||
gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
|
||||
gp->start = zvol_geom_bio_start;
|
||||
@ -1446,7 +1426,6 @@ zvol_os_create_minor(const char *name)
|
||||
pp->private = zv;
|
||||
|
||||
zsg->zsg_provider = pp;
|
||||
bioq_init(&zsg->zsg_queue);
|
||||
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
|
||||
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
|
||||
struct cdev *dev;
|
||||
@ -1502,7 +1481,7 @@ out_dmu_objset_disown:
|
||||
dmu_objset_disown(os, B_TRUE, FTAG);
|
||||
|
||||
if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
|
||||
zvol_geom_run(zv);
|
||||
g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
|
||||
g_topology_unlock();
|
||||
}
|
||||
out_doi:
|
||||
@ -1529,14 +1508,7 @@ zvol_os_clear_private(zvol_state_t *zv)
|
||||
if (pp->private == NULL) /* already cleared */
|
||||
return;
|
||||
|
||||
mtx_lock(&zsg->zsg_queue_mtx);
|
||||
zsg->zsg_state = ZVOL_GEOM_STOPPED;
|
||||
pp->private = NULL;
|
||||
wakeup_one(&zsg->zsg_queue);
|
||||
while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
|
||||
msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
|
||||
0, "zvol:w", 0);
|
||||
mtx_unlock(&zsg->zsg_queue_mtx);
|
||||
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
|
||||
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
|
||||
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
|
||||
@ -1606,8 +1578,7 @@ zvol_busy(void)
|
||||
int
|
||||
zvol_init(void)
|
||||
{
|
||||
zvol_init_impl();
|
||||
return (0);
|
||||
return (zvol_init_impl());
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -51,21 +51,13 @@ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
|
||||
struct request *rq, boolean_t force_sync);
|
||||
|
||||
static unsigned int zvol_major = ZVOL_MAJOR;
|
||||
static unsigned int zvol_request_sync = 0;
|
||||
static unsigned int zvol_prefetch_bytes = (128 * 1024);
|
||||
static unsigned long zvol_max_discard_blocks = 16384;
|
||||
|
||||
/*
|
||||
* Switch taskq at multiple of 512 MB offset. This can be set to a lower value
|
||||
* to utilize more threads for small files but may affect prefetch hits.
|
||||
*/
|
||||
#define ZVOL_TASKQ_OFFSET_SHIFT 29
|
||||
|
||||
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
|
||||
static unsigned int zvol_open_timeout_ms = 1000;
|
||||
#endif
|
||||
|
||||
static unsigned int zvol_threads = 0;
|
||||
static unsigned int zvol_blk_mq_threads = 0;
|
||||
static unsigned int zvol_blk_mq_actual_threads;
|
||||
static boolean_t zvol_use_blk_mq = B_FALSE;
|
||||
@ -82,8 +74,6 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
|
||||
*/
|
||||
static unsigned int zvol_blk_mq_blocks_per_thread = 8;
|
||||
|
||||
static unsigned int zvol_num_taskqs = 0;
|
||||
|
||||
#ifndef BLKDEV_DEFAULT_RQ
|
||||
/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
|
||||
#define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
|
||||
@ -117,45 +107,8 @@ struct zvol_state_os {
|
||||
boolean_t use_blk_mq;
|
||||
};
|
||||
|
||||
typedef struct zv_taskq {
|
||||
uint_t tqs_cnt;
|
||||
taskq_t **tqs_taskq;
|
||||
} zv_taskq_t;
|
||||
static zv_taskq_t zvol_taskqs;
|
||||
static struct ida zvol_ida;
|
||||
|
||||
typedef struct zv_request_stack {
|
||||
zvol_state_t *zv;
|
||||
struct bio *bio;
|
||||
struct request *rq;
|
||||
} zv_request_t;
|
||||
|
||||
typedef struct zv_work {
|
||||
struct request *rq;
|
||||
struct work_struct work;
|
||||
} zv_work_t;
|
||||
|
||||
typedef struct zv_request_task {
|
||||
zv_request_t zvr;
|
||||
taskq_ent_t ent;
|
||||
} zv_request_task_t;
|
||||
|
||||
static zv_request_task_t *
|
||||
zv_request_task_create(zv_request_t zvr)
|
||||
{
|
||||
zv_request_task_t *task;
|
||||
task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
|
||||
taskq_init_ent(&task->ent);
|
||||
task->zvr = zvr;
|
||||
return (task);
|
||||
}
|
||||
|
||||
static void
|
||||
zv_request_task_free(zv_request_task_t *task)
|
||||
{
|
||||
kmem_free(task, sizeof (*task));
|
||||
}
|
||||
|
||||
/*
|
||||
* This is called when a new block multiqueue request comes in. A request
|
||||
* contains one or more BIOs.
|
||||
@ -1793,59 +1746,14 @@ zvol_init(void)
|
||||
{
|
||||
int error;
|
||||
|
||||
/*
|
||||
* zvol_threads is the module param the user passes in.
|
||||
*
|
||||
* zvol_actual_threads is what we use internally, since the user can
|
||||
* pass zvol_thread = 0 to mean "use all the CPUs" (the default).
|
||||
*/
|
||||
static unsigned int zvol_actual_threads;
|
||||
|
||||
if (zvol_threads == 0) {
|
||||
/*
|
||||
* See dde9380a1 for why 32 was chosen here. This should
|
||||
* probably be refined to be some multiple of the number
|
||||
* of CPUs.
|
||||
*/
|
||||
zvol_actual_threads = MAX(num_online_cpus(), 32);
|
||||
} else {
|
||||
zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
|
||||
error = zvol_init_impl();
|
||||
if (error) {
|
||||
printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error);
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Use atleast 32 zvol_threads but for many core system,
|
||||
* prefer 6 threads per taskq, but no more taskqs
|
||||
* than threads in them on large systems.
|
||||
*
|
||||
* taskq total
|
||||
* cpus taskqs threads threads
|
||||
* ------- ------- ------- -------
|
||||
* 1 1 32 32
|
||||
* 2 1 32 32
|
||||
* 4 1 32 32
|
||||
* 8 2 16 32
|
||||
* 16 3 11 33
|
||||
* 32 5 7 35
|
||||
* 64 8 8 64
|
||||
* 128 11 12 132
|
||||
* 256 16 16 256
|
||||
*/
|
||||
zv_taskq_t *ztqs = &zvol_taskqs;
|
||||
uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
|
||||
if (num_tqs == 0) {
|
||||
num_tqs = 1 + num_online_cpus() / 6;
|
||||
while (num_tqs * num_tqs > zvol_actual_threads)
|
||||
num_tqs--;
|
||||
}
|
||||
uint_t per_tq_thread = zvol_actual_threads / num_tqs;
|
||||
if (per_tq_thread * num_tqs < zvol_actual_threads)
|
||||
per_tq_thread++;
|
||||
ztqs->tqs_cnt = num_tqs;
|
||||
ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
|
||||
error = register_blkdev(zvol_major, ZVOL_DRIVER);
|
||||
if (error) {
|
||||
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
|
||||
ztqs->tqs_taskq = NULL;
|
||||
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
|
||||
return (error);
|
||||
}
|
||||
@ -1864,25 +1772,6 @@ zvol_init(void)
|
||||
1024);
|
||||
}
|
||||
|
||||
for (uint_t i = 0; i < num_tqs; i++) {
|
||||
char name[32];
|
||||
(void) snprintf(name, sizeof (name), "%s_tq-%u",
|
||||
ZVOL_DRIVER, i);
|
||||
ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
|
||||
maxclsyspri, per_tq_thread, INT_MAX,
|
||||
TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
|
||||
if (ztqs->tqs_taskq[i] == NULL) {
|
||||
for (int j = i - 1; j >= 0; j--)
|
||||
taskq_destroy(ztqs->tqs_taskq[j]);
|
||||
unregister_blkdev(zvol_major, ZVOL_DRIVER);
|
||||
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
|
||||
sizeof (taskq_t *));
|
||||
ztqs->tqs_taskq = NULL;
|
||||
return (-ENOMEM);
|
||||
}
|
||||
}
|
||||
|
||||
zvol_init_impl();
|
||||
ida_init(&zvol_ida);
|
||||
return (0);
|
||||
}
|
||||
@ -1890,21 +1779,9 @@ zvol_init(void)
|
||||
void
|
||||
zvol_fini(void)
|
||||
{
|
||||
zv_taskq_t *ztqs = &zvol_taskqs;
|
||||
zvol_fini_impl();
|
||||
unregister_blkdev(zvol_major, ZVOL_DRIVER);
|
||||
|
||||
if (ztqs->tqs_taskq == NULL) {
|
||||
ASSERT3U(ztqs->tqs_cnt, ==, 0);
|
||||
} else {
|
||||
for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
|
||||
ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
|
||||
taskq_destroy(ztqs->tqs_taskq[i]);
|
||||
}
|
||||
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
|
||||
sizeof (taskq_t *));
|
||||
ztqs->tqs_taskq = NULL;
|
||||
}
|
||||
zvol_fini_impl();
|
||||
|
||||
ida_destroy(&zvol_ida);
|
||||
}
|
||||
@ -1915,19 +1792,9 @@ MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
|
||||
module_param(zvol_major, uint, 0444);
|
||||
MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
|
||||
|
||||
module_param(zvol_threads, uint, 0444);
|
||||
MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
|
||||
"to 0 to use all active CPUs");
|
||||
|
||||
module_param(zvol_request_sync, uint, 0644);
|
||||
MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
|
||||
|
||||
module_param(zvol_max_discard_blocks, ulong, 0444);
|
||||
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
|
||||
|
||||
module_param(zvol_num_taskqs, uint, 0444);
|
||||
MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
|
||||
|
||||
module_param(zvol_prefetch_bytes, uint, 0644);
|
||||
MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
|
||||
|
||||
|
@ -90,11 +90,15 @@
|
||||
|
||||
unsigned int zvol_inhibit_dev = 0;
|
||||
unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
|
||||
unsigned int zvol_threads = 0;
|
||||
unsigned int zvol_num_taskqs = 0;
|
||||
unsigned int zvol_request_sync = 0;
|
||||
|
||||
struct hlist_head *zvol_htable;
|
||||
static list_t zvol_state_list;
|
||||
krwlock_t zvol_state_lock;
|
||||
extern int zfs_bclone_wait_dirty;
|
||||
zv_taskq_t zvol_taskqs;
|
||||
|
||||
typedef enum {
|
||||
ZVOL_ASYNC_REMOVE_MINORS,
|
||||
@ -111,6 +115,22 @@ typedef struct {
|
||||
uint64_t value;
|
||||
} zvol_task_t;
|
||||
|
||||
zv_request_task_t *
|
||||
zv_request_task_create(zv_request_t zvr)
|
||||
{
|
||||
zv_request_task_t *task;
|
||||
task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
|
||||
taskq_init_ent(&task->ent);
|
||||
task->zvr = zvr;
|
||||
return (task);
|
||||
}
|
||||
|
||||
void
|
||||
zv_request_task_free(zv_request_task_t *task)
|
||||
{
|
||||
kmem_free(task, sizeof (*task));
|
||||
}
|
||||
|
||||
uint64_t
|
||||
zvol_name_hash(const char *name)
|
||||
{
|
||||
@ -2018,6 +2038,75 @@ zvol_init_impl(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
/*
|
||||
* zvol_threads is the module param the user passes in.
|
||||
*
|
||||
* zvol_actual_threads is what we use internally, since the user can
|
||||
* pass zvol_thread = 0 to mean "use all the CPUs" (the default).
|
||||
*/
|
||||
static unsigned int zvol_actual_threads;
|
||||
|
||||
if (zvol_threads == 0) {
|
||||
/*
|
||||
* See dde9380a1 for why 32 was chosen here. This should
|
||||
* probably be refined to be some multiple of the number
|
||||
* of CPUs.
|
||||
*/
|
||||
zvol_actual_threads = MAX(max_ncpus, 32);
|
||||
} else {
|
||||
zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
|
||||
}
|
||||
|
||||
/*
|
||||
* Use at least 32 zvol_threads but for many core system,
|
||||
* prefer 6 threads per taskq, but no more taskqs
|
||||
* than threads in them on large systems.
|
||||
*
|
||||
* taskq total
|
||||
* cpus taskqs threads threads
|
||||
* ------- ------- ------- -------
|
||||
* 1 1 32 32
|
||||
* 2 1 32 32
|
||||
* 4 1 32 32
|
||||
* 8 2 16 32
|
||||
* 16 3 11 33
|
||||
* 32 5 7 35
|
||||
* 64 8 8 64
|
||||
* 128 11 12 132
|
||||
* 256 16 16 256
|
||||
*/
|
||||
zv_taskq_t *ztqs = &zvol_taskqs;
|
||||
int num_tqs = MIN(max_ncpus, zvol_num_taskqs);
|
||||
if (num_tqs == 0) {
|
||||
num_tqs = 1 + max_ncpus / 6;
|
||||
while (num_tqs * num_tqs > zvol_actual_threads)
|
||||
num_tqs--;
|
||||
}
|
||||
|
||||
int per_tq_thread = zvol_actual_threads / num_tqs;
|
||||
if (per_tq_thread * num_tqs < zvol_actual_threads)
|
||||
per_tq_thread++;
|
||||
|
||||
ztqs->tqs_cnt = num_tqs;
|
||||
ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
|
||||
|
||||
for (uint_t i = 0; i < num_tqs; i++) {
|
||||
char name[32];
|
||||
(void) snprintf(name, sizeof (name), "%s_tq-%u",
|
||||
ZVOL_DRIVER, i);
|
||||
ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
|
||||
maxclsyspri, per_tq_thread, INT_MAX,
|
||||
TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
|
||||
if (ztqs->tqs_taskq[i] == NULL) {
|
||||
for (int j = i - 1; j >= 0; j--)
|
||||
taskq_destroy(ztqs->tqs_taskq[j]);
|
||||
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
|
||||
sizeof (taskq_t *));
|
||||
ztqs->tqs_taskq = NULL;
|
||||
return (SET_ERROR(ENOMEM));
|
||||
}
|
||||
}
|
||||
|
||||
list_create(&zvol_state_list, sizeof (zvol_state_t),
|
||||
offsetof(zvol_state_t, zv_next));
|
||||
rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
|
||||
@ -2033,6 +2122,8 @@ zvol_init_impl(void)
|
||||
void
|
||||
zvol_fini_impl(void)
|
||||
{
|
||||
zv_taskq_t *ztqs = &zvol_taskqs;
|
||||
|
||||
zvol_remove_minors_impl(NULL);
|
||||
|
||||
/*
|
||||
@ -2046,4 +2137,23 @@ zvol_fini_impl(void)
|
||||
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
|
||||
list_destroy(&zvol_state_list);
|
||||
rw_destroy(&zvol_state_lock);
|
||||
|
||||
if (ztqs->tqs_taskq == NULL) {
|
||||
ASSERT3U(ztqs->tqs_cnt, ==, 0);
|
||||
} else {
|
||||
for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
|
||||
ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
|
||||
taskq_destroy(ztqs->tqs_taskq[i]);
|
||||
}
|
||||
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
|
||||
sizeof (taskq_t *));
|
||||
ztqs->tqs_taskq = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, , zvol_threads, UINT, ZMOD_RW,
|
||||
"Number of threads for I/O requests. Set to 0 to use all active CPUs");
|
||||
ZFS_MODULE_PARAM(zfs, , zvol_num_taskqs, UINT, ZMOD_RW,
|
||||
"Number of zvol taskqs");
|
||||
ZFS_MODULE_PARAM(zfs, , zvol_request_sync, UINT, ZMOD_RW,
|
||||
"Synchronously handle bio requests");
|
||||
|
@ -3386,17 +3386,21 @@ function set_tunable_impl
|
||||
|
||||
function save_tunable
|
||||
{
|
||||
[[ ! -d $TEST_BASE_DIR ]] && return 1
|
||||
[[ -e $TEST_BASE_DIR/tunable-$1 ]] && return 2
|
||||
echo "$(get_tunable """$1""")" > "$TEST_BASE_DIR"/tunable-"$1"
|
||||
if tunable_exists $1 ; then
|
||||
[[ ! -d $TEST_BASE_DIR ]] && return 1
|
||||
[[ -e $TEST_BASE_DIR/tunable-$1 ]] && return 2
|
||||
echo "$(get_tunable """$1""")" > "$TEST_BASE_DIR"/tunable-"$1"
|
||||
fi
|
||||
}
|
||||
|
||||
function restore_tunable
|
||||
{
|
||||
[[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1
|
||||
val="$(cat $TEST_BASE_DIR/tunable-"""$1""")"
|
||||
set_tunable64 "$1" "$val"
|
||||
rm $TEST_BASE_DIR/tunable-$1
|
||||
if tunable_exists $1 ; then
|
||||
[[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1
|
||||
val="$(cat $TEST_BASE_DIR/tunable-"""$1""")"
|
||||
set_tunable64 "$1" "$val"
|
||||
rm $TEST_BASE_DIR/tunable-$1
|
||||
fi
|
||||
}
|
||||
|
||||
#
|
||||
|
@ -102,6 +102,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip
|
||||
VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev
|
||||
VOL_MODE vol.mode zvol_volmode
|
||||
VOL_RECURSIVE vol.recursive UNSUPPORTED
|
||||
VOL_REQUEST_SYNC zvol_request_sync zvol_request_sync
|
||||
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
|
||||
BCLONE_ENABLED bclone_enabled zfs_bclone_enabled
|
||||
BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty
|
||||
|
@ -140,3 +140,11 @@ function set_blk_mq
|
||||
log_must set_tunable32 VOL_USE_BLK_MQ $1
|
||||
fi
|
||||
}
|
||||
|
||||
# enable/disable zvol sync mode
|
||||
#
|
||||
# $1: 1 = enable, 0 = disable
|
||||
function set_zvol_sync
|
||||
{
|
||||
log_must set_tunable32 VOL_REQUEST_SYNC $1
|
||||
}
|
||||
|
@ -60,6 +60,9 @@ typeset -f each_zvol_size=$(( floor($biggest_zvol_size_possible * 0.9 / \
|
||||
|
||||
typeset tmpdir="$(mktemp -t -d zvol_stress_fio_state.XXXXXX)"
|
||||
|
||||
log_must save_tunable VOL_USE_BLK_MQ
|
||||
log_must save_tunable VOL_REQUEST_SYNC
|
||||
|
||||
function create_zvols
|
||||
{
|
||||
log_note "Creating $num_zvols zvols that are ${each_zvol_size}B each"
|
||||
@ -124,7 +127,8 @@ function cleanup
|
||||
log_must zinject -c all
|
||||
log_must zpool clear $TESTPOOL
|
||||
destroy_zvols
|
||||
set_blk_mq 0
|
||||
log_must restore_tunable VOL_USE_BLK_MQ
|
||||
log_must restore_tunable VOL_REQUEST_SYNC
|
||||
|
||||
# Remove all fio's leftover state files
|
||||
if [ -n "$tmpdir" ] ; then
|
||||
@ -146,6 +150,18 @@ destroy_zvols
|
||||
set_blk_mq 1
|
||||
create_zvols
|
||||
do_zvol_stress
|
||||
destroy_zvols
|
||||
|
||||
# Disable zvol sync mode, and re-run test
|
||||
set_zvol_sync 0
|
||||
create_zvols
|
||||
do_zvol_stress
|
||||
destroy_zvols
|
||||
|
||||
# Same for enabled zvol sync mode
|
||||
set_zvol_sync 1
|
||||
create_zvols
|
||||
do_zvol_stress
|
||||
|
||||
# Inject some errors, and verify we see some IO errors in zpool status
|
||||
sync_pool $TESTPOOL
|
||||
|
Loading…
Reference in New Issue
Block a user