diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 index 6f42b9812..29b0a2829 100644 --- a/config/kernel-blk-queue.m4 +++ b/config/kernel-blk-queue.m4 @@ -359,6 +359,36 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [ ]) ]) +dnl # +dnl # See if kernel supports block multi-queue and blk_status_t. +dnl # blk_status_t represents the new status codes introduced in the 4.13 +dnl # kernel patch: +dnl # +dnl # block: introduce new block status code type +dnl # +dnl # We do not currently support the "old" block multi-queue interfaces from +dnl # prior kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [ + ZFS_LINUX_TEST_SRC([blk_mq], [ + #include + ], [ + struct blk_mq_tag_set tag_set __attribute__ ((unused)) = {0}; + (void) blk_mq_alloc_tag_set(&tag_set); + return BLK_STS_OK; + ], []) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [ + AC_MSG_CHECKING([whether block multiqueue with blk_status_t is available]) + ZFS_LINUX_TEST_RESULT([blk_mq], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI @@ -370,6 +400,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS + ZFS_AC_KERNEL_SRC_BLK_MQ ]) AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ @@ -383,4 +414,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ ZFS_AC_KERNEL_BLK_QUEUE_FLUSH ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS + ZFS_AC_KERNEL_BLK_MQ ]) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index fd91560a3..7964937a0 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -34,6 +34,11 @@ #include #include #include /* for SECTOR_* */ +#include + +#ifdef HAVE_BLK_MQ +#include +#endif #ifndef HAVE_BLK_QUEUE_FLAG_SET static inline void @@ -608,4 +613,110 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id) } #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ +/* + * All the io_*() helper functions below can operate on a bio, or a rq, but + * not both. The older submit_bio() codepath will pass a bio, and the + * newer blk-mq codepath will pass a rq. + */ +static inline int +io_data_dir(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) { + if (op_is_write(req_op(rq))) { + return (WRITE); + } else { + return (READ); + } + } +#else + ASSERT3P(rq, ==, NULL); +#endif + return (bio_data_dir(bio)); +} + +static inline int +io_is_flush(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (req_op(rq) == REQ_OP_FLUSH); +#else + ASSERT3P(rq, ==, NULL); +#endif + return (bio_is_flush(bio)); +} + +static inline int +io_is_discard(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (req_op(rq) == REQ_OP_DISCARD); +#else + ASSERT3P(rq, ==, NULL); +#endif + return (bio_is_discard(bio)); +} + +static inline int +io_is_secure_erase(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (req_op(rq) == REQ_OP_SECURE_ERASE); +#else + ASSERT3P(rq, ==, NULL); +#endif + return (bio_is_secure_erase(bio)); +} + +static inline int +io_is_fua(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (rq->cmd_flags & REQ_FUA); +#else + ASSERT3P(rq, ==, NULL); +#endif + return (bio_is_fua(bio)); +} + + +static inline uint64_t +io_offset(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (blk_rq_pos(rq) << 9); +#else + ASSERT3P(rq, ==, NULL); +#endif + return (BIO_BI_SECTOR(bio) << 9); +} + +static inline uint64_t +io_size(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (blk_rq_bytes(rq)); +#else + ASSERT3P(rq, ==, NULL); +#endif + return (BIO_BI_SIZE(bio)); +} + +static inline int +io_has_data(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (bio_has_data(rq->bio)); +#else + ASSERT3P(rq, ==, NULL); +#endif + return (bio_has_data(bio)); +} #endif /* _ZFS_BLKDEV_H */ diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index 439eec986..fe2b5c07a 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -69,9 +69,20 @@ typedef struct zfs_uio { uint16_t uio_fmode; uint16_t uio_extflg; ssize_t uio_resid; + size_t uio_skip; + + struct request *rq; + + /* + * Used for saving rq_for_each_segment() state between calls + * to zfs_uiomove_bvec_rq(). + */ + struct req_iterator iter; + struct bio_vec bv; } zfs_uio_t; + #define zfs_uio_segflg(u) (u)->uio_segflg #define zfs_uio_offset(u) (u)->uio_loffset #define zfs_uio_resid(u) (u)->uio_resid @@ -116,17 +127,33 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov, } static inline void -zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio) +zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) { - uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; - uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); - uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; + /* Either bio or rq will be set, but not both */ + ASSERT3P(uio, !=, bio); + + if (bio) { + uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); + uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; + } else { + uio->uio_bvec = NULL; + uio->uio_iovcnt = 0; + memset(&uio->iter, 0, sizeof (uio->iter)); + } + + uio->uio_loffset = io_offset(bio, rq); uio->uio_segflg = UIO_BVEC; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; - uio->uio_resid = BIO_BI_SIZE(bio); - uio->uio_skip = BIO_BI_SKIP(bio); + uio->uio_resid = io_size(bio, rq); + if (bio) { + uio->uio_skip = BIO_BI_SKIP(bio); + } else { + uio->uio_skip = 0; + } + + uio->rq = rq; } #if defined(HAVE_VFS_IOV_ITER) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index a086e1a5d..a7e5408e5 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2248,9 +2248,74 @@ for each I/O submitter. When unset, requests are handled asynchronously by a thread pool. The number of requests which can be handled concurrently is controlled by .Sy zvol_threads . +.Sy zvol_request_sync +is ignored when running on a kernel that supports block multiqueue +.Pq Li blk-mq . . -.It Sy zvol_threads Ns = Ns Sy 32 Pq uint -Max number of threads which can handle zvol I/O requests concurrently. +.It Sy zvol_threads Ns = Ns Sy 0 Pq uint +The number of system wide threads to use for processing zvol block IOs. +If +.Sy 0 +(the default) then internally set +.Sy zvol_threads +to the number of CPUs present or 32 (whichever is greater). +. +.It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint +The number of threads per zvol to use for queuing IO requests. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +If +.Sy 0 +(the default) then internally set +.Sy zvol_blk_mq_threads +to the number of CPUs present. +. +.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint +Set to +.Sy 1 +to use the +.Li blk-mq +API for zvols. +Set to +.Sy 0 +(the default) to use the legacy zvol APIs. +This setting can give better or worse zvol performance depending on +the workload. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +. +.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint +If +.Sy zvol_use_blk_mq +is enabled, then process this number of +.Sy volblocksize Ns -sized blocks per zvol thread. +This tunable can be use to favor better performance for zvol reads (lower +values) or writes (higher values). +If set to +.Sy 0 , +then the zvol layer will process the maximum number of blocks +per thread that it can. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only applied at each zvol's load time. +. +.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint +The queue_depth value for the zvol +.Li blk-mq +interface. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only applied at each zvol's load time. +If +.Sy 0 +(the default) then use the kernel's default queue depth. +Values are clamped to the kernel's +.Dv BLKDEV_MIN_RQ +and +.Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ +limits. . .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint Defines zvol block devices behaviour when diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index 4f31bcb59..abb6dbe67 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -126,7 +126,7 @@ zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) } static int -zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { const struct bio_vec *bv = uio->uio_bvec; size_t skip = uio->uio_skip; @@ -137,10 +137,13 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) cnt = MIN(bv->bv_len - skip, n); paddr = zfs_kmap_atomic(bv->bv_page); - if (rw == UIO_READ) + if (rw == UIO_READ) { + /* Copy from buffer 'p' to the bvec data */ memcpy(paddr + bv->bv_offset + skip, p, cnt); - else + } else { + /* Copy from bvec data to buffer 'p' */ memcpy(p, paddr + bv->bv_offset + skip, cnt); + } zfs_kunmap_atomic(paddr); skip += cnt; @@ -158,6 +161,141 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) return (0); } +#ifdef HAVE_BLK_MQ +static void +zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw, + struct bio_vec *bv) +{ + void *paddr; + + paddr = zfs_kmap_atomic(bv->bv_page); + if (rw == UIO_READ) { + /* Copy from buffer 'p' to the bvec data */ + memcpy(paddr + bv->bv_offset + skip, p, cnt); + } else { + /* Copy from bvec data to buffer 'p' */ + memcpy(p, paddr + bv->bv_offset + skip, cnt); + } + zfs_kunmap_atomic(paddr); +} + +/* + * Copy 'n' bytes of data between the buffer p[] and the data represented + * by the request in the uio. + */ +static int +zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +{ + struct request *rq = uio->rq; + struct bio_vec bv; + struct req_iterator iter; + size_t this_seg_start; /* logical offset */ + size_t this_seg_end; /* logical offset */ + size_t skip_in_seg; + size_t copy_from_seg; + size_t orig_loffset; + int copied = 0; + + /* + * Get the original logical offset of this entire request (because + * uio->uio_loffset will be modified over time). + */ + orig_loffset = io_offset(NULL, rq); + this_seg_start = orig_loffset; + + rq_for_each_segment(bv, rq, iter) { + if (uio->iter.bio) { + /* + * If uio->iter.bio is present, then we know we've saved + * uio->iter from a previous call to this function, and + * we can skip ahead in this rq_for_each_segment() loop + * to where we last left off. That way, we don't need + * to iterate over tons of segments we've already + * processed - we can just restore the "saved state". + */ + iter = uio->iter; + bv = uio->bv; + this_seg_start = uio->uio_loffset; + memset(&uio->iter, 0, sizeof (uio->iter)); + continue; + } + + /* + * Lookup what the logical offset of the last byte of this + * segment is. + */ + this_seg_end = this_seg_start + bv.bv_len - 1; + + /* + * We only need to operate on segments that have data we're + * copying. + */ + if (uio->uio_loffset >= this_seg_start && + uio->uio_loffset <= this_seg_end) { + /* + * Some, or all, of the data in this segment needs to be + * copied. + */ + + /* + * We may be not be copying from the first byte in the + * segment. Figure out how many bytes to skip copying + * from the beginning of this segment. + */ + skip_in_seg = uio->uio_loffset - this_seg_start; + + /* + * Calculate the total number of bytes from this + * segment that we will be copying. + */ + copy_from_seg = MIN(bv.bv_len - skip_in_seg, n); + + /* Copy the bytes */ + zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv); + p = ((char *)p) + copy_from_seg; + + n -= copy_from_seg; + uio->uio_resid -= copy_from_seg; + uio->uio_loffset += copy_from_seg; + copied = 1; /* We copied some data */ + } + + if (n == 0) { + /* + * All done copying. Save our 'iter' value to the uio. + * This allows us to "save our state" and skip ahead in + * the rq_for_each_segment() loop the next time we call + * call zfs_uiomove_bvec_rq() on this uio (which we + * will be doing for any remaining data in the uio). + */ + uio->iter = iter; /* make a copy of the struct data */ + uio->bv = bv; + return (0); + } + + this_seg_start = this_seg_end + 1; + } + + if (!copied) { + /* Didn't copy anything */ + uio->uio_resid = 0; + } + return (0); +} +#endif + +static int +zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +{ +#ifdef HAVE_BLK_MQ + if (uio->rq != NULL) + return (zfs_uiomove_bvec_rq(p, n, rw, uio)); +#else + ASSERT3P(uio->rq, ==, NULL); +#endif + return (zfs_uiomove_bvec_impl(p, n, rw, uio)); +} + #if defined(HAVE_VFS_IOV_ITER) static int zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, @@ -300,8 +438,14 @@ zfs_uioskip(zfs_uio_t *uio, size_t n) { if (n > uio->uio_resid) return; - - if (uio->uio_segflg == UIO_BVEC) { + /* + * When using a uio with a struct request, we simply + * use uio_loffset as a pointer to the next logical byte to + * copy in the request. We don't have to do any fancy + * accounting with uio_bvec/uio_iovcnt since we don't use + * them. + */ + if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) { uio->uio_skip += n; while (uio->uio_iovcnt && uio->uio_skip >= uio->uio_bvec->bv_len) { diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 39441700a..acbab55d0 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -41,20 +41,77 @@ #include #include +#ifdef HAVE_BLK_MQ +#include +#endif + +static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, + struct request *rq, boolean_t force_sync); + static unsigned int zvol_major = ZVOL_MAJOR; static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; -static unsigned int zvol_threads = 32; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static const unsigned int zvol_open_timeout_ms = 1000; #endif +static unsigned int zvol_threads = 0; +#ifdef HAVE_BLK_MQ +static unsigned int zvol_blk_mq_threads = 0; +static unsigned int zvol_blk_mq_actual_threads; +static boolean_t zvol_use_blk_mq = B_FALSE; + +/* + * The maximum number of volblocksize blocks to process per thread. Typically, + * write heavy workloads preform better with higher values here, and read + * heavy workloads preform better with lower values, but that's not a hard + * and fast rule. It's basically a knob to tune between "less overhead with + * less parallelism" and "more overhead, but more parallelism". + * + * '8' was chosen as a reasonable, balanced, default based off of sequential + * read and write tests to a zvol in an NVMe pool (with 16 CPUs). + */ +static unsigned int zvol_blk_mq_blocks_per_thread = 8; +#endif + +#ifndef BLKDEV_DEFAULT_RQ +/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ +#define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ +#endif + +/* + * Finalize our BIO or request. + */ +#ifdef HAVE_BLK_MQ +#define END_IO(zv, bio, rq, error) do { \ + if (bio) { \ + BIO_END_IO(bio, error); \ + } else { \ + blk_mq_end_request(rq, errno_to_bi_status(error)); \ + } \ +} while (0) +#else +#define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) +#endif + +#ifdef HAVE_BLK_MQ +static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; +static unsigned int zvol_actual_blk_mq_queue_depth; +#endif + struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ struct request_queue *zvo_queue; /* request queue */ dev_t zvo_dev; /* device id */ + +#ifdef HAVE_BLK_MQ + struct blk_mq_tag_set tag_set; +#endif + + /* Set from the global 'zvol_use_blk_mq' at zvol load */ + boolean_t use_blk_mq; }; taskq_t *zvol_taskq; @@ -63,8 +120,14 @@ static struct ida zvol_ida; typedef struct zv_request_stack { zvol_state_t *zv; struct bio *bio; + struct request *rq; } zv_request_t; +typedef struct zv_work { + struct request *rq; + struct work_struct work; +} zv_work_t; + typedef struct zv_request_task { zv_request_t zvr; taskq_ent_t ent; @@ -86,6 +149,62 @@ zv_request_task_free(zv_request_task_t *task) kmem_free(task, sizeof (*task)); } +#ifdef HAVE_BLK_MQ + +/* + * This is called when a new block multiqueue request comes in. A request + * contains one or more BIOs. + */ +static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + zvol_state_t *zv = rq->q->queuedata; + + /* Tell the kernel that we are starting to process this request */ + blk_mq_start_request(rq); + + if (blk_rq_is_passthrough(rq)) { + /* Skip non filesystem request */ + blk_mq_end_request(rq, BLK_STS_IOERR); + return (BLK_STS_IOERR); + } + + zvol_request_impl(zv, NULL, rq, 0); + + /* Acknowledge to the kernel that we got this request */ + return (BLK_STS_OK); +} + +static struct blk_mq_ops zvol_blk_mq_queue_ops = { + .queue_rq = zvol_mq_queue_rq, +}; + +/* Initialize our blk-mq struct */ +static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) +{ + struct zvol_state_os *zso = zv->zv_zso; + + memset(&zso->tag_set, 0, sizeof (zso->tag_set)); + + /* Initialize tag set. */ + zso->tag_set.ops = &zvol_blk_mq_queue_ops; + zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; + zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; + zso->tag_set.numa_node = NUMA_NO_NODE; + zso->tag_set.cmd_size = 0; + + /* + * We need BLK_MQ_F_BLOCKING here since we do blocking calls in + * zvol_request_impl() + */ + zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; + zso->tag_set.driver_data = zv; + + return (blk_mq_alloc_tag_set(&zso->tag_set)); +} +#endif /* HAVE_BLK_MQ */ + /* * Given a path, return TRUE if path is a ZVOL. */ @@ -107,38 +226,51 @@ static void zvol_write(zv_request_t *zvr) { struct bio *bio = zvr->bio; + struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; - - zfs_uio_bvec_init(&uio, bio); - zvol_state_t *zv = zvr->zv; + struct request_queue *q; + struct gendisk *disk; + unsigned long start_time = 0; + boolean_t acct = B_FALSE; + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); + q = zv->zv_zso->zvo_queue; + disk = zv->zv_zso->zvo_disk; + /* bio marked as FLUSH need to flush before write */ - if (bio_is_flush(bio)) + if (io_is_flush(bio, rq)) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* Some requests are just for flush and nothing else. */ - if (uio.uio_resid == 0) { + if (io_size(bio, rq) == 0) { rw_exit(&zv->zv_suspend_lock); - BIO_END_IO(bio, 0); + END_IO(zv, bio, rq, 0); return; } - struct request_queue *q = zv->zv_zso->zvo_queue; - struct gendisk *disk = zv->zv_zso->zvo_disk; - ssize_t start_resid = uio.uio_resid; - unsigned long start_time; + zfs_uio_bvec_init(&uio, bio, rq); - boolean_t acct = blk_queue_io_stat(q); - if (acct) - start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); + ssize_t start_resid = uio.uio_resid; + + /* + * With use_blk_mq, accounting is done by blk_mq_start_request() + * and blk_mq_end_request(), so we can skip it here. + */ + if (bio) { + acct = blk_queue_io_stat(q); + if (acct) { + start_time = blk_generic_start_io_acct(q, disk, WRITE, + bio); + } + } boolean_t sync = - bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_WRITER); @@ -180,10 +312,11 @@ zvol_write(zv_request_t *zvr) rw_exit(&zv->zv_suspend_lock); - if (acct) + if (bio && acct) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + } - BIO_END_IO(bio, -error); + END_IO(zv, bio, rq, -error); } static void @@ -198,27 +331,33 @@ static void zvol_discard(zv_request_t *zvr) { struct bio *bio = zvr->bio; + struct request *rq = zvr->rq; zvol_state_t *zv = zvr->zv; - uint64_t start = BIO_BI_SECTOR(bio) << 9; - uint64_t size = BIO_BI_SIZE(bio); + uint64_t start = io_offset(bio, rq); + uint64_t size = io_size(bio, rq); uint64_t end = start + size; boolean_t sync; int error = 0; dmu_tx_t *tx; + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + unsigned long start_time = 0; + + boolean_t acct = blk_queue_io_stat(q); ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); - struct request_queue *q = zv->zv_zso->zvo_queue; - struct gendisk *disk = zv->zv_zso->zvo_disk; - unsigned long start_time; + if (bio) { + acct = blk_queue_io_stat(q); + if (acct) { + start_time = blk_generic_start_io_acct(q, disk, WRITE, + bio); + } + } - boolean_t acct = blk_queue_io_stat(q); - if (acct) - start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); - - sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (end > zv->zv_volsize) { error = SET_ERROR(EIO); @@ -231,7 +370,7 @@ zvol_discard(zv_request_t *zvr) * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ - if (!bio_is_secure_erase(bio)) { + if (!io_is_secure_erase(bio, rq)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; @@ -262,10 +401,12 @@ zvol_discard(zv_request_t *zvr) unlock: rw_exit(&zv->zv_suspend_lock); - if (acct) - blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + if (bio && acct) { + blk_generic_end_io_acct(q, disk, WRITE, bio, + start_time); + } - BIO_END_IO(bio, -error); + END_IO(zv, bio, rq, -error); } static void @@ -280,28 +421,41 @@ static void zvol_read(zv_request_t *zvr) { struct bio *bio = zvr->bio; + struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; - - zfs_uio_bvec_init(&uio, bio); - + boolean_t acct = B_FALSE; zvol_state_t *zv = zvr->zv; + struct request_queue *q; + struct gendisk *disk; + unsigned long start_time = 0; + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); - struct request_queue *q = zv->zv_zso->zvo_queue; - struct gendisk *disk = zv->zv_zso->zvo_disk; - ssize_t start_resid = uio.uio_resid; - unsigned long start_time; + zfs_uio_bvec_init(&uio, bio, rq); - boolean_t acct = blk_queue_io_stat(q); - if (acct) - start_time = blk_generic_start_io_acct(q, disk, READ, bio); + q = zv->zv_zso->zvo_queue; + disk = zv->zv_zso->zvo_disk; + + ssize_t start_resid = uio.uio_resid; + + /* + * When blk-mq is being used, accounting is done by + * blk_mq_start_request() and blk_mq_end_request(). + */ + if (bio) { + acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, READ, + bio); + } zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); uint64_t volsize = zv->zv_volsize; + while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); @@ -325,10 +479,11 @@ zvol_read(zv_request_t *zvr) rw_exit(&zv->zv_suspend_lock); - if (acct) + if (bio && acct) { blk_generic_end_io_acct(q, disk, READ, bio, start_time); + } - BIO_END_IO(bio, -error); + END_IO(zv, bio, rq, -error); } static void @@ -339,52 +494,49 @@ zvol_read_task(void *arg) zv_request_task_free(task); } -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID + +/* + * Process a BIO or request + * + * Either 'bio' or 'rq' should be set depending on if we are processing a + * bio or a request (both should not be set). + * + * force_sync: Set to 0 to defer processing to a background taskq + * Set to 1 to process data synchronously + */ static void -zvol_submit_bio(struct bio *bio) -#else -static blk_qc_t -zvol_submit_bio(struct bio *bio) -#endif -#else -static MAKE_REQUEST_FN_RET -zvol_request(struct request_queue *q, struct bio *bio) -#endif +zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, + boolean_t force_sync) { -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -#if defined(HAVE_BIO_BDEV_DISK) - struct request_queue *q = bio->bi_bdev->bd_disk->queue; -#else - struct request_queue *q = bio->bi_disk->queue; -#endif -#endif - zvol_state_t *zv = q->queuedata; fstrans_cookie_t cookie = spl_fstrans_mark(); - uint64_t offset = BIO_BI_SECTOR(bio) << 9; - uint64_t size = BIO_BI_SIZE(bio); - int rw = bio_data_dir(bio); + uint64_t offset = io_offset(bio, rq); + uint64_t size = io_size(bio, rq); + int rw = io_data_dir(bio, rq); - if (bio_has_data(bio) && offset + size > zv->zv_volsize) { - printk(KERN_INFO - "%s: bad access: offset=%llu, size=%lu\n", - zv->zv_zso->zvo_disk->disk_name, - (long long unsigned)offset, - (long unsigned)size); - - BIO_END_IO(bio, -SET_ERROR(EIO)); - goto out; - } + if (zvol_request_sync) + force_sync = 1; zv_request_t zvr = { .zv = zv, .bio = bio, + .rq = rq, }; + + if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { + printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", + zv->zv_zso->zvo_disk->disk_name, + (long long unsigned)offset, + (long unsigned)size); + + END_IO(zv, bio, rq, -SET_ERROR(EIO)); + goto out; + } + zv_request_task_t *task; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { - BIO_END_IO(bio, -SET_ERROR(EROFS)); + END_IO(zv, bio, rq, -SET_ERROR(EROFS)); goto out; } @@ -421,7 +573,7 @@ zvol_request(struct request_queue *q, struct bio *bio) * i/o may be a ZIL write (via zil_commit()), or a read of an * indirect block, or a read of a data block (if this is a * partial-block write). We will indicate that the i/o is - * complete by calling BIO_END_IO() from the taskq callback. + * complete by calling END_IO() from the taskq callback. * * This design allows the calling thread to continue and * initiate more concurrent operations by calling @@ -441,12 +593,12 @@ zvol_request(struct request_queue *q, struct bio *bio) * of one i/o at a time per zvol. However, an even better * design would be for zvol_request() to initiate the zio * directly, and then be notified by the zio_done callback, - * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL + * which would call END_IO(). Unfortunately, the DMU/ZIL * interfaces lack this functionality (they block waiting for * the i/o to complete). */ - if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { - if (zvol_request_sync) { + if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { + if (force_sync) { zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); @@ -454,7 +606,7 @@ zvol_request(struct request_queue *q, struct bio *bio) zvol_discard_task, task, 0, &task->ent); } } else { - if (zvol_request_sync) { + if (force_sync) { zvol_write(&zvr); } else { task = zv_request_task_create(zvr); @@ -469,14 +621,14 @@ zvol_request(struct request_queue *q, struct bio *bio) * data and require no additional handling. */ if (size == 0) { - BIO_END_IO(bio, 0); + END_IO(zv, bio, rq, 0); goto out; } rw_enter(&zv->zv_suspend_lock, RW_READER); /* See comment in WRITE case above. */ - if (zvol_request_sync) { + if (force_sync) { zvol_read(&zvr); } else { task = zv_request_task_create(zvr); @@ -487,8 +639,33 @@ zvol_request(struct request_queue *q, struct bio *bio) out: spl_fstrans_unmark(cookie); -#if (defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ - defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)) && \ +} + +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID +static void +zvol_submit_bio(struct bio *bio) +#else +static blk_qc_t +zvol_submit_bio(struct bio *bio) +#endif +#else +static MAKE_REQUEST_FN_RET +zvol_request(struct request_queue *q, struct bio *bio) +#endif +{ +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +#if defined(HAVE_BIO_BDEV_DISK) + struct request_queue *q = bio->bi_bdev->bd_disk->queue; +#else + struct request_queue *q = bio->bi_disk->queue; +#endif +#endif + zvol_state_t *zv = q->queuedata; + + zvol_request_impl(zv, bio, NULL, 0); +#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ + defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) return (BLK_QC_T_NONE); #endif @@ -805,6 +982,27 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) return (0); } +/* + * Why have two separate block_device_operations structs? + * + * Normally we'd just have one, and assign 'submit_bio' as needed. However, + * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we + * can't just change submit_bio dynamically at runtime. So just create two + * separate structs to get around this. + */ +static const struct block_device_operations zvol_ops_blk_mq = { + .open = zvol_open, + .release = zvol_release, + .ioctl = zvol_ioctl, + .compat_ioctl = zvol_compat_ioctl, + .check_events = zvol_check_events, +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK + .revalidate_disk = zvol_revalidate_disk, +#endif + .getgeo = zvol_getgeo, + .owner = THIS_MODULE, +}; + static const struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, @@ -821,6 +1019,87 @@ static const struct block_device_operations zvol_ops = { #endif }; +static int +zvol_alloc_non_blk_mq(struct zvol_state_os *zso) +{ +#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) +#if defined(HAVE_BLK_ALLOC_DISK) + zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); + if (zso->zvo_disk == NULL) + return (1); + + zso->zvo_disk->minors = ZVOL_MINORS; + zso->zvo_queue = zso->zvo_disk->queue; +#else + zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); + if (zso->zvo_queue == NULL) + return (1); + + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + return (1); + } + + zso->zvo_disk->queue = zso->zvo_queue; +#endif /* HAVE_BLK_ALLOC_DISK */ +#else + zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); + if (zso->zvo_queue == NULL) + return (1); + + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + return (1); + } + + zso->zvo_disk->queue = zso->zvo_queue; +#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ + return (0); + +} + +static int +zvol_alloc_blk_mq(zvol_state_t *zv) +{ +#ifdef HAVE_BLK_MQ + struct zvol_state_os *zso = zv->zv_zso; + + /* Allocate our blk-mq tag_set */ + if (zvol_blk_mq_alloc_tag_set(zv) != 0) + return (1); + +#if defined(HAVE_BLK_ALLOC_DISK) + zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); + if (zso->zvo_disk == NULL) { + blk_mq_free_tag_set(&zso->tag_set); + return (1); + } + zso->zvo_queue = zso->zvo_disk->queue; + zso->zvo_disk->minors = ZVOL_MINORS; +#else + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + blk_mq_free_tag_set(&zso->tag_set); + return (1); + } + /* Allocate queue */ + zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); + if (IS_ERR(zso->zvo_queue)) { + blk_mq_free_tag_set(&zso->tag_set); + return (1); + } + + /* Our queue is now created, assign it to our disk */ + zso->zvo_disk->queue = zso->zvo_queue; + +#endif +#endif + return (0); +} + /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. @@ -831,6 +1110,7 @@ zvol_alloc(dev_t dev, const char *name) zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; + int ret; if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) return (NULL); @@ -849,48 +1129,44 @@ zvol_alloc(dev_t dev, const char *name) list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -#ifdef HAVE_BLK_ALLOC_DISK - zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); - if (zso->zvo_disk == NULL) - goto out_kmem; +#ifdef HAVE_BLK_MQ + zv->zv_zso->use_blk_mq = zvol_use_blk_mq; +#endif - zso->zvo_disk->minors = ZVOL_MINORS; - zso->zvo_queue = zso->zvo_disk->queue; -#else - zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); - if (zso->zvo_queue == NULL) - goto out_kmem; - - zso->zvo_disk = alloc_disk(ZVOL_MINORS); - if (zso->zvo_disk == NULL) { - blk_cleanup_queue(zso->zvo_queue); - goto out_kmem; + /* + * The block layer has 3 interfaces for getting BIOs: + * + * 1. blk-mq request queues (new) + * 2. submit_bio() (oldest) + * 3. regular request queues (old). + * + * Each of those interfaces has two permutations: + * + * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates + * both the disk and its queue (5.14 kernel or newer) + * + * b) We don't have blk_*alloc_disk(), and have to allocate the + * disk and the queue separately. (5.13 kernel or older) + */ + if (zv->zv_zso->use_blk_mq) { + ret = zvol_alloc_blk_mq(zv); + zso->zvo_disk->fops = &zvol_ops_blk_mq; + } else { + ret = zvol_alloc_non_blk_mq(zso); + zso->zvo_disk->fops = &zvol_ops; } - - zso->zvo_disk->queue = zso->zvo_queue; -#endif /* HAVE_BLK_ALLOC_DISK */ -#else - zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); - if (zso->zvo_queue == NULL) + if (ret != 0) goto out_kmem; - zso->zvo_disk = alloc_disk(ZVOL_MINORS); - if (zso->zvo_disk == NULL) { - blk_cleanup_queue(zso->zvo_queue); - goto out_kmem; - } - - zso->zvo_disk->queue = zso->zvo_queue; -#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ - blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); - /* Disable write merging in favor of the ZIO pipeline. */ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); + if (!zv->zv_zso->use_blk_mq) { + /* Disable write merging in favor of the ZIO pipeline. */ + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); + } /* Enable /proc/diskstats */ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); @@ -918,7 +1194,6 @@ zvol_alloc(dev_t dev, const char *name) } zso->zvo_disk->first_minor = (dev & MINORMASK); - zso->zvo_disk->fops = &zvol_ops; zso->zvo_disk->private_data = zv; snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); @@ -963,6 +1238,11 @@ zvol_os_free(zvol_state_t *zv) put_disk(zv->zv_zso->zvo_disk); #endif +#ifdef HAVE_BLK_MQ + if (zv->zv_zso->use_blk_mq) + blk_mq_free_tag_set(&zv->zv_zso->tag_set); +#endif + ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); @@ -1044,8 +1324,69 @@ zvol_os_create_minor(const char *name) blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, (DMU_MAX_ACCESS / 4) >> 9); - blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); - blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); + + if (zv->zv_zso->use_blk_mq) { + /* + * IO requests can be really big (1MB). When an IO request + * comes in, it is passed off to zvol_read() or zvol_write() + * in a new thread, where it is chunked up into 'volblocksize' + * sized pieces and processed. So for example, if the request + * is a 1MB write and your volblocksize is 128k, one zvol_write + * thread will take that request and sequentially do ten 128k + * IOs. This is due to the fact that the thread needs to lock + * each volblocksize sized block. So you might be wondering: + * "instead of passing the whole 1MB request to one thread, + * why not pass ten individual 128k chunks to ten threads and + * process the whole write in parallel?" The short answer is + * that there's a sweet spot number of chunks that balances + * the greater parallelism with the added overhead of more + * threads. The sweet spot can be different depending on if you + * have a read or write heavy workload. Writes typically want + * high chunk counts while reads typically want lower ones. On + * a test pool with 6 NVMe drives in a 3x 2-disk mirror + * configuration, with volblocksize=8k, the sweet spot for good + * sequential reads and writes was at 8 chunks. + */ + + /* + * Below we tell the kernel how big we want our requests + * to be. You would think that blk_queue_io_opt() would be + * used to do this since it is used to "set optimal request + * size for the queue", but that doesn't seem to do + * anything - the kernel still gives you huge requests + * with tons of little PAGE_SIZE segments contained within it. + * + * Knowing that the kernel will just give you PAGE_SIZE segments + * no matter what, you can say "ok, I want PAGE_SIZE byte + * segments, and I want 'N' of them per request", where N is + * the correct number of segments for the volblocksize and + * number of chunks you want. + */ +#ifdef HAVE_BLK_MQ + if (zvol_blk_mq_blocks_per_thread != 0) { + unsigned int chunks; + chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); + + blk_queue_max_segment_size(zv->zv_zso->zvo_queue, + PAGE_SIZE); + blk_queue_max_segments(zv->zv_zso->zvo_queue, + (zv->zv_volblocksize * chunks) / PAGE_SIZE); + } else { + /* + * Special case: zvol_blk_mq_blocks_per_thread = 0 + * Max everything out. + */ + blk_queue_max_segments(zv->zv_zso->zvo_queue, + UINT16_MAX); + blk_queue_max_segment_size(zv->zv_zso->zvo_queue, + UINT_MAX); + } +#endif + } else { + blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); + blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); + } + blk_queue_physical_block_size(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); @@ -1167,19 +1508,54 @@ int zvol_init(void) { int error; - int threads = MIN(MAX(zvol_threads, 1), 1024); + + /* + * zvol_threads is the module param the user passes in. + * + * zvol_actual_threads is what we use internally, since the user can + * pass zvol_thread = 0 to mean "use all the CPUs" (the default). + */ + static unsigned int zvol_actual_threads; + + if (zvol_threads == 0) { + /* + * See dde9380a1 for why 32 was chosen here. This should + * probably be refined to be some multiple of the number + * of CPUs. + */ + zvol_actual_threads = MAX(num_online_cpus(), 32); + } else { + zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); + } error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } - zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, - threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + +#ifdef HAVE_BLK_MQ + if (zvol_blk_mq_queue_depth == 0) { + zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; + } else { + zvol_actual_blk_mq_queue_depth = + MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); + } + + if (zvol_blk_mq_threads == 0) { + zvol_blk_mq_actual_threads = num_online_cpus(); + } else { + zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), + 1024); + } +#endif + zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, + zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (zvol_taskq == NULL) { unregister_blkdev(zvol_major, ZVOL_DRIVER); return (-ENOMEM); } + zvol_init_impl(); ida_init(&zvol_ida); return (0); @@ -1202,7 +1578,8 @@ module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); -MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); +MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" + "to 0 to use all active CPUs"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); @@ -1215,4 +1592,17 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); + +#ifdef HAVE_BLK_MQ +module_param(zvol_blk_mq_queue_depth, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); + +module_param(zvol_use_blk_mq, uint, 0644); +MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); + +module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, + "Process volblocksize blocks per thread"); +#endif + /* END CSTYLED */ diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 243221598..89ee0d3cb 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -937,9 +937,13 @@ tags = ['functional', 'zvol', 'zvol_cli'] [tests/functional/zvol/zvol_misc] tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', - 'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil'] + 'zvol_misc_snapdev', 'zvol_misc_trim', 'zvol_misc_volmode', 'zvol_misc_zil'] tags = ['functional', 'zvol', 'zvol_misc'] +[tests/functional/zvol/zvol_stress] +tests = ['zvol_stress'] +tags = ['functional', 'zvol', 'zvol_stress'] + [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos'] tags = ['functional', 'zvol', 'zvol_swap'] diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 3985da146..fa71f412b 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -184,3 +184,8 @@ tags = ['functional', 'user_namespace'] tests = ['groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos', 'userquota_013_pos', 'userspace_003_pos'] tags = ['functional', 'userquota'] + +[tests/functional/zvol/zvol_misc:Linux] +tests = ['zvol_misc_fua'] +tags = ['functional', 'zvol', 'zvol_misc'] + diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 99430bc10..1ee786d13 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -120,10 +120,12 @@ export SYSTEM_FILES_FREEBSD='chflags showmount swapctl sysctl + trim uncompress' export SYSTEM_FILES_LINUX='attr blkid + blkdiscard blockdev chattr exportfs diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 51d4e225f..cb20318f4 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -2770,20 +2770,22 @@ function is_te_enabled svcs -H -o state labeld 2>/dev/null | grep -q "enabled" } +# Return the number of CPUs (cross-platform) +function get_num_cpus +{ + if is_linux ; then + grep -c '^processor' /proc/cpuinfo + elif is_freebsd; then + sysctl -n kern.smp.cpus + else + psrinfo | wc -l + fi +} + # Utility function to determine if a system has multiple cpus. function is_mp { - case "$UNAME" in - Linux) - (($(grep -c '^processor' /proc/cpuinfo) > 1)) - ;; - FreeBSD) - sysctl -n kern.smp.cpus - ;; - *) - (($(psrinfo | wc -l) > 1)) - ;; - esac + [[ $(get_num_cpus) -gt 1 ]] } function get_cpu_freq @@ -3320,14 +3322,23 @@ function get_tunable_impl { typeset name="$1" typeset module="${2:-zfs}" + typeset check_only="$3" eval "typeset tunable=\$$name" case "$tunable" in UNSUPPORTED) - log_unsupported "Tunable '$name' is unsupported on $UNAME" + if [ -z "$check_only" ] ; then + log_unsupported "Tunable '$name' is unsupported on $UNAME" + else + return 1 + fi ;; "") - log_fail "Tunable '$name' must be added to tunables.cfg" + if [ -z "$check_only" ] ; then + log_fail "Tunable '$name' must be added to tunables.cfg" + else + return 1 + fi ;; *) ;; @@ -3347,6 +3358,14 @@ function get_tunable_impl esac } +# Does a tunable exist? +# +# $1: Tunable name +function tunable_exists +{ + get_tunable_impl $1 "zfs" 1 +} + # # Compute MD5 digest for given file or stdin if no file given. # Note: file path must not contain spaces diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index d3838cb7c..d6a2fe5db 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -87,6 +87,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED +VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index ffc087351..d759e5196 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1966,11 +1966,16 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/zvol/zvol_misc/zvol_misc_004_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_005_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_006_pos.ksh \ + functional/zvol/zvol_misc/zvol_misc_fua.ksh \ functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh \ functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh \ functional/zvol/zvol_misc/zvol_misc_snapdev.ksh \ + functional/zvol/zvol_misc/zvol_misc_trim.ksh \ functional/zvol/zvol_misc/zvol_misc_volmode.ksh \ functional/zvol/zvol_misc/zvol_misc_zil.ksh \ + functional/zvol/zvol_stress/cleanup.ksh \ + functional/zvol/zvol_stress/setup.ksh \ + functional/zvol/zvol_stress/zvol_stress.ksh \ functional/zvol/zvol_swap/cleanup.ksh \ functional/zvol/zvol_swap/setup.ksh \ functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib b/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib index c0fd90f58..c04559fe3 100644 --- a/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib +++ b/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib @@ -128,3 +128,14 @@ function is_zvol_dumpified zdb -dddd $volume 2 | grep -q "dumpsize" } + +# enable/disable blk-mq (if available) +# +# $1: 1 = enable, 0 = disable +function set_blk_mq +{ + # Not all kernels support blk-mq + if tunable_exists VOL_USE_BLK_MQ ; then + log_must set_tunable32 VOL_USE_BLK_MQ $1 + fi +} diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh new file mode 100755 index 000000000..e44107030 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh @@ -0,0 +1,96 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/zvol/zvol_common.shlib + +# +# DESCRIPTION: +# Verify that a zvol Force Unit Access (FUA) write works. +# +# STRATEGY: +# 1. dd write 5MB of data with "oflag=dsync,direct" to a zvol. Those flags +# together do a FUA write. +# 3. Verify the data is correct. +# 3. Repeat 1-2 for both the blk-mq and non-blk-mq cases. + +verify_runnable "global" + +if ! is_physical_device $DISKS; then + log_unsupported "This directory cannot be run on raw files." +fi + +if ! is_linux ; then + log_unsupported "Only linux supports dd with oflag=dsync for FUA writes" +fi + +typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)" +typeset datafile2="$(mktemp zvol_misc_fua2.XXXXXX)" +typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL + +function cleanup +{ + rm "$datafile1" "$datafile2" +} + +function do_test { + # Wait for udev to create symlinks to our zvol + block_device_wait $zvolpath + + # Create a data file + log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5 + + # Write the data to our zvol using FUA + log_must dd if=$datafile1 of=$zvolpath oflag=dsync,direct bs=1M count=5 + + # Extract data from our zvol + log_must dd if=$zvolpath of="$datafile2" bs=1M count=5 + + # Compare the data we expect with what's on our zvol. diff will return + # non-zero if they differ. + log_must diff $datafile1 $datafile2 + + log_must rm $datafile1 $datafile2 +} + +log_assert "Verify that a ZFS volume can do Force Unit Access (FUA)" +log_onexit cleanup + +log_must zfs set compression=off $TESTPOOL/$TESTVOL + +log_note "Testing without blk-mq" + +set_blk_mq 0 +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +do_test + +set_blk_mq 1 +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +do_test + +log_pass "ZFS volume FUA works" diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh new file mode 100755 index 000000000..2e417a0e6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh @@ -0,0 +1,136 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/math.shlib +. $STF_SUITE/tests/functional/zvol/zvol_common.shlib + +# +# DESCRIPTION: +# Verify we can TRIM a zvol +# +# STRATEGY: +# 1. TRIM the entire zvol to remove data from older tests +# 2. Create a 5MB data file +# 3. Write the file to the zvol +# 4. Observe 5MB of used space on the zvol +# 5. TRIM the first 1MB and last 2MB of the 5MB block of data. +# 6. Observe 2MB of used space on the zvol +# 7. Verify the trimmed regions are zero'd on the zvol + +verify_runnable "global" + +if is_linux ; then + # We need '--force' here since the prior tests may leave a filesystem + # on the zvol, and blkdiscard will see that filesystem and print a + # warning unless you force it. + # + # Only blkdiscard >= v2.36 supports --force, so we need to + # check for it. + if blkdiscard --help | grep -q '\-\-force' ; then + trimcmd='blkdiscard --force' + else + trimcmd='blkdiscard' + fi +else + # By default, FreeBSD 'trim' always does a dry-run. '-f' makes + # it perform the actual operation. + trimcmd='trim -f' +fi + +if ! is_physical_device $DISKS; then + log_unsupported "This directory cannot be run on raw files." +fi + +typeset datafile1="$(mktemp zvol_misc_flags1.XXXXXX)" +typeset datafile2="$(mktemp zvol_misc_flags2.XXXXXX)" +typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL + +function cleanup +{ + rm "$datafile1" "$datafile2" +} + +function do_test { + # Wait for udev to create symlinks to our zvol + block_device_wait $zvolpath + + # Create a data file + log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5 + + # Write to zvol + log_must dd if=$datafile1 of=$zvolpath conv=fsync + + # Record how much space we've used (should be 5MB, with 128k + # of tolerance). + before="$(get_prop refer $TESTPOOL/$TESTVOL)" + log_must within_tolerance $before 5242880 131072 + + # We currently have 5MB of random data on the zvol. + # Trim the first 1MB and also trim 2MB at offset 3MB. + log_must $trimcmd -l $((1 * 1048576)) $zvolpath + log_must $trimcmd -o $((3 * 1048576)) -l $((2 * 1048576)) $zvolpath + sync_pool + + # After trimming 3MB, the zvol should have 2MB of data (with 128k of + # tolerance). + after="$(get_prop refer $TESTPOOL/$TESTVOL)" + log_must within_tolerance $after 2097152 131072 + + # Make the same holes in our test data + log_must dd if=/dev/zero of="$datafile1" bs=1M count=1 conv=notrunc + log_must dd if=/dev/zero of="$datafile1" bs=1M count=2 seek=3 conv=notrunc + + # Extract data from our zvol + log_must dd if=$zvolpath of="$datafile2" bs=1M count=5 + + # Compare the data we expect with what's on our zvol. diff will return + # non-zero if they differ. + log_must diff $datafile1 $datafile2 + + log_must rm $datafile1 $datafile2 +} + +log_assert "Verify that a ZFS volume can be TRIMed" +log_onexit cleanup + +log_must zfs set compression=off $TESTPOOL/$TESTVOL + +# Remove old data from previous tests +log_must $trimcmd $zvolpath + + +set_blk_mq 1 +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +do_test + +set_blk_mq 0 +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +do_test + +log_pass "ZFS volumes can be trimmed" diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh new file mode 100755 index 000000000..b81a37263 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh new file mode 100755 index 000000000..9e70fc47b --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_setup "$DISKS" diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh new file mode 100755 index 000000000..c1aadcac3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh @@ -0,0 +1,169 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2022 by Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/reservation/reservation.shlib +. $STF_SUITE/tests/functional/zvol/zvol_common.shlib + +# +# DESCRIPTION: +# Stress test multithreaded transfers to multiple zvols. Also verify +# zvol errors show up in zpool status. +# +# STRATEGY: +# +# For both the normal submit_bio() codepath and the blk-mq codepath, do +# the following: +# +# 1. Create one zvol per CPU +# 2. In parallel, spawn an fio "write and verify" for each zvol +# 3. Inject write errors +# 4. Write to one of the zvols with dd and verify the errors +# + +verify_runnable "global" + +num_zvols=$(get_num_cpus) + +# If we were making one big zvol from all the pool space, it would +# be this big: +biggest_zvol_size_possible=$(largest_volsize_from_pool $TESTPOOL) + +# Crude calculation: take the biggest zvol size we could possibly +# create, knock 10% off it (for overhead) and divide by the number +# of ZVOLs we want to make. +# +# Round the value using a printf +typeset -f each_zvol_size=$(( floor($biggest_zvol_size_possible * 0.9 / \ + $num_zvols ))) + +typeset tmpdir="$(mktemp -d zvol_stress_fio_state.XXXXXX)" + +function create_zvols +{ + log_note "Creating $num_zvols zvols that are ${each_zvol_size}B each" + for i in $(seq $num_zvols) ; do + log_must zfs create -V $each_zvol_size $TESTPOOL/testvol$i + block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/testvol$i" + done +} + +function destroy_zvols +{ + for i in $(seq $num_zvols) ; do + log_must_busy zfs destroy $TESTPOOL/testvol$i + done +} + +function do_zvol_stress +{ + # Write 10% of each zvol, or 50MB, whichever is less + zvol_write_size=$((each_zvol_size / 10)) + if [ $zvol_write_size -gt $((50 * 1048576)) ] ; then + zvol_write_size=$((50 * 1048576)) + fi + zvol_write_size_mb=$(($zvol_write_size / 1048576)) + + if is_linux ; then + engine=libaio + else + engine=psync + fi + + # Spawn off one fio per zvol in parallel + pids="" + for i in $(seq $num_zvols) ; do + # Spawn one fio per zvol as its own process + fio --ioengine=$engine --name=zvol_stress$i --direct=0 \ + --filename="$ZVOL_DEVDIR/$TESTPOOL/testvol$i" --bs=1048576 \ + --iodepth=10 --readwrite=randwrite --size=${zvol_write_size} \ + --verify_async=2 --numjobs=1 --verify=sha1 \ + --verify_fatal=1 \ + --continue_on_error=none \ + --error_dump=1 \ + --exitall_on_error \ + --aux-path="$tmpdir" --do_verify=1 & + pids="$pids $!" + done + + # Wait for all the spawned fios to finish and look for errors + fail="" + i=0 + for pid in $pids ; do + log_note "$s waiting on $pid" + if ! wait $pid ; then + log_fail "fio error on $TESTPOOL/testvol$i" + fi + i=$(($i + 1)) + done +} + +function cleanup +{ + log_must zinject -c all + log_must zpool clear $TESTPOOL + destroy_zvols + set_blk_mq 0 + + # Remove all fio's leftover state files + if [ -n "$tmpdir" ] ; then + log_must rm -fd "$tmpdir"/*.state "$tmpdir" + fi +} + +log_onexit cleanup + +log_assert "Stress test zvols" + +set_blk_mq 0 +create_zvols +# Do some fio write/verifies in parallel +do_zvol_stress +destroy_zvols + +# Enable blk-mq (block multi-queue), and re-run the same test +set_blk_mq 1 +create_zvols +do_zvol_stress + +# Inject some errors, and verify we see some IO errors in zpool status +for DISK in $DISKS ; do + log_must zinject -d $DISK -f 10 -e io -T write $TESTPOOL +done +log_must dd if=/dev/zero of=$ZVOL_DEVDIR/$TESTPOOL/testvol1 bs=512 count=50 +log_must zinject -c all + +# We should see write errors +typeset -i write_errors=$(zpool status -p | awk ' + !NF { isvdev = 0 } + isvdev { errors += $4 } + /CKSUM$/ { isvdev = 1 } + END { print errors } +') + +if [ $write_errors -eq 0 ] ; then + log_fail "Expected to see some write errors" +else + log_note "Correctly saw $write_errors write errors" +fi +log_pass "Done with zvol_stress"