fix #5014 reenable blk-mq optimization

While I think the huge performance optimization was at some point not
really that huge in practice - the feature sounds like it would
benefit our use-case:
https://github.com/openzfs/zfs/pull/13148

currently the feature is disabled in 2.2.0 (see the second patch),
because of the issues addressed by the first patch

Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
Stoiko Ivanov 2023-10-25 11:13:11 +02:00 committed by Thomas Lamprecht
parent 28de0abfa9
commit 7e3b7d81a1
3 changed files with 224 additions and 0 deletions

View File

@ -0,0 +1,99 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Mon, 23 Oct 2023 14:45:06 -0700
Subject: [PATCH] zvol: Remove broken blk-mq optimization
This fix removes a dubious optimization in zfs_uiomove_bvec_rq()
that saved the iterator contents of a rq_for_each_segment(). This
optimization allowed restoring the "saved state" from a previous
rq_for_each_segment() call on the same uio so that you wouldn't
need to iterate though each bvec on every zfs_uiomove_bvec_rq() call.
However, if the kernel is manipulating the requests/bios/bvecs under
the covers between zfs_uiomove_bvec_rq() calls, then it could result
in corruption from using the "saved state". This optimization
results in an unbootable system after installing an OS on a zvol
with blk-mq enabled.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #15351
(cherry picked from commit 7c9b6fed16ed5034fd1cdfdaedfad93dc97b1557)
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
include/os/linux/spl/sys/uio.h | 8 --------
module/os/linux/zfs/zfs_uio.c | 29 -----------------------------
2 files changed, 37 deletions(-)
diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h
index cce097e16..a4b600004 100644
--- a/include/os/linux/spl/sys/uio.h
+++ b/include/os/linux/spl/sys/uio.h
@@ -73,13 +73,6 @@ typedef struct zfs_uio {
size_t uio_skip;
struct request *rq;
-
- /*
- * Used for saving rq_for_each_segment() state between calls
- * to zfs_uiomove_bvec_rq().
- */
- struct req_iterator iter;
- struct bio_vec bv;
} zfs_uio_t;
@@ -138,7 +131,6 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq)
} else {
uio->uio_bvec = NULL;
uio->uio_iovcnt = 0;
- memset(&uio->iter, 0, sizeof (uio->iter));
}
uio->uio_loffset = io_offset(bio, rq);
diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c
index 3efd4ab15..c2ed67c43 100644
--- a/module/os/linux/zfs/zfs_uio.c
+++ b/module/os/linux/zfs/zfs_uio.c
@@ -204,22 +204,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
this_seg_start = orig_loffset;
rq_for_each_segment(bv, rq, iter) {
- if (uio->iter.bio) {
- /*
- * If uio->iter.bio is present, then we know we've saved
- * uio->iter from a previous call to this function, and
- * we can skip ahead in this rq_for_each_segment() loop
- * to where we last left off. That way, we don't need
- * to iterate over tons of segments we've already
- * processed - we can just restore the "saved state".
- */
- iter = uio->iter;
- bv = uio->bv;
- this_seg_start = uio->uio_loffset;
- memset(&uio->iter, 0, sizeof (uio->iter));
- continue;
- }
-
/*
* Lookup what the logical offset of the last byte of this
* segment is.
@@ -260,19 +244,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
copied = 1; /* We copied some data */
}
- if (n == 0) {
- /*
- * All done copying. Save our 'iter' value to the uio.
- * This allows us to "save our state" and skip ahead in
- * the rq_for_each_segment() loop the next time we call
- * call zfs_uiomove_bvec_rq() on this uio (which we
- * will be doing for any remaining data in the uio).
- */
- uio->iter = iter; /* make a copy of the struct data */
- uio->bv = bv;
- return (0);
- }
-
this_seg_start = this_seg_end + 1;
}

View File

@ -0,0 +1,123 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Mon, 23 Oct 2023 14:39:59 -0700
Subject: [PATCH] Revert "zvol: Temporally disable blk-mq"
This reverts commit aefb6a2bd6c24597cde655e9ce69edd0a4c34357.
aefb6a2bd temporally disabled blk-mq until we could fix a fix for
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #15439
(cherry picked from commit 05c4710e8958832afc2868102c9535a4f18115be)
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
man/man4/zfs.4 | 57 ++++++++++++++++++++++++++++
module/os/linux/zfs/zvol_os.c | 12 ++++++
tests/zfs-tests/include/tunables.cfg | 2 +-
3 files changed, 70 insertions(+), 1 deletion(-)
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 71a3e67ee..cfadd79d8 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2317,6 +2317,63 @@ If
.Sy zvol_threads
to the number of CPUs present or 32 (whichever is greater).
.
+.It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint
+The number of threads per zvol to use for queuing IO requests.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only read and assigned to a zvol at zvol load time.
+If
+.Sy 0
+(the default) then internally set
+.Sy zvol_blk_mq_threads
+to the number of CPUs present.
+.
+.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+Set to
+.Sy 1
+to use the
+.Li blk-mq
+API for zvols.
+Set to
+.Sy 0
+(the default) to use the legacy zvol APIs.
+This setting can give better or worse zvol performance depending on
+the workload.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only read and assigned to a zvol at zvol load time.
+.
+.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint
+If
+.Sy zvol_use_blk_mq
+is enabled, then process this number of
+.Sy volblocksize Ns -sized blocks per zvol thread.
+This tunable can be use to favor better performance for zvol reads (lower
+values) or writes (higher values).
+If set to
+.Sy 0 ,
+then the zvol layer will process the maximum number of blocks
+per thread that it can.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only applied at each zvol's load time.
+.
+.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint
+The queue_depth value for the zvol
+.Li blk-mq
+interface.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only applied at each zvol's load time.
+If
+.Sy 0
+(the default) then use the kernel's default queue depth.
+Values are clamped to the kernel's
+.Dv BLKDEV_MIN_RQ
+and
+.Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ
+limits.
+.
.It Sy zvol_volmode Ns = Ns Sy 1 Pq uint
Defines zvol block devices behaviour when
.Sy volmode Ns = Ns Sy default :
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 76521c959..7a95b54bd 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1620,6 +1620,18 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
module_param(zvol_volmode, uint, 0644);
MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
+#ifdef HAVE_BLK_MQ
+module_param(zvol_blk_mq_queue_depth, uint, 0644);
+MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
+
+module_param(zvol_use_blk_mq, uint, 0644);
+MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
+
+module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
+MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
+ "Process volblocksize blocks per thread");
+#endif
+
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
module_param(zvol_open_timeout_ms, uint, 0644);
MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 8010a9451..80e7bcb3b 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -89,7 +89,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip
VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev
VOL_MODE vol.mode zvol_volmode
VOL_RECURSIVE vol.recursive UNSUPPORTED
-VOL_USE_BLK_MQ UNSUPPORTED UNSUPPORTED
+VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
XATTR_COMPAT xattr_compat zfs_xattr_compat
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max

View File

@ -7,3 +7,5 @@
0007-Add-systemd-unit-for-importing-specific-pools.patch
0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
0010-zvol-Remove-broken-blk-mq-optimization.patch
0011-Revert-zvol-Temporally-disable-blk-mq.patch