fix #5014 reenable blk-mq optimization
While I think the huge performance optimization was at some point not really that huge in practice - the feature sounds like it would benefit our use-case: https://github.com/openzfs/zfs/pull/13148 currently the feature is disabled in 2.2.0 (see the second patch), because of the issues addressed by the first patch Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com> Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
28de0abfa9
commit
7e3b7d81a1
99
debian/patches/0010-zvol-Remove-broken-blk-mq-optimization.patch
vendored
Normal file
99
debian/patches/0010-zvol-Remove-broken-blk-mq-optimization.patch
vendored
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Tony Hutter <hutter2@llnl.gov>
|
||||||
|
Date: Mon, 23 Oct 2023 14:45:06 -0700
|
||||||
|
Subject: [PATCH] zvol: Remove broken blk-mq optimization
|
||||||
|
|
||||||
|
This fix removes a dubious optimization in zfs_uiomove_bvec_rq()
|
||||||
|
that saved the iterator contents of a rq_for_each_segment(). This
|
||||||
|
optimization allowed restoring the "saved state" from a previous
|
||||||
|
rq_for_each_segment() call on the same uio so that you wouldn't
|
||||||
|
need to iterate though each bvec on every zfs_uiomove_bvec_rq() call.
|
||||||
|
However, if the kernel is manipulating the requests/bios/bvecs under
|
||||||
|
the covers between zfs_uiomove_bvec_rq() calls, then it could result
|
||||||
|
in corruption from using the "saved state". This optimization
|
||||||
|
results in an unbootable system after installing an OS on a zvol
|
||||||
|
with blk-mq enabled.
|
||||||
|
|
||||||
|
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||||
|
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
|
||||||
|
Closes #15351
|
||||||
|
(cherry picked from commit 7c9b6fed16ed5034fd1cdfdaedfad93dc97b1557)
|
||||||
|
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
|
||||||
|
---
|
||||||
|
include/os/linux/spl/sys/uio.h | 8 --------
|
||||||
|
module/os/linux/zfs/zfs_uio.c | 29 -----------------------------
|
||||||
|
2 files changed, 37 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h
|
||||||
|
index cce097e16..a4b600004 100644
|
||||||
|
--- a/include/os/linux/spl/sys/uio.h
|
||||||
|
+++ b/include/os/linux/spl/sys/uio.h
|
||||||
|
@@ -73,13 +73,6 @@ typedef struct zfs_uio {
|
||||||
|
size_t uio_skip;
|
||||||
|
|
||||||
|
struct request *rq;
|
||||||
|
-
|
||||||
|
- /*
|
||||||
|
- * Used for saving rq_for_each_segment() state between calls
|
||||||
|
- * to zfs_uiomove_bvec_rq().
|
||||||
|
- */
|
||||||
|
- struct req_iterator iter;
|
||||||
|
- struct bio_vec bv;
|
||||||
|
} zfs_uio_t;
|
||||||
|
|
||||||
|
|
||||||
|
@@ -138,7 +131,6 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq)
|
||||||
|
} else {
|
||||||
|
uio->uio_bvec = NULL;
|
||||||
|
uio->uio_iovcnt = 0;
|
||||||
|
- memset(&uio->iter, 0, sizeof (uio->iter));
|
||||||
|
}
|
||||||
|
|
||||||
|
uio->uio_loffset = io_offset(bio, rq);
|
||||||
|
diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c
|
||||||
|
index 3efd4ab15..c2ed67c43 100644
|
||||||
|
--- a/module/os/linux/zfs/zfs_uio.c
|
||||||
|
+++ b/module/os/linux/zfs/zfs_uio.c
|
||||||
|
@@ -204,22 +204,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
|
||||||
|
this_seg_start = orig_loffset;
|
||||||
|
|
||||||
|
rq_for_each_segment(bv, rq, iter) {
|
||||||
|
- if (uio->iter.bio) {
|
||||||
|
- /*
|
||||||
|
- * If uio->iter.bio is present, then we know we've saved
|
||||||
|
- * uio->iter from a previous call to this function, and
|
||||||
|
- * we can skip ahead in this rq_for_each_segment() loop
|
||||||
|
- * to where we last left off. That way, we don't need
|
||||||
|
- * to iterate over tons of segments we've already
|
||||||
|
- * processed - we can just restore the "saved state".
|
||||||
|
- */
|
||||||
|
- iter = uio->iter;
|
||||||
|
- bv = uio->bv;
|
||||||
|
- this_seg_start = uio->uio_loffset;
|
||||||
|
- memset(&uio->iter, 0, sizeof (uio->iter));
|
||||||
|
- continue;
|
||||||
|
- }
|
||||||
|
-
|
||||||
|
/*
|
||||||
|
* Lookup what the logical offset of the last byte of this
|
||||||
|
* segment is.
|
||||||
|
@@ -260,19 +244,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
|
||||||
|
copied = 1; /* We copied some data */
|
||||||
|
}
|
||||||
|
|
||||||
|
- if (n == 0) {
|
||||||
|
- /*
|
||||||
|
- * All done copying. Save our 'iter' value to the uio.
|
||||||
|
- * This allows us to "save our state" and skip ahead in
|
||||||
|
- * the rq_for_each_segment() loop the next time we call
|
||||||
|
- * call zfs_uiomove_bvec_rq() on this uio (which we
|
||||||
|
- * will be doing for any remaining data in the uio).
|
||||||
|
- */
|
||||||
|
- uio->iter = iter; /* make a copy of the struct data */
|
||||||
|
- uio->bv = bv;
|
||||||
|
- return (0);
|
||||||
|
- }
|
||||||
|
-
|
||||||
|
this_seg_start = this_seg_end + 1;
|
||||||
|
}
|
||||||
|
|
123
debian/patches/0011-Revert-zvol-Temporally-disable-blk-mq.patch
vendored
Normal file
123
debian/patches/0011-Revert-zvol-Temporally-disable-blk-mq.patch
vendored
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Tony Hutter <hutter2@llnl.gov>
|
||||||
|
Date: Mon, 23 Oct 2023 14:39:59 -0700
|
||||||
|
Subject: [PATCH] Revert "zvol: Temporally disable blk-mq"
|
||||||
|
|
||||||
|
This reverts commit aefb6a2bd6c24597cde655e9ce69edd0a4c34357.
|
||||||
|
|
||||||
|
aefb6a2bd temporally disabled blk-mq until we could fix a fix for
|
||||||
|
|
||||||
|
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
|
||||||
|
Closes #15439
|
||||||
|
(cherry picked from commit 05c4710e8958832afc2868102c9535a4f18115be)
|
||||||
|
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
|
||||||
|
---
|
||||||
|
man/man4/zfs.4 | 57 ++++++++++++++++++++++++++++
|
||||||
|
module/os/linux/zfs/zvol_os.c | 12 ++++++
|
||||||
|
tests/zfs-tests/include/tunables.cfg | 2 +-
|
||||||
|
3 files changed, 70 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
|
||||||
|
index 71a3e67ee..cfadd79d8 100644
|
||||||
|
--- a/man/man4/zfs.4
|
||||||
|
+++ b/man/man4/zfs.4
|
||||||
|
@@ -2317,6 +2317,63 @@ If
|
||||||
|
.Sy zvol_threads
|
||||||
|
to the number of CPUs present or 32 (whichever is greater).
|
||||||
|
.
|
||||||
|
+.It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint
|
||||||
|
+The number of threads per zvol to use for queuing IO requests.
|
||||||
|
+This parameter will only appear if your kernel supports
|
||||||
|
+.Li blk-mq
|
||||||
|
+and is only read and assigned to a zvol at zvol load time.
|
||||||
|
+If
|
||||||
|
+.Sy 0
|
||||||
|
+(the default) then internally set
|
||||||
|
+.Sy zvol_blk_mq_threads
|
||||||
|
+to the number of CPUs present.
|
||||||
|
+.
|
||||||
|
+.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint
|
||||||
|
+Set to
|
||||||
|
+.Sy 1
|
||||||
|
+to use the
|
||||||
|
+.Li blk-mq
|
||||||
|
+API for zvols.
|
||||||
|
+Set to
|
||||||
|
+.Sy 0
|
||||||
|
+(the default) to use the legacy zvol APIs.
|
||||||
|
+This setting can give better or worse zvol performance depending on
|
||||||
|
+the workload.
|
||||||
|
+This parameter will only appear if your kernel supports
|
||||||
|
+.Li blk-mq
|
||||||
|
+and is only read and assigned to a zvol at zvol load time.
|
||||||
|
+.
|
||||||
|
+.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint
|
||||||
|
+If
|
||||||
|
+.Sy zvol_use_blk_mq
|
||||||
|
+is enabled, then process this number of
|
||||||
|
+.Sy volblocksize Ns -sized blocks per zvol thread.
|
||||||
|
+This tunable can be use to favor better performance for zvol reads (lower
|
||||||
|
+values) or writes (higher values).
|
||||||
|
+If set to
|
||||||
|
+.Sy 0 ,
|
||||||
|
+then the zvol layer will process the maximum number of blocks
|
||||||
|
+per thread that it can.
|
||||||
|
+This parameter will only appear if your kernel supports
|
||||||
|
+.Li blk-mq
|
||||||
|
+and is only applied at each zvol's load time.
|
||||||
|
+.
|
||||||
|
+.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint
|
||||||
|
+The queue_depth value for the zvol
|
||||||
|
+.Li blk-mq
|
||||||
|
+interface.
|
||||||
|
+This parameter will only appear if your kernel supports
|
||||||
|
+.Li blk-mq
|
||||||
|
+and is only applied at each zvol's load time.
|
||||||
|
+If
|
||||||
|
+.Sy 0
|
||||||
|
+(the default) then use the kernel's default queue depth.
|
||||||
|
+Values are clamped to the kernel's
|
||||||
|
+.Dv BLKDEV_MIN_RQ
|
||||||
|
+and
|
||||||
|
+.Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ
|
||||||
|
+limits.
|
||||||
|
+.
|
||||||
|
.It Sy zvol_volmode Ns = Ns Sy 1 Pq uint
|
||||||
|
Defines zvol block devices behaviour when
|
||||||
|
.Sy volmode Ns = Ns Sy default :
|
||||||
|
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
|
||||||
|
index 76521c959..7a95b54bd 100644
|
||||||
|
--- a/module/os/linux/zfs/zvol_os.c
|
||||||
|
+++ b/module/os/linux/zfs/zvol_os.c
|
||||||
|
@@ -1620,6 +1620,18 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
|
||||||
|
module_param(zvol_volmode, uint, 0644);
|
||||||
|
MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
|
||||||
|
|
||||||
|
+#ifdef HAVE_BLK_MQ
|
||||||
|
+module_param(zvol_blk_mq_queue_depth, uint, 0644);
|
||||||
|
+MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
|
||||||
|
+
|
||||||
|
+module_param(zvol_use_blk_mq, uint, 0644);
|
||||||
|
+MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
|
||||||
|
+
|
||||||
|
+module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
|
||||||
|
+MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
|
||||||
|
+ "Process volblocksize blocks per thread");
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
|
||||||
|
module_param(zvol_open_timeout_ms, uint, 0644);
|
||||||
|
MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
|
||||||
|
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
|
||||||
|
index 8010a9451..80e7bcb3b 100644
|
||||||
|
--- a/tests/zfs-tests/include/tunables.cfg
|
||||||
|
+++ b/tests/zfs-tests/include/tunables.cfg
|
||||||
|
@@ -89,7 +89,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip
|
||||||
|
VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev
|
||||||
|
VOL_MODE vol.mode zvol_volmode
|
||||||
|
VOL_RECURSIVE vol.recursive UNSUPPORTED
|
||||||
|
-VOL_USE_BLK_MQ UNSUPPORTED UNSUPPORTED
|
||||||
|
+VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
|
||||||
|
XATTR_COMPAT xattr_compat zfs_xattr_compat
|
||||||
|
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
|
||||||
|
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max
|
2
debian/patches/series
vendored
2
debian/patches/series
vendored
@ -7,3 +7,5 @@
|
|||||||
0007-Add-systemd-unit-for-importing-specific-pools.patch
|
0007-Add-systemd-unit-for-importing-specific-pools.patch
|
||||||
0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
|
0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
|
||||||
0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
|
0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
|
||||||
|
0010-zvol-Remove-broken-blk-mq-optimization.patch
|
||||||
|
0011-Revert-zvol-Temporally-disable-blk-mq.patch
|
||||||
|
Loading…
Reference in New Issue
Block a user