update zfs submodule to 2.2.4 and refresh patches
mostly - drop all patches we had queued up to get kernel 6.8 supported. Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com> Tested-by: Max Carrara <m.carrara@proxmox.com> Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
3968b96ed4
commit
76119aa32b
@ -18,7 +18,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
|
|||||||
---
|
---
|
||||||
etc/Makefile.am | 1 +
|
etc/Makefile.am | 1 +
|
||||||
etc/systemd/system/50-zfs.preset | 1 +
|
etc/systemd/system/50-zfs.preset | 1 +
|
||||||
etc/systemd/system/zfs-import@.service.in | 18 ++++++++++++++++
|
etc/systemd/system/zfs-import@.service.in | 18 ++++++++++++++++++
|
||||||
3 files changed, 20 insertions(+)
|
3 files changed, 20 insertions(+)
|
||||||
create mode 100644 etc/systemd/system/zfs-import@.service.in
|
create mode 100644 etc/systemd/system/zfs-import@.service.in
|
||||||
|
|
||||||
@ -48,7 +48,7 @@ index e4056a92c..030611419 100644
|
|||||||
enable zfs-share.service
|
enable zfs-share.service
|
||||||
diff --git a/etc/systemd/system/zfs-import@.service.in b/etc/systemd/system/zfs-import@.service.in
|
diff --git a/etc/systemd/system/zfs-import@.service.in b/etc/systemd/system/zfs-import@.service.in
|
||||||
new file mode 100644
|
new file mode 100644
|
||||||
index 000000000..9b4ee9371
|
index 000000000..5bd19fb79
|
||||||
--- /dev/null
|
--- /dev/null
|
||||||
+++ b/etc/systemd/system/zfs-import@.service.in
|
+++ b/etc/systemd/system/zfs-import@.service.in
|
||||||
@@ -0,0 +1,18 @@
|
@@ -0,0 +1,18 @@
|
||||||
|
@ -15,7 +15,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
|
|||||||
rename man/{man1/arcstat.1 => man8/arcstat.8} (99%)
|
rename man/{man1/arcstat.1 => man8/arcstat.8} (99%)
|
||||||
|
|
||||||
diff --git a/man/Makefile.am b/man/Makefile.am
|
diff --git a/man/Makefile.am b/man/Makefile.am
|
||||||
index 45156571e..3713e9371 100644
|
index 43bb014dd..a9293468a 100644
|
||||||
--- a/man/Makefile.am
|
--- a/man/Makefile.am
|
||||||
+++ b/man/Makefile.am
|
+++ b/man/Makefile.am
|
||||||
@@ -2,7 +2,6 @@ dist_noinst_man_MANS = \
|
@@ -2,7 +2,6 @@ dist_noinst_man_MANS = \
|
||||||
|
@ -27,7 +27,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
|
|||||||
2 files changed, 21 insertions(+), 21 deletions(-)
|
2 files changed, 21 insertions(+), 21 deletions(-)
|
||||||
|
|
||||||
diff --git a/cmd/arc_summary b/cmd/arc_summary
|
diff --git a/cmd/arc_summary b/cmd/arc_summary
|
||||||
index 9c69ec4f8..edf94ea2a 100755
|
index 100fb1987..86b2260a1 100755
|
||||||
--- a/cmd/arc_summary
|
--- a/cmd/arc_summary
|
||||||
+++ b/cmd/arc_summary
|
+++ b/cmd/arc_summary
|
||||||
@@ -655,13 +655,13 @@ def section_arc(kstats_dict):
|
@@ -655,13 +655,13 @@ def section_arc(kstats_dict):
|
||||||
@ -48,7 +48,7 @@ index 9c69ec4f8..edf94ea2a 100755
|
|||||||
prt_i1('L2 ineligible evictions:',
|
prt_i1('L2 ineligible evictions:',
|
||||||
f_bytes(arc_stats['evict_l2_ineligible']))
|
f_bytes(arc_stats['evict_l2_ineligible']))
|
||||||
print()
|
print()
|
||||||
@@ -851,20 +851,20 @@ def section_l2arc(kstats_dict):
|
@@ -860,20 +860,20 @@ def section_l2arc(kstats_dict):
|
||||||
f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']),
|
f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']),
|
||||||
f_bytes(arc_stats['l2_hdr_size']))
|
f_bytes(arc_stats['l2_hdr_size']))
|
||||||
prt_i2('MFU allocated size:',
|
prt_i2('MFU allocated size:',
|
||||||
@ -80,10 +80,10 @@ index 9c69ec4f8..edf94ea2a 100755
|
|||||||
print()
|
print()
|
||||||
prt_1('L2ARC breakdown:', f_hits(l2_access_total))
|
prt_1('L2ARC breakdown:', f_hits(l2_access_total))
|
||||||
diff --git a/cmd/arcstat.in b/cmd/arcstat.in
|
diff --git a/cmd/arcstat.in b/cmd/arcstat.in
|
||||||
index 8df1c62f7..833348d0e 100755
|
index c4f10a1d6..c570dca88 100755
|
||||||
--- a/cmd/arcstat.in
|
--- a/cmd/arcstat.in
|
||||||
+++ b/cmd/arcstat.in
|
+++ b/cmd/arcstat.in
|
||||||
@@ -565,8 +565,8 @@ def calculate():
|
@@ -597,8 +597,8 @@ def calculate():
|
||||||
v["el2skip"] = d["evict_l2_skip"] // sint
|
v["el2skip"] = d["evict_l2_skip"] // sint
|
||||||
v["el2cach"] = d["evict_l2_cached"] // sint
|
v["el2cach"] = d["evict_l2_cached"] // sint
|
||||||
v["el2el"] = d["evict_l2_eligible"] // sint
|
v["el2el"] = d["evict_l2_eligible"] // sint
|
||||||
@ -93,8 +93,8 @@ index 8df1c62f7..833348d0e 100755
|
|||||||
+ v["el2mru"] = d.get("evict_l2_eligible_mru", 0) // sint
|
+ v["el2mru"] = d.get("evict_l2_eligible_mru", 0) // sint
|
||||||
v["el2inel"] = d["evict_l2_ineligible"] // sint
|
v["el2inel"] = d["evict_l2_ineligible"] // sint
|
||||||
v["mtxmis"] = d["mutex_miss"] // sint
|
v["mtxmis"] = d["mutex_miss"] // sint
|
||||||
|
v["ztotal"] = (d["zfetch_hits"] + d["zfetch_future"] + d["zfetch_stride"] +
|
||||||
@@ -581,11 +581,11 @@ def calculate():
|
@@ -624,11 +624,11 @@ def calculate():
|
||||||
v["l2size"] = cur["l2_size"]
|
v["l2size"] = cur["l2_size"]
|
||||||
v["l2bytes"] = d["l2_read_bytes"] // sint
|
v["l2bytes"] = d["l2_read_bytes"] // sint
|
||||||
|
|
||||||
|
@ -51,10 +51,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
|
|||||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
|
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
|
||||||
index 69bf9649a..fd42ce7c1 100644
|
index ed0b8d7a1..f3acc49d0 100644
|
||||||
--- a/cmd/zpool/zpool_main.c
|
--- a/cmd/zpool/zpool_main.c
|
||||||
+++ b/cmd/zpool/zpool_main.c
|
+++ b/cmd/zpool/zpool_main.c
|
||||||
@@ -2616,7 +2616,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
@@ -2663,7 +2663,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
||||||
|
|
||||||
if (vs->vs_scan_removing != 0) {
|
if (vs->vs_scan_removing != 0) {
|
||||||
(void) printf(gettext(" (removing)"));
|
(void) printf(gettext(" (removing)"));
|
||||||
|
@ -1,52 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
|
|
||||||
Date: Wed, 6 Mar 2024 10:39:06 +0100
|
|
||||||
Subject: [PATCH] udev: correctly handle partition #16 and later
|
|
||||||
MIME-Version: 1.0
|
|
||||||
Content-Type: text/plain; charset=UTF-8
|
|
||||||
Content-Transfer-Encoding: 8bit
|
|
||||||
|
|
||||||
If a zvol has more than 15 partitions, the minor device number exhausts
|
|
||||||
the slot count reserved for partitions next to the zvol itself. As a
|
|
||||||
result, the minor number cannot be used to determine the partition
|
|
||||||
number for the higher partition, and doing so results in wrong named
|
|
||||||
symlinks being generated by udev.
|
|
||||||
|
|
||||||
Since the partition number is encoded in the block device name anyway,
|
|
||||||
let's just extract it from there instead.
|
|
||||||
|
|
||||||
Fixes: #15904
|
|
||||||
|
|
||||||
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
|
|
||||||
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
|
|
||||||
---
|
|
||||||
udev/zvol_id.c | 9 +++++----
|
|
||||||
1 file changed, 5 insertions(+), 4 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/udev/zvol_id.c b/udev/zvol_id.c
|
|
||||||
index 5960b9787..609349594 100644
|
|
||||||
--- a/udev/zvol_id.c
|
|
||||||
+++ b/udev/zvol_id.c
|
|
||||||
@@ -51,7 +51,7 @@ const char *__asan_default_options(void) {
|
|
||||||
int
|
|
||||||
main(int argc, const char *const *argv)
|
|
||||||
{
|
|
||||||
- if (argc != 2) {
|
|
||||||
+ if (argc != 2 || strncmp(argv[1], "/dev/zd", 7) != 0) {
|
|
||||||
fprintf(stderr, "usage: %s /dev/zdX\n", argv[0]);
|
|
||||||
return (1);
|
|
||||||
}
|
|
||||||
@@ -72,9 +72,10 @@ main(int argc, const char *const *argv)
|
|
||||||
return (1);
|
|
||||||
}
|
|
||||||
|
|
||||||
- unsigned int dev_part = minor(sb.st_rdev) % ZVOL_MINORS;
|
|
||||||
- if (dev_part != 0)
|
|
||||||
- sprintf(zvol_name + strlen(zvol_name), "-part%u", dev_part);
|
|
||||||
+ const char *dev_part = strrchr(dev_name, 'p');
|
|
||||||
+ if (dev_part != NULL) {
|
|
||||||
+ sprintf(zvol_name + strlen(zvol_name), "-part%s", dev_part + 1);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
for (size_t i = 0; i < strlen(zvol_name); ++i)
|
|
||||||
if (isblank(zvol_name[i]))
|
|
@ -1,135 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob N <robn@despairlabs.com>
|
|
||||||
Date: Thu, 21 Mar 2024 10:46:15 +1100
|
|
||||||
Subject: [PATCH] Linux 6.8 compat: use splice_copy_file_range() for fallback
|
|
||||||
|
|
||||||
Linux 6.8 removes generic_copy_file_range(), which had been reduced to a
|
|
||||||
simple wrapper around splice_copy_file_range(). Detect that function
|
|
||||||
directly and use it if generic_ is not available.
|
|
||||||
|
|
||||||
Sponsored-by: https://despairlabs.com/sponsor/
|
|
||||||
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
|
|
||||||
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
||||||
Signed-off-by: Rob Norris <robn@despairlabs.com>
|
|
||||||
Closes #15930
|
|
||||||
Closes #15931
|
|
||||||
(cherry picked from commit ef08a4d4065d21414d7fedccac20da6bfda4dfd0)
|
|
||||||
---
|
|
||||||
config/kernel-vfs-file_range.m4 | 27 +++++++++++++++++++++++++++
|
|
||||||
config/kernel.m4 | 2 ++
|
|
||||||
module/os/linux/zfs/zpl_file_range.c | 16 ++++++++++++++--
|
|
||||||
3 files changed, 43 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4
|
|
||||||
index cc96404d8..8a5cbe2ee 100644
|
|
||||||
--- a/config/kernel-vfs-file_range.m4
|
|
||||||
+++ b/config/kernel-vfs-file_range.m4
|
|
||||||
@@ -16,6 +16,9 @@ dnl #
|
|
||||||
dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
|
|
||||||
dnl # generic_copy_file_range() added to support it
|
|
||||||
dnl #
|
|
||||||
+dnl # 6.8: generic_copy_file_range() removed, replaced by
|
|
||||||
+dnl # splice_copy_file_range()
|
|
||||||
+dnl #
|
|
||||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
|
|
||||||
ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
|
|
||||||
#include <linux/fs.h>
|
|
||||||
@@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
|
|
||||||
])
|
|
||||||
])
|
|
||||||
|
|
||||||
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
|
|
||||||
+ ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
|
|
||||||
+ #include <linux/splice.h>
|
|
||||||
+ ], [
|
|
||||||
+ struct file *src_file __attribute__ ((unused)) = NULL;
|
|
||||||
+ loff_t src_off __attribute__ ((unused)) = 0;
|
|
||||||
+ struct file *dst_file __attribute__ ((unused)) = NULL;
|
|
||||||
+ loff_t dst_off __attribute__ ((unused)) = 0;
|
|
||||||
+ size_t len __attribute__ ((unused)) = 0;
|
|
||||||
+ splice_copy_file_range(src_file, src_off, dst_file, dst_off,
|
|
||||||
+ len);
|
|
||||||
+ ])
|
|
||||||
+])
|
|
||||||
+AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
|
|
||||||
+ AC_MSG_CHECKING([whether splice_copy_file_range() is available])
|
|
||||||
+ ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
|
|
||||||
+ AC_MSG_RESULT(yes)
|
|
||||||
+ AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
|
|
||||||
+ [splice_copy_file_range() is available])
|
|
||||||
+ ],[
|
|
||||||
+ AC_MSG_RESULT(no)
|
|
||||||
+ ])
|
|
||||||
+])
|
|
||||||
+
|
|
||||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
|
|
||||||
ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
|
|
||||||
#include <linux/fs.h>
|
|
||||||
diff --git a/config/kernel.m4 b/config/kernel.m4
|
|
||||||
index e3f864577..1d0c5a27f 100644
|
|
||||||
--- a/config/kernel.m4
|
|
||||||
+++ b/config/kernel.m4
|
|
||||||
@@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
|
||||||
ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
|
|
||||||
ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
|
|
||||||
ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
|
|
||||||
+ ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
|
|
||||||
ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
|
|
||||||
ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
|
|
||||||
ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
|
|
||||||
@@ -266,6 +267,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
|
||||||
ZFS_AC_KERNEL_VFS_IOV_ITER
|
|
||||||
ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
|
|
||||||
ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
|
|
||||||
+ ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
|
|
||||||
ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
|
|
||||||
ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
|
|
||||||
ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
|
|
||||||
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
|
|
||||||
index 3065d54fa..64728fdb1 100644
|
|
||||||
--- a/module/os/linux/zfs/zpl_file_range.c
|
|
||||||
+++ b/module/os/linux/zfs/zpl_file_range.c
|
|
||||||
@@ -26,6 +26,9 @@
|
|
||||||
#include <linux/compat.h>
|
|
||||||
#endif
|
|
||||||
#include <linux/fs.h>
|
|
||||||
+#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
|
|
||||||
+#include <linux/splice.h>
|
|
||||||
+#endif
|
|
||||||
#include <sys/file.h>
|
|
||||||
#include <sys/zfs_znode.h>
|
|
||||||
#include <sys/zfs_vnops.h>
|
|
||||||
@@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
|
||||||
ret = zpl_clone_file_range_impl(src_file, src_off,
|
|
||||||
dst_file, dst_off, len);
|
|
||||||
|
|
||||||
-#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
|
|
||||||
+#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
|
|
||||||
/*
|
|
||||||
* Since Linux 5.3 the filesystem driver is responsible for executing
|
|
||||||
* an appropriate fallback, and a generic fallback function is provided.
|
|
||||||
@@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
|
||||||
ret == -EAGAIN)
|
|
||||||
ret = generic_copy_file_range(src_file, src_off, dst_file,
|
|
||||||
dst_off, len, flags);
|
|
||||||
+#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
|
|
||||||
+ /*
|
|
||||||
+ * Since 6.8 the fallback function is called splice_copy_file_range
|
|
||||||
+ * and has a slightly different signature.
|
|
||||||
+ */
|
|
||||||
+ if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
|
|
||||||
+ ret == -EAGAIN)
|
|
||||||
+ ret = splice_copy_file_range(src_file, src_off, dst_file,
|
|
||||||
+ dst_off, len);
|
|
||||||
#else
|
|
||||||
/*
|
|
||||||
* Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
|
|
||||||
@@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
|
||||||
*/
|
|
||||||
if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
|
|
||||||
ret = -EOPNOTSUPP;
|
|
||||||
-#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
|
|
||||||
+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
|
|
||||||
|
|
||||||
return (ret);
|
|
||||||
}
|
|
121
debian/patches/0014-linux-5.4-compat-page_size.patch
vendored
121
debian/patches/0014-linux-5.4-compat-page_size.patch
vendored
@ -1,121 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Date: Mon, 13 Nov 2023 17:55:29 +1100
|
|
||||||
Subject: [PATCH] linux 5.4 compat: page_size()
|
|
||||||
|
|
||||||
Before 5.4 we have to do a little math.
|
|
||||||
|
|
||||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
|
||||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
||||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Sponsored-by: Klara, Inc.
|
|
||||||
Sponsored-by: Wasabi Technology, Inc.
|
|
||||||
Closes #15533
|
|
||||||
Closes #15588
|
|
||||||
(cherry picked from commit df04efe321a49c650f1fbaa6fd701fa2928cbe21)
|
|
||||||
---
|
|
||||||
config/kernel-mm-page-size.m4 | 17 +++++++++++
|
|
||||||
config/kernel.m4 | 2 ++
|
|
||||||
include/os/linux/Makefile.am | 1 +
|
|
||||||
include/os/linux/kernel/linux/mm_compat.h | 36 +++++++++++++++++++++++
|
|
||||||
4 files changed, 56 insertions(+)
|
|
||||||
create mode 100644 config/kernel-mm-page-size.m4
|
|
||||||
create mode 100644 include/os/linux/kernel/linux/mm_compat.h
|
|
||||||
|
|
||||||
diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4
|
|
||||||
new file mode 100644
|
|
||||||
index 000000000..d5ebd9269
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/config/kernel-mm-page-size.m4
|
|
||||||
@@ -0,0 +1,17 @@
|
|
||||||
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
|
|
||||||
+ ZFS_LINUX_TEST_SRC([page_size], [
|
|
||||||
+ #include <linux/mm.h>
|
|
||||||
+ ],[
|
|
||||||
+ unsigned long s;
|
|
||||||
+ s = page_size(NULL);
|
|
||||||
+ ])
|
|
||||||
+])
|
|
||||||
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
|
|
||||||
+ AC_MSG_CHECKING([whether page_size() is available])
|
|
||||||
+ ZFS_LINUX_TEST_RESULT([page_size], [
|
|
||||||
+ AC_MSG_RESULT(yes)
|
|
||||||
+ AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
|
|
||||||
+ ],[
|
|
||||||
+ AC_MSG_RESULT(no)
|
|
||||||
+ ])
|
|
||||||
+])
|
|
||||||
diff --git a/config/kernel.m4 b/config/kernel.m4
|
|
||||||
index 1d0c5a27f..548905ccd 100644
|
|
||||||
--- a/config/kernel.m4
|
|
||||||
+++ b/config/kernel.m4
|
|
||||||
@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
|
||||||
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
|
|
||||||
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
|
|
||||||
ZFS_AC_KERNEL_SRC_SYNC_BDEV
|
|
||||||
+ ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
|
|
||||||
case "$host_cpu" in
|
|
||||||
powerpc*)
|
|
||||||
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
|
|
||||||
@@ -316,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
|
||||||
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
|
|
||||||
ZFS_AC_KERNEL_COPY_SPLICE_READ
|
|
||||||
ZFS_AC_KERNEL_SYNC_BDEV
|
|
||||||
+ ZFS_AC_KERNEL_MM_PAGE_SIZE
|
|
||||||
case "$host_cpu" in
|
|
||||||
powerpc*)
|
|
||||||
ZFS_AC_KERNEL_CPU_HAS_FEATURE
|
|
||||||
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
|
|
||||||
index 3830d198d..51c27132b 100644
|
|
||||||
--- a/include/os/linux/Makefile.am
|
|
||||||
+++ b/include/os/linux/Makefile.am
|
|
||||||
@@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
|
|
||||||
%D%/kernel/linux/compiler_compat.h \
|
|
||||||
%D%/kernel/linux/dcache_compat.h \
|
|
||||||
%D%/kernel/linux/kmap_compat.h \
|
|
||||||
+ %D%/kernel/linux/mm_compat.h \
|
|
||||||
%D%/kernel/linux/mod_compat.h \
|
|
||||||
%D%/kernel/linux/page_compat.h \
|
|
||||||
%D%/kernel/linux/percpu_compat.h \
|
|
||||||
diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h
|
|
||||||
new file mode 100644
|
|
||||||
index 000000000..40056c68d
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/include/os/linux/kernel/linux/mm_compat.h
|
|
||||||
@@ -0,0 +1,36 @@
|
|
||||||
+/*
|
|
||||||
+ * CDDL HEADER START
|
|
||||||
+ *
|
|
||||||
+ * The contents of this file are subject to the terms of the
|
|
||||||
+ * Common Development and Distribution License (the "License").
|
|
||||||
+ * You may not use this file except in compliance with the License.
|
|
||||||
+ *
|
|
||||||
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
||||||
+ * or https://opensource.org/licenses/CDDL-1.0.
|
|
||||||
+ * See the License for the specific language governing permissions
|
|
||||||
+ * and limitations under the License.
|
|
||||||
+ *
|
|
||||||
+ * When distributing Covered Code, include this CDDL HEADER in each
|
|
||||||
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
||||||
+ * If applicable, add the following below this CDDL HEADER, with the
|
|
||||||
+ * fields enclosed by brackets "[]" replaced with your own identifying
|
|
||||||
+ * information: Portions Copyright [yyyy] [name of copyright owner]
|
|
||||||
+ *
|
|
||||||
+ * CDDL HEADER END
|
|
||||||
+ */
|
|
||||||
+
|
|
||||||
+/*
|
|
||||||
+ * Copyright (c) 2023, 2024, Klara Inc.
|
|
||||||
+ */
|
|
||||||
+
|
|
||||||
+#ifndef _ZFS_MM_COMPAT_H
|
|
||||||
+#define _ZFS_MM_COMPAT_H
|
|
||||||
+
|
|
||||||
+#include <linux/mm.h>
|
|
||||||
+
|
|
||||||
+/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
|
|
||||||
+#ifndef HAVE_MM_PAGE_SIZE
|
|
||||||
+#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#endif /* _ZFS_MM_COMPAT_H */
|
|
334
debian/patches/0015-abd-add-page-iterator.patch
vendored
334
debian/patches/0015-abd-add-page-iterator.patch
vendored
@ -1,334 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Date: Mon, 11 Dec 2023 16:05:54 +1100
|
|
||||||
Subject: [PATCH] abd: add page iterator
|
|
||||||
|
|
||||||
The regular ABD iterators yield data buffers, so they have to map and
|
|
||||||
unmap pages into kernel memory. If the caller only wants to count
|
|
||||||
chunks, or can use page pointers directly, then the map/unmap is just
|
|
||||||
unnecessary overhead.
|
|
||||||
|
|
||||||
This adds adb_iterate_page_func, which yields unmapped struct page
|
|
||||||
instead.
|
|
||||||
|
|
||||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
|
||||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
||||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Sponsored-by: Klara, Inc.
|
|
||||||
Sponsored-by: Wasabi Technology, Inc.
|
|
||||||
Closes #15533
|
|
||||||
Closes #15588
|
|
||||||
(cherry picked from commit 390b448726c580999dd337be7a40b0e95cf1d50b)
|
|
||||||
---
|
|
||||||
include/sys/abd.h | 7 +++
|
|
||||||
include/sys/abd_impl.h | 26 ++++++++-
|
|
||||||
module/os/freebsd/zfs/abd_os.c | 4 +-
|
|
||||||
module/os/linux/zfs/abd_os.c | 104 ++++++++++++++++++++++++++++++---
|
|
||||||
module/zfs/abd.c | 42 +++++++++++++
|
|
||||||
5 files changed, 169 insertions(+), 14 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/include/sys/abd.h b/include/sys/abd.h
|
|
||||||
index 750f9986c..8a2df0bca 100644
|
|
||||||
--- a/include/sys/abd.h
|
|
||||||
+++ b/include/sys/abd.h
|
|
||||||
@@ -79,6 +79,9 @@ typedef struct abd {
|
|
||||||
|
|
||||||
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
|
|
||||||
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
|
|
||||||
+#if defined(__linux__) && defined(_KERNEL)
|
|
||||||
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
|
|
||||||
+#endif
|
|
||||||
|
|
||||||
extern int zfs_abd_scatter_enabled;
|
|
||||||
|
|
||||||
@@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
|
|
||||||
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
|
|
||||||
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
|
|
||||||
abd_iter_func2_t *, void *);
|
|
||||||
+#if defined(__linux__) && defined(_KERNEL)
|
|
||||||
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
|
|
||||||
+ void *);
|
|
||||||
+#endif
|
|
||||||
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
|
|
||||||
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
|
|
||||||
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
|
|
||||||
diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
|
|
||||||
index 40546d4af..f88ea25e2 100644
|
|
||||||
--- a/include/sys/abd_impl.h
|
|
||||||
+++ b/include/sys/abd_impl.h
|
|
||||||
@@ -21,6 +21,7 @@
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
|
||||||
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
|
||||||
+ * Copyright (c) 2023, 2024, Klara Inc.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef _ABD_IMPL_H
|
|
||||||
@@ -38,12 +39,30 @@ typedef enum abd_stats_op {
|
|
||||||
ABDSTAT_DECR /* Decrease abdstat values */
|
|
||||||
} abd_stats_op_t;
|
|
||||||
|
|
||||||
-struct scatterlist; /* forward declaration */
|
|
||||||
+/* forward declarations */
|
|
||||||
+struct scatterlist;
|
|
||||||
+struct page;
|
|
||||||
|
|
||||||
struct abd_iter {
|
|
||||||
/* public interface */
|
|
||||||
- void *iter_mapaddr; /* addr corresponding to iter_pos */
|
|
||||||
- size_t iter_mapsize; /* length of data valid at mapaddr */
|
|
||||||
+ union {
|
|
||||||
+ /* for abd_iter_map()/abd_iter_unmap() */
|
|
||||||
+ struct {
|
|
||||||
+ /* addr corresponding to iter_pos */
|
|
||||||
+ void *iter_mapaddr;
|
|
||||||
+ /* length of data valid at mapaddr */
|
|
||||||
+ size_t iter_mapsize;
|
|
||||||
+ };
|
|
||||||
+ /* for abd_iter_page() */
|
|
||||||
+ struct {
|
|
||||||
+ /* current page */
|
|
||||||
+ struct page *iter_page;
|
|
||||||
+ /* offset of data in page */
|
|
||||||
+ size_t iter_page_doff;
|
|
||||||
+ /* size of data in page */
|
|
||||||
+ size_t iter_page_dsize;
|
|
||||||
+ };
|
|
||||||
+ };
|
|
||||||
|
|
||||||
/* private */
|
|
||||||
abd_t *iter_abd; /* ABD being iterated through */
|
|
||||||
@@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
|
|
||||||
void abd_iter_advance(struct abd_iter *, size_t);
|
|
||||||
void abd_iter_map(struct abd_iter *);
|
|
||||||
void abd_iter_unmap(struct abd_iter *);
|
|
||||||
+void abd_iter_page(struct abd_iter *);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Helper macros
|
|
||||||
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
|
|
||||||
index 58a37df62..3b812271f 100644
|
|
||||||
--- a/module/os/freebsd/zfs/abd_os.c
|
|
||||||
+++ b/module/os/freebsd/zfs/abd_os.c
|
|
||||||
@@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
|
||||||
{
|
|
||||||
ASSERT(!abd_is_gang(abd));
|
|
||||||
abd_verify(abd);
|
|
||||||
+ memset(aiter, 0, sizeof (struct abd_iter));
|
|
||||||
aiter->iter_abd = abd;
|
|
||||||
- aiter->iter_pos = 0;
|
|
||||||
- aiter->iter_mapaddr = NULL;
|
|
||||||
- aiter->iter_mapsize = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
|
|
||||||
index 24390fbbf..dae128012 100644
|
|
||||||
--- a/module/os/linux/zfs/abd_os.c
|
|
||||||
+++ b/module/os/linux/zfs/abd_os.c
|
|
||||||
@@ -21,6 +21,7 @@
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
|
||||||
* Copyright (c) 2019 by Delphix. All rights reserved.
|
|
||||||
+ * Copyright (c) 2023, 2024, Klara Inc.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
@@ -59,6 +60,7 @@
|
|
||||||
#include <sys/zfs_znode.h>
|
|
||||||
#ifdef _KERNEL
|
|
||||||
#include <linux/kmap_compat.h>
|
|
||||||
+#include <linux/mm_compat.h>
|
|
||||||
#include <linux/scatterlist.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
@@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
|
||||||
{
|
|
||||||
ASSERT(!abd_is_gang(abd));
|
|
||||||
abd_verify(abd);
|
|
||||||
+ memset(aiter, 0, sizeof (struct abd_iter));
|
|
||||||
aiter->iter_abd = abd;
|
|
||||||
- aiter->iter_mapaddr = NULL;
|
|
||||||
- aiter->iter_mapsize = 0;
|
|
||||||
- aiter->iter_pos = 0;
|
|
||||||
- if (abd_is_linear(abd)) {
|
|
||||||
- aiter->iter_offset = 0;
|
|
||||||
- aiter->iter_sg = NULL;
|
|
||||||
- } else {
|
|
||||||
+ if (!abd_is_linear(abd)) {
|
|
||||||
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
|
|
||||||
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
|
|
||||||
}
|
|
||||||
@@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
|
||||||
boolean_t
|
|
||||||
abd_iter_at_end(struct abd_iter *aiter)
|
|
||||||
{
|
|
||||||
+ ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
|
|
||||||
return (aiter->iter_pos == aiter->iter_abd->abd_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter)
|
|
||||||
void
|
|
||||||
abd_iter_advance(struct abd_iter *aiter, size_t amount)
|
|
||||||
{
|
|
||||||
+ /*
|
|
||||||
+ * Ensure that last chunk is not in use. abd_iterate_*() must clear
|
|
||||||
+ * this state (directly or abd_iter_unmap()) before advancing.
|
|
||||||
+ */
|
|
||||||
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
|
|
||||||
ASSERT0(aiter->iter_mapsize);
|
|
||||||
+ ASSERT3P(aiter->iter_page, ==, NULL);
|
|
||||||
+ ASSERT0(aiter->iter_page_doff);
|
|
||||||
+ ASSERT0(aiter->iter_page_dsize);
|
|
||||||
|
|
||||||
/* There's nothing left to advance to, so do nothing */
|
|
||||||
if (abd_iter_at_end(aiter))
|
|
||||||
@@ -1009,6 +1014,88 @@ abd_cache_reap_now(void)
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(_KERNEL)
|
|
||||||
+/*
|
|
||||||
+ * Yield the next page struct and data offset and size within it, without
|
|
||||||
+ * mapping it into the address space.
|
|
||||||
+ */
|
|
||||||
+void
|
|
||||||
+abd_iter_page(struct abd_iter *aiter)
|
|
||||||
+{
|
|
||||||
+ if (abd_iter_at_end(aiter)) {
|
|
||||||
+ aiter->iter_page = NULL;
|
|
||||||
+ aiter->iter_page_doff = 0;
|
|
||||||
+ aiter->iter_page_dsize = 0;
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ struct page *page;
|
|
||||||
+ size_t doff, dsize;
|
|
||||||
+
|
|
||||||
+ if (abd_is_linear(aiter->iter_abd)) {
|
|
||||||
+ ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
|
|
||||||
+
|
|
||||||
+ /* memory address at iter_pos */
|
|
||||||
+ void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
|
|
||||||
+
|
|
||||||
+ /* struct page for address */
|
|
||||||
+ page = is_vmalloc_addr(paddr) ?
|
|
||||||
+ vmalloc_to_page(paddr) : virt_to_page(paddr);
|
|
||||||
+
|
|
||||||
+ /* offset of address within the page */
|
|
||||||
+ doff = offset_in_page(paddr);
|
|
||||||
+
|
|
||||||
+ /* total data remaining in abd from this position */
|
|
||||||
+ dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
|
|
||||||
+ } else {
|
|
||||||
+ ASSERT(!abd_is_gang(aiter->iter_abd));
|
|
||||||
+
|
|
||||||
+ /* current scatter page */
|
|
||||||
+ page = sg_page(aiter->iter_sg);
|
|
||||||
+
|
|
||||||
+ /* position within page */
|
|
||||||
+ doff = aiter->iter_offset;
|
|
||||||
+
|
|
||||||
+ /* remaining data in scatterlist */
|
|
||||||
+ dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
|
|
||||||
+ aiter->iter_abd->abd_size - aiter->iter_pos);
|
|
||||||
+ }
|
|
||||||
+ ASSERT(page);
|
|
||||||
+
|
|
||||||
+ if (PageTail(page)) {
|
|
||||||
+ /*
|
|
||||||
+ * This page is part of a "compound page", which is a group of
|
|
||||||
+ * pages that can be referenced from a single struct page *.
|
|
||||||
+ * Its organised as a "head" page, followed by a series of
|
|
||||||
+ * "tail" pages.
|
|
||||||
+ *
|
|
||||||
+ * In OpenZFS, compound pages are allocated using the
|
|
||||||
+ * __GFP_COMP flag, which we get from scatter ABDs and SPL
|
|
||||||
+ * vmalloc slabs (ie >16K allocations). So a great many of the
|
|
||||||
+ * IO buffers we get are going to be of this type.
|
|
||||||
+ *
|
|
||||||
+ * The tail pages are just regular PAGE_SIZE pages, and can be
|
|
||||||
+ * safely used as-is. However, the head page has length
|
|
||||||
+ * covering itself and all the tail pages. If this ABD chunk
|
|
||||||
+ * spans multiple pages, then we can use the head page and a
|
|
||||||
+ * >PAGE_SIZE length, which is far more efficient.
|
|
||||||
+ *
|
|
||||||
+ * To do this, we need to adjust the offset to be counted from
|
|
||||||
+ * the head page. struct page for compound pages are stored
|
|
||||||
+ * contiguously, so we can just adjust by a simple offset.
|
|
||||||
+ */
|
|
||||||
+ struct page *head = compound_head(page);
|
|
||||||
+ doff += ((page - head) * PAGESIZE);
|
|
||||||
+ page = head;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ /* final page and position within it */
|
|
||||||
+ aiter->iter_page = page;
|
|
||||||
+ aiter->iter_page_doff = doff;
|
|
||||||
+
|
|
||||||
+ /* amount of data in the chunk, up to the end of the page */
|
|
||||||
+ aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
/*
|
|
||||||
* bio_nr_pages for ABD.
|
|
||||||
* @off is the offset in @abd
|
|
||||||
@@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
|
|
||||||
module_param(zfs_abd_scatter_max_order, uint, 0644);
|
|
||||||
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
|
|
||||||
"Maximum order allocation used for a scatter ABD.");
|
|
||||||
-#endif
|
|
||||||
+
|
|
||||||
+#endif /* _KERNEL */
|
|
||||||
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
|
|
||||||
index d982f201c..3388e2357 100644
|
|
||||||
--- a/module/zfs/abd.c
|
|
||||||
+++ b/module/zfs/abd.c
|
|
||||||
@@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
|
|
||||||
return (ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
+#if defined(__linux__) && defined(_KERNEL)
|
|
||||||
+int
|
|
||||||
+abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
|
|
||||||
+ abd_iter_page_func_t *func, void *private)
|
|
||||||
+{
|
|
||||||
+ struct abd_iter aiter;
|
|
||||||
+ int ret = 0;
|
|
||||||
+
|
|
||||||
+ if (size == 0)
|
|
||||||
+ return (0);
|
|
||||||
+
|
|
||||||
+ abd_verify(abd);
|
|
||||||
+ ASSERT3U(off + size, <=, abd->abd_size);
|
|
||||||
+
|
|
||||||
+ abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
|
|
||||||
+
|
|
||||||
+ while (size > 0) {
|
|
||||||
+ IMPLY(abd_is_gang(abd), c_abd != NULL);
|
|
||||||
+
|
|
||||||
+ abd_iter_page(&aiter);
|
|
||||||
+
|
|
||||||
+ size_t len = MIN(aiter.iter_page_dsize, size);
|
|
||||||
+ ASSERT3U(len, >, 0);
|
|
||||||
+
|
|
||||||
+ ret = func(aiter.iter_page, aiter.iter_page_doff,
|
|
||||||
+ len, private);
|
|
||||||
+
|
|
||||||
+ aiter.iter_page = NULL;
|
|
||||||
+ aiter.iter_page_doff = 0;
|
|
||||||
+ aiter.iter_page_dsize = 0;
|
|
||||||
+
|
|
||||||
+ if (ret != 0)
|
|
||||||
+ break;
|
|
||||||
+
|
|
||||||
+ size -= len;
|
|
||||||
+ c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return (ret);
|
|
||||||
+}
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
struct buf_arg {
|
|
||||||
void *arg_buf;
|
|
||||||
};
|
|
@ -1,349 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Date: Tue, 9 Jan 2024 12:12:56 +1100
|
|
||||||
Subject: [PATCH] vdev_disk: rename existing functions to vdev_classic_*
|
|
||||||
|
|
||||||
This is just renaming the existing functions we're about to replace and
|
|
||||||
grouping them together to make the next commits easier to follow.
|
|
||||||
|
|
||||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
|
||||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
||||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Sponsored-by: Klara, Inc.
|
|
||||||
Sponsored-by: Wasabi Technology, Inc.
|
|
||||||
Closes #15533
|
|
||||||
Closes #15588
|
|
||||||
(cherry picked from commit f3b85d706bae82957d2e3e0ef1d53a1cfab60eb4)
|
|
||||||
---
|
|
||||||
include/sys/abd.h | 2 +
|
|
||||||
module/os/linux/zfs/abd_os.c | 5 +
|
|
||||||
module/os/linux/zfs/vdev_disk.c | 215 +++++++++++++++++---------------
|
|
||||||
3 files changed, 120 insertions(+), 102 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/include/sys/abd.h b/include/sys/abd.h
|
|
||||||
index 8a2df0bca..bee38b831 100644
|
|
||||||
--- a/include/sys/abd.h
|
|
||||||
+++ b/include/sys/abd.h
|
|
||||||
@@ -220,6 +220,8 @@ void abd_fini(void);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Linux ABD bio functions
|
|
||||||
+ * Note: these are only needed to support vdev_classic. See comment in
|
|
||||||
+ * vdev_disk.c.
|
|
||||||
*/
|
|
||||||
#if defined(__linux__) && defined(_KERNEL)
|
|
||||||
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
|
|
||||||
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
|
|
||||||
index dae128012..3fe01c0b7 100644
|
|
||||||
--- a/module/os/linux/zfs/abd_os.c
|
|
||||||
+++ b/module/os/linux/zfs/abd_os.c
|
|
||||||
@@ -1096,6 +1096,11 @@ abd_iter_page(struct abd_iter *aiter)
|
|
||||||
aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
|
|
||||||
}
|
|
||||||
|
|
||||||
+/*
|
|
||||||
+ * Note: ABD BIO functions only needed to support vdev_classic. See comments in
|
|
||||||
+ * vdev_disk.c.
|
|
||||||
+ */
|
|
||||||
+
|
|
||||||
/*
|
|
||||||
* bio_nr_pages for ABD.
|
|
||||||
* @off is the offset in @abd
|
|
||||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
index b0bda5fa2..957619b87 100644
|
|
||||||
--- a/module/os/linux/zfs/vdev_disk.c
|
|
||||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
@@ -83,17 +83,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
|
|
||||||
*/
|
|
||||||
#define EFI_MIN_RESV_SIZE (16 * 1024)
|
|
||||||
|
|
||||||
-/*
|
|
||||||
- * Virtual device vector for disks.
|
|
||||||
- */
|
|
||||||
-typedef struct dio_request {
|
|
||||||
- zio_t *dr_zio; /* Parent ZIO */
|
|
||||||
- atomic_t dr_ref; /* References */
|
|
||||||
- int dr_error; /* Bio error */
|
|
||||||
- int dr_bio_count; /* Count of bio's */
|
|
||||||
- struct bio *dr_bio[]; /* Attached bio's */
|
|
||||||
-} dio_request_t;
|
|
||||||
-
|
|
||||||
/*
|
|
||||||
* BIO request failfast mask.
|
|
||||||
*/
|
|
||||||
@@ -467,85 +456,6 @@ vdev_disk_close(vdev_t *v)
|
|
||||||
v->vdev_tsd = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
-static dio_request_t *
|
|
||||||
-vdev_disk_dio_alloc(int bio_count)
|
|
||||||
-{
|
|
||||||
- dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
|
||||||
- sizeof (struct bio *) * bio_count, KM_SLEEP);
|
|
||||||
- atomic_set(&dr->dr_ref, 0);
|
|
||||||
- dr->dr_bio_count = bio_count;
|
|
||||||
- dr->dr_error = 0;
|
|
||||||
-
|
|
||||||
- for (int i = 0; i < dr->dr_bio_count; i++)
|
|
||||||
- dr->dr_bio[i] = NULL;
|
|
||||||
-
|
|
||||||
- return (dr);
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
-static void
|
|
||||||
-vdev_disk_dio_free(dio_request_t *dr)
|
|
||||||
-{
|
|
||||||
- int i;
|
|
||||||
-
|
|
||||||
- for (i = 0; i < dr->dr_bio_count; i++)
|
|
||||||
- if (dr->dr_bio[i])
|
|
||||||
- bio_put(dr->dr_bio[i]);
|
|
||||||
-
|
|
||||||
- kmem_free(dr, sizeof (dio_request_t) +
|
|
||||||
- sizeof (struct bio *) * dr->dr_bio_count);
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
-static void
|
|
||||||
-vdev_disk_dio_get(dio_request_t *dr)
|
|
||||||
-{
|
|
||||||
- atomic_inc(&dr->dr_ref);
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
-static void
|
|
||||||
-vdev_disk_dio_put(dio_request_t *dr)
|
|
||||||
-{
|
|
||||||
- int rc = atomic_dec_return(&dr->dr_ref);
|
|
||||||
-
|
|
||||||
- /*
|
|
||||||
- * Free the dio_request when the last reference is dropped and
|
|
||||||
- * ensure zio_interpret is called only once with the correct zio
|
|
||||||
- */
|
|
||||||
- if (rc == 0) {
|
|
||||||
- zio_t *zio = dr->dr_zio;
|
|
||||||
- int error = dr->dr_error;
|
|
||||||
-
|
|
||||||
- vdev_disk_dio_free(dr);
|
|
||||||
-
|
|
||||||
- if (zio) {
|
|
||||||
- zio->io_error = error;
|
|
||||||
- ASSERT3S(zio->io_error, >=, 0);
|
|
||||||
- if (zio->io_error)
|
|
||||||
- vdev_disk_error(zio);
|
|
||||||
-
|
|
||||||
- zio_delay_interrupt(zio);
|
|
||||||
- }
|
|
||||||
- }
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
-BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
|
|
||||||
-{
|
|
||||||
- dio_request_t *dr = bio->bi_private;
|
|
||||||
-
|
|
||||||
- if (dr->dr_error == 0) {
|
|
||||||
-#ifdef HAVE_1ARG_BIO_END_IO_T
|
|
||||||
- dr->dr_error = BIO_END_IO_ERROR(bio);
|
|
||||||
-#else
|
|
||||||
- if (error)
|
|
||||||
- dr->dr_error = -(error);
|
|
||||||
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
|
||||||
- dr->dr_error = EIO;
|
|
||||||
-#endif
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- /* Drop reference acquired by __vdev_disk_physio */
|
|
||||||
- vdev_disk_dio_put(dr);
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
static inline void
|
|
||||||
vdev_submit_bio_impl(struct bio *bio)
|
|
||||||
{
|
|
||||||
@@ -697,8 +607,107 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
|
|
||||||
return (bio);
|
|
||||||
}
|
|
||||||
|
|
||||||
+/* ========== */
|
|
||||||
+
|
|
||||||
+/*
|
|
||||||
+ * This is the classic, battle-tested BIO submission code.
|
|
||||||
+ *
|
|
||||||
+ * These functions have been renamed to vdev_classic_* to make it clear what
|
|
||||||
+ * they belong to, but their implementations are unchanged.
|
|
||||||
+ */
|
|
||||||
+
|
|
||||||
+/*
|
|
||||||
+ * Virtual device vector for disks.
|
|
||||||
+ */
|
|
||||||
+typedef struct dio_request {
|
|
||||||
+ zio_t *dr_zio; /* Parent ZIO */
|
|
||||||
+ atomic_t dr_ref; /* References */
|
|
||||||
+ int dr_error; /* Bio error */
|
|
||||||
+ int dr_bio_count; /* Count of bio's */
|
|
||||||
+ struct bio *dr_bio[]; /* Attached bio's */
|
|
||||||
+} dio_request_t;
|
|
||||||
+
|
|
||||||
+static dio_request_t *
|
|
||||||
+vdev_classic_dio_alloc(int bio_count)
|
|
||||||
+{
|
|
||||||
+ dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
|
||||||
+ sizeof (struct bio *) * bio_count, KM_SLEEP);
|
|
||||||
+ atomic_set(&dr->dr_ref, 0);
|
|
||||||
+ dr->dr_bio_count = bio_count;
|
|
||||||
+ dr->dr_error = 0;
|
|
||||||
+
|
|
||||||
+ for (int i = 0; i < dr->dr_bio_count; i++)
|
|
||||||
+ dr->dr_bio[i] = NULL;
|
|
||||||
+
|
|
||||||
+ return (dr);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static void
|
|
||||||
+vdev_classic_dio_free(dio_request_t *dr)
|
|
||||||
+{
|
|
||||||
+ int i;
|
|
||||||
+
|
|
||||||
+ for (i = 0; i < dr->dr_bio_count; i++)
|
|
||||||
+ if (dr->dr_bio[i])
|
|
||||||
+ bio_put(dr->dr_bio[i]);
|
|
||||||
+
|
|
||||||
+ kmem_free(dr, sizeof (dio_request_t) +
|
|
||||||
+ sizeof (struct bio *) * dr->dr_bio_count);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static void
|
|
||||||
+vdev_classic_dio_get(dio_request_t *dr)
|
|
||||||
+{
|
|
||||||
+ atomic_inc(&dr->dr_ref);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static void
|
|
||||||
+vdev_classic_dio_put(dio_request_t *dr)
|
|
||||||
+{
|
|
||||||
+ int rc = atomic_dec_return(&dr->dr_ref);
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * Free the dio_request when the last reference is dropped and
|
|
||||||
+ * ensure zio_interpret is called only once with the correct zio
|
|
||||||
+ */
|
|
||||||
+ if (rc == 0) {
|
|
||||||
+ zio_t *zio = dr->dr_zio;
|
|
||||||
+ int error = dr->dr_error;
|
|
||||||
+
|
|
||||||
+ vdev_classic_dio_free(dr);
|
|
||||||
+
|
|
||||||
+ if (zio) {
|
|
||||||
+ zio->io_error = error;
|
|
||||||
+ ASSERT3S(zio->io_error, >=, 0);
|
|
||||||
+ if (zio->io_error)
|
|
||||||
+ vdev_disk_error(zio);
|
|
||||||
+
|
|
||||||
+ zio_delay_interrupt(zio);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
|
|
||||||
+{
|
|
||||||
+ dio_request_t *dr = bio->bi_private;
|
|
||||||
+
|
|
||||||
+ if (dr->dr_error == 0) {
|
|
||||||
+#ifdef HAVE_1ARG_BIO_END_IO_T
|
|
||||||
+ dr->dr_error = BIO_END_IO_ERROR(bio);
|
|
||||||
+#else
|
|
||||||
+ if (error)
|
|
||||||
+ dr->dr_error = -(error);
|
|
||||||
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
|
||||||
+ dr->dr_error = EIO;
|
|
||||||
+#endif
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ /* Drop reference acquired by vdev_classic_physio */
|
|
||||||
+ vdev_classic_dio_put(dr);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
static inline unsigned int
|
|
||||||
-vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
|
||||||
+vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
|
||||||
{
|
|
||||||
unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
|
|
||||||
bio_size, abd_offset);
|
|
||||||
@@ -711,7 +720,7 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
|
||||||
-__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
|
||||||
+vdev_classic_physio(struct block_device *bdev, zio_t *zio,
|
|
||||||
size_t io_size, uint64_t io_offset, int rw, int flags)
|
|
||||||
{
|
|
||||||
dio_request_t *dr;
|
|
||||||
@@ -736,7 +745,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
|
||||||
}
|
|
||||||
|
|
||||||
retry:
|
|
||||||
- dr = vdev_disk_dio_alloc(bio_count);
|
|
||||||
+ dr = vdev_classic_dio_alloc(bio_count);
|
|
||||||
|
|
||||||
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
|
||||||
zio->io_vd->vdev_failfast == B_TRUE) {
|
|
||||||
@@ -771,23 +780,23 @@ retry:
|
|
||||||
* this should be rare - see the comment above.
|
|
||||||
*/
|
|
||||||
if (dr->dr_bio_count == i) {
|
|
||||||
- vdev_disk_dio_free(dr);
|
|
||||||
+ vdev_classic_dio_free(dr);
|
|
||||||
bio_count *= 2;
|
|
||||||
goto retry;
|
|
||||||
}
|
|
||||||
|
|
||||||
- nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
|
|
||||||
+ nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
|
|
||||||
dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
|
|
||||||
if (unlikely(dr->dr_bio[i] == NULL)) {
|
|
||||||
- vdev_disk_dio_free(dr);
|
|
||||||
+ vdev_classic_dio_free(dr);
|
|
||||||
return (SET_ERROR(ENOMEM));
|
|
||||||
}
|
|
||||||
|
|
||||||
- /* Matching put called by vdev_disk_physio_completion */
|
|
||||||
- vdev_disk_dio_get(dr);
|
|
||||||
+ /* Matching put called by vdev_classic_physio_completion */
|
|
||||||
+ vdev_classic_dio_get(dr);
|
|
||||||
|
|
||||||
BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
|
|
||||||
- dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
|
|
||||||
+ dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
|
|
||||||
dr->dr_bio[i]->bi_private = dr;
|
|
||||||
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
|
|
||||||
|
|
||||||
@@ -801,7 +810,7 @@ retry:
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Extra reference to protect dio_request during vdev_submit_bio */
|
|
||||||
- vdev_disk_dio_get(dr);
|
|
||||||
+ vdev_classic_dio_get(dr);
|
|
||||||
|
|
||||||
if (dr->dr_bio_count > 1)
|
|
||||||
blk_start_plug(&plug);
|
|
||||||
@@ -815,11 +824,13 @@ retry:
|
|
||||||
if (dr->dr_bio_count > 1)
|
|
||||||
blk_finish_plug(&plug);
|
|
||||||
|
|
||||||
- vdev_disk_dio_put(dr);
|
|
||||||
+ vdev_classic_dio_put(dr);
|
|
||||||
|
|
||||||
return (error);
|
|
||||||
}
|
|
||||||
|
|
||||||
+/* ========== */
|
|
||||||
+
|
|
||||||
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
|
|
||||||
{
|
|
||||||
zio_t *zio = bio->bi_private;
|
|
||||||
@@ -1023,7 +1034,7 @@ vdev_disk_io_start(zio_t *zio)
|
|
||||||
}
|
|
||||||
|
|
||||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
|
||||||
- error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
|
|
||||||
+ error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
|
|
||||||
zio->io_size, zio->io_offset, rw, 0);
|
|
||||||
rw_exit(&vd->vd_lock);
|
|
||||||
|
|
@ -1,111 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Date: Tue, 9 Jan 2024 12:23:30 +1100
|
|
||||||
Subject: [PATCH] vdev_disk: reorganise vdev_disk_io_start
|
|
||||||
|
|
||||||
Light reshuffle to make it a bit more linear to read and get rid of a
|
|
||||||
bunch of args that aren't needed in all cases.
|
|
||||||
|
|
||||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
|
||||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
||||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Sponsored-by: Klara, Inc.
|
|
||||||
Sponsored-by: Wasabi Technology, Inc.
|
|
||||||
Closes #15533
|
|
||||||
Closes #15588
|
|
||||||
(cherry picked from commit 867178ae1db28e73051c8a7ce662f2f2f81cd8e6)
|
|
||||||
---
|
|
||||||
module/os/linux/zfs/vdev_disk.c | 51 ++++++++++++++++++++-------------
|
|
||||||
1 file changed, 31 insertions(+), 20 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
index 957619b87..51e7cef2f 100644
|
|
||||||
--- a/module/os/linux/zfs/vdev_disk.c
|
|
||||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
@@ -720,9 +720,16 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
|
||||||
-vdev_classic_physio(struct block_device *bdev, zio_t *zio,
|
|
||||||
- size_t io_size, uint64_t io_offset, int rw, int flags)
|
|
||||||
+vdev_classic_physio(zio_t *zio)
|
|
||||||
{
|
|
||||||
+ vdev_t *v = zio->io_vd;
|
|
||||||
+ vdev_disk_t *vd = v->vdev_tsd;
|
|
||||||
+ struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
|
|
||||||
+ size_t io_size = zio->io_size;
|
|
||||||
+ uint64_t io_offset = zio->io_offset;
|
|
||||||
+ int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
|
|
||||||
+ int flags = 0;
|
|
||||||
+
|
|
||||||
dio_request_t *dr;
|
|
||||||
uint64_t abd_offset;
|
|
||||||
uint64_t bio_offset;
|
|
||||||
@@ -944,7 +951,7 @@ vdev_disk_io_start(zio_t *zio)
|
|
||||||
{
|
|
||||||
vdev_t *v = zio->io_vd;
|
|
||||||
vdev_disk_t *vd = v->vdev_tsd;
|
|
||||||
- int rw, error;
|
|
||||||
+ int error;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
|
|
||||||
@@ -1007,13 +1014,6 @@ vdev_disk_io_start(zio_t *zio)
|
|
||||||
rw_exit(&vd->vd_lock);
|
|
||||||
zio_execute(zio);
|
|
||||||
return;
|
|
||||||
- case ZIO_TYPE_WRITE:
|
|
||||||
- rw = WRITE;
|
|
||||||
- break;
|
|
||||||
-
|
|
||||||
- case ZIO_TYPE_READ:
|
|
||||||
- rw = READ;
|
|
||||||
- break;
|
|
||||||
|
|
||||||
case ZIO_TYPE_TRIM:
|
|
||||||
zio->io_error = vdev_disk_io_trim(zio);
|
|
||||||
@@ -1026,23 +1026,34 @@ vdev_disk_io_start(zio_t *zio)
|
|
||||||
#endif
|
|
||||||
return;
|
|
||||||
|
|
||||||
- default:
|
|
||||||
+ case ZIO_TYPE_READ:
|
|
||||||
+ case ZIO_TYPE_WRITE:
|
|
||||||
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
|
|
||||||
+ error = vdev_classic_physio(zio);
|
|
||||||
rw_exit(&vd->vd_lock);
|
|
||||||
- zio->io_error = SET_ERROR(ENOTSUP);
|
|
||||||
- zio_interrupt(zio);
|
|
||||||
+ if (error) {
|
|
||||||
+ zio->io_error = error;
|
|
||||||
+ zio_interrupt(zio);
|
|
||||||
+ }
|
|
||||||
return;
|
|
||||||
- }
|
|
||||||
|
|
||||||
- zio->io_target_timestamp = zio_handle_io_delay(zio);
|
|
||||||
- error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
|
|
||||||
- zio->io_size, zio->io_offset, rw, 0);
|
|
||||||
- rw_exit(&vd->vd_lock);
|
|
||||||
+ default:
|
|
||||||
+ /*
|
|
||||||
+ * Getting here means our parent vdev has made a very strange
|
|
||||||
+ * request of us, and shouldn't happen. Assert here to force a
|
|
||||||
+ * crash in dev builds, but in production return the IO
|
|
||||||
+ * unhandled. The pool will likely suspend anyway but that's
|
|
||||||
+ * nicer than crashing the kernel.
|
|
||||||
+ */
|
|
||||||
+ ASSERT3S(zio->io_type, ==, -1);
|
|
||||||
|
|
||||||
- if (error) {
|
|
||||||
- zio->io_error = error;
|
|
||||||
+ rw_exit(&vd->vd_lock);
|
|
||||||
+ zio->io_error = SET_ERROR(ENOTSUP);
|
|
||||||
zio_interrupt(zio);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
+
|
|
||||||
+ __builtin_unreachable();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
@ -1,69 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Date: Tue, 9 Jan 2024 12:29:19 +1100
|
|
||||||
Subject: [PATCH] vdev_disk: make read/write IO function configurable
|
|
||||||
|
|
||||||
This is just setting up for the next couple of commits, which will add a
|
|
||||||
new IO function and a parameter to select it.
|
|
||||||
|
|
||||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
|
||||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
||||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Sponsored-by: Klara, Inc.
|
|
||||||
Sponsored-by: Wasabi Technology, Inc.
|
|
||||||
Closes #15533
|
|
||||||
Closes #15588
|
|
||||||
(cherry picked from commit c4a13ba483f08a81aa47479d2f763a470d95b2b0)
|
|
||||||
---
|
|
||||||
module/os/linux/zfs/vdev_disk.c | 23 +++++++++++++++++++++--
|
|
||||||
1 file changed, 21 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
index 51e7cef2f..de4dba72f 100644
|
|
||||||
--- a/module/os/linux/zfs/vdev_disk.c
|
|
||||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
@@ -946,6 +946,8 @@ vdev_disk_io_trim(zio_t *zio)
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
+int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
|
|
||||||
+
|
|
||||||
static void
|
|
||||||
vdev_disk_io_start(zio_t *zio)
|
|
||||||
{
|
|
||||||
@@ -1029,7 +1031,7 @@ vdev_disk_io_start(zio_t *zio)
|
|
||||||
case ZIO_TYPE_READ:
|
|
||||||
case ZIO_TYPE_WRITE:
|
|
||||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
|
||||||
- error = vdev_classic_physio(zio);
|
|
||||||
+ error = vdev_disk_io_rw_fn(zio);
|
|
||||||
rw_exit(&vd->vd_lock);
|
|
||||||
if (error) {
|
|
||||||
zio->io_error = error;
|
|
||||||
@@ -1102,8 +1104,25 @@ vdev_disk_rele(vdev_t *vd)
|
|
||||||
/* XXX: Implement me as a vnode rele for the device */
|
|
||||||
}
|
|
||||||
|
|
||||||
+/*
|
|
||||||
+ * At first use vdev use, set the submission function from the default value if
|
|
||||||
+ * it hasn't been set already.
|
|
||||||
+ */
|
|
||||||
+static int
|
|
||||||
+vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
|
|
||||||
+{
|
|
||||||
+ (void) spa;
|
|
||||||
+ (void) nv;
|
|
||||||
+ (void) tsd;
|
|
||||||
+
|
|
||||||
+ if (vdev_disk_io_rw_fn == NULL)
|
|
||||||
+ vdev_disk_io_rw_fn = vdev_classic_physio;
|
|
||||||
+
|
|
||||||
+ return (0);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
vdev_ops_t vdev_disk_ops = {
|
|
||||||
- .vdev_op_init = NULL,
|
|
||||||
+ .vdev_op_init = vdev_disk_init,
|
|
||||||
.vdev_op_fini = NULL,
|
|
||||||
.vdev_op_open = vdev_disk_open,
|
|
||||||
.vdev_op_close = vdev_disk_close,
|
|
@ -1,671 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Date: Tue, 18 Jul 2023 11:11:29 +1000
|
|
||||||
Subject: [PATCH] vdev_disk: rewrite BIO filling machinery to avoid split pages
|
|
||||||
|
|
||||||
This commit tackles a number of issues in the way BIOs (`struct bio`)
|
|
||||||
are constructed for submission to the Linux block layer.
|
|
||||||
|
|
||||||
The kernel has a hard upper limit on the number of pages/segments that
|
|
||||||
can be added to a BIO, as well as a separate limit for each device
|
|
||||||
(related to its queue depth and other scheduling characteristics).
|
|
||||||
|
|
||||||
ZFS counts the number of memory pages in the request ABD
|
|
||||||
(`abd_nr_pages_off()`, and then uses that as the number of segments to
|
|
||||||
put into the BIO, up to the hard upper limit. If it requires more than
|
|
||||||
the limit, it will create multiple BIOs.
|
|
||||||
|
|
||||||
Leaving aside the fact that page count method is wrong (see below), not
|
|
||||||
limiting to the device segment max means that the device driver will
|
|
||||||
need to split the BIO in half. This is alone is not necessarily a
|
|
||||||
problem, but it interacts with another issue to cause a much larger
|
|
||||||
problem.
|
|
||||||
|
|
||||||
The kernel function to add a segment to a BIO (`bio_add_page()`) takes a
|
|
||||||
`struct page` pointer, and offset+len within it. `struct page` can
|
|
||||||
represent a run of contiguous memory pages (known as a "compound page").
|
|
||||||
In can be of arbitrary length.
|
|
||||||
|
|
||||||
The ZFS functions that count ABD pages and load them into the BIO
|
|
||||||
(`abd_nr_pages_off()`, `bio_map()` and `abd_bio_map_off()`) will never
|
|
||||||
consider a page to be more than `PAGE_SIZE` (4K), even if the `struct
|
|
||||||
page` is for multiple pages. In this case, it will load the same `struct
|
|
||||||
page` into the BIO multiple times, with the offset adjusted each time.
|
|
||||||
|
|
||||||
With a sufficiently large ABD, this can easily lead to the BIO being
|
|
||||||
entirely filled much earlier than it could have been. This is also
|
|
||||||
further contributes to the problem caused by the incorrect segment limit
|
|
||||||
calculation, as its much easier to go past the device limit, and so
|
|
||||||
require a split.
|
|
||||||
|
|
||||||
Again, this is not a problem on its own.
|
|
||||||
|
|
||||||
The logic for "never submit more than `PAGE_SIZE`" is actually a little
|
|
||||||
more subtle. It will actually never submit a buffer that crosses a 4K
|
|
||||||
page boundary.
|
|
||||||
|
|
||||||
In practice, this is fine, as most ABDs are scattered, that is a list of
|
|
||||||
complete 4K pages, and so are loaded in as such.
|
|
||||||
|
|
||||||
Linear ABDs are typically allocated from slabs, and for small sizes they
|
|
||||||
are frequently not aligned to page boundaries. For example, a 12K
|
|
||||||
allocation can span four pages, eg:
|
|
||||||
|
|
||||||
-- 4K -- -- 4K -- -- 4K -- -- 4K --
|
|
||||||
| | | | |
|
|
||||||
:## ######## ######## ######: [1K, 4K, 4K, 3K]
|
|
||||||
|
|
||||||
Such an allocation would be loaded into a BIO as you see:
|
|
||||||
|
|
||||||
[1K, 4K, 4K, 3K]
|
|
||||||
|
|
||||||
This tends not to be a problem in practice, because even if the BIO were
|
|
||||||
filled and needed to be split, each half would still have either a start
|
|
||||||
or end aligned to the logical block size of the device (assuming 4K at
|
|
||||||
least).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
In ideal circumstances, these shortcomings don't cause any particular
|
|
||||||
problems. Its when they start to interact with other ZFS features that
|
|
||||||
things get interesting.
|
|
||||||
|
|
||||||
Aggregation will create a "gang" ABD, which is simply a list of other
|
|
||||||
ABDs. Iterating over a gang ABD is just iterating over each ABD within
|
|
||||||
it in turn.
|
|
||||||
|
|
||||||
Because the segments are simply loaded in order, we can end up with
|
|
||||||
uneven segments either side of the "gap" between the two ABDs. For
|
|
||||||
example, two 12K ABDs might be aggregated and then loaded as:
|
|
||||||
|
|
||||||
[1K, 4K, 4K, 3K, 2K, 4K, 4K, 2K]
|
|
||||||
|
|
||||||
Should a split occur, each individual BIO can end up either having an
|
|
||||||
start or end offset that is not aligned to the logical block size, which
|
|
||||||
some drivers (eg SCSI) will reject. However, this tends not to happen
|
|
||||||
because the default aggregation limit usually keeps the BIO small enough
|
|
||||||
to not require more than one split, and most pages are actually full 4K
|
|
||||||
pages, so hitting an uneven gap is very rare anyway.
|
|
||||||
|
|
||||||
If the pool is under particular memory pressure, then an IO can be
|
|
||||||
broken down into a "gang block", a 512-byte block composed of a header
|
|
||||||
and up to three block pointers. Each points to a fragment of the
|
|
||||||
original write, or in turn, another gang block, breaking the original
|
|
||||||
data up over and over until space can be found in the pool for each of
|
|
||||||
them.
|
|
||||||
|
|
||||||
Each gang header is a separate 512-byte memory allocation from a slab,
|
|
||||||
that needs to be written down to disk. When the gang header is added to
|
|
||||||
the BIO, its a single 512-byte segment.
|
|
||||||
|
|
||||||
Pulling all this together, consider a large aggregated write of gang
|
|
||||||
blocks. This results a BIO containing lots of 512-byte segments. Given
|
|
||||||
our tendency to overfill the BIO, a split is likely, and most possible
|
|
||||||
split points will yield a pair of BIOs that are misaligned. Drivers that
|
|
||||||
care, like the SCSI driver, will reject them.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
This commit is a substantial refactor and rewrite of much of `vdev_disk`
|
|
||||||
to sort all this out.
|
|
||||||
|
|
||||||
`vdev_bio_max_segs()` now returns the ideal maximum size for the device,
|
|
||||||
if available. There's also a tuneable `zfs_vdev_disk_max_segs` to
|
|
||||||
override this, to assist with testing.
|
|
||||||
|
|
||||||
We scan the ABD up front to count the number of pages within it, and to
|
|
||||||
confirm that if we submitted all those pages to one or more BIOs, it
|
|
||||||
could be split at any point with creating a misaligned BIO. If the
|
|
||||||
pages in the BIO are not usable (as in any of the above situations), the
|
|
||||||
ABD is linearised, and then checked again. This is the same technique
|
|
||||||
used in `vdev_geom` on FreeBSD, adjusted for Linux's variable page size
|
|
||||||
and allocator quirks.
|
|
||||||
|
|
||||||
`vbio_t` is a cleanup and enhancement of the old `dio_request_t`. The
|
|
||||||
idea is simply that it can hold all the state needed to create, submit
|
|
||||||
and return multiple BIOs, including all the refcounts, the ABD copy if
|
|
||||||
it was needed, and so on. Apart from what I hope is a clearer interface,
|
|
||||||
the major difference is that because we know how many BIOs we'll need up
|
|
||||||
front, we don't need the old overflow logic that would grow the BIO
|
|
||||||
array, throw away all the old work and restart. We can get it right from
|
|
||||||
the start.
|
|
||||||
|
|
||||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
|
||||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
||||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Sponsored-by: Klara, Inc.
|
|
||||||
Sponsored-by: Wasabi Technology, Inc.
|
|
||||||
Closes #15533
|
|
||||||
Closes #15588
|
|
||||||
(cherry picked from commit 06a196020e6f70d2fedbd4d0d05bbe0c1ac6e4d8)
|
|
||||||
---
|
|
||||||
include/os/linux/kernel/linux/mod_compat.h | 1 +
|
|
||||||
man/man4/zfs.4 | 10 +-
|
|
||||||
module/os/linux/zfs/vdev_disk.c | 439 ++++++++++++++++++++-
|
|
||||||
3 files changed, 447 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h
|
|
||||||
index 8e20a9613..039865b70 100644
|
|
||||||
--- a/include/os/linux/kernel/linux/mod_compat.h
|
|
||||||
+++ b/include/os/linux/kernel/linux/mod_compat.h
|
|
||||||
@@ -68,6 +68,7 @@ enum scope_prefix_types {
|
|
||||||
zfs_trim,
|
|
||||||
zfs_txg,
|
|
||||||
zfs_vdev,
|
|
||||||
+ zfs_vdev_disk,
|
|
||||||
zfs_vdev_file,
|
|
||||||
zfs_vdev_mirror,
|
|
||||||
zfs_vnops,
|
|
||||||
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
|
|
||||||
index 352990e02..b5679f2f0 100644
|
|
||||||
--- a/man/man4/zfs.4
|
|
||||||
+++ b/man/man4/zfs.4
|
|
||||||
@@ -2,6 +2,7 @@
|
|
||||||
.\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
|
|
||||||
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
|
|
||||||
.\" Copyright (c) 2019 Datto Inc.
|
|
||||||
+.\" Copyright (c) 2023, 2024 Klara, Inc.
|
|
||||||
.\" The contents of this file are subject to the terms of the Common Development
|
|
||||||
.\" and Distribution License (the "License"). You may not use this file except
|
|
||||||
.\" in compliance with the License. You can obtain a copy of the license at
|
|
||||||
@@ -15,7 +16,7 @@
|
|
||||||
.\" own identifying information:
|
|
||||||
.\" Portions Copyright [yyyy] [name of copyright owner]
|
|
||||||
.\"
|
|
||||||
-.Dd July 21, 2023
|
|
||||||
+.Dd January 9, 2024
|
|
||||||
.Dt ZFS 4
|
|
||||||
.Os
|
|
||||||
.
|
|
||||||
@@ -1345,6 +1346,13 @@ _
|
|
||||||
4 Driver No driver retries on driver errors.
|
|
||||||
.TE
|
|
||||||
.
|
|
||||||
+.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
|
|
||||||
+Maximum number of segments to add to a BIO (min 4).
|
|
||||||
+If this is higher than the maximum allowed by the device queue or the kernel
|
|
||||||
+itself, it will be clamped.
|
|
||||||
+Setting it to zero will cause the kernel's ideal size to be used.
|
|
||||||
+This parameter only applies on Linux.
|
|
||||||
+.
|
|
||||||
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
|
|
||||||
Time before expiring
|
|
||||||
.Pa .zfs/snapshot .
|
|
||||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
index de4dba72f..0ccb9ad96 100644
|
|
||||||
--- a/module/os/linux/zfs/vdev_disk.c
|
|
||||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
@@ -24,6 +24,7 @@
|
|
||||||
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
|
|
||||||
* LLNL-CODE-403049.
|
|
||||||
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
|
||||||
+ * Copyright (c) 2023, 2024, Klara Inc.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <sys/zfs_context.h>
|
|
||||||
@@ -66,6 +67,13 @@ typedef struct vdev_disk {
|
|
||||||
krwlock_t vd_lock;
|
|
||||||
} vdev_disk_t;
|
|
||||||
|
|
||||||
+/*
|
|
||||||
+ * Maximum number of segments to add to a bio (min 4). If this is higher than
|
|
||||||
+ * the maximum allowed by the device queue or the kernel itself, it will be
|
|
||||||
+ * clamped. Setting it to zero will cause the kernel's ideal size to be used.
|
|
||||||
+ */
|
|
||||||
+uint_t zfs_vdev_disk_max_segs = 0;
|
|
||||||
+
|
|
||||||
/*
|
|
||||||
* Unique identifier for the exclusive vdev holder.
|
|
||||||
*/
|
|
||||||
@@ -607,10 +615,433 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
|
|
||||||
return (bio);
|
|
||||||
}
|
|
||||||
|
|
||||||
+static inline uint_t
|
|
||||||
+vdev_bio_max_segs(struct block_device *bdev)
|
|
||||||
+{
|
|
||||||
+ /*
|
|
||||||
+ * Smallest of the device max segs and the tuneable max segs. Minimum
|
|
||||||
+ * 4, so there's room to finish split pages if they come up.
|
|
||||||
+ */
|
|
||||||
+ const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
|
|
||||||
+ const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
|
|
||||||
+ MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
|
|
||||||
+ const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
|
|
||||||
+
|
|
||||||
+#ifdef HAVE_BIO_MAX_SEGS
|
|
||||||
+ return (bio_max_segs(max_segs));
|
|
||||||
+#else
|
|
||||||
+ return (MIN(max_segs, BIO_MAX_PAGES));
|
|
||||||
+#endif
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static inline uint_t
|
|
||||||
+vdev_bio_max_bytes(struct block_device *bdev)
|
|
||||||
+{
|
|
||||||
+ return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+/*
|
|
||||||
+ * Virtual block IO object (VBIO)
|
|
||||||
+ *
|
|
||||||
+ * Linux block IO (BIO) objects have a limit on how many data segments (pages)
|
|
||||||
+ * they can hold. Depending on how they're allocated and structured, a large
|
|
||||||
+ * ZIO can require more than one BIO to be submitted to the kernel, which then
|
|
||||||
+ * all have to complete before we can return the completed ZIO back to ZFS.
|
|
||||||
+ *
|
|
||||||
+ * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
|
|
||||||
+ * translate a ZIO down into the kernel block layer and back again.
|
|
||||||
+ *
|
|
||||||
+ * Note that these are only used for data ZIOs (read/write). Meta-operations
|
|
||||||
+ * (flush/trim) don't need multiple BIOs and so can just make the call
|
|
||||||
+ * directly.
|
|
||||||
+ */
|
|
||||||
+typedef struct {
|
|
||||||
+ zio_t *vbio_zio; /* parent zio */
|
|
||||||
+
|
|
||||||
+ struct block_device *vbio_bdev; /* blockdev to submit bios to */
|
|
||||||
+
|
|
||||||
+ abd_t *vbio_abd; /* abd carrying borrowed linear buf */
|
|
||||||
+
|
|
||||||
+ atomic_t vbio_ref; /* bio refcount */
|
|
||||||
+ int vbio_error; /* error from failed bio */
|
|
||||||
+
|
|
||||||
+ uint_t vbio_max_segs; /* max segs per bio */
|
|
||||||
+
|
|
||||||
+ uint_t vbio_max_bytes; /* max bytes per bio */
|
|
||||||
+ uint_t vbio_lbs_mask; /* logical block size mask */
|
|
||||||
+
|
|
||||||
+ uint64_t vbio_offset; /* start offset of next bio */
|
|
||||||
+
|
|
||||||
+ struct bio *vbio_bio; /* pointer to the current bio */
|
|
||||||
+ struct bio *vbio_bios; /* list of all bios */
|
|
||||||
+} vbio_t;
|
|
||||||
+
|
|
||||||
+static vbio_t *
|
|
||||||
+vbio_alloc(zio_t *zio, struct block_device *bdev)
|
|
||||||
+{
|
|
||||||
+ vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
|
|
||||||
+
|
|
||||||
+ vbio->vbio_zio = zio;
|
|
||||||
+ vbio->vbio_bdev = bdev;
|
|
||||||
+ atomic_set(&vbio->vbio_ref, 0);
|
|
||||||
+ vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
|
|
||||||
+ vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
|
|
||||||
+ vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
|
|
||||||
+ vbio->vbio_offset = zio->io_offset;
|
|
||||||
+
|
|
||||||
+ return (vbio);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
|
|
||||||
+{
|
|
||||||
+ struct bio *bio;
|
|
||||||
+ uint_t ssize;
|
|
||||||
+
|
|
||||||
+ while (size > 0) {
|
|
||||||
+ bio = vbio->vbio_bio;
|
|
||||||
+ if (bio == NULL) {
|
|
||||||
+ /* New BIO, allocate and set up */
|
|
||||||
+ bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
|
|
||||||
+ vbio->vbio_max_segs);
|
|
||||||
+ if (unlikely(bio == NULL))
|
|
||||||
+ return (SET_ERROR(ENOMEM));
|
|
||||||
+ BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
|
|
||||||
+
|
|
||||||
+ bio->bi_next = vbio->vbio_bios;
|
|
||||||
+ vbio->vbio_bios = vbio->vbio_bio = bio;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * Only load as much of the current page data as will fit in
|
|
||||||
+ * the space left in the BIO, respecting lbs alignment. Older
|
|
||||||
+ * kernels will error if we try to overfill the BIO, while
|
|
||||||
+ * newer ones will accept it and split the BIO. This ensures
|
|
||||||
+ * everything works on older kernels, and avoids an additional
|
|
||||||
+ * overhead on the new.
|
|
||||||
+ */
|
|
||||||
+ ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
|
|
||||||
+ vbio->vbio_lbs_mask);
|
|
||||||
+ if (ssize > 0 &&
|
|
||||||
+ bio_add_page(bio, page, ssize, offset) == ssize) {
|
|
||||||
+ /* Accepted, adjust and load any remaining. */
|
|
||||||
+ size -= ssize;
|
|
||||||
+ offset += ssize;
|
|
||||||
+ continue;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ /* No room, set up for a new BIO and loop */
|
|
||||||
+ vbio->vbio_offset += BIO_BI_SIZE(bio);
|
|
||||||
+
|
|
||||||
+ /* Signal new BIO allocation wanted */
|
|
||||||
+ vbio->vbio_bio = NULL;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return (0);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
|
|
||||||
+static void vbio_put(vbio_t *vbio);
|
|
||||||
+
|
|
||||||
+static void
|
|
||||||
+vbio_submit(vbio_t *vbio, int flags)
|
|
||||||
+{
|
|
||||||
+ ASSERT(vbio->vbio_bios);
|
|
||||||
+ struct bio *bio = vbio->vbio_bios;
|
|
||||||
+ vbio->vbio_bio = vbio->vbio_bios = NULL;
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * We take a reference for each BIO as we submit it, plus one to
|
|
||||||
+ * protect us from BIOs completing before we're done submitting them
|
|
||||||
+ * all, causing vbio_put() to free vbio out from under us and/or the
|
|
||||||
+ * zio to be returned before all its IO has completed.
|
|
||||||
+ */
|
|
||||||
+ atomic_set(&vbio->vbio_ref, 1);
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * If we're submitting more than one BIO, inform the block layer so
|
|
||||||
+ * it can batch them if it wants.
|
|
||||||
+ */
|
|
||||||
+ struct blk_plug plug;
|
|
||||||
+ boolean_t do_plug = (bio->bi_next != NULL);
|
|
||||||
+ if (do_plug)
|
|
||||||
+ blk_start_plug(&plug);
|
|
||||||
+
|
|
||||||
+ /* Submit all the BIOs */
|
|
||||||
+ while (bio != NULL) {
|
|
||||||
+ atomic_inc(&vbio->vbio_ref);
|
|
||||||
+
|
|
||||||
+ struct bio *next = bio->bi_next;
|
|
||||||
+ bio->bi_next = NULL;
|
|
||||||
+
|
|
||||||
+ bio->bi_end_io = vdev_disk_io_rw_completion;
|
|
||||||
+ bio->bi_private = vbio;
|
|
||||||
+ bio_set_op_attrs(bio,
|
|
||||||
+ vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
|
|
||||||
+ WRITE : READ, flags);
|
|
||||||
+
|
|
||||||
+ vdev_submit_bio(bio);
|
|
||||||
+
|
|
||||||
+ bio = next;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ /* Finish the batch */
|
|
||||||
+ if (do_plug)
|
|
||||||
+ blk_finish_plug(&plug);
|
|
||||||
+
|
|
||||||
+ /* Release the extra reference */
|
|
||||||
+ vbio_put(vbio);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static void
|
|
||||||
+vbio_return_abd(vbio_t *vbio)
|
|
||||||
+{
|
|
||||||
+ zio_t *zio = vbio->vbio_zio;
|
|
||||||
+ if (vbio->vbio_abd == NULL)
|
|
||||||
+ return;
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * If we copied the ABD before issuing it, clean up and return the copy
|
|
||||||
+ * to the ADB, with changes if appropriate.
|
|
||||||
+ */
|
|
||||||
+ void *buf = abd_to_buf(vbio->vbio_abd);
|
|
||||||
+ abd_free(vbio->vbio_abd);
|
|
||||||
+ vbio->vbio_abd = NULL;
|
|
||||||
+
|
|
||||||
+ if (zio->io_type == ZIO_TYPE_READ)
|
|
||||||
+ abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
|
|
||||||
+ else
|
|
||||||
+ abd_return_buf(zio->io_abd, buf, zio->io_size);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static void
|
|
||||||
+vbio_free(vbio_t *vbio)
|
|
||||||
+{
|
|
||||||
+ VERIFY0(atomic_read(&vbio->vbio_ref));
|
|
||||||
+
|
|
||||||
+ vbio_return_abd(vbio);
|
|
||||||
+
|
|
||||||
+ kmem_free(vbio, sizeof (vbio_t));
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static void
|
|
||||||
+vbio_put(vbio_t *vbio)
|
|
||||||
+{
|
|
||||||
+ if (atomic_dec_return(&vbio->vbio_ref) > 0)
|
|
||||||
+ return;
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * This was the last reference, so the entire IO is completed. Clean
|
|
||||||
+ * up and submit it for processing.
|
|
||||||
+ */
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * Get any data buf back to the original ABD, if necessary. We do this
|
|
||||||
+ * now so we can get the ZIO into the pipeline as quickly as possible,
|
|
||||||
+ * and then do the remaining cleanup after.
|
|
||||||
+ */
|
|
||||||
+ vbio_return_abd(vbio);
|
|
||||||
+
|
|
||||||
+ zio_t *zio = vbio->vbio_zio;
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * Set the overall error. If multiple BIOs returned an error, only the
|
|
||||||
+ * first will be taken; the others are dropped (see
|
|
||||||
+ * vdev_disk_io_rw_completion()). Its pretty much impossible for
|
|
||||||
+ * multiple IOs to the same device to fail with different errors, so
|
|
||||||
+ * there's no real risk.
|
|
||||||
+ */
|
|
||||||
+ zio->io_error = vbio->vbio_error;
|
|
||||||
+ if (zio->io_error)
|
|
||||||
+ vdev_disk_error(zio);
|
|
||||||
+
|
|
||||||
+ /* All done, submit for processing */
|
|
||||||
+ zio_delay_interrupt(zio);
|
|
||||||
+
|
|
||||||
+ /* Finish cleanup */
|
|
||||||
+ vbio_free(vbio);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
|
|
||||||
+{
|
|
||||||
+ vbio_t *vbio = bio->bi_private;
|
|
||||||
+
|
|
||||||
+ if (vbio->vbio_error == 0) {
|
|
||||||
+#ifdef HAVE_1ARG_BIO_END_IO_T
|
|
||||||
+ vbio->vbio_error = BIO_END_IO_ERROR(bio);
|
|
||||||
+#else
|
|
||||||
+ if (error)
|
|
||||||
+ vbio->vbio_error = -(error);
|
|
||||||
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
|
||||||
+ vbio->vbio_error = EIO;
|
|
||||||
+#endif
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * Destroy the BIO. This is safe to do; the vbio owns its data and the
|
|
||||||
+ * kernel won't touch it again after the completion function runs.
|
|
||||||
+ */
|
|
||||||
+ bio_put(bio);
|
|
||||||
+
|
|
||||||
+ /* Drop this BIOs reference acquired by vbio_submit() */
|
|
||||||
+ vbio_put(vbio);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+/*
|
|
||||||
+ * Iterator callback to count ABD pages and check their size & alignment.
|
|
||||||
+ *
|
|
||||||
+ * On Linux, each BIO segment can take a page pointer, and an offset+length of
|
|
||||||
+ * the data within that page. A page can be arbitrarily large ("compound"
|
|
||||||
+ * pages) but we still have to ensure the data portion is correctly sized and
|
|
||||||
+ * aligned to the logical block size, to ensure that if the kernel wants to
|
|
||||||
+ * split the BIO, the two halves will still be properly aligned.
|
|
||||||
+ */
|
|
||||||
+typedef struct {
|
|
||||||
+ uint_t bmask;
|
|
||||||
+ uint_t npages;
|
|
||||||
+ uint_t end;
|
|
||||||
+} vdev_disk_check_pages_t;
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
|
|
||||||
+{
|
|
||||||
+ vdev_disk_check_pages_t *s = priv;
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * If we didn't finish on a block size boundary last time, then there
|
|
||||||
+ * would be a gap if we tried to use this ABD as-is, so abort.
|
|
||||||
+ */
|
|
||||||
+ if (s->end != 0)
|
|
||||||
+ return (1);
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * Note if we're taking less than a full block, so we can check it
|
|
||||||
+ * above on the next call.
|
|
||||||
+ */
|
|
||||||
+ s->end = len & s->bmask;
|
|
||||||
+
|
|
||||||
+ /* All blocks after the first must start on a block size boundary. */
|
|
||||||
+ if (s->npages != 0 && (off & s->bmask) != 0)
|
|
||||||
+ return (1);
|
|
||||||
+
|
|
||||||
+ s->npages++;
|
|
||||||
+ return (0);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+/*
|
|
||||||
+ * Check if we can submit the pages in this ABD to the kernel as-is. Returns
|
|
||||||
+ * the number of pages, or 0 if it can't be submitted like this.
|
|
||||||
+ */
|
|
||||||
+static boolean_t
|
|
||||||
+vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
|
|
||||||
+{
|
|
||||||
+ vdev_disk_check_pages_t s = {
|
|
||||||
+ .bmask = bdev_logical_block_size(bdev)-1,
|
|
||||||
+ .npages = 0,
|
|
||||||
+ .end = 0,
|
|
||||||
+ };
|
|
||||||
+
|
|
||||||
+ if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
|
|
||||||
+ return (B_FALSE);
|
|
||||||
+
|
|
||||||
+ return (B_TRUE);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+/* Iterator callback to submit ABD pages to the vbio. */
|
|
||||||
+static int
|
|
||||||
+vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
|
|
||||||
+{
|
|
||||||
+ vbio_t *vbio = priv;
|
|
||||||
+ return (vbio_add_page(vbio, page, len, off));
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+vdev_disk_io_rw(zio_t *zio)
|
|
||||||
+{
|
|
||||||
+ vdev_t *v = zio->io_vd;
|
|
||||||
+ vdev_disk_t *vd = v->vdev_tsd;
|
|
||||||
+ struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
|
|
||||||
+ int flags = 0;
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * Accessing outside the block device is never allowed.
|
|
||||||
+ */
|
|
||||||
+ if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
|
|
||||||
+ vdev_dbgmsg(zio->io_vd,
|
|
||||||
+ "Illegal access %llu size %llu, device size %llu",
|
|
||||||
+ (u_longlong_t)zio->io_offset,
|
|
||||||
+ (u_longlong_t)zio->io_size,
|
|
||||||
+ (u_longlong_t)i_size_read(bdev->bd_inode));
|
|
||||||
+ return (SET_ERROR(EIO));
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
|
||||||
+ v->vdev_failfast == B_TRUE) {
|
|
||||||
+ bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
|
|
||||||
+ zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * Check alignment of the incoming ABD. If any part of it would require
|
|
||||||
+ * submitting a page that is not aligned to the logical block size,
|
|
||||||
+ * then we take a copy into a linear buffer and submit that instead.
|
|
||||||
+ * This should be impossible on a 512b LBS, and fairly rare on 4K,
|
|
||||||
+ * usually requiring abnormally-small data blocks (eg gang blocks)
|
|
||||||
+ * mixed into the same ABD as larger ones (eg aggregated).
|
|
||||||
+ */
|
|
||||||
+ abd_t *abd = zio->io_abd;
|
|
||||||
+ if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
|
|
||||||
+ void *buf;
|
|
||||||
+ if (zio->io_type == ZIO_TYPE_READ)
|
|
||||||
+ buf = abd_borrow_buf(zio->io_abd, zio->io_size);
|
|
||||||
+ else
|
|
||||||
+ buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * Wrap the copy in an abd_t, so we can use the same iterators
|
|
||||||
+ * to count and fill the vbio later.
|
|
||||||
+ */
|
|
||||||
+ abd = abd_get_from_buf(buf, zio->io_size);
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * False here would mean the borrowed copy has an invalid
|
|
||||||
+ * alignment too, which would mean we've somehow been passed a
|
|
||||||
+ * linear ABD with an interior page that has a non-zero offset
|
|
||||||
+ * or a size not a multiple of PAGE_SIZE. This is not possible.
|
|
||||||
+ * It would mean either zio_buf_alloc() or its underlying
|
|
||||||
+ * allocators have done something extremely strange, or our
|
|
||||||
+ * math in vdev_disk_check_pages() is wrong. In either case,
|
|
||||||
+ * something in seriously wrong and its not safe to continue.
|
|
||||||
+ */
|
|
||||||
+ VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ /* Allocate vbio, with a pointer to the borrowed ABD if necessary */
|
|
||||||
+ int error = 0;
|
|
||||||
+ vbio_t *vbio = vbio_alloc(zio, bdev);
|
|
||||||
+ if (abd != zio->io_abd)
|
|
||||||
+ vbio->vbio_abd = abd;
|
|
||||||
+
|
|
||||||
+ /* Fill it with pages */
|
|
||||||
+ error = abd_iterate_page_func(abd, 0, zio->io_size,
|
|
||||||
+ vdev_disk_fill_vbio_cb, vbio);
|
|
||||||
+ if (error != 0) {
|
|
||||||
+ vbio_free(vbio);
|
|
||||||
+ return (error);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ vbio_submit(vbio, flags);
|
|
||||||
+ return (0);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
/* ========== */
|
|
||||||
|
|
||||||
/*
|
|
||||||
- * This is the classic, battle-tested BIO submission code.
|
|
||||||
+ * This is the classic, battle-tested BIO submission code. Until we're totally
|
|
||||||
+ * sure that the new code is safe and correct in all cases, this will remain
|
|
||||||
+ * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
|
|
||||||
+ * load time.
|
|
||||||
*
|
|
||||||
* These functions have been renamed to vdev_classic_* to make it clear what
|
|
||||||
* they belong to, but their implementations are unchanged.
|
|
||||||
@@ -1116,7 +1547,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
|
|
||||||
(void) tsd;
|
|
||||||
|
|
||||||
if (vdev_disk_io_rw_fn == NULL)
|
|
||||||
- vdev_disk_io_rw_fn = vdev_classic_physio;
|
|
||||||
+ /* XXX make configurable */
|
|
||||||
+ vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
|
|
||||||
|
|
||||||
return (0);
|
|
||||||
}
|
|
||||||
@@ -1215,3 +1647,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
|
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
|
|
||||||
"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
|
|
||||||
+
|
|
||||||
+ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
|
|
||||||
+ "Maximum number of data segments to add to an IO request (min 4)");
|
|
@ -1,104 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Date: Tue, 9 Jan 2024 13:28:57 +1100
|
|
||||||
Subject: [PATCH] vdev_disk: add module parameter to select BIO submission
|
|
||||||
method
|
|
||||||
|
|
||||||
This makes the submission method selectable at module load time via the
|
|
||||||
`zfs_vdev_disk_classic` parameter, allowing this change to be backported
|
|
||||||
to 2.2 safely, and disabled in favour of the "classic" submission method
|
|
||||||
if new problems come up.
|
|
||||||
|
|
||||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
|
||||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
||||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Sponsored-by: Klara, Inc.
|
|
||||||
Sponsored-by: Wasabi Technology, Inc.
|
|
||||||
Closes #15533
|
|
||||||
Closes #15588
|
|
||||||
(cherry picked from commit df2169d141aadc0c2cc728c5c5261d6f5c2a27f7)
|
|
||||||
---
|
|
||||||
man/man4/zfs.4 | 16 ++++++++++++++++
|
|
||||||
module/os/linux/zfs/vdev_disk.c | 31 +++++++++++++++++++++++++++++--
|
|
||||||
2 files changed, 45 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
|
|
||||||
index b5679f2f0..6a628e7f3 100644
|
|
||||||
--- a/man/man4/zfs.4
|
|
||||||
+++ b/man/man4/zfs.4
|
|
||||||
@@ -1352,6 +1352,22 @@ If this is higher than the maximum allowed by the device queue or the kernel
|
|
||||||
itself, it will be clamped.
|
|
||||||
Setting it to zero will cause the kernel's ideal size to be used.
|
|
||||||
This parameter only applies on Linux.
|
|
||||||
+This parameter is ignored if
|
|
||||||
+.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
|
|
||||||
+.
|
|
||||||
+.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
|
|
||||||
+If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
|
|
||||||
+and earlier.
|
|
||||||
+This "classic" method has known issues with highly fragmented IO requests and
|
|
||||||
+is slower on many workloads, but it has been in use for many years and is known
|
|
||||||
+to be very stable.
|
|
||||||
+If you set this parameter, please also open a bug report why you did so,
|
|
||||||
+including the workload involved and any error messages.
|
|
||||||
+.Pp
|
|
||||||
+This parameter and the classic submission method will be removed once we have
|
|
||||||
+total confidence in the new method.
|
|
||||||
+.Pp
|
|
||||||
+This parameter only applies on Linux, and can only be set at module load time.
|
|
||||||
.
|
|
||||||
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
|
|
||||||
Time before expiring
|
|
||||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
index 0ccb9ad96..a9110623a 100644
|
|
||||||
--- a/module/os/linux/zfs/vdev_disk.c
|
|
||||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
@@ -1535,6 +1535,29 @@ vdev_disk_rele(vdev_t *vd)
|
|
||||||
/* XXX: Implement me as a vnode rele for the device */
|
|
||||||
}
|
|
||||||
|
|
||||||
+/*
|
|
||||||
+ * BIO submission method. See comment above about vdev_classic.
|
|
||||||
+ * Set zfs_vdev_disk_classic=0 for new, =1 for classic
|
|
||||||
+ */
|
|
||||||
+static uint_t zfs_vdev_disk_classic = 0; /* default new */
|
|
||||||
+
|
|
||||||
+/* Set submission function from module parameter */
|
|
||||||
+static int
|
|
||||||
+vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
|
|
||||||
+{
|
|
||||||
+ int err = param_set_uint(buf, kp);
|
|
||||||
+ if (err < 0)
|
|
||||||
+ return (SET_ERROR(err));
|
|
||||||
+
|
|
||||||
+ vdev_disk_io_rw_fn =
|
|
||||||
+ zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
|
|
||||||
+
|
|
||||||
+ printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
|
|
||||||
+ zfs_vdev_disk_classic ? "classic" : "new");
|
|
||||||
+
|
|
||||||
+ return (0);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
/*
|
|
||||||
* At first use vdev use, set the submission function from the default value if
|
|
||||||
* it hasn't been set already.
|
|
||||||
@@ -1547,8 +1570,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
|
|
||||||
(void) tsd;
|
|
||||||
|
|
||||||
if (vdev_disk_io_rw_fn == NULL)
|
|
||||||
- /* XXX make configurable */
|
|
||||||
- vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
|
|
||||||
+ vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
|
|
||||||
+ vdev_classic_physio : vdev_disk_io_rw;
|
|
||||||
|
|
||||||
return (0);
|
|
||||||
}
|
|
||||||
@@ -1650,3 +1673,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
|
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
|
|
||||||
"Maximum number of data segments to add to an IO request (min 4)");
|
|
||||||
+
|
|
||||||
+ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
|
|
||||||
+ vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
|
|
||||||
+ "Use classic BIO submission method");
|
|
@ -1,363 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Date: Wed, 21 Feb 2024 11:07:21 +1100
|
|
||||||
Subject: [PATCH] vdev_disk: use bio_chain() to submit multiple BIOs
|
|
||||||
|
|
||||||
Simplifies our code a lot, so we don't have to wait for each and
|
|
||||||
reassemble them.
|
|
||||||
|
|
||||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
|
||||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
||||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Sponsored-by: Klara, Inc.
|
|
||||||
Sponsored-by: Wasabi Technology, Inc.
|
|
||||||
Closes #15533
|
|
||||||
Closes #15588
|
|
||||||
(cherry picked from commit 72fd834c47558cb10d847948d1a4615e894c77c3)
|
|
||||||
---
|
|
||||||
module/os/linux/zfs/vdev_disk.c | 231 +++++++++++---------------------
|
|
||||||
1 file changed, 80 insertions(+), 151 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
index a9110623a..36468fc21 100644
|
|
||||||
--- a/module/os/linux/zfs/vdev_disk.c
|
|
||||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
@@ -454,10 +454,9 @@ vdev_disk_close(vdev_t *v)
|
|
||||||
if (v->vdev_reopening || vd == NULL)
|
|
||||||
return;
|
|
||||||
|
|
||||||
- if (vd->vd_bdh != NULL) {
|
|
||||||
+ if (vd->vd_bdh != NULL)
|
|
||||||
vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
|
|
||||||
zfs_vdev_holder);
|
|
||||||
- }
|
|
||||||
|
|
||||||
rw_destroy(&vd->vd_lock);
|
|
||||||
kmem_free(vd, sizeof (vdev_disk_t));
|
|
||||||
@@ -663,9 +662,6 @@ typedef struct {
|
|
||||||
|
|
||||||
abd_t *vbio_abd; /* abd carrying borrowed linear buf */
|
|
||||||
|
|
||||||
- atomic_t vbio_ref; /* bio refcount */
|
|
||||||
- int vbio_error; /* error from failed bio */
|
|
||||||
-
|
|
||||||
uint_t vbio_max_segs; /* max segs per bio */
|
|
||||||
|
|
||||||
uint_t vbio_max_bytes; /* max bytes per bio */
|
|
||||||
@@ -674,43 +670,52 @@ typedef struct {
|
|
||||||
uint64_t vbio_offset; /* start offset of next bio */
|
|
||||||
|
|
||||||
struct bio *vbio_bio; /* pointer to the current bio */
|
|
||||||
- struct bio *vbio_bios; /* list of all bios */
|
|
||||||
+ int vbio_flags; /* bio flags */
|
|
||||||
} vbio_t;
|
|
||||||
|
|
||||||
static vbio_t *
|
|
||||||
-vbio_alloc(zio_t *zio, struct block_device *bdev)
|
|
||||||
+vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
|
|
||||||
{
|
|
||||||
vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
|
|
||||||
|
|
||||||
vbio->vbio_zio = zio;
|
|
||||||
vbio->vbio_bdev = bdev;
|
|
||||||
- atomic_set(&vbio->vbio_ref, 0);
|
|
||||||
+ vbio->vbio_abd = NULL;
|
|
||||||
vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
|
|
||||||
vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
|
|
||||||
vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
|
|
||||||
vbio->vbio_offset = zio->io_offset;
|
|
||||||
+ vbio->vbio_bio = NULL;
|
|
||||||
+ vbio->vbio_flags = flags;
|
|
||||||
|
|
||||||
return (vbio);
|
|
||||||
}
|
|
||||||
|
|
||||||
+BIO_END_IO_PROTO(vbio_completion, bio, error);
|
|
||||||
+
|
|
||||||
static int
|
|
||||||
vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
|
|
||||||
{
|
|
||||||
- struct bio *bio;
|
|
||||||
+ struct bio *bio = vbio->vbio_bio;
|
|
||||||
uint_t ssize;
|
|
||||||
|
|
||||||
while (size > 0) {
|
|
||||||
- bio = vbio->vbio_bio;
|
|
||||||
if (bio == NULL) {
|
|
||||||
/* New BIO, allocate and set up */
|
|
||||||
bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
|
|
||||||
vbio->vbio_max_segs);
|
|
||||||
- if (unlikely(bio == NULL))
|
|
||||||
- return (SET_ERROR(ENOMEM));
|
|
||||||
+ VERIFY(bio);
|
|
||||||
+
|
|
||||||
BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
|
|
||||||
+ bio_set_op_attrs(bio,
|
|
||||||
+ vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
|
|
||||||
+ WRITE : READ, vbio->vbio_flags);
|
|
||||||
|
|
||||||
- bio->bi_next = vbio->vbio_bios;
|
|
||||||
- vbio->vbio_bios = vbio->vbio_bio = bio;
|
|
||||||
+ if (vbio->vbio_bio) {
|
|
||||||
+ bio_chain(vbio->vbio_bio, bio);
|
|
||||||
+ vdev_submit_bio(vbio->vbio_bio);
|
|
||||||
+ }
|
|
||||||
+ vbio->vbio_bio = bio;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
@@ -735,157 +740,97 @@ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
|
|
||||||
vbio->vbio_offset += BIO_BI_SIZE(bio);
|
|
||||||
|
|
||||||
/* Signal new BIO allocation wanted */
|
|
||||||
- vbio->vbio_bio = NULL;
|
|
||||||
+ bio = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return (0);
|
|
||||||
}
|
|
||||||
|
|
||||||
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
|
|
||||||
-static void vbio_put(vbio_t *vbio);
|
|
||||||
+/* Iterator callback to submit ABD pages to the vbio. */
|
|
||||||
+static int
|
|
||||||
+vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
|
|
||||||
+{
|
|
||||||
+ vbio_t *vbio = priv;
|
|
||||||
+ return (vbio_add_page(vbio, page, len, off));
|
|
||||||
+}
|
|
||||||
|
|
||||||
+/* Create some BIOs, fill them with data and submit them */
|
|
||||||
static void
|
|
||||||
-vbio_submit(vbio_t *vbio, int flags)
|
|
||||||
+vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
|
|
||||||
{
|
|
||||||
- ASSERT(vbio->vbio_bios);
|
|
||||||
- struct bio *bio = vbio->vbio_bios;
|
|
||||||
- vbio->vbio_bio = vbio->vbio_bios = NULL;
|
|
||||||
-
|
|
||||||
- /*
|
|
||||||
- * We take a reference for each BIO as we submit it, plus one to
|
|
||||||
- * protect us from BIOs completing before we're done submitting them
|
|
||||||
- * all, causing vbio_put() to free vbio out from under us and/or the
|
|
||||||
- * zio to be returned before all its IO has completed.
|
|
||||||
- */
|
|
||||||
- atomic_set(&vbio->vbio_ref, 1);
|
|
||||||
+ ASSERT(vbio->vbio_bdev);
|
|
||||||
|
|
||||||
/*
|
|
||||||
- * If we're submitting more than one BIO, inform the block layer so
|
|
||||||
- * it can batch them if it wants.
|
|
||||||
+ * We plug so we can submit the BIOs as we go and only unplug them when
|
|
||||||
+ * they are fully created and submitted. This is important; if we don't
|
|
||||||
+ * plug, then the kernel may start executing earlier BIOs while we're
|
|
||||||
+ * still creating and executing later ones, and if the device goes
|
|
||||||
+ * away while that's happening, older kernels can get confused and
|
|
||||||
+ * trample memory.
|
|
||||||
*/
|
|
||||||
struct blk_plug plug;
|
|
||||||
- boolean_t do_plug = (bio->bi_next != NULL);
|
|
||||||
- if (do_plug)
|
|
||||||
- blk_start_plug(&plug);
|
|
||||||
+ blk_start_plug(&plug);
|
|
||||||
|
|
||||||
- /* Submit all the BIOs */
|
|
||||||
- while (bio != NULL) {
|
|
||||||
- atomic_inc(&vbio->vbio_ref);
|
|
||||||
+ (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
|
|
||||||
+ ASSERT(vbio->vbio_bio);
|
|
||||||
|
|
||||||
- struct bio *next = bio->bi_next;
|
|
||||||
- bio->bi_next = NULL;
|
|
||||||
+ vbio->vbio_bio->bi_end_io = vbio_completion;
|
|
||||||
+ vbio->vbio_bio->bi_private = vbio;
|
|
||||||
|
|
||||||
- bio->bi_end_io = vdev_disk_io_rw_completion;
|
|
||||||
- bio->bi_private = vbio;
|
|
||||||
- bio_set_op_attrs(bio,
|
|
||||||
- vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
|
|
||||||
- WRITE : READ, flags);
|
|
||||||
+ vdev_submit_bio(vbio->vbio_bio);
|
|
||||||
|
|
||||||
- vdev_submit_bio(bio);
|
|
||||||
-
|
|
||||||
- bio = next;
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- /* Finish the batch */
|
|
||||||
- if (do_plug)
|
|
||||||
- blk_finish_plug(&plug);
|
|
||||||
+ blk_finish_plug(&plug);
|
|
||||||
|
|
||||||
- /* Release the extra reference */
|
|
||||||
- vbio_put(vbio);
|
|
||||||
+ vbio->vbio_bio = NULL;
|
|
||||||
+ vbio->vbio_bdev = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
-static void
|
|
||||||
-vbio_return_abd(vbio_t *vbio)
|
|
||||||
+/* IO completion callback */
|
|
||||||
+BIO_END_IO_PROTO(vbio_completion, bio, error)
|
|
||||||
{
|
|
||||||
+ vbio_t *vbio = bio->bi_private;
|
|
||||||
zio_t *zio = vbio->vbio_zio;
|
|
||||||
- if (vbio->vbio_abd == NULL)
|
|
||||||
- return;
|
|
||||||
-
|
|
||||||
- /*
|
|
||||||
- * If we copied the ABD before issuing it, clean up and return the copy
|
|
||||||
- * to the ADB, with changes if appropriate.
|
|
||||||
- */
|
|
||||||
- void *buf = abd_to_buf(vbio->vbio_abd);
|
|
||||||
- abd_free(vbio->vbio_abd);
|
|
||||||
- vbio->vbio_abd = NULL;
|
|
||||||
-
|
|
||||||
- if (zio->io_type == ZIO_TYPE_READ)
|
|
||||||
- abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
|
|
||||||
- else
|
|
||||||
- abd_return_buf(zio->io_abd, buf, zio->io_size);
|
|
||||||
-}
|
|
||||||
|
|
||||||
-static void
|
|
||||||
-vbio_free(vbio_t *vbio)
|
|
||||||
-{
|
|
||||||
- VERIFY0(atomic_read(&vbio->vbio_ref));
|
|
||||||
-
|
|
||||||
- vbio_return_abd(vbio);
|
|
||||||
+ ASSERT(zio);
|
|
||||||
|
|
||||||
- kmem_free(vbio, sizeof (vbio_t));
|
|
||||||
-}
|
|
||||||
+ /* Capture and log any errors */
|
|
||||||
+#ifdef HAVE_1ARG_BIO_END_IO_T
|
|
||||||
+ zio->io_error = BIO_END_IO_ERROR(bio);
|
|
||||||
+#else
|
|
||||||
+ zio->io_error = 0;
|
|
||||||
+ if (error)
|
|
||||||
+ zio->io_error = -(error);
|
|
||||||
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
|
||||||
+ zio->io_error = EIO;
|
|
||||||
+#endif
|
|
||||||
+ ASSERT3U(zio->io_error, >=, 0);
|
|
||||||
|
|
||||||
-static void
|
|
||||||
-vbio_put(vbio_t *vbio)
|
|
||||||
-{
|
|
||||||
- if (atomic_dec_return(&vbio->vbio_ref) > 0)
|
|
||||||
- return;
|
|
||||||
+ if (zio->io_error)
|
|
||||||
+ vdev_disk_error(zio);
|
|
||||||
|
|
||||||
- /*
|
|
||||||
- * This was the last reference, so the entire IO is completed. Clean
|
|
||||||
- * up and submit it for processing.
|
|
||||||
- */
|
|
||||||
+ /* Return the BIO to the kernel */
|
|
||||||
+ bio_put(bio);
|
|
||||||
|
|
||||||
/*
|
|
||||||
- * Get any data buf back to the original ABD, if necessary. We do this
|
|
||||||
- * now so we can get the ZIO into the pipeline as quickly as possible,
|
|
||||||
- * and then do the remaining cleanup after.
|
|
||||||
+ * If we copied the ABD before issuing it, clean up and return the copy
|
|
||||||
+ * to the ADB, with changes if appropriate.
|
|
||||||
*/
|
|
||||||
- vbio_return_abd(vbio);
|
|
||||||
+ if (vbio->vbio_abd != NULL) {
|
|
||||||
+ void *buf = abd_to_buf(vbio->vbio_abd);
|
|
||||||
+ abd_free(vbio->vbio_abd);
|
|
||||||
+ vbio->vbio_abd = NULL;
|
|
||||||
|
|
||||||
- zio_t *zio = vbio->vbio_zio;
|
|
||||||
+ if (zio->io_type == ZIO_TYPE_READ)
|
|
||||||
+ abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
|
|
||||||
+ else
|
|
||||||
+ abd_return_buf(zio->io_abd, buf, zio->io_size);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
- /*
|
|
||||||
- * Set the overall error. If multiple BIOs returned an error, only the
|
|
||||||
- * first will be taken; the others are dropped (see
|
|
||||||
- * vdev_disk_io_rw_completion()). Its pretty much impossible for
|
|
||||||
- * multiple IOs to the same device to fail with different errors, so
|
|
||||||
- * there's no real risk.
|
|
||||||
- */
|
|
||||||
- zio->io_error = vbio->vbio_error;
|
|
||||||
- if (zio->io_error)
|
|
||||||
- vdev_disk_error(zio);
|
|
||||||
+ /* Final cleanup */
|
|
||||||
+ kmem_free(vbio, sizeof (vbio_t));
|
|
||||||
|
|
||||||
/* All done, submit for processing */
|
|
||||||
zio_delay_interrupt(zio);
|
|
||||||
-
|
|
||||||
- /* Finish cleanup */
|
|
||||||
- vbio_free(vbio);
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
|
|
||||||
-{
|
|
||||||
- vbio_t *vbio = bio->bi_private;
|
|
||||||
-
|
|
||||||
- if (vbio->vbio_error == 0) {
|
|
||||||
-#ifdef HAVE_1ARG_BIO_END_IO_T
|
|
||||||
- vbio->vbio_error = BIO_END_IO_ERROR(bio);
|
|
||||||
-#else
|
|
||||||
- if (error)
|
|
||||||
- vbio->vbio_error = -(error);
|
|
||||||
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
|
||||||
- vbio->vbio_error = EIO;
|
|
||||||
-#endif
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- /*
|
|
||||||
- * Destroy the BIO. This is safe to do; the vbio owns its data and the
|
|
||||||
- * kernel won't touch it again after the completion function runs.
|
|
||||||
- */
|
|
||||||
- bio_put(bio);
|
|
||||||
-
|
|
||||||
- /* Drop this BIOs reference acquired by vbio_submit() */
|
|
||||||
- vbio_put(vbio);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
@@ -948,14 +893,6 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
|
|
||||||
return (B_TRUE);
|
|
||||||
}
|
|
||||||
|
|
||||||
-/* Iterator callback to submit ABD pages to the vbio. */
|
|
||||||
-static int
|
|
||||||
-vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
|
|
||||||
-{
|
|
||||||
- vbio_t *vbio = priv;
|
|
||||||
- return (vbio_add_page(vbio, page, len, off));
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
static int
|
|
||||||
vdev_disk_io_rw(zio_t *zio)
|
|
||||||
{
|
|
||||||
@@ -1018,20 +955,12 @@ vdev_disk_io_rw(zio_t *zio)
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
|
|
||||||
- int error = 0;
|
|
||||||
- vbio_t *vbio = vbio_alloc(zio, bdev);
|
|
||||||
+ vbio_t *vbio = vbio_alloc(zio, bdev, flags);
|
|
||||||
if (abd != zio->io_abd)
|
|
||||||
vbio->vbio_abd = abd;
|
|
||||||
|
|
||||||
- /* Fill it with pages */
|
|
||||||
- error = abd_iterate_page_func(abd, 0, zio->io_size,
|
|
||||||
- vdev_disk_fill_vbio_cb, vbio);
|
|
||||||
- if (error != 0) {
|
|
||||||
- vbio_free(vbio);
|
|
||||||
- return (error);
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- vbio_submit(vbio, flags);
|
|
||||||
+ /* Fill it with data pages and submit it to the kernel */
|
|
||||||
+ vbio_submit(vbio, abd, zio->io_size);
|
|
||||||
return (0);
|
|
||||||
}
|
|
||||||
|
|
@ -1,96 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Date: Thu, 14 Mar 2024 10:57:30 +1100
|
|
||||||
Subject: [PATCH] abd_iter_page: don't use compound heads on Linux <4.5
|
|
||||||
|
|
||||||
Before 4.5 (specifically, torvalds/linux@ddc58f2), head and tail pages
|
|
||||||
in a compound page were refcounted separately. This means that using the
|
|
||||||
head page without taking a reference to it could see it cleaned up later
|
|
||||||
before we're finished with it. Specifically, bio_add_page() would take a
|
|
||||||
reference, and drop its reference after the bio completion callback
|
|
||||||
returns.
|
|
||||||
|
|
||||||
If the zio is executed immediately from the completion callback, this is
|
|
||||||
usually ok, as any data is referenced through the tail page referenced
|
|
||||||
by the ABD, and so becomes "live" that way. If there's a delay in zio
|
|
||||||
execution (high load, error injection), then the head page can be freed,
|
|
||||||
along with any dirty flags or other indicators that the underlying
|
|
||||||
memory is used. Later, when the zio completes and that memory is
|
|
||||||
accessed, its either unmapped and an unhandled fault takes down the
|
|
||||||
entire system, or it is mapped and we end up messing around in someone
|
|
||||||
else's memory. Both of these are very bad.
|
|
||||||
|
|
||||||
The solution on these older kernels is to take a reference to the head
|
|
||||||
page when we use it, and release it when we're done. There's not really
|
|
||||||
a sensible way under our current structure to do this; the "best" would
|
|
||||||
be to keep a list of head page references in the ABD, and release them
|
|
||||||
when the ABD is freed.
|
|
||||||
|
|
||||||
Since this additional overhead is totally unnecessary on 4.5+, where
|
|
||||||
head and tail pages share refcounts, I've opted to simply not use the
|
|
||||||
compound head in ABD page iteration there. This is theoretically less
|
|
||||||
efficient (though cleaning up head page references would add overhead),
|
|
||||||
but its safe, and we still get the other benefits of not mapping pages
|
|
||||||
before adding them to a bio and not mis-splitting pages.
|
|
||||||
|
|
||||||
There doesn't appear to be an obvious symbol name or config option we
|
|
||||||
can match on to discover this behaviour in configure (and the mm/page
|
|
||||||
APIs have changed a lot since then anyway), so I've gone with a simple
|
|
||||||
version check.
|
|
||||||
|
|
||||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
|
||||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
||||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Sponsored-by: Klara, Inc.
|
|
||||||
Sponsored-by: Wasabi Technology, Inc.
|
|
||||||
Closes #15533
|
|
||||||
Closes #15588
|
|
||||||
(cherry picked from commit c6be6ce1755a3d9a3cbe70256cd8958ef83d8542)
|
|
||||||
---
|
|
||||||
module/os/linux/zfs/abd_os.c | 14 ++++++++++++++
|
|
||||||
1 file changed, 14 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
|
|
||||||
index 3fe01c0b7..d3255dcbc 100644
|
|
||||||
--- a/module/os/linux/zfs/abd_os.c
|
|
||||||
+++ b/module/os/linux/zfs/abd_os.c
|
|
||||||
@@ -62,6 +62,7 @@
|
|
||||||
#include <linux/kmap_compat.h>
|
|
||||||
#include <linux/mm_compat.h>
|
|
||||||
#include <linux/scatterlist.h>
|
|
||||||
+#include <linux/version.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _KERNEL
|
|
||||||
@@ -1061,6 +1062,7 @@ abd_iter_page(struct abd_iter *aiter)
|
|
||||||
}
|
|
||||||
ASSERT(page);
|
|
||||||
|
|
||||||
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
|
|
||||||
if (PageTail(page)) {
|
|
||||||
/*
|
|
||||||
* This page is part of a "compound page", which is a group of
|
|
||||||
@@ -1082,11 +1084,23 @@ abd_iter_page(struct abd_iter *aiter)
|
|
||||||
* To do this, we need to adjust the offset to be counted from
|
|
||||||
* the head page. struct page for compound pages are stored
|
|
||||||
* contiguously, so we can just adjust by a simple offset.
|
|
||||||
+ *
|
|
||||||
+ * Before kernel 4.5, compound page heads were refcounted
|
|
||||||
+ * separately, such that moving back to the head page would
|
|
||||||
+ * require us to take a reference to it and releasing it once
|
|
||||||
+ * we're completely finished with it. In practice, that means
|
|
||||||
+ * when our caller is done with the ABD, which we have no
|
|
||||||
+ * insight into from here. Rather than contort this API to
|
|
||||||
+ * track head page references on such ancient kernels, we just
|
|
||||||
+ * compile this block out and use the tail pages directly. This
|
|
||||||
+ * is slightly less efficient, but makes everything far
|
|
||||||
+ * simpler.
|
|
||||||
*/
|
|
||||||
struct page *head = compound_head(page);
|
|
||||||
doff += ((page - head) * PAGESIZE);
|
|
||||||
page = head;
|
|
||||||
}
|
|
||||||
+#endif
|
|
||||||
|
|
||||||
/* final page and position within it */
|
|
||||||
aiter->iter_page = page;
|
|
@ -1,90 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Date: Wed, 27 Mar 2024 13:11:12 +1100
|
|
||||||
Subject: [PATCH] vdev_disk: default to classic submission for 2.2.x
|
|
||||||
|
|
||||||
We don't want to change to brand-new code in the middle of a stable
|
|
||||||
series, but we want it available to test for people running into page
|
|
||||||
splitting issues.
|
|
||||||
|
|
||||||
This commits make zfs_vdev_disk_classic=1 the default, and updates the
|
|
||||||
documentation to better explain what's going on.
|
|
||||||
|
|
||||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Sponsored-by: Klara, Inc.
|
|
||||||
Sponsored-by: Wasabi Technology, Inc.
|
|
||||||
---
|
|
||||||
man/man4/zfs.4 | 31 ++++++++++++++++++++++---------
|
|
||||||
module/os/linux/zfs/vdev_disk.c | 8 +++++---
|
|
||||||
2 files changed, 27 insertions(+), 12 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
|
|
||||||
index 6a628e7f3..a98ec519a 100644
|
|
||||||
--- a/man/man4/zfs.4
|
|
||||||
+++ b/man/man4/zfs.4
|
|
||||||
@@ -1355,17 +1355,30 @@ This parameter only applies on Linux.
|
|
||||||
This parameter is ignored if
|
|
||||||
.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
|
|
||||||
.
|
|
||||||
-.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
|
|
||||||
-If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
|
|
||||||
-and earlier.
|
|
||||||
-This "classic" method has known issues with highly fragmented IO requests and
|
|
||||||
-is slower on many workloads, but it has been in use for many years and is known
|
|
||||||
-to be very stable.
|
|
||||||
-If you set this parameter, please also open a bug report why you did so,
|
|
||||||
+.It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint
|
|
||||||
+Controls the method used to submit IO to the Linux block layer
|
|
||||||
+(default
|
|
||||||
+.Sy 1 "classic" Ns
|
|
||||||
+)
|
|
||||||
+.Pp
|
|
||||||
+If set to 1, the "classic" method is used.
|
|
||||||
+This is the method that has been in use since the earliest versions of
|
|
||||||
+ZFS-on-Linux.
|
|
||||||
+It has known issues with highly fragmented IO requests and is less efficient on
|
|
||||||
+many workloads, but it well known and well understood.
|
|
||||||
+.Pp
|
|
||||||
+If set to 0, the "new" method is used.
|
|
||||||
+This method is available since 2.2.4 and should resolve all known issues and be
|
|
||||||
+far more efficient, but has not had as much testing.
|
|
||||||
+In the 2.2.x series, this parameter defaults to 1, to use the "classic" method.
|
|
||||||
+.Pp
|
|
||||||
+It is not recommended that you change it except on advice from the OpenZFS
|
|
||||||
+developers.
|
|
||||||
+If you do change it, please also open a bug report describing why you did so,
|
|
||||||
including the workload involved and any error messages.
|
|
||||||
.Pp
|
|
||||||
-This parameter and the classic submission method will be removed once we have
|
|
||||||
-total confidence in the new method.
|
|
||||||
+This parameter and the "classic" submission method will be removed in a future
|
|
||||||
+release of OpenZFS once we have total confidence in the new method.
|
|
||||||
.Pp
|
|
||||||
This parameter only applies on Linux, and can only be set at module load time.
|
|
||||||
.
|
|
||||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
index 36468fc21..e1c19a085 100644
|
|
||||||
--- a/module/os/linux/zfs/vdev_disk.c
|
|
||||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
@@ -969,8 +969,10 @@ vdev_disk_io_rw(zio_t *zio)
|
|
||||||
/*
|
|
||||||
* This is the classic, battle-tested BIO submission code. Until we're totally
|
|
||||||
* sure that the new code is safe and correct in all cases, this will remain
|
|
||||||
- * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
|
|
||||||
- * load time.
|
|
||||||
+ * available.
|
|
||||||
+ *
|
|
||||||
+ * It is enabled by setting zfs_vdev_disk_classic=1 at module load time. It is
|
|
||||||
+ * enabled (=1) by default since 2.2.4, and disabled by default (=0) on master.
|
|
||||||
*
|
|
||||||
* These functions have been renamed to vdev_classic_* to make it clear what
|
|
||||||
* they belong to, but their implementations are unchanged.
|
|
||||||
@@ -1468,7 +1470,7 @@ vdev_disk_rele(vdev_t *vd)
|
|
||||||
* BIO submission method. See comment above about vdev_classic.
|
|
||||||
* Set zfs_vdev_disk_classic=0 for new, =1 for classic
|
|
||||||
*/
|
|
||||||
-static uint_t zfs_vdev_disk_classic = 0; /* default new */
|
|
||||||
+static uint_t zfs_vdev_disk_classic = 1; /* default classic */
|
|
||||||
|
|
||||||
/* Set submission function from module parameter */
|
|
||||||
static int
|
|
@ -1,104 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Robert Evans <rrevans@gmail.com>
|
|
||||||
Date: Mon, 25 Mar 2024 17:56:49 -0400
|
|
||||||
Subject: [PATCH] Fix corruption caused by mmap flushing problems
|
|
||||||
|
|
||||||
1) Make mmap flushes synchronous. Linux may skip flushing dirty pages
|
|
||||||
already in writeback unless data-integrity sync is requested.
|
|
||||||
|
|
||||||
2) Change zfs_putpage to use TXG_WAIT. Otherwise dirty pages may be
|
|
||||||
skipped due to DMU pushing back on TX assign.
|
|
||||||
|
|
||||||
3) Add missing mmap flush when doing block cloning.
|
|
||||||
|
|
||||||
4) While here, pass errors from putpage to writepage/writepages.
|
|
||||||
|
|
||||||
This change fixes corruption edge cases, but unfortunately adds
|
|
||||||
synchronous ZIL flushes for dirty mmap pages to llseek and bclone
|
|
||||||
operations. It may be possible to avoid these sync writes later
|
|
||||||
but would need more tricky refactoring of the writeback code.
|
|
||||||
|
|
||||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
|
||||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
||||||
Signed-off-by: Robert Evans <evansr@google.com>
|
|
||||||
Closes #15933
|
|
||||||
Closes #16019
|
|
||||||
---
|
|
||||||
module/os/linux/zfs/zfs_vnops_os.c | 5 +----
|
|
||||||
module/os/linux/zfs/zpl_file.c | 8 ++++----
|
|
||||||
module/zfs/zfs_vnops.c | 6 +++++-
|
|
||||||
3 files changed, 10 insertions(+), 9 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
|
|
||||||
index c06a75662..7c473bc7e 100644
|
|
||||||
--- a/module/os/linux/zfs/zfs_vnops_os.c
|
|
||||||
+++ b/module/os/linux/zfs/zfs_vnops_os.c
|
|
||||||
@@ -3792,11 +3792,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
|
|
||||||
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
|
||||||
zfs_sa_upgrade_txholds(tx, zp);
|
|
||||||
|
|
||||||
- err = dmu_tx_assign(tx, TXG_NOWAIT);
|
|
||||||
+ err = dmu_tx_assign(tx, TXG_WAIT);
|
|
||||||
if (err != 0) {
|
|
||||||
- if (err == ERESTART)
|
|
||||||
- dmu_tx_wait(tx);
|
|
||||||
-
|
|
||||||
dmu_tx_abort(tx);
|
|
||||||
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
|
|
||||||
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
|
|
||||||
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
|
|
||||||
index 3caa0fc6c..9dec52215 100644
|
|
||||||
--- a/module/os/linux/zfs/zpl_file.c
|
|
||||||
+++ b/module/os/linux/zfs/zpl_file.c
|
|
||||||
@@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
|
|
||||||
{
|
|
||||||
boolean_t *for_sync = data;
|
|
||||||
fstrans_cookie_t cookie;
|
|
||||||
+ int ret;
|
|
||||||
|
|
||||||
ASSERT(PageLocked(pp));
|
|
||||||
ASSERT(!PageWriteback(pp));
|
|
||||||
|
|
||||||
cookie = spl_fstrans_mark();
|
|
||||||
- (void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
|
|
||||||
+ ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
|
|
||||||
spl_fstrans_unmark(cookie);
|
|
||||||
|
|
||||||
- return (0);
|
|
||||||
+ return (ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef HAVE_WRITEPAGE_T_FOLIO
|
|
||||||
static int
|
|
||||||
zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
|
|
||||||
{
|
|
||||||
- (void) zpl_putpage(&pp->page, wbc, data);
|
|
||||||
- return (0);
|
|
||||||
+ return (zpl_putpage(&pp->page, wbc, data));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
|
|
||||||
index 2b37834d5..7020f88ec 100644
|
|
||||||
--- a/module/zfs/zfs_vnops.c
|
|
||||||
+++ b/module/zfs/zfs_vnops.c
|
|
||||||
@@ -130,7 +130,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
|
|
||||||
|
|
||||||
/* Flush any mmap()'d data to disk */
|
|
||||||
if (zn_has_cached_data(zp, 0, file_sz - 1))
|
|
||||||
- zn_flush_cached_data(zp, B_FALSE);
|
|
||||||
+ zn_flush_cached_data(zp, B_TRUE);
|
|
||||||
|
|
||||||
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
|
|
||||||
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
|
|
||||||
@@ -1193,6 +1193,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
+ /* Flush any mmap()'d data to disk */
|
|
||||||
+ if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
|
|
||||||
+ zn_flush_cached_data(inzp, B_TRUE);
|
|
||||||
+
|
|
||||||
/*
|
|
||||||
* Maintain predictable lock order.
|
|
||||||
*/
|
|
@ -1,57 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Date: Tue, 2 Apr 2024 15:14:54 +1100
|
|
||||||
Subject: [PATCH] vdev_disk: don't touch vbio after its handed off to the
|
|
||||||
kernel
|
|
||||||
|
|
||||||
After IO is unplugged, it may complete immediately and vbio_completion
|
|
||||||
be called on interrupt context. That may interrupt or deschedule our
|
|
||||||
task. If its the last bio, the vbio will be freed. Then, we get
|
|
||||||
rescheduled, and try to write to freed memory through vbio->.
|
|
||||||
|
|
||||||
This patch just removes the the cleanup, and the corresponding assert.
|
|
||||||
These were leftovers from a previous iteration of vbio_submit() and were
|
|
||||||
always "belt and suspenders" ops anyway, never strictly required.
|
|
||||||
|
|
||||||
Reported-by: Rich Ercolani <rincebrain@gmail.com>
|
|
||||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
|
||||||
Sponsored-by: Klara, Inc.
|
|
||||||
Sponsored-by: Wasabi Technology, Inc.
|
|
||||||
(cherry picked from commit 34f662ad22206af6852020fd923ceccd836a855f)
|
|
||||||
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
|
|
||||||
---
|
|
||||||
module/os/linux/zfs/vdev_disk.c | 11 ++++++-----
|
|
||||||
1 file changed, 6 insertions(+), 5 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
index e1c19a085..62c7aa14f 100644
|
|
||||||
--- a/module/os/linux/zfs/vdev_disk.c
|
|
||||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
|
||||||
@@ -758,8 +758,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
|
|
||||||
static void
|
|
||||||
vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
|
|
||||||
{
|
|
||||||
- ASSERT(vbio->vbio_bdev);
|
|
||||||
-
|
|
||||||
/*
|
|
||||||
* We plug so we can submit the BIOs as we go and only unplug them when
|
|
||||||
* they are fully created and submitted. This is important; if we don't
|
|
||||||
@@ -777,12 +775,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
|
|
||||||
vbio->vbio_bio->bi_end_io = vbio_completion;
|
|
||||||
vbio->vbio_bio->bi_private = vbio;
|
|
||||||
|
|
||||||
+ /*
|
|
||||||
+ * Once submitted, vbio_bio now owns vbio (through bi_private) and we
|
|
||||||
+ * can't touch it again. The bio may complete and vbio_completion() be
|
|
||||||
+ * called and free the vbio before this task is run again, so we must
|
|
||||||
+ * consider it invalid from this point.
|
|
||||||
+ */
|
|
||||||
vdev_submit_bio(vbio->vbio_bio);
|
|
||||||
|
|
||||||
blk_finish_plug(&plug);
|
|
||||||
-
|
|
||||||
- vbio->vbio_bio = NULL;
|
|
||||||
- vbio->vbio_bdev = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* IO completion callback */
|
|
14
debian/patches/series
vendored
14
debian/patches/series
vendored
@ -9,17 +9,3 @@
|
|||||||
0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
|
0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
|
||||||
0010-Fix-nfs_truncate_shares-without-etc-exports.d.patch
|
0010-Fix-nfs_truncate_shares-without-etc-exports.d.patch
|
||||||
0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch
|
0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch
|
||||||
0012-udev-correctly-handle-partition-16-and-later.patch
|
|
||||||
0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
|
|
||||||
0014-linux-5.4-compat-page_size.patch
|
|
||||||
0015-abd-add-page-iterator.patch
|
|
||||||
0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
|
|
||||||
0017-vdev_disk-reorganise-vdev_disk_io_start.patch
|
|
||||||
0018-vdev_disk-make-read-write-IO-function-configurable.patch
|
|
||||||
0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
|
|
||||||
0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
|
|
||||||
0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
|
|
||||||
0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
|
|
||||||
0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
|
|
||||||
0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
|
|
||||||
0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch
|
|
||||||
|
2
upstream
2
upstream
@ -1 +1 @@
|
|||||||
Subproject commit c883088df83ced3a2b8b38e6d89a5e63fb153ee4
|
Subproject commit 2566592045780e7be7afc899c2496b1ae3af4f4d
|
Loading…
Reference in New Issue
Block a user