From 76119aa32b20c51b9f67dbf0cc40d633d222a82f Mon Sep 17 00:00:00 2001 From: Stoiko Ivanov Date: Tue, 7 May 2024 17:02:09 +0200 Subject: [PATCH] update zfs submodule to 2.2.4 and refresh patches mostly - drop all patches we had queued up to get kernel 6.8 supported. Signed-off-by: Stoiko Ivanov Tested-by: Max Carrara Signed-off-by: Thomas Lamprecht --- ...md-unit-for-importing-specific-pools.patch | 4 +- ...-move-manpage-arcstat-1-to-arcstat-8.patch | 2 +- ...-guard-access-to-l2arc-MFU-MRU-stats.patch | 12 +- ...hten-bounds-for-noalloc-stat-availab.patch | 4 +- ...rectly-handle-partition-16-and-later.patch | 52 -- ...-use-splice_copy_file_range-for-fall.patch | 135 ---- .../0014-linux-5.4-compat-page_size.patch | 121 ---- .../patches/0015-abd-add-page-iterator.patch | 334 --------- ...-existing-functions-to-vdev_classic_.patch | 349 --------- ...v_disk-reorganise-vdev_disk_io_start.patch | 111 --- ...-read-write-IO-function-configurable.patch | 69 -- ...e-BIO-filling-machinery-to-avoid-spl.patch | 671 ------------------ ...dule-parameter-to-select-BIO-submiss.patch | 104 --- ...se-bio_chain-to-submit-multiple-BIOs.patch | 363 ---------- ...on-t-use-compound-heads-on-Linux-4.5.patch | 96 --- ...ault-to-classic-submission-for-2.2.x.patch | 90 --- ...ion-caused-by-mmap-flushing-problems.patch | 104 --- ...touch-vbio-after-its-handed-off-to-t.patch | 57 -- debian/patches/series | 14 - upstream | 2 +- 20 files changed, 12 insertions(+), 2682 deletions(-) delete mode 100644 debian/patches/0012-udev-correctly-handle-partition-16-and-later.patch delete mode 100644 debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch delete mode 100644 debian/patches/0014-linux-5.4-compat-page_size.patch delete mode 100644 debian/patches/0015-abd-add-page-iterator.patch delete mode 100644 debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch delete mode 100644 debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch delete mode 100644 debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch delete mode 100644 debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch delete mode 100644 debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch delete mode 100644 debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch delete mode 100644 debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch delete mode 100644 debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch delete mode 100644 debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch delete mode 100644 debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch diff --git a/debian/patches/0007-Add-systemd-unit-for-importing-specific-pools.patch b/debian/patches/0007-Add-systemd-unit-for-importing-specific-pools.patch index 8232978..0600296 100644 --- a/debian/patches/0007-Add-systemd-unit-for-importing-specific-pools.patch +++ b/debian/patches/0007-Add-systemd-unit-for-importing-specific-pools.patch @@ -18,7 +18,7 @@ Signed-off-by: Thomas Lamprecht --- etc/Makefile.am | 1 + etc/systemd/system/50-zfs.preset | 1 + - etc/systemd/system/zfs-import@.service.in | 18 ++++++++++++++++ + etc/systemd/system/zfs-import@.service.in | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+) create mode 100644 etc/systemd/system/zfs-import@.service.in @@ -48,7 +48,7 @@ index e4056a92c..030611419 100644 enable zfs-share.service diff --git a/etc/systemd/system/zfs-import@.service.in b/etc/systemd/system/zfs-import@.service.in new file mode 100644 -index 000000000..9b4ee9371 +index 000000000..5bd19fb79 --- /dev/null +++ b/etc/systemd/system/zfs-import@.service.in @@ -0,0 +1,18 @@ diff --git a/debian/patches/0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch b/debian/patches/0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch index c11c1ae..9a4aea5 100644 --- a/debian/patches/0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch +++ b/debian/patches/0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch @@ -15,7 +15,7 @@ Signed-off-by: Thomas Lamprecht rename man/{man1/arcstat.1 => man8/arcstat.8} (99%) diff --git a/man/Makefile.am b/man/Makefile.am -index 45156571e..3713e9371 100644 +index 43bb014dd..a9293468a 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -2,7 +2,6 @@ dist_noinst_man_MANS = \ diff --git a/debian/patches/0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch b/debian/patches/0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch index f8cb353..2e7c207 100644 --- a/debian/patches/0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch +++ b/debian/patches/0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch @@ -27,7 +27,7 @@ Signed-off-by: Thomas Lamprecht 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/cmd/arc_summary b/cmd/arc_summary -index 9c69ec4f8..edf94ea2a 100755 +index 100fb1987..86b2260a1 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -655,13 +655,13 @@ def section_arc(kstats_dict): @@ -48,7 +48,7 @@ index 9c69ec4f8..edf94ea2a 100755 prt_i1('L2 ineligible evictions:', f_bytes(arc_stats['evict_l2_ineligible'])) print() -@@ -851,20 +851,20 @@ def section_l2arc(kstats_dict): +@@ -860,20 +860,20 @@ def section_l2arc(kstats_dict): f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']), f_bytes(arc_stats['l2_hdr_size'])) prt_i2('MFU allocated size:', @@ -80,10 +80,10 @@ index 9c69ec4f8..edf94ea2a 100755 print() prt_1('L2ARC breakdown:', f_hits(l2_access_total)) diff --git a/cmd/arcstat.in b/cmd/arcstat.in -index 8df1c62f7..833348d0e 100755 +index c4f10a1d6..c570dca88 100755 --- a/cmd/arcstat.in +++ b/cmd/arcstat.in -@@ -565,8 +565,8 @@ def calculate(): +@@ -597,8 +597,8 @@ def calculate(): v["el2skip"] = d["evict_l2_skip"] // sint v["el2cach"] = d["evict_l2_cached"] // sint v["el2el"] = d["evict_l2_eligible"] // sint @@ -93,8 +93,8 @@ index 8df1c62f7..833348d0e 100755 + v["el2mru"] = d.get("evict_l2_eligible_mru", 0) // sint v["el2inel"] = d["evict_l2_ineligible"] // sint v["mtxmis"] = d["mutex_miss"] // sint - -@@ -581,11 +581,11 @@ def calculate(): + v["ztotal"] = (d["zfetch_hits"] + d["zfetch_future"] + d["zfetch_stride"] + +@@ -624,11 +624,11 @@ def calculate(): v["l2size"] = cur["l2_size"] v["l2bytes"] = d["l2_read_bytes"] // sint diff --git a/debian/patches/0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch b/debian/patches/0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch index 3c87b0c..29c7f9a 100644 --- a/debian/patches/0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch +++ b/debian/patches/0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch @@ -51,10 +51,10 @@ Signed-off-by: Thomas Lamprecht 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c -index 69bf9649a..fd42ce7c1 100644 +index ed0b8d7a1..f3acc49d0 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c -@@ -2616,7 +2616,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, +@@ -2663,7 +2663,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, if (vs->vs_scan_removing != 0) { (void) printf(gettext(" (removing)")); diff --git a/debian/patches/0012-udev-correctly-handle-partition-16-and-later.patch b/debian/patches/0012-udev-correctly-handle-partition-16-and-later.patch deleted file mode 100644 index 578b74b..0000000 --- a/debian/patches/0012-udev-correctly-handle-partition-16-and-later.patch +++ /dev/null @@ -1,52 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= -Date: Wed, 6 Mar 2024 10:39:06 +0100 -Subject: [PATCH] udev: correctly handle partition #16 and later -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -If a zvol has more than 15 partitions, the minor device number exhausts -the slot count reserved for partitions next to the zvol itself. As a -result, the minor number cannot be used to determine the partition -number for the higher partition, and doing so results in wrong named -symlinks being generated by udev. - -Since the partition number is encoded in the block device name anyway, -let's just extract it from there instead. - -Fixes: #15904 - -Signed-off-by: Fabian Grünbichler -Signed-off-by: Stoiko Ivanov ---- - udev/zvol_id.c | 9 +++++---- - 1 file changed, 5 insertions(+), 4 deletions(-) - -diff --git a/udev/zvol_id.c b/udev/zvol_id.c -index 5960b9787..609349594 100644 ---- a/udev/zvol_id.c -+++ b/udev/zvol_id.c -@@ -51,7 +51,7 @@ const char *__asan_default_options(void) { - int - main(int argc, const char *const *argv) - { -- if (argc != 2) { -+ if (argc != 2 || strncmp(argv[1], "/dev/zd", 7) != 0) { - fprintf(stderr, "usage: %s /dev/zdX\n", argv[0]); - return (1); - } -@@ -72,9 +72,10 @@ main(int argc, const char *const *argv) - return (1); - } - -- unsigned int dev_part = minor(sb.st_rdev) % ZVOL_MINORS; -- if (dev_part != 0) -- sprintf(zvol_name + strlen(zvol_name), "-part%u", dev_part); -+ const char *dev_part = strrchr(dev_name, 'p'); -+ if (dev_part != NULL) { -+ sprintf(zvol_name + strlen(zvol_name), "-part%s", dev_part + 1); -+ } - - for (size_t i = 0; i < strlen(zvol_name); ++i) - if (isblank(zvol_name[i])) diff --git a/debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch b/debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch deleted file mode 100644 index 380d77c..0000000 --- a/debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch +++ /dev/null @@ -1,135 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob N -Date: Thu, 21 Mar 2024 10:46:15 +1100 -Subject: [PATCH] Linux 6.8 compat: use splice_copy_file_range() for fallback - -Linux 6.8 removes generic_copy_file_range(), which had been reduced to a -simple wrapper around splice_copy_file_range(). Detect that function -directly and use it if generic_ is not available. - -Sponsored-by: https://despairlabs.com/sponsor/ -Reviewed-by: Tony Hutter -Reviewed by: Brian Behlendorf -Signed-off-by: Rob Norris -Closes #15930 -Closes #15931 -(cherry picked from commit ef08a4d4065d21414d7fedccac20da6bfda4dfd0) ---- - config/kernel-vfs-file_range.m4 | 27 +++++++++++++++++++++++++++ - config/kernel.m4 | 2 ++ - module/os/linux/zfs/zpl_file_range.c | 16 ++++++++++++++-- - 3 files changed, 43 insertions(+), 2 deletions(-) - -diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4 -index cc96404d8..8a5cbe2ee 100644 ---- a/config/kernel-vfs-file_range.m4 -+++ b/config/kernel-vfs-file_range.m4 -@@ -16,6 +16,9 @@ dnl # - dnl # 5.3: VFS copy_file_range() expected to do its own fallback, - dnl # generic_copy_file_range() added to support it - dnl # -+dnl # 6.8: generic_copy_file_range() removed, replaced by -+dnl # splice_copy_file_range() -+dnl # - AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [ - ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [ - #include -@@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [ - ]) - ]) - -+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [ -+ ZFS_LINUX_TEST_SRC([splice_copy_file_range], [ -+ #include -+ ], [ -+ struct file *src_file __attribute__ ((unused)) = NULL; -+ loff_t src_off __attribute__ ((unused)) = 0; -+ struct file *dst_file __attribute__ ((unused)) = NULL; -+ loff_t dst_off __attribute__ ((unused)) = 0; -+ size_t len __attribute__ ((unused)) = 0; -+ splice_copy_file_range(src_file, src_off, dst_file, dst_off, -+ len); -+ ]) -+]) -+AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [ -+ AC_MSG_CHECKING([whether splice_copy_file_range() is available]) -+ ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1, -+ [splice_copy_file_range() is available]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ - AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [ - ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [ - #include -diff --git a/config/kernel.m4 b/config/kernel.m4 -index e3f864577..1d0c5a27f 100644 ---- a/config/kernel.m4 -+++ b/config/kernel.m4 -@@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ - ZFS_AC_KERNEL_SRC_VFS_IOV_ITER - ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE - ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE -+ ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE - ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE - ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE - ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE -@@ -266,6 +267,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ - ZFS_AC_KERNEL_VFS_IOV_ITER - ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE - ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE -+ ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE - ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE - ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE - ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE -diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c -index 3065d54fa..64728fdb1 100644 ---- a/module/os/linux/zfs/zpl_file_range.c -+++ b/module/os/linux/zfs/zpl_file_range.c -@@ -26,6 +26,9 @@ - #include - #endif - #include -+#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE -+#include -+#endif - #include - #include - #include -@@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, - ret = zpl_clone_file_range_impl(src_file, src_off, - dst_file, dst_off, len); - --#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE -+#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE) - /* - * Since Linux 5.3 the filesystem driver is responsible for executing - * an appropriate fallback, and a generic fallback function is provided. -@@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, - ret == -EAGAIN) - ret = generic_copy_file_range(src_file, src_off, dst_file, - dst_off, len, flags); -+#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE) -+ /* -+ * Since 6.8 the fallback function is called splice_copy_file_range -+ * and has a slightly different signature. -+ */ -+ if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV || -+ ret == -EAGAIN) -+ ret = splice_copy_file_range(src_file, src_off, dst_file, -+ dst_off, len); - #else - /* - * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal -@@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, - */ - if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN) - ret = -EOPNOTSUPP; --#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ -+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */ - - return (ret); - } diff --git a/debian/patches/0014-linux-5.4-compat-page_size.patch b/debian/patches/0014-linux-5.4-compat-page_size.patch deleted file mode 100644 index 258c025..0000000 --- a/debian/patches/0014-linux-5.4-compat-page_size.patch +++ /dev/null @@ -1,121 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob Norris -Date: Mon, 13 Nov 2023 17:55:29 +1100 -Subject: [PATCH] linux 5.4 compat: page_size() - -Before 5.4 we have to do a little math. - -Reviewed-by: Alexander Motin -Reviewed-by: Brian Behlendorf -Signed-off-by: Rob Norris -Sponsored-by: Klara, Inc. -Sponsored-by: Wasabi Technology, Inc. -Closes #15533 -Closes #15588 -(cherry picked from commit df04efe321a49c650f1fbaa6fd701fa2928cbe21) ---- - config/kernel-mm-page-size.m4 | 17 +++++++++++ - config/kernel.m4 | 2 ++ - include/os/linux/Makefile.am | 1 + - include/os/linux/kernel/linux/mm_compat.h | 36 +++++++++++++++++++++++ - 4 files changed, 56 insertions(+) - create mode 100644 config/kernel-mm-page-size.m4 - create mode 100644 include/os/linux/kernel/linux/mm_compat.h - -diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4 -new file mode 100644 -index 000000000..d5ebd9269 ---- /dev/null -+++ b/config/kernel-mm-page-size.m4 -@@ -0,0 +1,17 @@ -+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [ -+ ZFS_LINUX_TEST_SRC([page_size], [ -+ #include -+ ],[ -+ unsigned long s; -+ s = page_size(NULL); -+ ]) -+]) -+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [ -+ AC_MSG_CHECKING([whether page_size() is available]) -+ ZFS_LINUX_TEST_RESULT([page_size], [ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -diff --git a/config/kernel.m4 b/config/kernel.m4 -index 1d0c5a27f..548905ccd 100644 ---- a/config/kernel.m4 -+++ b/config/kernel.m4 -@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ - ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE - ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ - ZFS_AC_KERNEL_SRC_SYNC_BDEV -+ ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE - case "$host_cpu" in - powerpc*) - ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE -@@ -316,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ - ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE - ZFS_AC_KERNEL_COPY_SPLICE_READ - ZFS_AC_KERNEL_SYNC_BDEV -+ ZFS_AC_KERNEL_MM_PAGE_SIZE - case "$host_cpu" in - powerpc*) - ZFS_AC_KERNEL_CPU_HAS_FEATURE -diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am -index 3830d198d..51c27132b 100644 ---- a/include/os/linux/Makefile.am -+++ b/include/os/linux/Makefile.am -@@ -5,6 +5,7 @@ kernel_linux_HEADERS = \ - %D%/kernel/linux/compiler_compat.h \ - %D%/kernel/linux/dcache_compat.h \ - %D%/kernel/linux/kmap_compat.h \ -+ %D%/kernel/linux/mm_compat.h \ - %D%/kernel/linux/mod_compat.h \ - %D%/kernel/linux/page_compat.h \ - %D%/kernel/linux/percpu_compat.h \ -diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h -new file mode 100644 -index 000000000..40056c68d ---- /dev/null -+++ b/include/os/linux/kernel/linux/mm_compat.h -@@ -0,0 +1,36 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or https://opensource.org/licenses/CDDL-1.0. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Copyright (c) 2023, 2024, Klara Inc. -+ */ -+ -+#ifndef _ZFS_MM_COMPAT_H -+#define _ZFS_MM_COMPAT_H -+ -+#include -+ -+/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */ -+#ifndef HAVE_MM_PAGE_SIZE -+#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p))) -+#endif -+ -+#endif /* _ZFS_MM_COMPAT_H */ diff --git a/debian/patches/0015-abd-add-page-iterator.patch b/debian/patches/0015-abd-add-page-iterator.patch deleted file mode 100644 index bb91ea3..0000000 --- a/debian/patches/0015-abd-add-page-iterator.patch +++ /dev/null @@ -1,334 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob Norris -Date: Mon, 11 Dec 2023 16:05:54 +1100 -Subject: [PATCH] abd: add page iterator - -The regular ABD iterators yield data buffers, so they have to map and -unmap pages into kernel memory. If the caller only wants to count -chunks, or can use page pointers directly, then the map/unmap is just -unnecessary overhead. - -This adds adb_iterate_page_func, which yields unmapped struct page -instead. - -Reviewed-by: Alexander Motin -Reviewed-by: Brian Behlendorf -Signed-off-by: Rob Norris -Sponsored-by: Klara, Inc. -Sponsored-by: Wasabi Technology, Inc. -Closes #15533 -Closes #15588 -(cherry picked from commit 390b448726c580999dd337be7a40b0e95cf1d50b) ---- - include/sys/abd.h | 7 +++ - include/sys/abd_impl.h | 26 ++++++++- - module/os/freebsd/zfs/abd_os.c | 4 +- - module/os/linux/zfs/abd_os.c | 104 ++++++++++++++++++++++++++++++--- - module/zfs/abd.c | 42 +++++++++++++ - 5 files changed, 169 insertions(+), 14 deletions(-) - -diff --git a/include/sys/abd.h b/include/sys/abd.h -index 750f9986c..8a2df0bca 100644 ---- a/include/sys/abd.h -+++ b/include/sys/abd.h -@@ -79,6 +79,9 @@ typedef struct abd { - - typedef int abd_iter_func_t(void *buf, size_t len, void *priv); - typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); -+#if defined(__linux__) && defined(_KERNEL) -+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); -+#endif - - extern int zfs_abd_scatter_enabled; - -@@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *); - int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); - int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, - abd_iter_func2_t *, void *); -+#if defined(__linux__) && defined(_KERNEL) -+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, -+ void *); -+#endif - void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); - void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); - void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); -diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h -index 40546d4af..f88ea25e2 100644 ---- a/include/sys/abd_impl.h -+++ b/include/sys/abd_impl.h -@@ -21,6 +21,7 @@ - /* - * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2016, 2019 by Delphix. All rights reserved. -+ * Copyright (c) 2023, 2024, Klara Inc. - */ - - #ifndef _ABD_IMPL_H -@@ -38,12 +39,30 @@ typedef enum abd_stats_op { - ABDSTAT_DECR /* Decrease abdstat values */ - } abd_stats_op_t; - --struct scatterlist; /* forward declaration */ -+/* forward declarations */ -+struct scatterlist; -+struct page; - - struct abd_iter { - /* public interface */ -- void *iter_mapaddr; /* addr corresponding to iter_pos */ -- size_t iter_mapsize; /* length of data valid at mapaddr */ -+ union { -+ /* for abd_iter_map()/abd_iter_unmap() */ -+ struct { -+ /* addr corresponding to iter_pos */ -+ void *iter_mapaddr; -+ /* length of data valid at mapaddr */ -+ size_t iter_mapsize; -+ }; -+ /* for abd_iter_page() */ -+ struct { -+ /* current page */ -+ struct page *iter_page; -+ /* offset of data in page */ -+ size_t iter_page_doff; -+ /* size of data in page */ -+ size_t iter_page_dsize; -+ }; -+ }; - - /* private */ - abd_t *iter_abd; /* ABD being iterated through */ -@@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *); - void abd_iter_advance(struct abd_iter *, size_t); - void abd_iter_map(struct abd_iter *); - void abd_iter_unmap(struct abd_iter *); -+void abd_iter_page(struct abd_iter *); - - /* - * Helper macros -diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c -index 58a37df62..3b812271f 100644 ---- a/module/os/freebsd/zfs/abd_os.c -+++ b/module/os/freebsd/zfs/abd_os.c -@@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) - { - ASSERT(!abd_is_gang(abd)); - abd_verify(abd); -+ memset(aiter, 0, sizeof (struct abd_iter)); - aiter->iter_abd = abd; -- aiter->iter_pos = 0; -- aiter->iter_mapaddr = NULL; -- aiter->iter_mapsize = 0; - } - - /* -diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c -index 24390fbbf..dae128012 100644 ---- a/module/os/linux/zfs/abd_os.c -+++ b/module/os/linux/zfs/abd_os.c -@@ -21,6 +21,7 @@ - /* - * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2019 by Delphix. All rights reserved. -+ * Copyright (c) 2023, 2024, Klara Inc. - */ - - /* -@@ -59,6 +60,7 @@ - #include - #ifdef _KERNEL - #include -+#include - #include - #endif - -@@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) - { - ASSERT(!abd_is_gang(abd)); - abd_verify(abd); -+ memset(aiter, 0, sizeof (struct abd_iter)); - aiter->iter_abd = abd; -- aiter->iter_mapaddr = NULL; -- aiter->iter_mapsize = 0; -- aiter->iter_pos = 0; -- if (abd_is_linear(abd)) { -- aiter->iter_offset = 0; -- aiter->iter_sg = NULL; -- } else { -+ if (!abd_is_linear(abd)) { - aiter->iter_offset = ABD_SCATTER(abd).abd_offset; - aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; - } -@@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) - boolean_t - abd_iter_at_end(struct abd_iter *aiter) - { -+ ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); - return (aiter->iter_pos == aiter->iter_abd->abd_size); - } - -@@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter) - void - abd_iter_advance(struct abd_iter *aiter, size_t amount) - { -+ /* -+ * Ensure that last chunk is not in use. abd_iterate_*() must clear -+ * this state (directly or abd_iter_unmap()) before advancing. -+ */ - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); -+ ASSERT3P(aiter->iter_page, ==, NULL); -+ ASSERT0(aiter->iter_page_doff); -+ ASSERT0(aiter->iter_page_dsize); - - /* There's nothing left to advance to, so do nothing */ - if (abd_iter_at_end(aiter)) -@@ -1009,6 +1014,88 @@ abd_cache_reap_now(void) - } - - #if defined(_KERNEL) -+/* -+ * Yield the next page struct and data offset and size within it, without -+ * mapping it into the address space. -+ */ -+void -+abd_iter_page(struct abd_iter *aiter) -+{ -+ if (abd_iter_at_end(aiter)) { -+ aiter->iter_page = NULL; -+ aiter->iter_page_doff = 0; -+ aiter->iter_page_dsize = 0; -+ return; -+ } -+ -+ struct page *page; -+ size_t doff, dsize; -+ -+ if (abd_is_linear(aiter->iter_abd)) { -+ ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); -+ -+ /* memory address at iter_pos */ -+ void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; -+ -+ /* struct page for address */ -+ page = is_vmalloc_addr(paddr) ? -+ vmalloc_to_page(paddr) : virt_to_page(paddr); -+ -+ /* offset of address within the page */ -+ doff = offset_in_page(paddr); -+ -+ /* total data remaining in abd from this position */ -+ dsize = aiter->iter_abd->abd_size - aiter->iter_offset; -+ } else { -+ ASSERT(!abd_is_gang(aiter->iter_abd)); -+ -+ /* current scatter page */ -+ page = sg_page(aiter->iter_sg); -+ -+ /* position within page */ -+ doff = aiter->iter_offset; -+ -+ /* remaining data in scatterlist */ -+ dsize = MIN(aiter->iter_sg->length - aiter->iter_offset, -+ aiter->iter_abd->abd_size - aiter->iter_pos); -+ } -+ ASSERT(page); -+ -+ if (PageTail(page)) { -+ /* -+ * This page is part of a "compound page", which is a group of -+ * pages that can be referenced from a single struct page *. -+ * Its organised as a "head" page, followed by a series of -+ * "tail" pages. -+ * -+ * In OpenZFS, compound pages are allocated using the -+ * __GFP_COMP flag, which we get from scatter ABDs and SPL -+ * vmalloc slabs (ie >16K allocations). So a great many of the -+ * IO buffers we get are going to be of this type. -+ * -+ * The tail pages are just regular PAGE_SIZE pages, and can be -+ * safely used as-is. However, the head page has length -+ * covering itself and all the tail pages. If this ABD chunk -+ * spans multiple pages, then we can use the head page and a -+ * >PAGE_SIZE length, which is far more efficient. -+ * -+ * To do this, we need to adjust the offset to be counted from -+ * the head page. struct page for compound pages are stored -+ * contiguously, so we can just adjust by a simple offset. -+ */ -+ struct page *head = compound_head(page); -+ doff += ((page - head) * PAGESIZE); -+ page = head; -+ } -+ -+ /* final page and position within it */ -+ aiter->iter_page = page; -+ aiter->iter_page_doff = doff; -+ -+ /* amount of data in the chunk, up to the end of the page */ -+ aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff); -+} -+ - /* - * bio_nr_pages for ABD. - * @off is the offset in @abd -@@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size, - module_param(zfs_abd_scatter_max_order, uint, 0644); - MODULE_PARM_DESC(zfs_abd_scatter_max_order, - "Maximum order allocation used for a scatter ABD."); --#endif -+ -+#endif /* _KERNEL */ -diff --git a/module/zfs/abd.c b/module/zfs/abd.c -index d982f201c..3388e2357 100644 ---- a/module/zfs/abd.c -+++ b/module/zfs/abd.c -@@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, - return (ret); - } - -+#if defined(__linux__) && defined(_KERNEL) -+int -+abd_iterate_page_func(abd_t *abd, size_t off, size_t size, -+ abd_iter_page_func_t *func, void *private) -+{ -+ struct abd_iter aiter; -+ int ret = 0; -+ -+ if (size == 0) -+ return (0); -+ -+ abd_verify(abd); -+ ASSERT3U(off + size, <=, abd->abd_size); -+ -+ abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); -+ -+ while (size > 0) { -+ IMPLY(abd_is_gang(abd), c_abd != NULL); -+ -+ abd_iter_page(&aiter); -+ -+ size_t len = MIN(aiter.iter_page_dsize, size); -+ ASSERT3U(len, >, 0); -+ -+ ret = func(aiter.iter_page, aiter.iter_page_doff, -+ len, private); -+ -+ aiter.iter_page = NULL; -+ aiter.iter_page_doff = 0; -+ aiter.iter_page_dsize = 0; -+ -+ if (ret != 0) -+ break; -+ -+ size -= len; -+ c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); -+ } -+ -+ return (ret); -+} -+#endif -+ - struct buf_arg { - void *arg_buf; - }; diff --git a/debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch b/debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch deleted file mode 100644 index ebabb1c..0000000 --- a/debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch +++ /dev/null @@ -1,349 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob Norris -Date: Tue, 9 Jan 2024 12:12:56 +1100 -Subject: [PATCH] vdev_disk: rename existing functions to vdev_classic_* - -This is just renaming the existing functions we're about to replace and -grouping them together to make the next commits easier to follow. - -Reviewed-by: Alexander Motin -Reviewed-by: Brian Behlendorf -Signed-off-by: Rob Norris -Sponsored-by: Klara, Inc. -Sponsored-by: Wasabi Technology, Inc. -Closes #15533 -Closes #15588 -(cherry picked from commit f3b85d706bae82957d2e3e0ef1d53a1cfab60eb4) ---- - include/sys/abd.h | 2 + - module/os/linux/zfs/abd_os.c | 5 + - module/os/linux/zfs/vdev_disk.c | 215 +++++++++++++++++--------------- - 3 files changed, 120 insertions(+), 102 deletions(-) - -diff --git a/include/sys/abd.h b/include/sys/abd.h -index 8a2df0bca..bee38b831 100644 ---- a/include/sys/abd.h -+++ b/include/sys/abd.h -@@ -220,6 +220,8 @@ void abd_fini(void); - - /* - * Linux ABD bio functions -+ * Note: these are only needed to support vdev_classic. See comment in -+ * vdev_disk.c. - */ - #if defined(__linux__) && defined(_KERNEL) - unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); -diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c -index dae128012..3fe01c0b7 100644 ---- a/module/os/linux/zfs/abd_os.c -+++ b/module/os/linux/zfs/abd_os.c -@@ -1096,6 +1096,11 @@ abd_iter_page(struct abd_iter *aiter) - aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff); - } - -+/* -+ * Note: ABD BIO functions only needed to support vdev_classic. See comments in -+ * vdev_disk.c. -+ */ -+ - /* - * bio_nr_pages for ABD. - * @off is the offset in @abd -diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c -index b0bda5fa2..957619b87 100644 ---- a/module/os/linux/zfs/vdev_disk.c -+++ b/module/os/linux/zfs/vdev_disk.c -@@ -83,17 +83,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000; - */ - #define EFI_MIN_RESV_SIZE (16 * 1024) - --/* -- * Virtual device vector for disks. -- */ --typedef struct dio_request { -- zio_t *dr_zio; /* Parent ZIO */ -- atomic_t dr_ref; /* References */ -- int dr_error; /* Bio error */ -- int dr_bio_count; /* Count of bio's */ -- struct bio *dr_bio[]; /* Attached bio's */ --} dio_request_t; -- - /* - * BIO request failfast mask. - */ -@@ -467,85 +456,6 @@ vdev_disk_close(vdev_t *v) - v->vdev_tsd = NULL; - } - --static dio_request_t * --vdev_disk_dio_alloc(int bio_count) --{ -- dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + -- sizeof (struct bio *) * bio_count, KM_SLEEP); -- atomic_set(&dr->dr_ref, 0); -- dr->dr_bio_count = bio_count; -- dr->dr_error = 0; -- -- for (int i = 0; i < dr->dr_bio_count; i++) -- dr->dr_bio[i] = NULL; -- -- return (dr); --} -- --static void --vdev_disk_dio_free(dio_request_t *dr) --{ -- int i; -- -- for (i = 0; i < dr->dr_bio_count; i++) -- if (dr->dr_bio[i]) -- bio_put(dr->dr_bio[i]); -- -- kmem_free(dr, sizeof (dio_request_t) + -- sizeof (struct bio *) * dr->dr_bio_count); --} -- --static void --vdev_disk_dio_get(dio_request_t *dr) --{ -- atomic_inc(&dr->dr_ref); --} -- --static void --vdev_disk_dio_put(dio_request_t *dr) --{ -- int rc = atomic_dec_return(&dr->dr_ref); -- -- /* -- * Free the dio_request when the last reference is dropped and -- * ensure zio_interpret is called only once with the correct zio -- */ -- if (rc == 0) { -- zio_t *zio = dr->dr_zio; -- int error = dr->dr_error; -- -- vdev_disk_dio_free(dr); -- -- if (zio) { -- zio->io_error = error; -- ASSERT3S(zio->io_error, >=, 0); -- if (zio->io_error) -- vdev_disk_error(zio); -- -- zio_delay_interrupt(zio); -- } -- } --} -- --BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) --{ -- dio_request_t *dr = bio->bi_private; -- -- if (dr->dr_error == 0) { --#ifdef HAVE_1ARG_BIO_END_IO_T -- dr->dr_error = BIO_END_IO_ERROR(bio); --#else -- if (error) -- dr->dr_error = -(error); -- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) -- dr->dr_error = EIO; --#endif -- } -- -- /* Drop reference acquired by __vdev_disk_physio */ -- vdev_disk_dio_put(dr); --} -- - static inline void - vdev_submit_bio_impl(struct bio *bio) - { -@@ -697,8 +607,107 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, - return (bio); - } - -+/* ========== */ -+ -+/* -+ * This is the classic, battle-tested BIO submission code. -+ * -+ * These functions have been renamed to vdev_classic_* to make it clear what -+ * they belong to, but their implementations are unchanged. -+ */ -+ -+/* -+ * Virtual device vector for disks. -+ */ -+typedef struct dio_request { -+ zio_t *dr_zio; /* Parent ZIO */ -+ atomic_t dr_ref; /* References */ -+ int dr_error; /* Bio error */ -+ int dr_bio_count; /* Count of bio's */ -+ struct bio *dr_bio[]; /* Attached bio's */ -+} dio_request_t; -+ -+static dio_request_t * -+vdev_classic_dio_alloc(int bio_count) -+{ -+ dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + -+ sizeof (struct bio *) * bio_count, KM_SLEEP); -+ atomic_set(&dr->dr_ref, 0); -+ dr->dr_bio_count = bio_count; -+ dr->dr_error = 0; -+ -+ for (int i = 0; i < dr->dr_bio_count; i++) -+ dr->dr_bio[i] = NULL; -+ -+ return (dr); -+} -+ -+static void -+vdev_classic_dio_free(dio_request_t *dr) -+{ -+ int i; -+ -+ for (i = 0; i < dr->dr_bio_count; i++) -+ if (dr->dr_bio[i]) -+ bio_put(dr->dr_bio[i]); -+ -+ kmem_free(dr, sizeof (dio_request_t) + -+ sizeof (struct bio *) * dr->dr_bio_count); -+} -+ -+static void -+vdev_classic_dio_get(dio_request_t *dr) -+{ -+ atomic_inc(&dr->dr_ref); -+} -+ -+static void -+vdev_classic_dio_put(dio_request_t *dr) -+{ -+ int rc = atomic_dec_return(&dr->dr_ref); -+ -+ /* -+ * Free the dio_request when the last reference is dropped and -+ * ensure zio_interpret is called only once with the correct zio -+ */ -+ if (rc == 0) { -+ zio_t *zio = dr->dr_zio; -+ int error = dr->dr_error; -+ -+ vdev_classic_dio_free(dr); -+ -+ if (zio) { -+ zio->io_error = error; -+ ASSERT3S(zio->io_error, >=, 0); -+ if (zio->io_error) -+ vdev_disk_error(zio); -+ -+ zio_delay_interrupt(zio); -+ } -+ } -+} -+ -+BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) -+{ -+ dio_request_t *dr = bio->bi_private; -+ -+ if (dr->dr_error == 0) { -+#ifdef HAVE_1ARG_BIO_END_IO_T -+ dr->dr_error = BIO_END_IO_ERROR(bio); -+#else -+ if (error) -+ dr->dr_error = -(error); -+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) -+ dr->dr_error = EIO; -+#endif -+ } -+ -+ /* Drop reference acquired by vdev_classic_physio */ -+ vdev_classic_dio_put(dr); -+} -+ - static inline unsigned int --vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) -+vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) - { - unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, - bio_size, abd_offset); -@@ -711,7 +720,7 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) - } - - static int --__vdev_disk_physio(struct block_device *bdev, zio_t *zio, -+vdev_classic_physio(struct block_device *bdev, zio_t *zio, - size_t io_size, uint64_t io_offset, int rw, int flags) - { - dio_request_t *dr; -@@ -736,7 +745,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, - } - - retry: -- dr = vdev_disk_dio_alloc(bio_count); -+ dr = vdev_classic_dio_alloc(bio_count); - - if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && - zio->io_vd->vdev_failfast == B_TRUE) { -@@ -771,23 +780,23 @@ retry: - * this should be rare - see the comment above. - */ - if (dr->dr_bio_count == i) { -- vdev_disk_dio_free(dr); -+ vdev_classic_dio_free(dr); - bio_count *= 2; - goto retry; - } - -- nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); -+ nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); - dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); - if (unlikely(dr->dr_bio[i] == NULL)) { -- vdev_disk_dio_free(dr); -+ vdev_classic_dio_free(dr); - return (SET_ERROR(ENOMEM)); - } - -- /* Matching put called by vdev_disk_physio_completion */ -- vdev_disk_dio_get(dr); -+ /* Matching put called by vdev_classic_physio_completion */ -+ vdev_classic_dio_get(dr); - - BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; -- dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; -+ dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; - dr->dr_bio[i]->bi_private = dr; - bio_set_op_attrs(dr->dr_bio[i], rw, flags); - -@@ -801,7 +810,7 @@ retry: - } - - /* Extra reference to protect dio_request during vdev_submit_bio */ -- vdev_disk_dio_get(dr); -+ vdev_classic_dio_get(dr); - - if (dr->dr_bio_count > 1) - blk_start_plug(&plug); -@@ -815,11 +824,13 @@ retry: - if (dr->dr_bio_count > 1) - blk_finish_plug(&plug); - -- vdev_disk_dio_put(dr); -+ vdev_classic_dio_put(dr); - - return (error); - } - -+/* ========== */ -+ - BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) - { - zio_t *zio = bio->bi_private; -@@ -1023,7 +1034,7 @@ vdev_disk_io_start(zio_t *zio) - } - - zio->io_target_timestamp = zio_handle_io_delay(zio); -- error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio, -+ error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio, - zio->io_size, zio->io_offset, rw, 0); - rw_exit(&vd->vd_lock); - diff --git a/debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch b/debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch deleted file mode 100644 index 23a946f..0000000 --- a/debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch +++ /dev/null @@ -1,111 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob Norris -Date: Tue, 9 Jan 2024 12:23:30 +1100 -Subject: [PATCH] vdev_disk: reorganise vdev_disk_io_start - -Light reshuffle to make it a bit more linear to read and get rid of a -bunch of args that aren't needed in all cases. - -Reviewed-by: Alexander Motin -Reviewed-by: Brian Behlendorf -Signed-off-by: Rob Norris -Sponsored-by: Klara, Inc. -Sponsored-by: Wasabi Technology, Inc. -Closes #15533 -Closes #15588 -(cherry picked from commit 867178ae1db28e73051c8a7ce662f2f2f81cd8e6) ---- - module/os/linux/zfs/vdev_disk.c | 51 ++++++++++++++++++++------------- - 1 file changed, 31 insertions(+), 20 deletions(-) - -diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c -index 957619b87..51e7cef2f 100644 ---- a/module/os/linux/zfs/vdev_disk.c -+++ b/module/os/linux/zfs/vdev_disk.c -@@ -720,9 +720,16 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) - } - - static int --vdev_classic_physio(struct block_device *bdev, zio_t *zio, -- size_t io_size, uint64_t io_offset, int rw, int flags) -+vdev_classic_physio(zio_t *zio) - { -+ vdev_t *v = zio->io_vd; -+ vdev_disk_t *vd = v->vdev_tsd; -+ struct block_device *bdev = BDH_BDEV(vd->vd_bdh); -+ size_t io_size = zio->io_size; -+ uint64_t io_offset = zio->io_offset; -+ int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; -+ int flags = 0; -+ - dio_request_t *dr; - uint64_t abd_offset; - uint64_t bio_offset; -@@ -944,7 +951,7 @@ vdev_disk_io_start(zio_t *zio) - { - vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; -- int rw, error; -+ int error; - - /* - * If the vdev is closed, it's likely in the REMOVED or FAULTED state. -@@ -1007,13 +1014,6 @@ vdev_disk_io_start(zio_t *zio) - rw_exit(&vd->vd_lock); - zio_execute(zio); - return; -- case ZIO_TYPE_WRITE: -- rw = WRITE; -- break; -- -- case ZIO_TYPE_READ: -- rw = READ; -- break; - - case ZIO_TYPE_TRIM: - zio->io_error = vdev_disk_io_trim(zio); -@@ -1026,23 +1026,34 @@ vdev_disk_io_start(zio_t *zio) - #endif - return; - -- default: -+ case ZIO_TYPE_READ: -+ case ZIO_TYPE_WRITE: -+ zio->io_target_timestamp = zio_handle_io_delay(zio); -+ error = vdev_classic_physio(zio); - rw_exit(&vd->vd_lock); -- zio->io_error = SET_ERROR(ENOTSUP); -- zio_interrupt(zio); -+ if (error) { -+ zio->io_error = error; -+ zio_interrupt(zio); -+ } - return; -- } - -- zio->io_target_timestamp = zio_handle_io_delay(zio); -- error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio, -- zio->io_size, zio->io_offset, rw, 0); -- rw_exit(&vd->vd_lock); -+ default: -+ /* -+ * Getting here means our parent vdev has made a very strange -+ * request of us, and shouldn't happen. Assert here to force a -+ * crash in dev builds, but in production return the IO -+ * unhandled. The pool will likely suspend anyway but that's -+ * nicer than crashing the kernel. -+ */ -+ ASSERT3S(zio->io_type, ==, -1); - -- if (error) { -- zio->io_error = error; -+ rw_exit(&vd->vd_lock); -+ zio->io_error = SET_ERROR(ENOTSUP); - zio_interrupt(zio); - return; - } -+ -+ __builtin_unreachable(); - } - - static void diff --git a/debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch b/debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch deleted file mode 100644 index a169979..0000000 --- a/debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch +++ /dev/null @@ -1,69 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob Norris -Date: Tue, 9 Jan 2024 12:29:19 +1100 -Subject: [PATCH] vdev_disk: make read/write IO function configurable - -This is just setting up for the next couple of commits, which will add a -new IO function and a parameter to select it. - -Reviewed-by: Alexander Motin -Reviewed-by: Brian Behlendorf -Signed-off-by: Rob Norris -Sponsored-by: Klara, Inc. -Sponsored-by: Wasabi Technology, Inc. -Closes #15533 -Closes #15588 -(cherry picked from commit c4a13ba483f08a81aa47479d2f763a470d95b2b0) ---- - module/os/linux/zfs/vdev_disk.c | 23 +++++++++++++++++++++-- - 1 file changed, 21 insertions(+), 2 deletions(-) - -diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c -index 51e7cef2f..de4dba72f 100644 ---- a/module/os/linux/zfs/vdev_disk.c -+++ b/module/os/linux/zfs/vdev_disk.c -@@ -946,6 +946,8 @@ vdev_disk_io_trim(zio_t *zio) - #endif - } - -+int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; -+ - static void - vdev_disk_io_start(zio_t *zio) - { -@@ -1029,7 +1031,7 @@ vdev_disk_io_start(zio_t *zio) - case ZIO_TYPE_READ: - case ZIO_TYPE_WRITE: - zio->io_target_timestamp = zio_handle_io_delay(zio); -- error = vdev_classic_physio(zio); -+ error = vdev_disk_io_rw_fn(zio); - rw_exit(&vd->vd_lock); - if (error) { - zio->io_error = error; -@@ -1102,8 +1104,25 @@ vdev_disk_rele(vdev_t *vd) - /* XXX: Implement me as a vnode rele for the device */ - } - -+/* -+ * At first use vdev use, set the submission function from the default value if -+ * it hasn't been set already. -+ */ -+static int -+vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) -+{ -+ (void) spa; -+ (void) nv; -+ (void) tsd; -+ -+ if (vdev_disk_io_rw_fn == NULL) -+ vdev_disk_io_rw_fn = vdev_classic_physio; -+ -+ return (0); -+} -+ - vdev_ops_t vdev_disk_ops = { -- .vdev_op_init = NULL, -+ .vdev_op_init = vdev_disk_init, - .vdev_op_fini = NULL, - .vdev_op_open = vdev_disk_open, - .vdev_op_close = vdev_disk_close, diff --git a/debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch b/debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch deleted file mode 100644 index 8ccbf65..0000000 --- a/debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch +++ /dev/null @@ -1,671 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob Norris -Date: Tue, 18 Jul 2023 11:11:29 +1000 -Subject: [PATCH] vdev_disk: rewrite BIO filling machinery to avoid split pages - -This commit tackles a number of issues in the way BIOs (`struct bio`) -are constructed for submission to the Linux block layer. - -The kernel has a hard upper limit on the number of pages/segments that -can be added to a BIO, as well as a separate limit for each device -(related to its queue depth and other scheduling characteristics). - -ZFS counts the number of memory pages in the request ABD -(`abd_nr_pages_off()`, and then uses that as the number of segments to -put into the BIO, up to the hard upper limit. If it requires more than -the limit, it will create multiple BIOs. - -Leaving aside the fact that page count method is wrong (see below), not -limiting to the device segment max means that the device driver will -need to split the BIO in half. This is alone is not necessarily a -problem, but it interacts with another issue to cause a much larger -problem. - -The kernel function to add a segment to a BIO (`bio_add_page()`) takes a -`struct page` pointer, and offset+len within it. `struct page` can -represent a run of contiguous memory pages (known as a "compound page"). -In can be of arbitrary length. - -The ZFS functions that count ABD pages and load them into the BIO -(`abd_nr_pages_off()`, `bio_map()` and `abd_bio_map_off()`) will never -consider a page to be more than `PAGE_SIZE` (4K), even if the `struct -page` is for multiple pages. In this case, it will load the same `struct -page` into the BIO multiple times, with the offset adjusted each time. - -With a sufficiently large ABD, this can easily lead to the BIO being -entirely filled much earlier than it could have been. This is also -further contributes to the problem caused by the incorrect segment limit -calculation, as its much easier to go past the device limit, and so -require a split. - -Again, this is not a problem on its own. - -The logic for "never submit more than `PAGE_SIZE`" is actually a little -more subtle. It will actually never submit a buffer that crosses a 4K -page boundary. - -In practice, this is fine, as most ABDs are scattered, that is a list of -complete 4K pages, and so are loaded in as such. - -Linear ABDs are typically allocated from slabs, and for small sizes they -are frequently not aligned to page boundaries. For example, a 12K -allocation can span four pages, eg: - - -- 4K -- -- 4K -- -- 4K -- -- 4K -- - | | | | | - :## ######## ######## ######: [1K, 4K, 4K, 3K] - -Such an allocation would be loaded into a BIO as you see: - - [1K, 4K, 4K, 3K] - -This tends not to be a problem in practice, because even if the BIO were -filled and needed to be split, each half would still have either a start -or end aligned to the logical block size of the device (assuming 4K at -least). - ---- - -In ideal circumstances, these shortcomings don't cause any particular -problems. Its when they start to interact with other ZFS features that -things get interesting. - -Aggregation will create a "gang" ABD, which is simply a list of other -ABDs. Iterating over a gang ABD is just iterating over each ABD within -it in turn. - -Because the segments are simply loaded in order, we can end up with -uneven segments either side of the "gap" between the two ABDs. For -example, two 12K ABDs might be aggregated and then loaded as: - - [1K, 4K, 4K, 3K, 2K, 4K, 4K, 2K] - -Should a split occur, each individual BIO can end up either having an -start or end offset that is not aligned to the logical block size, which -some drivers (eg SCSI) will reject. However, this tends not to happen -because the default aggregation limit usually keeps the BIO small enough -to not require more than one split, and most pages are actually full 4K -pages, so hitting an uneven gap is very rare anyway. - -If the pool is under particular memory pressure, then an IO can be -broken down into a "gang block", a 512-byte block composed of a header -and up to three block pointers. Each points to a fragment of the -original write, or in turn, another gang block, breaking the original -data up over and over until space can be found in the pool for each of -them. - -Each gang header is a separate 512-byte memory allocation from a slab, -that needs to be written down to disk. When the gang header is added to -the BIO, its a single 512-byte segment. - -Pulling all this together, consider a large aggregated write of gang -blocks. This results a BIO containing lots of 512-byte segments. Given -our tendency to overfill the BIO, a split is likely, and most possible -split points will yield a pair of BIOs that are misaligned. Drivers that -care, like the SCSI driver, will reject them. - ---- - -This commit is a substantial refactor and rewrite of much of `vdev_disk` -to sort all this out. - -`vdev_bio_max_segs()` now returns the ideal maximum size for the device, -if available. There's also a tuneable `zfs_vdev_disk_max_segs` to -override this, to assist with testing. - -We scan the ABD up front to count the number of pages within it, and to -confirm that if we submitted all those pages to one or more BIOs, it -could be split at any point with creating a misaligned BIO. If the -pages in the BIO are not usable (as in any of the above situations), the -ABD is linearised, and then checked again. This is the same technique -used in `vdev_geom` on FreeBSD, adjusted for Linux's variable page size -and allocator quirks. - -`vbio_t` is a cleanup and enhancement of the old `dio_request_t`. The -idea is simply that it can hold all the state needed to create, submit -and return multiple BIOs, including all the refcounts, the ABD copy if -it was needed, and so on. Apart from what I hope is a clearer interface, -the major difference is that because we know how many BIOs we'll need up -front, we don't need the old overflow logic that would grow the BIO -array, throw away all the old work and restart. We can get it right from -the start. - -Reviewed-by: Alexander Motin -Reviewed-by: Brian Behlendorf -Signed-off-by: Rob Norris -Sponsored-by: Klara, Inc. -Sponsored-by: Wasabi Technology, Inc. -Closes #15533 -Closes #15588 -(cherry picked from commit 06a196020e6f70d2fedbd4d0d05bbe0c1ac6e4d8) ---- - include/os/linux/kernel/linux/mod_compat.h | 1 + - man/man4/zfs.4 | 10 +- - module/os/linux/zfs/vdev_disk.c | 439 ++++++++++++++++++++- - 3 files changed, 447 insertions(+), 3 deletions(-) - -diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h -index 8e20a9613..039865b70 100644 ---- a/include/os/linux/kernel/linux/mod_compat.h -+++ b/include/os/linux/kernel/linux/mod_compat.h -@@ -68,6 +68,7 @@ enum scope_prefix_types { - zfs_trim, - zfs_txg, - zfs_vdev, -+ zfs_vdev_disk, - zfs_vdev_file, - zfs_vdev_mirror, - zfs_vnops, -diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 -index 352990e02..b5679f2f0 100644 ---- a/man/man4/zfs.4 -+++ b/man/man4/zfs.4 -@@ -2,6 +2,7 @@ - .\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. - .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved. - .\" Copyright (c) 2019 Datto Inc. -+.\" Copyright (c) 2023, 2024 Klara, Inc. - .\" The contents of this file are subject to the terms of the Common Development - .\" and Distribution License (the "License"). You may not use this file except - .\" in compliance with the License. You can obtain a copy of the license at -@@ -15,7 +16,7 @@ - .\" own identifying information: - .\" Portions Copyright [yyyy] [name of copyright owner] - .\" --.Dd July 21, 2023 -+.Dd January 9, 2024 - .Dt ZFS 4 - .Os - . -@@ -1345,6 +1346,13 @@ _ - 4 Driver No driver retries on driver errors. - .TE - . -+.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint -+Maximum number of segments to add to a BIO (min 4). -+If this is higher than the maximum allowed by the device queue or the kernel -+itself, it will be clamped. -+Setting it to zero will cause the kernel's ideal size to be used. -+This parameter only applies on Linux. -+. - .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int - Time before expiring - .Pa .zfs/snapshot . -diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c -index de4dba72f..0ccb9ad96 100644 ---- a/module/os/linux/zfs/vdev_disk.c -+++ b/module/os/linux/zfs/vdev_disk.c -@@ -24,6 +24,7 @@ - * Rewritten for Linux by Brian Behlendorf . - * LLNL-CODE-403049. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. -+ * Copyright (c) 2023, 2024, Klara Inc. - */ - - #include -@@ -66,6 +67,13 @@ typedef struct vdev_disk { - krwlock_t vd_lock; - } vdev_disk_t; - -+/* -+ * Maximum number of segments to add to a bio (min 4). If this is higher than -+ * the maximum allowed by the device queue or the kernel itself, it will be -+ * clamped. Setting it to zero will cause the kernel's ideal size to be used. -+ */ -+uint_t zfs_vdev_disk_max_segs = 0; -+ - /* - * Unique identifier for the exclusive vdev holder. - */ -@@ -607,10 +615,433 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, - return (bio); - } - -+static inline uint_t -+vdev_bio_max_segs(struct block_device *bdev) -+{ -+ /* -+ * Smallest of the device max segs and the tuneable max segs. Minimum -+ * 4, so there's room to finish split pages if they come up. -+ */ -+ const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); -+ const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? -+ MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; -+ const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); -+ -+#ifdef HAVE_BIO_MAX_SEGS -+ return (bio_max_segs(max_segs)); -+#else -+ return (MIN(max_segs, BIO_MAX_PAGES)); -+#endif -+} -+ -+static inline uint_t -+vdev_bio_max_bytes(struct block_device *bdev) -+{ -+ return (queue_max_sectors(bdev_get_queue(bdev)) << 9); -+} -+ -+ -+/* -+ * Virtual block IO object (VBIO) -+ * -+ * Linux block IO (BIO) objects have a limit on how many data segments (pages) -+ * they can hold. Depending on how they're allocated and structured, a large -+ * ZIO can require more than one BIO to be submitted to the kernel, which then -+ * all have to complete before we can return the completed ZIO back to ZFS. -+ * -+ * A VBIO is a wrapper around multiple BIOs, carrying everything needed to -+ * translate a ZIO down into the kernel block layer and back again. -+ * -+ * Note that these are only used for data ZIOs (read/write). Meta-operations -+ * (flush/trim) don't need multiple BIOs and so can just make the call -+ * directly. -+ */ -+typedef struct { -+ zio_t *vbio_zio; /* parent zio */ -+ -+ struct block_device *vbio_bdev; /* blockdev to submit bios to */ -+ -+ abd_t *vbio_abd; /* abd carrying borrowed linear buf */ -+ -+ atomic_t vbio_ref; /* bio refcount */ -+ int vbio_error; /* error from failed bio */ -+ -+ uint_t vbio_max_segs; /* max segs per bio */ -+ -+ uint_t vbio_max_bytes; /* max bytes per bio */ -+ uint_t vbio_lbs_mask; /* logical block size mask */ -+ -+ uint64_t vbio_offset; /* start offset of next bio */ -+ -+ struct bio *vbio_bio; /* pointer to the current bio */ -+ struct bio *vbio_bios; /* list of all bios */ -+} vbio_t; -+ -+static vbio_t * -+vbio_alloc(zio_t *zio, struct block_device *bdev) -+{ -+ vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); -+ -+ vbio->vbio_zio = zio; -+ vbio->vbio_bdev = bdev; -+ atomic_set(&vbio->vbio_ref, 0); -+ vbio->vbio_max_segs = vdev_bio_max_segs(bdev); -+ vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); -+ vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); -+ vbio->vbio_offset = zio->io_offset; -+ -+ return (vbio); -+} -+ -+static int -+vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) -+{ -+ struct bio *bio; -+ uint_t ssize; -+ -+ while (size > 0) { -+ bio = vbio->vbio_bio; -+ if (bio == NULL) { -+ /* New BIO, allocate and set up */ -+ bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, -+ vbio->vbio_max_segs); -+ if (unlikely(bio == NULL)) -+ return (SET_ERROR(ENOMEM)); -+ BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; -+ -+ bio->bi_next = vbio->vbio_bios; -+ vbio->vbio_bios = vbio->vbio_bio = bio; -+ } -+ -+ /* -+ * Only load as much of the current page data as will fit in -+ * the space left in the BIO, respecting lbs alignment. Older -+ * kernels will error if we try to overfill the BIO, while -+ * newer ones will accept it and split the BIO. This ensures -+ * everything works on older kernels, and avoids an additional -+ * overhead on the new. -+ */ -+ ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & -+ vbio->vbio_lbs_mask); -+ if (ssize > 0 && -+ bio_add_page(bio, page, ssize, offset) == ssize) { -+ /* Accepted, adjust and load any remaining. */ -+ size -= ssize; -+ offset += ssize; -+ continue; -+ } -+ -+ /* No room, set up for a new BIO and loop */ -+ vbio->vbio_offset += BIO_BI_SIZE(bio); -+ -+ /* Signal new BIO allocation wanted */ -+ vbio->vbio_bio = NULL; -+ } -+ -+ return (0); -+} -+ -+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error); -+static void vbio_put(vbio_t *vbio); -+ -+static void -+vbio_submit(vbio_t *vbio, int flags) -+{ -+ ASSERT(vbio->vbio_bios); -+ struct bio *bio = vbio->vbio_bios; -+ vbio->vbio_bio = vbio->vbio_bios = NULL; -+ -+ /* -+ * We take a reference for each BIO as we submit it, plus one to -+ * protect us from BIOs completing before we're done submitting them -+ * all, causing vbio_put() to free vbio out from under us and/or the -+ * zio to be returned before all its IO has completed. -+ */ -+ atomic_set(&vbio->vbio_ref, 1); -+ -+ /* -+ * If we're submitting more than one BIO, inform the block layer so -+ * it can batch them if it wants. -+ */ -+ struct blk_plug plug; -+ boolean_t do_plug = (bio->bi_next != NULL); -+ if (do_plug) -+ blk_start_plug(&plug); -+ -+ /* Submit all the BIOs */ -+ while (bio != NULL) { -+ atomic_inc(&vbio->vbio_ref); -+ -+ struct bio *next = bio->bi_next; -+ bio->bi_next = NULL; -+ -+ bio->bi_end_io = vdev_disk_io_rw_completion; -+ bio->bi_private = vbio; -+ bio_set_op_attrs(bio, -+ vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? -+ WRITE : READ, flags); -+ -+ vdev_submit_bio(bio); -+ -+ bio = next; -+ } -+ -+ /* Finish the batch */ -+ if (do_plug) -+ blk_finish_plug(&plug); -+ -+ /* Release the extra reference */ -+ vbio_put(vbio); -+} -+ -+static void -+vbio_return_abd(vbio_t *vbio) -+{ -+ zio_t *zio = vbio->vbio_zio; -+ if (vbio->vbio_abd == NULL) -+ return; -+ -+ /* -+ * If we copied the ABD before issuing it, clean up and return the copy -+ * to the ADB, with changes if appropriate. -+ */ -+ void *buf = abd_to_buf(vbio->vbio_abd); -+ abd_free(vbio->vbio_abd); -+ vbio->vbio_abd = NULL; -+ -+ if (zio->io_type == ZIO_TYPE_READ) -+ abd_return_buf_copy(zio->io_abd, buf, zio->io_size); -+ else -+ abd_return_buf(zio->io_abd, buf, zio->io_size); -+} -+ -+static void -+vbio_free(vbio_t *vbio) -+{ -+ VERIFY0(atomic_read(&vbio->vbio_ref)); -+ -+ vbio_return_abd(vbio); -+ -+ kmem_free(vbio, sizeof (vbio_t)); -+} -+ -+static void -+vbio_put(vbio_t *vbio) -+{ -+ if (atomic_dec_return(&vbio->vbio_ref) > 0) -+ return; -+ -+ /* -+ * This was the last reference, so the entire IO is completed. Clean -+ * up and submit it for processing. -+ */ -+ -+ /* -+ * Get any data buf back to the original ABD, if necessary. We do this -+ * now so we can get the ZIO into the pipeline as quickly as possible, -+ * and then do the remaining cleanup after. -+ */ -+ vbio_return_abd(vbio); -+ -+ zio_t *zio = vbio->vbio_zio; -+ -+ /* -+ * Set the overall error. If multiple BIOs returned an error, only the -+ * first will be taken; the others are dropped (see -+ * vdev_disk_io_rw_completion()). Its pretty much impossible for -+ * multiple IOs to the same device to fail with different errors, so -+ * there's no real risk. -+ */ -+ zio->io_error = vbio->vbio_error; -+ if (zio->io_error) -+ vdev_disk_error(zio); -+ -+ /* All done, submit for processing */ -+ zio_delay_interrupt(zio); -+ -+ /* Finish cleanup */ -+ vbio_free(vbio); -+} -+ -+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error) -+{ -+ vbio_t *vbio = bio->bi_private; -+ -+ if (vbio->vbio_error == 0) { -+#ifdef HAVE_1ARG_BIO_END_IO_T -+ vbio->vbio_error = BIO_END_IO_ERROR(bio); -+#else -+ if (error) -+ vbio->vbio_error = -(error); -+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) -+ vbio->vbio_error = EIO; -+#endif -+ } -+ -+ /* -+ * Destroy the BIO. This is safe to do; the vbio owns its data and the -+ * kernel won't touch it again after the completion function runs. -+ */ -+ bio_put(bio); -+ -+ /* Drop this BIOs reference acquired by vbio_submit() */ -+ vbio_put(vbio); -+} -+ -+/* -+ * Iterator callback to count ABD pages and check their size & alignment. -+ * -+ * On Linux, each BIO segment can take a page pointer, and an offset+length of -+ * the data within that page. A page can be arbitrarily large ("compound" -+ * pages) but we still have to ensure the data portion is correctly sized and -+ * aligned to the logical block size, to ensure that if the kernel wants to -+ * split the BIO, the two halves will still be properly aligned. -+ */ -+typedef struct { -+ uint_t bmask; -+ uint_t npages; -+ uint_t end; -+} vdev_disk_check_pages_t; -+ -+static int -+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) -+{ -+ vdev_disk_check_pages_t *s = priv; -+ -+ /* -+ * If we didn't finish on a block size boundary last time, then there -+ * would be a gap if we tried to use this ABD as-is, so abort. -+ */ -+ if (s->end != 0) -+ return (1); -+ -+ /* -+ * Note if we're taking less than a full block, so we can check it -+ * above on the next call. -+ */ -+ s->end = len & s->bmask; -+ -+ /* All blocks after the first must start on a block size boundary. */ -+ if (s->npages != 0 && (off & s->bmask) != 0) -+ return (1); -+ -+ s->npages++; -+ return (0); -+} -+ -+/* -+ * Check if we can submit the pages in this ABD to the kernel as-is. Returns -+ * the number of pages, or 0 if it can't be submitted like this. -+ */ -+static boolean_t -+vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) -+{ -+ vdev_disk_check_pages_t s = { -+ .bmask = bdev_logical_block_size(bdev)-1, -+ .npages = 0, -+ .end = 0, -+ }; -+ -+ if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) -+ return (B_FALSE); -+ -+ return (B_TRUE); -+} -+ -+/* Iterator callback to submit ABD pages to the vbio. */ -+static int -+vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv) -+{ -+ vbio_t *vbio = priv; -+ return (vbio_add_page(vbio, page, len, off)); -+} -+ -+static int -+vdev_disk_io_rw(zio_t *zio) -+{ -+ vdev_t *v = zio->io_vd; -+ vdev_disk_t *vd = v->vdev_tsd; -+ struct block_device *bdev = BDH_BDEV(vd->vd_bdh); -+ int flags = 0; -+ -+ /* -+ * Accessing outside the block device is never allowed. -+ */ -+ if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) { -+ vdev_dbgmsg(zio->io_vd, -+ "Illegal access %llu size %llu, device size %llu", -+ (u_longlong_t)zio->io_offset, -+ (u_longlong_t)zio->io_size, -+ (u_longlong_t)i_size_read(bdev->bd_inode)); -+ return (SET_ERROR(EIO)); -+ } -+ -+ if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && -+ v->vdev_failfast == B_TRUE) { -+ bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, -+ zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); -+ } -+ -+ /* -+ * Check alignment of the incoming ABD. If any part of it would require -+ * submitting a page that is not aligned to the logical block size, -+ * then we take a copy into a linear buffer and submit that instead. -+ * This should be impossible on a 512b LBS, and fairly rare on 4K, -+ * usually requiring abnormally-small data blocks (eg gang blocks) -+ * mixed into the same ABD as larger ones (eg aggregated). -+ */ -+ abd_t *abd = zio->io_abd; -+ if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { -+ void *buf; -+ if (zio->io_type == ZIO_TYPE_READ) -+ buf = abd_borrow_buf(zio->io_abd, zio->io_size); -+ else -+ buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); -+ -+ /* -+ * Wrap the copy in an abd_t, so we can use the same iterators -+ * to count and fill the vbio later. -+ */ -+ abd = abd_get_from_buf(buf, zio->io_size); -+ -+ /* -+ * False here would mean the borrowed copy has an invalid -+ * alignment too, which would mean we've somehow been passed a -+ * linear ABD with an interior page that has a non-zero offset -+ * or a size not a multiple of PAGE_SIZE. This is not possible. -+ * It would mean either zio_buf_alloc() or its underlying -+ * allocators have done something extremely strange, or our -+ * math in vdev_disk_check_pages() is wrong. In either case, -+ * something in seriously wrong and its not safe to continue. -+ */ -+ VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); -+ } -+ -+ /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ -+ int error = 0; -+ vbio_t *vbio = vbio_alloc(zio, bdev); -+ if (abd != zio->io_abd) -+ vbio->vbio_abd = abd; -+ -+ /* Fill it with pages */ -+ error = abd_iterate_page_func(abd, 0, zio->io_size, -+ vdev_disk_fill_vbio_cb, vbio); -+ if (error != 0) { -+ vbio_free(vbio); -+ return (error); -+ } -+ -+ vbio_submit(vbio, flags); -+ return (0); -+} -+ - /* ========== */ - - /* -- * This is the classic, battle-tested BIO submission code. -+ * This is the classic, battle-tested BIO submission code. Until we're totally -+ * sure that the new code is safe and correct in all cases, this will remain -+ * available and can be enabled by setting zfs_vdev_disk_classic=1 at module -+ * load time. - * - * These functions have been renamed to vdev_classic_* to make it clear what - * they belong to, but their implementations are unchanged. -@@ -1116,7 +1547,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) - (void) tsd; - - if (vdev_disk_io_rw_fn == NULL) -- vdev_disk_io_rw_fn = vdev_classic_physio; -+ /* XXX make configurable */ -+ vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw; - - return (0); - } -@@ -1215,3 +1647,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, - - ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, - "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); -+ -+ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, -+ "Maximum number of data segments to add to an IO request (min 4)"); diff --git a/debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch b/debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch deleted file mode 100644 index b7aef38..0000000 --- a/debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch +++ /dev/null @@ -1,104 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob Norris -Date: Tue, 9 Jan 2024 13:28:57 +1100 -Subject: [PATCH] vdev_disk: add module parameter to select BIO submission - method - -This makes the submission method selectable at module load time via the -`zfs_vdev_disk_classic` parameter, allowing this change to be backported -to 2.2 safely, and disabled in favour of the "classic" submission method -if new problems come up. - -Reviewed-by: Alexander Motin -Reviewed-by: Brian Behlendorf -Signed-off-by: Rob Norris -Sponsored-by: Klara, Inc. -Sponsored-by: Wasabi Technology, Inc. -Closes #15533 -Closes #15588 -(cherry picked from commit df2169d141aadc0c2cc728c5c5261d6f5c2a27f7) ---- - man/man4/zfs.4 | 16 ++++++++++++++++ - module/os/linux/zfs/vdev_disk.c | 31 +++++++++++++++++++++++++++++-- - 2 files changed, 45 insertions(+), 2 deletions(-) - -diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 -index b5679f2f0..6a628e7f3 100644 ---- a/man/man4/zfs.4 -+++ b/man/man4/zfs.4 -@@ -1352,6 +1352,22 @@ If this is higher than the maximum allowed by the device queue or the kernel - itself, it will be clamped. - Setting it to zero will cause the kernel's ideal size to be used. - This parameter only applies on Linux. -+This parameter is ignored if -+.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 . -+. -+.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint -+If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2 -+and earlier. -+This "classic" method has known issues with highly fragmented IO requests and -+is slower on many workloads, but it has been in use for many years and is known -+to be very stable. -+If you set this parameter, please also open a bug report why you did so, -+including the workload involved and any error messages. -+.Pp -+This parameter and the classic submission method will be removed once we have -+total confidence in the new method. -+.Pp -+This parameter only applies on Linux, and can only be set at module load time. - . - .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int - Time before expiring -diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c -index 0ccb9ad96..a9110623a 100644 ---- a/module/os/linux/zfs/vdev_disk.c -+++ b/module/os/linux/zfs/vdev_disk.c -@@ -1535,6 +1535,29 @@ vdev_disk_rele(vdev_t *vd) - /* XXX: Implement me as a vnode rele for the device */ - } - -+/* -+ * BIO submission method. See comment above about vdev_classic. -+ * Set zfs_vdev_disk_classic=0 for new, =1 for classic -+ */ -+static uint_t zfs_vdev_disk_classic = 0; /* default new */ -+ -+/* Set submission function from module parameter */ -+static int -+vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) -+{ -+ int err = param_set_uint(buf, kp); -+ if (err < 0) -+ return (SET_ERROR(err)); -+ -+ vdev_disk_io_rw_fn = -+ zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; -+ -+ printk(KERN_INFO "ZFS: forcing %s BIO submission\n", -+ zfs_vdev_disk_classic ? "classic" : "new"); -+ -+ return (0); -+} -+ - /* - * At first use vdev use, set the submission function from the default value if - * it hasn't been set already. -@@ -1547,8 +1570,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) - (void) tsd; - - if (vdev_disk_io_rw_fn == NULL) -- /* XXX make configurable */ -- vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw; -+ vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? -+ vdev_classic_physio : vdev_disk_io_rw; - - return (0); - } -@@ -1650,3 +1673,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, - - ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, - "Maximum number of data segments to add to an IO request (min 4)"); -+ -+ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, -+ vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, -+ "Use classic BIO submission method"); diff --git a/debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch b/debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch deleted file mode 100644 index 2dbf891..0000000 --- a/debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch +++ /dev/null @@ -1,363 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob Norris -Date: Wed, 21 Feb 2024 11:07:21 +1100 -Subject: [PATCH] vdev_disk: use bio_chain() to submit multiple BIOs - -Simplifies our code a lot, so we don't have to wait for each and -reassemble them. - -Reviewed-by: Alexander Motin -Reviewed-by: Brian Behlendorf -Signed-off-by: Rob Norris -Sponsored-by: Klara, Inc. -Sponsored-by: Wasabi Technology, Inc. -Closes #15533 -Closes #15588 -(cherry picked from commit 72fd834c47558cb10d847948d1a4615e894c77c3) ---- - module/os/linux/zfs/vdev_disk.c | 231 +++++++++++--------------------- - 1 file changed, 80 insertions(+), 151 deletions(-) - -diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c -index a9110623a..36468fc21 100644 ---- a/module/os/linux/zfs/vdev_disk.c -+++ b/module/os/linux/zfs/vdev_disk.c -@@ -454,10 +454,9 @@ vdev_disk_close(vdev_t *v) - if (v->vdev_reopening || vd == NULL) - return; - -- if (vd->vd_bdh != NULL) { -+ if (vd->vd_bdh != NULL) - vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), - zfs_vdev_holder); -- } - - rw_destroy(&vd->vd_lock); - kmem_free(vd, sizeof (vdev_disk_t)); -@@ -663,9 +662,6 @@ typedef struct { - - abd_t *vbio_abd; /* abd carrying borrowed linear buf */ - -- atomic_t vbio_ref; /* bio refcount */ -- int vbio_error; /* error from failed bio */ -- - uint_t vbio_max_segs; /* max segs per bio */ - - uint_t vbio_max_bytes; /* max bytes per bio */ -@@ -674,43 +670,52 @@ typedef struct { - uint64_t vbio_offset; /* start offset of next bio */ - - struct bio *vbio_bio; /* pointer to the current bio */ -- struct bio *vbio_bios; /* list of all bios */ -+ int vbio_flags; /* bio flags */ - } vbio_t; - - static vbio_t * --vbio_alloc(zio_t *zio, struct block_device *bdev) -+vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) - { - vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); - - vbio->vbio_zio = zio; - vbio->vbio_bdev = bdev; -- atomic_set(&vbio->vbio_ref, 0); -+ vbio->vbio_abd = NULL; - vbio->vbio_max_segs = vdev_bio_max_segs(bdev); - vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); - vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); - vbio->vbio_offset = zio->io_offset; -+ vbio->vbio_bio = NULL; -+ vbio->vbio_flags = flags; - - return (vbio); - } - -+BIO_END_IO_PROTO(vbio_completion, bio, error); -+ - static int - vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) - { -- struct bio *bio; -+ struct bio *bio = vbio->vbio_bio; - uint_t ssize; - - while (size > 0) { -- bio = vbio->vbio_bio; - if (bio == NULL) { - /* New BIO, allocate and set up */ - bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, - vbio->vbio_max_segs); -- if (unlikely(bio == NULL)) -- return (SET_ERROR(ENOMEM)); -+ VERIFY(bio); -+ - BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; -+ bio_set_op_attrs(bio, -+ vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? -+ WRITE : READ, vbio->vbio_flags); - -- bio->bi_next = vbio->vbio_bios; -- vbio->vbio_bios = vbio->vbio_bio = bio; -+ if (vbio->vbio_bio) { -+ bio_chain(vbio->vbio_bio, bio); -+ vdev_submit_bio(vbio->vbio_bio); -+ } -+ vbio->vbio_bio = bio; - } - - /* -@@ -735,157 +740,97 @@ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) - vbio->vbio_offset += BIO_BI_SIZE(bio); - - /* Signal new BIO allocation wanted */ -- vbio->vbio_bio = NULL; -+ bio = NULL; - } - - return (0); - } - --BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error); --static void vbio_put(vbio_t *vbio); -+/* Iterator callback to submit ABD pages to the vbio. */ -+static int -+vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) -+{ -+ vbio_t *vbio = priv; -+ return (vbio_add_page(vbio, page, len, off)); -+} - -+/* Create some BIOs, fill them with data and submit them */ - static void --vbio_submit(vbio_t *vbio, int flags) -+vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) - { -- ASSERT(vbio->vbio_bios); -- struct bio *bio = vbio->vbio_bios; -- vbio->vbio_bio = vbio->vbio_bios = NULL; -- -- /* -- * We take a reference for each BIO as we submit it, plus one to -- * protect us from BIOs completing before we're done submitting them -- * all, causing vbio_put() to free vbio out from under us and/or the -- * zio to be returned before all its IO has completed. -- */ -- atomic_set(&vbio->vbio_ref, 1); -+ ASSERT(vbio->vbio_bdev); - - /* -- * If we're submitting more than one BIO, inform the block layer so -- * it can batch them if it wants. -+ * We plug so we can submit the BIOs as we go and only unplug them when -+ * they are fully created and submitted. This is important; if we don't -+ * plug, then the kernel may start executing earlier BIOs while we're -+ * still creating and executing later ones, and if the device goes -+ * away while that's happening, older kernels can get confused and -+ * trample memory. - */ - struct blk_plug plug; -- boolean_t do_plug = (bio->bi_next != NULL); -- if (do_plug) -- blk_start_plug(&plug); -+ blk_start_plug(&plug); - -- /* Submit all the BIOs */ -- while (bio != NULL) { -- atomic_inc(&vbio->vbio_ref); -+ (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); -+ ASSERT(vbio->vbio_bio); - -- struct bio *next = bio->bi_next; -- bio->bi_next = NULL; -+ vbio->vbio_bio->bi_end_io = vbio_completion; -+ vbio->vbio_bio->bi_private = vbio; - -- bio->bi_end_io = vdev_disk_io_rw_completion; -- bio->bi_private = vbio; -- bio_set_op_attrs(bio, -- vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? -- WRITE : READ, flags); -+ vdev_submit_bio(vbio->vbio_bio); - -- vdev_submit_bio(bio); -- -- bio = next; -- } -- -- /* Finish the batch */ -- if (do_plug) -- blk_finish_plug(&plug); -+ blk_finish_plug(&plug); - -- /* Release the extra reference */ -- vbio_put(vbio); -+ vbio->vbio_bio = NULL; -+ vbio->vbio_bdev = NULL; - } - --static void --vbio_return_abd(vbio_t *vbio) -+/* IO completion callback */ -+BIO_END_IO_PROTO(vbio_completion, bio, error) - { -+ vbio_t *vbio = bio->bi_private; - zio_t *zio = vbio->vbio_zio; -- if (vbio->vbio_abd == NULL) -- return; -- -- /* -- * If we copied the ABD before issuing it, clean up and return the copy -- * to the ADB, with changes if appropriate. -- */ -- void *buf = abd_to_buf(vbio->vbio_abd); -- abd_free(vbio->vbio_abd); -- vbio->vbio_abd = NULL; -- -- if (zio->io_type == ZIO_TYPE_READ) -- abd_return_buf_copy(zio->io_abd, buf, zio->io_size); -- else -- abd_return_buf(zio->io_abd, buf, zio->io_size); --} - --static void --vbio_free(vbio_t *vbio) --{ -- VERIFY0(atomic_read(&vbio->vbio_ref)); -- -- vbio_return_abd(vbio); -+ ASSERT(zio); - -- kmem_free(vbio, sizeof (vbio_t)); --} -+ /* Capture and log any errors */ -+#ifdef HAVE_1ARG_BIO_END_IO_T -+ zio->io_error = BIO_END_IO_ERROR(bio); -+#else -+ zio->io_error = 0; -+ if (error) -+ zio->io_error = -(error); -+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) -+ zio->io_error = EIO; -+#endif -+ ASSERT3U(zio->io_error, >=, 0); - --static void --vbio_put(vbio_t *vbio) --{ -- if (atomic_dec_return(&vbio->vbio_ref) > 0) -- return; -+ if (zio->io_error) -+ vdev_disk_error(zio); - -- /* -- * This was the last reference, so the entire IO is completed. Clean -- * up and submit it for processing. -- */ -+ /* Return the BIO to the kernel */ -+ bio_put(bio); - - /* -- * Get any data buf back to the original ABD, if necessary. We do this -- * now so we can get the ZIO into the pipeline as quickly as possible, -- * and then do the remaining cleanup after. -+ * If we copied the ABD before issuing it, clean up and return the copy -+ * to the ADB, with changes if appropriate. - */ -- vbio_return_abd(vbio); -+ if (vbio->vbio_abd != NULL) { -+ void *buf = abd_to_buf(vbio->vbio_abd); -+ abd_free(vbio->vbio_abd); -+ vbio->vbio_abd = NULL; - -- zio_t *zio = vbio->vbio_zio; -+ if (zio->io_type == ZIO_TYPE_READ) -+ abd_return_buf_copy(zio->io_abd, buf, zio->io_size); -+ else -+ abd_return_buf(zio->io_abd, buf, zio->io_size); -+ } - -- /* -- * Set the overall error. If multiple BIOs returned an error, only the -- * first will be taken; the others are dropped (see -- * vdev_disk_io_rw_completion()). Its pretty much impossible for -- * multiple IOs to the same device to fail with different errors, so -- * there's no real risk. -- */ -- zio->io_error = vbio->vbio_error; -- if (zio->io_error) -- vdev_disk_error(zio); -+ /* Final cleanup */ -+ kmem_free(vbio, sizeof (vbio_t)); - - /* All done, submit for processing */ - zio_delay_interrupt(zio); -- -- /* Finish cleanup */ -- vbio_free(vbio); --} -- --BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error) --{ -- vbio_t *vbio = bio->bi_private; -- -- if (vbio->vbio_error == 0) { --#ifdef HAVE_1ARG_BIO_END_IO_T -- vbio->vbio_error = BIO_END_IO_ERROR(bio); --#else -- if (error) -- vbio->vbio_error = -(error); -- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) -- vbio->vbio_error = EIO; --#endif -- } -- -- /* -- * Destroy the BIO. This is safe to do; the vbio owns its data and the -- * kernel won't touch it again after the completion function runs. -- */ -- bio_put(bio); -- -- /* Drop this BIOs reference acquired by vbio_submit() */ -- vbio_put(vbio); - } - - /* -@@ -948,14 +893,6 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) - return (B_TRUE); - } - --/* Iterator callback to submit ABD pages to the vbio. */ --static int --vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv) --{ -- vbio_t *vbio = priv; -- return (vbio_add_page(vbio, page, len, off)); --} -- - static int - vdev_disk_io_rw(zio_t *zio) - { -@@ -1018,20 +955,12 @@ vdev_disk_io_rw(zio_t *zio) - } - - /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ -- int error = 0; -- vbio_t *vbio = vbio_alloc(zio, bdev); -+ vbio_t *vbio = vbio_alloc(zio, bdev, flags); - if (abd != zio->io_abd) - vbio->vbio_abd = abd; - -- /* Fill it with pages */ -- error = abd_iterate_page_func(abd, 0, zio->io_size, -- vdev_disk_fill_vbio_cb, vbio); -- if (error != 0) { -- vbio_free(vbio); -- return (error); -- } -- -- vbio_submit(vbio, flags); -+ /* Fill it with data pages and submit it to the kernel */ -+ vbio_submit(vbio, abd, zio->io_size); - return (0); - } - diff --git a/debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch b/debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch deleted file mode 100644 index 28dbbf9..0000000 --- a/debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch +++ /dev/null @@ -1,96 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob Norris -Date: Thu, 14 Mar 2024 10:57:30 +1100 -Subject: [PATCH] abd_iter_page: don't use compound heads on Linux <4.5 - -Before 4.5 (specifically, torvalds/linux@ddc58f2), head and tail pages -in a compound page were refcounted separately. This means that using the -head page without taking a reference to it could see it cleaned up later -before we're finished with it. Specifically, bio_add_page() would take a -reference, and drop its reference after the bio completion callback -returns. - -If the zio is executed immediately from the completion callback, this is -usually ok, as any data is referenced through the tail page referenced -by the ABD, and so becomes "live" that way. If there's a delay in zio -execution (high load, error injection), then the head page can be freed, -along with any dirty flags or other indicators that the underlying -memory is used. Later, when the zio completes and that memory is -accessed, its either unmapped and an unhandled fault takes down the -entire system, or it is mapped and we end up messing around in someone -else's memory. Both of these are very bad. - -The solution on these older kernels is to take a reference to the head -page when we use it, and release it when we're done. There's not really -a sensible way under our current structure to do this; the "best" would -be to keep a list of head page references in the ABD, and release them -when the ABD is freed. - -Since this additional overhead is totally unnecessary on 4.5+, where -head and tail pages share refcounts, I've opted to simply not use the -compound head in ABD page iteration there. This is theoretically less -efficient (though cleaning up head page references would add overhead), -but its safe, and we still get the other benefits of not mapping pages -before adding them to a bio and not mis-splitting pages. - -There doesn't appear to be an obvious symbol name or config option we -can match on to discover this behaviour in configure (and the mm/page -APIs have changed a lot since then anyway), so I've gone with a simple -version check. - -Reviewed-by: Alexander Motin -Reviewed-by: Brian Behlendorf -Signed-off-by: Rob Norris -Sponsored-by: Klara, Inc. -Sponsored-by: Wasabi Technology, Inc. -Closes #15533 -Closes #15588 -(cherry picked from commit c6be6ce1755a3d9a3cbe70256cd8958ef83d8542) ---- - module/os/linux/zfs/abd_os.c | 14 ++++++++++++++ - 1 file changed, 14 insertions(+) - -diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c -index 3fe01c0b7..d3255dcbc 100644 ---- a/module/os/linux/zfs/abd_os.c -+++ b/module/os/linux/zfs/abd_os.c -@@ -62,6 +62,7 @@ - #include - #include - #include -+#include - #endif - - #ifdef _KERNEL -@@ -1061,6 +1062,7 @@ abd_iter_page(struct abd_iter *aiter) - } - ASSERT(page); - -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) - if (PageTail(page)) { - /* - * This page is part of a "compound page", which is a group of -@@ -1082,11 +1084,23 @@ abd_iter_page(struct abd_iter *aiter) - * To do this, we need to adjust the offset to be counted from - * the head page. struct page for compound pages are stored - * contiguously, so we can just adjust by a simple offset. -+ * -+ * Before kernel 4.5, compound page heads were refcounted -+ * separately, such that moving back to the head page would -+ * require us to take a reference to it and releasing it once -+ * we're completely finished with it. In practice, that means -+ * when our caller is done with the ABD, which we have no -+ * insight into from here. Rather than contort this API to -+ * track head page references on such ancient kernels, we just -+ * compile this block out and use the tail pages directly. This -+ * is slightly less efficient, but makes everything far -+ * simpler. - */ - struct page *head = compound_head(page); - doff += ((page - head) * PAGESIZE); - page = head; - } -+#endif - - /* final page and position within it */ - aiter->iter_page = page; diff --git a/debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch b/debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch deleted file mode 100644 index e2f1422..0000000 --- a/debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch +++ /dev/null @@ -1,90 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob Norris -Date: Wed, 27 Mar 2024 13:11:12 +1100 -Subject: [PATCH] vdev_disk: default to classic submission for 2.2.x - -We don't want to change to brand-new code in the middle of a stable -series, but we want it available to test for people running into page -splitting issues. - -This commits make zfs_vdev_disk_classic=1 the default, and updates the -documentation to better explain what's going on. - -Signed-off-by: Rob Norris -Sponsored-by: Klara, Inc. -Sponsored-by: Wasabi Technology, Inc. ---- - man/man4/zfs.4 | 31 ++++++++++++++++++++++--------- - module/os/linux/zfs/vdev_disk.c | 8 +++++--- - 2 files changed, 27 insertions(+), 12 deletions(-) - -diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 -index 6a628e7f3..a98ec519a 100644 ---- a/man/man4/zfs.4 -+++ b/man/man4/zfs.4 -@@ -1355,17 +1355,30 @@ This parameter only applies on Linux. - This parameter is ignored if - .Sy zfs_vdev_disk_classic Ns = Ns Sy 1 . - . --.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint --If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2 --and earlier. --This "classic" method has known issues with highly fragmented IO requests and --is slower on many workloads, but it has been in use for many years and is known --to be very stable. --If you set this parameter, please also open a bug report why you did so, -+.It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint -+Controls the method used to submit IO to the Linux block layer -+(default -+.Sy 1 "classic" Ns -+) -+.Pp -+If set to 1, the "classic" method is used. -+This is the method that has been in use since the earliest versions of -+ZFS-on-Linux. -+It has known issues with highly fragmented IO requests and is less efficient on -+many workloads, but it well known and well understood. -+.Pp -+If set to 0, the "new" method is used. -+This method is available since 2.2.4 and should resolve all known issues and be -+far more efficient, but has not had as much testing. -+In the 2.2.x series, this parameter defaults to 1, to use the "classic" method. -+.Pp -+It is not recommended that you change it except on advice from the OpenZFS -+developers. -+If you do change it, please also open a bug report describing why you did so, - including the workload involved and any error messages. - .Pp --This parameter and the classic submission method will be removed once we have --total confidence in the new method. -+This parameter and the "classic" submission method will be removed in a future -+release of OpenZFS once we have total confidence in the new method. - .Pp - This parameter only applies on Linux, and can only be set at module load time. - . -diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c -index 36468fc21..e1c19a085 100644 ---- a/module/os/linux/zfs/vdev_disk.c -+++ b/module/os/linux/zfs/vdev_disk.c -@@ -969,8 +969,10 @@ vdev_disk_io_rw(zio_t *zio) - /* - * This is the classic, battle-tested BIO submission code. Until we're totally - * sure that the new code is safe and correct in all cases, this will remain -- * available and can be enabled by setting zfs_vdev_disk_classic=1 at module -- * load time. -+ * available. -+ * -+ * It is enabled by setting zfs_vdev_disk_classic=1 at module load time. It is -+ * enabled (=1) by default since 2.2.4, and disabled by default (=0) on master. - * - * These functions have been renamed to vdev_classic_* to make it clear what - * they belong to, but their implementations are unchanged. -@@ -1468,7 +1470,7 @@ vdev_disk_rele(vdev_t *vd) - * BIO submission method. See comment above about vdev_classic. - * Set zfs_vdev_disk_classic=0 for new, =1 for classic - */ --static uint_t zfs_vdev_disk_classic = 0; /* default new */ -+static uint_t zfs_vdev_disk_classic = 1; /* default classic */ - - /* Set submission function from module parameter */ - static int diff --git a/debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch b/debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch deleted file mode 100644 index 027f299..0000000 --- a/debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch +++ /dev/null @@ -1,104 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Robert Evans -Date: Mon, 25 Mar 2024 17:56:49 -0400 -Subject: [PATCH] Fix corruption caused by mmap flushing problems - -1) Make mmap flushes synchronous. Linux may skip flushing dirty pages - already in writeback unless data-integrity sync is requested. - -2) Change zfs_putpage to use TXG_WAIT. Otherwise dirty pages may be - skipped due to DMU pushing back on TX assign. - -3) Add missing mmap flush when doing block cloning. - -4) While here, pass errors from putpage to writepage/writepages. - -This change fixes corruption edge cases, but unfortunately adds -synchronous ZIL flushes for dirty mmap pages to llseek and bclone -operations. It may be possible to avoid these sync writes later -but would need more tricky refactoring of the writeback code. - -Reviewed-by: Alexander Motin -Reviewed-by: Brian Behlendorf -Signed-off-by: Robert Evans -Closes #15933 -Closes #16019 ---- - module/os/linux/zfs/zfs_vnops_os.c | 5 +---- - module/os/linux/zfs/zpl_file.c | 8 ++++---- - module/zfs/zfs_vnops.c | 6 +++++- - 3 files changed, 10 insertions(+), 9 deletions(-) - -diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c -index c06a75662..7c473bc7e 100644 ---- a/module/os/linux/zfs/zfs_vnops_os.c -+++ b/module/os/linux/zfs/zfs_vnops_os.c -@@ -3792,11 +3792,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - -- err = dmu_tx_assign(tx, TXG_NOWAIT); -+ err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { -- if (err == ERESTART) -- dmu_tx_wait(tx); -- - dmu_tx_abort(tx); - #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO - filemap_dirty_folio(page_mapping(pp), page_folio(pp)); -diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c -index 3caa0fc6c..9dec52215 100644 ---- a/module/os/linux/zfs/zpl_file.c -+++ b/module/os/linux/zfs/zpl_file.c -@@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) - { - boolean_t *for_sync = data; - fstrans_cookie_t cookie; -+ int ret; - - ASSERT(PageLocked(pp)); - ASSERT(!PageWriteback(pp)); - - cookie = spl_fstrans_mark(); -- (void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); -+ ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); - spl_fstrans_unmark(cookie); - -- return (0); -+ return (ret); - } - - #ifdef HAVE_WRITEPAGE_T_FOLIO - static int - zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) - { -- (void) zpl_putpage(&pp->page, wbc, data); -- return (0); -+ return (zpl_putpage(&pp->page, wbc, data)); - } - #endif - -diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c -index 2b37834d5..7020f88ec 100644 ---- a/module/zfs/zfs_vnops.c -+++ b/module/zfs/zfs_vnops.c -@@ -130,7 +130,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) - - /* Flush any mmap()'d data to disk */ - if (zn_has_cached_data(zp, 0, file_sz - 1)) -- zn_flush_cached_data(zp, B_FALSE); -+ zn_flush_cached_data(zp, B_TRUE); - - lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); - error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); -@@ -1193,6 +1193,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, - } - } - -+ /* Flush any mmap()'d data to disk */ -+ if (zn_has_cached_data(inzp, inoff, inoff + len - 1)) -+ zn_flush_cached_data(inzp, B_TRUE); -+ - /* - * Maintain predictable lock order. - */ diff --git a/debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch b/debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch deleted file mode 100644 index 83eac37..0000000 --- a/debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch +++ /dev/null @@ -1,57 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rob Norris -Date: Tue, 2 Apr 2024 15:14:54 +1100 -Subject: [PATCH] vdev_disk: don't touch vbio after its handed off to the - kernel - -After IO is unplugged, it may complete immediately and vbio_completion -be called on interrupt context. That may interrupt or deschedule our -task. If its the last bio, the vbio will be freed. Then, we get -rescheduled, and try to write to freed memory through vbio->. - -This patch just removes the the cleanup, and the corresponding assert. -These were leftovers from a previous iteration of vbio_submit() and were -always "belt and suspenders" ops anyway, never strictly required. - -Reported-by: Rich Ercolani -Signed-off-by: Rob Norris -Sponsored-by: Klara, Inc. -Sponsored-by: Wasabi Technology, Inc. -(cherry picked from commit 34f662ad22206af6852020fd923ceccd836a855f) -Signed-off-by: Thomas Lamprecht ---- - module/os/linux/zfs/vdev_disk.c | 11 ++++++----- - 1 file changed, 6 insertions(+), 5 deletions(-) - -diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c -index e1c19a085..62c7aa14f 100644 ---- a/module/os/linux/zfs/vdev_disk.c -+++ b/module/os/linux/zfs/vdev_disk.c -@@ -758,8 +758,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) - static void - vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) - { -- ASSERT(vbio->vbio_bdev); -- - /* - * We plug so we can submit the BIOs as we go and only unplug them when - * they are fully created and submitted. This is important; if we don't -@@ -777,12 +775,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) - vbio->vbio_bio->bi_end_io = vbio_completion; - vbio->vbio_bio->bi_private = vbio; - -+ /* -+ * Once submitted, vbio_bio now owns vbio (through bi_private) and we -+ * can't touch it again. The bio may complete and vbio_completion() be -+ * called and free the vbio before this task is run again, so we must -+ * consider it invalid from this point. -+ */ - vdev_submit_bio(vbio->vbio_bio); - - blk_finish_plug(&plug); -- -- vbio->vbio_bio = NULL; -- vbio->vbio_bdev = NULL; - } - - /* IO completion callback */ diff --git a/debian/patches/series b/debian/patches/series index 7c1a5c6..35f81d1 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -9,17 +9,3 @@ 0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch 0010-Fix-nfs_truncate_shares-without-etc-exports.d.patch 0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch -0012-udev-correctly-handle-partition-16-and-later.patch -0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch -0014-linux-5.4-compat-page_size.patch -0015-abd-add-page-iterator.patch -0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch -0017-vdev_disk-reorganise-vdev_disk_io_start.patch -0018-vdev_disk-make-read-write-IO-function-configurable.patch -0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch -0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch -0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch -0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch -0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch -0024-Fix-corruption-caused-by-mmap-flushing-problems.patch -0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch diff --git a/upstream b/upstream index c883088..2566592 160000 --- a/upstream +++ b/upstream @@ -1 +1 @@ -Subproject commit c883088df83ced3a2b8b38e6d89a5e63fb153ee4 +Subproject commit 2566592045780e7be7afc899c2496b1ae3af4f4d