backport 2.2.4 staging for better 6.8 support

Use the current ZFS 2.2.4 staging tree [0] with commit deb7a8423 ("Fix corruption caused by mmap flushing problems") on top. Additionally, include an open, but ack'd, pull request [1] that avoids a potential general protection fault due to touching a vbio after it was handed off to the kernel. [0]: https://github.com/openzfs/zfs/commits/zfs-2.2.4-staging/ [1]: https://github.com/openzfs/zfs/pull/16049 Both should mostly touch the module code. Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
2024-04-03 09:53:17 +02:00 · 2024-04-03 09:53:17 +02:00 · 68be554e71
commit 68be554e71
parent 6c9ff9b992
14 changed files with 2617 additions and 0 deletions
--- a/debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
+++ b/debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
@ -0,0 +1,135 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob N <robn@despairlabs.com>
+Date: Thu, 21 Mar 2024 10:46:15 +1100
+Subject: [PATCH] Linux 6.8 compat: use splice_copy_file_range() for fallback
+
+Linux 6.8 removes generic_copy_file_range(), which had been reduced to a
+simple wrapper around splice_copy_file_range(). Detect that function
+directly and use it if generic_ is not available.
+
+Sponsored-by: https://despairlabs.com/sponsor/
+Reviewed-by: Tony Hutter <hutter2@llnl.gov>
+Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Rob Norris <robn@despairlabs.com>
+Closes #15930
+Closes #15931
+(cherry picked from commit ef08a4d4065d21414d7fedccac20da6bfda4dfd0)
+---
+ config/kernel-vfs-file_range.m4      | 27 +++++++++++++++++++++++++++
+ config/kernel.m4                     |  2 ++
+ module/os/linux/zfs/zpl_file_range.c | 16 ++++++++++++++--
+ 3 files changed, 43 insertions(+), 2 deletions(-)
+
+diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4
+index cc96404d8..8a5cbe2ee 100644
+--- a/config/kernel-vfs-file_range.m4
+++ b/config/kernel-vfs-file_range.m4
+@@ -16,6 +16,9 @@ dnl #
+ dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
+ dnl #      generic_copy_file_range() added to support it
+ dnl #
+dnl # 6.8: generic_copy_file_range() removed, replaced by
+dnl #      splice_copy_file_range()
+dnl #
+ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
+ 	ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
+ 		#include <linux/fs.h>
+@@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
+ 	])
+ ])
+ 
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
+		#include <linux/splice.h>
+	], [
+		struct file *src_file __attribute__ ((unused)) = NULL;
+		loff_t src_off __attribute__ ((unused)) = 0;
+		struct file *dst_file __attribute__ ((unused)) = NULL;
+		loff_t dst_off __attribute__ ((unused)) = 0;
+		size_t len __attribute__ ((unused)) = 0;
+		splice_copy_file_range(src_file, src_off, dst_file, dst_off,
+		    len);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
+	AC_MSG_CHECKING([whether splice_copy_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
+		    [splice_copy_file_range() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
+ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
+ 	ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
+ 		#include <linux/fs.h>
+diff --git a/config/kernel.m4 b/config/kernel.m4
+index e3f864577..1d0c5a27f 100644
+--- a/config/kernel.m4
+++ b/config/kernel.m4
+@@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
+ 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
+ 	ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
+ 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
+ 	ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
+ 	ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
+ 	ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
+@@ -266,6 +267,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
+ 	ZFS_AC_KERNEL_VFS_IOV_ITER
+ 	ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
+ 	ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
+ 	ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
+ 	ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
+ 	ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
+diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
+index 3065d54fa..64728fdb1 100644
+--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
+@@ -26,6 +26,9 @@
+ #include <linux/compat.h>
+ #endif
+ #include <linux/fs.h>
+#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
+#include <linux/splice.h>
+#endif
+ #include <sys/file.h>
+ #include <sys/zfs_znode.h>
+ #include <sys/zfs_vnops.h>
+@@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
+ 	ret = zpl_clone_file_range_impl(src_file, src_off,
+ 	    dst_file, dst_off, len);
+ 
+-#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
+#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
+ 	/*
+ 	 * Since Linux 5.3 the filesystem driver is responsible for executing
+ 	 * an appropriate fallback, and a generic fallback function is provided.
+@@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
+ 	    ret == -EAGAIN)
+ 		ret = generic_copy_file_range(src_file, src_off, dst_file,
+ 		    dst_off, len, flags);
+#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
+	/*
+	 * Since 6.8 the fallback function is called splice_copy_file_range
+	 * and has a slightly different signature.
+	 */
+	if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
+	    ret == -EAGAIN)
+		ret = splice_copy_file_range(src_file, src_off, dst_file,
+		    dst_off, len);
+ #else
+ 	/*
+ 	 * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
+@@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
+ 	 */
+ 	if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
+ 		ret = -EOPNOTSUPP;
+-#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
+ 
+ 	return (ret);
+ }
--- a/debian/patches/0014-linux-5.4-compat-page_size.patch
+++ b/debian/patches/0014-linux-5.4-compat-page_size.patch
@ -0,0 +1,121 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob Norris <rob.norris@klarasystems.com>
+Date: Mon, 13 Nov 2023 17:55:29 +1100
+Subject: [PATCH] linux 5.4 compat: page_size()
+
+Before 5.4 we have to do a little math.
+
+Reviewed-by: Alexander Motin <mav@FreeBSD.org>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
+Sponsored-by: Klara, Inc.
+Sponsored-by: Wasabi Technology, Inc.
+Closes #15533
+Closes #15588
+(cherry picked from commit df04efe321a49c650f1fbaa6fd701fa2928cbe21)
+---
+ config/kernel-mm-page-size.m4             | 17 +++++++++++
+ config/kernel.m4                          |  2 ++
+ include/os/linux/Makefile.am              |  1 +
+ include/os/linux/kernel/linux/mm_compat.h | 36 +++++++++++++++++++++++
+ 4 files changed, 56 insertions(+)
+ create mode 100644 config/kernel-mm-page-size.m4
+ create mode 100644 include/os/linux/kernel/linux/mm_compat.h
+
+diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4
+new file mode 100644
+index 000000000..d5ebd9269
+--- /dev/null
+++ b/config/kernel-mm-page-size.m4
+@@ -0,0 +1,17 @@
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
+	ZFS_LINUX_TEST_SRC([page_size], [
+		#include <linux/mm.h>
+	],[
+		unsigned long s;
+		s = page_size(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
+	AC_MSG_CHECKING([whether page_size() is available])
+	ZFS_LINUX_TEST_RESULT([page_size], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+diff --git a/config/kernel.m4 b/config/kernel.m4
+index 1d0c5a27f..548905ccd 100644
+--- a/config/kernel.m4
+++ b/config/kernel.m4
+@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
+ 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
+ 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
+ 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
+	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
+ 	case "$host_cpu" in
+ 		powerpc*)
+ 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
+@@ -316,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
+ 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
+ 	ZFS_AC_KERNEL_COPY_SPLICE_READ
+ 	ZFS_AC_KERNEL_SYNC_BDEV
+	ZFS_AC_KERNEL_MM_PAGE_SIZE
+ 	case "$host_cpu" in
+ 		powerpc*)
+ 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
+diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
+index 3830d198d..51c27132b 100644
+--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
+@@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
+ 	%D%/kernel/linux/compiler_compat.h \
+ 	%D%/kernel/linux/dcache_compat.h \
+ 	%D%/kernel/linux/kmap_compat.h \
+	%D%/kernel/linux/mm_compat.h \
+ 	%D%/kernel/linux/mod_compat.h \
+ 	%D%/kernel/linux/page_compat.h \
+ 	%D%/kernel/linux/percpu_compat.h \
+diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h
+new file mode 100644
+index 000000000..40056c68d
+--- /dev/null
+++ b/include/os/linux/kernel/linux/mm_compat.h
+@@ -0,0 +1,36 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ZFS_MM_COMPAT_H
+#define	_ZFS_MM_COMPAT_H
+
+#include <linux/mm.h>
+
+/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
+#ifndef HAVE_MM_PAGE_SIZE
+#define	page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
+#endif
+
+#endif /* _ZFS_MM_COMPAT_H */
--- a/debian/patches/0015-abd-add-page-iterator.patch
+++ b/debian/patches/0015-abd-add-page-iterator.patch
@ -0,0 +1,334 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob Norris <rob.norris@klarasystems.com>
+Date: Mon, 11 Dec 2023 16:05:54 +1100
+Subject: [PATCH] abd: add page iterator
+
+The regular ABD iterators yield data buffers, so they have to map and
+unmap pages into kernel memory. If the caller only wants to count
+chunks, or can use page pointers directly, then the map/unmap is just
+unnecessary overhead.
+
+This adds adb_iterate_page_func, which yields unmapped struct page
+instead.
+
+Reviewed-by: Alexander Motin <mav@FreeBSD.org>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
+Sponsored-by: Klara, Inc.
+Sponsored-by: Wasabi Technology, Inc.
+Closes #15533
+Closes #15588
+(cherry picked from commit 390b448726c580999dd337be7a40b0e95cf1d50b)
+---
+ include/sys/abd.h              |   7 +++
+ include/sys/abd_impl.h         |  26 ++++++++-
+ module/os/freebsd/zfs/abd_os.c |   4 +-
+ module/os/linux/zfs/abd_os.c   | 104 ++++++++++++++++++++++++++++++---
+ module/zfs/abd.c               |  42 +++++++++++++
+ 5 files changed, 169 insertions(+), 14 deletions(-)
+
+diff --git a/include/sys/abd.h b/include/sys/abd.h
+index 750f9986c..8a2df0bca 100644
+--- a/include/sys/abd.h
+++ b/include/sys/abd.h
+@@ -79,6 +79,9 @@ typedef struct abd {
+ 
+ typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
+ typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
+#if defined(__linux__) && defined(_KERNEL)
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
+#endif
+ 
+ extern int zfs_abd_scatter_enabled;
+ 
+@@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
+ int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
+ int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
+     abd_iter_func2_t *, void *);
+#if defined(__linux__) && defined(_KERNEL)
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
+    void *);
+#endif
+ void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
+ void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
+ void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
+diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
+index 40546d4af..f88ea25e2 100644
+--- a/include/sys/abd_impl.h
+++ b/include/sys/abd_impl.h
+@@ -21,6 +21,7 @@
+ /*
+  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+  * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+  */
+ 
+ #ifndef _ABD_IMPL_H
+@@ -38,12 +39,30 @@ typedef enum abd_stats_op {
+ 	ABDSTAT_DECR  /* Decrease abdstat values */
+ } abd_stats_op_t;
+ 
+-struct scatterlist; /* forward declaration */
+/* forward declarations */
+struct scatterlist;
+struct page;
+ 
+ struct abd_iter {
+ 	/* public interface */
+-	void		*iter_mapaddr;	/* addr corresponding to iter_pos */
+-	size_t		iter_mapsize;	/* length of data valid at mapaddr */
+	union {
+		/* for abd_iter_map()/abd_iter_unmap() */
+		struct {
+			/* addr corresponding to iter_pos */
+			void		*iter_mapaddr;
+			/* length of data valid at mapaddr */
+			size_t		iter_mapsize;
+		};
+		/* for abd_iter_page() */
+		struct {
+			/* current page */
+			struct page	*iter_page;
+			/* offset of data in page */
+			size_t		iter_page_doff;
+			/* size of data in page */
+			size_t		iter_page_dsize;
+		};
+	};
+ 
+ 	/* private */
+ 	abd_t		*iter_abd;	/* ABD being iterated through */
+@@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
+ void abd_iter_advance(struct abd_iter *, size_t);
+ void abd_iter_map(struct abd_iter *);
+ void abd_iter_unmap(struct abd_iter *);
+void abd_iter_page(struct abd_iter *);
+ 
+ /*
+  * Helper macros
+diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
+index 58a37df62..3b812271f 100644
+--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
+@@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+ {
+ 	ASSERT(!abd_is_gang(abd));
+ 	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
+ 	aiter->iter_abd = abd;
+-	aiter->iter_pos = 0;
+-	aiter->iter_mapaddr = NULL;
+-	aiter->iter_mapsize = 0;
+ }
+ 
+ /*
+diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
+index 24390fbbf..dae128012 100644
+--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
+@@ -21,6 +21,7 @@
+ /*
+  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+  * Copyright (c) 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+  */
+ 
+ /*
+@@ -59,6 +60,7 @@
+ #include <sys/zfs_znode.h>
+ #ifdef _KERNEL
+ #include <linux/kmap_compat.h>
+#include <linux/mm_compat.h>
+ #include <linux/scatterlist.h>
+ #endif
+ 
+@@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+ {
+ 	ASSERT(!abd_is_gang(abd));
+ 	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
+ 	aiter->iter_abd = abd;
+-	aiter->iter_mapaddr = NULL;
+-	aiter->iter_mapsize = 0;
+-	aiter->iter_pos = 0;
+-	if (abd_is_linear(abd)) {
+-		aiter->iter_offset = 0;
+-		aiter->iter_sg = NULL;
+-	} else {
+	if (!abd_is_linear(abd)) {
+ 		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
+ 		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
+ 	}
+@@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+ boolean_t
+ abd_iter_at_end(struct abd_iter *aiter)
+ {
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
+ 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
+ }
+ 
+@@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter)
+ void
+ abd_iter_advance(struct abd_iter *aiter, size_t amount)
+ {
+	/*
+	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
+	 * this state (directly or abd_iter_unmap()) before advancing.
+	 */
+ 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ 	ASSERT0(aiter->iter_mapsize);
+	ASSERT3P(aiter->iter_page, ==, NULL);
+	ASSERT0(aiter->iter_page_doff);
+	ASSERT0(aiter->iter_page_dsize);
+ 
+ 	/* There's nothing left to advance to, so do nothing */
+ 	if (abd_iter_at_end(aiter))
+@@ -1009,6 +1014,88 @@ abd_cache_reap_now(void)
+ }
+ 
+ #if defined(_KERNEL)
+/*
+ * Yield the next page struct and data offset and size within it, without
+ * mapping it into the address space.
+ */
+void
+abd_iter_page(struct abd_iter *aiter)
+{
+	if (abd_iter_at_end(aiter)) {
+		aiter->iter_page = NULL;
+		aiter->iter_page_doff = 0;
+		aiter->iter_page_dsize = 0;
+		return;
+	}
+
+	struct page *page;
+	size_t doff, dsize;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+
+		/* memory address at iter_pos */
+		void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
+
+		/* struct page for address */
+		page = is_vmalloc_addr(paddr) ?
+		    vmalloc_to_page(paddr) : virt_to_page(paddr);
+
+		/* offset of address within the page */
+		doff = offset_in_page(paddr);
+
+		/* total data remaining in abd from this position */
+		dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
+	} else {
+		ASSERT(!abd_is_gang(aiter->iter_abd));
+
+		/* current scatter page */
+		page = sg_page(aiter->iter_sg);
+
+		/* position within page */
+		doff = aiter->iter_offset;
+
+		/* remaining data in scatterlist */
+		dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
+		    aiter->iter_abd->abd_size - aiter->iter_pos);
+	}
+	ASSERT(page);
+
+	if (PageTail(page)) {
+		/*
+		 * This page is part of a "compound page", which is a group of
+		 * pages that can be referenced from a single struct page *.
+		 * Its organised as a "head" page, followed by a series of
+		 * "tail" pages.
+		 *
+		 * In OpenZFS, compound pages are allocated using the
+		 * __GFP_COMP flag, which we get from scatter ABDs and SPL
+		 * vmalloc slabs (ie >16K allocations). So a great many of the
+		 * IO buffers we get are going to be of this type.
+		 *
+		 * The tail pages are just regular PAGE_SIZE pages, and can be
+		 * safely used as-is. However, the head page has length
+		 * covering itself and all the tail pages. If this ABD chunk
+		 * spans multiple pages, then we can use the head page and a
+		 * >PAGE_SIZE length, which is far more efficient.
+		 *
+		 * To do this, we need to adjust the offset to be counted from
+		 * the head page. struct page for compound pages are stored
+		 * contiguously, so we can just adjust by a simple offset.
+		 */
+		struct page *head = compound_head(page);
+		doff += ((page - head) * PAGESIZE);
+		page = head;
+	}
+
+	/* final page and position within it */
+	aiter->iter_page = page;
+	aiter->iter_page_doff = doff;
+
+	/* amount of data in the chunk, up to the end of the page */
+	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
+}
+
+ /*
+  * bio_nr_pages for ABD.
+  * @off is the offset in @abd
+@@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
+ module_param(zfs_abd_scatter_max_order, uint, 0644);
+ MODULE_PARM_DESC(zfs_abd_scatter_max_order,
+ 	"Maximum order allocation used for a scatter ABD.");
+-#endif
+
+#endif /* _KERNEL */
+diff --git a/module/zfs/abd.c b/module/zfs/abd.c
+index d982f201c..3388e2357 100644
+--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
+@@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
+ 	return (ret);
+ }
+ 
+#if defined(__linux__) && defined(_KERNEL)
+int
+abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
+    abd_iter_page_func_t *func, void *private)
+{
+	struct abd_iter aiter;
+	int ret = 0;
+
+	if (size == 0)
+		return (0);
+
+	abd_verify(abd);
+	ASSERT3U(off + size, <=, abd->abd_size);
+
+	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
+
+	while (size > 0) {
+		IMPLY(abd_is_gang(abd), c_abd != NULL);
+
+		abd_iter_page(&aiter);
+
+		size_t len = MIN(aiter.iter_page_dsize, size);
+		ASSERT3U(len, >, 0);
+
+		ret = func(aiter.iter_page, aiter.iter_page_doff,
+		    len, private);
+
+		aiter.iter_page = NULL;
+		aiter.iter_page_doff = 0;
+		aiter.iter_page_dsize = 0;
+
+		if (ret != 0)
+			break;
+
+		size -= len;
+		c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
+	}
+
+	return (ret);
+}
+#endif
+
+ struct buf_arg {
+ 	void *arg_buf;
+ };
--- a/debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
+++ b/debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
@ -0,0 +1,349 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob Norris <rob.norris@klarasystems.com>
+Date: Tue, 9 Jan 2024 12:12:56 +1100
+Subject: [PATCH] vdev_disk: rename existing functions to vdev_classic_*
+
+This is just renaming the existing functions we're about to replace and
+grouping them together to make the next commits easier to follow.
+
+Reviewed-by: Alexander Motin <mav@FreeBSD.org>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
+Sponsored-by: Klara, Inc.
+Sponsored-by: Wasabi Technology, Inc.
+Closes #15533
+Closes #15588
+(cherry picked from commit f3b85d706bae82957d2e3e0ef1d53a1cfab60eb4)
+---
+ include/sys/abd.h               |   2 +
+ module/os/linux/zfs/abd_os.c    |   5 +
+ module/os/linux/zfs/vdev_disk.c | 215 +++++++++++++++++---------------
+ 3 files changed, 120 insertions(+), 102 deletions(-)
+
+diff --git a/include/sys/abd.h b/include/sys/abd.h
+index 8a2df0bca..bee38b831 100644
+--- a/include/sys/abd.h
+++ b/include/sys/abd.h
+@@ -220,6 +220,8 @@ void abd_fini(void);
+ 
+ /*
+  * Linux ABD bio functions
+ * Note: these are only needed to support vdev_classic. See comment in
+ * vdev_disk.c.
+  */
+ #if defined(__linux__) && defined(_KERNEL)
+ unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
+diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
+index dae128012..3fe01c0b7 100644
+--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
+@@ -1096,6 +1096,11 @@ abd_iter_page(struct abd_iter *aiter)
+ 	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
+ }
+ 
+/*
+ * Note: ABD BIO functions only needed to support vdev_classic. See comments in
+ * vdev_disk.c.
+ */
+
+ /*
+  * bio_nr_pages for ABD.
+  * @off is the offset in @abd
+diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
+index b0bda5fa2..957619b87 100644
+--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
+@@ -83,17 +83,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
+  */
+ #define	EFI_MIN_RESV_SIZE	(16 * 1024)
+ 
+-/*
+- * Virtual device vector for disks.
+- */
+-typedef struct dio_request {
+-	zio_t			*dr_zio;	/* Parent ZIO */
+-	atomic_t		dr_ref;		/* References */
+-	int			dr_error;	/* Bio error */
+-	int			dr_bio_count;	/* Count of bio's */
+-	struct bio		*dr_bio[];	/* Attached bio's */
+-} dio_request_t;
+-
+ /*
+  * BIO request failfast mask.
+  */
+@@ -467,85 +456,6 @@ vdev_disk_close(vdev_t *v)
+ 	v->vdev_tsd = NULL;
+ }
+ 
+-static dio_request_t *
+-vdev_disk_dio_alloc(int bio_count)
+-{
+-	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
+-	    sizeof (struct bio *) * bio_count, KM_SLEEP);
+-	atomic_set(&dr->dr_ref, 0);
+-	dr->dr_bio_count = bio_count;
+-	dr->dr_error = 0;
+-
+-	for (int i = 0; i < dr->dr_bio_count; i++)
+-		dr->dr_bio[i] = NULL;
+-
+-	return (dr);
+-}
+-
+-static void
+-vdev_disk_dio_free(dio_request_t *dr)
+-{
+-	int i;
+-
+-	for (i = 0; i < dr->dr_bio_count; i++)
+-		if (dr->dr_bio[i])
+-			bio_put(dr->dr_bio[i]);
+-
+-	kmem_free(dr, sizeof (dio_request_t) +
+-	    sizeof (struct bio *) * dr->dr_bio_count);
+-}
+-
+-static void
+-vdev_disk_dio_get(dio_request_t *dr)
+-{
+-	atomic_inc(&dr->dr_ref);
+-}
+-
+-static void
+-vdev_disk_dio_put(dio_request_t *dr)
+-{
+-	int rc = atomic_dec_return(&dr->dr_ref);
+-
+-	/*
+-	 * Free the dio_request when the last reference is dropped and
+-	 * ensure zio_interpret is called only once with the correct zio
+-	 */
+-	if (rc == 0) {
+-		zio_t *zio = dr->dr_zio;
+-		int error = dr->dr_error;
+-
+-		vdev_disk_dio_free(dr);
+-
+-		if (zio) {
+-			zio->io_error = error;
+-			ASSERT3S(zio->io_error, >=, 0);
+-			if (zio->io_error)
+-				vdev_disk_error(zio);
+-
+-			zio_delay_interrupt(zio);
+-		}
+-	}
+-}
+-
+-BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
+-{
+-	dio_request_t *dr = bio->bi_private;
+-
+-	if (dr->dr_error == 0) {
+-#ifdef HAVE_1ARG_BIO_END_IO_T
+-		dr->dr_error = BIO_END_IO_ERROR(bio);
+-#else
+-		if (error)
+-			dr->dr_error = -(error);
+-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+-			dr->dr_error = EIO;
+-#endif
+-	}
+-
+-	/* Drop reference acquired by __vdev_disk_physio */
+-	vdev_disk_dio_put(dr);
+-}
+-
+ static inline void
+ vdev_submit_bio_impl(struct bio *bio)
+ {
+@@ -697,8 +607,107 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
+ 	return (bio);
+ }
+ 
+/* ========== */
+
+/*
+ * This is the classic, battle-tested BIO submission code.
+ *
+ * These functions have been renamed to vdev_classic_* to make it clear what
+ * they belong to, but their implementations are unchanged.
+ */
+
+/*
+ * Virtual device vector for disks.
+ */
+typedef struct dio_request {
+	zio_t			*dr_zio;	/* Parent ZIO */
+	atomic_t		dr_ref;		/* References */
+	int			dr_error;	/* Bio error */
+	int			dr_bio_count;	/* Count of bio's */
+	struct bio		*dr_bio[];	/* Attached bio's */
+} dio_request_t;
+
+static dio_request_t *
+vdev_classic_dio_alloc(int bio_count)
+{
+	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
+	    sizeof (struct bio *) * bio_count, KM_SLEEP);
+	atomic_set(&dr->dr_ref, 0);
+	dr->dr_bio_count = bio_count;
+	dr->dr_error = 0;
+
+	for (int i = 0; i < dr->dr_bio_count; i++)
+		dr->dr_bio[i] = NULL;
+
+	return (dr);
+}
+
+static void
+vdev_classic_dio_free(dio_request_t *dr)
+{
+	int i;
+
+	for (i = 0; i < dr->dr_bio_count; i++)
+		if (dr->dr_bio[i])
+			bio_put(dr->dr_bio[i]);
+
+	kmem_free(dr, sizeof (dio_request_t) +
+	    sizeof (struct bio *) * dr->dr_bio_count);
+}
+
+static void
+vdev_classic_dio_get(dio_request_t *dr)
+{
+	atomic_inc(&dr->dr_ref);
+}
+
+static void
+vdev_classic_dio_put(dio_request_t *dr)
+{
+	int rc = atomic_dec_return(&dr->dr_ref);
+
+	/*
+	 * Free the dio_request when the last reference is dropped and
+	 * ensure zio_interpret is called only once with the correct zio
+	 */
+	if (rc == 0) {
+		zio_t *zio = dr->dr_zio;
+		int error = dr->dr_error;
+
+		vdev_classic_dio_free(dr);
+
+		if (zio) {
+			zio->io_error = error;
+			ASSERT3S(zio->io_error, >=, 0);
+			if (zio->io_error)
+				vdev_disk_error(zio);
+
+			zio_delay_interrupt(zio);
+		}
+	}
+}
+
+BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
+{
+	dio_request_t *dr = bio->bi_private;
+
+	if (dr->dr_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		dr->dr_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			dr->dr_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			dr->dr_error = EIO;
+#endif
+	}
+
+	/* Drop reference acquired by vdev_classic_physio */
+	vdev_classic_dio_put(dr);
+}
+
+ static inline unsigned int
+-vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
+vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
+ {
+ 	unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
+ 	    bio_size, abd_offset);
+@@ -711,7 +720,7 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
+ }
+ 
+ static int
+-__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+vdev_classic_physio(struct block_device *bdev, zio_t *zio,
+     size_t io_size, uint64_t io_offset, int rw, int flags)
+ {
+ 	dio_request_t *dr;
+@@ -736,7 +745,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+ 	}
+ 
+ retry:
+-	dr = vdev_disk_dio_alloc(bio_count);
+	dr = vdev_classic_dio_alloc(bio_count);
+ 
+ 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
+ 	    zio->io_vd->vdev_failfast == B_TRUE) {
+@@ -771,23 +780,23 @@ retry:
+ 		 * this should be rare - see the comment above.
+ 		 */
+ 		if (dr->dr_bio_count == i) {
+-			vdev_disk_dio_free(dr);
+			vdev_classic_dio_free(dr);
+ 			bio_count *= 2;
+ 			goto retry;
+ 		}
+ 
+-		nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
+		nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
+ 		dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
+ 		if (unlikely(dr->dr_bio[i] == NULL)) {
+-			vdev_disk_dio_free(dr);
+			vdev_classic_dio_free(dr);
+ 			return (SET_ERROR(ENOMEM));
+ 		}
+ 
+-		/* Matching put called by vdev_disk_physio_completion */
+-		vdev_disk_dio_get(dr);
+		/* Matching put called by vdev_classic_physio_completion */
+		vdev_classic_dio_get(dr);
+ 
+ 		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
+-		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+		dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
+ 		dr->dr_bio[i]->bi_private = dr;
+ 		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
+ 
+@@ -801,7 +810,7 @@ retry:
+ 	}
+ 
+ 	/* Extra reference to protect dio_request during vdev_submit_bio */
+-	vdev_disk_dio_get(dr);
+	vdev_classic_dio_get(dr);
+ 
+ 	if (dr->dr_bio_count > 1)
+ 		blk_start_plug(&plug);
+@@ -815,11 +824,13 @@ retry:
+ 	if (dr->dr_bio_count > 1)
+ 		blk_finish_plug(&plug);
+ 
+-	vdev_disk_dio_put(dr);
+	vdev_classic_dio_put(dr);
+ 
+ 	return (error);
+ }
+ 
+/* ========== */
+
+ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
+ {
+ 	zio_t *zio = bio->bi_private;
+@@ -1023,7 +1034,7 @@ vdev_disk_io_start(zio_t *zio)
+ 	}
+ 
+ 	zio->io_target_timestamp = zio_handle_io_delay(zio);
+-	error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
+	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
+ 	    zio->io_size, zio->io_offset, rw, 0);
+ 	rw_exit(&vd->vd_lock);
+ 
--- a/debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch
+++ b/debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch
@ -0,0 +1,111 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob Norris <rob.norris@klarasystems.com>
+Date: Tue, 9 Jan 2024 12:23:30 +1100
+Subject: [PATCH] vdev_disk: reorganise vdev_disk_io_start
+
+Light reshuffle to make it a bit more linear to read and get rid of a
+bunch of args that aren't needed in all cases.
+
+Reviewed-by: Alexander Motin <mav@FreeBSD.org>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
+Sponsored-by: Klara, Inc.
+Sponsored-by: Wasabi Technology, Inc.
+Closes #15533
+Closes #15588
+(cherry picked from commit 867178ae1db28e73051c8a7ce662f2f2f81cd8e6)
+---
+ module/os/linux/zfs/vdev_disk.c | 51 ++++++++++++++++++++-------------
+ 1 file changed, 31 insertions(+), 20 deletions(-)
+
+diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
+index 957619b87..51e7cef2f 100644
+--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
+@@ -720,9 +720,16 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
+ }
+ 
+ static int
+-vdev_classic_physio(struct block_device *bdev, zio_t *zio,
+-    size_t io_size, uint64_t io_offset, int rw, int flags)
+vdev_classic_physio(zio_t *zio)
+ {
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+	size_t io_size = zio->io_size;
+	uint64_t io_offset = zio->io_offset;
+	int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
+	int flags = 0;
+
+ 	dio_request_t *dr;
+ 	uint64_t abd_offset;
+ 	uint64_t bio_offset;
+@@ -944,7 +951,7 @@ vdev_disk_io_start(zio_t *zio)
+ {
+ 	vdev_t *v = zio->io_vd;
+ 	vdev_disk_t *vd = v->vdev_tsd;
+-	int rw, error;
+	int error;
+ 
+ 	/*
+ 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+@@ -1007,13 +1014,6 @@ vdev_disk_io_start(zio_t *zio)
+ 		rw_exit(&vd->vd_lock);
+ 		zio_execute(zio);
+ 		return;
+-	case ZIO_TYPE_WRITE:
+-		rw = WRITE;
+-		break;
+-
+-	case ZIO_TYPE_READ:
+-		rw = READ;
+-		break;
+ 
+ 	case ZIO_TYPE_TRIM:
+ 		zio->io_error = vdev_disk_io_trim(zio);
+@@ -1026,23 +1026,34 @@ vdev_disk_io_start(zio_t *zio)
+ #endif
+ 		return;
+ 
+-	default:
+	case ZIO_TYPE_READ:
+	case ZIO_TYPE_WRITE:
+		zio->io_target_timestamp = zio_handle_io_delay(zio);
+		error = vdev_classic_physio(zio);
+ 		rw_exit(&vd->vd_lock);
+-		zio->io_error = SET_ERROR(ENOTSUP);
+-		zio_interrupt(zio);
+		if (error) {
+			zio->io_error = error;
+			zio_interrupt(zio);
+		}
+ 		return;
+-	}
+ 
+-	zio->io_target_timestamp = zio_handle_io_delay(zio);
+-	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
+-	    zio->io_size, zio->io_offset, rw, 0);
+-	rw_exit(&vd->vd_lock);
+	default:
+		/*
+		 * Getting here means our parent vdev has made a very strange
+		 * request of us, and shouldn't happen. Assert here to force a
+		 * crash in dev builds, but in production return the IO
+		 * unhandled. The pool will likely suspend anyway but that's
+		 * nicer than crashing the kernel.
+		 */
+		ASSERT3S(zio->io_type, ==, -1);
+ 
+-	if (error) {
+-		zio->io_error = error;
+		rw_exit(&vd->vd_lock);
+		zio->io_error = SET_ERROR(ENOTSUP);
+ 		zio_interrupt(zio);
+ 		return;
+ 	}
+
+	__builtin_unreachable();
+ }
+ 
+ static void
--- a/debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch
+++ b/debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch
@ -0,0 +1,69 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob Norris <rob.norris@klarasystems.com>
+Date: Tue, 9 Jan 2024 12:29:19 +1100
+Subject: [PATCH] vdev_disk: make read/write IO function configurable
+
+This is just setting up for the next couple of commits, which will add a
+new IO function and a parameter to select it.
+
+Reviewed-by: Alexander Motin <mav@FreeBSD.org>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
+Sponsored-by: Klara, Inc.
+Sponsored-by: Wasabi Technology, Inc.
+Closes #15533
+Closes #15588
+(cherry picked from commit c4a13ba483f08a81aa47479d2f763a470d95b2b0)
+---
+ module/os/linux/zfs/vdev_disk.c | 23 +++++++++++++++++++++--
+ 1 file changed, 21 insertions(+), 2 deletions(-)
+
+diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
+index 51e7cef2f..de4dba72f 100644
+--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
+@@ -946,6 +946,8 @@ vdev_disk_io_trim(zio_t *zio)
+ #endif
+ }
+ 
+int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
+
+ static void
+ vdev_disk_io_start(zio_t *zio)
+ {
+@@ -1029,7 +1031,7 @@ vdev_disk_io_start(zio_t *zio)
+ 	case ZIO_TYPE_READ:
+ 	case ZIO_TYPE_WRITE:
+ 		zio->io_target_timestamp = zio_handle_io_delay(zio);
+-		error = vdev_classic_physio(zio);
+		error = vdev_disk_io_rw_fn(zio);
+ 		rw_exit(&vd->vd_lock);
+ 		if (error) {
+ 			zio->io_error = error;
+@@ -1102,8 +1104,25 @@ vdev_disk_rele(vdev_t *vd)
+ 	/* XXX: Implement me as a vnode rele for the device */
+ }
+ 
+/*
+ * At first use vdev use, set the submission function from the default value if
+ * it hasn't been set already.
+ */
+static int
+vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+	(void) spa;
+	(void) nv;
+	(void) tsd;
+
+	if (vdev_disk_io_rw_fn == NULL)
+		vdev_disk_io_rw_fn = vdev_classic_physio;
+
+	return (0);
+}
+
+ vdev_ops_t vdev_disk_ops = {
+-	.vdev_op_init = NULL,
+	.vdev_op_init = vdev_disk_init,
+ 	.vdev_op_fini = NULL,
+ 	.vdev_op_open = vdev_disk_open,
+ 	.vdev_op_close = vdev_disk_close,
--- a/debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
+++ b/debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
@ -0,0 +1,671 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob Norris <rob.norris@klarasystems.com>
+Date: Tue, 18 Jul 2023 11:11:29 +1000
+Subject: [PATCH] vdev_disk: rewrite BIO filling machinery to avoid split pages
+
+This commit tackles a number of issues in the way BIOs (`struct bio`)
+are constructed for submission to the Linux block layer.
+
+The kernel has a hard upper limit on the number of pages/segments that
+can be added to a BIO, as well as a separate limit for each device
+(related to its queue depth and other scheduling characteristics).
+
+ZFS counts the number of memory pages in the request ABD
+(`abd_nr_pages_off()`, and then uses that as the number of segments to
+put into the BIO, up to the hard upper limit. If it requires more than
+the limit, it will create multiple BIOs.
+
+Leaving aside the fact that page count method is wrong (see below), not
+limiting to the device segment max means that the device driver will
+need to split the BIO in half. This is alone is not necessarily a
+problem, but it interacts with another issue to cause a much larger
+problem.
+
+The kernel function to add a segment to a BIO (`bio_add_page()`) takes a
+`struct page` pointer, and offset+len within it. `struct page` can
+represent a run of contiguous memory pages (known as a "compound page").
+In can be of arbitrary length.
+
+The ZFS functions that count ABD pages and load them into the BIO
+(`abd_nr_pages_off()`, `bio_map()` and `abd_bio_map_off()`) will never
+consider a page to be more than `PAGE_SIZE` (4K), even if the `struct
+page` is for multiple pages. In this case, it will load the same `struct
+page` into the BIO multiple times, with the offset adjusted each time.
+
+With a sufficiently large ABD, this can easily lead to the BIO being
+entirely filled much earlier than it could have been. This is also
+further contributes to the problem caused by the incorrect segment limit
+calculation, as its much easier to go past the device limit, and so
+require a split.
+
+Again, this is not a problem on its own.
+
+The logic for "never submit more than `PAGE_SIZE`" is actually a little
+more subtle. It will actually never submit a buffer that crosses a 4K
+page boundary.
+
+In practice, this is fine, as most ABDs are scattered, that is a list of
+complete 4K pages, and so are loaded in as such.
+
+Linear ABDs are typically allocated from slabs, and for small sizes they
+are frequently not aligned to page boundaries. For example, a 12K
+allocation can span four pages, eg:
+
+     -- 4K -- -- 4K -- -- 4K -- -- 4K --
+    |        |        |        |        |
+          :## ######## ######## ######:    [1K, 4K, 4K, 3K]
+
+Such an allocation would be loaded into a BIO as you see:
+
+    [1K, 4K, 4K, 3K]
+
+This tends not to be a problem in practice, because even if the BIO were
+filled and needed to be split, each half would still have either a start
+or end aligned to the logical block size of the device (assuming 4K at
+least).
+
+---
+
+In ideal circumstances, these shortcomings don't cause any particular
+problems. Its when they start to interact with other ZFS features that
+things get interesting.
+
+Aggregation will create a "gang" ABD, which is simply a list of other
+ABDs. Iterating over a gang ABD is just iterating over each ABD within
+it in turn.
+
+Because the segments are simply loaded in order, we can end up with
+uneven segments either side of the "gap" between the two ABDs. For
+example, two 12K ABDs might be aggregated and then loaded as:
+
+    [1K, 4K, 4K, 3K, 2K, 4K, 4K, 2K]
+
+Should a split occur, each individual BIO can end up either having an
+start or end offset that is not aligned to the logical block size, which
+some drivers (eg SCSI) will reject. However, this tends not to happen
+because the default aggregation limit usually keeps the BIO small enough
+to not require more than one split, and most pages are actually full 4K
+pages, so hitting an uneven gap is very rare anyway.
+
+If the pool is under particular memory pressure, then an IO can be
+broken down into a "gang block", a 512-byte block composed of a header
+and up to three block pointers. Each points to a fragment of the
+original write, or in turn, another gang block, breaking the original
+data up over and over until space can be found in the pool for each of
+them.
+
+Each gang header is a separate 512-byte memory allocation from a slab,
+that needs to be written down to disk. When the gang header is added to
+the BIO, its a single 512-byte segment.
+
+Pulling all this together, consider a large aggregated write of gang
+blocks. This results a BIO containing lots of 512-byte segments. Given
+our tendency to overfill the BIO, a split is likely, and most possible
+split points will yield a pair of BIOs that are misaligned. Drivers that
+care, like the SCSI driver, will reject them.
+
+---
+
+This commit is a substantial refactor and rewrite of much of `vdev_disk`
+to sort all this out.
+
+`vdev_bio_max_segs()` now returns the ideal maximum size for the device,
+if available. There's also a tuneable `zfs_vdev_disk_max_segs` to
+override this, to assist with testing.
+
+We scan the ABD up front to count the number of pages within it, and to
+confirm that if we submitted all those pages to one or more BIOs, it
+could be split at any point with creating a misaligned BIO.  If the
+pages in the BIO are not usable (as in any of the above situations), the
+ABD is linearised, and then checked again. This is the same technique
+used in `vdev_geom` on FreeBSD, adjusted for Linux's variable page size
+and allocator quirks.
+
+`vbio_t` is a cleanup and enhancement of the old `dio_request_t`. The
+idea is simply that it can hold all the state needed to create, submit
+and return multiple BIOs, including all the refcounts, the ABD copy if
+it was needed, and so on. Apart from what I hope is a clearer interface,
+the major difference is that because we know how many BIOs we'll need up
+front, we don't need the old overflow logic that would grow the BIO
+array, throw away all the old work and restart. We can get it right from
+the start.
+
+Reviewed-by: Alexander Motin <mav@FreeBSD.org>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
+Sponsored-by: Klara, Inc.
+Sponsored-by: Wasabi Technology, Inc.
+Closes #15533
+Closes #15588
+(cherry picked from commit 06a196020e6f70d2fedbd4d0d05bbe0c1ac6e4d8)
+---
+ include/os/linux/kernel/linux/mod_compat.h |   1 +
+ man/man4/zfs.4                             |  10 +-
+ module/os/linux/zfs/vdev_disk.c            | 439 ++++++++++++++++++++-
+ 3 files changed, 447 insertions(+), 3 deletions(-)
+
+diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h
+index 8e20a9613..039865b70 100644
+--- a/include/os/linux/kernel/linux/mod_compat.h
+++ b/include/os/linux/kernel/linux/mod_compat.h
+@@ -68,6 +68,7 @@ enum scope_prefix_types {
+ 	zfs_trim,
+ 	zfs_txg,
+ 	zfs_vdev,
+	zfs_vdev_disk,
+ 	zfs_vdev_file,
+ 	zfs_vdev_mirror,
+ 	zfs_vnops,
+diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
+index 352990e02..b5679f2f0 100644
+--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
+@@ -2,6 +2,7 @@
+ .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
+ .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
+ .\" Copyright (c) 2019 Datto Inc.
+.\" Copyright (c) 2023, 2024 Klara, Inc.
+ .\" The contents of this file are subject to the terms of the Common Development
+ .\" and Distribution License (the "License").  You may not use this file except
+ .\" in compliance with the License. You can obtain a copy of the license at
+@@ -15,7 +16,7 @@
+ .\" own identifying information:
+ .\" Portions Copyright [yyyy] [name of copyright owner]
+ .\"
+-.Dd July 21, 2023
+.Dd January 9, 2024
+ .Dt ZFS 4
+ .Os
+ .
+@@ -1345,6 +1346,13 @@ _
+ 	4	Driver	No driver retries on driver errors.
+ .TE
+ .
+.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
+Maximum number of segments to add to a BIO (min 4).
+If this is higher than the maximum allowed by the device queue or the kernel
+itself, it will be clamped.
+Setting it to zero will cause the kernel's ideal size to be used.
+This parameter only applies on Linux.
+.
+ .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
+ Time before expiring
+ .Pa .zfs/snapshot .
+diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
+index de4dba72f..0ccb9ad96 100644
+--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
+@@ -24,6 +24,7 @@
+  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
+  * LLNL-CODE-403049.
+  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+  */
+ 
+ #include <sys/zfs_context.h>
+@@ -66,6 +67,13 @@ typedef struct vdev_disk {
+ 	krwlock_t			vd_lock;
+ } vdev_disk_t;
+ 
+/*
+ * Maximum number of segments to add to a bio (min 4). If this is higher than
+ * the maximum allowed by the device queue or the kernel itself, it will be
+ * clamped. Setting it to zero will cause the kernel's ideal size to be used.
+ */
+uint_t zfs_vdev_disk_max_segs = 0;
+
+ /*
+  * Unique identifier for the exclusive vdev holder.
+  */
+@@ -607,10 +615,433 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
+ 	return (bio);
+ }
+ 
+static inline uint_t
+vdev_bio_max_segs(struct block_device *bdev)
+{
+	/*
+	 * Smallest of the device max segs and the tuneable max segs. Minimum
+	 * 4, so there's room to finish split pages if they come up.
+	 */
+	const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
+	const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
+	    MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
+	const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
+
+#ifdef HAVE_BIO_MAX_SEGS
+	return (bio_max_segs(max_segs));
+#else
+	return (MIN(max_segs, BIO_MAX_PAGES));
+#endif
+}
+
+static inline uint_t
+vdev_bio_max_bytes(struct block_device *bdev)
+{
+	return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
+}
+
+
+/*
+ * Virtual block IO object (VBIO)
+ *
+ * Linux block IO (BIO) objects have a limit on how many data segments (pages)
+ * they can hold. Depending on how they're allocated and structured, a large
+ * ZIO can require more than one BIO to be submitted to the kernel, which then
+ * all have to complete before we can return the completed ZIO back to ZFS.
+ *
+ * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
+ * translate a ZIO down into the kernel block layer and back again.
+ *
+ * Note that these are only used for data ZIOs (read/write). Meta-operations
+ * (flush/trim) don't need multiple BIOs and so can just make the call
+ * directly.
+ */
+typedef struct {
+	zio_t		*vbio_zio;	/* parent zio */
+
+	struct block_device *vbio_bdev;	/* blockdev to submit bios to */
+
+	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
+
+	atomic_t	vbio_ref;	/* bio refcount */
+	int		vbio_error;	/* error from failed bio */
+
+	uint_t		vbio_max_segs;	/* max segs per bio */
+
+	uint_t		vbio_max_bytes;	/* max bytes per bio */
+	uint_t		vbio_lbs_mask;	/* logical block size mask */
+
+	uint64_t	vbio_offset;	/* start offset of next bio */
+
+	struct bio	*vbio_bio;	/* pointer to the current bio */
+	struct bio	*vbio_bios;	/* list of all bios */
+} vbio_t;
+
+static vbio_t *
+vbio_alloc(zio_t *zio, struct block_device *bdev)
+{
+	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
+
+	vbio->vbio_zio = zio;
+	vbio->vbio_bdev = bdev;
+	atomic_set(&vbio->vbio_ref, 0);
+	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
+	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
+	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
+	vbio->vbio_offset = zio->io_offset;
+
+	return (vbio);
+}
+
+static int
+vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
+{
+	struct bio *bio;
+	uint_t ssize;
+
+	while (size > 0) {
+		bio = vbio->vbio_bio;
+		if (bio == NULL) {
+			/* New BIO, allocate and set up */
+			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
+			    vbio->vbio_max_segs);
+			if (unlikely(bio == NULL))
+				return (SET_ERROR(ENOMEM));
+			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+
+			bio->bi_next = vbio->vbio_bios;
+			vbio->vbio_bios = vbio->vbio_bio = bio;
+		}
+
+		/*
+		 * Only load as much of the current page data as will fit in
+		 * the space left in the BIO, respecting lbs alignment. Older
+		 * kernels will error if we try to overfill the BIO, while
+		 * newer ones will accept it and split the BIO. This ensures
+		 * everything works on older kernels, and avoids an additional
+		 * overhead on the new.
+		 */
+		ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
+		    vbio->vbio_lbs_mask);
+		if (ssize > 0 &&
+		    bio_add_page(bio, page, ssize, offset) == ssize) {
+			/* Accepted, adjust and load any remaining. */
+			size -= ssize;
+			offset += ssize;
+			continue;
+		}
+
+		/* No room, set up for a new BIO and loop */
+		vbio->vbio_offset += BIO_BI_SIZE(bio);
+
+		/* Signal new BIO allocation wanted */
+		vbio->vbio_bio = NULL;
+	}
+
+	return (0);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
+static void vbio_put(vbio_t *vbio);
+
+static void
+vbio_submit(vbio_t *vbio, int flags)
+{
+	ASSERT(vbio->vbio_bios);
+	struct bio *bio = vbio->vbio_bios;
+	vbio->vbio_bio = vbio->vbio_bios = NULL;
+
+	/*
+	 * We take a reference for each BIO as we submit it, plus one to
+	 * protect us from BIOs completing before we're done submitting them
+	 * all, causing vbio_put() to free vbio out from under us and/or the
+	 * zio to be returned before all its IO has completed.
+	 */
+	atomic_set(&vbio->vbio_ref, 1);
+
+	/*
+	 * If we're submitting more than one BIO, inform the block layer so
+	 * it can batch them if it wants.
+	 */
+	struct blk_plug plug;
+	boolean_t do_plug = (bio->bi_next != NULL);
+	if (do_plug)
+		blk_start_plug(&plug);
+
+	/* Submit all the BIOs */
+	while (bio != NULL) {
+		atomic_inc(&vbio->vbio_ref);
+
+		struct bio *next = bio->bi_next;
+		bio->bi_next = NULL;
+
+		bio->bi_end_io = vdev_disk_io_rw_completion;
+		bio->bi_private = vbio;
+		bio_set_op_attrs(bio,
+		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+		    WRITE : READ, flags);
+
+		vdev_submit_bio(bio);
+
+		bio = next;
+	}
+
+	/* Finish the batch */
+	if (do_plug)
+		blk_finish_plug(&plug);
+
+	/* Release the extra reference */
+	vbio_put(vbio);
+}
+
+static void
+vbio_return_abd(vbio_t *vbio)
+{
+	zio_t *zio = vbio->vbio_zio;
+	if (vbio->vbio_abd == NULL)
+		return;
+
+	/*
+	 * If we copied the ABD before issuing it, clean up and return the copy
+	 * to the ADB, with changes if appropriate.
+	 */
+	void *buf = abd_to_buf(vbio->vbio_abd);
+	abd_free(vbio->vbio_abd);
+	vbio->vbio_abd = NULL;
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+	else
+		abd_return_buf(zio->io_abd, buf, zio->io_size);
+}
+
+static void
+vbio_free(vbio_t *vbio)
+{
+	VERIFY0(atomic_read(&vbio->vbio_ref));
+
+	vbio_return_abd(vbio);
+
+	kmem_free(vbio, sizeof (vbio_t));
+}
+
+static void
+vbio_put(vbio_t *vbio)
+{
+	if (atomic_dec_return(&vbio->vbio_ref) > 0)
+		return;
+
+	/*
+	 * This was the last reference, so the entire IO is completed. Clean
+	 * up and submit it for processing.
+	 */
+
+	/*
+	 * Get any data buf back to the original ABD, if necessary. We do this
+	 * now so we can get the ZIO into the pipeline as quickly as possible,
+	 * and then do the remaining cleanup after.
+	 */
+	vbio_return_abd(vbio);
+
+	zio_t *zio = vbio->vbio_zio;
+
+	/*
+	 * Set the overall error. If multiple BIOs returned an error, only the
+	 * first will be taken; the others are dropped (see
+	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
+	 * multiple IOs to the same device to fail with different errors, so
+	 * there's no real risk.
+	 */
+	zio->io_error = vbio->vbio_error;
+	if (zio->io_error)
+		vdev_disk_error(zio);
+
+	/* All done, submit for processing */
+	zio_delay_interrupt(zio);
+
+	/* Finish cleanup */
+	vbio_free(vbio);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
+{
+	vbio_t *vbio = bio->bi_private;
+
+	if (vbio->vbio_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		vbio->vbio_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			vbio->vbio_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			vbio->vbio_error = EIO;
+#endif
+	}
+
+	/*
+	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
+	 * kernel won't touch it again after the completion function runs.
+	 */
+	bio_put(bio);
+
+	/* Drop this BIOs reference acquired by vbio_submit() */
+	vbio_put(vbio);
+}
+
+/*
+ * Iterator callback to count ABD pages and check their size & alignment.
+ *
+ * On Linux, each BIO segment can take a page pointer, and an offset+length of
+ * the data within that page. A page can be arbitrarily large ("compound"
+ * pages) but we still have to ensure the data portion is correctly sized and
+ * aligned to the logical block size, to ensure that if the kernel wants to
+ * split the BIO, the two halves will still be properly aligned.
+ */
+typedef struct {
+	uint_t  bmask;
+	uint_t  npages;
+	uint_t  end;
+} vdev_disk_check_pages_t;
+
+static int
+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vdev_disk_check_pages_t *s = priv;
+
+	/*
+	 * If we didn't finish on a block size boundary last time, then there
+	 * would be a gap if we tried to use this ABD as-is, so abort.
+	 */
+	if (s->end != 0)
+		return (1);
+
+	/*
+	 * Note if we're taking less than a full block, so we can check it
+	 * above on the next call.
+	 */
+	s->end = len & s->bmask;
+
+	/* All blocks after the first must start on a block size boundary. */
+	if (s->npages != 0 && (off & s->bmask) != 0)
+		return (1);
+
+	s->npages++;
+	return (0);
+}
+
+/*
+ * Check if we can submit the pages in this ABD to the kernel as-is. Returns
+ * the number of pages, or 0 if it can't be submitted like this.
+ */
+static boolean_t
+vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
+{
+	vdev_disk_check_pages_t s = {
+	    .bmask = bdev_logical_block_size(bdev)-1,
+	    .npages = 0,
+	    .end = 0,
+	};
+
+	if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/* Iterator callback to submit ABD pages to the vbio. */
+static int
+vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vbio_t *vbio = priv;
+	return (vbio_add_page(vbio, page, len, off));
+}
+
+static int
+vdev_disk_io_rw(zio_t *zio)
+{
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+	int flags = 0;
+
+	/*
+	 * Accessing outside the block device is never allowed.
+	 */
+	if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
+		vdev_dbgmsg(zio->io_vd,
+		    "Illegal access %llu size %llu, device size %llu",
+		    (u_longlong_t)zio->io_offset,
+		    (u_longlong_t)zio->io_size,
+		    (u_longlong_t)i_size_read(bdev->bd_inode));
+		return (SET_ERROR(EIO));
+	}
+
+	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
+	    v->vdev_failfast == B_TRUE) {
+		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
+		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
+	}
+
+	/*
+	 * Check alignment of the incoming ABD. If any part of it would require
+	 * submitting a page that is not aligned to the logical block size,
+	 * then we take a copy into a linear buffer and submit that instead.
+	 * This should be impossible on a 512b LBS, and fairly rare on 4K,
+	 * usually requiring abnormally-small data blocks (eg gang blocks)
+	 * mixed into the same ABD as larger ones (eg aggregated).
+	 */
+	abd_t *abd = zio->io_abd;
+	if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
+		void *buf;
+		if (zio->io_type == ZIO_TYPE_READ)
+			buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+		else
+			buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+
+		/*
+		 * Wrap the copy in an abd_t, so we can use the same iterators
+		 * to count and fill the vbio later.
+		 */
+		abd = abd_get_from_buf(buf, zio->io_size);
+
+		/*
+		 * False here would mean the borrowed copy has an invalid
+		 * alignment too, which would mean we've somehow been passed a
+		 * linear ABD with an interior page that has a non-zero offset
+		 * or a size not a multiple of PAGE_SIZE. This is not possible.
+		 * It would mean either zio_buf_alloc() or its underlying
+		 * allocators have done something extremely strange, or our
+		 * math in vdev_disk_check_pages() is wrong. In either case,
+		 * something in seriously wrong and its not safe to continue.
+		 */
+		VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
+	}
+
+	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
+	int error = 0;
+	vbio_t *vbio = vbio_alloc(zio, bdev);
+	if (abd != zio->io_abd)
+		vbio->vbio_abd = abd;
+
+	/* Fill it with pages */
+	error = abd_iterate_page_func(abd, 0, zio->io_size,
+	    vdev_disk_fill_vbio_cb, vbio);
+	if (error != 0) {
+		vbio_free(vbio);
+		return (error);
+	}
+
+	vbio_submit(vbio, flags);
+	return (0);
+}
+
+ /* ========== */
+ 
+ /*
+- * This is the classic, battle-tested BIO submission code.
+ * This is the classic, battle-tested BIO submission code. Until we're totally
+ * sure that the new code is safe and correct in all cases, this will remain
+ * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
+ * load time.
+  *
+  * These functions have been renamed to vdev_classic_* to make it clear what
+  * they belong to, but their implementations are unchanged.
+@@ -1116,7 +1547,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
+ 	(void) tsd;
+ 
+ 	if (vdev_disk_io_rw_fn == NULL)
+-		vdev_disk_io_rw_fn = vdev_classic_physio;
+		/* XXX make configurable */
+		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
+ 
+ 	return (0);
+ }
+@@ -1215,3 +1647,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
+ 
+ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
+ 	"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
+
+ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
+	"Maximum number of data segments to add to an IO request (min 4)");
--- a/debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
+++ b/debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
@ -0,0 +1,104 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob Norris <rob.norris@klarasystems.com>
+Date: Tue, 9 Jan 2024 13:28:57 +1100
+Subject: [PATCH] vdev_disk: add module parameter to select BIO submission
+ method
+
+This makes the submission method selectable at module load time via the
+`zfs_vdev_disk_classic` parameter, allowing this change to be backported
+to 2.2 safely, and disabled in favour of the "classic" submission method
+if new problems come up.
+
+Reviewed-by: Alexander Motin <mav@FreeBSD.org>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
+Sponsored-by: Klara, Inc.
+Sponsored-by: Wasabi Technology, Inc.
+Closes #15533
+Closes #15588
+(cherry picked from commit df2169d141aadc0c2cc728c5c5261d6f5c2a27f7)
+---
+ man/man4/zfs.4                  | 16 ++++++++++++++++
+ module/os/linux/zfs/vdev_disk.c | 31 +++++++++++++++++++++++++++++--
+ 2 files changed, 45 insertions(+), 2 deletions(-)
+
+diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
+index b5679f2f0..6a628e7f3 100644
+--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
+@@ -1352,6 +1352,22 @@ If this is higher than the maximum allowed by the device queue or the kernel
+ itself, it will be clamped.
+ Setting it to zero will cause the kernel's ideal size to be used.
+ This parameter only applies on Linux.
+This parameter is ignored if
+.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
+.
+.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
+and earlier.
+This "classic" method has known issues with highly fragmented IO requests and
+is slower on many workloads, but it has been in use for many years and is known
+to be very stable.
+If you set this parameter, please also open a bug report why you did so,
+including the workload involved and any error messages.
+.Pp
+This parameter and the classic submission method will be removed once we have
+total confidence in the new method.
+.Pp
+This parameter only applies on Linux, and can only be set at module load time.
+ .
+ .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
+ Time before expiring
+diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
+index 0ccb9ad96..a9110623a 100644
+--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
+@@ -1535,6 +1535,29 @@ vdev_disk_rele(vdev_t *vd)
+ 	/* XXX: Implement me as a vnode rele for the device */
+ }
+ 
+/*
+ * BIO submission method. See comment above about vdev_classic.
+ * Set zfs_vdev_disk_classic=0 for new, =1 for classic
+ */
+static uint_t zfs_vdev_disk_classic = 0;	/* default new */
+
+/* Set submission function from module parameter */
+static int
+vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
+{
+	int err = param_set_uint(buf, kp);
+	if (err < 0)
+		return (SET_ERROR(err));
+
+	vdev_disk_io_rw_fn =
+	    zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
+
+	printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
+	    zfs_vdev_disk_classic ? "classic" : "new");
+
+	return (0);
+}
+
+ /*
+  * At first use vdev use, set the submission function from the default value if
+  * it hasn't been set already.
+@@ -1547,8 +1570,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
+ 	(void) tsd;
+ 
+ 	if (vdev_disk_io_rw_fn == NULL)
+-		/* XXX make configurable */
+-		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
+		vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
+		    vdev_classic_physio : vdev_disk_io_rw;
+ 
+ 	return (0);
+ }
+@@ -1650,3 +1673,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
+ 
+ ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
+ 	"Maximum number of data segments to add to an IO request (min 4)");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
+    vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
+	"Use classic BIO submission method");
--- a/debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
+++ b/debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
@ -0,0 +1,363 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob Norris <rob.norris@klarasystems.com>
+Date: Wed, 21 Feb 2024 11:07:21 +1100
+Subject: [PATCH] vdev_disk: use bio_chain() to submit multiple BIOs
+
+Simplifies our code a lot, so we don't have to wait for each and
+reassemble them.
+
+Reviewed-by: Alexander Motin <mav@FreeBSD.org>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
+Sponsored-by: Klara, Inc.
+Sponsored-by: Wasabi Technology, Inc.
+Closes #15533
+Closes #15588
+(cherry picked from commit 72fd834c47558cb10d847948d1a4615e894c77c3)
+---
+ module/os/linux/zfs/vdev_disk.c | 231 +++++++++++---------------------
+ 1 file changed, 80 insertions(+), 151 deletions(-)
+
+diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
+index a9110623a..36468fc21 100644
+--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
+@@ -454,10 +454,9 @@ vdev_disk_close(vdev_t *v)
+ 	if (v->vdev_reopening || vd == NULL)
+ 		return;
+ 
+-	if (vd->vd_bdh != NULL) {
+	if (vd->vd_bdh != NULL)
+ 		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
+ 		    zfs_vdev_holder);
+-	}
+ 
+ 	rw_destroy(&vd->vd_lock);
+ 	kmem_free(vd, sizeof (vdev_disk_t));
+@@ -663,9 +662,6 @@ typedef struct {
+ 
+ 	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
+ 
+-	atomic_t	vbio_ref;	/* bio refcount */
+-	int		vbio_error;	/* error from failed bio */
+-
+ 	uint_t		vbio_max_segs;	/* max segs per bio */
+ 
+ 	uint_t		vbio_max_bytes;	/* max bytes per bio */
+@@ -674,43 +670,52 @@ typedef struct {
+ 	uint64_t	vbio_offset;	/* start offset of next bio */
+ 
+ 	struct bio	*vbio_bio;	/* pointer to the current bio */
+-	struct bio	*vbio_bios;	/* list of all bios */
+	int		vbio_flags;	/* bio flags */
+ } vbio_t;
+ 
+ static vbio_t *
+-vbio_alloc(zio_t *zio, struct block_device *bdev)
+vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
+ {
+ 	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
+ 
+ 	vbio->vbio_zio = zio;
+ 	vbio->vbio_bdev = bdev;
+-	atomic_set(&vbio->vbio_ref, 0);
+	vbio->vbio_abd = NULL;
+ 	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
+ 	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
+ 	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
+ 	vbio->vbio_offset = zio->io_offset;
+	vbio->vbio_bio = NULL;
+	vbio->vbio_flags = flags;
+ 
+ 	return (vbio);
+ }
+ 
+BIO_END_IO_PROTO(vbio_completion, bio, error);
+
+ static int
+ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
+ {
+-	struct bio *bio;
+	struct bio *bio = vbio->vbio_bio;
+ 	uint_t ssize;
+ 
+ 	while (size > 0) {
+-		bio = vbio->vbio_bio;
+ 		if (bio == NULL) {
+ 			/* New BIO, allocate and set up */
+ 			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
+ 			    vbio->vbio_max_segs);
+-			if (unlikely(bio == NULL))
+-				return (SET_ERROR(ENOMEM));
+			VERIFY(bio);
+
+ 			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+			bio_set_op_attrs(bio,
+			    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+			    WRITE : READ, vbio->vbio_flags);
+ 
+-			bio->bi_next = vbio->vbio_bios;
+-			vbio->vbio_bios = vbio->vbio_bio = bio;
+			if (vbio->vbio_bio) {
+				bio_chain(vbio->vbio_bio, bio);
+				vdev_submit_bio(vbio->vbio_bio);
+			}
+			vbio->vbio_bio = bio;
+ 		}
+ 
+ 		/*
+@@ -735,157 +740,97 @@ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
+ 		vbio->vbio_offset += BIO_BI_SIZE(bio);
+ 
+ 		/* Signal new BIO allocation wanted */
+-		vbio->vbio_bio = NULL;
+		bio = NULL;
+ 	}
+ 
+ 	return (0);
+ }
+ 
+-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
+-static void vbio_put(vbio_t *vbio);
+/* Iterator callback to submit ABD pages to the vbio. */
+static int
+vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vbio_t *vbio = priv;
+	return (vbio_add_page(vbio, page, len, off));
+}
+ 
+/* Create some BIOs, fill them with data and submit them */
+ static void
+-vbio_submit(vbio_t *vbio, int flags)
+vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
+ {
+-	ASSERT(vbio->vbio_bios);
+-	struct bio *bio = vbio->vbio_bios;
+-	vbio->vbio_bio = vbio->vbio_bios = NULL;
+-
+-	/*
+-	 * We take a reference for each BIO as we submit it, plus one to
+-	 * protect us from BIOs completing before we're done submitting them
+-	 * all, causing vbio_put() to free vbio out from under us and/or the
+-	 * zio to be returned before all its IO has completed.
+-	 */
+-	atomic_set(&vbio->vbio_ref, 1);
+	ASSERT(vbio->vbio_bdev);
+ 
+ 	/*
+-	 * If we're submitting more than one BIO, inform the block layer so
+-	 * it can batch them if it wants.
+	 * We plug so we can submit the BIOs as we go and only unplug them when
+	 * they are fully created and submitted. This is important; if we don't
+	 * plug, then the kernel may start executing earlier BIOs while we're
+	 * still creating and executing later ones, and if the device goes
+	 * away while that's happening, older kernels can get confused and
+	 * trample memory.
+ 	 */
+ 	struct blk_plug plug;
+-	boolean_t do_plug = (bio->bi_next != NULL);
+-	if (do_plug)
+-		blk_start_plug(&plug);
+	blk_start_plug(&plug);
+ 
+-	/* Submit all the BIOs */
+-	while (bio != NULL) {
+-		atomic_inc(&vbio->vbio_ref);
+	(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
+	ASSERT(vbio->vbio_bio);
+ 
+-		struct bio *next = bio->bi_next;
+-		bio->bi_next = NULL;
+	vbio->vbio_bio->bi_end_io = vbio_completion;
+	vbio->vbio_bio->bi_private = vbio;
+ 
+-		bio->bi_end_io = vdev_disk_io_rw_completion;
+-		bio->bi_private = vbio;
+-		bio_set_op_attrs(bio,
+-		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+-		    WRITE : READ, flags);
+	vdev_submit_bio(vbio->vbio_bio);
+ 
+-		vdev_submit_bio(bio);
+-
+-		bio = next;
+-	}
+-
+-	/* Finish the batch */
+-	if (do_plug)
+-		blk_finish_plug(&plug);
+	blk_finish_plug(&plug);
+ 
+-	/* Release the extra reference */
+-	vbio_put(vbio);
+	vbio->vbio_bio = NULL;
+	vbio->vbio_bdev = NULL;
+ }
+ 
+-static void
+-vbio_return_abd(vbio_t *vbio)
+/* IO completion callback */
+BIO_END_IO_PROTO(vbio_completion, bio, error)
+ {
+	vbio_t *vbio = bio->bi_private;
+ 	zio_t *zio = vbio->vbio_zio;
+-	if (vbio->vbio_abd == NULL)
+-		return;
+-
+-	/*
+-	 * If we copied the ABD before issuing it, clean up and return the copy
+-	 * to the ADB, with changes if appropriate.
+-	 */
+-	void *buf = abd_to_buf(vbio->vbio_abd);
+-	abd_free(vbio->vbio_abd);
+-	vbio->vbio_abd = NULL;
+-
+-	if (zio->io_type == ZIO_TYPE_READ)
+-		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+-	else
+-		abd_return_buf(zio->io_abd, buf, zio->io_size);
+-}
+ 
+-static void
+-vbio_free(vbio_t *vbio)
+-{
+-	VERIFY0(atomic_read(&vbio->vbio_ref));
+-
+-	vbio_return_abd(vbio);
+	ASSERT(zio);
+ 
+-	kmem_free(vbio, sizeof (vbio_t));
+-}
+	/* Capture and log any errors */
+#ifdef HAVE_1ARG_BIO_END_IO_T
+	zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+	zio->io_error = 0;
+	if (error)
+		zio->io_error = -(error);
+	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		zio->io_error = EIO;
+#endif
+	ASSERT3U(zio->io_error, >=, 0);
+ 
+-static void
+-vbio_put(vbio_t *vbio)
+-{
+-	if (atomic_dec_return(&vbio->vbio_ref) > 0)
+-		return;
+	if (zio->io_error)
+		vdev_disk_error(zio);
+ 
+-	/*
+-	 * This was the last reference, so the entire IO is completed. Clean
+-	 * up and submit it for processing.
+-	 */
+	/* Return the BIO to the kernel */
+	bio_put(bio);
+ 
+ 	/*
+-	 * Get any data buf back to the original ABD, if necessary. We do this
+-	 * now so we can get the ZIO into the pipeline as quickly as possible,
+-	 * and then do the remaining cleanup after.
+	 * If we copied the ABD before issuing it, clean up and return the copy
+	 * to the ADB, with changes if appropriate.
+ 	 */
+-	vbio_return_abd(vbio);
+	if (vbio->vbio_abd != NULL) {
+		void *buf = abd_to_buf(vbio->vbio_abd);
+		abd_free(vbio->vbio_abd);
+		vbio->vbio_abd = NULL;
+ 
+-	zio_t *zio = vbio->vbio_zio;
+		if (zio->io_type == ZIO_TYPE_READ)
+			abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+		else
+			abd_return_buf(zio->io_abd, buf, zio->io_size);
+	}
+ 
+-	/*
+-	 * Set the overall error. If multiple BIOs returned an error, only the
+-	 * first will be taken; the others are dropped (see
+-	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
+-	 * multiple IOs to the same device to fail with different errors, so
+-	 * there's no real risk.
+-	 */
+-	zio->io_error = vbio->vbio_error;
+-	if (zio->io_error)
+-		vdev_disk_error(zio);
+	/* Final cleanup */
+	kmem_free(vbio, sizeof (vbio_t));
+ 
+ 	/* All done, submit for processing */
+ 	zio_delay_interrupt(zio);
+-
+-	/* Finish cleanup */
+-	vbio_free(vbio);
+-}
+-
+-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
+-{
+-	vbio_t *vbio = bio->bi_private;
+-
+-	if (vbio->vbio_error == 0) {
+-#ifdef HAVE_1ARG_BIO_END_IO_T
+-		vbio->vbio_error = BIO_END_IO_ERROR(bio);
+-#else
+-		if (error)
+-			vbio->vbio_error = -(error);
+-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+-			vbio->vbio_error = EIO;
+-#endif
+-	}
+-
+-	/*
+-	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
+-	 * kernel won't touch it again after the completion function runs.
+-	 */
+-	bio_put(bio);
+-
+-	/* Drop this BIOs reference acquired by vbio_submit() */
+-	vbio_put(vbio);
+ }
+ 
+ /*
+@@ -948,14 +893,6 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
+ 	return (B_TRUE);
+ }
+ 
+-/* Iterator callback to submit ABD pages to the vbio. */
+-static int
+-vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
+-{
+-	vbio_t *vbio = priv;
+-	return (vbio_add_page(vbio, page, len, off));
+-}
+-
+ static int
+ vdev_disk_io_rw(zio_t *zio)
+ {
+@@ -1018,20 +955,12 @@ vdev_disk_io_rw(zio_t *zio)
+ 	}
+ 
+ 	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
+-	int error = 0;
+-	vbio_t *vbio = vbio_alloc(zio, bdev);
+	vbio_t *vbio = vbio_alloc(zio, bdev, flags);
+ 	if (abd != zio->io_abd)
+ 		vbio->vbio_abd = abd;
+ 
+-	/* Fill it with pages */
+-	error = abd_iterate_page_func(abd, 0, zio->io_size,
+-	    vdev_disk_fill_vbio_cb, vbio);
+-	if (error != 0) {
+-		vbio_free(vbio);
+-		return (error);
+-	}
+-
+-	vbio_submit(vbio, flags);
+	/* Fill it with data pages and submit it to the kernel */
+	vbio_submit(vbio, abd, zio->io_size);
+ 	return (0);
+ }
+ 
--- a/debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
+++ b/debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
@ -0,0 +1,96 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob Norris <rob.norris@klarasystems.com>
+Date: Thu, 14 Mar 2024 10:57:30 +1100
+Subject: [PATCH] abd_iter_page: don't use compound heads on Linux <4.5
+
+Before 4.5 (specifically, torvalds/linux@ddc58f2), head and tail pages
+in a compound page were refcounted separately. This means that using the
+head page without taking a reference to it could see it cleaned up later
+before we're finished with it. Specifically, bio_add_page() would take a
+reference, and drop its reference after the bio completion callback
+returns.
+
+If the zio is executed immediately from the completion callback, this is
+usually ok, as any data is referenced through the tail page referenced
+by the ABD, and so becomes "live" that way. If there's a delay in zio
+execution (high load, error injection), then the head page can be freed,
+along with any dirty flags or other indicators that the underlying
+memory is used. Later, when the zio completes and that memory is
+accessed, its either unmapped and an unhandled fault takes down the
+entire system, or it is mapped and we end up messing around in someone
+else's memory. Both of these are very bad.
+
+The solution on these older kernels is to take a reference to the head
+page when we use it, and release it when we're done. There's not really
+a sensible way under our current structure to do this; the "best" would
+be to keep a list of head page references in the ABD, and release them
+when the ABD is freed.
+
+Since this additional overhead is totally unnecessary on 4.5+, where
+head and tail pages share refcounts, I've opted to simply not use the
+compound head in ABD page iteration there. This is theoretically less
+efficient (though cleaning up head page references would add overhead),
+but its safe, and we still get the other benefits of not mapping pages
+before adding them to a bio and not mis-splitting pages.
+
+There doesn't appear to be an obvious symbol name or config option we
+can match on to discover this behaviour in configure (and the mm/page
+APIs have changed a lot since then anyway), so I've gone with a simple
+version check.
+
+Reviewed-by: Alexander Motin <mav@FreeBSD.org>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
+Sponsored-by: Klara, Inc.
+Sponsored-by: Wasabi Technology, Inc.
+Closes #15533
+Closes #15588
+(cherry picked from commit c6be6ce1755a3d9a3cbe70256cd8958ef83d8542)
+---
+ module/os/linux/zfs/abd_os.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
+index 3fe01c0b7..d3255dcbc 100644
+--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
+@@ -62,6 +62,7 @@
+ #include <linux/kmap_compat.h>
+ #include <linux/mm_compat.h>
+ #include <linux/scatterlist.h>
+#include <linux/version.h>
+ #endif
+ 
+ #ifdef _KERNEL
+@@ -1061,6 +1062,7 @@ abd_iter_page(struct abd_iter *aiter)
+ 	}
+ 	ASSERT(page);
+ 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
+ 	if (PageTail(page)) {
+ 		/*
+ 		 * This page is part of a "compound page", which is a group of
+@@ -1082,11 +1084,23 @@ abd_iter_page(struct abd_iter *aiter)
+ 		 * To do this, we need to adjust the offset to be counted from
+ 		 * the head page. struct page for compound pages are stored
+ 		 * contiguously, so we can just adjust by a simple offset.
+		 *
+		 * Before kernel 4.5, compound page heads were refcounted
+		 * separately, such that moving back to the head page would
+		 * require us to take a reference to it and releasing it once
+		 * we're completely finished with it. In practice, that means
+		 * when our caller is done with the ABD, which we have no
+		 * insight into from here. Rather than contort this API to
+		 * track head page references on such ancient kernels, we just
+		 * compile this block out and use the tail pages directly. This
+		 * is slightly less efficient, but makes everything far
+		 * simpler.
+ 		 */
+ 		struct page *head = compound_head(page);
+ 		doff += ((page - head) * PAGESIZE);
+ 		page = head;
+ 	}
+#endif
+ 
+ 	/* final page and position within it */
+ 	aiter->iter_page = page;
--- a/debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
+++ b/debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
@ -0,0 +1,90 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob Norris <rob.norris@klarasystems.com>
+Date: Wed, 27 Mar 2024 13:11:12 +1100
+Subject: [PATCH] vdev_disk: default to classic submission for 2.2.x
+
+We don't want to change to brand-new code in the middle of a stable
+series, but we want it available to test for people running into page
+splitting issues.
+
+This commits make zfs_vdev_disk_classic=1 the default, and updates the
+documentation to better explain what's going on.
+
+Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
+Sponsored-by: Klara, Inc.
+Sponsored-by: Wasabi Technology, Inc.
+---
+ man/man4/zfs.4                  | 31 ++++++++++++++++++++++---------
+ module/os/linux/zfs/vdev_disk.c |  8 +++++---
+ 2 files changed, 27 insertions(+), 12 deletions(-)
+
+diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
+index 6a628e7f3..a98ec519a 100644
+--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
+@@ -1355,17 +1355,30 @@ This parameter only applies on Linux.
+ This parameter is ignored if
+ .Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
+ .
+-.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+-If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
+-and earlier.
+-This "classic" method has known issues with highly fragmented IO requests and
+-is slower on many workloads, but it has been in use for many years and is known
+-to be very stable.
+-If you set this parameter, please also open a bug report why you did so,
+.It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint
+Controls the method used to submit IO to the Linux block layer
+(default
+.Sy 1 "classic" Ns
+)
+.Pp
+If set to 1, the "classic" method is used.
+This is the method that has been in use since the earliest versions of
+ZFS-on-Linux.
+It has known issues with highly fragmented IO requests and is less efficient on
+many workloads, but it well known and well understood.
+.Pp
+If set to 0, the "new" method is used.
+This method is available since 2.2.4 and should resolve all known issues and be
+far more efficient, but has not had as much testing.
+In the 2.2.x series, this parameter defaults to 1, to use the "classic" method.
+.Pp
+It is not recommended that you change it except on advice from the OpenZFS
+developers.
+If you do change it, please also open a bug report describing why you did so,
+ including the workload involved and any error messages.
+ .Pp
+-This parameter and the classic submission method will be removed once we have
+-total confidence in the new method.
+This parameter and the "classic" submission method will be removed in a future
+release of OpenZFS once we have total confidence in the new method.
+ .Pp
+ This parameter only applies on Linux, and can only be set at module load time.
+ .
+diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
+index 36468fc21..e1c19a085 100644
+--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
+@@ -969,8 +969,10 @@ vdev_disk_io_rw(zio_t *zio)
+ /*
+  * This is the classic, battle-tested BIO submission code. Until we're totally
+  * sure that the new code is safe and correct in all cases, this will remain
+- * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
+- * load time.
+ * available.
+ *
+ * It is enabled by setting zfs_vdev_disk_classic=1 at module load time. It is
+ * enabled (=1) by default since 2.2.4, and disabled by default (=0) on master.
+  *
+  * These functions have been renamed to vdev_classic_* to make it clear what
+  * they belong to, but their implementations are unchanged.
+@@ -1468,7 +1470,7 @@ vdev_disk_rele(vdev_t *vd)
+  * BIO submission method. See comment above about vdev_classic.
+  * Set zfs_vdev_disk_classic=0 for new, =1 for classic
+  */
+-static uint_t zfs_vdev_disk_classic = 0;	/* default new */
+static uint_t zfs_vdev_disk_classic = 1;	/* default classic */
+ 
+ /* Set submission function from module parameter */
+ static int
--- a/debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
+++ b/debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
@ -0,0 +1,104 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Robert Evans <rrevans@gmail.com>
+Date: Mon, 25 Mar 2024 17:56:49 -0400
+Subject: [PATCH] Fix corruption caused by mmap flushing problems
+
+1) Make mmap flushes synchronous. Linux may skip flushing dirty pages
+   already in writeback unless data-integrity sync is requested.
+
+2) Change zfs_putpage to use TXG_WAIT. Otherwise dirty pages may be
+   skipped due to DMU pushing back on TX assign.
+
+3) Add missing mmap flush when doing block cloning.
+
+4) While here, pass errors from putpage to writepage/writepages.
+
+This change fixes corruption edge cases, but unfortunately adds
+synchronous ZIL flushes for dirty mmap pages to llseek and bclone
+operations. It may be possible to avoid these sync writes later
+but would need more tricky refactoring of the writeback code.
+
+Reviewed-by: Alexander Motin <mav@FreeBSD.org>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Robert Evans <evansr@google.com>
+Closes #15933
+Closes #16019
+---
+ module/os/linux/zfs/zfs_vnops_os.c | 5 +----
+ module/os/linux/zfs/zpl_file.c     | 8 ++++----
+ module/zfs/zfs_vnops.c             | 6 +++++-
+ 3 files changed, 10 insertions(+), 9 deletions(-)
+
+diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
+index c06a75662..7c473bc7e 100644
+--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
+@@ -3792,11 +3792,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
+ 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ 	zfs_sa_upgrade_txholds(tx, zp);
+ 
+-	err = dmu_tx_assign(tx, TXG_NOWAIT);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+ 	if (err != 0) {
+-		if (err == ERESTART)
+-			dmu_tx_wait(tx);
+-
+ 		dmu_tx_abort(tx);
+ #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
+ 		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
+diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
+index 3caa0fc6c..9dec52215 100644
+--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
+@@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
+ {
+ 	boolean_t *for_sync = data;
+ 	fstrans_cookie_t cookie;
+	int ret;
+ 
+ 	ASSERT(PageLocked(pp));
+ 	ASSERT(!PageWriteback(pp));
+ 
+ 	cookie = spl_fstrans_mark();
+-	(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
+	ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
+ 	spl_fstrans_unmark(cookie);
+ 
+-	return (0);
+	return (ret);
+ }
+ 
+ #ifdef HAVE_WRITEPAGE_T_FOLIO
+ static int
+ zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
+ {
+-	(void) zpl_putpage(&pp->page, wbc, data);
+-	return (0);
+	return (zpl_putpage(&pp->page, wbc, data));
+ }
+ #endif
+ 
+diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
+index 2b37834d5..7020f88ec 100644
+--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
+@@ -130,7 +130,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
+ 
+ 	/* Flush any mmap()'d data to disk */
+ 	if (zn_has_cached_data(zp, 0, file_sz - 1))
+-		zn_flush_cached_data(zp, B_FALSE);
+		zn_flush_cached_data(zp, B_TRUE);
+ 
+ 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
+ 	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
+@@ -1193,6 +1193,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
+ 		}
+ 	}
+ 
+	/* Flush any mmap()'d data to disk */
+	if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
+		zn_flush_cached_data(inzp, B_TRUE);
+
+ 	/*
+ 	 * Maintain predictable lock order.
+ 	 */
--- a/debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch
+++ b/debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch
@ -0,0 +1,57 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rob Norris <rob.norris@klarasystems.com>
+Date: Tue, 2 Apr 2024 15:14:54 +1100
+Subject: [PATCH] vdev_disk: don't touch vbio after its handed off to the
+ kernel
+
+After IO is unplugged, it may complete immediately and vbio_completion
+be called on interrupt context. That may interrupt or deschedule our
+task. If its the last bio, the vbio will be freed. Then, we get
+rescheduled, and try to write to freed memory through vbio->.
+
+This patch just removes the the cleanup, and the corresponding assert.
+These were leftovers from a previous iteration of vbio_submit() and were
+always "belt and suspenders" ops anyway, never strictly required.
+
+Reported-by: Rich Ercolani <rincebrain@gmail.com>
+Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
+Sponsored-by: Klara, Inc.
+Sponsored-by: Wasabi Technology, Inc.
+(cherry picked from commit 34f662ad22206af6852020fd923ceccd836a855f)
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ module/os/linux/zfs/vdev_disk.c | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
+index e1c19a085..62c7aa14f 100644
+--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
+@@ -758,8 +758,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
+ static void
+ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
+ {
+-	ASSERT(vbio->vbio_bdev);
+-
+ 	/*
+ 	 * We plug so we can submit the BIOs as we go and only unplug them when
+ 	 * they are fully created and submitted. This is important; if we don't
+@@ -777,12 +775,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
+ 	vbio->vbio_bio->bi_end_io = vbio_completion;
+ 	vbio->vbio_bio->bi_private = vbio;
+ 
+	/*
+	 * Once submitted, vbio_bio now owns vbio (through bi_private) and we
+	 * can't touch it again. The bio may complete and vbio_completion() be
+	 * called and free the vbio before this task is run again, so we must
+	 * consider it invalid from this point.
+	 */
+ 	vdev_submit_bio(vbio->vbio_bio);
+ 
+ 	blk_finish_plug(&plug);
+-
+-	vbio->vbio_bio = NULL;
+-	vbio->vbio_bdev = NULL;
+ }
+ 
+ /* IO completion callback */
--- a/debian/patches/series
+++ b/debian/patches/series
@ -10,3 +10,16 @@
 0010-Fix-nfs_truncate_shares-without-etc-exports.d.patch
 0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch
 0012-udev-correctly-handle-partition-16-and-later.patch
+0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
+0014-linux-5.4-compat-page_size.patch
+0015-abd-add-page-iterator.patch
+0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
+0017-vdev_disk-reorganise-vdev_disk_io_start.patch
+0018-vdev_disk-make-read-write-IO-function-configurable.patch
+0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
+0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
+0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
+0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
+0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
+0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
+0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch