backport 2.2.4 staging for better 6.8 support

Use the current ZFS 2.2.4 staging tree [0] with commit deb7a8423 ("Fix corruption caused by mmap flushing problems") on top. Additionally, include an open, but ack'd, pull request [1] that avoids a potential general protection fault due to touching a vbio after it was handed off to the kernel. [0]: https://github.com/openzfs/zfs/commits/zfs-2.2.4-staging/ [1]: https://github.com/openzfs/zfs/pull/16049 Both should mostly touch the module code. Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
bump version to 2.2.3-pve1
2024-04-03 09:56:31 +02:00 · 2024-03-11 13:46:05 +01:00 · 2024-03-11 13:44:37 +01:00 · 2024-03-11 13:41:25 +01:00 · 2024-02-19 17:00:17 +01:00 · 2024-02-02 19:17:28 +01:00
35 changed files with 3072 additions and 326 deletions
@@ -0,0 +1,7 @@
 /*.build
 /*.buildinfo
 /*.changes
 /*.deb
 /*.dsc
 /*.tar*
 /zfs-utils-*.*/
@@ -1,9 +1,11 @@
 include /usr/share/dpkg/default.mk
 # source form https://github.com/zfsonlinux/
-ZFSDIR=zfs-linux_$(DEB_VERSION_UPSTREAM)
+PACKAGE = zfs-linux
-ZFSSRC=upstream
+
-ORIG_SRC_TAR=$(ZFSDIR).orig.tar.gz
+SRCDIR = upstream
 BUILDDIR ?= $(PACKAGE)-$(DEB_VERSION_UPSTREAM)
 ORIG_SRC_TAR = $(PACKAGE)_$(DEB_VERSION_UPSTREAM).orig.tar.gz
 ZFS_DEB1= libnvpair3linux_$(DEB_VERSION)_amd64.deb
@@ -30,10 +32,14 @@ DEBS= $(ZFS_DEB1) $(ZFS_DEB2) $(ZFS_DBG_DEBS)
 ZFS_DSC = zfs-linux_$(DEB_VERSION).dsc
 all: deb
-.PHONY: deb
+
 .PHONY: deb dsc
 deb: $(DEBS)
-.PHONY: dsc
+
-dsc: $(ZFS_DSC)
+dsc:
 	rm -rf *.dsc $(BUILDDIR)
 	$(MAKE) $(ZFS_DSC)
 	lintian $(ZFS_DSC)
 # called from pve-kernel's Makefile to get patched sources
 .PHONY: kernel
@@ -47,38 +53,41 @@ dinstall: $(DEBS)
 .PHONY: submodule
 submodule:
-	test -f "$(ZFSSRC)/README.md" || git submodule update --init
+	test -f "$(SRCDIR)/README.md" || git submodule update --init
-$(ZFSSRC)/README.md: submodule
+
 $(SRCDIR)/README.md: submodule
 .PHONY: zfs
 zfs: $(DEBS)
 $(ZFS_DEB2) $(ZFS_DBG_DEBS): $(ZFS_DEB1)
-$(ZFS_DEB1): $(ZFSDIR)
+$(ZFS_DEB1): $(BUILDDIR)
-	cd $(ZFSDIR); dpkg-buildpackage -b -uc -us
+	cd $(BUILDDIR); dpkg-buildpackage -b -uc -us
 	lintian $(DEBS)
-$(ORIG_SRC_TAR): $(ZFSDIR)
+$(ORIG_SRC_TAR): $(BUILDDIR)
-	tar czf $(ORIG_SRC_TAR) --exclude="$(ZFSDIR)/debian" $(ZFSDIR)
+	tar czf $(ORIG_SRC_TAR) --exclude="$(BUILDDIR)/debian" $(BUILDDIR)
-$(ZFS_DSC): $(ZFSDIR) $(ORIG_SRC_TAR)
+$(ZFS_DSC): $(BUILDDIR) $(ORIG_SRC_TAR)
-	tar czf zfs-linux_$(ZFSVER).orig.tar.gz $(ZFSDIR)
+	cd $(BUILDDIR); dpkg-buildpackage -S -uc -us -d
 	cd $(ZFSDIR); dpkg-buildpackage -S -uc -us -d
 	lintian $@
-$(ZFSDIR): $(ZFSSRC)/README.md $(ZFSSRC) debian
+sbuild: $(ZFS_DSC)
-	rm -rf $(ZFSDIR) $(ZFSDIR).tmp
+	sbuild $(ZFS_DSC)
 	cp -a $(ZFSSRC) $(ZFSDIR).tmp
 	cp -a debian $(ZFSDIR).tmp/debian
 	mv $(ZFSDIR).tmp $(ZFSDIR)
 $(BUILDDIR): $(SRCDIR)/README.md $(SRCDIR) debian
 	rm -rf $@ $@.tmp
 	cp -a $(SRCDIR) $@.tmp
 	cp -a debian $@.tmp/debian
 	mv $@.tmp $@
 .PHONY: clean
 clean: 	
-	rm -rf *~ *.deb *.changes *.buildinfo *.build *.dsc *.orig.tar.* *.debian.tar.* $(ZFSDIR)
+	rm -rf $(PACKAGE)-[0-9]*/
 	rm -f *~ *.deb *.changes *.buildinfo *.build *.dsc *.orig.tar.* *.debian.tar.*
 .PHONY: distclean
 distclean: clean
 .PHONY: upload
 upload: UPLOAD_DIST ?= $(DEB_DISTRIBUTION)
 upload: $(DEBS)
-	tar -cf - $(DEBS) | ssh repoman@repo.proxmox.com -- upload --product pve,pmg,pbs --dist bullseye --arch amd64
+	tar -cf - $(DEBS) | ssh repoman@repo.proxmox.com -- upload --product pve,pmg,pbs --dist $(UPLOAD_DIST) --arch $(DEB_HOST_ARCH)
@@ -1,3 +1,87 @@
 zfs-linux (2.2.3-pve1) bookworm; urgency=medium
  * update to new ZFS upstream 2.2.3 release
  * fix #5288: correctly handle zvols with more than 15 partitions in udev
 -- Proxmox Support Team <support@proxmox.com>  Mon, 11 Mar 2024 13:42:50 +0100
 zfs-linux (2.2.2-pve2) bookworm; urgency=medium
  * fix #5101: ensure datasets that have sharenfs enabled are not unexported
    after a `zfs mount -a` call.
 -- Proxmox Support Team <support@proxmox.com>  Mon, 19 Feb 2024 16:56:37 +0100
 zfs-linux (2.2.2-pve1) bookworm; urgency=medium
  * update to new ZFS upstream 2.2.2 release, as we have all important fixes
    for recent discovered data integrity issues backported to previous
    versions, there should be no visible change in that regard.
 -- Proxmox Support Team <support@proxmox.com>  Mon, 04 Dec 2023 16:50:25 +0100
 zfs-linux (2.2.0-pve4) bookworm; urgency=medium
  * pick bug-fix staged for 2.2.2:
    - fix (rare) corruption caused by dirty dnode being treated as clean
 -- Proxmox Support Team <support@proxmox.com>  Wed, 29 Nov 2023 09:21:26 +0100
 zfs-linux (2.2.0-pve3) bookworm; urgency=medium
  * pick bug-fixes staged for 2.2.1:
    - add a tunable to disable BRT support and disable it by default
    - fix block cloning between unencrypted and encrypted datasets
    - disable block cloning by default
 -- Proxmox Support Team <support@proxmox.com>  Fri, 17 Nov 2023 17:32:58 +0100
 zfs-linux (2.2.0-pve2) bookworm; urgency=medium
  * avoid error from zfs-mount when /etc/exports.d does not exist (yet)
  * ensure vdev_stat struct layout compat between 2.1 and 2.2, avoiding
    false-positive detection of the non-allocating feature from 2.2 when the
    kernel still used the 2.1 module.
 -- Proxmox Support Team <support@proxmox.com>  Sun, 12 Nov 2023 16:02:02 +0100
 zfs-linux (2.2.0-pve1) bookworm; urgency=medium
  * update ZFS to 2.2.0
  * zfsutils-linux:
    - install new systemd units to trim a pool periodically
    - ship new `zilstat` binary
    - and new man pages for zfs lock, zfs unlock and vdev properties
    - remove man pages for zfs jail and zfs unjail, those are for FreeBSD only
      and the respective commands where never exposed for Linux
  * fix #5014: re-enable blk-mq optimization
 -- Proxmox Support Team <support@proxmox.com>  Sun, 15 Oct 2023 12:09:24 +0200
 zfs-linux (2.1.13-pve1) bookworm; urgency=medium
  * update ZFS to 2.1.13
 -- Proxmox Support Team <support@proxmox.com>  Thu, 28 Sep 2023 12:22:28 +0200
 zfs-linux (2.1.12-pve1) bookworm; urgency=medium
  * update ZFS to 2.1.12
  * zfs trim: avoid exit-failure if last pool isn't nvme-only
 -- Proxmox Support Team <support@proxmox.com>  Tue, 13 Jun 2023 15:25:16 +0200
 zfs-linux (2.1.11-pve2) bookworm; urgency=medium
  * re-build for Debian 12 Bookworm based releases
 -- Proxmox Support Team <support@proxmox.com>  Sat, 20 May 2023 19:32:04 +0200
 zfs-linux (2.1.11-pve1) bullseye; urgency=medium
  * update ZFS to 2.1.11
@@ -151,6 +151,7 @@ Section: contrib/doc
 Architecture: all
 Depends:
 ${sphinxdoc:Depends},
 ${sphinxdoc:Built-Using},
 ${misc:Depends}
 Recommends:
 python3-pyzfs
@@ -202,8 +203,8 @@ Description: OpenZFS root filesystem capabilities for Linux - dracut
 Package: zfsutils-linux
 Section: contrib/admin
 Architecture: linux-any
-Depends: python3, ${misc:Depends}, ${python3:Depends}, ${shlibs:Depends}
+Depends: python3, ${misc:Depends}, ${shlibs:Depends}
-Recommends: lsb-base, zfs-zed
+Recommends: zfs-zed
 Suggests: nfs-kernel-server,
          samba-common-bin (>= 3.0.23),
          zfs-initramfs
@@ -285,17 +286,3 @@ Description: Solaris Porting Layer user-space utilities for Linux (dummy)
 to Linux primitives.
 .
 This is a transitional dummy package. It can safely be removed.
 Package: zfs-dbg
 Section: contrib/metapackages
 Architecture: all
 Suggests: libnvpair3linux-dbgsym,
         libpam-zfs-dbgsym,
         libuutil3linux-dbgsym,
         libzfs4linux-dbgsym,
         libzfsbootenv1linux-dbgsym,
         libzpool5linux-dbgsym,
         zfs-test-dbgsym,
         zfsutils-linux-dbgsym,
         zfs-zed-dbgsym,
 Description: Transitional package. It can be safely removed.
@@ -37,25 +37,26 @@ Copyright: 2011, 2013, Nexenta Systems, Inc.
           2007, 2009, Sun Microsystems, Inc.
 License: CDDL-1.0
-Files: cmd/arc_summary/*
+Files: cmd/arc_summary
 Copyright:
  2010, 2011, Jason J. Hellenthal <jhell@DataIX.net>
  2010, Martin Matuska <mm@FreeBSD.org>
  2008, Ben Rockwood <benr@cuddletech.com>
  2017, Scot W. Stevenson <scot.stevenson@gmail.com>
 License: BSD-2-clause
-Files: cmd/arcstat/*
+Files: cmd/arcstat.in
 Source: http://github.com/mharsch/arcstat
 Copyright:
  2007, Oracle and/or its affiliates.
  2010-2015, Mike Harsch
 License: CDDL-1.0
-Files: cmd/dbufstat/*
+Files: cmd/dbufstat.in
 Copyright: 2013, Lawrence Livermore National Security, LLC
 License: CDDL-1.0
-Files: cmd/mount_zfs/*
+Files: cmd/mount_zfs.c
 Copyright: 2011, Lawrence Livermore National Security, LLC
           2005, 2010, Oracle and/or its affiliates.
 License: CDDL-1.0
@@ -64,7 +65,7 @@ Files: cmd/raidz_test/*
 Copyright: 2016 Gvozden Nešković.
 License: CDDL-1.0
-Files: cmd/vdev_id/*
+Files: udev/vdev_id
 Copyright: 2011, 2013, Nexenta Systems, Inc.
           2007, 2009, Sun Microsystems, Inc.
 License: CDDL-1.0
@@ -106,7 +107,7 @@ Copyright:
  2018 Datto Inc.
 License: CDDL-1.0
-Files: cmd/zhack/*
+Files: cmd/zhack.c
 Copyright: 2013, Steven Hartland.
           2011, 2012, 2014, Delphix.
 License: CDDL-1.0
@@ -132,14 +133,14 @@ Copyright:
 2017, Intel Corporation.
 License: CDDL-1.0
-Files: cmd/zstreamdump/*
+Files: cmd/zstream/*
 Copyright:
-  2013, Delphix.
+  2013, 2015 Delphix.
  2012, Martin Matuska <martin@matuska.org>
  2010, Sun Microsystems, Inc.
 License: CDDL-1.0
-Files: cmd/ztest/*
+Files: cmd/ztest.c
 Copyright:
  2005, 2010, Oracle and/or its affiliates.
  2011, 2018 by Delphix.
@@ -150,7 +151,7 @@ Copyright:
  2017, Intel Corporation.
 License: CDDL-1.0
-Files: cmd/zvol_id/*
+Files: udev/zvol_id.c
 Copyright: 2011, Fajar A. Nugraha.
 License: CDDL-1.0
@@ -158,27 +159,6 @@ Files: config/*
 Copyright: 1996-2012, Free Software Foundation, Inc.
 License: GPL-2+ with autoconf exception
 Files: config/ltoptions.m4
       config/lt~obsolete.m4
       config/ltversion.m4
       config/libtool.m4
       config/ltsugar.m4
 Copyright: 1996-2012, Free Software Foundation, Inc.
 License: PERMISSIVE
 This file is free software; the Free Software Foundation gives
 unlimited permission to copy and/or distribute it, with or without
 modifications, as long as this notice is preserved.
 Files: config/install-sh
 Copyright: 1994, X Consortium
 License: Expat
 Files: configure
 Copyright: 1992-1996, 1998-2010, Free Software
 License: PERMISSIVE2
 This configure script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it.
 Files: contrib/bash_completion.d/*
 Copyright: 2010, 2013, Aneurin Price <aneurin.price@gmail.com>
 License: Expat
@@ -201,14 +181,8 @@ Copyright:
 2011-2013, Darik Horn <dajhorn@vanadac.com>
 2018-2019, Mo Zhou <cdluminate@gmail.com>
 2018-2020, Mo Zhou <lumin@debian.org>
 2015-2021 Proxmox Server Solutions GmbH <support@proxmox.com>
 License: GPL-2+
 Files: debian/po/*
 Copyright:
 2013, The Debian po file translators.
 License: CDDL-1.0
 Files: etc/init.d/zfs-*.in
 Copyright:
  2016, Carlo Landmeter <clandmeter@gmail.com>
@@ -399,12 +373,7 @@ Copyright: 2009, Oracle and/or its affiliates.
  2009, Michael Gebetsroither <michael.geb@gmx.at>
 License: CDDL-1.0
-Files: man/man5/zfs-events.5
+Files: man/man7/zpool-features.7
       man/man5/zfs-module-parameters.5
 Copyright: 2013, Turbo Fredriksson <turbo@bayour.com>
 License: CDDL-1.0
 Files: man/man5/zpool-features.5
 Copyright:
  2013, Delphix
  2013, Saso Kiselkov
@@ -427,16 +396,12 @@ Copyright: 2007, Sun Microsystems, Inc.
  2013, Delphix
 License: CDDL-1.0
 Files: man/man8/zstreamdump.8
 Copyright: 2009, Sun Microsystems, Inc.
 License: CDDL-1.0
 Files: module/*
 Copyright: 2011-2014, Delphix.
  2007, 2009, 2010, Sun Microsystems, Inc.
 License: CDDL-1.0
-Files: module/lua
+Files: module/lua/*
 Copyright: 1994-2015 Lua.org, PUC-Rio.
 License: Expat
@@ -483,7 +448,7 @@ Copyright: 2013, Saso Kiselkov.
  2005, 2010, Oracle and/or its affiliates.
 License: CDDL-1.0
-Files: module/zcommon/zfs_uio.c
+Files: module/os/linux/zfs/zfs_uio.c
 Copyright: 2007, 2009, 2010, Sun Microsystems, Inc.
  1983-1989, AT&T
  1982, 1986, 1988, The Regents of the University of California
@@ -583,7 +548,6 @@ Files: module/zfs/dmu_zfetch.c
  module/zfs/rrwlock.c
  module/zfs/space_map.c
  module/zfs/space_reftree.c
  module/zfs/vdev_cache.c
  module/zfs/vdev_mirror.c
  module/zfs/vdev_missing.c
  module/zfs/vdev_queue.c
@@ -651,14 +615,16 @@ Copyright: 2013, Steven Hartland.
 License: CDDL-1.0
 Files: module/zfs/gzip.c
  module/zfs/sha256.c
  module/zfs/spa_boot.c
  module/zfs/unique.c
  module/zfs/zfs_byteswap.c
  module/zfs/zle.c
 Copyright: 2005-2010, Sun Microsystems, Inc.
 License: CDDL-1.0
 Files: module/icp/algs/sha2/*
 Copyright: 2022, Tino Reichardt <milky-zfs@mcmilk.de>
 License: CDDL-1.0
 Files: module/zfs/lz4.c
 Copyright: 2011-2013, Yann Collet
 License: BSD-2-clause
@@ -697,13 +663,14 @@ Copyright: 2011, 2014, Nexenta Systems, Inc.
  2005, 2010, Oracle and/or its affiliates.
 License: CDDL-1.0
-Files: module/zfs/vdev_disk.c
+Files: module/os/linux/zfs/vdev_disk.c
 Copyright: 2012, 2014, Delphix.
  2008-2010, Lawrence Livermore National Security, LLC
 License: CDDL-1.0
-Files: module/zfs/zfs_ctldir.c
+Files: module/os/freebsd/zfs/zfs_ctldir.c
-Copyright: 2013, Delphix.
+  module/os/linux/zfs/zfs_ctldir.c
 Copyright: 2013, 2015 Delphix.
  2011, Lawrence Livermore National Security, LLC
  2005, 2010, Oracle and/or its affiliates.
 License: CDDL-1.0
@@ -726,7 +693,8 @@ Copyright: 2013, Delphix.
  2005, 2010, Oracle and/or its affiliates.
 License: CDDL-1.0
-Files: module/zfs/zfs_vfsops.c
+Files: module/os/freebsd/zfs/zfs_vfsops.c
  module/os/linux/zfs/zfs_vfsops.c
  module/zfs/zil.c
 Copyright: 2011-2014, Delphix.
  2010, Robert Milkowski
@@ -741,8 +709,9 @@ Copyright: 2015, Chunwei Chen.
  2005, 2010, Oracle and/or its affiliates.
 License: CDDL-1.0
-Files: module/zfs/zfs_znode.c
+Files: module/os/freebsd/zfs/zfs_znode.c
-Copyright: 2013, Delphix.
+  module/os/linux/zfs/zfs_znode.c
 Copyright: 2013, 2015 Delphix.
  2007, Jeremy Teo
  2005, 2010, Oracle and/or its affiliates.
 License: CDDL-1.0
@@ -753,20 +722,20 @@ Copyright: 2013, Saso Kiselkov.
  2009, Sun Microsystems, Inc.
 License: CDDL-1.0
-Files: module/zfs/zpl_ctldir.c
+Files: module/os/linux/zfs/zpl_ctldir.c
-  module/zfs/zpl_super.c
+  module/os/linux/zfs/zpl_super.c
-  module/zfs/zpl_xattr.c
+  module/os/linux/zfs/zpl_xattr.c
  module/zfs/zvol.c
 Copyright: 2008-2011, Lawrence Livermore National Security, LLC
 License: CDDL-1.0
-Files: module/zfs/zpl_export.c
+Files: module/os/linux/zfs/zpl_export.c
 Copyright: 2012, Cyril Plisko.
  2011, Gunnar Beutner
 License: CDDL-1.0
-Files: module/zfs/zpl_file.c
+Files: module/os/linux/zfs/zpl_file.c
-  module/zfs/zpl_inode.c
+  module/os/linux/zfs/zpl_inode.c
 Copyright: 2015, Chunwei Chen.
  2011, Lawrence Livermore National Security, LLC
 License: CDDL-1.0
@@ -13,13 +13,13 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/etc/systemd/system/zfs-zed.service.in b/etc/systemd/system/zfs-zed.service.in
-index be80025a4..20ce8e632 100644
+index be2fc6734..7606604ec 100644
 --- a/etc/systemd/system/zfs-zed.service.in
 +++ b/etc/systemd/system/zfs-zed.service.in
-@@ -4,7 +4,7 @@ Documentation=man:zed(8)
+@@ -5,7 +5,7 @@ ConditionPathIsDirectory=/sys/module/zfs
 ConditionPathIsDirectory=/sys/module/zfs
 [Service]
 EnvironmentFile=-@initconfdir@/zfs
 -ExecStart=@sbindir@/zed -F
 +ExecStart=/usr/sbin/zed -F
 Restart=always
@@ -14,13 +14,13 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/etc/systemd/system/zfs-import-scan.service.in b/etc/systemd/system/zfs-import-scan.service.in
-index 598ef501b..e4f3a70c1 100644
+index c5dd45d87..1c792edf0 100644
 --- a/etc/systemd/system/zfs-import-scan.service.in
 +++ b/etc/systemd/system/zfs-import-scan.service.in
-@@ -13,7 +13,7 @@ ConditionPathIsDirectory=/sys/module/zfs
+@@ -14,7 +14,7 @@ ConditionPathIsDirectory=/sys/module/zfs
 [Service]
 Type=oneshot
 RemainAfterExit=yes
 EnvironmentFile=-@initconfdir@/zfs
 -ExecStart=@sbindir@/zpool import -aN -o cachefile=none $ZPOOL_IMPORT_OPTS
 +ExecStart=@sbindir@/zpool import -aN -d /dev/disk/by-id -o cachefile=none $ZPOOL_IMPORT_OPTS
@@ -13,10 +13,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc
-index 227b26c26..240d0dbfa 100644
+index bc269b155..e6d4b1703 100644
 --- a/cmd/zed/zed.d/zed.rc
 +++ b/cmd/zed/zed.d/zed.rc
-@@ -42,7 +42,7 @@ ZED_EMAIL_ADDR="root"
+@@ -41,7 +41,7 @@ ZED_EMAIL_ADDR="root"
 ##
 # Minimum number of seconds between notifications for a similar event.
 #
@@ -3,29 +3,44 @@ From: Antonio Russo <antonio.e.russo@gmail.com>
 Date: Fri, 20 Mar 2020 17:28:43 +0100
 Subject: [PATCH] dont symlink zed scripts
-(cherry picked and adapted from 5cee380324d74e640d5dd7a360faba3994c8007f [0])
+Of the zedlet scripts shipped by upstream, a subset are enabled by
 default, by creating symlinks in /etc/zfs/zed.d.  These symlinks are
 shipped in the zfs-zed package.  dpkg, however, does not support
 conffile handling of symlinks, and therefore any changes (removals) to
 the symlinks are not preserved on package upgrade.
-[0] https://salsa.debian.org/zfsonlinux-team/zfs.git
+To address this policy violation, we:
-Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+ 1. During package build, create a list of enabled-by-default zedlets,
    instead of creating symlinks.
 2. On package removal, identify all enabled-by-default zedlets whose
    symlinks do not exist (i.e., were removed by the user). This is done
    by creating "whiteout" links to /dev/null in their place).
 3. On package installation, create links to enabled-by-default zedlets
    UNLESS there is already a file there (i.e., abort if there is a
    whiteout link).
 4. We also clean up broken symlinks to removed zedlets at package
    postinst.
 Description: track default symlinks, instead of symlinking
 Forwarded: no need
 (cherry picked from https://salsa.debian.org/zfsonlinux-team/zfs/-/commit/5cee380324d7)
 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 ---
 cmd/zed/zed.d/Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am
-index 2c8173b3e..ad39292e4 100644
+index 812558cf6..f802cf140 100644
 --- a/cmd/zed/zed.d/Makefile.am
 +++ b/cmd/zed/zed.d/Makefile.am
-@@ -49,7 +49,7 @@ install-data-hook:
+@@ -48,7 +48,7 @@ zed-install-data-hook:
- 	for f in $(zedconfdefaults); do \
+ 	set -x; for f in $(zedconfdefaults); do \
- 	  test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \
+ 	  [ -f "$(DESTDIR)$(zedconfdir)/$${f}" ] ||\
- 	       -L "$(DESTDIR)$(zedconfdir)/$${f}" || \
+ 	    [ -L "$(DESTDIR)$(zedconfdir)/$${f}" ] || \
-	    ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \
+-	    $(LN_S) "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \
 +	    echo "$${f}" >> "$(DESTDIR)$(zedexecdir)/DEFAULT-ENABLED" ; \
 	done
 	chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc"
 SHELLCHECKSCRIPTS += $(dist_zedconf_DATA) $(dist_zedexec_SCRIPTS) $(nodist_zedexec_SCRIPTS)
@@ -13,16 +13,28 @@ can contain characters which will be escaped by systemd.
 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 ---
- etc/systemd/system/50-zfs.preset.in       |  1 +
+ etc/Makefile.am                           |  1 +
- etc/systemd/system/Makefile.am            |  1 +
+ etc/systemd/system/50-zfs.preset          |  1 +
 etc/systemd/system/zfs-import@.service.in | 16 ++++++++++++++++
 3 files changed, 18 insertions(+)
 create mode 100644 etc/systemd/system/zfs-import@.service.in
-diff --git a/etc/systemd/system/50-zfs.preset.in b/etc/systemd/system/50-zfs.preset.in
+diff --git a/etc/Makefile.am b/etc/Makefile.am
 index 7187762d3..de131dc87 100644
 --- a/etc/Makefile.am
 +++ b/etc/Makefile.am
@@ -54,6 +54,7 @@ dist_systemdpreset_DATA = \
 systemdunit_DATA = \
 	%D%/systemd/system/zfs-import-cache.service \
 	%D%/systemd/system/zfs-import-scan.service \
 +	%D%/systemd/system/zfs-import@.service \
 	%D%/systemd/system/zfs-import.target \
 	%D%/systemd/system/zfs-mount.service \
 	%D%/systemd/system/zfs-scrub-monthly@.timer \
 diff --git a/etc/systemd/system/50-zfs.preset b/etc/systemd/system/50-zfs.preset
 index e4056a92c..030611419 100644
--- a/etc/systemd/system/50-zfs.preset.in
+--- a/etc/systemd/system/50-zfs.preset
-+++ b/etc/systemd/system/50-zfs.preset.in
+++ b/etc/systemd/system/50-zfs.preset
@@ -1,6 +1,7 @@
 # ZFS is enabled by default
 enable zfs-import-cache.service
@@ -31,18 +43,6 @@ index e4056a92c..030611419 100644
 enable zfs-import.target
 enable zfs-mount.service
 enable zfs-share.service
 diff --git a/etc/systemd/system/Makefile.am b/etc/systemd/system/Makefile.am
 index 35f833de5..af3ae597c 100644
 --- a/etc/systemd/system/Makefile.am
 +++ b/etc/systemd/system/Makefile.am
@@ -7,6 +7,7 @@ systemdunit_DATA = \
 	zfs-zed.service \
 	zfs-import-cache.service \
 	zfs-import-scan.service \
 +	zfs-import@.service \
 	zfs-mount.service \
 	zfs-share.service \
 	zfs-volume-wait.service \
 diff --git a/etc/systemd/system/zfs-import@.service.in b/etc/systemd/system/zfs-import@.service.in
 new file mode 100644
 index 000000000..9b4ee9371
@@ -15,36 +15,36 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 rename man/{man1/arcstat.1 => man8/arcstat.8} (99%)
 diff --git a/man/Makefile.am b/man/Makefile.am
-index 64650c2b9..95a66a62f 100644
+index 45156571e..3713e9371 100644
 --- a/man/Makefile.am
 +++ b/man/Makefile.am
-@@ -8,7 +8,6 @@ dist_man_MANS = \
+@@ -2,7 +2,6 @@ dist_noinst_man_MANS = \
- 	man1/ztest.1 \
+ 	%D%/man1/cstyle.1
- 	man1/raidz_test.1 \
+ 
- 	man1/zvol_wait.1 \
+ dist_man_MANS = \
-	man1/arcstat.1 \
+-	%D%/man1/arcstat.1 \
- 	\
+ 	%D%/man1/raidz_test.1 \
- 	man5/vdev_id.conf.5 \
+ 	%D%/man1/test-runner.1 \
- 	\
+ 	%D%/man1/zhack.1 \
@@ -22,6 +21,7 @@ dist_man_MANS = \
- 	man7/zpoolconcepts.7 \
+ 	%D%/man7/zpoolconcepts.7 \
- 	man7/zpoolprops.7 \
+ 	%D%/man7/zpoolprops.7 \
 	\
-+	man8/arcstat.8 \
+	%D%/man8/arcstat.8 \
- 	man8/fsck.zfs.8 \
+ 	%D%/man8/fsck.zfs.8 \
- 	man8/mount.zfs.8 \
+ 	%D%/man8/mount.zfs.8 \
- 	man8/vdev_id.8 \
+ 	%D%/man8/vdev_id.8 \
 diff --git a/man/man1/arcstat.1 b/man/man8/arcstat.8
 similarity index 99%
 rename from man/man1/arcstat.1
 rename to man/man8/arcstat.8
-index a69cd8937..dfe9c971b 100644
+index 82358fa68..a8fb55498 100644
 --- a/man/man1/arcstat.1
 +++ b/man/man8/arcstat.8
@@ -13,7 +13,7 @@
 .\" Copyright (c) 2020 by AJ Jordan. All rights reserved.
 .\"
- .Dd May 26, 2021
+ .Dd December 23, 2022
 -.Dt ARCSTAT 1
 +.Dt ARCSTAT 8
 .Os
@@ -22,15 +22,15 @@ l2_bufc_data_asize l2_bufc_metadata_asize to .get accessor
 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 ---
- cmd/arc_summary/arc_summary3 | 28 ++++++++++++++--------------
+ cmd/arc_summary | 28 ++++++++++++++--------------
- cmd/arcstat/arcstat.in       | 14 +++++++-------
+ cmd/arcstat.in  | 14 +++++++-------
 2 files changed, 21 insertions(+), 21 deletions(-)
-diff --git a/cmd/arc_summary/arc_summary3 b/cmd/arc_summary/arc_summary3
+diff --git a/cmd/arc_summary b/cmd/arc_summary
-index 9d0c2d30d..fd2581ae2 100755
+index 9c69ec4f8..edf94ea2a 100755
--- a/cmd/arc_summary/arc_summary3
+--- a/cmd/arc_summary
-+++ b/cmd/arc_summary/arc_summary3
+++ b/cmd/arc_summary
-@@ -609,13 +609,13 @@ def section_arc(kstats_dict):
+@@ -655,13 +655,13 @@ def section_arc(kstats_dict):
     prt_i1('L2 cached evictions:', f_bytes(arc_stats['evict_l2_cached']))
     prt_i1('L2 eligible evictions:', f_bytes(arc_stats['evict_l2_eligible']))
     prt_i2('L2 eligible MFU evictions:',
@@ -48,7 +48,7 @@ index 9d0c2d30d..fd2581ae2 100755
     prt_i1('L2 ineligible evictions:',
            f_bytes(arc_stats['evict_l2_ineligible']))
     print()
-@@ -757,20 +757,20 @@ def section_l2arc(kstats_dict):
+@@ -851,20 +851,20 @@ def section_l2arc(kstats_dict):
            f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']),
            f_bytes(arc_stats['l2_hdr_size']))
     prt_i2('MFU allocated size:',
@@ -79,11 +79,11 @@ index 9d0c2d30d..fd2581ae2 100755
     print()
     prt_1('L2ARC breakdown:', f_hits(l2_access_total))
-diff --git a/cmd/arcstat/arcstat.in b/cmd/arcstat/arcstat.in
+diff --git a/cmd/arcstat.in b/cmd/arcstat.in
-index d2b2e28d1..8004940b3 100755
+index 8df1c62f7..833348d0e 100755
--- a/cmd/arcstat/arcstat.in
+--- a/cmd/arcstat.in
-+++ b/cmd/arcstat/arcstat.in
+++ b/cmd/arcstat.in
-@@ -482,8 +482,8 @@ def calculate():
+@@ -565,8 +565,8 @@ def calculate():
     v["el2skip"] = d["evict_l2_skip"] // sint
     v["el2cach"] = d["evict_l2_cached"] // sint
     v["el2el"] = d["evict_l2_eligible"] // sint
@@ -94,7 +94,7 @@ index d2b2e28d1..8004940b3 100755
     v["el2inel"] = d["evict_l2_ineligible"] // sint
     v["mtxmis"] = d["mutex_miss"] // sint
-@@ -498,11 +498,11 @@ def calculate():
+@@ -581,11 +581,11 @@ def calculate():
         v["l2size"] = cur["l2_size"]
         v["l2bytes"] = d["l2_read_bytes"] // sint
@@ -1,134 +0,0 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Valmiky Arquissandas <kayvlim@gmail.com>
 Date: Fri, 8 Oct 2021 16:32:27 +0100
 Subject: [PATCH] arcstat: Fix integer division with python3
 The arcstat script requests compatibility with python2 and python3, but
 PEP 238 modified the / operator and results in erroneous output when
 run under python3.
 This commit replaces instances of / with //, yielding the expected
 result in both versions of Python.
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Reviewed-by: John Kennedy <john.kennedy@delphix.com>
 Reviewed-by: Ryan Moeller <ryan@ixsystems.com>
 Signed-off-by: Valmiky Arquissandas <foss@kayvlim.com>
 Closes #12603
 (cherry picked from commit 2d02bba23d83ae8fede8d281edc255f01ccd28e9)
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 ---
 cmd/arcstat/arcstat.in | 66 +++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 33 deletions(-)
 diff --git a/cmd/arcstat/arcstat.in b/cmd/arcstat/arcstat.in
 index 0128fd817..d2b2e28d1 100755
 --- a/cmd/arcstat/arcstat.in
 +++ b/cmd/arcstat/arcstat.in
@@ -441,73 +441,73 @@ def calculate():
     v = dict()
     v["time"] = time.strftime("%H:%M:%S", time.localtime())
 -    v["hits"] = d["hits"] / sint
 -    v["miss"] = d["misses"] / sint
 +    v["hits"] = d["hits"] // sint
 +    v["miss"] = d["misses"] // sint
     v["read"] = v["hits"] + v["miss"]
 -    v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0
 +    v["hit%"] = 100 * v["hits"] // v["read"] if v["read"] > 0 else 0
     v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0
 -    v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint
 -    v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint
 +    v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) // sint
 +    v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) // sint
     v["dread"] = v["dhit"] + v["dmis"]
 -    v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0
 +    v["dh%"] = 100 * v["dhit"] // v["dread"] if v["dread"] > 0 else 0
     v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0
 -    v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint
 +    v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) // sint
     v["pmis"] = (d["prefetch_data_misses"] +
 -                 d["prefetch_metadata_misses"]) / sint
 +                 d["prefetch_metadata_misses"]) // sint
     v["pread"] = v["phit"] + v["pmis"]
 -    v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0
 +    v["ph%"] = 100 * v["phit"] // v["pread"] if v["pread"] > 0 else 0
     v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0
     v["mhit"] = (d["prefetch_metadata_hits"] +
 -                 d["demand_metadata_hits"]) / sint
 +                 d["demand_metadata_hits"]) // sint
     v["mmis"] = (d["prefetch_metadata_misses"] +
 -                 d["demand_metadata_misses"]) / sint
 +                 d["demand_metadata_misses"]) // sint
     v["mread"] = v["mhit"] + v["mmis"]
 -    v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0
 +    v["mh%"] = 100 * v["mhit"] // v["mread"] if v["mread"] > 0 else 0
     v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0
     v["arcsz"] = cur["size"]
     v["size"] = cur["size"]
     v["c"] = cur["c"]
 -    v["mfu"] = d["mfu_hits"] / sint
 -    v["mru"] = d["mru_hits"] / sint
 -    v["mrug"] = d["mru_ghost_hits"] / sint
 -    v["mfug"] = d["mfu_ghost_hits"] / sint
 -    v["eskip"] = d["evict_skip"] / sint
 -    v["el2skip"] = d["evict_l2_skip"] / sint
 -    v["el2cach"] = d["evict_l2_cached"] / sint
 -    v["el2el"] = d["evict_l2_eligible"] / sint
 -    v["el2mfu"] = d["evict_l2_eligible_mfu"] / sint
 -    v["el2mru"] = d["evict_l2_eligible_mru"] / sint
 -    v["el2inel"] = d["evict_l2_ineligible"] / sint
 -    v["mtxmis"] = d["mutex_miss"] / sint
 +    v["mfu"] = d["mfu_hits"] // sint
 +    v["mru"] = d["mru_hits"] // sint
 +    v["mrug"] = d["mru_ghost_hits"] // sint
 +    v["mfug"] = d["mfu_ghost_hits"] // sint
 +    v["eskip"] = d["evict_skip"] // sint
 +    v["el2skip"] = d["evict_l2_skip"] // sint
 +    v["el2cach"] = d["evict_l2_cached"] // sint
 +    v["el2el"] = d["evict_l2_eligible"] // sint
 +    v["el2mfu"] = d["evict_l2_eligible_mfu"] // sint
 +    v["el2mru"] = d["evict_l2_eligible_mru"] // sint
 +    v["el2inel"] = d["evict_l2_ineligible"] // sint
 +    v["mtxmis"] = d["mutex_miss"] // sint
     if l2exist:
 -        v["l2hits"] = d["l2_hits"] / sint
 -        v["l2miss"] = d["l2_misses"] / sint
 +        v["l2hits"] = d["l2_hits"] // sint
 +        v["l2miss"] = d["l2_misses"] // sint
         v["l2read"] = v["l2hits"] + v["l2miss"]
 -        v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0
 +        v["l2hit%"] = 100 * v["l2hits"] // v["l2read"] if v["l2read"] > 0 else 0
         v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0
         v["l2asize"] = cur["l2_asize"]
         v["l2size"] = cur["l2_size"]
 -        v["l2bytes"] = d["l2_read_bytes"] / sint
 +        v["l2bytes"] = d["l2_read_bytes"] // sint
         v["l2pref"] = cur["l2_prefetch_asize"]
         v["l2mfu"] = cur["l2_mfu_asize"]
         v["l2mru"] = cur["l2_mru_asize"]
         v["l2data"] = cur["l2_bufc_data_asize"]
         v["l2meta"] = cur["l2_bufc_metadata_asize"]
 -        v["l2pref%"] = 100 * v["l2pref"] / v["l2asize"]
 -        v["l2mfu%"] = 100 * v["l2mfu"] / v["l2asize"]
 -        v["l2mru%"] = 100 * v["l2mru"] / v["l2asize"]
 -        v["l2data%"] = 100 * v["l2data"] / v["l2asize"]
 -        v["l2meta%"] = 100 * v["l2meta"] / v["l2asize"]
 +        v["l2pref%"] = 100 * v["l2pref"] // v["l2asize"]
 +        v["l2mfu%"] = 100 * v["l2mfu"] // v["l2asize"]
 +        v["l2mru%"] = 100 * v["l2mru"] // v["l2asize"]
 +        v["l2data%"] = 100 * v["l2data"] // v["l2asize"]
 +        v["l2meta%"] = 100 * v["l2meta"] // v["l2asize"]
     v["grow"] = 0 if cur["arc_no_grow"] else 1
     v["need"] = cur["arc_need_free"]
@@ -0,0 +1,76 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: siv0 <github@nomore.at>
 Date: Tue, 31 Oct 2023 21:57:54 +0100
 Subject: [PATCH] Fix nfs_truncate_shares without /etc/exports.d
 Calling nfs_reset_shares on Linux prints a warning:
 `failed to lock /etc/exports.d/zfs.exports.lock: No such file or
 directory`
 when /etc/exports.d does not exist. The directory gets created, when a
 filesystem is actually exported through nfs_toggle_share and
 nfs_init_share. The truncation of /etc/exports.d/zfs.exports happens
 unconditionally when calling `zfs mount -a` (via zfs_do_mount and
 share_mount in `cmd/zfs/zfs_main.c`).
 Fixing the issue only in the Linux part, since the exports file on
 freebsd is in `/etc/zfs/`, which seems present on 2 FreeBSD systems I
 have access to (through `/etc/zfs/compatibility.d/`), while a Debian
 box does not have the directory even if `/usr/sbin/exportfs` is
 present through the `nfs-kernel-server` package.
 The code for exports_available is copied from nfs_available above.
 Fixes: ede037cda73675f42b1452187e8dd3438fafc220
 ("Make zfs-share service resilient to stale exports")
 Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
 Closes #15369
 Closes #15468
 (cherry picked from commit 41e55b476bcfc90f1ad81c02c5375367fdace9e9)
 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
 ---
 lib/libshare/os/linux/nfs.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 diff --git a/lib/libshare/os/linux/nfs.c b/lib/libshare/os/linux/nfs.c
 index 004946b0c..3dce81840 100644
 --- a/lib/libshare/os/linux/nfs.c
 +++ b/lib/libshare/os/linux/nfs.c
@@ -47,6 +47,7 @@
 static boolean_t nfs_available(void);
 +static boolean_t exports_available(void);
 typedef int (*nfs_shareopt_callback_t)(const char *opt, const char *value,
     void *cookie);
@@ -539,6 +540,8 @@ nfs_commit_shares(void)
 static void
 nfs_truncate_shares(void)
 {
 +	if (!exports_available())
 +		return;
 	nfs_reset_shares(ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE);
 }
@@ -566,3 +569,18 @@ nfs_available(void)
 	return (avail == 1);
 }
 +
 +static boolean_t
 +exports_available(void)
 +{
 +	static int avail;
 +
 +	if (!avail) {
 +		if (access(ZFS_EXPORTS_DIR, F_OK) != 0)
 +			avail = -1;
 +		else
 +			avail = 1;
 +	}
 +
 +	return (avail == 1);
 +}
@@ -0,0 +1,66 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Thomas Lamprecht <t.lamprecht@proxmox.com>
 Date: Sun, 12 Nov 2023 15:52:25 +0100
 Subject: [PATCH] zpool status: tighten bounds for noalloc stat availabillity
 When running zfs 2.2.0 userspace utilities with a kernel that still
 has 2.1.13 modules zpool status adds `(non-allocating)` next to the
 disk name of a single-disk pool.
 The reason for this seems to be that the patch adding the `vs_pspace` field was
 backported, but the one adding `vs_noalloc` was not.
 Itself that is not a problem, but in 2.2 `noalloc` was added before `psspace`,
 so the struct layout between 2.1.13 and 2.2.0 do NOT match anymore...
 I.e., the struct looks like the following at the end for ZFS 2.1.x:
 ```
 typedef struct vdev_stat {
    hrtime_t        vs_timestamp;           /* time since vdev load */
    // snip
    uint64_t        vs_logical_ashift;      /* vdev_logical_ashift  */
    uint64_t        vs_physical_ashift;     /* vdev_physical_ashift */
    uint64_t        vs_pspace;              /* physical capacity */
 } vdev_stat_t;
 ```
 And like the following on ZFS 2.2.x:
 ```
 typedef struct vdev_stat {
    hrtime_t        vs_timestamp;           /* time since vdev load */
    // snip
    uint64_t        vs_logical_ashift;      /* vdev_logical_ashift  */
    uint64_t        vs_physical_ashift;     /* vdev_physical_ashift */
    uint64_t        vs_noalloc;             /* allocations halted?  */
    uint64_t        vs_pspace;              /* physical capacity */
 } vdev_stat_t;
 ```
 Resulting in 2.2.x user-space tooling interpreting the `vs_pspace` field from
 the 2.1.x kernel module as `vs_noalloc` field.
 For now, work-around that discrepancy by coupling the availability of
 the vs_noalloc field with the one of the vs_pspace one, as when both
 are returned from the module we can be sure that our struct layout
 matches again.
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 ---
 cmd/zpool/zpool_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
 diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
 index 69bf9649a..fd42ce7c1 100644
 --- a/cmd/zpool/zpool_main.c
 +++ b/cmd/zpool/zpool_main.c
@@ -2616,7 +2616,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 	if (vs->vs_scan_removing != 0) {
 		(void) printf(gettext("  (removing)"));
 -	} else if (VDEV_STAT_VALID(vs_noalloc, vsc) && vs->vs_noalloc != 0) {
 +	} else if (VDEV_STAT_VALID(vs_pspace, vsc)
 +		       && VDEV_STAT_VALID(vs_noalloc, vsc) && vs->vs_noalloc != 0) {
 		(void) printf(gettext("  (non-allocating)"));
 	}
@@ -0,0 +1,52 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
 Date: Wed, 6 Mar 2024 10:39:06 +0100
 Subject: [PATCH] udev: correctly handle partition #16 and later
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 If a zvol has more than 15 partitions, the minor device number exhausts
 the slot count reserved for partitions next to the zvol itself. As a
 result, the minor number cannot be used to determine the partition
 number for the higher partition, and doing so results in wrong named
 symlinks being generated by udev.
 Since the partition number is encoded in the block device name anyway,
 let's just extract it from there instead.
 Fixes: #15904
 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
 ---
 udev/zvol_id.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)
 diff --git a/udev/zvol_id.c b/udev/zvol_id.c
 index 5960b9787..609349594 100644
 --- a/udev/zvol_id.c
 +++ b/udev/zvol_id.c
@@ -51,7 +51,7 @@ const char *__asan_default_options(void) {
 int
 main(int argc, const char *const *argv)
 {
 -	if (argc != 2) {
 +	if (argc != 2 || strncmp(argv[1], "/dev/zd", 7) != 0) {
 		fprintf(stderr, "usage: %s /dev/zdX\n", argv[0]);
 		return (1);
 	}
@@ -72,9 +72,10 @@ main(int argc, const char *const *argv)
 		return (1);
 	}
 -	unsigned int dev_part = minor(sb.st_rdev) % ZVOL_MINORS;
 -	if (dev_part != 0)
 -		sprintf(zvol_name + strlen(zvol_name), "-part%u", dev_part);
 +	const char *dev_part = strrchr(dev_name, 'p');
 +	if (dev_part != NULL) {
 +		sprintf(zvol_name + strlen(zvol_name), "-part%s", dev_part + 1);
 +	}
 	for (size_t i = 0; i < strlen(zvol_name); ++i)
 		if (isblank(zvol_name[i]))
@@ -0,0 +1,135 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob N <robn@despairlabs.com>
 Date: Thu, 21 Mar 2024 10:46:15 +1100
 Subject: [PATCH] Linux 6.8 compat: use splice_copy_file_range() for fallback
 Linux 6.8 removes generic_copy_file_range(), which had been reduced to a
 simple wrapper around splice_copy_file_range(). Detect that function
 directly and use it if generic_ is not available.
 Sponsored-by: https://despairlabs.com/sponsor/
 Reviewed-by: Tony Hutter <hutter2@llnl.gov>
 Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Rob Norris <robn@despairlabs.com>
 Closes #15930
 Closes #15931
 (cherry picked from commit ef08a4d4065d21414d7fedccac20da6bfda4dfd0)
 ---
 config/kernel-vfs-file_range.m4      | 27 +++++++++++++++++++++++++++
 config/kernel.m4                     |  2 ++
 module/os/linux/zfs/zpl_file_range.c | 16 ++++++++++++++--
 3 files changed, 43 insertions(+), 2 deletions(-)
 diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4
 index cc96404d8..8a5cbe2ee 100644
 --- a/config/kernel-vfs-file_range.m4
 +++ b/config/kernel-vfs-file_range.m4
@@ -16,6 +16,9 @@ dnl #
 dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
 dnl #      generic_copy_file_range() added to support it
 dnl #
 +dnl # 6.8: generic_copy_file_range() removed, replaced by
 +dnl #      splice_copy_file_range()
 +dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
 	ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
 		#include <linux/fs.h>
@@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
 	])
 ])
 +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
 +	ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
 +		#include <linux/splice.h>
 +	], [
 +		struct file *src_file __attribute__ ((unused)) = NULL;
 +		loff_t src_off __attribute__ ((unused)) = 0;
 +		struct file *dst_file __attribute__ ((unused)) = NULL;
 +		loff_t dst_off __attribute__ ((unused)) = 0;
 +		size_t len __attribute__ ((unused)) = 0;
 +		splice_copy_file_range(src_file, src_off, dst_file, dst_off,
 +		    len);
 +	])
 +])
 +AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
 +	AC_MSG_CHECKING([whether splice_copy_file_range() is available])
 +	ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
 +		AC_MSG_RESULT(yes)
 +		AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
 +		    [splice_copy_file_range() is available])
 +	],[
 +		AC_MSG_RESULT(no)
 +	])
 +])
 +
 AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
 	ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
 		#include <linux/fs.h>
 diff --git a/config/kernel.m4 b/config/kernel.m4
 index e3f864577..1d0c5a27f 100644
 --- a/config/kernel.m4
 +++ b/config/kernel.m4
@@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
 	ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
 +	ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
@@ -266,6 +267,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_VFS_IOV_ITER
 	ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
 +	ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
 diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
 index 3065d54fa..64728fdb1 100644
 --- a/module/os/linux/zfs/zpl_file_range.c
 +++ b/module/os/linux/zfs/zpl_file_range.c
@@ -26,6 +26,9 @@
 #include <linux/compat.h>
 #endif
 #include <linux/fs.h>
 +#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
 +#include <linux/splice.h>
 +#endif
 #include <sys/file.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_vnops.h>
@@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	ret = zpl_clone_file_range_impl(src_file, src_off,
 	    dst_file, dst_off, len);
 -#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
 +#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
 	/*
 	 * Since Linux 5.3 the filesystem driver is responsible for executing
 	 * an appropriate fallback, and a generic fallback function is provided.
@@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	    ret == -EAGAIN)
 		ret = generic_copy_file_range(src_file, src_off, dst_file,
 		    dst_off, len, flags);
 +#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
 +	/*
 +	 * Since 6.8 the fallback function is called splice_copy_file_range
 +	 * and has a slightly different signature.
 +	 */
 +	if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
 +	    ret == -EAGAIN)
 +		ret = splice_copy_file_range(src_file, src_off, dst_file,
 +		    dst_off, len);
 #else
 	/*
 	 * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
@@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	 */
 	if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
 		ret = -EOPNOTSUPP;
 -#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
 +#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
 	return (ret);
 }
@@ -0,0 +1,121 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob Norris <rob.norris@klarasystems.com>
 Date: Mon, 13 Nov 2023 17:55:29 +1100
 Subject: [PATCH] linux 5.4 compat: page_size()
 Before 5.4 we have to do a little math.
 Reviewed-by: Alexander Motin <mav@FreeBSD.org>
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
 Sponsored-by: Klara, Inc.
 Sponsored-by: Wasabi Technology, Inc.
 Closes #15533
 Closes #15588
 (cherry picked from commit df04efe321a49c650f1fbaa6fd701fa2928cbe21)
 ---
 config/kernel-mm-page-size.m4             | 17 +++++++++++
 config/kernel.m4                          |  2 ++
 include/os/linux/Makefile.am              |  1 +
 include/os/linux/kernel/linux/mm_compat.h | 36 +++++++++++++++++++++++
 4 files changed, 56 insertions(+)
 create mode 100644 config/kernel-mm-page-size.m4
 create mode 100644 include/os/linux/kernel/linux/mm_compat.h
 diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4
 new file mode 100644
 index 000000000..d5ebd9269
 --- /dev/null
 +++ b/config/kernel-mm-page-size.m4
@@ -0,0 +1,17 @@
 +AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
 +	ZFS_LINUX_TEST_SRC([page_size], [
 +		#include <linux/mm.h>
 +	],[
 +		unsigned long s;
 +		s = page_size(NULL);
 +	])
 +])
 +AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
 +	AC_MSG_CHECKING([whether page_size() is available])
 +	ZFS_LINUX_TEST_RESULT([page_size], [
 +		AC_MSG_RESULT(yes)
 +		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
 +	],[
 +		AC_MSG_RESULT(no)
 +	])
 +])
 diff --git a/config/kernel.m4 b/config/kernel.m4
 index 1d0c5a27f..548905ccd 100644
 --- a/config/kernel.m4
 +++ b/config/kernel.m4
@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
 +	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -316,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
 +	ZFS_AC_KERNEL_MM_PAGE_SIZE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
 diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
 index 3830d198d..51c27132b 100644
 --- a/include/os/linux/Makefile.am
 +++ b/include/os/linux/Makefile.am
@@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
 	%D%/kernel/linux/compiler_compat.h \
 	%D%/kernel/linux/dcache_compat.h \
 	%D%/kernel/linux/kmap_compat.h \
 +	%D%/kernel/linux/mm_compat.h \
 	%D%/kernel/linux/mod_compat.h \
 	%D%/kernel/linux/page_compat.h \
 	%D%/kernel/linux/percpu_compat.h \
 diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h
 new file mode 100644
 index 000000000..40056c68d
 --- /dev/null
 +++ b/include/os/linux/kernel/linux/mm_compat.h
@@ -0,0 +1,36 @@
 +/*
 + * CDDL HEADER START
 + *
 + * The contents of this file are subject to the terms of the
 + * Common Development and Distribution License (the "License").
 + * You may not use this file except in compliance with the License.
 + *
 + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 + * or https://opensource.org/licenses/CDDL-1.0.
 + * See the License for the specific language governing permissions
 + * and limitations under the License.
 + *
 + * When distributing Covered Code, include this CDDL HEADER in each
 + * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 + * If applicable, add the following below this CDDL HEADER, with the
 + * fields enclosed by brackets "[]" replaced with your own identifying
 + * information: Portions Copyright [yyyy] [name of copyright owner]
 + *
 + * CDDL HEADER END
 + */
 +
 +/*
 + * Copyright (c) 2023, 2024, Klara Inc.
 + */
 +
 +#ifndef _ZFS_MM_COMPAT_H
 +#define	_ZFS_MM_COMPAT_H
 +
 +#include <linux/mm.h>
 +
 +/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
 +#ifndef HAVE_MM_PAGE_SIZE
 +#define	page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
 +#endif
 +
 +#endif /* _ZFS_MM_COMPAT_H */
@@ -0,0 +1,334 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob Norris <rob.norris@klarasystems.com>
 Date: Mon, 11 Dec 2023 16:05:54 +1100
 Subject: [PATCH] abd: add page iterator
 The regular ABD iterators yield data buffers, so they have to map and
 unmap pages into kernel memory. If the caller only wants to count
 chunks, or can use page pointers directly, then the map/unmap is just
 unnecessary overhead.
 This adds adb_iterate_page_func, which yields unmapped struct page
 instead.
 Reviewed-by: Alexander Motin <mav@FreeBSD.org>
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
 Sponsored-by: Klara, Inc.
 Sponsored-by: Wasabi Technology, Inc.
 Closes #15533
 Closes #15588
 (cherry picked from commit 390b448726c580999dd337be7a40b0e95cf1d50b)
 ---
 include/sys/abd.h              |   7 +++
 include/sys/abd_impl.h         |  26 ++++++++-
 module/os/freebsd/zfs/abd_os.c |   4 +-
 module/os/linux/zfs/abd_os.c   | 104 ++++++++++++++++++++++++++++++---
 module/zfs/abd.c               |  42 +++++++++++++
 5 files changed, 169 insertions(+), 14 deletions(-)
 diff --git a/include/sys/abd.h b/include/sys/abd.h
 index 750f9986c..8a2df0bca 100644
 --- a/include/sys/abd.h
 +++ b/include/sys/abd.h
@@ -79,6 +79,9 @@ typedef struct abd {
 typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
 typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
 +#if defined(__linux__) && defined(_KERNEL)
 +typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
 +#endif
 extern int zfs_abd_scatter_enabled;
@@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
 int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
 int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
     abd_iter_func2_t *, void *);
 +#if defined(__linux__) && defined(_KERNEL)
 +int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
 +    void *);
 +#endif
 void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
 void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
 void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
 diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
 index 40546d4af..f88ea25e2 100644
 --- a/include/sys/abd_impl.h
 +++ b/include/sys/abd_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
 + * Copyright (c) 2023, 2024, Klara Inc.
  */
 #ifndef _ABD_IMPL_H
@@ -38,12 +39,30 @@ typedef enum abd_stats_op {
 	ABDSTAT_DECR  /* Decrease abdstat values */
 } abd_stats_op_t;
 -struct scatterlist; /* forward declaration */
 +/* forward declarations */
 +struct scatterlist;
 +struct page;
 struct abd_iter {
 	/* public interface */
 -	void		*iter_mapaddr;	/* addr corresponding to iter_pos */
 -	size_t		iter_mapsize;	/* length of data valid at mapaddr */
 +	union {
 +		/* for abd_iter_map()/abd_iter_unmap() */
 +		struct {
 +			/* addr corresponding to iter_pos */
 +			void		*iter_mapaddr;
 +			/* length of data valid at mapaddr */
 +			size_t		iter_mapsize;
 +		};
 +		/* for abd_iter_page() */
 +		struct {
 +			/* current page */
 +			struct page	*iter_page;
 +			/* offset of data in page */
 +			size_t		iter_page_doff;
 +			/* size of data in page */
 +			size_t		iter_page_dsize;
 +		};
 +	};
 	/* private */
 	abd_t		*iter_abd;	/* ABD being iterated through */
@@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
 void abd_iter_advance(struct abd_iter *, size_t);
 void abd_iter_map(struct abd_iter *);
 void abd_iter_unmap(struct abd_iter *);
 +void abd_iter_page(struct abd_iter *);
 /*
  * Helper macros
 diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
 index 58a37df62..3b812271f 100644
 --- a/module/os/freebsd/zfs/abd_os.c
 +++ b/module/os/freebsd/zfs/abd_os.c
@@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
 	ASSERT(!abd_is_gang(abd));
 	abd_verify(abd);
 +	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
 -	aiter->iter_pos = 0;
 -	aiter->iter_mapaddr = NULL;
 -	aiter->iter_mapsize = 0;
 }
 /*
 diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
 index 24390fbbf..dae128012 100644
 --- a/module/os/linux/zfs/abd_os.c
 +++ b/module/os/linux/zfs/abd_os.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 by Delphix. All rights reserved.
 + * Copyright (c) 2023, 2024, Klara Inc.
  */
 /*
@@ -59,6 +60,7 @@
 #include <sys/zfs_znode.h>
 #ifdef _KERNEL
 #include <linux/kmap_compat.h>
 +#include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
 #endif
@@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
 	ASSERT(!abd_is_gang(abd));
 	abd_verify(abd);
 +	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
 -	aiter->iter_mapaddr = NULL;
 -	aiter->iter_mapsize = 0;
 -	aiter->iter_pos = 0;
 -	if (abd_is_linear(abd)) {
 -		aiter->iter_offset = 0;
 -		aiter->iter_sg = NULL;
 -	} else {
 +	if (!abd_is_linear(abd)) {
 		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
 		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
 	}
@@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 boolean_t
 abd_iter_at_end(struct abd_iter *aiter)
 {
 +	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
 }
@@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter)
 void
 abd_iter_advance(struct abd_iter *aiter, size_t amount)
 {
 +	/*
 +	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
 +	 * this state (directly or abd_iter_unmap()) before advancing.
 +	 */
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
 +	ASSERT3P(aiter->iter_page, ==, NULL);
 +	ASSERT0(aiter->iter_page_doff);
 +	ASSERT0(aiter->iter_page_dsize);
 	/* There's nothing left to advance to, so do nothing */
 	if (abd_iter_at_end(aiter))
@@ -1009,6 +1014,88 @@ abd_cache_reap_now(void)
 }
 #if defined(_KERNEL)
 +/*
 + * Yield the next page struct and data offset and size within it, without
 + * mapping it into the address space.
 + */
 +void
 +abd_iter_page(struct abd_iter *aiter)
 +{
 +	if (abd_iter_at_end(aiter)) {
 +		aiter->iter_page = NULL;
 +		aiter->iter_page_doff = 0;
 +		aiter->iter_page_dsize = 0;
 +		return;
 +	}
 +
 +	struct page *page;
 +	size_t doff, dsize;
 +
 +	if (abd_is_linear(aiter->iter_abd)) {
 +		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
 +
 +		/* memory address at iter_pos */
 +		void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
 +
 +		/* struct page for address */
 +		page = is_vmalloc_addr(paddr) ?
 +		    vmalloc_to_page(paddr) : virt_to_page(paddr);
 +
 +		/* offset of address within the page */
 +		doff = offset_in_page(paddr);
 +
 +		/* total data remaining in abd from this position */
 +		dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
 +	} else {
 +		ASSERT(!abd_is_gang(aiter->iter_abd));
 +
 +		/* current scatter page */
 +		page = sg_page(aiter->iter_sg);
 +
 +		/* position within page */
 +		doff = aiter->iter_offset;
 +
 +		/* remaining data in scatterlist */
 +		dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
 +		    aiter->iter_abd->abd_size - aiter->iter_pos);
 +	}
 +	ASSERT(page);
 +
 +	if (PageTail(page)) {
 +		/*
 +		 * This page is part of a "compound page", which is a group of
 +		 * pages that can be referenced from a single struct page *.
 +		 * Its organised as a "head" page, followed by a series of
 +		 * "tail" pages.
 +		 *
 +		 * In OpenZFS, compound pages are allocated using the
 +		 * __GFP_COMP flag, which we get from scatter ABDs and SPL
 +		 * vmalloc slabs (ie >16K allocations). So a great many of the
 +		 * IO buffers we get are going to be of this type.
 +		 *
 +		 * The tail pages are just regular PAGE_SIZE pages, and can be
 +		 * safely used as-is. However, the head page has length
 +		 * covering itself and all the tail pages. If this ABD chunk
 +		 * spans multiple pages, then we can use the head page and a
 +		 * >PAGE_SIZE length, which is far more efficient.
 +		 *
 +		 * To do this, we need to adjust the offset to be counted from
 +		 * the head page. struct page for compound pages are stored
 +		 * contiguously, so we can just adjust by a simple offset.
 +		 */
 +		struct page *head = compound_head(page);
 +		doff += ((page - head) * PAGESIZE);
 +		page = head;
 +	}
 +
 +	/* final page and position within it */
 +	aiter->iter_page = page;
 +	aiter->iter_page_doff = doff;
 +
 +	/* amount of data in the chunk, up to the end of the page */
 +	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
 +}
 +
 /*
  * bio_nr_pages for ABD.
  * @off is the offset in @abd
@@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
 module_param(zfs_abd_scatter_max_order, uint, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
 	"Maximum order allocation used for a scatter ABD.");
 -#endif
 +
 +#endif /* _KERNEL */
 diff --git a/module/zfs/abd.c b/module/zfs/abd.c
 index d982f201c..3388e2357 100644
 --- a/module/zfs/abd.c
 +++ b/module/zfs/abd.c
@@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
 	return (ret);
 }
 +#if defined(__linux__) && defined(_KERNEL)
 +int
 +abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
 +    abd_iter_page_func_t *func, void *private)
 +{
 +	struct abd_iter aiter;
 +	int ret = 0;
 +
 +	if (size == 0)
 +		return (0);
 +
 +	abd_verify(abd);
 +	ASSERT3U(off + size, <=, abd->abd_size);
 +
 +	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
 +
 +	while (size > 0) {
 +		IMPLY(abd_is_gang(abd), c_abd != NULL);
 +
 +		abd_iter_page(&aiter);
 +
 +		size_t len = MIN(aiter.iter_page_dsize, size);
 +		ASSERT3U(len, >, 0);
 +
 +		ret = func(aiter.iter_page, aiter.iter_page_doff,
 +		    len, private);
 +
 +		aiter.iter_page = NULL;
 +		aiter.iter_page_doff = 0;
 +		aiter.iter_page_dsize = 0;
 +
 +		if (ret != 0)
 +			break;
 +
 +		size -= len;
 +		c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
 +	}
 +
 +	return (ret);
 +}
 +#endif
 +
 struct buf_arg {
 	void *arg_buf;
 };
@@ -0,0 +1,349 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob Norris <rob.norris@klarasystems.com>
 Date: Tue, 9 Jan 2024 12:12:56 +1100
 Subject: [PATCH] vdev_disk: rename existing functions to vdev_classic_*
 This is just renaming the existing functions we're about to replace and
 grouping them together to make the next commits easier to follow.
 Reviewed-by: Alexander Motin <mav@FreeBSD.org>
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
 Sponsored-by: Klara, Inc.
 Sponsored-by: Wasabi Technology, Inc.
 Closes #15533
 Closes #15588
 (cherry picked from commit f3b85d706bae82957d2e3e0ef1d53a1cfab60eb4)
 ---
 include/sys/abd.h               |   2 +
 module/os/linux/zfs/abd_os.c    |   5 +
 module/os/linux/zfs/vdev_disk.c | 215 +++++++++++++++++---------------
 3 files changed, 120 insertions(+), 102 deletions(-)
 diff --git a/include/sys/abd.h b/include/sys/abd.h
 index 8a2df0bca..bee38b831 100644
 --- a/include/sys/abd.h
 +++ b/include/sys/abd.h
@@ -220,6 +220,8 @@ void abd_fini(void);
 /*
  * Linux ABD bio functions
 + * Note: these are only needed to support vdev_classic. See comment in
 + * vdev_disk.c.
  */
 #if defined(__linux__) && defined(_KERNEL)
 unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
 diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
 index dae128012..3fe01c0b7 100644
 --- a/module/os/linux/zfs/abd_os.c
 +++ b/module/os/linux/zfs/abd_os.c
@@ -1096,6 +1096,11 @@ abd_iter_page(struct abd_iter *aiter)
 	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
 }
 +/*
 + * Note: ABD BIO functions only needed to support vdev_classic. See comments in
 + * vdev_disk.c.
 + */
 +
 /*
  * bio_nr_pages for ABD.
  * @off is the offset in @abd
 diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
 index b0bda5fa2..957619b87 100644
 --- a/module/os/linux/zfs/vdev_disk.c
 +++ b/module/os/linux/zfs/vdev_disk.c
@@ -83,17 +83,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
  */
 #define	EFI_MIN_RESV_SIZE	(16 * 1024)
 -/*
 - * Virtual device vector for disks.
 - */
 -typedef struct dio_request {
 -	zio_t			*dr_zio;	/* Parent ZIO */
 -	atomic_t		dr_ref;		/* References */
 -	int			dr_error;	/* Bio error */
 -	int			dr_bio_count;	/* Count of bio's */
 -	struct bio		*dr_bio[];	/* Attached bio's */
 -} dio_request_t;
 -
 /*
  * BIO request failfast mask.
  */
@@ -467,85 +456,6 @@ vdev_disk_close(vdev_t *v)
 	v->vdev_tsd = NULL;
 }
 -static dio_request_t *
 -vdev_disk_dio_alloc(int bio_count)
 -{
 -	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
 -	    sizeof (struct bio *) * bio_count, KM_SLEEP);
 -	atomic_set(&dr->dr_ref, 0);
 -	dr->dr_bio_count = bio_count;
 -	dr->dr_error = 0;
 -
 -	for (int i = 0; i < dr->dr_bio_count; i++)
 -		dr->dr_bio[i] = NULL;
 -
 -	return (dr);
 -}
 -
 -static void
 -vdev_disk_dio_free(dio_request_t *dr)
 -{
 -	int i;
 -
 -	for (i = 0; i < dr->dr_bio_count; i++)
 -		if (dr->dr_bio[i])
 -			bio_put(dr->dr_bio[i]);
 -
 -	kmem_free(dr, sizeof (dio_request_t) +
 -	    sizeof (struct bio *) * dr->dr_bio_count);
 -}
 -
 -static void
 -vdev_disk_dio_get(dio_request_t *dr)
 -{
 -	atomic_inc(&dr->dr_ref);
 -}
 -
 -static void
 -vdev_disk_dio_put(dio_request_t *dr)
 -{
 -	int rc = atomic_dec_return(&dr->dr_ref);
 -
 -	/*
 -	 * Free the dio_request when the last reference is dropped and
 -	 * ensure zio_interpret is called only once with the correct zio
 -	 */
 -	if (rc == 0) {
 -		zio_t *zio = dr->dr_zio;
 -		int error = dr->dr_error;
 -
 -		vdev_disk_dio_free(dr);
 -
 -		if (zio) {
 -			zio->io_error = error;
 -			ASSERT3S(zio->io_error, >=, 0);
 -			if (zio->io_error)
 -				vdev_disk_error(zio);
 -
 -			zio_delay_interrupt(zio);
 -		}
 -	}
 -}
 -
 -BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
 -{
 -	dio_request_t *dr = bio->bi_private;
 -
 -	if (dr->dr_error == 0) {
 -#ifdef HAVE_1ARG_BIO_END_IO_T
 -		dr->dr_error = BIO_END_IO_ERROR(bio);
 -#else
 -		if (error)
 -			dr->dr_error = -(error);
 -		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 -			dr->dr_error = EIO;
 -#endif
 -	}
 -
 -	/* Drop reference acquired by __vdev_disk_physio */
 -	vdev_disk_dio_put(dr);
 -}
 -
 static inline void
 vdev_submit_bio_impl(struct bio *bio)
 {
@@ -697,8 +607,107 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
 	return (bio);
 }
 +/* ========== */
 +
 +/*
 + * This is the classic, battle-tested BIO submission code.
 + *
 + * These functions have been renamed to vdev_classic_* to make it clear what
 + * they belong to, but their implementations are unchanged.
 + */
 +
 +/*
 + * Virtual device vector for disks.
 + */
 +typedef struct dio_request {
 +	zio_t			*dr_zio;	/* Parent ZIO */
 +	atomic_t		dr_ref;		/* References */
 +	int			dr_error;	/* Bio error */
 +	int			dr_bio_count;	/* Count of bio's */
 +	struct bio		*dr_bio[];	/* Attached bio's */
 +} dio_request_t;
 +
 +static dio_request_t *
 +vdev_classic_dio_alloc(int bio_count)
 +{
 +	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
 +	    sizeof (struct bio *) * bio_count, KM_SLEEP);
 +	atomic_set(&dr->dr_ref, 0);
 +	dr->dr_bio_count = bio_count;
 +	dr->dr_error = 0;
 +
 +	for (int i = 0; i < dr->dr_bio_count; i++)
 +		dr->dr_bio[i] = NULL;
 +
 +	return (dr);
 +}
 +
 +static void
 +vdev_classic_dio_free(dio_request_t *dr)
 +{
 +	int i;
 +
 +	for (i = 0; i < dr->dr_bio_count; i++)
 +		if (dr->dr_bio[i])
 +			bio_put(dr->dr_bio[i]);
 +
 +	kmem_free(dr, sizeof (dio_request_t) +
 +	    sizeof (struct bio *) * dr->dr_bio_count);
 +}
 +
 +static void
 +vdev_classic_dio_get(dio_request_t *dr)
 +{
 +	atomic_inc(&dr->dr_ref);
 +}
 +
 +static void
 +vdev_classic_dio_put(dio_request_t *dr)
 +{
 +	int rc = atomic_dec_return(&dr->dr_ref);
 +
 +	/*
 +	 * Free the dio_request when the last reference is dropped and
 +	 * ensure zio_interpret is called only once with the correct zio
 +	 */
 +	if (rc == 0) {
 +		zio_t *zio = dr->dr_zio;
 +		int error = dr->dr_error;
 +
 +		vdev_classic_dio_free(dr);
 +
 +		if (zio) {
 +			zio->io_error = error;
 +			ASSERT3S(zio->io_error, >=, 0);
 +			if (zio->io_error)
 +				vdev_disk_error(zio);
 +
 +			zio_delay_interrupt(zio);
 +		}
 +	}
 +}
 +
 +BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
 +{
 +	dio_request_t *dr = bio->bi_private;
 +
 +	if (dr->dr_error == 0) {
 +#ifdef HAVE_1ARG_BIO_END_IO_T
 +		dr->dr_error = BIO_END_IO_ERROR(bio);
 +#else
 +		if (error)
 +			dr->dr_error = -(error);
 +		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 +			dr->dr_error = EIO;
 +#endif
 +	}
 +
 +	/* Drop reference acquired by vdev_classic_physio */
 +	vdev_classic_dio_put(dr);
 +}
 +
 static inline unsigned int
 -vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 +vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 {
 	unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
 	    bio_size, abd_offset);
@@ -711,7 +720,7 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 }
 static int
 -__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 +vdev_classic_physio(struct block_device *bdev, zio_t *zio,
     size_t io_size, uint64_t io_offset, int rw, int flags)
 {
 	dio_request_t *dr;
@@ -736,7 +745,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 	}
 retry:
 -	dr = vdev_disk_dio_alloc(bio_count);
 +	dr = vdev_classic_dio_alloc(bio_count);
 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
 	    zio->io_vd->vdev_failfast == B_TRUE) {
@@ -771,23 +780,23 @@ retry:
 		 * this should be rare - see the comment above.
 		 */
 		if (dr->dr_bio_count == i) {
 -			vdev_disk_dio_free(dr);
 +			vdev_classic_dio_free(dr);
 			bio_count *= 2;
 			goto retry;
 		}
 -		nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
 +		nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
 		dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
 		if (unlikely(dr->dr_bio[i] == NULL)) {
 -			vdev_disk_dio_free(dr);
 +			vdev_classic_dio_free(dr);
 			return (SET_ERROR(ENOMEM));
 		}
 -		/* Matching put called by vdev_disk_physio_completion */
 -		vdev_disk_dio_get(dr);
 +		/* Matching put called by vdev_classic_physio_completion */
 +		vdev_classic_dio_get(dr);
 		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
 -		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
 +		dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
 		dr->dr_bio[i]->bi_private = dr;
 		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
@@ -801,7 +810,7 @@ retry:
 	}
 	/* Extra reference to protect dio_request during vdev_submit_bio */
 -	vdev_disk_dio_get(dr);
 +	vdev_classic_dio_get(dr);
 	if (dr->dr_bio_count > 1)
 		blk_start_plug(&plug);
@@ -815,11 +824,13 @@ retry:
 	if (dr->dr_bio_count > 1)
 		blk_finish_plug(&plug);
 -	vdev_disk_dio_put(dr);
 +	vdev_classic_dio_put(dr);
 	return (error);
 }
 +/* ========== */
 +
 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
 {
 	zio_t *zio = bio->bi_private;
@@ -1023,7 +1034,7 @@ vdev_disk_io_start(zio_t *zio)
 	}
 	zio->io_target_timestamp = zio_handle_io_delay(zio);
 -	error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
 +	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
 	    zio->io_size, zio->io_offset, rw, 0);
 	rw_exit(&vd->vd_lock);
@@ -0,0 +1,111 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob Norris <rob.norris@klarasystems.com>
 Date: Tue, 9 Jan 2024 12:23:30 +1100
 Subject: [PATCH] vdev_disk: reorganise vdev_disk_io_start
 Light reshuffle to make it a bit more linear to read and get rid of a
 bunch of args that aren't needed in all cases.
 Reviewed-by: Alexander Motin <mav@FreeBSD.org>
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
 Sponsored-by: Klara, Inc.
 Sponsored-by: Wasabi Technology, Inc.
 Closes #15533
 Closes #15588
 (cherry picked from commit 867178ae1db28e73051c8a7ce662f2f2f81cd8e6)
 ---
 module/os/linux/zfs/vdev_disk.c | 51 ++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 20 deletions(-)
 diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
 index 957619b87..51e7cef2f 100644
 --- a/module/os/linux/zfs/vdev_disk.c
 +++ b/module/os/linux/zfs/vdev_disk.c
@@ -720,9 +720,16 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 }
 static int
 -vdev_classic_physio(struct block_device *bdev, zio_t *zio,
 -    size_t io_size, uint64_t io_offset, int rw, int flags)
 +vdev_classic_physio(zio_t *zio)
 {
 +	vdev_t *v = zio->io_vd;
 +	vdev_disk_t *vd = v->vdev_tsd;
 +	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
 +	size_t io_size = zio->io_size;
 +	uint64_t io_offset = zio->io_offset;
 +	int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
 +	int flags = 0;
 +
 	dio_request_t *dr;
 	uint64_t abd_offset;
 	uint64_t bio_offset;
@@ -944,7 +951,7 @@ vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
 -	int rw, error;
 +	int error;
 	/*
 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
@@ -1007,13 +1014,6 @@ vdev_disk_io_start(zio_t *zio)
 		rw_exit(&vd->vd_lock);
 		zio_execute(zio);
 		return;
 -	case ZIO_TYPE_WRITE:
 -		rw = WRITE;
 -		break;
 -
 -	case ZIO_TYPE_READ:
 -		rw = READ;
 -		break;
 	case ZIO_TYPE_TRIM:
 		zio->io_error = vdev_disk_io_trim(zio);
@@ -1026,23 +1026,34 @@ vdev_disk_io_start(zio_t *zio)
 #endif
 		return;
 -	default:
 +	case ZIO_TYPE_READ:
 +	case ZIO_TYPE_WRITE:
 +		zio->io_target_timestamp = zio_handle_io_delay(zio);
 +		error = vdev_classic_physio(zio);
 		rw_exit(&vd->vd_lock);
 -		zio->io_error = SET_ERROR(ENOTSUP);
 -		zio_interrupt(zio);
 +		if (error) {
 +			zio->io_error = error;
 +			zio_interrupt(zio);
 +		}
 		return;
 -	}
 -	zio->io_target_timestamp = zio_handle_io_delay(zio);
 -	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
 -	    zio->io_size, zio->io_offset, rw, 0);
 -	rw_exit(&vd->vd_lock);
 +	default:
 +		/*
 +		 * Getting here means our parent vdev has made a very strange
 +		 * request of us, and shouldn't happen. Assert here to force a
 +		 * crash in dev builds, but in production return the IO
 +		 * unhandled. The pool will likely suspend anyway but that's
 +		 * nicer than crashing the kernel.
 +		 */
 +		ASSERT3S(zio->io_type, ==, -1);
 -	if (error) {
 -		zio->io_error = error;
 +		rw_exit(&vd->vd_lock);
 +		zio->io_error = SET_ERROR(ENOTSUP);
 		zio_interrupt(zio);
 		return;
 	}
 +
 +	__builtin_unreachable();
 }
 static void
@@ -0,0 +1,69 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob Norris <rob.norris@klarasystems.com>
 Date: Tue, 9 Jan 2024 12:29:19 +1100
 Subject: [PATCH] vdev_disk: make read/write IO function configurable
 This is just setting up for the next couple of commits, which will add a
 new IO function and a parameter to select it.
 Reviewed-by: Alexander Motin <mav@FreeBSD.org>
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
 Sponsored-by: Klara, Inc.
 Sponsored-by: Wasabi Technology, Inc.
 Closes #15533
 Closes #15588
 (cherry picked from commit c4a13ba483f08a81aa47479d2f763a470d95b2b0)
 ---
 module/os/linux/zfs/vdev_disk.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)
 diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
 index 51e7cef2f..de4dba72f 100644
 --- a/module/os/linux/zfs/vdev_disk.c
 +++ b/module/os/linux/zfs/vdev_disk.c
@@ -946,6 +946,8 @@ vdev_disk_io_trim(zio_t *zio)
 #endif
 }
 +int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
 +
 static void
 vdev_disk_io_start(zio_t *zio)
 {
@@ -1029,7 +1031,7 @@ vdev_disk_io_start(zio_t *zio)
 	case ZIO_TYPE_READ:
 	case ZIO_TYPE_WRITE:
 		zio->io_target_timestamp = zio_handle_io_delay(zio);
 -		error = vdev_classic_physio(zio);
 +		error = vdev_disk_io_rw_fn(zio);
 		rw_exit(&vd->vd_lock);
 		if (error) {
 			zio->io_error = error;
@@ -1102,8 +1104,25 @@ vdev_disk_rele(vdev_t *vd)
 	/* XXX: Implement me as a vnode rele for the device */
 }
 +/*
 + * At first use vdev use, set the submission function from the default value if
 + * it hasn't been set already.
 + */
 +static int
 +vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 +{
 +	(void) spa;
 +	(void) nv;
 +	(void) tsd;
 +
 +	if (vdev_disk_io_rw_fn == NULL)
 +		vdev_disk_io_rw_fn = vdev_classic_physio;
 +
 +	return (0);
 +}
 +
 vdev_ops_t vdev_disk_ops = {
 -	.vdev_op_init = NULL,
 +	.vdev_op_init = vdev_disk_init,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_disk_open,
 	.vdev_op_close = vdev_disk_close,
@@ -0,0 +1,671 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob Norris <rob.norris@klarasystems.com>
 Date: Tue, 18 Jul 2023 11:11:29 +1000
 Subject: [PATCH] vdev_disk: rewrite BIO filling machinery to avoid split pages
 This commit tackles a number of issues in the way BIOs (`struct bio`)
 are constructed for submission to the Linux block layer.
 The kernel has a hard upper limit on the number of pages/segments that
 can be added to a BIO, as well as a separate limit for each device
 (related to its queue depth and other scheduling characteristics).
 ZFS counts the number of memory pages in the request ABD
 (`abd_nr_pages_off()`, and then uses that as the number of segments to
 put into the BIO, up to the hard upper limit. If it requires more than
 the limit, it will create multiple BIOs.
 Leaving aside the fact that page count method is wrong (see below), not
 limiting to the device segment max means that the device driver will
 need to split the BIO in half. This is alone is not necessarily a
 problem, but it interacts with another issue to cause a much larger
 problem.
 The kernel function to add a segment to a BIO (`bio_add_page()`) takes a
 `struct page` pointer, and offset+len within it. `struct page` can
 represent a run of contiguous memory pages (known as a "compound page").
 In can be of arbitrary length.
 The ZFS functions that count ABD pages and load them into the BIO
 (`abd_nr_pages_off()`, `bio_map()` and `abd_bio_map_off()`) will never
 consider a page to be more than `PAGE_SIZE` (4K), even if the `struct
 page` is for multiple pages. In this case, it will load the same `struct
 page` into the BIO multiple times, with the offset adjusted each time.
 With a sufficiently large ABD, this can easily lead to the BIO being
 entirely filled much earlier than it could have been. This is also
 further contributes to the problem caused by the incorrect segment limit
 calculation, as its much easier to go past the device limit, and so
 require a split.
 Again, this is not a problem on its own.
 The logic for "never submit more than `PAGE_SIZE`" is actually a little
 more subtle. It will actually never submit a buffer that crosses a 4K
 page boundary.
 In practice, this is fine, as most ABDs are scattered, that is a list of
 complete 4K pages, and so are loaded in as such.
 Linear ABDs are typically allocated from slabs, and for small sizes they
 are frequently not aligned to page boundaries. For example, a 12K
 allocation can span four pages, eg:
     -- 4K -- -- 4K -- -- 4K -- -- 4K --
    |        |        |        |        |
          :## ######## ######## ######:    [1K, 4K, 4K, 3K]
 Such an allocation would be loaded into a BIO as you see:
    [1K, 4K, 4K, 3K]
 This tends not to be a problem in practice, because even if the BIO were
 filled and needed to be split, each half would still have either a start
 or end aligned to the logical block size of the device (assuming 4K at
 least).
 ---
 In ideal circumstances, these shortcomings don't cause any particular
 problems. Its when they start to interact with other ZFS features that
 things get interesting.
 Aggregation will create a "gang" ABD, which is simply a list of other
 ABDs. Iterating over a gang ABD is just iterating over each ABD within
 it in turn.
 Because the segments are simply loaded in order, we can end up with
 uneven segments either side of the "gap" between the two ABDs. For
 example, two 12K ABDs might be aggregated and then loaded as:
    [1K, 4K, 4K, 3K, 2K, 4K, 4K, 2K]
 Should a split occur, each individual BIO can end up either having an
 start or end offset that is not aligned to the logical block size, which
 some drivers (eg SCSI) will reject. However, this tends not to happen
 because the default aggregation limit usually keeps the BIO small enough
 to not require more than one split, and most pages are actually full 4K
 pages, so hitting an uneven gap is very rare anyway.
 If the pool is under particular memory pressure, then an IO can be
 broken down into a "gang block", a 512-byte block composed of a header
 and up to three block pointers. Each points to a fragment of the
 original write, or in turn, another gang block, breaking the original
 data up over and over until space can be found in the pool for each of
 them.
 Each gang header is a separate 512-byte memory allocation from a slab,
 that needs to be written down to disk. When the gang header is added to
 the BIO, its a single 512-byte segment.
 Pulling all this together, consider a large aggregated write of gang
 blocks. This results a BIO containing lots of 512-byte segments. Given
 our tendency to overfill the BIO, a split is likely, and most possible
 split points will yield a pair of BIOs that are misaligned. Drivers that
 care, like the SCSI driver, will reject them.
 ---
 This commit is a substantial refactor and rewrite of much of `vdev_disk`
 to sort all this out.
 `vdev_bio_max_segs()` now returns the ideal maximum size for the device,
 if available. There's also a tuneable `zfs_vdev_disk_max_segs` to
 override this, to assist with testing.
 We scan the ABD up front to count the number of pages within it, and to
 confirm that if we submitted all those pages to one or more BIOs, it
 could be split at any point with creating a misaligned BIO.  If the
 pages in the BIO are not usable (as in any of the above situations), the
 ABD is linearised, and then checked again. This is the same technique
 used in `vdev_geom` on FreeBSD, adjusted for Linux's variable page size
 and allocator quirks.
 `vbio_t` is a cleanup and enhancement of the old `dio_request_t`. The
 idea is simply that it can hold all the state needed to create, submit
 and return multiple BIOs, including all the refcounts, the ABD copy if
 it was needed, and so on. Apart from what I hope is a clearer interface,
 the major difference is that because we know how many BIOs we'll need up
 front, we don't need the old overflow logic that would grow the BIO
 array, throw away all the old work and restart. We can get it right from
 the start.
 Reviewed-by: Alexander Motin <mav@FreeBSD.org>
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
 Sponsored-by: Klara, Inc.
 Sponsored-by: Wasabi Technology, Inc.
 Closes #15533
 Closes #15588
 (cherry picked from commit 06a196020e6f70d2fedbd4d0d05bbe0c1ac6e4d8)
 ---
 include/os/linux/kernel/linux/mod_compat.h |   1 +
 man/man4/zfs.4                             |  10 +-
 module/os/linux/zfs/vdev_disk.c            | 439 ++++++++++++++++++++-
 3 files changed, 447 insertions(+), 3 deletions(-)
 diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h
 index 8e20a9613..039865b70 100644
 --- a/include/os/linux/kernel/linux/mod_compat.h
 +++ b/include/os/linux/kernel/linux/mod_compat.h
@@ -68,6 +68,7 @@ enum scope_prefix_types {
 	zfs_trim,
 	zfs_txg,
 	zfs_vdev,
 +	zfs_vdev_disk,
 	zfs_vdev_file,
 	zfs_vdev_mirror,
 	zfs_vnops,
 diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
 index 352990e02..b5679f2f0 100644
 --- a/man/man4/zfs.4
 +++ b/man/man4/zfs.4
@@ -2,6 +2,7 @@
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
 .\" Copyright (c) 2019 Datto Inc.
 +.\" Copyright (c) 2023, 2024 Klara, Inc.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
@@ -15,7 +16,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
 -.Dd July 21, 2023
 +.Dd January 9, 2024
 .Dt ZFS 4
 .Os
 .
@@ -1345,6 +1346,13 @@ _
 	4	Driver	No driver retries on driver errors.
 .TE
 .
 +.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
 +Maximum number of segments to add to a BIO (min 4).
 +If this is higher than the maximum allowed by the device queue or the kernel
 +itself, it will be clamped.
 +Setting it to zero will cause the kernel's ideal size to be used.
 +This parameter only applies on Linux.
 +.
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
 .Pa .zfs/snapshot .
 diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
 index de4dba72f..0ccb9ad96 100644
 --- a/module/os/linux/zfs/vdev_disk.c
 +++ b/module/os/linux/zfs/vdev_disk.c
@@ -24,6 +24,7 @@
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
 + * Copyright (c) 2023, 2024, Klara Inc.
  */
 #include <sys/zfs_context.h>
@@ -66,6 +67,13 @@ typedef struct vdev_disk {
 	krwlock_t			vd_lock;
 } vdev_disk_t;
 +/*
 + * Maximum number of segments to add to a bio (min 4). If this is higher than
 + * the maximum allowed by the device queue or the kernel itself, it will be
 + * clamped. Setting it to zero will cause the kernel's ideal size to be used.
 + */
 +uint_t zfs_vdev_disk_max_segs = 0;
 +
 /*
  * Unique identifier for the exclusive vdev holder.
  */
@@ -607,10 +615,433 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
 	return (bio);
 }
 +static inline uint_t
 +vdev_bio_max_segs(struct block_device *bdev)
 +{
 +	/*
 +	 * Smallest of the device max segs and the tuneable max segs. Minimum
 +	 * 4, so there's room to finish split pages if they come up.
 +	 */
 +	const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
 +	const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
 +	    MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
 +	const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
 +
 +#ifdef HAVE_BIO_MAX_SEGS
 +	return (bio_max_segs(max_segs));
 +#else
 +	return (MIN(max_segs, BIO_MAX_PAGES));
 +#endif
 +}
 +
 +static inline uint_t
 +vdev_bio_max_bytes(struct block_device *bdev)
 +{
 +	return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
 +}
 +
 +
 +/*
 + * Virtual block IO object (VBIO)
 + *
 + * Linux block IO (BIO) objects have a limit on how many data segments (pages)
 + * they can hold. Depending on how they're allocated and structured, a large
 + * ZIO can require more than one BIO to be submitted to the kernel, which then
 + * all have to complete before we can return the completed ZIO back to ZFS.
 + *
 + * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
 + * translate a ZIO down into the kernel block layer and back again.
 + *
 + * Note that these are only used for data ZIOs (read/write). Meta-operations
 + * (flush/trim) don't need multiple BIOs and so can just make the call
 + * directly.
 + */
 +typedef struct {
 +	zio_t		*vbio_zio;	/* parent zio */
 +
 +	struct block_device *vbio_bdev;	/* blockdev to submit bios to */
 +
 +	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
 +
 +	atomic_t	vbio_ref;	/* bio refcount */
 +	int		vbio_error;	/* error from failed bio */
 +
 +	uint_t		vbio_max_segs;	/* max segs per bio */
 +
 +	uint_t		vbio_max_bytes;	/* max bytes per bio */
 +	uint_t		vbio_lbs_mask;	/* logical block size mask */
 +
 +	uint64_t	vbio_offset;	/* start offset of next bio */
 +
 +	struct bio	*vbio_bio;	/* pointer to the current bio */
 +	struct bio	*vbio_bios;	/* list of all bios */
 +} vbio_t;
 +
 +static vbio_t *
 +vbio_alloc(zio_t *zio, struct block_device *bdev)
 +{
 +	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
 +
 +	vbio->vbio_zio = zio;
 +	vbio->vbio_bdev = bdev;
 +	atomic_set(&vbio->vbio_ref, 0);
 +	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
 +	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
 +	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
 +	vbio->vbio_offset = zio->io_offset;
 +
 +	return (vbio);
 +}
 +
 +static int
 +vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 +{
 +	struct bio *bio;
 +	uint_t ssize;
 +
 +	while (size > 0) {
 +		bio = vbio->vbio_bio;
 +		if (bio == NULL) {
 +			/* New BIO, allocate and set up */
 +			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
 +			    vbio->vbio_max_segs);
 +			if (unlikely(bio == NULL))
 +				return (SET_ERROR(ENOMEM));
 +			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
 +
 +			bio->bi_next = vbio->vbio_bios;
 +			vbio->vbio_bios = vbio->vbio_bio = bio;
 +		}
 +
 +		/*
 +		 * Only load as much of the current page data as will fit in
 +		 * the space left in the BIO, respecting lbs alignment. Older
 +		 * kernels will error if we try to overfill the BIO, while
 +		 * newer ones will accept it and split the BIO. This ensures
 +		 * everything works on older kernels, and avoids an additional
 +		 * overhead on the new.
 +		 */
 +		ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
 +		    vbio->vbio_lbs_mask);
 +		if (ssize > 0 &&
 +		    bio_add_page(bio, page, ssize, offset) == ssize) {
 +			/* Accepted, adjust and load any remaining. */
 +			size -= ssize;
 +			offset += ssize;
 +			continue;
 +		}
 +
 +		/* No room, set up for a new BIO and loop */
 +		vbio->vbio_offset += BIO_BI_SIZE(bio);
 +
 +		/* Signal new BIO allocation wanted */
 +		vbio->vbio_bio = NULL;
 +	}
 +
 +	return (0);
 +}
 +
 +BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
 +static void vbio_put(vbio_t *vbio);
 +
 +static void
 +vbio_submit(vbio_t *vbio, int flags)
 +{
 +	ASSERT(vbio->vbio_bios);
 +	struct bio *bio = vbio->vbio_bios;
 +	vbio->vbio_bio = vbio->vbio_bios = NULL;
 +
 +	/*
 +	 * We take a reference for each BIO as we submit it, plus one to
 +	 * protect us from BIOs completing before we're done submitting them
 +	 * all, causing vbio_put() to free vbio out from under us and/or the
 +	 * zio to be returned before all its IO has completed.
 +	 */
 +	atomic_set(&vbio->vbio_ref, 1);
 +
 +	/*
 +	 * If we're submitting more than one BIO, inform the block layer so
 +	 * it can batch them if it wants.
 +	 */
 +	struct blk_plug plug;
 +	boolean_t do_plug = (bio->bi_next != NULL);
 +	if (do_plug)
 +		blk_start_plug(&plug);
 +
 +	/* Submit all the BIOs */
 +	while (bio != NULL) {
 +		atomic_inc(&vbio->vbio_ref);
 +
 +		struct bio *next = bio->bi_next;
 +		bio->bi_next = NULL;
 +
 +		bio->bi_end_io = vdev_disk_io_rw_completion;
 +		bio->bi_private = vbio;
 +		bio_set_op_attrs(bio,
 +		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
 +		    WRITE : READ, flags);
 +
 +		vdev_submit_bio(bio);
 +
 +		bio = next;
 +	}
 +
 +	/* Finish the batch */
 +	if (do_plug)
 +		blk_finish_plug(&plug);
 +
 +	/* Release the extra reference */
 +	vbio_put(vbio);
 +}
 +
 +static void
 +vbio_return_abd(vbio_t *vbio)
 +{
 +	zio_t *zio = vbio->vbio_zio;
 +	if (vbio->vbio_abd == NULL)
 +		return;
 +
 +	/*
 +	 * If we copied the ABD before issuing it, clean up and return the copy
 +	 * to the ADB, with changes if appropriate.
 +	 */
 +	void *buf = abd_to_buf(vbio->vbio_abd);
 +	abd_free(vbio->vbio_abd);
 +	vbio->vbio_abd = NULL;
 +
 +	if (zio->io_type == ZIO_TYPE_READ)
 +		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
 +	else
 +		abd_return_buf(zio->io_abd, buf, zio->io_size);
 +}
 +
 +static void
 +vbio_free(vbio_t *vbio)
 +{
 +	VERIFY0(atomic_read(&vbio->vbio_ref));
 +
 +	vbio_return_abd(vbio);
 +
 +	kmem_free(vbio, sizeof (vbio_t));
 +}
 +
 +static void
 +vbio_put(vbio_t *vbio)
 +{
 +	if (atomic_dec_return(&vbio->vbio_ref) > 0)
 +		return;
 +
 +	/*
 +	 * This was the last reference, so the entire IO is completed. Clean
 +	 * up and submit it for processing.
 +	 */
 +
 +	/*
 +	 * Get any data buf back to the original ABD, if necessary. We do this
 +	 * now so we can get the ZIO into the pipeline as quickly as possible,
 +	 * and then do the remaining cleanup after.
 +	 */
 +	vbio_return_abd(vbio);
 +
 +	zio_t *zio = vbio->vbio_zio;
 +
 +	/*
 +	 * Set the overall error. If multiple BIOs returned an error, only the
 +	 * first will be taken; the others are dropped (see
 +	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
 +	 * multiple IOs to the same device to fail with different errors, so
 +	 * there's no real risk.
 +	 */
 +	zio->io_error = vbio->vbio_error;
 +	if (zio->io_error)
 +		vdev_disk_error(zio);
 +
 +	/* All done, submit for processing */
 +	zio_delay_interrupt(zio);
 +
 +	/* Finish cleanup */
 +	vbio_free(vbio);
 +}
 +
 +BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
 +{
 +	vbio_t *vbio = bio->bi_private;
 +
 +	if (vbio->vbio_error == 0) {
 +#ifdef HAVE_1ARG_BIO_END_IO_T
 +		vbio->vbio_error = BIO_END_IO_ERROR(bio);
 +#else
 +		if (error)
 +			vbio->vbio_error = -(error);
 +		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 +			vbio->vbio_error = EIO;
 +#endif
 +	}
 +
 +	/*
 +	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
 +	 * kernel won't touch it again after the completion function runs.
 +	 */
 +	bio_put(bio);
 +
 +	/* Drop this BIOs reference acquired by vbio_submit() */
 +	vbio_put(vbio);
 +}
 +
 +/*
 + * Iterator callback to count ABD pages and check their size & alignment.
 + *
 + * On Linux, each BIO segment can take a page pointer, and an offset+length of
 + * the data within that page. A page can be arbitrarily large ("compound"
 + * pages) but we still have to ensure the data portion is correctly sized and
 + * aligned to the logical block size, to ensure that if the kernel wants to
 + * split the BIO, the two halves will still be properly aligned.
 + */
 +typedef struct {
 +	uint_t  bmask;
 +	uint_t  npages;
 +	uint_t  end;
 +} vdev_disk_check_pages_t;
 +
 +static int
 +vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
 +{
 +	vdev_disk_check_pages_t *s = priv;
 +
 +	/*
 +	 * If we didn't finish on a block size boundary last time, then there
 +	 * would be a gap if we tried to use this ABD as-is, so abort.
 +	 */
 +	if (s->end != 0)
 +		return (1);
 +
 +	/*
 +	 * Note if we're taking less than a full block, so we can check it
 +	 * above on the next call.
 +	 */
 +	s->end = len & s->bmask;
 +
 +	/* All blocks after the first must start on a block size boundary. */
 +	if (s->npages != 0 && (off & s->bmask) != 0)
 +		return (1);
 +
 +	s->npages++;
 +	return (0);
 +}
 +
 +/*
 + * Check if we can submit the pages in this ABD to the kernel as-is. Returns
 + * the number of pages, or 0 if it can't be submitted like this.
 + */
 +static boolean_t
 +vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
 +{
 +	vdev_disk_check_pages_t s = {
 +	    .bmask = bdev_logical_block_size(bdev)-1,
 +	    .npages = 0,
 +	    .end = 0,
 +	};
 +
 +	if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
 +		return (B_FALSE);
 +
 +	return (B_TRUE);
 +}
 +
 +/* Iterator callback to submit ABD pages to the vbio. */
 +static int
 +vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
 +{
 +	vbio_t *vbio = priv;
 +	return (vbio_add_page(vbio, page, len, off));
 +}
 +
 +static int
 +vdev_disk_io_rw(zio_t *zio)
 +{
 +	vdev_t *v = zio->io_vd;
 +	vdev_disk_t *vd = v->vdev_tsd;
 +	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
 +	int flags = 0;
 +
 +	/*
 +	 * Accessing outside the block device is never allowed.
 +	 */
 +	if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
 +		vdev_dbgmsg(zio->io_vd,
 +		    "Illegal access %llu size %llu, device size %llu",
 +		    (u_longlong_t)zio->io_offset,
 +		    (u_longlong_t)zio->io_size,
 +		    (u_longlong_t)i_size_read(bdev->bd_inode));
 +		return (SET_ERROR(EIO));
 +	}
 +
 +	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
 +	    v->vdev_failfast == B_TRUE) {
 +		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
 +		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
 +	}
 +
 +	/*
 +	 * Check alignment of the incoming ABD. If any part of it would require
 +	 * submitting a page that is not aligned to the logical block size,
 +	 * then we take a copy into a linear buffer and submit that instead.
 +	 * This should be impossible on a 512b LBS, and fairly rare on 4K,
 +	 * usually requiring abnormally-small data blocks (eg gang blocks)
 +	 * mixed into the same ABD as larger ones (eg aggregated).
 +	 */
 +	abd_t *abd = zio->io_abd;
 +	if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
 +		void *buf;
 +		if (zio->io_type == ZIO_TYPE_READ)
 +			buf = abd_borrow_buf(zio->io_abd, zio->io_size);
 +		else
 +			buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
 +
 +		/*
 +		 * Wrap the copy in an abd_t, so we can use the same iterators
 +		 * to count and fill the vbio later.
 +		 */
 +		abd = abd_get_from_buf(buf, zio->io_size);
 +
 +		/*
 +		 * False here would mean the borrowed copy has an invalid
 +		 * alignment too, which would mean we've somehow been passed a
 +		 * linear ABD with an interior page that has a non-zero offset
 +		 * or a size not a multiple of PAGE_SIZE. This is not possible.
 +		 * It would mean either zio_buf_alloc() or its underlying
 +		 * allocators have done something extremely strange, or our
 +		 * math in vdev_disk_check_pages() is wrong. In either case,
 +		 * something in seriously wrong and its not safe to continue.
 +		 */
 +		VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
 +	}
 +
 +	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
 +	int error = 0;
 +	vbio_t *vbio = vbio_alloc(zio, bdev);
 +	if (abd != zio->io_abd)
 +		vbio->vbio_abd = abd;
 +
 +	/* Fill it with pages */
 +	error = abd_iterate_page_func(abd, 0, zio->io_size,
 +	    vdev_disk_fill_vbio_cb, vbio);
 +	if (error != 0) {
 +		vbio_free(vbio);
 +		return (error);
 +	}
 +
 +	vbio_submit(vbio, flags);
 +	return (0);
 +}
 +
 /* ========== */
 /*
 - * This is the classic, battle-tested BIO submission code.
 + * This is the classic, battle-tested BIO submission code. Until we're totally
 + * sure that the new code is safe and correct in all cases, this will remain
 + * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
 + * load time.
  *
  * These functions have been renamed to vdev_classic_* to make it clear what
  * they belong to, but their implementations are unchanged.
@@ -1116,7 +1547,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 	(void) tsd;
 	if (vdev_disk_io_rw_fn == NULL)
 -		vdev_disk_io_rw_fn = vdev_classic_physio;
 +		/* XXX make configurable */
 +		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
 	return (0);
 }
@@ -1215,3 +1647,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 	"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
 +
 +ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
 +	"Maximum number of data segments to add to an IO request (min 4)");
@@ -0,0 +1,104 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob Norris <rob.norris@klarasystems.com>
 Date: Tue, 9 Jan 2024 13:28:57 +1100
 Subject: [PATCH] vdev_disk: add module parameter to select BIO submission
 method
 This makes the submission method selectable at module load time via the
 `zfs_vdev_disk_classic` parameter, allowing this change to be backported
 to 2.2 safely, and disabled in favour of the "classic" submission method
 if new problems come up.
 Reviewed-by: Alexander Motin <mav@FreeBSD.org>
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
 Sponsored-by: Klara, Inc.
 Sponsored-by: Wasabi Technology, Inc.
 Closes #15533
 Closes #15588
 (cherry picked from commit df2169d141aadc0c2cc728c5c5261d6f5c2a27f7)
 ---
 man/man4/zfs.4                  | 16 ++++++++++++++++
 module/os/linux/zfs/vdev_disk.c | 31 +++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)
 diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
 index b5679f2f0..6a628e7f3 100644
 --- a/man/man4/zfs.4
 +++ b/man/man4/zfs.4
@@ -1352,6 +1352,22 @@ If this is higher than the maximum allowed by the device queue or the kernel
 itself, it will be clamped.
 Setting it to zero will cause the kernel's ideal size to be used.
 This parameter only applies on Linux.
 +This parameter is ignored if
 +.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
 +.
 +.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 +If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
 +and earlier.
 +This "classic" method has known issues with highly fragmented IO requests and
 +is slower on many workloads, but it has been in use for many years and is known
 +to be very stable.
 +If you set this parameter, please also open a bug report why you did so,
 +including the workload involved and any error messages.
 +.Pp
 +This parameter and the classic submission method will be removed once we have
 +total confidence in the new method.
 +.Pp
 +This parameter only applies on Linux, and can only be set at module load time.
 .
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
 diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
 index 0ccb9ad96..a9110623a 100644
 --- a/module/os/linux/zfs/vdev_disk.c
 +++ b/module/os/linux/zfs/vdev_disk.c
@@ -1535,6 +1535,29 @@ vdev_disk_rele(vdev_t *vd)
 	/* XXX: Implement me as a vnode rele for the device */
 }
 +/*
 + * BIO submission method. See comment above about vdev_classic.
 + * Set zfs_vdev_disk_classic=0 for new, =1 for classic
 + */
 +static uint_t zfs_vdev_disk_classic = 0;	/* default new */
 +
 +/* Set submission function from module parameter */
 +static int
 +vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
 +{
 +	int err = param_set_uint(buf, kp);
 +	if (err < 0)
 +		return (SET_ERROR(err));
 +
 +	vdev_disk_io_rw_fn =
 +	    zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
 +
 +	printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
 +	    zfs_vdev_disk_classic ? "classic" : "new");
 +
 +	return (0);
 +}
 +
 /*
  * At first use vdev use, set the submission function from the default value if
  * it hasn't been set already.
@@ -1547,8 +1570,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 	(void) tsd;
 	if (vdev_disk_io_rw_fn == NULL)
 -		/* XXX make configurable */
 -		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
 +		vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
 +		    vdev_classic_physio : vdev_disk_io_rw;
 	return (0);
 }
@@ -1650,3 +1673,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
 	"Maximum number of data segments to add to an IO request (min 4)");
 +
 +ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
 +    vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
 +	"Use classic BIO submission method");
@@ -0,0 +1,363 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob Norris <rob.norris@klarasystems.com>
 Date: Wed, 21 Feb 2024 11:07:21 +1100
 Subject: [PATCH] vdev_disk: use bio_chain() to submit multiple BIOs
 Simplifies our code a lot, so we don't have to wait for each and
 reassemble them.
 Reviewed-by: Alexander Motin <mav@FreeBSD.org>
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
 Sponsored-by: Klara, Inc.
 Sponsored-by: Wasabi Technology, Inc.
 Closes #15533
 Closes #15588
 (cherry picked from commit 72fd834c47558cb10d847948d1a4615e894c77c3)
 ---
 module/os/linux/zfs/vdev_disk.c | 231 +++++++++++---------------------
 1 file changed, 80 insertions(+), 151 deletions(-)
 diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
 index a9110623a..36468fc21 100644
 --- a/module/os/linux/zfs/vdev_disk.c
 +++ b/module/os/linux/zfs/vdev_disk.c
@@ -454,10 +454,9 @@ vdev_disk_close(vdev_t *v)
 	if (v->vdev_reopening || vd == NULL)
 		return;
 -	if (vd->vd_bdh != NULL) {
 +	if (vd->vd_bdh != NULL)
 		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
 		    zfs_vdev_holder);
 -	}
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
@@ -663,9 +662,6 @@ typedef struct {
 	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
 -	atomic_t	vbio_ref;	/* bio refcount */
 -	int		vbio_error;	/* error from failed bio */
 -
 	uint_t		vbio_max_segs;	/* max segs per bio */
 	uint_t		vbio_max_bytes;	/* max bytes per bio */
@@ -674,43 +670,52 @@ typedef struct {
 	uint64_t	vbio_offset;	/* start offset of next bio */
 	struct bio	*vbio_bio;	/* pointer to the current bio */
 -	struct bio	*vbio_bios;	/* list of all bios */
 +	int		vbio_flags;	/* bio flags */
 } vbio_t;
 static vbio_t *
 -vbio_alloc(zio_t *zio, struct block_device *bdev)
 +vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
 {
 	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
 	vbio->vbio_zio = zio;
 	vbio->vbio_bdev = bdev;
 -	atomic_set(&vbio->vbio_ref, 0);
 +	vbio->vbio_abd = NULL;
 	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
 	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
 	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
 	vbio->vbio_offset = zio->io_offset;
 +	vbio->vbio_bio = NULL;
 +	vbio->vbio_flags = flags;
 	return (vbio);
 }
 +BIO_END_IO_PROTO(vbio_completion, bio, error);
 +
 static int
 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 {
 -	struct bio *bio;
 +	struct bio *bio = vbio->vbio_bio;
 	uint_t ssize;
 	while (size > 0) {
 -		bio = vbio->vbio_bio;
 		if (bio == NULL) {
 			/* New BIO, allocate and set up */
 			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
 			    vbio->vbio_max_segs);
 -			if (unlikely(bio == NULL))
 -				return (SET_ERROR(ENOMEM));
 +			VERIFY(bio);
 +
 			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
 +			bio_set_op_attrs(bio,
 +			    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
 +			    WRITE : READ, vbio->vbio_flags);
 -			bio->bi_next = vbio->vbio_bios;
 -			vbio->vbio_bios = vbio->vbio_bio = bio;
 +			if (vbio->vbio_bio) {
 +				bio_chain(vbio->vbio_bio, bio);
 +				vdev_submit_bio(vbio->vbio_bio);
 +			}
 +			vbio->vbio_bio = bio;
 		}
 		/*
@@ -735,157 +740,97 @@ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 		vbio->vbio_offset += BIO_BI_SIZE(bio);
 		/* Signal new BIO allocation wanted */
 -		vbio->vbio_bio = NULL;
 +		bio = NULL;
 	}
 	return (0);
 }
 -BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
 -static void vbio_put(vbio_t *vbio);
 +/* Iterator callback to submit ABD pages to the vbio. */
 +static int
 +vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
 +{
 +	vbio_t *vbio = priv;
 +	return (vbio_add_page(vbio, page, len, off));
 +}
 +/* Create some BIOs, fill them with data and submit them */
 static void
 -vbio_submit(vbio_t *vbio, int flags)
 +vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
 -	ASSERT(vbio->vbio_bios);
 -	struct bio *bio = vbio->vbio_bios;
 -	vbio->vbio_bio = vbio->vbio_bios = NULL;
 -
 -	/*
 -	 * We take a reference for each BIO as we submit it, plus one to
 -	 * protect us from BIOs completing before we're done submitting them
 -	 * all, causing vbio_put() to free vbio out from under us and/or the
 -	 * zio to be returned before all its IO has completed.
 -	 */
 -	atomic_set(&vbio->vbio_ref, 1);
 +	ASSERT(vbio->vbio_bdev);
 	/*
 -	 * If we're submitting more than one BIO, inform the block layer so
 -	 * it can batch them if it wants.
 +	 * We plug so we can submit the BIOs as we go and only unplug them when
 +	 * they are fully created and submitted. This is important; if we don't
 +	 * plug, then the kernel may start executing earlier BIOs while we're
 +	 * still creating and executing later ones, and if the device goes
 +	 * away while that's happening, older kernels can get confused and
 +	 * trample memory.
 	 */
 	struct blk_plug plug;
 -	boolean_t do_plug = (bio->bi_next != NULL);
 -	if (do_plug)
 -		blk_start_plug(&plug);
 +	blk_start_plug(&plug);
 -	/* Submit all the BIOs */
 -	while (bio != NULL) {
 -		atomic_inc(&vbio->vbio_ref);
 +	(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
 +	ASSERT(vbio->vbio_bio);
 -		struct bio *next = bio->bi_next;
 -		bio->bi_next = NULL;
 +	vbio->vbio_bio->bi_end_io = vbio_completion;
 +	vbio->vbio_bio->bi_private = vbio;
 -		bio->bi_end_io = vdev_disk_io_rw_completion;
 -		bio->bi_private = vbio;
 -		bio_set_op_attrs(bio,
 -		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
 -		    WRITE : READ, flags);
 +	vdev_submit_bio(vbio->vbio_bio);
 -		vdev_submit_bio(bio);
 -
 -		bio = next;
 -	}
 -
 -	/* Finish the batch */
 -	if (do_plug)
 -		blk_finish_plug(&plug);
 +	blk_finish_plug(&plug);
 -	/* Release the extra reference */
 -	vbio_put(vbio);
 +	vbio->vbio_bio = NULL;
 +	vbio->vbio_bdev = NULL;
 }
 -static void
 -vbio_return_abd(vbio_t *vbio)
 +/* IO completion callback */
 +BIO_END_IO_PROTO(vbio_completion, bio, error)
 {
 +	vbio_t *vbio = bio->bi_private;
 	zio_t *zio = vbio->vbio_zio;
 -	if (vbio->vbio_abd == NULL)
 -		return;
 -
 -	/*
 -	 * If we copied the ABD before issuing it, clean up and return the copy
 -	 * to the ADB, with changes if appropriate.
 -	 */
 -	void *buf = abd_to_buf(vbio->vbio_abd);
 -	abd_free(vbio->vbio_abd);
 -	vbio->vbio_abd = NULL;
 -
 -	if (zio->io_type == ZIO_TYPE_READ)
 -		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
 -	else
 -		abd_return_buf(zio->io_abd, buf, zio->io_size);
 -}
 -static void
 -vbio_free(vbio_t *vbio)
 -{
 -	VERIFY0(atomic_read(&vbio->vbio_ref));
 -
 -	vbio_return_abd(vbio);
 +	ASSERT(zio);
 -	kmem_free(vbio, sizeof (vbio_t));
 -}
 +	/* Capture and log any errors */
 +#ifdef HAVE_1ARG_BIO_END_IO_T
 +	zio->io_error = BIO_END_IO_ERROR(bio);
 +#else
 +	zio->io_error = 0;
 +	if (error)
 +		zio->io_error = -(error);
 +	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 +		zio->io_error = EIO;
 +#endif
 +	ASSERT3U(zio->io_error, >=, 0);
 -static void
 -vbio_put(vbio_t *vbio)
 -{
 -	if (atomic_dec_return(&vbio->vbio_ref) > 0)
 -		return;
 +	if (zio->io_error)
 +		vdev_disk_error(zio);
 -	/*
 -	 * This was the last reference, so the entire IO is completed. Clean
 -	 * up and submit it for processing.
 -	 */
 +	/* Return the BIO to the kernel */
 +	bio_put(bio);
 	/*
 -	 * Get any data buf back to the original ABD, if necessary. We do this
 -	 * now so we can get the ZIO into the pipeline as quickly as possible,
 -	 * and then do the remaining cleanup after.
 +	 * If we copied the ABD before issuing it, clean up and return the copy
 +	 * to the ADB, with changes if appropriate.
 	 */
 -	vbio_return_abd(vbio);
 +	if (vbio->vbio_abd != NULL) {
 +		void *buf = abd_to_buf(vbio->vbio_abd);
 +		abd_free(vbio->vbio_abd);
 +		vbio->vbio_abd = NULL;
 -	zio_t *zio = vbio->vbio_zio;
 +		if (zio->io_type == ZIO_TYPE_READ)
 +			abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
 +		else
 +			abd_return_buf(zio->io_abd, buf, zio->io_size);
 +	}
 -	/*
 -	 * Set the overall error. If multiple BIOs returned an error, only the
 -	 * first will be taken; the others are dropped (see
 -	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
 -	 * multiple IOs to the same device to fail with different errors, so
 -	 * there's no real risk.
 -	 */
 -	zio->io_error = vbio->vbio_error;
 -	if (zio->io_error)
 -		vdev_disk_error(zio);
 +	/* Final cleanup */
 +	kmem_free(vbio, sizeof (vbio_t));
 	/* All done, submit for processing */
 	zio_delay_interrupt(zio);
 -
 -	/* Finish cleanup */
 -	vbio_free(vbio);
 -}
 -
 -BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
 -{
 -	vbio_t *vbio = bio->bi_private;
 -
 -	if (vbio->vbio_error == 0) {
 -#ifdef HAVE_1ARG_BIO_END_IO_T
 -		vbio->vbio_error = BIO_END_IO_ERROR(bio);
 -#else
 -		if (error)
 -			vbio->vbio_error = -(error);
 -		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 -			vbio->vbio_error = EIO;
 -#endif
 -	}
 -
 -	/*
 -	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
 -	 * kernel won't touch it again after the completion function runs.
 -	 */
 -	bio_put(bio);
 -
 -	/* Drop this BIOs reference acquired by vbio_submit() */
 -	vbio_put(vbio);
 }
 /*
@@ -948,14 +893,6 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
 	return (B_TRUE);
 }
 -/* Iterator callback to submit ABD pages to the vbio. */
 -static int
 -vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
 -{
 -	vbio_t *vbio = priv;
 -	return (vbio_add_page(vbio, page, len, off));
 -}
 -
 static int
 vdev_disk_io_rw(zio_t *zio)
 {
@@ -1018,20 +955,12 @@ vdev_disk_io_rw(zio_t *zio)
 	}
 	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
 -	int error = 0;
 -	vbio_t *vbio = vbio_alloc(zio, bdev);
 +	vbio_t *vbio = vbio_alloc(zio, bdev, flags);
 	if (abd != zio->io_abd)
 		vbio->vbio_abd = abd;
 -	/* Fill it with pages */
 -	error = abd_iterate_page_func(abd, 0, zio->io_size,
 -	    vdev_disk_fill_vbio_cb, vbio);
 -	if (error != 0) {
 -		vbio_free(vbio);
 -		return (error);
 -	}
 -
 -	vbio_submit(vbio, flags);
 +	/* Fill it with data pages and submit it to the kernel */
 +	vbio_submit(vbio, abd, zio->io_size);
 	return (0);
 }
@@ -0,0 +1,96 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob Norris <rob.norris@klarasystems.com>
 Date: Thu, 14 Mar 2024 10:57:30 +1100
 Subject: [PATCH] abd_iter_page: don't use compound heads on Linux <4.5
 Before 4.5 (specifically, torvalds/linux@ddc58f2), head and tail pages
 in a compound page were refcounted separately. This means that using the
 head page without taking a reference to it could see it cleaned up later
 before we're finished with it. Specifically, bio_add_page() would take a
 reference, and drop its reference after the bio completion callback
 returns.
 If the zio is executed immediately from the completion callback, this is
 usually ok, as any data is referenced through the tail page referenced
 by the ABD, and so becomes "live" that way. If there's a delay in zio
 execution (high load, error injection), then the head page can be freed,
 along with any dirty flags or other indicators that the underlying
 memory is used. Later, when the zio completes and that memory is
 accessed, its either unmapped and an unhandled fault takes down the
 entire system, or it is mapped and we end up messing around in someone
 else's memory. Both of these are very bad.
 The solution on these older kernels is to take a reference to the head
 page when we use it, and release it when we're done. There's not really
 a sensible way under our current structure to do this; the "best" would
 be to keep a list of head page references in the ABD, and release them
 when the ABD is freed.
 Since this additional overhead is totally unnecessary on 4.5+, where
 head and tail pages share refcounts, I've opted to simply not use the
 compound head in ABD page iteration there. This is theoretically less
 efficient (though cleaning up head page references would add overhead),
 but its safe, and we still get the other benefits of not mapping pages
 before adding them to a bio and not mis-splitting pages.
 There doesn't appear to be an obvious symbol name or config option we
 can match on to discover this behaviour in configure (and the mm/page
 APIs have changed a lot since then anyway), so I've gone with a simple
 version check.
 Reviewed-by: Alexander Motin <mav@FreeBSD.org>
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
 Sponsored-by: Klara, Inc.
 Sponsored-by: Wasabi Technology, Inc.
 Closes #15533
 Closes #15588
 (cherry picked from commit c6be6ce1755a3d9a3cbe70256cd8958ef83d8542)
 ---
 module/os/linux/zfs/abd_os.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
 index 3fe01c0b7..d3255dcbc 100644
 --- a/module/os/linux/zfs/abd_os.c
 +++ b/module/os/linux/zfs/abd_os.c
@@ -62,6 +62,7 @@
 #include <linux/kmap_compat.h>
 #include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
 +#include <linux/version.h>
 #endif
 #ifdef _KERNEL
@@ -1061,6 +1062,7 @@ abd_iter_page(struct abd_iter *aiter)
 	}
 	ASSERT(page);
 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
 	if (PageTail(page)) {
 		/*
 		 * This page is part of a "compound page", which is a group of
@@ -1082,11 +1084,23 @@ abd_iter_page(struct abd_iter *aiter)
 		 * To do this, we need to adjust the offset to be counted from
 		 * the head page. struct page for compound pages are stored
 		 * contiguously, so we can just adjust by a simple offset.
 +		 *
 +		 * Before kernel 4.5, compound page heads were refcounted
 +		 * separately, such that moving back to the head page would
 +		 * require us to take a reference to it and releasing it once
 +		 * we're completely finished with it. In practice, that means
 +		 * when our caller is done with the ABD, which we have no
 +		 * insight into from here. Rather than contort this API to
 +		 * track head page references on such ancient kernels, we just
 +		 * compile this block out and use the tail pages directly. This
 +		 * is slightly less efficient, but makes everything far
 +		 * simpler.
 		 */
 		struct page *head = compound_head(page);
 		doff += ((page - head) * PAGESIZE);
 		page = head;
 	}
 +#endif
 	/* final page and position within it */
 	aiter->iter_page = page;
@@ -0,0 +1,90 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob Norris <rob.norris@klarasystems.com>
 Date: Wed, 27 Mar 2024 13:11:12 +1100
 Subject: [PATCH] vdev_disk: default to classic submission for 2.2.x
 We don't want to change to brand-new code in the middle of a stable
 series, but we want it available to test for people running into page
 splitting issues.
 This commits make zfs_vdev_disk_classic=1 the default, and updates the
 documentation to better explain what's going on.
 Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
 Sponsored-by: Klara, Inc.
 Sponsored-by: Wasabi Technology, Inc.
 ---
 man/man4/zfs.4                  | 31 ++++++++++++++++++++++---------
 module/os/linux/zfs/vdev_disk.c |  8 +++++---
 2 files changed, 27 insertions(+), 12 deletions(-)
 diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
 index 6a628e7f3..a98ec519a 100644
 --- a/man/man4/zfs.4
 +++ b/man/man4/zfs.4
@@ -1355,17 +1355,30 @@ This parameter only applies on Linux.
 This parameter is ignored if
 .Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
 .
 -.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 -If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
 -and earlier.
 -This "classic" method has known issues with highly fragmented IO requests and
 -is slower on many workloads, but it has been in use for many years and is known
 -to be very stable.
 -If you set this parameter, please also open a bug report why you did so,
 +.It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint
 +Controls the method used to submit IO to the Linux block layer
 +(default
 +.Sy 1 "classic" Ns
 +)
 +.Pp
 +If set to 1, the "classic" method is used.
 +This is the method that has been in use since the earliest versions of
 +ZFS-on-Linux.
 +It has known issues with highly fragmented IO requests and is less efficient on
 +many workloads, but it well known and well understood.
 +.Pp
 +If set to 0, the "new" method is used.
 +This method is available since 2.2.4 and should resolve all known issues and be
 +far more efficient, but has not had as much testing.
 +In the 2.2.x series, this parameter defaults to 1, to use the "classic" method.
 +.Pp
 +It is not recommended that you change it except on advice from the OpenZFS
 +developers.
 +If you do change it, please also open a bug report describing why you did so,
 including the workload involved and any error messages.
 .Pp
 -This parameter and the classic submission method will be removed once we have
 -total confidence in the new method.
 +This parameter and the "classic" submission method will be removed in a future
 +release of OpenZFS once we have total confidence in the new method.
 .Pp
 This parameter only applies on Linux, and can only be set at module load time.
 .
 diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
 index 36468fc21..e1c19a085 100644
 --- a/module/os/linux/zfs/vdev_disk.c
 +++ b/module/os/linux/zfs/vdev_disk.c
@@ -969,8 +969,10 @@ vdev_disk_io_rw(zio_t *zio)
 /*
  * This is the classic, battle-tested BIO submission code. Until we're totally
  * sure that the new code is safe and correct in all cases, this will remain
 - * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
 - * load time.
 + * available.
 + *
 + * It is enabled by setting zfs_vdev_disk_classic=1 at module load time. It is
 + * enabled (=1) by default since 2.2.4, and disabled by default (=0) on master.
  *
  * These functions have been renamed to vdev_classic_* to make it clear what
  * they belong to, but their implementations are unchanged.
@@ -1468,7 +1470,7 @@ vdev_disk_rele(vdev_t *vd)
  * BIO submission method. See comment above about vdev_classic.
  * Set zfs_vdev_disk_classic=0 for new, =1 for classic
  */
 -static uint_t zfs_vdev_disk_classic = 0;	/* default new */
 +static uint_t zfs_vdev_disk_classic = 1;	/* default classic */
 /* Set submission function from module parameter */
 static int
@@ -0,0 +1,104 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Robert Evans <rrevans@gmail.com>
 Date: Mon, 25 Mar 2024 17:56:49 -0400
 Subject: [PATCH] Fix corruption caused by mmap flushing problems
 1) Make mmap flushes synchronous. Linux may skip flushing dirty pages
   already in writeback unless data-integrity sync is requested.
 2) Change zfs_putpage to use TXG_WAIT. Otherwise dirty pages may be
   skipped due to DMU pushing back on TX assign.
 3) Add missing mmap flush when doing block cloning.
 4) While here, pass errors from putpage to writepage/writepages.
 This change fixes corruption edge cases, but unfortunately adds
 synchronous ZIL flushes for dirty mmap pages to llseek and bclone
 operations. It may be possible to avoid these sync writes later
 but would need more tricky refactoring of the writeback code.
 Reviewed-by: Alexander Motin <mav@FreeBSD.org>
 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Robert Evans <evansr@google.com>
 Closes #15933
 Closes #16019
 ---
 module/os/linux/zfs/zfs_vnops_os.c | 5 +----
 module/os/linux/zfs/zpl_file.c     | 8 ++++----
 module/zfs/zfs_vnops.c             | 6 +++++-
 3 files changed, 10 insertions(+), 9 deletions(-)
 diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
 index c06a75662..7c473bc7e 100644
 --- a/module/os/linux/zfs/zfs_vnops_os.c
 +++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -3792,11 +3792,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 -	err = dmu_tx_assign(tx, TXG_NOWAIT);
 +	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 -		if (err == ERESTART)
 -			dmu_tx_wait(tx);
 -
 		dmu_tx_abort(tx);
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
 diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
 index 3caa0fc6c..9dec52215 100644
 --- a/module/os/linux/zfs/zpl_file.c
 +++ b/module/os/linux/zfs/zpl_file.c
@@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 {
 	boolean_t *for_sync = data;
 	fstrans_cookie_t cookie;
 +	int ret;
 	ASSERT(PageLocked(pp));
 	ASSERT(!PageWriteback(pp));
 	cookie = spl_fstrans_mark();
 -	(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
 +	ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
 	spl_fstrans_unmark(cookie);
 -	return (0);
 +	return (ret);
 }
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 static int
 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
 {
 -	(void) zpl_putpage(&pp->page, wbc, data);
 -	return (0);
 +	return (zpl_putpage(&pp->page, wbc, data));
 }
 #endif
 diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
 index 2b37834d5..7020f88ec 100644
 --- a/module/zfs/zfs_vnops.c
 +++ b/module/zfs/zfs_vnops.c
@@ -130,7 +130,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
 	/* Flush any mmap()'d data to disk */
 	if (zn_has_cached_data(zp, 0, file_sz - 1))
 -		zn_flush_cached_data(zp, B_FALSE);
 +		zn_flush_cached_data(zp, B_TRUE);
 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
 	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
@@ -1193,6 +1193,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		}
 	}
 +	/* Flush any mmap()'d data to disk */
 +	if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
 +		zn_flush_cached_data(inzp, B_TRUE);
 +
 	/*
 	 * Maintain predictable lock order.
 	 */
@@ -0,0 +1,57 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Rob Norris <rob.norris@klarasystems.com>
 Date: Tue, 2 Apr 2024 15:14:54 +1100
 Subject: [PATCH] vdev_disk: don't touch vbio after its handed off to the
 kernel
 After IO is unplugged, it may complete immediately and vbio_completion
 be called on interrupt context. That may interrupt or deschedule our
 task. If its the last bio, the vbio will be freed. Then, we get
 rescheduled, and try to write to freed memory through vbio->.
 This patch just removes the the cleanup, and the corresponding assert.
 These were leftovers from a previous iteration of vbio_submit() and were
 always "belt and suspenders" ops anyway, never strictly required.
 Reported-by: Rich Ercolani <rincebrain@gmail.com>
 Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
 Sponsored-by: Klara, Inc.
 Sponsored-by: Wasabi Technology, Inc.
 (cherry picked from commit 34f662ad22206af6852020fd923ceccd836a855f)
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 ---
 module/os/linux/zfs/vdev_disk.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)
 diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
 index e1c19a085..62c7aa14f 100644
 --- a/module/os/linux/zfs/vdev_disk.c
 +++ b/module/os/linux/zfs/vdev_disk.c
@@ -758,8 +758,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
 static void
 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
 -	ASSERT(vbio->vbio_bdev);
 -
 	/*
 	 * We plug so we can submit the BIOs as we go and only unplug them when
 	 * they are fully created and submitted. This is important; if we don't
@@ -777,12 +775,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 	vbio->vbio_bio->bi_end_io = vbio_completion;
 	vbio->vbio_bio->bi_private = vbio;
 +	/*
 +	 * Once submitted, vbio_bio now owns vbio (through bi_private) and we
 +	 * can't touch it again. The bio may complete and vbio_completion() be
 +	 * called and free the vbio before this task is run again, so we must
 +	 * consider it invalid from this point.
 +	 */
 	vdev_submit_bio(vbio->vbio_bio);
 	blk_finish_plug(&plug);
 -
 -	vbio->vbio_bio = NULL;
 -	vbio->vbio_bdev = NULL;
 }
 /* IO completion callback */
@@ -6,5 +6,20 @@
 0006-dont-symlink-zed-scripts.patch
 0007-Add-systemd-unit-for-importing-specific-pools.patch
 0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
-0009-arcstat-Fix-integer-division-with-python3.patch
+0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
-0010-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
+0010-Fix-nfs_truncate_shares-without-etc-exports.d.patch
 0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch
 0012-udev-correctly-handle-partition-16-and-later.patch
 0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
 0014-linux-5.4-compat-page_size.patch
 0015-abd-add-page-iterator.patch
 0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
 0017-vdev_disk-reorganise-vdev_disk_io_start.patch
 0018-vdev_disk-make-read-write-IO-function-configurable.patch
 0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
 0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
 0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
 0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
 0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
 0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
 0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch
@@ -10,7 +10,7 @@ SPHINX_BUILD = $(shell dpkg -L python3-sphinx | grep -m 1 "/sphinx-build$$")
 export DEB_BUILD_MAINT_OPTIONS = hardening=+all
 %:
-	dh $@ --with autoreconf,python3,sphinxdoc --parallel
+	dh $@ --with autoreconf,python3,sphinxdoc
 adapt_meta_file:
 	@# Embed the downstream version in the module.
@@ -60,10 +60,6 @@ override_dh_auto_install:
 	@# Install the utilities.
 	$(MAKE) install DESTDIR='$(CURDIR)/debian/tmp'
 	# Use upstream's bash completion
 	install -D -t '$(CURDIR)/debian/tmp/usr/share/bash-completion/completions/' \
 		'$(CURDIR)/contrib/bash_completion.d/zfs'
 	# Move from bin_dir to /usr/sbin
 	# Remove suffix (.py) as per policy 10.4 - Scripts
 	# https://www.debian.org/doc/debian-policy/ch-files.html#s-scripts
@@ -83,7 +79,6 @@ override_dh_auto_install:
 	chmod a-x '$(CURDIR)/debian/tmp/etc/zfs/zfs-functions'
 	chmod a-x '$(CURDIR)/debian/tmp/etc/default/zfs'
 	chmod a-x '$(CURDIR)/debian/tmp/usr/share/bash-completion/completions/zfs'
 override_dh_python3:
 	dh_python3 -p python3-pyzfs
@@ -91,9 +86,6 @@ override_dh_python3:
 override_dh_makeshlibs:
 	dh_makeshlibs -a -V
 override_dh_strip:
 	dh_strip --dbgsym-migration='zfs-dbg (<< 2.0.4~)'
 override_dh_auto_clean:
 	find . -name .gitignore -delete
 	rm -rf zfs-$(DEB_VERSION_UPSTREAM)
@@ -101,7 +93,7 @@ override_dh_auto_clean:
 	@if test -e META.orig; then mv META.orig META; fi
 override_dh_install:
-	find debian/tmp/lib -name *.la -delete
+	find debian/tmp/lib -name '*.la' -delete
 	dh_install
 override_dh_missing:
@@ -60,7 +60,7 @@ do
 	case "${ret}" in
 		disable);;
 		enable)	trim_if_not_already_trimming "${pool}" ;;
-		-|auto)	pool_is_nvme_only "${pool}" && trim_if_not_already_trimming "${pool}" ;;
+		-|auto)	if pool_is_nvme_only "${pool}"; then trim_if_not_already_trimming "${pool}"; fi ;;
 		*)	cat > /dev/stderr <<EOF
 $0: [WARNING] illegal value "${ret}" for property "${PROPERTY_NAME}" of ZFS dataset "${pool}".
 $0: Acceptable choices for this property are: auto, enable, disable. The default is auto.
@@ -3,11 +3,8 @@ command-in-sbin-has-manpage-in-incorrect-section
 arch-dep-package-has-big-usr-share
 manpage-without-executable
 national-encoding *usr/share/zfs/zfs-tests/tests/functional/channel_program/lua_core/tst.lib_table.lua*
-executable-not-elf-or-script *usr/share/zfs/zfs-tests/tests/functional/cli_root/zfs_jail/jail.conf*
+executable-not-elf-or-script *usr/share/zfs/zfs-tests/tests/functional/cli_root/*
 package-contains-documentation-outside-usr-share-doc *usr/share/zfs/zfs-tests/*
-script-not-executable *usr/share/zfs/common.sh*
+script-not-executable [usr/share/zfs/common.sh]
-script-not-executable *usr/share/zfs/zfs-tests/include/default.cfg*
+script-not-executable [usr/share/zfs/zfs-tests/include/default.cfg]
-script-not-executable *usr/share/zfs/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib*
+script-not-executable [usr/share/zfs/zfs-tests/tests/functional/*]
 script-not-executable *usr/share/zfs/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib*
 script-not-executable *usr/share/zfs/zfs-tests/tests/functional/l2arc/l2arc.cfg*
 script-not-executable *usr/share/zfs/zfs-tests/tests/functional/redacted_send/redacted.kshlib*
@@ -14,6 +14,9 @@ lib/systemd/system/zfs-scrub-monthly@.timer
 lib/systemd/system/zfs-scrub-weekly@.timer
 lib/systemd/system/zfs-scrub@.service
 lib/systemd/system/zfs-share.service
 lib/systemd/system/zfs-trim-monthly@.timer
 lib/systemd/system/zfs-trim-weekly@.timer
 lib/systemd/system/zfs-trim@.service
 lib/systemd/system/zfs-volume-wait.service
 lib/systemd/system/zfs-volumes.target
 lib/systemd/system/zfs.target
@@ -30,7 +33,9 @@ sbin/zpool
 sbin/zstream
 sbin/zstreamdump
 usr/bin/zvol_wait
 usr/bin/zilstat
 usr/lib/modules-load.d/ lib/
 usr/lib/zfs-linux/zfs_prepare_disk
 usr/lib/zfs-linux/zpool.d/
 usr/lib/zfs-linux/zpool_influxdb
 usr/sbin/arc_summary
@@ -43,6 +48,7 @@ usr/share/man/man1/zvol_wait.1
 usr/share/man/man4/zfs.4
 usr/share/man/man4/spl.4
 usr/share/man/man5/
 usr/share/man/man7/vdevprops.7
 usr/share/man/man7/zfsconcepts.7
 usr/share/man/man7/zfsprops.7
 usr/share/man/man7/zpoolconcepts.7
@@ -63,11 +69,11 @@ usr/share/man/man8/zfs-get.8
 usr/share/man/man8/zfs-groupspace.8
 usr/share/man/man8/zfs-hold.8
 usr/share/man/man8/zfs-inherit.8
 usr/share/man/man8/zfs-jail.8
 usr/share/man/man8/zfs-list.8
 usr/share/man/man8/zfs-load-key.8
 usr/share/man/man8/zfs-mount-generator.8
 usr/share/man/man8/zfs-mount.8
 usr/share/man/man8/zfs_prepare_disk.8
 usr/share/man/man8/zfs-program.8
 usr/share/man/man8/zfs-project.8
 usr/share/man/man8/zfs-projectspace.8
@@ -83,9 +89,9 @@ usr/share/man/man8/zfs-set.8
 usr/share/man/man8/zfs-share.8
 usr/share/man/man8/zfs-snapshot.8
 usr/share/man/man8/zfs-unallow.8
 usr/share/man/man8/zfs-unjail.8
 usr/share/man/man8/zfs-unload-key.8
 usr/share/man/man8/zfs-unmount.8
 usr/share/man/man8/zfs-unzone.8
 usr/share/man/man8/zfs-upgrade.8
 usr/share/man/man8/zfs-userspace.8
 usr/share/man/man8/zfs-wait.8
@@ -124,6 +130,7 @@ usr/share/man/man8/zpool-sync.8
 usr/share/man/man8/zpool-trim.8
 usr/share/man/man8/zpool-upgrade.8
 usr/share/man/man8/zpool-wait.8
 usr/share/man/man8/zfs-zone.8
 usr/share/man/man8/zpool.8
 usr/share/man/man8/zstream.8
 usr/share/man/man8/zstreamdump.8
Author	SHA1	Message	Date
Thomas Lamprecht	68be554e71	backport 2.2.4 staging for better 6.8 support Use the current ZFS 2.2.4 staging tree [0] with commit deb7a8423 ("Fix corruption caused by mmap flushing problems") on top. Additionally, include an open, but ack'd, pull request [1] that avoids a potential general protection fault due to touching a vbio after it was handed off to the kernel. [0]: https://github.com/openzfs/zfs/commits/zfs-2.2.4-staging/ [1]: https://github.com/openzfs/zfs/pull/16049 Both should mostly touch the module code. Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2024-04-03 09:56:31 +02:00
Thomas Lamprecht	6c9ff9b992	bump version to 2.2.3-pve1 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2024-03-11 13:46:05 +01:00
Stoiko Ivanov	b48cfd2b15	fix #5288 : cherry-pick fix for udev-partition links > 16 If a zvol has more than 15 partitions, the minor device number exhausts the slot count reserved for partitions next to the zvol itself. As a result, the minor number cannot be used to determine the partition number for the higher partition, and doing so results in wrong named symlinks being generated by udev. Since the partition number is encoded in the block device name anyway, let's just extract it from there instead. For upstream issue and PR discussion see: https://github.com/openzfs/zfs/pull/15970 https://github.com/openzfs/zfs/issues/15904 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com> Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2024-03-11 13:44:37 +01:00
Stoiko Ivanov	a5e0251015	update zfs submodule to 2.2.3 and refresh patches mostly support for newer kernel-versions, and fixes for the BRT bugs discovered with 2.2.0 (BRT remains disabled by default). The update contains a fix for CVE-2020-24370 in lua (which is present in ZFS for channel-programs, which we do not use) - see: https://github.com/openzfs/zfs/pull/15847 for more details. One patch from Stefan Lendl was backported and is now in the ZFS 2.2 branch. Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>	2024-03-11 13:41:25 +01:00
Thomas Lamprecht	838cd1d173	bump version to 2.2.2-pve2 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2024-02-19 17:00:17 +01:00
Stefan Lendl	5f4f0445f4	Fix #5101 : exports with sharenfs remain after zfs mount -a When running `zfs mount -a`, prevent the exported datasets (with sharenfs) to be truncated (unexported). Adds tests to verify shares persist after mount -a Signed-off-by: Stefan Lendl <s.lendl@proxmox.com>	2024-02-02 19:17:28 +01:00
Thomas Lamprecht	81d11761c3	bump version to 2.2.2-pve1 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-12-04 16:50:30 +01:00
Stoiko Ivanov	3bda92bd20	d/zfsutils-linux.install: add zfs_prepare_disk and manpage Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>	2023-12-04 16:48:29 +01:00
Stoiko Ivanov	f67eb9538f	update zfs submodule to 2.2.2 and refresh patches the removed patches were cherry-picks, which are included in 2.2.2 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>	2023-12-04 16:48:29 +01:00
Fabian Grünbichler	00036e5a6e	bump version to 2.2.0-pve4 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>	2023-11-29 09:22:05 +01:00
Fabian Grünbichler	3db00caad9	cherry-pick fix for data corruption cherry-picked from 2.2.0-staging, fixing https://github.com/openzfs/zfs/issues/15526 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>	2023-11-29 09:18:39 +01:00
Thomas Lamprecht	e295f30e6a	bump version to 2.2.0-pve3 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-11-17 17:33:02 +01:00
Stoiko Ivanov	96c807af63	pick bug-fixes staged for 2.2.1 ZFS 2.2.1 is currently being prepared, but the 3 patches added here seem quite relevant, as the might cause dataloss/panics on setups which run `zpool upgrade`. See upstreams discussion for 2.2.1: https://github.com/openzfs/zfs/pull/15498/ and the most critical issue: https://github.com/openzfs/zfs/pull/15529 finally: https://github.com/openzfs/zfs/commit/459c99ff2339a4a514abcf2255f9b3e5324ef09e should not hurt either the change to the UBSAN patch (0013) is unrelate, cosmetic only and happened by running export-patchqueue. Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>	2023-11-17 17:30:26 +01:00
Thomas Lamprecht	88fd6e053b	bump version to 2.2.0-pve2 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-11-12 16:04:12 +01:00
Thomas Lamprecht	4f818e9880	ensure vdev_stat struct layout compat betweem 2.1 and 2.2 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-11-12 15:59:55 +01:00
Thomas Lamprecht	310afb0d19	backport work-around for UBSAN-errors with variable arrays Link: https://github.com/openzfs/zfs/pull/15510 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-11-12 15:59:16 +01:00
Stoiko Ivanov	0f9a07b53e	add patch for spurious warning on `zfs mount -a` reported in our community forum: https://forum.proxmox.com/threads/135635/#post-60036 the small fix was merged upstream: https://github.com/openzfs/zfs/pull/15468 minimally tested by building with this patch and running `zfs mount -a` on an affected system. Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>	2023-11-02 20:12:02 +01:00
Thomas Lamprecht	aa99285dda	bump version to 2.2.0-pve1 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-27 18:25:54 +02:00
Stoiko Ivanov	7e3b7d81a1	fix #5014 reenable blk-mq optimization While I think the huge performance optimization was at some point not really that huge in practice - the feature sounds like it would benefit our use-case: https://github.com/openzfs/zfs/pull/13148 currently the feature is disabled in 2.2.0 (see the second patch), because of the issues addressed by the first patch Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com> Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-27 18:24:32 +02:00
Stoiko Ivanov	28de0abfa9	d/control: fix depends provided by dpkg-gencontrol Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com> Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-27 18:24:32 +02:00
Stoiko Ivanov	a20ffcd44f	d/rules: drop bash_completion mangling This was integrated into upstreams autoconf in commit: e69ade32e116e72d03068c03799924c3f1a15c95 (contrib: bash_completion.d: make install destination vendor dependent) Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com> Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-27 18:24:32 +02:00
Thomas Lamprecht	1382616c40	d/lintian: adapt zfs-test overrides to debhelper 13 format Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-27 18:24:32 +02:00
Thomas Lamprecht	1621cb1079	d/copyright: adapt to file moves and deletions Lots of targeted commands consisting of a single file as source got moved a level up from their command-specific sub-directory to cmd/ directory directly. A handful of build-artefacts like configure/m4 files generated by auto-tools, was removed from being tracked by git. Then some stuff was simple unused or broken and got deleted (vdev_cache, zfs_spa). Others where replaced (sha256 by general sha2 library, zstreamdump by zstream). Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-27 18:24:32 +02:00
Thomas Lamprecht	508220ed2c	zfsutils-linux: install new trim units, zilstat tool, and new man pages Cater to dh_missing complaints and ship new: - zilstat tool - zfs-lock and zfs-unlock manual pages in section 8 - vdevprops manual page in section 7 - systemd template unit files for trimming a specific pool, inclusive timers to do so on a weekly and/or monthly basis. Keep those covered by our default "no-stop-on-upgrade" rule for the dh_systemd helper. Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-27 18:24:32 +02:00
Thomas Lamprecht	6da885c3b1	d/patches: add context to ZED no-symlink patch Just copy over the existing commit message... Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-27 18:24:32 +02:00
Thomas Lamprecht	2840fef531	d/install: remove manpages for BSD-specific jail/unjail hooks With ZFS 2.2. they're actually only installed if ZFS is being build for FreeBSD, so not remvoing them here leads to a missing file error. Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-27 18:24:32 +02:00
Thomas Lamprecht	1f8dab1515	d/rules: add missing quotes to glob passed to find Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-27 18:24:32 +02:00
Thomas Lamprecht	2c95b92384	update submodule and patches for 2.2.0 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-27 18:24:32 +02:00
Thomas Lamprecht	9e8946d4b9	backport fix for AMX register breakage vmexit's can cause the AMX registers to "misbehave" which can break ZFS, even though ZFS doesn't use AMX at all. This causes crashes and processes hanging forever in uninterruptible sleep (the infamous D state) on Intel Xeon 4th gen HW, possible other HW too, but we only got reports on Sapphire Rapids models. Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com> Tested-by: Aaron Lauterer <a.lauterer@proxmox.com>	2023-10-11 16:05:26 +02:00
Thomas Lamprecht	8c6520d1fc	buildsys: improve clean target Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-10-10 15:44:59 +02:00
Thomas Lamprecht	aa26132525	bump version to 2.1.13-pve1 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-09-28 15:40:17 +02:00
Thomas Lamprecht	13c7e925aa	add basic gitignore Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-09-28 15:40:17 +02:00
Thomas Lamprecht	a80c5e3597	buildsys: improve DSC target Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-09-28 15:40:17 +02:00
Thomas Lamprecht	149fd91bb2	buildsys: align variable names with our commonly used ones Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-09-28 15:40:17 +02:00
Stoiko Ivanov	362d3432be	update zfs submodule to 2.1.13 and refresh patches Sugested-by: Thomas Lamprecht <t.lamprecht@proxmox.com> Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com> Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-09-28 15:40:17 +02:00
Thomas Lamprecht	f5ed5be89a	bump version to 2.1.12-pve1 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-06-13 15:25:21 +02:00
Stoiko Ivanov	5891aaec34	/lib/zfs-linux/trim: don't exit 1 if last pool isn't nvme-only (Closes: #1030316 ) (cherry picked from debian-upstream[0] commit 8ed69adac193f6463832f6ae34b5ded88b8014d8) [0] https://salsa.debian.org/zfsonlinux-team/zfs Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>	2023-06-13 15:23:49 +02:00
Stoiko Ivanov	63e591d8a9	update zfs submodule to 2.1.12 patches still applied cleanly Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>	2023-06-13 15:23:49 +02:00
Thomas Lamprecht	d855afe7be	bump version to 2.1.11-pve2 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-05-20 20:37:21 +02:00
Thomas Lamprecht	34d701d1ac	buildsys: add sbuild convenience target Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-05-20 20:37:21 +02:00
Thomas Lamprecht	40fe66e33e	buildsys: derive upload dist and arch automatically Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-05-20 20:37:21 +02:00
Thomas Lamprecht	1b7710c13c	d/copyright: update from debian upstream Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-05-20 20:37:21 +02:00
Thomas Lamprecht	2f5fca8a1a	d/control: do not depend on obsolete lsb-base Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-05-20 20:37:21 +02:00
Thomas Lamprecht	8ba2c83746	d/rules: drop --parallel flag, useless for dh-compat >= 10 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-05-20 20:37:21 +02:00
Thomas Lamprecht	dff6b68bf5	drop transitionall zfs-dbg package this effectively reverts the commit `755c716` ("d/control: add transitional zfs-dbg package") Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>	2023-05-20 20:37:16 +02:00