Tag zfs-2.2.6

META file and changelog updated. Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Enable L2 cache of all (MRU+MFU) metadata but MFU data only
2026-05-24 03:08:51 +03:00 · 2024-08-27 14:53:03 -07:00 · 2024-08-27 14:53:03 -07:00 · 2024-08-26 15:10:16 -07:00 · 2024-08-26 15:10:16 -07:00 · 2024-08-26 15:10:16 -07:00
92 changed files with 1636 additions and 548 deletions
@@ -77,7 +77,10 @@ Yanping Gao <yanping.gao@xtaotech.com>
 Youzhong Yang <youzhong@gmail.com>

 # Signed-off-by: overriding Author:
+Ryan <errornointernet@envs.net> <error.nointernet@gmail.com>
+Qiuhao Chen <chenqiuhao1997@gmail.com> <haohao0924@126.com>
 Yuxin Wang <yuxinwang9999@gmail.com> <Bi11gates9999@gmail.com>
+Zhenlei Huang <zlei@FreeBSD.org> <zlei.huang@gmail.com>

 # Commits from strange places, long ago
 Brian Behlendorf <behlendorf1@llnl.gov> <behlendo@7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c>
@@ -95,6 +98,7 @@ Alek Pinchuk <apinchuk@axcient.com> <alek-p@users.noreply.github.com>
 Alexander Lobakin <alobakin@pm.me> <solbjorn@users.noreply.github.com>
 Alexey Smirnoff <fling@member.fsf.org> <fling-@users.noreply.github.com>
 Allen Holl <allen.m.holl@gmail.com> <65494904+allen-4@users.noreply.github.com>
+Alphan Yılmaz <alphanyilmaz@gmail.com> <a1ea321@users.noreply.github.com>
 Ameer Hamza <ahamza@ixsystems.com> <106930537+ixhamza@users.noreply.github.com>
 Andrew J. Hesford <ajh@sideband.org> <48421688+ahesford@users.noreply.github.com>>
 Andrew Sun <me@andrewsun.com> <as-com@users.noreply.github.com>
@@ -102,6 +106,7 @@ Aron Xu <happyaron.xu@gmail.com> <happyaron@users.noreply.github.com>
 Arun KV <arun.kv@datacore.com> <65647132+arun-kv@users.noreply.github.com>
 Ben Wolsieffer <benwolsieffer@gmail.com> <lopsided98@users.noreply.github.com>
 bernie1995 <bernie.pikes@gmail.com> <42413912+bernie1995@users.noreply.github.com>
+Bojan Novković <bnovkov@FreeBSD.org> <72801811+bnovkov@users.noreply.github.com>
 Boris Protopopov <boris.protopopov@actifio.com> <bprotopopov@users.noreply.github.com>
 Brad Forschinger <github@bnjf.id.au> <bnjf@users.noreply.github.com>
 Brandon Thetford <brandon@dodecatec.com> <dodexahedron@users.noreply.github.com>
@@ -193,6 +198,7 @@ Stefan Lendl <s.lendl@proxmox.com> <1321542+stfl@users.noreply.github.com>
 Thomas Bertschinger <bertschinger@lanl.gov> <101425190+bertschinger@users.noreply.github.com>
 Thomas Geppert <geppi@digitx.de> <geppi@users.noreply.github.com>
 Tim Crawford <tcrawford@datto.com> <crawfxrd@users.noreply.github.com>
+Todd Seidelmann <18294602+seidelma@users.noreply.github.com>
 Tom Matthews <tom@axiom-partners.com> <tomtastic@users.noreply.github.com>
 Tony Perkins <tperkins@datto.com> <62951051+tony-zfs@users.noreply.github.com>
 Torsten Wörtwein <twoertwein@gmail.com> <twoertwein@users.noreply.github.com>
@@ -46,6 +46,7 @@ CONTRIBUTORS:
    Alex Zhuravlev <alexey.zhuravlev@intel.com>
    Allan Jude <allanjude@freebsd.org>
    Allen Holl <allen.m.holl@gmail.com>
+    Alphan Yılmaz <alphanyilmaz@gmail.com>
    alteriks <alteriks@gmail.com>
    Alyssa Ross <hi@alyssa.is>
    Ameer Hamza <ahamza@ixsystems.com>
@@ -99,6 +100,7 @@ CONTRIBUTORS:
    bernie1995 <bernie.pikes@gmail.com>
    Bill McGonigle <bill-github.com-public1@bfccomputing.com>
    Bill Pijewski <wdp@joyent.com>
+    Bojan Novković <bnovkov@FreeBSD.org>
    Boris Protopopov <boris.protopopov@nexenta.com>
    Brad Forschinger <github@bnjf.id.au>
    Brad Lewis <brad.lewis@delphix.com>
@@ -168,6 +170,7 @@ CONTRIBUTORS:
    Daniel Hoffman <dj.hoffman@delphix.com>
    Daniel Kobras <d.kobras@science-computing.de>
    Daniel Kolesa <daniel@octaforge.org>
+    Daniel Perry <dtperry@amazon.com>
    Daniel Reichelt <hacking@nachtgeist.net>
    Daniel Stevenson <bot@dstev.net>
    Daniel Verite <daniel@verite.pro>
@@ -187,6 +190,7 @@ CONTRIBUTORS:
    Dennis R. Friedrichsen <dennis.r.friedrichsen@gmail.com>
    Denys Rtveliashvili <denys@rtveliashvili.name>
    Derek Dai <daiderek@gmail.com>
+    Derek Schrock <dereks@lifeofadishwasher.com>
    Dex Wood <slash2314@gmail.com>
    DHE <git@dehacked.net>
    Didier Roche <didrocks@ubuntu.com>
@@ -245,6 +249,7 @@ CONTRIBUTORS:
    Gionatan Danti <g.danti@assyoma.it>
    Giuseppe Di Natale <guss80@gmail.com>
    Glenn Washburn <development@efficientek.com>
+    glibg10b <glibg10b@users.noreply.github.com>
    gofaster <felix.gofaster@gmail.com>
    Gordan Bobic <gordan@redsleeve.org>
    Gordon Bergling <gbergling@googlemail.com>
@@ -410,6 +415,7 @@ CONTRIBUTORS:
    Mart Frauenlob <allkind@fastest.cc>
    Martin Matuska <mm@FreeBSD.org>
    Martin Rüegg <martin.rueegg@metaworx.ch>
+    Martin Wagner <martin.wagner.dev@gmail.com>
    Massimo Maggi <me@massimo-maggi.eu>
    Mateusz Guzik <mjguzik@gmail.com>
    Mateusz Piotrowski <0mp@FreeBSD.org>
@@ -488,6 +494,7 @@ CONTRIBUTORS:
    Peng <peng.hse@xtaotech.com>
    Peter Ashford <ashford@accs.com>
    Peter Dave Hello <hsu@peterdavehello.org>
+    Peter Doherty <peterd@acranox.org>
    Peter Levine <plevine457@gmail.com>
    Peter Wirdemo <peter.wirdemo@gmail.com>
    Petros Koutoupis <petros@petroskoutoupis.com>
@@ -501,6 +508,7 @@ CONTRIBUTORS:
    Prasad Joshi <prasadjoshi124@gmail.com>
    privb0x23 <privb0x23@users.noreply.github.com>
    P.SCH <p88@yahoo.com>
+    Qiuhao Chen <chenqiuhao1997@gmail.com>
    Quartz <yyhran@163.com>
    Quentin Zdanis <zdanisq@gmail.com>
    Rafael Kitover <rkitover@gmail.com>
@@ -532,6 +540,7 @@ CONTRIBUTORS:
    Roman Strashkin <roman.strashkin@nexenta.com>
    Ross Williams <ross@ross-williams.net>
    Ruben Kerkhof <ruben@rubenkerkhof.com>
+    Ryan <errornointernet@envs.net>
    Ryan Hirasaki <ryanhirasaki@gmail.com>
    Ryan Lahfa <masterancpp@gmail.com>
    Ryan Libby <rlibby@FreeBSD.org>
@@ -556,6 +565,7 @@ CONTRIBUTORS:
    Sen Haerens <sen@senhaerens.be>
    Serapheim Dimitropoulos <serapheim@delphix.com>
    Seth Forshee <seth.forshee@canonical.com>
+    Seth Troisi <sethtroisi@google.com>
    Shaan Nobee <sniper111@gmail.com>
    Shampavman <sham.pavman@nexenta.com>
    Shaun Tancheff <shaun@aeonazure.com>
@@ -602,6 +612,7 @@ CONTRIBUTORS:
    Tim Schumacher <timschumi@gmx.de>
    Tino Reichardt <milky-zfs@mcmilk.de>
    Tobin Harding <me@tobin.cc>
+    Todd Seidelmann <seidelma@users.noreply.github.com>
    Tom Caputi <tcaputi@datto.com>
    Tom Matthews <tom@axiom-partners.com>
    Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
@@ -653,6 +664,8 @@ CONTRIBUTORS:
    Zachary Bedell <zac@thebedells.org>
    Zach Dykstra <dykstra.zachary@gmail.com>
    zgock <zgock@nuc.base.zgock-lab.net>
+    Zhao Yongming <zym@apache.org>
+    Zhenlei Huang <zlei@FreeBSD.org>
    Zhu Chuang <chuang@melty.land>
    Érico Nogueira <erico.erc@gmail.com>
    Đoàn Trần Công Danh <congdanhqx@gmail.com>
@@ -1,10 +1,10 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.2.4
+Version:       2.2.6
 Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.8
+Linux-Maximum: 6.10
 Linux-Minimum: 3.10
@@ -48,6 +48,7 @@
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
+#include <sys/zap_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_sa.h>
@@ -84,6 +85,9 @@
 #include <sys/brt_impl.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>
+#if (__GLIBC__ && !__UCLIBC__)
+#include <execinfo.h> /* for backtrace() */
+#endif

 #include <libnvpair.h>
 #include <libzutil.h>
@@ -926,11 +930,41 @@ usage(void)
 static void
 dump_debug_buffer(void)
 {
-	if (dump_opt['G']) {
-		(void) printf("\n");
-		(void) fflush(stdout);
-		zfs_dbgmsg_print("zdb");
-	}
+	ssize_t ret __attribute__((unused));
+
+	if (!dump_opt['G'])
+		return;
+	/*
+	 * We use write() instead of printf() so that this function
+	 * is safe to call from a signal handler.
+	 */
+	ret = write(STDOUT_FILENO, "\n", 1);
+	zfs_dbgmsg_print("zdb");
+}
+
+#define	BACKTRACE_SZ	100
+
+static void sig_handler(int signo)
+{
+	struct sigaction action;
+#if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */
+	int nptrs;
+	void *buffer[BACKTRACE_SZ];
+
+	nptrs = backtrace(buffer, BACKTRACE_SZ);
+	backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO);
+#endif
+	dump_debug_buffer();
+
+	/*
+	 * Restore default action and re-raise signal so SIGSEGV and
+	 * SIGABRT can trigger a core dump.
+	 */
+	action.sa_handler = SIG_DFL;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+	(void) sigaction(signo, &action, NULL);
+	raise(signo);
 }

 /*
@@ -1199,16 +1233,33 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
-		(void) printf("\t\t%s = ", attr.za_name);
+		boolean_t key64 =
+		    !!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY);
+
+		if (key64)
+			(void) printf("\t\t0x%010lx = ",
+			    *(uint64_t *)attr.za_name);
+		else
+			(void) printf("\t\t%s = ", attr.za_name);
+
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		prop = umem_zalloc(attr.za_num_integers *
 		    attr.za_integer_length, UMEM_NOFAIL);
-		(void) zap_lookup(os, object, attr.za_name,
-		    attr.za_integer_length, attr.za_num_integers, prop);
-		if (attr.za_integer_length == 1) {
+
+		if (key64)
+			(void) zap_lookup_uint64(os, object,
+			    (const uint64_t *)attr.za_name, 1,
+			    attr.za_integer_length, attr.za_num_integers,
+			    prop);
+		else
+			(void) zap_lookup(os, object, attr.za_name,
+			    attr.za_integer_length, attr.za_num_integers,
+			    prop);
+
+		if (attr.za_integer_length == 1 && !key64) {
 			if (strcmp(attr.za_name,
 			    DSL_CRYPTO_KEY_MASTER_KEY) == 0 ||
 			    strcmp(attr.za_name,
@@ -1227,6 +1278,10 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 		} else {
 			for (i = 0; i < attr.za_num_integers; i++) {
 				switch (attr.za_integer_length) {
+				case 1:
+					(void) printf("%u ",
+					    ((uint8_t *)prop)[i]);
+					break;
 				case 2:
 					(void) printf("%u ",
 					    ((uint16_t *)prop)[i]);
@@ -5217,7 +5272,7 @@ dump_label(const char *dev)
 	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));

 	psize = statbuf.st_size;
-	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+	psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t);
 	ashift = SPA_MINBLOCKSHIFT;

 	/*
@@ -8934,9 +8989,27 @@ main(int argc, char **argv)
 	char *spa_config_path_env, *objset_str;
 	boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
 	nvlist_t *cfg = NULL;
+	struct sigaction action;

 	dprintf_setup(&argc, argv);

+	/*
+	 * Set up signal handlers, so if we crash due to bad on-disk data we
+	 * can get more info. Unlike ztest, we don't bail out if we can't set
+	 * up signal handlers, because zdb is very useful without them.
+	 */
+	action.sa_handler = sig_handler;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+	if (sigaction(SIGSEGV, &action, NULL) < 0) {
+		(void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n",
+		    strerror(errno));
+	}
+	if (sigaction(SIGABRT, &action, NULL) < 0) {
+		(void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n",
+		    strerror(errno));
+	}
+
 	/*
 	 * If there is an environment variable SPA_CONFIG_PATH it overrides
 	 * default spa_config_path setting. If -U flag is specified it will
@@ -186,7 +186,7 @@ static void
 zfs_redup_stream(int infd, int outfd, boolean_t verbose)
 {
 	int bufsz = SPA_MAXBLOCKSIZE;
-	dmu_replay_record_t thedrr = { 0 };
+	dmu_replay_record_t thedrr;
 	dmu_replay_record_t *drr = &thedrr;
 	redup_table_t rdt;
 	zio_cksum_t stream_cksum;
@@ -194,6 +194,8 @@ zfs_redup_stream(int infd, int outfd, boolean_t verbose)
 	uint64_t num_records = 0;
 	uint64_t num_write_byref_records = 0;

+	memset(&thedrr, 0, sizeof (dmu_replay_record_t));
+
 #ifdef _ILP32
 	uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
 #else
@@ -2448,7 +2448,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 		ASSERT3P(zio, !=, NULL);
 		size = doi.doi_data_block_size;
 		if (ISP2(size)) {
-			offset = P2ALIGN(offset, size);
+			offset = P2ALIGN_TYPED(offset, size, uint64_t);
 		} else {
 			ASSERT3U(offset, <, size);
 			offset = 0;
@@ -4668,7 +4668,8 @@ ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
 	 */
 	mutex_enter(&os->os_obj_lock);
 	object = ztest_random(os->os_obj_next_chunk);
-	os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
+	os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk,
+	    uint64_t);
 	mutex_exit(&os->os_obj_lock);
 }

@@ -6284,7 +6285,8 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 		 * the end of the disk (vdev_psize) is aligned to
 		 * sizeof (vdev_label_t).
 		 */
-		uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
+		uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t),
+		    uint64_t);
 		if ((leaf & 1) == 1 &&
 		    offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
 			continue;
@@ -6600,8 +6602,8 @@ ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id)
 				size_t inc = 64 * ztest_random(size / 67);
 				/* sometimes add few bytes to test non-simd */
 				if (ztest_random(100) < 10)
-					inc += P2ALIGN(ztest_random(64),
-					    sizeof (uint32_t));
+					inc += P2ALIGN_TYPED(ztest_random(64),
+					    sizeof (uint32_t), uint64_t);

 				if (inc > (size - pos))
 					inc = size - pos;
@@ -90,8 +90,8 @@ AC_DEFUN([ZFS_AC_FIND_SYSTEM_LIBRARY], [
 	AC_DEFINE([HAVE_][$1], [1], [Define if you have [$5]])
 	$7
    ],[dnl ELSE
-	AC_SUBST([$1]_CFLAGS, [])
-	AC_SUBST([$1]_LIBS, [])
+	AC_SUBST([$1]_CFLAGS, [""])
+	AC_SUBST([$1]_LIBS, [""])
 	AC_MSG_WARN([cannot find [$5] via pkg-config or in the standard locations])
 	$8
    ])
@@ -25,6 +25,8 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_PLUG], [
 dnl #
 dnl # 2.6.32 - 4.11: statically allocated bdi in request_queue
 dnl # 4.12: dynamically allocated bdi in request_queue
+dnl # 6.11: bdi no longer available through request_queue, so get it from
+dnl #       the gendisk attached to the queue
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI], [
 	ZFS_LINUX_TEST_SRC([blk_queue_bdi], [
@@ -47,6 +49,30 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [
 	])
 ])

+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI], [
+	ZFS_LINUX_TEST_SRC([blk_queue_disk_bdi], [
+		#include <linux/blkdev.h>
+		#include <linux/backing-dev.h>
+	], [
+		struct request_queue q;
+		struct gendisk disk;
+		struct backing_dev_info bdi __attribute__ ((unused));
+		q.disk = &disk;
+		q.disk->bdi = &bdi;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI], [
+	AC_MSG_CHECKING([whether backing_dev_info is available through queue gendisk])
+	ZFS_LINUX_TEST_RESULT([blk_queue_disk_bdi], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLK_QUEUE_DISK_BDI, 1,
+		    [backing_dev_info is available through queue gendisk])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 dnl #
 dnl # 5.9: added blk_queue_update_readahead(),
 dnl # 5.15: renamed to disk_update_readahead()
@@ -332,7 +358,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS], [
 	ZFS_LINUX_TEST_RESULT([blk_queue_max_hw_sectors], [
 		AC_MSG_RESULT(yes)
 	],[
-		ZFS_LINUX_TEST_ERROR([blk_queue_max_hw_sectors])
+		AC_MSG_RESULT(no)
 	])
 ])

@@ -355,7 +381,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [
 	ZFS_LINUX_TEST_RESULT([blk_queue_max_segments], [
 		AC_MSG_RESULT(yes)
 	], [
-		ZFS_LINUX_TEST_ERROR([blk_queue_max_segments])
+		AC_MSG_RESULT(no)
 	])
 ])

@@ -407,6 +433,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI
+	ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE
@@ -421,6 +448,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
 	ZFS_AC_KERNEL_BLK_QUEUE_PLUG
 	ZFS_AC_KERNEL_BLK_QUEUE_BDI
+	ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI
 	ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD
 	ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
 	ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE
@@ -534,6 +534,30 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [
 	])
 ])

+dnl #
+dnl # 5.16 API change
+dnl # Added bdev_nr_bytes() helper.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_NR_BYTES], [
+	ZFS_LINUX_TEST_SRC([bdev_nr_bytes], [
+		#include <linux/blkdev.h>
+	],[
+		struct block_device *bdev = NULL;
+		loff_t nr_bytes __attribute__ ((unused)) = 0;
+		nr_bytes = bdev_nr_bytes(bdev);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_NR_BYTES], [
+	AC_MSG_CHECKING([whether bdev_nr_bytes() is available])
+	ZFS_LINUX_TEST_RESULT([bdev_nr_bytes], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BDEV_NR_BYTES, 1, [bdev_nr_bytes() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 dnl #
 dnl # 5.20 API change,
 dnl # Removed bdevname(), snprintf(.., %pg) should be used.
@@ -747,6 +771,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE
+	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_NR_BYTES
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME
 	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
@@ -767,6 +792,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE
+	ZFS_AC_KERNEL_BLKDEV_BDEV_NR_BYTES
 	ZFS_AC_KERNEL_BLKDEV_BDEVNAME
 	ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS
 	ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD
@@ -58,6 +58,13 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [
 		disk = blk_alloc_disk(lim, NUMA_NO_NODE);
 	])

+	ZFS_LINUX_TEST_SRC([blkdev_queue_limits_features], [
+		#include <linux/blkdev.h>
+	],[
+		struct queue_limits *lim = NULL;
+		lim->features = 0;
+	])
+
 	ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [
 		#include <linux/blkdev.h>
 	],[
@@ -114,6 +121,20 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args])

+			dnl #
+			dnl # Linux 6.11 API change:
+			dnl # struct queue_limits gains a 'features' field,
+			dnl # used to set flushing options
+			dnl #
+			AC_MSG_CHECKING([whether struct queue_limits has a features field])
+			ZFS_LINUX_TEST_RESULT([blkdev_queue_limits_features], [
+				AC_MSG_RESULT(yes)
+				AC_DEFINE([HAVE_BLKDEV_QUEUE_LIMITS_FEATURES], 1,
+				    [struct queue_limits has a features field])
+			], [
+				AC_MSG_RESULT(no)
+			])
+
 			dnl #
 			dnl # 5.20 API change,
 			dnl # Removed blk_cleanup_disk(), put_disk() should be used.
@@ -1,17 +0,0 @@
-AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
-	ZFS_LINUX_TEST_SRC([page_size], [
-		#include <linux/mm.h>
-	],[
-		unsigned long s;
-		s = page_size(NULL);
-	])
-])
-AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
-	AC_MSG_CHECKING([whether page_size() is available])
-	ZFS_LINUX_TEST_RESULT([page_size], [
-		AC_MSG_RESULT(yes)
-		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
-	],[
-		AC_MSG_RESULT(no)
-	])
-])
@@ -0,0 +1,36 @@
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
+	ZFS_LINUX_TEST_SRC([page_size], [
+		#include <linux/mm.h>
+	],[
+		unsigned long s;
+		s = page_size(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
+	AC_MSG_CHECKING([whether page_size() is available])
+	ZFS_LINUX_TEST_RESULT([page_size], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING], [
+	ZFS_LINUX_TEST_SRC([page_mapping], [
+		#include <linux/pagemap.h>
+	],[
+		struct page *p = NULL;
+		struct address_space *m = page_mapping(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_MAPPING], [
+	AC_MSG_CHECKING([whether page_mapping() is available])
+	ZFS_LINUX_TEST_RESULT([page_mapping], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_MAPPING, 1, [page_mapping() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
@@ -25,3 +25,62 @@ AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
 		AC_MSG_RESULT([no])
 	])
 ])
+
+dnl #
+dnl # Linux 6.11 register_sysctl() enforces that sysctl tables no longer
+dnl # supply a sentinel end-of-table element. 6.6 introduces
+dnl # register_sysctl_sz() to enable callers to choose, so we use it if
+dnl # available for backward compatibility.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ], [
+	ZFS_LINUX_TEST_SRC([has_register_sysctl_sz], [
+		#include <linux/sysctl.h>
+	],[
+		struct ctl_table test_table[] __attribute__((unused)) = {0};
+		register_sysctl_sz("", test_table, 0);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ], [
+	AC_MSG_CHECKING([whether register_sysctl_sz exists])
+	ZFS_LINUX_TEST_RESULT([has_register_sysctl_sz], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_REGISTER_SYSCTL_SZ, 1,
+			[register_sysctl_sz exists])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
+
+dnl #
+dnl # Linux 6.11 makes const the ctl_table arg of proc_handler
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST], [
+	ZFS_LINUX_TEST_SRC([has_proc_handler_ctl_table_const], [
+		#include <linux/sysctl.h>
+
+		static int test_handler(
+		    const struct ctl_table *ctl __attribute((unused)),
+		    int write __attribute((unused)),
+		    void *buffer __attribute((unused)),
+		    size_t *lenp __attribute((unused)),
+		    loff_t *ppos __attribute((unused)))
+		{
+			return (0);
+		}
+	], [
+		proc_handler *ph __attribute((unused)) =
+		    &test_handler;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST], [
+	AC_MSG_CHECKING([whether proc_handler ctl_table arg is const])
+	ZFS_LINUX_TEST_RESULT([has_proc_handler_ctl_table_const], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_PROC_HANDLER_CTL_TABLE_CONST, 1,
+		    [proc_handler ctl_table arg is const])
+	], [
+		AC_MSG_RESULT([no])
+	])
+])
@@ -0,0 +1,40 @@
+dnl #
+dnl # check if kernel provides definitions for given types
+dnl #
+
+dnl _ZFS_AC_KERNEL_SRC_TYPE(type)
+AC_DEFUN([_ZFS_AC_KERNEL_SRC_TYPE], [
+	ZFS_LINUX_TEST_SRC([type_$1], [
+		#include <linux/types.h>
+	],[
+		const $1 __attribute__((unused)) x = ($1) 0;
+	])
+])
+
+dnl _ZFS_AC_KERNEL_TYPE(type)
+AC_DEFUN([_ZFS_AC_KERNEL_TYPE], [
+	AC_MSG_CHECKING([whether kernel defines $1])
+	ZFS_LINUX_TEST_RESULT([type_$1], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE([HAVE_KERNEL_]m4_quote(m4_translit([$1], [a-z], [A-Z])),
+		    1, [kernel defines $1])
+	], [
+		AC_MSG_RESULT([no])
+	])
+])
+
+dnl ZFS_AC_KERNEL_TYPES([types...])
+AC_DEFUN([ZFS_AC_KERNEL_TYPES], [
+	AC_DEFUN([ZFS_AC_KERNEL_SRC_TYPES], [
+		m4_foreach_w([type], [$1], [
+			_ZFS_AC_KERNEL_SRC_TYPE(type)
+		])
+	])
+	AC_DEFUN([ZFS_AC_KERNEL_TYPES], [
+		m4_foreach_w([type], [$1], [
+			_ZFS_AC_KERNEL_TYPE(type)
+		])
+	])
+])
+
+ZFS_AC_KERNEL_TYPES([intptr_t])
@@ -37,6 +37,7 @@ dnl # only once the compilation can be done in parallel significantly
 dnl # speeding up the process.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
+	ZFS_AC_KERNEL_SRC_TYPES
 	ZFS_AC_KERNEL_SRC_OBJTOOL
 	ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE
 	ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE
@@ -165,9 +166,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
 	ZFS_AC_KERNEL_SRC_RECLAIMED
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ
+	ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
 	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
+	ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -187,6 +191,7 @@ dnl #
 dnl # Check results of kernel interface tests.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
+	ZFS_AC_KERNEL_TYPES
 	ZFS_AC_KERNEL_ACCESS_OK_TYPE
 	ZFS_AC_KERNEL_GLOBAL_PAGE_STATE
 	ZFS_AC_KERNEL_OBJTOOL
@@ -315,9 +320,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_WRITEPAGE_T
 	ZFS_AC_KERNEL_RECLAIMED
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ
+	ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
 	ZFS_AC_KERNEL_MM_PAGE_SIZE
+	ZFS_AC_KERNEL_MM_PAGE_MAPPING
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
@@ -0,0 +1,14 @@
+dnl
+dnl backtrace(), for userspace assertions. glibc has this directly in libc.
+dnl FreeBSD and (sometimes) musl have it in a separate -lexecinfo. It's assumed
+dnl that this will also get the companion function backtrace_symbols().
+dnl
+AC_DEFUN([ZFS_AC_CONFIG_USER_BACKTRACE], [
+	AX_SAVE_FLAGS
+	LIBS=""
+	AC_SEARCH_LIBS([backtrace], [execinfo], [
+		AC_DEFINE(HAVE_BACKTRACE, 1, [backtrace() is available])
+		AC_SUBST([BACKTRACE_LIBS], ["$LIBS"])
+	])
+	AX_RESTORE_FLAGS
+])
@@ -0,0 +1,44 @@
+dnl
+dnl Checks for libunwind, which usually does a better job than backtrace() when
+dnl resolving symbols in the stack backtrace. Newer versions have support for
+dnl getting info about the object file the function came from, so we look for
+dnl that too and use it if found.
+dnl
+AC_DEFUN([ZFS_AC_CONFIG_USER_LIBUNWIND], [
+	AC_ARG_WITH([libunwind],
+	    AS_HELP_STRING([--with-libunwind],
+		[use libunwind for backtraces in userspace assertions]),
+	    [],
+	    [with_libunwind=auto])
+
+	AS_IF([test "x$with_libunwind" != "xno"], [
+		ZFS_AC_FIND_SYSTEM_LIBRARY(LIBUNWIND, [libunwind], [libunwind.h], [], [unwind], [], [
+			dnl unw_get_elf_filename() is sometimes a macro, other
+			dnl times a proper symbol, so we can't just do a link
+			dnl check; we need to include the header properly.
+			AX_SAVE_FLAGS
+			CFLAGS="$CFLAGS $LIBUNWIND_CFLAGS"
+			LIBS="$LIBS $LIBUNWIND_LIBS"
+			AC_MSG_CHECKING([for unw_get_elf_filename in libunwind])
+			AC_LINK_IFELSE([
+				AC_LANG_PROGRAM([
+					#define UNW_LOCAL_ONLY
+					#include <libunwind.h>
+				], [
+					unw_get_elf_filename(0, 0, 0, 0);
+				])
+			], [
+				AC_MSG_RESULT([yes])
+				AC_DEFINE(HAVE_LIBUNWIND_ELF, 1,
+				    [libunwind has unw_get_elf_filename])
+			], [
+				AC_MSG_RESULT([no])
+			])
+			AX_RESTORE_FLAGS
+		], [
+			AS_IF([test "x$with_libunwind" = "xyes"], [
+				AC_MSG_FAILURE([--with-libunwind was given, but libunwind is not available, try installing libunwind-devel])
+			])
+		])
+	])
+])
@@ -26,12 +26,14 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
 	ZFS_AC_CONFIG_USER_AIO_H
 	ZFS_AC_CONFIG_USER_CLOCK_GETTIME
 	ZFS_AC_CONFIG_USER_PAM
+	ZFS_AC_CONFIG_USER_BACKTRACE
+	ZFS_AC_CONFIG_USER_LIBUNWIND
 	ZFS_AC_CONFIG_USER_RUNSTATEDIR
 	ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS
 	ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV
 	ZFS_AC_CONFIG_USER_ZFSEXEC

-	AC_CHECK_FUNCS([issetugid mlockall strlcat strlcpy])
+	AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid])

 	AC_SUBST(RM)
 ])
@@ -1 +1,2 @@
 /zfs
+/zpool
@@ -1,5 +1,9 @@
-nodist_bashcompletion_DATA  = %D%/zfs
-SUBSTFILES                 += $(nodist_bashcompletion_DATA)
+nodist_bashcompletion_DATA  = %D%/zfs %D%/zpool
+COMPLETION_FILES            = %D%/zfs
+SUBSTFILES                 += $(COMPLETION_FILES)

-SHELLCHECKSCRIPTS   += $(nodist_bashcompletion_DATA)
-$(call SHELLCHECK_OPTS,$(nodist_bashcompletion_DATA)): SHELLCHECK_SHELL = bash
+SHELLCHECKSCRIPTS   += $(COMPLETION_FILES)
+$(call SHELLCHECK_OPTS,$(COMPLETION_FILES)): SHELLCHECK_SHELL = bash
+
+%D%/zpool: %D%/zfs
+	$(LN_S) zfs $@
@@ -138,7 +138,8 @@ typedef int enum_t;
 #define	readdir64 readdir
 #define	dirent64 dirent
 #endif
-#define	P2ALIGN(x, align)		((x) & -(align))
+// Deprecated. Use P2ALIGN_TYPED instead.
+// #define	P2ALIGN(x, align)		((x) & -(align))
 #define	P2CROSS(x, y, align)	(((x) ^ (y)) > (align) - 1)
 #define	P2ROUNDUP(x, align)		((((x) - 1) | ((align) - 1)) + 1)
 #define	P2PHASE(x, align)		((x) & ((align) - 1))
@@ -31,13 +31,14 @@

 #include_next <sys/sdt.h>
 #ifdef KDTRACE_HOOKS
-/* CSTYLED */
+/* BEGIN CSTYLED */
 SDT_PROBE_DECLARE(sdt, , , set__error);

-#define	SET_ERROR(err) \
-	((sdt_sdt___set__error->id ? \
-	(*sdt_probe_func)(sdt_sdt___set__error->id, \
-	    (uintptr_t)err, 0, 0, 0, 0) : 0), err)
+#define	SET_ERROR(err)	({ 					\
+	SDT_PROBE1(sdt, , , set__error, (uintptr_t)err);	\
+	err;							\
+})
+/* END CSTYLED */
 #else
 #define	SET_ERROR(err) (err)
 #endif
@@ -191,7 +191,8 @@ extern unsigned char bcd_to_byte[256];
 * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align)
 * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align)
 */
-#define	P2ALIGN(x, align)		((x) & -(align))
+// Deprecated. Use P2ALIGN_TYPED instead.
+// #define	P2ALIGN(x, align)		((x) & -(align))

 /*
 * return x % (mod) align
@@ -57,6 +57,11 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
 #endif

 /*
+ * 6.11 API
+ * Setting the flush flags directly is no longer possible; flush flags are set
+ * on the queue_limits structure and passed to blk_disk_alloc(). In this case
+ * we remove this function entirely.
+ *
 * 4.7 API,
 * The blk_queue_write_cache() interface has replaced blk_queue_flush()
 * interface.  However, the new interface is GPL-only thus we implement
@@ -68,39 +73,43 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
 * new one is GPL-only.   Thus if the GPL-only version is detected we
 * implement our own trivial helper.
 */
+#if !defined(HAVE_BLK_ALLOC_DISK_2ARG) || \
+	!defined(HAVE_BLKDEV_QUEUE_LIMITS_FEATURES)
 static inline void
-blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua)
+blk_queue_set_write_cache(struct request_queue *q, bool on)
 {
 #if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY)
-	if (wc)
+	if (on) {
 		blk_queue_flag_set(QUEUE_FLAG_WC, q);
-	else
-		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
-	if (fua)
 		blk_queue_flag_set(QUEUE_FLAG_FUA, q);
-	else
+	} else {
+		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
 		blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
+	}
 #elif defined(HAVE_BLK_QUEUE_WRITE_CACHE)
-	blk_queue_write_cache(q, wc, fua);
+	blk_queue_write_cache(q, on, on);
 #elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY)
-	if (wc)
-		q->flush_flags |= REQ_FLUSH;
-	if (fua)
-		q->flush_flags |= REQ_FUA;
+	if (on)
+		q->flush_flags |= REQ_FLUSH | REQ_FUA;
+	else
+		q->flush_flags &= ~(REQ_FLUSH | REQ_FUA);
 #elif defined(HAVE_BLK_QUEUE_FLUSH)
-	blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0));
+	blk_queue_flush(q, on ? (REQ_FLUSH | REQ_FUA) : 0);
 #else
 #error "Unsupported kernel"
 #endif
 }
+#endif /* !HAVE_BLK_ALLOC_DISK_2ARG || !HAVE_BLKDEV_QUEUE_LIMITS_FEATURES */

 static inline void
 blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages)
 {
 #if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \
 	!defined(HAVE_DISK_UPDATE_READAHEAD)
-#ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC
+#if defined(HAVE_BLK_QUEUE_BDI_DYNAMIC)
 	q->backing_dev_info->ra_pages = ra_pages;
+#elif defined(HAVE_BLK_QUEUE_DISK_BDI)
+	q->disk->bdi->ra_pages = ra_pages;
 #else
 	q->backing_dev_info.ra_pages = ra_pages;
 #endif
@@ -21,16 +21,23 @@

 /*
 * Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
 */

 #ifndef _ZFS_MM_COMPAT_H
 #define	_ZFS_MM_COMPAT_H

 #include <linux/mm.h>
+#include <linux/pagemap.h>

 /* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
 #ifndef HAVE_MM_PAGE_SIZE
 #define	page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
 #endif

+/* 6.11 removed page_mapping(). A simple wrapper around folio_mapping() works */
+#ifndef HAVE_MM_PAGE_MAPPING
+#define	page_mapping(p) folio_mapping(page_folio(p))
+#endif
+
 #endif /* _ZFS_MM_COMPAT_H */
@@ -192,22 +192,25 @@ extern void spl_kmem_reap(void);
 extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache);
 extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache);

+#ifndef	SPL_KMEM_CACHE_IMPLEMENTING
+/*
+ * Macros for the kmem_cache_* API expected by ZFS and SPL clients. We don't
+ * define them inside spl-kmem-cache.c, as that uses the kernel's incompatible
+ * kmem_cache_* facilities to implement ours.
+ */
+
+/* Avoid conflicts with kernel names that might be implemented as macros. */
+#undef	kmem_cache_alloc
+
 #define	kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \
    spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl)
 #define	kmem_cache_set_move(skc, move)	spl_kmem_cache_set_move(skc, move)
 #define	kmem_cache_destroy(skc)		spl_kmem_cache_destroy(skc)
-/*
- * This is necessary to be compatible with other kernel modules
- * or in-tree filesystem that may define kmem_cache_alloc,
- * like bcachefs does it now.
- */
-#ifdef kmem_cache_alloc
-#undef kmem_cache_alloc
-#endif
 #define	kmem_cache_alloc(skc, flags)	spl_kmem_cache_alloc(skc, flags)
 #define	kmem_cache_free(skc, obj)	spl_kmem_cache_free(skc, obj)
 #define	kmem_cache_reap_now(skc)	spl_kmem_cache_reap_now(skc)
 #define	kmem_reap()			spl_kmem_reap()
+#endif

 /*
 * The following functions are only available for internal use.
@@ -159,7 +159,8 @@ makedev(unsigned int major, unsigned int minor)
 /*
 * Compatibility macros/typedefs needed for Solaris -> Linux port
 */
-#define	P2ALIGN(x, align)	((x) & -(align))
+// Deprecated. Use P2ALIGN_TYPED instead.
+// #define	P2ALIGN(x, align)	((x) & -(align))
 #define	P2CROSS(x, y, align)	(((x) ^ (y)) > (align) - 1)
 #define	P2ROUNDUP(x, align)	((((x) - 1) | ((align) - 1)) + 1)
 #define	P2PHASE(x, align)	((x) & ((align) - 1))
@@ -38,7 +38,9 @@ typedef unsigned long		ulong_t;
 typedef unsigned long long	u_longlong_t;
 typedef long long		longlong_t;

+#ifndef HAVE_KERNEL_INTPTR_T
 typedef long			intptr_t;
+#endif
 typedef unsigned long long	rlim64_t;

 typedef struct task_struct	kthread_t;
@@ -80,7 +80,7 @@
 		snprintf(__get_str(msg), TRACE_DBUF_MSG_MAX,		\
 		    DBUF_TP_PRINTK_FMT, DBUF_TP_PRINTK_ARGS);		\
 	} else {							\
-		__assign_str(os_spa, "NULL")				\
+		__assign_str(os_spa, "NULL");				\
 		__entry->ds_object = 0;					\
 		__entry->db_object = 0;					\
 		__entry->db_level  = 0;					\
@@ -173,6 +173,7 @@ typedef struct dsl_scan {
 	dsl_scan_phys_t scn_phys;	/* on disk representation of scan */
 	dsl_scan_phys_t scn_phys_cached;
 	avl_tree_t scn_queue;		/* queue of datasets to scan */
+	kmutex_t scn_queue_lock;	/* serializes scn_queue inserts */
 	uint64_t scn_queues_pending;	/* outstanding data to issue */
 	/* members needed for syncing error scrub status to disk */
 	dsl_errorscrub_phys_t errorscrub_phys;
@@ -1175,8 +1175,8 @@ efi_use_whole_disk(int fd)
 	 * (for performance reasons). The alignment should match the
 	 * alignment used by the "zpool_label_disk" function.
 	 */
-	limit = P2ALIGN(efi_label->efi_last_lba - nblocks - EFI_MIN_RESV_SIZE,
-	    PARTITION_END_ALIGNMENT);
+	limit = P2ALIGN_TYPED(efi_label->efi_last_lba - nblocks -
+	    EFI_MIN_RESV_SIZE, PARTITION_END_ALIGNMENT, diskaddr_t);
 	if (data_start + data_size != limit || resv_start != limit)
 		sync_needed = B_TRUE;

@@ -1,6 +1,6 @@
 include $(srcdir)/%D%/include/Makefile.am

-libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS)
+libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) $(LIBUNWIND_CFLAGS)
 libspl_la_CFLAGS        = $(libspl_assert_la_CFLAGS)

 noinst_LTLIBRARIES += libspl_assert.la libspl.la
@@ -43,3 +43,9 @@ libspl_la_LIBADD = \
 	libspl_assert.la

 libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME)
+
+libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS)
+
+if BUILD_FREEBSD
+libspl_assert_la_LIBADD += -lpthread
+endif
@@ -22,8 +22,96 @@
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
+/*
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
+ */

 #include <assert.h>
+#include <pthread.h>
+
+#if defined(__linux__)
+#include <errno.h>
+#include <sys/prctl.h>
+#ifdef HAVE_GETTID
+#define	libspl_gettid()		gettid()
+#else
+#include <sys/syscall.h>
+#define	libspl_gettid()		((pid_t)syscall(__NR_gettid))
+#endif
+#define	libspl_getprogname()	(program_invocation_short_name)
+#define	libspl_getthreadname(buf, len)	\
+	prctl(PR_GET_NAME, (unsigned long)(buf), 0, 0, 0)
+#elif defined(__FreeBSD__) || defined(__APPLE__)
+#if !defined(__APPLE__)
+#include <pthread_np.h>
+#define	libspl_gettid()		pthread_getthreadid_np()
+#endif
+#define	libspl_getprogname()	getprogname()
+#define	libspl_getthreadname(buf, len)	\
+	pthread_getname_np(pthread_self(), buf, len);
+#endif
+
+#if defined(HAVE_LIBUNWIND)
+#define	UNW_LOCAL_ONLY
+#include <libunwind.h>
+
+static inline void
+libspl_dump_backtrace(void)
+{
+	unw_context_t uc;
+	unw_cursor_t cp;
+	unw_word_t ip, off;
+	char funcname[128];
+#ifdef HAVE_LIBUNWIND_ELF
+	char objname[128];
+	unw_word_t objoff;
+#endif
+
+	fprintf(stderr, "Call trace:\n");
+	unw_getcontext(&uc);
+	unw_init_local(&cp, &uc);
+	while (unw_step(&cp) > 0) {
+		unw_get_reg(&cp, UNW_REG_IP, &ip);
+		unw_get_proc_name(&cp, funcname, sizeof (funcname), &off);
+#ifdef HAVE_LIBUNWIND_ELF
+		unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff);
+		fprintf(stderr, "  [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n",
+		    ip, funcname, off, objname, objoff);
+#else
+		fprintf(stderr, "  [0x%08lx] %s+0x%2lx\n", ip, funcname, off);
+#endif
+	}
+}
+#elif defined(HAVE_BACKTRACE)
+#include <execinfo.h>
+
+static inline void
+libspl_dump_backtrace(void)
+{
+	void *btptrs[100];
+	size_t nptrs = backtrace(btptrs, 100);
+	char **bt = backtrace_symbols(btptrs, nptrs);
+	fprintf(stderr, "Call trace:\n");
+	for (size_t i = 0; i < nptrs; i++)
+		fprintf(stderr, "  %s\n", bt[i]);
+	free(bt);
+}
+#else
+#define	libspl_dump_backtrace()
+#endif
+
+#if defined(__APPLE__)
+static inline uint64_t
+libspl_gettid(void)
+{
+	uint64_t tid;
+
+	if (pthread_threadid_np(NULL, &tid) != 0)
+		tid = 0;
+
+	return (tid);
+}
+#endif

 static boolean_t libspl_assert_ok = B_FALSE;

@@ -33,21 +121,41 @@ libspl_set_assert_ok(boolean_t val)
 	libspl_assert_ok = val;
 }

+static pthread_mutex_t assert_lock = PTHREAD_MUTEX_INITIALIZER;
+
 /* printf version of libspl_assert */
 void
 libspl_assertf(const char *file, const char *func, int line,
    const char *format, ...)
 {
+	pthread_mutex_lock(&assert_lock);
+
 	va_list args;
+	char tname[64];
+
+	libspl_getthreadname(tname, sizeof (tname));
+
+	fprintf(stderr, "ASSERT at %s:%d:%s()\n", file, line, func);

 	va_start(args, format);
 	vfprintf(stderr, format, args);
-	fprintf(stderr, "\n");
-	fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func);
 	va_end(args);

+	fprintf(stderr, "\n"
+	    "  PID: %-8u  COMM: %s\n"
+#if defined(__APPLE__)
+	    "  TID: %-8" PRIu64 "  NAME: %s\n",
+#else
+	    "  TID: %-8u  NAME: %s\n",
+#endif
+	    getpid(), libspl_getprogname(),
+	    libspl_gettid(), tname);
+
+	libspl_dump_backtrace();
+
 #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__)
 	if (libspl_assert_ok) {
+		pthread_mutex_unlock(&assert_lock);
 		return;
 	}
 #endif
@@ -57,6 +57,8 @@
 extern size_t spl_pagesize(void);
 #define	PAGESIZE	(spl_pagesize())

+#ifndef HAVE_EXECVPE
 extern int execvpe(const char *name, char * const argv[], char * const envp[]);
+#endif

 #endif
@@ -52,7 +52,8 @@
 /*
 * Compatibility macros/typedefs needed for Solaris -> Linux port
 */
-#define	P2ALIGN(x, align)	((x) & -(align))
+// Deprecated. Use P2ALIGN_TYPED instead.
+// #define	P2ALIGN(x, align)	((x) & -(align))
 #define	P2CROSS(x, y, align)	(((x) ^ (y)) > (align) - 1)
 #define	P2ROUNDUP(x, align)	((((x) - 1) | ((align) - 1)) + 1)
 #define	P2BOUNDARY(off, len, align) \
@@ -2170,7 +2170,8 @@ out:
 static int
 send_conclusion_record(int fd, zio_cksum_t *zc)
 {
-	dmu_replay_record_t drr = { 0 };
+	dmu_replay_record_t drr;
+	memset(&drr, 0, sizeof (dmu_replay_record_t));
 	drr.drr_type = DRR_END;
 	if (zc != NULL)
 		drr.drr_u.drr_end.drr_checksum = *zc;
@@ -2272,7 +2273,8 @@ send_prelim_records(zfs_handle_t *zhp, const char *from, int fd,
 	}

 	if (!dryrun) {
-		dmu_replay_record_t drr = { 0 };
+		dmu_replay_record_t drr;
+		memset(&drr, 0, sizeof (dmu_replay_record_t));
 		/* write first begin record */
 		drr.drr_type = DRR_BEGIN;
 		drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
@@ -38,7 +38,8 @@
 #define	ZFS_KMOD	"openzfs"
 #endif

-
+#ifndef HAVE_EXECVPE
+/* FreeBSD prior to 15 lacks execvpe */
 static int
 execvPe(const char *name, const char *path, char * const *argv,
    char * const *envp)
@@ -192,6 +193,7 @@ execvpe(const char *name, char * const argv[], char * const envp[])

 	return (execvPe(name, path, argv, envp));
 }
+#endif /* !HAVE_EXECVPE */

 static __thread char errbuf[ERRBUFLEN];

@@ -268,7 +268,8 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name)
 	if (start_block == MAXOFFSET_T)
 		start_block = NEW_START_BLOCK;
 	slice_size -= start_block;
-	slice_size = P2ALIGN(slice_size, PARTITION_END_ALIGNMENT);
+	slice_size = P2ALIGN_TYPED(slice_size, PARTITION_END_ALIGNMENT,
+	    uint64_t);

 	vtoc->efi_parts[0].p_start = start_block;
 	vtoc->efi_parts[0].p_size = slice_size;
@@ -121,20 +121,26 @@ Controls whether buffers present on special vdevs are eligible for caching
 into L2ARC.
 If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
 .
-.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq  int
+.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int
 Controls whether only MFU metadata and data are cached from ARC into L2ARC.
 This may be desired to avoid wasting space on L2ARC when reading/writing large
 amounts of data that are not expected to be accessed more than once.
 .Pp
-The default is off,
+The default is 0,
 meaning both MRU and MFU data and metadata are cached.
-When turning off this feature, some MRU buffers will still be present
-in ARC and eventually cached on L2ARC.
+When turning off this feature (setting it to 0), some MRU buffers will
+still be present in ARC and eventually cached on L2ARC.
 .No If Sy l2arc_noprefetch Ns = Ns Sy 0 ,
 some prefetched buffers will be cached to L2ARC, and those might later
 transition to MRU, in which case the
 .Sy l2arc_mru_asize No arcstat will not be Sy 0 .
 .Pp
+Setting it to 1 means to L2 cache only MFU data and metadata.
+.Pp
+Setting it to 2 means to L2 cache all metadata (MRU+MFU) but
+only MFU data (ie: MRU data are not cached). This can be the right setting
+to cache as much metadata as possible even when having high data turnover.
+.Pp
 Regardless of
 .Sy l2arc_noprefetch ,
 some MFU buffers might be evicted from ARC,
@@ -2324,8 +2330,8 @@ Prioritize requeued I/O.
 .
 .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint
 Percentage of online CPUs which will run a worker thread for I/O.
-These workers are responsible for I/O work such as compression and
-checksum calculations.
+These workers are responsible for I/O work such as compression, encryption,
+checksum and parity calculations.
 Fractional number of CPUs will be rounded down.
 .Pp
 The default value of
@@ -2333,6 +2339,7 @@ The default value of
 was chosen to avoid using all CPUs which can result in
 latency issues and inconsistent application performance,
 especially when slower compression and/or checksumming is enabled.
+Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint
 Number of worker threads per taskq.
@@ -2342,16 +2349,19 @@ while higher reduces lock contention.
 If
 .Sy 0 ,
 generate a system-dependent value close to 6 threads per taskq.
+Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
 Set the queue and thread configuration for the IO read queues.
 This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
+Set values only apply to pools imported/created after that.
 .
 .It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp
 Set the queue and thread configuration for the IO write queues.
 This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
+Set values only apply to pools imported/created after that.
 .
 .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Do not create zvol device nodes.
@@ -154,7 +154,7 @@ defaults to the current kernel release.
 .
 .It Sy bootfs.rollback Ns Op Sy = Ns Ar snapshot-name
 Execute
-.Nm zfs Cm snapshot Fl Rf Ar boot-dataset Ns Sy @ Ns Ar snapshot-name
+.Nm zfs Cm rollback Fl Rf Ar boot-dataset Ns Sy @ Ns Ar snapshot-name
 before pivoting to the real root.
 .Ar snapshot-name
 defaults to the current kernel release.
@@ -16,8 +16,8 @@ src = @abs_srcdir@
 obj = @abs_builddir@
 else
 zfs_include = $(srctree)/include/zfs
-icp_include = $(srctree)/$(src)/icp/include
-zstd_include = $(srctree)/$(src)/zstd/include
+icp_include = $(src)/icp/include
+zstd_include = $(src)/zstd/include
 ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
 endif

@@ -492,6 +492,8 @@ zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64))
 UBSAN_SANITIZE_zap_leaf.o := n
 UBSAN_SANITIZE_zap_micro.o := n
 UBSAN_SANITIZE_sa.o := n
+UBSAN_SANITIZE_zfs/zap_micro.o := n
+UBSAN_SANITIZE_zfs/sa.o := n

 # Suppress incorrect warnings from versions of objtool which are not
 # aware of x86 EVEX prefix instructions used for AVX512.
@@ -118,7 +118,15 @@ const sha256_ops_t sha256_shani_impl = {
 };
 #endif

-#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH > 6)
+#elif defined(__aarch64__) || defined(__arm__)
+extern void zfs_sha256_block_armv7(uint32_t s[8], const void *, size_t);
+const sha256_ops_t sha256_armv7_impl = {
+	.is_supported = sha2_is_supported,
+	.transform = zfs_sha256_block_armv7,
+	.name = "armv7"
+};
+
+#if __ARM_ARCH > 6
 static boolean_t sha256_have_neon(void)
 {
 	return (kfpu_allowed() && zfs_neon_available());
@@ -129,13 +137,6 @@ static boolean_t sha256_have_armv8ce(void)
 	return (kfpu_allowed() && zfs_sha256_available());
 }

-extern void zfs_sha256_block_armv7(uint32_t s[8], const void *, size_t);
-const sha256_ops_t sha256_armv7_impl = {
-	.is_supported = sha2_is_supported,
-	.transform = zfs_sha256_block_armv7,
-	.name = "armv7"
-};
-
 TF(zfs_sha256_block_neon, tf_sha256_neon);
 const sha256_ops_t sha256_neon_impl = {
 	.is_supported = sha256_have_neon,
@@ -149,6 +150,7 @@ const sha256_ops_t sha256_armv8_impl = {
 	.transform = tf_sha256_armv8ce,
 	.name = "armv8-ce"
 };
+#endif

 #elif defined(__PPC64__)
 static boolean_t sha256_have_isa207(void)
@@ -192,11 +194,13 @@ static const sha256_ops_t *const sha256_impls[] = {
 #if defined(__x86_64) && defined(HAVE_SSE4_1)
 	&sha256_shani_impl,
 #endif
-#if defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH > 6)
+#if defined(__aarch64__) || defined(__arm__)
 	&sha256_armv7_impl,
+#if __ARM_ARCH > 6
 	&sha256_neon_impl,
 	&sha256_armv8_impl,
 #endif
+#endif
 #if defined(__PPC64__)
 	&sha256_ppc_impl,
 	&sha256_power8_impl,
@@ -88,7 +88,7 @@ const sha512_ops_t sha512_avx2_impl = {
 };
 #endif

-#elif defined(__aarch64__)
+#elif defined(__aarch64__) || defined(__arm__)
 extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
 const sha512_ops_t sha512_armv7_impl = {
 	.is_supported = sha2_is_supported,
@@ -96,6 +96,7 @@ const sha512_ops_t sha512_armv7_impl = {
 	.name = "armv7"
 };

+#if defined(__aarch64__)
 static boolean_t sha512_have_armv8ce(void)
 {
 	return (kfpu_allowed() && zfs_sha512_available());
@@ -107,15 +108,9 @@ const sha512_ops_t sha512_armv8_impl = {
 	.transform = tf_sha512_armv8ce,
 	.name = "armv8-ce"
 };
+#endif

-#elif defined(__arm__) && __ARM_ARCH > 6
-extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
-const sha512_ops_t sha512_armv7_impl = {
-	.is_supported = sha2_is_supported,
-	.transform = zfs_sha512_block_armv7,
-	.name = "armv7"
-};
-
+#if defined(__arm__) && __ARM_ARCH > 6
 static boolean_t sha512_have_neon(void)
 {
 	return (kfpu_allowed() && zfs_neon_available());
@@ -127,6 +122,7 @@ const sha512_ops_t sha512_neon_impl = {
 	.transform = tf_sha512_neon,
 	.name = "neon"
 };
+#endif

 #elif defined(__PPC64__)
 TF(zfs_sha512_ppc, tf_sha512_ppc);
@@ -164,14 +160,15 @@ static const sha512_ops_t *const sha512_impls[] = {
 #if defined(__x86_64) && defined(HAVE_AVX2)
 	&sha512_avx2_impl,
 #endif
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__arm__)
 	&sha512_armv7_impl,
+#if defined(__aarch64__)
 	&sha512_armv8_impl,
 #endif
 #if defined(__arm__) && __ARM_ARCH > 6
-	&sha512_armv7_impl,
 	&sha512_neon_impl,
 #endif
+#endif
 #if defined(__PPC64__)
 	&sha512_ppc_impl,
 	&sha512_power8_impl,
@@ -21,8 +21,11 @@

 #if defined(__arm__)

-#define	__ARM_ARCH__      7
-#define	__ARM_MAX_ARCH__  7
+#ifndef __ARM_ARCH
+# define __ARM_ARCH__	7
+#else
+# define __ARM_ARCH__	__ARM_ARCH
+#endif

 #if defined(__thumb2__)
 .syntax unified
@@ -1834,6 +1837,7 @@ zfs_sha256_block_armv7:
 #endif
 .size	zfs_sha256_block_armv7,.-zfs_sha256_block_armv7

+#if __ARM_ARCH__ >= 7
 .arch	armv7-a
 .fpu	neon

@@ -2766,4 +2770,5 @@ zfs_sha256_block_armv8:
 	bx	lr		@ bx lr
 .size	zfs_sha256_block_armv8,.-zfs_sha256_block_armv8

-#endif
+#endif // #if __ARM_ARCH__ >= 7
+#endif // #if defined(__arm__)
@@ -21,8 +21,11 @@

 #if defined(__arm__)

-#define	__ARM_ARCH__      7
-#define	__ARM_MAX_ARCH__  7
+#ifndef __ARM_ARCH
+# define __ARM_ARCH__	7
+#else
+# define __ARM_ARCH__	__ARM_ARCH
+#endif

 #ifndef __KERNEL__
 # define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
@@ -490,6 +493,7 @@ zfs_sha512_block_armv7:
 #endif
 .size	zfs_sha512_block_armv7,.-zfs_sha512_block_armv7

+#if __ARM_ARCH__ >= 7
 .arch	armv7-a
 .fpu	neon

@@ -1819,4 +1823,5 @@ zfs_sha512_block_neon:
 	VFP_ABI_POP
 	bx	lr				@ .word	0xe12fff1e
 .size	zfs_sha512_block_neon,.-zfs_sha512_block_neon
-#endif
+#endif // #if __ARM_ARCH__ >= 7
+#endif // #if defined(__arm__)
@@ -832,12 +832,14 @@ aes_encrypt_atomic(crypto_mechanism_t *mechanism,
    crypto_key_t *key, crypto_data_t *plaintext, crypto_data_t *ciphertext,
    crypto_spi_ctx_template_t template)
 {
-	aes_ctx_t aes_ctx = {{{{0}}}};
+	aes_ctx_t aes_ctx;
 	off_t saved_offset;
 	size_t saved_length;
 	size_t length_needed;
 	int ret;

+	memset(&aes_ctx, 0, sizeof (aes_ctx_t));
+
 	ASSERT(ciphertext != NULL);

 	/*
@@ -956,12 +958,14 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism,
    crypto_key_t *key, crypto_data_t *ciphertext, crypto_data_t *plaintext,
    crypto_spi_ctx_template_t template)
 {
-	aes_ctx_t aes_ctx = {{{{0}}}};
+	aes_ctx_t aes_ctx;
 	off_t saved_offset;
 	size_t saved_length;
 	size_t length_needed;
 	int ret;

+	memset(&aes_ctx, 0, sizeof (aes_ctx_t));
+
 	ASSERT(plaintext != NULL);

 	/*
@@ -457,7 +457,7 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
 	ZFS_LOG(1, "Reading config from %s...", pp->name);

 	psize = pp->mediasize;
-	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+	psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t);

 	size = sizeof (*vdev_lists[0]) + pp->sectorsize -
 	    ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
@@ -543,6 +543,7 @@ zfs_rmnode(znode_t *zp)
 	dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);

 	zfs_znode_delete(zp, tx);
+	zfs_znode_free(zp);

 	dmu_tx_commit(tx);

@@ -1169,10 +1169,25 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
 		return (error);
 	}
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+	error = zfs_link_create(dzp, name, zp, tx, ZNEW);
+	if (error != 0) {
+		/*
+		 * Since, we failed to add the directory entry for it,
+		 * delete the newly created dnode.
+		 */
+		zfs_znode_delete(zp, tx);
+		VOP_UNLOCK1(ZTOV(zp));
+		zrele(zp);
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_commit(tx);
+		getnewvnode_drop_reserve();
+		goto out;
+	}
+
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);

-	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
 	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 	    vsecp, acl_ids.z_fuidp, vap);
@@ -1520,13 +1535,19 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);

-	if (fuid_dirtied)
-		zfs_fuid_sync(zfsvfs, tx);
-
 	/*
 	 * Now put new name in parent dir.
 	 */
-	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
+	error = zfs_link_create(dzp, dirname, zp, tx, ZNEW);
+	if (error != 0) {
+		zfs_znode_delete(zp, tx);
+		VOP_UNLOCK1(ZTOV(zp));
+		zrele(zp);
+		goto out;
+	}
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);

 	*zpp = zp;

@@ -1534,6 +1555,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
 	    acl_ids.z_fuidp, vap);

+out:
 	zfs_acl_ids_free(&acl_ids);

 	dmu_tx_commit(tx);
@@ -1544,7 +1566,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
 		zil_commit(zilog, 0);

 	zfs_exit(zfsvfs, FTAG);
-	return (0);
+	return (error);
 }

 #if	__FreeBSD_version < 1300124
@@ -3578,10 +3600,14 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
 	/*
 	 * Insert the new object into the directory.
 	 */
-	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
-
-	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
-	*zpp = zp;
+	error = zfs_link_create(dzp, name, zp, tx, ZNEW);
+	if (error != 0) {
+		zfs_znode_delete(zp, tx);
+		VOP_UNLOCK1(ZTOV(zp));
+		zrele(zp);
+	} else {
+		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+	}

 	zfs_acl_ids_free(&acl_ids);

@@ -3589,8 +3615,12 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,

 	getnewvnode_drop_reserve();

-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+	if (error == 0) {
+		*zpp = zp;
+
+		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+			zil_commit(zilog, 0);
+	}

 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -6238,7 +6268,6 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 	struct vnode *invp = ap->a_invp;
 	struct vnode *outvp = ap->a_outvp;
 	struct mount *mp;
-	struct uio io;
 	int error;
 	uint64_t len = *ap->a_lenp;

@@ -6286,12 +6315,6 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 		goto out_locked;
 #endif

-	io.uio_offset = *ap->a_outoffp;
-	io.uio_resid = *ap->a_lenp;
-	error = vn_rlimit_fsize(outvp, &io, ap->a_fsizetd);
-	if (error != 0)
-		goto out_locked;
-
 	error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
 	    ap->a_outoffp, &len, ap->a_outcred);
 	if (error == EXDEV || error == EAGAIN || error == EINVAL ||
@@ -1234,7 +1234,6 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 	VERIFY0(dmu_object_free(os, obj, tx));
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
-	zfs_znode_free(zp);
 }

 void
@@ -21,6 +21,8 @@
 *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
 */

+#define	SPL_KMEM_CACHE_IMPLEMENTING
+
 #include <linux/percpu_compat.h>
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
@@ -33,16 +35,6 @@
 #include <linux/swap.h>
 #include <linux/prefetch.h>

-/*
- * Within the scope of spl-kmem.c file the kmem_cache_* definitions
- * are removed to allow access to the real Linux slab allocator.
- */
-#undef kmem_cache_destroy
-#undef kmem_cache_create
-#undef kmem_cache_alloc
-#undef kmem_cache_free
-
-
 /*
 * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
 * with smp_mb__{before,after}_atomic() because they were redundant. This is
@@ -22,6 +22,9 @@
 *
 *  Solaris Porting Layer (SPL) Proc Implementation.
 */
+/*
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
+ */

 #include <sys/systeminfo.h>
 #include <sys/kstat.h>
@@ -43,6 +46,12 @@ typedef struct ctl_table __no_const spl_ctl_table;
 typedef struct ctl_table spl_ctl_table;
 #endif

+#ifdef HAVE_PROC_HANDLER_CTL_TABLE_CONST
+#define	CONST_CTL_TABLE		const struct ctl_table
+#else
+#define	CONST_CTL_TABLE		struct ctl_table
+#endif
+
 static unsigned long table_min = 0;
 static unsigned long table_max = ~0;

@@ -60,7 +69,7 @@ struct proc_dir_entry *proc_spl_kstat = NULL;

 #ifdef DEBUG_KMEM
 static int
-proc_domemused(struct ctl_table *table, int write,
+proc_domemused(CONST_CTL_TABLE *table, int write,
    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int rc = 0;
@@ -88,7 +97,7 @@ proc_domemused(struct ctl_table *table, int write,
 #endif /* DEBUG_KMEM */

 static int
-proc_doslab(struct ctl_table *table, int write,
+proc_doslab(CONST_CTL_TABLE *table, int write,
    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int rc = 0;
@@ -135,7 +144,7 @@ proc_doslab(struct ctl_table *table, int write,
 }

 static int
-proc_dohostid(struct ctl_table *table, int write,
+proc_dohostid(CONST_CTL_TABLE *table, int write,
    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char *end, str[32];
@@ -688,6 +697,37 @@ static void spl_proc_cleanup(void)
 	}
 }

+#ifndef HAVE_REGISTER_SYSCTL_TABLE
+
+/*
+ * Traditionally, struct ctl_table arrays have been terminated by an "empty"
+ * sentinel element (specifically, one with .procname == NULL).
+ *
+ * Linux 6.6 began migrating away from this, adding register_sysctl_sz() so
+ * that callers could provide the size directly, and redefining
+ * register_sysctl() to just call register_sysctl_sz() with the array size. It
+ * retained support for the terminating element so that existing callers would
+ * continue to work.
+ *
+ * Linux 6.11 removed support for the terminating element, instead interpreting
+ * it as a real malformed element, and rejecting it.
+ *
+ * In order to continue support older kernels, we retain the terminating
+ * sentinel element for our sysctl tables, but instead detect availability of
+ * register_sysctl_sz(). If it exists, we pass it the array size -1, stopping
+ * the kernel from trying to process the terminator. For pre-6.6 kernels that
+ * don't have register_sysctl_sz(), we just use register_sysctl(), which can
+ * handle the terminating element as it always has.
+ */
+#ifdef HAVE_REGISTER_SYSCTL_SZ
+#define	spl_proc_register_sysctl(p, t)	\
+	register_sysctl_sz(p, t, ARRAY_SIZE(t)-1)
+#else
+#define	spl_proc_register_sysctl(p, t)	\
+	register_sysctl(p, t)
+#endif
+#endif
+
 int
 spl_proc_init(void)
 {
@@ -698,16 +738,17 @@ spl_proc_init(void)
 	if (spl_header == NULL)
 		return (-EUNATCH);
 #else
-	spl_header = register_sysctl("kernel/spl", spl_table);
+	spl_header = spl_proc_register_sysctl("kernel/spl", spl_table);
 	if (spl_header == NULL)
 		return (-EUNATCH);

-	spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table);
+	spl_kmem = spl_proc_register_sysctl("kernel/spl/kmem", spl_kmem_table);
 	if (spl_kmem == NULL) {
 		rc = -EUNATCH;
 		goto out;
 	}
-	spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table);
+	spl_kstat = spl_proc_register_sysctl("kernel/spl/kstat",
+	    spl_kstat_table);
 	if (spl_kstat == NULL) {
 		rc = -EUNATCH;
 		goto out;
@@ -1015,10 +1015,50 @@ abd_cache_reap_now(void)
 }

 #if defined(_KERNEL)
+
 /*
- * Yield the next page struct and data offset and size within it, without
+ * This is abd_iter_page(), the function underneath abd_iterate_page_func().
+ * It yields the next page struct and data offset and size within it, without
 * mapping it into the address space.
 */
+
+/*
+ * "Compound pages" are a group of pages that can be referenced from a single
+ * struct page *. Its organised as a "head" page, followed by a series of
+ * "tail" pages.
+ *
+ * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we
+ * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a
+ * great many of the IO buffers we get are going to be of this type.
+ *
+ * The tail pages are just regular PAGESIZE pages, and can be safely used
+ * as-is. However, the head page has length covering itself and all the tail
+ * pages. If the ABD chunk spans multiple pages, then we can use the head page
+ * and a >PAGESIZE length, which is far more efficient.
+ *
+ * Before kernel 4.5 however, compound page heads were refcounted separately
+ * from tail pages, such that moving back to the head page would require us to
+ * take a reference to it and releasing it once we're completely finished with
+ * it. In practice, that means when our caller is done with the ABD, which we
+ * have no insight into from here. Rather than contort this API to track head
+ * page references on such ancient kernels, we disable this special compound
+ * page handling on 4.5, instead just using treating each page within it as a
+ * regular PAGESIZE page (which it is). This is slightly less efficient, but
+ * makes everything far simpler.
+ *
+ * The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the
+ * special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to
+ * understand compound pages, or not, as required.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
+#define	ABD_ITER_COMPOUND_PAGES		1
+#define	ABD_ITER_PAGE_SIZE(page)	\
+	(PageCompound(page) ? page_size(page) : PAGESIZE)
+#else
+#undef ABD_ITER_COMPOUND_PAGES
+#define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE)
+#endif
+
 void
 abd_iter_page(struct abd_iter *aiter)
 {
@@ -1032,6 +1072,12 @@ abd_iter_page(struct abd_iter *aiter)
 	struct page *page;
 	size_t doff, dsize;

+	/*
+	 * Find the page, and the start of the data within it. This is computed
+	 * differently for linear and scatter ABDs; linear is referenced by
+	 * virtual memory location, while scatter is referenced by page
+	 * pointer.
+	 */
 	if (abd_is_linear(aiter->iter_abd)) {
 		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);

@@ -1044,57 +1090,24 @@ abd_iter_page(struct abd_iter *aiter)

 		/* offset of address within the page */
 		doff = offset_in_page(paddr);
-
-		/* total data remaining in abd from this position */
-		dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
 	} else {
 		ASSERT(!abd_is_gang(aiter->iter_abd));

 		/* current scatter page */
-		page = sg_page(aiter->iter_sg);
+		page = nth_page(sg_page(aiter->iter_sg),
+		    aiter->iter_offset >> PAGE_SHIFT);

 		/* position within page */
-		doff = aiter->iter_offset;
-
-		/* remaining data in scatterlist */
-		dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
-		    aiter->iter_abd->abd_size - aiter->iter_pos);
+		doff = aiter->iter_offset & (PAGESIZE - 1);
 	}
-	ASSERT(page);

-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
+#ifdef ABD_ITER_COMPOUND_PAGES
 	if (PageTail(page)) {
 		/*
-		 * This page is part of a "compound page", which is a group of
-		 * pages that can be referenced from a single struct page *.
-		 * Its organised as a "head" page, followed by a series of
-		 * "tail" pages.
-		 *
-		 * In OpenZFS, compound pages are allocated using the
-		 * __GFP_COMP flag, which we get from scatter ABDs and SPL
-		 * vmalloc slabs (ie >16K allocations). So a great many of the
-		 * IO buffers we get are going to be of this type.
-		 *
-		 * The tail pages are just regular PAGE_SIZE pages, and can be
-		 * safely used as-is. However, the head page has length
-		 * covering itself and all the tail pages. If this ABD chunk
-		 * spans multiple pages, then we can use the head page and a
-		 * >PAGE_SIZE length, which is far more efficient.
-		 *
-		 * To do this, we need to adjust the offset to be counted from
-		 * the head page. struct page for compound pages are stored
-		 * contiguously, so we can just adjust by a simple offset.
-		 *
-		 * Before kernel 4.5, compound page heads were refcounted
-		 * separately, such that moving back to the head page would
-		 * require us to take a reference to it and releasing it once
-		 * we're completely finished with it. In practice, that means
-		 * when our caller is done with the ABD, which we have no
-		 * insight into from here. Rather than contort this API to
-		 * track head page references on such ancient kernels, we just
-		 * compile this block out and use the tail pages directly. This
-		 * is slightly less efficient, but makes everything far
-		 * simpler.
+		 * If this is a compound tail page, move back to the head, and
+		 * adjust the offset to match. This may let us yield a much
+		 * larger amount of data from a single logical page, and so
+		 * leave our caller with fewer pages to process.
 		 */
 		struct page *head = compound_head(page);
 		doff += ((page - head) * PAGESIZE);
@@ -1102,12 +1115,27 @@ abd_iter_page(struct abd_iter *aiter)
 	}
 #endif

-	/* final page and position within it */
+	ASSERT(page);
+
+	/*
+	 * Compute the maximum amount of data we can take from this page. This
+	 * is the smaller of:
+	 * - the remaining space in the page
+	 * - the remaining space in this scatterlist entry (which may not cover
+	 *   the entire page)
+	 * - the remaining space in the abd (which may not cover the entire
+	 *   scatterlist entry)
+	 */
+	dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff,
+	    aiter->iter_abd->abd_size - aiter->iter_pos);
+	if (!abd_is_linear(aiter->iter_abd))
+		dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset);
+	ASSERT3U(dsize, >, 0);
+
+	/* final iterator outputs */
 	aiter->iter_page = page;
 	aiter->iter_page_doff = doff;
-
-	/* amount of data in the chunk, up to the end of the page */
-	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
+	aiter->iter_page_dsize = dsize;
 }

 /*
@@ -150,7 +150,11 @@ vdev_bdev_mode(spa_mode_t smode)
 static uint64_t
 bdev_capacity(struct block_device *bdev)
 {
+#ifdef HAVE_BDEV_NR_BYTES
+	return (bdev_nr_bytes(bdev));
+#else
 	return (i_size_read(bdev->bd_inode));
+#endif
 }

 #if !defined(HAVE_BDEV_WHOLE)
@@ -209,7 +213,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
 		 * "reserved" EFI partition: in such cases return the device
 		 * usable capacity.
 		 */
-		available = i_size_read(bdev_whole(bdev)->bd_inode) -
+		available = bdev_capacity(bdev_whole(bdev)) -
 		    ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
 		    PARTITION_END_ALIGNMENT) << SECTOR_BITS);
 		psize = MAX(available, bdev_capacity(bdev));
@@ -916,12 +920,12 @@ vdev_disk_io_rw(zio_t *zio)
 	/*
 	 * Accessing outside the block device is never allowed.
 	 */
-	if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
+	if (zio->io_offset + zio->io_size > bdev_capacity(bdev)) {
 		vdev_dbgmsg(zio->io_vd,
 		    "Illegal access %llu size %llu, device size %llu",
 		    (u_longlong_t)zio->io_offset,
 		    (u_longlong_t)zio->io_size,
-		    (u_longlong_t)i_size_read(bdev->bd_inode));
+		    (u_longlong_t)bdev_capacity(bdev));
 		return (SET_ERROR(EIO));
 	}

@@ -1116,12 +1120,12 @@ vdev_classic_physio(zio_t *zio)
 	/*
 	 * Accessing outside the block device is never allowed.
 	 */
-	if (io_offset + io_size > bdev->bd_inode->i_size) {
+	if (io_offset + io_size > bdev_capacity(bdev)) {
 		vdev_dbgmsg(zio->io_vd,
 		    "Illegal access %llu size %llu, device size %llu",
 		    (u_longlong_t)io_offset,
 		    (u_longlong_t)io_size,
-		    (u_longlong_t)i_size_read(bdev->bd_inode));
+		    (u_longlong_t)bdev_capacity(bdev));
 		return (SET_ERROR(EIO));
 	}

@@ -110,8 +110,17 @@ zfs_kobj_fini(zfs_mod_kobj_t *zkobj)
 	}

 	/* kobject_put() will call zfs_kobj_release() to release memory */
-	kobject_del(&zkobj->zko_kobj);
-	kobject_put(&zkobj->zko_kobj);
+	/*
+	 * Special case note:
+	 *
+	 * We have to check for 'zkobj->zko_kobj.name != NULL' as
+	 * a workaround for #16249 which was added to zfs-2.2.4
+	 * and fixed (with this change) in zfs-2.2.5.
+	 */
+	if (zkobj->zko_kobj.name != NULL) {
+		kobject_del(&zkobj->zko_kobj);
+		kobject_put(&zkobj->zko_kobj);
+	}
 }

 static void
@@ -69,6 +69,7 @@
 #include <sys/zpl.h>
 #include <sys/zil.h>
 #include <sys/sa_impl.h>
+#include <linux/mm_compat.h>

 /*
 * Programming rules.
@@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
 */

 #include <sys/dataset_kstats.h>
@@ -41,6 +42,7 @@

 #include <linux/blkdev_compat.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/workqueue.h>

 #ifdef HAVE_BLK_MQ
 #include <linux/blk-mq.h>
@@ -384,7 +386,7 @@ zvol_discard(zv_request_t *zvr)
 	 */
 	if (!io_is_secure_erase(bio, rq)) {
 		start = P2ROUNDUP(start, zv->zv_volblocksize);
-		end = P2ALIGN(end, zv->zv_volblocksize);
+		end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
 		size = end - start;
 	}

@@ -729,7 +731,7 @@ retry:
 #endif
 	if (zv == NULL) {
 		rw_exit(&zvol_state_lock);
-		return (SET_ERROR(-ENXIO));
+		return (-SET_ERROR(ENXIO));
 	}

 	mutex_enter(&zv->zv_state_lock);
@@ -793,10 +795,10 @@ retry:

 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
 				schedule();
-				return (SET_ERROR(-ERESTARTSYS));
+				return (-SET_ERROR(ERESTARTSYS));
 #else
 				if ((gethrtime() - start) > timeout)
-					return (SET_ERROR(-ERESTARTSYS));
+					return (-SET_ERROR(ERESTARTSYS));

 				schedule_timeout(MSEC_TO_TICK(10));
 				goto retry;
@@ -818,7 +820,7 @@ retry:
 			if (zv->zv_open_count == 0)
 				zvol_last_close(zv);

-			error = SET_ERROR(-EROFS);
+			error = -SET_ERROR(EROFS);
 		} else {
 			zv->zv_open_count++;
 		}
@@ -1073,8 +1075,159 @@ static const struct block_device_operations zvol_ops = {
 #endif
 };

+/*
+ * Since 6.9, Linux has been removing queue limit setters in favour of an
+ * initial queue_limits struct applied when the device is open. Since 6.11,
+ * queue_limits is being extended to allow more things to be applied when the
+ * device is open. Setters are also being removed for this.
+ *
+ * For OpenZFS, this means that depending on kernel version, some options may
+ * be set up before the device is open, and some applied to an open device
+ * (queue) after the fact.
+ *
+ * We manage this complexity by having our own limits struct,
+ * zvol_queue_limits_t, in which we carry any queue config that we're
+ * interested in setting. This structure is the same on all kernels.
+ *
+ * These limits are then applied to the queue at device open time by the most
+ * appropriate method for the kernel.
+ *
+ * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
+ * blk_alloc_disk() exists). This converts our limits struct to a proper Linux
+ * struct queue_limits, and passes it in. Any fields added in later kernels are
+ * (obviously) not set up here.
+ *
+ * zvol_queue_limits_apply() is called on all kernel versions after the queue
+ * is created, and applies any remaining config. Before 6.9 that will be
+ * everything, via setter methods. After 6.9 that will be whatever couldn't be
+ * put into struct queue_limits. (This implies that zvol_queue_limits_apply()
+ * will always be a no-op on the latest kernel we support).
+ */
+typedef struct zvol_queue_limits {
+	unsigned int	zql_max_hw_sectors;
+	unsigned short	zql_max_segments;
+	unsigned int	zql_max_segment_size;
+	unsigned int	zql_io_opt;
+	unsigned int	zql_physical_block_size;
+	unsigned int	zql_max_discard_sectors;
+	unsigned int	zql_discard_granularity;
+} zvol_queue_limits_t;
+
+static void
+zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
+    boolean_t use_blk_mq)
+{
+	limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9;
+
+	if (use_blk_mq) {
+		/*
+		 * IO requests can be really big (1MB).  When an IO request
+		 * comes in, it is passed off to zvol_read() or zvol_write()
+		 * in a new thread, where it is chunked up into 'volblocksize'
+		 * sized pieces and processed.  So for example, if the request
+		 * is a 1MB write and your volblocksize is 128k, one zvol_write
+		 * thread will take that request and sequentially do ten 128k
+		 * IOs.  This is due to the fact that the thread needs to lock
+		 * each volblocksize sized block.  So you might be wondering:
+		 * "instead of passing the whole 1MB request to one thread,
+		 * why not pass ten individual 128k chunks to ten threads and
+		 * process the whole write in parallel?"  The short answer is
+		 * that there's a sweet spot number of chunks that balances
+		 * the greater parallelism with the added overhead of more
+		 * threads. The sweet spot can be different depending on if you
+		 * have a read or write  heavy workload.  Writes typically want
+		 * high chunk counts while reads typically want lower ones.  On
+		 * a test pool with 6 NVMe drives in a 3x 2-disk mirror
+		 * configuration, with volblocksize=8k, the sweet spot for good
+		 * sequential reads and writes was at 8 chunks.
+		 */
+
+		/*
+		 * Below we tell the kernel how big we want our requests
+		 * to be.  You would think that blk_queue_io_opt() would be
+		 * used to do this since it is used to "set optimal request
+		 * size for the queue", but that doesn't seem to do
+		 * anything - the kernel still gives you huge requests
+		 * with tons of little PAGE_SIZE segments contained within it.
+		 *
+		 * Knowing that the kernel will just give you PAGE_SIZE segments
+		 * no matter what, you can say "ok, I want PAGE_SIZE byte
+		 * segments, and I want 'N' of them per request", where N is
+		 * the correct number of segments for the volblocksize and
+		 * number of chunks you want.
+		 */
+#ifdef HAVE_BLK_MQ
+		if (zvol_blk_mq_blocks_per_thread != 0) {
+			unsigned int chunks;
+			chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
+
+			limits->zql_max_segment_size = PAGE_SIZE;
+			limits->zql_max_segments =
+			    (zv->zv_volblocksize * chunks) / PAGE_SIZE;
+		} else {
+			/*
+			 * Special case: zvol_blk_mq_blocks_per_thread = 0
+			 * Max everything out.
+			 */
+			limits->zql_max_segments = UINT16_MAX;
+			limits->zql_max_segment_size = UINT_MAX;
+		}
+	} else {
+#endif
+		limits->zql_max_segments = UINT16_MAX;
+		limits->zql_max_segment_size = UINT_MAX;
+	}
+
+	limits->zql_io_opt = zv->zv_volblocksize;
+
+	limits->zql_physical_block_size = zv->zv_volblocksize;
+	limits->zql_max_discard_sectors =
+	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
+	limits->zql_discard_granularity = zv->zv_volblocksize;
+}
+
+#ifdef HAVE_BLK_ALLOC_DISK_2ARG
+static void
+zvol_queue_limits_convert(zvol_queue_limits_t *limits,
+    struct queue_limits *qlimits)
+{
+	memset(qlimits, 0, sizeof (struct queue_limits));
+	qlimits->max_hw_sectors = limits->zql_max_hw_sectors;
+	qlimits->max_segments = limits->zql_max_segments;
+	qlimits->max_segment_size = limits->zql_max_segment_size;
+	qlimits->io_opt = limits->zql_io_opt;
+	qlimits->physical_block_size = limits->zql_physical_block_size;
+	qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
+	qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
+	qlimits->discard_granularity = limits->zql_discard_granularity;
+#ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
+	qlimits->features =
+	    BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
+#endif
+}
+#endif
+
+static void
+zvol_queue_limits_apply(zvol_queue_limits_t *limits,
+    struct request_queue *queue)
+{
+#ifndef HAVE_BLK_ALLOC_DISK_2ARG
+	blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
+	blk_queue_max_segments(queue, limits->zql_max_segments);
+	blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
+	blk_queue_io_opt(queue, limits->zql_io_opt);
+	blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
+	blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
+	blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
+#endif
+#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
+	blk_queue_set_write_cache(queue, B_TRUE);
+	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
+#endif
+}
+
 static int
-zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
+zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 {
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
 #if defined(HAVE_BLK_ALLOC_DISK)
@@ -1085,7 +1238,9 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
-	struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
+	struct queue_limits qlimits;
+	zvol_queue_limits_convert(limits, &qlimits);
+	struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE);
 	if (IS_ERR(disk)) {
 		zso->zvo_disk = NULL;
 		return (1);
@@ -1094,6 +1249,7 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
 	zso->zvo_disk = disk;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
+
 #else
 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)
@@ -1120,12 +1276,15 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)

 	zso->zvo_disk->queue = zso->zvo_queue;
 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
+
+	zvol_queue_limits_apply(limits, zso->zvo_queue);
+
 	return (0);

 }

 static int
-zvol_alloc_blk_mq(zvol_state_t *zv)
+zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
 {
 #ifdef HAVE_BLK_MQ
 	struct zvol_state_os *zso = zv->zv_zso;
@@ -1143,7 +1302,9 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
 	zso->zvo_queue = zso->zvo_disk->queue;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
-	struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv);
+	struct queue_limits qlimits;
+	zvol_queue_limits_convert(limits, &qlimits);
+	struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv);
 	if (IS_ERR(disk)) {
 		zso->zvo_disk = NULL;
 		blk_mq_free_tag_set(&zso->tag_set);
@@ -1169,9 +1330,11 @@ zvol_alloc_blk_mq(zvol_state_t *zv)

 	/* Our queue is now created, assign it to our disk */
 	zso->zvo_disk->queue = zso->zvo_queue;
+#endif

+	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif
-#endif
+
 	return (0);
 }

@@ -1180,7 +1343,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
 * request queue and generic disk structures for the block device.
 */
 static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name)
+zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
 {
 	zvol_state_t *zv;
 	struct zvol_state_os *zso;
@@ -1200,6 +1363,7 @@ zvol_alloc(dev_t dev, const char *name)
 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 	zv->zv_zso = zso;
 	zv->zv_volmode = volmode;
+	zv->zv_volblocksize = volblocksize;

 	list_link_init(&zv->zv_next);
 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1208,6 +1372,9 @@ zvol_alloc(dev_t dev, const char *name)
 	zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
 #endif

+	zvol_queue_limits_t limits;
+	zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq);
+
 	/*
 	 * The block layer has 3 interfaces for getting BIOs:
 	 *
@@ -1224,17 +1391,15 @@ zvol_alloc(dev_t dev, const char *name)
 	 *    disk and the queue separately. (5.13 kernel or older)
 	 */
 	if (zv->zv_zso->use_blk_mq) {
-		ret = zvol_alloc_blk_mq(zv);
+		ret = zvol_alloc_blk_mq(zv, &limits);
 		zso->zvo_disk->fops = &zvol_ops_blk_mq;
 	} else {
-		ret = zvol_alloc_non_blk_mq(zso);
+		ret = zvol_alloc_non_blk_mq(zso, &limits);
 		zso->zvo_disk->fops = &zvol_ops;
 	}
 	if (ret != 0)
 		goto out_kmem;

-	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
-
 	/* Limit read-ahead to a single page to prevent over-prefetching. */
 	blk_queue_set_read_ahead(zso->zvo_queue, 1);

@@ -1243,9 +1408,6 @@ zvol_alloc(dev_t dev, const char *name)
 		blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
 	}

-	/* Enable /proc/diskstats */
-	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
-
 	zso->zvo_queue->queuedata = zv;
 	zso->zvo_dev = dev;
 	zv->zv_open_count = 0;
@@ -1337,6 +1499,101 @@ zvol_wait_close(zvol_state_t *zv)
 {
 }

+struct add_disk_work {
+	struct delayed_work work;
+	struct gendisk *disk;
+	int error;
+};
+
+static int
+__zvol_os_add_disk(struct gendisk *disk)
+{
+	int error = 0;
+#ifdef HAVE_ADD_DISK_RET
+	error = add_disk(disk);
+#else
+	add_disk(disk);
+#endif
+	return (error);
+}
+
+#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
+static void
+zvol_os_add_disk_work(struct work_struct *work)
+{
+	struct add_disk_work *add_disk_work;
+	add_disk_work = container_of(work, struct add_disk_work, work.work);
+	add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);
+}
+#endif
+
+/*
+ * SPECIAL CASE:
+ *
+ * This function basically calls add_disk() from a workqueue.   You may be
+ * thinking: why not just call add_disk() directly?
+ *
+ * When you call add_disk(), the zvol appears to the world.  When this happens,
+ * the kernel calls disk_scan_partitions() on the zvol, which behaves
+ * differently on the 6.9+ kernels:
+ *
+ * - 6.8 and older kernels -
+ * disk_scan_partitions()
+ *	handle = bdev_open_by_dev(
+ *		zvol_open()
+ *	bdev_release(handle);
+ *		zvol_release()
+ *
+ *
+ * - 6.9+ kernels -
+ * disk_scan_partitions()
+ * 	file = bdev_file_open_by_dev()
+ *		zvol_open()
+ *	fput(file)
+ *	< wait for return to userspace >
+ *		zvol_release()
+ *
+ * The difference is that the bdev_release() from the 6.8 kernel is synchronous
+ * while the fput() from the 6.9 kernel is async.  Or more specifically it's
+ * async that has to wait until we return to userspace (since it adds the fput
+ * into the caller's work queue with the TWA_RESUME flag set).  This is not the
+ * behavior we want, since we want do things like create+destroy a zvol within
+ * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the
+ * reference to the zvol while we're in the IOCTL, which can't wait until we
+ * return to userspace.
+ *
+ * We can get around this since fput() has a special codepath for when it's
+ * running in a kernel thread or interrupt.  In those cases, it just puts the
+ * fput into the system workqueue, which we can force to run with
+ * __flush_workqueue().  That is why we call add_disk() from a workqueue - so it
+ * run from a kernel thread and "tricks" the fput() codepaths.
+ *
+ * Note that __flush_workqueue() is slowly getting deprecated.  This may be ok
+ * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via
+ * fput) to happen, which it eventually, naturally, will from the system_wq
+ * without us explicitly calling __flush_workqueue().
+ */
+static int
+zvol_os_add_disk(struct gendisk *disk)
+{
+#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)	/* 6.9+ kernel */
+	struct add_disk_work add_disk_work;
+
+	INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);
+	add_disk_work.disk = disk;
+	add_disk_work.error = 0;
+
+	/* Use *_delayed_work functions since they're not GPL'd */
+	schedule_delayed_work(&add_disk_work.work, 0);
+	flush_delayed_work(&add_disk_work.work);
+
+	__flush_workqueue(system_wq);
+	return (add_disk_work.error);
+#else	/* <= 6.8 kernel */
+	return (__zvol_os_add_disk(disk));
+#endif
+}
+
 /*
 * Create a block device minor node and setup the linkage between it
 * and the specified volume.  Once this function returns the block
@@ -1393,7 +1650,8 @@ zvol_os_create_minor(const char *name)
 	if (error)
 		goto out_dmu_objset_disown;

-	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
+	zv = zvol_alloc(MKDEV(zvol_major, minor), name,
+	    doi->doi_data_block_size);
 	if (zv == NULL) {
 		error = SET_ERROR(EAGAIN);
 		goto out_dmu_objset_disown;
@@ -1403,84 +1661,11 @@ zvol_os_create_minor(const char *name)
 	if (dmu_objset_is_snapshot(os))
 		zv->zv_flags |= ZVOL_RDONLY;

-	zv->zv_volblocksize = doi->doi_data_block_size;
 	zv->zv_volsize = volsize;
 	zv->zv_objset = os;

 	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);

-	blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
-	    (DMU_MAX_ACCESS / 4) >> 9);
-
-	if (zv->zv_zso->use_blk_mq) {
-		/*
-		 * IO requests can be really big (1MB).  When an IO request
-		 * comes in, it is passed off to zvol_read() or zvol_write()
-		 * in a new thread, where it is chunked up into 'volblocksize'
-		 * sized pieces and processed.  So for example, if the request
-		 * is a 1MB write and your volblocksize is 128k, one zvol_write
-		 * thread will take that request and sequentially do ten 128k
-		 * IOs.  This is due to the fact that the thread needs to lock
-		 * each volblocksize sized block.  So you might be wondering:
-		 * "instead of passing the whole 1MB request to one thread,
-		 * why not pass ten individual 128k chunks to ten threads and
-		 * process the whole write in parallel?"  The short answer is
-		 * that there's a sweet spot number of chunks that balances
-		 * the greater parallelism with the added overhead of more
-		 * threads. The sweet spot can be different depending on if you
-		 * have a read or write  heavy workload.  Writes typically want
-		 * high chunk counts while reads typically want lower ones.  On
-		 * a test pool with 6 NVMe drives in a 3x 2-disk mirror
-		 * configuration, with volblocksize=8k, the sweet spot for good
-		 * sequential reads and writes was at 8 chunks.
-		 */
-
-		/*
-		 * Below we tell the kernel how big we want our requests
-		 * to be.  You would think that blk_queue_io_opt() would be
-		 * used to do this since it is used to "set optimal request
-		 * size for the queue", but that doesn't seem to do
-		 * anything - the kernel still gives you huge requests
-		 * with tons of little PAGE_SIZE segments contained within it.
-		 *
-		 * Knowing that the kernel will just give you PAGE_SIZE segments
-		 * no matter what, you can say "ok, I want PAGE_SIZE byte
-		 * segments, and I want 'N' of them per request", where N is
-		 * the correct number of segments for the volblocksize and
-		 * number of chunks you want.
-		 */
-#ifdef HAVE_BLK_MQ
-		if (zvol_blk_mq_blocks_per_thread != 0) {
-			unsigned int chunks;
-			chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
-
-			blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
-			    PAGE_SIZE);
-			blk_queue_max_segments(zv->zv_zso->zvo_queue,
-			    (zv->zv_volblocksize * chunks) / PAGE_SIZE);
-		} else {
-			/*
-			 * Special case: zvol_blk_mq_blocks_per_thread = 0
-			 * Max everything out.
-			 */
-			blk_queue_max_segments(zv->zv_zso->zvo_queue,
-			    UINT16_MAX);
-			blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
-			    UINT_MAX);
-		}
-#endif
-	} else {
-		blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
-		blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
-	}
-
-	blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
-	    zv->zv_volblocksize);
-	blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
-	blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
-	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
-	blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
-	    zv->zv_volblocksize);
 #ifdef QUEUE_FLAG_DISCARD
 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
 #endif
@@ -1541,11 +1726,7 @@ out_doi:
 		rw_enter(&zvol_state_lock, RW_WRITER);
 		zvol_insert(zv);
 		rw_exit(&zvol_state_lock);
-#ifdef HAVE_ADD_DISK_RET
-		error = add_disk(zv->zv_zso->zvo_disk);
-#else
-		add_disk(zv->zv_zso->zvo_disk);
-#endif
+		error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
 	} else {
 		ida_simple_remove(&zvol_ida, idx);
 	}
@@ -471,7 +471,8 @@ fletcher_4_native(const void *buf, uint64_t size,
    const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
-	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
+	const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE,
+	    uint64_t);

 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));

@@ -519,7 +520,8 @@ fletcher_4_byteswap(const void *buf, uint64_t size,
    const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
-	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
+	const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE,
+	    uint64_t);

 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));

@@ -878,7 +880,7 @@ abd_fletcher_4_iter(void *data, size_t size, void *private)
 	fletcher_4_ctx_t *ctx = cdp->acd_ctx;
 	fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
 	boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
-	uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
+	uint64_t asize = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, uint64_t);

 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));

@@ -8879,7 +8879,7 @@ out:
 		 * assertions may be violated without functional consequences
 		 * as the device is about to be removed.
 		 */
-		ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
+		ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end);
 		if (!dev->l2ad_first)
 			ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 	}
@@ -8895,7 +8895,6 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
    abd_t **abd_out)
 {
 	int ret;
-	void *tmp = NULL;
 	abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
 	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
@@ -8916,12 +8915,11 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 	 * and copy the data. This may be done to eliminate a dependency on a
 	 * shared buffer or to reallocate the buffer to match asize.
 	 */
-	if (HDR_HAS_RABD(hdr) && asize != psize) {
-		ASSERT3U(asize, >=, psize);
+	if (HDR_HAS_RABD(hdr)) {
+		ASSERT3U(asize, >, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
-		if (psize != asize)
-			abd_zero_off(to_write, psize, asize - psize);
+		abd_zero_off(to_write, psize, asize - psize);
 		goto out;
 	}

@@ -8930,48 +8928,31 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 		ASSERT3U(size, ==, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
-		if (size != asize)
+		if (asize > size)
 			abd_zero_off(to_write, size, asize - size);
 		goto out;
 	}

 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
-		/*
-		 * In some cases, we can wind up with size > asize, so
-		 * we need to opt for the larger allocation option here.
-		 *
-		 * (We also need abd_return_buf_copy in all cases because
-		 * it's an ASSERT() to modify the buffer before returning it
-		 * with arc_return_buf(), and all the compressors
-		 * write things before deciding to fail compression in nearly
-		 * every case.)
-		 */
-		uint64_t bufsize = MAX(size, asize);
-		cabd = abd_alloc_for_io(bufsize, ismd);
-		tmp = abd_borrow_buf(cabd, bufsize);
-
-		psize = zio_compress_data(compress, to_write, &tmp, size,
-		    hdr->b_complevel);
-
-		if (psize >= asize) {
-			psize = HDR_GET_PSIZE(hdr);
-			abd_return_buf_copy(cabd, tmp, bufsize);
-			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
-			to_write = cabd;
-			abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
-			if (psize != asize)
-				abd_zero_off(to_write, psize, asize - psize);
-			goto encrypt;
+		size_t bufsize = MAX(size, asize);
+		void *buf = zio_buf_alloc(bufsize);
+		uint64_t csize = zio_compress_data(compress, to_write, &buf,
+		    size, hdr->b_complevel);
+		if (csize > psize) {
+			/*
+			 * We can't re-compress the block into the original
+			 * psize.  Even if it fits into asize, it does not
+			 * matter, since checksum will never match on read.
+			 */
+			zio_buf_free(buf, bufsize);
+			return (SET_ERROR(EIO));
 		}
-		ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
-		if (psize < asize)
-			memset((char *)tmp + psize, 0, bufsize - psize);
-		psize = HDR_GET_PSIZE(hdr);
-		abd_return_buf_copy(cabd, tmp, bufsize);
-		to_write = cabd;
+		if (asize > csize)
+			memset((char *)buf + csize, 0, asize - csize);
+		to_write = cabd = abd_get_from_buf(buf, bufsize);
+		abd_take_ownership_of_buf(cabd, B_TRUE);
 	}

-encrypt:
 	if (HDR_ENCRYPTED(hdr)) {
 		eabd = abd_alloc_for_io(asize, ismd);

@@ -9074,12 +9055,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	 */
 	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
 		/*
-		 * If pass == 1 or 3, we cache MRU metadata and data
-		 * respectively.
+		 * pass == 0: MFU meta
+		 * pass == 1: MRU meta
+		 * pass == 2: MFU data
+		 * pass == 3: MRU data
 		 */
-		if (l2arc_mfuonly) {
+		if (l2arc_mfuonly == 1) {
 			if (pass == 1 || pass == 3)
 				continue;
+		} else if (l2arc_mfuonly > 1) {
+			if (pass == 3)
+				continue;
 		}

 		uint64_t passed_sz = 0;
@@ -218,7 +218,7 @@ zfs_btree_create_custom(zfs_btree_t *tree,
 	    zfs_btree_find_in_buf : bt_find_in_buf;
 	tree->bt_elem_size = size;
 	tree->bt_leaf_size = lsize;
-	tree->bt_leaf_cap = P2ALIGN(esize / size, 2);
+	tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t);
 	tree->bt_height = -1;
 	tree->bt_bulk = NULL;
 }
@@ -201,6 +201,9 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
 void
 dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
 {
+	if (dk->dk_kstats == NULL)
+		return;
+
 	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
 	char *ds_name;

@@ -161,13 +161,13 @@ struct {
 } dbuf_sums;

 #define	DBUF_STAT_INCR(stat, val)	\
-	wmsum_add(&dbuf_sums.stat, val);
+	wmsum_add(&dbuf_sums.stat, val)
 #define	DBUF_STAT_DECR(stat, val)	\
-	DBUF_STAT_INCR(stat, -(val));
+	DBUF_STAT_INCR(stat, -(val))
 #define	DBUF_STAT_BUMP(stat)		\
-	DBUF_STAT_INCR(stat, 1);
+	DBUF_STAT_INCR(stat, 1)
 #define	DBUF_STAT_BUMPDOWN(stat)	\
-	DBUF_STAT_INCR(stat, -1);
+	DBUF_STAT_INCR(stat, -1)
 #define	DBUF_STAT_MAX(stat, v) {					\
 	uint64_t _m;							\
 	while ((v) > (_m = dbuf_stats.stat.value.ui64) &&		\
@@ -177,7 +177,6 @@ struct {

 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
-static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);

 /*
 * Global data structures and functions for the dbuf cache.
@@ -1403,13 +1402,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
 * a decrypted block. Otherwise success.
 */
 static int
-dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
 {
-	int bonuslen, max_bonuslen, err;
-
-	err = dbuf_read_verify_dnode_crypt(db, flags);
-	if (err)
-		return (err);
+	int bonuslen, max_bonuslen;

 	bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 	max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
@@ -1494,32 +1489,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
 * decrypt / authenticate them when we need to read an encrypted bonus buffer.
 */
 static int
-dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
+dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
 {
-	int err = 0;
 	objset_t *os = db->db_objset;
-	arc_buf_t *dnode_abuf;
-	dnode_t *dn;
+	dmu_buf_impl_t *dndb;
+	arc_buf_t *dnbuf;
 	zbookmark_phys_t zb;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
+	int err;

 	if ((flags & DB_RF_NO_DECRYPT) != 0 ||
-	    !os->os_encrypted || os->os_raw_receive)
+	    !os->os_encrypted || os->os_raw_receive ||
+	    (dndb = dn->dn_dbuf) == NULL)
 		return (0);

-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
-
-	if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
-		DB_DNODE_EXIT(db);
+	dnbuf = dndb->db_buf;
+	if (!arc_is_encrypted(dnbuf))
 		return (0);
-	}
+
+	mutex_enter(&dndb->db_mtx);
+
+	/*
+	 * Since dnode buffer is modified by sync process, there can be only
+	 * one copy of it.  It means we can not modify (decrypt) it while it
+	 * is being written.  I don't see how this may happen now, since
+	 * encrypted dnode writes by receive should be completed before any
+	 * plain-text reads due to txg wait, but better be safe than sorry.
+	 */
+	while (1) {
+		if (!arc_is_encrypted(dnbuf)) {
+			mutex_exit(&dndb->db_mtx);
+			return (0);
+		}
+		dbuf_dirty_record_t *dr = dndb->db_data_pending;
+		if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
+			break;
+		cv_wait(&dndb->db_changed, &dndb->db_mtx);
+	};

 	SET_BOOKMARK(&zb, dmu_objset_id(os),
-	    DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
-	err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
+	    DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
+	err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);

 	/*
 	 * An error code of EACCES tells us that the key is still not
@@ -1532,7 +1541,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
 	    !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
 		err = 0;

-	DB_DNODE_EXIT(db);
+	mutex_exit(&dndb->db_mtx);

 	return (err);
 }
@@ -1558,7 +1567,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 	    RW_LOCK_HELD(&db->db_parent->db_rwlock));

 	if (db->db_blkid == DMU_BONUS_BLKID) {
-		err = dbuf_read_bonus(db, dn, flags);
+		err = dbuf_read_bonus(db, dn);
 		goto early_unlock;
 	}

@@ -1619,10 +1628,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 		goto early_unlock;
 	}

-	err = dbuf_read_verify_dnode_crypt(db, flags);
-	if (err != 0)
-		goto early_unlock;
-
 	db->db_state = DB_READ;
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
@@ -1738,19 +1743,23 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 {
-	int err = 0;
-	boolean_t prefetch;
 	dnode_t *dn;
+	boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
+	int err;

-	/*
-	 * We don't have to hold the mutex to check db_state because it
-	 * can't be freed while we have a hold on the buffer.
-	 */
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));

 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);

+	/*
+	 * Ensure that this block's dnode has been decrypted if the caller
+	 * has requested decrypted data.
+	 */
+	err = dbuf_read_verify_dnode_crypt(db, dn, flags);
+	if (err != 0)
+		goto done;
+
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    (flags & DB_RF_NOPREFETCH) == 0;

@@ -1759,13 +1768,38 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 		db->db_partial_read = B_TRUE;
 	else if (!(flags & DB_RF_PARTIAL_MORE))
 		db->db_partial_read = B_FALSE;
-	if (db->db_state == DB_CACHED) {
-		/*
-		 * Ensure that this block's dnode has been decrypted if
-		 * the caller has requested decrypted data.
-		 */
-		err = dbuf_read_verify_dnode_crypt(db, flags);
+	miss = (db->db_state != DB_CACHED);

+	if (db->db_state == DB_READ || db->db_state == DB_FILL) {
+		/*
+		 * Another reader came in while the dbuf was in flight between
+		 * UNCACHED and CACHED.  Either a writer will finish filling
+		 * the buffer, sending the dbuf to CACHED, or the first reader's
+		 * request will reach the read_done callback and send the dbuf
+		 * to CACHED.  Otherwise, a failure occurred and the dbuf will
+		 * be sent to UNCACHED.
+		 */
+		if (flags & DB_RF_NEVERWAIT) {
+			mutex_exit(&db->db_mtx);
+			DB_DNODE_EXIT(db);
+			goto done;
+		}
+		do {
+			ASSERT(db->db_state == DB_READ ||
+			    (flags & DB_RF_HAVESTRUCT) == 0);
+			DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
+			    zio_t *, pio);
+			cv_wait(&db->db_changed, &db->db_mtx);
+		} while (db->db_state == DB_READ || db->db_state == DB_FILL);
+		if (db->db_state == DB_UNCACHED) {
+			err = SET_ERROR(EIO);
+			mutex_exit(&db->db_mtx);
+			DB_DNODE_EXIT(db);
+			goto done;
+		}
+	}
+
+	if (db->db_state == DB_CACHED) {
 		/*
 		 * If the arc buf is compressed or encrypted and the caller
 		 * requested uncompressed data, we need to untransform it
@@ -1773,8 +1807,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 		 * unauthenticated blocks, which will verify their MAC if
 		 * the key is now available.
 		 */
-		if (err == 0 && db->db_buf != NULL &&
-		    (flags & DB_RF_NO_DECRYPT) == 0 &&
+		if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
 		    (arc_is_encrypted(db->db_buf) ||
 		    arc_is_unauthenticated(db->db_buf) ||
 		    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
@@ -1788,17 +1821,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 			dbuf_set_data(db, db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
-		if (err == 0 && prefetch) {
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-			    B_FALSE, flags & DB_RF_HAVESTRUCT);
-		}
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_hits);
-	} else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
-		boolean_t need_wait = B_FALSE;
-
+	} else {
+		ASSERT(db->db_state == DB_UNCACHED ||
+		    db->db_state == DB_NOFILL);
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
-
 		if (pio == NULL && (db->db_state == DB_NOFILL ||
 		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
 			spa_t *spa = dn->dn_objset->os_spa;
@@ -1806,65 +1832,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 			need_wait = B_TRUE;
 		}
 		err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
-		/*
-		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
-		 * for us
-		 */
-		if (!err && prefetch) {
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-			    db->db_state != DB_CACHED,
-			    flags & DB_RF_HAVESTRUCT);
-		}
-
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_misses);
-
-		/*
-		 * If we created a zio_root we must execute it to avoid
-		 * leaking it, even if it isn't attached to any work due
-		 * to an error in dbuf_read_impl().
-		 */
-		if (need_wait) {
-			if (err == 0)
-				err = zio_wait(pio);
-			else
-				(void) zio_wait(pio);
-			pio = NULL;
-		}
-	} else {
-		/*
-		 * Another reader came in while the dbuf was in flight
-		 * between UNCACHED and CACHED.  Either a writer will finish
-		 * writing the buffer (sending the dbuf to CACHED) or the
-		 * first reader's request will reach the read_done callback
-		 * and send the dbuf to CACHED.  Otherwise, a failure
-		 * occurred and the dbuf went to UNCACHED.
-		 */
-		mutex_exit(&db->db_mtx);
-		if (prefetch) {
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-			    B_TRUE, flags & DB_RF_HAVESTRUCT);
-		}
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_misses);
-
-		/* Skip the wait per the caller's request. */
-		if ((flags & DB_RF_NEVERWAIT) == 0) {
-			mutex_enter(&db->db_mtx);
-			while (db->db_state == DB_READ ||
-			    db->db_state == DB_FILL) {
-				ASSERT(db->db_state == DB_READ ||
-				    (flags & DB_RF_HAVESTRUCT) == 0);
-				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
-				    db, zio_t *, pio);
-				cv_wait(&db->db_changed, &db->db_mtx);
-			}
-			if (db->db_state == DB_UNCACHED)
-				err = SET_ERROR(EIO);
-			mutex_exit(&db->db_mtx);
-		}
+		/* dbuf_read_impl drops db_mtx and parent's rwlock. */
+		miss = (db->db_state != DB_CACHED);
 	}

+	if (err == 0 && prefetch) {
+		dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
+		    flags & DB_RF_HAVESTRUCT);
+	}
+	DB_DNODE_EXIT(db);
+
+	/*
+	 * If we created a zio we must execute it to avoid leaking it, even if
+	 * it isn't attached to any work due to an error in dbuf_read_impl().
+	 */
+	if (need_wait) {
+		if (err == 0)
+			err = zio_wait(pio);
+		else
+			(void) zio_wait(pio);
+		pio = NULL;
+	}
+
+done:
+	if (miss)
+		DBUF_STAT_BUMP(hash_misses);
+	else
+		DBUF_STAT_BUMP(hash_hits);
 	if (pio && err != 0) {
 		zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
@@ -2840,6 +2834,7 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
 			failed = B_FALSE;
 		} else if (failed) {
 			VERIFY(!dbuf_undirty(db, tx));
+			arc_buf_destroy(db->db_buf, db);
 			db->db_buf = NULL;
 			dbuf_clear_data(db);
 			DTRACE_SET_STATE(db, "fill failed");
@@ -537,7 +537,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
-		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
+		    P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t))
+		    >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
@@ -814,6 +815,13 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)

 	ASSERT3U(minimum, <=, *start);

+	/* dn_nlevels == 1 means we don't have any L1 blocks */
+	if (dn->dn_nlevels <= 1) {
+		*l1blks = 0;
+		*start = minimum;
+		return (0);
+	}
+
 	/*
 	 * Check if we can free the entire range assuming that all of the
 	 * L1 blocks in this range have data. If we can, we use this
@@ -854,7 +862,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
 		}

 		/* set start to the beginning of this L1 indirect */
-		*start = P2ALIGN(*start, iblkrange);
+		*start = P2ALIGN_TYPED(*start, iblkrange, uint64_t);
 	}
 	if (*start < minimum)
 		*start = minimum;
@@ -160,7 +160,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
 			 * is not suitably aligned.
 			 */
 			os->os_obj_next_chunk =
-			    P2ALIGN(object, dnodes_per_chunk) +
+			    P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) +
 			    dnodes_per_chunk;
 			(void) atomic_swap_64(cpuobj, object);
 			mutex_exit(&os->os_obj_lock);
@@ -400,10 +400,10 @@ dnode_hash(const objset_t *os, uint64_t obj)

 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 	/*
-	 * The low 6 bits of the pointer don't have much entropy, because
-	 * the objset_t is larger than 2^6 bytes long.
+	 * The lower 11 bits of the pointer don't have much entropy, because
+	 * the objset_t is more than 1KB long and so likely aligned to 2KB.
 	 */
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
@@ -3710,16 +3710,19 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 	spa_history_log_internal_ds(hds, "promote", tx, " ");

 	dsl_dir_rele(odd, FTAG);
-	promote_rele(ddpa, FTAG);

 	/*
-	 * Transfer common error blocks from old head to new head.
+	 * Transfer common error blocks from old head to new head, before
+	 * calling promote_rele() on ddpa since we need to dereference
+	 * origin_head and hds.
 	 */
 	if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		uint64_t old_head = origin_head->ds_object;
 		uint64_t new_head = hds->ds_object;
 		spa_swap_errlog(dp->dp_spa, new_head, old_head, tx);
 	}
+
+	promote_rele(ddpa, FTAG);
 }

 /*
@@ -491,6 +491,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)

 	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
 	    offsetof(scan_ds_t, sds_node));
+	mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
 	    sizeof (scan_prefetch_issue_ctx_t),
 	    offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
@@ -646,6 +647,7 @@ dsl_scan_fini(dsl_pool_t *dp)

 		scan_ds_queue_clear(scn);
 		avl_destroy(&scn->scn_queue);
+		mutex_destroy(&scn->scn_queue_lock);
 		scan_ds_prefetch_queue_clear(scn);
 		avl_destroy(&scn->scn_prefetch_queue);

@@ -2727,8 +2729,10 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 			return (err);
 		ds = prev;
 	}
+	mutex_enter(&scn->scn_queue_lock);
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	mutex_exit(&scn->scn_queue_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
@@ -2919,8 +2923,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 		ds = prev;
 	}

+	mutex_enter(&scn->scn_queue_lock);
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	mutex_exit(&scn->scn_queue_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
@@ -629,8 +629,8 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
 		 * metaslabs. We report the expandable space in terms
 		 * of the metaslab size since that's the unit of expansion.
 		 */
-		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
-		    1ULL << tvd->vdev_ms_shift);
+		space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize,
+		    1ULL << tvd->vdev_ms_shift, uint64_t);
 	}
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (space);
@@ -640,6 +640,7 @@ void
 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
 	multilist_t *ml = &mc->mc_metaslab_txg_list;
+	hrtime_t now = gethrtime();
 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		metaslab_t *msp = multilist_sublist_head(mls);
@@ -663,8 +664,10 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 			multilist_sublist_unlock(mls);
 			if (txg >
 			    msp->ms_selected_txg + metaslab_unload_delay &&
-			    gethrtime() > msp->ms_selected_time +
-			    (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
+			    now > msp->ms_selected_time +
+			    MSEC2NSEC(metaslab_unload_delay_ms) &&
+			    (msp->ms_allocator == -1 ||
+			    !metaslab_preload_enabled)) {
 				metaslab_evict(msp, txg);
 			} else {
 				/*
@@ -9939,6 +9939,9 @@ spa_sync(spa_t *spa, uint64_t txg)

 	metaslab_class_evict_old(spa->spa_normal_class, txg);
 	metaslab_class_evict_old(spa->spa_log_class, txg);
+	/* spa_embedded_log_class has only one metaslab per vdev. */
+	metaslab_class_evict_old(spa->spa_special_class, txg);
+	metaslab_class_evict_old(spa->spa_dedup_class, txg);

 	spa_sync_close_syncing_log_sm(spa);

@@ -10561,10 +10564,10 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
 	"Print vdev tree to zfs_dbgmsg during pool import");

-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
 	"Percentage of CPUs to run an IO worker thread");

-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
 	"Number of threads per IO worker taskqueue");

 /* BEGIN CSTYLED */
@@ -10595,10 +10598,10 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,

 #ifdef _KERNEL
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
-	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
+	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
 	"Configure IO queues for read IO");
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
-	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
+	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
 	"Configure IO queues for write IO");
 #endif
 /* END CSTYLED */
@@ -347,7 +347,8 @@ vdev_get_min_asize(vdev_t *vd)
 	 * to the nearest metaslab.
 	 */
 	if (vd == vd->vdev_top)
-		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
+		return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift,
+		    uint64_t));

 	return (pvd->vdev_ops->vdev_op_min_asize(pvd));
 }
@@ -2007,6 +2008,7 @@ vdev_open(vdev_t *vd)
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
+	vd->vdev_fault_wanted = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);

 	/*
@@ -2107,8 +2109,8 @@ vdev_open(vdev_t *vd)
 		}
 	}

-	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
-	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
+	osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t);
+	max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t);

 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
@@ -4730,9 +4732,9 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 		 * can expand.
 		 */
 		if (vd->vdev_aux == NULL && tvd != NULL) {
-			vs->vs_esize = P2ALIGN(
+			vs->vs_esize = P2ALIGN_TYPED(
 			    vd->vdev_max_asize - vd->vdev_asize,
-			    1ULL << tvd->vdev_ms_shift);
+			    1ULL << tvd->vdev_ms_shift, uint64_t);
 		}

 		vs->vs_configured_ashift = vd->vdev_top != NULL
@@ -635,6 +635,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
 		uint64_t object = zap->zap_object;

 		zap_put_leaf(l);
+		*lp = l = NULL;
 		zap_unlockdir(zap, tag);
 		err = zap_lockdir(os, object, tx, RW_WRITER,
 		    FALSE, FALSE, tag, &zn->zn_zap);
@@ -844,21 +845,17 @@ retry:
 	} else if (err == EAGAIN) {
 		err = zap_expand_leaf(zn, l, tag, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
-		if (err == 0) {
+		if (err == 0)
 			goto retry;
-		} else if (err == ENOSPC) {
-			/*
-			 * If we failed to expand the leaf, then bailout
-			 * as there is no point trying
-			 * zap_put_leaf_maybe_grow_ptrtbl().
-			 */
-			return (err);
-		}
 	}

 out:
-	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	if (l != NULL) {
+		if (err == ENOSPC)
+			zap_put_leaf(l);
+		else
+			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	}
 	return (err);
 }

@@ -915,8 +912,12 @@ retry:
 			goto retry;
 	}

-	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	if (l != NULL) {
+		if (err == ENOSPC)
+			zap_put_leaf(l);
+		else
+			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	}
 	return (err);
 }

@@ -903,7 +903,7 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
 	itx_t *itx;
 	lr_clone_range_t *lr;
 	uint64_t partlen, max_log_data;
-	size_t i, partnbps;
+	size_t partnbps;

 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
@@ -912,10 +912,8 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,

 	while (nbps > 0) {
 		partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
-		partlen = 0;
-		for (i = 0; i < partnbps; i++) {
-			partlen += BP_GET_LSIZE(&bps[i]);
-		}
+		partlen = partnbps * blksz;
+		ASSERT3U(partlen, <, len + blksz);
 		partlen = MIN(partlen, len);

 		itx = zil_itx_create(txtype,
@@ -513,9 +513,26 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,

 		for (; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
+
+			/*
+			 * Are the remaining bytes large enough to hold an
+			 * log record?
+			 */
+			if ((char *)(lr + 1) > end) {
+				cmn_err(CE_WARN, "zil_parse: lr_t overrun");
+				error = SET_ERROR(ECKSUM);
+				arc_buf_destroy(abuf, &abuf);
+				goto done;
+			}
 			reclen = lr->lrc_reclen;
-			ASSERT3U(reclen, >=, sizeof (lr_t));
-			ASSERT3U(reclen, <=, end - lrp);
+			if (reclen < sizeof (lr_t) || reclen > end - lrp) {
+				cmn_err(CE_WARN,
+				    "zil_parse: lr_t has an invalid reclen");
+				error = SET_ERROR(ECKSUM);
+				arc_buf_destroy(abuf, &abuf);
+				goto done;
+			}
+
 			if (lr->lrc_seq > claim_lr_seq) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
@@ -145,6 +145,24 @@ for kernel_version in %{?kernel_versions}; do
        %{?kernel_cc} \
        %{?kernel_ld} \
        %{?kernel_llvm}
+
+    # Pre-6.10 kernel builds didn't need to copy over the source files to the
+    # build directory.  However we do need to do it though post-6.10 due to
+    # these commits:
+    #
+    # b1992c3772e6 kbuild: use $(src) instead of $(srctree)/$(src) for source
+    #                      directory
+    #
+    # 9a0ebe5011f4 kbuild: use $(obj)/ instead of $(src)/ for common pattern
+    #                      rules
+    #
+    # Note that kmodtool actually copies over the source into the build
+    # directory, so what we're doing here is normal.  For efficiency reasons
+    # though we just use hardlinks instead of copying.
+    #
+    # See https://github.com/openzfs/zfs/issues/16439 for more info.
+    cp -lR ../%{module}-%{version}/module/* module/
+
    make %{?_smp_mflags}
    cd ..
 done
@@ -532,6 +532,7 @@ systemctl --system daemon-reload >/dev/null || true
 %attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/*

 %config(noreplace) %{_bashcompletiondir}/zfs
+%config(noreplace) %{_bashcompletiondir}/zpool

 %files -n libzpool5
 %{_libdir}/libzpool.so.*
@@ -26,6 +26,7 @@ PACKAGE_VERSION="${pkgver}"
 PACKAGE_CONFIG="${pkgcfg}"
 NO_WEAK_MODULES="yes"
 PRE_BUILD="configure
+  --disable-dependency-tracking
  --prefix=/usr
  --with-config=kernel
  --with-linux=\$(
@@ -32,6 +32,7 @@ SCRIPT_COMMON=${SCRIPT_COMMON:-${0%/*}/common.sh}
 PROG=zfs-tests.sh
 VERBOSE="no"
 QUIET=""
+DEBUG=""
 CLEANUP="yes"
 CLEANUPALL="no"
 KMSG=""
@@ -313,6 +314,7 @@ OPTIONS:
 	-h          Show this message
 	-v          Verbose zfs-tests.sh output
 	-q          Quiet test-runner output
+	-D          Debug; show all test output immediately (noisy)
 	-x          Remove all testpools, dm, lo, and files (unsafe)
 	-k          Disable cleanup after test failure
 	-K          Log test names to /dev/kmsg
@@ -326,7 +328,8 @@ OPTIONS:
 	-d DIR      Use world-writable DIR for files and loopback devices
 	-s SIZE     Use vdevs of SIZE (default: 4G)
 	-r RUNFILES Run tests in RUNFILES (default: ${DEFAULT_RUNFILES})
-	-t PATH     Run single test at PATH relative to test suite
+	-t PATH|NAME  Run single test at PATH relative to test suite,
+	                or search for test by NAME
 	-T TAGS     Comma separated list of tags (default: 'functional')
 	-u USER     Run single test as USER (default: root)

@@ -340,6 +343,9 @@ $0 -r linux-fast
 # Run a single test
 $0 -t tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh

+# Run a single test by name
+$0 -t zfs_bookmark_cliargs
+
 # Cleanup a previous run of the test suite prior to testing, run the
 # default ($(echo "${DEFAULT_RUNFILES}" | sed 's/\.run//')) suite of tests and perform no cleanup on exit.
 $0 -x
@@ -347,7 +353,7 @@ $0 -x
 EOF
 }

-while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do
+while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do
 	case $OPTION in
 	h)
 		usage
@@ -393,6 +399,9 @@ while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do
 	d)
 		FILEDIR="$OPTARG"
 		;;
+	D)
+		DEBUG="yes"
+		;;
 	I)
 		ITERATIONS="$OPTARG"
 		if [ "$ITERATIONS" -le 0 ]; then
@@ -450,8 +459,15 @@ post_user = root
 post =
 outputdir = /var/tmp/test_results
 EOF
-	SINGLETESTDIR="${SINGLETEST%/*}"
+	if [ "$SINGLETEST" = "${SINGLETEST%/*}" ] ; then
+		NEWSINGLETEST=$(find "$STF_SUITE" -name "$SINGLETEST*" -print -quit)
+		if [ -z "$NEWSINGLETEST" ] ; then
+			fail "couldn't find test matching '$SINGLETEST'"
+		fi
+		SINGLETEST=$NEWSINGLETEST
+	fi

+	SINGLETESTDIR="${SINGLETEST%/*}"
 	SETUPDIR="$SINGLETESTDIR"
 	[ "${SETUPDIR#/}" = "$SETUPDIR" ] && SETUPDIR="$STF_SUITE/$SINGLETESTDIR"
 	[ -x "$SETUPDIR/setup.ksh"   ] && SETUPSCRIPT="setup"     || SETUPSCRIPT=
@@ -680,6 +696,7 @@ REPORT_FILE=$(mktemp_file zts-report)
 #
 msg "${TEST_RUNNER}" \
    "${QUIET:+-q}" \
+    "${DEBUG:+-D}" \
    "${KMEMLEAK:+-m}" \
    "${KMSG:+-K}" \
    "-c \"${RUNFILES}\"" \
@@ -689,6 +706,7 @@ msg "${TEST_RUNNER}" \
 { PATH=$STF_PATH \
    ${TEST_RUNNER} \
    ${QUIET:+-q} \
+    ${DEBUG:+-D} \
    ${KMEMLEAK:+-m} \
    ${KMSG:+-K} \
    -c "${RUNFILES}" \
@@ -715,6 +733,7 @@ if [ "$RESULT" -eq "2" ] && [ -n "$RERUN" ]; then
 	{ PATH=$STF_PATH \
 	    ${TEST_RUNNER} \
 	        ${QUIET:+-q} \
+	        ${DEBUG:+-D} \
 	        ${KMEMLEAK:+-m} \
 	    -c "${RUNFILES}" \
 	    -T "${TAGS}" \
@@ -81,7 +81,8 @@ tests = ['block_cloning_clone_mmap_cached',
    'block_cloning_cross_enc_dataset',
    'block_cloning_copyfilerange_fallback_same_txg',
    'block_cloning_replay', 'block_cloning_replay_encrypted',
-    'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write']
+    'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write',
+    'block_cloning_rlimit_fsize']
 tags = ['functional', 'block_cloning']

 [tests/functional/bootfs]
@@ -121,7 +121,7 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
    'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
    'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
    'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
-    'scrub_after_resilver', 'zpool_status_-s']
+    'scrub_after_resilver', 'suspend_resume_single', 'zpool_status_-s']
 tags = ['functional', 'fault']

 [tests/functional/features/large_dnode:Linux]
@@ -113,8 +113,9 @@ class Output(object):
    This class is a slightly modified version of the 'Stream' class found
    here: http://goo.gl/aSGfv
    """
-    def __init__(self, stream):
+    def __init__(self, stream, debug=False):
        self.stream = stream
+        self.debug = debug
        self._buf = b''
        self.lines = []

@@ -140,6 +141,8 @@ class Output(object):
        buf = os.read(fd, 4096)
        if not buf:
            return None
+        if self.debug:
+            os.write(sys.stderr.fileno(), buf)
        if b'\n' not in buf:
            self._buf += buf
            return []
@@ -238,14 +241,14 @@ User: %s
        ret = '%s -E -u %s %s' % (SUDO, user, cmd)
        return ret.split(' ')

-    def collect_output(self, proc):
+    def collect_output(self, proc, debug=False):
        """
        Read from stdout/stderr as data becomes available, until the
        process is no longer running. Return the lines from the stdout and
        stderr Output objects.
        """
-        out = Output(proc.stdout)
-        err = Output(proc.stderr)
+        out = Output(proc.stdout, debug)
+        err = Output(proc.stderr, debug)
        res = []
        while proc.returncode is None:
            proc.poll()
@@ -308,7 +311,10 @@ User: %s

        try:
            t.start()
-            self.result.stdout, self.result.stderr = self.collect_output(proc)
+
+            out, err = self.collect_output(proc, options.debug)
+            self.result.stdout = out
+            self.result.stderr = err

            if kmemleak:
                cmd = f'{SUDO} sh -c "echo scan > {KMEMLEAK_FILE}"'
@@ -624,7 +630,7 @@ Tags: %s


 class TestRun(object):
-    props = ['quiet', 'outputdir']
+    props = ['quiet', 'outputdir', 'debug']

    def __init__(self, options):
        self.tests = {}
@@ -644,7 +650,8 @@ class TestRun(object):
            ('post_user', ''),
            ('failsafe', ''),
            ('failsafe_user', ''),
-            ('tags', [])
+            ('tags', []),
+            ('debug', False)
        ]

    def __str__(self):
@@ -1067,6 +1074,8 @@ def parse_args():
                      help='Specify tests to run via config files.')
    parser.add_option('-d', action='store_true', default=False, dest='dryrun',
                      help='Dry run. Print tests, but take no other action.')
+    parser.add_option('-D', action='store_true', default=False, dest='debug',
+                      help='Write all test output to stdout as it arrives.')
    parser.add_option('-l', action='callback', callback=options_cb,
                      default=None, dest='logfile', metavar='logfile',
                      type='string',
@@ -182,7 +182,6 @@ if sys.platform.startswith('freebsd'):
        'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
        'cp_files/cp_files_002_pos': ['SKIP', na_reason],
        'link_count/link_count_001': ['SKIP', na_reason],
-        'casenorm/mixed_create_failure': ['FAIL', 13215],
        'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
        'rsend/send_raw_ashift': ['SKIP', 14961],
    })
@@ -331,6 +330,8 @@ elif sys.platform.startswith('linux'):
            ['SKIP', cfr_reason],
        'block_cloning/block_cloning_replay_encrypted':
            ['SKIP', cfr_reason],
+        'block_cloning/block_cloning_rlimit_fsize':
+            ['SKIP', cfr_reason],
        'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
        'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
        'cp_files/cp_files_002_pos': ['SKIP', cfr_reason],
@@ -380,6 +381,7 @@ if os.environ.get('CI') == 'true':
        'fault/auto_replace_002_pos': ['SKIP', ci_reason],
        'fault/auto_spare_ashift': ['SKIP', ci_reason],
        'fault/auto_spare_shared': ['SKIP', ci_reason],
+        'fault/suspend_resume_single': ['SKIP', ci_reason],
        'procfs/pool_state': ['SKIP', ci_reason],
    })

@@ -521,13 +521,15 @@ test_send_new(const char *snapshot, int fd)
 static void
 test_recv_new(const char *dataset, int fd)
 {
-	dmu_replay_record_t drr = { 0 };
+	dmu_replay_record_t drr;
 	nvlist_t *required = fnvlist_alloc();
 	nvlist_t *optional = fnvlist_alloc();
 	nvlist_t *props = fnvlist_alloc();
 	char snapshot[MAXNAMELEN + 32];
 	ssize_t count;

+	memset(&drr, 0, sizeof (dmu_replay_record_t));
+
 	int cleanup_fd = open(ZFS_DEV, O_RDWR);
 	if (cleanup_fd == -1) {
 		(void) fprintf(stderr, "open(%s) failed: %s\n", ZFS_DEV,
@@ -62,11 +62,39 @@ function compare_version_gte
 }

 # Helper function used by linux_version() and freebsd_version()
+# $1, if provided, should be a MAJOR, MAJOR.MINOR or MAJOR.MINOR.PATCH
+# version number
 function kernel_version
 {
 	typeset ver="$1"

-	[ -z "$ver" ] && ver=$(uname -r | grep -Eo "^[0-9]+\.[0-9]+\.[0-9]+")
+	[ -z "$ver" ] && case "$UNAME" in
+	Linux)
+		# Linux version numbers are X.Y.Z followed by optional
+		# vendor/distro specific stuff
+		#   RHEL7:       3.10.0-1160.108.1.el7.x86_64
+		#   Fedora 37:   6.5.12-100.fc37.x86_64
+		#   Debian 12.6: 6.1.0-22-amd64
+		ver=$(uname -r | grep -Eo "^[0-9]+\.[0-9]+\.[0-9]+")
+		;;
+	FreeBSD)
+		# FreeBSD version numbers are X.Y-BRANCH-pZ. Depending on
+		# branch, -pZ may not be present, but this is typically only
+		# on pre-release or true .0 releases, so can be assumed 0
+		# if not present.
+		# eg:
+		#   13.2-RELEASE-p4
+		#   14.1-RELEASE
+		#   15.0-CURRENT
+		ver=$(uname -r | \
+		    grep -Eo "[0-9]+\.[0-9]+(-[A-Z0-9]+-p[0-9]+)?" | \
+		    sed -E "s/-[^-]+-p/./")
+		;;
+	*)
+		# Unknown system
+		log_fail "Don't know how to get kernel version for '$UNAME'"
+		;;
+	esac

 	typeset version major minor _
 	IFS='.' read -r version major minor _ <<<"$ver"
@@ -478,6 +478,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/block_cloning/block_cloning_replay.ksh \
 	functional/block_cloning/block_cloning_replay_encrypted.ksh \
 	functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh \
+	functional/block_cloning/block_cloning_rlimit_fsize.ksh \
 	functional/bootfs/bootfs_001_pos.ksh \
 	functional/bootfs/bootfs_002_neg.ksh \
 	functional/bootfs/bootfs_003_pos.ksh \
@@ -1476,6 +1477,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/fault/decompress_fault.ksh \
 	functional/fault/decrypt_fault.ksh \
 	functional/fault/scrub_after_resilver.ksh \
+	functional/fault/suspend_resume_single.ksh \
 	functional/fault/setup.ksh \
 	functional/fault/zpool_status_-s.ksh \
 	functional/features/async_destroy/async_destroy_001_pos.ksh \
@@ -55,7 +55,7 @@ function display_status
 	((ret |= $?))

 	typeset mntpnt=$(get_prop mountpoint $pool)
-	dd if=/dev/random of=$mntpnt/testfile.$$ &
+	dd if=/dev/urandom of=$mntpnt/testfile.$$ &
 	typeset pid=$!

 	zpool iostat -v 1 3 > /dev/null
@@ -0,0 +1,64 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+#
+# DESCRIPTION:
+#	When block cloning is used to implement copy_file_range(2), the
+#	RLIMIT_FSIZE limit must be respected.
+#
+# STRATEGY:
+#	1. Create a pool.
+#	2. ???
+#
+
+verify_runnable "global"
+
+VDIR=$TEST_BASE_DIR/disk-bclone
+VDEV="$VDIR/a"
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+	rm -rf $VDIR
+}
+
+log_onexit cleanup
+
+log_assert "Test for RLIMIT_FSIZE handling with block cloning enabled"
+
+log_must rm -rf $VDIR
+log_must mkdir -p $VDIR
+log_must truncate -s 1G $VDEV
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1 count=1000
+
+ulimit -f 2
+log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 all
+ulimit -f 1
+log_mustnot clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file3 0 0 all
+
+log_pass "copy_file_range(2) respects RLIMIT_FSIZE"
@@ -84,7 +84,8 @@ function do_vol_test
 	vol=$TESTPOOL/$TESTVOL1
 	vol_b_path=$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL1

-	log_must zfs create -V $VOLSIZE -o copies=$copies $vol
+	log_must zfs create -V $VOLSIZE -o compression=off -o copies=$copies \
+	    $vol
 	log_must zfs set refreservation=none $vol
 	block_device_wait $vol_b_path

@@ -116,31 +117,30 @@ function do_vol_test
 		else
 			log_must zpool create $TESTPOOL1 $vol_b_path
 		fi
-		log_must zfs create $TESTPOOL1/$TESTFS1
+		log_must zfs create -o compression=off $TESTPOOL1/$TESTFS1
+		sync_pool $TESTPOOL1
 		;;
 	*)
 		log_unsupported "$type test not implemented"
 		;;
 	esac

-	((nfilesize = copies * ${FILESIZE%m}))
+	sync_pool $TESTPOOL
 	pre_used=$(get_prop used $vol)
-	((target_size = pre_used + nfilesize))

 	if [[ $type == "zfs" ]]; then
 		log_must mkfile $FILESIZE /$TESTPOOL1/$TESTFS1/$FILE
+		sync_pool $TESTPOOL1
 	else
 		log_must mkfile $FILESIZE $mntp/$FILE
+		log_must sync
 	fi

+	sync_pool $TESTPOOL
 	post_used=$(get_prop used $vol)
-	((retries = 0))
-	while ((post_used < target_size && retries++ < 42)); do
-		sleep 1
-		post_used=$(get_prop used $vol)
-	done

 	((used = post_used - pre_used))
+	((nfilesize = copies * ${FILESIZE%m}))
 	if ((used < nfilesize)); then
 		log_fail "The space is not charged correctly while setting" \
 		    "copies as $copies ($used < $nfilesize)" \
@@ -153,5 +153,7 @@ function do_vol_test
 		log_must umount $mntp
 	fi

+	# Ubuntu 20.04 wants a sync here
+	log_must sync
 	log_must zfs destroy $vol
 }
@@ -0,0 +1,102 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+set -x
+
+DATAFILE="$TMPDIR/datafile"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+	unload_scsi_debug
+	rm -f $DATA_FILE
+}
+
+log_onexit cleanup
+
+log_assert "ensure single-disk pool resumes properly after suspend and clear"
+
+# create a file, and take a checksum, so we can compare later
+log_must dd if=/dev/urandom of=$DATAFILE bs=128K count=1
+typeset sum1=$(cat $DATAFILE | md5sum)
+
+# make a debug device that we can "unplug"
+load_scsi_debug 100 1 1 1 '512b'
+sd=$(get_debug_device)
+
+# create a single-device pool
+log_must zpool create $TESTPOOL $sd
+log_must zpool sync
+
+# "pull" the disk
+log_must eval "echo offline > /sys/block/$sd/device/state"
+
+# copy data onto the pool. it'll appear to succeed, but only be in memory
+log_must cp $DATAFILE /$TESTPOOL/file
+
+# wait until sync starts, and the pool suspends
+log_note "waiting for pool to suspend"
+typeset -i tries=10
+until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
+	if ((tries-- == 0)); then
+		log_fail "pool didn't suspend"
+	fi
+	sleep 1
+done
+
+# return the disk
+log_must eval "echo running > /sys/block/$sd/device/state"
+
+# clear the error states, which should reopen the vdev, get the pool back
+# online, and replay the failed IO
+log_must zpool clear $TESTPOOL
+
+# wait a while for everything to sync out. if something is going to go wrong,
+# this is where it will happen
+log_note "giving pool time to settle and complete txg"
+sleep 7
+
+# if the pool suspended, then everything is bad
+if [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; then
+	log_fail "pool suspended"
+fi
+
+# export the pool, to make sure it exports clean, and also to clear the file
+# out of the cache
+log_must zpool export $TESTPOOL
+
+# import the pool
+log_must zpool import $TESTPOOL
+
+# sum the file we wrote earlier
+typeset sum2=$(cat /$TESTPOOL/file | md5sum)
+
+# make sure the checksums match
+log_must test "$sum1" = "$sum2"
+
+log_pass "single-disk pool resumes properly after disk suspend and clear"
@@ -37,11 +37,7 @@ export TMP_HISTORY=$TEST_BASE_DIR/tmp_history.$$
 export NEW_HISTORY=$TEST_BASE_DIR/new_history.$$

 export MIGRATEDPOOLNAME=${MIGRATEDPOOLNAME:-history_pool}
-if is_freebsd; then
-	export TIMEZONE=${TIMEZONE:-America/Denver}
-else
-	export TIMEZONE=${TIMEZONE:-US/Mountain}
-fi
+export TIMEZONE=${TIMEZONE:-America/Denver}

 export HIST_USER="huser"
 export HIST_GROUP="hgroup"
@@ -41,13 +41,13 @@ verify_runnable "global"


 if ! $(grep -q "CONFIG_IO_URING=y" /boot/config-$(uname -r)); then
-	log_unsupported "Requires io_uring support"
+	log_unsupported "Requires io_uring support within Kernel"
 fi

 if [ -e /etc/os-release ] ; then
 	source /etc/os-release
-	if [ -n "$REDHAT_SUPPORT_PRODUCT_VERSION" ] && ((floor($REDHAT_SUPPORT_PRODUCT_VERSION) == 9)) ; then
-		log_unsupported "Disabled on CentOS 9, fails with 'Operation not permitted'"
+	if [ $PLATFORM_ID = "platform:el9" ]; then
+		log_unsupported "Disabled on RHEL 9 variants: fails with 'Operation not permitted'"
 	fi
 fi

@@ -96,7 +96,7 @@ log_must zfs destroy -R $clone2
 log_must eval "zfs send -i $sendfs#book2 --redact book3 $sendfs@snap2 >$stream"
 log_must eval "zfs recv $recvfs <$stream"
 log_must mount_redacted -f $recvfs
-log_must diff <(ls $send_mnt) <(ls $recv_mnt)
+log_must [ "$(ls $send_mnt)" == "$(ls $recv_mnt)" ]
 log_must zfs destroy -R $recvfs
 log_must zfs rollback -R $sendfs@snap

@@ -71,8 +71,7 @@ log_must ismounted $recvfs
 # deleted.
 contents=$(log_must find $recv_mnt)
 contents_orig=$(log_must find $send_mnt)
-log_must diff <(echo ${contents//$recv_mnt/}) \
-    <(echo ${contents_orig//$send_mnt/})
+log_must [ "${contents//$recv_mnt/}" == "${contents_orig//$send_mnt/}" ]
 log_must zfs redact $sendvol@snap book2 $clonevol@snap
 log_must eval "zfs send --redact book2 $sendvol@snap >$stream"
 log_must eval "zfs receive $recvvol <$stream"
@@ -103,7 +102,6 @@ log_must mount_redacted -f $recvfs
 log_must ismounted $recvfs
 contents=$(log_must find $recv_mnt)
 contents_orig=$(log_must find $send_mnt)
-log_must diff <(echo ${contents//$recv_mnt/}) \
-    <(echo ${contents_orig//$send_mnt/})
+log_must [ "${contents//$recv_mnt/}" == "${contents_orig//$send_mnt/}" ]

 log_pass "Received redacted streams can be mounted."