Tag zfs-2.2.2

META file and changelog updated. Signed-off-by: Tony Hutter <hutter2@llnl.gov>
FreeBSD: Fix ZFS so that snapshots under .zfs/snapshot are NFS visible
2026-05-23 19:04:45 +03:00 · 2023-11-29 14:08:46 -08:00 · 2023-11-29 14:08:46 -08:00 · 2023-11-29 13:08:25 -08:00 · 2023-11-29 13:08:25 -08:00 · 2023-11-28 15:19:07 -08:00
108 changed files with 1773 additions and 590 deletions
@@ -83,6 +83,7 @@
 modules.order
 Makefile
 Makefile.in
 changelog
 *.patch
 *.orig
 *.tmp
@@ -1,10 +1,10 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.2.0
+Version:       2.2.2
 Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.5
+Linux-Maximum: 6.6
 Linux-Minimum: 3.10
@@ -32,4 +32,4 @@ For more details see the NOTICE, LICENSE and COPYRIGHT files; `UCRL-CODE-235197`
 # Supported Kernels
  * The `META` file contains the officially recognized supported Linux kernel versions.
-  * Supported FreeBSD versions are any supported branches and releases starting from 12.2-RELEASE.
+  * Supported FreeBSD versions are any supported branches and releases starting from 12.4-RELEASE.
@@ -711,7 +711,7 @@ def section_archits(kstats_dict):
    pd_total = int(arc_stats['prefetch_data_hits']) +\
        int(arc_stats['prefetch_data_iohits']) +\
        int(arc_stats['prefetch_data_misses'])
-    prt_2('ARC prefetch metadata accesses:', f_perc(pd_total, all_accesses),
+    prt_2('ARC prefetch data accesses:', f_perc(pd_total, all_accesses),
          f_hits(pd_total))
    pd_todo = (('Prefetch data hits:', arc_stats['prefetch_data_hits']),
               ('Prefetch data I/O hits:', arc_stats['prefetch_data_iohits']),
@@ -34,6 +34,7 @@
 * Copyright (c) 2021 Allan Jude
 * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
 * Copyright (c) 2023, Klara Inc.
 * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
 */
 #include <stdio.h>
@@ -80,6 +81,7 @@
 #include <sys/dsl_scan.h>
 #include <sys/btree.h>
 #include <sys/brt.h>
 #include <sys/brt_impl.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>
@@ -899,6 +901,8 @@ usage(void)
 	    "don't print label contents\n");
 	(void) fprintf(stderr, "        -t --txg=INTEGER             "
 	    "highest txg to use when searching for uberblocks\n");
 	(void) fprintf(stderr, "        -T --brt-stats               "
 	    "BRT statistics\n");
 	(void) fprintf(stderr, "        -u --uberblock               "
 	    "uberblock\n");
 	(void) fprintf(stderr, "        -U --cachefile=PATH          "
@@ -999,6 +1003,15 @@ zdb_nicenum(uint64_t num, char *buf, size_t buflen)
 		nicenum(num, buf, buflen);
 }
 static void
 zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen)
 {
 	if (dump_opt['P'])
 		(void) snprintf(buf, buflen, "%llu", (longlong_t)bytes);
 	else
 		zfs_nicebytes(bytes, buf, buflen);
 }
 static const char histo_stars[] = "****************************************";
 static const uint64_t histo_width = sizeof (histo_stars) - 1;
@@ -2081,6 +2094,76 @@ dump_all_ddts(spa_t *spa)
 	dump_dedup_ratio(&dds_total);
 }
 static void
 dump_brt(spa_t *spa)
 {
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		printf("BRT: unsupported on this pool\n");
 		return;
 	}
 	if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		printf("BRT: empty\n");
 		return;
 	}
 	brt_t *brt = spa->spa_brt;
 	VERIFY(brt);
 	char count[32], used[32], saved[32];
 	zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
 	zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
 	uint64_t ratio = brt_get_ratio(spa);
 	printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved,
 	    (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100));
 	if (dump_opt['T'] < 2)
 		return;
 	for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
 		if (brtvd == NULL)
 			continue;
 		if (!brtvd->bv_initiated) {
 			printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
 			continue;
 		}
 		zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));
 		zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));
 		zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));
 		printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n",
 		    vdevid, count, used, saved);
 	}
 	if (dump_opt['T'] < 3)
 		return;
 	char dva[64];
 	printf("\n%-16s %-10s\n", "DVA", "REFCNT");
 	for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
 		if (brtvd == NULL || !brtvd->bv_initiated)
 			continue;
 		zap_cursor_t zc;
 		zap_attribute_t za;
 		for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t offset = *(uint64_t *)za.za_name;
 			uint64_t refcnt = za.za_first_integer;
 			snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid,
 			    (u_longlong_t)offset);
 			printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt);
 		}
 		zap_cursor_fini(&zc);
 	}
 }
 static void
 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
@@ -8093,6 +8176,9 @@ dump_zpool(spa_t *spa)
 	if (dump_opt['D'])
 		dump_all_ddts(spa);
 	if (dump_opt['T'])
 		dump_brt(spa);
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
 	if (dump_opt['M'])
@@ -8879,6 +8965,7 @@ main(int argc, char **argv)
 		{"io-stats",		no_argument,		NULL, 's'},
 		{"simulate-dedup",	no_argument,		NULL, 'S'},
 		{"txg",			required_argument,	NULL, 't'},
 		{"brt-stats",		no_argument,		NULL, 'T'},
 		{"uberblock",		no_argument,		NULL, 'u'},
 		{"cachefile",		required_argument,	NULL, 'U'},
 		{"verbose",		no_argument,		NULL, 'v'},
@@ -8892,7 +8979,7 @@ main(int argc, char **argv)
 	};
 	while ((c = getopt_long(argc, argv,
-	    "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ",
+	    "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ",
 	    long_options, NULL)) != -1) {
 		switch (c) {
 		case 'b':
@@ -8914,6 +9001,7 @@ main(int argc, char **argv)
 		case 'R':
 		case 's':
 		case 'S':
 		case 'T':
 		case 'u':
 		case 'y':
 		case 'Z':
@@ -9076,22 +9164,6 @@ main(int argc, char **argv)
 	if (dump_opt['l'])
 		return (dump_label(argv[0]));
 	if (dump_opt['O']) {
 		if (argc != 2)
 			usage();
 		dump_opt['v'] = verbose + 3;
 		return (dump_path(argv[0], argv[1], NULL));
 	}
 	if (dump_opt['r']) {
 		target_is_spa = B_FALSE;
 		if (argc != 3)
 			usage();
 		dump_opt['v'] = verbose;
 		error = dump_path(argv[0], argv[1], &object);
 		if (error != 0)
 			fatal("internal error: %s", strerror(error));
 	}
 	if (dump_opt['X'] || dump_opt['F'])
 		rewind = ZPOOL_DO_REWIND |
 		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
@@ -9192,6 +9264,29 @@ main(int argc, char **argv)
 		searchdirs = NULL;
 	}
 	/*
 	 * We need to make sure to process -O option or call
 	 * dump_path after the -e option has been processed,
 	 * which imports the pool to the namespace if it's
 	 * not in the cachefile.
 	 */
 	if (dump_opt['O']) {
 		if (argc != 2)
 			usage();
 		dump_opt['v'] = verbose + 3;
 		return (dump_path(argv[0], argv[1], NULL));
 	}
 	if (dump_opt['r']) {
 		target_is_spa = B_FALSE;
 		if (argc != 3)
 			usage();
 		dump_opt['v'] = verbose;
 		error = dump_path(argv[0], argv[1], &object);
 		if (error != 0)
 			fatal("internal error: %s", strerror(error));
 	}
 	/*
 	 * import_checkpointed_state makes the assumption that the
 	 * target pool that we pass it is already part of the spa
@@ -24,6 +24,7 @@
 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
 * Copyright (c) 2016, 2017, Intel Corporation.
 * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 * Copyright (c) 2023, Klara Inc.
 */
 /*
@@ -146,6 +147,17 @@ zfs_unavail_pool(zpool_handle_t *zhp, void *data)
 	return (0);
 }
 /*
 * Write an array of strings to the zed log
 */
 static void lines_to_zed_log_msg(char **lines, int lines_cnt)
 {
 	int i;
 	for (i = 0; i < lines_cnt; i++) {
 		zed_log_msg(LOG_INFO, "%s", lines[i]);
 	}
 }
 /*
 * Two stage replace on Linux
 * since we get disk notifications
@@ -193,14 +205,21 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 	uint64_t is_spare = 0;
 	const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
 	char rawpath[PATH_MAX], fullpath[PATH_MAX];
-	char devpath[PATH_MAX];
+	char pathbuf[PATH_MAX];
 	int ret;
 	int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;
 	boolean_t is_sd = B_FALSE;
 	boolean_t is_mpath_wholedisk = B_FALSE;
 	uint_t c;
 	vdev_stat_t *vs;
 	char **lines = NULL;
 	int lines_cnt = 0;
 	/*
 	 * Get the persistent path, typically under the '/dev/disk/by-id' or
 	 * '/dev/disk/by-vdev' directories.  Note that this path can change
 	 * when a vdev is replaced with a new disk.
 	 */
 	if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
 		return;
@@ -357,15 +376,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 	(void) snprintf(rawpath, sizeof (rawpath), "%s%s",
 	    is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
-	if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) {
+	if (realpath(rawpath, pathbuf) == NULL && !is_mpath_wholedisk) {
 		zed_log_msg(LOG_INFO, "  realpath: %s failed (%s)",
 		    rawpath, strerror(errno));
-		(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
+		int err = zpool_vdev_online(zhp, fullpath,
-		    &newstate);
+		    ZFS_ONLINE_FORCEFAULT, &newstate);
-		zed_log_msg(LOG_INFO, "  zpool_vdev_online: %s FORCEFAULT (%s)",
+		zed_log_msg(LOG_INFO, "  zpool_vdev_online: %s FORCEFAULT (%s) "
-		    fullpath, libzfs_error_description(g_zfshdl));
+		    "err %d, new state %d",
 		    fullpath, libzfs_error_description(g_zfshdl), err,
 		    err ? (int)newstate : 0);
 		return;
 	}
@@ -383,6 +404,22 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 	if (is_mpath_wholedisk) {
 		/* Don't label device mapper or multipath disks. */
 		zed_log_msg(LOG_INFO,
 		    "  it's a multipath wholedisk, don't label");
 		if (zpool_prepare_disk(zhp, vdev, "autoreplace", &lines,
 		    &lines_cnt) != 0) {
 			zed_log_msg(LOG_INFO,
 			    "  zpool_prepare_disk: could not "
 			    "prepare '%s' (%s)", fullpath,
 			    libzfs_error_description(g_zfshdl));
 			if (lines_cnt > 0) {
 				zed_log_msg(LOG_INFO,
 				    "  zfs_prepare_disk output:");
 				lines_to_zed_log_msg(lines, lines_cnt);
 			}
 			libzfs_free_str_array(lines, lines_cnt);
 			return;
 		}
 	} else if (!labeled) {
 		/*
 		 * we're auto-replacing a raw disk, so label it first
@@ -399,16 +436,24 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		 * to trigger a ZFS fault for the device (and any hot spare
 		 * replacement).
 		 */
-		leafname = strrchr(devpath, '/') + 1;
+		leafname = strrchr(pathbuf, '/') + 1;
 		/*
 		 * If this is a request to label a whole disk, then attempt to
 		 * write out the label.
 		 */
-		if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) {
+		if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname,
-			zed_log_msg(LOG_INFO, "  zpool_label_disk: could not "
+		    vdev, "autoreplace", &lines, &lines_cnt) != 0) {
 			zed_log_msg(LOG_WARNING,
 			    "  zpool_prepare_and_label_disk: could not "
 			    "label '%s' (%s)", leafname,
 			    libzfs_error_description(g_zfshdl));
 			if (lines_cnt > 0) {
 				zed_log_msg(LOG_INFO,
 				"  zfs_prepare_disk output:");
 				lines_to_zed_log_msg(lines, lines_cnt);
 			}
 			libzfs_free_str_array(lines, lines_cnt);
 			(void) zpool_vdev_online(zhp, fullpath,
 			    ZFS_ONLINE_FORCEFAULT, &newstate);
@@ -431,7 +476,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		    sizeof (device->pd_physpath));
 		list_insert_tail(&g_device_list, device);
-		zed_log_msg(LOG_INFO, "  zpool_label_disk: async '%s' (%llu)",
+		zed_log_msg(LOG_NOTICE, "  zpool_label_disk: async '%s' (%llu)",
 		    leafname, (u_longlong_t)guid);
 		return;	/* resumes at EC_DEV_ADD.ESC_DISK for partition */
@@ -454,8 +499,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		}
 		if (!found) {
 			/* unexpected partition slice encountered */
-			zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
+			zed_log_msg(LOG_WARNING, "labeled disk %s was "
-			    fullpath);
+			    "unexpected here", fullpath);
 			(void) zpool_vdev_online(zhp, fullpath,
 			    ZFS_ONLINE_FORCEFAULT, &newstate);
 			return;
@@ -464,10 +509,21 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		zed_log_msg(LOG_INFO, "  zpool_label_disk: resume '%s' (%llu)",
 		    physpath, (u_longlong_t)guid);
-		(void) snprintf(devpath, sizeof (devpath), "%s%s",
+		/*
-		    DEV_BYID_PATH, new_devid);
+		 * Paths that begin with '/dev/disk/by-id/' will change and so
 		 * they must be updated before calling zpool_vdev_attach().
 		 */
 		if (strncmp(path, DEV_BYID_PATH, strlen(DEV_BYID_PATH)) == 0) {
 			(void) snprintf(pathbuf, sizeof (pathbuf), "%s%s",
 			    DEV_BYID_PATH, new_devid);
 			zed_log_msg(LOG_INFO, "  zpool_label_disk: path '%s' "
 			    "replaced by '%s'", path, pathbuf);
 			path = pathbuf;
 		}
 	}
 	libzfs_free_str_array(lines, lines_cnt);
 	/*
 	 * Construct the root vdev to pass to zpool_vdev_attach().  While adding
 	 * the entire vdev structure is harmless, we construct a reduced set of
@@ -506,9 +562,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 	 * Wait for udev to verify the links exist, then auto-replace
 	 * the leaf disk at same physical location.
 	 */
-	if (zpool_label_disk_wait(path, 3000) != 0) {
+	if (zpool_label_disk_wait(path, DISK_LABEL_WAIT) != 0) {
-		zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
+		zed_log_msg(LOG_WARNING, "zfs_mod: pool '%s', after labeling "
-		    "disk %s is missing", path);
+		    "replacement disk, the expected disk partition link '%s' "
 		    "is missing after waiting %u ms",
 		    zpool_get_name(zhp), path, DISK_LABEL_WAIT);
 		nvlist_free(nvroot);
 		return;
 	}
@@ -523,7 +581,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		    B_TRUE, B_FALSE);
 	}
-	zed_log_msg(LOG_INFO, "  zpool_vdev_replace: %s with %s (%s)",
+	zed_log_msg(LOG_WARNING, "  zpool_vdev_replace: %s with %s (%s)",
 	    fullpath, path, (ret == 0) ? "no errors" :
 	    libzfs_error_description(g_zfshdl));
@@ -621,7 +679,7 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
 		    dp->dd_prop, path);
 		dp->dd_found = B_TRUE;
-		/* pass the new devid for use by replacing code */
+		/* pass the new devid for use by auto-replacing code */
 		if (dp->dd_new_devid != NULL) {
 			(void) nvlist_add_string(nvl, "new_devid",
 			    dp->dd_new_devid);
@@ -1,6 +1,9 @@
 # Features which are supported by GRUB2
 allocation_classes
 async_destroy
 block_cloning
 bookmarks
 device_rebuild
 embedded_data
 empty_bpobj
 enabled_txg
@@ -9,6 +12,12 @@ filesystem_limits
 hole_birth
 large_blocks
 livelist
 log_spacemap
 lz4_compress
 project_quota
 resilver_defer
 spacemap_histogram
 spacemap_v2
 userobj_accounting
 zilsaxattr
 zpool_checkpoint
@@ -6,7 +6,6 @@ edonr
 embedded_data
 empty_bpobj
 enabled_txg
 encryption
 extensible_dataset
 filesystem_limits
 hole_birth
@@ -443,37 +443,22 @@ vdev_run_cmd(vdev_cmd_data_t *data, char *cmd)
 {
 	int rc;
 	char *argv[2] = {cmd};
-	char *env[5] = {(char *)"PATH=/bin:/sbin:/usr/bin:/usr/sbin"};
+	char **env;
 	char **lines = NULL;
 	int lines_cnt = 0;
 	int i;
-	/* Setup our custom environment variables */
+	env = zpool_vdev_script_alloc_env(data->pool, data->path, data->upath,
-	rc = asprintf(&env[1], "VDEV_PATH=%s",
+	    data->vdev_enc_sysfs_path, NULL, NULL);
-	    data->path ? data->path : "");
+	if (env == NULL)
 	if (rc == -1) {
 		env[1] = NULL;
 		goto out;
 	}
 	rc = asprintf(&env[2], "VDEV_UPATH=%s",
 	    data->upath ? data->upath : "");
 	if (rc == -1) {
 		env[2] = NULL;
 		goto out;
 	}
 	rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s",
 	    data->vdev_enc_sysfs_path ?
 	    data->vdev_enc_sysfs_path : "");
 	if (rc == -1) {
 		env[3] = NULL;
 		goto out;
 	}
 	/* Run the command */
 	rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines,
 	    &lines_cnt);
 	zpool_vdev_script_free_env(env);
 	if (rc != 0)
 		goto out;
@@ -485,10 +470,6 @@ vdev_run_cmd(vdev_cmd_data_t *data, char *cmd)
 out:
 	if (lines != NULL)
 		libzfs_free_str_array(lines, lines_cnt);
 	/* Start with i = 1 since env[0] was statically allocated */
 	for (i = 1; i < ARRAY_SIZE(env); i++)
 		free(env[i]);
 }
 /*
@@ -126,6 +126,10 @@ vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv,
 void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl);
 void free_vdev_cmd_data(vdev_cmd_data_t *data);
 int vdev_run_cmd_simple(char *path, char *cmd);
 int check_device(const char *path, boolean_t force,
    boolean_t isspare, boolean_t iswholedisk);
 boolean_t check_sector_size_database(char *path, int *sector_size);
@@ -936,6 +936,15 @@ zero_label(const char *path)
 	return (0);
 }
 static void
 lines_to_stderr(char *lines[], int lines_cnt)
 {
 	int i;
 	for (i = 0; i < lines_cnt; i++) {
 		fprintf(stderr, "%s\n", lines[i]);
 	}
 }
 /*
 * Go through and find any whole disks in the vdev specification, labelling them
 * as appropriate.  When constructing the vdev spec, we were unable to open this
@@ -947,7 +956,7 @@ zero_label(const char *path)
 * need to get the devid after we label the disk.
 */
 static int
-make_disks(zpool_handle_t *zhp, nvlist_t *nv)
+make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing)
 {
 	nvlist_t **child;
 	uint_t c, children;
@@ -1032,6 +1041,8 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
 		 */
 		if (!is_exclusive && !is_spare(NULL, udevpath)) {
 			char *devnode = strrchr(devpath, '/') + 1;
 			char **lines = NULL;
 			int lines_cnt = 0;
 			ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
 			if (ret == 0) {
@@ -1043,9 +1054,27 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
 			/*
 			 * When labeling a pool the raw device node name
 			 * is provided as it appears under /dev/.
 			 *
 			 * Note that 'zhp' will be NULL when we're creating a
 			 * pool.
 			 */
-			if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
+			if (zpool_prepare_and_label_disk(g_zfs, zhp, devnode,
 			    nv, zhp == NULL ? "create" :
 			    replacing ? "replace" : "add", &lines,
 			    &lines_cnt) != 0) {
 				(void) fprintf(stderr,
 				    gettext(
 				    "Error preparing/labeling disk.\n"));
 				if (lines_cnt > 0) {
 					(void) fprintf(stderr,
 					gettext("zfs_prepare_disk output:\n"));
 					lines_to_stderr(lines, lines_cnt);
 				}
 				libzfs_free_str_array(lines, lines_cnt);
 				return (-1);
 			}
 			libzfs_free_str_array(lines, lines_cnt);
 			/*
 			 * Wait for udev to signal the device is available
@@ -1082,19 +1111,19 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
 	}
 	for (c = 0; c < children; c++)
-		if ((ret = make_disks(zhp, child[c])) != 0)
+		if ((ret = make_disks(zhp, child[c], replacing)) != 0)
 			return (ret);
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
-			if ((ret = make_disks(zhp, child[c])) != 0)
+			if ((ret = make_disks(zhp, child[c], replacing)) != 0)
 				return (ret);
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
-			if ((ret = make_disks(zhp, child[c])) != 0)
+			if ((ret = make_disks(zhp, child[c], replacing)) != 0)
 				return (ret);
 	return (0);
@@ -1752,7 +1781,7 @@ split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
 			return (NULL);
 		}
-		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
+		if (!flags.dryrun && make_disks(zhp, newroot, B_FALSE) != 0) {
 			nvlist_free(newroot);
 			return (NULL);
 		}
@@ -1873,7 +1902,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
 	/*
 	 * Run through the vdev specification and label any whole disks found.
 	 */
-	if (!dryrun && make_disks(zhp, newroot) != 0) {
+	if (!dryrun && make_disks(zhp, newroot, replacing) != 0) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
@@ -33,6 +33,7 @@ AM_CPPFLAGS += -D_REENTRANT
 AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64
 AM_CPPFLAGS += -D_LARGEFILE64_SOURCE
 AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\"
 AM_CPPFLAGS += -DZFSEXECDIR=\"$(zfsexecdir)\"
 AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\"
 AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\"
 AM_CPPFLAGS += -DSYSCONFDIR=\"$(sysconfdir)\"
@@ -0,0 +1,36 @@
 dnl #
 dnl # 6.6 API change,
 dnl # fsync_bdev was removed in favor of sync_blockdev
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_SYNC_BDEV], [
 	ZFS_LINUX_TEST_SRC([fsync_bdev], [
 		#include <linux/blkdev.h>
 	],[
 		fsync_bdev(NULL);
 	])
 	ZFS_LINUX_TEST_SRC([sync_blockdev], [
 		#include <linux/blkdev.h>
 	],[
 		sync_blockdev(NULL);
 	])
 ])
 AC_DEFUN([ZFS_AC_KERNEL_SYNC_BDEV], [
 	AC_MSG_CHECKING([whether fsync_bdev() exists])
 	ZFS_LINUX_TEST_RESULT([fsync_bdev], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_FSYNC_BDEV, 1,
 		    [fsync_bdev() is declared in include/blkdev.h])
 	],[
 		AC_MSG_CHECKING([whether sync_blockdev() exists])
 		ZFS_LINUX_TEST_RESULT([sync_blockdev], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE(HAVE_SYNC_BLOCKDEV, 1,
 			    [sync_blockdev() is declared in include/blkdev.h])
 		],[
 			ZFS_LINUX_TEST_ERROR(
 			    [neither fsync_bdev() nor sync_blockdev() exist])
 		])
 	])
 ])
@@ -7,6 +7,10 @@ dnl #
 dnl # 6.3 API
 dnl # generic_fillattr() now takes struct mnt_idmap* as the first argument
 dnl #
 dnl # 6.6 API
 dnl # generic_fillattr() now takes u32 as second argument, representing a
 dnl # request_mask for statx
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR], [
 	ZFS_LINUX_TEST_SRC([generic_fillattr_userns], [
 		#include <linux/fs.h>
@@ -25,22 +29,39 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR], [
 		struct kstat *k = NULL;
 		generic_fillattr(idmap, in, k);
 	])
 	ZFS_LINUX_TEST_SRC([generic_fillattr_mnt_idmap_reqmask], [
 		#include <linux/fs.h>
 	],[
 		struct mnt_idmap *idmap = NULL;
 		struct inode *in = NULL;
 		struct kstat *k = NULL;
 		generic_fillattr(idmap, 0, in, k);
 	])
 ])
 AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FILLATTR], [
-	AC_MSG_CHECKING([whether generic_fillattr requires struct mnt_idmap*])
+	AC_MSG_CHECKING(
-	ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap], [
+	    [whether generic_fillattr requires struct mnt_idmap* and request_mask])
 	ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap_reqmask], [
 		AC_MSG_RESULT([yes])
-		AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP, 1,
+		AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK, 1,
-		    [generic_fillattr requires struct mnt_idmap*])
+		    [generic_fillattr requires struct mnt_idmap* and u32 request_mask])
 	],[
-		AC_MSG_CHECKING([whether generic_fillattr requires struct user_namespace*])
+		AC_MSG_CHECKING([whether generic_fillattr requires struct mnt_idmap*])
-		ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [
+		ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap], [
 			AC_MSG_RESULT([yes])
-			AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1,
+			AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP, 1,
-			    [generic_fillattr requires struct user_namespace*])
+				[generic_fillattr requires struct mnt_idmap*])
 		],[
-			AC_MSG_RESULT([no])
+			AC_MSG_CHECKING([whether generic_fillattr requires struct user_namespace*])
 			ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [
 				AC_MSG_RESULT([yes])
 				AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1,
 					[generic_fillattr requires struct user_namespace*])
 			],[
 				AC_MSG_RESULT([no])
 			])
 		])
 	])
 ])
@@ -27,6 +27,31 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [
 		memset(&ip, 0, sizeof(ip));
 		ts = ip.i_mtime;
 	])
 	dnl #
 	dnl # 6.6 API change
 	dnl # i_ctime no longer directly accessible, must use
 	dnl # inode_get_ctime(ip), inode_set_ctime*(ip) to
 	dnl # read/write.
 	dnl #
 	ZFS_LINUX_TEST_SRC([inode_get_ctime], [
 		#include <linux/fs.h>
 	],[
 		struct inode ip;
 		memset(&ip, 0, sizeof(ip));
 		inode_get_ctime(&ip);
 	])
 	ZFS_LINUX_TEST_SRC([inode_set_ctime_to_ts], [
 		#include <linux/fs.h>
 	],[
 		struct inode ip;
 		struct timespec64 ts = {0};
 		memset(&ip, 0, sizeof(ip));
 		inode_set_ctime_to_ts(&ip, ts);
 	])
 ])
 AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [
@@ -47,4 +72,22 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [
 		AC_DEFINE(HAVE_INODE_TIMESPEC64_TIMES, 1,
 		    [inode->i_*time's are timespec64])
 	])
 	AC_MSG_CHECKING([whether inode_get_ctime() exists])
 	ZFS_LINUX_TEST_RESULT([inode_get_ctime], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_INODE_GET_CTIME, 1,
 		    [inode_get_ctime() exists in linux/fs.h])
 	],[
 		AC_MSG_RESULT(no)
 	])
 	AC_MSG_CHECKING([whether inode_set_ctime_to_ts() exists])
 	ZFS_LINUX_TEST_RESULT([inode_set_ctime_to_ts], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_INODE_SET_CTIME_TO_TS, 1,
 		    [inode_set_ctime_to_ts() exists in linux/fs.h])
 	],[
 		AC_MSG_RESULT(no)
 	])
 ])
@@ -162,6 +162,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_RECLAIMED
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -303,6 +304,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_RECLAIMED
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
@@ -67,6 +67,7 @@ ZFS_AC_DEBUG_INVARIANTS
 AC_CONFIG_FILES([
 	contrib/debian/rules
 	contrib/debian/changelog
 	Makefile
 	include/Makefile
 	lib/libzfs/libzfs.pc
@@ -1,3 +1,9 @@
 openzfs-linux (@VERSION@-1) unstable; urgency=low
  * OpenZFS @VERSION@ is tagged.
 -- Umer Saleem <usaleem@ixsystems.com>  Wed, 15 Nov 2023 15:00:00 +0500
 openzfs-linux (2.2.0-0) unstable; urgency=low
  * OpenZFS 2.2.0 is tagged.
@@ -197,7 +197,6 @@ Recommends: openzfs-zfs-zed, openzfs-zfsutils (>= ${source:Version}), ${linux:Re
 Suggests: debhelper
 Breaks: spl-dkms (<< 0.8.0~rc1)
 Replaces: spl-dkms, zfs-dkms
 Conflicts: zfs-dkms
 Provides: openzfs-zfs-modules
 Description: OpenZFS filesystem kernel modules for Linux
 OpenZFS is a storage platform that encompasses the functionality of
@@ -34,6 +34,7 @@ usr/bin/zvol_wait
 usr/lib/modules-load.d/ lib/
 usr/lib/zfs-linux/zpool.d/
 usr/lib/zfs-linux/zpool_influxdb
 usr/lib/zfs-linux/zfs_prepare_disk
 usr/sbin/arc_summary
 usr/sbin/arcstat
 usr/sbin/dbufstat
@@ -87,6 +88,7 @@ usr/share/man/man8/zfs-wait.8
 usr/share/man/man8/zfs-zone.8
 usr/share/man/man8/zfs.8
 usr/share/man/man8/zfs_ids_to_path.8
 usr/share/man/man8/zfs_prepare_disk.8
 usr/share/man/man7/zfsconcepts.7
 usr/share/man/man7/zfsprops.7
 usr/share/man/man8/zgenhostid.8
@@ -33,6 +33,7 @@ COMMON_H = \
 	sys/bqueue.h \
 	sys/btree.h \
 	sys/brt.h \
 	sys/brt_impl.h \
 	sys/dataset_kstats.h \
 	sys/dbuf.h \
 	sys/ddt.h \
@@ -326,6 +326,15 @@ _LIBZFS_H nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *,
    boolean_t *, boolean_t *, boolean_t *);
 _LIBZFS_H int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *,
    const char *);
 _LIBZFS_H int zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv,
    const char *prepare_str, char **lines[], int *lines_cnt);
 _LIBZFS_H int zpool_prepare_and_label_disk(libzfs_handle_t *hdl,
    zpool_handle_t *, const char *, nvlist_t *vdev_nv, const char *prepare_str,
    char **lines[], int *lines_cnt);
 _LIBZFS_H char ** zpool_vdev_script_alloc_env(const char *pool_name,
    const char *vdev_path, const char *vdev_upath,
    const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val);
 _LIBZFS_H void zpool_vdev_script_free_env(char **env);
 _LIBZFS_H uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp,
    const char *path);
@@ -34,7 +34,7 @@ extern "C" {
 #endif
 /*
- * Default wait time for a device name to be created.
+ * Default wait time in milliseconds for a device name to be created.
 */
 #define	DISK_LABEL_WAIT		(30 * 1000)  /* 30 seconds */
@@ -64,6 +64,7 @@ typedef enum {
 } while (0)
 #define	mutex_destroy(lock)	sx_destroy(lock)
 #define	mutex_enter(lock)	sx_xlock(lock)
 #define	mutex_enter_interruptible(lock)	sx_xlock_sig(lock)
 #define	mutex_enter_nested(lock, type)	sx_xlock(lock)
 #define	mutex_tryenter(lock)	sx_try_xlock(lock)
 #define	mutex_exit(lock)	sx_xunlock(lock)
@@ -30,9 +30,9 @@
 #include <sys/types.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
 #include <sys/thread.h>
 #include <sys/ck.h>
 #ifdef	__cplusplus
 extern "C" {
@@ -48,16 +48,16 @@ typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 typedef struct taskq_ent {
-	struct task	 tqent_task;
+	union {
-	struct timeout_task tqent_timeout_task;
+		struct task	 tqent_task;
 		struct timeout_task tqent_timeout_task;
 	};
 	task_func_t	*tqent_func;
 	void		*tqent_arg;
-	taskqid_t tqent_id;
+	taskqid_t	 tqent_id;
-	CK_LIST_ENTRY(taskq_ent) tqent_hash;
+	LIST_ENTRY(taskq_ent) tqent_hash;
-	uint8_t tqent_type;
+	uint_t		 tqent_type;
-	uint8_t tqent_registered;
+	volatile uint_t	 tqent_rc;
 	uint8_t tqent_cancelled;
 	volatile uint32_t tqent_rc;
 } taskq_ent_t;
 /*
@@ -101,7 +101,7 @@ void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
 void vfs_clearmntopt(vfs_t *vfsp, const char *name);
 int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp);
 int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype,
-    char *fspath, char *fspec, int fsflags);
+    char *fspath, char *fspec, int fsflags, vfs_t *parent_vfsp);
 typedef	uint64_t	vfs_feature_t;
@@ -56,6 +56,7 @@ enum symfollow { NO_FOLLOW = NOFOLLOW };
 #ifndef IN_BASE
 #include_next <sys/vnode.h>
 #endif
 #include <sys/ccompat.h>
 #include <sys/mount.h>
 #include <sys/cred.h>
 #include <sys/fcntl.h>
@@ -104,7 +105,7 @@ vn_flush_cached_data(vnode_t *vp, boolean_t sync)
 		zfs_vmobject_wlock(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, flags);
 		zfs_vmobject_wunlock(vp->v_object);
-		VOP_UNLOCK(vp);
+		VOP_UNLOCK1(vp);
 	}
 }
 #endif
@@ -286,6 +286,7 @@ typedef struct zfid_long {
 extern uint_t zfs_fsyncer_key;
 extern int zfs_super_owner;
 extern int zfs_bclone_enabled;
 extern void zfs_init(void);
 extern void zfs_fini(void);
@@ -461,10 +461,16 @@ zpl_is_32bit_api(void)
 * 6.3 API change
 * generic_fillattr() first arg is changed to struct mnt_idmap *
 *
 * 6.6 API change
 * generic_fillattr() gets new second arg request_mask, a u32 type
 *
 */
 #ifdef HAVE_GENERIC_FILLATTR_IDMAP
 #define	zpl_generic_fillattr(idmap, ip, sp)	\
    generic_fillattr(idmap, ip, sp)
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK)
 #define	zpl_generic_fillattr(idmap, rqm, ip, sp)	\
    generic_fillattr(idmap, rqm, ip, sp)
 #elif defined(HAVE_GENERIC_FILLATTR_USERNS)
 #define	zpl_generic_fillattr(user_ns, ip, sp)	\
    generic_fillattr(user_ns, ip, sp)
@@ -108,7 +108,7 @@ typedef struct spl_kmem_magazine {
 	uint32_t		skm_refill;	/* Batch refill size */
 	struct spl_kmem_cache	*skm_cache;	/* Owned by cache */
 	unsigned int		skm_cpu;	/* Owned by cpu */
-	void			*skm_objs[0];	/* Object pointers */
+	void			*skm_objs[];	/* Object pointers */
 } spl_kmem_magazine_t;
 typedef struct spl_kmem_obj {
@@ -128,7 +128,6 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp)			\
 #define	NESTED_SINGLE 1
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define	mutex_enter_nested(mp, subclass)			\
 {								\
 	ASSERT3P(mutex_owner(mp), !=, current);			\
@@ -137,16 +136,22 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp)			\
 	spl_mutex_lockdep_on_maybe(mp);				\
 	spl_mutex_set_owner(mp);				\
 }
-#else /* CONFIG_DEBUG_LOCK_ALLOC */
+
-#define	mutex_enter_nested(mp, subclass)			\
+#define	mutex_enter_interruptible(mp)				\
-{								\
+/* CSTYLED */							\
 ({								\
 	int _rc_;						\
 								\
 	ASSERT3P(mutex_owner(mp), !=, current);			\
 	spl_mutex_lockdep_off_maybe(mp);			\
-	mutex_lock(MUTEX(mp));					\
+	_rc_ = mutex_lock_interruptible(MUTEX(mp));		\
 	spl_mutex_lockdep_on_maybe(mp);				\
-	spl_mutex_set_owner(mp);				\
+	if (!_rc_) {						\
-}
+		spl_mutex_set_owner(mp);			\
-#endif /*  CONFIG_DEBUG_LOCK_ALLOC */
+	}							\
 								\
 	_rc_;							\
 })
 #define	mutex_enter(mp) mutex_enter_nested((mp), 0)
@@ -73,13 +73,6 @@ typedef struct zfs_uio {
 	size_t		uio_skip;
 	struct request	*rq;
 	/*
 	 * Used for saving rq_for_each_segment() state between calls
 	 * to zfs_uiomove_bvec_rq().
 	 */
 	struct req_iterator iter;
 	struct bio_vec bv;
 } zfs_uio_t;
@@ -138,7 +131,6 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq)
 	} else {
 		uio->uio_bvec = NULL;
 		uio->uio_iovcnt = 0;
 		memset(&uio->iter, 0, sizeof (uio->iter));
 	}
 	uio->uio_loffset = io_offset(bio, rq);
@@ -45,6 +45,8 @@ extern "C" {
 typedef struct zfsvfs zfsvfs_t;
 struct znode;
 extern int zfs_bclone_enabled;
 /*
 * This structure emulates the vfs_t from other platforms.  It's purpose
 * is to facilitate the handling of mount options and minimize structural
@@ -56,7 +56,12 @@ extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap,
 extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd,
    cred_t *cr, int flags);
 extern int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr);
 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
 extern int zfs_getattr_fast(zidmap_t *, u32 request_mask, struct inode *ip,
    struct kstat *sp);
 #else
 extern int zfs_getattr_fast(zidmap_t *, struct inode *ip, struct kstat *sp);
 #endif
 extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr,
    zidmap_t *mnt_ns);
 extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp,
@@ -60,7 +60,7 @@ extern const struct file_operations zpl_file_operations;
 extern const struct file_operations zpl_dir_file_operations;
 /* zpl_super.c */
-extern void zpl_prune_sb(int64_t nr_to_scan, void *arg);
+extern void zpl_prune_sb(uint64_t nr_to_scan, void *arg);
 extern const struct super_operations zpl_super_operations;
 extern const struct export_operations zpl_export_operations;
@@ -263,4 +263,15 @@ extern long zpl_ioctl_fideduperange(struct file *filp, void *arg);
 #define	zpl_setattr_prepare(ns, dentry, ia)	setattr_prepare(dentry, ia)
 #endif
 #ifdef HAVE_INODE_GET_CTIME
 #define	zpl_inode_get_ctime(ip)	inode_get_ctime(ip)
 #else
 #define	zpl_inode_get_ctime(ip)	(ip->i_ctime)
 #endif
 #ifdef HAVE_INODE_SET_CTIME_TO_TS
 #define	zpl_inode_set_ctime_to_ts(ip, ts)	inode_set_ctime_to_ts(ip, ts)
 #else
 #define	zpl_inode_set_ctime_to_ts(ip, ts)	(ip->i_ctime = ts)
 #endif
 #endif	/* _SYS_ZPL_H */
@@ -81,7 +81,7 @@ typedef struct arc_prune arc_prune_t;
 typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb,
    const blkptr_t *bp, arc_buf_t *buf, void *priv);
 typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
-typedef void arc_prune_func_t(int64_t bytes, void *priv);
+typedef void arc_prune_func_t(uint64_t bytes, void *priv);
 /* Shared module parameters */
 extern uint_t zfs_arc_average_blocksize;
@@ -1065,7 +1065,6 @@ extern void arc_wait_for_eviction(uint64_t, boolean_t);
 extern void arc_lowmem_init(void);
 extern void arc_lowmem_fini(void);
 extern void arc_prune_async(uint64_t);
 extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg);
 extern uint64_t arc_free_memory(void);
 extern int64_t arc_available_memory(void);
@@ -0,0 +1,199 @@
 /*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or https://opensource.org/licenses/CDDL-1.0.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
 /*
 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
 */
 #ifndef _SYS_BRT_IMPL_H
 #define	_SYS_BRT_IMPL_H
 #ifdef	__cplusplus
 extern "C" {
 #endif
 /*
 * BRT - Block Reference Table.
 */
 #define	BRT_OBJECT_VDEV_PREFIX	"com.fudosecurity:brt:vdev:"
 /*
 * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
 * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
 * Each element in this array represents how many BRT entries do we have in this
 * chunk of storage. We always load this entire array into memory and update as
 * needed. By having it in memory we can quickly tell (during zio_free()) if
 * there are any BRT entries that we might need to update.
 *
 * This value cannot be larger than 16MB, at least as long as we support
 * 512 byte block sizes. With 512 byte block size we can have exactly
 * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
 * many for a 16bit counter.
 */
 #define	BRT_RANGESIZE	(16 * 1024 * 1024)
 _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
 	"BRT_RANGESIZE is too large.");
 /*
 * We don't want to update the whole structure every time. Maintain bitmap
 * of dirty blocks within the regions, so that a single bit represents a
 * block size of entcounts. For example if we have a 1PB vdev then all
 * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
 * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
 * the whole 128MB on disk when we have updated only a single entcount.
 * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
 * is represented by a single bit. This gives us 4096 bits. A set bit in the
 * bitmap means that we had a change in at least one of the 16384 entcounts
 * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
 */
 #define	BRT_BLOCKSIZE	(32 * 1024)
 #define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
 	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
 #define	BRT_LITTLE_ENDIAN	0
 #define	BRT_BIG_ENDIAN		1
 #ifdef _ZFS_LITTLE_ENDIAN
 #define	BRT_NATIVE_BYTEORDER		BRT_LITTLE_ENDIAN
 #define	BRT_NON_NATIVE_BYTEORDER	BRT_BIG_ENDIAN
 #else
 #define	BRT_NATIVE_BYTEORDER		BRT_BIG_ENDIAN
 #define	BRT_NON_NATIVE_BYTEORDER	BRT_LITTLE_ENDIAN
 #endif
 typedef struct brt_vdev_phys {
 	uint64_t	bvp_mos_entries;
 	uint64_t	bvp_size;
 	uint64_t	bvp_byteorder;
 	uint64_t	bvp_totalcount;
 	uint64_t	bvp_rangesize;
 	uint64_t	bvp_usedspace;
 	uint64_t	bvp_savedspace;
 } brt_vdev_phys_t;
 typedef struct brt_vdev {
 	/*
 	 * VDEV id.
 	 */
 	uint64_t	bv_vdevid;
 	/*
 	 * Is the structure initiated?
 	 * (bv_entcount and bv_bitmap are allocated?)
 	 */
 	boolean_t	bv_initiated;
 	/*
 	 * Object number in the MOS for the entcount array and brt_vdev_phys.
 	 */
 	uint64_t	bv_mos_brtvdev;
 	/*
 	 * Object number in the MOS for the entries table.
 	 */
 	uint64_t	bv_mos_entries;
 	/*
 	 * Entries to sync.
 	 */
 	avl_tree_t	bv_tree;
 	/*
 	 * Does the bv_entcount[] array needs byte swapping?
 	 */
 	boolean_t	bv_need_byteswap;
 	/*
 	 * Number of entries in the bv_entcount[] array.
 	 */
 	uint64_t	bv_size;
 	/*
 	 * This is the array with BRT entry count per BRT_RANGESIZE.
 	 */
 	uint16_t	*bv_entcount;
 	/*
 	 * Sum of all bv_entcount[]s.
 	 */
 	uint64_t	bv_totalcount;
 	/*
 	 * Space on disk occupied by cloned blocks (without compression).
 	 */
 	uint64_t	bv_usedspace;
 	/*
 	 * How much additional space would be occupied without block cloning.
 	 */
 	uint64_t	bv_savedspace;
 	/*
 	 * brt_vdev_phys needs updating on disk.
 	 */
 	boolean_t	bv_meta_dirty;
 	/*
 	 * bv_entcount[] needs updating on disk.
 	 */
 	boolean_t	bv_entcount_dirty;
 	/*
 	 * bv_entcount[] potentially can be a bit too big to sychronize it all
 	 * when we just changed few entcounts. The fields below allow us to
 	 * track updates to bv_entcount[] array since the last sync.
 	 * A single bit in the bv_bitmap represents as many entcounts as can
 	 * fit into a single BRT_BLOCKSIZE.
 	 * For example we have 65536 entcounts in the bv_entcount array
 	 * (so the whole array is 128kB). We updated bv_entcount[2] and
 	 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
 	 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
 	 */
 	ulong_t		*bv_bitmap;
 	uint64_t	bv_nblocks;
 } brt_vdev_t;
 /*
 * In-core brt
 */
 typedef struct brt {
 	krwlock_t	brt_lock;
 	spa_t		*brt_spa;
 #define	brt_mos		brt_spa->spa_meta_objset
 	uint64_t	brt_rangesize;
 	uint64_t	brt_usedspace;
 	uint64_t	brt_savedspace;
 	avl_tree_t	brt_pending_tree[TXG_SIZE];
 	kmutex_t	brt_pending_lock[TXG_SIZE];
 	/* Sum of all entries across all bv_trees. */
 	uint64_t	brt_nentries;
 	brt_vdev_t	*brt_vdevs;
 	uint64_t	brt_nvdevs;
 } brt_t;
 /* Size of bre_offset / sizeof (uint64_t). */
 #define	BRT_KEY_WORDS	(1)
 /*
 * In-core brt entry.
 * On-disk we use bre_offset as the key and bre_refcount as the value.
 */
 typedef struct brt_entry {
 	uint64_t	bre_offset;
 	uint64_t	bre_refcount;
 	avl_node_t	bre_node;
 } brt_entry_t;
 typedef struct brt_pending_entry {
 	blkptr_t	bpe_bp;
 	int		bpe_count;
 	avl_node_t	bpe_node;
 } brt_pending_entry_t;
 #ifdef	__cplusplus
 }
 #endif
 #endif	/* _SYS_BRT_IMPL_H */
@@ -1072,8 +1072,7 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
 int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset,
    uint64_t length, struct blkptr *bps, size_t *nbpsp);
 int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps,
+    uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps);
    boolean_t replay);
 /*
 * Initial setup and final teardown.
@@ -837,7 +837,7 @@ extern kmutex_t spa_namespace_lock;
 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t);
 extern void spa_config_load(void);
-extern nvlist_t *spa_all_configs(uint64_t *);
+extern int spa_all_configs(uint64_t *generation, nvlist_t **pools);
 extern void spa_config_set(spa_t *spa, nvlist_t *config);
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
    int getstats);
@@ -73,8 +73,7 @@ struct tx_cpu {
 	kcondvar_t	tc_cv[TXG_SIZE];
 	uint64_t	tc_count[TXG_SIZE];	/* tx hold count on each txg */
 	list_t		tc_callbacks[TXG_SIZE]; /* commit cb list */
-	char		tc_pad[8];		/* pad to fill 3 cache lines */
+} ____cacheline_aligned;
 };
 /*
 * The tx_state structure maintains the state information about the different
@@ -131,7 +131,10 @@ typedef const struct vdev_ops {
 * Virtual device properties
 */
 typedef union vdev_queue_class {
-	list_t		vqc_list;
+	struct {
 		ulong_t 	vqc_list_numnodes;
 		list_t		vqc_list;
 	};
 	avl_tree_t	vqc_tree;
 } vdev_queue_class_t;
@@ -130,7 +130,7 @@ typedef struct raidz_row {
 	uint64_t rr_offset;		/* Logical offset for *_io_verify() */
 	uint64_t rr_size;		/* Physical size for *_io_verify() */
 #endif
-	raidz_col_t rr_col[0];		/* Flexible array of I/O columns */
+	raidz_col_t rr_col[];		/* Flexible array of I/O columns */
 } raidz_row_t;
 typedef struct raidz_map {
@@ -139,7 +139,7 @@ typedef struct raidz_map {
 	int rm_nskip;			/* RAIDZ sectors skipped for padding */
 	int rm_skipstart;		/* Column index of padding start */
 	const raidz_impl_ops_t *rm_ops;	/* RAIDZ math operations */
-	raidz_row_t *rm_row[0];		/* flexible array of rows */
+	raidz_row_t *rm_row[];		/* flexible array of rows */
 } raidz_map_t;
@@ -274,11 +274,13 @@ typedef struct kmutex {
 extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie);
 extern void mutex_destroy(kmutex_t *mp);
 extern void mutex_enter(kmutex_t *mp);
 extern int mutex_enter_check_return(kmutex_t *mp);
 extern void mutex_exit(kmutex_t *mp);
 extern int mutex_tryenter(kmutex_t *mp);
 #define	NESTED_SINGLE 1
 #define	mutex_enter_nested(mp, class) mutex_enter(mp)
 #define	mutex_enter_interruptible(mp) mutex_enter_check_return(mp)
 /*
 * RW locks
 */
@@ -515,6 +515,8 @@
    <elf-symbol name='zpool_open' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_open_canfail' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_pool_state_to_name' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_prepare_and_label_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_prepare_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_print_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_prop_align_right' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_prop_column_name' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -562,6 +564,8 @@
    <elf-symbol name='zpool_vdev_remove' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_vdev_remove_cancel' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_vdev_remove_wanted' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_vdev_script_alloc_env' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_vdev_script_free_env' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_vdev_split' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_wait' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_wait_status' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -2071,3 +2071,196 @@ printf_color(const char *color, const char *format, ...)
 	return (rc);
 }
 /* PATH + 5 env vars + a NULL entry = 7 */
 #define	ZPOOL_VDEV_SCRIPT_ENV_COUNT 7
 /*
 * There's a few places where ZFS will call external scripts (like the script
 * in zpool.d/ and `zfs_prepare_disk`).  These scripts are called with a
 * reduced $PATH, and some vdev specific environment vars set.  This function
 * will allocate an populate the environment variable array that is passed to
 * these scripts.  The user must free the arrays with zpool_vdev_free_env() when
 * they are done.
 *
 * The following env vars will be set (but value could be blank):
 *
 * POOL_NAME
 * VDEV_PATH
 * VDEV_UPATH
 * VDEV_ENC_SYSFS_PATH
 *
 * In addition, you can set an optional environment variable named 'opt_key'
 * to 'opt_val' if you want.
 *
 * Returns allocated env[] array on success, NULL otherwise.
 */
 char **
 zpool_vdev_script_alloc_env(const char *pool_name,
    const char *vdev_path, const char *vdev_upath,
    const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val)
 {
 	char **env = NULL;
 	int rc;
 	env = calloc(ZPOOL_VDEV_SCRIPT_ENV_COUNT, sizeof (*env));
 	if (!env)
 		return (NULL);
 	env[0] = strdup("PATH=/bin:/sbin:/usr/bin:/usr/sbin");
 	if (!env[0])
 		goto error;
 	/* Setup our custom environment variables */
 	rc = asprintf(&env[1], "POOL_NAME=%s", pool_name ? pool_name : "");
 	if (rc == -1) {
 		env[1] = NULL;
 		goto error;
 	}
 	rc = asprintf(&env[2], "VDEV_PATH=%s", vdev_path ? vdev_path : "");
 	if (rc == -1) {
 		env[2] = NULL;
 		goto error;
 	}
 	rc = asprintf(&env[3], "VDEV_UPATH=%s", vdev_upath ? vdev_upath : "");
 	if (rc == -1) {
 		env[3] = NULL;
 		goto error;
 	}
 	rc = asprintf(&env[4], "VDEV_ENC_SYSFS_PATH=%s",
 	    vdev_enc_sysfs_path ?  vdev_enc_sysfs_path : "");
 	if (rc == -1) {
 		env[4] = NULL;
 		goto error;
 	}
 	if (opt_key != NULL) {
 		rc = asprintf(&env[5], "%s=%s", opt_key,
 		    opt_val ? opt_val : "");
 		if (rc == -1) {
 			env[5] = NULL;
 			goto error;
 		}
 	}
 	return (env);
 error:
 	for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++)
 		free(env[i]);
 	free(env);
 	return (NULL);
 }
 /*
 * Free the env[] array that was allocated by zpool_vdev_script_alloc_env().
 */
 void
 zpool_vdev_script_free_env(char **env)
 {
 	for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++)
 		free(env[i]);
 	free(env);
 }
 /*
 * Prepare a disk by (optionally) running a program before labeling the disk.
 * This can be useful for installing disk firmware or doing some pre-flight
 * checks on the disk before it becomes part of the pool.  The program run is
 * located at ZFSEXECDIR/zfs_prepare_disk
 * (E.x: /usr/local/libexec/zfs/zfs_prepare_disk).
 *
 * Return 0 on success, non-zero on failure.
 */
 int
 zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv,
    const char *prepare_str, char **lines[], int *lines_cnt)
 {
 	const char *script_path = ZFSEXECDIR "/zfs_prepare_disk";
 	const char *pool_name;
 	int rc = 0;
 	/* Path to script and a NULL entry */
 	char *argv[2] = {(char *)script_path};
 	char **env = NULL;
 	const char *path = NULL, *enc_sysfs_path = NULL;
 	char *upath;
 	*lines_cnt = 0;
 	if (access(script_path, X_OK) != 0) {
 		/* No script, nothing to do */
 		return (0);
 	}
 	(void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH, &path);
 	(void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
 	    &enc_sysfs_path);
 	upath = zfs_get_underlying_path(path);
 	pool_name = zhp ? zpool_get_name(zhp) : NULL;
 	env = zpool_vdev_script_alloc_env(pool_name, path, upath,
 	    enc_sysfs_path, "VDEV_PREPARE", prepare_str);
 	free(upath);
 	if (env == NULL) {
 		return (ENOMEM);
 	}
 	rc = libzfs_run_process_get_stdout(script_path, argv, env, lines,
 	    lines_cnt);
 	zpool_vdev_script_free_env(env);
 	return (rc);
 }
 /*
 * Optionally run a script and then label a disk.  The script can be used to
 * prepare a disk for inclusion into the pool.  For example, it might update
 * the disk's firmware or check its health.
 *
 * The 'name' provided is the short name, stripped of any leading
 * /dev path, and is passed to zpool_label_disk. vdev_nv is the nvlist for
 * the vdev.  prepare_str is a string that gets passed as the VDEV_PREPARE
 * env variable to the script.
 *
 * The following env vars are passed to the script:
 *
 * POOL_NAME:		The pool name (blank during zpool create)
 * VDEV_PREPARE:	Reason why the disk is being prepared for inclusion:
 *			"create", "add", "replace", or "autoreplace"
 * VDEV_PATH:		Path to the disk
 * VDEV_UPATH:		One of the 'underlying paths' to the disk.  This is
 * 			useful for DM devices.
 * VDEV_ENC_SYSFS_PATH:	Path to the disk's enclosure sysfs path, if available.
 *
 * Note, some of these values can be blank.
 *
 * Return 0 on success, non-zero otherwise.
 */
 int
 zpool_prepare_and_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp,
    const char *name, nvlist_t *vdev_nv, const char *prepare_str,
    char **lines[], int *lines_cnt)
 {
 	int rc;
 	char vdev_path[MAXPATHLEN];
 	(void) snprintf(vdev_path, sizeof (vdev_path), "%s/%s", DISK_ROOT,
 	    name);
 	/* zhp will be NULL when creating a pool */
 	rc = zpool_prepare_disk(zhp, vdev_nv, prepare_str, lines, lines_cnt);
 	if (rc != 0)
 		return (rc);
 	rc = zpool_label_disk(hdl, zhp, name);
 	return (rc);
 }
@@ -205,6 +205,15 @@ mutex_enter(kmutex_t *mp)
 	mp->m_owner = pthread_self();
 }
 int
 mutex_enter_check_return(kmutex_t *mp)
 {
 	int error = pthread_mutex_lock(&mp->m_lock);
 	if (error == 0)
 		mp->m_owner = pthread_self();
 	return (error);
 }
 int
 mutex_tryenter(kmutex_t *mp)
 {
@@ -582,9 +582,8 @@ zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
 * Wait up to timeout_ms for udev to set up the device node.  The device is
 * considered ready when libudev determines it has been initialized, all of
 * the device links have been verified to exist, and it has been allowed to
- * settle.  At this point the device the device can be accessed reliably.
+ * settle.  At this point the device can be accessed reliably. Depending on
- * Depending on the complexity of the udev rules this process could take
+ * the complexity of the udev rules this process could take several seconds.
 * several seconds.
 */
 int
 zpool_label_disk_wait(const char *path, int timeout_ms)
@@ -62,6 +62,7 @@ dist_man_MANS = \
 	%D%/man8/zfs-userspace.8 \
 	%D%/man8/zfs-wait.8 \
 	%D%/man8/zfs_ids_to_path.8 \
 	%D%/man8/zfs_prepare_disk.8 \
 	%D%/man8/zgenhostid.8 \
 	%D%/man8/zinject.8 \
 	%D%/man8/zpool.8 \
@@ -1137,6 +1137,11 @@ Selecting any option other than
 results in vector instructions
 from the respective CPU instruction set being used.
 .
 .It Sy zfs_bclone_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable the experimental block cloning feature.
 If this setting is 0, then even if feature@block_cloning is enabled,
 attempts to clone blocks will act as though the feature is disabled.
 .
 .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
 Select a BLAKE3 implementation.
 .Pp
@@ -2172,7 +2177,7 @@ if a volatile out-of-order write cache is enabled.
 Disable intent logging replay.
 Can be disabled for recovery from corrupted ZIL.
 .
-.It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768 KiB Pc Pq u64
+.It Sy zil_slog_bulk Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64
 Limit SLOG write size per commit executed with synchronous priority.
 Any writes above that will be executed with lower (asynchronous) priority
 to limit potential SLOG device abuse by single active ZIL writer.
@@ -2317,6 +2322,63 @@ If
 .Sy zvol_threads
 to the number of CPUs present or 32 (whichever is greater).
 .
 .It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint
 The number of threads per zvol to use for queuing IO requests.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only read and assigned to a zvol at zvol load time.
 If
 .Sy 0
 (the default) then internally set
 .Sy zvol_blk_mq_threads
 to the number of CPUs present.
 .
 .It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Set to
 .Sy 1
 to use the
 .Li blk-mq
 API for zvols.
 Set to
 .Sy 0
 (the default) to use the legacy zvol APIs.
 This setting can give better or worse zvol performance depending on
 the workload.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only read and assigned to a zvol at zvol load time.
 .
 .It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint
 If
 .Sy zvol_use_blk_mq
 is enabled, then process this number of
 .Sy volblocksize Ns -sized blocks per zvol thread.
 This tunable can be use to favor better performance for zvol reads (lower
 values) or writes (higher values).
 If set to
 .Sy 0 ,
 then the zvol layer will process the maximum number of blocks
 per thread that it can.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only applied at each zvol's load time.
 .
 .It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint
 The queue_depth value for the zvol
 .Li blk-mq
 interface.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only applied at each zvol's load time.
 If
 .Sy 0
 (the default) then use the kernel's default queue depth.
 Values are clamped to the kernel's
 .Dv BLKDEV_MIN_RQ
 and
 .Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ
 limits.
 .
 .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint
 Defines zvol block devices behaviour when
 .Sy volmode Ns = Ns Sy default :
@@ -219,8 +219,11 @@ to the end of the line is ignored.
 .Bd -literal -compact -offset 4n
 .No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2
 # Features which are supported by GRUB2
 allocation_classes
 async_destroy
 block_cloning
 bookmarks
 device_rebuild
 embedded_data
 empty_bpobj
 enabled_txg
@@ -229,8 +232,14 @@ filesystem_limits
 hole_birth
 large_blocks
 livelist
 log_spacemap
 lz4_compress
 project_quota
 resilver_defer
 spacemap_histogram
 spacemap_v2
 userobj_accounting
 zilsaxattr
 zpool_checkpoint
 .No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev
@@ -1,2 +1,3 @@
 /zed.8
 /zfs-mount-generator.8
 /zfs_prepare_disk.8
@@ -14,7 +14,7 @@
 .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
 .\" Copyright (c) 2017 Intel Corporation.
 .\"
-.Dd June 27, 2023
+.Dd November 18, 2023
 .Dt ZDB 8
 .Os
 .
@@ -23,7 +23,7 @@
 .Nd display ZFS storage pool debugging and consistency information
 .Sh SYNOPSIS
 .Nm
-.Op Fl AbcdDFGhikLMNPsvXYy
+.Op Fl AbcdDFGhikLMNPsTvXYy
 .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
 .Op Fl I Ar inflight-I/O-ops
 .Oo Fl o Ar var Ns = Ns Ar value Oc Ns …
@@ -403,6 +403,13 @@ Display operation counts, bandwidth, and error counts of I/O to the pool from
 Simulate the effects of deduplication, constructing a DDT and then display
 that DDT as with
 .Fl DD .
 .It Fl T , -brt-stats
 Display block reference table (BRT) statistics, including the size of uniques
 blocks cloned, the space saving as a result of cloning, and the saving ratio.
 .It Fl TT
 Display the per-vdev BRT statistics, including total references.
 .It Fl TTT
 Dump the contents of the block reference tables.
 .It Fl u , -uberblock
 Display the current uberblock.
 .El
@@ -0,0 +1,70 @@
 .\"
 .\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 .\" Copyright (C) 2023 Lawrence Livermore National Security, LLC.
 .\" Refer to the OpenZFS git commit log for authoritative copyright attribution.
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License Version 1.0 (CDDL-1.0).
 .\" You can obtain a copy of the license from the top-level file
 .\" "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
 .\" You may not use this file except in compliance with the license.
 .\"
 .\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
 .\"
 .Dd August 30, 2023
 .Dt ZFS_PREPARE_DISK 8
 .Os
 .
 .Sh NAME
 .Nm zfs_prepare_disk
 .Nd special script that gets run before bringing a disk into a pool
 .Sh DESCRIPTION
 .Nm
 is an optional script that gets called by libzfs before bringing a disk into a
 pool.
 It can be modified by the user to run whatever commands are necessary to prepare
 a disk for inclusion into the pool.
 For example, users can add lines to
 .Nm zfs_prepare_disk
 to do things like update the drive's firmware or check the drive's health.
 .Nm zfs_prepare_disk
 is optional and can be removed if not needed.
 libzfs will look for the script at @zfsexecdir@/zfs_prepare_disk.
 .
 .Ss Properties
 .Nm zfs_prepare_disk
 will be passed the following environment variables:
 .sp
 .Bl -tag -compact -width "VDEV_ENC_SYSFS_PATH"
 .
 .It Nm POOL_NAME
 .No Name of the pool
 .It Nm VDEV_PATH
 .No Path to the disk (like /dev/sda)
 .It Nm VDEV_PREPARE
 .No Reason why the disk is being prepared for inclusion
 ('create', 'add', 'replace', or 'autoreplace').
 This can be useful if you only want the script to be run under certain actions.
 .It Nm VDEV_UPATH
 .No Path to one of the underlying devices for the
 disk.
 For multipath this would return one of the /dev/sd* paths to the disk.
 If the device is not a device mapper device, then
 .Nm VDEV_UPATH
 just returns the same value as
 .Nm VDEV_PATH
 .It Nm VDEV_ENC_SYSFS_PATH
 .No Path to the disk's enclosure sysfs path, if available
 .El
 .Pp
 Note that some of these variables may have a blank value.
 .Nm POOL_NAME
 is blank at pool creation time, for example.
 .Sh ENVIRONMENT
 .Nm zfs_prepare_disk
 runs with a limited $PATH.
 .Sh EXIT STATUS
 .Nm zfs_prepare_disk
 should return 0 on success, non-zero otherwise.
 If non-zero is returned, the disk will not be included in the pool.
 .
@@ -488,6 +488,10 @@ zfs-$(CONFIG_ARM64) += $(addprefix zfs/,$(ZFS_OBJS_ARM64))
 zfs-$(CONFIG_PPC)   += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64))
 zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64))
 UBSAN_SANITIZE_zap_leaf.o := n
 UBSAN_SANITIZE_zap_micro.o := n
 UBSAN_SANITIZE_sa.o := n
 # Suppress incorrect warnings from versions of objtool which are not
 # aware of x86 EVEX prefix instructions used for AVX512.
 OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y
@@ -30,8 +30,6 @@
 __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/ck.h>
 #include <sys/epoch.h>
 #include <sys/kernel.h>
 #include <sys/kmem.h>
 #include <sys/lock.h>
@@ -66,11 +64,9 @@ taskq_t *dynamic_taskq = NULL;
 proc_t *system_proc;
 extern int uma_align_cache;
 static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures");
-static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;
+static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;
 static unsigned long tqenthash;
 static unsigned long tqenthashlock;
 static struct sx *tqenthashtbl_lock;
@@ -80,8 +76,8 @@ static taskqid_t tqidnext;
 #define	TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash])
 #define	TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)])
 #define	NORMAL_TASK 0
 #define	TIMEOUT_TASK 1
 #define	NORMAL_TASK 2
 static void
 system_taskq_init(void *arg)
@@ -121,7 +117,7 @@ system_taskq_fini(void *arg)
 	for (i = 0; i < tqenthashlock + 1; i++)
 		sx_destroy(&tqenthashtbl_lock[i]);
 	for (i = 0; i < tqenthash + 1; i++)
-		VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i]));
+		VERIFY(LIST_EMPTY(&tqenthashtbl[i]));
 	free(tqenthashtbl_lock, M_TASKQ);
 	free(tqenthashtbl, M_TASKQ);
 }
@@ -162,27 +158,27 @@ taskq_lookup(taskqid_t tqid)
 {
 	taskq_ent_t *ent = NULL;
-	sx_xlock(TQIDHASHLOCK(tqid));
+	if (tqid == 0)
-	CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) {
+		return (NULL);
 	sx_slock(TQIDHASHLOCK(tqid));
 	LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) {
 		if (ent->tqent_id == tqid)
 			break;
 	}
 	if (ent != NULL)
 		refcount_acquire(&ent->tqent_rc);
-	sx_xunlock(TQIDHASHLOCK(tqid));
+	sx_sunlock(TQIDHASHLOCK(tqid));
 	return (ent);
 }
 static taskqid_t
 taskq_insert(taskq_ent_t *ent)
 {
-	taskqid_t tqid;
+	taskqid_t tqid = __taskq_genid();
 	tqid = __taskq_genid();
 	ent->tqent_id = tqid;
 	ent->tqent_registered = B_TRUE;
 	sx_xlock(TQIDHASHLOCK(tqid));
-	CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash);
+	LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash);
 	sx_xunlock(TQIDHASHLOCK(tqid));
 	return (tqid);
 }
@@ -192,13 +188,14 @@ taskq_remove(taskq_ent_t *ent)
 {
 	taskqid_t tqid = ent->tqent_id;
-	if (!ent->tqent_registered)
+	if (tqid == 0)
 		return;
 	sx_xlock(TQIDHASHLOCK(tqid));
-	CK_LIST_REMOVE(ent, tqent_hash);
+	if (ent->tqent_id != 0) {
 		LIST_REMOVE(ent, tqent_hash);
 		ent->tqent_id = 0;
 	}
 	sx_xunlock(TQIDHASHLOCK(tqid));
 	ent->tqent_registered = B_FALSE;
 }
 static void
@@ -285,21 +282,22 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
 	int rc;
 	taskq_ent_t *ent;
 	if (tid == 0)
 		return (0);
 	if ((ent = taskq_lookup(tid)) == NULL)
 		return (0);
-	ent->tqent_cancelled = B_TRUE;
+	if (ent->tqent_type == NORMAL_TASK) {
-	if (ent->tqent_type == TIMEOUT_TASK) {
+		rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
 		if (rc == EBUSY)
 			taskqueue_drain(tq->tq_queue, &ent->tqent_task);
 	} else {
 		rc = taskqueue_cancel_timeout(tq->tq_queue,
 		    &ent->tqent_timeout_task, &pend);
-	} else
+		if (rc == EBUSY) {
-		rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
+			taskqueue_drain_timeout(tq->tq_queue,
-	if (rc == EBUSY) {
+			    &ent->tqent_timeout_task);
-		taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+		}
-	} else if (pend) {
+	}
 	if (pend) {
 		/*
 		 * Tasks normally free themselves when run, but here the task
 		 * was cancelled so it did not free itself.
@@ -312,12 +310,13 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
 }
 static void
-taskq_run(void *arg, int pending __unused)
+taskq_run(void *arg, int pending)
 {
 	taskq_ent_t *task = arg;
-	if (!task->tqent_cancelled)
+	if (pending == 0)
-		task->tqent_func(task->tqent_arg);
+		return;
 	task->tqent_func(task->tqent_arg);
 	taskq_free(task);
 }
@@ -345,7 +344,6 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 	task->tqent_func = func;
 	task->tqent_arg = arg;
 	task->tqent_type = TIMEOUT_TASK;
 	task->tqent_cancelled = B_FALSE;
 	refcount_init(&task->tqent_rc, 1);
 	tqid = taskq_insert(task);
 	TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0,
@@ -379,7 +377,6 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	refcount_init(&task->tqent_rc, 1);
 	task->tqent_func = func;
 	task->tqent_arg = arg;
 	task->tqent_cancelled = B_FALSE;
 	task->tqent_type = NORMAL_TASK;
 	tqid = taskq_insert(task);
 	TASK_INIT(&task->tqent_task, prio, taskq_run, task);
@@ -388,10 +385,12 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 }
 static void
-taskq_run_ent(void *arg, int pending __unused)
+taskq_run_ent(void *arg, int pending)
 {
 	taskq_ent_t *task = arg;
 	if (pending == 0)
 		return;
 	task->tqent_func(task->tqent_arg);
 }
@@ -406,8 +405,6 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags,
 	 * can go at the front of the queue.
 	 */
 	prio = !!(flags & TQ_FRONT);
 	task->tqent_cancelled = B_FALSE;
 	task->tqent_registered = B_FALSE;
 	task->tqent_id = 0;
 	task->tqent_func = func;
 	task->tqent_arg = arg;
@@ -427,12 +424,13 @@ taskq_wait_id(taskq_t *tq, taskqid_t tid)
 {
 	taskq_ent_t *ent;
 	if (tid == 0)
 		return;
 	if ((ent = taskq_lookup(tid)) == NULL)
 		return;
-	taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+	if (ent->tqent_type == NORMAL_TASK)
 		taskqueue_drain(tq->tq_queue, &ent->tqent_task);
 	else
 		taskqueue_drain_timeout(tq->tq_queue, &ent->tqent_timeout_task);
 	taskq_free(ent);
 }
@@ -120,7 +120,7 @@ vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
 int
 mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
-    char *fspec, int fsflags)
+    char *fspec, int fsflags, vfs_t *parent_vfsp)
 {
 	struct vfsconf *vfsp;
 	struct mount *mp;
@@ -220,6 +220,13 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
 	mp->mnt_opt = mp->mnt_optnew;
 	(void) VFS_STATFS(mp, &mp->mnt_stat);
 #ifdef VFS_SUPPORTS_EXJAIL_CLONE
 	/*
 	 * Clone the mnt_exjail credentials of the parent, as required.
 	 */
 	vfs_exjail_clone(parent_vfsp, mp);
 #endif
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
@@ -32,11 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/zmod.h>
 #if __FreeBSD_version >= 1300041
 #include <contrib/zlib/zlib.h>
 #else
 #include <sys/zlib.h>
 #endif
 #include <sys/kobj.h>
@@ -90,11 +86,7 @@ zlib_inflateInit(z_stream *stream)
 static int
 zlib_inflate(z_stream *stream, int finish)
 {
 #if __FreeBSD_version >= 1300024
 	return (inflate(stream, finish));
 #else
 	return (_zlib104_inflate(stream, finish));
 #endif
 }
@@ -52,11 +52,6 @@
 #include <sys/vm.h>
 #include <sys/vmmeter.h>
 #if __FreeBSD_version >= 1300139
 static struct sx arc_vnlru_lock;
 static struct vnode *arc_vnlru_marker;
 #endif
 extern struct vfsops zfs_vfsops;
 uint_t zfs_arc_free_target = 0;
@@ -131,53 +126,6 @@ arc_default_max(uint64_t min, uint64_t allmem)
 	return (MAX(allmem * 5 / 8, size));
 }
 /*
 * Helper function for arc_prune_async() it is responsible for safely
 * handling the execution of a registered arc_prune_func_t.
 */
 static void
 arc_prune_task(void *arg)
 {
 	uint64_t nr_scan = (uintptr_t)arg;
 #ifndef __ILP32__
 	if (nr_scan > INT_MAX)
 		nr_scan = INT_MAX;
 #endif
 #if __FreeBSD_version >= 1300139
 	sx_xlock(&arc_vnlru_lock);
 	vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker);
 	sx_xunlock(&arc_vnlru_lock);
 #else
 	vnlru_free(nr_scan, &zfs_vfsops);
 #endif
 }
 /*
 * Notify registered consumers they must drop holds on a portion of the ARC
 * buffered they reference.  This provides a mechanism to ensure the ARC can
 * honor the metadata limit and reclaim otherwise pinned ARC buffers.  This
 * is analogous to dnlc_reduce_cache() but more generic.
 *
 * This operation is performed asynchronously so it may be safely called
 * in the context of the arc_reclaim_thread().  A reference is taken here
 * for each registered arc_prune_t and the arc_prune_task() is responsible
 * for releasing it once the registered arc_prune_func_t has completed.
 */
 void
 arc_prune_async(uint64_t adjust)
 {
 #ifndef __LP64__
 	if (adjust > UINTPTR_MAX)
 		adjust = UINTPTR_MAX;
 #endif
 	taskq_dispatch(arc_prune_taskq, arc_prune_task,
 	    (void *)(intptr_t)adjust, TQ_SLEEP);
 	ARCSTAT_BUMP(arcstat_prune);
 }
 uint64_t
 arc_all_memory(void)
 {
@@ -228,10 +176,6 @@ arc_lowmem_init(void)
 {
 	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
 	    EVENTHANDLER_PRI_FIRST);
 #if __FreeBSD_version >= 1300139
 	arc_vnlru_marker = vnlru_alloc_marker();
 	sx_init(&arc_vnlru_lock, "arc vnlru lock");
 #endif
 }
 void
@@ -239,12 +183,6 @@ arc_lowmem_fini(void)
 {
 	if (arc_event_lowmem != NULL)
 		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
 #if __FreeBSD_version >= 1300139
 	if (arc_vnlru_marker != NULL) {
 		vnlru_free_marker(arc_vnlru_marker);
 		sx_destroy(&arc_vnlru_lock);
 	}
 #endif
 }
 void
@@ -46,6 +46,7 @@ knlist_sx_xunlock(void *arg)
 	sx_xunlock((struct sx *)arg);
 }
 #if __FreeBSD_version >= 1300128
 static void
 knlist_sx_assert_lock(void *arg, int what)
 {
@@ -55,11 +56,28 @@ knlist_sx_assert_lock(void *arg, int what)
 	else
 		sx_assert((struct sx *)arg, SX_UNLOCKED);
 }
 #else
 static void
 knlist_sx_assert_locked(void *arg)
 {
 	sx_assert((struct sx *)arg, SX_LOCKED);
 }
 static void
 knlist_sx_assert_unlocked(void *arg)
 {
 	sx_assert((struct sx *)arg, SX_UNLOCKED);
 }
 #endif
 void
 knlist_init_sx(struct knlist *knl, struct sx *lock)
 {
 #if __FreeBSD_version >= 1300128
 	knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock,
 	    knlist_sx_assert_lock);
 #else
 	knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock,
 	    knlist_sx_assert_locked, knlist_sx_assert_unlocked);
 #endif
 }
@@ -1026,7 +1026,8 @@ zfsctl_snapdir_lookup(struct vop_lookup_args *ap)
 	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
 	    dvp->v_vfsp->mnt_stat.f_mntonname, name);
-	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
+	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0,
 	    dvp->v_vfsp);
 	kmem_free(mountpoint, mountpoint_len);
 	if (err == 0) {
 		/*
@@ -89,6 +89,10 @@ int zfs_debug_level;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
 	"Debug level");
 int zfs_bclone_enabled = 0;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
 	&zfs_bclone_enabled, 0, "Enable block cloning");
 struct zfs_jailparam {
 	int mount_snapshot;
 };
@@ -2070,6 +2074,26 @@ zfs_vnodes_adjust_back(void)
 #endif
 }
 #if __FreeBSD_version >= 1300139
 static struct sx zfs_vnlru_lock;
 static struct vnode *zfs_vnlru_marker;
 #endif
 static arc_prune_t *zfs_prune;
 static void
 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
 {
 	if (nr_to_scan > INT_MAX)
 		nr_to_scan = INT_MAX;
 #if __FreeBSD_version >= 1300139
 	sx_xlock(&zfs_vnlru_lock);
 	vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
 	sx_xunlock(&zfs_vnlru_lock);
 #else
 	vnlru_free(nr_to_scan, &zfs_vfsops);
 #endif
 }
 void
 zfs_init(void)
 {
@@ -2096,11 +2120,23 @@ zfs_init(void)
 	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
 	zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
 #if __FreeBSD_version >= 1300139
 	zfs_vnlru_marker = vnlru_alloc_marker();
 	sx_init(&zfs_vnlru_lock, "zfs vnlru lock");
 #endif
 	zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL);
 }
 void
 zfs_fini(void)
 {
 	arc_remove_prune_callback(zfs_prune);
 #if __FreeBSD_version >= 1300139
 	vnlru_free_marker(zfs_vnlru_marker);
 	sx_destroy(&zfs_vnlru_lock);
 #endif
 	taskq_destroy(zfsvfs_taskq);
 	zfsctl_fini();
 	zfs_znode_fini();
@@ -6213,6 +6213,7 @@ zfs_deallocate(struct vop_deallocate_args *ap)
 }
 #endif
 #if __FreeBSD_version >= 1300039
 #ifndef _SYS_SYSPROTO_H_
 struct vop_copy_file_range_args {
 	struct vnode *a_invp;
@@ -6243,6 +6244,11 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 	int error;
 	uint64_t len = *ap->a_lenp;
 	if (!zfs_bclone_enabled) {
 		mp = NULL;
 		goto bad_write_fallback;
 	}
 	/*
 	 * TODO: If offset/length is not aligned to recordsize, use
 	 * vn_generic_copy_file_range() on this fragment.
@@ -6314,6 +6320,7 @@ bad_write_fallback:
 	    ap->a_incred, ap->a_outcred, ap->a_fsizetd);
 	return (error);
 }
 #endif
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
@@ -6378,7 +6385,9 @@ struct vop_vector zfs_vnodeops = {
 #if __FreeBSD_version >= 1400043
 	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
 #endif
 #if __FreeBSD_version >= 1300039
 	.vop_copy_file_range =	zfs_freebsd_copy_file_range,
 #endif
 };
 VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
@@ -1364,6 +1364,19 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
 				vec++;
 				total_len += crypt_len;
 			}
 		} else if (txtype == TX_CLONE_RANGE) {
 			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
 			crypt_len = o - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t);
 			dst_iovecs[vec].iov_len = crypt_len;
 			/* copy the bps now since they will not be encrypted */
 			memcpy(dlrp + o, slrp + o, lr_len - o);
 			memcpy(aadp, slrp + o, lr_len - o);
 			aadp += lr_len - o;
 			aad_len += lr_len - o;
 			vec++;
 			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp +
@@ -489,56 +489,5 @@ arc_unregister_hotplug(void)
 }
 #endif /* _KERNEL */
 /*
 * Helper function for arc_prune_async() it is responsible for safely
 * handling the execution of a registered arc_prune_func_t.
 */
 static void
 arc_prune_task(void *ptr)
 {
 	arc_prune_t *ap = (arc_prune_t *)ptr;
 	arc_prune_func_t *func = ap->p_pfunc;
 	if (func != NULL)
 		func(ap->p_adjust, ap->p_private);
 	zfs_refcount_remove(&ap->p_refcnt, func);
 }
 /*
 * Notify registered consumers they must drop holds on a portion of the ARC
 * buffered they reference.  This provides a mechanism to ensure the ARC can
 * honor the metadata limit and reclaim otherwise pinned ARC buffers.  This
 * is analogous to dnlc_reduce_cache() but more generic.
 *
 * This operation is performed asynchronously so it may be safely called
 * in the context of the arc_reclaim_thread().  A reference is taken here
 * for each registered arc_prune_t and the arc_prune_task() is responsible
 * for releasing it once the registered arc_prune_func_t has completed.
 */
 void
 arc_prune_async(uint64_t adjust)
 {
 	arc_prune_t *ap;
 	mutex_enter(&arc_prune_mtx);
 	for (ap = list_head(&arc_prune_list); ap != NULL;
 	    ap = list_next(&arc_prune_list, ap)) {
 		if (zfs_refcount_count(&ap->p_refcnt) >= 2)
 			continue;
 		zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
 		ap->p_adjust = adjust;
 		if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
 		    ap, TQ_SLEEP) == TASKQID_INVALID) {
 			zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
 			continue;
 		}
 		ARCSTAT_BUMP(arcstat_prune);
 	}
 	mutex_exit(&arc_prune_mtx);
 }
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
 	"Limit on number of pages that ARC shrinker can reclaim at once");
@@ -522,7 +522,7 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
 	ip->i_blkbits = SPA_MINBLOCKSHIFT;
 	ip->i_atime = now;
 	ip->i_mtime = now;
-	ip->i_ctime = now;
+	zpl_inode_set_ctime_to_ts(ip, now);
 	ip->i_fop = fops;
 	ip->i_op = ops;
 #if defined(IOP_XATTR)
@@ -204,22 +204,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
 	this_seg_start = orig_loffset;
 	rq_for_each_segment(bv, rq, iter) {
 		if (uio->iter.bio) {
 			/*
 			 * If uio->iter.bio is present, then we know we've saved
 			 * uio->iter from a previous call to this function, and
 			 * we can skip ahead in this rq_for_each_segment() loop
 			 * to where we last left off.  That way, we don't need
 			 * to iterate over tons of segments we've already
 			 * processed - we can just restore the "saved state".
 			 */
 			iter = uio->iter;
 			bv = uio->bv;
 			this_seg_start = uio->uio_loffset;
 			memset(&uio->iter, 0, sizeof (uio->iter));
 			continue;
 		}
 		/*
 		 * Lookup what the logical offset of the last byte of this
 		 * segment is.
@@ -260,19 +244,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
 			copied = 1;	/* We copied some data */
 		}
 		if (n == 0) {
 			/*
 			 * All done copying.  Save our 'iter' value to the uio.
 			 * This allows us to "save our state" and skip ahead in
 			 * the rq_for_each_segment() loop the next time we call
 			 * call zfs_uiomove_bvec_rq() on this uio (which we
 			 * will be doing for any remaining data in the uio).
 			 */
 			uio->iter = iter; /* make a copy of the struct data */
 			uio->bv = bv;
 			return (0);
 		}
 		this_seg_start = this_seg_end + 1;
 	}
@@ -1488,7 +1488,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
 	 * read-only flag, pretend it was set, as done for snapshots.
 	 */
 	if (!canwrite)
-		vfs->vfs_readonly = true;
+		vfs->vfs_readonly = B_TRUE;
 	error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs);
 	if (error) {
@@ -1652,7 +1652,12 @@ out:
 *	RETURN:	0 (always succeeds)
 */
 int
 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip,
    struct kstat *sp)
 #else
 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
 #endif
 {
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
@@ -1665,7 +1670,11 @@ zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
 	mutex_enter(&zp->z_lock);
 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
 	zpl_generic_fillattr(user_ns, request_mask, ip, sp);
 #else
 	zpl_generic_fillattr(user_ns, ip, sp);
 #endif
 	/*
 	 * +1 link count for root inode with visible '.zfs' directory.
 	 */
@@ -2442,8 +2451,8 @@ top:
 	if (mask & (ATTR_CTIME | ATTR_SIZE)) {
 		ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
-		ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
+		zpl_inode_set_ctime_to_ts(ZTOI(zp),
-		    ZTOI(zp));
+		    zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    ctime, sizeof (ctime));
 	}
@@ -3648,6 +3657,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	caddr_t		va;
 	int		err = 0;
 	uint64_t	mtime[2], ctime[2];
 	inode_timespec_t tmp_ctime;
 	sa_bulk_attr_t	bulk[3];
 	int		cnt = 0;
 	struct address_space *mapping;
@@ -3812,7 +3822,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	/* Preserve the mtime and ctime provided by the inode */
 	ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
-	ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+	tmp_ctime = zpl_inode_get_ctime(ip);
 	ZFS_TIME_ENCODE(&tmp_ctime, ctime);
 	zp->z_atime_dirty = B_FALSE;
 	zp->z_seq++;
@@ -3862,6 +3873,7 @@ zfs_dirty_inode(struct inode *ip, int flags)
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	dmu_tx_t	*tx;
 	uint64_t	mode, atime[2], mtime[2], ctime[2];
 	inode_timespec_t tmp_ctime;
 	sa_bulk_attr_t	bulk[4];
 	int		error = 0;
 	int		cnt = 0;
@@ -3908,7 +3920,8 @@ zfs_dirty_inode(struct inode *ip, int flags)
 	/* Preserve the mode, mtime and ctime provided by the inode */
 	ZFS_TIME_ENCODE(&ip->i_atime, atime);
 	ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
-	ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+	tmp_ctime = zpl_inode_get_ctime(ip);
 	ZFS_TIME_ENCODE(&tmp_ctime, ctime);
 	mode = ip->i_mode;
 	zp->z_mode = mode;
@@ -4058,8 +4071,8 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
-	if ((vm_flags & VM_WRITE) && (zp->z_pflags &
+	if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) &&
-	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
+	    (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
@@ -4229,4 +4242,8 @@ EXPORT_SYMBOL(zfs_map);
 module_param(zfs_delete_blocks, ulong, 0644);
 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
 /* CSTYLED */
 module_param(zfs_bclone_enabled, uint, 0644);
 MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");
 #endif
@@ -542,6 +542,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	uint64_t links;
 	uint64_t z_uid, z_gid;
 	uint64_t atime[2], mtime[2], ctime[2], btime[2];
 	inode_timespec_t tmp_ctime;
 	uint64_t projid = ZFS_DEFAULT_PROJID;
 	sa_bulk_attr_t bulk[12];
 	int count = 0;
@@ -615,7 +616,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	ZFS_TIME_DECODE(&ip->i_atime, atime);
 	ZFS_TIME_DECODE(&ip->i_mtime, mtime);
-	ZFS_TIME_DECODE(&ip->i_ctime, ctime);
+	ZFS_TIME_DECODE(&tmp_ctime, ctime);
 	zpl_inode_set_ctime_to_ts(ip, tmp_ctime);
 	ZFS_TIME_DECODE(&zp->z_btime, btime);
 	ip->i_ino = zp->z_id;
@@ -1195,6 +1197,7 @@ zfs_rezget(znode_t *zp)
 	uint64_t gen;
 	uint64_t z_uid, z_gid;
 	uint64_t atime[2], mtime[2], ctime[2], btime[2];
 	inode_timespec_t tmp_ctime;
 	uint64_t projid = ZFS_DEFAULT_PROJID;
 	znode_hold_t *zh;
@@ -1289,7 +1292,8 @@ zfs_rezget(znode_t *zp)
 	ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
 	ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
-	ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
+	ZFS_TIME_DECODE(&tmp_ctime, ctime);
 	zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
 	ZFS_TIME_DECODE(&zp->z_btime, btime);
 	if ((uint32_t)gen != ZTOI(zp)->i_generation) {
@@ -1397,7 +1401,7 @@ zfs_zinactive(znode_t *zp)
 boolean_t
 zfs_relatime_need_update(const struct inode *ip)
 {
-	inode_timespec_t now;
+	inode_timespec_t now, tmp_ctime;
 	gethrestime(&now);
 	/*
@@ -1408,7 +1412,8 @@ zfs_relatime_need_update(const struct inode *ip)
 	if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
 		return (B_TRUE);
-	if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0)
+	tmp_ctime = zpl_inode_get_ctime(ip);
 	if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0)
 		return (B_TRUE);
 	if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
@@ -1434,7 +1439,7 @@ void
 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
    uint64_t ctime[2])
 {
-	inode_timespec_t now;
+	inode_timespec_t now, tmp_ctime;
 	gethrestime(&now);
@@ -1451,7 +1456,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
 	if (flag & ATTR_CTIME) {
 		ZFS_TIME_ENCODE(&now, ctime);
-		ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
+		ZFS_TIME_DECODE(&tmp_ctime, ctime);
 		zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
 		if (ZTOZSB(zp)->z_use_fuids)
 			zp->z_pflags |= ZFS_ARCHIVE;
 	}
@@ -1543,6 +1543,21 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
 				nr_iovecs++;
 				total_len += crypt_len;
 			}
 		} else if (txtype == TX_CLONE_RANGE) {
 			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
 			crypt_len = o - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 			/* copy the bps now since they will not be encrypted */
 			memcpy(dlrp + o, slrp + o, lr_len - o);
 			memcpy(aadp, slrp + o, lr_len - o);
 			aadp += lr_len - o;
 			aad_len += lr_len - o;
 			nr_iovecs++;
 			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
@@ -124,6 +124,8 @@ zpl_root_getattr_impl(const struct path *path, struct kstat *stat,
 	generic_fillattr(user_ns, ip, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP)
 	generic_fillattr(user_ns, ip, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK)
 	generic_fillattr(user_ns, request_mask, ip, stat);
 #else
 	(void) user_ns;
 #endif
@@ -435,6 +437,8 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
 	generic_fillattr(user_ns, ip, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP)
 	generic_fillattr(user_ns, ip, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK)
 	generic_fillattr(user_ns, request_mask, ip, stat);
 #else
 	(void) user_ns;
 #endif
@@ -609,6 +613,8 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
 		generic_fillattr(user_ns, path->dentry->d_inode, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP)
 		generic_fillattr(user_ns, path->dentry->d_inode, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK)
 	generic_fillattr(user_ns, request_mask, ip, stat);
 #else
 		(void) user_ns;
 #endif
@@ -623,7 +629,10 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
 	error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
 	if (error == 0) {
-#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
+#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
 		error = -zfs_getattr_fast(user_ns, request_mask, ZTOI(dzp),
 		    stat);
 #elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
 		error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat);
 #else
 		error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat);
@@ -31,6 +31,8 @@
 #include <sys/zfs_vnops.h>
 #include <sys/zfeature.h>
 int zfs_bclone_enabled = 0;
 /*
 * Clone part of a file via block cloning.
 *
@@ -50,6 +52,9 @@ __zpl_clone_file_range(struct file *src_file, loff_t src_off,
 	fstrans_cookie_t cookie;
 	int err;
 	if (!zfs_bclone_enabled)
 		return (-EOPNOTSUPP);
 	if (!spa_feature_is_enabled(
 	    dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING))
 		return (-EOPNOTSUPP);
@@ -435,7 +435,9 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
 	 * XXX query_flags currently ignored.
 	 */
-#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
+#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
 	error = -zfs_getattr_fast(user_ns, request_mask, ip, stat);
 #elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
 	error = -zfs_getattr_fast(user_ns, ip, stat);
 #else
 	error = -zfs_getattr_fast(kcred->user_ns, ip, stat);
@@ -774,7 +776,7 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 		return (-EMLINK);
 	crhold(cr);
-	ip->i_ctime = current_time(ip);
+	zpl_inode_set_ctime_to_ts(ip, current_time(ip));
 	/* Must have an existing ref, so igrab() cannot return NULL */
 	VERIFY3P(igrab(ip), !=, NULL);
@@ -375,7 +375,7 @@ zpl_kill_sb(struct super_block *sb)
 }
 void
-zpl_prune_sb(int64_t nr_to_scan, void *arg)
+zpl_prune_sb(uint64_t nr_to_scan, void *arg)
 {
 	struct super_block *sb = (struct super_block *)arg;
 	int objects = 0;
@@ -513,7 +513,7 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value,
 	error = -zfs_write_simple(xzp, value, size, pos, NULL);
 out:
 	if (error == 0) {
-		ip->i_ctime = current_time(ip);
+		zpl_inode_set_ctime_to_ts(ip, current_time(ip));
 		zfs_mark_inode_dirty(ip);
 	}
@@ -1011,7 +1011,8 @@ zpl_set_acl_impl(struct inode *ip, struct posix_acl *acl, int type)
 				 */
 				if (ip->i_mode != mode) {
 					ip->i_mode = ITOZ(ip)->z_mode = mode;
-					ip->i_ctime = current_time(ip);
+					zpl_inode_set_ctime_to_ts(ip,
 					    current_time(ip));
 					zfs_mark_inode_dirty(ip);
 				}
@@ -1170,7 +1171,7 @@ zpl_init_acl(struct inode *ip, struct inode *dir)
 			return (PTR_ERR(acl));
 		if (!acl) {
 			ITOZ(ip)->z_mode = (ip->i_mode &= ~current_umask());
-			ip->i_ctime = current_time(ip);
+			zpl_inode_set_ctime_to_ts(ip, current_time(ip));
 			zfs_mark_inode_dirty(ip);
 			return (0);
 		}
@@ -873,7 +873,13 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
 	switch (cmd) {
 	case BLKFLSBUF:
 #ifdef HAVE_FSYNC_BDEV
 		fsync_bdev(bdev);
 #elif defined(HAVE_SYNC_BLOCKDEV)
 		sync_blockdev(bdev);
 #else
 #error "Neither fsync_bdev() nor sync_blockdev() found"
 #endif
 		invalidate_bdev(bdev);
 		rw_enter(&zv->zv_suspend_lock, RW_READER);
@@ -1620,6 +1626,18 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
 module_param(zvol_volmode, uint, 0644);
 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
 #ifdef HAVE_BLK_MQ
 module_param(zvol_blk_mq_queue_depth, uint, 0644);
 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
 module_param(zvol_use_blk_mq, uint, 0644);
 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
    "Process volblocksize blocks per thread");
 #endif
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 module_param(zvol_open_timeout_ms, uint, 0644);
 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
@@ -886,6 +886,8 @@ static void l2arc_do_free_on_write(void);
 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
    boolean_t state_only);
 static void arc_prune_async(uint64_t adjust);
 #define	l2arc_hdr_arcstats_increment(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
 #define	l2arc_hdr_arcstats_decrement(hdr) \
@@ -1364,7 +1366,7 @@ arc_buf_is_shared(arc_buf_t *buf)
 	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
 	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
-	IMPLY(shared, ARC_BUF_SHARED(buf));
+	EQUIV(shared, ARC_BUF_SHARED(buf));
 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
 	/*
@@ -1998,7 +2000,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
 	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
 	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
-	IMPLY(encrypted, !ARC_BUF_SHARED(buf));
+	IMPLY(encrypted, !arc_buf_is_shared(buf));
 	/*
 	 * If the caller wanted encrypted data we just need to copy it from
@@ -2066,7 +2068,9 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 	}
 	if (hdr_compressed == compressed) {
-		if (!arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			ASSERT(arc_buf_is_shared(buf));
 		} else {
 			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
 			    arc_buf_size(buf));
 		}
@@ -2078,7 +2082,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 		 * If the buf is sharing its data with the hdr, unlink it and
 		 * allocate a new data buffer for the buf.
 		 */
-		if (arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			ASSERT(ARC_BUF_COMPRESSED(buf));
 			/* We need to give the buf its own b_data */
@@ -2090,6 +2094,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 			/* Previously overhead was 0; just add new overhead */
 			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 		} else if (ARC_BUF_COMPRESSED(buf)) {
 			ASSERT(!arc_buf_is_shared(buf));
 			/* We need to reallocate the buf's b_data */
 			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
 			    buf);
@@ -2217,7 +2223,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
-		if (arc_buf_is_shared(buf))
+		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
@@ -2256,7 +2262,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
-		if (arc_buf_is_shared(buf))
+		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
@@ -2481,7 +2487,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
-				if (arc_buf_is_shared(buf))
+				if (ARC_BUF_SHARED(buf))
 					continue;
 				(void) zfs_refcount_add_many(
@@ -2537,7 +2543,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
-				if (arc_buf_is_shared(buf))
+				if (ARC_BUF_SHARED(buf))
 					continue;
 				(void) zfs_refcount_remove_many(
@@ -3061,9 +3067,10 @@ arc_buf_destroy_impl(arc_buf_t *buf)
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
-		if (arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 		} else {
 			ASSERT(!arc_buf_is_shared(buf));
 			uint64_t size = arc_buf_size(buf);
 			arc_free_data_buf(hdr, buf->b_data, size, buf);
 			ARCSTAT_INCR(arcstat_overhead_size, -size);
@@ -3104,9 +3111,9 @@ arc_buf_destroy_impl(arc_buf_t *buf)
 		 */
 		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
 			/* Only one buf can be shared at once */
-			VERIFY(!arc_buf_is_shared(lastbuf));
+			ASSERT(!arc_buf_is_shared(lastbuf));
 			/* hdr is uncompressed so can't have compressed buf */
-			VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
+			ASSERT(!ARC_BUF_COMPRESSED(lastbuf));
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			arc_hdr_free_abd(hdr, B_FALSE);
@@ -5863,12 +5870,9 @@ top:
 			 * 3. This buffer isn't currently writing to the L2ARC.
 			 * 4. The L2ARC entry wasn't evicted, which may
 			 *    also have invalidated the vdev.
 			 * 5. This isn't prefetch or l2arc_noprefetch is 0.
 			 */
 			if (HDR_HAS_L2HDR(hdr) &&
-			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
+			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
 			    !(l2arc_noprefetch &&
 			    (*arc_flags & ARC_FLAG_PREFETCH))) {
 				l2arc_read_callback_t *cb;
 				abd_t *abd;
 				uint64_t asize;
@@ -6048,6 +6052,56 @@ arc_remove_prune_callback(arc_prune_t *p)
 	kmem_free(p, sizeof (*p));
 }
 /*
 * Helper function for arc_prune_async() it is responsible for safely
 * handling the execution of a registered arc_prune_func_t.
 */
 static void
 arc_prune_task(void *ptr)
 {
 	arc_prune_t *ap = (arc_prune_t *)ptr;
 	arc_prune_func_t *func = ap->p_pfunc;
 	if (func != NULL)
 		func(ap->p_adjust, ap->p_private);
 	zfs_refcount_remove(&ap->p_refcnt, func);
 }
 /*
 * Notify registered consumers they must drop holds on a portion of the ARC
 * buffers they reference.  This provides a mechanism to ensure the ARC can
 * honor the metadata limit and reclaim otherwise pinned ARC buffers.
 *
 * This operation is performed asynchronously so it may be safely called
 * in the context of the arc_reclaim_thread().  A reference is taken here
 * for each registered arc_prune_t and the arc_prune_task() is responsible
 * for releasing it once the registered arc_prune_func_t has completed.
 */
 static void
 arc_prune_async(uint64_t adjust)
 {
 	arc_prune_t *ap;
 	mutex_enter(&arc_prune_mtx);
 	for (ap = list_head(&arc_prune_list); ap != NULL;
 	    ap = list_next(&arc_prune_list, ap)) {
 		if (zfs_refcount_count(&ap->p_refcnt) >= 2)
 			continue;
 		zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
 		ap->p_adjust = adjust;
 		if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
 		    ap, TQ_SLEEP) == TASKQID_INVALID) {
 			zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
 			continue;
 		}
 		ARCSTAT_BUMP(arcstat_prune);
 	}
 	mutex_exit(&arc_prune_mtx);
 }
 /*
 * Notify the arc that a block was freed, and thus will never be used again.
 */
@@ -6189,7 +6243,7 @@ arc_release(arc_buf_t *buf, const void *tag)
 		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
 		VERIFY3S(remove_reference(hdr, tag), >, 0);
-		if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
+		if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(ARC_BUF_LAST(buf));
 		}
@@ -6206,9 +6260,9 @@ arc_release(arc_buf_t *buf, const void *tag)
 		 * If the current arc_buf_t and the hdr are sharing their data
 		 * buffer, then we must stop sharing that block.
 		 */
-		if (arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
-			VERIFY(!arc_buf_is_shared(lastbuf));
+			ASSERT(!arc_buf_is_shared(lastbuf));
 			/*
 			 * First, sever the block sharing relationship between
@@ -6241,7 +6295,7 @@ arc_release(arc_buf_t *buf, const void *tag)
 			 */
 			ASSERT(arc_buf_is_shared(lastbuf) ||
 			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
-			ASSERT(!ARC_BUF_SHARED(buf));
+			ASSERT(!arc_buf_is_shared(buf));
 		}
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
@@ -6335,9 +6389,10 @@ arc_write_ready(zio_t *zio)
 		arc_cksum_free(hdr);
 		arc_buf_unwatch(buf);
 		if (hdr->b_l1hdr.b_pabd != NULL) {
-			if (arc_buf_is_shared(buf)) {
+			if (ARC_BUF_SHARED(buf)) {
 				arc_unshare_buf(hdr, buf);
 			} else {
 				ASSERT(!arc_buf_is_shared(buf));
 				arc_hdr_free_abd(hdr, B_FALSE);
 			}
 		}
@@ -6636,9 +6691,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 		 * The hdr will remain with a NULL data pointer and the
 		 * buf will take sole ownership of the block.
 		 */
-		if (arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			arc_unshare_buf(hdr, buf);
 		} else {
 			ASSERT(!arc_buf_is_shared(buf));
 			arc_hdr_free_abd(hdr, B_FALSE);
 		}
 		VERIFY3P(buf->b_data, !=, NULL);
@@ -28,6 +28,7 @@
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/brt.h>
 #include <sys/brt_impl.h>
 #include <sys/ddt.h>
 #include <sys/bitmap.h>
 #include <sys/zap.h>
@@ -234,178 +235,15 @@
 * destination dataset is mounted and its ZIL replayed.
 * To address this situation we leverage zil_claim() mechanism where ZFS will
 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
- * entries, we will bump reference counters for their BPs in the BRT and then
+ * entries, we will bump reference counters for their BPs in the BRT.  Then
- * on mount and ZIL replay we will just attach BPs to the file without
+ * on mount and ZIL replay we bump the reference counters once more, while the
- * bumping reference counters.
+ * first references are dropped during ZIL destroy by zil_free_clone_range().
- * Note it is still possible that after zil_claim() we never mount the
+ * It is possible that after zil_claim() we never mount the destination, so
- * destination, so we never replay its ZIL and we destroy it. This way we would
+ * we never replay its ZIL and just destroy it.  In this case the only taken
- * end up with leaked references in BRT. We address that too as ZFS gives us
+ * references will be dropped by zil_free_clone_range(), since the cloning is
- * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
+ * not going to ever take place.
 */
 /*
 * BRT - Block Reference Table.
 */
 #define	BRT_OBJECT_VDEV_PREFIX	"com.fudosecurity:brt:vdev:"
 /*
 * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
 * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
 * Each element in this array represents how many BRT entries do we have in this
 * chunk of storage. We always load this entire array into memory and update as
 * needed. By having it in memory we can quickly tell (during zio_free()) if
 * there are any BRT entries that we might need to update.
 *
 * This value cannot be larger than 16MB, at least as long as we support
 * 512 byte block sizes. With 512 byte block size we can have exactly
 * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
 * many for a 16bit counter.
 */
 #define	BRT_RANGESIZE	(16 * 1024 * 1024)
 _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
 	"BRT_RANGESIZE is too large.");
 /*
 * We don't want to update the whole structure every time. Maintain bitmap
 * of dirty blocks within the regions, so that a single bit represents a
 * block size of entcounts. For example if we have a 1PB vdev then all
 * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
 * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
 * the whole 128MB on disk when we have updated only a single entcount.
 * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
 * is represented by a single bit. This gives us 4096 bits. A set bit in the
 * bitmap means that we had a change in at least one of the 16384 entcounts
 * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
 */
 #define	BRT_BLOCKSIZE	(32 * 1024)
 #define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
 	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
 #define	BRT_LITTLE_ENDIAN	0
 #define	BRT_BIG_ENDIAN		1
 #ifdef _ZFS_LITTLE_ENDIAN
 #define	BRT_NATIVE_BYTEORDER		BRT_LITTLE_ENDIAN
 #define	BRT_NON_NATIVE_BYTEORDER	BRT_BIG_ENDIAN
 #else
 #define	BRT_NATIVE_BYTEORDER		BRT_BIG_ENDIAN
 #define	BRT_NON_NATIVE_BYTEORDER	BRT_LITTLE_ENDIAN
 #endif
 typedef struct brt_vdev_phys {
 	uint64_t	bvp_mos_entries;
 	uint64_t	bvp_size;
 	uint64_t	bvp_byteorder;
 	uint64_t	bvp_totalcount;
 	uint64_t	bvp_rangesize;
 	uint64_t	bvp_usedspace;
 	uint64_t	bvp_savedspace;
 } brt_vdev_phys_t;
 typedef struct brt_vdev {
 	/*
 	 * VDEV id.
 	 */
 	uint64_t	bv_vdevid;
 	/*
 	 * Is the structure initiated?
 	 * (bv_entcount and bv_bitmap are allocated?)
 	 */
 	boolean_t	bv_initiated;
 	/*
 	 * Object number in the MOS for the entcount array and brt_vdev_phys.
 	 */
 	uint64_t	bv_mos_brtvdev;
 	/*
 	 * Object number in the MOS for the entries table.
 	 */
 	uint64_t	bv_mos_entries;
 	/*
 	 * Entries to sync.
 	 */
 	avl_tree_t	bv_tree;
 	/*
 	 * Does the bv_entcount[] array needs byte swapping?
 	 */
 	boolean_t	bv_need_byteswap;
 	/*
 	 * Number of entries in the bv_entcount[] array.
 	 */
 	uint64_t	bv_size;
 	/*
 	 * This is the array with BRT entry count per BRT_RANGESIZE.
 	 */
 	uint16_t	*bv_entcount;
 	/*
 	 * Sum of all bv_entcount[]s.
 	 */
 	uint64_t	bv_totalcount;
 	/*
 	 * Space on disk occupied by cloned blocks (without compression).
 	 */
 	uint64_t	bv_usedspace;
 	/*
 	 * How much additional space would be occupied without block cloning.
 	 */
 	uint64_t	bv_savedspace;
 	/*
 	 * brt_vdev_phys needs updating on disk.
 	 */
 	boolean_t	bv_meta_dirty;
 	/*
 	 * bv_entcount[] needs updating on disk.
 	 */
 	boolean_t	bv_entcount_dirty;
 	/*
 	 * bv_entcount[] potentially can be a bit too big to sychronize it all
 	 * when we just changed few entcounts. The fields below allow us to
 	 * track updates to bv_entcount[] array since the last sync.
 	 * A single bit in the bv_bitmap represents as many entcounts as can
 	 * fit into a single BRT_BLOCKSIZE.
 	 * For example we have 65536 entcounts in the bv_entcount array
 	 * (so the whole array is 128kB). We updated bv_entcount[2] and
 	 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
 	 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
 	 */
 	ulong_t		*bv_bitmap;
 	uint64_t	bv_nblocks;
 } brt_vdev_t;
 /*
 * In-core brt
 */
 typedef struct brt {
 	krwlock_t	brt_lock;
 	spa_t		*brt_spa;
 #define	brt_mos		brt_spa->spa_meta_objset
 	uint64_t	brt_rangesize;
 	uint64_t	brt_usedspace;
 	uint64_t	brt_savedspace;
 	avl_tree_t	brt_pending_tree[TXG_SIZE];
 	kmutex_t	brt_pending_lock[TXG_SIZE];
 	/* Sum of all entries across all bv_trees. */
 	uint64_t	brt_nentries;
 	brt_vdev_t	*brt_vdevs;
 	uint64_t	brt_nvdevs;
 } brt_t;
 /* Size of bre_offset / sizeof (uint64_t). */
 #define	BRT_KEY_WORDS	(1)
 /*
 * In-core brt entry.
 * On-disk we use bre_offset as the key and bre_refcount as the value.
 */
 typedef struct brt_entry {
 	uint64_t	bre_offset;
 	uint64_t	bre_refcount;
 	avl_node_t	bre_node;
 } brt_entry_t;
 typedef struct brt_pending_entry {
 	blkptr_t	bpe_bp;
 	int		bpe_count;
 	avl_node_t	bpe_node;
 } brt_pending_entry_t;
 static kmem_cache_t *brt_entry_cache;
 static kmem_cache_t *brt_pending_entry_cache;
@@ -2700,15 +2700,23 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
 	 * writes and clones into this block.
 	 */
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 	VERIFY(!dbuf_undirty(db, tx));
 	ASSERT3P(dbuf_find_dirty_eq(db, tx->tx_txg), ==, NULL);
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 	}
 	db->db_state = DB_NOFILL;
 	DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
-	dmu_buf_will_not_fill(db_fake, tx);
+	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 void
@@ -2267,7 +2267,7 @@ out:
 int
 dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
-    dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay)
+    dmu_tx_t *tx, const blkptr_t *bps, size_t nbps)
 {
 	spa_t *spa;
 	dmu_buf_t **dbp, *dbuf;
@@ -2341,10 +2341,8 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		 * When data in embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
 		 * Also, when replaying ZIL we don't want to bump references
 		 * in the BRT as it was already done during ZIL claim.
 		 */
-		if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_add(spa, bp, tx);
 		}
 	}
@@ -210,10 +210,12 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 	dmu_buf_impl_t *db;
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	db = dbuf_hold_level(dn, level, blkid, FTAG);
+	err = dbuf_hold_impl(dn, level, blkid, TRUE, FALSE, FTAG, &db);
 	rw_exit(&dn->dn_struct_rwlock);
-	if (db == NULL)
+	if (err == ENOENT)
-		return (SET_ERROR(EIO));
+		return (0);
 	if (err != 0)
 		return (err);
 	/*
 	 * PARTIAL_FIRST allows caching for uncacheable blocks.  It will
 	 * be cleared after dmu_buf_will_dirty() call dbuf_read() again.
@@ -1764,7 +1764,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
 }
 /*
- * Checks if the dnode contains any uncommitted dirty records.
+ * Checks if the dnode itself is dirty, or is carrying any uncommitted records.
 * It is important to check both conditions, as some operations (eg appending
 * to a file) can dirty both as a single logical unit, but they are not synced
 * out atomically, so checking one and not the other can result in an object
 * appearing to be clean mid-way through a commit.
 *
 * Do not change this lightly! If you get it wrong, dmu_offset_next() can
 * detect a hole where there is really data, leading to silent corruption.
 */
 boolean_t
 dnode_is_dirty(dnode_t *dn)
@@ -1772,7 +1779,8 @@ dnode_is_dirty(dnode_t *dn)
 	mutex_enter(&dn->dn_mtx);
 	for (int i = 0; i < TXG_SIZE; i++) {
-		if (multilist_link_active(&dn->dn_dirty_link[i])) {
+		if (multilist_link_active(&dn->dn_dirty_link[i]) ||
 		    !list_is_empty(&dn->dn_dirty_records[i])) {
 			mutex_exit(&dn->dn_mtx);
 			return (B_TRUE);
 		}
@@ -965,18 +965,18 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
-	mutex_enter(&dp->dp_lock);
+	/*
-	uint64_t dirty = dp->dp_dirty_total;
+	 * We are not taking the dp_lock here and few other places, since torn
-	mutex_exit(&dp->dp_lock);
+	 * reads are unlikely: on 64-bit systems due to register size and on
-
+	 * 32-bit due to memory constraints.  Pool-wide locks in hot path may
-	return (dirty > delay_min_bytes);
+	 * be too expensive, while we do not need a precise result here.
 	 */
 	return (dp->dp_dirty_total > delay_min_bytes);
 }
 static boolean_t
 dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg)
 {
 	ASSERT(MUTEX_HELD(&dp->dp_lock));
 	uint64_t dirty_min_bytes =
 	    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
 	uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
@@ -367,23 +367,24 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent,
 * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
 * information for all pool visible within the zone.
 */
-nvlist_t *
+int
-spa_all_configs(uint64_t *generation)
+spa_all_configs(uint64_t *generation, nvlist_t **pools)
 {
 	nvlist_t *pools;
 	spa_t *spa = NULL;
 	if (*generation == spa_config_generation)
-		return (NULL);
+		return (SET_ERROR(EEXIST));
-	pools = fnvlist_alloc();
+	int error = mutex_enter_interruptible(&spa_namespace_lock);
 	if (error)
 		return (SET_ERROR(EINTR));
-	mutex_enter(&spa_namespace_lock);
+	*pools = fnvlist_alloc();
 	while ((spa = spa_next(spa)) != NULL) {
 		if (INGLOBALZONE(curproc) ||
 		    zone_dataset_visible(spa_name(spa), NULL)) {
 			mutex_enter(&spa->spa_props_lock);
-			fnvlist_add_nvlist(pools, spa_name(spa),
+			fnvlist_add_nvlist(*pools, spa_name(spa),
 			    spa->spa_config);
 			mutex_exit(&spa->spa_props_lock);
 		}
@@ -391,7 +392,7 @@ spa_all_configs(uint64_t *generation)
 	*generation = spa_config_generation;
 	mutex_exit(&spa_namespace_lock);
-	return (pools);
+	return (0);
 }
 void
@@ -4215,6 +4215,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 		/* XXX - L2ARC 1.0 does not support expansion */
 		if (vd->vdev_aux)
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
 		spa->spa_ccw_fail_time = 0;
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
@@ -273,8 +273,10 @@ vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
 {
 	zio_priority_t p = zio->io_priority;
 	vq->vq_cqueued |= 1U << p;
-	if (vdev_queue_class_fifo(p))
+	if (vdev_queue_class_fifo(p)) {
 		list_insert_tail(&vq->vq_class[p].vqc_list, zio);
 		vq->vq_class[p].vqc_list_numnodes++;
 	}
 	else
 		avl_add(&vq->vq_class[p].vqc_tree, zio);
 }
@@ -288,6 +290,7 @@ vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
 		list_t *list = &vq->vq_class[p].vqc_list;
 		list_remove(list, zio);
 		empty = list_is_empty(list);
 		vq->vq_class[p].vqc_list_numnodes--;
 	} else {
 		avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
 		avl_remove(tree, zio);
@@ -1069,7 +1072,7 @@ vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 	if (vdev_queue_class_fifo(p))
-		return (list_is_empty(&vq->vq_class[p].vqc_list) == 0);
+		return (vq->vq_class[p].vqc_list_numnodes);
 	else
 		return (avl_numnodes(&vq->vq_class[p].vqc_tree));
 }
@@ -1582,8 +1582,9 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc)
 	nvlist_t *configs;
 	int error;
-	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
+	error = spa_all_configs(&zc->zc_cookie, &configs);
-		return (SET_ERROR(EEXIST));
+	if (error)
 		return (error);
 	error = put_nvlist(zc, configs);
@@ -1094,6 +1094,15 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 	ASSERT(!outzfsvfs->z_replay);
 	/*
 	 * Block cloning from an unencrypted dataset into an encrypted
 	 * dataset and vice versa is not supported.
 	 */
 	if (inos->os_encrypted != outos->os_encrypted) {
 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 	error = zfs_verify_zp(inzp);
 	if (error == 0)
 		error = zfs_verify_zp(outzp);
@@ -1324,7 +1333,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		}
 		error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
-		    bps, nbps, B_FALSE);
+		    bps, nbps);
 		if (error != 0) {
 			dmu_tx_commit(tx);
 			break;
@@ -1458,7 +1467,7 @@ zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
 	if (zp->z_blksz < blksz)
 		zfs_grow_blocksize(zp, blksz, tx);
-	dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE);
+	dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps);
 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
@@ -145,7 +145,7 @@ static int zil_nocacheflush = 0;
 * Any writes above that will be executed with lower (asynchronous) priority
 * to limit potential SLOG device abuse by single active ZIL writer.
 */
-static uint64_t zil_slog_bulk = 768 * 1024;
+static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
@@ -24,6 +24,7 @@ BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
 BuildArch:      noarch
 Requires:       dkms >= 2.2.0.3
 Requires(pre):  dkms >= 2.2.0.3
 Requires(post): dkms >= 2.2.0.3
 Requires(preun): dkms >= 2.2.0.3
 Requires:       gcc, make, perl, diffutils
@@ -68,9 +69,93 @@ fi
 %defattr(-,root,root)
 /usr/src/%{module}-%{version}
 %pre
 echo "Running pre installation script: $0. Parameters: $*"
 # We don't want any other versions lingering around in dkms.
 # Tests with 'dnf' showed that in case of reinstall, or upgrade
 #  the preun scriptlet removed the version we are trying to install.
 # Because of this, find all zfs dkms sources in /var/lib/dkms and
 #  remove them, if we find a matching version in dkms.
 dkms_root=/var/lib/dkms
 if [ -d ${dkms_root}/%{module} ]; then
    cd ${dkms_root}/%{module}
    for x in [[:digit:]]*; do
        [ -d "$x" ] || continue
        otherver="$x"
        opath="${dkms_root}/%{module}/${otherver}"
        if [ "$otherver" != %{version} ]; then
            # This is a workaround for a broken 'dkms status', we caused in a previous version.
            # One day it might be not needed anymore, but it does not hurt to keep it.
            if dkms status -m %{module} -v "$otherver" 2>&1 | grep "${opath}/source/dkms.conf does not exist"
            then
                echo "ERROR: dkms status is broken!" >&2
                if [ -L "${opath}/source" -a ! -d "${opath}/source" ]
                then
                    echo "Trying to fix it by removing the symlink: ${opath}/source" >&2
                    echo "You should manually remove ${opath}" >&2
                    rm -f "${opath}/source" || echo "Removal failed!" >&2
                fi
            fi
            if [ `dkms status -m %{module} -v "$otherver" | grep -c %{module}` -gt 0 ]; then
                echo "Removing old %{module} dkms modules version $otherver from all kernels."
                dkms remove -m %{module} -v "$otherver" --all ||:
            fi
        fi
    done
    cd ${dkms_root}
 fi
 # Uninstall this version of zfs dkms modules before installation of the package.
 if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then
    echo "Removing %{module} dkms modules version %{version} from all kernels."
    dkms remove -m %{module} -v %{version} --all ||:
 fi
 %post
 echo "Running post installation script: $0. Parameters: $*"
 # Add the module to dkms, as reccommended in the dkms man page.
 # This is generally rpm specfic.
 # But this also may help, if we have a broken 'dkms status'.
 # Because, if the sources are available and only the symlink pointing
 #  to them is missing, this will resolve the situation
 echo "Adding %{module} dkms modules version %{version} to dkms."
 dkms add -m %{module} -v %{version} %{!?not_rpm:--rpm_safe_upgrade} ||:
 # After installing the package, dkms install this zfs version for the current kernel.
 # Force the overwriting of old modules to avoid diff warnings in dkms status.
 # Or in case of a downgrade to overwrite newer versions.
 # Or if some other backed up versions have been restored before.
 echo "Installing %{module} dkms modules version %{version} for the current kernel."
 dkms install --force -m %{module} -v %{version} ||:
 %preun
-dkms remove -m %{module} -v %{version} --all
+dkms_root="/var/lib/dkms/%{module}/%{version}"
 echo "Running pre uninstall script: $0. Parameters: $*"
 # In case of upgrade we do nothing. See above comment in pre hook.
 if [ "$1" = "1" -o "$1" = "upgrade" ] ; then
    echo "This is an upgrade. Skipping pre uninstall action."
    exit 0
 fi
-%posttrans
+# Check if we uninstall the package. In that case remove the dkms modules.
-/usr/lib/dkms/common.postinst %{module} %{version}
+# '0' is the value for the first parameter for rpm packages.
 # 'remove' or 'purge' are the possible names for deb packages.
 if [ "$1" = "0" -o "$1" = "remove" -o "$1" = "purge" ] ; then
    if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then
        echo "Removing %{module} dkms modules version %{version} from all kernels."
        dkms remove -m %{module} -v %{version} --all %{!?not_rpm:--rpm_safe_upgrade} && exit 0
    fi
    # If removing the modules failed, it might be because of the broken 'dkms status'.
    if dkms status -m %{module} -v %{version} 2>&1 | grep "${dkms_root}/source/dkms.conf does not exist"
    then
        echo "ERROR: dkms status is broken!" >&2
        echo "You should manually remove ${dkms_root}" >&2
        echo "WARNING: installed modules in /lib/modules/`uname -r`/extra could not be removed automatically!" >&2
    fi
 else
    echo "Script parameter $1 did not match any removal condition."
 fi
 exit 0
@@ -20,6 +20,8 @@ scripts_scripts = \
 if CONFIG_USER
 dist_scripts_SCRIPTS = $(scripts_scripts)
 dist_zfsexec_SCRIPTS = \
 	%D%/zfs_prepare_disk
 else
 dist_noinst_SCRIPTS += $(scripts_scripts)
 endif
@@ -0,0 +1,17 @@
 #!/bin/sh
 #
 # This is an optional helper script that is automatically called by libzfs
 # before a disk is about to be added into the pool.  It can be modified by
 # the user to run whatever commands are necessary to prepare a disk for
 # inclusion into the pool.  For example, users can add lines to this
 # script to do things like update the drive's firmware or check the drive's
 # health.  The script is optional and can be removed if it is not needed.
 #
 # See the zfs_prepare_disk(8) man page for details.
 #
 # Example:
 #
 # echo "Prepare disk $VDEV_PATH ($VDEV_UPATH) for $VDEV_PREPARE in $POOL_NAME"
 #
 exit 0
@@ -122,10 +122,10 @@ tags = ['functional', 'fallocate']
 [tests/functional/fault:Linux]
 tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
-    'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos',
+    'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
-    'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared',
+    'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
-    'decrypt_fault', 'decompress_fault', 'scrub_after_resilver',
+    'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
-    'zpool_status_-s']
+    'scrub_after_resilver', 'zpool_status_-s']
 tags = ['functional', 'fault']
 [tests/functional/features/large_dnode:Linux]
@@ -328,6 +328,7 @@ if os.environ.get('CI') == 'true':
        'fault/auto_online_001_pos': ['SKIP', ci_reason],
        'fault/auto_online_002_pos': ['SKIP', ci_reason],
        'fault/auto_replace_001_pos': ['SKIP', ci_reason],
        'fault/auto_replace_002_pos': ['SKIP', ci_reason],
        'fault/auto_spare_ashift': ['SKIP', ci_reason],
        'fault/auto_spare_shared': ['SKIP', ci_reason],
        'procfs/pool_state': ['SKIP', ci_reason],
@@ -130,12 +130,14 @@ export SYSTEM_FILES_LINUX='attr
    chattr
    exportfs
    fallocate
    flock
    free
    getfattr
    groupadd
    groupdel
    groupmod
    hostid
    logger
    losetup
    lsattr
    lsblk
@@ -145,21 +147,20 @@ export SYSTEM_FILES_LINUX='attr
    md5sum
    mkswap
    modprobe
    mountpoint
    mpstat
    nsenter
    parted
    perf
    setfattr
    setpriv
    sha256sum
    udevadm
    unshare
    useradd
    userdel
    usermod
-    setpriv
+    wipefs'
    mountpoint
    flock
    logger'
 export ZFS_FILES='zdb
    zfs
@@ -37,6 +37,12 @@
 . ${STF_SUITE}/include/math.shlib
 . ${STF_SUITE}/include/blkdev.shlib
 # On AlmaLinux 9 we will see $PWD = '.' instead of the full path.  This causes
 # some tests to fail.  Fix it up here.
 if [ "$PWD" = "." ] ; then
 	PWD="$(readlink -f $PWD)"
 fi
 #
 # Apply constrained path when available.  This is required since the
 # PATH may have been modified by sudo's secure_path behavior.
@@ -3334,6 +3340,21 @@ function set_tunable_impl
 	esac
 }
 function save_tunable
 {
 	[[ ! -d $TEST_BASE_DIR ]] && return 1
 	[[ -e $TEST_BASE_DIR/tunable-$1 ]] && return 2
 	echo "$(get_tunable """$1""")" > "$TEST_BASE_DIR"/tunable-"$1"
 }
 function restore_tunable
 {
 	[[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1
 	val="$(cat $TEST_BASE_DIR/tunable-"""$1""")"
 	set_tunable64 "$1" "$val"
 	rm $TEST_BASE_DIR/tunable-$1
 }
 #
 # Get a global system tunable
 #
@@ -89,7 +89,8 @@ VDEV_VALIDATE_SKIP		vdev.validate_skip		vdev_validate_skip
 VOL_INHIBIT_DEV			UNSUPPORTED			zvol_inhibit_dev
 VOL_MODE			vol.mode			zvol_volmode
 VOL_RECURSIVE			vol.recursive			UNSUPPORTED
-VOL_USE_BLK_MQ			UNSUPPORTED			UNSUPPORTED
+VOL_USE_BLK_MQ			UNSUPPORTED			zvol_use_blk_mq
 BCLONE_ENABLED			zfs_bclone_enabled		zfs_bclone_enabled
 XATTR_COMPAT			xattr_compat			zfs_xattr_compat
 ZEVENT_LEN_MAX			zevent.len_max			zfs_zevent_len_max
 ZEVENT_RETAIN_MAX		zevent.retain_max		zfs_zevent_retain_max
@@ -1431,6 +1431,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/fault/auto_online_001_pos.ksh \
 	functional/fault/auto_online_002_pos.ksh \
 	functional/fault/auto_replace_001_pos.ksh \
 	functional/fault/auto_replace_002_pos.ksh \
 	functional/fault/auto_spare_001_pos.ksh \
 	functional/fault/auto_spare_002_pos.ksh \
 	functional/fault/auto_spare_ashift.ksh \
@@ -31,4 +31,8 @@ verify_runnable "global"
 default_cleanup_noexit
 if tunable_exists BCLONE_ENABLED ; then
 	log_must restore_tunable BCLONE_ENABLED
 fi
 log_pass
--- a/Show More
+++ b/Show More