Tag zfs-2.2.2

META file and changelog updated. Signed-off-by: Tony Hutter <hutter2@llnl.gov>
FreeBSD: Fix ZFS so that snapshots under .zfs/snapshot are NFS visible
2026-05-24 03:08:51 +03:00 · 2023-11-29 14:08:46 -08:00 · 2023-11-29 14:08:46 -08:00 · 2023-11-29 13:08:25 -08:00 · 2023-11-29 13:08:25 -08:00 · 2023-11-28 15:19:07 -08:00
26 changed files with 474 additions and 255 deletions
@@ -1,7 +1,7 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.2.1
+Version:       2.2.2
 Release:       1
 Release-Tags:  relext
 License:       CDDL
@@ -32,4 +32,4 @@ For more details see the NOTICE, LICENSE and COPYRIGHT files; `UCRL-CODE-235197`

 # Supported Kernels
  * The `META` file contains the officially recognized supported Linux kernel versions.
-  * Supported FreeBSD versions are any supported branches and releases starting from 12.2-RELEASE.
+  * Supported FreeBSD versions are any supported branches and releases starting from 12.4-RELEASE.
@@ -34,6 +34,7 @@
 * Copyright (c) 2021 Allan Jude
 * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
 * Copyright (c) 2023, Klara Inc.
+ * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
 */

 #include <stdio.h>
@@ -80,6 +81,7 @@
 #include <sys/dsl_scan.h>
 #include <sys/btree.h>
 #include <sys/brt.h>
+#include <sys/brt_impl.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>

@@ -899,6 +901,8 @@ usage(void)
 	    "don't print label contents\n");
 	(void) fprintf(stderr, "        -t --txg=INTEGER             "
 	    "highest txg to use when searching for uberblocks\n");
+	(void) fprintf(stderr, "        -T --brt-stats               "
+	    "BRT statistics\n");
 	(void) fprintf(stderr, "        -u --uberblock               "
 	    "uberblock\n");
 	(void) fprintf(stderr, "        -U --cachefile=PATH          "
@@ -999,6 +1003,15 @@ zdb_nicenum(uint64_t num, char *buf, size_t buflen)
 		nicenum(num, buf, buflen);
 }

+static void
+zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen)
+{
+	if (dump_opt['P'])
+		(void) snprintf(buf, buflen, "%llu", (longlong_t)bytes);
+	else
+		zfs_nicebytes(bytes, buf, buflen);
+}
+
 static const char histo_stars[] = "****************************************";
 static const uint64_t histo_width = sizeof (histo_stars) - 1;

@@ -2081,6 +2094,76 @@ dump_all_ddts(spa_t *spa)
 	dump_dedup_ratio(&dds_total);
 }

+static void
+dump_brt(spa_t *spa)
+{
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) {
+		printf("BRT: unsupported on this pool\n");
+		return;
+	}
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
+		printf("BRT: empty\n");
+		return;
+	}
+
+	brt_t *brt = spa->spa_brt;
+	VERIFY(brt);
+
+	char count[32], used[32], saved[32];
+	zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
+	zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
+	uint64_t ratio = brt_get_ratio(spa);
+	printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved,
+	    (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100));
+
+	if (dump_opt['T'] < 2)
+		return;
+
+	for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+		brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
+		if (brtvd == NULL)
+			continue;
+
+		if (!brtvd->bv_initiated) {
+			printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
+			continue;
+		}
+
+		zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));
+		zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));
+		zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));
+		printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n",
+		    vdevid, count, used, saved);
+	}
+
+	if (dump_opt['T'] < 3)
+		return;
+
+	char dva[64];
+	printf("\n%-16s %-10s\n", "DVA", "REFCNT");
+
+	for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+		brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
+		if (brtvd == NULL || !brtvd->bv_initiated)
+			continue;
+
+		zap_cursor_t zc;
+		zap_attribute_t za;
+		for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			uint64_t offset = *(uint64_t *)za.za_name;
+			uint64_t refcnt = za.za_first_integer;
+
+			snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid,
+			    (u_longlong_t)offset);
+			printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt);
+		}
+		zap_cursor_fini(&zc);
+	}
+}
+
 static void
 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
@@ -8093,6 +8176,9 @@ dump_zpool(spa_t *spa)
 	if (dump_opt['D'])
 		dump_all_ddts(spa);

+	if (dump_opt['T'])
+		dump_brt(spa);
+
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
 	if (dump_opt['M'])
@@ -8879,6 +8965,7 @@ main(int argc, char **argv)
 		{"io-stats",		no_argument,		NULL, 's'},
 		{"simulate-dedup",	no_argument,		NULL, 'S'},
 		{"txg",			required_argument,	NULL, 't'},
+		{"brt-stats",		no_argument,		NULL, 'T'},
 		{"uberblock",		no_argument,		NULL, 'u'},
 		{"cachefile",		required_argument,	NULL, 'U'},
 		{"verbose",		no_argument,		NULL, 'v'},
@@ -8892,7 +8979,7 @@ main(int argc, char **argv)
 	};

 	while ((c = getopt_long(argc, argv,
-	    "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ",
+	    "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ",
 	    long_options, NULL)) != -1) {
 		switch (c) {
 		case 'b':
@@ -8914,6 +9001,7 @@ main(int argc, char **argv)
 		case 'R':
 		case 's':
 		case 'S':
+		case 'T':
 		case 'u':
 		case 'y':
 		case 'Z':
@@ -9076,22 +9164,6 @@ main(int argc, char **argv)
 	if (dump_opt['l'])
 		return (dump_label(argv[0]));

-	if (dump_opt['O']) {
-		if (argc != 2)
-			usage();
-		dump_opt['v'] = verbose + 3;
-		return (dump_path(argv[0], argv[1], NULL));
-	}
-	if (dump_opt['r']) {
-		target_is_spa = B_FALSE;
-		if (argc != 3)
-			usage();
-		dump_opt['v'] = verbose;
-		error = dump_path(argv[0], argv[1], &object);
-		if (error != 0)
-			fatal("internal error: %s", strerror(error));
-	}
-
 	if (dump_opt['X'] || dump_opt['F'])
 		rewind = ZPOOL_DO_REWIND |
 		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
@@ -9192,6 +9264,29 @@ main(int argc, char **argv)
 		searchdirs = NULL;
 	}

+	/*
+	 * We need to make sure to process -O option or call
+	 * dump_path after the -e option has been processed,
+	 * which imports the pool to the namespace if it's
+	 * not in the cachefile.
+	 */
+	if (dump_opt['O']) {
+		if (argc != 2)
+			usage();
+		dump_opt['v'] = verbose + 3;
+		return (dump_path(argv[0], argv[1], NULL));
+	}
+
+	if (dump_opt['r']) {
+		target_is_spa = B_FALSE;
+		if (argc != 3)
+			usage();
+		dump_opt['v'] = verbose;
+		error = dump_path(argv[0], argv[1], &object);
+		if (error != 0)
+			fatal("internal error: %s", strerror(error));
+	}
+
 	/*
 	 * import_checkpointed_state makes the assumption that the
 	 * target pool that we pass it is already part of the spa
@@ -47,7 +47,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [
 		#include <linux/fs.h>
 	],[
 		struct inode ip;
-		struct timespec64 ts;
+		struct timespec64 ts = {0};

 		memset(&ip, 0, sizeof(ip));
 		inode_set_ctime_to_ts(&ip, ts);
@@ -33,6 +33,7 @@ COMMON_H = \
 	sys/bqueue.h \
 	sys/btree.h \
 	sys/brt.h \
+	sys/brt_impl.h \
 	sys/dataset_kstats.h \
 	sys/dbuf.h \
 	sys/ddt.h \
@@ -101,7 +101,7 @@ void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
 void vfs_clearmntopt(vfs_t *vfsp, const char *name);
 int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp);
 int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype,
-    char *fspath, char *fspec, int fsflags);
+    char *fspath, char *fspec, int fsflags, vfs_t *parent_vfsp);

 typedef	uint64_t	vfs_feature_t;

@@ -56,6 +56,7 @@ enum symfollow { NO_FOLLOW = NOFOLLOW };
 #ifndef IN_BASE
 #include_next <sys/vnode.h>
 #endif
+#include <sys/ccompat.h>
 #include <sys/mount.h>
 #include <sys/cred.h>
 #include <sys/fcntl.h>
@@ -104,7 +105,7 @@ vn_flush_cached_data(vnode_t *vp, boolean_t sync)
 		zfs_vmobject_wlock(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, flags);
 		zfs_vmobject_wunlock(vp->v_object);
-		VOP_UNLOCK(vp);
+		VOP_UNLOCK1(vp);
 	}
 }
 #endif
@@ -0,0 +1,199 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
+ */
+
+#ifndef _SYS_BRT_IMPL_H
+#define	_SYS_BRT_IMPL_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * BRT - Block Reference Table.
+ */
+#define	BRT_OBJECT_VDEV_PREFIX	"com.fudosecurity:brt:vdev:"
+
+/*
+ * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
+ * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
+ * Each element in this array represents how many BRT entries do we have in this
+ * chunk of storage. We always load this entire array into memory and update as
+ * needed. By having it in memory we can quickly tell (during zio_free()) if
+ * there are any BRT entries that we might need to update.
+ *
+ * This value cannot be larger than 16MB, at least as long as we support
+ * 512 byte block sizes. With 512 byte block size we can have exactly
+ * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
+ * many for a 16bit counter.
+ */
+#define	BRT_RANGESIZE	(16 * 1024 * 1024)
+_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
+	"BRT_RANGESIZE is too large.");
+/*
+ * We don't want to update the whole structure every time. Maintain bitmap
+ * of dirty blocks within the regions, so that a single bit represents a
+ * block size of entcounts. For example if we have a 1PB vdev then all
+ * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
+ * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
+ * the whole 128MB on disk when we have updated only a single entcount.
+ * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
+ * is represented by a single bit. This gives us 4096 bits. A set bit in the
+ * bitmap means that we had a change in at least one of the 16384 entcounts
+ * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
+ */
+#define	BRT_BLOCKSIZE	(32 * 1024)
+#define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
+	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
+
+#define	BRT_LITTLE_ENDIAN	0
+#define	BRT_BIG_ENDIAN		1
+#ifdef _ZFS_LITTLE_ENDIAN
+#define	BRT_NATIVE_BYTEORDER		BRT_LITTLE_ENDIAN
+#define	BRT_NON_NATIVE_BYTEORDER	BRT_BIG_ENDIAN
+#else
+#define	BRT_NATIVE_BYTEORDER		BRT_BIG_ENDIAN
+#define	BRT_NON_NATIVE_BYTEORDER	BRT_LITTLE_ENDIAN
+#endif
+
+typedef struct brt_vdev_phys {
+	uint64_t	bvp_mos_entries;
+	uint64_t	bvp_size;
+	uint64_t	bvp_byteorder;
+	uint64_t	bvp_totalcount;
+	uint64_t	bvp_rangesize;
+	uint64_t	bvp_usedspace;
+	uint64_t	bvp_savedspace;
+} brt_vdev_phys_t;
+
+typedef struct brt_vdev {
+	/*
+	 * VDEV id.
+	 */
+	uint64_t	bv_vdevid;
+	/*
+	 * Is the structure initiated?
+	 * (bv_entcount and bv_bitmap are allocated?)
+	 */
+	boolean_t	bv_initiated;
+	/*
+	 * Object number in the MOS for the entcount array and brt_vdev_phys.
+	 */
+	uint64_t	bv_mos_brtvdev;
+	/*
+	 * Object number in the MOS for the entries table.
+	 */
+	uint64_t	bv_mos_entries;
+	/*
+	 * Entries to sync.
+	 */
+	avl_tree_t	bv_tree;
+	/*
+	 * Does the bv_entcount[] array needs byte swapping?
+	 */
+	boolean_t	bv_need_byteswap;
+	/*
+	 * Number of entries in the bv_entcount[] array.
+	 */
+	uint64_t	bv_size;
+	/*
+	 * This is the array with BRT entry count per BRT_RANGESIZE.
+	 */
+	uint16_t	*bv_entcount;
+	/*
+	 * Sum of all bv_entcount[]s.
+	 */
+	uint64_t	bv_totalcount;
+	/*
+	 * Space on disk occupied by cloned blocks (without compression).
+	 */
+	uint64_t	bv_usedspace;
+	/*
+	 * How much additional space would be occupied without block cloning.
+	 */
+	uint64_t	bv_savedspace;
+	/*
+	 * brt_vdev_phys needs updating on disk.
+	 */
+	boolean_t	bv_meta_dirty;
+	/*
+	 * bv_entcount[] needs updating on disk.
+	 */
+	boolean_t	bv_entcount_dirty;
+	/*
+	 * bv_entcount[] potentially can be a bit too big to sychronize it all
+	 * when we just changed few entcounts. The fields below allow us to
+	 * track updates to bv_entcount[] array since the last sync.
+	 * A single bit in the bv_bitmap represents as many entcounts as can
+	 * fit into a single BRT_BLOCKSIZE.
+	 * For example we have 65536 entcounts in the bv_entcount array
+	 * (so the whole array is 128kB). We updated bv_entcount[2] and
+	 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
+	 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
+	 */
+	ulong_t		*bv_bitmap;
+	uint64_t	bv_nblocks;
+} brt_vdev_t;
+
+/*
+ * In-core brt
+ */
+typedef struct brt {
+	krwlock_t	brt_lock;
+	spa_t		*brt_spa;
+#define	brt_mos		brt_spa->spa_meta_objset
+	uint64_t	brt_rangesize;
+	uint64_t	brt_usedspace;
+	uint64_t	brt_savedspace;
+	avl_tree_t	brt_pending_tree[TXG_SIZE];
+	kmutex_t	brt_pending_lock[TXG_SIZE];
+	/* Sum of all entries across all bv_trees. */
+	uint64_t	brt_nentries;
+	brt_vdev_t	*brt_vdevs;
+	uint64_t	brt_nvdevs;
+} brt_t;
+
+/* Size of bre_offset / sizeof (uint64_t). */
+#define	BRT_KEY_WORDS	(1)
+
+/*
+ * In-core brt entry.
+ * On-disk we use bre_offset as the key and bre_refcount as the value.
+ */
+typedef struct brt_entry {
+	uint64_t	bre_offset;
+	uint64_t	bre_refcount;
+	avl_node_t	bre_node;
+} brt_entry_t;
+
+typedef struct brt_pending_entry {
+	blkptr_t	bpe_bp;
+	int		bpe_count;
+	avl_node_t	bpe_node;
+} brt_pending_entry_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_BRT_IMPL_H */
@@ -1072,8 +1072,7 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
 int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset,
    uint64_t length, struct blkptr *bps, size_t *nbpsp);
 int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps,
-    boolean_t replay);
+    uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps);

 /*
 * Initial setup and final teardown.
@@ -14,7 +14,7 @@
 .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
 .\" Copyright (c) 2017 Intel Corporation.
 .\"
-.Dd June 27, 2023
+.Dd November 18, 2023
 .Dt ZDB 8
 .Os
 .
@@ -23,7 +23,7 @@
 .Nd display ZFS storage pool debugging and consistency information
 .Sh SYNOPSIS
 .Nm
-.Op Fl AbcdDFGhikLMNPsvXYy
+.Op Fl AbcdDFGhikLMNPsTvXYy
 .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
 .Op Fl I Ar inflight-I/O-ops
 .Oo Fl o Ar var Ns = Ns Ar value Oc Ns …
@@ -403,6 +403,13 @@ Display operation counts, bandwidth, and error counts of I/O to the pool from
 Simulate the effects of deduplication, constructing a DDT and then display
 that DDT as with
 .Fl DD .
+.It Fl T , -brt-stats
+Display block reference table (BRT) statistics, including the size of uniques
+blocks cloned, the space saving as a result of cloning, and the saving ratio.
+.It Fl TT
+Display the per-vdev BRT statistics, including total references.
+.It Fl TTT
+Dump the contents of the block reference tables.
 .It Fl u , -uberblock
 Display the current uberblock.
 .El
@@ -120,7 +120,7 @@ vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)

 int
 mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
-    char *fspec, int fsflags)
+    char *fspec, int fsflags, vfs_t *parent_vfsp)
 {
 	struct vfsconf *vfsp;
 	struct mount *mp;
@@ -220,6 +220,13 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
 	mp->mnt_opt = mp->mnt_optnew;
 	(void) VFS_STATFS(mp, &mp->mnt_stat);

+#ifdef VFS_SUPPORTS_EXJAIL_CLONE
+	/*
+	 * Clone the mnt_exjail credentials of the parent, as required.
+	 */
+	vfs_exjail_clone(parent_vfsp, mp);
+#endif
+
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
@@ -32,11 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/zmod.h>
-#if __FreeBSD_version >= 1300041
 #include <contrib/zlib/zlib.h>
-#else
-#include <sys/zlib.h>
-#endif
 #include <sys/kobj.h>


@@ -90,11 +86,7 @@ zlib_inflateInit(z_stream *stream)
 static int
 zlib_inflate(z_stream *stream, int finish)
 {
-#if __FreeBSD_version >= 1300024
 	return (inflate(stream, finish));
-#else
-	return (_zlib104_inflate(stream, finish));
-#endif
 }


@@ -46,6 +46,7 @@ knlist_sx_xunlock(void *arg)
 	sx_xunlock((struct sx *)arg);
 }

+#if __FreeBSD_version >= 1300128
 static void
 knlist_sx_assert_lock(void *arg, int what)
 {
@@ -55,11 +56,28 @@ knlist_sx_assert_lock(void *arg, int what)
 	else
 		sx_assert((struct sx *)arg, SX_UNLOCKED);
 }
+#else
+static void
+knlist_sx_assert_locked(void *arg)
+{
+	sx_assert((struct sx *)arg, SX_LOCKED);
+}
+static void
+knlist_sx_assert_unlocked(void *arg)
+{
+	sx_assert((struct sx *)arg, SX_UNLOCKED);
+}
+#endif

 void
 knlist_init_sx(struct knlist *knl, struct sx *lock)
 {

+#if __FreeBSD_version >= 1300128
 	knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock,
 	    knlist_sx_assert_lock);
+#else
+	knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock,
+	    knlist_sx_assert_locked, knlist_sx_assert_unlocked);
+#endif
 }
@@ -1026,7 +1026,8 @@ zfsctl_snapdir_lookup(struct vop_lookup_args *ap)
 	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
 	    dvp->v_vfsp->mnt_stat.f_mntonname, name);

-	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
+	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0,
+	    dvp->v_vfsp);
 	kmem_free(mountpoint, mountpoint_len);
 	if (err == 0) {
 		/*
@@ -6213,6 +6213,7 @@ zfs_deallocate(struct vop_deallocate_args *ap)
 }
 #endif

+#if __FreeBSD_version >= 1300039
 #ifndef _SYS_SYSPROTO_H_
 struct vop_copy_file_range_args {
 	struct vnode *a_invp;
@@ -6319,6 +6320,7 @@ bad_write_fallback:
 	    ap->a_incred, ap->a_outcred, ap->a_fsizetd);
 	return (error);
 }
+#endif

 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
@@ -6383,7 +6385,9 @@ struct vop_vector zfs_vnodeops = {
 #if __FreeBSD_version >= 1400043
 	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
 #endif
+#if __FreeBSD_version >= 1300039
 	.vop_copy_file_range =	zfs_freebsd_copy_file_range,
+#endif
 };
 VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);

@@ -1364,6 +1364,19 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
 				vec++;
 				total_len += crypt_len;
 			}
+		} else if (txtype == TX_CLONE_RANGE) {
+			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
+			crypt_len = o - sizeof (lr_t);
+			dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t);
+			dst_iovecs[vec].iov_len = crypt_len;
+
+			/* copy the bps now since they will not be encrypted */
+			memcpy(dlrp + o, slrp + o, lr_len - o);
+			memcpy(aadp, slrp + o, lr_len - o);
+			aadp += lr_len - o;
+			aad_len += lr_len - o;
+			vec++;
+			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp +
@@ -1543,6 +1543,21 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
 				nr_iovecs++;
 				total_len += crypt_len;
 			}
+		} else if (txtype == TX_CLONE_RANGE) {
+			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
+			crypt_len = o - sizeof (lr_t);
+			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+			src_iovecs[nr_iovecs].iov_len = crypt_len;
+			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+			dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+			/* copy the bps now since they will not be encrypted */
+			memcpy(dlrp + o, slrp + o, lr_len - o);
+			memcpy(aadp, slrp + o, lr_len - o);
+			aadp += lr_len - o;
+			aad_len += lr_len - o;
+			nr_iovecs++;
+			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
@@ -28,6 +28,7 @@
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/brt.h>
+#include <sys/brt_impl.h>
 #include <sys/ddt.h>
 #include <sys/bitmap.h>
 #include <sys/zap.h>
@@ -234,178 +235,15 @@
 * destination dataset is mounted and its ZIL replayed.
 * To address this situation we leverage zil_claim() mechanism where ZFS will
 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
- * entries, we will bump reference counters for their BPs in the BRT and then
- * on mount and ZIL replay we will just attach BPs to the file without
- * bumping reference counters.
- * Note it is still possible that after zil_claim() we never mount the
- * destination, so we never replay its ZIL and we destroy it. This way we would
- * end up with leaked references in BRT. We address that too as ZFS gives us
- * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
+ * entries, we will bump reference counters for their BPs in the BRT.  Then
+ * on mount and ZIL replay we bump the reference counters once more, while the
+ * first references are dropped during ZIL destroy by zil_free_clone_range().
+ * It is possible that after zil_claim() we never mount the destination, so
+ * we never replay its ZIL and just destroy it.  In this case the only taken
+ * references will be dropped by zil_free_clone_range(), since the cloning is
+ * not going to ever take place.
 */

-/*
- * BRT - Block Reference Table.
- */
-#define	BRT_OBJECT_VDEV_PREFIX	"com.fudosecurity:brt:vdev:"
-
-/*
- * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
- * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
- * Each element in this array represents how many BRT entries do we have in this
- * chunk of storage. We always load this entire array into memory and update as
- * needed. By having it in memory we can quickly tell (during zio_free()) if
- * there are any BRT entries that we might need to update.
- *
- * This value cannot be larger than 16MB, at least as long as we support
- * 512 byte block sizes. With 512 byte block size we can have exactly
- * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
- * many for a 16bit counter.
- */
-#define	BRT_RANGESIZE	(16 * 1024 * 1024)
-_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
-	"BRT_RANGESIZE is too large.");
-/*
- * We don't want to update the whole structure every time. Maintain bitmap
- * of dirty blocks within the regions, so that a single bit represents a
- * block size of entcounts. For example if we have a 1PB vdev then all
- * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
- * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
- * the whole 128MB on disk when we have updated only a single entcount.
- * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
- * is represented by a single bit. This gives us 4096 bits. A set bit in the
- * bitmap means that we had a change in at least one of the 16384 entcounts
- * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
- */
-#define	BRT_BLOCKSIZE	(32 * 1024)
-#define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
-	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
-
-#define	BRT_LITTLE_ENDIAN	0
-#define	BRT_BIG_ENDIAN		1
-#ifdef _ZFS_LITTLE_ENDIAN
-#define	BRT_NATIVE_BYTEORDER		BRT_LITTLE_ENDIAN
-#define	BRT_NON_NATIVE_BYTEORDER	BRT_BIG_ENDIAN
-#else
-#define	BRT_NATIVE_BYTEORDER		BRT_BIG_ENDIAN
-#define	BRT_NON_NATIVE_BYTEORDER	BRT_LITTLE_ENDIAN
-#endif
-
-typedef struct brt_vdev_phys {
-	uint64_t	bvp_mos_entries;
-	uint64_t	bvp_size;
-	uint64_t	bvp_byteorder;
-	uint64_t	bvp_totalcount;
-	uint64_t	bvp_rangesize;
-	uint64_t	bvp_usedspace;
-	uint64_t	bvp_savedspace;
-} brt_vdev_phys_t;
-
-typedef struct brt_vdev {
-	/*
-	 * VDEV id.
-	 */
-	uint64_t	bv_vdevid;
-	/*
-	 * Is the structure initiated?
-	 * (bv_entcount and bv_bitmap are allocated?)
-	 */
-	boolean_t	bv_initiated;
-	/*
-	 * Object number in the MOS for the entcount array and brt_vdev_phys.
-	 */
-	uint64_t	bv_mos_brtvdev;
-	/*
-	 * Object number in the MOS for the entries table.
-	 */
-	uint64_t	bv_mos_entries;
-	/*
-	 * Entries to sync.
-	 */
-	avl_tree_t	bv_tree;
-	/*
-	 * Does the bv_entcount[] array needs byte swapping?
-	 */
-	boolean_t	bv_need_byteswap;
-	/*
-	 * Number of entries in the bv_entcount[] array.
-	 */
-	uint64_t	bv_size;
-	/*
-	 * This is the array with BRT entry count per BRT_RANGESIZE.
-	 */
-	uint16_t	*bv_entcount;
-	/*
-	 * Sum of all bv_entcount[]s.
-	 */
-	uint64_t	bv_totalcount;
-	/*
-	 * Space on disk occupied by cloned blocks (without compression).
-	 */
-	uint64_t	bv_usedspace;
-	/*
-	 * How much additional space would be occupied without block cloning.
-	 */
-	uint64_t	bv_savedspace;
-	/*
-	 * brt_vdev_phys needs updating on disk.
-	 */
-	boolean_t	bv_meta_dirty;
-	/*
-	 * bv_entcount[] needs updating on disk.
-	 */
-	boolean_t	bv_entcount_dirty;
-	/*
-	 * bv_entcount[] potentially can be a bit too big to sychronize it all
-	 * when we just changed few entcounts. The fields below allow us to
-	 * track updates to bv_entcount[] array since the last sync.
-	 * A single bit in the bv_bitmap represents as many entcounts as can
-	 * fit into a single BRT_BLOCKSIZE.
-	 * For example we have 65536 entcounts in the bv_entcount array
-	 * (so the whole array is 128kB). We updated bv_entcount[2] and
-	 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
-	 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
-	 */
-	ulong_t		*bv_bitmap;
-	uint64_t	bv_nblocks;
-} brt_vdev_t;
-
-/*
- * In-core brt
- */
-typedef struct brt {
-	krwlock_t	brt_lock;
-	spa_t		*brt_spa;
-#define	brt_mos		brt_spa->spa_meta_objset
-	uint64_t	brt_rangesize;
-	uint64_t	brt_usedspace;
-	uint64_t	brt_savedspace;
-	avl_tree_t	brt_pending_tree[TXG_SIZE];
-	kmutex_t	brt_pending_lock[TXG_SIZE];
-	/* Sum of all entries across all bv_trees. */
-	uint64_t	brt_nentries;
-	brt_vdev_t	*brt_vdevs;
-	uint64_t	brt_nvdevs;
-} brt_t;
-
-/* Size of bre_offset / sizeof (uint64_t). */
-#define	BRT_KEY_WORDS	(1)
-
-/*
- * In-core brt entry.
- * On-disk we use bre_offset as the key and bre_refcount as the value.
- */
-typedef struct brt_entry {
-	uint64_t	bre_offset;
-	uint64_t	bre_refcount;
-	avl_node_t	bre_node;
-} brt_entry_t;
-
-typedef struct brt_pending_entry {
-	blkptr_t	bpe_bp;
-	int		bpe_count;
-	avl_node_t	bpe_node;
-} brt_pending_entry_t;
-
 static kmem_cache_t *brt_entry_cache;
 static kmem_cache_t *brt_pending_entry_cache;

@@ -2700,15 +2700,23 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
 	 * writes and clones into this block.
 	 */
 	mutex_enter(&db->db_mtx);
+	DBUF_VERIFY(db);
 	VERIFY(!dbuf_undirty(db, tx));
 	ASSERT3P(dbuf_find_dirty_eq(db, tx->tx_txg), ==, NULL);
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;
+		dbuf_clear_data(db);
 	}
+
+	db->db_state = DB_NOFILL;
+	DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+
+	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);

-	dmu_buf_will_not_fill(db_fake, tx);
+	dbuf_noread(db);
+	(void) dbuf_dirty(db, tx);
 }

 void
@@ -2267,7 +2267,7 @@ out:

 int
 dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
-    dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay)
+    dmu_tx_t *tx, const blkptr_t *bps, size_t nbps)
 {
 	spa_t *spa;
 	dmu_buf_t **dbp, *dbuf;
@@ -2341,10 +2341,8 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		 * When data in embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
-		 * Also, when replaying ZIL we don't want to bump references
-		 * in the BRT as it was already done during ZIL claim.
 		 */
-		if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_add(spa, bp, tx);
 		}
 	}
@@ -1764,7 +1764,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
 }

 /*
- * Checks if the dnode contains any uncommitted dirty records.
+ * Checks if the dnode itself is dirty, or is carrying any uncommitted records.
+ * It is important to check both conditions, as some operations (eg appending
+ * to a file) can dirty both as a single logical unit, but they are not synced
+ * out atomically, so checking one and not the other can result in an object
+ * appearing to be clean mid-way through a commit.
+ *
+ * Do not change this lightly! If you get it wrong, dmu_offset_next() can
+ * detect a hole where there is really data, leading to silent corruption.
 */
 boolean_t
 dnode_is_dirty(dnode_t *dn)
@@ -1772,7 +1779,8 @@ dnode_is_dirty(dnode_t *dn)
 	mutex_enter(&dn->dn_mtx);

 	for (int i = 0; i < TXG_SIZE; i++) {
-		if (multilist_link_active(&dn->dn_dirty_link[i])) {
+		if (multilist_link_active(&dn->dn_dirty_link[i]) ||
+		    !list_is_empty(&dn->dn_dirty_records[i])) {
 			mutex_exit(&dn->dn_mtx);
 			return (B_TRUE);
 		}
@@ -1333,7 +1333,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		}

 		error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
-		    bps, nbps, B_FALSE);
+		    bps, nbps);
 		if (error != 0) {
 			dmu_tx_commit(tx);
 			break;
@@ -1467,7 +1467,7 @@ zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
 	if (zp->z_blksz < blksz)
 		zfs_grow_blocksize(zp, blksz, tx);

-	dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE);
+	dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps);

 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);

@@ -158,22 +158,23 @@ zio_init(void)
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);

+	/*
+	 * For small buffers, we want a cache for each multiple of
+	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
+	 * for each quarter-power of 2.
+	 */
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
-		size_t align, cflags, data_cflags;
-		char name[32];
-
-		/*
-		 * Create cache for each half-power of 2 size, starting from
-		 * SPA_MINBLOCKSIZE.  It should give us memory space efficiency
-		 * of ~7/8, sufficient for transient allocations mostly using
-		 * these caches.
-		 */
 		size_t p2 = size;
+		size_t align = 0;
+		size_t data_cflags, cflags;
+
+		data_cflags = KMC_NODEBUG;
+		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
+		    KMC_NODEBUG : 0;
+
 		while (!ISP2(p2))
 			p2 &= p2 - 1;
-		if (!IS_P2ALIGNED(size, p2 / 2))
-			continue;

 #ifndef _KERNEL
 		/*
@@ -184,37 +185,47 @@ zio_init(void)
 		 */
 		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 			continue;
+		/*
+		 * Here's the problem - on 4K native devices in userland on
+		 * Linux using O_DIRECT, buffers must be 4K aligned or I/O
+		 * will fail with EINVAL, causing zdb (and others) to coredump.
+		 * Since userland probably doesn't need optimized buffer caches,
+		 * we just force 4K alignment on everything.
+		 */
+		align = 8 * SPA_MINBLOCKSIZE;
+#else
+		if (size < PAGESIZE) {
+			align = SPA_MINBLOCKSIZE;
+		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
+			align = PAGESIZE;
+		}
 #endif

-		if (IS_P2ALIGNED(size, PAGESIZE))
-			align = PAGESIZE;
-		else
-			align = 1 << (highbit64(size ^ (size - 1)) - 1);
+		if (align != 0) {
+			char name[36];
+			if (cflags == data_cflags) {
+				/*
+				 * Resulting kmem caches would be identical.
+				 * Save memory by creating only one.
+				 */
+				(void) snprintf(name, sizeof (name),
+				    "zio_buf_comb_%lu", (ulong_t)size);
+				zio_buf_cache[c] = kmem_cache_create(name,
+				    size, align, NULL, NULL, NULL, NULL, NULL,
+				    cflags);
+				zio_data_buf_cache[c] = zio_buf_cache[c];
+				continue;
+			}
+			(void) snprintf(name, sizeof (name), "zio_buf_%lu",
+			    (ulong_t)size);
+			zio_buf_cache[c] = kmem_cache_create(name, size,
+			    align, NULL, NULL, NULL, NULL, NULL, cflags);

-		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
-		    KMC_NODEBUG : 0;
-		data_cflags = KMC_NODEBUG;
-		if (cflags == data_cflags) {
-			/*
-			 * Resulting kmem caches would be identical.
-			 * Save memory by creating only one.
-			 */
-			(void) snprintf(name, sizeof (name),
-			    "zio_buf_comb_%lu", (ulong_t)size);
-			zio_buf_cache[c] = kmem_cache_create(name, size, align,
-			    NULL, NULL, NULL, NULL, NULL, cflags);
-			zio_data_buf_cache[c] = zio_buf_cache[c];
-			continue;
+			(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
+			    (ulong_t)size);
+			zio_data_buf_cache[c] = kmem_cache_create(name, size,
+			    align, NULL, NULL, NULL, NULL, NULL, data_cflags);
 		}
-		(void) snprintf(name, sizeof (name), "zio_buf_%lu",
-		    (ulong_t)size);
-		zio_buf_cache[c] = kmem_cache_create(name, size, align,
-		    NULL, NULL, NULL, NULL, NULL, cflags);
-
-		(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
-		    (ulong_t)size);
-		zio_data_buf_cache[c] = kmem_cache_create(name, size, align,
-		    NULL, NULL, NULL, NULL, NULL, data_cflags);
 	}

 	while (--c != 0) {
@@ -103,6 +103,7 @@ if [ -d ${dkms_root}/%{module} ]; then
            fi
        fi
    done
+    cd ${dkms_root}
 fi

 # Uninstall this version of zfs dkms modules before installation of the package.
@@ -58,7 +58,7 @@ set -A args "create" "add" "destroy" "import fakepool" \
    "setvprop" "blah blah" "-%" "--?" "-*" "-=" \
    "-a" "-f" "-g" "-j" "-n" "-o" "-p" "-p /tmp" \
    "-t" "-w" "-z" "-E" "-H" "-I" "-J" \
-    "-Q" "-R" "-T" "-W"
+    "-Q" "-R" "-W"

 log_assert "Execute zdb using invalid parameters."

@@ -123,7 +123,10 @@ if not httpd:
 with open('$HTTPS_PORT_FILE', 'w') as portf:
 	print(port, file=portf)

-httpd.socket = ssl.wrap_socket(httpd.socket, server_side=True, keyfile='/$TESTPOOL/snakeoil.key', certfile='$SSL_CA_CERT_FILE', ssl_version=ssl.PROTOCOL_TLS)
+sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
+sslctx.check_hostname = False
+sslctx.load_cert_chain(certfile='$SSL_CA_CERT_FILE', keyfile='/$TESTPOOL/snakeoil.key')
+httpd.socket = httpd.socket = sslctx.wrap_socket(httpd.socket, server_side=True)

 os.chdir('$STF_SUITE/tests/functional/cli_root/zfs_load-key')
Author	SHA1	Message	Date
Tony Hutter	494aaaed89	Tag zfs-2.2.2 META file and changelog updated. Signed-off-by: Tony Hutter <hutter2@llnl.gov>	2023-11-29 14:08:46 -08:00
rmacklem	522414da3b	FreeBSD: Fix ZFS so that snapshots under .zfs/snapshot are NFS visible Call vfs_exjail_clone() for mounts created under .zfs/snapshot to fill in the mnt_exjail field for the mount. If this is not done, the snapshots under .zfs/snapshot with not be accessible over NFS. This version has the argument name in vfs.h fixed to match that of the name in spl_vfs.c, although it really does not matter. External-issue: https://reviews.freebsd.org/D42672 Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Rick Macklem <rmacklem@uoguelph.ca> Closes #15563	2023-11-29 14:08:46 -08:00
Alexander Motin	a8c256046b	ZIL: Call brt_pending_add() replaying TX_CLONE_RANGE zil_claim_clone_range() takes references on cloned blocks before ZIL replay. Later zil_free_clone_range() drops them after replay or on dataset destroy. The total balance is neutral. It means on actual replay we must take additional references, which would stay in BRT. Without this blocks could be freed prematurely when either original file or its clone are destroyed. I've observed BRT being emptied and the feature being deactivated after ZIL replay completion, which should not have happened. With the patch I see expected stats. Reviewed-by: Kay Pedersen <mail@mkwg.de> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Rob Norris <robn@despairlabs.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15603	2023-11-29 13:08:25 -08:00
Martin Matuška	eb34de04d7	zdb: fix printf() length for uint64_t devid Bug introduced in `213d682967`. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Warner Losh <imp@FreeBSD.org> Signed-off-by: Martin Matuska <mm@FreeBSD.org> Closes #15606	2023-11-29 13:08:25 -08:00
Jaron Kent-Dobias	d813aa8530	Linux 6.6 compat: fix configure error with clang (#15558 ) With Linux v6.6.x and clang 16, a configure step fails on a warning that later results in an error while building, due to 'ts' being uninitialized. Add a trivial initialization to silence the warning. Signed-off-by: Jaron Kent-Dobias <jaron@kent-dobias.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov>	2023-11-28 15:19:07 -08:00
AllKind	3b267e72de	zfs-dkms: fix shell-init error message If all zfs dkms modules have been removed, a shell-init error message may appear, because /var/lib/dkms/zfs does no longer exist. Resolve this by leaving the directory earlier on. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Mart Frauenlob <AllKind@fastest.cc> Closes #15576	2023-11-28 15:19:07 -08:00
Alan Somers	349fb77f11	FreeBSD: Fix the build on FreeBSD 12 It was broken for several reasons: * VOP_UNLOCK lost an argument in 13.0. So OpenZFS should be using VOP_UNLOCK1, but a few direct calls to VOP_UNLOCK snuck in. * The location of the zlib header moved in 13.0 and 12.1. We can drop support for building on 12.0, which is EoL. * knlist_init lost an argument in 13.0. OpenZFS change `9d0887402b` assumed 13.0 or later. * FreeBSD 13.0 added copy_file_range, and OpenZFS change `67a1b03791` assumed 13.0 or later. Sponsored-by: Axcient Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Alan Somers <asomers@gmail.com> Closes #15551	2023-11-28 15:19:07 -08:00
Rob N	2a953e0ac9	dmu_buf_will_clone: fix race in transition back to NOFILL Previously, dmu_buf_will_clone() would roll back any dirty record, but would not clean out the modified data nor reset the state before releasing the lock. That leaves the last-written data in db_data, but the dbuf in the wrong state. This is eventually corrected when the dbuf state is made NOFILL, and dbuf_noread() called (which clears out the old data), but at this point its too late, because the lock was already dropped with that invalid state. Any caller acquiring the lock before the call into dmu_buf_will_not_fill() can find what appears to be a clean, readable buffer, and would take the wrong state from it: it should be getting the data from the cloned block, not from earlier (unwritten) dirty data. Even after the state was switched to NOFILL, the old data was still not cleaned out until dbuf_noread(), which is another gap for a caller to take the lock and read the wrong data. This commit fixes all this by properly cleaning up the previous state and then setting the new state before dropping the lock. The DBUF_VERIFY() calls confirm that the dbuf is in a valid state when the lock is down. Sponsored-by: Klara, Inc. Sponsored-By: OpenDrives Inc. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Closes #15566 Closes #15526	2023-11-28 12:59:00 -08:00
Akash B	e4985bf5a1	zdb: Fix zdb '-O\|-r' options with -e/exported zpool zdb with '-e' or exported zpool doesn't work along with '-O' and '-r' options as we process them before '-e' has been processed. Below errors are seen: ~> zdb -e pool-mds65/mdt65 -O oi.9/0x200000009:0x0:0x0 failed to hold dataset 'pool-mds65/mdt65': No such file or directory ~> zdb -e pool-oss0/ost0 -r file1 /tmp/filecopy1 -p. failed to hold dataset 'pool-oss0/ost0': No such file or directory zdb: internal error: No such file or directory We need to make sure to process '-O\|-r' options after the '-e' option has been processed, which imports the pool to the namespace if it's not in the cachefile. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Akash B <akash-b@hpe.com> Closes #15532	2023-11-28 12:56:43 -08:00
Rob Norris	e96675a7b1	zdb: show BRT statistics and dump its contents Same idea as the dedup stats, but for block cloning. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Kay Pedersen <mail@mkwg.de> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Rob Norris <robn@despairlabs.com> Closes #15541	2023-11-28 12:56:43 -08:00
Rob Norris	d702f86eaf	brt: lift internal definitions into _impl header So that zdb (and others!) can get at the BRT on-disk structures. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Kay Pedersen <mail@mkwg.de> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Rob Norris <robn@despairlabs.com> Closes #15541	2023-11-28 12:56:43 -08:00
Tony Hutter	41c4599cba	ZTS: Fix zfs_load-key failures on F39 The zfs_load-key tests were failing on F39 due to their use of the deprecated ssl.wrap_socket function. This commit updates the test to instead use ssl.SSLContext() as described in https://stackoverflow.com/a/65194957. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes #15534 Closes #15550	2023-11-28 12:56:09 -08:00
Alexander Motin	56a2a0981e	ZIL: Do not encrypt block pointers in lr_clone_range_t In case of crash cloned blocks need to be claimed on pool import. It is only possible if they (lr_bps) and their count (lr_nbps) are not encrypted but only authenticated, similar to block pointer in lr_write_t. Few other fields can be and are still encrypted. This should fix panic on ZIL claim after crash when block cloning is actively used. Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Reviewed-by: Tom Caputi <caputit1@tcnj.edu> Reviewed-by: Sean Eric Fagan <sef@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Edmund Nadolski <edmund.nadolski@ixsystems.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15543 Closes #15513	2023-11-28 11:17:52 -08:00
Rob N	9b9b09f452	dnode_is_dirty: check dnode and its data for dirtiness Over its history this the dirty dnode test has been changed between checking for a dnodes being on `os_dirty_dnodes` (`dn_dirty_link`) and `dn_dirty_record`. `de198f2d9` Fix lseek(SEEK_DATA/SEEK_HOLE) mmap consistency `2531ce372` Revert "Report holes when there are only metadata changes" `ec4f9b8f3` Report holes when there are only metadata changes `454365bba` Fix dirty check in dmu_offset_next() `66aca2473` SEEK_HOLE should not block on txg_wait_synced() Also illumos/illumos-gate@c543ec060d illumos/illumos-gate@2bcf0248e9 It turns out both are actually required. In the case of appending data to a newly created file, the dnode proper is dirtied (at least to change the blocksize) and dirty records are added. Thus, a single logical operation is represented by separate dirty indicators, and must not be separated. The incorrect dirty check becomes a problem when the first block of a file is being appended to while another process is calling lseek to skip holes. There is a small window where the dnode part is undirtied while there are still dirty records. In this case, `lseek(fd, 0, SEEK_DATA)` would not know that the file is dirty, and would go to `dnode_next_offset()`. Since the object has no data blocks yet, it returns `ESRCH`, indicating no data found, which results in `ENXIO` being returned to `lseek()`'s caller. Since coreutils 9.2, `cp` performs sparse copies by default, that is, it uses `SEEK_DATA` and `SEEK_HOLE` against the source file and attempts to replicate the holes in the target. When it hits the bug, its initial search for data fails, and it goes on to call `fallocate()` to create a hole over the entire destination file. This has come up more recently as users upgrade their systems, getting OpenZFS 2.2 as well as a newer coreutils. However, this problem has been reproduced against 2.1, as well as on FreeBSD 13 and 14. This change simply updates the dirty check to check both types of dirty. If there's anything dirty at all, we immediately go to the "wait for sync" stage, It doesn't really matter after that; both changes are on disk, so the dirty fields should be correct. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Rich Ercolani <rincebrain@gmail.com> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Closes #15571 Closes #15526	2023-11-28 09:15:48 -08:00
Brian Behlendorf	89fcb8c6f9	Revert "Tune zio buffer caches and their alignments" This reverts commit `bd7a02c251` which can trigger an unlikely existing bio alignment issue on Linux. This change is good, but the underlying issue it exposes needs to be resolved before this can be re-applied. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #15533	2023-11-28 09:03:58 -08:00