Tag zfs-2.0.6

META file and changelog updated. Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Linux 5.15 compat: get_acl()
2026-05-23 10:54:35 +03:00 · 2021-09-22 15:19:08 -07:00 · 2021-09-22 15:19:08 -07:00 · 2021-09-22 15:19:08 -07:00 · 2021-09-22 15:19:08 -07:00 · 2021-09-22 15:19:08 -07:00
43 changed files with 839 additions and 295 deletions
@@ -1,10 +1,10 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.0.5
+Version:       2.0.6
 Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 5.12
+Linux-Maximum: 5.14
 Linux-Minimum: 3.10
@@ -161,12 +161,6 @@ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
    dmu_tx_t *tx);

 typedef struct sublivelist_verify {
-	/* all ALLOC'd blkptr_t in one sub-livelist */
-	zfs_btree_t sv_all_allocs;
-
-	/* all FREE'd blkptr_t in one sub-livelist */
-	zfs_btree_t sv_all_frees;
-
 	/* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */
 	zfs_btree_t sv_pair;

@@ -225,29 +219,68 @@ typedef struct sublivelist_verify_block {

 static void zdb_print_blkptr(const blkptr_t *bp, int flags);

+typedef struct sublivelist_verify_block_refcnt {
+	/* block pointer entry in livelist being verified */
+	blkptr_t svbr_blk;
+
+	/*
+	 * Refcount gets incremented to 1 when we encounter the first
+	 * FREE entry for the svfbr block pointer and a node for it
+	 * is created in our ZDB verification/tracking metadata.
+	 *
+	 * As we encounter more FREE entries we increment this counter
+	 * and similarly decrement it whenever we find the respective
+	 * ALLOC entries for this block.
+	 *
+	 * When the refcount gets to 0 it means that all the FREE and
+	 * ALLOC entries of this block have paired up and we no longer
+	 * need to track it in our verification logic (e.g. the node
+	 * containing this struct in our verification data structure
+	 * should be freed).
+	 *
+	 * [refer to sublivelist_verify_blkptr() for the actual code]
+	 */
+	uint32_t svbr_refcnt;
+} sublivelist_verify_block_refcnt_t;
+
+static int
+sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
+{
+	const sublivelist_verify_block_refcnt_t *l = larg;
+	const sublivelist_verify_block_refcnt_t *r = rarg;
+	return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
+}
+
 static int
 sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
    dmu_tx_t *tx)
 {
 	ASSERT3P(tx, ==, NULL);
 	struct sublivelist_verify *sv = arg;
-	char blkbuf[BP_SPRINTF_LEN];
+	sublivelist_verify_block_refcnt_t current = {
+			.svbr_blk = *bp,
+
+			/*
+			 * Start with 1 in case this is the first free entry.
+			 * This field is not used for our B-Tree comparisons
+			 * anyway.
+			 */
+			.svbr_refcnt = 1,
+	};
+
 	zfs_btree_index_t where;
+	sublivelist_verify_block_refcnt_t *pair =
+	    zfs_btree_find(&sv->sv_pair, &current, &where);
 	if (free) {
-		zfs_btree_add(&sv->sv_pair, bp);
-		/* Check if the FREE is a duplicate */
-		if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) {
-			snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
-			    free);
-			(void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf);
+		if (pair == NULL) {
+			/* first free entry for this block pointer */
+			zfs_btree_add(&sv->sv_pair, &current);
 		} else {
-			zfs_btree_add_idx(&sv->sv_all_frees, bp, &where);
+			pair->svbr_refcnt++;
 		}
 	} else {
-		/* Check if the ALLOC has been freed */
-		if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) {
-			zfs_btree_remove_idx(&sv->sv_pair, &where);
-		} else {
+		if (pair == NULL) {
+			/* block that is currently marked as allocated */
 			for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
 				if (DVA_IS_EMPTY(&bp->blk_dva[i]))
 					break;
@@ -262,16 +295,16 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
 					    &svb, &where);
 				}
 			}
-		}
-		/* Check if the ALLOC is a duplicate */
-		if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) {
-			snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
-			    free);
-			(void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf);
 		} else {
-			zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where);
+			/* alloc matches a free entry */
+			pair->svbr_refcnt--;
+			if (pair->svbr_refcnt == 0) {
+				/* all allocs and frees have been matched */
+				zfs_btree_remove_idx(&sv->sv_pair, &where);
+			}
 		}
 	}
+
 	return (0);
 }

@@ -279,32 +312,22 @@ static int
 sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
 {
 	int err;
-	char blkbuf[BP_SPRINTF_LEN];
 	struct sublivelist_verify *sv = args;

-	zfs_btree_create(&sv->sv_all_allocs, livelist_compare,
-	    sizeof (blkptr_t));
-
-	zfs_btree_create(&sv->sv_all_frees, livelist_compare,
-	    sizeof (blkptr_t));
-
-	zfs_btree_create(&sv->sv_pair, livelist_compare,
-	    sizeof (blkptr_t));
+	zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare,
+	    sizeof (sublivelist_verify_block_refcnt_t));

 	err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
 	    sv, NULL);

-	zfs_btree_clear(&sv->sv_all_allocs);
-	zfs_btree_destroy(&sv->sv_all_allocs);
-
-	zfs_btree_clear(&sv->sv_all_frees);
-	zfs_btree_destroy(&sv->sv_all_frees);
-
-	blkptr_t *e;
+	sublivelist_verify_block_refcnt_t *e;
 	zfs_btree_index_t *cookie = NULL;
 	while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
-		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE);
-		(void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf);
+		char blkbuf[BP_SPRINTF_LEN];
+		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
+		    &e->svbr_blk, B_TRUE);
+		(void) printf("\tERROR: %d unmatched FREE(s): %s\n",
+		    e->svbr_refcnt, blkbuf);
 	}
 	zfs_btree_destroy(&sv->sv_pair);

@@ -613,10 +636,14 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
 /*
 * [Livelist Check]
 * Iterate through all the sublivelists and:
- * - report leftover frees
- * - report double ALLOCs/FREEs
+ * - report leftover frees (**)
 * - record leftover ALLOCs together with their TXG [see Cross Check]
 *
+ * (**) Note: Double ALLOCs are valid in datasets that have dedup
+ *      enabled. Similarly double FREEs are allowed as well but
+ *      only if they pair up with a corresponding ALLOC entry once
+ *      we our done with our sublivelist iteration.
+ *
 * [Spacemap Check]
 * for each metaslab:
 * - iterate over spacemap and then the metaslab's entries in the
@@ -162,6 +162,9 @@ dnl #
 dnl # 3.1 API change,
 dnl # Check if inode_operations contains the function get_acl
 dnl #
+dnl # 5.15 API change,
+dnl # Added the bool rcu argument to get_acl for rcu path walk.
+dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_GET_ACL], [
 	ZFS_LINUX_TEST_SRC([inode_operations_get_acl], [
 		#include <linux/fs.h>
@@ -174,14 +177,32 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_GET_ACL], [
 			.get_acl = get_acl_fn,
 		};
 	],[])
+
+	ZFS_LINUX_TEST_SRC([inode_operations_get_acl_rcu], [
+		#include <linux/fs.h>
+
+		struct posix_acl *get_acl_fn(struct inode *inode, int type,
+		    bool rcu) { return NULL; }
+
+		static const struct inode_operations
+		    iops __attribute__ ((unused)) = {
+			.get_acl = get_acl_fn,
+		};
+	],[])
 ])

 AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL], [
 	AC_MSG_CHECKING([whether iops->get_acl() exists])
 	ZFS_LINUX_TEST_RESULT([inode_operations_get_acl], [
 		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_GET_ACL, 1, [iops->get_acl() exists])
 	],[
-		ZFS_LINUX_TEST_ERROR([iops->get_acl()])
+		ZFS_LINUX_TEST_RESULT([inode_operations_get_acl_rcu], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE(HAVE_GET_ACL_RCU, 1, [iops->get_acl() takes rcu])
+		],[
+			ZFS_LINUX_TEST_ERROR([iops->get_acl()])
+		])
 	])
 ])

@@ -48,7 +48,45 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [
 ])

 dnl #
-dnl # 2.6.32 - 4.x API,
+dnl # 5.9: added blk_queue_update_readahead(),
+dnl # 5.15: renamed to disk_update_readahead()
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD], [
+	ZFS_LINUX_TEST_SRC([blk_queue_update_readahead], [
+		#include <linux/blkdev.h>
+	],[
+		struct request_queue q;
+		blk_queue_update_readahead(&q);
+	])
+
+	ZFS_LINUX_TEST_SRC([disk_update_readahead], [
+		#include <linux/blkdev.h>
+	],[
+		struct gendisk disk;
+		disk_update_readahead(&disk);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD], [
+	AC_MSG_CHECKING([whether blk_queue_update_readahead() exists])
+	ZFS_LINUX_TEST_RESULT([blk_queue_update_readahead], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLK_QUEUE_UPDATE_READAHEAD, 1,
+		    [blk_queue_update_readahead() exists])
+	],[
+		AC_MSG_CHECKING([whether disk_update_readahead() exists])
+		ZFS_LINUX_TEST_RESULT([disk_update_readahead], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE(HAVE_DISK_UPDATE_READAHEAD, 1,
+			    [disk_update_readahead() exists])
+		],[
+			AC_MSG_RESULT(no)
+		])
+	])
+])
+
+dnl #
+dnl # 2.6.32 API,
 dnl #   blk_queue_discard()
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD], [
@@ -280,6 +318,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI
+	ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_SET
@@ -292,6 +331,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
 	ZFS_AC_KERNEL_BLK_QUEUE_PLUG
 	ZFS_AC_KERNEL_BLK_QUEUE_BDI
+	ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD
 	ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
 	ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE
 	ZFS_AC_KERNEL_BLK_QUEUE_FLAG_SET
@@ -42,6 +42,13 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [
 		struct block_device_operations o;
 		o.submit_bio = NULL;
 	])
+
+	ZFS_LINUX_TEST_SRC([blk_alloc_disk], [
+		#include <linux/blkdev.h>
+	],[
+		struct gendisk *disk  __attribute__ ((unused));
+		disk = blk_alloc_disk(NUMA_NO_NODE);
+	])
 ])

 AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
@@ -56,6 +63,19 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [

 		AC_DEFINE(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS, 1,
 		    [submit_bio is member of struct block_device_operations])
+
+		dnl #
+		dnl # Linux 5.14 API Change:
+		dnl # blk_alloc_queue() + alloc_disk() combo replaced by
+		dnl # a single call to blk_alloc_disk().
+		dnl #
+		AC_MSG_CHECKING([whether blk_alloc_disk() exists])
+		ZFS_LINUX_TEST_RESULT([blk_alloc_disk], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE([HAVE_BLK_ALLOC_DISK], 1, [blk_alloc_disk() exists])
+		], [
+			AC_MSG_RESULT(no)
+		])
 	],[
 		AC_MSG_RESULT(no)

@@ -0,0 +1,21 @@
+dnl #
+dnl # 4.20 API change
+dnl # Added kernel_siginfo_t
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_SIGINFO], [
+	ZFS_LINUX_TEST_SRC([siginfo], [
+		#include <linux/signal_types.h>
+	],[
+		kernel_siginfo_t info __attribute__ ((unused));
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SIGINFO], [
+	AC_MSG_CHECKING([whether kernel_siginfo_t tyepedef exists])
+	ZFS_LINUX_TEST_RESULT([siginfo], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_SIGINFO, 1, [kernel_siginfo_t exists])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
@@ -0,0 +1,21 @@
+dnl #
+dnl # 4.4 API change
+dnl # Added kernel_signal_stop
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_SIGNAL_STOP], [
+	ZFS_LINUX_TEST_SRC([signal_stop], [
+		#include <linux/sched/signal.h>
+	],[
+		kernel_signal_stop();
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SIGNAL_STOP], [
+	AC_MSG_CHECKING([whether signal_stop() exists])
+	ZFS_LINUX_TEST_RESULT([signal_stop], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_SIGNAL_STOP, 1, [signal_stop() exists])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
@@ -0,0 +1,21 @@
+dnl #
+dnl # 4.17 API change
+dnl # Added set_special_state() function
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_SET_SPECIAL_STATE], [
+	ZFS_LINUX_TEST_SRC([set_special_state], [
+		#include <linux/sched.h>
+	],[
+		set_special_state(TASK_STOPPED);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SET_SPECIAL_STATE], [
+	AC_MSG_CHECKING([whether set_special_state() exists])
+	ZFS_LINUX_TEST_RESULT([set_special_state], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_SET_SPECIAL_STATE, 1, [set_special_state() exists])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
@@ -0,0 +1,32 @@
+dnl #
+dnl # Linux 5.15 gets rid of -isystem and external <stdarg.h> inclusion
+dnl # and ships its own <linux/stdarg.h>. Check if this header file does
+dnl # exist and provide all necessary definitions for variable argument
+dnl # functions. Adjust the inclusion of <stdarg.h> according to the
+dnl # results.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG], [
+	ZFS_LINUX_TEST_SRC([has_standalone_linux_stdarg], [
+		#include <linux/stdarg.h>
+
+		#if !defined(va_start) || !defined(va_end) || \
+		    !defined(va_arg) || !defined(va_copy)
+		#error "<linux/stdarg.h> is invalid"
+		#endif
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG], [
+	dnl #
+	dnl # Linux 5.15 ships its own stdarg.h and doesn't allow to
+	dnl # include compiler headers.
+	dnl #
+	AC_MSG_CHECKING([whether standalone <linux/stdarg.h> exists])
+	ZFS_LINUX_TEST_RESULT([has_standalone_linux_stdarg], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_STANDALONE_LINUX_STDARG, 1,
+			[standalone <linux/stdarg.h> exists])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
@@ -0,0 +1,34 @@
+dnl #
+dnl # Linux 5.14 adds a change to require set_page_dirty to be manually
+dnl # wired up in struct address_space_operations. Determine if this needs
+dnl # to be done. This patch set also introduced __set_page_dirty_nobuffers
+dnl # declaration in linux/pagemap.h, so these tests look for the presence
+dnl # of that function to tell the compiler to assign set_page_dirty in
+dnl # module/os/linux/zfs/zpl_file.c
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS], [
+	ZFS_LINUX_TEST_SRC([vfs_has_set_page_dirty_nobuffers], [
+		#include <linux/pagemap.h>
+		#include <linux/fs.h>
+
+		static const struct address_space_operations
+		    aops __attribute__ ((unused)) = {
+			.set_page_dirty = __set_page_dirty_nobuffers,
+		};
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS], [
+	dnl #
+	dnl # Linux 5.14 change requires set_page_dirty() to be assigned
+	dnl # in address_space_operations()
+	dnl #
+	AC_MSG_CHECKING([__set_page_dirty_nobuffers exists])
+	ZFS_LINUX_TEST_RESULT([vfs_has_set_page_dirty_nobuffers], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS, 1,
+			[__set_page_dirty_nobuffers exists])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
@@ -128,6 +128,11 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_MKNOD
 	ZFS_AC_KERNEL_SRC_SYMLINK
 	ZFS_AC_KERNEL_SRC_BIO_MAX_SEGS
+	ZFS_AC_KERNEL_SRC_SIGNAL_STOP
+	ZFS_AC_KERNEL_SRC_SIGINFO
+	ZFS_AC_KERNEL_SRC_SET_SPECIAL_STATE
+	ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS
+	ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG

 	AC_MSG_CHECKING([for available kernel interfaces])
 	ZFS_LINUX_TEST_COMPILE_ALL([kabi])
@@ -229,6 +234,11 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_MKNOD
 	ZFS_AC_KERNEL_SYMLINK
 	ZFS_AC_KERNEL_BIO_MAX_SEGS
+	ZFS_AC_KERNEL_SIGNAL_STOP
+	ZFS_AC_KERNEL_SIGINFO
+	ZFS_AC_KERNEL_SET_SPECIAL_STATE
+	ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS
+	ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG
 ])

 dnl #
@@ -92,11 +92,14 @@ blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua)
 static inline void
 blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages)
 {
+#if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \
+	!defined(HAVE_DISK_UPDATE_READAHEAD)
 #ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC
 	q->backing_dev_info->ra_pages = ra_pages;
 #else
 	q->backing_dev_info.ra_pages = ra_pages;
 #endif
+#endif
 }

 #ifdef HAVE_BIO_BVEC_ITER
@@ -24,7 +24,11 @@
 #ifndef _SPL_CMN_ERR_H
 #define	_SPL_CMN_ERR_H

+#if defined(_KERNEL) && defined(HAVE_STANDALONE_LINUX_STDARG)
+#include <linux/stdarg.h>
+#else
 #include <stdarg.h>
+#endif

 #define	CE_CONT		0 /* continuation */
 #define	CE_NOTE		1 /* notice */
@@ -33,22 +33,6 @@
 #define	FORREAL		0	/* Usual side-effects */
 #define	JUSTLOOKING	1	/* Don't stop the process */

-/*
- * The "why" argument indicates the allowable side-effects of the call:
- *
- * FORREAL:  Extract the next pending signal from p_sig into p_cursig;
- * stop the process if a stop has been requested or if a traced signal
- * is pending.
- *
- * JUSTLOOKING:  Don't stop the process, just indicate whether or not
- * a signal might be pending (FORREAL is needed to tell for sure).
- */
-static __inline__ int
-issig(int why)
-{
-	ASSERT(why == FORREAL || why == JUSTLOOKING);
-
-	return (signal_pending(current));
-}
+extern int issig(int why);

 #endif /* SPL_SIGNAL_H */
@@ -70,4 +70,17 @@ extern struct task_struct *spl_kthread_create(int (*func)(void *),

 extern proc_t p0;

+#ifdef HAVE_SIGINFO
+typedef kernel_siginfo_t spl_kernel_siginfo_t;
+#else
+typedef siginfo_t spl_kernel_siginfo_t;
+#endif
+
+#ifdef HAVE_SET_SPECIAL_STATE
+#define	spl_set_special_state(x) set_special_state((x))
+#else
+#define	spl_set_special_state(x) __set_current_state((x))
+#endif
+
+
 #endif  /* _SPL_THREAD_H */
@@ -70,7 +70,11 @@ extern int zpl_set_acl(struct user_namespace *userns, struct inode *ip,
 extern int zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type);
 #endif /* HAVE_SET_ACL_USERNS */
 #endif /* HAVE_SET_ACL */
+#if defined(HAVE_GET_ACL_RCU)
+extern struct posix_acl *zpl_get_acl(struct inode *ip, int type, bool rcu);
+#elif defined(HAVE_GET_ACL)
 extern struct posix_acl *zpl_get_acl(struct inode *ip, int type);
+#endif
 extern int zpl_init_acl(struct inode *ip, struct inode *dir);
 extern int zpl_chmod_acl(struct inode *ip);
 #else
@@ -41,10 +41,11 @@ typedef void (dsl_sigfunc_t)(void *, dmu_tx_t *);

 typedef enum zfs_space_check {
 	/*
-	 * Normal space check: if there is less than 3.2% free space,
-	 * the operation will fail.  Operations which are logically
-	 * creating things should use this (e.g. "zfs create", "zfs snapshot").
-	 * User writes (via the ZPL / ZVOL) also fail at this point.
+	 * Normal space check: if there is less than 3.2% free space (bounded
+	 * by spa_max_slop), the operation will fail.  Operations which are
+	 * logically creating things should use this (e.g. "zfs create", "zfs
+	 * snapshot").  User writes (via the ZPL / ZVOL) also fail at this
+	 * point.
 	 */
 	ZFS_SPACE_CHECK_NORMAL,

@@ -31,6 +31,7 @@ extern "C" {
 #endif

 #include <sys/nvpair.h>
+#include <sys/zfs_file.h>

 /*
 * Shared user/kernel definitions for class length, error channel name,
@@ -96,8 +97,8 @@ extern void fm_nvprint(nvlist_t *);
 extern void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector);
 extern int zfs_zevent_post(nvlist_t *, nvlist_t *, zevent_cb_t *);
 extern void zfs_zevent_drain_all(int *);
-extern int zfs_zevent_fd_hold(int, minor_t *, zfs_zevent_t **);
-extern void zfs_zevent_fd_rele(int);
+extern zfs_file_t *zfs_zevent_fd_hold(int, minor_t *, zfs_zevent_t **);
+extern void zfs_zevent_fd_rele(zfs_file_t *);
 extern int zfs_zevent_next(zfs_zevent_t *, nvlist_t **, uint64_t *, uint64_t *);
 extern int zfs_zevent_wait(zfs_zevent_t *);
 extern int zfs_zevent_seek(zfs_zevent_t *, uint64_t);
@@ -22,6 +22,8 @@
 #ifndef	_SYS_ZFS_FILE_H
 #define	_SYS_ZFS_FILE_H

+#include <sys/zfs_context.h>
+
 #ifndef _KERNEL
 typedef struct zfs_file {
 	int f_fd;
@@ -55,8 +57,8 @@ int zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len);
 loff_t zfs_file_off(zfs_file_t *fp);
 int zfs_file_unlink(const char *);

-int zfs_file_get(int fd, zfs_file_t **fp);
-void zfs_file_put(int fd);
+zfs_file_t *zfs_file_get(int fd);
+void zfs_file_put(zfs_file_t *fp);
 void *zfs_file_private(zfs_file_t *fp);

 #endif /* _SYS_ZFS_FILE_H */
@@ -566,7 +566,7 @@ typedef struct zfsdev_state {
 } zfsdev_state_t;

 extern void *zfsdev_get_state(minor_t minor, enum zfsdev_state_type which);
-extern int zfsdev_getminor(int fd, minor_t *minorp);
+extern int zfsdev_getminor(zfs_file_t *fp, minor_t *minorp);
 extern minor_t zfsdev_minor_alloc(void);

 extern uint_t zfs_fsyncer_key;
@@ -51,8 +51,8 @@ extern void zfs_onexit_destroy(zfs_onexit_t *zo);

 #endif

-extern int zfs_onexit_fd_hold(int fd, minor_t *minorp);
-extern void zfs_onexit_fd_rele(int fd);
+extern zfs_file_t *zfs_onexit_fd_hold(int fd, minor_t *minorp);
+extern void zfs_onexit_fd_rele(zfs_file_t *);
 extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
    uint64_t *action_handle);

@@ -966,16 +966,16 @@ kmem_asprintf(const char *fmt, ...)
 }

 /* ARGSUSED */
-int
+zfs_file_t *
 zfs_onexit_fd_hold(int fd, minor_t *minorp)
 {
 	*minorp = 0;
-	return (0);
+	return (NULL);
 }

 /* ARGSUSED */
 void
-zfs_onexit_fd_rele(int fd)
+zfs_onexit_fd_rele(zfs_file_t *fp)
 {
 }

@@ -1385,28 +1385,26 @@ zfs_file_unlink(const char *path)
 * Get reference to file pointer
 *
 * fd - input file descriptor
- * fpp - pointer to file pointer
 *
- * Returns 0 on success EBADF on failure.
+ * Returns pointer to file struct or NULL.
 * Unsupported in user space.
 */
-int
-zfs_file_get(int fd, zfs_file_t **fpp)
+zfs_file_t *
+zfs_file_get(int fd)
 {
 	abort();

-	return (EOPNOTSUPP);
+	return (NULL);
 }
-
 /*
 * Drop reference to file pointer
 *
- * fd - input file descriptor
+ * fp - pointer to file struct
 *
 * Unsupported in user space.
 */
 void
-zfs_file_put(int fd)
+zfs_file_put(zfs_file_t *fp)
 {
 	abort();
 }
@@ -172,11 +172,13 @@ zfs_crypto_dispatch(freebsd_crypt_session_t *session, 	struct cryptop *crp)
 			break;
 		mtx_lock(&session->fs_lock);
 		while (session->fs_done == false)
-			msleep(crp, &session->fs_lock, PRIBIO,
-			    "zfs_crypto", hz/5);
+			msleep(crp, &session->fs_lock, 0,
+			    "zfs_crypto", 0);
 		mtx_unlock(&session->fs_lock);

-		if (crp->crp_etype != EAGAIN) {
+		if (crp->crp_etype == ENOMEM) {
+			pause("zcrnomem", 1);
+		} else if (crp->crp_etype != EAGAIN) {
 			error = crp->crp_etype;
 			break;
 		}
@@ -379,7 +379,11 @@ vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
 	int i, n_bios, j;
 	size_t bios_size;

+#if __FreeBSD_version > 1300130
+	maxio = maxphys - (maxphys % cp->provider->sectorsize);
+#else
 	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
+#endif
 	n_bios = 0;

 	/* How many bios are required for all commands ? */
@@ -241,28 +241,21 @@ zfs_file_fsync(zfs_file_t *fp, int flags)
 	return (zfs_vop_fsync(fp->f_vnode));
 }

-int
-zfs_file_get(int fd, zfs_file_t **fpp)
+zfs_file_t *
+zfs_file_get(int fd)
 {
 	struct file *fp;

 	if (fget(curthread, fd, &cap_no_rights, &fp))
-		return (SET_ERROR(EBADF));
+		return (NULL);

-	*fpp = fp;
-	return (0);
+	return (fp);
 }

 void
-zfs_file_put(int fd)
+zfs_file_put(zfs_file_t *fp)
 {
-	struct file *fp;
-
-	/* No CAP_ rights required, as we're only releasing. */
-	if (fget(curthread, fd, &cap_no_rights, &fp) == 0) {
-		fdrop(fp, curthread);
-		fdrop(fp, curthread);
-	}
+	fdrop(fp, curthread);
 }

 loff_t
@@ -1229,7 +1229,11 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
 		args.mda_si_drv2 = zv;
 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
 		    == 0) {
+#if __FreeBSD_version > 1300130
+			dev->si_iosize_max = maxphys;
+#else
 			dev->si_iosize_max = MAXPHYS;
+#endif
 			zsd->zsd_cdev = dev;
 		}
 	}
@@ -1265,9 +1269,10 @@ zvol_free(zvol_state_t *zv)
 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
 		struct cdev *dev = zsd->zsd_cdev;

-		ASSERT3P(dev->si_drv2, ==, NULL);
-
-		destroy_dev(dev);
+		if (dev != NULL) {
+			ASSERT3P(dev->si_drv2, ==, NULL);
+			destroy_dev(dev);
+		}
 	}

 	mutex_destroy(&zv->zv_state_lock);
@@ -1362,16 +1367,15 @@ zvol_create_minor_impl(const char *name)
 		args.mda_gid = GID_OPERATOR;
 		args.mda_mode = 0640;
 		args.mda_si_drv2 = zv;
-		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
-		if (error) {
-			kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
-			mutex_destroy(&zv->zv_state_lock);
-			kmem_free(zv, sizeof (*zv));
-			dmu_objset_disown(os, B_TRUE, FTAG);
-			goto out_doi;
+		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
+		    == 0) {
+#if __FreeBSD_version > 1300130
+			dev->si_iosize_max = maxphys;
+#else
+			dev->si_iosize_max = MAXPHYS;
+#endif
+			zsd->zsd_cdev = dev;
 		}
-		dev->si_iosize_max = MAXPHYS;
-		zsd->zsd_cdev = dev;
 	}
 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
@@ -1444,7 +1448,8 @@ zvol_clear_private(zvol_state_t *zv)
 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
 		struct cdev *dev = zsd->zsd_cdev;

-		dev->si_drv2 = NULL;
+		if (dev != NULL)
+			dev->si_drv2 = NULL;
 	}
 }

@@ -158,3 +158,54 @@ spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...)
 	} while (1);
 }
 EXPORT_SYMBOL(spl_kthread_create);
+
+/*
+ * The "why" argument indicates the allowable side-effects of the call:
+ *
+ * FORREAL:  Extract the next pending signal from p_sig into p_cursig;
+ * stop the process if a stop has been requested or if a traced signal
+ * is pending.
+ *
+ * JUSTLOOKING:  Don't stop the process, just indicate whether or not
+ * a signal might be pending (FORREAL is needed to tell for sure).
+ */
+int
+issig(int why)
+{
+	ASSERT(why == FORREAL || why == JUSTLOOKING);
+
+	if (!signal_pending(current))
+		return (0);
+
+	if (why != FORREAL)
+		return (1);
+
+	struct task_struct *task = current;
+	spl_kernel_siginfo_t __info;
+	sigset_t set;
+	siginitsetinv(&set, 1ULL << (SIGSTOP - 1) | 1ULL << (SIGTSTP - 1));
+	sigorsets(&set, &task->blocked, &set);
+
+	spin_lock_irq(&task->sighand->siglock);
+	int ret;
+	if ((ret = dequeue_signal(task, &set, &__info)) != 0) {
+#ifdef HAVE_SIGNAL_STOP
+		spin_unlock_irq(&task->sighand->siglock);
+		kernel_signal_stop();
+#else
+		if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+			spl_set_special_state(TASK_STOPPED);
+
+		spin_unlock_irq(&current->sighand->siglock);
+
+		schedule();
+#endif
+		return (0);
+	}
+
+	spin_unlock_irq(&task->sighand->siglock);
+
+	return (1);
+}
+
+EXPORT_SYMBOL(issig);
@@ -405,36 +405,22 @@ zfs_file_unlink(const char *path)
 * Get reference to file pointer
 *
 * fd - input file descriptor
- * fpp - pointer to file pointer
 *
- * Returns 0 on success EBADF on failure.
+ * Returns pointer to file struct or NULL
 */
-int
-zfs_file_get(int fd, zfs_file_t **fpp)
+zfs_file_t *
+zfs_file_get(int fd)
 {
-	zfs_file_t *fp;
-
-	fp = fget(fd);
-	if (fp == NULL)
-		return (EBADF);
-
-	*fpp = fp;
-
-	return (0);
+	return (fget(fd));
 }

 /*
 * Drop reference to file pointer
 *
- * fd - input file descriptor
+ * fp - input file struct pointer
 */
 void
-zfs_file_put(int fd)
+zfs_file_put(zfs_file_t *fp)
 {
-	struct file *fp;
-
-	if ((fp = fget(fd)) != NULL) {
-		fput(fp);
-		fput(fp);
-	}
+	fput(fp);
 }
@@ -33,6 +33,9 @@
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_project.h>
+#ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS
+#include <linux/pagemap.h>
+#endif

 /*
 * When using fallocate(2) to preallocate space, inflate the requested
@@ -1007,6 +1010,9 @@ const struct address_space_operations zpl_address_space_operations = {
 	.writepage	= zpl_writepage,
 	.writepages	= zpl_writepages,
 	.direct_IO	= zpl_direct_IO,
+#ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS
+	.set_page_dirty = __set_page_dirty_nobuffers,
+#endif
 };

 const struct file_operations zpl_file_operations = {
@@ -1012,13 +1012,12 @@ zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type)
 }
 #endif /* HAVE_SET_ACL */

-struct posix_acl *
-zpl_get_acl(struct inode *ip, int type)
+static struct posix_acl *
+zpl_get_acl_impl(struct inode *ip, int type)
 {
 	struct posix_acl *acl;
 	void *value = NULL;
 	char *name;
-	int size;

 	/*
 	 * As of Linux 3.14, the kernel get_acl will check this for us.
@@ -1042,7 +1041,7 @@ zpl_get_acl(struct inode *ip, int type)
 		return (ERR_PTR(-EINVAL));
 	}

-	size = zpl_xattr_get(ip, name, NULL, 0);
+	int size = zpl_xattr_get(ip, name, NULL, 0);
 	if (size > 0) {
 		value = kmem_alloc(size, KM_SLEEP);
 		size = zpl_xattr_get(ip, name, value, size);
@@ -1068,6 +1067,25 @@ zpl_get_acl(struct inode *ip, int type)
 	return (acl);
 }

+#if defined(HAVE_GET_ACL_RCU)
+struct posix_acl *
+zpl_get_acl(struct inode *ip, int type, bool rcu)
+{
+	if (rcu)
+		return (ERR_PTR(-ECHILD));
+
+	return (zpl_get_acl_impl(ip, type));
+}
+#elif defined(HAVE_GET_ACL)
+struct posix_acl *
+zpl_get_acl(struct inode *ip, int type)
+{
+	return (zpl_get_acl_impl(ip, type));
+}
+#else
+#error "Unsupported iops->get_acl() implementation"
+#endif /* HAVE_GET_ACL_RCU */
+
 int
 zpl_init_acl(struct inode *ip, struct inode *dir)
 {
@@ -1078,7 +1096,7 @@ zpl_init_acl(struct inode *ip, struct inode *dir)
 		return (0);

 	if (!S_ISLNK(ip->i_mode)) {
-		acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT);
+		acl = zpl_get_acl_impl(dir, ACL_TYPE_DEFAULT);
 		if (IS_ERR(acl))
 			return (PTR_ERR(acl));
 		if (!acl) {
@@ -1127,7 +1145,7 @@ zpl_chmod_acl(struct inode *ip)
 	if (S_ISLNK(ip->i_mode))
 		return (-EOPNOTSUPP);

-	acl = zpl_get_acl(ip, ACL_TYPE_ACCESS);
+	acl = zpl_get_acl_impl(ip, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl) || !acl)
 		return (PTR_ERR(acl));

@@ -1189,7 +1207,7 @@ __zpl_xattr_acl_get_access(struct inode *ip, const char *name,
 	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
 		return (-EOPNOTSUPP);

-	acl = zpl_get_acl(ip, type);
+	acl = zpl_get_acl_impl(ip, type);
 	if (IS_ERR(acl))
 		return (PTR_ERR(acl));
 	if (acl == NULL)
@@ -1217,7 +1235,7 @@ __zpl_xattr_acl_get_default(struct inode *ip, const char *name,
 	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
 		return (-EOPNOTSUPP);

-	acl = zpl_get_acl(ip, type);
+	acl = zpl_get_acl_impl(ip, type);
 	if (IS_ERR(acl))
 		return (PTR_ERR(acl));
 	if (acl == NULL)
@@ -727,7 +727,7 @@ static struct block_device_operations zvol_ops = {
 	.getgeo			= zvol_getgeo,
 	.owner			= THIS_MODULE,
 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
-    .submit_bio		= zvol_submit_bio,
+	.submit_bio		= zvol_submit_bio,
 #endif
 };

@@ -760,13 +760,40 @@ zvol_alloc(dev_t dev, const char *name)
 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);

 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
-	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
+#ifdef HAVE_BLK_ALLOC_DISK
+	zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
+	if (zso->zvo_disk == NULL)
+		goto out_kmem;
+
+	zso->zvo_disk->minors = ZVOL_MINORS;
+	zso->zvo_queue = zso->zvo_disk->queue;
 #else
-	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
-#endif
+	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)
 		goto out_kmem;

+	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+	if (zso->zvo_disk == NULL) {
+		blk_cleanup_queue(zso->zvo_queue);
+		goto out_kmem;
+	}
+
+	zso->zvo_disk->queue = zso->zvo_queue;
+#endif /* HAVE_BLK_ALLOC_DISK */
+#else
+	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
+	if (zso->zvo_queue == NULL)
+		goto out_kmem;
+
+	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+	if (zso->zvo_disk == NULL) {
+		blk_cleanup_queue(zso->zvo_queue);
+		goto out_kmem;
+	}
+
+	zso->zvo_disk->queue = zso->zvo_queue;
+#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
+
 	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);

 	/* Limit read-ahead to a single page to prevent over-prefetching. */
@@ -775,10 +802,6 @@ zvol_alloc(dev_t dev, const char *name)
 	/* Disable write merging in favor of the ZIO pipeline. */
 	blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);

-	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
-	if (zso->zvo_disk == NULL)
-		goto out_queue;
-
 	zso->zvo_queue->queuedata = zv;
 	zso->zvo_dev = dev;
 	zv->zv_open_count = 0;
@@ -809,14 +832,11 @@ zvol_alloc(dev_t dev, const char *name)
 	zso->zvo_disk->first_minor = (dev & MINORMASK);
 	zso->zvo_disk->fops = &zvol_ops;
 	zso->zvo_disk->private_data = zv;
-	zso->zvo_disk->queue = zso->zvo_queue;
 	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
 	    ZVOL_DEV_NAME, (dev & MINORMASK));

 	return (zv);

-out_queue:
-	blk_cleanup_queue(zso->zvo_queue);
 out_kmem:
 	kmem_free(zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
@@ -847,8 +867,13 @@ zvol_free(zvol_state_t *zv)
 	zfs_rangelock_fini(&zv->zv_rangelock);

 	del_gendisk(zv->zv_zso->zvo_disk);
+#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
+	defined(HAVE_BLK_ALLOC_DISK)
+	blk_cleanup_disk(zv->zv_zso->zvo_disk);
+#else
 	blk_cleanup_queue(zv->zv_zso->zvo_queue);
 	put_disk(zv->zv_zso->zvo_disk);
+#endif

 	ida_simple_remove(&zvol_ida,
 	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
@@ -1090,42 +1090,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 	db->db.db_data = buf->b_data;
 }

-static arc_buf_t *
-dbuf_alloc_arcbuf_from_arcbuf(dmu_buf_impl_t *db, arc_buf_t *data)
-{
-	objset_t *os = db->db_objset;
-	spa_t *spa = os->os_spa;
-	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-	enum zio_compress compress_type;
-	uint8_t complevel;
-	int psize, lsize;
-
-	psize = arc_buf_size(data);
-	lsize = arc_buf_lsize(data);
-	compress_type = arc_get_compression(data);
-	complevel = arc_get_complevel(data);
-
-	if (arc_is_encrypted(data)) {
-		boolean_t byteorder;
-		uint8_t salt[ZIO_DATA_SALT_LEN];
-		uint8_t iv[ZIO_DATA_IV_LEN];
-		uint8_t mac[ZIO_DATA_MAC_LEN];
-		dnode_t *dn = DB_DNODE(db);
-
-		arc_get_raw_params(data, &byteorder, salt, iv, mac);
-		data = arc_alloc_raw_buf(spa, db, dmu_objset_id(os),
-		    byteorder, salt, iv, mac, dn->dn_type, psize, lsize,
-		    compress_type, complevel);
-	} else if (compress_type != ZIO_COMPRESS_OFF) {
-		ASSERT3U(type, ==, ARC_BUFC_DATA);
-		data = arc_alloc_compressed_buf(spa, db,
-		    psize, lsize, compress_type, complevel);
-	} else {
-		data = arc_alloc_buf(spa, db, type, psize);
-	}
-	return (data);
-}
-
 static arc_buf_t *
 dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
 {
@@ -1575,9 +1539,35 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 		arc_space_consume(bonuslen, ARC_SPACE_BONUS);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
 	} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
-		arc_buf_t *buf = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf);
-		dr->dt.dl.dr_data = buf;
-		bcopy(db->db.db_data, buf->b_data, arc_buf_size(buf));
+		dnode_t *dn = DB_DNODE(db);
+		int size = arc_buf_size(db->db_buf);
+		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+		spa_t *spa = db->db_objset->os_spa;
+		enum zio_compress compress_type =
+		    arc_get_compression(db->db_buf);
+		uint8_t complevel = arc_get_complevel(db->db_buf);
+
+		if (arc_is_encrypted(db->db_buf)) {
+			boolean_t byteorder;
+			uint8_t salt[ZIO_DATA_SALT_LEN];
+			uint8_t iv[ZIO_DATA_IV_LEN];
+			uint8_t mac[ZIO_DATA_MAC_LEN];
+
+			arc_get_raw_params(db->db_buf, &byteorder, salt,
+			    iv, mac);
+			dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
+			    dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
+			    mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
+			    compress_type, complevel);
+		} else if (compress_type != ZIO_COMPRESS_OFF) {
+			ASSERT3U(type, ==, ARC_BUFC_DATA);
+			dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
+			    size, arc_buf_lsize(db->db_buf), compress_type,
+			    complevel);
+		} else {
+			dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
+		}
+		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 	} else {
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
@@ -3280,10 +3270,30 @@ noinline static void
 dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
 {
 	dbuf_dirty_record_t *dr = db->db_data_pending;
-	arc_buf_t *newdata, *data = dr->dt.dl.dr_data;
+	arc_buf_t *data = dr->dt.dl.dr_data;
+	enum zio_compress compress_type = arc_get_compression(data);
+	uint8_t complevel = arc_get_complevel(data);
+
+	if (arc_is_encrypted(data)) {
+		boolean_t byteorder;
+		uint8_t salt[ZIO_DATA_SALT_LEN];
+		uint8_t iv[ZIO_DATA_IV_LEN];
+		uint8_t mac[ZIO_DATA_MAC_LEN];
+
+		arc_get_raw_params(data, &byteorder, salt, iv, mac);
+		dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
+		    dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
+		    dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
+		    compress_type, complevel));
+	} else if (compress_type != ZIO_COMPRESS_OFF) {
+		dbuf_set_data(db, arc_alloc_compressed_buf(
+		    dn->dn_objset->os_spa, db, arc_buf_size(data),
+		    arc_buf_lsize(data), compress_type, complevel));
+	} else {
+		dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
+		    DBUF_GET_BUFC_TYPE(db), db->db.db_size));
+	}

-	newdata = dbuf_alloc_arcbuf_from_arcbuf(db, data);
-	dbuf_set_data(db, newdata);
 	rw_enter(&db->db_rwlock, RW_WRITER);
 	bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
 	rw_exit(&db->db_rwlock);
@@ -4067,8 +4077,31 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		 * objects only modified in the syncing context (e.g.
 		 * DNONE_DNODE blocks).
 		 */
-		*datap = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf);
-		bcopy(db->db.db_data, (*datap)->b_data, arc_buf_size(*datap));
+		int psize = arc_buf_size(*datap);
+		int lsize = arc_buf_lsize(*datap);
+		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+		enum zio_compress compress_type = arc_get_compression(*datap);
+		uint8_t complevel = arc_get_complevel(*datap);
+
+		if (arc_is_encrypted(*datap)) {
+			boolean_t byteorder;
+			uint8_t salt[ZIO_DATA_SALT_LEN];
+			uint8_t iv[ZIO_DATA_IV_LEN];
+			uint8_t mac[ZIO_DATA_MAC_LEN];
+
+			arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
+			*datap = arc_alloc_raw_buf(os->os_spa, db,
+			    dmu_objset_id(os), byteorder, salt, iv, mac,
+			    dn->dn_type, psize, lsize, compress_type,
+			    complevel);
+		} else if (compress_type != ZIO_COMPRESS_OFF) {
+			ASSERT3U(type, ==, ARC_BUFC_DATA);
+			*datap = arc_alloc_compressed_buf(os->os_spa, db,
+			    psize, lsize, compress_type, complevel);
+		} else {
+			*datap = arc_alloc_buf(os->os_spa, db, type, psize);
+		}
+		bcopy(db->db.db_data, (*datap)->b_data, psize);
 	}
 	db->db_data_pending = dr;

@@ -503,7 +503,7 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 {
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
-		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		for (enum ddt_type type = 0; type < DDT_TYPES && ddt; type++) {
 			for (enum ddt_class class = 0; class < DDT_CLASSES;
 			    class++) {
 				ddt_histogram_add(ddh,
@@ -129,6 +129,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
 	zfs_refcount_create(&dn->dn_tx_holds);
 	list_link_init(&dn->dn_link);

+	bzero(&dn->dn_next_type[0], sizeof (dn->dn_next_type));
 	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
 	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
 	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
@@ -909,15 +909,16 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
 }

 typedef struct livelist_entry {
-	const blkptr_t *le_bp;
+	blkptr_t le_bp;
+	uint32_t le_refcnt;
 	avl_node_t le_node;
 } livelist_entry_t;

 static int
 livelist_compare(const void *larg, const void *rarg)
 {
-	const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp;
-	const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp;
+	const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp;
+	const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp;

 	/* Sort them according to dva[0] */
 	uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
@@ -944,6 +945,11 @@ struct livelist_iter_arg {
 * Expects an AVL tree which is incrementally filled will FREE blkptrs
 * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
 * corresponding FREE are stored in the supplied bplist.
+ *
+ * Note that multiple FREE and ALLOC entries for the same blkptr may
+ * be encountered when dedup is involved. For this reason we keep a
+ * refcount for all the FREE entries of each blkptr and ensure that
+ * each of those FREE entries has a corresponding ALLOC preceding it.
 */
 static int
 dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
@@ -957,23 +963,47 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,

 	if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
 		return (SET_ERROR(EINTR));
+
+	livelist_entry_t node;
+	node.le_bp = *bp;
+	livelist_entry_t *found = avl_find(avl, &node, NULL);
 	if (bp_freed) {
-		livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t),
-		    KM_SLEEP);
-		blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-		*temp_bp = *bp;
-		node->le_bp = temp_bp;
-		avl_add(avl, node);
-	} else {
-		livelist_entry_t node;
-		node.le_bp = bp;
-		livelist_entry_t *found = avl_find(avl, &node, NULL);
-		if (found != NULL) {
-			avl_remove(avl, found);
-			kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t));
-			kmem_free(found, sizeof (livelist_entry_t));
+		if (found == NULL) {
+			/* first free entry for this blkptr */
+			livelist_entry_t *e =
+			    kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP);
+			e->le_bp = *bp;
+			e->le_refcnt = 1;
+			avl_add(avl, e);
 		} else {
+			/* dedup block free */
+			ASSERT(BP_GET_DEDUP(bp));
+			ASSERT3U(BP_GET_CHECKSUM(bp), ==,
+			    BP_GET_CHECKSUM(&found->le_bp));
+			ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
+			found->le_refcnt++;
+		}
+	} else {
+		if (found == NULL) {
+			/* block is currently marked as allocated */
 			bplist_append(to_free, bp);
+		} else {
+			/* alloc matches a free entry */
+			ASSERT3U(found->le_refcnt, !=, 0);
+			found->le_refcnt--;
+			if (found->le_refcnt == 0) {
+				/* all tracked free pairs have been matched */
+				avl_remove(avl, found);
+				kmem_free(found, sizeof (livelist_entry_t));
+			} else {
+				/*
+				 * This is definitely a deduped blkptr so
+				 * let's validate it.
+				 */
+				ASSERT(BP_GET_DEDUP(bp));
+				ASSERT3U(BP_GET_CHECKSUM(bp), ==,
+				    BP_GET_CHECKSUM(&found->le_bp));
+			}
 		}
 	}
 	return (0);
@@ -999,6 +1029,7 @@ dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
 	};
 	int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);

+	VERIFY0(avl_numnodes(&avl));
 	avl_destroy(&avl);
 	return (err);
 }
@@ -586,25 +586,29 @@ zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze)
 	return (0);
 }

-int
+zfs_file_t *
 zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze)
 {
-	int error;
+	zfs_file_t *fp = zfs_file_get(fd);
+	if (fp == NULL)
+		return (NULL);

-	error = zfsdev_getminor(fd, minorp);
+	int error = zfsdev_getminor(fp, minorp);
 	if (error == 0)
 		error = zfs_zevent_minor_to_state(*minorp, ze);

-	if (error)
-		zfs_zevent_fd_rele(fd);
+	if (error) {
+		zfs_zevent_fd_rele(fp);
+		fp = NULL;
+	}

-	return (error);
+	return (fp);
 }

 void
-zfs_zevent_fd_rele(int fd)
+zfs_zevent_fd_rele(zfs_file_t *fp)
 {
-	zfs_file_put(fd);
+	zfs_file_put(fp);
 }

 /*
@@ -347,10 +347,10 @@ int spa_asize_inflation = 24;

 /*
 * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
- * the pool to be consumed.  This ensures that we don't run the pool
- * completely out of space, due to unaccounted changes (e.g. to the MOS).
- * It also limits the worst-case time to allocate space.  If we have
- * less than this amount of free space, most ZPL operations (e.g. write,
+ * the pool to be consumed (bounded by spa_max_slop).  This ensures that we
+ * don't run the pool completely out of space, due to unaccounted changes (e.g.
+ * to the MOS).  It also limits the worst-case time to allocate space.  If we
+ * have less than this amount of free space, most ZPL operations (e.g.  write,
 * create) will return ENOSPC.
 *
 * Certain operations (e.g. file removal, most administrative actions) can
@@ -374,10 +374,15 @@ int spa_asize_inflation = 24;
 * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
 * but we never allow it to be more than half the pool size.
 *
+ * Further, on very large pools, the slop space will be smaller than
+ * 3.2%, to avoid reserving much more space than we actually need; bounded
+ * by spa_max_slop (128GB).
+ *
 * See also the comments in zfs_space_check_t.
 */
 int spa_slop_shift = 5;
-uint64_t spa_min_slop = 128 * 1024 * 1024;
+uint64_t spa_min_slop = 128ULL * 1024 * 1024;
+uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
 int spa_allocators = 4;


@@ -1775,17 +1780,38 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
 }

 /*
- * Return the amount of slop space in bytes.  It is 1/32 of the pool (3.2%),
- * or at least 128MB, unless that would cause it to be more than half the
- * pool size.
+ * Return the amount of slop space in bytes.  It is typically 1/32 of the pool
+ * (3.2%).  On very small pools, it may be slightly larger than this.  On very
+ * large pools, it will be capped to the value of spa_max_slop.
 *
- * See the comment above spa_slop_shift for details.
+ * See the comment above spa_slop_shift for more details.
 */
 uint64_t
 spa_get_slop_space(spa_t *spa)
 {
-	uint64_t space = spa_get_dspace(spa);
-	return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
+	uint64_t space = 0;
+	uint64_t slop = 0;
+
+	/*
+	 * Make sure spa_dedup_dspace has been set.
+	 */
+	if (spa->spa_dedup_dspace == ~0ULL)
+		spa_update_dspace(spa);
+
+	/*
+	 * spa_get_dspace() includes the space only logically "used" by
+	 * deduplicated data, so since it's not useful to reserve more
+	 * space with more deduplicated data, we subtract that out here.
+	 */
+	space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
+	slop = MIN(space >> spa_slop_shift, spa_max_slop);
+
+	/*
+	 * Slop space should be at least spa_min_slop, but no more than half
+	 * the entire pool.
+	 */
+	slop = MAX(slop, MIN(space >> 1, spa_min_slop));
+	return (slop);
 }

 uint64_t
@@ -1823,7 +1849,14 @@ spa_update_dspace(spa_t *spa)
 		spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 		vdev_t *vd =
 		    vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
-		if (vd->vdev_mg->mg_class == spa_normal_class(spa)) {
+		/*
+		 * If the stars align, we can wind up here after
+		 * vdev_remove_complete() has cleared vd->vdev_mg but before
+		 * spa->spa_vdev_removal gets cleared, so we must check before
+		 * we dereference.
+		 */
+		if (vd->vdev_mg &&
+		    vd->vdev_mg->mg_class == spa_normal_class(spa)) {
 			spa->spa_dspace -= spa_deflate(spa) ?
 			    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
 		}
@@ -4834,8 +4834,8 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 	*errors = fnvlist_alloc();
 	off = 0;

-	if ((error = zfs_file_get(input_fd, &input_fp)))
-		return (error);
+	if ((input_fp = zfs_file_get(input_fd)) == NULL)
+		return (SET_ERROR(EBADF));

 	noff = off = zfs_file_off(input_fp);
 	error = dmu_recv_begin(tofs, tosnap, begin_record, force,
@@ -5115,7 +5115,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 		nvlist_free(inheritprops);
 	}
 out:
-	zfs_file_put(input_fd);
+	zfs_file_put(input_fp);
 	nvlist_free(origrecvd);
 	nvlist_free(origprops);

@@ -5445,8 +5445,8 @@ zfs_ioc_send(zfs_cmd_t *zc)
 		zfs_file_t *fp;
 		dmu_send_outparams_t out = {0};

-		if ((error = zfs_file_get(zc->zc_cookie, &fp)))
-			return (error);
+		if ((fp = zfs_file_get(zc->zc_cookie)) == NULL)
+			return (SET_ERROR(EBADF));

 		off = zfs_file_off(fp);
 		out.dso_outfunc = dump_bytes;
@@ -5456,7 +5456,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
 		    zc->zc_fromobj, embedok, large_block_ok, compressok,
 		    rawok, savedok, zc->zc_cookie, &off, &out);

-		zfs_file_put(zc->zc_cookie);
+		zfs_file_put(fp);
 	}
 	return (error);
 }
@@ -6020,25 +6020,24 @@ zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
 {
 	char *snap_name;
 	char *hold_name;
-	int error;
 	minor_t minor;

-	error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
-	if (error != 0)
-		return (error);
+	zfs_file_t *fp = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
+	if (fp == NULL)
+		return (SET_ERROR(EBADF));

 	snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
 	    (u_longlong_t)ddi_get_lbolt64());
 	hold_name = kmem_asprintf("%%%s", zc->zc_value);

-	error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
+	int error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
 	    hold_name);
 	if (error == 0)
 		(void) strlcpy(zc->zc_value, snap_name,
 		    sizeof (zc->zc_value));
 	kmem_strfree(snap_name);
 	kmem_strfree(hold_name);
-	zfs_onexit_fd_rele(zc->zc_cleanup_fd);
+	zfs_onexit_fd_rele(fp);
 	return (error);
 }

@@ -6058,13 +6057,13 @@ zfs_ioc_diff(zfs_cmd_t *zc)
 	offset_t off;
 	int error;

-	if ((error = zfs_file_get(zc->zc_cookie, &fp)))
-		return (error);
+	if ((fp = zfs_file_get(zc->zc_cookie)) == NULL)
+		return (SET_ERROR(EBADF));

 	off = zfs_file_off(fp);
 	error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);

-	zfs_file_put(zc->zc_cookie);
+	zfs_file_put(fp);

 	return (error);
 }
@@ -6100,6 +6099,7 @@ zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
 	int cleanup_fd = -1;
 	int error;
 	minor_t minor = 0;
+	zfs_file_t *fp = NULL;

 	holds = fnvlist_lookup_nvlist(args, "holds");

@@ -6117,14 +6117,16 @@ zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
 	}

 	if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
-		error = zfs_onexit_fd_hold(cleanup_fd, &minor);
-		if (error != 0)
-			return (SET_ERROR(error));
+		fp = zfs_onexit_fd_hold(cleanup_fd, &minor);
+		if (fp == NULL)
+			return (SET_ERROR(EBADF));
 	}

 	error = dsl_dataset_user_hold(holds, minor, errlist);
-	if (minor != 0)
-		zfs_onexit_fd_rele(cleanup_fd);
+	if (fp != NULL) {
+		ASSERT3U(minor, !=, 0);
+		zfs_onexit_fd_rele(fp);
+	}
 	return (SET_ERROR(error));
 }

@@ -6187,9 +6189,9 @@ zfs_ioc_events_next(zfs_cmd_t *zc)
 	uint64_t dropped = 0;
 	int error;

-	error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze);
-	if (error != 0)
-		return (error);
+	zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze);
+	if (fp == NULL)
+		return (SET_ERROR(EBADF));

 	do {
 		error = zfs_zevent_next(ze, &event,
@@ -6211,7 +6213,7 @@ zfs_ioc_events_next(zfs_cmd_t *zc)
 			break;
 	} while (1);

-	zfs_zevent_fd_rele(zc->zc_cleanup_fd);
+	zfs_zevent_fd_rele(fp);

 	return (error);
 }
@@ -6243,12 +6245,12 @@ zfs_ioc_events_seek(zfs_cmd_t *zc)
 	minor_t minor;
 	int error;

-	error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze);
-	if (error != 0)
-		return (error);
+	zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze);
+	if (fp == NULL)
+		return (SET_ERROR(EBADF));

 	error = zfs_zevent_seek(ze, zc->zc_guid);
-	zfs_zevent_fd_rele(zc->zc_cleanup_fd);
+	zfs_zevent_fd_rele(fp);

 	return (error);
 }
@@ -6432,8 +6434,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)

 	(void) nvlist_lookup_string(innvl, "redactbook", &redactbook);

-	if ((error = zfs_file_get(fd, &fp)))
-		return (error);
+	if ((fp = zfs_file_get(fd)) == NULL)
+		return (SET_ERROR(EBADF));

 	off = zfs_file_off(fp);

@@ -6445,7 +6447,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 	    compressok, rawok, savedok, resumeobj, resumeoff,
 	    redactbook, fd, &off, &out);

-	zfs_file_put(fd);
+	zfs_file_put(fp);
 	return (error);
 }

@@ -7318,17 +7320,12 @@ pool_status_check(const char *name, zfs_ioc_namecheck_t type,
 }

 int
-zfsdev_getminor(int fd, minor_t *minorp)
+zfsdev_getminor(zfs_file_t *fp, minor_t *minorp)
 {
 	zfsdev_state_t *zs, *fpd;
-	zfs_file_t *fp;
-	int rc;

 	ASSERT(!MUTEX_HELD(&zfsdev_state_lock));

-	if ((rc = zfs_file_get(fd, &fp)))
-		return (rc);
-
 	fpd = zfs_file_private(fp);
 	if (fpd == NULL)
 		return (SET_ERROR(EBADF));
@@ -126,9 +126,11 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)

 	/* Now pack the attributes up in a single uint64_t */
 	attrs = (uint64_t *)bitmap;
-	crtime = attrs + 1;
-	scanstamp = (caddr_t)(crtime + 2);
 	*attrs = 0;
+	crtime = attrs + 1;
+	bzero(crtime, 2 * sizeof (uint64_t));
+	scanstamp = (caddr_t)(crtime + 2);
+	bzero(scanstamp, AV_SCANSTAMP_SZ);
 	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
 		*attrs |= (xoap->xoa_readonly == 0) ? 0 :
 		    XAT0_READONLY;
@@ -107,30 +107,33 @@ zfs_onexit_destroy(zfs_onexit_t *zo)
 * of this function must call zfs_onexit_fd_rele() when they're finished
 * using the minor number.
 */
-int
+zfs_file_t *
 zfs_onexit_fd_hold(int fd, minor_t *minorp)
 {
 	zfs_onexit_t *zo = NULL;
-	int error;

-	error = zfsdev_getminor(fd, minorp);
+	zfs_file_t *fp = zfs_file_get(fd);
+	if (fp == NULL)
+		return (NULL);
+
+	int error = zfsdev_getminor(fp, minorp);
 	if (error) {
-		zfs_onexit_fd_rele(fd);
-		return (error);
+		zfs_onexit_fd_rele(fp);
+		return (NULL);
 	}

 	zo = zfsdev_get_state(*minorp, ZST_ONEXIT);
 	if (zo == NULL) {
-		zfs_onexit_fd_rele(fd);
-		return (SET_ERROR(EBADF));
+		zfs_onexit_fd_rele(fp);
+		return (NULL);
 	}
-	return (0);
+	return (fp);
 }

 void
-zfs_onexit_fd_rele(int fd)
+zfs_onexit_fd_rele(zfs_file_t *fp)
 {
-	zfs_file_put(fd);
+	zfs_file_put(fp);
 }

 static int
@@ -1617,7 +1617,7 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 	lr_t *lrcb, *lrc;
 	lr_write_t *lrwb, *lrw;
 	char *lr_buf;
-	uint64_t dlen, dnow, lwb_sp, reclen, txg, max_log_data;
+	uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data;

 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
@@ -1651,8 +1651,9 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
 		dlen = P2ROUNDUP_TYPED(
 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
+		dpad = dlen - lrw->lr_length;
 	} else {
-		dlen = 0;
+		dlen = dpad = 0;
 	}
 	reclen = lrc->lrc_reclen;
 	zilog->zl_cur_used += (reclen + dlen);
@@ -1746,6 +1747,9 @@ cont:
 			error = zilog->zl_get_data(itx->itx_private,
 			    itx->itx_gen, lrwb, dbuf, lwb,
 			    lwb->lwb_write_zio);
+			if (dbuf != NULL && error == 0 && dnow == dlen)
+				/* Zero any padding bytes in the last block. */
+				bzero((char *)dbuf + lrwb->lr_length, dpad);

 			if (error == EIO) {
 				txg_wait_synced(zilog->zl_dmu_pool, txg);
@@ -1783,18 +1787,19 @@ cont:
 }

 itx_t *
-zil_itx_create(uint64_t txtype, size_t lrsize)
+zil_itx_create(uint64_t txtype, size_t olrsize)
 {
-	size_t itxsize;
+	size_t itxsize, lrsize;
 	itx_t *itx;

-	lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
+	lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
 	itxsize = offsetof(itx_t, itx_lr) + lrsize;

 	itx = zio_data_buf_alloc(itxsize);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
+	bzero((char *)&itx->itx_lr + olrsize, lrsize - olrsize);
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
@@ -165,8 +165,8 @@ tags = ['functional', 'cli_root', 'zfs_create']

 [tests/functional/cli_root/zfs_destroy]
 tests = ['zfs_clone_livelist_condense_and_disable',
-    'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos',
-    'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
+    'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup',
+    'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
    'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg',
    'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos',
    'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos',
@@ -0,0 +1,88 @@
+#!/bin/ksh -p
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2021 by Delphix. All rights reserved.
+#
+
+# DESCRIPTION
+# Verify zfs destroy test for clones with livelists that contain
+# dedup blocks. This test is a baseline regression test created
+# to ensure that past bugs that we've encountered between dedup
+# and the livelist logic don't resurface.
+
+# STRATEGY
+# 1. Create a clone from a test filesystem and enable dedup.
+# 2. Write some data and create a livelist.
+# 3. Copy the data within the clone to create dedup blocks.
+# 4. Remove some of the dedup data to create multiple free
+#    entries for the same block pointers.
+# 5. Process all the livelist entries by destroying the clone.
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib
+
+function cleanup
+{
+	log_must zfs destroy -Rf $TESTPOOL/$TESTFS1
+	# Reset the minimum percent shared to 75
+	set_tunable32 LIVELIST_MIN_PERCENT_SHARED $ORIGINAL_MIN_SHARED
+}
+
+function test_dedup
+{
+	# Set a small percent shared threshold so the livelist is not disabled
+	set_tunable32 LIVELIST_MIN_PERCENT_SHARED 10
+	clone_dataset $TESTFS1 snap $TESTCLONE
+
+	# Enable dedup
+	log_must zfs set dedup=on $TESTPOOL/$TESTCLONE
+
+	# Create some data to be deduped
+	log_must dd if=/dev/urandom of="/$TESTPOOL/$TESTCLONE/data" bs=512 count=10k
+
+	# Create dedup blocks
+	# Note: We sync before and after so all dedup blocks belong to the
+	#       same TXG, otherwise they won't look identical to the livelist
+	#       iterator due to their logical birth TXG being different.
+	log_must zpool sync $TESTPOOL
+	log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-0
+	log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-1
+	log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-2
+	log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-3
+	log_must zpool sync $TESTPOOL
+	check_livelist_exists $TESTCLONE
+
+	# Introduce "double frees"
+	#   We want to introduce consecutive FREEs of the same block as this
+	#   was what triggered past panics.
+	# Note: Similarly to the previouys step we sync before and after our
+	#       our deletions so all the entries end up in the same TXG.
+	log_must zpool sync $TESTPOOL
+	log_must rm /$TESTPOOL/$TESTCLONE/data-dup-2
+	log_must rm /$TESTPOOL/$TESTCLONE/data-dup-3
+	log_must zpool sync $TESTPOOL
+	check_livelist_exists $TESTCLONE
+
+	log_must zfs destroy $TESTPOOL/$TESTCLONE
+	check_livelist_gone
+}
+
+ORIGINAL_MIN_SHARED=$(get_tunable LIVELIST_MIN_PERCENT_SHARED)
+
+log_onexit cleanup
+log_must zfs create $TESTPOOL/$TESTFS1
+log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile
+log_must zfs snapshot $TESTPOOL/$TESTFS1@snap
+test_dedup
+
+log_pass "Clone's livelist processes dedup blocks as expected."
Author	SHA1	Message	Date
Tony Hutter	ef686e96ec	Tag zfs-2.0.6 META file and changelog updated. Signed-off-by: Tony Hutter <hutter2@llnl.gov>	2021-09-22 15:19:08 -07:00
Brian Behlendorf	7a41ef240a	Linux 5.15 compat: get_acl() Kernel commits 332f606b32b6 ovl: enable RCU'd ->get_acl() 0cad6246621b vfs: add rcu argument to ->get_acl() callback Added compatibility code to detect the new ->get_acl() interface and correctly handle the case where the new rcu argument is set. Reviewed-by: Coleman Kane <ckane@colemankane.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #12548	2021-09-22 15:19:08 -07:00
Alexander	72d16a9b49	Linux 5.15 compat: standalone <linux/stdarg.h> Kernel commits 39f75da7bcc8 ("isystem: trim/fixup stdarg.h and other headers") c0891ac15f04 ("isystem: ship and use stdarg.h") 564f963eabd1 ("isystem: delete global -isystem compile option") (for now can be found in linux-next.git tree, will land into the Linus' tree during the ongoing 5.15 cycle with one of akpm merges) removed the -isystem flag and disallowed the inclusion of any compiler header files. They also introduced a minimal <linux/stdarg.h> as a replacement for <stdarg.h>. include/os/linux/spl/sys/cmn_err.h in the ZFS source tree includes <stdarg.h> unconditionally. Introduce a test for <linux/stdarg.h> and include it instead of the compiler's one to prevent module build breakage. Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Lobakin <alobakin@pm.me> Closes #12531	2021-09-22 15:19:08 -07:00
Brian Behlendorf	54c358c3f2	Linux 5.15 compat: block device readahead The 5.15 kernel moved the backing_dev_info structure out of the request queue structure which causes a build failure. Rather than look in the new location for the BDI we instead detect this upstream refactoring by the existance of either the blk_queue_update_readahead() or disk_update_readahead() functions. In either case, there's no longer any reason to manually set the ra_pages value since it will be overridden with a reasonable default (2x the block size) when blk_queue_io_opt() is called. Therefore, we update the compatibility wrapper to do nothing for 5.9 and newer kernels. While it's tempting to do the same for older kernels we want to keep the compatibility code to preserve the existing behavior. Removing it would effectively increase the default readahead to 128k. Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #12532	2021-09-22 15:19:08 -07:00
Brian Behlendorf	560e9fc817	Linux 5.14 compat: META Increase the Linux-Maximum version in the META file to 5.14. All of the required compatibility patches have been merged and the 5.14 kernel has been officially released. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #12565	2021-09-22 15:19:08 -07:00
Brian Behlendorf	d642efe83c	Linux 5.13 compat: META Increase the Linux-Maximum version in the META file to 5.13. All of the required compatibility patches have been merged and the 5.13 kernel has been officially released. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>	2021-09-22 15:19:08 -07:00
Alexander Motin	51e313f610	FreeBSD: Ignore make_dev_s() errors Since errors returned by zvol_create_minor_impl() are ignored by the common code, it is more convenient to ignore make_dev_s() errors there. It allows, for example, to get device created for the zvol after later rename instead of having it further stuck in half-created state. zvol_rename_minor() already ignores those errors. While there, switch from MAXPHYS to maxphys in FreeBSD 13+. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored-By: iXsystems, Inc. Closes #12375	2021-09-22 15:19:08 -07:00
Alexander Motin	327f12c291	FreeBSD: Switch from MAXPHYS to maxphys on FreeBSD 13+ Reviewed-by: Allan Jude <allan@klarasystems.com> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored-By: iXsystems, Inc. Closes #12378	2021-09-22 15:19:08 -07:00
Alexander Motin	5d4f7e7566	FreeBSD: Retry OCF ENOMEM errors. ZFS does not expect transient errors from crypto. For read they are counted as checksum errors, while for write end up in panic. To not panic on random low memory conditions retry ENOMEM errors in the OCF wrapper function. While there remove unneeded timeout and priority from msleep(). External-issue: https://reviews.freebsd.org/D30339 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Mark Maybee <mark.maybee@delphix.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored-By: iXsystems, Inc. Closes #12077	2021-09-22 15:19:08 -07:00
Serapheim Dimitropoulos	abd0b59e48	Livelist logic should handle dedup blkptrs Update the logic to handle the dedup-case of consecutive FREEs in the livelist code. The logic still ensures that all the FREE entries are matched up with a respective ALLOC by keeping a refcount for each FREE blkptr that we encounter and ensuring that this refcount gets to zero by the time we are done processing the livelist. zdb -y no longer panics when encountering double frees Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: John Kennedy <john.kennedy@delphix.com> Reviewed-by: Don Brady <don.brady@delphix.com> Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com> Closes #11480 Closes #12177	2021-09-22 15:19:08 -07:00
Coleman Kane	6b9d0eda75	Linux 5.14 compat: explicity assign set_page_dirty Kernel 5.14 introduced a change where set_page_dirty of struct address_space_operations is no longer implicitly set to __set_page_dirty_buffers(), which ended up resulting in a NULL pointer deref in the kernel when it is attempted to be called. This change sets .set_page_dirty in the structure to __set_page_dirty_nobuffers(), which was introduced with the related patch set. The breaking change was introduce in commit 0af573780b0b13fceb7fabd49dc1b073cee9a507 to torvalds/linux.git. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Coleman Kane <ckane@colemankane.org> Closes #12427	2021-09-22 15:19:08 -07:00
Paul Dagnelie	744cdfd93b	Add SIGSTOP and SIGTSTP handling to issig This change adds SIGSTOP and SIGTSTP handling to the issig function; this mirrors its behavior on Solaris. This way, long running kernel tasks can be stopped with the appropriate signals. Note that doing so with ctrl-z on the command line doesn't return control of the tty to the shell, because tty handling is done separately from stopping the process. That can be future work, if people feel that it is a necessary addition. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Paul Dagnelie <pcd@delphix.com> Issue #810 Issue #10843 Closes #11801	2021-09-22 15:19:08 -07:00
Brian Behlendorf	36d50b60d8	Linux 5.14 compat: blk_alloc_disk() In Linux 5.14, blk_alloc_queue is no longer exported, and its usage has been superseded by blk_alloc_disk, which returns a gendisk struct from which we can still retrieve the struct request_queue* that is needed in the one place where it is used. This also replaces the call to alloc_disk(minors), and minors is now set via struct member assignment. Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com> Reviewed-by: Olaf Faaland <faaland1@llnl.gov> Reviewed-by: Coleman Kane <ckane@colemankane.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #12362 Closes #12409	2021-09-22 15:19:08 -07:00
Mark Johnston	0c4f86be74	Initialize dn_next_type[] in the dnode constructor It seems nothing ensures that this array is zeroed when a dnode is freshly allocated, so in principle it retains the values from the previous allocation. In practice it seems to be the case that the fields should end up zeroed, but we can zero the field anyway for consistency. This was found using KMSAN. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Mark Johnston <markj@FreeBSD.org> Closes #12383	2021-09-22 15:19:08 -07:00
Mark Johnston	88be308b2f	Zero pad bytes following TX_WRITE log data When logging a TX_WRITE record in the case where file data has to be copied from the DMU, we pad the log record size to a multiple of 8 bytes. In this case, any padding bytes should be zeroed, otherwise the contents of uninitialized memory are written to the ZIL. This was found using KMSAN. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Mark Johnston <markj@FreeBSD.org> Closes #12383	2021-09-22 15:19:08 -07:00
Mark Johnston	d6dc79eabc	Zero pad bytes when allocating a ZIL record When allocating a record, we round up the allocation size to a multiple of 8. In this case, any padding bytes should be zeroed, otherwise the contents of uninitialized memory are written to the ZIL. This was found using KMSAN. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Mark Johnston <markj@FreeBSD.org> Closes #12383	2021-09-22 15:19:08 -07:00
Mark Johnston	900a444107	Initialize all fields in zfs_log_xvattr() When logging TX_SETATTR, we could otherwise fail to initialize part of the corresponding ZIL record depending on which fields are present in the xvattr. Initialize the creation time and the AV scan timestamp to zero so that uninitialized bytes are not written to the ZIL. This was found using KMSAN. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Mark Johnston <markj@FreeBSD.org> Closes #12383	2021-09-22 15:19:08 -07:00
George Wilson	1885e5ebab	file reference counts can get corrupted Callers of zfs_file_get and zfs_file_put can corrupt the reference counts for the file structure resulting in a panic or a soft lockup. When zfs send/recv runs, it will add a reference count to the open file, and begin to send or recv the stream. If the file descriptor is closed, then when dmu_recv_stream() or dmu_send() return we will call zfs_file_put to remove the reference we placed on the file structure. Unfortunately, because zfs_file_put() uses the file descriptor to lookup the file structure, it may end up finding that the file descriptor table no longer contains the file struct, thus leaking the file structure. Or it might end up finding a file descriptor for a different file and blindly updating its reference counts. Other failure modes probably exists. This change reworks the zfs_file_[get\|put] interface to not rely on the file descriptor but instead pass the zfs_file_t pointer around. Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Mark Maybee <mark.maybee@delphix.com> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Co-authored-by: Allan Jude <allan@klarasystems.com> Signed-off-by: George Wilson <gwilson@delphix.com> External-issue: DLPX-76119 Closes #12299	2021-09-22 15:19:08 -07:00
Antonio Russo	1ad8fcc054	Revert Consolidate arc_buf allocation checks This reverts commit `13fac09868`. Per the discussion in #11531, the reverted commit---which intended only to be a cleanup commit---introduced a subtle, unintended change in behavior. Care was taken to partially revert and then reapply `10b3c7f5e4` which would otherwise have caused a conflict. These changes were squashed in to this commit. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Suggested-by: @chrisrd Suggested-by: robn@despairlabs.com Signed-off-by: Antonio Russo <aerusso@aerusso.net> Closes #11531 Closes #12227	2021-09-22 15:19:08 -07:00
Rich Ercolani	05c96b438a	Fix unfortunate NULL in spa_update_dspace After `1325434b`, we can in certain circumstances end up calling spa_update_dspace with vd->vdev_mg NULL, which ends poorly during vdev removal. So let's not do that further space adjustment when we can't. Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Rich Ercolani <rincebrain@gmail.com> Closes #12380 Closes #12428	2021-09-22 15:19:08 -07:00
Rich Ercolani	ccf6d0a59b	Tinker with slop space accounting with dedup * Tinker with slop space accounting with dedup Do not include the deduplicated space usage in the slop space reservation, it leads to surprising outcomes. * Update spa_dedup_dspace sometimes Sometimes, we get into spa_get_slop_space() with spa_dedup_dspace=~0ULL, AKA "unset", while spa_dspace is correctly set. So call the code to update it before we use it if we hit that case. Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Mark Maybee <mark.maybee@delphix.com> Signed-off-by: Rich Ercolani <rincebrain@gmail.com> Closes #12271	2021-09-22 15:19:08 -07:00
Prakash Surya	61ae6c99f7	Add upper bound for slop space calculation This change modifies the behavior of how we determine how much slop space to use in the pool, such that now it has an upper limit. The default upper limit is 128G, but is configurable via a tunable. (Backporting note: Snipped out the embedded_log portion of the changes.) Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Prakash Surya <prakash.surya@delphix.com> Closes #11023	2021-09-22 15:19:08 -07:00