OpenZFS restructuring - move platform specific sources

Move platform specific Linux source under module/os/linux/ and update the build system accordingly. Additional code restructuring will follow to make the common code fully portable. Reviewed-by: Jorgen Lundman <lundman@lundman.net> Reviewed-by: Igor Kozhukhov <igor@dilos.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Matthew Macy <mmacy@FreeBSD.org> Closes #9206
2026-05-22 18:40:43 +03:00 · 2019-09-06 11:26:26 -07:00
parent 870e7a52c1
commit bced7e3aaa
62 changed files with 167 additions and 87 deletions
@@ -0,0 +1,34 @@
+#
+# Linux specific sources included from module/zfs/Makefile.in
+#
+
+# Suppress unused-value warnings in sparc64 architecture headers
+ifeq ($(target_cpu),sparc64)
+ccflags-y += -Wno-unused-value
+endif
+
+ccflags-y += -I@abs_top_srcdir@/module/os/linux/zfs
+
+$(MODULE)-objs += ../os/linux/zfs/abd.o
+$(MODULE)-objs += ../os/linux/zfs/policy.o
+$(MODULE)-objs += ../os/linux/zfs/qat.o
+$(MODULE)-objs += ../os/linux/zfs/qat_compress.o
+$(MODULE)-objs += ../os/linux/zfs/qat_crypt.o
+$(MODULE)-objs += ../os/linux/zfs/spa_stats.o
+$(MODULE)-objs += ../os/linux/zfs/vdev_disk.o
+$(MODULE)-objs += ../os/linux/zfs/vdev_file.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_acl.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_ctldir.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_debug.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_dir.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_vnops.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_znode.o
+$(MODULE)-objs += ../os/linux/zfs/zio_crypt.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_export.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_file.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_inode.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_super.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_xattr.o
@@ -0,0 +1,355 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright (C) 2016 Lawrence Livermore National Security, LLC.
+ *
+ * For Linux the vast majority of this enforcement is already handled via
+ * the standard Linux VFS permission checks.  However certain administrative
+ * commands which bypass the standard mechanisms may need to make use of
+ * this functionality.
+ */
+
+#include <sys/policy.h>
+#include <linux/security.h>
+#include <linux/vfs_compat.h>
+
+/*
+ * The passed credentials cannot be directly verified because Linux only
+ * provides and interface to check the *current* process credentials.  In
+ * order to handle this the capable() test is only run when the passed
+ * credentials match the current process credentials or the kcred.  In
+ * all other cases this function must fail and return the passed err.
+ */
+static int
+priv_policy_ns(const cred_t *cr, int capability, boolean_t all, int err,
+    struct user_namespace *ns)
+{
+	ASSERT3S(all, ==, B_FALSE);
+
+	if (cr != CRED() && (cr != kcred))
+		return (err);
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_NS_CAPABLE)
+	if (!(ns ? ns_capable(ns, capability) : capable(capability)))
+#else
+	if (!capable(capability))
+#endif
+		return (err);
+
+	return (0);
+}
+
+static int
+priv_policy(const cred_t *cr, int capability, boolean_t all, int err)
+{
+	return (priv_policy_ns(cr, capability, all, err, NULL));
+}
+
+static int
+priv_policy_user(const cred_t *cr, int capability, boolean_t all, int err)
+{
+	/*
+	 * All priv_policy_user checks are preceded by kuid/kgid_has_mapping()
+	 * checks. If we cannot do them, we shouldn't be using ns_capable()
+	 * since we don't know whether the affected files are valid in our
+	 * namespace. Note that kuid_has_mapping() came after cred->user_ns, so
+	 * we shouldn't need to re-check for HAVE_CRED_USER_NS
+	 */
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+	return (priv_policy_ns(cr, capability, all, err, cr->user_ns));
+#else
+	return (priv_policy_ns(cr, capability, all, err, NULL));
+#endif
+}
+
+/*
+ * Checks for operations that are either client-only or are used by
+ * both clients and servers.
+ */
+int
+secpolicy_nfs(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM));
+}
+
+/*
+ * Catch all system configuration.
+ */
+int
+secpolicy_sys_config(const cred_t *cr, boolean_t checkonly)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM));
+}
+
+/*
+ * Like secpolicy_vnode_access() but we get the actual wanted mode and the
+ * current mode of the file, not the missing bits.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner,
+    mode_t curmode, mode_t wantmode)
+{
+	return (0);
+}
+
+/*
+ * This is a special routine for ZFS; it is used to determine whether
+ * any of the privileges in effect allow any form of access to the
+ * file.  There's no reason to audit this or any reason to record
+ * this.  More work is needed to do the "KPLD" stuff.
+ */
+int
+secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+	if (zpl_inode_owner_or_capable(ip))
+		return (0);
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	if (priv_policy_user(cr, CAP_DAC_OVERRIDE, B_FALSE, EPERM) == 0)
+		return (0);
+
+	if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, B_FALSE, EPERM) == 0)
+		return (0);
+
+	return (EPERM);
+}
+
+/*
+ * Determine if subject can chown owner of a file.
+ */
+int
+secpolicy_vnode_chown(const cred_t *cr, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM));
+}
+
+/*
+ * Determine if subject can change group ownership of a file.
+ */
+int
+secpolicy_vnode_create_gid(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SETGID, B_FALSE, EPERM));
+}
+
+/*
+ * Policy determines whether we can remove an entry from a directory,
+ * regardless of permission bits.
+ */
+int
+secpolicy_vnode_remove(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_FOWNER, B_FALSE, EPERM));
+}
+
+/*
+ * Determine that subject can modify the mode of a file.  allzone privilege
+ * needed when modifying root owned object.
+ */
+int
+secpolicy_vnode_setdac(const cred_t *cr, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM));
+}
+
+/*
+ * Are we allowed to retain the set-uid/set-gid bits when
+ * changing ownership or when writing to a file?
+ * "issuid" should be true when set-uid; only in that case
+ * root ownership is checked (setgid is assumed).
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot)
+{
+	return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM));
+}
+
+/*
+ * Determine that subject can set the file setgid flag.
+ */
+int
+secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid)
+{
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+	if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid)))
+		return (EPERM);
+#endif
+	if (crgetfsgid(cr) != gid && !groupmember(gid, cr))
+		return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM));
+
+	return (0);
+}
+
+/*
+ * Determine if the subject can inject faults in the ZFS fault injection
+ * framework.  Requires all privileges.
+ */
+int
+secpolicy_zinject(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES));
+}
+
+/*
+ * Determine if the subject has permission to manipulate ZFS datasets
+ * (not pools).  Equivalent to the SYS_MOUNT privilege.
+ */
+int
+secpolicy_zfs(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES));
+}
+
+void
+secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
+{
+	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
+	    secpolicy_vnode_setid_retain(cr,
+	    (vap->va_mode & S_ISUID) != 0 &&
+	    (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) {
+		vap->va_mask |= AT_MODE;
+		vap->va_mode &= ~(S_ISUID|S_ISGID);
+	}
+}
+
+/*
+ * Determine that subject can set the file setid flags.
+ */
+static int
+secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM));
+}
+
+/*
+ * Determine that subject can make a file a "sticky".
+ *
+ * Enforced in the Linux VFS.
+ */
+static int
+secpolicy_vnode_stky_modify(const cred_t *cr)
+{
+	return (0);
+}
+
+int
+secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap,
+    const vattr_t *ovap, cred_t *cr)
+{
+	int error;
+
+	if ((vap->va_mode & S_ISUID) != 0 &&
+	    (error = secpolicy_vnode_setid_modify(cr,
+	    ovap->va_uid)) != 0) {
+		return (error);
+	}
+
+	/*
+	 * Check privilege if attempting to set the
+	 * sticky bit on a non-directory.
+	 */
+	if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 &&
+	    secpolicy_vnode_stky_modify(cr) != 0) {
+		vap->va_mode &= ~S_ISVTX;
+	}
+
+	/*
+	 * Check for privilege if attempting to set the
+	 * group-id bit.
+	 */
+	if ((vap->va_mode & S_ISGID) != 0 &&
+	    secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) {
+		vap->va_mode &= ~S_ISGID;
+	}
+
+	return (0);
+}
+
+/*
+ * Check privileges for setting xvattr attributes
+ */
+int
+secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype)
+{
+	return (secpolicy_vnode_chown(cr, owner));
+}
+
+/*
+ * Check privileges for setattr attributes.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap,
+    const struct vattr *ovap, int flags,
+    int unlocked_access(void *, int, cred_t *), void *node)
+{
+	return (0);
+}
+
+/*
+ * Check privileges for links.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_basic_link(const cred_t *cr)
+{
+	return (0);
+}
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <sys/zfs_context.h>
+#include "qat.h"
+
+qat_stats_t qat_stats = {
+	{ "comp_requests",			KSTAT_DATA_UINT64 },
+	{ "comp_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "comp_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "decomp_requests",			KSTAT_DATA_UINT64 },
+	{ "decomp_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "decomp_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "dc_fails",				KSTAT_DATA_UINT64 },
+	{ "encrypt_requests",			KSTAT_DATA_UINT64 },
+	{ "encrypt_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "encrypt_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "decrypt_requests",			KSTAT_DATA_UINT64 },
+	{ "decrypt_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "decrypt_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "crypt_fails",			KSTAT_DATA_UINT64 },
+	{ "cksum_requests",			KSTAT_DATA_UINT64 },
+	{ "cksum_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "cksum_fails",			KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *qat_ksp = NULL;
+
+CpaStatus
+qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes)
+{
+	*pp_mem_addr = kmalloc(size_bytes, GFP_KERNEL);
+	if (*pp_mem_addr == NULL)
+		return (CPA_STATUS_RESOURCE);
+	return (CPA_STATUS_SUCCESS);
+}
+
+void
+qat_mem_free_contig(void **pp_mem_addr)
+{
+	if (*pp_mem_addr != NULL) {
+		kfree(*pp_mem_addr);
+		*pp_mem_addr = NULL;
+	}
+}
+
+int
+qat_init(void)
+{
+	qat_ksp = kstat_create("zfs", 0, "qat", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (qat_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (qat_ksp != NULL) {
+		qat_ksp->ks_data = &qat_stats;
+		kstat_install(qat_ksp);
+	}
+
+	/*
+	 * Just set the disable flag when qat init failed, qat can be
+	 * turned on again in post-process after zfs module is loaded, e.g.:
+	 * echo 0 > /sys/module/zfs/parameters/zfs_qat_compress_disable
+	 */
+	if (qat_dc_init() != 0)
+		zfs_qat_compress_disable = 1;
+
+	if (qat_cy_init() != 0) {
+		zfs_qat_checksum_disable = 1;
+		zfs_qat_encrypt_disable = 1;
+	}
+
+	return (0);
+}
+
+void
+qat_fini(void)
+{
+	if (qat_ksp != NULL) {
+		kstat_delete(qat_ksp);
+		qat_ksp = NULL;
+	}
+
+	qat_cy_fini();
+	qat_dc_fini();
+}
+
+#endif
@@ -0,0 +1,574 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/completion.h>
+#include <sys/zfs_context.h>
+#include <sys/byteorder.h>
+#include <sys/zio.h>
+#include "qat.h"
+
+/*
+ * Max instances in a QAT device, each instance is a channel to submit
+ * jobs to QAT hardware, this is only for pre-allocating instance and
+ * session arrays; the actual number of instances are defined in the
+ * QAT driver's configuration file.
+ */
+#define	QAT_DC_MAX_INSTANCES	48
+
+/*
+ * ZLIB head and foot size
+ */
+#define	ZLIB_HEAD_SZ		2
+#define	ZLIB_FOOT_SZ		4
+
+static CpaInstanceHandle dc_inst_handles[QAT_DC_MAX_INSTANCES];
+static CpaDcSessionHandle session_handles[QAT_DC_MAX_INSTANCES];
+static CpaBufferList **buffer_array[QAT_DC_MAX_INSTANCES];
+static Cpa16U num_inst = 0;
+static Cpa32U inst_num = 0;
+static boolean_t qat_dc_init_done = B_FALSE;
+int zfs_qat_compress_disable = 0;
+
+boolean_t
+qat_dc_use_accel(size_t s_len)
+{
+	return (!zfs_qat_compress_disable &&
+	    qat_dc_init_done &&
+	    s_len >= QAT_MIN_BUF_SIZE &&
+	    s_len <= QAT_MAX_BUF_SIZE);
+}
+
+static void
+qat_dc_callback(void *p_callback, CpaStatus status)
+{
+	if (p_callback != NULL)
+		complete((struct completion *)p_callback);
+}
+
+static void
+qat_dc_clean(void)
+{
+	Cpa16U buff_num = 0;
+	Cpa16U num_inter_buff_lists = 0;
+
+	for (Cpa16U i = 0; i < num_inst; i++) {
+		cpaDcStopInstance(dc_inst_handles[i]);
+		QAT_PHYS_CONTIG_FREE(session_handles[i]);
+		/* free intermediate buffers  */
+		if (buffer_array[i] != NULL) {
+			cpaDcGetNumIntermediateBuffers(
+			    dc_inst_handles[i], &num_inter_buff_lists);
+			for (buff_num = 0; buff_num < num_inter_buff_lists;
+			    buff_num++) {
+				CpaBufferList *buffer_inter =
+				    buffer_array[i][buff_num];
+				if (buffer_inter->pBuffers) {
+					QAT_PHYS_CONTIG_FREE(
+					    buffer_inter->pBuffers->pData);
+					QAT_PHYS_CONTIG_FREE(
+					    buffer_inter->pBuffers);
+				}
+				QAT_PHYS_CONTIG_FREE(
+				    buffer_inter->pPrivateMetaData);
+				QAT_PHYS_CONTIG_FREE(buffer_inter);
+			}
+		}
+	}
+
+	num_inst = 0;
+	qat_dc_init_done = B_FALSE;
+}
+
+int
+qat_dc_init(void)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U sess_size = 0;
+	Cpa32U ctx_size = 0;
+	Cpa16U num_inter_buff_lists = 0;
+	Cpa16U buff_num = 0;
+	Cpa32U buff_meta_size = 0;
+	CpaDcSessionSetupData sd = {0};
+
+	if (qat_dc_init_done)
+		return (0);
+
+	status = cpaDcGetNumInstances(&num_inst);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	/* if the user has configured no QAT compression units just return */
+	if (num_inst == 0)
+		return (0);
+
+	if (num_inst > QAT_DC_MAX_INSTANCES)
+		num_inst = QAT_DC_MAX_INSTANCES;
+
+	status = cpaDcGetInstances(num_inst, &dc_inst_handles[0]);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	for (Cpa16U i = 0; i < num_inst; i++) {
+		cpaDcSetAddressTranslation(dc_inst_handles[i],
+		    (void*)virt_to_phys);
+
+		status = cpaDcBufferListGetMetaSize(dc_inst_handles[i],
+		    1, &buff_meta_size);
+
+		if (status == CPA_STATUS_SUCCESS)
+			status = cpaDcGetNumIntermediateBuffers(
+			    dc_inst_handles[i], &num_inter_buff_lists);
+
+		if (status == CPA_STATUS_SUCCESS && num_inter_buff_lists != 0)
+			status = QAT_PHYS_CONTIG_ALLOC(&buffer_array[i],
+			    num_inter_buff_lists *
+			    sizeof (CpaBufferList *));
+
+		for (buff_num = 0; buff_num < num_inter_buff_lists;
+		    buff_num++) {
+			if (status == CPA_STATUS_SUCCESS)
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num],
+				    sizeof (CpaBufferList));
+
+			if (status == CPA_STATUS_SUCCESS)
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num]->
+				    pPrivateMetaData,
+				    buff_meta_size);
+
+			if (status == CPA_STATUS_SUCCESS)
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num]->pBuffers,
+				    sizeof (CpaFlatBuffer));
+
+			if (status == CPA_STATUS_SUCCESS) {
+				/*
+				 *  implementation requires an intermediate
+				 *  buffer approximately twice the size of
+				 *  output buffer, which is 2x max buffer
+				 *  size here.
+				 */
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num]->pBuffers->
+				    pData, 2 * QAT_MAX_BUF_SIZE);
+				if (status != CPA_STATUS_SUCCESS)
+					goto fail;
+
+				buffer_array[i][buff_num]->numBuffers = 1;
+				buffer_array[i][buff_num]->pBuffers->
+				    dataLenInBytes = 2 * QAT_MAX_BUF_SIZE;
+			}
+		}
+
+		status = cpaDcStartInstance(dc_inst_handles[i],
+		    num_inter_buff_lists, buffer_array[i]);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+
+		sd.compLevel = CPA_DC_L1;
+		sd.compType = CPA_DC_DEFLATE;
+		sd.huffType = CPA_DC_HT_FULL_DYNAMIC;
+		sd.sessDirection = CPA_DC_DIR_COMBINED;
+		sd.sessState = CPA_DC_STATELESS;
+		sd.deflateWindowSize = 7;
+		sd.checksum = CPA_DC_ADLER32;
+		status = cpaDcGetSessionSize(dc_inst_handles[i],
+		    &sd, &sess_size, &ctx_size);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+
+		QAT_PHYS_CONTIG_ALLOC(&session_handles[i], sess_size);
+		if (session_handles[i] == NULL)
+			goto fail;
+
+		status = cpaDcInitSession(dc_inst_handles[i],
+		    session_handles[i],
+		    &sd, NULL, qat_dc_callback);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+	}
+
+	qat_dc_init_done = B_TRUE;
+	return (0);
+fail:
+	qat_dc_clean();
+	return (-1);
+}
+
+void
+qat_dc_fini(void)
+{
+	if (!qat_dc_init_done)
+		return;
+
+	qat_dc_clean();
+}
+
+/*
+ * The "add" parameter is an additional buffer which is passed
+ * to QAT as a scratch buffer alongside the destination buffer
+ * in case the "compressed" data ends up being larger than the
+ * original source data. This is necessary to prevent QAT from
+ * generating buffer overflow warnings for incompressible data.
+ */
+static int
+qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len,
+    char *dst, int dst_len, char *add, int add_len, size_t *c_len)
+{
+	CpaInstanceHandle dc_inst_handle;
+	CpaDcSessionHandle session_handle;
+	CpaBufferList *buf_list_src = NULL;
+	CpaBufferList *buf_list_dst = NULL;
+	CpaFlatBuffer *flat_buf_src = NULL;
+	CpaFlatBuffer *flat_buf_dst = NULL;
+	Cpa8U *buffer_meta_src = NULL;
+	Cpa8U *buffer_meta_dst = NULL;
+	Cpa32U buffer_meta_size = 0;
+	CpaDcRqResults dc_results;
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U hdr_sz = 0;
+	Cpa32U compressed_sz;
+	Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2;
+	Cpa32U num_dst_buf = (dst_len >> PAGE_SHIFT) + 2;
+	Cpa32U num_add_buf = (add_len >> PAGE_SHIFT) + 2;
+	Cpa32U bytes_left;
+	Cpa32U dst_pages = 0;
+	Cpa32U adler32 = 0;
+	char *data;
+	struct page *page;
+	struct page **in_pages = NULL;
+	struct page **out_pages = NULL;
+	struct page **add_pages = NULL;
+	Cpa32U page_off = 0;
+	struct completion complete;
+	Cpa32U page_num = 0;
+	Cpa16U i;
+
+	/*
+	 * We increment num_src_buf and num_dst_buf by 2 to allow
+	 * us to handle non page-aligned buffer addresses and buffers
+	 * whose sizes are not divisible by PAGE_SIZE.
+	 */
+	Cpa32U src_buffer_list_mem_size = sizeof (CpaBufferList) +
+	    (num_src_buf * sizeof (CpaFlatBuffer));
+	Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) +
+	    ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer));
+
+	if (QAT_PHYS_CONTIG_ALLOC(&in_pages,
+	    num_src_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	if (QAT_PHYS_CONTIG_ALLOC(&out_pages,
+	    num_dst_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	if (QAT_PHYS_CONTIG_ALLOC(&add_pages,
+	    num_add_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+	dc_inst_handle = dc_inst_handles[i];
+	session_handle = session_handles[i];
+
+	cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf,
+	    &buffer_meta_size);
+	if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size) !=
+	    CPA_STATUS_SUCCESS)
+		goto fail;
+
+	cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf,
+	    &buffer_meta_size);
+	if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size) !=
+	    CPA_STATUS_SUCCESS)
+		goto fail;
+
+	/* build source buffer list */
+	if (QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size) !=
+	    CPA_STATUS_SUCCESS)
+		goto fail;
+
+	flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1);
+
+	buf_list_src->pBuffers = flat_buf_src; /* always point to first one */
+
+	/* build destination buffer list */
+	if (QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size) !=
+	    CPA_STATUS_SUCCESS)
+		goto fail;
+
+	flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
+
+	buf_list_dst->pBuffers = flat_buf_dst; /* always point to first one */
+
+	buf_list_src->numBuffers = 0;
+	buf_list_src->pPrivateMetaData = buffer_meta_src;
+	bytes_left = src_len;
+	data = src;
+	page_num = 0;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		page = qat_mem_to_page(data);
+		in_pages[page_num] = page;
+		flat_buf_src->pData = kmap(page) + page_off;
+		flat_buf_src->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+		bytes_left -= flat_buf_src->dataLenInBytes;
+		data += flat_buf_src->dataLenInBytes;
+		flat_buf_src++;
+		buf_list_src->numBuffers++;
+		page_num++;
+	}
+
+	buf_list_dst->numBuffers = 0;
+	buf_list_dst->pPrivateMetaData = buffer_meta_dst;
+	bytes_left = dst_len;
+	data = dst;
+	page_num = 0;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		page = qat_mem_to_page(data);
+		flat_buf_dst->pData = kmap(page) + page_off;
+		out_pages[page_num] = page;
+		flat_buf_dst->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+		bytes_left -= flat_buf_dst->dataLenInBytes;
+		data += flat_buf_dst->dataLenInBytes;
+		flat_buf_dst++;
+		buf_list_dst->numBuffers++;
+		page_num++;
+		dst_pages++;
+	}
+
+	/* map additional scratch pages into the destination buffer list */
+	bytes_left = add_len;
+	data = add;
+	page_num = 0;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		page = qat_mem_to_page(data);
+		flat_buf_dst->pData = kmap(page) + page_off;
+		add_pages[page_num] = page;
+		flat_buf_dst->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+		bytes_left -= flat_buf_dst->dataLenInBytes;
+		data += flat_buf_dst->dataLenInBytes;
+		flat_buf_dst++;
+		buf_list_dst->numBuffers++;
+		page_num++;
+	}
+
+	init_completion(&complete);
+
+	if (dir == QAT_COMPRESS) {
+		QAT_STAT_BUMP(comp_requests);
+		QAT_STAT_INCR(comp_total_in_bytes, src_len);
+
+		cpaDcGenerateHeader(session_handle,
+		    buf_list_dst->pBuffers, &hdr_sz);
+		buf_list_dst->pBuffers->pData += hdr_sz;
+		buf_list_dst->pBuffers->dataLenInBytes -= hdr_sz;
+		status = cpaDcCompressData(
+		    dc_inst_handle, session_handle,
+		    buf_list_src, buf_list_dst,
+		    &dc_results, CPA_DC_FLUSH_FINAL,
+		    &complete);
+		if (status != CPA_STATUS_SUCCESS) {
+			goto fail;
+		}
+
+		/* we now wait until the completion of the operation. */
+		if (!wait_for_completion_interruptible_timeout(&complete,
+		    QAT_TIMEOUT_MS)) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+
+		if (dc_results.status != CPA_STATUS_SUCCESS) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+
+		compressed_sz = dc_results.produced;
+		if (compressed_sz + hdr_sz + ZLIB_FOOT_SZ > dst_len) {
+			status = CPA_STATUS_INCOMPRESSIBLE;
+			goto fail;
+		}
+
+		flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
+		/* move to the last page */
+		flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT;
+
+		/* no space for gzip footer in the last page */
+		if (((compressed_sz + hdr_sz) % PAGE_SIZE)
+		    + ZLIB_FOOT_SZ > PAGE_SIZE) {
+			status = CPA_STATUS_INCOMPRESSIBLE;
+			goto fail;
+		}
+
+		/* jump to the end of the buffer and append footer */
+		flat_buf_dst->pData =
+		    (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK)
+		    + ((compressed_sz + hdr_sz) % PAGE_SIZE);
+		flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ;
+
+		dc_results.produced = 0;
+		status = cpaDcGenerateFooter(session_handle,
+		    flat_buf_dst, &dc_results);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+
+		*c_len = compressed_sz + dc_results.produced + hdr_sz;
+		QAT_STAT_INCR(comp_total_out_bytes, *c_len);
+	} else {
+		ASSERT3U(dir, ==, QAT_DECOMPRESS);
+		QAT_STAT_BUMP(decomp_requests);
+		QAT_STAT_INCR(decomp_total_in_bytes, src_len);
+
+		buf_list_src->pBuffers->pData += ZLIB_HEAD_SZ;
+		buf_list_src->pBuffers->dataLenInBytes -= ZLIB_HEAD_SZ;
+		status = cpaDcDecompressData(dc_inst_handle, session_handle,
+		    buf_list_src, buf_list_dst, &dc_results, CPA_DC_FLUSH_FINAL,
+		    &complete);
+
+		if (CPA_STATUS_SUCCESS != status) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+
+		/* we now wait until the completion of the operation. */
+		if (!wait_for_completion_interruptible_timeout(&complete,
+		    QAT_TIMEOUT_MS)) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+
+		if (dc_results.status != CPA_STATUS_SUCCESS) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+
+		/* verify adler checksum */
+		adler32 = *(Cpa32U *)(src + dc_results.consumed + ZLIB_HEAD_SZ);
+		if (adler32 != BSWAP_32(dc_results.checksum)) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+		*c_len = dc_results.produced;
+		QAT_STAT_INCR(decomp_total_out_bytes, *c_len);
+	}
+
+fail:
+	if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_INCOMPRESSIBLE)
+		QAT_STAT_BUMP(dc_fails);
+
+	if (in_pages) {
+		for (page_num = 0;
+		    page_num < buf_list_src->numBuffers;
+		    page_num++) {
+			kunmap(in_pages[page_num]);
+		}
+		QAT_PHYS_CONTIG_FREE(in_pages);
+	}
+
+	if (out_pages) {
+		for (page_num = 0; page_num < dst_pages; page_num++) {
+			kunmap(out_pages[page_num]);
+		}
+		QAT_PHYS_CONTIG_FREE(out_pages);
+	}
+
+	if (add_pages) {
+		for (page_num = 0;
+		    page_num < buf_list_dst->numBuffers - dst_pages;
+		    page_num++) {
+			kunmap(add_pages[page_num]);
+		}
+		QAT_PHYS_CONTIG_FREE(add_pages);
+	}
+
+	QAT_PHYS_CONTIG_FREE(buffer_meta_src);
+	QAT_PHYS_CONTIG_FREE(buffer_meta_dst);
+	QAT_PHYS_CONTIG_FREE(buf_list_src);
+	QAT_PHYS_CONTIG_FREE(buf_list_dst);
+
+	return (status);
+}
+
+/*
+ * Entry point for QAT accelerated compression / decompression.
+ */
+int
+qat_compress(qat_compress_dir_t dir, char *src, int src_len,
+    char *dst, int dst_len, size_t *c_len)
+{
+	int ret;
+	size_t add_len = 0;
+	void *add = NULL;
+
+	if (dir == QAT_COMPRESS) {
+		add_len = dst_len;
+		add = zio_data_buf_alloc(add_len);
+	}
+
+	ret = qat_compress_impl(dir, src, src_len, dst,
+	    dst_len, add, add_len, c_len);
+
+	if (dir == QAT_COMPRESS)
+		zio_data_buf_free(add, add_len);
+
+	return (ret);
+}
+
+static int
+param_set_qat_compress(const char *val, zfs_kernel_param_t *kp)
+{
+	int ret;
+	int *pvalue = kp->arg;
+	ret = param_set_int(val, kp);
+	if (ret)
+		return (ret);
+	/*
+	 * zfs_qat_compress_disable = 0: enable qat compress
+	 * try to initialize qat instance if it has not been done
+	 */
+	if (*pvalue == 0 && !qat_dc_init_done) {
+		ret = qat_dc_init();
+		if (ret != 0) {
+			zfs_qat_compress_disable = 1;
+			return (ret);
+		}
+	}
+	return (ret);
+}
+
+module_param_call(zfs_qat_compress_disable, param_set_qat_compress,
+    param_get_int, &zfs_qat_compress_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_compress_disable, "Enable/Disable QAT compression");
+
+#endif
@@ -0,0 +1,631 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * This file represents the QAT implementation of checksums and encryption.
+ * Internally, QAT shares the same cryptographic instances for both of these
+ * operations, so the code has been combined here. QAT data compression uses
+ * compression instances, so that code is separated into qat_compress.c
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/completion.h>
+#include <sys/zfs_context.h>
+#include <sys/zio_crypt.h>
+#include "lac/cpa_cy_im.h"
+#include "lac/cpa_cy_common.h"
+#include "qat.h"
+
+/*
+ * Max instances in a QAT device, each instance is a channel to submit
+ * jobs to QAT hardware, this is only for pre-allocating instances
+ * and session arrays; the actual number of instances are defined in
+ * the QAT driver's configure file.
+ */
+#define	QAT_CRYPT_MAX_INSTANCES		48
+
+#define	MAX_PAGE_NUM			1024
+
+static Cpa32U inst_num = 0;
+static Cpa16U num_inst = 0;
+static CpaInstanceHandle cy_inst_handles[QAT_CRYPT_MAX_INSTANCES];
+static boolean_t qat_cy_init_done = B_FALSE;
+int zfs_qat_encrypt_disable = 0;
+int zfs_qat_checksum_disable = 0;
+
+typedef struct cy_callback {
+	CpaBoolean verify_result;
+	struct completion complete;
+} cy_callback_t;
+
+static void
+symcallback(void *p_callback, CpaStatus status, const CpaCySymOp operation,
+    void *op_data, CpaBufferList *buf_list_dst, CpaBoolean verify)
+{
+	cy_callback_t *cb = p_callback;
+
+	if (cb != NULL) {
+		/* indicate that the function has been called */
+		cb->verify_result = verify;
+		complete(&cb->complete);
+	}
+}
+
+boolean_t
+qat_crypt_use_accel(size_t s_len)
+{
+	return (!zfs_qat_encrypt_disable &&
+	    qat_cy_init_done &&
+	    s_len >= QAT_MIN_BUF_SIZE &&
+	    s_len <= QAT_MAX_BUF_SIZE);
+}
+
+boolean_t
+qat_checksum_use_accel(size_t s_len)
+{
+	return (!zfs_qat_checksum_disable &&
+	    qat_cy_init_done &&
+	    s_len >= QAT_MIN_BUF_SIZE &&
+	    s_len <= QAT_MAX_BUF_SIZE);
+}
+
+void
+qat_cy_clean(void)
+{
+	for (Cpa16U i = 0; i < num_inst; i++)
+		cpaCyStopInstance(cy_inst_handles[i]);
+
+	num_inst = 0;
+	qat_cy_init_done = B_FALSE;
+}
+
+int
+qat_cy_init(void)
+{
+	CpaStatus status = CPA_STATUS_FAIL;
+
+	if (qat_cy_init_done)
+		return (0);
+
+	status = cpaCyGetNumInstances(&num_inst);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	/* if the user has configured no QAT encryption units just return */
+	if (num_inst == 0)
+		return (0);
+
+	if (num_inst > QAT_CRYPT_MAX_INSTANCES)
+		num_inst = QAT_CRYPT_MAX_INSTANCES;
+
+	status = cpaCyGetInstances(num_inst, &cy_inst_handles[0]);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	for (Cpa16U i = 0; i < num_inst; i++) {
+		status = cpaCySetAddressTranslation(cy_inst_handles[i],
+		    (void *)virt_to_phys);
+		if (status != CPA_STATUS_SUCCESS)
+			goto error;
+
+		status = cpaCyStartInstance(cy_inst_handles[i]);
+		if (status != CPA_STATUS_SUCCESS)
+			goto error;
+	}
+
+	qat_cy_init_done = B_TRUE;
+	return (0);
+
+error:
+	qat_cy_clean();
+	return (-1);
+}
+
+void
+qat_cy_fini(void)
+{
+	if (!qat_cy_init_done)
+		return;
+
+	qat_cy_clean();
+}
+
+static CpaStatus
+qat_init_crypt_session_ctx(qat_encrypt_dir_t dir, CpaInstanceHandle inst_handle,
+    CpaCySymSessionCtx **cy_session_ctx, crypto_key_t *key,
+    Cpa64U crypt, Cpa32U aad_len)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U ctx_size;
+	Cpa32U ciper_algorithm;
+	Cpa32U hash_algorithm;
+	CpaCySymSessionSetupData sd = { 0 };
+
+	if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_CCM) {
+		return (CPA_STATUS_FAIL);
+	} else {
+		ciper_algorithm = CPA_CY_SYM_CIPHER_AES_GCM;
+		hash_algorithm = CPA_CY_SYM_HASH_AES_GCM;
+	}
+
+	sd.cipherSetupData.cipherAlgorithm = ciper_algorithm;
+	sd.cipherSetupData.pCipherKey = key->ck_data;
+	sd.cipherSetupData.cipherKeyLenInBytes = key->ck_length / 8;
+	sd.hashSetupData.hashAlgorithm = hash_algorithm;
+	sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH;
+	sd.hashSetupData.digestResultLenInBytes = ZIO_DATA_MAC_LEN;
+	sd.hashSetupData.authModeSetupData.aadLenInBytes = aad_len;
+	sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+	sd.symOperation = CPA_CY_SYM_OP_ALGORITHM_CHAINING;
+	sd.digestIsAppended = CPA_FALSE;
+	sd.verifyDigest = CPA_FALSE;
+
+	if (dir == QAT_ENCRYPT) {
+		sd.cipherSetupData.cipherDirection =
+		    CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT;
+		sd.algChainOrder =
+		    CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER;
+	} else {
+		ASSERT3U(dir, ==, QAT_DECRYPT);
+		sd.cipherSetupData.cipherDirection =
+		    CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT;
+		sd.algChainOrder =
+		    CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH;
+	}
+
+	status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = cpaCySymInitSession(inst_handle, symcallback, &sd,
+	    *cy_session_ctx);
+	if (status != CPA_STATUS_SUCCESS) {
+		QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
+		return (status);
+	}
+
+	return (CPA_STATUS_SUCCESS);
+}
+
+static CpaStatus
+qat_init_checksum_session_ctx(CpaInstanceHandle inst_handle,
+    CpaCySymSessionCtx **cy_session_ctx, Cpa64U cksum)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U ctx_size;
+	Cpa32U hash_algorithm;
+	CpaCySymSessionSetupData sd = { 0 };
+
+	/*
+	 * ZFS's SHA512 checksum is actually SHA512/256, which uses
+	 * a different IV from standard SHA512. QAT does not support
+	 * SHA512/256, so we can only support SHA256.
+	 */
+	if (cksum == ZIO_CHECKSUM_SHA256)
+		hash_algorithm = CPA_CY_SYM_HASH_SHA256;
+	else
+		return (CPA_STATUS_FAIL);
+
+	sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+	sd.symOperation = CPA_CY_SYM_OP_HASH;
+	sd.hashSetupData.hashAlgorithm = hash_algorithm;
+	sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_PLAIN;
+	sd.hashSetupData.digestResultLenInBytes = sizeof (zio_cksum_t);
+	sd.digestIsAppended = CPA_FALSE;
+	sd.verifyDigest = CPA_FALSE;
+
+	status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = cpaCySymInitSession(inst_handle, symcallback, &sd,
+	    *cy_session_ctx);
+	if (status != CPA_STATUS_SUCCESS) {
+		QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
+		return (status);
+	}
+
+	return (CPA_STATUS_SUCCESS);
+}
+
+static CpaStatus
+qat_init_cy_buffer_lists(CpaInstanceHandle inst_handle, uint32_t nr_bufs,
+    CpaBufferList *src, CpaBufferList *dst)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U meta_size = 0;
+
+	status = cpaCyBufferListGetMetaSize(inst_handle, nr_bufs, &meta_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = QAT_PHYS_CONTIG_ALLOC(&src->pPrivateMetaData, meta_size);
+	if (status != CPA_STATUS_SUCCESS)
+		goto error;
+
+	if (src != dst) {
+		status = QAT_PHYS_CONTIG_ALLOC(&dst->pPrivateMetaData,
+		    meta_size);
+		if (status != CPA_STATUS_SUCCESS)
+			goto error;
+	}
+
+	return (CPA_STATUS_SUCCESS);
+
+error:
+	QAT_PHYS_CONTIG_FREE(src->pPrivateMetaData);
+	if (src != dst)
+		QAT_PHYS_CONTIG_FREE(dst->pPrivateMetaData);
+
+	return (status);
+}
+
+int
+qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf,
+    uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf,
+    crypto_key_t *key, uint64_t crypt, uint32_t enc_len)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa16U i;
+	CpaInstanceHandle cy_inst_handle;
+	Cpa16U nr_bufs = (enc_len >> PAGE_SHIFT) + 2;
+	Cpa32U bytes_left = 0;
+	Cpa8S *data = NULL;
+	CpaCySymSessionCtx *cy_session_ctx = NULL;
+	cy_callback_t cb;
+	CpaCySymOpData op_data = { 0 };
+	CpaBufferList src_buffer_list = { 0 };
+	CpaBufferList dst_buffer_list = { 0 };
+	CpaFlatBuffer *flat_src_buf_array = NULL;
+	CpaFlatBuffer *flat_src_buf = NULL;
+	CpaFlatBuffer *flat_dst_buf_array = NULL;
+	CpaFlatBuffer *flat_dst_buf = NULL;
+	struct page *in_pages[MAX_PAGE_NUM];
+	struct page *out_pages[MAX_PAGE_NUM];
+	Cpa32U in_page_num = 0;
+	Cpa32U out_page_num = 0;
+	Cpa32U in_page_off = 0;
+	Cpa32U out_page_off = 0;
+
+	if (dir == QAT_ENCRYPT) {
+		QAT_STAT_BUMP(encrypt_requests);
+		QAT_STAT_INCR(encrypt_total_in_bytes, enc_len);
+	} else {
+		QAT_STAT_BUMP(decrypt_requests);
+		QAT_STAT_INCR(decrypt_total_in_bytes, enc_len);
+	}
+
+	i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+	cy_inst_handle = cy_inst_handles[i];
+
+	status = qat_init_crypt_session_ctx(dir, cy_inst_handle,
+	    &cy_session_ctx, key, crypt, aad_len);
+	if (status != CPA_STATUS_SUCCESS) {
+		/* don't count CCM as a failure since it's not supported */
+		if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_GCM)
+			QAT_STAT_BUMP(crypt_fails);
+		return (status);
+	}
+
+	/*
+	 * We increment nr_bufs by 2 to allow us to handle non
+	 * page-aligned buffer addresses and buffers whose sizes
+	 * are not divisible by PAGE_SIZE.
+	 */
+	status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
+	    &src_buffer_list, &dst_buffer_list);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
+	    nr_bufs * sizeof (CpaFlatBuffer));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&flat_dst_buf_array,
+	    nr_bufs * sizeof (CpaFlatBuffer));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&op_data.pDigestResult,
+	    ZIO_DATA_MAC_LEN);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&op_data.pIv,
+	    ZIO_DATA_IV_LEN);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	if (aad_len > 0) {
+		status = QAT_PHYS_CONTIG_ALLOC(&op_data.pAdditionalAuthData,
+		    aad_len);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+		bcopy(aad_buf, op_data.pAdditionalAuthData, aad_len);
+	}
+
+	bytes_left = enc_len;
+	data = src_buf;
+	flat_src_buf = flat_src_buf_array;
+	while (bytes_left > 0) {
+		in_page_off = ((long)data & ~PAGE_MASK);
+		in_pages[in_page_num] = qat_mem_to_page(data);
+		flat_src_buf->pData = kmap(in_pages[in_page_num]) + in_page_off;
+		flat_src_buf->dataLenInBytes =
+		    min((long)PAGE_SIZE - in_page_off, (long)bytes_left);
+		data += flat_src_buf->dataLenInBytes;
+		bytes_left -= flat_src_buf->dataLenInBytes;
+		flat_src_buf++;
+		in_page_num++;
+	}
+	src_buffer_list.pBuffers = flat_src_buf_array;
+	src_buffer_list.numBuffers = in_page_num;
+
+	bytes_left = enc_len;
+	data = dst_buf;
+	flat_dst_buf = flat_dst_buf_array;
+	while (bytes_left > 0) {
+		out_page_off = ((long)data & ~PAGE_MASK);
+		out_pages[out_page_num] = qat_mem_to_page(data);
+		flat_dst_buf->pData = kmap(out_pages[out_page_num]) +
+		    out_page_off;
+		flat_dst_buf->dataLenInBytes =
+		    min((long)PAGE_SIZE - out_page_off, (long)bytes_left);
+		data += flat_dst_buf->dataLenInBytes;
+		bytes_left -= flat_dst_buf->dataLenInBytes;
+		flat_dst_buf++;
+		out_page_num++;
+	}
+	dst_buffer_list.pBuffers = flat_dst_buf_array;
+	dst_buffer_list.numBuffers = out_page_num;
+
+	op_data.sessionCtx = cy_session_ctx;
+	op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+	op_data.cryptoStartSrcOffsetInBytes = 0;
+	op_data.messageLenToCipherInBytes = 0;
+	op_data.hashStartSrcOffsetInBytes = 0;
+	op_data.messageLenToHashInBytes = 0;
+	op_data.messageLenToCipherInBytes = enc_len;
+	op_data.ivLenInBytes = ZIO_DATA_IV_LEN;
+	bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN);
+
+	cb.verify_result = CPA_FALSE;
+	init_completion(&cb.complete);
+	status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
+	    &src_buffer_list, &dst_buffer_list, NULL);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	if (!wait_for_completion_interruptible_timeout(&cb.complete,
+	    QAT_TIMEOUT_MS)) {
+		status = CPA_STATUS_FAIL;
+		goto fail;
+	}
+
+	if (cb.verify_result == CPA_FALSE) {
+		status = CPA_STATUS_FAIL;
+		goto fail;
+	}
+
+	/* save digest result to digest_buf */
+	bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN);
+	if (dir == QAT_ENCRYPT)
+		QAT_STAT_INCR(encrypt_total_out_bytes, enc_len);
+	else
+		QAT_STAT_INCR(decrypt_total_out_bytes, enc_len);
+
+fail:
+	if (status != CPA_STATUS_SUCCESS)
+		QAT_STAT_BUMP(crypt_fails);
+
+	for (i = 0; i < in_page_num; i++)
+		kunmap(in_pages[i]);
+	for (i = 0; i < out_page_num; i++)
+		kunmap(out_pages[i]);
+
+	cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
+	if (aad_len > 0)
+		QAT_PHYS_CONTIG_FREE(op_data.pAdditionalAuthData);
+	QAT_PHYS_CONTIG_FREE(op_data.pIv);
+	QAT_PHYS_CONTIG_FREE(op_data.pDigestResult);
+	QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
+	QAT_PHYS_CONTIG_FREE(dst_buffer_list.pPrivateMetaData);
+	QAT_PHYS_CONTIG_FREE(cy_session_ctx);
+	QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
+	QAT_PHYS_CONTIG_FREE(flat_dst_buf_array);
+
+	return (status);
+}
+
+int
+qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	CpaStatus status;
+	Cpa16U i;
+	CpaInstanceHandle cy_inst_handle;
+	Cpa16U nr_bufs = (size >> PAGE_SHIFT) + 2;
+	Cpa32U bytes_left = 0;
+	Cpa8S *data = NULL;
+	CpaCySymSessionCtx *cy_session_ctx = NULL;
+	cy_callback_t cb;
+	Cpa8U *digest_buffer = NULL;
+	CpaCySymOpData op_data = { 0 };
+	CpaBufferList src_buffer_list = { 0 };
+	CpaFlatBuffer *flat_src_buf_array = NULL;
+	CpaFlatBuffer *flat_src_buf = NULL;
+	struct page *in_pages[MAX_PAGE_NUM];
+	Cpa32U page_num = 0;
+	Cpa32U page_off = 0;
+
+	QAT_STAT_BUMP(cksum_requests);
+	QAT_STAT_INCR(cksum_total_in_bytes, size);
+
+	i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+	cy_inst_handle = cy_inst_handles[i];
+
+	status = qat_init_checksum_session_ctx(cy_inst_handle,
+	    &cy_session_ctx, cksum);
+	if (status != CPA_STATUS_SUCCESS) {
+		/* don't count unsupported checksums as a failure */
+		if (cksum == ZIO_CHECKSUM_SHA256 ||
+		    cksum == ZIO_CHECKSUM_SHA512)
+			QAT_STAT_BUMP(cksum_fails);
+		return (status);
+	}
+
+	/*
+	 * We increment nr_bufs by 2 to allow us to handle non
+	 * page-aligned buffer addresses and buffers whose sizes
+	 * are not divisible by PAGE_SIZE.
+	 */
+	status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
+	    &src_buffer_list, &src_buffer_list);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
+	    nr_bufs * sizeof (CpaFlatBuffer));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&digest_buffer,
+	    sizeof (zio_cksum_t));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	bytes_left = size;
+	data = buf;
+	flat_src_buf = flat_src_buf_array;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		in_pages[page_num] = qat_mem_to_page(data);
+		flat_src_buf->pData = kmap(in_pages[page_num]) + page_off;
+		flat_src_buf->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+		data += flat_src_buf->dataLenInBytes;
+		bytes_left -= flat_src_buf->dataLenInBytes;
+		flat_src_buf++;
+		page_num++;
+	}
+	src_buffer_list.pBuffers = flat_src_buf_array;
+	src_buffer_list.numBuffers = page_num;
+
+	op_data.sessionCtx = cy_session_ctx;
+	op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+	op_data.hashStartSrcOffsetInBytes = 0;
+	op_data.messageLenToHashInBytes = size;
+	op_data.pDigestResult = digest_buffer;
+
+	cb.verify_result = CPA_FALSE;
+	init_completion(&cb.complete);
+	status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
+	    &src_buffer_list, &src_buffer_list, NULL);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	if (!wait_for_completion_interruptible_timeout(&cb.complete,
+	    QAT_TIMEOUT_MS)) {
+		status = CPA_STATUS_FAIL;
+		goto fail;
+	}
+	if (cb.verify_result == CPA_FALSE) {
+		status = CPA_STATUS_FAIL;
+		goto fail;
+	}
+
+	bcopy(digest_buffer, zcp, sizeof (zio_cksum_t));
+
+fail:
+	if (status != CPA_STATUS_SUCCESS)
+		QAT_STAT_BUMP(cksum_fails);
+
+	for (i = 0; i < page_num; i++)
+		kunmap(in_pages[i]);
+
+	cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
+	QAT_PHYS_CONTIG_FREE(digest_buffer);
+	QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
+	QAT_PHYS_CONTIG_FREE(cy_session_ctx);
+	QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
+
+	return (status);
+}
+
+static int
+param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp)
+{
+	int ret;
+	int *pvalue = kp->arg;
+	ret = param_set_int(val, kp);
+	if (ret)
+		return (ret);
+	/*
+	 * zfs_qat_encrypt_disable = 0: enable qat encrypt
+	 * try to initialize qat instance if it has not been done
+	 */
+	if (*pvalue == 0 && !qat_cy_init_done) {
+		ret = qat_cy_init();
+		if (ret != 0) {
+			zfs_qat_encrypt_disable = 1;
+			return (ret);
+		}
+	}
+	return (ret);
+}
+
+static int
+param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp)
+{
+	int ret;
+	int *pvalue = kp->arg;
+	ret = param_set_int(val, kp);
+	if (ret)
+		return (ret);
+	/*
+	 * set_checksum_param_ops = 0: enable qat checksum
+	 * try to initialize qat instance if it has not been done
+	 */
+	if (*pvalue == 0 && !qat_cy_init_done) {
+		ret = qat_cy_init();
+		if (ret != 0) {
+			zfs_qat_checksum_disable = 1;
+			return (ret);
+		}
+	}
+	return (ret);
+}
+
+module_param_call(zfs_qat_encrypt_disable, param_set_qat_encrypt,
+    param_get_int, &zfs_qat_encrypt_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_encrypt_disable, "Enable/Disable QAT encryption");
+
+module_param_call(zfs_qat_checksum_disable, param_set_qat_checksum,
+    param_get_int, &zfs_qat_checksum_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_checksum_disable, "Enable/Disable QAT checksumming");
+
+#endif
@@ -0,0 +1,954 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * LLNL-CODE-403049.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <linux/msdos_fs.h>
+#include <linux/vfs_compat.h>
+
+char *zfs_vdev_scheduler = VDEV_SCHEDULER;
+static void *zfs_vdev_holder = VDEV_HOLDER;
+
+/* size of the "reserved" partition, in blocks */
+#define	EFI_MIN_RESV_SIZE	(16 * 1024)
+
+/*
+ * Virtual device vector for disks.
+ */
+typedef struct dio_request {
+	zio_t			*dr_zio;	/* Parent ZIO */
+	atomic_t		dr_ref;		/* References */
+	int			dr_error;	/* Bio error */
+	int			dr_bio_count;	/* Count of bio's */
+	struct bio		*dr_bio[0];	/* Attached bio's */
+} dio_request_t;
+
+
+#if defined(HAVE_OPEN_BDEV_EXCLUSIVE) || defined(HAVE_BLKDEV_GET_BY_PATH)
+static fmode_t
+vdev_bdev_mode(int smode)
+{
+	fmode_t mode = 0;
+
+	ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
+
+	if (smode & FREAD)
+		mode |= FMODE_READ;
+
+	if (smode & FWRITE)
+		mode |= FMODE_WRITE;
+
+	return (mode);
+}
+#else
+static int
+vdev_bdev_mode(int smode)
+{
+	int mode = 0;
+
+	ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
+
+	if ((smode & FREAD) && !(smode & FWRITE))
+		mode = SB_RDONLY;
+
+	return (mode);
+}
+#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
+
+/*
+ * Returns the usable capacity (in bytes) for the partition or disk.
+ */
+static uint64_t
+bdev_capacity(struct block_device *bdev)
+{
+	return (i_size_read(bdev->bd_inode));
+}
+
+/*
+ * Returns the maximum expansion capacity of the block device (in bytes).
+ *
+ * It is possible to expand a vdev when it has been created as a wholedisk
+ * and the containing block device has increased in capacity.  Or when the
+ * partition containing the pool has been manually increased in size.
+ *
+ * This function is only responsible for calculating the potential expansion
+ * size so it can be reported by 'zpool list'.  The efi_use_whole_disk() is
+ * responsible for verifying the expected partition layout in the wholedisk
+ * case, and updating the partition table if appropriate.  Once the partition
+ * size has been increased the additional capacity will be visible using
+ * bdev_capacity().
+ *
+ * The returned maximum expansion capacity is always expected to be larger, or
+ * at the very least equal, to its usable capacity to prevent overestimating
+ * the pool expandsize.
+ */
+static uint64_t
+bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
+{
+	uint64_t psize;
+	int64_t available;
+
+	if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
+		/*
+		 * When reporting maximum expansion capacity for a wholedisk
+		 * deduct any capacity which is expected to be lost due to
+		 * alignment restrictions.  Over reporting this value isn't
+		 * harmful and would only result in slightly less capacity
+		 * than expected post expansion.
+		 * The estimated available space may be slightly smaller than
+		 * bdev_capacity() for devices where the number of sectors is
+		 * not a multiple of the alignment size and the partition layout
+		 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
+		 * "reserved" EFI partition: in such cases return the device
+		 * usable capacity.
+		 */
+		available = i_size_read(bdev->bd_contains->bd_inode) -
+		    ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
+		    PARTITION_END_ALIGNMENT) << SECTOR_BITS);
+		psize = MAX(available, bdev_capacity(bdev));
+	} else {
+		psize = bdev_capacity(bdev);
+	}
+
+	return (psize);
+}
+
+static void
+vdev_disk_error(zio_t *zio)
+{
+	/*
+	 * This function can be called in interrupt context, for instance while
+	 * handling IRQs coming from a misbehaving disk device; use printk()
+	 * which is safe from any context.
+	 */
+	printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
+	    "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
+	    zio->io_vd->vdev_path, zio->io_error, zio->io_type,
+	    (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
+	    zio->io_flags);
+}
+
+/*
+ * Use the Linux 'noop' elevator for zfs managed block devices.  This
+ * strikes the ideal balance by allowing the zfs elevator to do all
+ * request ordering and prioritization.  While allowing the Linux
+ * elevator to do the maximum front/back merging allowed by the
+ * physical device.  This yields the largest possible requests for
+ * the device with the lowest total overhead.
+ */
+static void
+vdev_elevator_switch(vdev_t *v, char *elevator)
+{
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct request_queue *q;
+	char *device;
+	int error;
+
+	for (int c = 0; c < v->vdev_children; c++)
+		vdev_elevator_switch(v->vdev_child[c], elevator);
+
+	if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
+		return;
+
+	q = bdev_get_queue(vd->vd_bdev);
+	device = vd->vd_bdev->bd_disk->disk_name;
+
+	/*
+	 * Skip devices which are not whole disks (partitions).
+	 * Device-mapper devices are excepted since they may be whole
+	 * disks despite the vdev_wholedisk flag, in which case we can
+	 * and should switch the elevator. If the device-mapper device
+	 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
+	 * "Skip devices without schedulers" check below will fail.
+	 */
+	if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
+		return;
+
+	/* Leave existing scheduler when set to "none" */
+	if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
+		return;
+
+	/*
+	 * The elevator_change() function was available in kernels from
+	 * 2.6.36 to 4.11.  When not available fall back to using the user
+	 * mode helper functionality to set the elevator via sysfs.  This
+	 * requires /bin/echo and sysfs to be mounted which may not be true
+	 * early in the boot process.
+	 */
+#ifdef HAVE_ELEVATOR_CHANGE
+	error = elevator_change(q, elevator);
+#else
+#define	SET_SCHEDULER_CMD \
+	"exec 0</dev/null " \
+	"     1>/sys/block/%s/queue/scheduler " \
+	"     2>/dev/null; " \
+	"echo %s"
+
+	char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+	char *envp[] = { NULL };
+
+	argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
+	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	strfree(argv[2]);
+#endif /* HAVE_ELEVATOR_CHANGE */
+	if (error) {
+		zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
+		    elevator, v->vdev_path, device, error);
+	}
+}
+
+static int
+vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *ashift)
+{
+	struct block_device *bdev;
+	fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+	int count = 0, block_size;
+	int bdev_retry_count = 50;
+	vdev_disk_t *vd;
+
+	/* Must have a pathname and it must be absolute. */
+	if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
+		v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		vdev_dbgmsg(v, "invalid vdev_path");
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Reopen the device if it is currently open.  When expanding a
+	 * partition force re-scanning the partition table while closed
+	 * in order to get an accurate updated block device size.  Then
+	 * since udev may need to recreate the device links increase the
+	 * open retry count before reporting the device as unavailable.
+	 */
+	vd = v->vdev_tsd;
+	if (vd) {
+		char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
+		boolean_t reread_part = B_FALSE;
+
+		rw_enter(&vd->vd_lock, RW_WRITER);
+		bdev = vd->vd_bdev;
+		vd->vd_bdev = NULL;
+
+		if (bdev) {
+			if (v->vdev_expanding && bdev != bdev->bd_contains) {
+				bdevname(bdev->bd_contains, disk_name + 5);
+				reread_part = B_TRUE;
+			}
+
+			vdev_bdev_close(bdev, mode);
+		}
+
+		if (reread_part) {
+			bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
+			if (!IS_ERR(bdev)) {
+				int error = vdev_bdev_reread_part(bdev);
+				vdev_bdev_close(bdev, mode);
+				if (error == 0)
+					bdev_retry_count = 100;
+			}
+		}
+	} else {
+		vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+		rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
+		rw_enter(&vd->vd_lock, RW_WRITER);
+	}
+
+	/*
+	 * Devices are always opened by the path provided at configuration
+	 * time.  This means that if the provided path is a udev by-id path
+	 * then drives may be re-cabled without an issue.  If the provided
+	 * path is a udev by-path path, then the physical location information
+	 * will be preserved.  This can be critical for more complicated
+	 * configurations where drives are located in specific physical
+	 * locations to maximize the systems tolerance to component failure.
+	 *
+	 * Alternatively, you can provide your own udev rule to flexibly map
+	 * the drives as you see fit.  It is not advised that you use the
+	 * /dev/[hd]d devices which may be reordered due to probing order.
+	 * Devices in the wrong locations will be detected by the higher
+	 * level vdev validation.
+	 *
+	 * The specified paths may be briefly removed and recreated in
+	 * response to udev events.  This should be exceptionally unlikely
+	 * because the zpool command makes every effort to verify these paths
+	 * have already settled prior to reaching this point.  Therefore,
+	 * a ENOENT failure at this point is highly likely to be transient
+	 * and it is reasonable to sleep and retry before giving up.  In
+	 * practice delays have been observed to be on the order of 100ms.
+	 */
+	bdev = ERR_PTR(-ENXIO);
+	while (IS_ERR(bdev) && count < bdev_retry_count) {
+		bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
+		if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
+			schedule_timeout(MSEC_TO_TICK(10));
+			count++;
+		} else if (IS_ERR(bdev)) {
+			break;
+		}
+	}
+
+	if (IS_ERR(bdev)) {
+		int error = -PTR_ERR(bdev);
+		vdev_dbgmsg(v, "open error=%d count=%d", error, count);
+		vd->vd_bdev = NULL;
+		v->vdev_tsd = vd;
+		rw_exit(&vd->vd_lock);
+		return (SET_ERROR(error));
+	} else {
+		vd->vd_bdev = bdev;
+		v->vdev_tsd = vd;
+		rw_exit(&vd->vd_lock);
+	}
+
+	struct request_queue *q = bdev_get_queue(vd->vd_bdev);
+
+	/*  Determine the physical block size */
+	block_size = vdev_bdev_block_size(vd->vd_bdev);
+
+	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
+	v->vdev_nowritecache = B_FALSE;
+
+	/* Set when device reports it supports TRIM. */
+	v->vdev_has_trim = !!blk_queue_discard(q);
+
+	/* Set when device reports it supports secure TRIM. */
+	v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
+
+	/* Inform the ZIO pipeline that we are non-rotational */
+	v->vdev_nonrot = blk_queue_nonrot(q);
+
+	/* Physical volume size in bytes for the partition */
+	*psize = bdev_capacity(vd->vd_bdev);
+
+	/* Physical volume size in bytes including possible expansion space */
+	*max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
+
+	/* Based on the minimum sector size set the block size */
+	*ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
+
+	/* Try to set the io scheduler elevator algorithm */
+	(void) vdev_elevator_switch(v, zfs_vdev_scheduler);
+
+	return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *v)
+{
+	vdev_disk_t *vd = v->vdev_tsd;
+
+	if (v->vdev_reopening || vd == NULL)
+		return;
+
+	if (vd->vd_bdev != NULL) {
+		vdev_bdev_close(vd->vd_bdev,
+		    vdev_bdev_mode(spa_mode(v->vdev_spa)));
+	}
+
+	rw_destroy(&vd->vd_lock);
+	kmem_free(vd, sizeof (vdev_disk_t));
+	v->vdev_tsd = NULL;
+}
+
+static dio_request_t *
+vdev_disk_dio_alloc(int bio_count)
+{
+	dio_request_t *dr;
+	int i;
+
+	dr = kmem_zalloc(sizeof (dio_request_t) +
+	    sizeof (struct bio *) * bio_count, KM_SLEEP);
+	if (dr) {
+		atomic_set(&dr->dr_ref, 0);
+		dr->dr_bio_count = bio_count;
+		dr->dr_error = 0;
+
+		for (i = 0; i < dr->dr_bio_count; i++)
+			dr->dr_bio[i] = NULL;
+	}
+
+	return (dr);
+}
+
+static void
+vdev_disk_dio_free(dio_request_t *dr)
+{
+	int i;
+
+	for (i = 0; i < dr->dr_bio_count; i++)
+		if (dr->dr_bio[i])
+			bio_put(dr->dr_bio[i]);
+
+	kmem_free(dr, sizeof (dio_request_t) +
+	    sizeof (struct bio *) * dr->dr_bio_count);
+}
+
+static void
+vdev_disk_dio_get(dio_request_t *dr)
+{
+	atomic_inc(&dr->dr_ref);
+}
+
+static int
+vdev_disk_dio_put(dio_request_t *dr)
+{
+	int rc = atomic_dec_return(&dr->dr_ref);
+
+	/*
+	 * Free the dio_request when the last reference is dropped and
+	 * ensure zio_interpret is called only once with the correct zio
+	 */
+	if (rc == 0) {
+		zio_t *zio = dr->dr_zio;
+		int error = dr->dr_error;
+
+		vdev_disk_dio_free(dr);
+
+		if (zio) {
+			zio->io_error = error;
+			ASSERT3S(zio->io_error, >=, 0);
+			if (zio->io_error)
+				vdev_disk_error(zio);
+
+			zio_delay_interrupt(zio);
+		}
+	}
+
+	return (rc);
+}
+
+BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
+{
+	dio_request_t *dr = bio->bi_private;
+	int rc;
+
+	if (dr->dr_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		dr->dr_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			dr->dr_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			dr->dr_error = EIO;
+#endif
+	}
+
+	/* Drop reference acquired by __vdev_disk_physio */
+	rc = vdev_disk_dio_put(dr);
+}
+
+static unsigned int
+bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
+{
+	unsigned int offset, size, i;
+	struct page *page;
+
+	offset = offset_in_page(bio_ptr);
+	for (i = 0; i < bio->bi_max_vecs; i++) {
+		size = PAGE_SIZE - offset;
+
+		if (bio_size <= 0)
+			break;
+
+		if (size > bio_size)
+			size = bio_size;
+
+		if (is_vmalloc_addr(bio_ptr))
+			page = vmalloc_to_page(bio_ptr);
+		else
+			page = virt_to_page(bio_ptr);
+
+		/*
+		 * Some network related block device uses tcp_sendpage, which
+		 * doesn't behave well when using 0-count page, this is a
+		 * safety net to catch them.
+		 */
+		ASSERT3S(page_count(page), >, 0);
+
+		if (bio_add_page(bio, page, size, offset) != size)
+			break;
+
+		bio_ptr  += size;
+		bio_size -= size;
+		offset = 0;
+	}
+
+	return (bio_size);
+}
+
+static unsigned int
+bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
+{
+	if (abd_is_linear(abd))
+		return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
+
+	return (abd_scatter_bio_map_off(bio, abd, size, off));
+}
+
+static inline void
+vdev_submit_bio_impl(struct bio *bio)
+{
+#ifdef HAVE_1ARG_SUBMIT_BIO
+	submit_bio(bio);
+#else
+	submit_bio(0, bio);
+#endif
+}
+
+#ifdef HAVE_BIO_SET_DEV
+#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
+/*
+ * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
+ * GPL-only bio_associate_blkg() symbol thus inadvertently converting
+ * the entire macro.  Provide a minimal version which always assigns the
+ * request queue's root_blkg to the bio.
+ */
+static inline void
+vdev_bio_associate_blkg(struct bio *bio)
+{
+	struct request_queue *q = bio->bi_disk->queue;
+
+	ASSERT3P(q, !=, NULL);
+	ASSERT3P(bio->bi_blkg, ==, NULL);
+
+	if (blkg_tryget(q->root_blkg))
+		bio->bi_blkg = q->root_blkg;
+}
+#define	bio_associate_blkg vdev_bio_associate_blkg
+#endif
+#else
+/*
+ * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
+ */
+static inline void
+bio_set_dev(struct bio *bio, struct block_device *bdev)
+{
+	bio->bi_bdev = bdev;
+}
+#endif /* HAVE_BIO_SET_DEV */
+
+static inline void
+vdev_submit_bio(struct bio *bio)
+{
+#ifdef HAVE_CURRENT_BIO_TAIL
+	struct bio **bio_tail = current->bio_tail;
+	current->bio_tail = NULL;
+	vdev_submit_bio_impl(bio);
+	current->bio_tail = bio_tail;
+#else
+	struct bio_list *bio_list = current->bio_list;
+	current->bio_list = NULL;
+	vdev_submit_bio_impl(bio);
+	current->bio_list = bio_list;
+#endif
+}
+
+static int
+__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+    size_t io_size, uint64_t io_offset, int rw, int flags)
+{
+	dio_request_t *dr;
+	uint64_t abd_offset;
+	uint64_t bio_offset;
+	int bio_size, bio_count = 16;
+	int i = 0, error = 0;
+#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
+	struct blk_plug plug;
+#endif
+	/*
+	 * Accessing outside the block device is never allowed.
+	 */
+	if (io_offset + io_size > bdev->bd_inode->i_size) {
+		vdev_dbgmsg(zio->io_vd,
+		    "Illegal access %llu size %llu, device size %llu",
+		    io_offset, io_size, i_size_read(bdev->bd_inode));
+		return (SET_ERROR(EIO));
+	}
+
+retry:
+	dr = vdev_disk_dio_alloc(bio_count);
+	if (dr == NULL)
+		return (SET_ERROR(ENOMEM));
+
+	if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
+		bio_set_flags_failfast(bdev, &flags);
+
+	dr->dr_zio = zio;
+
+	/*
+	 * When the IO size exceeds the maximum bio size for the request
+	 * queue we are forced to break the IO in multiple bio's and wait
+	 * for them all to complete.  Ideally, all pool users will set
+	 * their volume block size to match the maximum request size and
+	 * the common case will be one bio per vdev IO request.
+	 */
+
+	abd_offset = 0;
+	bio_offset = io_offset;
+	bio_size   = io_size;
+	for (i = 0; i <= dr->dr_bio_count; i++) {
+
+		/* Finished constructing bio's for given buffer */
+		if (bio_size <= 0)
+			break;
+
+		/*
+		 * By default only 'bio_count' bio's per dio are allowed.
+		 * However, if we find ourselves in a situation where more
+		 * are needed we allocate a larger dio and warn the user.
+		 */
+		if (dr->dr_bio_count == i) {
+			vdev_disk_dio_free(dr);
+			bio_count *= 2;
+			goto retry;
+		}
+
+		/* bio_alloc() with __GFP_WAIT never returns NULL */
+		dr->dr_bio[i] = bio_alloc(GFP_NOIO,
+		    MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
+		    BIO_MAX_PAGES));
+		if (unlikely(dr->dr_bio[i] == NULL)) {
+			vdev_disk_dio_free(dr);
+			return (SET_ERROR(ENOMEM));
+		}
+
+		/* Matching put called by vdev_disk_physio_completion */
+		vdev_disk_dio_get(dr);
+
+		bio_set_dev(dr->dr_bio[i], bdev);
+		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
+		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+		dr->dr_bio[i]->bi_private = dr;
+		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
+
+		/* Remaining size is returned to become the new size */
+		bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
+		    bio_size, abd_offset);
+
+		/* Advance in buffer and construct another bio if needed */
+		abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
+		bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
+	}
+
+	/* Extra reference to protect dio_request during vdev_submit_bio */
+	vdev_disk_dio_get(dr);
+
+#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
+	if (dr->dr_bio_count > 1)
+		blk_start_plug(&plug);
+#endif
+
+	/* Submit all bio's associated with this dio */
+	for (i = 0; i < dr->dr_bio_count; i++)
+		if (dr->dr_bio[i])
+			vdev_submit_bio(dr->dr_bio[i]);
+
+#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
+	if (dr->dr_bio_count > 1)
+		blk_finish_plug(&plug);
+#endif
+
+	(void) vdev_disk_dio_put(dr);
+
+	return (error);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
+{
+	zio_t *zio = bio->bi_private;
+#ifdef HAVE_1ARG_BIO_END_IO_T
+	zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+	zio->io_error = -error;
+#endif
+
+	if (zio->io_error && (zio->io_error == EOPNOTSUPP))
+		zio->io_vd->vdev_nowritecache = B_TRUE;
+
+	bio_put(bio);
+	ASSERT3S(zio->io_error, >=, 0);
+	if (zio->io_error)
+		vdev_disk_error(zio);
+	zio_interrupt(zio);
+}
+
+static int
+vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
+{
+	struct request_queue *q;
+	struct bio *bio;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return (SET_ERROR(ENXIO));
+
+	bio = bio_alloc(GFP_NOIO, 0);
+	/* bio_alloc() with __GFP_WAIT never returns NULL */
+	if (unlikely(bio == NULL))
+		return (SET_ERROR(ENOMEM));
+
+	bio->bi_end_io = vdev_disk_io_flush_completion;
+	bio->bi_private = zio;
+	bio_set_dev(bio, bdev);
+	bio_set_flush(bio);
+	vdev_submit_bio(bio);
+	invalidate_bdev(bdev);
+
+	return (0);
+}
+
+static void
+vdev_disk_io_start(zio_t *zio)
+{
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	unsigned long trim_flags = 0;
+	int rw, flags, error;
+
+	/*
+	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+	 * Nothing to be done here but return failure.
+	 */
+	if (vd == NULL) {
+		zio->io_error = ENXIO;
+		zio_interrupt(zio);
+		return;
+	}
+
+	rw_enter(&vd->vd_lock, RW_READER);
+
+	/*
+	 * If the vdev is closed, it's likely due to a failed reopen and is
+	 * in the UNAVAIL state.  Nothing to be done here but return failure.
+	 */
+	if (vd->vd_bdev == NULL) {
+		rw_exit(&vd->vd_lock);
+		zio->io_error = ENXIO;
+		zio_interrupt(zio);
+		return;
+	}
+
+	switch (zio->io_type) {
+	case ZIO_TYPE_IOCTL:
+
+		if (!vdev_readable(v)) {
+			rw_exit(&vd->vd_lock);
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+		case DKIOCFLUSHWRITECACHE:
+
+			if (zfs_nocacheflush)
+				break;
+
+			if (v->vdev_nowritecache) {
+				zio->io_error = SET_ERROR(ENOTSUP);
+				break;
+			}
+
+			error = vdev_disk_io_flush(vd->vd_bdev, zio);
+			if (error == 0) {
+				rw_exit(&vd->vd_lock);
+				return;
+			}
+
+			zio->io_error = error;
+
+			break;
+
+		default:
+			zio->io_error = SET_ERROR(ENOTSUP);
+		}
+
+		rw_exit(&vd->vd_lock);
+		zio_execute(zio);
+		return;
+	case ZIO_TYPE_WRITE:
+		rw = WRITE;
+#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
+		flags = (1 << BIO_RW_UNPLUG);
+#elif defined(REQ_UNPLUG)
+		flags = REQ_UNPLUG;
+#else
+		flags = 0;
+#endif
+		break;
+
+	case ZIO_TYPE_READ:
+		rw = READ;
+#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
+		flags = (1 << BIO_RW_UNPLUG);
+#elif defined(REQ_UNPLUG)
+		flags = REQ_UNPLUG;
+#else
+		flags = 0;
+#endif
+		break;
+
+	case ZIO_TYPE_TRIM:
+#if defined(BLKDEV_DISCARD_SECURE)
+		if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+			trim_flags |= BLKDEV_DISCARD_SECURE;
+#endif
+		zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
+		    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
+		    trim_flags);
+
+		rw_exit(&vd->vd_lock);
+		zio_interrupt(zio);
+		return;
+
+	default:
+		rw_exit(&vd->vd_lock);
+		zio->io_error = SET_ERROR(ENOTSUP);
+		zio_interrupt(zio);
+		return;
+	}
+
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
+	error = __vdev_disk_physio(vd->vd_bdev, zio,
+	    zio->io_size, zio->io_offset, rw, flags);
+	rw_exit(&vd->vd_lock);
+
+	if (error) {
+		zio->io_error = error;
+		zio_interrupt(zio);
+		return;
+	}
+}
+
+static void
+vdev_disk_io_done(zio_t *zio)
+{
+	/*
+	 * If the device returned EIO, we revalidate the media.  If it is
+	 * determined the media has changed this triggers the asynchronous
+	 * removal of the device from the configuration.
+	 */
+	if (zio->io_error == EIO) {
+		vdev_t *v = zio->io_vd;
+		vdev_disk_t *vd = v->vdev_tsd;
+
+		if (check_disk_change(vd->vd_bdev)) {
+			vdev_bdev_invalidate(vd->vd_bdev);
+			v->vdev_remove_wanted = B_TRUE;
+			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+		}
+	}
+}
+
+static void
+vdev_disk_hold(vdev_t *vd)
+{
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+	/* We must have a pathname, and it must be absolute. */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
+		return;
+
+	/*
+	 * Only prefetch path and devid info if the device has
+	 * never been opened.
+	 */
+	if (vd->vdev_tsd != NULL)
+		return;
+
+	/* XXX: Implement me as a vnode lookup for the device */
+	vd->vdev_name_vp = NULL;
+	vd->vdev_devid_vp = NULL;
+}
+
+static void
+vdev_disk_rele(vdev_t *vd)
+{
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+	/* XXX: Implement me as a vnode rele for the device */
+}
+
+static int
+param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
+{
+	spa_t *spa = NULL;
+	char *p;
+
+	if (val == NULL)
+		return (SET_ERROR(-EINVAL));
+
+	if ((p = strchr(val, '\n')) != NULL)
+		*p = '\0';
+
+	if (spa_mode_global != 0) {
+		mutex_enter(&spa_namespace_lock);
+		while ((spa = spa_next(spa)) != NULL) {
+			if (spa_state(spa) != POOL_STATE_ACTIVE ||
+			    !spa_writeable(spa) || spa_suspended(spa))
+				continue;
+
+			spa_open_ref(spa, FTAG);
+			mutex_exit(&spa_namespace_lock);
+			vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
+			mutex_enter(&spa_namespace_lock);
+			spa_close(spa, FTAG);
+		}
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	return (param_set_charp(val, kp));
+}
+
+vdev_ops_t vdev_disk_ops = {
+	.vdev_op_open = vdev_disk_open,
+	.vdev_op_close = vdev_disk_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_disk_io_start,
+	.vdev_op_io_done = vdev_disk_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_disk_hold,
+	.vdev_op_rele = vdev_disk_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
+    param_get_charp, &zfs_vdev_scheduler, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
@@ -0,0 +1,331 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static taskq_t *vdev_file_taskq;
+
+static void
+vdev_file_hold(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *ashift)
+{
+	vdev_file_t *vf;
+	vnode_t *vp;
+	vattr_t vattr;
+	int error;
+
+	/*
+	 * Rotational optimizations only make sense on block devices.
+	 */
+	vd->vdev_nonrot = B_TRUE;
+
+	/*
+	 * Allow TRIM on file based vdevs.  This may not always be supported,
+	 * since it depends on your kernel version and underlying filesystem
+	 * type but it is always safe to attempt.
+	 */
+	vd->vdev_has_trim = B_TRUE;
+
+	/*
+	 * Disable secure TRIM on file based vdevs.  There is no way to
+	 * request this behavior from the underlying filesystem.
+	 */
+	vd->vdev_has_securetrim = B_FALSE;
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Reopen the device if it's not currently open.  Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if (vd->vdev_tsd != NULL) {
+		ASSERT(vd->vdev_reopening);
+		vf = vd->vdev_tsd;
+		goto skip_open;
+	}
+
+	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+	/*
+	 * We always open the files from the root of the global zone, even if
+	 * we're in a local zone.  If the user has gotten to this point, the
+	 * administrator has already decided that the pool should be available
+	 * to local zone users, so the underlying devices should be as well.
+	 */
+	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
+	    spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
+
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	vf->vf_vnode = vp;
+
+#ifdef _KERNEL
+	/*
+	 * Make sure it's a regular file.
+	 */
+	if (vp->v_type != VREG) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (SET_ERROR(ENODEV));
+	}
+#endif
+
+skip_open:
+	/*
+	 * Determine the physical size of the file.
+	 */
+	vattr.va_mask = AT_SIZE;
+	error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	*max_psize = *psize = vattr.va_size;
+	*ashift = SPA_MINBLOCKSHIFT;
+
+	return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (vd->vdev_reopening || vf == NULL)
+		return;
+
+	if (vf->vf_vnode != NULL) {
+		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
+		(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
+		    kcred, NULL);
+	}
+
+	vd->vdev_delayed_close = B_FALSE;
+	kmem_free(vf, sizeof (vdev_file_t));
+	vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+	zio_t *zio = (zio_t *)arg;
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf = vd->vdev_tsd;
+	ssize_t resid;
+	void *buf;
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+	else
+		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+
+	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
+	    UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size,
+	    zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+	else
+		abd_return_buf(zio->io_abd, buf, zio->io_size);
+
+	if (resid != 0 && zio->io_error == 0)
+		zio->io_error = SET_ERROR(ENOSPC);
+
+	zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_fsync(void *arg)
+{
+	zio_t *zio = (zio_t *)arg;
+	vdev_file_t *vf = zio->io_vd->vdev_tsd;
+
+	zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, kcred, NULL);
+
+	zio_interrupt(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
+		/* XXPOLICY */
+		if (!vdev_readable(vd)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+		case DKIOCFLUSHWRITECACHE:
+
+			if (zfs_nocacheflush)
+				break;
+
+			/*
+			 * We cannot safely call vfs_fsync() when PF_FSTRANS
+			 * is set in the current context.  Filesystems like
+			 * XFS include sanity checks to verify it is not
+			 * already set, see xfs_vm_writepage().  Therefore
+			 * the sync must be dispatched to a different context.
+			 */
+			if (__spl_pf_fstrans_check()) {
+				VERIFY3U(taskq_dispatch(vdev_file_taskq,
+				    vdev_file_io_fsync, zio, TQ_SLEEP), !=,
+				    TASKQID_INVALID);
+				return;
+			}
+
+			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
+			    kcred, NULL);
+			break;
+		default:
+			zio->io_error = SET_ERROR(ENOTSUP);
+		}
+
+		zio_execute(zio);
+		return;
+	} else if (zio->io_type == ZIO_TYPE_TRIM) {
+		struct flock flck;
+
+		ASSERT3U(zio->io_size, !=, 0);
+		bzero(&flck, sizeof (flck));
+		flck.l_type = F_FREESP;
+		flck.l_start = zio->io_offset;
+		flck.l_len = zio->io_size;
+		flck.l_whence = SEEK_SET;
+
+		zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &flck,
+		    0, 0, kcred, NULL);
+
+		zio_execute(zio);
+		return;
+	}
+
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
+
+	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+	    TQ_SLEEP), !=, TASKQID_INVALID);
+}
+
+/* ARGSUSED */
+static void
+vdev_file_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_file_ops = {
+	.vdev_op_open = vdev_file_open,
+	.vdev_op_close = vdev_file_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_file_io_start,
+	.vdev_op_io_done = vdev_file_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_file_hold,
+	.vdev_op_rele = vdev_file_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_FILE,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+void
+vdev_file_init(void)
+{
+	vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16),
+	    minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC);
+
+	VERIFY(vdev_file_taskq);
+}
+
+void
+vdev_file_fini(void)
+{
+	taskq_destroy(vdev_file_taskq);
+}
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+	.vdev_op_open = vdev_file_open,
+	.vdev_op_close = vdev_file_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_file_io_start,
+	.vdev_op_io_done = vdev_file_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_file_hold,
+	.vdev_op_rele = vdev_file_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+#endif
@@ -0,0 +1,253 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+
+typedef struct zfs_dbgmsg {
+	procfs_list_node_t	zdm_node;
+	time_t			zdm_timestamp;
+	int			zdm_size;
+	char			zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+procfs_list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size = 0;
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+
+/*
+ * Internal ZFS debug messages are enabled by default.
+ *
+ * # Print debug messages
+ * cat /proc/spl/kstat/zfs/dbgmsg
+ *
+ * # Disable the kernel debug message log.
+ * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable
+ *
+ * # Clear the kernel debug message log.
+ * echo 0 >/proc/spl/kstat/zfs/dbgmsg
+ */
+int zfs_dbgmsg_enable = 1;
+
+static int
+zfs_dbgmsg_show_header(struct seq_file *f)
+{
+	seq_printf(f, "%-12s %-8s\n", "timestamp", "message");
+	return (0);
+}
+
+static int
+zfs_dbgmsg_show(struct seq_file *f, void *p)
+{
+	zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p;
+	seq_printf(f, "%-12llu %-s\n",
+	    (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
+	return (0);
+}
+
+static void
+zfs_dbgmsg_purge(int max_size)
+{
+	while (zfs_dbgmsg_size > max_size) {
+		zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list);
+		if (zdm == NULL)
+			return;
+
+		int size = zdm->zdm_size;
+		kmem_free(zdm, size);
+		zfs_dbgmsg_size -= size;
+	}
+}
+
+static int
+zfs_dbgmsg_clear(procfs_list_t *procfs_list)
+{
+	mutex_enter(&zfs_dbgmsgs.pl_lock);
+	zfs_dbgmsg_purge(0);
+	mutex_exit(&zfs_dbgmsgs.pl_lock);
+	return (0);
+}
+
+void
+zfs_dbgmsg_init(void)
+{
+	procfs_list_install("zfs",
+	    "dbgmsg",
+	    0600,
+	    &zfs_dbgmsgs,
+	    zfs_dbgmsg_show,
+	    zfs_dbgmsg_show_header,
+	    zfs_dbgmsg_clear,
+	    offsetof(zfs_dbgmsg_t, zdm_node));
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+	procfs_list_uninstall(&zfs_dbgmsgs);
+	zfs_dbgmsg_purge(0);
+
+	/*
+	 * TODO - decide how to make this permanent
+	 */
+#ifdef _KERNEL
+	procfs_list_destroy(&zfs_dbgmsgs);
+#endif
+}
+
+void
+__set_error(const char *file, const char *func, int line, int err)
+{
+	/*
+	 * To enable this:
+	 *
+	 * $ echo 512 >/sys/module/zfs/parameters/zfs_flags
+	 */
+	if (zfs_flags & ZFS_DEBUG_SET_ERROR)
+		__dprintf(B_FALSE, file, func, line, "error %lu", err);
+}
+
+void
+__zfs_dbgmsg(char *buf)
+{
+	int size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+	zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP);
+	zdm->zdm_size = size;
+	zdm->zdm_timestamp = gethrestime_sec();
+	strcpy(zdm->zdm_msg, buf);
+
+	mutex_enter(&zfs_dbgmsgs.pl_lock);
+	procfs_list_add(&zfs_dbgmsgs, zdm);
+	zfs_dbgmsg_size += size;
+	zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+	mutex_exit(&zfs_dbgmsgs.pl_lock);
+}
+
+#ifdef _KERNEL
+
+void
+__dprintf(boolean_t dprint, const char *file, const char *func,
+    int line, const char *fmt, ...)
+{
+	const char *newfile;
+	va_list adx;
+	size_t size;
+	char *buf;
+	char *nl;
+	int i;
+	char *prefix = (dprint) ? "dprintf: " : "";
+
+	size = 1024;
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	/*
+	 * Get rid of annoying prefix to filename.
+	 */
+	newfile = strrchr(file, '/');
+	if (newfile != NULL) {
+		newfile = newfile + 1; /* Get rid of leading / */
+	} else {
+		newfile = file;
+	}
+
+	i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func);
+
+	if (i < size) {
+		va_start(adx, fmt);
+		(void) vsnprintf(buf + i, size - i, fmt, adx);
+		va_end(adx);
+	}
+
+	/*
+	 * Get rid of trailing newline for dprintf logs.
+	 */
+	if (dprint && buf[0] != '\0') {
+		nl = &buf[strlen(buf) - 1];
+		if (*nl == '\n')
+			*nl = '\0';
+	}
+
+	/*
+	 * To get this data enable the zfs__dprintf trace point as shown:
+	 *
+	 * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer
+	 * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable
+	 * $ echo 0 > /sys/kernel/debug/tracing/trace
+	 *
+	 * # Dump the ring buffer.
+	 * $ cat /sys/kernel/debug/tracing/trace
+	 */
+	DTRACE_PROBE1(zfs__dprintf, char *, buf);
+
+	/*
+	 * To get this data:
+	 *
+	 * $ cat /proc/spl/kstat/zfs/dbgmsg
+	 *
+	 * To clear the buffer:
+	 * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg
+	 */
+	__zfs_dbgmsg(buf);
+
+	kmem_free(buf, size);
+}
+
+#else
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+	ssize_t ret __attribute__((unused));
+
+	/*
+	 * We use write() in this function instead of printf()
+	 * so it is safe to call from a signal handler.
+	 */
+	ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
+	ret = write(STDOUT_FILENO, tag, strlen(tag));
+	ret = write(STDOUT_FILENO, ") START:\n", 9);
+
+	mutex_enter(&zfs_dbgmsgs.pl_lock);
+	for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL;
+	    zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) {
+		ret = write(STDOUT_FILENO, zdm->zdm_msg,
+		    strlen(zdm->zdm_msg));
+		ret = write(STDOUT_FILENO, "\n", 1);
+	}
+
+	ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
+	ret = write(STDOUT_FILENO, tag, strlen(tag));
+	ret = write(STDOUT_FILENO, ") END\n", 6);
+
+	mutex_exit(&zfs_dbgmsgs.pl_lock);
+}
+#endif /* _KERNEL */
+
+#ifdef _KERNEL
+module_param(zfs_dbgmsg_enable, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log");
+
+module_param(zfs_dbgmsg_maxsize, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size");
+#endif
@@ -0,0 +1,661 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_sysfs.h>
+#include <sys/kmem.h>
+#include <sys/fs/zfs.h>
+#include <linux/kobject.h>
+
+#include "zfs_prop.h"
+
+#if !defined(_KERNEL)
+#error kernel builds only
+#endif
+
+/*
+ * ZFS Module sysfs support
+ *
+ * This extends our sysfs '/sys/module/zfs' entry to include feature
+ * and property attributes. The primary consumer of this information
+ * is user processes, like the zfs CLI, that need to know what the
+ * current loaded ZFS module supports. The libzfs binary will consult
+ * this information when instantiating the zfs|zpool property tables
+ * and the pool features table.
+ *
+ * The added top-level directories are:
+ * /sys/module/zfs
+ *		├── features.kernel
+ *		├── features.pool
+ *		├── properties.dataset
+ *		└── properties.pool
+ *
+ * The local interface for the zfs kobjects includes:
+ *	zfs_kobj_init()
+ *	zfs_kobj_add()
+ *	zfs_kobj_release()
+ *	zfs_kobj_add_attr()
+ *	zfs_kobj_fini()
+ */
+
+/*
+ * A zfs_mod_kobj_t represents a zfs kobject under '/sys/module/zfs'
+ */
+struct zfs_mod_kobj;
+typedef struct zfs_mod_kobj zfs_mod_kobj_t;
+
+struct zfs_mod_kobj {
+	struct kobject		zko_kobj;
+	struct kobj_type	zko_kobj_type;
+	struct sysfs_ops	zko_sysfs_ops;
+	size_t			zko_attr_count;
+	struct attribute	*zko_attr_list;		/* allocated */
+	struct attribute	**zko_default_attrs;	/* allocated */
+	size_t			zko_child_count;
+	zfs_mod_kobj_t		*zko_children;		/* allocated */
+};
+
+#define	ATTR_TABLE_SIZE(cnt)	(sizeof (struct attribute) * (cnt))
+/* Note +1 for NULL terminator slot */
+#define	DEFAULT_ATTR_SIZE(cnt)	(sizeof (struct attribute *) * (cnt + 1))
+#define	CHILD_TABLE_SIZE(cnt)	(sizeof (zfs_mod_kobj_t) * (cnt))
+
+/*
+ * These are the top-level kobjects under '/sys/module/zfs/'
+ */
+static zfs_mod_kobj_t kernel_features_kobj;
+static zfs_mod_kobj_t pool_features_kobj;
+static zfs_mod_kobj_t dataset_props_kobj;
+static zfs_mod_kobj_t pool_props_kobj;
+
+/*
+ * The show function is used to provide the content
+ * of an attribute into a PAGE_SIZE buffer.
+ */
+typedef ssize_t	(*sysfs_show_func)(struct kobject *, struct attribute *,
+    char *);
+
+static void
+zfs_kobj_fini(zfs_mod_kobj_t *zkobj)
+{
+	/* finalize any child kobjects */
+	if (zkobj->zko_child_count != 0) {
+		ASSERT(zkobj->zko_children);
+		for (int i = 0; i < zkobj->zko_child_count; i++)
+			zfs_kobj_fini(&zkobj->zko_children[i]);
+	}
+
+	/* kobject_put() will call zfs_kobj_release() to release memory */
+	kobject_del(&zkobj->zko_kobj);
+	kobject_put(&zkobj->zko_kobj);
+}
+
+static void
+zfs_kobj_release(struct kobject *kobj)
+{
+	zfs_mod_kobj_t *zkobj = container_of(kobj, zfs_mod_kobj_t, zko_kobj);
+
+	if (zkobj->zko_attr_list != NULL) {
+		ASSERT3S(zkobj->zko_attr_count, !=, 0);
+		kmem_free(zkobj->zko_attr_list,
+		    ATTR_TABLE_SIZE(zkobj->zko_attr_count));
+		zkobj->zko_attr_list = NULL;
+	}
+
+	if (zkobj->zko_default_attrs != NULL) {
+		kmem_free(zkobj->zko_default_attrs,
+		    DEFAULT_ATTR_SIZE(zkobj->zko_attr_count));
+		zkobj->zko_default_attrs = NULL;
+	}
+
+	if (zkobj->zko_child_count != 0) {
+		ASSERT(zkobj->zko_children);
+
+		kmem_free(zkobj->zko_children,
+		    CHILD_TABLE_SIZE(zkobj->zko_child_count));
+		zkobj->zko_child_count = 0;
+		zkobj->zko_children = NULL;
+	}
+
+	zkobj->zko_attr_count = 0;
+}
+
+#ifndef sysfs_attr_init
+#define	sysfs_attr_init(attr) do {} while (0)
+#endif
+
+static void
+zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name)
+{
+	VERIFY3U(attr_num, <, zkobj->zko_attr_count);
+	ASSERT(zkobj->zko_attr_list);
+	ASSERT(zkobj->zko_default_attrs);
+
+	zkobj->zko_attr_list[attr_num].name = attr_name;
+	zkobj->zko_attr_list[attr_num].mode = 0444;
+	zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num];
+	sysfs_attr_init(&zkobj->zko_attr_list[attr_num]);
+}
+
+static int
+zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt,
+    sysfs_show_func show_func)
+{
+	/*
+	 * Initialize object's attributes. Count can be zero.
+	 */
+	if (attr_cnt > 0) {
+		zkobj->zko_attr_list = kmem_zalloc(ATTR_TABLE_SIZE(attr_cnt),
+		    KM_SLEEP);
+		if (zkobj->zko_attr_list == NULL)
+			return (ENOMEM);
+	}
+	/* this will always have at least one slot for NULL termination */
+	zkobj->zko_default_attrs = kmem_zalloc(DEFAULT_ATTR_SIZE(attr_cnt),
+	    KM_SLEEP);
+	if (zkobj->zko_default_attrs == NULL) {
+		if (zkobj->zko_attr_list != NULL) {
+			kmem_free(zkobj->zko_attr_list,
+			    ATTR_TABLE_SIZE(attr_cnt));
+		}
+		return (ENOMEM);
+	}
+	zkobj->zko_attr_count = attr_cnt;
+	zkobj->zko_kobj_type.default_attrs = zkobj->zko_default_attrs;
+
+	if (child_cnt > 0) {
+		zkobj->zko_children = kmem_zalloc(CHILD_TABLE_SIZE(child_cnt),
+		    KM_SLEEP);
+		if (zkobj->zko_children == NULL) {
+			if (zkobj->zko_default_attrs != NULL) {
+				kmem_free(zkobj->zko_default_attrs,
+				    DEFAULT_ATTR_SIZE(attr_cnt));
+			}
+			if (zkobj->zko_attr_list != NULL) {
+				kmem_free(zkobj->zko_attr_list,
+				    ATTR_TABLE_SIZE(attr_cnt));
+			}
+			return (ENOMEM);
+		}
+		zkobj->zko_child_count = child_cnt;
+	}
+
+	zkobj->zko_sysfs_ops.show = show_func;
+	zkobj->zko_kobj_type.sysfs_ops = &zkobj->zko_sysfs_ops;
+	zkobj->zko_kobj_type.release = zfs_kobj_release;
+
+	return (0);
+}
+
+static int
+zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name)
+{
+	/* zko_default_attrs must be NULL terminated */
+	ASSERT(zkobj->zko_default_attrs != NULL);
+	ASSERT(zkobj->zko_default_attrs[zkobj->zko_attr_count] == NULL);
+
+	kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type);
+	return (kobject_add(&zkobj->zko_kobj, parent, name));
+}
+
+/*
+ * Each zfs property has these common attributes
+ */
+static const char *zprop_attrs[]  = {
+	"type",
+	"readonly",
+	"setonce",
+	"visible",
+	"values",
+	"default",
+	"datasets"	/* zfs properties only */
+};
+
+#define	ZFS_PROP_ATTR_COUNT	ARRAY_SIZE(zprop_attrs)
+#define	ZPOOL_PROP_ATTR_COUNT	(ZFS_PROP_ATTR_COUNT - 1)
+
+static const char *zprop_types[]  = {
+	"number",
+	"string",
+	"index",
+};
+
+typedef struct zfs_type_map {
+	zfs_type_t	ztm_type;
+	const char	*ztm_name;
+} zfs_type_map_t;
+
+static zfs_type_map_t type_map[] = {
+	{ZFS_TYPE_FILESYSTEM,	"filesystem"},
+	{ZFS_TYPE_SNAPSHOT,	"snapshot"},
+	{ZFS_TYPE_VOLUME,	"volume"},
+	{ZFS_TYPE_BOOKMARK,	"bookmark"}
+};
+
+/*
+ * Show the content for a zfs property attribute
+ */
+static ssize_t
+zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property,
+    char *buf, size_t buflen)
+{
+	const char *show_str;
+	char number[32];
+
+	/* For dataset properties list the dataset types that apply */
+	if (strcmp(attr_name, "datasets") == 0 &&
+	    property->pd_types != ZFS_TYPE_POOL) {
+		int len = 0;
+
+		for (int i = 0; i < ARRAY_SIZE(type_map); i++) {
+			if (type_map[i].ztm_type & property->pd_types)  {
+				len += snprintf(buf + len, buflen - len, "%s ",
+				    type_map[i].ztm_name);
+			}
+		}
+		len += snprintf(buf + len, buflen - len, "\n");
+		return (len);
+	}
+
+	if (strcmp(attr_name, "type") == 0) {
+		show_str = zprop_types[property->pd_proptype];
+	} else if (strcmp(attr_name, "readonly") == 0) {
+		show_str = property->pd_attr == PROP_READONLY ? "1" : "0";
+	} else if (strcmp(attr_name, "setonce") == 0) {
+		show_str = property->pd_attr == PROP_ONETIME ? "1" : "0";
+	} else if (strcmp(attr_name, "visible") == 0) {
+		show_str = property->pd_visible ? "1" : "0";
+	} else if (strcmp(attr_name, "values") == 0) {
+		show_str = property->pd_values ? property->pd_values : "";
+	} else if (strcmp(attr_name, "default") == 0) {
+		switch (property->pd_proptype) {
+		case PROP_TYPE_NUMBER:
+			(void) snprintf(number, sizeof (number), "%llu",
+			    (u_longlong_t)property->pd_numdefault);
+			show_str = number;
+			break;
+		case PROP_TYPE_STRING:
+			show_str = property->pd_strdefault ?
+			    property->pd_strdefault : "";
+			break;
+		case PROP_TYPE_INDEX:
+			if (zprop_index_to_string(property->pd_propnum,
+			    property->pd_numdefault, &show_str,
+			    property->pd_types) != 0) {
+				show_str = "";
+			}
+			break;
+		default:
+			return (0);
+		}
+	} else {
+		return (0);
+	}
+
+	return (snprintf(buf, buflen, "%s\n", show_str));
+}
+
+static ssize_t
+dataset_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	zfs_prop_t prop = zfs_name_to_prop(kobject_name(kobj));
+	zprop_desc_t *prop_tbl = zfs_prop_get_table();
+	ssize_t len;
+
+	ASSERT3U(prop, <, ZFS_NUM_PROPS);
+
+	len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
+
+	return (len);
+}
+
+static ssize_t
+pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	zpool_prop_t prop = zpool_name_to_prop(kobject_name(kobj));
+	zprop_desc_t *prop_tbl = zpool_prop_get_table();
+	ssize_t len;
+
+	ASSERT3U(prop, <, ZPOOL_NUM_PROPS);
+
+	len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
+
+	return (len);
+}
+
+/*
+ * ZFS kernel feature attributes for '/sys/module/zfs/features.kernel'
+ *
+ * This list is intended for kernel features that don't have a pool feature
+ * association or that extend existing user kernel interfaces.
+ *
+ * A user processes can easily check if the running zfs kernel module
+ * supports the new feature.
+ */
+static const char *zfs_kernel_features[] = {
+	/* --> Add new kernel features here */
+	"com.delphix:vdev_initialize",
+	"org.zfsonlinux:vdev_trim",
+};
+
+#define	KERNEL_FEATURE_COUNT	ARRAY_SIZE(zfs_kernel_features)
+
+static ssize_t
+kernel_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	if (strcmp(attr->name, "supported") == 0)
+		return (snprintf(buf, PAGE_SIZE, "yes\n"));
+	return (0);
+}
+
+static void
+kernel_feature_to_kobj(zfs_mod_kobj_t *parent, int slot, const char *name)
+{
+	zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[slot];
+
+	ASSERT3U(slot, <, KERNEL_FEATURE_COUNT);
+	ASSERT(name);
+
+	int err = zfs_kobj_init(zfs_kobj, 1, 0, kernel_feature_show);
+	if (err)
+		return;
+
+	zfs_kobj_add_attr(zfs_kobj, 0, "supported");
+
+	err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+	if (err)
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+}
+
+static int
+zfs_kernel_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
+{
+	/*
+	 * Create a parent kobject to host kernel features.
+	 *
+	 * '/sys/module/zfs/features.kernel'
+	 */
+	int err = zfs_kobj_init(zfs_kobj, 0, KERNEL_FEATURE_COUNT,
+	    kernel_feature_show);
+	if (err)
+		return (err);
+	err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_KERNEL_FEATURES);
+	if (err) {
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+		return (err);
+	}
+
+	/*
+	 * Now create a kobject for each feature.
+	 *
+	 * '/sys/module/zfs/features.kernel/<feature>'
+	 */
+	for (int f = 0; f < KERNEL_FEATURE_COUNT; f++)
+		kernel_feature_to_kobj(zfs_kobj, f, zfs_kernel_features[f]);
+
+	return (0);
+}
+
+/*
+ * Each pool feature has these common attributes
+ */
+static const char *pool_feature_attrs[]  = {
+	"description",
+	"guid",
+	"uname",
+	"readonly_compatible",
+	"required_for_mos",
+	"activate_on_enable",
+	"per_dataset"
+};
+
+#define	ZPOOL_FEATURE_ATTR_COUNT	ARRAY_SIZE(pool_feature_attrs)
+
+/*
+ * Show the content for the given zfs pool feature attribute
+ */
+static ssize_t
+pool_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	spa_feature_t fid;
+
+	if (zfeature_lookup_guid(kobject_name(kobj), &fid) != 0)
+		return (0);
+
+	ASSERT3U(fid, <, SPA_FEATURES);
+
+	zfeature_flags_t flags = spa_feature_table[fid].fi_flags;
+	const char *show_str = NULL;
+
+	if (strcmp(attr->name, "description") == 0) {
+		show_str = spa_feature_table[fid].fi_desc;
+	} else if (strcmp(attr->name, "guid") == 0) {
+		show_str = spa_feature_table[fid].fi_guid;
+	} else if (strcmp(attr->name, "uname") == 0) {
+		show_str = spa_feature_table[fid].fi_uname;
+	} else if (strcmp(attr->name, "readonly_compatible") == 0) {
+		show_str = flags & ZFEATURE_FLAG_READONLY_COMPAT ? "1" : "0";
+	} else if (strcmp(attr->name, "required_for_mos") == 0) {
+		show_str = flags & ZFEATURE_FLAG_MOS ? "1" : "0";
+	} else if (strcmp(attr->name, "activate_on_enable") == 0) {
+		show_str = flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE ? "1" : "0";
+	} else if (strcmp(attr->name, "per_dataset") == 0) {
+		show_str = flags & ZFEATURE_FLAG_PER_DATASET ? "1" : "0";
+	}
+	if (show_str == NULL)
+		return (0);
+
+	return (snprintf(buf, PAGE_SIZE, "%s\n", show_str));
+}
+
+static void
+pool_feature_to_kobj(zfs_mod_kobj_t *parent, spa_feature_t fid,
+    const char *name)
+{
+	zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[fid];
+
+	ASSERT3U(fid, <, SPA_FEATURES);
+	ASSERT(name);
+
+	int err = zfs_kobj_init(zfs_kobj, ZPOOL_FEATURE_ATTR_COUNT, 0,
+	    pool_feature_show);
+	if (err)
+		return;
+
+	for (int i = 0; i < ZPOOL_FEATURE_ATTR_COUNT; i++)
+		zfs_kobj_add_attr(zfs_kobj, i, pool_feature_attrs[i]);
+
+	err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+	if (err)
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+}
+
+static int
+zfs_pool_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
+{
+	/*
+	 * Create a parent kobject to host pool features.
+	 *
+	 * '/sys/module/zfs/features.pool'
+	 */
+	int err = zfs_kobj_init(zfs_kobj, 0, SPA_FEATURES, pool_feature_show);
+	if (err)
+		return (err);
+	err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_POOL_FEATURES);
+	if (err) {
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+		return (err);
+	}
+
+	/*
+	 * Now create a kobject for each feature.
+	 *
+	 * '/sys/module/zfs/features.pool/<feature>'
+	 */
+	for (spa_feature_t i = 0; i < SPA_FEATURES; i++)
+		pool_feature_to_kobj(zfs_kobj, i, spa_feature_table[i].fi_guid);
+
+	return (0);
+}
+
+typedef struct prop_to_kobj_arg {
+	zprop_desc_t	*p2k_table;
+	zfs_mod_kobj_t	*p2k_parent;
+	sysfs_show_func	p2k_show_func;
+	int		p2k_attr_count;
+} prop_to_kobj_arg_t;
+
+static int
+zprop_to_kobj(int prop, void *args)
+{
+	prop_to_kobj_arg_t *data = args;
+	zfs_mod_kobj_t *parent = data->p2k_parent;
+	zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[prop];
+	const char *name = data->p2k_table[prop].pd_name;
+	int err;
+
+	ASSERT(name);
+
+	err = zfs_kobj_init(zfs_kobj, data->p2k_attr_count, 0,
+	    data->p2k_show_func);
+	if (err)
+		return (ZPROP_CONT);
+
+	for (int i = 0; i < data->p2k_attr_count; i++)
+		zfs_kobj_add_attr(zfs_kobj, i, zprop_attrs[i]);
+
+	err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+	if (err)
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+
+	return (ZPROP_CONT);
+}
+
+static int
+zfs_sysfs_properties_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent,
+    zfs_type_t type)
+{
+	prop_to_kobj_arg_t context;
+	const char *name;
+	int err;
+
+	/*
+	 * Create a parent kobject to host properties.
+	 *
+	 * '/sys/module/zfs/properties.<type>'
+	 */
+	if (type == ZFS_TYPE_POOL) {
+		name = ZFS_SYSFS_POOL_PROPERTIES;
+		context.p2k_table = zpool_prop_get_table();
+		context.p2k_attr_count = ZPOOL_PROP_ATTR_COUNT;
+		context.p2k_parent = zfs_kobj;
+		context.p2k_show_func = pool_property_show;
+		err = zfs_kobj_init(zfs_kobj, 0, ZPOOL_NUM_PROPS,
+		    pool_property_show);
+	} else {
+		name = ZFS_SYSFS_DATASET_PROPERTIES;
+		context.p2k_table = zfs_prop_get_table();
+		context.p2k_attr_count = ZFS_PROP_ATTR_COUNT;
+		context.p2k_parent = zfs_kobj;
+		context.p2k_show_func = dataset_property_show;
+		err = zfs_kobj_init(zfs_kobj, 0, ZFS_NUM_PROPS,
+		    dataset_property_show);
+	}
+
+	if (err)
+		return (err);
+
+	err = zfs_kobj_add(zfs_kobj, parent, name);
+	if (err) {
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+		return (err);
+	}
+
+	/*
+	 * Create a kobject for each property.
+	 *
+	 * '/sys/module/zfs/properties.<type>/<property>'
+	 */
+	(void) zprop_iter_common(zprop_to_kobj, &context, B_TRUE,
+	    B_FALSE, type);
+
+	return (err);
+}
+
+void
+zfs_sysfs_init(void)
+{
+	struct kobject *parent;
+#if defined(CONFIG_ZFS) && !defined(CONFIG_ZFS_MODULE)
+	parent = kobject_create_and_add("zfs", fs_kobj);
+#else
+	parent = &(((struct module *)(THIS_MODULE))->mkobj).kobj;
+#endif
+	int err;
+
+	if (parent == NULL)
+		return;
+
+	err = zfs_kernel_features_init(&kernel_features_kobj, parent);
+	if (err)
+		return;
+
+	err = zfs_pool_features_init(&pool_features_kobj, parent);
+	if (err) {
+		zfs_kobj_fini(&kernel_features_kobj);
+		return;
+	}
+
+	err = zfs_sysfs_properties_init(&pool_props_kobj, parent,
+	    ZFS_TYPE_POOL);
+	if (err) {
+		zfs_kobj_fini(&kernel_features_kobj);
+		zfs_kobj_fini(&pool_features_kobj);
+		return;
+	}
+
+	err = zfs_sysfs_properties_init(&dataset_props_kobj, parent,
+	    ZFS_TYPE_FILESYSTEM);
+	if (err) {
+		zfs_kobj_fini(&kernel_features_kobj);
+		zfs_kobj_fini(&pool_features_kobj);
+		zfs_kobj_fini(&pool_props_kobj);
+		return;
+	}
+}
+
+void
+zfs_sysfs_fini(void)
+{
+	/*
+	 * Remove top-level kobjects; each will remove any children kobjects
+	 */
+	zfs_kobj_fini(&kernel_features_kobj);
+	zfs_kobj_fini(&pool_features_kobj);
+	zfs_kobj_fini(&dataset_props_kobj);
+	zfs_kobj_fini(&pool_props_kobj);
+}
@@ -0,0 +1,572 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ *   Rohan Puri <rohan.puri15@gmail.com>
+ *   Brian Behlendorf <behlendorf1@llnl.gov>
+ */
+
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+/*
+ * Common open routine.  Disallow any write access.
+ */
+/* ARGSUSED */
+static int
+zpl_common_open(struct inode *ip, struct file *filp)
+{
+	if (filp->f_mode & FMODE_WRITE)
+		return (-EACCES);
+
+	return (generic_file_open(ip, filp));
+}
+
+/*
+ * Get root directory contents.
+ */
+static int
+zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+	int error = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (!zpl_dir_emit_dots(filp, ctx))
+		goto out;
+
+	if (ctx->pos == 2) {
+		if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME,
+		    strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR))
+			goto out;
+
+		ctx->pos++;
+	}
+
+	if (ctx->pos == 3) {
+		if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME,
+		    strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR))
+			goto out;
+
+		ctx->pos++;
+	}
+out:
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	zpl_dir_context_t ctx =
+	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+	int error;
+
+	error = zpl_root_iterate(filp, &ctx);
+	filp->f_pos = ctx.pos;
+
+	return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+/*
+ * Get root directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_root_getattr_impl(const struct path *path, struct kstat *stat,
+    u32 request_mask, unsigned int query_flags)
+{
+	struct inode *ip = path->dentry->d_inode;
+
+	generic_fillattr(ip, stat);
+	stat->atime = current_time(ip);
+
+	return (0);
+}
+ZPL_GETATTR_WRAPPER(zpl_root_getattr);
+
+static struct dentry *
+#ifdef HAVE_LOOKUP_NAMEIDATA
+zpl_root_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd)
+#else
+zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags)
+#endif
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	int error;
+
+	crhold(cr);
+	error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	if (error) {
+		if (error == -ENOENT)
+			return (d_splice_alias(NULL, dentry));
+		else
+			return (ERR_PTR(error));
+	}
+
+	return (d_splice_alias(ip, dentry));
+}
+
+/*
+ * The '.zfs' control directory file and inode operations.
+ */
+const struct file_operations zpl_fops_root = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+	.iterate_shared	= zpl_root_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+	.iterate	= zpl_root_iterate,
+#else
+	.readdir	= zpl_root_readdir,
+#endif
+};
+
+const struct inode_operations zpl_ops_root = {
+	.lookup		= zpl_root_lookup,
+	.getattr	= zpl_root_getattr,
+};
+
+#ifdef HAVE_AUTOMOUNT
+static struct vfsmount *
+zpl_snapdir_automount(struct path *path)
+{
+	int error;
+
+	error = -zfsctl_snapshot_mount(path, 0);
+	if (error)
+		return (ERR_PTR(error));
+
+	/*
+	 * Rather than returning the new vfsmount for the snapshot we must
+	 * return NULL to indicate a mount collision.  This is done because
+	 * the user space mount calls do_add_mount() which adds the vfsmount
+	 * to the name space.  If we returned the new mount here it would be
+	 * added again to the vfsmount list resulting in list corruption.
+	 */
+	return (NULL);
+}
+#endif /* HAVE_AUTOMOUNT */
+
+/*
+ * Negative dentries must always be revalidated so newly created snapshots
+ * can be detected and automounted.  Normal dentries should be kept because
+ * as of the 3.18 kernel revaliding the mountpoint dentry will result in
+ * the snapshot being immediately unmounted.
+ */
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i)
+#else
+zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
+#endif
+{
+	return (!!dentry->d_inode);
+}
+
+dentry_operations_t zpl_dops_snapdirs = {
+/*
+ * Auto mounting of snapshots is only supported for 2.6.37 and
+ * newer kernels.  Prior to this kernel the ops->follow_link()
+ * callback was used as a hack to trigger the mount.  The
+ * resulting vfsmount was then explicitly grafted in to the
+ * name space.  While it might be possible to add compatibility
+ * code to accomplish this it would require considerable care.
+ */
+#ifdef HAVE_AUTOMOUNT
+	.d_automount	= zpl_snapdir_automount,
+#endif /* HAVE_AUTOMOUNT */
+	.d_revalidate	= zpl_snapdir_revalidate,
+};
+
+static struct dentry *
+#ifdef HAVE_LOOKUP_NAMEIDATA
+zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
+    struct nameidata *nd)
+#else
+zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
+    unsigned int flags)
+#endif
+
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	struct inode *ip = NULL;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip,
+	    0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error && error != -ENOENT)
+		return (ERR_PTR(error));
+
+	ASSERT(error == 0 || ip == NULL);
+	d_clear_d_op(dentry);
+	d_set_d_op(dentry, &zpl_dops_snapdirs);
+#ifdef HAVE_AUTOMOUNT
+	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+#endif
+
+	return (d_splice_alias(ip, dentry));
+}
+
+static int
+zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+	fstrans_cookie_t cookie;
+	char snapname[MAXNAMELEN];
+	boolean_t case_conflict;
+	uint64_t id, pos;
+	int error = 0;
+
+	ZFS_ENTER(zfsvfs);
+	cookie = spl_fstrans_mark();
+
+	if (!zpl_dir_emit_dots(filp, ctx))
+		goto out;
+
+	pos = ctx->pos;
+	while (error == 0) {
+		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN,
+		    snapname, &id, &pos, &case_conflict);
+		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		if (error)
+			goto out;
+
+		if (!zpl_dir_emit(ctx, snapname, strlen(snapname),
+		    ZFSCTL_INO_SHARES - id, DT_DIR))
+			goto out;
+
+		ctx->pos = pos;
+	}
+out:
+	spl_fstrans_unmark(cookie);
+	ZFS_EXIT(zfsvfs);
+
+	if (error == -ENOENT)
+		return (0);
+
+	return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	zpl_dir_context_t ctx =
+	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+	int error;
+
+	error = zpl_snapdir_iterate(filp, &ctx);
+	filp->f_pos = ctx.pos;
+
+	return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+static int
+zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+{
+	cred_t *cr = CRED();
+	int error;
+
+	/* We probably don't want to support renameat2(2) in ctldir */
+	if (flags)
+		return (-EINVAL);
+
+	crhold(cr);
+	error = -zfsctl_snapdir_rename(sdip, dname(sdentry),
+	    tdip, dname(tdentry), cr, 0);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+#ifndef HAVE_RENAME_WANTS_FLAGS
+static int
+zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry)
+{
+	return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0));
+}
+#endif
+
+static int
+zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	int error;
+
+	crhold(cr);
+	error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+static int
+zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode)
+{
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	struct inode *ip;
+	int error;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dip, mode | S_IFDIR, cr);
+
+	error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
+	if (error == 0) {
+		d_clear_d_op(dentry);
+		d_set_d_op(dentry, &zpl_dops_snapdirs);
+		d_instantiate(dentry, ip);
+	}
+
+	kmem_free(vap, sizeof (vattr_t));
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+/*
+ * Get snapshot directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
+    u32 request_mask, unsigned int query_flags)
+{
+	struct inode *ip = path->dentry->d_inode;
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+	ZFS_ENTER(zfsvfs);
+	generic_fillattr(ip, stat);
+
+	stat->nlink = stat->size = 2;
+	stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+	stat->atime = current_time(ip);
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr);
+
+/*
+ * The '.zfs/snapshot' directory file operations.  These mainly control
+ * generating the list of available snapshots when doing an 'ls' in the
+ * directory.  See zpl_snapdir_readdir().
+ */
+const struct file_operations zpl_fops_snapdir = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+	.iterate_shared	= zpl_snapdir_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+	.iterate	= zpl_snapdir_iterate,
+#else
+	.readdir	= zpl_snapdir_readdir,
+#endif
+
+};
+
+/*
+ * The '.zfs/snapshot' directory inode operations.  These mainly control
+ * creating an inode for a snapshot directory and initializing the needed
+ * infrastructure to automount the snapshot.  See zpl_snapdir_lookup().
+ */
+const struct inode_operations zpl_ops_snapdir = {
+	.lookup		= zpl_snapdir_lookup,
+	.getattr	= zpl_snapdir_getattr,
+#ifdef HAVE_RENAME_WANTS_FLAGS
+	.rename		= zpl_snapdir_rename2,
+#else
+	.rename		= zpl_snapdir_rename,
+#endif
+	.rmdir		= zpl_snapdir_rmdir,
+	.mkdir		= zpl_snapdir_mkdir,
+};
+
+static struct dentry *
+#ifdef HAVE_LOOKUP_NAMEIDATA
+zpl_shares_lookup(struct inode *dip, struct dentry *dentry,
+    struct nameidata *nd)
+#else
+zpl_shares_lookup(struct inode *dip, struct dentry *dentry,
+    unsigned int flags)
+#endif
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	struct inode *ip = NULL;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfsctl_shares_lookup(dip, dname(dentry), &ip,
+	    0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error) {
+		if (error == -ENOENT)
+			return (d_splice_alias(NULL, dentry));
+		else
+			return (ERR_PTR(error));
+	}
+
+	return (d_splice_alias(ip, dentry));
+}
+
+static int
+zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+	znode_t *dzp;
+	int error = 0;
+
+	ZFS_ENTER(zfsvfs);
+	cookie = spl_fstrans_mark();
+
+	if (zfsvfs->z_shares_dir == 0) {
+		zpl_dir_emit_dots(filp, ctx);
+		goto out;
+	}
+
+	error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
+	if (error)
+		goto out;
+
+	crhold(cr);
+	error = -zfs_readdir(ZTOI(dzp), ctx, cr);
+	crfree(cr);
+
+	iput(ZTOI(dzp));
+out:
+	spl_fstrans_unmark(cookie);
+	ZFS_EXIT(zfsvfs);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	zpl_dir_context_t ctx =
+	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+	int error;
+
+	error = zpl_shares_iterate(filp, &ctx);
+	filp->f_pos = ctx.pos;
+
+	return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+/* ARGSUSED */
+static int
+zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
+    u32 request_mask, unsigned int query_flags)
+{
+	struct inode *ip = path->dentry->d_inode;
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsvfs->z_shares_dir == 0) {
+		generic_fillattr(path->dentry->d_inode, stat);
+		stat->nlink = stat->size = 2;
+		stat->atime = current_time(ip);
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
+	if (error == 0) {
+		error = -zfs_getattr_fast(ZTOI(dzp), stat);
+		iput(ZTOI(dzp));
+	}
+
+	ZFS_EXIT(zfsvfs);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+ZPL_GETATTR_WRAPPER(zpl_shares_getattr);
+
+/*
+ * The '.zfs/shares' directory file operations.
+ */
+const struct file_operations zpl_fops_shares = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+	.iterate_shared	= zpl_shares_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+	.iterate	= zpl_shares_iterate,
+#else
+	.readdir	= zpl_shares_readdir,
+#endif
+
+};
+
+/*
+ * The '.zfs/shares' directory inode operations.
+ */
+const struct inode_operations zpl_ops_shares = {
+	.lookup		= zpl_shares_lookup,
+	.getattr	= zpl_shares_getattr,
+};
@@ -0,0 +1,177 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Gunnar Beutner
+ * Copyright (c) 2012 Cyril Plisko. All rights reserved.
+ */
+
+
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+
+static int
+#ifdef HAVE_ENCODE_FH_WITH_INODE
+zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent)
+{
+#else
+zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable)
+{
+	/* CSTYLED */
+	struct inode *ip = dentry->d_inode;
+#endif /* HAVE_ENCODE_FH_WITH_INODE */
+	fstrans_cookie_t cookie;
+	fid_t *fid = (fid_t *)fh;
+	int len_bytes, rc;
+
+	len_bytes = *max_len * sizeof (__u32);
+
+	if (len_bytes < offsetof(fid_t, fid_data))
+		return (255);
+
+	fid->fid_len = len_bytes - offsetof(fid_t, fid_data);
+	cookie = spl_fstrans_mark();
+
+	if (zfsctl_is_node(ip))
+		rc = zfsctl_fid(ip, fid);
+	else
+		rc = zfs_fid(ip, fid);
+
+	spl_fstrans_unmark(cookie);
+	len_bytes = offsetof(fid_t, fid_data) + fid->fid_len;
+	*max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32);
+
+	return (rc == 0 ? FILEID_INO32_GEN : 255);
+}
+
+static struct dentry *
+zpl_dentry_obtain_alias(struct inode *ip)
+{
+	struct dentry *result;
+
+#ifdef HAVE_D_OBTAIN_ALIAS
+	result = d_obtain_alias(ip);
+#else
+	result = d_alloc_anon(ip);
+
+	if (result == NULL) {
+		iput(ip);
+		result = ERR_PTR(-ENOMEM);
+	}
+#endif /* HAVE_D_OBTAIN_ALIAS */
+
+	return (result);
+}
+
+static struct dentry *
+zpl_fh_to_dentry(struct super_block *sb, struct fid *fh,
+    int fh_len, int fh_type)
+{
+	fid_t *fid = (fid_t *)fh;
+	fstrans_cookie_t cookie;
+	struct inode *ip;
+	int len_bytes, rc;
+
+	len_bytes = fh_len * sizeof (__u32);
+
+	if (fh_type != FILEID_INO32_GEN ||
+	    len_bytes < offsetof(fid_t, fid_data) ||
+	    len_bytes < offsetof(fid_t, fid_data) + fid->fid_len)
+		return (ERR_PTR(-EINVAL));
+
+	cookie = spl_fstrans_mark();
+	rc = zfs_vget(sb, &ip, fid);
+	spl_fstrans_unmark(cookie);
+
+	if (rc) {
+		/*
+		 * If we see ENOENT it might mean that an NFSv4 * client
+		 * is using a cached inode value in a file handle and
+		 * that the sought after file has had its inode changed
+		 * by a third party.  So change the error to ESTALE
+		 * which will trigger a full lookup by the client and
+		 * will find the new filename/inode pair if it still
+		 * exists.
+		 */
+		if (rc == ENOENT)
+			rc = ESTALE;
+
+		return (ERR_PTR(-rc));
+	}
+
+	ASSERT((ip != NULL) && !IS_ERR(ip));
+
+	return (zpl_dentry_obtain_alias(ip));
+}
+
+static struct dentry *
+zpl_get_parent(struct dentry *child)
+{
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	struct inode *ip;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_lookup(child->d_inode, "..", &ip, 0, cr, NULL, NULL);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	if (error)
+		return (ERR_PTR(error));
+
+	return (zpl_dentry_obtain_alias(ip));
+}
+
+#ifdef HAVE_COMMIT_METADATA
+static int
+zpl_commit_metadata(struct inode *inode)
+{
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	int error;
+
+	if (zfsctl_is_node(inode))
+		return (0);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_fsync(inode, 0, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+#endif /* HAVE_COMMIT_METADATA */
+
+const struct export_operations zpl_export_operations = {
+	.encode_fh		= zpl_encode_fh,
+	.fh_to_dentry		= zpl_fh_to_dentry,
+	.get_parent		= zpl_get_parent,
+#ifdef HAVE_COMMIT_METADATA
+	.commit_metadata	= zpl_commit_metadata,
+#endif /* HAVE_COMMIT_METADATA */
+};
@@ -0,0 +1,826 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/dmu_objset.h>
+#include <sys/vfs.h>
+#include <sys/zpl.h>
+#include <sys/file.h>
+
+
+static struct dentry *
+#ifdef HAVE_LOOKUP_NAMEIDATA
+zpl_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+#else
+zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+#endif
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	int error;
+	fstrans_cookie_t cookie;
+	pathname_t *ppn = NULL;
+	pathname_t pn;
+	int zfs_flags = 0;
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+	if (dlen(dentry) >= ZAP_MAXNAMELEN)
+		return (ERR_PTR(-ENAMETOOLONG));
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+
+	/* If we are a case insensitive fs, we need the real name */
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+		zfs_flags = FIGNORECASE;
+		pn_alloc(&pn);
+		ppn = &pn;
+	}
+
+	error = -zfs_lookup(dir, dname(dentry), &ip, zfs_flags, cr, NULL, ppn);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	spin_lock(&dentry->d_lock);
+	dentry->d_time = jiffies;
+#ifndef HAVE_S_D_OP
+	d_set_d_op(dentry, &zpl_dentry_operations);
+#endif /* HAVE_S_D_OP */
+	spin_unlock(&dentry->d_lock);
+
+	if (error) {
+		/*
+		 * If we have a case sensitive fs, we do not want to
+		 * insert negative entries, so return NULL for ENOENT.
+		 * Fall through if the error is not ENOENT. Also free memory.
+		 */
+		if (ppn) {
+			pn_free(ppn);
+			if (error == -ENOENT)
+				return (NULL);
+		}
+
+		if (error == -ENOENT)
+			return (d_splice_alias(NULL, dentry));
+		else
+			return (ERR_PTR(error));
+	}
+
+	/*
+	 * If we are case insensitive, call the correct function
+	 * to install the name.
+	 */
+	if (ppn) {
+		struct dentry *new_dentry;
+		struct qstr ci_name;
+
+		if (strcmp(dname(dentry), pn.pn_buf) == 0) {
+			new_dentry = d_splice_alias(ip,  dentry);
+		} else {
+			ci_name.name = pn.pn_buf;
+			ci_name.len = strlen(pn.pn_buf);
+			new_dentry = d_add_ci(dentry, ip, &ci_name);
+		}
+		pn_free(ppn);
+		return (new_dentry);
+	} else {
+		return (d_splice_alias(ip, dentry));
+	}
+}
+
+void
+zpl_vap_init(vattr_t *vap, struct inode *dir, zpl_umode_t mode, cred_t *cr)
+{
+	vap->va_mask = ATTR_MODE;
+	vap->va_mode = mode;
+	vap->va_uid = crgetfsuid(cr);
+
+	if (dir && dir->i_mode & S_ISGID) {
+		vap->va_gid = KGID_TO_SGID(dir->i_gid);
+		if (S_ISDIR(mode))
+			vap->va_mode |= S_ISGID;
+	} else {
+		vap->va_gid = crgetfsgid(cr);
+	}
+}
+
+static int
+#ifdef HAVE_CREATE_NAMEIDATA
+zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
+    struct nameidata *nd)
+#else
+zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
+    bool flag)
+#endif
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, mode, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
+	if (error == 0) {
+		d_instantiate(dentry, ip);
+
+		error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ip, dir);
+
+		if (error)
+			(void) zfs_remove(dir, dname(dentry), cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
+    dev_t rdev)
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	/*
+	 * We currently expect Linux to supply rdev=0 for all sockets
+	 * and fifos, but we want to know if this behavior ever changes.
+	 */
+	if (S_ISSOCK(mode) || S_ISFIFO(mode))
+		ASSERT(rdev == 0);
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, mode, cr);
+	vap->va_rdev = rdev;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
+	if (error == 0) {
+		d_instantiate(dentry, ip);
+
+		error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ip, dir);
+
+		if (error)
+			(void) zfs_remove(dir, dname(dentry), cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#ifdef HAVE_TMPFILE
+static int
+zpl_tmpfile(struct inode *dir, struct dentry *dentry, zpl_umode_t mode)
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, mode, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL);
+	if (error == 0) {
+		/* d_tmpfile will do drop_nlink, so we should set it first */
+		set_nlink(ip, 1);
+		d_tmpfile(dentry, ip);
+
+		error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ip, dir);
+		/*
+		 * don't need to handle error here, file is already in
+		 * unlinked set.
+		 */
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+#endif
+
+static int
+zpl_unlink(struct inode *dir, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_remove(dir, dname(dentry), cr, 0);
+
+	/*
+	 * For a CI FS we must invalidate the dentry to prevent the
+	 * creation of negative entries.
+	 */
+	if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
+		d_invalidate(dentry);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode)
+{
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	struct inode *ip;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, mode | S_IFDIR, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL);
+	if (error == 0) {
+		d_instantiate(dentry, ip);
+
+		error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ip, dir);
+
+		if (error)
+			(void) zfs_rmdir(dir, dname(dentry), NULL, cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_rmdir(dir, dname(dentry), NULL, cr, 0);
+
+	/*
+	 * For a CI FS we must invalidate the dentry to prevent the
+	 * creation of negative entries.
+	 */
+	if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
+		d_invalidate(dentry);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
+    unsigned int query_flags)
+{
+	int error;
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+
+	/*
+	 * XXX request_mask and query_flags currently ignored.
+	 */
+
+	error = -zfs_getattr_fast(path->dentry->d_inode, stat);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+ZPL_GETATTR_WRAPPER(zpl_getattr);
+
+static int
+zpl_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	struct inode *ip = dentry->d_inode;
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	error = setattr_prepare(dentry, ia);
+	if (error)
+		return (error);
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK;
+	vap->va_mode = ia->ia_mode;
+	vap->va_uid = KUID_TO_SUID(ia->ia_uid);
+	vap->va_gid = KGID_TO_SGID(ia->ia_gid);
+	vap->va_size = ia->ia_size;
+	vap->va_atime = ia->ia_atime;
+	vap->va_mtime = ia->ia_mtime;
+	vap->va_ctime = ia->ia_ctime;
+
+	if (vap->va_mask & ATTR_ATIME) {
+		ip->i_atime = zpl_inode_timespec_trunc(ia->ia_atime,
+		    ip->i_sb->s_time_gran);
+	}
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_setattr(ip, vap, 0, cr);
+	if (!error && (ia->ia_valid & ATTR_MODE))
+		error = zpl_chmod_acl(ip);
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_rename2(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	/* We don't have renameat2(2) support */
+	if (flags)
+		return (-EINVAL);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_rename(sdip, dname(sdentry), tdip, dname(tdentry), cr, 0);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#ifndef HAVE_RENAME_WANTS_FLAGS
+static int
+zpl_rename(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry)
+{
+	return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0));
+}
+#endif
+
+static int
+zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
+{
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	struct inode *ip;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0);
+	if (error == 0) {
+		d_instantiate(dentry, ip);
+
+		error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+		if (error)
+			(void) zfs_remove(dir, dname(dentry), cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#if defined(HAVE_PUT_LINK_COOKIE)
+static void
+zpl_put_link(struct inode *unused, void *cookie)
+{
+	kmem_free(cookie, MAXPATHLEN);
+}
+#elif defined(HAVE_PUT_LINK_NAMEIDATA)
+static void
+zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
+{
+	const char *link = nd_get_link(nd);
+
+	if (!IS_ERR(link))
+		kmem_free(link, MAXPATHLEN);
+}
+#elif defined(HAVE_PUT_LINK_DELAYED)
+static void
+zpl_put_link(void *ptr)
+{
+	kmem_free(ptr, MAXPATHLEN);
+}
+#endif
+
+static int
+zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	struct iovec iov;
+	uio_t uio = { { 0 }, 0 };
+	int error;
+
+	crhold(cr);
+	*link = NULL;
+	iov.iov_len = MAXPATHLEN;
+	iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+	uio.uio_iov = &iov;
+	uio.uio_iovcnt = 1;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_resid = (MAXPATHLEN - 1);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_readlink(ip, &uio, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error)
+		kmem_free(iov.iov_base, MAXPATHLEN);
+	else
+		*link = iov.iov_base;
+
+	return (error);
+}
+
+#if defined(HAVE_GET_LINK_DELAYED)
+const char *
+zpl_get_link(struct dentry *dentry, struct inode *inode,
+    struct delayed_call *done)
+{
+	char *link = NULL;
+	int error;
+
+	if (!dentry)
+		return (ERR_PTR(-ECHILD));
+
+	error = zpl_get_link_common(dentry, inode, &link);
+	if (error)
+		return (ERR_PTR(error));
+
+	set_delayed_call(done, zpl_put_link, link);
+
+	return (link);
+}
+#elif defined(HAVE_GET_LINK_COOKIE)
+const char *
+zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie)
+{
+	char *link = NULL;
+	int error;
+
+	if (!dentry)
+		return (ERR_PTR(-ECHILD));
+
+	error = zpl_get_link_common(dentry, inode, &link);
+	if (error)
+		return (ERR_PTR(error));
+
+	return (*cookie = link);
+}
+#elif defined(HAVE_FOLLOW_LINK_COOKIE)
+const char *
+zpl_follow_link(struct dentry *dentry, void **cookie)
+{
+	char *link = NULL;
+	int error;
+
+	error = zpl_get_link_common(dentry, dentry->d_inode, &link);
+	if (error)
+		return (ERR_PTR(error));
+
+	return (*cookie = link);
+}
+#elif defined(HAVE_FOLLOW_LINK_NAMEIDATA)
+static void *
+zpl_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *link = NULL;
+	int error;
+
+	error = zpl_get_link_common(dentry, dentry->d_inode, &link);
+	if (error)
+		nd_set_link(nd, ERR_PTR(error));
+	else
+		nd_set_link(nd, link);
+
+	return (NULL);
+}
+#endif
+
+static int
+zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	struct inode *ip = old_dentry->d_inode;
+	int error;
+	fstrans_cookie_t cookie;
+
+	if (ip->i_nlink >= ZFS_LINK_MAX)
+		return (-EMLINK);
+
+	crhold(cr);
+	ip->i_ctime = current_time(ip);
+	igrab(ip); /* Use ihold() if available */
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_link(dir, ip, dname(dentry), cr, 0);
+	if (error) {
+		iput(ip);
+		goto out;
+	}
+
+	d_instantiate(dentry, ip);
+out:
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#ifdef HAVE_INODE_TRUNCATE_RANGE
+static void
+zpl_truncate_range(struct inode *ip, loff_t start, loff_t end)
+{
+	cred_t *cr = CRED();
+	flock64_t bf;
+	fstrans_cookie_t cookie;
+
+	ASSERT3S(start, <=, end);
+
+	/*
+	 * zfs_freesp() will interpret (len == 0) as meaning "truncate until
+	 * the end of the file". We don't want that.
+	 */
+	if (start == end)
+		return;
+
+	crhold(cr);
+
+	bf.l_type = F_WRLCK;
+	bf.l_whence = SEEK_SET;
+	bf.l_start = start;
+	bf.l_len = end - start;
+	bf.l_pid = 0;
+	cookie = spl_fstrans_mark();
+	zfs_space(ip, F_FREESP, &bf, FWRITE, start, cr);
+	spl_fstrans_unmark(cookie);
+
+	crfree(cr);
+}
+#endif /* HAVE_INODE_TRUNCATE_RANGE */
+
+#ifdef HAVE_INODE_FALLOCATE
+static long
+zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len)
+{
+	return (zpl_fallocate_common(ip, mode, offset, len));
+}
+#endif /* HAVE_INODE_FALLOCATE */
+
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned int flags = (nd ? nd->flags : 0);
+#else
+zpl_revalidate(struct dentry *dentry, unsigned int flags)
+{
+#endif /* HAVE_D_REVALIDATE_NAMEIDATA */
+	/* CSTYLED */
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+	int error;
+
+	if (flags & LOOKUP_RCU)
+		return (-ECHILD);
+
+	/*
+	 * Automounted snapshots rely on periodic dentry revalidation
+	 * to defer snapshots from being automatically unmounted.
+	 */
+	if (zfsvfs->z_issnap) {
+		if (time_after(jiffies, zfsvfs->z_snap_defer_time +
+		    MAX(zfs_expire_snapshot * HZ / 2, HZ))) {
+			zfsvfs->z_snap_defer_time = jiffies;
+			zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa,
+			    dmu_objset_id(zfsvfs->z_os), zfs_expire_snapshot);
+		}
+	}
+
+	/*
+	 * After a rollback negative dentries created before the rollback
+	 * time must be invalidated.  Otherwise they can obscure files which
+	 * are only present in the rolled back dataset.
+	 */
+	if (dentry->d_inode == NULL) {
+		spin_lock(&dentry->d_lock);
+		error = time_before(dentry->d_time, zfsvfs->z_rollback_time);
+		spin_unlock(&dentry->d_lock);
+
+		if (error)
+			return (0);
+	}
+
+	/*
+	 * The dentry may reference a stale inode if a mounted file system
+	 * was rolled back to a point in time where the object didn't exist.
+	 */
+	if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale)
+		return (0);
+
+	return (1);
+}
+
+const struct inode_operations zpl_inode_operations = {
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+#ifdef HAVE_INODE_TRUNCATE_RANGE
+	.truncate_range = zpl_truncate_range,
+#endif /* HAVE_INODE_TRUNCATE_RANGE */
+#ifdef HAVE_INODE_FALLOCATE
+	.fallocate	= zpl_fallocate,
+#endif /* HAVE_INODE_FALLOCATE */
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+	.set_acl	= zpl_set_acl,
+#endif
+#if defined(HAVE_GET_ACL)
+	.get_acl	= zpl_get_acl,
+#elif defined(HAVE_CHECK_ACL)
+	.check_acl	= zpl_check_acl,
+#elif defined(HAVE_PERMISSION)
+	.permission	= zpl_permission,
+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+const struct inode_operations zpl_dir_inode_operations = {
+	.create		= zpl_create,
+	.lookup		= zpl_lookup,
+	.link		= zpl_link,
+	.unlink		= zpl_unlink,
+	.symlink	= zpl_symlink,
+	.mkdir		= zpl_mkdir,
+	.rmdir		= zpl_rmdir,
+	.mknod		= zpl_mknod,
+#ifdef HAVE_RENAME_WANTS_FLAGS
+	.rename		= zpl_rename2,
+#else
+	.rename		= zpl_rename,
+#endif
+#ifdef HAVE_TMPFILE
+	.tmpfile	= zpl_tmpfile,
+#endif
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+	.set_acl	= zpl_set_acl,
+#endif
+#if defined(HAVE_GET_ACL)
+	.get_acl	= zpl_get_acl,
+#elif defined(HAVE_CHECK_ACL)
+	.check_acl	= zpl_check_acl,
+#elif defined(HAVE_PERMISSION)
+	.permission	= zpl_permission,
+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+const struct inode_operations zpl_symlink_inode_operations = {
+#ifdef HAVE_GENERIC_READLINK
+	.readlink	= generic_readlink,
+#endif
+#if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE)
+	.get_link	= zpl_get_link,
+#elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA)
+	.follow_link	= zpl_follow_link,
+#endif
+#if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA)
+	.put_link	= zpl_put_link,
+#endif
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+};
+
+const struct inode_operations zpl_special_inode_operations = {
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+	.set_acl	= zpl_set_acl,
+#endif
+#if defined(HAVE_GET_ACL)
+	.get_acl	= zpl_get_acl,
+#elif defined(HAVE_CHECK_ACL)
+	.check_acl	= zpl_check_acl,
+#elif defined(HAVE_PERMISSION)
+	.permission	= zpl_permission,
+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+dentry_operations_t zpl_dentry_operations = {
+	.d_revalidate	= zpl_revalidate,
+};
@@ -0,0 +1,426 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ */
+
+
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+
+static struct inode *
+zpl_inode_alloc(struct super_block *sb)
+{
+	struct inode *ip;
+
+	VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
+	inode_set_iversion(ip, 1);
+
+	return (ip);
+}
+
+static void
+zpl_inode_destroy(struct inode *ip)
+{
+	ASSERT(atomic_read(&ip->i_count) == 0);
+	zfs_inode_destroy(ip);
+}
+
+/*
+ * Called from __mark_inode_dirty() to reflect that something in the
+ * inode has changed.  We use it to ensure the znode system attributes
+ * are always strictly update to date with respect to the inode.
+ */
+#ifdef HAVE_DIRTY_INODE_WITH_FLAGS
+static void
+zpl_dirty_inode(struct inode *ip, int flags)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	zfs_dirty_inode(ip, flags);
+	spl_fstrans_unmark(cookie);
+}
+#else
+static void
+zpl_dirty_inode(struct inode *ip)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	zfs_dirty_inode(ip, 0);
+	spl_fstrans_unmark(cookie);
+}
+#endif /* HAVE_DIRTY_INODE_WITH_FLAGS */
+
+/*
+ * When ->drop_inode() is called its return value indicates if the
+ * inode should be evicted from the inode cache.  If the inode is
+ * unhashed and has no links the default policy is to evict it
+ * immediately.
+ *
+ * Prior to 2.6.36 this eviction was accomplished by the vfs calling
+ * ->delete_inode().  It was ->delete_inode()'s responsibility to
+ * truncate the inode pages and call clear_inode().  The call to
+ * clear_inode() synchronously invalidates all the buffers and
+ * calls ->clear_inode().  It was ->clear_inode()'s responsibility
+ * to cleanup and filesystem specific data before freeing the inode.
+ *
+ * This elaborate mechanism was replaced by ->evict_inode() which
+ * does the job of both ->delete_inode() and ->clear_inode().  It
+ * will be called exactly once, and when it returns the inode must
+ * be in a state where it can simply be freed.i
+ *
+ * The ->evict_inode() callback must minimally truncate the inode pages,
+ * and call clear_inode().  For 2.6.35 and later kernels this will
+ * simply update the inode state, with the sync occurring before the
+ * truncate in evict().  For earlier kernels clear_inode() maps to
+ * end_writeback() which is responsible for completing all outstanding
+ * write back.  In either case, once this is done it is safe to cleanup
+ * any remaining inode specific data via zfs_inactive().
+ * remaining filesystem specific data.
+ */
+#ifdef HAVE_EVICT_INODE
+static void
+zpl_evict_inode(struct inode *ip)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	truncate_setsize(ip, 0);
+	clear_inode(ip);
+	zfs_inactive(ip);
+	spl_fstrans_unmark(cookie);
+}
+
+#else
+
+static void
+zpl_drop_inode(struct inode *ip)
+{
+	generic_delete_inode(ip);
+}
+
+static void
+zpl_clear_inode(struct inode *ip)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	zfs_inactive(ip);
+	spl_fstrans_unmark(cookie);
+}
+
+static void
+zpl_inode_delete(struct inode *ip)
+{
+	truncate_setsize(ip, 0);
+	clear_inode(ip);
+}
+#endif /* HAVE_EVICT_INODE */
+
+static void
+zpl_put_super(struct super_block *sb)
+{
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_umount(sb);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+}
+
+static int
+zpl_sync_fs(struct super_block *sb, int wait)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_sync(sb, wait, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
+{
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_statvfs(dentry, statp);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	/*
+	 * If required by a 32-bit system call, dynamically scale the
+	 * block size up to 16MiB and decrease the block counts.  This
+	 * allows for a maximum size of 64EiB to be reported.  The file
+	 * counts must be artificially capped at 2^32-1.
+	 */
+	if (unlikely(zpl_is_32bit_api())) {
+		while (statp->f_blocks > UINT32_MAX &&
+		    statp->f_bsize < SPA_MAXBLOCKSIZE) {
+			statp->f_frsize <<= 1;
+			statp->f_bsize <<= 1;
+
+			statp->f_blocks >>= 1;
+			statp->f_bfree >>= 1;
+			statp->f_bavail >>= 1;
+		}
+
+		uint64_t usedobjs = statp->f_files - statp->f_ffree;
+		statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
+		statp->f_files = statp->f_ffree + usedobjs;
+	}
+
+	return (error);
+}
+
+static int
+zpl_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_remount(sb, flags, &zm);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+__zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
+{
+	seq_printf(seq, ",%s",
+	    zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
+
+#ifdef CONFIG_FS_POSIX_ACL
+	switch (zfsvfs->z_acl_type) {
+	case ZFS_ACLTYPE_POSIXACL:
+		seq_puts(seq, ",posixacl");
+		break;
+	default:
+		seq_puts(seq, ",noacl");
+		break;
+	}
+#endif /* CONFIG_FS_POSIX_ACL */
+
+	return (0);
+}
+
+#ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY
+static int
+zpl_show_options(struct seq_file *seq, struct dentry *root)
+{
+	return (__zpl_show_options(seq, root->d_sb->s_fs_info));
+}
+#else
+static int
+zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp)
+{
+	return (__zpl_show_options(seq, vfsp->mnt_sb->s_fs_info));
+}
+#endif /* HAVE_SHOW_OPTIONS_WITH_DENTRY */
+
+static int
+zpl_fill_super(struct super_block *sb, void *data, int silent)
+{
+	zfs_mnt_t *zm = (zfs_mnt_t *)data;
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_domount(sb, zm, silent);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_test_super(struct super_block *s, void *data)
+{
+	zfsvfs_t *zfsvfs = s->s_fs_info;
+	objset_t *os = data;
+
+	if (zfsvfs == NULL)
+		return (0);
+
+	return (os == zfsvfs->z_os);
+}
+
+static struct super_block *
+zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
+{
+	struct super_block *s;
+	objset_t *os;
+	int err;
+
+	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
+	if (err)
+		return (ERR_PTR(-err));
+
+	/*
+	 * The dsl pool lock must be released prior to calling sget().
+	 * It is possible sget() may block on the lock in grab_super()
+	 * while deactivate_super() holds that same lock and waits for
+	 * a txg sync.  If the dsl_pool lock is held over sget()
+	 * this can prevent the pool sync and cause a deadlock.
+	 */
+	dsl_pool_rele(dmu_objset_pool(os), FTAG);
+	s = zpl_sget(fs_type, zpl_test_super, set_anon_super, flags, os);
+	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
+
+	if (IS_ERR(s))
+		return (ERR_CAST(s));
+
+	if (s->s_root == NULL) {
+		err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
+		if (err) {
+			deactivate_locked_super(s);
+			return (ERR_PTR(err));
+		}
+		s->s_flags |= SB_ACTIVE;
+	} else if ((flags ^ s->s_flags) & SB_RDONLY) {
+		deactivate_locked_super(s);
+		return (ERR_PTR(-EBUSY));
+	}
+
+	return (s);
+}
+
+#ifdef HAVE_FST_MOUNT
+static struct dentry *
+zpl_mount(struct file_system_type *fs_type, int flags,
+    const char *osname, void *data)
+{
+	zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
+
+	struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
+	if (IS_ERR(sb))
+		return (ERR_CAST(sb));
+
+	return (dget(sb->s_root));
+}
+#else
+static int
+zpl_get_sb(struct file_system_type *fs_type, int flags,
+    const char *osname, void *data, struct vfsmount *mnt)
+{
+	zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
+
+	struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
+	if (IS_ERR(sb))
+		return (PTR_ERR(sb));
+
+	(void) simple_set_mnt(mnt, sb);
+
+	return (0);
+}
+#endif /* HAVE_FST_MOUNT */
+
+static void
+zpl_kill_sb(struct super_block *sb)
+{
+	zfs_preumount(sb);
+	kill_anon_super(sb);
+
+#ifdef HAVE_S_INSTANCES_LIST_HEAD
+	sb->s_instances.next = &(zpl_fs_type.fs_supers);
+#endif /* HAVE_S_INSTANCES_LIST_HEAD */
+}
+
+void
+zpl_prune_sb(int64_t nr_to_scan, void *arg)
+{
+	struct super_block *sb = (struct super_block *)arg;
+	int objects = 0;
+
+	(void) -zfs_prune(sb, nr_to_scan, &objects);
+}
+
+#ifdef HAVE_NR_CACHED_OBJECTS
+static int
+zpl_nr_cached_objects(struct super_block *sb)
+{
+	return (0);
+}
+#endif /* HAVE_NR_CACHED_OBJECTS */
+
+#ifdef HAVE_FREE_CACHED_OBJECTS
+static void
+zpl_free_cached_objects(struct super_block *sb, int nr_to_scan)
+{
+	/* noop */
+}
+#endif /* HAVE_FREE_CACHED_OBJECTS */
+
+const struct super_operations zpl_super_operations = {
+	.alloc_inode		= zpl_inode_alloc,
+	.destroy_inode		= zpl_inode_destroy,
+	.dirty_inode		= zpl_dirty_inode,
+	.write_inode		= NULL,
+#ifdef HAVE_EVICT_INODE
+	.evict_inode		= zpl_evict_inode,
+#else
+	.drop_inode		= zpl_drop_inode,
+	.clear_inode		= zpl_clear_inode,
+	.delete_inode		= zpl_inode_delete,
+#endif /* HAVE_EVICT_INODE */
+	.put_super		= zpl_put_super,
+	.sync_fs		= zpl_sync_fs,
+	.statfs			= zpl_statfs,
+	.remount_fs		= zpl_remount_fs,
+	.show_options		= zpl_show_options,
+	.show_stats		= NULL,
+#ifdef HAVE_NR_CACHED_OBJECTS
+	.nr_cached_objects	= zpl_nr_cached_objects,
+#endif /* HAVE_NR_CACHED_OBJECTS */
+#ifdef HAVE_FREE_CACHED_OBJECTS
+	.free_cached_objects	= zpl_free_cached_objects,
+#endif /* HAVE_FREE_CACHED_OBJECTS */
+};
+
+struct file_system_type zpl_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= ZFS_DRIVER,
+#ifdef HAVE_FST_MOUNT
+	.mount			= zpl_mount,
+#else
+	.get_sb			= zpl_get_sb,
+#endif /* HAVE_FST_MOUNT */
+	.kill_sb		= zpl_kill_sb,
+};