Linux 5.0 compat: SIMD compatibility

Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS, and 5.0 and newer kernels. This commit squashes the following commits from master in to a single commit which can be applied to 0.8.2. 10fa2545 - Linux 4.14, 4.19, 5.0+ compat: SIMD save/restore b88ca2ac - Enable SIMD for encryption 095b5412 - Fix CONFIG_X86_DEBUG_FPU build failure e5db3134 - Linux 5.0 compat: SIMD compatibility Reviewed-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> TEST_ZIMPORT_SKIP="yes"
2025-08-01 20:47:39 +03:00 · 2019-07-12 09:31:20 -07:00 · 2019-07-12 09:31:20 -07:00 · 62c034f6d4
commit 62c034f6d4
parent 988b040476
30 changed files with 548 additions and 206 deletions
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@ -107,6 +107,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_raidz.h>
 #include <sys/vdev_trim.h>
 #include <sys/spa_impl.h>
 #include <sys/metaslab_impl.h>
@ -7110,6 +7111,8 @@ ztest_run(ztest_shared_t *zs)
 	metaslab_preload_limit = ztest_random(20) + 1;
 	ztest_spa = spa;

+	VERIFY0(vdev_raidz_impl_set("cycle"));
+
 	dmu_objset_stats_t dds;
 	VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
 	    DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os));
--- a/config/kernel-fpu.m4
+++ b/config/kernel-fpu.m4
@ -2,8 +2,9 @@ dnl #
 dnl # Handle differences in kernel FPU code.
 dnl #
 dnl # Kernel
-dnl # 5.0:	All kernel fpu functions are GPL only, so we can't use them.
-dnl #		(nothing defined)
+dnl # 5.0:	Wrappers have been introduced to save/restore the FPU state.
+dnl #		This change was made to the 4.19.38 and 4.14.120 LTS kernels.
+dnl #		HAVE_KERNEL_FPU_INTERNAL
 dnl #
 dnl # 4.2:	Use __kernel_fpu_{begin,end}()
 dnl #		HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU
@ -12,7 +13,11 @@ dnl # Pre-4.2:	Use kernel_fpu_{begin,end}()
 dnl #		HAVE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_FPU], [
-	AC_MSG_CHECKING([which kernel_fpu header to use])
+	dnl #
+	dnl # N.B. The header check is performed before all other checks since
+	dnl # it depends on HAVE_KERNEL_FPU_API_HEADER being set in confdefs.h.
+	dnl #
+	AC_MSG_CHECKING([whether fpu headers are available])
 	ZFS_LINUX_TRY_COMPILE([
 		#include <linux/module.h>
 		#include <asm/fpu/api.h>
@ -25,9 +30,13 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
 		AC_MSG_RESULT(i387.h & xcr.h)
 	])

-	AC_MSG_CHECKING([which kernel_fpu function to use])
+	dnl #
+	dnl # Legacy kernel
+	dnl #
+	AC_MSG_CHECKING([whether kernel fpu is available])
 	ZFS_LINUX_TRY_COMPILE_SYMBOL([
 		#include <linux/module.h>
+		#include <linux/types.h>
 		#ifdef HAVE_KERNEL_FPU_API_HEADER
 		#include <asm/fpu/api.h>
 		#else
@ -45,8 +54,12 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
 		AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
 		    [kernel exports FPU functions])
 	],[
+		dnl #
+		dnl # Linux 4.2 kernel
+		dnl #
 		ZFS_LINUX_TRY_COMPILE_SYMBOL([
 			#include <linux/module.h>
+			#include <linux/types.h>
 			#ifdef HAVE_KERNEL_FPU_API_HEADER
 			#include <asm/fpu/api.h>
 			#else
@ -57,12 +70,60 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
 		],[
 			__kernel_fpu_begin();
 			__kernel_fpu_end();
-		], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [
+		], [__kernel_fpu_begin],
+		[arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [
 			AC_MSG_RESULT(__kernel_fpu_*)
-			AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions])
-			AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions])
+			AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1,
+			    [kernel has __kernel_fpu_* functions])
+			AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
+			    [kernel exports FPU functions])
 		],[
-			AC_MSG_RESULT(not exported)
+			ZFS_LINUX_TRY_COMPILE([
+				#include <linux/module.h>
+
+				#if defined(__x86_64) || defined(__x86_64__) || \
+				    defined(__i386) || defined(__i386__)
+				#if !defined(__x86)
+				#define __x86
+				#endif
+				#endif
+
+				#if !defined(__x86)
+				#error Unsupported architecture
+				#endif
+
+				#include <linux/types.h>
+				#ifdef HAVE_KERNEL_FPU_API_HEADER
+				#include <asm/fpu/api.h>
+				#include <asm/fpu/internal.h>
+				#else
+				#include <asm/i387.h>
+				#include <asm/xcr.h>
+				#endif
+
+				#if !defined(XSTATE_XSAVE)
+				#error XSTATE_XSAVE not defined
+				#endif
+
+				#if !defined(XSTATE_XRESTORE)
+				#error XSTATE_XRESTORE not defined
+				#endif
+			],[
+				struct fpu *fpu = &current->thread.fpu;
+				union fpregs_state *st = &fpu->state;
+				struct fregs_state *fr __attribute__ ((unused)) =
+				    &st->fsave;
+				struct fxregs_state *fxr __attribute__ ((unused)) =
+				    &st->fxsave;
+				struct xregs_state *xr __attribute__ ((unused)) =
+				    &st->xsave;
+			], [
+				AC_MSG_RESULT(internal)
+				AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1,
+				    [kernel fpu internal])
+			],[
+				AC_MSG_RESULT(unavailable)
+			])
 		])
 	])
 ])
--- a/include/linux/Makefile.am
+++ b/include/linux/Makefile.am
@ -7,6 +7,7 @@ KERNEL_H = \
 	$(top_srcdir)/include/linux/blkdev_compat.h \
 	$(top_srcdir)/include/linux/utsname_compat.h \
 	$(top_srcdir)/include/linux/kmap_compat.h \
+	$(top_srcdir)/include/linux/simd.h \
 	$(top_srcdir)/include/linux/simd_x86.h \
 	$(top_srcdir)/include/linux/simd_aarch64.h \
 	$(top_srcdir)/include/linux/mod_compat.h \
--- a/include/linux/simd.h
+++ b/include/linux/simd.h
@ -0,0 +1,42 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2019 Lawrence Livermore National Security, LLC.
+ */
+
+#ifndef _SIMD_H
+#define	_SIMD_H
+
+#if defined(__x86)
+#include <linux/simd_x86.h>
+
+#elif defined(__aarch64__)
+#include <linux/simd_aarch64.h>
+#else
+
+#define	kfpu_allowed()		0
+#define	kfpu_begin()		do {} while (0)
+#define	kfpu_end()		do {} while (0)
+#define	kfpu_init()		0
+#define	kfpu_fini()		((void) 0)
+
+#endif
+#endif /* _SIMD_H */
--- a/include/linux/simd_aarch64.h
+++ b/include/linux/simd_aarch64.h
@ -27,9 +27,10 @@
 *
 * Kernel fpu methods:
 *	kfpu_allowed()
- *	kfpu_initialize()
 *	kfpu_begin()
 *	kfpu_end()
+ *	kfpu_init()
+ *	kfpu_fini()
 */

 #ifndef _SIMD_AARCH64_H
@ -43,20 +44,20 @@

 #if defined(_KERNEL)
 #include <asm/neon.h>
-#define	kfpu_begin()		\
-{					\
-	kernel_neon_begin();		\
-}
-#define	kfpu_end()			\
-{					\
-	kernel_neon_end();		\
-}
+#define	kfpu_allowed()		1
+#define	kfpu_begin()		kernel_neon_begin()
+#define	kfpu_end()		kernel_neon_end()
+#define	kfpu_init()		0
+#define	kfpu_fini()		((void) 0)
 #else
 /*
 * fpu dummy methods for userspace
 */
-#define	kfpu_begin() 	do {} while (0)
-#define	kfpu_end() 		do {} while (0)
+#define	kfpu_allowed()		1
+#define	kfpu_begin()		do {} while (0)
+#define	kfpu_end()		do {} while (0)
+#define	kfpu_init()		0
+#define	kfpu_fini()		((void) 0)
 #endif /* defined(_KERNEL) */

 #endif /* __aarch64__ */
--- a/include/linux/simd_x86.h
+++ b/include/linux/simd_x86.h
@ -27,9 +27,10 @@
 *
 * Kernel fpu methods:
 *	kfpu_allowed()
- *	kfpu_initialize()
 *	kfpu_begin()
 *	kfpu_end()
+ *	kfpu_init()
+ *	kfpu_fini()
 *
 * SIMD support:
 *
@ -84,6 +85,15 @@

 #if defined(_KERNEL)

+/*
+ * Disable the WARN_ON_FPU() macro to prevent additional dependencies
+ * when providing the kfpu_* functions.  Relevant warnings are included
+ * as appropriate and are unconditionally enabled.
+ */
+#if defined(CONFIG_X86_DEBUG_FPU) && !defined(KERNEL_EXPORTS_X86_FPU)
+#undef CONFIG_X86_DEBUG_FPU
+#endif
+
 #if defined(HAVE_KERNEL_FPU_API_HEADER)
 #include <asm/fpu/api.h>
 #include <asm/fpu/internal.h>
@ -92,33 +102,231 @@
 #include <asm/xcr.h>
 #endif

+/*
+ * The following cases are for kernels which export either the
+ * kernel_fpu_* or __kernel_fpu_* functions.
+ */
+#if defined(KERNEL_EXPORTS_X86_FPU)
+
+#define	kfpu_allowed()		1
+#define	kfpu_init()		0
+#define	kfpu_fini()		((void) 0)
+
 #if defined(HAVE_UNDERSCORE_KERNEL_FPU)
 #define	kfpu_begin()		\
-{							\
-	preempt_disable();		\
+{				\
+	preempt_disable();	\
 	__kernel_fpu_begin();	\
 }
-#define	kfpu_end()			\
-{							\
-	__kernel_fpu_end();		\
-	preempt_enable();		\
+#define	kfpu_end()		\
+{				\
+	__kernel_fpu_end();	\
+	preempt_enable();	\
 }
+
 #elif defined(HAVE_KERNEL_FPU)
-#define	kfpu_begin()	kernel_fpu_begin()
+#define	kfpu_begin()		kernel_fpu_begin()
 #define	kfpu_end()		kernel_fpu_end()
-#else
-/* Kernel doesn't export any kernel_fpu_* functions */
-#include <asm/fpu/internal.h>	/* For kernel xgetbv() */
-#define	kfpu_begin() 	panic("This code should never run")
-#define	kfpu_end() 	panic("This code should never run")
-#endif /* defined(HAVE_KERNEL_FPU) */

 #else
 /*
- * fpu dummy methods for userspace
+ * This case is unreachable.  When KERNEL_EXPORTS_X86_FPU is defined then
+ * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined.
 */
-#define	kfpu_begin() 	do {} while (0)
-#define	kfpu_end() 		do {} while (0)
+#error "Unreachable kernel configuration"
+#endif
+
+#else /* defined(KERNEL_EXPORTS_X86_FPU) */
+
+/*
+ * When the kernel_fpu_* symbols are unavailable then provide our own
+ * versions which allow the FPU to be safely used.
+ */
+#if defined(HAVE_KERNEL_FPU_INTERNAL)
+
+extern union fpregs_state **zfs_kfpu_fpregs;
+
+/*
+ * Initialize per-cpu variables to store FPU state.
+ */
+static inline void
+kfpu_fini(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (zfs_kfpu_fpregs[cpu] != NULL) {
+			kfree(zfs_kfpu_fpregs[cpu]);
+		}
+	}
+
+	kfree(zfs_kfpu_fpregs);
+}
+
+static inline int
+kfpu_init(void)
+{
+	int cpu;
+
+	zfs_kfpu_fpregs = kzalloc(num_possible_cpus() *
+	    sizeof (union fpregs_state *), GFP_KERNEL);
+	if (zfs_kfpu_fpregs == NULL)
+		return (-ENOMEM);
+
+	for_each_possible_cpu(cpu) {
+		zfs_kfpu_fpregs[cpu] = kmalloc_node(sizeof (union fpregs_state),
+		    GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu));
+		if (zfs_kfpu_fpregs[cpu] == NULL) {
+			kfpu_fini();
+			return (-ENOMEM);
+		}
+	}
+
+	return (0);
+}
+
+#define	kfpu_allowed()		1
+#define	ex_handler_fprestore	ex_handler_default
+
+/*
+ * FPU save and restore instructions.
+ */
+#define	__asm			__asm__ __volatile__
+#define	kfpu_fxsave(addr)	__asm("fxsave %0" : "=m" (*(addr)))
+#define	kfpu_fxsaveq(addr)	__asm("fxsaveq %0" : "=m" (*(addr)))
+#define	kfpu_fnsave(addr)	__asm("fnsave %0; fwait" : "=m" (*(addr)))
+#define	kfpu_fxrstor(addr)	__asm("fxrstor %0" : : "m" (*(addr)))
+#define	kfpu_fxrstorq(addr)	__asm("fxrstorq %0" : : "m" (*(addr)))
+#define	kfpu_frstor(addr)	__asm("frstor %0" : : "m" (*(addr)))
+#define	kfpu_fxsr_clean(rval)	__asm("fnclex; emms; fildl %P[addr]" \
+				    : : [addr] "m" (rval));
+
+static inline void
+kfpu_save_xsave(struct xregs_state *addr, uint64_t mask)
+{
+	uint32_t low, hi;
+	int err;
+
+	low = mask;
+	hi = mask >> 32;
+	XSTATE_XSAVE(addr, low, hi, err);
+	WARN_ON_ONCE(err);
+}
+
+static inline void
+kfpu_save_fxsr(struct fxregs_state *addr)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		kfpu_fxsave(addr);
+	else
+		kfpu_fxsaveq(addr);
+}
+
+static inline void
+kfpu_save_fsave(struct fregs_state *addr)
+{
+	kfpu_fnsave(addr);
+}
+
+static inline void
+kfpu_begin(void)
+{
+	/*
+	 * Preemption and interrupts must be disabled for the critical
+	 * region where the FPU state is being modified.
+	 */
+	preempt_disable();
+	local_irq_disable();
+
+	/*
+	 * The current FPU registers need to be preserved by kfpu_begin()
+	 * and restored by kfpu_end().  They are stored in a dedicated
+	 * per-cpu variable, not in the task struct, this allows any user
+	 * FPU state to be correctly preserved and restored.
+	 */
+	union fpregs_state *state = zfs_kfpu_fpregs[smp_processor_id()];
+
+	if (static_cpu_has(X86_FEATURE_XSAVE)) {
+		kfpu_save_xsave(&state->xsave, ~0);
+	} else if (static_cpu_has(X86_FEATURE_FXSR)) {
+		kfpu_save_fxsr(&state->fxsave);
+	} else {
+		kfpu_save_fsave(&state->fsave);
+	}
+}
+
+static inline void
+kfpu_restore_xsave(struct xregs_state *addr, uint64_t mask)
+{
+	uint32_t low, hi;
+
+	low = mask;
+	hi = mask >> 32;
+	XSTATE_XRESTORE(addr, low, hi);
+}
+
+static inline void
+kfpu_restore_fxsr(struct fxregs_state *addr)
+{
+	/*
+	 * On AuthenticAMD K7 and K8 processors the fxrstor instruction only
+	 * restores the _x87 FOP, FIP, and FDP registers when an exception
+	 * is pending.  Clean the _x87 state to force the restore.
+	 */
+	if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK)))
+		kfpu_fxsr_clean(addr);
+
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		kfpu_fxrstor(addr);
+	} else {
+		kfpu_fxrstorq(addr);
+	}
+}
+
+static inline void
+kfpu_restore_fsave(struct fregs_state *addr)
+{
+	kfpu_frstor(addr);
+}
+
+static inline void
+kfpu_end(void)
+{
+	union fpregs_state *state = zfs_kfpu_fpregs[smp_processor_id()];
+
+	if (static_cpu_has(X86_FEATURE_XSAVE)) {
+		kfpu_restore_xsave(&state->xsave, ~0);
+	} else if (static_cpu_has(X86_FEATURE_FXSR)) {
+		kfpu_restore_fxsr(&state->fxsave);
+	} else {
+		kfpu_restore_fsave(&state->fsave);
+	}
+
+	local_irq_enable();
+	preempt_enable();
+}
+
+#else
+
+/*
+ * FPU support is unavailable.
+ */
+#define	kfpu_allowed()		0
+#define	kfpu_begin()		do {} while (0)
+#define	kfpu_end()		do {} while (0)
+#define	kfpu_init()		0
+#define	kfpu_fini()		((void) 0)
+
+#endif /* defined(HAVE_KERNEL_FPU_INTERNAL) */
+#endif /* defined(KERNEL_EXPORTS_X86_FPU) */
+
+#else /* defined(_KERNEL) */
+/*
+ * FPU dummy methods for user space.
+ */
+#define	kfpu_allowed()		1
+#define	kfpu_begin()		do {} while (0)
+#define	kfpu_end()		do {} while (0)
 #endif /* defined(_KERNEL) */

 /*
@ -289,7 +497,6 @@ CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ);

 #endif /* !defined(_KERNEL) */

-
 /*
 * Detect register set support
 */
@ -300,7 +507,7 @@ __simd_state_enabled(const uint64_t state)
 	uint64_t xcr0;

 #if defined(_KERNEL)
-#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_OSXSAVE)
 	has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE);
 #else
 	has_osxsave = B_FALSE;
@ -330,11 +537,7 @@ static inline boolean_t
 zfs_sse_available(void)
 {
 #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
 	return (!!boot_cpu_has(X86_FEATURE_XMM));
-#else
-	return (B_FALSE);
-#endif
 #elif !defined(_KERNEL)
 	return (__cpuid_has_sse());
 #endif
@ -347,11 +550,7 @@ static inline boolean_t
 zfs_sse2_available(void)
 {
 #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
 	return (!!boot_cpu_has(X86_FEATURE_XMM2));
-#else
-	return (B_FALSE);
-#endif
 #elif !defined(_KERNEL)
 	return (__cpuid_has_sse2());
 #endif
@ -364,11 +563,7 @@ static inline boolean_t
 zfs_sse3_available(void)
 {
 #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
 	return (!!boot_cpu_has(X86_FEATURE_XMM3));
-#else
-	return (B_FALSE);
-#endif
 #elif !defined(_KERNEL)
 	return (__cpuid_has_sse3());
 #endif
@ -381,11 +576,7 @@ static inline boolean_t
 zfs_ssse3_available(void)
 {
 #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
 	return (!!boot_cpu_has(X86_FEATURE_SSSE3));
-#else
-	return (B_FALSE);
-#endif
 #elif !defined(_KERNEL)
 	return (__cpuid_has_ssse3());
 #endif
@ -398,11 +589,7 @@ static inline boolean_t
 zfs_sse4_1_available(void)
 {
 #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
 	return (!!boot_cpu_has(X86_FEATURE_XMM4_1));
-#else
-	return (B_FALSE);
-#endif
 #elif !defined(_KERNEL)
 	return (__cpuid_has_sse4_1());
 #endif
@ -415,11 +602,7 @@ static inline boolean_t
 zfs_sse4_2_available(void)
 {
 #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
 	return (!!boot_cpu_has(X86_FEATURE_XMM4_2));
-#else
-	return (B_FALSE);
-#endif
 #elif !defined(_KERNEL)
 	return (__cpuid_has_sse4_2());
 #endif
@ -433,11 +616,7 @@ zfs_avx_available(void)
 {
 	boolean_t has_avx;
 #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
 	has_avx = !!boot_cpu_has(X86_FEATURE_AVX);
-#else
-	has_avx = B_FALSE;
-#endif
 #elif !defined(_KERNEL)
 	has_avx = __cpuid_has_avx();
 #endif
@ -453,11 +632,7 @@ zfs_avx2_available(void)
 {
 	boolean_t has_avx2;
 #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU)
 	has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2);
-#else
-	has_avx2 = B_FALSE;
-#endif
 #elif !defined(_KERNEL)
 	has_avx2 = __cpuid_has_avx2();
 #endif
@ -472,7 +647,7 @@ static inline boolean_t
 zfs_bmi1_available(void)
 {
 #if defined(_KERNEL)
-#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_BMI1)
 	return (!!boot_cpu_has(X86_FEATURE_BMI1));
 #else
 	return (B_FALSE);
@ -489,7 +664,7 @@ static inline boolean_t
 zfs_bmi2_available(void)
 {
 #if defined(_KERNEL)
-#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_BMI2)
 	return (!!boot_cpu_has(X86_FEATURE_BMI2));
 #else
 	return (B_FALSE);
@ -506,7 +681,7 @@ static inline boolean_t
 zfs_aes_available(void)
 {
 #if defined(_KERNEL)
-#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AES)
 	return (!!boot_cpu_has(X86_FEATURE_AES));
 #else
 	return (B_FALSE);
@ -523,7 +698,7 @@ static inline boolean_t
 zfs_pclmulqdq_available(void)
 {
 #if defined(_KERNEL)
-#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_PCLMULQDQ)
 	return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ));
 #else
 	return (B_FALSE);
@ -557,7 +732,7 @@ zfs_avx512f_available(void)
 	boolean_t has_avx512 = B_FALSE;

 #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512F)
 	has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F);
 #else
 	has_avx512 = B_FALSE;
@ -576,7 +751,7 @@ zfs_avx512cd_available(void)
 	boolean_t has_avx512 = B_FALSE;

 #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512CD)
 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
 	    boot_cpu_has(X86_FEATURE_AVX512CD);
 #else
@ -596,7 +771,7 @@ zfs_avx512er_available(void)
 	boolean_t has_avx512 = B_FALSE;

 #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512ER)
 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
 	    boot_cpu_has(X86_FEATURE_AVX512ER);
 #else
@ -616,7 +791,7 @@ zfs_avx512pf_available(void)
 	boolean_t has_avx512 = B_FALSE;

 #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512PF)
 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
 	    boot_cpu_has(X86_FEATURE_AVX512PF);
 #else
@ -636,7 +811,7 @@ zfs_avx512bw_available(void)
 	boolean_t has_avx512 = B_FALSE;

 #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512BW)
 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
 	    boot_cpu_has(X86_FEATURE_AVX512BW);
 #else
@ -656,7 +831,7 @@ zfs_avx512dq_available(void)
 	boolean_t has_avx512 = B_FALSE;

 #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512DQ)
 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
 	    boot_cpu_has(X86_FEATURE_AVX512DQ);
 #else
@ -676,7 +851,7 @@ zfs_avx512vl_available(void)
 	boolean_t has_avx512 = B_FALSE;

 #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512VL)
 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
 	    boot_cpu_has(X86_FEATURE_AVX512VL);
 #else
@ -696,7 +871,7 @@ zfs_avx512ifma_available(void)
 	boolean_t has_avx512 = B_FALSE;

 #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512IFMA)
 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
 	    boot_cpu_has(X86_FEATURE_AVX512IFMA);
 #else
@ -716,7 +891,7 @@ zfs_avx512vbmi_available(void)
 	boolean_t has_avx512 = B_FALSE;

 #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512VBMI)
 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
 	    boot_cpu_has(X86_FEATURE_AVX512VBMI);
 #else
--- a/include/sys/vdev_raidz.h
+++ b/include/sys/vdev_raidz.h
@ -51,7 +51,7 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
 */
 void vdev_raidz_math_init(void);
 void vdev_raidz_math_fini(void);
-struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
+const struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
 int vdev_raidz_math_generate(struct raidz_map *);
 int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *,
    const int);
--- a/include/sys/vdev_raidz_impl.h
+++ b/include/sys/vdev_raidz_impl.h
@ -126,7 +126,7 @@ typedef struct raidz_map {
 	uintptr_t rm_reports;		/* # of referencing checksum reports */
 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
-	raidz_impl_ops_t *rm_ops;	/* RAIDZ math operations */
+	const raidz_impl_ops_t *rm_ops;	/* RAIDZ math operations */
 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
 } raidz_map_t;

--- a/module/icp/algs/aes/aes_impl.c
+++ b/module/icp/algs/aes/aes_impl.c
@ -27,6 +27,7 @@
 #include <sys/crypto/spi.h>
 #include <modes/modes.h>
 #include <aes/aes_impl.h>
+#include <linux/simd.h>

 /*
 * Initialize AES encryption and decryption key schedules.
@ -40,9 +41,9 @@
 void
 aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
 {
-	aes_impl_ops_t	*ops = aes_impl_get_ops();
-	aes_key_t	*newbie = keysched;
-	uint_t		keysize, i, j;
+	const aes_impl_ops_t *ops = aes_impl_get_ops();
+	aes_key_t *newbie = keysched;
+	uint_t keysize, i, j;
 	union {
 		uint64_t	ka64[4];
 		uint32_t	ka32[8];
@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0;
 static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];

 /*
- * Selects the aes operations for encrypt/decrypt/key setup
+ * Returns the AES operations for encrypt/decrypt/key setup.  When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
 */
-aes_impl_ops_t *
-aes_impl_get_ops()
+const aes_impl_ops_t *
+aes_impl_get_ops(void)
 {
-	aes_impl_ops_t *ops = NULL;
+	if (!kfpu_allowed())
+		return (&aes_generic_impl);
+
+	const aes_impl_ops_t *ops = NULL;
 	const uint32_t impl = AES_IMPL_READ(icp_aes_impl);

 	switch (impl) {
@ -266,15 +272,13 @@ aes_impl_get_ops()
 		ops = &aes_fastest_impl;
 		break;
 	case IMPL_CYCLE:
-	{
+		/* Cycle through supported implementations */
 		ASSERT(aes_impl_initialized);
 		ASSERT3U(aes_supp_impl_cnt, >, 0);
-		/* Cycle through supported implementations */
 		static size_t cycle_impl_idx = 0;
 		size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
 		ops = aes_supp_impl[idx];
-	}
-	break;
+		break;
 	default:
 		ASSERT3U(impl, <, aes_supp_impl_cnt);
 		ASSERT3U(aes_supp_impl_cnt, >, 0);
@ -288,13 +292,16 @@ aes_impl_get_ops()
 	return (ops);
 }

+/*
+ * Initialize all supported implementations.
+ */
 void
 aes_impl_init(void)
 {
 	aes_impl_ops_t *curr_impl;
 	int i, c;

-	/* move supported impl into aes_supp_impls */
+	/* Move supported implementations into aes_supp_impls */
 	for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
 		curr_impl = (aes_impl_ops_t *)aes_all_impl[i];

--- a/module/icp/algs/aes/aes_impl_aesni.c
+++ b/module/icp/algs/aes/aes_impl_aesni.c
@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
 static boolean_t
 aes_aesni_will_work(void)
 {
-	return (zfs_aes_available());
+	return (kfpu_allowed() && zfs_aes_available());
 }

 const aes_impl_ops_t aes_aesni_impl = {
--- a/module/icp/algs/modes/gcm.c
+++ b/module/icp/algs/modes/gcm.c
@ -29,6 +29,7 @@
 #include <sys/crypto/impl.h>
 #include <sys/byteorder.h>
 #include <modes/gcm_impl.h>
+#include <linux/simd.h>

 #define	GHASH(c, d, t, o) \
 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	size_t remainder = length;
 	size_t need = 0;
 	uint8_t *datap = (uint8_t *)data;
@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
 	uint8_t *ghash, *macp = NULL;
 	int i, rv;
@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	size_t pt_len;
 	size_t remainder;
 	uint8_t *ghash;
@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	uint8_t *cb;
 	ulong_t remainder = iv_len;
 	ulong_t processed = 0;
@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	uint8_t *ghash, *datap, *authp;
 	size_t remainder, processed;

@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0;
 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];

 /*
- * Selects the gcm operation
+ * Returns the GCM operations for encrypt/decrypt/key setup.  When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
 */
-gcm_impl_ops_t *
+const gcm_impl_ops_t *
 gcm_impl_get_ops()
 {
-	gcm_impl_ops_t *ops = NULL;
+	if (!kfpu_allowed())
+		return (&gcm_generic_impl);
+
+	const gcm_impl_ops_t *ops = NULL;
 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);

 	switch (impl) {
@ -674,15 +680,13 @@ gcm_impl_get_ops()
 		ops = &gcm_fastest_impl;
 		break;
 	case IMPL_CYCLE:
-	{
+		/* Cycle through supported implementations */
 		ASSERT(gcm_impl_initialized);
 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
-		/* Cycle through supported implementations */
 		static size_t cycle_impl_idx = 0;
 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
 		ops = gcm_supp_impl[idx];
-	}
-	break;
+		break;
 	default:
 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
@ -696,13 +700,16 @@ gcm_impl_get_ops()
 	return (ops);
 }

+/*
+ * Initialize all supported implementations.
+ */
 void
 gcm_impl_init(void)
 {
 	gcm_impl_ops_t *curr_impl;
 	int i, c;

-	/* move supported impl into aes_supp_impls */
+	/* Move supported implementations into gcm_supp_impls */
 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];

@ -711,7 +718,10 @@ gcm_impl_init(void)
 	}
 	gcm_supp_impl_cnt = c;

-	/* set fastest implementation. assume hardware accelerated is fastest */
+	/*
+	 * Set the fastest implementation given the assumption that the
+	 * hardware accelerated version is the fastest.
+	 */
 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
 	if (gcm_pclmulqdq_impl.is_supported()) {
 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
--- a/module/icp/algs/modes/gcm_pclmulqdq.c
+++ b/module/icp/algs/modes/gcm_pclmulqdq.c
@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
 static boolean_t
 gcm_pclmulqdq_will_work(void)
 {
-	return (zfs_pclmulqdq_available());
+	return (kfpu_allowed() && zfs_pclmulqdq_available());
 }

 const gcm_impl_ops_t gcm_pclmulqdq_impl = {
--- a/module/icp/include/aes/aes_impl.h
+++ b/module/icp/include/aes/aes_impl.h
@ -201,9 +201,9 @@ extern const aes_impl_ops_t aes_aesni_impl;
 void aes_impl_init(void);

 /*
- * Get selected aes implementation
+ * Returns optimal allowed AES implementation
 */
-struct aes_impl_ops *aes_impl_get_ops(void);
+const struct aes_impl_ops *aes_impl_get_ops(void);

 #ifdef	__cplusplus
 }
--- a/module/icp/include/modes/gcm_impl.h
+++ b/module/icp/include/modes/gcm_impl.h
@ -64,9 +64,9 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
 void gcm_impl_init(void);

 /*
- * Get selected aes implementation
+ * Returns optimal allowed GCM implementation
 */
-struct gcm_impl_ops *gcm_impl_get_ops(void);
+const struct gcm_impl_ops *gcm_impl_get_ops(void);

 #ifdef	__cplusplus
 }
--- a/module/icp/io/aes.c
+++ b/module/icp/io/aes.c
@ -206,7 +206,7 @@ aes_mod_init(void)
 {
 	int ret;

-	/* find fastest implementations and set any requested implementations */
+	/* Determine the fastest available implementation. */
 	aes_impl_init();
 	gcm_impl_init();

--- a/module/zcommon/zfs_fletcher.c
+++ b/module/zcommon/zfs_fletcher.c
@ -140,6 +140,7 @@
 #include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <zfs_fletcher.h>
+#include <linux/simd.h>

 #define	FLETCHER_MIN_SIMD_SIZE	64

@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector {
 	const char	*fis_name;
 	uint32_t	fis_sel;
 } fletcher_4_impl_selectors[] = {
-#if !defined(_KERNEL)
 	{ "cycle",	IMPL_CYCLE },
-#endif
 	{ "fastest",	IMPL_FASTEST },
 	{ "scalar",	IMPL_SCALAR }
 };

 #if defined(_KERNEL)
 static kstat_t *fletcher_4_kstat;
-#endif

 static struct fletcher_4_kstat {
 	uint64_t native;
 	uint64_t byteswap;
 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
+#endif

 /* Indicate that benchmark has been completed */
 static boolean_t fletcher_4_initialized = B_FALSE;
@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val)
 	return (err);
 }

+/*
+ * Returns the Fletcher 4 operations for checksums.   When a SIMD
+ * implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
+ */
 static inline const fletcher_4_ops_t *
 fletcher_4_impl_get(void)
 {
-	fletcher_4_ops_t *ops = NULL;
-	const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+	if (!kfpu_allowed())
+		return (&fletcher_4_superscalar4_ops);
+
+	const fletcher_4_ops_t *ops = NULL;
+	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);

 	switch (impl) {
 	case IMPL_FASTEST:
 		ASSERT(fletcher_4_initialized);
 		ops = &fletcher_4_fastest_impl;
 		break;
-#if !defined(_KERNEL)
-	case IMPL_CYCLE: {
+	case IMPL_CYCLE:
+		/* Cycle through supported implementations */
 		ASSERT(fletcher_4_initialized);
 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
-
 		static uint32_t cycle_count = 0;
 		uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
 		ops = fletcher_4_supp_impls[idx];
-	}
-	break;
-#endif
+		break;
 	default:
 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
 		ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
-
 		ops = fletcher_4_supp_impls[impl];
 		break;
 	}
@ -659,6 +662,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
 typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
 					zio_cksum_t *);

+#if defined(_KERNEL)
 static void
 fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 {
@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 	/* restore original selection */
 	atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
 }
+#endif /* _KERNEL */

-void
-fletcher_4_init(void)
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+fletcher_4_benchmark(void)
 {
-	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
 	fletcher_4_ops_t *curr_impl;
-	char *databuf;
 	int i, c;

-	/* move supported impl into fletcher_4_supp_impls */
+	/* Move supported implementations into fletcher_4_supp_impls */
 	for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
 		curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];

@ -735,19 +741,10 @@ fletcher_4_init(void)
 	membar_producer();	/* complete fletcher_4_supp_impls[] init */
 	fletcher_4_supp_impls_cnt = c;	/* number of supported impl */

-#if !defined(_KERNEL)
-	/* Skip benchmarking and use last implementation as fastest */
-	memcpy(&fletcher_4_fastest_impl,
-	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
-	    sizeof (fletcher_4_fastest_impl));
-	fletcher_4_fastest_impl.name = "fastest";
-	membar_producer();
+#if defined(_KERNEL)
+	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
+	char *databuf = vmem_alloc(data_size, KM_SLEEP);

-	fletcher_4_initialized = B_TRUE;
-	return;
-#endif
-	/* Benchmark all supported implementations */
-	databuf = vmem_alloc(data_size, KM_SLEEP);
 	for (i = 0; i < data_size / sizeof (uint64_t); i++)
 		((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */

@ -755,9 +752,28 @@ fletcher_4_init(void)
 	fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);

 	vmem_free(databuf, data_size);
+#else
+	/*
+	 * Skip the benchmark in user space to avoid impacting libzpool
+	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
+	 * is assumed to be the fastest and used by default.
+	 */
+	memcpy(&fletcher_4_fastest_impl,
+	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
+	    sizeof (fletcher_4_fastest_impl));
+	fletcher_4_fastest_impl.name = "fastest";
+	membar_producer();
+#endif /* _KERNEL */
+}
+
+void
+fletcher_4_init(void)
+{
+	/* Determine the fastest available implementation. */
+	fletcher_4_benchmark();

 #if defined(_KERNEL)
-	/* install kstats for all implementations */
+	/* Install kstats for all implementations */
 	fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 	if (fletcher_4_kstat != NULL) {
--- a/module/zcommon/zfs_fletcher_aarch64_neon.c
+++ b/module/zcommon/zfs_fletcher_aarch64_neon.c
@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16)));

 static boolean_t fletcher_4_aarch64_neon_valid(void)
 {
-	return (B_TRUE);
+	return (kfpu_allowed());
 }

 const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
--- a/module/zcommon/zfs_fletcher_avx512.c
+++ b/module/zcommon/zfs_fletcher_avx512.c
@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
 static boolean_t
 fletcher_4_avx512f_valid(void)
 {
-	return (zfs_avx512f_available());
+	return (kfpu_allowed() && zfs_avx512f_available());
 }

 const fletcher_4_ops_t fletcher_4_avx512f_ops = {
--- a/module/zcommon/zfs_fletcher_intel.c
+++ b/module/zcommon/zfs_fletcher_intel.c
@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)

 static boolean_t fletcher_4_avx2_valid(void)
 {
-	return (zfs_avx_available() && zfs_avx2_available());
+	return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
 }

 const fletcher_4_ops_t fletcher_4_avx2_ops = {
--- a/module/zcommon/zfs_fletcher_sse.c
+++ b/module/zcommon/zfs_fletcher_sse.c
@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)

 static boolean_t fletcher_4_sse2_valid(void)
 {
-	return (zfs_sse2_available());
+	return (kfpu_allowed() && zfs_sse2_available());
 }

 const fletcher_4_ops_t fletcher_4_sse2_ops = {
@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)

 static boolean_t fletcher_4_ssse3_valid(void)
 {
-	return (zfs_sse2_available() && zfs_ssse3_available());
+	return (kfpu_allowed() && zfs_sse2_available() &&
+	    zfs_ssse3_available());
 }

 const fletcher_4_ops_t fletcher_4_ssse3_ops = {
--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@ -853,10 +853,23 @@ zfs_prop_align_right(zfs_prop_t prop)
 #endif

 #if defined(_KERNEL)
+
+#include <linux/simd.h>
+
+#if defined(HAVE_KERNEL_FPU_INTERNAL)
+union fpregs_state **zfs_kfpu_fpregs;
+EXPORT_SYMBOL(zfs_kfpu_fpregs);
+#endif /* HAVE_KERNEL_FPU_INTERNAL */
+
 static int __init
 zcommon_init(void)
 {
+	int error = kfpu_init();
+	if (error)
+		return (error);
+
 	fletcher_4_init();
+
 	return (0);
 }

@ -864,6 +877,7 @@ static void __exit
 zcommon_fini(void)
 {
 	fletcher_4_fini();
+	kfpu_fini();
 }

 module_init(zcommon_init);
--- a/module/zfs/vdev_raidz_math.c
+++ b/module/zfs/vdev_raidz_math.c
@ -27,9 +27,9 @@
 #include <sys/zio.h>
 #include <sys/debug.h>
 #include <sys/zfs_debug.h>
-
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
+#include <linux/simd.h>

 extern boolean_t raidz_will_scalar_work(void);

@ -87,6 +87,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
 static size_t raidz_supp_impl_cnt = 0;
 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];

+#if defined(_KERNEL)
 /*
 * kstats values for supported implementations
 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
@ -95,14 +96,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];

 /* kstat for benchmarked implementations */
 static kstat_t *raidz_math_kstat = NULL;
+#endif

 /*
- * Selects the raidz operation for raidz_map
- * If rm_ops is set to NULL original raidz implementation will be used
+ * Returns the RAIDZ operations for raidz_map() parity calculations.   When
+ * a SIMD implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
 */
-raidz_impl_ops_t *
-vdev_raidz_math_get_ops()
+const raidz_impl_ops_t *
+vdev_raidz_math_get_ops(void)
 {
+	if (!kfpu_allowed())
+		return (&vdev_raidz_scalar_impl);
+
 	raidz_impl_ops_t *ops = NULL;
 	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);

@ -111,18 +117,14 @@ vdev_raidz_math_get_ops()
 		ASSERT(raidz_math_initialized);
 		ops = &vdev_raidz_fastest_impl;
 		break;
-#if !defined(_KERNEL)
 	case IMPL_CYCLE:
-	{
+		/* Cycle through all supported implementations */
 		ASSERT(raidz_math_initialized);
 		ASSERT3U(raidz_supp_impl_cnt, >, 0);
-		/* Cycle through all supported implementations */
 		static size_t cycle_impl_idx = 0;
 		size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
 		ops = raidz_supp_impl[idx];
-	}
-	break;
-#endif
+		break;
 	case IMPL_ORIGINAL:
 		ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
 		break;
@ -273,6 +275,8 @@ const char *raidz_rec_name[] = {
 	"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
 };

+#if defined(_KERNEL)
+
 #define	RAIDZ_KSTAT_LINE_LEN	(17 + 10*12 + 1)

 static int
@ -435,21 +439,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
 		}
 	}
 }
+#endif

-void
-vdev_raidz_math_init(void)
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+benchmark_raidz(void)
 {
 	raidz_impl_ops_t *curr_impl;
-	zio_t *bench_zio = NULL;
-	raidz_map_t *bench_rm = NULL;
-	uint64_t bench_parity;
-	int i, c, fn;
+	int i, c;

-	/* move supported impl into raidz_supp_impl */
+	/* Move supported impl into raidz_supp_impl */
 	for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
 		curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];

-		/* initialize impl */
 		if (curr_impl->init)
 			curr_impl->init();

@ -459,18 +463,10 @@ vdev_raidz_math_init(void)
 	membar_producer();		/* complete raidz_supp_impl[] init */
 	raidz_supp_impl_cnt = c;	/* number of supported impl */

-#if !defined(_KERNEL)
-	/* Skip benchmarking and use last implementation as fastest */
-	memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
-	    sizeof (vdev_raidz_fastest_impl));
-	strcpy(vdev_raidz_fastest_impl.name, "fastest");
-
-	raidz_math_initialized = B_TRUE;
-
-	/* Use 'cycle' math selection method for userspace */
-	VERIFY0(vdev_raidz_impl_set("cycle"));
-	return;
-#endif
+#if defined(_KERNEL)
+	zio_t *bench_zio = NULL;
+	raidz_map_t *bench_rm = NULL;
+	uint64_t bench_parity;

 	/* Fake a zio and run the benchmark on a warmed up buffer */
 	bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
@ -480,7 +476,7 @@ vdev_raidz_math_init(void)
 	memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);

 	/* Benchmark parity generation methods */
-	for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+	for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
 		bench_parity = fn + 1;
 		/* New raidz_map is needed for each generate_p/q/r */
 		bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
@ -495,7 +491,7 @@ vdev_raidz_math_init(void)
 	bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
 	    BENCH_COLS, PARITY_PQR);

-	for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
+	for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
 		benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);

 	vdev_raidz_map_free(bench_rm);
@ -503,11 +499,29 @@ vdev_raidz_math_init(void)
 	/* cleanup the bench zio */
 	abd_free(bench_zio->io_abd);
 	kmem_free(bench_zio, sizeof (zio_t));
+#else
+	/*
+	 * Skip the benchmark in user space to avoid impacting libzpool
+	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
+	 * is assumed to be the fastest and used by default.
+	 */
+	memcpy(&vdev_raidz_fastest_impl,
+	    raidz_supp_impl[raidz_supp_impl_cnt - 1],
+	    sizeof (vdev_raidz_fastest_impl));
+	strcpy(vdev_raidz_fastest_impl.name, "fastest");
+#endif /* _KERNEL */
+}

-	/* install kstats for all impl */
+void
+vdev_raidz_math_init(void)
+{
+	/* Determine the fastest available implementation. */
+	benchmark_raidz();
+
+#if defined(_KERNEL)
+	/* Install kstats for all implementations */
 	raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
-
 	if (raidz_math_kstat != NULL) {
 		raidz_math_kstat->ks_data = NULL;
 		raidz_math_kstat->ks_ndata = UINT32_MAX;
@ -517,6 +531,7 @@ vdev_raidz_math_init(void)
 		    raidz_math_kstat_addr);
 		kstat_install(raidz_math_kstat);
 	}
+#endif

 	/* Finish initialization */
 	atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
@ -527,15 +542,15 @@ void
 vdev_raidz_math_fini(void)
 {
 	raidz_impl_ops_t const *curr_impl;
-	int i;

+#if defined(_KERNEL)
 	if (raidz_math_kstat != NULL) {
 		kstat_delete(raidz_math_kstat);
 		raidz_math_kstat = NULL;
 	}
+#endif

-	/* fini impl */
-	for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+	for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
 		curr_impl = raidz_all_maths[i];
 		if (curr_impl->fini)
 			curr_impl->fini();
@ -546,9 +561,7 @@ static const struct {
 	char *name;
 	uint32_t sel;
 } math_impl_opts[] = {
-#if !defined(_KERNEL)
 		{ "cycle",	IMPL_CYCLE },
-#endif
 		{ "fastest",	IMPL_FASTEST },
 		{ "original",	IMPL_ORIGINAL },
 		{ "scalar",	IMPL_SCALAR }
--- a/module/zfs/vdev_raidz_math_aarch64_neon.c
+++ b/module/zfs/vdev_raidz_math_aarch64_neon.c
@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon);
 static boolean_t
 raidz_will_aarch64_neon_work(void)
 {
-	return (B_TRUE); // __arch64__ requires NEON
+	return (kfpu_allowed());
 }

 const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = {
--- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c
+++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2);
 static boolean_t
 raidz_will_aarch64_neonx2_work(void)
 {
-	return (B_TRUE); // __arch64__ requires NEON
+	return (kfpu_allowed());
 }

 const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = {
--- a/module/zfs/vdev_raidz_math_avx2.c
+++ b/module/zfs/vdev_raidz_math_avx2.c
@ -396,7 +396,7 @@ DEFINE_REC_METHODS(avx2);
 static boolean_t
 raidz_will_avx2_work(void)
 {
-	return (zfs_avx_available() && zfs_avx2_available());
+	return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
 }

 const raidz_impl_ops_t vdev_raidz_avx2_impl = {
--- a/module/zfs/vdev_raidz_math_avx512bw.c
+++ b/module/zfs/vdev_raidz_math_avx512bw.c
@ -393,9 +393,8 @@ DEFINE_REC_METHODS(avx512bw);
 static boolean_t
 raidz_will_avx512bw_work(void)
 {
-	return (zfs_avx_available() &&
-	    zfs_avx512f_available() &&
-	    zfs_avx512bw_available());
+	return (kfpu_allowed() && zfs_avx_available() &&
+	    zfs_avx512f_available() && zfs_avx512bw_available());
 }

 const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
--- a/module/zfs/vdev_raidz_math_avx512f.c
+++ b/module/zfs/vdev_raidz_math_avx512f.c
@ -470,9 +470,8 @@ DEFINE_REC_METHODS(avx512f);
 static boolean_t
 raidz_will_avx512f_work(void)
 {
-	return (zfs_avx_available() &&
-	    zfs_avx2_available() &&
-	    zfs_avx512f_available());
+	return (kfpu_allowed() && zfs_avx_available() &&
+	    zfs_avx2_available() && zfs_avx512f_available());
 }

 const raidz_impl_ops_t vdev_raidz_avx512f_impl = {
--- a/module/zfs/vdev_raidz_math_sse2.c
+++ b/module/zfs/vdev_raidz_math_sse2.c
@ -607,7 +607,7 @@ DEFINE_REC_METHODS(sse2);
 static boolean_t
 raidz_will_sse2_work(void)
 {
-	return (zfs_sse_available() && zfs_sse2_available());
+	return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
 }

 const raidz_impl_ops_t vdev_raidz_sse2_impl = {
--- a/module/zfs/vdev_raidz_math_ssse3.c
+++ b/module/zfs/vdev_raidz_math_ssse3.c
@ -399,8 +399,8 @@ DEFINE_REC_METHODS(ssse3);
 static boolean_t
 raidz_will_ssse3_work(void)
 {
-	return (zfs_sse_available() && zfs_sse2_available() &&
-	    zfs_ssse3_available());
+	return (kfpu_allowed() && zfs_sse_available() &&
+	    zfs_sse2_available() && zfs_ssse3_available());
 }

 const raidz_impl_ops_t vdev_raidz_ssse3_impl = {
--- a/module/zfs/zio_crypt.c
+++ b/module/zfs/zio_crypt.c
@ -549,12 +549,12 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
    uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
    uint8_t *mac, zio_crypt_key_t *key)
 {
-	int ret;
 	crypto_mechanism_t mech;
 	uio_t puio, cuio;
 	uint64_t aad[3];
 	iovec_t plain_iovecs[2], cipher_iovecs[3];
 	uint_t enc_len, keydata_len, aad_len;
+	int ret;

 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 	ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);