From f43dbfa75207ffa8be7aa8f969f77f9e5a7a582a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
Date: Thu, 8 Aug 2019 15:12:33 +0200
Subject: [PATCH] cherry-pick SIMD compat patches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
 ...-Linux-5.0-compat-SIMD-compatibility.patch | 1615 +++++++++++++++++
 ...x-CONFIG_X86_DEBUG_FPU-build-failure.patch |   44 +
 debian/patches/series                         |    2 +
 3 files changed, 1661 insertions(+)
 create mode 100644 debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch
 create mode 100644 debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch

diff --git a/debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch b/debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch
new file mode 100644
index 0000000..9b25e0c
--- /dev/null
+++ b/debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch
@@ -0,0 +1,1615 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Fri, 12 Jul 2019 09:31:20 -0700
+Subject: [PATCH] Linux 5.0 compat: SIMD compatibility
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS,
+and 5.0 and newer kernels.  This is accomplished by leveraging
+the fact that by definition dedicated kernel threads never need
+to concern themselves with saving and restoring the user FPU state.
+Therefore, they may use the FPU as long as we can guarantee user
+tasks always restore their FPU state before context switching back
+to user space.
+
+For the 5.0 and 5.1 kernels disabling preemption and local
+interrupts is sufficient to allow the FPU to be used.  All non-kernel
+threads will restore the preserved user FPU state.
+
+For 5.2 and latter kernels the user FPU state restoration will be
+skipped if the kernel determines the registers have not changed.
+Therefore, for these kernels we need to perform the additional
+step of saving and restoring the FPU registers.  Invalidating the
+per-cpu global tracking the FPU state would force a restore but
+that functionality is private to the core x86 FPU implementation
+and unavailable.
+
+In practice, restricting SIMD to kernel threads is not a major
+restriction for ZFS.  The vast majority of SIMD operations are
+already performed by the IO pipeline.  The remaining cases are
+relatively infrequent and can be handled by the generic code
+without significant impact.  The two most noteworthy cases are:
+
+  1) Decrypting the wrapping key for an encrypted dataset,
+     i.e. `zfs load-key`.  All other encryption and decryption
+     operations will use the SIMD optimized implementations.
+
+  2) Generating the payload checksums for a `zfs send` stream.
+
+In order to avoid making any changes to the higher layers of ZFS
+all of the `*_get_ops()` functions were updated to take in to
+consideration the calling context.  This allows for the fastest
+implementation to be used as appropriate (see kfpu_allowed()).
+
+The only other notable instance of SIMD operations being used
+outside a kernel thread was at module load time.  This code
+was moved in to a taskq in order to accommodate the new kernel
+thread restriction.
+
+Finally, a few other modifications were made in order to further
+harden this code and facilitate testing.  They include updating
+each implementations operations structure to be declared as a
+constant.  And allowing "cycle" to be set when selecting the
+preferred ops in the kernel as well as user space.
+
+Reviewed-by: Tony Hutter <hutter2@llnl.gov>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #8754
+Closes #8793
+Closes #8965
+(cherry picked from commit e5db31349484e5e859c7a942eb15b98d68ce5b4d)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ include/linux/Makefile.am                   |   1 +
+ include/linux/simd.h                        |  41 +++++
+ include/linux/simd_aarch64.h                |  18 +-
+ include/linux/simd_x86.h                    | 192 +++++++++++++-------
+ include/sys/vdev_raidz.h                    |   2 +-
+ include/sys/vdev_raidz_impl.h               |   2 +-
+ module/icp/include/aes/aes_impl.h           |   6 +-
+ module/icp/include/modes/gcm_impl.h         |   6 +-
+ cmd/ztest/ztest.c                           |   3 +
+ module/icp/algs/aes/aes_impl.c              |  34 ++--
+ module/icp/algs/aes/aes_impl_aesni.c        |   2 +-
+ module/icp/algs/modes/gcm.c                 |  41 +++--
+ module/icp/algs/modes/gcm_pclmulqdq.c       |   2 +-
+ module/icp/io/aes.c                         |  32 +++-
+ module/spl/spl-taskq.c                      |   2 +
+ module/spl/spl-thread.c                     |   2 +
+ module/zcommon/zfs_fletcher.c               |  88 ++++++---
+ module/zcommon/zfs_fletcher_aarch64_neon.c  |   2 +-
+ module/zcommon/zfs_fletcher_avx512.c        |   2 +-
+ module/zcommon/zfs_fletcher_intel.c         |   2 +-
+ module/zcommon/zfs_fletcher_sse.c           |   5 +-
+ module/zfs/vdev_raidz_math.c                | 105 +++++++----
+ module/zfs/vdev_raidz_math_aarch64_neon.c   |   2 +-
+ module/zfs/vdev_raidz_math_aarch64_neonx2.c |   2 +-
+ module/zfs/vdev_raidz_math_avx2.c           |   2 +-
+ module/zfs/vdev_raidz_math_avx512bw.c       |   5 +-
+ module/zfs/vdev_raidz_math_avx512f.c        |   5 +-
+ module/zfs/vdev_raidz_math_sse2.c           |   2 +-
+ module/zfs/vdev_raidz_math_ssse3.c          |   4 +-
+ config/kernel-fpu.m4                        |  46 ++++-
+ 30 files changed, 454 insertions(+), 204 deletions(-)
+ create mode 100644 include/linux/simd.h
+
+diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am
+index efb49520e..2455759e8 100644
+--- a/include/linux/Makefile.am
++++ b/include/linux/Makefile.am
+@@ -7,6 +7,7 @@ KERNEL_H = \
+ 	$(top_srcdir)/include/linux/blkdev_compat.h \
+ 	$(top_srcdir)/include/linux/utsname_compat.h \
+ 	$(top_srcdir)/include/linux/kmap_compat.h \
++	$(top_srcdir)/include/linux/simd.h \
+ 	$(top_srcdir)/include/linux/simd_x86.h \
+ 	$(top_srcdir)/include/linux/simd_aarch64.h \
+ 	$(top_srcdir)/include/linux/mod_compat.h \
+diff --git a/include/linux/simd.h b/include/linux/simd.h
+new file mode 100644
+index 000000000..d2b60996a
+--- /dev/null
++++ b/include/linux/simd.h
+@@ -0,0 +1,41 @@
++/*
++ * CDDL HEADER START
++ *
++ * The contents of this file are subject to the terms of the
++ * Common Development and Distribution License (the "License").
++ * You may not use this file except in compliance with the License.
++ *
++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
++ * or http://www.opensolaris.org/os/licensing.
++ * See the License for the specific language governing permissions
++ * and limitations under the License.
++ *
++ * When distributing Covered Code, include this CDDL HEADER in each
++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
++ * If applicable, add the following below this CDDL HEADER, with the
++ * fields enclosed by brackets "[]" replaced with your own identifying
++ * information: Portions Copyright [yyyy] [name of copyright owner]
++ *
++ * CDDL HEADER END
++ */
++/*
++ * Copyright (C) 2019 Lawrence Livermore National Security, LLC.
++ */
++
++#ifndef _SIMD_H
++#define	_SIMD_H
++
++#if defined(__x86)
++#include <linux/simd_x86.h>
++
++#elif defined(__aarch64__)
++#include <linux/simd_aarch64.h>
++#else
++
++#define	kfpu_allowed()		1
++#define	kfpu_initialize(tsk)	do {} while (0)
++#define	kfpu_begin()		do {} while (0)
++#define	kfpu_end()		do {} while (0)
++
++#endif
++#endif /* _SIMD_H */
+diff --git a/include/linux/simd_aarch64.h b/include/linux/simd_aarch64.h
+index 155ef6205..1cfcd01e4 100644
+--- a/include/linux/simd_aarch64.h
++++ b/include/linux/simd_aarch64.h
+@@ -41,20 +41,18 @@
+ 
+ #if defined(_KERNEL)
+ #include <asm/neon.h>
+-#define	kfpu_begin()		\
+-{					\
+-	kernel_neon_begin();		\
+-}
+-#define	kfpu_end()			\
+-{					\
+-	kernel_neon_end();		\
+-}
++#define	kfpu_allowed()		1
++#define	kfpu_initialize(tsk)	do {} while (0)
++#define	kfpu_begin()		kernel_neon_begin()
++#define	kfpu_end()		kernel_neon_end()
+ #else
+ /*
+  * fpu dummy methods for userspace
+  */
+-#define	kfpu_begin() 	do {} while (0)
+-#define	kfpu_end() 		do {} while (0)
++#define	kfpu_allowed()		1
++#define	kfpu_initialize(tsk)	do {} while (0)
++#define	kfpu_begin()		do {} while (0)
++#define	kfpu_end()		do {} while (0)
+ #endif /* defined(_KERNEL) */
+ 
+ #endif /* __aarch64__ */
+diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h
+index 12cd74677..2d7a1c3a5 100644
+--- a/include/linux/simd_x86.h
++++ b/include/linux/simd_x86.h
+@@ -90,33 +90,135 @@
+ #include <asm/xcr.h>
+ #endif
+ 
++/*
++ * The following cases are for kernels which export either the
++ * kernel_fpu_* or __kernel_fpu_* functions.
++ */
++#if defined(KERNEL_EXPORTS_X86_FPU)
++
++#define	kfpu_allowed()		1
++#define	kfpu_initialize(tsk)	do {} while (0)
++
+ #if defined(HAVE_UNDERSCORE_KERNEL_FPU)
+ #define	kfpu_begin()		\
+-{							\
+-	preempt_disable();		\
++{				\
++	preempt_disable();	\
+ 	__kernel_fpu_begin();	\
+ }
+-#define	kfpu_end()			\
+-{							\
+-	__kernel_fpu_end();		\
+-	preempt_enable();		\
++#define	kfpu_end()		\
++{				\
++	__kernel_fpu_end();	\
++	preempt_enable();	\
+ }
++
+ #elif defined(HAVE_KERNEL_FPU)
+-#define	kfpu_begin()	kernel_fpu_begin()
++#define	kfpu_begin()		kernel_fpu_begin()
+ #define	kfpu_end()		kernel_fpu_end()
++
+ #else
+-/* Kernel doesn't export any kernel_fpu_* functions */
+-#include <asm/fpu/internal.h>	/* For kernel xgetbv() */
+-#define	kfpu_begin() 	panic("This code should never run")
+-#define	kfpu_end() 	panic("This code should never run")
+-#endif /* defined(HAVE_KERNEL_FPU) */
++/*
++ * This case is unreachable.  When KERNEL_EXPORTS_X86_FPU is defined then
++ * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined.
++ */
++#error "Unreachable kernel configuration"
++#endif
++
++#else /* defined(KERNEL_EXPORTS_X86_FPU) */
++/*
++ * When the kernel_fpu_* symbols are unavailable then provide our own
++ * versions which allow the FPU to be safely used in kernel threads.
++ * In practice, this is not a significant restriction for ZFS since the
++ * vast majority of SIMD operations are performed by the IO pipeline.
++ */
+ 
++/*
++ * Returns non-zero if FPU operations are allowed in the current context.
++ */
++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
++#define	kfpu_allowed()		((current->flags & PF_KTHREAD) && \
++				test_thread_flag(TIF_NEED_FPU_LOAD))
++#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
++#define	kfpu_allowed()		((current->flags & PF_KTHREAD) && \
++				current->thread.fpu.initialized)
+ #else
++#define	kfpu_allowed()		0
++#endif
++
++static inline void
++kfpu_initialize(void)
++{
++	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
++
++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
++	__fpu_invalidate_fpregs_state(&current->thread.fpu);
++	set_thread_flag(TIF_NEED_FPU_LOAD);
++#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
++	__fpu_invalidate_fpregs_state(&current->thread.fpu);
++	current->thread.fpu.initialized = 1;
++#endif
++}
++
++static inline void
++kfpu_begin(void)
++{
++	WARN_ON_ONCE(!kfpu_allowed());
++
++	/*
++	 * Preemption and interrupts must be disabled for the critical
++	 * region where the FPU state is being modified.
++	 */
++	preempt_disable();
++	local_irq_disable();
++
++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
++	/*
++	 * The current FPU registers need to be preserved by kfpu_begin()
++	 * and restored by kfpu_end().  This is required because we can
++	 * not call __cpu_invalidate_fpregs_state() to invalidate the
++	 * per-cpu FPU state and force them to be restored during a
++	 * context switch.
++	 */
++	copy_fpregs_to_fpstate(&current->thread.fpu);
++#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
++	/*
++	 * There is no need to preserve and restore the FPU registers.
++	 * They will always be restored from the task's stored FPU state
++	 * when switching contexts.
++	 */
++	WARN_ON_ONCE(current->thread.fpu.initialized == 0);
++#endif
++}
++
++static inline void
++kfpu_end(void)
++{
++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
++	union fpregs_state *state = &current->thread.fpu.state;
++	int error;
++
++	if (use_xsave()) {
++		error = copy_kernel_to_xregs_err(&state->xsave, -1);
++	} else if (use_fxsr()) {
++		error = copy_kernel_to_fxregs_err(&state->fxsave);
++	} else {
++		error = copy_kernel_to_fregs_err(&state->fsave);
++	}
++	WARN_ON_ONCE(error);
++#endif
++
++	local_irq_enable();
++	preempt_enable();
++}
++#endif /* defined(HAVE_KERNEL_FPU) */
++
++#else /* defined(_KERNEL) */
+ /*
+- * fpu dummy methods for userspace
++ * FPU dummy methods for user space.
+  */
+-#define	kfpu_begin() 	do {} while (0)
+-#define	kfpu_end() 		do {} while (0)
++#define	kfpu_allowed()		1
++#define	kfpu_initialize(tsk)	do {} while (0)
++#define	kfpu_begin()		do {} while (0)
++#define	kfpu_end()		do {} while (0)
+ #endif /* defined(_KERNEL) */
+ 
+ /*
+@@ -298,7 +400,7 @@ __simd_state_enabled(const uint64_t state)
+ 	uint64_t xcr0;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_OSXSAVE)
+ 	has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE);
+ #else
+ 	has_osxsave = B_FALSE;
+@@ -328,11 +430,7 @@ static inline boolean_t
+ zfs_sse_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+ 	return (!!boot_cpu_has(X86_FEATURE_XMM));
+-#else
+-	return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+ 	return (__cpuid_has_sse());
+ #endif
+@@ -345,11 +443,7 @@ static inline boolean_t
+ zfs_sse2_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+ 	return (!!boot_cpu_has(X86_FEATURE_XMM2));
+-#else
+-	return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+ 	return (__cpuid_has_sse2());
+ #endif
+@@ -362,11 +456,7 @@ static inline boolean_t
+ zfs_sse3_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+ 	return (!!boot_cpu_has(X86_FEATURE_XMM3));
+-#else
+-	return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+ 	return (__cpuid_has_sse3());
+ #endif
+@@ -379,11 +469,7 @@ static inline boolean_t
+ zfs_ssse3_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+ 	return (!!boot_cpu_has(X86_FEATURE_SSSE3));
+-#else
+-	return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+ 	return (__cpuid_has_ssse3());
+ #endif
+@@ -396,11 +482,7 @@ static inline boolean_t
+ zfs_sse4_1_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+ 	return (!!boot_cpu_has(X86_FEATURE_XMM4_1));
+-#else
+-	return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+ 	return (__cpuid_has_sse4_1());
+ #endif
+@@ -413,11 +495,7 @@ static inline boolean_t
+ zfs_sse4_2_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+ 	return (!!boot_cpu_has(X86_FEATURE_XMM4_2));
+-#else
+-	return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+ 	return (__cpuid_has_sse4_2());
+ #endif
+@@ -431,11 +509,7 @@ zfs_avx_available(void)
+ {
+ 	boolean_t has_avx;
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+ 	has_avx = !!boot_cpu_has(X86_FEATURE_AVX);
+-#else
+-	has_avx = B_FALSE;
+-#endif
+ #elif !defined(_KERNEL)
+ 	has_avx = __cpuid_has_avx();
+ #endif
+@@ -451,11 +525,7 @@ zfs_avx2_available(void)
+ {
+ 	boolean_t has_avx2;
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU)
+ 	has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2);
+-#else
+-	has_avx2 = B_FALSE;
+-#endif
+ #elif !defined(_KERNEL)
+ 	has_avx2 = __cpuid_has_avx2();
+ #endif
+@@ -470,7 +540,7 @@ static inline boolean_t
+ zfs_bmi1_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_BMI1)
+ 	return (!!boot_cpu_has(X86_FEATURE_BMI1));
+ #else
+ 	return (B_FALSE);
+@@ -487,7 +557,7 @@ static inline boolean_t
+ zfs_bmi2_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_BMI2)
+ 	return (!!boot_cpu_has(X86_FEATURE_BMI2));
+ #else
+ 	return (B_FALSE);
+@@ -504,7 +574,7 @@ static inline boolean_t
+ zfs_aes_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AES)
+ 	return (!!boot_cpu_has(X86_FEATURE_AES));
+ #else
+ 	return (B_FALSE);
+@@ -521,7 +591,7 @@ static inline boolean_t
+ zfs_pclmulqdq_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_PCLMULQDQ)
+ 	return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ));
+ #else
+ 	return (B_FALSE);
+@@ -555,7 +625,7 @@ zfs_avx512f_available(void)
+ 	boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512F)
+ 	has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F);
+ #else
+ 	has_avx512 = B_FALSE;
+@@ -574,7 +644,7 @@ zfs_avx512cd_available(void)
+ 	boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512CD)
+ 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+ 	    boot_cpu_has(X86_FEATURE_AVX512CD);
+ #else
+@@ -594,7 +664,7 @@ zfs_avx512er_available(void)
+ 	boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512ER)
+ 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+ 	    boot_cpu_has(X86_FEATURE_AVX512ER);
+ #else
+@@ -614,7 +684,7 @@ zfs_avx512pf_available(void)
+ 	boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512PF)
+ 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+ 	    boot_cpu_has(X86_FEATURE_AVX512PF);
+ #else
+@@ -634,7 +704,7 @@ zfs_avx512bw_available(void)
+ 	boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512BW)
+ 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+ 	    boot_cpu_has(X86_FEATURE_AVX512BW);
+ #else
+@@ -654,7 +724,7 @@ zfs_avx512dq_available(void)
+ 	boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512DQ)
+ 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+ 	    boot_cpu_has(X86_FEATURE_AVX512DQ);
+ #else
+@@ -674,7 +744,7 @@ zfs_avx512vl_available(void)
+ 	boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512VL)
+ 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+ 	    boot_cpu_has(X86_FEATURE_AVX512VL);
+ #else
+@@ -694,7 +764,7 @@ zfs_avx512ifma_available(void)
+ 	boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512IFMA)
+ 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+ 	    boot_cpu_has(X86_FEATURE_AVX512IFMA);
+ #else
+@@ -714,7 +784,7 @@ zfs_avx512vbmi_available(void)
+ 	boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512VBMI)
+ 	has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+ 	    boot_cpu_has(X86_FEATURE_AVX512VBMI);
+ #else
+diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h
+index 2ce32469d..0ce2b5ea1 100644
+--- a/include/sys/vdev_raidz.h
++++ b/include/sys/vdev_raidz.h
+@@ -51,7 +51,7 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
+  */
+ void vdev_raidz_math_init(void);
+ void vdev_raidz_math_fini(void);
+-struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
++const struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
+ int vdev_raidz_math_generate(struct raidz_map *);
+ int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *,
+     const int);
+diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h
+index 0799ed19d..4969d110b 100644
+--- a/include/sys/vdev_raidz_impl.h
++++ b/include/sys/vdev_raidz_impl.h
+@@ -126,7 +126,7 @@ typedef struct raidz_map {
+ 	uintptr_t rm_reports;		/* # of referencing checksum reports */
+ 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
+ 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
+-	raidz_impl_ops_t *rm_ops;	/* RAIDZ math operations */
++	const raidz_impl_ops_t *rm_ops;	/* RAIDZ math operations */
+ 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
+ } raidz_map_t;
+ 
+diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h
+index 95cfddf9e..9fd9c1bd1 100644
+--- a/module/icp/include/aes/aes_impl.h
++++ b/module/icp/include/aes/aes_impl.h
+@@ -198,12 +198,12 @@ extern const aes_impl_ops_t aes_aesni_impl;
+ /*
+  * Initializes fastest implementation
+  */
+-void aes_impl_init(void);
++void aes_impl_init(void *arg);
+ 
+ /*
+- * Get selected aes implementation
++ * Returns optimal allowed AES implementation
+  */
+-struct aes_impl_ops *aes_impl_get_ops(void);
++const struct aes_impl_ops *aes_impl_get_ops(void);
+ 
+ #ifdef	__cplusplus
+ }
+diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h
+index cbb904c05..138090487 100644
+--- a/module/icp/include/modes/gcm_impl.h
++++ b/module/icp/include/modes/gcm_impl.h
+@@ -61,12 +61,12 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
+ /*
+  * Initializes fastest implementation
+  */
+-void gcm_impl_init(void);
++void gcm_impl_init(void *arg);
+ 
+ /*
+- * Get selected aes implementation
++ * Returns optimal allowed GCM implementation
+  */
+-struct gcm_impl_ops *gcm_impl_get_ops(void);
++const struct gcm_impl_ops *gcm_impl_get_ops(void);
+ 
+ #ifdef	__cplusplus
+ }
+diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
+index 9c2cf9501..815462443 100644
+--- a/cmd/ztest/ztest.c
++++ b/cmd/ztest/ztest.c
+@@ -107,6 +107,7 @@
+ #include <sys/vdev_impl.h>
+ #include <sys/vdev_file.h>
+ #include <sys/vdev_initialize.h>
++#include <sys/vdev_raidz.h>
+ #include <sys/vdev_trim.h>
+ #include <sys/spa_impl.h>
+ #include <sys/metaslab_impl.h>
+@@ -7094,6 +7095,8 @@ ztest_run(ztest_shared_t *zs)
+ 	metaslab_preload_limit = ztest_random(20) + 1;
+ 	ztest_spa = spa;
+ 
++	VERIFY0(vdev_raidz_impl_set("cycle"));
++
+ 	dmu_objset_stats_t dds;
+ 	VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
+ 	    DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os));
+diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c
+index e15050635..457b9e45c 100644
+--- a/module/icp/algs/aes/aes_impl.c
++++ b/module/icp/algs/aes/aes_impl.c
+@@ -27,6 +27,7 @@
+ #include <sys/crypto/spi.h>
+ #include <modes/modes.h>
+ #include <aes/aes_impl.h>
++#include <linux/simd.h>
+ 
+ /*
+  * Initialize AES encryption and decryption key schedules.
+@@ -40,9 +41,9 @@
+ void
+ aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
+ {
+-	aes_impl_ops_t	*ops = aes_impl_get_ops();
+-	aes_key_t	*newbie = keysched;
+-	uint_t		keysize, i, j;
++	const aes_impl_ops_t *ops = aes_impl_get_ops();
++	aes_key_t *newbie = keysched;
++	uint_t keysize, i, j;
+ 	union {
+ 		uint64_t	ka64[4];
+ 		uint32_t	ka32[8];
+@@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0;
+ static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];
+ 
+ /*
+- * Selects the aes operations for encrypt/decrypt/key setup
++ * Returns the AES operations for encrypt/decrypt/key setup.  When a
++ * SIMD implementation is not allowed in the current context, then
++ * fallback to the fastest generic implementation.
+  */
+-aes_impl_ops_t *
+-aes_impl_get_ops()
++const aes_impl_ops_t *
++aes_impl_get_ops(void)
+ {
+-	aes_impl_ops_t *ops = NULL;
++	if (!kfpu_allowed())
++		return (&aes_generic_impl);
++
++	const aes_impl_ops_t *ops = NULL;
+ 	const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
+ 
+ 	switch (impl) {
+@@ -266,15 +272,13 @@ aes_impl_get_ops()
+ 		ops = &aes_fastest_impl;
+ 		break;
+ 	case IMPL_CYCLE:
+-	{
++		/* Cycle through supported implementations */
+ 		ASSERT(aes_impl_initialized);
+ 		ASSERT3U(aes_supp_impl_cnt, >, 0);
+-		/* Cycle through supported implementations */
+ 		static size_t cycle_impl_idx = 0;
+ 		size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
+ 		ops = aes_supp_impl[idx];
+-	}
+-	break;
++		break;
+ 	default:
+ 		ASSERT3U(impl, <, aes_supp_impl_cnt);
+ 		ASSERT3U(aes_supp_impl_cnt, >, 0);
+@@ -288,13 +292,17 @@ aes_impl_get_ops()
+ 	return (ops);
+ }
+ 
++/*
++ * Initialize all supported implementations.
++ */
++/* ARGSUSED */
+ void
+-aes_impl_init(void)
++aes_impl_init(void *arg)
+ {
+ 	aes_impl_ops_t *curr_impl;
+ 	int i, c;
+ 
+-	/* move supported impl into aes_supp_impls */
++	/* Move supported implementations into aes_supp_impls */
+ 	for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
+ 		curr_impl = (aes_impl_ops_t *)aes_all_impl[i];
+ 
+diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c
+index 97f7c3a47..222c176aa 100644
+--- a/module/icp/algs/aes/aes_impl_aesni.c
++++ b/module/icp/algs/aes/aes_impl_aesni.c
+@@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
+ static boolean_t
+ aes_aesni_will_work(void)
+ {
+-	return (zfs_aes_available());
++	return (kfpu_allowed() && zfs_aes_available());
+ }
+ 
+ const aes_impl_ops_t aes_aesni_impl = {
+diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c
+index 13bceef0f..f6f8434de 100644
+--- a/module/icp/algs/modes/gcm.c
++++ b/module/icp/algs/modes/gcm.c
+@@ -29,6 +29,7 @@
+ #include <sys/crypto/impl.h>
+ #include <sys/byteorder.h>
+ #include <modes/gcm_impl.h>
++#include <linux/simd.h>
+ 
+ #define	GHASH(c, d, t, o) \
+ 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
+@@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
+     void (*copy_block)(uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
+-	gcm_impl_ops_t *gops;
++	const gcm_impl_ops_t *gops;
+ 	size_t remainder = length;
+ 	size_t need = 0;
+ 	uint8_t *datap = (uint8_t *)data;
+@@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+     void (*copy_block)(uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
+-	gcm_impl_ops_t *gops;
++	const gcm_impl_ops_t *gops;
+ 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+ 	uint8_t *ghash, *macp = NULL;
+ 	int i, rv;
+@@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
+-	gcm_impl_ops_t *gops;
++	const gcm_impl_ops_t *gops;
+ 	size_t pt_len;
+ 	size_t remainder;
+ 	uint8_t *ghash;
+@@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
+     void (*copy_block)(uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
+-	gcm_impl_ops_t *gops;
++	const gcm_impl_ops_t *gops;
+ 	uint8_t *cb;
+ 	ulong_t remainder = iv_len;
+ 	ulong_t processed = 0;
+@@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
+     void (*copy_block)(uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
+-	gcm_impl_ops_t *gops;
++	const gcm_impl_ops_t *gops;
+ 	uint8_t *ghash, *datap, *authp;
+ 	size_t remainder, processed;
+ 
+@@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0;
+ static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
+ 
+ /*
+- * Selects the gcm operation
++ * Returns the GCM operations for encrypt/decrypt/key setup.  When a
++ * SIMD implementation is not allowed in the current context, then
++ * fallback to the fastest generic implementation.
+  */
+-gcm_impl_ops_t *
++const gcm_impl_ops_t *
+ gcm_impl_get_ops()
+ {
+-	gcm_impl_ops_t *ops = NULL;
++	if (!kfpu_allowed())
++		return (&gcm_generic_impl);
++
++	const gcm_impl_ops_t *ops = NULL;
+ 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
+ 
+ 	switch (impl) {
+@@ -674,15 +680,13 @@ gcm_impl_get_ops()
+ 		ops = &gcm_fastest_impl;
+ 		break;
+ 	case IMPL_CYCLE:
+-	{
++		/* Cycle through supported implementations */
+ 		ASSERT(gcm_impl_initialized);
+ 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
+-		/* Cycle through supported implementations */
+ 		static size_t cycle_impl_idx = 0;
+ 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
+ 		ops = gcm_supp_impl[idx];
+-	}
+-	break;
++		break;
+ 	default:
+ 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
+ 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
+@@ -696,13 +700,17 @@ gcm_impl_get_ops()
+ 	return (ops);
+ }
+ 
++/*
++ * Initialize all supported implementations.
++ */
++/* ARGSUSED */
+ void
+-gcm_impl_init(void)
++gcm_impl_init(void *arg)
+ {
+ 	gcm_impl_ops_t *curr_impl;
+ 	int i, c;
+ 
+-	/* move supported impl into aes_supp_impls */
++	/* Move supported implementations into gcm_supp_impls */
+ 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
+ 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
+ 
+@@ -711,7 +719,10 @@ gcm_impl_init(void)
+ 	}
+ 	gcm_supp_impl_cnt = c;
+ 
+-	/* set fastest implementation. assume hardware accelerated is fastest */
++	/*
++	 * Set the fastest implementation given the assumption that the
++	 * hardware accelerated version is the fastest.
++	 */
+ #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+ 	if (gcm_pclmulqdq_impl.is_supported())
+ 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
+diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c
+index be00ba37b..8a43ba33a 100644
+--- a/module/icp/algs/modes/gcm_pclmulqdq.c
++++ b/module/icp/algs/modes/gcm_pclmulqdq.c
+@@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
+ static boolean_t
+ gcm_pclmulqdq_will_work(void)
+ {
+-	return (zfs_pclmulqdq_available());
++	return (kfpu_allowed() && zfs_pclmulqdq_available());
+ }
+ 
+ const gcm_impl_ops_t gcm_pclmulqdq_impl = {
+diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c
+index 53b193693..51538bc60 100644
+--- a/module/icp/io/aes.c
++++ b/module/icp/io/aes.c
+@@ -206,9 +206,35 @@ aes_mod_init(void)
+ {
+ 	int ret;
+ 
+-	/* find fastest implementations and set any requested implementations */
+-	aes_impl_init();
+-	gcm_impl_init();
++#if defined(_KERNEL)
++	/*
++	 * Determine the fastest available implementation.  The benchmarks
++	 * are run in dedicated kernel threads to allow Linux 5.0+ kernels
++	 * to use SIMD operations.  If for some reason this isn't possible,
++	 * fallback to the generic implementations.  See the comment in
++	 * include/linux/simd_x86.h for additional details.  Additionally,
++	 * this has the benefit of allowing them to be run in parallel.
++	 */
++	taskqid_t aes_id = taskq_dispatch(system_taskq, aes_impl_init,
++	    NULL, TQ_SLEEP);
++	taskqid_t gcm_id = taskq_dispatch(system_taskq, gcm_impl_init,
++	    NULL, TQ_SLEEP);
++
++	if (aes_id != TASKQID_INVALID) {
++		taskq_wait_id(system_taskq, aes_id);
++	} else {
++		aes_impl_init(NULL);
++	}
++
++	if (gcm_id != TASKQID_INVALID) {
++		taskq_wait_id(system_taskq, gcm_id);
++	} else {
++		gcm_impl_init(NULL);
++	}
++#else
++	aes_impl_init(NULL);
++	gcm_impl_init(NULL);
++#endif
+ 
+ 	if ((ret = mod_install(&modlinkage)) != 0)
+ 		return (ret);
+diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c
+index 7684257be..de0e45190 100644
+--- a/module/spl/spl-taskq.c
++++ b/module/spl/spl-taskq.c
+@@ -27,6 +27,7 @@
+ #include <sys/taskq.h>
+ #include <sys/kmem.h>
+ #include <sys/tsd.h>
++#include <linux/simd.h>
+ 
+ int spl_taskq_thread_bind = 0;
+ module_param(spl_taskq_thread_bind, int, 0644);
+@@ -869,6 +870,7 @@ taskq_thread(void *args)
+ 	sigfillset(&blocked);
+ 	sigprocmask(SIG_BLOCK, &blocked, NULL);
+ 	flush_signals(current);
++	kfpu_initialize();
+ 
+ 	tsd_set(taskq_tsd, tq);
+ 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c
+index d441ad65f..c4977bcf2 100644
+--- a/module/spl/spl-thread.c
++++ b/module/spl/spl-thread.c
+@@ -27,6 +27,7 @@
+ #include <sys/thread.h>
+ #include <sys/kmem.h>
+ #include <sys/tsd.h>
++#include <linux/simd.h>
+ 
+ /*
+  * Thread interfaces
+@@ -54,6 +55,7 @@ thread_generic_wrapper(void *arg)
+ 	args = tp->tp_args;
+ 	set_current_state(tp->tp_state);
+ 	set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
++	kfpu_initialize();
+ 	kmem_free(tp->tp_name, tp->tp_name_size);
+ 	kmem_free(tp, sizeof (thread_priv_t));
+ 
+diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
+index 5a991ba60..b75d8ab00 100644
+--- a/module/zcommon/zfs_fletcher.c
++++ b/module/zcommon/zfs_fletcher.c
+@@ -140,6 +140,7 @@
+ #include <sys/zio_checksum.h>
+ #include <sys/zfs_context.h>
+ #include <zfs_fletcher.h>
++#include <linux/simd.h>
+ 
+ #define	FLETCHER_MIN_SIMD_SIZE	64
+ 
+@@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector {
+ 	const char	*fis_name;
+ 	uint32_t	fis_sel;
+ } fletcher_4_impl_selectors[] = {
+-#if !defined(_KERNEL)
+ 	{ "cycle",	IMPL_CYCLE },
+-#endif
+ 	{ "fastest",	IMPL_FASTEST },
+ 	{ "scalar",	IMPL_SCALAR }
+ };
+ 
+ #if defined(_KERNEL)
+ static kstat_t *fletcher_4_kstat;
+-#endif
+ 
+ static struct fletcher_4_kstat {
+ 	uint64_t native;
+ 	uint64_t byteswap;
+ } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
++#endif
+ 
+ /* Indicate that benchmark has been completed */
+ static boolean_t fletcher_4_initialized = B_FALSE;
+@@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val)
+ 	return (err);
+ }
+ 
++/*
++ * Returns the Fletcher 4 operations for checksums.   When a SIMD
++ * implementation is not allowed in the current context, then fallback
++ * to the fastest generic implementation.
++ */
+ static inline const fletcher_4_ops_t *
+ fletcher_4_impl_get(void)
+ {
+-	fletcher_4_ops_t *ops = NULL;
+-	const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
++	if (!kfpu_allowed())
++		return (&fletcher_4_superscalar4_ops);
++
++	const fletcher_4_ops_t *ops = NULL;
++	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+ 
+ 	switch (impl) {
+ 	case IMPL_FASTEST:
+ 		ASSERT(fletcher_4_initialized);
+ 		ops = &fletcher_4_fastest_impl;
+ 		break;
+-#if !defined(_KERNEL)
+-	case IMPL_CYCLE: {
++	case IMPL_CYCLE:
++		/* Cycle through supported implementations */
+ 		ASSERT(fletcher_4_initialized);
+ 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
+-
+ 		static uint32_t cycle_count = 0;
+ 		uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
+ 		ops = fletcher_4_supp_impls[idx];
+-	}
+-	break;
+-#endif
++		break;
+ 	default:
+ 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
+ 		ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
+-
+ 		ops = fletcher_4_supp_impls[impl];
+ 		break;
+ 	}
+@@ -658,6 +661,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
+ typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
+ 					zio_cksum_t *);
+ 
++#if defined(_KERNEL)
+ static void
+ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
+ {
+@@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
+ 	/* restore original selection */
+ 	atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
+ }
++#endif /* _KERNEL */
+ 
+-void
+-fletcher_4_init(void)
++/*
++ * Initialize and benchmark all supported implementations.
++ */
++static void
++fletcher_4_benchmark(void *arg)
+ {
+-	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
+ 	fletcher_4_ops_t *curr_impl;
+-	char *databuf;
+ 	int i, c;
+ 
+-	/* move supported impl into fletcher_4_supp_impls */
++	/* Move supported implementations into fletcher_4_supp_impls */
+ 	for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
+ 		curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
+ 
+@@ -735,19 +741,10 @@ fletcher_4_init(void)
+ 	membar_producer();	/* complete fletcher_4_supp_impls[] init */
+ 	fletcher_4_supp_impls_cnt = c;	/* number of supported impl */
+ 
+-#if !defined(_KERNEL)
+-	/* Skip benchmarking and use last implementation as fastest */
+-	memcpy(&fletcher_4_fastest_impl,
+-	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
+-	    sizeof (fletcher_4_fastest_impl));
+-	fletcher_4_fastest_impl.name = "fastest";
+-	membar_producer();
++#if defined(_KERNEL)
++	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
++	char *databuf = vmem_alloc(data_size, KM_SLEEP);
+ 
+-	fletcher_4_initialized = B_TRUE;
+-	return;
+-#endif
+-	/* Benchmark all supported implementations */
+-	databuf = vmem_alloc(data_size, KM_SLEEP);
+ 	for (i = 0; i < data_size / sizeof (uint64_t); i++)
+ 		((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
+ 
+@@ -755,9 +752,38 @@ fletcher_4_init(void)
+ 	fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
+ 
+ 	vmem_free(databuf, data_size);
++#else
++	/*
++	 * Skip the benchmark in user space to avoid impacting libzpool
++	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
++	 * is assumed to be the fastest and used by default.
++	 */
++	memcpy(&fletcher_4_fastest_impl,
++	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
++	    sizeof (fletcher_4_fastest_impl));
++	fletcher_4_fastest_impl.name = "fastest";
++	membar_producer();
++#endif /* _KERNEL */
++}
+ 
++void
++fletcher_4_init(void)
++{
+ #if defined(_KERNEL)
+-	/* install kstats for all implementations */
++	/*
++	 * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
++	 * run in a kernel threads.  This is needed to take advantage of the
++	 * SIMD functionality, see include/linux/simd_x86.h for details.
++	 */
++	taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark,
++	    NULL, TQ_SLEEP);
++	if (id != TASKQID_INVALID) {
++		taskq_wait_id(system_taskq, id);
++	} else {
++		fletcher_4_benchmark(NULL);
++	}
++
++	/* Install kstats for all implementations */
+ 	fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
+ 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+ 	if (fletcher_4_kstat != NULL) {
+@@ -769,6 +795,8 @@ fletcher_4_init(void)
+ 		    fletcher_4_kstat_addr);
+ 		kstat_install(fletcher_4_kstat);
+ 	}
++#else
++	fletcher_4_benchmark(NULL);
+ #endif
+ 
+ 	/* Finish initialization */
+diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c
+index bd2db2b20..3b3c1b52b 100644
+--- a/module/zcommon/zfs_fletcher_aarch64_neon.c
++++ b/module/zcommon/zfs_fletcher_aarch64_neon.c
+@@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16)));
+ 
+ static boolean_t fletcher_4_aarch64_neon_valid(void)
+ {
+-	return (B_TRUE);
++	return (kfpu_allowed());
+ }
+ 
+ const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
+diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c
+index 7260a9864..0d4cff21a 100644
+--- a/module/zcommon/zfs_fletcher_avx512.c
++++ b/module/zcommon/zfs_fletcher_avx512.c
+@@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
+ static boolean_t
+ fletcher_4_avx512f_valid(void)
+ {
+-	return (zfs_avx512f_available());
++	return (kfpu_allowed() && zfs_avx512f_available());
+ }
+ 
+ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
+diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c
+index 6dac047da..7f12efe6d 100644
+--- a/module/zcommon/zfs_fletcher_intel.c
++++ b/module/zcommon/zfs_fletcher_intel.c
+@@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+ 
+ static boolean_t fletcher_4_avx2_valid(void)
+ {
+-	return (zfs_avx_available() && zfs_avx2_available());
++	return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
+ }
+ 
+ const fletcher_4_ops_t fletcher_4_avx2_ops = {
+diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c
+index a0b42e5f5..e6389d6e5 100644
+--- a/module/zcommon/zfs_fletcher_sse.c
++++ b/module/zcommon/zfs_fletcher_sse.c
+@@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+ 
+ static boolean_t fletcher_4_sse2_valid(void)
+ {
+-	return (zfs_sse2_available());
++	return (kfpu_allowed() && zfs_sse2_available());
+ }
+ 
+ const fletcher_4_ops_t fletcher_4_sse2_ops = {
+@@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+ 
+ static boolean_t fletcher_4_ssse3_valid(void)
+ {
+-	return (zfs_sse2_available() && zfs_ssse3_available());
++	return (kfpu_allowed() && zfs_sse2_available() &&
++	    zfs_ssse3_available());
+ }
+ 
+ const fletcher_4_ops_t fletcher_4_ssse3_ops = {
+diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c
+index e6112bc02..e7a39015c 100644
+--- a/module/zfs/vdev_raidz_math.c
++++ b/module/zfs/vdev_raidz_math.c
+@@ -27,9 +27,9 @@
+ #include <sys/zio.h>
+ #include <sys/debug.h>
+ #include <sys/zfs_debug.h>
+-
+ #include <sys/vdev_raidz.h>
+ #include <sys/vdev_raidz_impl.h>
++#include <linux/simd.h>
+ 
+ extern boolean_t raidz_will_scalar_work(void);
+ 
+@@ -87,6 +87,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
+ static size_t raidz_supp_impl_cnt = 0;
+ static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
+ 
++#if defined(_KERNEL)
+ /*
+  * kstats values for supported implementations
+  * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
+@@ -95,14 +96,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
+ 
+ /* kstat for benchmarked implementations */
+ static kstat_t *raidz_math_kstat = NULL;
++#endif
+ 
+ /*
+- * Selects the raidz operation for raidz_map
+- * If rm_ops is set to NULL original raidz implementation will be used
++ * Returns the RAIDZ operations for raidz_map() parity calculations.   When
++ * a SIMD implementation is not allowed in the current context, then fallback
++ * to the fastest generic implementation.
+  */
+-raidz_impl_ops_t *
+-vdev_raidz_math_get_ops()
++const raidz_impl_ops_t *
++vdev_raidz_math_get_ops(void)
+ {
++	if (!kfpu_allowed())
++		return (&vdev_raidz_scalar_impl);
++
+ 	raidz_impl_ops_t *ops = NULL;
+ 	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
+ 
+@@ -111,18 +117,14 @@ vdev_raidz_math_get_ops()
+ 		ASSERT(raidz_math_initialized);
+ 		ops = &vdev_raidz_fastest_impl;
+ 		break;
+-#if !defined(_KERNEL)
+ 	case IMPL_CYCLE:
+-	{
++		/* Cycle through all supported implementations */
+ 		ASSERT(raidz_math_initialized);
+ 		ASSERT3U(raidz_supp_impl_cnt, >, 0);
+-		/* Cycle through all supported implementations */
+ 		static size_t cycle_impl_idx = 0;
+ 		size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
+ 		ops = raidz_supp_impl[idx];
+-	}
+-	break;
+-#endif
++		break;
+ 	case IMPL_ORIGINAL:
+ 		ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
+ 		break;
+@@ -273,6 +275,8 @@ const char *raidz_rec_name[] = {
+ 	"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
+ };
+ 
++#if defined(_KERNEL)
++
+ #define	RAIDZ_KSTAT_LINE_LEN	(17 + 10*12 + 1)
+ 
+ static int
+@@ -435,21 +439,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
+ 		}
+ 	}
+ }
++#endif
+ 
+-void
+-vdev_raidz_math_init(void)
++/*
++ * Initialize and benchmark all supported implementations.
++ */
++static void
++benchmark_raidz(void *arg)
+ {
+ 	raidz_impl_ops_t *curr_impl;
+-	zio_t *bench_zio = NULL;
+-	raidz_map_t *bench_rm = NULL;
+-	uint64_t bench_parity;
+-	int i, c, fn;
++	int i, c;
+ 
+-	/* move supported impl into raidz_supp_impl */
++	/* Move supported impl into raidz_supp_impl */
+ 	for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+ 		curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
+ 
+-		/* initialize impl */
+ 		if (curr_impl->init)
+ 			curr_impl->init();
+ 
+@@ -459,18 +463,10 @@ vdev_raidz_math_init(void)
+ 	membar_producer();		/* complete raidz_supp_impl[] init */
+ 	raidz_supp_impl_cnt = c;	/* number of supported impl */
+ 
+-#if !defined(_KERNEL)
+-	/* Skip benchmarking and use last implementation as fastest */
+-	memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
+-	    sizeof (vdev_raidz_fastest_impl));
+-	strcpy(vdev_raidz_fastest_impl.name, "fastest");
+-
+-	raidz_math_initialized = B_TRUE;
+-
+-	/* Use 'cycle' math selection method for userspace */
+-	VERIFY0(vdev_raidz_impl_set("cycle"));
+-	return;
+-#endif
++#if defined(_KERNEL)
++	zio_t *bench_zio = NULL;
++	raidz_map_t *bench_rm = NULL;
++	uint64_t bench_parity;
+ 
+ 	/* Fake an zio and run the benchmark on a warmed up buffer */
+ 	bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
+@@ -480,7 +476,7 @@ vdev_raidz_math_init(void)
+ 	memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
+ 
+ 	/* Benchmark parity generation methods */
+-	for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
++	for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+ 		bench_parity = fn + 1;
+ 		/* New raidz_map is needed for each generate_p/q/r */
+ 		bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
+@@ -495,7 +491,7 @@ vdev_raidz_math_init(void)
+ 	bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
+ 	    BENCH_COLS, PARITY_PQR);
+ 
+-	for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
++	for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
+ 		benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
+ 
+ 	vdev_raidz_map_free(bench_rm);
+@@ -503,11 +499,39 @@ vdev_raidz_math_init(void)
+ 	/* cleanup the bench zio */
+ 	abd_free(bench_zio->io_abd);
+ 	kmem_free(bench_zio, sizeof (zio_t));
++#else
++	/*
++	 * Skip the benchmark in user space to avoid impacting libzpool
++	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
++	 * is assumed to be the fastest and used by default.
++	 */
++	memcpy(&vdev_raidz_fastest_impl,
++	    raidz_supp_impl[raidz_supp_impl_cnt - 1],
++	    sizeof (vdev_raidz_fastest_impl));
++	strcpy(vdev_raidz_fastest_impl.name, "fastest");
++#endif /* _KERNEL */
++}
+ 
+-	/* install kstats for all impl */
++void
++vdev_raidz_math_init(void)
++{
++#if defined(_KERNEL)
++	/*
++	 * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
++	 * run in a kernel threads.  This is needed to take advantage of the
++	 * SIMD functionality, see include/linux/simd_x86.h for details.
++	 */
++	taskqid_t id = taskq_dispatch(system_taskq, benchmark_raidz,
++	    NULL, TQ_SLEEP);
++	if (id != TASKQID_INVALID) {
++		taskq_wait_id(system_taskq, id);
++	} else {
++		benchmark_raidz(NULL);
++	}
++
++	/* Install kstats for all implementations */
+ 	raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
+ 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+-
+ 	if (raidz_math_kstat != NULL) {
+ 		raidz_math_kstat->ks_data = NULL;
+ 		raidz_math_kstat->ks_ndata = UINT32_MAX;
+@@ -517,6 +541,9 @@ vdev_raidz_math_init(void)
+ 		    raidz_math_kstat_addr);
+ 		kstat_install(raidz_math_kstat);
+ 	}
++#else
++	benchmark_raidz(NULL);
++#endif
+ 
+ 	/* Finish initialization */
+ 	atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
+@@ -527,15 +554,15 @@ void
+ vdev_raidz_math_fini(void)
+ {
+ 	raidz_impl_ops_t const *curr_impl;
+-	int i;
+ 
++#if defined(_KERNEL)
+ 	if (raidz_math_kstat != NULL) {
+ 		kstat_delete(raidz_math_kstat);
+ 		raidz_math_kstat = NULL;
+ 	}
++#endif
+ 
+-	/* fini impl */
+-	for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
++	for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+ 		curr_impl = raidz_all_maths[i];
+ 		if (curr_impl->fini)
+ 			curr_impl->fini();
+@@ -546,9 +573,7 @@ static const struct {
+ 	char *name;
+ 	uint32_t sel;
+ } math_impl_opts[] = {
+-#if !defined(_KERNEL)
+ 		{ "cycle",	IMPL_CYCLE },
+-#endif
+ 		{ "fastest",	IMPL_FASTEST },
+ 		{ "original",	IMPL_ORIGINAL },
+ 		{ "scalar",	IMPL_SCALAR }
+diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c
+index e3ad06776..0a67ceb84 100644
+--- a/module/zfs/vdev_raidz_math_aarch64_neon.c
++++ b/module/zfs/vdev_raidz_math_aarch64_neon.c
+@@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon);
+ static boolean_t
+ raidz_will_aarch64_neon_work(void)
+ {
+-	return (B_TRUE); // __arch64__ requires NEON
++	return (kfpu_allowed());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = {
+diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
+index f8688a06a..e072f51cd 100644
+--- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c
++++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
+@@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2);
+ static boolean_t
+ raidz_will_aarch64_neonx2_work(void)
+ {
+-	return (B_TRUE); // __arch64__ requires NEON
++	return (kfpu_allowed());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = {
+diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c
+index 063d29bcd..a12eb6720 100644
+--- a/module/zfs/vdev_raidz_math_avx2.c
++++ b/module/zfs/vdev_raidz_math_avx2.c
+@@ -396,7 +396,7 @@ DEFINE_REC_METHODS(avx2);
+ static boolean_t
+ raidz_will_avx2_work(void)
+ {
+-	return (zfs_avx_available() && zfs_avx2_available());
++	return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_avx2_impl = {
+diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c
+index d605653db..2f545c9ec 100644
+--- a/module/zfs/vdev_raidz_math_avx512bw.c
++++ b/module/zfs/vdev_raidz_math_avx512bw.c
+@@ -393,9 +393,8 @@ DEFINE_REC_METHODS(avx512bw);
+ static boolean_t
+ raidz_will_avx512bw_work(void)
+ {
+-	return (zfs_avx_available() &&
+-	    zfs_avx512f_available() &&
+-	    zfs_avx512bw_available());
++	return (kfpu_allowed() && zfs_avx_available() &&
++	    zfs_avx512f_available() && zfs_avx512bw_available());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
+diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c
+index f4e4560ce..75af7a8ee 100644
+--- a/module/zfs/vdev_raidz_math_avx512f.c
++++ b/module/zfs/vdev_raidz_math_avx512f.c
+@@ -470,9 +470,8 @@ DEFINE_REC_METHODS(avx512f);
+ static boolean_t
+ raidz_will_avx512f_work(void)
+ {
+-	return (zfs_avx_available() &&
+-	    zfs_avx2_available() &&
+-	    zfs_avx512f_available());
++	return (kfpu_allowed() && zfs_avx_available() &&
++	    zfs_avx2_available() && zfs_avx512f_available());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_avx512f_impl = {
+diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c
+index 9985da273..5b3a9385c 100644
+--- a/module/zfs/vdev_raidz_math_sse2.c
++++ b/module/zfs/vdev_raidz_math_sse2.c
+@@ -607,7 +607,7 @@ DEFINE_REC_METHODS(sse2);
+ static boolean_t
+ raidz_will_sse2_work(void)
+ {
+-	return (zfs_sse_available() && zfs_sse2_available());
++	return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_sse2_impl = {
+diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c
+index 047a48d54..62247cf8e 100644
+--- a/module/zfs/vdev_raidz_math_ssse3.c
++++ b/module/zfs/vdev_raidz_math_ssse3.c
+@@ -399,8 +399,8 @@ DEFINE_REC_METHODS(ssse3);
+ static boolean_t
+ raidz_will_ssse3_work(void)
+ {
+-	return (zfs_sse_available() && zfs_sse2_available() &&
+-	    zfs_ssse3_available());
++	return (kfpu_allowed() && zfs_sse_available() &&
++	    zfs_sse2_available() && zfs_ssse3_available());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_ssse3_impl = {
+diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4
+index 5fff79a74..31bf35f83 100644
+--- a/config/kernel-fpu.m4
++++ b/config/kernel-fpu.m4
+@@ -2,8 +2,15 @@ dnl #
+ dnl # Handle differences in kernel FPU code.
+ dnl #
+ dnl # Kernel
+-dnl # 5.0:	All kernel fpu functions are GPL only, so we can't use them.
+-dnl #		(nothing defined)
++dnl # 5.2:	The fpu->initialized flag was replaced by TIF_NEED_FPU_LOAD.
++dnl #		HAVE_KERNEL_TIF_NEED_FPU_LOAD
++dnl #
++dnl # 5.0:	As an optimization SIMD operations performed by kernel
++dnl #		threads can skip saving and restoring their FPU context.
++dnl #		Wrappers have been introduced to determine the running
++dnl #		context and use either the SIMD or generic implementation.
++dnl #		This change was made to the 4.19.38 and 4.14.120 LTS kernels.
++dnl #		HAVE_KERNEL_FPU_INITIALIZED
+ dnl #
+ dnl # 4.2:	Use __kernel_fpu_{begin,end}()
+ dnl #		HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU
+@@ -56,10 +63,39 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
+ 			__kernel_fpu_end();
+ 		], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [
+ 			AC_MSG_RESULT(__kernel_fpu_*)
+-			AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions])
+-			AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions])
++			AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1,
++			    [kernel has __kernel_fpu_* functions])
++			AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
++			    [kernel exports FPU functions])
+ 		],[
+-			AC_MSG_RESULT(not exported)
++			ZFS_LINUX_TRY_COMPILE([
++				#include <linux/module.h>
++				#include <linux/sched.h>
++			],[
++				struct fpu *fpu = &current->thread.fpu;
++				if (fpu->initialized) { return (0); };
++			],[
++				AC_MSG_RESULT(fpu.initialized)
++				AC_DEFINE(HAVE_KERNEL_FPU_INITIALIZED, 1,
++				    [kernel fpu.initialized exists])
++			],[
++				ZFS_LINUX_TRY_COMPILE([
++					#include <linux/module.h>
++					#include <asm/thread_info.h>
++
++					#if !defined(TIF_NEED_FPU_LOAD)
++					#error "TIF_NEED_FPU_LOAD undefined"
++					#endif
++				],[
++				],[
++					AC_MSG_RESULT(TIF_NEED_FPU_LOAD)
++					AC_DEFINE(
++					    HAVE_KERNEL_TIF_NEED_FPU_LOAD, 1,
++					    [kernel TIF_NEED_FPU_LOAD exists])
++				],[
++					AC_MSG_RESULT(unavailable)
++				])
++			])
+ 		])
+ 	])
+ ])
diff --git a/debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch b/debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch
new file mode 100644
index 0000000..c8c8267
--- /dev/null
+++ b/debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch
@@ -0,0 +1,44 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Wed, 17 Jul 2019 09:14:36 -0700
+Subject: [PATCH] Fix CONFIG_X86_DEBUG_FPU build failure
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When CONFIG_X86_DEBUG_FPU is defined the alternatives_patched symbol
+is pulled in as a dependency which results in a build failure.  To
+prevent this undefine CONFIG_X86_DEBUG_FPU to disable the WARN_ON_FPU()
+macro and rely on WARN_ON_ONCE debugging checks which were previously
+added.
+
+Reviewed-by: Tony Hutter <hutter2@llnl.gov>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #9041
+Closes #9049
+(cherry picked from commit 095b5412b31c07cad5cec74a4eb5ace011c92b27)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ include/linux/simd_x86.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h
+index 2d7a1c3a5..5f243e0cc 100644
+--- a/include/linux/simd_x86.h
++++ b/include/linux/simd_x86.h
+@@ -82,6 +82,15 @@
+ 
+ #if defined(_KERNEL)
+ 
++/*
++ * Disable the WARN_ON_FPU() macro to prevent additional dependencies
++ * when providing the kfpu_* functions.  Relevant warnings are included
++ * as appropriate and are unconditionally enabled.
++ */
++#if defined(CONFIG_X86_DEBUG_FPU) && !defined(KERNEL_EXPORTS_X86_FPU)
++#undef CONFIG_X86_DEBUG_FPU
++#endif
++
+ #if defined(HAVE_KERNEL_FPU_API_HEADER)
+ #include <asm/fpu/api.h>
+ #include <asm/fpu/internal.h>
diff --git a/debian/patches/series b/debian/patches/series
index 9da3503..9b0d7fb 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -5,3 +5,5 @@
 0005-import-with-d-dev-disk-by-id-in-scan-service.patch
 0006-Enable-zed-emails.patch
 0007-Fix-race-in-parallel-mount-s-thread-dispatching-algo.patch
+0008-Linux-5.0-compat-SIMD-compatibility.patch
+0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch