From f43dbfa75207ffa8be7aa8f969f77f9e5a7a582a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= Date: Thu, 8 Aug 2019 15:12:33 +0200 Subject: [PATCH] cherry-pick SIMD compat patches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Fabian Grünbichler --- ...-Linux-5.0-compat-SIMD-compatibility.patch | 1615 +++++++++++++++++ ...x-CONFIG_X86_DEBUG_FPU-build-failure.patch | 44 + debian/patches/series | 2 + 3 files changed, 1661 insertions(+) create mode 100644 debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch create mode 100644 debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch diff --git a/debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch b/debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch new file mode 100644 index 0000000..9b25e0c --- /dev/null +++ b/debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch @@ -0,0 +1,1615 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Brian Behlendorf +Date: Fri, 12 Jul 2019 09:31:20 -0700 +Subject: [PATCH] Linux 5.0 compat: SIMD compatibility +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS, +and 5.0 and newer kernels. This is accomplished by leveraging +the fact that by definition dedicated kernel threads never need +to concern themselves with saving and restoring the user FPU state. +Therefore, they may use the FPU as long as we can guarantee user +tasks always restore their FPU state before context switching back +to user space. + +For the 5.0 and 5.1 kernels disabling preemption and local +interrupts is sufficient to allow the FPU to be used. All non-kernel +threads will restore the preserved user FPU state. + +For 5.2 and latter kernels the user FPU state restoration will be +skipped if the kernel determines the registers have not changed. +Therefore, for these kernels we need to perform the additional +step of saving and restoring the FPU registers. Invalidating the +per-cpu global tracking the FPU state would force a restore but +that functionality is private to the core x86 FPU implementation +and unavailable. + +In practice, restricting SIMD to kernel threads is not a major +restriction for ZFS. The vast majority of SIMD operations are +already performed by the IO pipeline. The remaining cases are +relatively infrequent and can be handled by the generic code +without significant impact. The two most noteworthy cases are: + + 1) Decrypting the wrapping key for an encrypted dataset, + i.e. `zfs load-key`. All other encryption and decryption + operations will use the SIMD optimized implementations. + + 2) Generating the payload checksums for a `zfs send` stream. + +In order to avoid making any changes to the higher layers of ZFS +all of the `*_get_ops()` functions were updated to take in to +consideration the calling context. This allows for the fastest +implementation to be used as appropriate (see kfpu_allowed()). + +The only other notable instance of SIMD operations being used +outside a kernel thread was at module load time. This code +was moved in to a taskq in order to accommodate the new kernel +thread restriction. + +Finally, a few other modifications were made in order to further +harden this code and facilitate testing. They include updating +each implementations operations structure to be declared as a +constant. And allowing "cycle" to be set when selecting the +preferred ops in the kernel as well as user space. + +Reviewed-by: Tony Hutter +Signed-off-by: Brian Behlendorf +Closes #8754 +Closes #8793 +Closes #8965 +(cherry picked from commit e5db31349484e5e859c7a942eb15b98d68ce5b4d) +Signed-off-by: Fabian Grünbichler +--- + include/linux/Makefile.am | 1 + + include/linux/simd.h | 41 +++++ + include/linux/simd_aarch64.h | 18 +- + include/linux/simd_x86.h | 192 +++++++++++++------- + include/sys/vdev_raidz.h | 2 +- + include/sys/vdev_raidz_impl.h | 2 +- + module/icp/include/aes/aes_impl.h | 6 +- + module/icp/include/modes/gcm_impl.h | 6 +- + cmd/ztest/ztest.c | 3 + + module/icp/algs/aes/aes_impl.c | 34 ++-- + module/icp/algs/aes/aes_impl_aesni.c | 2 +- + module/icp/algs/modes/gcm.c | 41 +++-- + module/icp/algs/modes/gcm_pclmulqdq.c | 2 +- + module/icp/io/aes.c | 32 +++- + module/spl/spl-taskq.c | 2 + + module/spl/spl-thread.c | 2 + + module/zcommon/zfs_fletcher.c | 88 ++++++--- + module/zcommon/zfs_fletcher_aarch64_neon.c | 2 +- + module/zcommon/zfs_fletcher_avx512.c | 2 +- + module/zcommon/zfs_fletcher_intel.c | 2 +- + module/zcommon/zfs_fletcher_sse.c | 5 +- + module/zfs/vdev_raidz_math.c | 105 +++++++---- + module/zfs/vdev_raidz_math_aarch64_neon.c | 2 +- + module/zfs/vdev_raidz_math_aarch64_neonx2.c | 2 +- + module/zfs/vdev_raidz_math_avx2.c | 2 +- + module/zfs/vdev_raidz_math_avx512bw.c | 5 +- + module/zfs/vdev_raidz_math_avx512f.c | 5 +- + module/zfs/vdev_raidz_math_sse2.c | 2 +- + module/zfs/vdev_raidz_math_ssse3.c | 4 +- + config/kernel-fpu.m4 | 46 ++++- + 30 files changed, 454 insertions(+), 204 deletions(-) + create mode 100644 include/linux/simd.h + +diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am +index efb49520e..2455759e8 100644 +--- a/include/linux/Makefile.am ++++ b/include/linux/Makefile.am +@@ -7,6 +7,7 @@ KERNEL_H = \ + $(top_srcdir)/include/linux/blkdev_compat.h \ + $(top_srcdir)/include/linux/utsname_compat.h \ + $(top_srcdir)/include/linux/kmap_compat.h \ ++ $(top_srcdir)/include/linux/simd.h \ + $(top_srcdir)/include/linux/simd_x86.h \ + $(top_srcdir)/include/linux/simd_aarch64.h \ + $(top_srcdir)/include/linux/mod_compat.h \ +diff --git a/include/linux/simd.h b/include/linux/simd.h +new file mode 100644 +index 000000000..d2b60996a +--- /dev/null ++++ b/include/linux/simd.h +@@ -0,0 +1,41 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (C) 2019 Lawrence Livermore National Security, LLC. ++ */ ++ ++#ifndef _SIMD_H ++#define _SIMD_H ++ ++#if defined(__x86) ++#include ++ ++#elif defined(__aarch64__) ++#include ++#else ++ ++#define kfpu_allowed() 1 ++#define kfpu_initialize(tsk) do {} while (0) ++#define kfpu_begin() do {} while (0) ++#define kfpu_end() do {} while (0) ++ ++#endif ++#endif /* _SIMD_H */ +diff --git a/include/linux/simd_aarch64.h b/include/linux/simd_aarch64.h +index 155ef6205..1cfcd01e4 100644 +--- a/include/linux/simd_aarch64.h ++++ b/include/linux/simd_aarch64.h +@@ -41,20 +41,18 @@ + + #if defined(_KERNEL) + #include +-#define kfpu_begin() \ +-{ \ +- kernel_neon_begin(); \ +-} +-#define kfpu_end() \ +-{ \ +- kernel_neon_end(); \ +-} ++#define kfpu_allowed() 1 ++#define kfpu_initialize(tsk) do {} while (0) ++#define kfpu_begin() kernel_neon_begin() ++#define kfpu_end() kernel_neon_end() + #else + /* + * fpu dummy methods for userspace + */ +-#define kfpu_begin() do {} while (0) +-#define kfpu_end() do {} while (0) ++#define kfpu_allowed() 1 ++#define kfpu_initialize(tsk) do {} while (0) ++#define kfpu_begin() do {} while (0) ++#define kfpu_end() do {} while (0) + #endif /* defined(_KERNEL) */ + + #endif /* __aarch64__ */ +diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h +index 12cd74677..2d7a1c3a5 100644 +--- a/include/linux/simd_x86.h ++++ b/include/linux/simd_x86.h +@@ -90,33 +90,135 @@ + #include + #endif + ++/* ++ * The following cases are for kernels which export either the ++ * kernel_fpu_* or __kernel_fpu_* functions. ++ */ ++#if defined(KERNEL_EXPORTS_X86_FPU) ++ ++#define kfpu_allowed() 1 ++#define kfpu_initialize(tsk) do {} while (0) ++ + #if defined(HAVE_UNDERSCORE_KERNEL_FPU) + #define kfpu_begin() \ +-{ \ +- preempt_disable(); \ ++{ \ ++ preempt_disable(); \ + __kernel_fpu_begin(); \ + } +-#define kfpu_end() \ +-{ \ +- __kernel_fpu_end(); \ +- preempt_enable(); \ ++#define kfpu_end() \ ++{ \ ++ __kernel_fpu_end(); \ ++ preempt_enable(); \ + } ++ + #elif defined(HAVE_KERNEL_FPU) +-#define kfpu_begin() kernel_fpu_begin() ++#define kfpu_begin() kernel_fpu_begin() + #define kfpu_end() kernel_fpu_end() ++ + #else +-/* Kernel doesn't export any kernel_fpu_* functions */ +-#include /* For kernel xgetbv() */ +-#define kfpu_begin() panic("This code should never run") +-#define kfpu_end() panic("This code should never run") +-#endif /* defined(HAVE_KERNEL_FPU) */ ++/* ++ * This case is unreachable. When KERNEL_EXPORTS_X86_FPU is defined then ++ * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined. ++ */ ++#error "Unreachable kernel configuration" ++#endif ++ ++#else /* defined(KERNEL_EXPORTS_X86_FPU) */ ++/* ++ * When the kernel_fpu_* symbols are unavailable then provide our own ++ * versions which allow the FPU to be safely used in kernel threads. ++ * In practice, this is not a significant restriction for ZFS since the ++ * vast majority of SIMD operations are performed by the IO pipeline. ++ */ + ++/* ++ * Returns non-zero if FPU operations are allowed in the current context. ++ */ ++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD) ++#define kfpu_allowed() ((current->flags & PF_KTHREAD) && \ ++ test_thread_flag(TIF_NEED_FPU_LOAD)) ++#elif defined(HAVE_KERNEL_FPU_INITIALIZED) ++#define kfpu_allowed() ((current->flags & PF_KTHREAD) && \ ++ current->thread.fpu.initialized) + #else ++#define kfpu_allowed() 0 ++#endif ++ ++static inline void ++kfpu_initialize(void) ++{ ++ WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); ++ ++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD) ++ __fpu_invalidate_fpregs_state(¤t->thread.fpu); ++ set_thread_flag(TIF_NEED_FPU_LOAD); ++#elif defined(HAVE_KERNEL_FPU_INITIALIZED) ++ __fpu_invalidate_fpregs_state(¤t->thread.fpu); ++ current->thread.fpu.initialized = 1; ++#endif ++} ++ ++static inline void ++kfpu_begin(void) ++{ ++ WARN_ON_ONCE(!kfpu_allowed()); ++ ++ /* ++ * Preemption and interrupts must be disabled for the critical ++ * region where the FPU state is being modified. ++ */ ++ preempt_disable(); ++ local_irq_disable(); ++ ++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD) ++ /* ++ * The current FPU registers need to be preserved by kfpu_begin() ++ * and restored by kfpu_end(). This is required because we can ++ * not call __cpu_invalidate_fpregs_state() to invalidate the ++ * per-cpu FPU state and force them to be restored during a ++ * context switch. ++ */ ++ copy_fpregs_to_fpstate(¤t->thread.fpu); ++#elif defined(HAVE_KERNEL_FPU_INITIALIZED) ++ /* ++ * There is no need to preserve and restore the FPU registers. ++ * They will always be restored from the task's stored FPU state ++ * when switching contexts. ++ */ ++ WARN_ON_ONCE(current->thread.fpu.initialized == 0); ++#endif ++} ++ ++static inline void ++kfpu_end(void) ++{ ++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD) ++ union fpregs_state *state = ¤t->thread.fpu.state; ++ int error; ++ ++ if (use_xsave()) { ++ error = copy_kernel_to_xregs_err(&state->xsave, -1); ++ } else if (use_fxsr()) { ++ error = copy_kernel_to_fxregs_err(&state->fxsave); ++ } else { ++ error = copy_kernel_to_fregs_err(&state->fsave); ++ } ++ WARN_ON_ONCE(error); ++#endif ++ ++ local_irq_enable(); ++ preempt_enable(); ++} ++#endif /* defined(HAVE_KERNEL_FPU) */ ++ ++#else /* defined(_KERNEL) */ + /* +- * fpu dummy methods for userspace ++ * FPU dummy methods for user space. + */ +-#define kfpu_begin() do {} while (0) +-#define kfpu_end() do {} while (0) ++#define kfpu_allowed() 1 ++#define kfpu_initialize(tsk) do {} while (0) ++#define kfpu_begin() do {} while (0) ++#define kfpu_end() do {} while (0) + #endif /* defined(_KERNEL) */ + + /* +@@ -298,7 +400,7 @@ __simd_state_enabled(const uint64_t state) + uint64_t xcr0; + + #if defined(_KERNEL) +-#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_OSXSAVE) + has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE); + #else + has_osxsave = B_FALSE; +@@ -328,11 +430,7 @@ static inline boolean_t + zfs_sse_available(void) + { + #if defined(_KERNEL) +-#if defined(KERNEL_EXPORTS_X86_FPU) + return (!!boot_cpu_has(X86_FEATURE_XMM)); +-#else +- return (B_FALSE); +-#endif + #elif !defined(_KERNEL) + return (__cpuid_has_sse()); + #endif +@@ -345,11 +443,7 @@ static inline boolean_t + zfs_sse2_available(void) + { + #if defined(_KERNEL) +-#if defined(KERNEL_EXPORTS_X86_FPU) + return (!!boot_cpu_has(X86_FEATURE_XMM2)); +-#else +- return (B_FALSE); +-#endif + #elif !defined(_KERNEL) + return (__cpuid_has_sse2()); + #endif +@@ -362,11 +456,7 @@ static inline boolean_t + zfs_sse3_available(void) + { + #if defined(_KERNEL) +-#if defined(KERNEL_EXPORTS_X86_FPU) + return (!!boot_cpu_has(X86_FEATURE_XMM3)); +-#else +- return (B_FALSE); +-#endif + #elif !defined(_KERNEL) + return (__cpuid_has_sse3()); + #endif +@@ -379,11 +469,7 @@ static inline boolean_t + zfs_ssse3_available(void) + { + #if defined(_KERNEL) +-#if defined(KERNEL_EXPORTS_X86_FPU) + return (!!boot_cpu_has(X86_FEATURE_SSSE3)); +-#else +- return (B_FALSE); +-#endif + #elif !defined(_KERNEL) + return (__cpuid_has_ssse3()); + #endif +@@ -396,11 +482,7 @@ static inline boolean_t + zfs_sse4_1_available(void) + { + #if defined(_KERNEL) +-#if defined(KERNEL_EXPORTS_X86_FPU) + return (!!boot_cpu_has(X86_FEATURE_XMM4_1)); +-#else +- return (B_FALSE); +-#endif + #elif !defined(_KERNEL) + return (__cpuid_has_sse4_1()); + #endif +@@ -413,11 +495,7 @@ static inline boolean_t + zfs_sse4_2_available(void) + { + #if defined(_KERNEL) +-#if defined(KERNEL_EXPORTS_X86_FPU) + return (!!boot_cpu_has(X86_FEATURE_XMM4_2)); +-#else +- return (B_FALSE); +-#endif + #elif !defined(_KERNEL) + return (__cpuid_has_sse4_2()); + #endif +@@ -431,11 +509,7 @@ zfs_avx_available(void) + { + boolean_t has_avx; + #if defined(_KERNEL) +-#if defined(KERNEL_EXPORTS_X86_FPU) + has_avx = !!boot_cpu_has(X86_FEATURE_AVX); +-#else +- has_avx = B_FALSE; +-#endif + #elif !defined(_KERNEL) + has_avx = __cpuid_has_avx(); + #endif +@@ -451,11 +525,7 @@ zfs_avx2_available(void) + { + boolean_t has_avx2; + #if defined(_KERNEL) +-#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU) + has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2); +-#else +- has_avx2 = B_FALSE; +-#endif + #elif !defined(_KERNEL) + has_avx2 = __cpuid_has_avx2(); + #endif +@@ -470,7 +540,7 @@ static inline boolean_t + zfs_bmi1_available(void) + { + #if defined(_KERNEL) +-#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_BMI1) + return (!!boot_cpu_has(X86_FEATURE_BMI1)); + #else + return (B_FALSE); +@@ -487,7 +557,7 @@ static inline boolean_t + zfs_bmi2_available(void) + { + #if defined(_KERNEL) +-#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_BMI2) + return (!!boot_cpu_has(X86_FEATURE_BMI2)); + #else + return (B_FALSE); +@@ -504,7 +574,7 @@ static inline boolean_t + zfs_aes_available(void) + { + #if defined(_KERNEL) +-#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_AES) + return (!!boot_cpu_has(X86_FEATURE_AES)); + #else + return (B_FALSE); +@@ -521,7 +591,7 @@ static inline boolean_t + zfs_pclmulqdq_available(void) + { + #if defined(_KERNEL) +-#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_PCLMULQDQ) + return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ)); + #else + return (B_FALSE); +@@ -555,7 +625,7 @@ zfs_avx512f_available(void) + boolean_t has_avx512 = B_FALSE; + + #if defined(_KERNEL) +-#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_AVX512F) + has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F); + #else + has_avx512 = B_FALSE; +@@ -574,7 +644,7 @@ zfs_avx512cd_available(void) + boolean_t has_avx512 = B_FALSE; + + #if defined(_KERNEL) +-#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_AVX512CD) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512CD); + #else +@@ -594,7 +664,7 @@ zfs_avx512er_available(void) + boolean_t has_avx512 = B_FALSE; + + #if defined(_KERNEL) +-#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_AVX512ER) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512ER); + #else +@@ -614,7 +684,7 @@ zfs_avx512pf_available(void) + boolean_t has_avx512 = B_FALSE; + + #if defined(_KERNEL) +-#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_AVX512PF) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512PF); + #else +@@ -634,7 +704,7 @@ zfs_avx512bw_available(void) + boolean_t has_avx512 = B_FALSE; + + #if defined(_KERNEL) +-#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_AVX512BW) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW); + #else +@@ -654,7 +724,7 @@ zfs_avx512dq_available(void) + boolean_t has_avx512 = B_FALSE; + + #if defined(_KERNEL) +-#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_AVX512DQ) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512DQ); + #else +@@ -674,7 +744,7 @@ zfs_avx512vl_available(void) + boolean_t has_avx512 = B_FALSE; + + #if defined(_KERNEL) +-#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_AVX512VL) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VL); + #else +@@ -694,7 +764,7 @@ zfs_avx512ifma_available(void) + boolean_t has_avx512 = B_FALSE; + + #if defined(_KERNEL) +-#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_AVX512IFMA) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512IFMA); + #else +@@ -714,7 +784,7 @@ zfs_avx512vbmi_available(void) + boolean_t has_avx512 = B_FALSE; + + #if defined(_KERNEL) +-#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU) ++#if defined(X86_FEATURE_AVX512VBMI) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VBMI); + #else +diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h +index 2ce32469d..0ce2b5ea1 100644 +--- a/include/sys/vdev_raidz.h ++++ b/include/sys/vdev_raidz.h +@@ -51,7 +51,7 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); + */ + void vdev_raidz_math_init(void); + void vdev_raidz_math_fini(void); +-struct raidz_impl_ops *vdev_raidz_math_get_ops(void); ++const struct raidz_impl_ops *vdev_raidz_math_get_ops(void); + int vdev_raidz_math_generate(struct raidz_map *); + int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *, + const int); +diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h +index 0799ed19d..4969d110b 100644 +--- a/include/sys/vdev_raidz_impl.h ++++ b/include/sys/vdev_raidz_impl.h +@@ -126,7 +126,7 @@ typedef struct raidz_map { + uintptr_t rm_reports; /* # of referencing checksum reports */ + uint8_t rm_freed; /* map no longer has referencing ZIO */ + uint8_t rm_ecksuminjected; /* checksum error was injected */ +- raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ ++ const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ + } raidz_map_t; + +diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h +index 95cfddf9e..9fd9c1bd1 100644 +--- a/module/icp/include/aes/aes_impl.h ++++ b/module/icp/include/aes/aes_impl.h +@@ -198,12 +198,12 @@ extern const aes_impl_ops_t aes_aesni_impl; + /* + * Initializes fastest implementation + */ +-void aes_impl_init(void); ++void aes_impl_init(void *arg); + + /* +- * Get selected aes implementation ++ * Returns optimal allowed AES implementation + */ +-struct aes_impl_ops *aes_impl_get_ops(void); ++const struct aes_impl_ops *aes_impl_get_ops(void); + + #ifdef __cplusplus + } +diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h +index cbb904c05..138090487 100644 +--- a/module/icp/include/modes/gcm_impl.h ++++ b/module/icp/include/modes/gcm_impl.h +@@ -61,12 +61,12 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl; + /* + * Initializes fastest implementation + */ +-void gcm_impl_init(void); ++void gcm_impl_init(void *arg); + + /* +- * Get selected aes implementation ++ * Returns optimal allowed GCM implementation + */ +-struct gcm_impl_ops *gcm_impl_get_ops(void); ++const struct gcm_impl_ops *gcm_impl_get_ops(void); + + #ifdef __cplusplus + } +diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c +index 9c2cf9501..815462443 100644 +--- a/cmd/ztest/ztest.c ++++ b/cmd/ztest/ztest.c +@@ -107,6 +107,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -7094,6 +7095,8 @@ ztest_run(ztest_shared_t *zs) + metaslab_preload_limit = ztest_random(20) + 1; + ztest_spa = spa; + ++ VERIFY0(vdev_raidz_impl_set("cycle")); ++ + dmu_objset_stats_t dds; + VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, + DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); +diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c +index e15050635..457b9e45c 100644 +--- a/module/icp/algs/aes/aes_impl.c ++++ b/module/icp/algs/aes/aes_impl.c +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + + /* + * Initialize AES encryption and decryption key schedules. +@@ -40,9 +41,9 @@ + void + aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched) + { +- aes_impl_ops_t *ops = aes_impl_get_ops(); +- aes_key_t *newbie = keysched; +- uint_t keysize, i, j; ++ const aes_impl_ops_t *ops = aes_impl_get_ops(); ++ aes_key_t *newbie = keysched; ++ uint_t keysize, i, j; + union { + uint64_t ka64[4]; + uint32_t ka32[8]; +@@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0; + static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)]; + + /* +- * Selects the aes operations for encrypt/decrypt/key setup ++ * Returns the AES operations for encrypt/decrypt/key setup. When a ++ * SIMD implementation is not allowed in the current context, then ++ * fallback to the fastest generic implementation. + */ +-aes_impl_ops_t * +-aes_impl_get_ops() ++const aes_impl_ops_t * ++aes_impl_get_ops(void) + { +- aes_impl_ops_t *ops = NULL; ++ if (!kfpu_allowed()) ++ return (&aes_generic_impl); ++ ++ const aes_impl_ops_t *ops = NULL; + const uint32_t impl = AES_IMPL_READ(icp_aes_impl); + + switch (impl) { +@@ -266,15 +272,13 @@ aes_impl_get_ops() + ops = &aes_fastest_impl; + break; + case IMPL_CYCLE: +- { ++ /* Cycle through supported implementations */ + ASSERT(aes_impl_initialized); + ASSERT3U(aes_supp_impl_cnt, >, 0); +- /* Cycle through supported implementations */ + static size_t cycle_impl_idx = 0; + size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt; + ops = aes_supp_impl[idx]; +- } +- break; ++ break; + default: + ASSERT3U(impl, <, aes_supp_impl_cnt); + ASSERT3U(aes_supp_impl_cnt, >, 0); +@@ -288,13 +292,17 @@ aes_impl_get_ops() + return (ops); + } + ++/* ++ * Initialize all supported implementations. ++ */ ++/* ARGSUSED */ + void +-aes_impl_init(void) ++aes_impl_init(void *arg) + { + aes_impl_ops_t *curr_impl; + int i, c; + +- /* move supported impl into aes_supp_impls */ ++ /* Move supported implementations into aes_supp_impls */ + for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) { + curr_impl = (aes_impl_ops_t *)aes_all_impl[i]; + +diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c +index 97f7c3a47..222c176aa 100644 +--- a/module/icp/algs/aes/aes_impl_aesni.c ++++ b/module/icp/algs/aes/aes_impl_aesni.c +@@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4], + static boolean_t + aes_aesni_will_work(void) + { +- return (zfs_aes_available()); ++ return (kfpu_allowed() && zfs_aes_available()); + } + + const aes_impl_ops_t aes_aesni_impl = { +diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c +index 13bceef0f..f6f8434de 100644 +--- a/module/icp/algs/modes/gcm.c ++++ b/module/icp/algs/modes/gcm.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + #define GHASH(c, d, t, o) \ + xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ +@@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) + { +- gcm_impl_ops_t *gops; ++ const gcm_impl_ops_t *gops; + size_t remainder = length; + size_t need = 0; + uint8_t *datap = (uint8_t *)data; +@@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) + { +- gcm_impl_ops_t *gops; ++ const gcm_impl_ops_t *gops; + uint64_t counter_mask = ntohll(0x00000000ffffffffULL); + uint8_t *ghash, *macp = NULL; + int i, rv; +@@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) + { +- gcm_impl_ops_t *gops; ++ const gcm_impl_ops_t *gops; + size_t pt_len; + size_t remainder; + uint8_t *ghash; +@@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len, + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) + { +- gcm_impl_ops_t *gops; ++ const gcm_impl_ops_t *gops; + uint8_t *cb; + ulong_t remainder = iv_len; + ulong_t processed = 0; +@@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) + { +- gcm_impl_ops_t *gops; ++ const gcm_impl_ops_t *gops; + uint8_t *ghash, *datap, *authp; + size_t remainder, processed; + +@@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0; + static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; + + /* +- * Selects the gcm operation ++ * Returns the GCM operations for encrypt/decrypt/key setup. When a ++ * SIMD implementation is not allowed in the current context, then ++ * fallback to the fastest generic implementation. + */ +-gcm_impl_ops_t * ++const gcm_impl_ops_t * + gcm_impl_get_ops() + { +- gcm_impl_ops_t *ops = NULL; ++ if (!kfpu_allowed()) ++ return (&gcm_generic_impl); ++ ++ const gcm_impl_ops_t *ops = NULL; + const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); + + switch (impl) { +@@ -674,15 +680,13 @@ gcm_impl_get_ops() + ops = &gcm_fastest_impl; + break; + case IMPL_CYCLE: +- { ++ /* Cycle through supported implementations */ + ASSERT(gcm_impl_initialized); + ASSERT3U(gcm_supp_impl_cnt, >, 0); +- /* Cycle through supported implementations */ + static size_t cycle_impl_idx = 0; + size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; + ops = gcm_supp_impl[idx]; +- } +- break; ++ break; + default: + ASSERT3U(impl, <, gcm_supp_impl_cnt); + ASSERT3U(gcm_supp_impl_cnt, >, 0); +@@ -696,13 +700,17 @@ gcm_impl_get_ops() + return (ops); + } + ++/* ++ * Initialize all supported implementations. ++ */ ++/* ARGSUSED */ + void +-gcm_impl_init(void) ++gcm_impl_init(void *arg) + { + gcm_impl_ops_t *curr_impl; + int i, c; + +- /* move supported impl into aes_supp_impls */ ++ /* Move supported implementations into gcm_supp_impls */ + for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { + curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; + +@@ -711,7 +719,10 @@ gcm_impl_init(void) + } + gcm_supp_impl_cnt = c; + +- /* set fastest implementation. assume hardware accelerated is fastest */ ++ /* ++ * Set the fastest implementation given the assumption that the ++ * hardware accelerated version is the fastest. ++ */ + #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) + if (gcm_pclmulqdq_impl.is_supported()) + memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, +diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c +index be00ba37b..8a43ba33a 100644 +--- a/module/icp/algs/modes/gcm_pclmulqdq.c ++++ b/module/icp/algs/modes/gcm_pclmulqdq.c +@@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res) + static boolean_t + gcm_pclmulqdq_will_work(void) + { +- return (zfs_pclmulqdq_available()); ++ return (kfpu_allowed() && zfs_pclmulqdq_available()); + } + + const gcm_impl_ops_t gcm_pclmulqdq_impl = { +diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c +index 53b193693..51538bc60 100644 +--- a/module/icp/io/aes.c ++++ b/module/icp/io/aes.c +@@ -206,9 +206,35 @@ aes_mod_init(void) + { + int ret; + +- /* find fastest implementations and set any requested implementations */ +- aes_impl_init(); +- gcm_impl_init(); ++#if defined(_KERNEL) ++ /* ++ * Determine the fastest available implementation. The benchmarks ++ * are run in dedicated kernel threads to allow Linux 5.0+ kernels ++ * to use SIMD operations. If for some reason this isn't possible, ++ * fallback to the generic implementations. See the comment in ++ * include/linux/simd_x86.h for additional details. Additionally, ++ * this has the benefit of allowing them to be run in parallel. ++ */ ++ taskqid_t aes_id = taskq_dispatch(system_taskq, aes_impl_init, ++ NULL, TQ_SLEEP); ++ taskqid_t gcm_id = taskq_dispatch(system_taskq, gcm_impl_init, ++ NULL, TQ_SLEEP); ++ ++ if (aes_id != TASKQID_INVALID) { ++ taskq_wait_id(system_taskq, aes_id); ++ } else { ++ aes_impl_init(NULL); ++ } ++ ++ if (gcm_id != TASKQID_INVALID) { ++ taskq_wait_id(system_taskq, gcm_id); ++ } else { ++ gcm_impl_init(NULL); ++ } ++#else ++ aes_impl_init(NULL); ++ gcm_impl_init(NULL); ++#endif + + if ((ret = mod_install(&modlinkage)) != 0) + return (ret); +diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c +index 7684257be..de0e45190 100644 +--- a/module/spl/spl-taskq.c ++++ b/module/spl/spl-taskq.c +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + + int spl_taskq_thread_bind = 0; + module_param(spl_taskq_thread_bind, int, 0644); +@@ -869,6 +870,7 @@ taskq_thread(void *args) + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); ++ kfpu_initialize(); + + tsd_set(taskq_tsd, tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); +diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c +index d441ad65f..c4977bcf2 100644 +--- a/module/spl/spl-thread.c ++++ b/module/spl/spl-thread.c +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + + /* + * Thread interfaces +@@ -54,6 +55,7 @@ thread_generic_wrapper(void *arg) + args = tp->tp_args; + set_current_state(tp->tp_state); + set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri)); ++ kfpu_initialize(); + kmem_free(tp->tp_name, tp->tp_name_size); + kmem_free(tp, sizeof (thread_priv_t)); + +diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c +index 5a991ba60..b75d8ab00 100644 +--- a/module/zcommon/zfs_fletcher.c ++++ b/module/zcommon/zfs_fletcher.c +@@ -140,6 +140,7 @@ + #include + #include + #include ++#include + + #define FLETCHER_MIN_SIMD_SIZE 64 + +@@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector { + const char *fis_name; + uint32_t fis_sel; + } fletcher_4_impl_selectors[] = { +-#if !defined(_KERNEL) + { "cycle", IMPL_CYCLE }, +-#endif + { "fastest", IMPL_FASTEST }, + { "scalar", IMPL_SCALAR } + }; + + #if defined(_KERNEL) + static kstat_t *fletcher_4_kstat; +-#endif + + static struct fletcher_4_kstat { + uint64_t native; + uint64_t byteswap; + } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; ++#endif + + /* Indicate that benchmark has been completed */ + static boolean_t fletcher_4_initialized = B_FALSE; +@@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val) + return (err); + } + ++/* ++ * Returns the Fletcher 4 operations for checksums. When a SIMD ++ * implementation is not allowed in the current context, then fallback ++ * to the fastest generic implementation. ++ */ + static inline const fletcher_4_ops_t * + fletcher_4_impl_get(void) + { +- fletcher_4_ops_t *ops = NULL; +- const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); ++ if (!kfpu_allowed()) ++ return (&fletcher_4_superscalar4_ops); ++ ++ const fletcher_4_ops_t *ops = NULL; ++ uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); + + switch (impl) { + case IMPL_FASTEST: + ASSERT(fletcher_4_initialized); + ops = &fletcher_4_fastest_impl; + break; +-#if !defined(_KERNEL) +- case IMPL_CYCLE: { ++ case IMPL_CYCLE: ++ /* Cycle through supported implementations */ + ASSERT(fletcher_4_initialized); + ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); +- + static uint32_t cycle_count = 0; + uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; + ops = fletcher_4_supp_impls[idx]; +- } +- break; +-#endif ++ break; + default: + ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); + ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); +- + ops = fletcher_4_supp_impls[impl]; + break; + } +@@ -658,6 +661,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) + typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, + zio_cksum_t *); + ++#if defined(_KERNEL) + static void + fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) + { +@@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) + /* restore original selection */ + atomic_swap_32(&fletcher_4_impl_chosen, sel_save); + } ++#endif /* _KERNEL */ + +-void +-fletcher_4_init(void) ++/* ++ * Initialize and benchmark all supported implementations. ++ */ ++static void ++fletcher_4_benchmark(void *arg) + { +- static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ + fletcher_4_ops_t *curr_impl; +- char *databuf; + int i, c; + +- /* move supported impl into fletcher_4_supp_impls */ ++ /* Move supported implementations into fletcher_4_supp_impls */ + for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { + curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; + +@@ -735,19 +741,10 @@ fletcher_4_init(void) + membar_producer(); /* complete fletcher_4_supp_impls[] init */ + fletcher_4_supp_impls_cnt = c; /* number of supported impl */ + +-#if !defined(_KERNEL) +- /* Skip benchmarking and use last implementation as fastest */ +- memcpy(&fletcher_4_fastest_impl, +- fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1], +- sizeof (fletcher_4_fastest_impl)); +- fletcher_4_fastest_impl.name = "fastest"; +- membar_producer(); ++#if defined(_KERNEL) ++ static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ ++ char *databuf = vmem_alloc(data_size, KM_SLEEP); + +- fletcher_4_initialized = B_TRUE; +- return; +-#endif +- /* Benchmark all supported implementations */ +- databuf = vmem_alloc(data_size, KM_SLEEP); + for (i = 0; i < data_size / sizeof (uint64_t); i++) + ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ + +@@ -755,9 +752,38 @@ fletcher_4_init(void) + fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); + + vmem_free(databuf, data_size); ++#else ++ /* ++ * Skip the benchmark in user space to avoid impacting libzpool ++ * consumers (zdb, zhack, zinject, ztest). The last implementation ++ * is assumed to be the fastest and used by default. ++ */ ++ memcpy(&fletcher_4_fastest_impl, ++ fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1], ++ sizeof (fletcher_4_fastest_impl)); ++ fletcher_4_fastest_impl.name = "fastest"; ++ membar_producer(); ++#endif /* _KERNEL */ ++} + ++void ++fletcher_4_init(void) ++{ + #if defined(_KERNEL) +- /* install kstats for all implementations */ ++ /* ++ * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are ++ * run in a kernel threads. This is needed to take advantage of the ++ * SIMD functionality, see include/linux/simd_x86.h for details. ++ */ ++ taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark, ++ NULL, TQ_SLEEP); ++ if (id != TASKQID_INVALID) { ++ taskq_wait_id(system_taskq, id); ++ } else { ++ fletcher_4_benchmark(NULL); ++ } ++ ++ /* Install kstats for all implementations */ + fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + if (fletcher_4_kstat != NULL) { +@@ -769,6 +795,8 @@ fletcher_4_init(void) + fletcher_4_kstat_addr); + kstat_install(fletcher_4_kstat); + } ++#else ++ fletcher_4_benchmark(NULL); + #endif + + /* Finish initialization */ +diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c +index bd2db2b20..3b3c1b52b 100644 +--- a/module/zcommon/zfs_fletcher_aarch64_neon.c ++++ b/module/zcommon/zfs_fletcher_aarch64_neon.c +@@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16))); + + static boolean_t fletcher_4_aarch64_neon_valid(void) + { +- return (B_TRUE); ++ return (kfpu_allowed()); + } + + const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = { +diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c +index 7260a9864..0d4cff21a 100644 +--- a/module/zcommon/zfs_fletcher_avx512.c ++++ b/module/zcommon/zfs_fletcher_avx512.c +@@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap); + static boolean_t + fletcher_4_avx512f_valid(void) + { +- return (zfs_avx512f_available()); ++ return (kfpu_allowed() && zfs_avx512f_available()); + } + + const fletcher_4_ops_t fletcher_4_avx512f_ops = { +diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c +index 6dac047da..7f12efe6d 100644 +--- a/module/zcommon/zfs_fletcher_intel.c ++++ b/module/zcommon/zfs_fletcher_intel.c +@@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) + + static boolean_t fletcher_4_avx2_valid(void) + { +- return (zfs_avx_available() && zfs_avx2_available()); ++ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); + } + + const fletcher_4_ops_t fletcher_4_avx2_ops = { +diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c +index a0b42e5f5..e6389d6e5 100644 +--- a/module/zcommon/zfs_fletcher_sse.c ++++ b/module/zcommon/zfs_fletcher_sse.c +@@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) + + static boolean_t fletcher_4_sse2_valid(void) + { +- return (zfs_sse2_available()); ++ return (kfpu_allowed() && zfs_sse2_available()); + } + + const fletcher_4_ops_t fletcher_4_sse2_ops = { +@@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) + + static boolean_t fletcher_4_ssse3_valid(void) + { +- return (zfs_sse2_available() && zfs_ssse3_available()); ++ return (kfpu_allowed() && zfs_sse2_available() && ++ zfs_ssse3_available()); + } + + const fletcher_4_ops_t fletcher_4_ssse3_ops = { +diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c +index e6112bc02..e7a39015c 100644 +--- a/module/zfs/vdev_raidz_math.c ++++ b/module/zfs/vdev_raidz_math.c +@@ -27,9 +27,9 @@ + #include + #include + #include +- + #include + #include ++#include + + extern boolean_t raidz_will_scalar_work(void); + +@@ -87,6 +87,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST; + static size_t raidz_supp_impl_cnt = 0; + static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; + ++#if defined(_KERNEL) + /* + * kstats values for supported implementations + * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] +@@ -95,14 +96,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; + + /* kstat for benchmarked implementations */ + static kstat_t *raidz_math_kstat = NULL; ++#endif + + /* +- * Selects the raidz operation for raidz_map +- * If rm_ops is set to NULL original raidz implementation will be used ++ * Returns the RAIDZ operations for raidz_map() parity calculations. When ++ * a SIMD implementation is not allowed in the current context, then fallback ++ * to the fastest generic implementation. + */ +-raidz_impl_ops_t * +-vdev_raidz_math_get_ops() ++const raidz_impl_ops_t * ++vdev_raidz_math_get_ops(void) + { ++ if (!kfpu_allowed()) ++ return (&vdev_raidz_scalar_impl); ++ + raidz_impl_ops_t *ops = NULL; + const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); + +@@ -111,18 +117,14 @@ vdev_raidz_math_get_ops() + ASSERT(raidz_math_initialized); + ops = &vdev_raidz_fastest_impl; + break; +-#if !defined(_KERNEL) + case IMPL_CYCLE: +- { ++ /* Cycle through all supported implementations */ + ASSERT(raidz_math_initialized); + ASSERT3U(raidz_supp_impl_cnt, >, 0); +- /* Cycle through all supported implementations */ + static size_t cycle_impl_idx = 0; + size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; + ops = raidz_supp_impl[idx]; +- } +- break; +-#endif ++ break; + case IMPL_ORIGINAL: + ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; + break; +@@ -273,6 +275,8 @@ const char *raidz_rec_name[] = { + "rec_pq", "rec_pr", "rec_qr", "rec_pqr" + }; + ++#if defined(_KERNEL) ++ + #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1) + + static int +@@ -435,21 +439,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) + } + } + } ++#endif + +-void +-vdev_raidz_math_init(void) ++/* ++ * Initialize and benchmark all supported implementations. ++ */ ++static void ++benchmark_raidz(void *arg) + { + raidz_impl_ops_t *curr_impl; +- zio_t *bench_zio = NULL; +- raidz_map_t *bench_rm = NULL; +- uint64_t bench_parity; +- int i, c, fn; ++ int i, c; + +- /* move supported impl into raidz_supp_impl */ ++ /* Move supported impl into raidz_supp_impl */ + for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { + curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; + +- /* initialize impl */ + if (curr_impl->init) + curr_impl->init(); + +@@ -459,18 +463,10 @@ vdev_raidz_math_init(void) + membar_producer(); /* complete raidz_supp_impl[] init */ + raidz_supp_impl_cnt = c; /* number of supported impl */ + +-#if !defined(_KERNEL) +- /* Skip benchmarking and use last implementation as fastest */ +- memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1], +- sizeof (vdev_raidz_fastest_impl)); +- strcpy(vdev_raidz_fastest_impl.name, "fastest"); +- +- raidz_math_initialized = B_TRUE; +- +- /* Use 'cycle' math selection method for userspace */ +- VERIFY0(vdev_raidz_impl_set("cycle")); +- return; +-#endif ++#if defined(_KERNEL) ++ zio_t *bench_zio = NULL; ++ raidz_map_t *bench_rm = NULL; ++ uint64_t bench_parity; + + /* Fake an zio and run the benchmark on a warmed up buffer */ + bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); +@@ -480,7 +476,7 @@ vdev_raidz_math_init(void) + memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); + + /* Benchmark parity generation methods */ +- for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { ++ for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + bench_parity = fn + 1; + /* New raidz_map is needed for each generate_p/q/r */ + bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, +@@ -495,7 +491,7 @@ vdev_raidz_math_init(void) + bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, + BENCH_COLS, PARITY_PQR); + +- for (fn = 0; fn < RAIDZ_REC_NUM; fn++) ++ for (int fn = 0; fn < RAIDZ_REC_NUM; fn++) + benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); + + vdev_raidz_map_free(bench_rm); +@@ -503,11 +499,39 @@ vdev_raidz_math_init(void) + /* cleanup the bench zio */ + abd_free(bench_zio->io_abd); + kmem_free(bench_zio, sizeof (zio_t)); ++#else ++ /* ++ * Skip the benchmark in user space to avoid impacting libzpool ++ * consumers (zdb, zhack, zinject, ztest). The last implementation ++ * is assumed to be the fastest and used by default. ++ */ ++ memcpy(&vdev_raidz_fastest_impl, ++ raidz_supp_impl[raidz_supp_impl_cnt - 1], ++ sizeof (vdev_raidz_fastest_impl)); ++ strcpy(vdev_raidz_fastest_impl.name, "fastest"); ++#endif /* _KERNEL */ ++} + +- /* install kstats for all impl */ ++void ++vdev_raidz_math_init(void) ++{ ++#if defined(_KERNEL) ++ /* ++ * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are ++ * run in a kernel threads. This is needed to take advantage of the ++ * SIMD functionality, see include/linux/simd_x86.h for details. ++ */ ++ taskqid_t id = taskq_dispatch(system_taskq, benchmark_raidz, ++ NULL, TQ_SLEEP); ++ if (id != TASKQID_INVALID) { ++ taskq_wait_id(system_taskq, id); ++ } else { ++ benchmark_raidz(NULL); ++ } ++ ++ /* Install kstats for all implementations */ + raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); +- + if (raidz_math_kstat != NULL) { + raidz_math_kstat->ks_data = NULL; + raidz_math_kstat->ks_ndata = UINT32_MAX; +@@ -517,6 +541,9 @@ vdev_raidz_math_init(void) + raidz_math_kstat_addr); + kstat_install(raidz_math_kstat); + } ++#else ++ benchmark_raidz(NULL); ++#endif + + /* Finish initialization */ + atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); +@@ -527,15 +554,15 @@ void + vdev_raidz_math_fini(void) + { + raidz_impl_ops_t const *curr_impl; +- int i; + ++#if defined(_KERNEL) + if (raidz_math_kstat != NULL) { + kstat_delete(raidz_math_kstat); + raidz_math_kstat = NULL; + } ++#endif + +- /* fini impl */ +- for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { ++ for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { + curr_impl = raidz_all_maths[i]; + if (curr_impl->fini) + curr_impl->fini(); +@@ -546,9 +573,7 @@ static const struct { + char *name; + uint32_t sel; + } math_impl_opts[] = { +-#if !defined(_KERNEL) + { "cycle", IMPL_CYCLE }, +-#endif + { "fastest", IMPL_FASTEST }, + { "original", IMPL_ORIGINAL }, + { "scalar", IMPL_SCALAR } +diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c +index e3ad06776..0a67ceb84 100644 +--- a/module/zfs/vdev_raidz_math_aarch64_neon.c ++++ b/module/zfs/vdev_raidz_math_aarch64_neon.c +@@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon); + static boolean_t + raidz_will_aarch64_neon_work(void) + { +- return (B_TRUE); // __arch64__ requires NEON ++ return (kfpu_allowed()); + } + + const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = { +diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c +index f8688a06a..e072f51cd 100644 +--- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c ++++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c +@@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2); + static boolean_t + raidz_will_aarch64_neonx2_work(void) + { +- return (B_TRUE); // __arch64__ requires NEON ++ return (kfpu_allowed()); + } + + const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = { +diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c +index 063d29bcd..a12eb6720 100644 +--- a/module/zfs/vdev_raidz_math_avx2.c ++++ b/module/zfs/vdev_raidz_math_avx2.c +@@ -396,7 +396,7 @@ DEFINE_REC_METHODS(avx2); + static boolean_t + raidz_will_avx2_work(void) + { +- return (zfs_avx_available() && zfs_avx2_available()); ++ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); + } + + const raidz_impl_ops_t vdev_raidz_avx2_impl = { +diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c +index d605653db..2f545c9ec 100644 +--- a/module/zfs/vdev_raidz_math_avx512bw.c ++++ b/module/zfs/vdev_raidz_math_avx512bw.c +@@ -393,9 +393,8 @@ DEFINE_REC_METHODS(avx512bw); + static boolean_t + raidz_will_avx512bw_work(void) + { +- return (zfs_avx_available() && +- zfs_avx512f_available() && +- zfs_avx512bw_available()); ++ return (kfpu_allowed() && zfs_avx_available() && ++ zfs_avx512f_available() && zfs_avx512bw_available()); + } + + const raidz_impl_ops_t vdev_raidz_avx512bw_impl = { +diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c +index f4e4560ce..75af7a8ee 100644 +--- a/module/zfs/vdev_raidz_math_avx512f.c ++++ b/module/zfs/vdev_raidz_math_avx512f.c +@@ -470,9 +470,8 @@ DEFINE_REC_METHODS(avx512f); + static boolean_t + raidz_will_avx512f_work(void) + { +- return (zfs_avx_available() && +- zfs_avx2_available() && +- zfs_avx512f_available()); ++ return (kfpu_allowed() && zfs_avx_available() && ++ zfs_avx2_available() && zfs_avx512f_available()); + } + + const raidz_impl_ops_t vdev_raidz_avx512f_impl = { +diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c +index 9985da273..5b3a9385c 100644 +--- a/module/zfs/vdev_raidz_math_sse2.c ++++ b/module/zfs/vdev_raidz_math_sse2.c +@@ -607,7 +607,7 @@ DEFINE_REC_METHODS(sse2); + static boolean_t + raidz_will_sse2_work(void) + { +- return (zfs_sse_available() && zfs_sse2_available()); ++ return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available()); + } + + const raidz_impl_ops_t vdev_raidz_sse2_impl = { +diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c +index 047a48d54..62247cf8e 100644 +--- a/module/zfs/vdev_raidz_math_ssse3.c ++++ b/module/zfs/vdev_raidz_math_ssse3.c +@@ -399,8 +399,8 @@ DEFINE_REC_METHODS(ssse3); + static boolean_t + raidz_will_ssse3_work(void) + { +- return (zfs_sse_available() && zfs_sse2_available() && +- zfs_ssse3_available()); ++ return (kfpu_allowed() && zfs_sse_available() && ++ zfs_sse2_available() && zfs_ssse3_available()); + } + + const raidz_impl_ops_t vdev_raidz_ssse3_impl = { +diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4 +index 5fff79a74..31bf35f83 100644 +--- a/config/kernel-fpu.m4 ++++ b/config/kernel-fpu.m4 +@@ -2,8 +2,15 @@ dnl # + dnl # Handle differences in kernel FPU code. + dnl # + dnl # Kernel +-dnl # 5.0: All kernel fpu functions are GPL only, so we can't use them. +-dnl # (nothing defined) ++dnl # 5.2: The fpu->initialized flag was replaced by TIF_NEED_FPU_LOAD. ++dnl # HAVE_KERNEL_TIF_NEED_FPU_LOAD ++dnl # ++dnl # 5.0: As an optimization SIMD operations performed by kernel ++dnl # threads can skip saving and restoring their FPU context. ++dnl # Wrappers have been introduced to determine the running ++dnl # context and use either the SIMD or generic implementation. ++dnl # This change was made to the 4.19.38 and 4.14.120 LTS kernels. ++dnl # HAVE_KERNEL_FPU_INITIALIZED + dnl # + dnl # 4.2: Use __kernel_fpu_{begin,end}() + dnl # HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU +@@ -56,10 +63,39 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ + __kernel_fpu_end(); + ], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [ + AC_MSG_RESULT(__kernel_fpu_*) +- AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions]) +- AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions]) ++ AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, ++ [kernel has __kernel_fpu_* functions]) ++ AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, ++ [kernel exports FPU functions]) + ],[ +- AC_MSG_RESULT(not exported) ++ ZFS_LINUX_TRY_COMPILE([ ++ #include ++ #include ++ ],[ ++ struct fpu *fpu = ¤t->thread.fpu; ++ if (fpu->initialized) { return (0); }; ++ ],[ ++ AC_MSG_RESULT(fpu.initialized) ++ AC_DEFINE(HAVE_KERNEL_FPU_INITIALIZED, 1, ++ [kernel fpu.initialized exists]) ++ ],[ ++ ZFS_LINUX_TRY_COMPILE([ ++ #include ++ #include ++ ++ #if !defined(TIF_NEED_FPU_LOAD) ++ #error "TIF_NEED_FPU_LOAD undefined" ++ #endif ++ ],[ ++ ],[ ++ AC_MSG_RESULT(TIF_NEED_FPU_LOAD) ++ AC_DEFINE( ++ HAVE_KERNEL_TIF_NEED_FPU_LOAD, 1, ++ [kernel TIF_NEED_FPU_LOAD exists]) ++ ],[ ++ AC_MSG_RESULT(unavailable) ++ ]) ++ ]) + ]) + ]) + ]) diff --git a/debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch b/debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch new file mode 100644 index 0000000..c8c8267 --- /dev/null +++ b/debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch @@ -0,0 +1,44 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Brian Behlendorf +Date: Wed, 17 Jul 2019 09:14:36 -0700 +Subject: [PATCH] Fix CONFIG_X86_DEBUG_FPU build failure +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When CONFIG_X86_DEBUG_FPU is defined the alternatives_patched symbol +is pulled in as a dependency which results in a build failure. To +prevent this undefine CONFIG_X86_DEBUG_FPU to disable the WARN_ON_FPU() +macro and rely on WARN_ON_ONCE debugging checks which were previously +added. + +Reviewed-by: Tony Hutter +Signed-off-by: Brian Behlendorf +Closes #9041 +Closes #9049 +(cherry picked from commit 095b5412b31c07cad5cec74a4eb5ace011c92b27) +Signed-off-by: Fabian Grünbichler +--- + include/linux/simd_x86.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h +index 2d7a1c3a5..5f243e0cc 100644 +--- a/include/linux/simd_x86.h ++++ b/include/linux/simd_x86.h +@@ -82,6 +82,15 @@ + + #if defined(_KERNEL) + ++/* ++ * Disable the WARN_ON_FPU() macro to prevent additional dependencies ++ * when providing the kfpu_* functions. Relevant warnings are included ++ * as appropriate and are unconditionally enabled. ++ */ ++#if defined(CONFIG_X86_DEBUG_FPU) && !defined(KERNEL_EXPORTS_X86_FPU) ++#undef CONFIG_X86_DEBUG_FPU ++#endif ++ + #if defined(HAVE_KERNEL_FPU_API_HEADER) + #include + #include diff --git a/debian/patches/series b/debian/patches/series index 9da3503..9b0d7fb 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -5,3 +5,5 @@ 0005-import-with-d-dev-disk-by-id-in-scan-service.patch 0006-Enable-zed-emails.patch 0007-Fix-race-in-parallel-mount-s-thread-dispatching-algo.patch +0008-Linux-5.0-compat-SIMD-compatibility.patch +0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch