f43dbfa752
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
1616 lines
50 KiB
Diff
1616 lines
50 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
Date: Fri, 12 Jul 2019 09:31:20 -0700
|
|
Subject: [PATCH] Linux 5.0 compat: SIMD compatibility
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS,
|
|
and 5.0 and newer kernels. This is accomplished by leveraging
|
|
the fact that by definition dedicated kernel threads never need
|
|
to concern themselves with saving and restoring the user FPU state.
|
|
Therefore, they may use the FPU as long as we can guarantee user
|
|
tasks always restore their FPU state before context switching back
|
|
to user space.
|
|
|
|
For the 5.0 and 5.1 kernels disabling preemption and local
|
|
interrupts is sufficient to allow the FPU to be used. All non-kernel
|
|
threads will restore the preserved user FPU state.
|
|
|
|
For 5.2 and latter kernels the user FPU state restoration will be
|
|
skipped if the kernel determines the registers have not changed.
|
|
Therefore, for these kernels we need to perform the additional
|
|
step of saving and restoring the FPU registers. Invalidating the
|
|
per-cpu global tracking the FPU state would force a restore but
|
|
that functionality is private to the core x86 FPU implementation
|
|
and unavailable.
|
|
|
|
In practice, restricting SIMD to kernel threads is not a major
|
|
restriction for ZFS. The vast majority of SIMD operations are
|
|
already performed by the IO pipeline. The remaining cases are
|
|
relatively infrequent and can be handled by the generic code
|
|
without significant impact. The two most noteworthy cases are:
|
|
|
|
1) Decrypting the wrapping key for an encrypted dataset,
|
|
i.e. `zfs load-key`. All other encryption and decryption
|
|
operations will use the SIMD optimized implementations.
|
|
|
|
2) Generating the payload checksums for a `zfs send` stream.
|
|
|
|
In order to avoid making any changes to the higher layers of ZFS
|
|
all of the `*_get_ops()` functions were updated to take in to
|
|
consideration the calling context. This allows for the fastest
|
|
implementation to be used as appropriate (see kfpu_allowed()).
|
|
|
|
The only other notable instance of SIMD operations being used
|
|
outside a kernel thread was at module load time. This code
|
|
was moved in to a taskq in order to accommodate the new kernel
|
|
thread restriction.
|
|
|
|
Finally, a few other modifications were made in order to further
|
|
harden this code and facilitate testing. They include updating
|
|
each implementations operations structure to be declared as a
|
|
constant. And allowing "cycle" to be set when selecting the
|
|
preferred ops in the kernel as well as user space.
|
|
|
|
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
|
|
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
Closes #8754
|
|
Closes #8793
|
|
Closes #8965
|
|
(cherry picked from commit e5db31349484e5e859c7a942eb15b98d68ce5b4d)
|
|
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
|
|
---
|
|
include/linux/Makefile.am | 1 +
|
|
include/linux/simd.h | 41 +++++
|
|
include/linux/simd_aarch64.h | 18 +-
|
|
include/linux/simd_x86.h | 192 +++++++++++++-------
|
|
include/sys/vdev_raidz.h | 2 +-
|
|
include/sys/vdev_raidz_impl.h | 2 +-
|
|
module/icp/include/aes/aes_impl.h | 6 +-
|
|
module/icp/include/modes/gcm_impl.h | 6 +-
|
|
cmd/ztest/ztest.c | 3 +
|
|
module/icp/algs/aes/aes_impl.c | 34 ++--
|
|
module/icp/algs/aes/aes_impl_aesni.c | 2 +-
|
|
module/icp/algs/modes/gcm.c | 41 +++--
|
|
module/icp/algs/modes/gcm_pclmulqdq.c | 2 +-
|
|
module/icp/io/aes.c | 32 +++-
|
|
module/spl/spl-taskq.c | 2 +
|
|
module/spl/spl-thread.c | 2 +
|
|
module/zcommon/zfs_fletcher.c | 88 ++++++---
|
|
module/zcommon/zfs_fletcher_aarch64_neon.c | 2 +-
|
|
module/zcommon/zfs_fletcher_avx512.c | 2 +-
|
|
module/zcommon/zfs_fletcher_intel.c | 2 +-
|
|
module/zcommon/zfs_fletcher_sse.c | 5 +-
|
|
module/zfs/vdev_raidz_math.c | 105 +++++++----
|
|
module/zfs/vdev_raidz_math_aarch64_neon.c | 2 +-
|
|
module/zfs/vdev_raidz_math_aarch64_neonx2.c | 2 +-
|
|
module/zfs/vdev_raidz_math_avx2.c | 2 +-
|
|
module/zfs/vdev_raidz_math_avx512bw.c | 5 +-
|
|
module/zfs/vdev_raidz_math_avx512f.c | 5 +-
|
|
module/zfs/vdev_raidz_math_sse2.c | 2 +-
|
|
module/zfs/vdev_raidz_math_ssse3.c | 4 +-
|
|
config/kernel-fpu.m4 | 46 ++++-
|
|
30 files changed, 454 insertions(+), 204 deletions(-)
|
|
create mode 100644 include/linux/simd.h
|
|
|
|
diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am
|
|
index efb49520e..2455759e8 100644
|
|
--- a/include/linux/Makefile.am
|
|
+++ b/include/linux/Makefile.am
|
|
@@ -7,6 +7,7 @@ KERNEL_H = \
|
|
$(top_srcdir)/include/linux/blkdev_compat.h \
|
|
$(top_srcdir)/include/linux/utsname_compat.h \
|
|
$(top_srcdir)/include/linux/kmap_compat.h \
|
|
+ $(top_srcdir)/include/linux/simd.h \
|
|
$(top_srcdir)/include/linux/simd_x86.h \
|
|
$(top_srcdir)/include/linux/simd_aarch64.h \
|
|
$(top_srcdir)/include/linux/mod_compat.h \
|
|
diff --git a/include/linux/simd.h b/include/linux/simd.h
|
|
new file mode 100644
|
|
index 000000000..d2b60996a
|
|
--- /dev/null
|
|
+++ b/include/linux/simd.h
|
|
@@ -0,0 +1,41 @@
|
|
+/*
|
|
+ * CDDL HEADER START
|
|
+ *
|
|
+ * The contents of this file are subject to the terms of the
|
|
+ * Common Development and Distribution License (the "License").
|
|
+ * You may not use this file except in compliance with the License.
|
|
+ *
|
|
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
+ * or http://www.opensolaris.org/os/licensing.
|
|
+ * See the License for the specific language governing permissions
|
|
+ * and limitations under the License.
|
|
+ *
|
|
+ * When distributing Covered Code, include this CDDL HEADER in each
|
|
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
+ * If applicable, add the following below this CDDL HEADER, with the
|
|
+ * fields enclosed by brackets "[]" replaced with your own identifying
|
|
+ * information: Portions Copyright [yyyy] [name of copyright owner]
|
|
+ *
|
|
+ * CDDL HEADER END
|
|
+ */
|
|
+/*
|
|
+ * Copyright (C) 2019 Lawrence Livermore National Security, LLC.
|
|
+ */
|
|
+
|
|
+#ifndef _SIMD_H
|
|
+#define _SIMD_H
|
|
+
|
|
+#if defined(__x86)
|
|
+#include <linux/simd_x86.h>
|
|
+
|
|
+#elif defined(__aarch64__)
|
|
+#include <linux/simd_aarch64.h>
|
|
+#else
|
|
+
|
|
+#define kfpu_allowed() 1
|
|
+#define kfpu_initialize(tsk) do {} while (0)
|
|
+#define kfpu_begin() do {} while (0)
|
|
+#define kfpu_end() do {} while (0)
|
|
+
|
|
+#endif
|
|
+#endif /* _SIMD_H */
|
|
diff --git a/include/linux/simd_aarch64.h b/include/linux/simd_aarch64.h
|
|
index 155ef6205..1cfcd01e4 100644
|
|
--- a/include/linux/simd_aarch64.h
|
|
+++ b/include/linux/simd_aarch64.h
|
|
@@ -41,20 +41,18 @@
|
|
|
|
#if defined(_KERNEL)
|
|
#include <asm/neon.h>
|
|
-#define kfpu_begin() \
|
|
-{ \
|
|
- kernel_neon_begin(); \
|
|
-}
|
|
-#define kfpu_end() \
|
|
-{ \
|
|
- kernel_neon_end(); \
|
|
-}
|
|
+#define kfpu_allowed() 1
|
|
+#define kfpu_initialize(tsk) do {} while (0)
|
|
+#define kfpu_begin() kernel_neon_begin()
|
|
+#define kfpu_end() kernel_neon_end()
|
|
#else
|
|
/*
|
|
* fpu dummy methods for userspace
|
|
*/
|
|
-#define kfpu_begin() do {} while (0)
|
|
-#define kfpu_end() do {} while (0)
|
|
+#define kfpu_allowed() 1
|
|
+#define kfpu_initialize(tsk) do {} while (0)
|
|
+#define kfpu_begin() do {} while (0)
|
|
+#define kfpu_end() do {} while (0)
|
|
#endif /* defined(_KERNEL) */
|
|
|
|
#endif /* __aarch64__ */
|
|
diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h
|
|
index 12cd74677..2d7a1c3a5 100644
|
|
--- a/include/linux/simd_x86.h
|
|
+++ b/include/linux/simd_x86.h
|
|
@@ -90,33 +90,135 @@
|
|
#include <asm/xcr.h>
|
|
#endif
|
|
|
|
+/*
|
|
+ * The following cases are for kernels which export either the
|
|
+ * kernel_fpu_* or __kernel_fpu_* functions.
|
|
+ */
|
|
+#if defined(KERNEL_EXPORTS_X86_FPU)
|
|
+
|
|
+#define kfpu_allowed() 1
|
|
+#define kfpu_initialize(tsk) do {} while (0)
|
|
+
|
|
#if defined(HAVE_UNDERSCORE_KERNEL_FPU)
|
|
#define kfpu_begin() \
|
|
-{ \
|
|
- preempt_disable(); \
|
|
+{ \
|
|
+ preempt_disable(); \
|
|
__kernel_fpu_begin(); \
|
|
}
|
|
-#define kfpu_end() \
|
|
-{ \
|
|
- __kernel_fpu_end(); \
|
|
- preempt_enable(); \
|
|
+#define kfpu_end() \
|
|
+{ \
|
|
+ __kernel_fpu_end(); \
|
|
+ preempt_enable(); \
|
|
}
|
|
+
|
|
#elif defined(HAVE_KERNEL_FPU)
|
|
-#define kfpu_begin() kernel_fpu_begin()
|
|
+#define kfpu_begin() kernel_fpu_begin()
|
|
#define kfpu_end() kernel_fpu_end()
|
|
+
|
|
#else
|
|
-/* Kernel doesn't export any kernel_fpu_* functions */
|
|
-#include <asm/fpu/internal.h> /* For kernel xgetbv() */
|
|
-#define kfpu_begin() panic("This code should never run")
|
|
-#define kfpu_end() panic("This code should never run")
|
|
-#endif /* defined(HAVE_KERNEL_FPU) */
|
|
+/*
|
|
+ * This case is unreachable. When KERNEL_EXPORTS_X86_FPU is defined then
|
|
+ * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined.
|
|
+ */
|
|
+#error "Unreachable kernel configuration"
|
|
+#endif
|
|
+
|
|
+#else /* defined(KERNEL_EXPORTS_X86_FPU) */
|
|
+/*
|
|
+ * When the kernel_fpu_* symbols are unavailable then provide our own
|
|
+ * versions which allow the FPU to be safely used in kernel threads.
|
|
+ * In practice, this is not a significant restriction for ZFS since the
|
|
+ * vast majority of SIMD operations are performed by the IO pipeline.
|
|
+ */
|
|
|
|
+/*
|
|
+ * Returns non-zero if FPU operations are allowed in the current context.
|
|
+ */
|
|
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
|
|
+#define kfpu_allowed() ((current->flags & PF_KTHREAD) && \
|
|
+ test_thread_flag(TIF_NEED_FPU_LOAD))
|
|
+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
|
|
+#define kfpu_allowed() ((current->flags & PF_KTHREAD) && \
|
|
+ current->thread.fpu.initialized)
|
|
#else
|
|
+#define kfpu_allowed() 0
|
|
+#endif
|
|
+
|
|
+static inline void
|
|
+kfpu_initialize(void)
|
|
+{
|
|
+ WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
|
|
+
|
|
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
|
|
+ __fpu_invalidate_fpregs_state(¤t->thread.fpu);
|
|
+ set_thread_flag(TIF_NEED_FPU_LOAD);
|
|
+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
|
|
+ __fpu_invalidate_fpregs_state(¤t->thread.fpu);
|
|
+ current->thread.fpu.initialized = 1;
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline void
|
|
+kfpu_begin(void)
|
|
+{
|
|
+ WARN_ON_ONCE(!kfpu_allowed());
|
|
+
|
|
+ /*
|
|
+ * Preemption and interrupts must be disabled for the critical
|
|
+ * region where the FPU state is being modified.
|
|
+ */
|
|
+ preempt_disable();
|
|
+ local_irq_disable();
|
|
+
|
|
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
|
|
+ /*
|
|
+ * The current FPU registers need to be preserved by kfpu_begin()
|
|
+ * and restored by kfpu_end(). This is required because we can
|
|
+ * not call __cpu_invalidate_fpregs_state() to invalidate the
|
|
+ * per-cpu FPU state and force them to be restored during a
|
|
+ * context switch.
|
|
+ */
|
|
+ copy_fpregs_to_fpstate(¤t->thread.fpu);
|
|
+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
|
|
+ /*
|
|
+ * There is no need to preserve and restore the FPU registers.
|
|
+ * They will always be restored from the task's stored FPU state
|
|
+ * when switching contexts.
|
|
+ */
|
|
+ WARN_ON_ONCE(current->thread.fpu.initialized == 0);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline void
|
|
+kfpu_end(void)
|
|
+{
|
|
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
|
|
+ union fpregs_state *state = ¤t->thread.fpu.state;
|
|
+ int error;
|
|
+
|
|
+ if (use_xsave()) {
|
|
+ error = copy_kernel_to_xregs_err(&state->xsave, -1);
|
|
+ } else if (use_fxsr()) {
|
|
+ error = copy_kernel_to_fxregs_err(&state->fxsave);
|
|
+ } else {
|
|
+ error = copy_kernel_to_fregs_err(&state->fsave);
|
|
+ }
|
|
+ WARN_ON_ONCE(error);
|
|
+#endif
|
|
+
|
|
+ local_irq_enable();
|
|
+ preempt_enable();
|
|
+}
|
|
+#endif /* defined(HAVE_KERNEL_FPU) */
|
|
+
|
|
+#else /* defined(_KERNEL) */
|
|
/*
|
|
- * fpu dummy methods for userspace
|
|
+ * FPU dummy methods for user space.
|
|
*/
|
|
-#define kfpu_begin() do {} while (0)
|
|
-#define kfpu_end() do {} while (0)
|
|
+#define kfpu_allowed() 1
|
|
+#define kfpu_initialize(tsk) do {} while (0)
|
|
+#define kfpu_begin() do {} while (0)
|
|
+#define kfpu_end() do {} while (0)
|
|
#endif /* defined(_KERNEL) */
|
|
|
|
/*
|
|
@@ -298,7 +400,7 @@ __simd_state_enabled(const uint64_t state)
|
|
uint64_t xcr0;
|
|
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_OSXSAVE)
|
|
has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE);
|
|
#else
|
|
has_osxsave = B_FALSE;
|
|
@@ -328,11 +430,7 @@ static inline boolean_t
|
|
zfs_sse_available(void)
|
|
{
|
|
#if defined(_KERNEL)
|
|
-#if defined(KERNEL_EXPORTS_X86_FPU)
|
|
return (!!boot_cpu_has(X86_FEATURE_XMM));
|
|
-#else
|
|
- return (B_FALSE);
|
|
-#endif
|
|
#elif !defined(_KERNEL)
|
|
return (__cpuid_has_sse());
|
|
#endif
|
|
@@ -345,11 +443,7 @@ static inline boolean_t
|
|
zfs_sse2_available(void)
|
|
{
|
|
#if defined(_KERNEL)
|
|
-#if defined(KERNEL_EXPORTS_X86_FPU)
|
|
return (!!boot_cpu_has(X86_FEATURE_XMM2));
|
|
-#else
|
|
- return (B_FALSE);
|
|
-#endif
|
|
#elif !defined(_KERNEL)
|
|
return (__cpuid_has_sse2());
|
|
#endif
|
|
@@ -362,11 +456,7 @@ static inline boolean_t
|
|
zfs_sse3_available(void)
|
|
{
|
|
#if defined(_KERNEL)
|
|
-#if defined(KERNEL_EXPORTS_X86_FPU)
|
|
return (!!boot_cpu_has(X86_FEATURE_XMM3));
|
|
-#else
|
|
- return (B_FALSE);
|
|
-#endif
|
|
#elif !defined(_KERNEL)
|
|
return (__cpuid_has_sse3());
|
|
#endif
|
|
@@ -379,11 +469,7 @@ static inline boolean_t
|
|
zfs_ssse3_available(void)
|
|
{
|
|
#if defined(_KERNEL)
|
|
-#if defined(KERNEL_EXPORTS_X86_FPU)
|
|
return (!!boot_cpu_has(X86_FEATURE_SSSE3));
|
|
-#else
|
|
- return (B_FALSE);
|
|
-#endif
|
|
#elif !defined(_KERNEL)
|
|
return (__cpuid_has_ssse3());
|
|
#endif
|
|
@@ -396,11 +482,7 @@ static inline boolean_t
|
|
zfs_sse4_1_available(void)
|
|
{
|
|
#if defined(_KERNEL)
|
|
-#if defined(KERNEL_EXPORTS_X86_FPU)
|
|
return (!!boot_cpu_has(X86_FEATURE_XMM4_1));
|
|
-#else
|
|
- return (B_FALSE);
|
|
-#endif
|
|
#elif !defined(_KERNEL)
|
|
return (__cpuid_has_sse4_1());
|
|
#endif
|
|
@@ -413,11 +495,7 @@ static inline boolean_t
|
|
zfs_sse4_2_available(void)
|
|
{
|
|
#if defined(_KERNEL)
|
|
-#if defined(KERNEL_EXPORTS_X86_FPU)
|
|
return (!!boot_cpu_has(X86_FEATURE_XMM4_2));
|
|
-#else
|
|
- return (B_FALSE);
|
|
-#endif
|
|
#elif !defined(_KERNEL)
|
|
return (__cpuid_has_sse4_2());
|
|
#endif
|
|
@@ -431,11 +509,7 @@ zfs_avx_available(void)
|
|
{
|
|
boolean_t has_avx;
|
|
#if defined(_KERNEL)
|
|
-#if defined(KERNEL_EXPORTS_X86_FPU)
|
|
has_avx = !!boot_cpu_has(X86_FEATURE_AVX);
|
|
-#else
|
|
- has_avx = B_FALSE;
|
|
-#endif
|
|
#elif !defined(_KERNEL)
|
|
has_avx = __cpuid_has_avx();
|
|
#endif
|
|
@@ -451,11 +525,7 @@ zfs_avx2_available(void)
|
|
{
|
|
boolean_t has_avx2;
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2);
|
|
-#else
|
|
- has_avx2 = B_FALSE;
|
|
-#endif
|
|
#elif !defined(_KERNEL)
|
|
has_avx2 = __cpuid_has_avx2();
|
|
#endif
|
|
@@ -470,7 +540,7 @@ static inline boolean_t
|
|
zfs_bmi1_available(void)
|
|
{
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_BMI1)
|
|
return (!!boot_cpu_has(X86_FEATURE_BMI1));
|
|
#else
|
|
return (B_FALSE);
|
|
@@ -487,7 +557,7 @@ static inline boolean_t
|
|
zfs_bmi2_available(void)
|
|
{
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_BMI2)
|
|
return (!!boot_cpu_has(X86_FEATURE_BMI2));
|
|
#else
|
|
return (B_FALSE);
|
|
@@ -504,7 +574,7 @@ static inline boolean_t
|
|
zfs_aes_available(void)
|
|
{
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_AES)
|
|
return (!!boot_cpu_has(X86_FEATURE_AES));
|
|
#else
|
|
return (B_FALSE);
|
|
@@ -521,7 +591,7 @@ static inline boolean_t
|
|
zfs_pclmulqdq_available(void)
|
|
{
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_PCLMULQDQ)
|
|
return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ));
|
|
#else
|
|
return (B_FALSE);
|
|
@@ -555,7 +625,7 @@ zfs_avx512f_available(void)
|
|
boolean_t has_avx512 = B_FALSE;
|
|
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_AVX512F)
|
|
has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F);
|
|
#else
|
|
has_avx512 = B_FALSE;
|
|
@@ -574,7 +644,7 @@ zfs_avx512cd_available(void)
|
|
boolean_t has_avx512 = B_FALSE;
|
|
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_AVX512CD)
|
|
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
|
|
boot_cpu_has(X86_FEATURE_AVX512CD);
|
|
#else
|
|
@@ -594,7 +664,7 @@ zfs_avx512er_available(void)
|
|
boolean_t has_avx512 = B_FALSE;
|
|
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_AVX512ER)
|
|
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
|
|
boot_cpu_has(X86_FEATURE_AVX512ER);
|
|
#else
|
|
@@ -614,7 +684,7 @@ zfs_avx512pf_available(void)
|
|
boolean_t has_avx512 = B_FALSE;
|
|
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_AVX512PF)
|
|
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
|
|
boot_cpu_has(X86_FEATURE_AVX512PF);
|
|
#else
|
|
@@ -634,7 +704,7 @@ zfs_avx512bw_available(void)
|
|
boolean_t has_avx512 = B_FALSE;
|
|
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_AVX512BW)
|
|
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
|
|
boot_cpu_has(X86_FEATURE_AVX512BW);
|
|
#else
|
|
@@ -654,7 +724,7 @@ zfs_avx512dq_available(void)
|
|
boolean_t has_avx512 = B_FALSE;
|
|
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_AVX512DQ)
|
|
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
|
|
boot_cpu_has(X86_FEATURE_AVX512DQ);
|
|
#else
|
|
@@ -674,7 +744,7 @@ zfs_avx512vl_available(void)
|
|
boolean_t has_avx512 = B_FALSE;
|
|
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_AVX512VL)
|
|
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
|
|
boot_cpu_has(X86_FEATURE_AVX512VL);
|
|
#else
|
|
@@ -694,7 +764,7 @@ zfs_avx512ifma_available(void)
|
|
boolean_t has_avx512 = B_FALSE;
|
|
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_AVX512IFMA)
|
|
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
|
|
boot_cpu_has(X86_FEATURE_AVX512IFMA);
|
|
#else
|
|
@@ -714,7 +784,7 @@ zfs_avx512vbmi_available(void)
|
|
boolean_t has_avx512 = B_FALSE;
|
|
|
|
#if defined(_KERNEL)
|
|
-#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU)
|
|
+#if defined(X86_FEATURE_AVX512VBMI)
|
|
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
|
|
boot_cpu_has(X86_FEATURE_AVX512VBMI);
|
|
#else
|
|
diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h
|
|
index 2ce32469d..0ce2b5ea1 100644
|
|
--- a/include/sys/vdev_raidz.h
|
|
+++ b/include/sys/vdev_raidz.h
|
|
@@ -51,7 +51,7 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
|
|
*/
|
|
void vdev_raidz_math_init(void);
|
|
void vdev_raidz_math_fini(void);
|
|
-struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
|
|
+const struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
|
|
int vdev_raidz_math_generate(struct raidz_map *);
|
|
int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *,
|
|
const int);
|
|
diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h
|
|
index 0799ed19d..4969d110b 100644
|
|
--- a/include/sys/vdev_raidz_impl.h
|
|
+++ b/include/sys/vdev_raidz_impl.h
|
|
@@ -126,7 +126,7 @@ typedef struct raidz_map {
|
|
uintptr_t rm_reports; /* # of referencing checksum reports */
|
|
uint8_t rm_freed; /* map no longer has referencing ZIO */
|
|
uint8_t rm_ecksuminjected; /* checksum error was injected */
|
|
- raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
|
|
+ const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
|
|
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
|
|
} raidz_map_t;
|
|
|
|
diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h
|
|
index 95cfddf9e..9fd9c1bd1 100644
|
|
--- a/module/icp/include/aes/aes_impl.h
|
|
+++ b/module/icp/include/aes/aes_impl.h
|
|
@@ -198,12 +198,12 @@ extern const aes_impl_ops_t aes_aesni_impl;
|
|
/*
|
|
* Initializes fastest implementation
|
|
*/
|
|
-void aes_impl_init(void);
|
|
+void aes_impl_init(void *arg);
|
|
|
|
/*
|
|
- * Get selected aes implementation
|
|
+ * Returns optimal allowed AES implementation
|
|
*/
|
|
-struct aes_impl_ops *aes_impl_get_ops(void);
|
|
+const struct aes_impl_ops *aes_impl_get_ops(void);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h
|
|
index cbb904c05..138090487 100644
|
|
--- a/module/icp/include/modes/gcm_impl.h
|
|
+++ b/module/icp/include/modes/gcm_impl.h
|
|
@@ -61,12 +61,12 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
|
|
/*
|
|
* Initializes fastest implementation
|
|
*/
|
|
-void gcm_impl_init(void);
|
|
+void gcm_impl_init(void *arg);
|
|
|
|
/*
|
|
- * Get selected aes implementation
|
|
+ * Returns optimal allowed GCM implementation
|
|
*/
|
|
-struct gcm_impl_ops *gcm_impl_get_ops(void);
|
|
+const struct gcm_impl_ops *gcm_impl_get_ops(void);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
|
|
index 9c2cf9501..815462443 100644
|
|
--- a/cmd/ztest/ztest.c
|
|
+++ b/cmd/ztest/ztest.c
|
|
@@ -107,6 +107,7 @@
|
|
#include <sys/vdev_impl.h>
|
|
#include <sys/vdev_file.h>
|
|
#include <sys/vdev_initialize.h>
|
|
+#include <sys/vdev_raidz.h>
|
|
#include <sys/vdev_trim.h>
|
|
#include <sys/spa_impl.h>
|
|
#include <sys/metaslab_impl.h>
|
|
@@ -7094,6 +7095,8 @@ ztest_run(ztest_shared_t *zs)
|
|
metaslab_preload_limit = ztest_random(20) + 1;
|
|
ztest_spa = spa;
|
|
|
|
+ VERIFY0(vdev_raidz_impl_set("cycle"));
|
|
+
|
|
dmu_objset_stats_t dds;
|
|
VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
|
|
DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os));
|
|
diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c
|
|
index e15050635..457b9e45c 100644
|
|
--- a/module/icp/algs/aes/aes_impl.c
|
|
+++ b/module/icp/algs/aes/aes_impl.c
|
|
@@ -27,6 +27,7 @@
|
|
#include <sys/crypto/spi.h>
|
|
#include <modes/modes.h>
|
|
#include <aes/aes_impl.h>
|
|
+#include <linux/simd.h>
|
|
|
|
/*
|
|
* Initialize AES encryption and decryption key schedules.
|
|
@@ -40,9 +41,9 @@
|
|
void
|
|
aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
|
|
{
|
|
- aes_impl_ops_t *ops = aes_impl_get_ops();
|
|
- aes_key_t *newbie = keysched;
|
|
- uint_t keysize, i, j;
|
|
+ const aes_impl_ops_t *ops = aes_impl_get_ops();
|
|
+ aes_key_t *newbie = keysched;
|
|
+ uint_t keysize, i, j;
|
|
union {
|
|
uint64_t ka64[4];
|
|
uint32_t ka32[8];
|
|
@@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0;
|
|
static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];
|
|
|
|
/*
|
|
- * Selects the aes operations for encrypt/decrypt/key setup
|
|
+ * Returns the AES operations for encrypt/decrypt/key setup. When a
|
|
+ * SIMD implementation is not allowed in the current context, then
|
|
+ * fallback to the fastest generic implementation.
|
|
*/
|
|
-aes_impl_ops_t *
|
|
-aes_impl_get_ops()
|
|
+const aes_impl_ops_t *
|
|
+aes_impl_get_ops(void)
|
|
{
|
|
- aes_impl_ops_t *ops = NULL;
|
|
+ if (!kfpu_allowed())
|
|
+ return (&aes_generic_impl);
|
|
+
|
|
+ const aes_impl_ops_t *ops = NULL;
|
|
const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
|
|
|
|
switch (impl) {
|
|
@@ -266,15 +272,13 @@ aes_impl_get_ops()
|
|
ops = &aes_fastest_impl;
|
|
break;
|
|
case IMPL_CYCLE:
|
|
- {
|
|
+ /* Cycle through supported implementations */
|
|
ASSERT(aes_impl_initialized);
|
|
ASSERT3U(aes_supp_impl_cnt, >, 0);
|
|
- /* Cycle through supported implementations */
|
|
static size_t cycle_impl_idx = 0;
|
|
size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
|
|
ops = aes_supp_impl[idx];
|
|
- }
|
|
- break;
|
|
+ break;
|
|
default:
|
|
ASSERT3U(impl, <, aes_supp_impl_cnt);
|
|
ASSERT3U(aes_supp_impl_cnt, >, 0);
|
|
@@ -288,13 +292,17 @@ aes_impl_get_ops()
|
|
return (ops);
|
|
}
|
|
|
|
+/*
|
|
+ * Initialize all supported implementations.
|
|
+ */
|
|
+/* ARGSUSED */
|
|
void
|
|
-aes_impl_init(void)
|
|
+aes_impl_init(void *arg)
|
|
{
|
|
aes_impl_ops_t *curr_impl;
|
|
int i, c;
|
|
|
|
- /* move supported impl into aes_supp_impls */
|
|
+ /* Move supported implementations into aes_supp_impls */
|
|
for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
|
|
curr_impl = (aes_impl_ops_t *)aes_all_impl[i];
|
|
|
|
diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c
|
|
index 97f7c3a47..222c176aa 100644
|
|
--- a/module/icp/algs/aes/aes_impl_aesni.c
|
|
+++ b/module/icp/algs/aes/aes_impl_aesni.c
|
|
@@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
|
|
static boolean_t
|
|
aes_aesni_will_work(void)
|
|
{
|
|
- return (zfs_aes_available());
|
|
+ return (kfpu_allowed() && zfs_aes_available());
|
|
}
|
|
|
|
const aes_impl_ops_t aes_aesni_impl = {
|
|
diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c
|
|
index 13bceef0f..f6f8434de 100644
|
|
--- a/module/icp/algs/modes/gcm.c
|
|
+++ b/module/icp/algs/modes/gcm.c
|
|
@@ -29,6 +29,7 @@
|
|
#include <sys/crypto/impl.h>
|
|
#include <sys/byteorder.h>
|
|
#include <modes/gcm_impl.h>
|
|
+#include <linux/simd.h>
|
|
|
|
#define GHASH(c, d, t, o) \
|
|
xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
|
|
@@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
|
|
void (*copy_block)(uint8_t *, uint8_t *),
|
|
void (*xor_block)(uint8_t *, uint8_t *))
|
|
{
|
|
- gcm_impl_ops_t *gops;
|
|
+ const gcm_impl_ops_t *gops;
|
|
size_t remainder = length;
|
|
size_t need = 0;
|
|
uint8_t *datap = (uint8_t *)data;
|
|
@@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
|
|
void (*copy_block)(uint8_t *, uint8_t *),
|
|
void (*xor_block)(uint8_t *, uint8_t *))
|
|
{
|
|
- gcm_impl_ops_t *gops;
|
|
+ const gcm_impl_ops_t *gops;
|
|
uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
|
|
uint8_t *ghash, *macp = NULL;
|
|
int i, rv;
|
|
@@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
|
|
int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
|
|
void (*xor_block)(uint8_t *, uint8_t *))
|
|
{
|
|
- gcm_impl_ops_t *gops;
|
|
+ const gcm_impl_ops_t *gops;
|
|
size_t pt_len;
|
|
size_t remainder;
|
|
uint8_t *ghash;
|
|
@@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
|
|
void (*copy_block)(uint8_t *, uint8_t *),
|
|
void (*xor_block)(uint8_t *, uint8_t *))
|
|
{
|
|
- gcm_impl_ops_t *gops;
|
|
+ const gcm_impl_ops_t *gops;
|
|
uint8_t *cb;
|
|
ulong_t remainder = iv_len;
|
|
ulong_t processed = 0;
|
|
@@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
|
|
void (*copy_block)(uint8_t *, uint8_t *),
|
|
void (*xor_block)(uint8_t *, uint8_t *))
|
|
{
|
|
- gcm_impl_ops_t *gops;
|
|
+ const gcm_impl_ops_t *gops;
|
|
uint8_t *ghash, *datap, *authp;
|
|
size_t remainder, processed;
|
|
|
|
@@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0;
|
|
static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
|
|
|
|
/*
|
|
- * Selects the gcm operation
|
|
+ * Returns the GCM operations for encrypt/decrypt/key setup. When a
|
|
+ * SIMD implementation is not allowed in the current context, then
|
|
+ * fallback to the fastest generic implementation.
|
|
*/
|
|
-gcm_impl_ops_t *
|
|
+const gcm_impl_ops_t *
|
|
gcm_impl_get_ops()
|
|
{
|
|
- gcm_impl_ops_t *ops = NULL;
|
|
+ if (!kfpu_allowed())
|
|
+ return (&gcm_generic_impl);
|
|
+
|
|
+ const gcm_impl_ops_t *ops = NULL;
|
|
const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
|
|
|
|
switch (impl) {
|
|
@@ -674,15 +680,13 @@ gcm_impl_get_ops()
|
|
ops = &gcm_fastest_impl;
|
|
break;
|
|
case IMPL_CYCLE:
|
|
- {
|
|
+ /* Cycle through supported implementations */
|
|
ASSERT(gcm_impl_initialized);
|
|
ASSERT3U(gcm_supp_impl_cnt, >, 0);
|
|
- /* Cycle through supported implementations */
|
|
static size_t cycle_impl_idx = 0;
|
|
size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
|
|
ops = gcm_supp_impl[idx];
|
|
- }
|
|
- break;
|
|
+ break;
|
|
default:
|
|
ASSERT3U(impl, <, gcm_supp_impl_cnt);
|
|
ASSERT3U(gcm_supp_impl_cnt, >, 0);
|
|
@@ -696,13 +700,17 @@ gcm_impl_get_ops()
|
|
return (ops);
|
|
}
|
|
|
|
+/*
|
|
+ * Initialize all supported implementations.
|
|
+ */
|
|
+/* ARGSUSED */
|
|
void
|
|
-gcm_impl_init(void)
|
|
+gcm_impl_init(void *arg)
|
|
{
|
|
gcm_impl_ops_t *curr_impl;
|
|
int i, c;
|
|
|
|
- /* move supported impl into aes_supp_impls */
|
|
+ /* Move supported implementations into gcm_supp_impls */
|
|
for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
|
|
curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
|
|
|
|
@@ -711,7 +719,10 @@ gcm_impl_init(void)
|
|
}
|
|
gcm_supp_impl_cnt = c;
|
|
|
|
- /* set fastest implementation. assume hardware accelerated is fastest */
|
|
+ /*
|
|
+ * Set the fastest implementation given the assumption that the
|
|
+ * hardware accelerated version is the fastest.
|
|
+ */
|
|
#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
|
|
if (gcm_pclmulqdq_impl.is_supported())
|
|
memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
|
|
diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c
|
|
index be00ba37b..8a43ba33a 100644
|
|
--- a/module/icp/algs/modes/gcm_pclmulqdq.c
|
|
+++ b/module/icp/algs/modes/gcm_pclmulqdq.c
|
|
@@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
|
|
static boolean_t
|
|
gcm_pclmulqdq_will_work(void)
|
|
{
|
|
- return (zfs_pclmulqdq_available());
|
|
+ return (kfpu_allowed() && zfs_pclmulqdq_available());
|
|
}
|
|
|
|
const gcm_impl_ops_t gcm_pclmulqdq_impl = {
|
|
diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c
|
|
index 53b193693..51538bc60 100644
|
|
--- a/module/icp/io/aes.c
|
|
+++ b/module/icp/io/aes.c
|
|
@@ -206,9 +206,35 @@ aes_mod_init(void)
|
|
{
|
|
int ret;
|
|
|
|
- /* find fastest implementations and set any requested implementations */
|
|
- aes_impl_init();
|
|
- gcm_impl_init();
|
|
+#if defined(_KERNEL)
|
|
+ /*
|
|
+ * Determine the fastest available implementation. The benchmarks
|
|
+ * are run in dedicated kernel threads to allow Linux 5.0+ kernels
|
|
+ * to use SIMD operations. If for some reason this isn't possible,
|
|
+ * fallback to the generic implementations. See the comment in
|
|
+ * include/linux/simd_x86.h for additional details. Additionally,
|
|
+ * this has the benefit of allowing them to be run in parallel.
|
|
+ */
|
|
+ taskqid_t aes_id = taskq_dispatch(system_taskq, aes_impl_init,
|
|
+ NULL, TQ_SLEEP);
|
|
+ taskqid_t gcm_id = taskq_dispatch(system_taskq, gcm_impl_init,
|
|
+ NULL, TQ_SLEEP);
|
|
+
|
|
+ if (aes_id != TASKQID_INVALID) {
|
|
+ taskq_wait_id(system_taskq, aes_id);
|
|
+ } else {
|
|
+ aes_impl_init(NULL);
|
|
+ }
|
|
+
|
|
+ if (gcm_id != TASKQID_INVALID) {
|
|
+ taskq_wait_id(system_taskq, gcm_id);
|
|
+ } else {
|
|
+ gcm_impl_init(NULL);
|
|
+ }
|
|
+#else
|
|
+ aes_impl_init(NULL);
|
|
+ gcm_impl_init(NULL);
|
|
+#endif
|
|
|
|
if ((ret = mod_install(&modlinkage)) != 0)
|
|
return (ret);
|
|
diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c
|
|
index 7684257be..de0e45190 100644
|
|
--- a/module/spl/spl-taskq.c
|
|
+++ b/module/spl/spl-taskq.c
|
|
@@ -27,6 +27,7 @@
|
|
#include <sys/taskq.h>
|
|
#include <sys/kmem.h>
|
|
#include <sys/tsd.h>
|
|
+#include <linux/simd.h>
|
|
|
|
int spl_taskq_thread_bind = 0;
|
|
module_param(spl_taskq_thread_bind, int, 0644);
|
|
@@ -869,6 +870,7 @@ taskq_thread(void *args)
|
|
sigfillset(&blocked);
|
|
sigprocmask(SIG_BLOCK, &blocked, NULL);
|
|
flush_signals(current);
|
|
+ kfpu_initialize();
|
|
|
|
tsd_set(taskq_tsd, tq);
|
|
spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
|
|
diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c
|
|
index d441ad65f..c4977bcf2 100644
|
|
--- a/module/spl/spl-thread.c
|
|
+++ b/module/spl/spl-thread.c
|
|
@@ -27,6 +27,7 @@
|
|
#include <sys/thread.h>
|
|
#include <sys/kmem.h>
|
|
#include <sys/tsd.h>
|
|
+#include <linux/simd.h>
|
|
|
|
/*
|
|
* Thread interfaces
|
|
@@ -54,6 +55,7 @@ thread_generic_wrapper(void *arg)
|
|
args = tp->tp_args;
|
|
set_current_state(tp->tp_state);
|
|
set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
|
|
+ kfpu_initialize();
|
|
kmem_free(tp->tp_name, tp->tp_name_size);
|
|
kmem_free(tp, sizeof (thread_priv_t));
|
|
|
|
diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
|
|
index 5a991ba60..b75d8ab00 100644
|
|
--- a/module/zcommon/zfs_fletcher.c
|
|
+++ b/module/zcommon/zfs_fletcher.c
|
|
@@ -140,6 +140,7 @@
|
|
#include <sys/zio_checksum.h>
|
|
#include <sys/zfs_context.h>
|
|
#include <zfs_fletcher.h>
|
|
+#include <linux/simd.h>
|
|
|
|
#define FLETCHER_MIN_SIMD_SIZE 64
|
|
|
|
@@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector {
|
|
const char *fis_name;
|
|
uint32_t fis_sel;
|
|
} fletcher_4_impl_selectors[] = {
|
|
-#if !defined(_KERNEL)
|
|
{ "cycle", IMPL_CYCLE },
|
|
-#endif
|
|
{ "fastest", IMPL_FASTEST },
|
|
{ "scalar", IMPL_SCALAR }
|
|
};
|
|
|
|
#if defined(_KERNEL)
|
|
static kstat_t *fletcher_4_kstat;
|
|
-#endif
|
|
|
|
static struct fletcher_4_kstat {
|
|
uint64_t native;
|
|
uint64_t byteswap;
|
|
} fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
|
|
+#endif
|
|
|
|
/* Indicate that benchmark has been completed */
|
|
static boolean_t fletcher_4_initialized = B_FALSE;
|
|
@@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val)
|
|
return (err);
|
|
}
|
|
|
|
+/*
|
|
+ * Returns the Fletcher 4 operations for checksums. When a SIMD
|
|
+ * implementation is not allowed in the current context, then fallback
|
|
+ * to the fastest generic implementation.
|
|
+ */
|
|
static inline const fletcher_4_ops_t *
|
|
fletcher_4_impl_get(void)
|
|
{
|
|
- fletcher_4_ops_t *ops = NULL;
|
|
- const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
|
|
+ if (!kfpu_allowed())
|
|
+ return (&fletcher_4_superscalar4_ops);
|
|
+
|
|
+ const fletcher_4_ops_t *ops = NULL;
|
|
+ uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
|
|
|
|
switch (impl) {
|
|
case IMPL_FASTEST:
|
|
ASSERT(fletcher_4_initialized);
|
|
ops = &fletcher_4_fastest_impl;
|
|
break;
|
|
-#if !defined(_KERNEL)
|
|
- case IMPL_CYCLE: {
|
|
+ case IMPL_CYCLE:
|
|
+ /* Cycle through supported implementations */
|
|
ASSERT(fletcher_4_initialized);
|
|
ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
|
|
-
|
|
static uint32_t cycle_count = 0;
|
|
uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
|
|
ops = fletcher_4_supp_impls[idx];
|
|
- }
|
|
- break;
|
|
-#endif
|
|
+ break;
|
|
default:
|
|
ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
|
|
ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
|
|
-
|
|
ops = fletcher_4_supp_impls[impl];
|
|
break;
|
|
}
|
|
@@ -658,6 +661,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
|
|
typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
|
|
zio_cksum_t *);
|
|
|
|
+#if defined(_KERNEL)
|
|
static void
|
|
fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
|
|
{
|
|
@@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
|
|
/* restore original selection */
|
|
atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
|
|
}
|
|
+#endif /* _KERNEL */
|
|
|
|
-void
|
|
-fletcher_4_init(void)
|
|
+/*
|
|
+ * Initialize and benchmark all supported implementations.
|
|
+ */
|
|
+static void
|
|
+fletcher_4_benchmark(void *arg)
|
|
{
|
|
- static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
|
|
fletcher_4_ops_t *curr_impl;
|
|
- char *databuf;
|
|
int i, c;
|
|
|
|
- /* move supported impl into fletcher_4_supp_impls */
|
|
+ /* Move supported implementations into fletcher_4_supp_impls */
|
|
for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
|
|
curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
|
|
|
|
@@ -735,19 +741,10 @@ fletcher_4_init(void)
|
|
membar_producer(); /* complete fletcher_4_supp_impls[] init */
|
|
fletcher_4_supp_impls_cnt = c; /* number of supported impl */
|
|
|
|
-#if !defined(_KERNEL)
|
|
- /* Skip benchmarking and use last implementation as fastest */
|
|
- memcpy(&fletcher_4_fastest_impl,
|
|
- fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
|
|
- sizeof (fletcher_4_fastest_impl));
|
|
- fletcher_4_fastest_impl.name = "fastest";
|
|
- membar_producer();
|
|
+#if defined(_KERNEL)
|
|
+ static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
|
|
+ char *databuf = vmem_alloc(data_size, KM_SLEEP);
|
|
|
|
- fletcher_4_initialized = B_TRUE;
|
|
- return;
|
|
-#endif
|
|
- /* Benchmark all supported implementations */
|
|
- databuf = vmem_alloc(data_size, KM_SLEEP);
|
|
for (i = 0; i < data_size / sizeof (uint64_t); i++)
|
|
((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
|
|
|
|
@@ -755,9 +752,38 @@ fletcher_4_init(void)
|
|
fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
|
|
|
|
vmem_free(databuf, data_size);
|
|
+#else
|
|
+ /*
|
|
+ * Skip the benchmark in user space to avoid impacting libzpool
|
|
+ * consumers (zdb, zhack, zinject, ztest). The last implementation
|
|
+ * is assumed to be the fastest and used by default.
|
|
+ */
|
|
+ memcpy(&fletcher_4_fastest_impl,
|
|
+ fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
|
|
+ sizeof (fletcher_4_fastest_impl));
|
|
+ fletcher_4_fastest_impl.name = "fastest";
|
|
+ membar_producer();
|
|
+#endif /* _KERNEL */
|
|
+}
|
|
|
|
+void
|
|
+fletcher_4_init(void)
|
|
+{
|
|
#if defined(_KERNEL)
|
|
- /* install kstats for all implementations */
|
|
+ /*
|
|
+ * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
|
|
+ * run in a kernel threads. This is needed to take advantage of the
|
|
+ * SIMD functionality, see include/linux/simd_x86.h for details.
|
|
+ */
|
|
+ taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark,
|
|
+ NULL, TQ_SLEEP);
|
|
+ if (id != TASKQID_INVALID) {
|
|
+ taskq_wait_id(system_taskq, id);
|
|
+ } else {
|
|
+ fletcher_4_benchmark(NULL);
|
|
+ }
|
|
+
|
|
+ /* Install kstats for all implementations */
|
|
fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
|
|
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
|
|
if (fletcher_4_kstat != NULL) {
|
|
@@ -769,6 +795,8 @@ fletcher_4_init(void)
|
|
fletcher_4_kstat_addr);
|
|
kstat_install(fletcher_4_kstat);
|
|
}
|
|
+#else
|
|
+ fletcher_4_benchmark(NULL);
|
|
#endif
|
|
|
|
/* Finish initialization */
|
|
diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c
|
|
index bd2db2b20..3b3c1b52b 100644
|
|
--- a/module/zcommon/zfs_fletcher_aarch64_neon.c
|
|
+++ b/module/zcommon/zfs_fletcher_aarch64_neon.c
|
|
@@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16)));
|
|
|
|
static boolean_t fletcher_4_aarch64_neon_valid(void)
|
|
{
|
|
- return (B_TRUE);
|
|
+ return (kfpu_allowed());
|
|
}
|
|
|
|
const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
|
|
diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c
|
|
index 7260a9864..0d4cff21a 100644
|
|
--- a/module/zcommon/zfs_fletcher_avx512.c
|
|
+++ b/module/zcommon/zfs_fletcher_avx512.c
|
|
@@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
|
|
static boolean_t
|
|
fletcher_4_avx512f_valid(void)
|
|
{
|
|
- return (zfs_avx512f_available());
|
|
+ return (kfpu_allowed() && zfs_avx512f_available());
|
|
}
|
|
|
|
const fletcher_4_ops_t fletcher_4_avx512f_ops = {
|
|
diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c
|
|
index 6dac047da..7f12efe6d 100644
|
|
--- a/module/zcommon/zfs_fletcher_intel.c
|
|
+++ b/module/zcommon/zfs_fletcher_intel.c
|
|
@@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
|
|
|
|
static boolean_t fletcher_4_avx2_valid(void)
|
|
{
|
|
- return (zfs_avx_available() && zfs_avx2_available());
|
|
+ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
|
|
}
|
|
|
|
const fletcher_4_ops_t fletcher_4_avx2_ops = {
|
|
diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c
|
|
index a0b42e5f5..e6389d6e5 100644
|
|
--- a/module/zcommon/zfs_fletcher_sse.c
|
|
+++ b/module/zcommon/zfs_fletcher_sse.c
|
|
@@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
|
|
|
|
static boolean_t fletcher_4_sse2_valid(void)
|
|
{
|
|
- return (zfs_sse2_available());
|
|
+ return (kfpu_allowed() && zfs_sse2_available());
|
|
}
|
|
|
|
const fletcher_4_ops_t fletcher_4_sse2_ops = {
|
|
@@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
|
|
|
|
static boolean_t fletcher_4_ssse3_valid(void)
|
|
{
|
|
- return (zfs_sse2_available() && zfs_ssse3_available());
|
|
+ return (kfpu_allowed() && zfs_sse2_available() &&
|
|
+ zfs_ssse3_available());
|
|
}
|
|
|
|
const fletcher_4_ops_t fletcher_4_ssse3_ops = {
|
|
diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c
|
|
index e6112bc02..e7a39015c 100644
|
|
--- a/module/zfs/vdev_raidz_math.c
|
|
+++ b/module/zfs/vdev_raidz_math.c
|
|
@@ -27,9 +27,9 @@
|
|
#include <sys/zio.h>
|
|
#include <sys/debug.h>
|
|
#include <sys/zfs_debug.h>
|
|
-
|
|
#include <sys/vdev_raidz.h>
|
|
#include <sys/vdev_raidz_impl.h>
|
|
+#include <linux/simd.h>
|
|
|
|
extern boolean_t raidz_will_scalar_work(void);
|
|
|
|
@@ -87,6 +87,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
|
|
static size_t raidz_supp_impl_cnt = 0;
|
|
static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
|
|
|
|
+#if defined(_KERNEL)
|
|
/*
|
|
* kstats values for supported implementations
|
|
* Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
|
|
@@ -95,14 +96,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
|
|
|
|
/* kstat for benchmarked implementations */
|
|
static kstat_t *raidz_math_kstat = NULL;
|
|
+#endif
|
|
|
|
/*
|
|
- * Selects the raidz operation for raidz_map
|
|
- * If rm_ops is set to NULL original raidz implementation will be used
|
|
+ * Returns the RAIDZ operations for raidz_map() parity calculations. When
|
|
+ * a SIMD implementation is not allowed in the current context, then fallback
|
|
+ * to the fastest generic implementation.
|
|
*/
|
|
-raidz_impl_ops_t *
|
|
-vdev_raidz_math_get_ops()
|
|
+const raidz_impl_ops_t *
|
|
+vdev_raidz_math_get_ops(void)
|
|
{
|
|
+ if (!kfpu_allowed())
|
|
+ return (&vdev_raidz_scalar_impl);
|
|
+
|
|
raidz_impl_ops_t *ops = NULL;
|
|
const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
|
|
|
|
@@ -111,18 +117,14 @@ vdev_raidz_math_get_ops()
|
|
ASSERT(raidz_math_initialized);
|
|
ops = &vdev_raidz_fastest_impl;
|
|
break;
|
|
-#if !defined(_KERNEL)
|
|
case IMPL_CYCLE:
|
|
- {
|
|
+ /* Cycle through all supported implementations */
|
|
ASSERT(raidz_math_initialized);
|
|
ASSERT3U(raidz_supp_impl_cnt, >, 0);
|
|
- /* Cycle through all supported implementations */
|
|
static size_t cycle_impl_idx = 0;
|
|
size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
|
|
ops = raidz_supp_impl[idx];
|
|
- }
|
|
- break;
|
|
-#endif
|
|
+ break;
|
|
case IMPL_ORIGINAL:
|
|
ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
|
|
break;
|
|
@@ -273,6 +275,8 @@ const char *raidz_rec_name[] = {
|
|
"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
|
|
};
|
|
|
|
+#if defined(_KERNEL)
|
|
+
|
|
#define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1)
|
|
|
|
static int
|
|
@@ -435,21 +439,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
|
|
}
|
|
}
|
|
}
|
|
+#endif
|
|
|
|
-void
|
|
-vdev_raidz_math_init(void)
|
|
+/*
|
|
+ * Initialize and benchmark all supported implementations.
|
|
+ */
|
|
+static void
|
|
+benchmark_raidz(void *arg)
|
|
{
|
|
raidz_impl_ops_t *curr_impl;
|
|
- zio_t *bench_zio = NULL;
|
|
- raidz_map_t *bench_rm = NULL;
|
|
- uint64_t bench_parity;
|
|
- int i, c, fn;
|
|
+ int i, c;
|
|
|
|
- /* move supported impl into raidz_supp_impl */
|
|
+ /* Move supported impl into raidz_supp_impl */
|
|
for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
|
|
curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
|
|
|
|
- /* initialize impl */
|
|
if (curr_impl->init)
|
|
curr_impl->init();
|
|
|
|
@@ -459,18 +463,10 @@ vdev_raidz_math_init(void)
|
|
membar_producer(); /* complete raidz_supp_impl[] init */
|
|
raidz_supp_impl_cnt = c; /* number of supported impl */
|
|
|
|
-#if !defined(_KERNEL)
|
|
- /* Skip benchmarking and use last implementation as fastest */
|
|
- memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
|
|
- sizeof (vdev_raidz_fastest_impl));
|
|
- strcpy(vdev_raidz_fastest_impl.name, "fastest");
|
|
-
|
|
- raidz_math_initialized = B_TRUE;
|
|
-
|
|
- /* Use 'cycle' math selection method for userspace */
|
|
- VERIFY0(vdev_raidz_impl_set("cycle"));
|
|
- return;
|
|
-#endif
|
|
+#if defined(_KERNEL)
|
|
+ zio_t *bench_zio = NULL;
|
|
+ raidz_map_t *bench_rm = NULL;
|
|
+ uint64_t bench_parity;
|
|
|
|
/* Fake an zio and run the benchmark on a warmed up buffer */
|
|
bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
|
|
@@ -480,7 +476,7 @@ vdev_raidz_math_init(void)
|
|
memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
|
|
|
|
/* Benchmark parity generation methods */
|
|
- for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
|
|
+ for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
|
|
bench_parity = fn + 1;
|
|
/* New raidz_map is needed for each generate_p/q/r */
|
|
bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
|
|
@@ -495,7 +491,7 @@ vdev_raidz_math_init(void)
|
|
bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
|
|
BENCH_COLS, PARITY_PQR);
|
|
|
|
- for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
|
|
+ for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
|
|
benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
|
|
|
|
vdev_raidz_map_free(bench_rm);
|
|
@@ -503,11 +499,39 @@ vdev_raidz_math_init(void)
|
|
/* cleanup the bench zio */
|
|
abd_free(bench_zio->io_abd);
|
|
kmem_free(bench_zio, sizeof (zio_t));
|
|
+#else
|
|
+ /*
|
|
+ * Skip the benchmark in user space to avoid impacting libzpool
|
|
+ * consumers (zdb, zhack, zinject, ztest). The last implementation
|
|
+ * is assumed to be the fastest and used by default.
|
|
+ */
|
|
+ memcpy(&vdev_raidz_fastest_impl,
|
|
+ raidz_supp_impl[raidz_supp_impl_cnt - 1],
|
|
+ sizeof (vdev_raidz_fastest_impl));
|
|
+ strcpy(vdev_raidz_fastest_impl.name, "fastest");
|
|
+#endif /* _KERNEL */
|
|
+}
|
|
|
|
- /* install kstats for all impl */
|
|
+void
|
|
+vdev_raidz_math_init(void)
|
|
+{
|
|
+#if defined(_KERNEL)
|
|
+ /*
|
|
+ * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
|
|
+ * run in a kernel threads. This is needed to take advantage of the
|
|
+ * SIMD functionality, see include/linux/simd_x86.h for details.
|
|
+ */
|
|
+ taskqid_t id = taskq_dispatch(system_taskq, benchmark_raidz,
|
|
+ NULL, TQ_SLEEP);
|
|
+ if (id != TASKQID_INVALID) {
|
|
+ taskq_wait_id(system_taskq, id);
|
|
+ } else {
|
|
+ benchmark_raidz(NULL);
|
|
+ }
|
|
+
|
|
+ /* Install kstats for all implementations */
|
|
raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
|
|
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
|
|
-
|
|
if (raidz_math_kstat != NULL) {
|
|
raidz_math_kstat->ks_data = NULL;
|
|
raidz_math_kstat->ks_ndata = UINT32_MAX;
|
|
@@ -517,6 +541,9 @@ vdev_raidz_math_init(void)
|
|
raidz_math_kstat_addr);
|
|
kstat_install(raidz_math_kstat);
|
|
}
|
|
+#else
|
|
+ benchmark_raidz(NULL);
|
|
+#endif
|
|
|
|
/* Finish initialization */
|
|
atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
|
|
@@ -527,15 +554,15 @@ void
|
|
vdev_raidz_math_fini(void)
|
|
{
|
|
raidz_impl_ops_t const *curr_impl;
|
|
- int i;
|
|
|
|
+#if defined(_KERNEL)
|
|
if (raidz_math_kstat != NULL) {
|
|
kstat_delete(raidz_math_kstat);
|
|
raidz_math_kstat = NULL;
|
|
}
|
|
+#endif
|
|
|
|
- /* fini impl */
|
|
- for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
|
|
+ for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
|
|
curr_impl = raidz_all_maths[i];
|
|
if (curr_impl->fini)
|
|
curr_impl->fini();
|
|
@@ -546,9 +573,7 @@ static const struct {
|
|
char *name;
|
|
uint32_t sel;
|
|
} math_impl_opts[] = {
|
|
-#if !defined(_KERNEL)
|
|
{ "cycle", IMPL_CYCLE },
|
|
-#endif
|
|
{ "fastest", IMPL_FASTEST },
|
|
{ "original", IMPL_ORIGINAL },
|
|
{ "scalar", IMPL_SCALAR }
|
|
diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c
|
|
index e3ad06776..0a67ceb84 100644
|
|
--- a/module/zfs/vdev_raidz_math_aarch64_neon.c
|
|
+++ b/module/zfs/vdev_raidz_math_aarch64_neon.c
|
|
@@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon);
|
|
static boolean_t
|
|
raidz_will_aarch64_neon_work(void)
|
|
{
|
|
- return (B_TRUE); // __arch64__ requires NEON
|
|
+ return (kfpu_allowed());
|
|
}
|
|
|
|
const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = {
|
|
diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
|
|
index f8688a06a..e072f51cd 100644
|
|
--- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c
|
|
+++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
|
|
@@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2);
|
|
static boolean_t
|
|
raidz_will_aarch64_neonx2_work(void)
|
|
{
|
|
- return (B_TRUE); // __arch64__ requires NEON
|
|
+ return (kfpu_allowed());
|
|
}
|
|
|
|
const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = {
|
|
diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c
|
|
index 063d29bcd..a12eb6720 100644
|
|
--- a/module/zfs/vdev_raidz_math_avx2.c
|
|
+++ b/module/zfs/vdev_raidz_math_avx2.c
|
|
@@ -396,7 +396,7 @@ DEFINE_REC_METHODS(avx2);
|
|
static boolean_t
|
|
raidz_will_avx2_work(void)
|
|
{
|
|
- return (zfs_avx_available() && zfs_avx2_available());
|
|
+ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
|
|
}
|
|
|
|
const raidz_impl_ops_t vdev_raidz_avx2_impl = {
|
|
diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c
|
|
index d605653db..2f545c9ec 100644
|
|
--- a/module/zfs/vdev_raidz_math_avx512bw.c
|
|
+++ b/module/zfs/vdev_raidz_math_avx512bw.c
|
|
@@ -393,9 +393,8 @@ DEFINE_REC_METHODS(avx512bw);
|
|
static boolean_t
|
|
raidz_will_avx512bw_work(void)
|
|
{
|
|
- return (zfs_avx_available() &&
|
|
- zfs_avx512f_available() &&
|
|
- zfs_avx512bw_available());
|
|
+ return (kfpu_allowed() && zfs_avx_available() &&
|
|
+ zfs_avx512f_available() && zfs_avx512bw_available());
|
|
}
|
|
|
|
const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
|
|
diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c
|
|
index f4e4560ce..75af7a8ee 100644
|
|
--- a/module/zfs/vdev_raidz_math_avx512f.c
|
|
+++ b/module/zfs/vdev_raidz_math_avx512f.c
|
|
@@ -470,9 +470,8 @@ DEFINE_REC_METHODS(avx512f);
|
|
static boolean_t
|
|
raidz_will_avx512f_work(void)
|
|
{
|
|
- return (zfs_avx_available() &&
|
|
- zfs_avx2_available() &&
|
|
- zfs_avx512f_available());
|
|
+ return (kfpu_allowed() && zfs_avx_available() &&
|
|
+ zfs_avx2_available() && zfs_avx512f_available());
|
|
}
|
|
|
|
const raidz_impl_ops_t vdev_raidz_avx512f_impl = {
|
|
diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c
|
|
index 9985da273..5b3a9385c 100644
|
|
--- a/module/zfs/vdev_raidz_math_sse2.c
|
|
+++ b/module/zfs/vdev_raidz_math_sse2.c
|
|
@@ -607,7 +607,7 @@ DEFINE_REC_METHODS(sse2);
|
|
static boolean_t
|
|
raidz_will_sse2_work(void)
|
|
{
|
|
- return (zfs_sse_available() && zfs_sse2_available());
|
|
+ return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
|
|
}
|
|
|
|
const raidz_impl_ops_t vdev_raidz_sse2_impl = {
|
|
diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c
|
|
index 047a48d54..62247cf8e 100644
|
|
--- a/module/zfs/vdev_raidz_math_ssse3.c
|
|
+++ b/module/zfs/vdev_raidz_math_ssse3.c
|
|
@@ -399,8 +399,8 @@ DEFINE_REC_METHODS(ssse3);
|
|
static boolean_t
|
|
raidz_will_ssse3_work(void)
|
|
{
|
|
- return (zfs_sse_available() && zfs_sse2_available() &&
|
|
- zfs_ssse3_available());
|
|
+ return (kfpu_allowed() && zfs_sse_available() &&
|
|
+ zfs_sse2_available() && zfs_ssse3_available());
|
|
}
|
|
|
|
const raidz_impl_ops_t vdev_raidz_ssse3_impl = {
|
|
diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4
|
|
index 5fff79a74..31bf35f83 100644
|
|
--- a/config/kernel-fpu.m4
|
|
+++ b/config/kernel-fpu.m4
|
|
@@ -2,8 +2,15 @@ dnl #
|
|
dnl # Handle differences in kernel FPU code.
|
|
dnl #
|
|
dnl # Kernel
|
|
-dnl # 5.0: All kernel fpu functions are GPL only, so we can't use them.
|
|
-dnl # (nothing defined)
|
|
+dnl # 5.2: The fpu->initialized flag was replaced by TIF_NEED_FPU_LOAD.
|
|
+dnl # HAVE_KERNEL_TIF_NEED_FPU_LOAD
|
|
+dnl #
|
|
+dnl # 5.0: As an optimization SIMD operations performed by kernel
|
|
+dnl # threads can skip saving and restoring their FPU context.
|
|
+dnl # Wrappers have been introduced to determine the running
|
|
+dnl # context and use either the SIMD or generic implementation.
|
|
+dnl # This change was made to the 4.19.38 and 4.14.120 LTS kernels.
|
|
+dnl # HAVE_KERNEL_FPU_INITIALIZED
|
|
dnl #
|
|
dnl # 4.2: Use __kernel_fpu_{begin,end}()
|
|
dnl # HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU
|
|
@@ -56,10 +63,39 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
|
|
__kernel_fpu_end();
|
|
], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [
|
|
AC_MSG_RESULT(__kernel_fpu_*)
|
|
- AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions])
|
|
- AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions])
|
|
+ AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1,
|
|
+ [kernel has __kernel_fpu_* functions])
|
|
+ AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
|
|
+ [kernel exports FPU functions])
|
|
],[
|
|
- AC_MSG_RESULT(not exported)
|
|
+ ZFS_LINUX_TRY_COMPILE([
|
|
+ #include <linux/module.h>
|
|
+ #include <linux/sched.h>
|
|
+ ],[
|
|
+ struct fpu *fpu = ¤t->thread.fpu;
|
|
+ if (fpu->initialized) { return (0); };
|
|
+ ],[
|
|
+ AC_MSG_RESULT(fpu.initialized)
|
|
+ AC_DEFINE(HAVE_KERNEL_FPU_INITIALIZED, 1,
|
|
+ [kernel fpu.initialized exists])
|
|
+ ],[
|
|
+ ZFS_LINUX_TRY_COMPILE([
|
|
+ #include <linux/module.h>
|
|
+ #include <asm/thread_info.h>
|
|
+
|
|
+ #if !defined(TIF_NEED_FPU_LOAD)
|
|
+ #error "TIF_NEED_FPU_LOAD undefined"
|
|
+ #endif
|
|
+ ],[
|
|
+ ],[
|
|
+ AC_MSG_RESULT(TIF_NEED_FPU_LOAD)
|
|
+ AC_DEFINE(
|
|
+ HAVE_KERNEL_TIF_NEED_FPU_LOAD, 1,
|
|
+ [kernel TIF_NEED_FPU_LOAD exists])
|
|
+ ],[
|
|
+ AC_MSG_RESULT(unavailable)
|
|
+ ])
|
|
+ ])
|
|
])
|
|
])
|
|
])
|