From 62c034f6d45df04fc81d6c7ca5bd884e17bfee19 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 12 Jul 2019 09:31:20 -0700 Subject: [PATCH] Linux 5.0 compat: SIMD compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS, and 5.0 and newer kernels. This commit squashes the following commits from master in to a single commit which can be applied to 0.8.2. 10fa2545 - Linux 4.14, 4.19, 5.0+ compat: SIMD save/restore b88ca2ac - Enable SIMD for encryption 095b5412 - Fix CONFIG_X86_DEBUG_FPU build failure e5db3134 - Linux 5.0 compat: SIMD compatibility Reviewed-by: Fabian Grünbichler Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf TEST_ZIMPORT_SKIP="yes" --- cmd/ztest/ztest.c | 3 + config/kernel-fpu.m4 | 77 ++++- include/linux/Makefile.am | 1 + include/linux/simd.h | 42 +++ include/linux/simd_aarch64.h | 23 +- include/linux/simd_x86.h | 303 +++++++++++++++----- include/sys/vdev_raidz.h | 2 +- include/sys/vdev_raidz_impl.h | 2 +- module/icp/algs/aes/aes_impl.c | 31 +- module/icp/algs/aes/aes_impl_aesni.c | 2 +- module/icp/algs/modes/gcm.c | 38 ++- module/icp/algs/modes/gcm_pclmulqdq.c | 2 +- module/icp/include/aes/aes_impl.h | 4 +- module/icp/include/modes/gcm_impl.h | 4 +- module/icp/io/aes.c | 2 +- module/zcommon/zfs_fletcher.c | 76 +++-- module/zcommon/zfs_fletcher_aarch64_neon.c | 2 +- module/zcommon/zfs_fletcher_avx512.c | 2 +- module/zcommon/zfs_fletcher_intel.c | 2 +- module/zcommon/zfs_fletcher_sse.c | 5 +- module/zcommon/zfs_prop.c | 14 + module/zfs/vdev_raidz_math.c | 93 +++--- module/zfs/vdev_raidz_math_aarch64_neon.c | 2 +- module/zfs/vdev_raidz_math_aarch64_neonx2.c | 2 +- module/zfs/vdev_raidz_math_avx2.c | 2 +- module/zfs/vdev_raidz_math_avx512bw.c | 5 +- module/zfs/vdev_raidz_math_avx512f.c | 5 +- module/zfs/vdev_raidz_math_sse2.c | 2 +- module/zfs/vdev_raidz_math_ssse3.c | 4 +- module/zfs/zio_crypt.c | 2 +- 30 files changed, 548 insertions(+), 206 deletions(-) create mode 100644 include/linux/simd.h diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index e83654a32..8fe412672 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -107,6 +107,7 @@ #include #include #include +#include #include #include #include @@ -7110,6 +7111,8 @@ ztest_run(ztest_shared_t *zs) metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; + VERIFY0(vdev_raidz_impl_set("cycle")); + dmu_objset_stats_t dds; VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4 index ebb02fb09..49316aab4 100644 --- a/config/kernel-fpu.m4 +++ b/config/kernel-fpu.m4 @@ -2,8 +2,9 @@ dnl # dnl # Handle differences in kernel FPU code. dnl # dnl # Kernel -dnl # 5.0: All kernel fpu functions are GPL only, so we can't use them. -dnl # (nothing defined) +dnl # 5.0: Wrappers have been introduced to save/restore the FPU state. +dnl # This change was made to the 4.19.38 and 4.14.120 LTS kernels. +dnl # HAVE_KERNEL_FPU_INTERNAL dnl # dnl # 4.2: Use __kernel_fpu_{begin,end}() dnl # HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU @@ -12,7 +13,11 @@ dnl # Pre-4.2: Use kernel_fpu_{begin,end}() dnl # HAVE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU dnl # AC_DEFUN([ZFS_AC_KERNEL_FPU], [ - AC_MSG_CHECKING([which kernel_fpu header to use]) + dnl # + dnl # N.B. The header check is performed before all other checks since + dnl # it depends on HAVE_KERNEL_FPU_API_HEADER being set in confdefs.h. + dnl # + AC_MSG_CHECKING([whether fpu headers are available]) ZFS_LINUX_TRY_COMPILE([ #include #include @@ -25,9 +30,13 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ AC_MSG_RESULT(i387.h & xcr.h) ]) - AC_MSG_CHECKING([which kernel_fpu function to use]) + dnl # + dnl # Legacy kernel + dnl # + AC_MSG_CHECKING([whether kernel fpu is available]) ZFS_LINUX_TRY_COMPILE_SYMBOL([ #include + #include #ifdef HAVE_KERNEL_FPU_API_HEADER #include #else @@ -45,8 +54,12 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions]) ],[ + dnl # + dnl # Linux 4.2 kernel + dnl # ZFS_LINUX_TRY_COMPILE_SYMBOL([ #include + #include #ifdef HAVE_KERNEL_FPU_API_HEADER #include #else @@ -57,12 +70,60 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ ],[ __kernel_fpu_begin(); __kernel_fpu_end(); - ], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [ + ], [__kernel_fpu_begin], + [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [ AC_MSG_RESULT(__kernel_fpu_*) - AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions]) - AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions]) + AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, + [kernel has __kernel_fpu_* functions]) + AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, + [kernel exports FPU functions]) ],[ - AC_MSG_RESULT(not exported) + ZFS_LINUX_TRY_COMPILE([ + #include + + #if defined(__x86_64) || defined(__x86_64__) || \ + defined(__i386) || defined(__i386__) + #if !defined(__x86) + #define __x86 + #endif + #endif + + #if !defined(__x86) + #error Unsupported architecture + #endif + + #include + #ifdef HAVE_KERNEL_FPU_API_HEADER + #include + #include + #else + #include + #include + #endif + + #if !defined(XSTATE_XSAVE) + #error XSTATE_XSAVE not defined + #endif + + #if !defined(XSTATE_XRESTORE) + #error XSTATE_XRESTORE not defined + #endif + ],[ + struct fpu *fpu = ¤t->thread.fpu; + union fpregs_state *st = &fpu->state; + struct fregs_state *fr __attribute__ ((unused)) = + &st->fsave; + struct fxregs_state *fxr __attribute__ ((unused)) = + &st->fxsave; + struct xregs_state *xr __attribute__ ((unused)) = + &st->xsave; + ], [ + AC_MSG_RESULT(internal) + AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1, + [kernel fpu internal]) + ],[ + AC_MSG_RESULT(unavailable) + ]) ]) ]) ]) diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am index efb49520e..2455759e8 100644 --- a/include/linux/Makefile.am +++ b/include/linux/Makefile.am @@ -7,6 +7,7 @@ KERNEL_H = \ $(top_srcdir)/include/linux/blkdev_compat.h \ $(top_srcdir)/include/linux/utsname_compat.h \ $(top_srcdir)/include/linux/kmap_compat.h \ + $(top_srcdir)/include/linux/simd.h \ $(top_srcdir)/include/linux/simd_x86.h \ $(top_srcdir)/include/linux/simd_aarch64.h \ $(top_srcdir)/include/linux/mod_compat.h \ diff --git a/include/linux/simd.h b/include/linux/simd.h new file mode 100644 index 000000000..bb5f0f02a --- /dev/null +++ b/include/linux/simd.h @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2019 Lawrence Livermore National Security, LLC. + */ + +#ifndef _SIMD_H +#define _SIMD_H + +#if defined(__x86) +#include + +#elif defined(__aarch64__) +#include +#else + +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +#endif +#endif /* _SIMD_H */ diff --git a/include/linux/simd_aarch64.h b/include/linux/simd_aarch64.h index 56153a160..7ba308d15 100644 --- a/include/linux/simd_aarch64.h +++ b/include/linux/simd_aarch64.h @@ -27,9 +27,10 @@ * * Kernel fpu methods: * kfpu_allowed() - * kfpu_initialize() * kfpu_begin() * kfpu_end() + * kfpu_init() + * kfpu_fini() */ #ifndef _SIMD_AARCH64_H @@ -43,20 +44,20 @@ #if defined(_KERNEL) #include -#define kfpu_begin() \ -{ \ - kernel_neon_begin(); \ -} -#define kfpu_end() \ -{ \ - kernel_neon_end(); \ -} +#define kfpu_allowed() 1 +#define kfpu_begin() kernel_neon_begin() +#define kfpu_end() kernel_neon_end() +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) #else /* * fpu dummy methods for userspace */ -#define kfpu_begin() do {} while (0) -#define kfpu_end() do {} while (0) +#define kfpu_allowed() 1 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) #endif /* defined(_KERNEL) */ #endif /* __aarch64__ */ diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h index 0489bfaa3..69dbd5579 100644 --- a/include/linux/simd_x86.h +++ b/include/linux/simd_x86.h @@ -27,9 +27,10 @@ * * Kernel fpu methods: * kfpu_allowed() - * kfpu_initialize() * kfpu_begin() * kfpu_end() + * kfpu_init() + * kfpu_fini() * * SIMD support: * @@ -84,6 +85,15 @@ #if defined(_KERNEL) +/* + * Disable the WARN_ON_FPU() macro to prevent additional dependencies + * when providing the kfpu_* functions. Relevant warnings are included + * as appropriate and are unconditionally enabled. + */ +#if defined(CONFIG_X86_DEBUG_FPU) && !defined(KERNEL_EXPORTS_X86_FPU) +#undef CONFIG_X86_DEBUG_FPU +#endif + #if defined(HAVE_KERNEL_FPU_API_HEADER) #include #include @@ -92,33 +102,231 @@ #include #endif +/* + * The following cases are for kernels which export either the + * kernel_fpu_* or __kernel_fpu_* functions. + */ +#if defined(KERNEL_EXPORTS_X86_FPU) + +#define kfpu_allowed() 1 +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + #if defined(HAVE_UNDERSCORE_KERNEL_FPU) #define kfpu_begin() \ -{ \ - preempt_disable(); \ +{ \ + preempt_disable(); \ __kernel_fpu_begin(); \ } -#define kfpu_end() \ -{ \ - __kernel_fpu_end(); \ - preempt_enable(); \ +#define kfpu_end() \ +{ \ + __kernel_fpu_end(); \ + preempt_enable(); \ } + #elif defined(HAVE_KERNEL_FPU) -#define kfpu_begin() kernel_fpu_begin() +#define kfpu_begin() kernel_fpu_begin() #define kfpu_end() kernel_fpu_end() -#else -/* Kernel doesn't export any kernel_fpu_* functions */ -#include /* For kernel xgetbv() */ -#define kfpu_begin() panic("This code should never run") -#define kfpu_end() panic("This code should never run") -#endif /* defined(HAVE_KERNEL_FPU) */ #else /* - * fpu dummy methods for userspace + * This case is unreachable. When KERNEL_EXPORTS_X86_FPU is defined then + * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined. */ -#define kfpu_begin() do {} while (0) -#define kfpu_end() do {} while (0) +#error "Unreachable kernel configuration" +#endif + +#else /* defined(KERNEL_EXPORTS_X86_FPU) */ + +/* + * When the kernel_fpu_* symbols are unavailable then provide our own + * versions which allow the FPU to be safely used. + */ +#if defined(HAVE_KERNEL_FPU_INTERNAL) + +extern union fpregs_state **zfs_kfpu_fpregs; + +/* + * Initialize per-cpu variables to store FPU state. + */ +static inline void +kfpu_fini(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (zfs_kfpu_fpregs[cpu] != NULL) { + kfree(zfs_kfpu_fpregs[cpu]); + } + } + + kfree(zfs_kfpu_fpregs); +} + +static inline int +kfpu_init(void) +{ + int cpu; + + zfs_kfpu_fpregs = kzalloc(num_possible_cpus() * + sizeof (union fpregs_state *), GFP_KERNEL); + if (zfs_kfpu_fpregs == NULL) + return (-ENOMEM); + + for_each_possible_cpu(cpu) { + zfs_kfpu_fpregs[cpu] = kmalloc_node(sizeof (union fpregs_state), + GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); + if (zfs_kfpu_fpregs[cpu] == NULL) { + kfpu_fini(); + return (-ENOMEM); + } + } + + return (0); +} + +#define kfpu_allowed() 1 +#define ex_handler_fprestore ex_handler_default + +/* + * FPU save and restore instructions. + */ +#define __asm __asm__ __volatile__ +#define kfpu_fxsave(addr) __asm("fxsave %0" : "=m" (*(addr))) +#define kfpu_fxsaveq(addr) __asm("fxsaveq %0" : "=m" (*(addr))) +#define kfpu_fnsave(addr) __asm("fnsave %0; fwait" : "=m" (*(addr))) +#define kfpu_fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) +#define kfpu_fxrstorq(addr) __asm("fxrstorq %0" : : "m" (*(addr))) +#define kfpu_frstor(addr) __asm("frstor %0" : : "m" (*(addr))) +#define kfpu_fxsr_clean(rval) __asm("fnclex; emms; fildl %P[addr]" \ + : : [addr] "m" (rval)); + +static inline void +kfpu_save_xsave(struct xregs_state *addr, uint64_t mask) +{ + uint32_t low, hi; + int err; + + low = mask; + hi = mask >> 32; + XSTATE_XSAVE(addr, low, hi, err); + WARN_ON_ONCE(err); +} + +static inline void +kfpu_save_fxsr(struct fxregs_state *addr) +{ + if (IS_ENABLED(CONFIG_X86_32)) + kfpu_fxsave(addr); + else + kfpu_fxsaveq(addr); +} + +static inline void +kfpu_save_fsave(struct fregs_state *addr) +{ + kfpu_fnsave(addr); +} + +static inline void +kfpu_begin(void) +{ + /* + * Preemption and interrupts must be disabled for the critical + * region where the FPU state is being modified. + */ + preempt_disable(); + local_irq_disable(); + + /* + * The current FPU registers need to be preserved by kfpu_begin() + * and restored by kfpu_end(). They are stored in a dedicated + * per-cpu variable, not in the task struct, this allows any user + * FPU state to be correctly preserved and restored. + */ + union fpregs_state *state = zfs_kfpu_fpregs[smp_processor_id()]; + + if (static_cpu_has(X86_FEATURE_XSAVE)) { + kfpu_save_xsave(&state->xsave, ~0); + } else if (static_cpu_has(X86_FEATURE_FXSR)) { + kfpu_save_fxsr(&state->fxsave); + } else { + kfpu_save_fsave(&state->fsave); + } +} + +static inline void +kfpu_restore_xsave(struct xregs_state *addr, uint64_t mask) +{ + uint32_t low, hi; + + low = mask; + hi = mask >> 32; + XSTATE_XRESTORE(addr, low, hi); +} + +static inline void +kfpu_restore_fxsr(struct fxregs_state *addr) +{ + /* + * On AuthenticAMD K7 and K8 processors the fxrstor instruction only + * restores the _x87 FOP, FIP, and FDP registers when an exception + * is pending. Clean the _x87 state to force the restore. + */ + if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) + kfpu_fxsr_clean(addr); + + if (IS_ENABLED(CONFIG_X86_32)) { + kfpu_fxrstor(addr); + } else { + kfpu_fxrstorq(addr); + } +} + +static inline void +kfpu_restore_fsave(struct fregs_state *addr) +{ + kfpu_frstor(addr); +} + +static inline void +kfpu_end(void) +{ + union fpregs_state *state = zfs_kfpu_fpregs[smp_processor_id()]; + + if (static_cpu_has(X86_FEATURE_XSAVE)) { + kfpu_restore_xsave(&state->xsave, ~0); + } else if (static_cpu_has(X86_FEATURE_FXSR)) { + kfpu_restore_fxsr(&state->fxsave); + } else { + kfpu_restore_fsave(&state->fsave); + } + + local_irq_enable(); + preempt_enable(); +} + +#else + +/* + * FPU support is unavailable. + */ +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +#endif /* defined(HAVE_KERNEL_FPU_INTERNAL) */ +#endif /* defined(KERNEL_EXPORTS_X86_FPU) */ + +#else /* defined(_KERNEL) */ +/* + * FPU dummy methods for user space. + */ +#define kfpu_allowed() 1 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) #endif /* defined(_KERNEL) */ /* @@ -289,7 +497,6 @@ CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ); #endif /* !defined(_KERNEL) */ - /* * Detect register set support */ @@ -300,7 +507,7 @@ __simd_state_enabled(const uint64_t state) uint64_t xcr0; #if defined(_KERNEL) -#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_OSXSAVE) has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE); #else has_osxsave = B_FALSE; @@ -330,11 +537,7 @@ static inline boolean_t zfs_sse_available(void) { #if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) return (!!boot_cpu_has(X86_FEATURE_XMM)); -#else - return (B_FALSE); -#endif #elif !defined(_KERNEL) return (__cpuid_has_sse()); #endif @@ -347,11 +550,7 @@ static inline boolean_t zfs_sse2_available(void) { #if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) return (!!boot_cpu_has(X86_FEATURE_XMM2)); -#else - return (B_FALSE); -#endif #elif !defined(_KERNEL) return (__cpuid_has_sse2()); #endif @@ -364,11 +563,7 @@ static inline boolean_t zfs_sse3_available(void) { #if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) return (!!boot_cpu_has(X86_FEATURE_XMM3)); -#else - return (B_FALSE); -#endif #elif !defined(_KERNEL) return (__cpuid_has_sse3()); #endif @@ -381,11 +576,7 @@ static inline boolean_t zfs_ssse3_available(void) { #if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) return (!!boot_cpu_has(X86_FEATURE_SSSE3)); -#else - return (B_FALSE); -#endif #elif !defined(_KERNEL) return (__cpuid_has_ssse3()); #endif @@ -398,11 +589,7 @@ static inline boolean_t zfs_sse4_1_available(void) { #if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) return (!!boot_cpu_has(X86_FEATURE_XMM4_1)); -#else - return (B_FALSE); -#endif #elif !defined(_KERNEL) return (__cpuid_has_sse4_1()); #endif @@ -415,11 +602,7 @@ static inline boolean_t zfs_sse4_2_available(void) { #if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) return (!!boot_cpu_has(X86_FEATURE_XMM4_2)); -#else - return (B_FALSE); -#endif #elif !defined(_KERNEL) return (__cpuid_has_sse4_2()); #endif @@ -433,11 +616,7 @@ zfs_avx_available(void) { boolean_t has_avx; #if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) has_avx = !!boot_cpu_has(X86_FEATURE_AVX); -#else - has_avx = B_FALSE; -#endif #elif !defined(_KERNEL) has_avx = __cpuid_has_avx(); #endif @@ -453,11 +632,7 @@ zfs_avx2_available(void) { boolean_t has_avx2; #if defined(_KERNEL) -#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU) has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2); -#else - has_avx2 = B_FALSE; -#endif #elif !defined(_KERNEL) has_avx2 = __cpuid_has_avx2(); #endif @@ -472,7 +647,7 @@ static inline boolean_t zfs_bmi1_available(void) { #if defined(_KERNEL) -#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_BMI1) return (!!boot_cpu_has(X86_FEATURE_BMI1)); #else return (B_FALSE); @@ -489,7 +664,7 @@ static inline boolean_t zfs_bmi2_available(void) { #if defined(_KERNEL) -#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_BMI2) return (!!boot_cpu_has(X86_FEATURE_BMI2)); #else return (B_FALSE); @@ -506,7 +681,7 @@ static inline boolean_t zfs_aes_available(void) { #if defined(_KERNEL) -#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_AES) return (!!boot_cpu_has(X86_FEATURE_AES)); #else return (B_FALSE); @@ -523,7 +698,7 @@ static inline boolean_t zfs_pclmulqdq_available(void) { #if defined(_KERNEL) -#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_PCLMULQDQ) return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ)); #else return (B_FALSE); @@ -557,7 +732,7 @@ zfs_avx512f_available(void) boolean_t has_avx512 = B_FALSE; #if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_AVX512F) has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F); #else has_avx512 = B_FALSE; @@ -576,7 +751,7 @@ zfs_avx512cd_available(void) boolean_t has_avx512 = B_FALSE; #if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_AVX512CD) has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512CD); #else @@ -596,7 +771,7 @@ zfs_avx512er_available(void) boolean_t has_avx512 = B_FALSE; #if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_AVX512ER) has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512ER); #else @@ -616,7 +791,7 @@ zfs_avx512pf_available(void) boolean_t has_avx512 = B_FALSE; #if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_AVX512PF) has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512PF); #else @@ -636,7 +811,7 @@ zfs_avx512bw_available(void) boolean_t has_avx512 = B_FALSE; #if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_AVX512BW) has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512BW); #else @@ -656,7 +831,7 @@ zfs_avx512dq_available(void) boolean_t has_avx512 = B_FALSE; #if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_AVX512DQ) has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512DQ); #else @@ -676,7 +851,7 @@ zfs_avx512vl_available(void) boolean_t has_avx512 = B_FALSE; #if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_AVX512VL) has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512VL); #else @@ -696,7 +871,7 @@ zfs_avx512ifma_available(void) boolean_t has_avx512 = B_FALSE; #if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_AVX512IFMA) has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512IFMA); #else @@ -716,7 +891,7 @@ zfs_avx512vbmi_available(void) boolean_t has_avx512 = B_FALSE; #if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU) +#if defined(X86_FEATURE_AVX512VBMI) has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512VBMI); #else diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 2ce32469d..0ce2b5ea1 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -51,7 +51,7 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); */ void vdev_raidz_math_init(void); void vdev_raidz_math_fini(void); -struct raidz_impl_ops *vdev_raidz_math_get_ops(void); +const struct raidz_impl_ops *vdev_raidz_math_get_ops(void); int vdev_raidz_math_generate(struct raidz_map *); int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *, const int); diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 94960ba95..2e38962cc 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -126,7 +126,7 @@ typedef struct raidz_map { uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ uint8_t rm_ecksuminjected; /* checksum error was injected */ - raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c index 36e0686a5..fe15d76d1 100644 --- a/module/icp/algs/aes/aes_impl.c +++ b/module/icp/algs/aes/aes_impl.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * Initialize AES encryption and decryption key schedules. @@ -40,9 +41,9 @@ void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched) { - aes_impl_ops_t *ops = aes_impl_get_ops(); - aes_key_t *newbie = keysched; - uint_t keysize, i, j; + const aes_impl_ops_t *ops = aes_impl_get_ops(); + aes_key_t *newbie = keysched; + uint_t keysize, i, j; union { uint64_t ka64[4]; uint32_t ka32[8]; @@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0; static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)]; /* - * Selects the aes operations for encrypt/decrypt/key setup + * Returns the AES operations for encrypt/decrypt/key setup. When a + * SIMD implementation is not allowed in the current context, then + * fallback to the fastest generic implementation. */ -aes_impl_ops_t * -aes_impl_get_ops() +const aes_impl_ops_t * +aes_impl_get_ops(void) { - aes_impl_ops_t *ops = NULL; + if (!kfpu_allowed()) + return (&aes_generic_impl); + + const aes_impl_ops_t *ops = NULL; const uint32_t impl = AES_IMPL_READ(icp_aes_impl); switch (impl) { @@ -266,15 +272,13 @@ aes_impl_get_ops() ops = &aes_fastest_impl; break; case IMPL_CYCLE: - { + /* Cycle through supported implementations */ ASSERT(aes_impl_initialized); ASSERT3U(aes_supp_impl_cnt, >, 0); - /* Cycle through supported implementations */ static size_t cycle_impl_idx = 0; size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt; ops = aes_supp_impl[idx]; - } - break; + break; default: ASSERT3U(impl, <, aes_supp_impl_cnt); ASSERT3U(aes_supp_impl_cnt, >, 0); @@ -288,13 +292,16 @@ aes_impl_get_ops() return (ops); } +/* + * Initialize all supported implementations. + */ void aes_impl_init(void) { aes_impl_ops_t *curr_impl; int i, c; - /* move supported impl into aes_supp_impls */ + /* Move supported implementations into aes_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) { curr_impl = (aes_impl_ops_t *)aes_all_impl[i]; diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c index 97f7c3a47..222c176aa 100644 --- a/module/icp/algs/aes/aes_impl_aesni.c +++ b/module/icp/algs/aes/aes_impl_aesni.c @@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4], static boolean_t aes_aesni_will_work(void) { - return (zfs_aes_available()); + return (kfpu_allowed() && zfs_aes_available()); } const aes_impl_ops_t aes_aesni_impl = { diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c index 0afd957f0..efbf0fea9 100644 --- a/module/icp/algs/modes/gcm.c +++ b/module/icp/algs/modes/gcm.c @@ -29,6 +29,7 @@ #include #include #include +#include #define GHASH(c, d, t, o) \ xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ @@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { - gcm_impl_ops_t *gops; + const gcm_impl_ops_t *gops; size_t remainder = length; size_t need = 0; uint8_t *datap = (uint8_t *)data; @@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { - gcm_impl_ops_t *gops; + const gcm_impl_ops_t *gops; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); uint8_t *ghash, *macp = NULL; int i, rv; @@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { - gcm_impl_ops_t *gops; + const gcm_impl_ops_t *gops; size_t pt_len; size_t remainder; uint8_t *ghash; @@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len, void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { - gcm_impl_ops_t *gops; + const gcm_impl_ops_t *gops; uint8_t *cb; ulong_t remainder = iv_len; ulong_t processed = 0; @@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { - gcm_impl_ops_t *gops; + const gcm_impl_ops_t *gops; uint8_t *ghash, *datap, *authp; size_t remainder, processed; @@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0; static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; /* - * Selects the gcm operation + * Returns the GCM operations for encrypt/decrypt/key setup. When a + * SIMD implementation is not allowed in the current context, then + * fallback to the fastest generic implementation. */ -gcm_impl_ops_t * +const gcm_impl_ops_t * gcm_impl_get_ops() { - gcm_impl_ops_t *ops = NULL; + if (!kfpu_allowed()) + return (&gcm_generic_impl); + + const gcm_impl_ops_t *ops = NULL; const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); switch (impl) { @@ -674,15 +680,13 @@ gcm_impl_get_ops() ops = &gcm_fastest_impl; break; case IMPL_CYCLE: - { + /* Cycle through supported implementations */ ASSERT(gcm_impl_initialized); ASSERT3U(gcm_supp_impl_cnt, >, 0); - /* Cycle through supported implementations */ static size_t cycle_impl_idx = 0; size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; ops = gcm_supp_impl[idx]; - } - break; + break; default: ASSERT3U(impl, <, gcm_supp_impl_cnt); ASSERT3U(gcm_supp_impl_cnt, >, 0); @@ -696,13 +700,16 @@ gcm_impl_get_ops() return (ops); } +/* + * Initialize all supported implementations. + */ void gcm_impl_init(void) { gcm_impl_ops_t *curr_impl; int i, c; - /* move supported impl into aes_supp_impls */ + /* Move supported implementations into gcm_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; @@ -711,7 +718,10 @@ gcm_impl_init(void) } gcm_supp_impl_cnt = c; - /* set fastest implementation. assume hardware accelerated is fastest */ + /* + * Set the fastest implementation given the assumption that the + * hardware accelerated version is the fastest. + */ #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) if (gcm_pclmulqdq_impl.is_supported()) { memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c index be00ba37b..8a43ba33a 100644 --- a/module/icp/algs/modes/gcm_pclmulqdq.c +++ b/module/icp/algs/modes/gcm_pclmulqdq.c @@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res) static boolean_t gcm_pclmulqdq_will_work(void) { - return (zfs_pclmulqdq_available()); + return (kfpu_allowed() && zfs_pclmulqdq_available()); } const gcm_impl_ops_t gcm_pclmulqdq_impl = { diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h index 3a3de91cf..a0b82ade4 100644 --- a/module/icp/include/aes/aes_impl.h +++ b/module/icp/include/aes/aes_impl.h @@ -201,9 +201,9 @@ extern const aes_impl_ops_t aes_aesni_impl; void aes_impl_init(void); /* - * Get selected aes implementation + * Returns optimal allowed AES implementation */ -struct aes_impl_ops *aes_impl_get_ops(void); +const struct aes_impl_ops *aes_impl_get_ops(void); #ifdef __cplusplus } diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h index b78cc8aab..28c8f63a7 100644 --- a/module/icp/include/modes/gcm_impl.h +++ b/module/icp/include/modes/gcm_impl.h @@ -64,9 +64,9 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl; void gcm_impl_init(void); /* - * Get selected aes implementation + * Returns optimal allowed GCM implementation */ -struct gcm_impl_ops *gcm_impl_get_ops(void); +const struct gcm_impl_ops *gcm_impl_get_ops(void); #ifdef __cplusplus } diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c index 53b193693..788bcef7d 100644 --- a/module/icp/io/aes.c +++ b/module/icp/io/aes.c @@ -206,7 +206,7 @@ aes_mod_init(void) { int ret; - /* find fastest implementations and set any requested implementations */ + /* Determine the fastest available implementation. */ aes_impl_init(); gcm_impl_init(); diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index f712ce40c..4c9db441b 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -140,6 +140,7 @@ #include #include #include +#include #define FLETCHER_MIN_SIMD_SIZE 64 @@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector { const char *fis_name; uint32_t fis_sel; } fletcher_4_impl_selectors[] = { -#if !defined(_KERNEL) { "cycle", IMPL_CYCLE }, -#endif { "fastest", IMPL_FASTEST }, { "scalar", IMPL_SCALAR } }; #if defined(_KERNEL) static kstat_t *fletcher_4_kstat; -#endif static struct fletcher_4_kstat { uint64_t native; uint64_t byteswap; } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; +#endif /* Indicate that benchmark has been completed */ static boolean_t fletcher_4_initialized = B_FALSE; @@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val) return (err); } +/* + * Returns the Fletcher 4 operations for checksums. When a SIMD + * implementation is not allowed in the current context, then fallback + * to the fastest generic implementation. + */ static inline const fletcher_4_ops_t * fletcher_4_impl_get(void) { - fletcher_4_ops_t *ops = NULL; - const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); + if (!kfpu_allowed()) + return (&fletcher_4_superscalar4_ops); + + const fletcher_4_ops_t *ops = NULL; + uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); switch (impl) { case IMPL_FASTEST: ASSERT(fletcher_4_initialized); ops = &fletcher_4_fastest_impl; break; -#if !defined(_KERNEL) - case IMPL_CYCLE: { + case IMPL_CYCLE: + /* Cycle through supported implementations */ ASSERT(fletcher_4_initialized); ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); - static uint32_t cycle_count = 0; uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; ops = fletcher_4_supp_impls[idx]; - } - break; -#endif + break; default: ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); - ops = fletcher_4_supp_impls[impl]; break; } @@ -659,6 +662,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, zio_cksum_t *); +#if defined(_KERNEL) static void fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) { @@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) /* restore original selection */ atomic_swap_32(&fletcher_4_impl_chosen, sel_save); } +#endif /* _KERNEL */ -void -fletcher_4_init(void) +/* + * Initialize and benchmark all supported implementations. + */ +static void +fletcher_4_benchmark(void) { - static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ fletcher_4_ops_t *curr_impl; - char *databuf; int i, c; - /* move supported impl into fletcher_4_supp_impls */ + /* Move supported implementations into fletcher_4_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; @@ -735,19 +741,10 @@ fletcher_4_init(void) membar_producer(); /* complete fletcher_4_supp_impls[] init */ fletcher_4_supp_impls_cnt = c; /* number of supported impl */ -#if !defined(_KERNEL) - /* Skip benchmarking and use last implementation as fastest */ - memcpy(&fletcher_4_fastest_impl, - fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1], - sizeof (fletcher_4_fastest_impl)); - fletcher_4_fastest_impl.name = "fastest"; - membar_producer(); +#if defined(_KERNEL) + static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ + char *databuf = vmem_alloc(data_size, KM_SLEEP); - fletcher_4_initialized = B_TRUE; - return; -#endif - /* Benchmark all supported implementations */ - databuf = vmem_alloc(data_size, KM_SLEEP); for (i = 0; i < data_size / sizeof (uint64_t); i++) ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ @@ -755,9 +752,28 @@ fletcher_4_init(void) fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); vmem_free(databuf, data_size); +#else + /* + * Skip the benchmark in user space to avoid impacting libzpool + * consumers (zdb, zhack, zinject, ztest). The last implementation + * is assumed to be the fastest and used by default. + */ + memcpy(&fletcher_4_fastest_impl, + fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1], + sizeof (fletcher_4_fastest_impl)); + fletcher_4_fastest_impl.name = "fastest"; + membar_producer(); +#endif /* _KERNEL */ +} + +void +fletcher_4_init(void) +{ + /* Determine the fastest available implementation. */ + fletcher_4_benchmark(); #if defined(_KERNEL) - /* install kstats for all implementations */ + /* Install kstats for all implementations */ fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); if (fletcher_4_kstat != NULL) { diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c index bd2db2b20..3b3c1b52b 100644 --- a/module/zcommon/zfs_fletcher_aarch64_neon.c +++ b/module/zcommon/zfs_fletcher_aarch64_neon.c @@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16))); static boolean_t fletcher_4_aarch64_neon_valid(void) { - return (B_TRUE); + return (kfpu_allowed()); } const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = { diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c index 7260a9864..0d4cff21a 100644 --- a/module/zcommon/zfs_fletcher_avx512.c +++ b/module/zcommon/zfs_fletcher_avx512.c @@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap); static boolean_t fletcher_4_avx512f_valid(void) { - return (zfs_avx512f_available()); + return (kfpu_allowed() && zfs_avx512f_available()); } const fletcher_4_ops_t fletcher_4_avx512f_ops = { diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c index 6dac047da..7f12efe6d 100644 --- a/module/zcommon/zfs_fletcher_intel.c +++ b/module/zcommon/zfs_fletcher_intel.c @@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) static boolean_t fletcher_4_avx2_valid(void) { - return (zfs_avx_available() && zfs_avx2_available()); + return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); } const fletcher_4_ops_t fletcher_4_avx2_ops = { diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c index a0b42e5f5..e6389d6e5 100644 --- a/module/zcommon/zfs_fletcher_sse.c +++ b/module/zcommon/zfs_fletcher_sse.c @@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) static boolean_t fletcher_4_sse2_valid(void) { - return (zfs_sse2_available()); + return (kfpu_allowed() && zfs_sse2_available()); } const fletcher_4_ops_t fletcher_4_sse2_ops = { @@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) static boolean_t fletcher_4_ssse3_valid(void) { - return (zfs_sse2_available() && zfs_ssse3_available()); + return (kfpu_allowed() && zfs_sse2_available() && + zfs_ssse3_available()); } const fletcher_4_ops_t fletcher_4_ssse3_ops = { diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index dab749138..f1c415838 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -853,10 +853,23 @@ zfs_prop_align_right(zfs_prop_t prop) #endif #if defined(_KERNEL) + +#include + +#if defined(HAVE_KERNEL_FPU_INTERNAL) +union fpregs_state **zfs_kfpu_fpregs; +EXPORT_SYMBOL(zfs_kfpu_fpregs); +#endif /* HAVE_KERNEL_FPU_INTERNAL */ + static int __init zcommon_init(void) { + int error = kfpu_init(); + if (error) + return (error); + fletcher_4_init(); + return (0); } @@ -864,6 +877,7 @@ static void __exit zcommon_fini(void) { fletcher_4_fini(); + kfpu_fini(); } module_init(zcommon_init); diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index 3ef67768f..576d33bef 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -27,9 +27,9 @@ #include #include #include - #include #include +#include extern boolean_t raidz_will_scalar_work(void); @@ -87,6 +87,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST; static size_t raidz_supp_impl_cnt = 0; static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; +#if defined(_KERNEL) /* * kstats values for supported implementations * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] @@ -95,14 +96,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; /* kstat for benchmarked implementations */ static kstat_t *raidz_math_kstat = NULL; +#endif /* - * Selects the raidz operation for raidz_map - * If rm_ops is set to NULL original raidz implementation will be used + * Returns the RAIDZ operations for raidz_map() parity calculations. When + * a SIMD implementation is not allowed in the current context, then fallback + * to the fastest generic implementation. */ -raidz_impl_ops_t * -vdev_raidz_math_get_ops() +const raidz_impl_ops_t * +vdev_raidz_math_get_ops(void) { + if (!kfpu_allowed()) + return (&vdev_raidz_scalar_impl); + raidz_impl_ops_t *ops = NULL; const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); @@ -111,18 +117,14 @@ vdev_raidz_math_get_ops() ASSERT(raidz_math_initialized); ops = &vdev_raidz_fastest_impl; break; -#if !defined(_KERNEL) case IMPL_CYCLE: - { + /* Cycle through all supported implementations */ ASSERT(raidz_math_initialized); ASSERT3U(raidz_supp_impl_cnt, >, 0); - /* Cycle through all supported implementations */ static size_t cycle_impl_idx = 0; size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; ops = raidz_supp_impl[idx]; - } - break; -#endif + break; case IMPL_ORIGINAL: ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; break; @@ -273,6 +275,8 @@ const char *raidz_rec_name[] = { "rec_pq", "rec_pr", "rec_qr", "rec_pqr" }; +#if defined(_KERNEL) + #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1) static int @@ -435,21 +439,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) } } } +#endif -void -vdev_raidz_math_init(void) +/* + * Initialize and benchmark all supported implementations. + */ +static void +benchmark_raidz(void) { raidz_impl_ops_t *curr_impl; - zio_t *bench_zio = NULL; - raidz_map_t *bench_rm = NULL; - uint64_t bench_parity; - int i, c, fn; + int i, c; - /* move supported impl into raidz_supp_impl */ + /* Move supported impl into raidz_supp_impl */ for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; - /* initialize impl */ if (curr_impl->init) curr_impl->init(); @@ -459,18 +463,10 @@ vdev_raidz_math_init(void) membar_producer(); /* complete raidz_supp_impl[] init */ raidz_supp_impl_cnt = c; /* number of supported impl */ -#if !defined(_KERNEL) - /* Skip benchmarking and use last implementation as fastest */ - memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1], - sizeof (vdev_raidz_fastest_impl)); - strcpy(vdev_raidz_fastest_impl.name, "fastest"); - - raidz_math_initialized = B_TRUE; - - /* Use 'cycle' math selection method for userspace */ - VERIFY0(vdev_raidz_impl_set("cycle")); - return; -#endif +#if defined(_KERNEL) + zio_t *bench_zio = NULL; + raidz_map_t *bench_rm = NULL; + uint64_t bench_parity; /* Fake a zio and run the benchmark on a warmed up buffer */ bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); @@ -480,7 +476,7 @@ vdev_raidz_math_init(void) memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); /* Benchmark parity generation methods */ - for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) { bench_parity = fn + 1; /* New raidz_map is needed for each generate_p/q/r */ bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, @@ -495,7 +491,7 @@ vdev_raidz_math_init(void) bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, BENCH_COLS, PARITY_PQR); - for (fn = 0; fn < RAIDZ_REC_NUM; fn++) + for (int fn = 0; fn < RAIDZ_REC_NUM; fn++) benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); vdev_raidz_map_free(bench_rm); @@ -503,11 +499,29 @@ vdev_raidz_math_init(void) /* cleanup the bench zio */ abd_free(bench_zio->io_abd); kmem_free(bench_zio, sizeof (zio_t)); +#else + /* + * Skip the benchmark in user space to avoid impacting libzpool + * consumers (zdb, zhack, zinject, ztest). The last implementation + * is assumed to be the fastest and used by default. + */ + memcpy(&vdev_raidz_fastest_impl, + raidz_supp_impl[raidz_supp_impl_cnt - 1], + sizeof (vdev_raidz_fastest_impl)); + strcpy(vdev_raidz_fastest_impl.name, "fastest"); +#endif /* _KERNEL */ +} - /* install kstats for all impl */ +void +vdev_raidz_math_init(void) +{ + /* Determine the fastest available implementation. */ + benchmark_raidz(); + +#if defined(_KERNEL) + /* Install kstats for all implementations */ raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - if (raidz_math_kstat != NULL) { raidz_math_kstat->ks_data = NULL; raidz_math_kstat->ks_ndata = UINT32_MAX; @@ -517,6 +531,7 @@ vdev_raidz_math_init(void) raidz_math_kstat_addr); kstat_install(raidz_math_kstat); } +#endif /* Finish initialization */ atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); @@ -527,15 +542,15 @@ void vdev_raidz_math_fini(void) { raidz_impl_ops_t const *curr_impl; - int i; +#if defined(_KERNEL) if (raidz_math_kstat != NULL) { kstat_delete(raidz_math_kstat); raidz_math_kstat = NULL; } +#endif - /* fini impl */ - for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { + for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { curr_impl = raidz_all_maths[i]; if (curr_impl->fini) curr_impl->fini(); @@ -546,9 +561,7 @@ static const struct { char *name; uint32_t sel; } math_impl_opts[] = { -#if !defined(_KERNEL) { "cycle", IMPL_CYCLE }, -#endif { "fastest", IMPL_FASTEST }, { "original", IMPL_ORIGINAL }, { "scalar", IMPL_SCALAR } diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c index e3ad06776..0a67ceb84 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neon.c +++ b/module/zfs/vdev_raidz_math_aarch64_neon.c @@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon); static boolean_t raidz_will_aarch64_neon_work(void) { - return (B_TRUE); // __arch64__ requires NEON + return (kfpu_allowed()); } const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = { diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c index f8688a06a..e072f51cd 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c +++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c @@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2); static boolean_t raidz_will_aarch64_neonx2_work(void) { - return (B_TRUE); // __arch64__ requires NEON + return (kfpu_allowed()); } const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = { diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c index 063d29bcd..a12eb6720 100644 --- a/module/zfs/vdev_raidz_math_avx2.c +++ b/module/zfs/vdev_raidz_math_avx2.c @@ -396,7 +396,7 @@ DEFINE_REC_METHODS(avx2); static boolean_t raidz_will_avx2_work(void) { - return (zfs_avx_available() && zfs_avx2_available()); + return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); } const raidz_impl_ops_t vdev_raidz_avx2_impl = { diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c index d605653db..2f545c9ec 100644 --- a/module/zfs/vdev_raidz_math_avx512bw.c +++ b/module/zfs/vdev_raidz_math_avx512bw.c @@ -393,9 +393,8 @@ DEFINE_REC_METHODS(avx512bw); static boolean_t raidz_will_avx512bw_work(void) { - return (zfs_avx_available() && - zfs_avx512f_available() && - zfs_avx512bw_available()); + return (kfpu_allowed() && zfs_avx_available() && + zfs_avx512f_available() && zfs_avx512bw_available()); } const raidz_impl_ops_t vdev_raidz_avx512bw_impl = { diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c index f4e4560ce..75af7a8ee 100644 --- a/module/zfs/vdev_raidz_math_avx512f.c +++ b/module/zfs/vdev_raidz_math_avx512f.c @@ -470,9 +470,8 @@ DEFINE_REC_METHODS(avx512f); static boolean_t raidz_will_avx512f_work(void) { - return (zfs_avx_available() && - zfs_avx2_available() && - zfs_avx512f_available()); + return (kfpu_allowed() && zfs_avx_available() && + zfs_avx2_available() && zfs_avx512f_available()); } const raidz_impl_ops_t vdev_raidz_avx512f_impl = { diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c index 9985da273..5b3a9385c 100644 --- a/module/zfs/vdev_raidz_math_sse2.c +++ b/module/zfs/vdev_raidz_math_sse2.c @@ -607,7 +607,7 @@ DEFINE_REC_METHODS(sse2); static boolean_t raidz_will_sse2_work(void) { - return (zfs_sse_available() && zfs_sse2_available()); + return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available()); } const raidz_impl_ops_t vdev_raidz_sse2_impl = { diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c index 047a48d54..62247cf8e 100644 --- a/module/zfs/vdev_raidz_math_ssse3.c +++ b/module/zfs/vdev_raidz_math_ssse3.c @@ -399,8 +399,8 @@ DEFINE_REC_METHODS(ssse3); static boolean_t raidz_will_ssse3_work(void) { - return (zfs_sse_available() && zfs_sse2_available() && - zfs_ssse3_available()); + return (kfpu_allowed() && zfs_sse_available() && + zfs_sse2_available() && zfs_ssse3_available()); } const raidz_impl_ops_t vdev_raidz_ssse3_impl = { diff --git a/module/zfs/zio_crypt.c b/module/zfs/zio_crypt.c index 7cf20f413..7ce2b1bf4 100644 --- a/module/zfs/zio_crypt.c +++ b/module/zfs/zio_crypt.c @@ -549,12 +549,12 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, uint8_t *mac, zio_crypt_key_t *key) { - int ret; crypto_mechanism_t mech; uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint_t enc_len, keydata_len, aad_len; + int ret; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);