Linux 5.0 compat: SIMD compatibility

Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS,
and 5.0 and newer kernels.

This commit squashes the following commits from master in to
a single commit which can be applied to 0.8.2.

10fa2545 - Linux 4.14, 4.19, 5.0+ compat: SIMD save/restore
b88ca2ac - Enable SIMD for encryption
095b5412 - Fix CONFIG_X86_DEBUG_FPU build failure
e5db3134 - Linux 5.0 compat: SIMD compatibility

Reviewed-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
TEST_ZIMPORT_SKIP="yes"
This commit is contained in:
Brian Behlendorf
2019-07-12 09:31:20 -07:00
committed by Tony Hutter
parent 988b040476
commit 62c034f6d4
30 changed files with 548 additions and 206 deletions
+46 -30
View File
@@ -140,6 +140,7 @@
#include <sys/zio_checksum.h>
#include <sys/zfs_context.h>
#include <zfs_fletcher.h>
#include <linux/simd.h>
#define FLETCHER_MIN_SIMD_SIZE 64
@@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector {
const char *fis_name;
uint32_t fis_sel;
} fletcher_4_impl_selectors[] = {
#if !defined(_KERNEL)
{ "cycle", IMPL_CYCLE },
#endif
{ "fastest", IMPL_FASTEST },
{ "scalar", IMPL_SCALAR }
};
#if defined(_KERNEL)
static kstat_t *fletcher_4_kstat;
#endif
static struct fletcher_4_kstat {
uint64_t native;
uint64_t byteswap;
} fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
#endif
/* Indicate that benchmark has been completed */
static boolean_t fletcher_4_initialized = B_FALSE;
@@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val)
return (err);
}
/*
* Returns the Fletcher 4 operations for checksums. When a SIMD
* implementation is not allowed in the current context, then fallback
* to the fastest generic implementation.
*/
static inline const fletcher_4_ops_t *
fletcher_4_impl_get(void)
{
fletcher_4_ops_t *ops = NULL;
const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
if (!kfpu_allowed())
return (&fletcher_4_superscalar4_ops);
const fletcher_4_ops_t *ops = NULL;
uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
switch (impl) {
case IMPL_FASTEST:
ASSERT(fletcher_4_initialized);
ops = &fletcher_4_fastest_impl;
break;
#if !defined(_KERNEL)
case IMPL_CYCLE: {
case IMPL_CYCLE:
/* Cycle through supported implementations */
ASSERT(fletcher_4_initialized);
ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
static uint32_t cycle_count = 0;
uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
ops = fletcher_4_supp_impls[idx];
}
break;
#endif
break;
default:
ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
ops = fletcher_4_supp_impls[impl];
break;
}
@@ -659,6 +662,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
zio_cksum_t *);
#if defined(_KERNEL)
static void
fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
{
@@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
/* restore original selection */
atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
}
#endif /* _KERNEL */
void
fletcher_4_init(void)
/*
* Initialize and benchmark all supported implementations.
*/
static void
fletcher_4_benchmark(void)
{
static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
fletcher_4_ops_t *curr_impl;
char *databuf;
int i, c;
/* move supported impl into fletcher_4_supp_impls */
/* Move supported implementations into fletcher_4_supp_impls */
for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
@@ -735,19 +741,10 @@ fletcher_4_init(void)
membar_producer(); /* complete fletcher_4_supp_impls[] init */
fletcher_4_supp_impls_cnt = c; /* number of supported impl */
#if !defined(_KERNEL)
/* Skip benchmarking and use last implementation as fastest */
memcpy(&fletcher_4_fastest_impl,
fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
sizeof (fletcher_4_fastest_impl));
fletcher_4_fastest_impl.name = "fastest";
membar_producer();
#if defined(_KERNEL)
static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
char *databuf = vmem_alloc(data_size, KM_SLEEP);
fletcher_4_initialized = B_TRUE;
return;
#endif
/* Benchmark all supported implementations */
databuf = vmem_alloc(data_size, KM_SLEEP);
for (i = 0; i < data_size / sizeof (uint64_t); i++)
((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
@@ -755,9 +752,28 @@ fletcher_4_init(void)
fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
vmem_free(databuf, data_size);
#else
/*
* Skip the benchmark in user space to avoid impacting libzpool
* consumers (zdb, zhack, zinject, ztest). The last implementation
* is assumed to be the fastest and used by default.
*/
memcpy(&fletcher_4_fastest_impl,
fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
sizeof (fletcher_4_fastest_impl));
fletcher_4_fastest_impl.name = "fastest";
membar_producer();
#endif /* _KERNEL */
}
void
fletcher_4_init(void)
{
/* Determine the fastest available implementation. */
fletcher_4_benchmark();
#if defined(_KERNEL)
/* install kstats for all implementations */
/* Install kstats for all implementations */
fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
if (fletcher_4_kstat != NULL) {
+1 -1
View File
@@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16)));
static boolean_t fletcher_4_aarch64_neon_valid(void)
{
return (B_TRUE);
return (kfpu_allowed());
}
const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
+1 -1
View File
@@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
static boolean_t
fletcher_4_avx512f_valid(void)
{
return (zfs_avx512f_available());
return (kfpu_allowed() && zfs_avx512f_available());
}
const fletcher_4_ops_t fletcher_4_avx512f_ops = {
+1 -1
View File
@@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
static boolean_t fletcher_4_avx2_valid(void)
{
return (zfs_avx_available() && zfs_avx2_available());
return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
}
const fletcher_4_ops_t fletcher_4_avx2_ops = {
+3 -2
View File
@@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
static boolean_t fletcher_4_sse2_valid(void)
{
return (zfs_sse2_available());
return (kfpu_allowed() && zfs_sse2_available());
}
const fletcher_4_ops_t fletcher_4_sse2_ops = {
@@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
static boolean_t fletcher_4_ssse3_valid(void)
{
return (zfs_sse2_available() && zfs_ssse3_available());
return (kfpu_allowed() && zfs_sse2_available() &&
zfs_ssse3_available());
}
const fletcher_4_ops_t fletcher_4_ssse3_ops = {
+14
View File
@@ -853,10 +853,23 @@ zfs_prop_align_right(zfs_prop_t prop)
#endif
#if defined(_KERNEL)
#include <linux/simd.h>
#if defined(HAVE_KERNEL_FPU_INTERNAL)
union fpregs_state **zfs_kfpu_fpregs;
EXPORT_SYMBOL(zfs_kfpu_fpregs);
#endif /* HAVE_KERNEL_FPU_INTERNAL */
static int __init
zcommon_init(void)
{
int error = kfpu_init();
if (error)
return (error);
fletcher_4_init();
return (0);
}
@@ -864,6 +877,7 @@ static void __exit
zcommon_fini(void)
{
fletcher_4_fini();
kfpu_fini();
}
module_init(zcommon_init);