Linux 5.0 compat: SIMD compatibility

Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS, and 5.0 and newer kernels. This commit squashes the following commits from master in to a single commit which can be applied to 0.8.2. 10fa2545 - Linux 4.14, 4.19, 5.0+ compat: SIMD save/restore b88ca2ac - Enable SIMD for encryption 095b5412 - Fix CONFIG_X86_DEBUG_FPU build failure e5db3134 - Linux 5.0 compat: SIMD compatibility Reviewed-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> TEST_ZIMPORT_SKIP="yes"
2026-05-23 19:04:45 +03:00 · 2019-07-12 09:31:20 -07:00
parent 988b040476
commit 62c034f6d4
30 changed files with 548 additions and 206 deletions
@@ -140,6 +140,7 @@
 #include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <zfs_fletcher.h>
+#include <linux/simd.h>

 #define	FLETCHER_MIN_SIMD_SIZE	64

@@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector {
 	const char	*fis_name;
 	uint32_t	fis_sel;
 } fletcher_4_impl_selectors[] = {
-#if !defined(_KERNEL)
 	{ "cycle",	IMPL_CYCLE },
-#endif
 	{ "fastest",	IMPL_FASTEST },
 	{ "scalar",	IMPL_SCALAR }
 };

 #if defined(_KERNEL)
 static kstat_t *fletcher_4_kstat;
-#endif

 static struct fletcher_4_kstat {
 	uint64_t native;
 	uint64_t byteswap;
 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
+#endif

 /* Indicate that benchmark has been completed */
 static boolean_t fletcher_4_initialized = B_FALSE;
@@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val)
 	return (err);
 }

+/*
+ * Returns the Fletcher 4 operations for checksums.   When a SIMD
+ * implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
+ */
 static inline const fletcher_4_ops_t *
 fletcher_4_impl_get(void)
 {
-	fletcher_4_ops_t *ops = NULL;
-	const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+	if (!kfpu_allowed())
+		return (&fletcher_4_superscalar4_ops);
+
+	const fletcher_4_ops_t *ops = NULL;
+	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);

 	switch (impl) {
 	case IMPL_FASTEST:
 		ASSERT(fletcher_4_initialized);
 		ops = &fletcher_4_fastest_impl;
 		break;
-#if !defined(_KERNEL)
-	case IMPL_CYCLE: {
+	case IMPL_CYCLE:
+		/* Cycle through supported implementations */
 		ASSERT(fletcher_4_initialized);
 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
-
 		static uint32_t cycle_count = 0;
 		uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
 		ops = fletcher_4_supp_impls[idx];
-	}
-	break;
-#endif
+		break;
 	default:
 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
 		ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
-
 		ops = fletcher_4_supp_impls[impl];
 		break;
 	}
@@ -659,6 +662,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
 typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
 					zio_cksum_t *);

+#if defined(_KERNEL)
 static void
 fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 {
@@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 	/* restore original selection */
 	atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
 }
+#endif /* _KERNEL */

-void
-fletcher_4_init(void)
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+fletcher_4_benchmark(void)
 {
-	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
 	fletcher_4_ops_t *curr_impl;
-	char *databuf;
 	int i, c;

-	/* move supported impl into fletcher_4_supp_impls */
+	/* Move supported implementations into fletcher_4_supp_impls */
 	for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
 		curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];

@@ -735,19 +741,10 @@ fletcher_4_init(void)
 	membar_producer();	/* complete fletcher_4_supp_impls[] init */
 	fletcher_4_supp_impls_cnt = c;	/* number of supported impl */

-#if !defined(_KERNEL)
-	/* Skip benchmarking and use last implementation as fastest */
-	memcpy(&fletcher_4_fastest_impl,
-	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
-	    sizeof (fletcher_4_fastest_impl));
-	fletcher_4_fastest_impl.name = "fastest";
-	membar_producer();
+#if defined(_KERNEL)
+	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
+	char *databuf = vmem_alloc(data_size, KM_SLEEP);

-	fletcher_4_initialized = B_TRUE;
-	return;
-#endif
-	/* Benchmark all supported implementations */
-	databuf = vmem_alloc(data_size, KM_SLEEP);
 	for (i = 0; i < data_size / sizeof (uint64_t); i++)
 		((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */

@@ -755,9 +752,28 @@ fletcher_4_init(void)
 	fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);

 	vmem_free(databuf, data_size);
+#else
+	/*
+	 * Skip the benchmark in user space to avoid impacting libzpool
+	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
+	 * is assumed to be the fastest and used by default.
+	 */
+	memcpy(&fletcher_4_fastest_impl,
+	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
+	    sizeof (fletcher_4_fastest_impl));
+	fletcher_4_fastest_impl.name = "fastest";
+	membar_producer();
+#endif /* _KERNEL */
+}
+
+void
+fletcher_4_init(void)
+{
+	/* Determine the fastest available implementation. */
+	fletcher_4_benchmark();

 #if defined(_KERNEL)
-	/* install kstats for all implementations */
+	/* Install kstats for all implementations */
 	fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 	if (fletcher_4_kstat != NULL) {
@@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16)));

 static boolean_t fletcher_4_aarch64_neon_valid(void)
 {
-	return (B_TRUE);
+	return (kfpu_allowed());
 }

 const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
@@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
 static boolean_t
 fletcher_4_avx512f_valid(void)
 {
-	return (zfs_avx512f_available());
+	return (kfpu_allowed() && zfs_avx512f_available());
 }

 const fletcher_4_ops_t fletcher_4_avx512f_ops = {
@@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)

 static boolean_t fletcher_4_avx2_valid(void)
 {
-	return (zfs_avx_available() && zfs_avx2_available());
+	return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
 }

 const fletcher_4_ops_t fletcher_4_avx2_ops = {
@@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)

 static boolean_t fletcher_4_sse2_valid(void)
 {
-	return (zfs_sse2_available());
+	return (kfpu_allowed() && zfs_sse2_available());
 }

 const fletcher_4_ops_t fletcher_4_sse2_ops = {
@@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)

 static boolean_t fletcher_4_ssse3_valid(void)
 {
-	return (zfs_sse2_available() && zfs_ssse3_available());
+	return (kfpu_allowed() && zfs_sse2_available() &&
+	    zfs_ssse3_available());
 }

 const fletcher_4_ops_t fletcher_4_ssse3_ops = {
@@ -853,10 +853,23 @@ zfs_prop_align_right(zfs_prop_t prop)
 #endif

 #if defined(_KERNEL)
+
+#include <linux/simd.h>
+
+#if defined(HAVE_KERNEL_FPU_INTERNAL)
+union fpregs_state **zfs_kfpu_fpregs;
+EXPORT_SYMBOL(zfs_kfpu_fpregs);
+#endif /* HAVE_KERNEL_FPU_INTERNAL */
+
 static int __init
 zcommon_init(void)
 {
+	int error = kfpu_init();
+	if (error)
+		return (error);
+
 	fletcher_4_init();
+
 	return (0);
 }

@@ -864,6 +877,7 @@ static void __exit
 zcommon_fini(void)
 {
 	fletcher_4_fini();
+	kfpu_fini();
 }

 module_init(zcommon_init);