From 01017962908392b83764f29b21e687844636eed6 Mon Sep 17 00:00:00 2001
From: Gvozden Neskovic <neskovic@gmail.com>
Date: Fri, 16 Dec 2016 02:31:33 +0100
Subject: [PATCH] ABD: Adapt avx512bw raidz assembly

Adapt avx512bw implementation for use with abd buffers. Mul2 implementation
is rewritten to take advantage of the BW instruction set.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Romain Dolbeau <romain.dolbeau@atos.net>
Signed-off-by: Gvozden Neskovic <neskovic@gmail.com>
Closes #5477
---
 module/zfs/vdev_raidz_math.c          |   2 +-
 module/zfs/vdev_raidz_math_avx512bw.c | 174 +++++++++++---------------
 2 files changed, 71 insertions(+), 105 deletions(-)

diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c
index 85dd15cc3..a175bcf77 100644
--- a/module/zfs/vdev_raidz_math.c
+++ b/module/zfs/vdev_raidz_math.c
@@ -61,7 +61,7 @@ const raidz_impl_ops_t *raidz_all_maths[] = {
 	&vdev_raidz_avx512f_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX512BW)	/* only x86_64 for now */
-	// &vdev_raidz_avx512bw_impl,
+	&vdev_raidz_avx512bw_impl,
 #endif
 #if defined(__aarch64__)
 	&vdev_raidz_aarch64_neon_impl,
diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c
index 33b2d388f..3d5326b9e 100644
--- a/module/zfs/vdev_raidz_math_avx512bw.c
+++ b/module/zfs/vdev_raidz_math_avx512bw.c
@@ -20,11 +20,12 @@
  */
 /*
  * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
  */
 
 #include <sys/isa_defs.h>
 
-#if 0 // defined(__x86_64) && defined(HAVE_AVX512BW)
+#if defined(__x86_64) && defined(HAVE_AVX512BW)
 
 #include <sys/types.h>
 #include <linux/simd_x86.h>
@@ -66,20 +67,6 @@ typedef struct v {
 	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
 } v_t;
 
-#define	PREFETCHNTA(ptr, offset)					\
-{									\
-	__asm(								\
-	    "prefetchnta " #offset "(%[MEM])\n"				\
-	    : : [MEM] "r" (ptr));					\
-}
-
-#define	PREFETCH(ptr, offset)						\
-{									\
-	__asm(								\
-	    "prefetcht0 " #offset "(%[MEM])\n"				\
-	    : : [MEM] "r" (ptr));					\
-}
-
 #define	XOR_ACC(src, r...)						\
 {									\
 	switch (REG_CNT(r)) {						\
@@ -122,25 +109,7 @@ typedef struct v {
 	}								\
 }
 
-#define	ZERO(r...)							\
-{									\
-	switch (REG_CNT(r)) {						\
-	case 4:								\
-		__asm(							\
-		    "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n"	\
-		    "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n"	\
-		    "vpxorq %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n"	\
-		    "vpxorq %" VR3(r) ", %" VR3(r)", %" VR3(r));	\
-		break;							\
-	case 2:								\
-		__asm(							\
-		    "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n"	\
-		    "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r));	\
-		break;							\
-	default:							\
-		ASM_BUG();						\
-	}								\
-}
+#define	ZERO(r...)	XOR(r, r)
 
 #define	COPY(r...)							\
 {									\
@@ -206,20 +175,11 @@ typedef struct v {
 	}								\
 }
 
-#define	FLUSH()								\
-{									\
-	__asm("vzeroupper");						\
-}
-
-#define	MUL2_SETUP()							\
-{									\
-	__asm("vmovq %0,   %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d));	\
-	__asm("vpbroadcastq %xmm14, %zmm14");				\
-	__asm("vmovq %0,   %%xmm13" :: "r"(0x8080808080808080));	\
-	__asm("vpbroadcastq %xmm13, %zmm13");				\
-	__asm("vmovq %0,   %%xmm12" :: "r"(0xfefefefefefefefe));	\
-	__asm("vpbroadcastq %xmm12, %zmm12");				\
-	__asm("vpxorq       %zmm15, %zmm15 ,%zmm15");			\
+#define	MUL2_SETUP() 							\
+{   									\
+	__asm("vmovq %0,    %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d));	\
+	__asm("vpbroadcastq  %xmm22, %zmm22");				\
+	__asm("vpxord        %zmm23, %zmm23 ,%zmm23");			\
 }
 
 #define	_MUL2(r...)							\
@@ -227,20 +187,14 @@ typedef struct v {
 	switch	(REG_CNT(r)) {						\
 	case 2:								\
 		__asm(							\
-		    "vpandq   %" VR0(r)", %zmm13, %zmm10\n"		\
-		    "vpandq   %" VR1(r)", %zmm13, %zmm11\n"		\
-		    "vpsrlq   $7, %zmm10, %zmm8\n"			\
-		    "vpsrlq   $7, %zmm11, %zmm9\n"			\
-		    "vpsllq   $1, %zmm10, %zmm10\n"			\
-		    "vpsllq   $1, %zmm11, %zmm11\n"			\
-		    "vpsubq   %zmm8, %zmm10, %zmm10\n"			\
-		    "vpsubq   %zmm9, %zmm11, %zmm11\n"			\
-		    "vpsllq   $1, %" VR0(r)", %" VR0(r) "\n"		\
-		    "vpsllq   $1, %" VR1(r)", %" VR1(r) "\n"		\
-		    "vpandq   %zmm10, %zmm14, %zmm10\n"			\
-		    "vpandq   %zmm11, %zmm14, %zmm11\n"			\
-		    "vpternlogd $0x6c,%zmm12, %zmm10, %" VR0(r) "\n"	\
-		    "vpternlogd $0x6c,%zmm12, %zmm11, %" VR1(r));	\
+		    "vpcmpb $1, %zmm23,     %" VR0(r)", %k1\n"		\
+		    "vpcmpb $1, %zmm23,     %" VR1(r)", %k2\n"		\
+		    "vpaddb     %" VR0(r)", %" VR0(r)", %" VR0(r) "\n"	\
+		    "vpaddb     %" VR1(r)", %" VR1(r)", %" VR1(r) "\n"	\
+		    "vpxord     %zmm22,     %" VR0(r)", %zmm12\n"	\
+		    "vpxord     %zmm22,     %" VR1(r)", %zmm13\n"	\
+		    "vmovdqu8   %zmm12,     %" VR0(r) "{%k1}\n"		\
+		    "vmovdqu8   %zmm13,     %" VR1(r) "{%k2}");		\
 		break;							\
 	default:							\
 		ASM_BUG();						\
@@ -276,7 +230,7 @@ typedef struct v {
 #define	_ta		"zmm10"
 #define	_tb		"zmm15"
 
-static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
+static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F;
 
 #define	_MULx2(c, r...)							\
 {									\
@@ -339,11 +293,15 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
 }
 
 #define	raidz_math_begin()	kfpu_begin()
-#define	raidz_math_end()						\
-{									\
-	FLUSH();							\
-	kfpu_end();							\
-}
+#define	raidz_math_end()	kfpu_end()
+
+/*
+ * ZERO, COPY, and MUL operations are already 2x unrolled, which means that
+ * the stride of these operations for avx512 must not exceed 4. Otherwise, a
+ * single step would exceed 512B block size.
+ */
+
+#define	SYN_STRIDE		4
 
 #define	ZERO_STRIDE		4
 #define	ZERO_DEFINE()		{}
@@ -361,59 +319,67 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
 #define	MUL_DEFINE()		{}
 #define	MUL_D			0, 1, 2, 3
 
-#define	GEN_P_DEFINE()		{}
 #define	GEN_P_STRIDE		4
+#define	GEN_P_DEFINE()		{}
 #define	GEN_P_P			0, 1, 2, 3
 
-#define	GEN_PQ_DEFINE()		{}
 #define	GEN_PQ_STRIDE		4
+#define	GEN_PQ_DEFINE() 	{}
 #define	GEN_PQ_D		0, 1, 2, 3
-#define	GEN_PQ_P		4, 5, 6, 7
-#define	GEN_PQ_Q		20, 21, 22, 23
+#define	GEN_PQ_C		4, 5, 6, 7
 
-#define	GEN_PQR_DEFINE()	{}
-#define	GEN_PQR_STRIDE		2
-#define	GEN_PQR_D		0, 1
-#define	GEN_PQR_P		2, 3
-#define	GEN_PQR_Q		4, 5
-#define	GEN_PQR_R		6, 7
+#define	GEN_PQR_STRIDE		4
+#define	GEN_PQR_DEFINE() 	{}
+#define	GEN_PQR_D		0, 1, 2, 3
+#define	GEN_PQR_C		4, 5, 6, 7
 
-#define	REC_P_DEFINE()		{}
-#define	REC_P_STRIDE		4
-#define	REC_P_X			0, 1, 2, 3
+#define	SYN_Q_DEFINE()		{}
+#define	SYN_Q_D			0, 1, 2, 3
+#define	SYN_Q_X			4, 5, 6, 7
 
-#define	REC_Q_DEFINE()		{}
-#define	REC_Q_STRIDE		4
-#define	REC_Q_X			0, 1, 2, 3
+#define	SYN_R_DEFINE()		{}
+#define	SYN_R_D			0, 1, 2, 3
+#define	SYN_R_X			4, 5, 6, 7
 
-#define	REC_R_DEFINE()		{}
-#define	REC_R_STRIDE		4
-#define	REC_R_X			0, 1, 2, 3
+#define	SYN_PQ_DEFINE() 	{}
+#define	SYN_PQ_D		0, 1, 2, 3
+#define	SYN_PQ_X		4, 5, 6, 7
 
-#define	REC_PQ_DEFINE()		{}
-#define	REC_PQ_STRIDE		4
-#define	REC_PQ_X		0, 1, 2, 3
-#define	REC_PQ_Y		4, 5, 6, 7
-#define	REC_PQ_D		20, 21, 22, 23
+#define	REC_PQ_STRIDE		2
+#define	REC_PQ_DEFINE() 	{}
+#define	REC_PQ_X		0, 1
+#define	REC_PQ_Y		2, 3
+#define	REC_PQ_T		4, 5
 
-#define	REC_PR_DEFINE()		{}
-#define	REC_PR_STRIDE		4
-#define	REC_PR_X		0, 1, 2, 3
-#define	REC_PR_Y		4, 5, 6, 7
-#define	REC_PR_D		20, 21, 22, 23
+#define	SYN_PR_DEFINE() 	{}
+#define	SYN_PR_D		0, 1, 2, 3
+#define	SYN_PR_X		4, 5, 6, 7
 
-#define	REC_QR_DEFINE()		{}
-#define	REC_QR_STRIDE		4
-#define	REC_QR_X		0, 1, 2, 3
-#define	REC_QR_Y		4, 5, 6, 7
-#define	REC_QR_D		20, 21, 22, 23
+#define	REC_PR_STRIDE		2
+#define	REC_PR_DEFINE() 	{}
+#define	REC_PR_X		0, 1
+#define	REC_PR_Y		2, 3
+#define	REC_PR_T		4, 5
+
+#define	SYN_QR_DEFINE() 	{}
+#define	SYN_QR_D		0, 1, 2, 3
+#define	SYN_QR_X		4, 5, 6, 7
+
+#define	REC_QR_STRIDE		2
+#define	REC_QR_DEFINE() 	{}
+#define	REC_QR_X		0, 1
+#define	REC_QR_Y		2, 3
+#define	REC_QR_T		4, 5
+
+#define	SYN_PQR_DEFINE() 	{}
+#define	SYN_PQR_D		0, 1, 2, 3
+#define	SYN_PQR_X		4, 5, 6, 7
 
-#define	REC_PQR_DEFINE()	{}
 #define	REC_PQR_STRIDE		2
+#define	REC_PQR_DEFINE() 	{}
 #define	REC_PQR_X		0, 1
 #define	REC_PQR_Y		2, 3
 #define	REC_PQR_Z		4, 5
-#define	REC_PQR_D		6, 7
 #define	REC_PQR_XS		6, 7
 #define	REC_PQR_YS		8, 9