ABD: Adapt avx512bw raidz assembly

Adapt avx512bw implementation for use with abd buffers. Mul2 implementation
is rewritten to take advantage of the BW instruction set.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Romain Dolbeau <romain.dolbeau@atos.net>
Signed-off-by: Gvozden Neskovic <neskovic@gmail.com>
Closes #5477
This commit is contained in:
Gvozden Neskovic 2016-12-16 02:31:33 +01:00 committed by Brian Behlendorf
parent f2d8bdc62e
commit 0101796290
2 changed files with 71 additions and 105 deletions

View File

@ -61,7 +61,7 @@ const raidz_impl_ops_t *raidz_all_maths[] = {
&vdev_raidz_avx512f_impl, &vdev_raidz_avx512f_impl,
#endif #endif
#if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */
// &vdev_raidz_avx512bw_impl, &vdev_raidz_avx512bw_impl,
#endif #endif
#if defined(__aarch64__) #if defined(__aarch64__)
&vdev_raidz_aarch64_neon_impl, &vdev_raidz_aarch64_neon_impl,

View File

@ -20,11 +20,12 @@
*/ */
/* /*
* Copyright (C) 2016 Romain Dolbeau. All rights reserved. * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
* Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/ */
#include <sys/isa_defs.h> #include <sys/isa_defs.h>
#if 0 // defined(__x86_64) && defined(HAVE_AVX512BW) #if defined(__x86_64) && defined(HAVE_AVX512BW)
#include <sys/types.h> #include <sys/types.h>
#include <linux/simd_x86.h> #include <linux/simd_x86.h>
@ -66,20 +67,6 @@ typedef struct v {
uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
} v_t; } v_t;
#define PREFETCHNTA(ptr, offset) \
{ \
__asm( \
"prefetchnta " #offset "(%[MEM])\n" \
: : [MEM] "r" (ptr)); \
}
#define PREFETCH(ptr, offset) \
{ \
__asm( \
"prefetcht0 " #offset "(%[MEM])\n" \
: : [MEM] "r" (ptr)); \
}
#define XOR_ACC(src, r...) \ #define XOR_ACC(src, r...) \
{ \ { \
switch (REG_CNT(r)) { \ switch (REG_CNT(r)) { \
@ -122,25 +109,7 @@ typedef struct v {
} \ } \
} }
#define ZERO(r...) \ #define ZERO(r...) XOR(r, r)
{ \
switch (REG_CNT(r)) { \
case 4: \
__asm( \
"vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
"vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n" \
"vpxorq %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n" \
"vpxorq %" VR3(r) ", %" VR3(r)", %" VR3(r)); \
break; \
case 2: \
__asm( \
"vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
"vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r)); \
break; \
default: \
ASM_BUG(); \
} \
}
#define COPY(r...) \ #define COPY(r...) \
{ \ { \
@ -206,20 +175,11 @@ typedef struct v {
} \ } \
} }
#define FLUSH() \
{ \
__asm("vzeroupper"); \
}
#define MUL2_SETUP() \ #define MUL2_SETUP() \
{ \ { \
__asm("vmovq %0, %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d)); \ __asm("vmovq %0, %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d)); \
__asm("vpbroadcastq %xmm14, %zmm14"); \ __asm("vpbroadcastq %xmm22, %zmm22"); \
__asm("vmovq %0, %%xmm13" :: "r"(0x8080808080808080)); \ __asm("vpxord %zmm23, %zmm23 ,%zmm23"); \
__asm("vpbroadcastq %xmm13, %zmm13"); \
__asm("vmovq %0, %%xmm12" :: "r"(0xfefefefefefefefe)); \
__asm("vpbroadcastq %xmm12, %zmm12"); \
__asm("vpxorq %zmm15, %zmm15 ,%zmm15"); \
} }
#define _MUL2(r...) \ #define _MUL2(r...) \
@ -227,20 +187,14 @@ typedef struct v {
switch (REG_CNT(r)) { \ switch (REG_CNT(r)) { \
case 2: \ case 2: \
__asm( \ __asm( \
"vpandq %" VR0(r)", %zmm13, %zmm10\n" \ "vpcmpb $1, %zmm23, %" VR0(r)", %k1\n" \
"vpandq %" VR1(r)", %zmm13, %zmm11\n" \ "vpcmpb $1, %zmm23, %" VR1(r)", %k2\n" \
"vpsrlq $7, %zmm10, %zmm8\n" \ "vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \
"vpsrlq $7, %zmm11, %zmm9\n" \ "vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \
"vpsllq $1, %zmm10, %zmm10\n" \ "vpxord %zmm22, %" VR0(r)", %zmm12\n" \
"vpsllq $1, %zmm11, %zmm11\n" \ "vpxord %zmm22, %" VR1(r)", %zmm13\n" \
"vpsubq %zmm8, %zmm10, %zmm10\n" \ "vmovdqu8 %zmm12, %" VR0(r) "{%k1}\n" \
"vpsubq %zmm9, %zmm11, %zmm11\n" \ "vmovdqu8 %zmm13, %" VR1(r) "{%k2}"); \
"vpsllq $1, %" VR0(r)", %" VR0(r) "\n" \
"vpsllq $1, %" VR1(r)", %" VR1(r) "\n" \
"vpandq %zmm10, %zmm14, %zmm10\n" \
"vpandq %zmm11, %zmm14, %zmm11\n" \
"vpternlogd $0x6c,%zmm12, %zmm10, %" VR0(r) "\n" \
"vpternlogd $0x6c,%zmm12, %zmm11, %" VR1(r)); \
break; \ break; \
default: \ default: \
ASM_BUG(); \ ASM_BUG(); \
@ -276,7 +230,7 @@ typedef struct v {
#define _ta "zmm10" #define _ta "zmm10"
#define _tb "zmm15" #define _tb "zmm15"
static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F; static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F;
#define _MULx2(c, r...) \ #define _MULx2(c, r...) \
{ \ { \
@ -339,11 +293,15 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
} }
#define raidz_math_begin() kfpu_begin() #define raidz_math_begin() kfpu_begin()
#define raidz_math_end() \ #define raidz_math_end() kfpu_end()
{ \
FLUSH(); \ /*
kfpu_end(); \ * ZERO, COPY, and MUL operations are already 2x unrolled, which means that
} * the stride of these operations for avx512 must not exceed 4. Otherwise, a
* single step would exceed 512B block size.
*/
#define SYN_STRIDE 4
#define ZERO_STRIDE 4 #define ZERO_STRIDE 4
#define ZERO_DEFINE() {} #define ZERO_DEFINE() {}
@ -361,59 +319,67 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
#define MUL_DEFINE() {} #define MUL_DEFINE() {}
#define MUL_D 0, 1, 2, 3 #define MUL_D 0, 1, 2, 3
#define GEN_P_DEFINE() {}
#define GEN_P_STRIDE 4 #define GEN_P_STRIDE 4
#define GEN_P_DEFINE() {}
#define GEN_P_P 0, 1, 2, 3 #define GEN_P_P 0, 1, 2, 3
#define GEN_PQ_DEFINE() {}
#define GEN_PQ_STRIDE 4 #define GEN_PQ_STRIDE 4
#define GEN_PQ_DEFINE() {}
#define GEN_PQ_D 0, 1, 2, 3 #define GEN_PQ_D 0, 1, 2, 3
#define GEN_PQ_P 4, 5, 6, 7 #define GEN_PQ_C 4, 5, 6, 7
#define GEN_PQ_Q 20, 21, 22, 23
#define GEN_PQR_STRIDE 4
#define GEN_PQR_DEFINE() {} #define GEN_PQR_DEFINE() {}
#define GEN_PQR_STRIDE 2 #define GEN_PQR_D 0, 1, 2, 3
#define GEN_PQR_D 0, 1 #define GEN_PQR_C 4, 5, 6, 7
#define GEN_PQR_P 2, 3
#define GEN_PQR_Q 4, 5
#define GEN_PQR_R 6, 7
#define REC_P_DEFINE() {} #define SYN_Q_DEFINE() {}
#define REC_P_STRIDE 4 #define SYN_Q_D 0, 1, 2, 3
#define REC_P_X 0, 1, 2, 3 #define SYN_Q_X 4, 5, 6, 7
#define REC_Q_DEFINE() {} #define SYN_R_DEFINE() {}
#define REC_Q_STRIDE 4 #define SYN_R_D 0, 1, 2, 3
#define REC_Q_X 0, 1, 2, 3 #define SYN_R_X 4, 5, 6, 7
#define REC_R_DEFINE() {} #define SYN_PQ_DEFINE() {}
#define REC_R_STRIDE 4 #define SYN_PQ_D 0, 1, 2, 3
#define REC_R_X 0, 1, 2, 3 #define SYN_PQ_X 4, 5, 6, 7
#define REC_PQ_STRIDE 2
#define REC_PQ_DEFINE() {} #define REC_PQ_DEFINE() {}
#define REC_PQ_STRIDE 4 #define REC_PQ_X 0, 1
#define REC_PQ_X 0, 1, 2, 3 #define REC_PQ_Y 2, 3
#define REC_PQ_Y 4, 5, 6, 7 #define REC_PQ_T 4, 5
#define REC_PQ_D 20, 21, 22, 23
#define SYN_PR_DEFINE() {}
#define SYN_PR_D 0, 1, 2, 3
#define SYN_PR_X 4, 5, 6, 7
#define REC_PR_STRIDE 2
#define REC_PR_DEFINE() {} #define REC_PR_DEFINE() {}
#define REC_PR_STRIDE 4 #define REC_PR_X 0, 1
#define REC_PR_X 0, 1, 2, 3 #define REC_PR_Y 2, 3
#define REC_PR_Y 4, 5, 6, 7 #define REC_PR_T 4, 5
#define REC_PR_D 20, 21, 22, 23
#define SYN_QR_DEFINE() {}
#define SYN_QR_D 0, 1, 2, 3
#define SYN_QR_X 4, 5, 6, 7
#define REC_QR_STRIDE 2
#define REC_QR_DEFINE() {} #define REC_QR_DEFINE() {}
#define REC_QR_STRIDE 4 #define REC_QR_X 0, 1
#define REC_QR_X 0, 1, 2, 3 #define REC_QR_Y 2, 3
#define REC_QR_Y 4, 5, 6, 7 #define REC_QR_T 4, 5
#define REC_QR_D 20, 21, 22, 23
#define SYN_PQR_DEFINE() {}
#define SYN_PQR_D 0, 1, 2, 3
#define SYN_PQR_X 4, 5, 6, 7
#define REC_PQR_DEFINE() {}
#define REC_PQR_STRIDE 2 #define REC_PQR_STRIDE 2
#define REC_PQR_DEFINE() {}
#define REC_PQR_X 0, 1 #define REC_PQR_X 0, 1
#define REC_PQR_Y 2, 3 #define REC_PQR_Y 2, 3
#define REC_PQR_Z 4, 5 #define REC_PQR_Z 4, 5
#define REC_PQR_D 6, 7
#define REC_PQR_XS 6, 7 #define REC_PQR_XS 6, 7
#define REC_PQR_YS 8, 9 #define REC_PQR_YS 8, 9