ABD: Adapt avx512bw raidz assembly

Adapt avx512bw implementation for use with abd buffers. Mul2 implementation
is rewritten to take advantage of the BW instruction set.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Romain Dolbeau <romain.dolbeau@atos.net>
Signed-off-by: Gvozden Neskovic <neskovic@gmail.com>
Closes #5477
This commit is contained in:
Gvozden Neskovic 2016-12-16 02:31:33 +01:00 committed by Brian Behlendorf
parent f2d8bdc62e
commit 0101796290
2 changed files with 71 additions and 105 deletions

View File

@ -61,7 +61,7 @@ const raidz_impl_ops_t *raidz_all_maths[] = {
&vdev_raidz_avx512f_impl,
#endif
#if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */
// &vdev_raidz_avx512bw_impl,
&vdev_raidz_avx512bw_impl,
#endif
#if defined(__aarch64__)
&vdev_raidz_aarch64_neon_impl,

View File

@ -20,11 +20,12 @@
*/
/*
* Copyright (C) 2016 Romain Dolbeau. All rights reserved.
* Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/
#include <sys/isa_defs.h>
#if 0 // defined(__x86_64) && defined(HAVE_AVX512BW)
#if defined(__x86_64) && defined(HAVE_AVX512BW)
#include <sys/types.h>
#include <linux/simd_x86.h>
@ -66,20 +67,6 @@ typedef struct v {
uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
} v_t;
#define PREFETCHNTA(ptr, offset) \
{ \
__asm( \
"prefetchnta " #offset "(%[MEM])\n" \
: : [MEM] "r" (ptr)); \
}
#define PREFETCH(ptr, offset) \
{ \
__asm( \
"prefetcht0 " #offset "(%[MEM])\n" \
: : [MEM] "r" (ptr)); \
}
#define XOR_ACC(src, r...) \
{ \
switch (REG_CNT(r)) { \
@ -122,25 +109,7 @@ typedef struct v {
} \
}
#define ZERO(r...) \
{ \
switch (REG_CNT(r)) { \
case 4: \
__asm( \
"vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
"vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n" \
"vpxorq %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n" \
"vpxorq %" VR3(r) ", %" VR3(r)", %" VR3(r)); \
break; \
case 2: \
__asm( \
"vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
"vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r)); \
break; \
default: \
ASM_BUG(); \
} \
}
#define ZERO(r...) XOR(r, r)
#define COPY(r...) \
{ \
@ -206,20 +175,11 @@ typedef struct v {
} \
}
#define FLUSH() \
{ \
__asm("vzeroupper"); \
}
#define MUL2_SETUP() \
{ \
__asm("vmovq %0, %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d)); \
__asm("vpbroadcastq %xmm14, %zmm14"); \
__asm("vmovq %0, %%xmm13" :: "r"(0x8080808080808080)); \
__asm("vpbroadcastq %xmm13, %zmm13"); \
__asm("vmovq %0, %%xmm12" :: "r"(0xfefefefefefefefe)); \
__asm("vpbroadcastq %xmm12, %zmm12"); \
__asm("vpxorq %zmm15, %zmm15 ,%zmm15"); \
__asm("vmovq %0, %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d)); \
__asm("vpbroadcastq %xmm22, %zmm22"); \
__asm("vpxord %zmm23, %zmm23 ,%zmm23"); \
}
#define _MUL2(r...) \
@ -227,20 +187,14 @@ typedef struct v {
switch (REG_CNT(r)) { \
case 2: \
__asm( \
"vpandq %" VR0(r)", %zmm13, %zmm10\n" \
"vpandq %" VR1(r)", %zmm13, %zmm11\n" \
"vpsrlq $7, %zmm10, %zmm8\n" \
"vpsrlq $7, %zmm11, %zmm9\n" \
"vpsllq $1, %zmm10, %zmm10\n" \
"vpsllq $1, %zmm11, %zmm11\n" \
"vpsubq %zmm8, %zmm10, %zmm10\n" \
"vpsubq %zmm9, %zmm11, %zmm11\n" \
"vpsllq $1, %" VR0(r)", %" VR0(r) "\n" \
"vpsllq $1, %" VR1(r)", %" VR1(r) "\n" \
"vpandq %zmm10, %zmm14, %zmm10\n" \
"vpandq %zmm11, %zmm14, %zmm11\n" \
"vpternlogd $0x6c,%zmm12, %zmm10, %" VR0(r) "\n" \
"vpternlogd $0x6c,%zmm12, %zmm11, %" VR1(r)); \
"vpcmpb $1, %zmm23, %" VR0(r)", %k1\n" \
"vpcmpb $1, %zmm23, %" VR1(r)", %k2\n" \
"vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \
"vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \
"vpxord %zmm22, %" VR0(r)", %zmm12\n" \
"vpxord %zmm22, %" VR1(r)", %zmm13\n" \
"vmovdqu8 %zmm12, %" VR0(r) "{%k1}\n" \
"vmovdqu8 %zmm13, %" VR1(r) "{%k2}"); \
break; \
default: \
ASM_BUG(); \
@ -276,7 +230,7 @@ typedef struct v {
#define _ta "zmm10"
#define _tb "zmm15"
static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F;
#define _MULx2(c, r...) \
{ \
@ -339,11 +293,15 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
}
#define raidz_math_begin() kfpu_begin()
#define raidz_math_end() \
{ \
FLUSH(); \
kfpu_end(); \
}
#define raidz_math_end() kfpu_end()
/*
* ZERO, COPY, and MUL operations are already 2x unrolled, which means that
* the stride of these operations for avx512 must not exceed 4. Otherwise, a
* single step would exceed 512B block size.
*/
#define SYN_STRIDE 4
#define ZERO_STRIDE 4
#define ZERO_DEFINE() {}
@ -361,59 +319,67 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
#define MUL_DEFINE() {}
#define MUL_D 0, 1, 2, 3
#define GEN_P_DEFINE() {}
#define GEN_P_STRIDE 4
#define GEN_P_DEFINE() {}
#define GEN_P_P 0, 1, 2, 3
#define GEN_PQ_DEFINE() {}
#define GEN_PQ_STRIDE 4
#define GEN_PQ_DEFINE() {}
#define GEN_PQ_D 0, 1, 2, 3
#define GEN_PQ_P 4, 5, 6, 7
#define GEN_PQ_Q 20, 21, 22, 23
#define GEN_PQ_C 4, 5, 6, 7
#define GEN_PQR_STRIDE 4
#define GEN_PQR_DEFINE() {}
#define GEN_PQR_STRIDE 2
#define GEN_PQR_D 0, 1
#define GEN_PQR_P 2, 3
#define GEN_PQR_Q 4, 5
#define GEN_PQR_R 6, 7
#define GEN_PQR_D 0, 1, 2, 3
#define GEN_PQR_C 4, 5, 6, 7
#define REC_P_DEFINE() {}
#define REC_P_STRIDE 4
#define REC_P_X 0, 1, 2, 3
#define SYN_Q_DEFINE() {}
#define SYN_Q_D 0, 1, 2, 3
#define SYN_Q_X 4, 5, 6, 7
#define REC_Q_DEFINE() {}
#define REC_Q_STRIDE 4
#define REC_Q_X 0, 1, 2, 3
#define SYN_R_DEFINE() {}
#define SYN_R_D 0, 1, 2, 3
#define SYN_R_X 4, 5, 6, 7
#define REC_R_DEFINE() {}
#define REC_R_STRIDE 4
#define REC_R_X 0, 1, 2, 3
#define SYN_PQ_DEFINE() {}
#define SYN_PQ_D 0, 1, 2, 3
#define SYN_PQ_X 4, 5, 6, 7
#define REC_PQ_STRIDE 2
#define REC_PQ_DEFINE() {}
#define REC_PQ_STRIDE 4
#define REC_PQ_X 0, 1, 2, 3
#define REC_PQ_Y 4, 5, 6, 7
#define REC_PQ_D 20, 21, 22, 23
#define REC_PQ_X 0, 1
#define REC_PQ_Y 2, 3
#define REC_PQ_T 4, 5
#define SYN_PR_DEFINE() {}
#define SYN_PR_D 0, 1, 2, 3
#define SYN_PR_X 4, 5, 6, 7
#define REC_PR_STRIDE 2
#define REC_PR_DEFINE() {}
#define REC_PR_STRIDE 4
#define REC_PR_X 0, 1, 2, 3
#define REC_PR_Y 4, 5, 6, 7
#define REC_PR_D 20, 21, 22, 23
#define REC_PR_X 0, 1
#define REC_PR_Y 2, 3
#define REC_PR_T 4, 5
#define SYN_QR_DEFINE() {}
#define SYN_QR_D 0, 1, 2, 3
#define SYN_QR_X 4, 5, 6, 7
#define REC_QR_STRIDE 2
#define REC_QR_DEFINE() {}
#define REC_QR_STRIDE 4
#define REC_QR_X 0, 1, 2, 3
#define REC_QR_Y 4, 5, 6, 7
#define REC_QR_D 20, 21, 22, 23
#define REC_QR_X 0, 1
#define REC_QR_Y 2, 3
#define REC_QR_T 4, 5
#define SYN_PQR_DEFINE() {}
#define SYN_PQR_D 0, 1, 2, 3
#define SYN_PQR_X 4, 5, 6, 7
#define REC_PQR_DEFINE() {}
#define REC_PQR_STRIDE 2
#define REC_PQR_DEFINE() {}
#define REC_PQR_X 0, 1
#define REC_PQR_Y 2, 3
#define REC_PQR_Z 4, 5
#define REC_PQR_D 6, 7
#define REC_PQR_XS 6, 7
#define REC_PQR_YS 8, 9