mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-26 19:19:32 +03:00
ABD: Adapt avx512bw raidz assembly
Adapt avx512bw implementation for use with abd buffers. Mul2 implementation is rewritten to take advantage of the BW instruction set. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Romain Dolbeau <romain.dolbeau@atos.net> Signed-off-by: Gvozden Neskovic <neskovic@gmail.com> Closes #5477
This commit is contained in:
parent
f2d8bdc62e
commit
0101796290
@ -61,7 +61,7 @@ const raidz_impl_ops_t *raidz_all_maths[] = {
|
|||||||
&vdev_raidz_avx512f_impl,
|
&vdev_raidz_avx512f_impl,
|
||||||
#endif
|
#endif
|
||||||
#if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */
|
#if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */
|
||||||
// &vdev_raidz_avx512bw_impl,
|
&vdev_raidz_avx512bw_impl,
|
||||||
#endif
|
#endif
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__)
|
||||||
&vdev_raidz_aarch64_neon_impl,
|
&vdev_raidz_aarch64_neon_impl,
|
||||||
|
@ -20,11 +20,12 @@
|
|||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2016 Romain Dolbeau. All rights reserved.
|
* Copyright (C) 2016 Romain Dolbeau. All rights reserved.
|
||||||
|
* Copyright (C) 2016 Gvozden Nešković. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sys/isa_defs.h>
|
#include <sys/isa_defs.h>
|
||||||
|
|
||||||
#if 0 // defined(__x86_64) && defined(HAVE_AVX512BW)
|
#if defined(__x86_64) && defined(HAVE_AVX512BW)
|
||||||
|
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <linux/simd_x86.h>
|
#include <linux/simd_x86.h>
|
||||||
@ -66,20 +67,6 @@ typedef struct v {
|
|||||||
uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
|
uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
|
||||||
} v_t;
|
} v_t;
|
||||||
|
|
||||||
#define PREFETCHNTA(ptr, offset) \
|
|
||||||
{ \
|
|
||||||
__asm( \
|
|
||||||
"prefetchnta " #offset "(%[MEM])\n" \
|
|
||||||
: : [MEM] "r" (ptr)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define PREFETCH(ptr, offset) \
|
|
||||||
{ \
|
|
||||||
__asm( \
|
|
||||||
"prefetcht0 " #offset "(%[MEM])\n" \
|
|
||||||
: : [MEM] "r" (ptr)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define XOR_ACC(src, r...) \
|
#define XOR_ACC(src, r...) \
|
||||||
{ \
|
{ \
|
||||||
switch (REG_CNT(r)) { \
|
switch (REG_CNT(r)) { \
|
||||||
@ -122,25 +109,7 @@ typedef struct v {
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ZERO(r...) \
|
#define ZERO(r...) XOR(r, r)
|
||||||
{ \
|
|
||||||
switch (REG_CNT(r)) { \
|
|
||||||
case 4: \
|
|
||||||
__asm( \
|
|
||||||
"vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
|
|
||||||
"vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n" \
|
|
||||||
"vpxorq %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n" \
|
|
||||||
"vpxorq %" VR3(r) ", %" VR3(r)", %" VR3(r)); \
|
|
||||||
break; \
|
|
||||||
case 2: \
|
|
||||||
__asm( \
|
|
||||||
"vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
|
|
||||||
"vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r)); \
|
|
||||||
break; \
|
|
||||||
default: \
|
|
||||||
ASM_BUG(); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define COPY(r...) \
|
#define COPY(r...) \
|
||||||
{ \
|
{ \
|
||||||
@ -206,20 +175,11 @@ typedef struct v {
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define FLUSH() \
|
|
||||||
{ \
|
|
||||||
__asm("vzeroupper"); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define MUL2_SETUP() \
|
#define MUL2_SETUP() \
|
||||||
{ \
|
{ \
|
||||||
__asm("vmovq %0, %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d)); \
|
__asm("vmovq %0, %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d)); \
|
||||||
__asm("vpbroadcastq %xmm14, %zmm14"); \
|
__asm("vpbroadcastq %xmm22, %zmm22"); \
|
||||||
__asm("vmovq %0, %%xmm13" :: "r"(0x8080808080808080)); \
|
__asm("vpxord %zmm23, %zmm23 ,%zmm23"); \
|
||||||
__asm("vpbroadcastq %xmm13, %zmm13"); \
|
|
||||||
__asm("vmovq %0, %%xmm12" :: "r"(0xfefefefefefefefe)); \
|
|
||||||
__asm("vpbroadcastq %xmm12, %zmm12"); \
|
|
||||||
__asm("vpxorq %zmm15, %zmm15 ,%zmm15"); \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define _MUL2(r...) \
|
#define _MUL2(r...) \
|
||||||
@ -227,20 +187,14 @@ typedef struct v {
|
|||||||
switch (REG_CNT(r)) { \
|
switch (REG_CNT(r)) { \
|
||||||
case 2: \
|
case 2: \
|
||||||
__asm( \
|
__asm( \
|
||||||
"vpandq %" VR0(r)", %zmm13, %zmm10\n" \
|
"vpcmpb $1, %zmm23, %" VR0(r)", %k1\n" \
|
||||||
"vpandq %" VR1(r)", %zmm13, %zmm11\n" \
|
"vpcmpb $1, %zmm23, %" VR1(r)", %k2\n" \
|
||||||
"vpsrlq $7, %zmm10, %zmm8\n" \
|
"vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \
|
||||||
"vpsrlq $7, %zmm11, %zmm9\n" \
|
"vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \
|
||||||
"vpsllq $1, %zmm10, %zmm10\n" \
|
"vpxord %zmm22, %" VR0(r)", %zmm12\n" \
|
||||||
"vpsllq $1, %zmm11, %zmm11\n" \
|
"vpxord %zmm22, %" VR1(r)", %zmm13\n" \
|
||||||
"vpsubq %zmm8, %zmm10, %zmm10\n" \
|
"vmovdqu8 %zmm12, %" VR0(r) "{%k1}\n" \
|
||||||
"vpsubq %zmm9, %zmm11, %zmm11\n" \
|
"vmovdqu8 %zmm13, %" VR1(r) "{%k2}"); \
|
||||||
"vpsllq $1, %" VR0(r)", %" VR0(r) "\n" \
|
|
||||||
"vpsllq $1, %" VR1(r)", %" VR1(r) "\n" \
|
|
||||||
"vpandq %zmm10, %zmm14, %zmm10\n" \
|
|
||||||
"vpandq %zmm11, %zmm14, %zmm11\n" \
|
|
||||||
"vpternlogd $0x6c,%zmm12, %zmm10, %" VR0(r) "\n" \
|
|
||||||
"vpternlogd $0x6c,%zmm12, %zmm11, %" VR1(r)); \
|
|
||||||
break; \
|
break; \
|
||||||
default: \
|
default: \
|
||||||
ASM_BUG(); \
|
ASM_BUG(); \
|
||||||
@ -276,7 +230,7 @@ typedef struct v {
|
|||||||
#define _ta "zmm10"
|
#define _ta "zmm10"
|
||||||
#define _tb "zmm15"
|
#define _tb "zmm15"
|
||||||
|
|
||||||
static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
|
static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F;
|
||||||
|
|
||||||
#define _MULx2(c, r...) \
|
#define _MULx2(c, r...) \
|
||||||
{ \
|
{ \
|
||||||
@ -339,11 +293,15 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define raidz_math_begin() kfpu_begin()
|
#define raidz_math_begin() kfpu_begin()
|
||||||
#define raidz_math_end() \
|
#define raidz_math_end() kfpu_end()
|
||||||
{ \
|
|
||||||
FLUSH(); \
|
/*
|
||||||
kfpu_end(); \
|
* ZERO, COPY, and MUL operations are already 2x unrolled, which means that
|
||||||
}
|
* the stride of these operations for avx512 must not exceed 4. Otherwise, a
|
||||||
|
* single step would exceed 512B block size.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define SYN_STRIDE 4
|
||||||
|
|
||||||
#define ZERO_STRIDE 4
|
#define ZERO_STRIDE 4
|
||||||
#define ZERO_DEFINE() {}
|
#define ZERO_DEFINE() {}
|
||||||
@ -361,59 +319,67 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
|
|||||||
#define MUL_DEFINE() {}
|
#define MUL_DEFINE() {}
|
||||||
#define MUL_D 0, 1, 2, 3
|
#define MUL_D 0, 1, 2, 3
|
||||||
|
|
||||||
#define GEN_P_DEFINE() {}
|
|
||||||
#define GEN_P_STRIDE 4
|
#define GEN_P_STRIDE 4
|
||||||
|
#define GEN_P_DEFINE() {}
|
||||||
#define GEN_P_P 0, 1, 2, 3
|
#define GEN_P_P 0, 1, 2, 3
|
||||||
|
|
||||||
#define GEN_PQ_DEFINE() {}
|
|
||||||
#define GEN_PQ_STRIDE 4
|
#define GEN_PQ_STRIDE 4
|
||||||
|
#define GEN_PQ_DEFINE() {}
|
||||||
#define GEN_PQ_D 0, 1, 2, 3
|
#define GEN_PQ_D 0, 1, 2, 3
|
||||||
#define GEN_PQ_P 4, 5, 6, 7
|
#define GEN_PQ_C 4, 5, 6, 7
|
||||||
#define GEN_PQ_Q 20, 21, 22, 23
|
|
||||||
|
|
||||||
|
#define GEN_PQR_STRIDE 4
|
||||||
#define GEN_PQR_DEFINE() {}
|
#define GEN_PQR_DEFINE() {}
|
||||||
#define GEN_PQR_STRIDE 2
|
#define GEN_PQR_D 0, 1, 2, 3
|
||||||
#define GEN_PQR_D 0, 1
|
#define GEN_PQR_C 4, 5, 6, 7
|
||||||
#define GEN_PQR_P 2, 3
|
|
||||||
#define GEN_PQR_Q 4, 5
|
|
||||||
#define GEN_PQR_R 6, 7
|
|
||||||
|
|
||||||
#define REC_P_DEFINE() {}
|
#define SYN_Q_DEFINE() {}
|
||||||
#define REC_P_STRIDE 4
|
#define SYN_Q_D 0, 1, 2, 3
|
||||||
#define REC_P_X 0, 1, 2, 3
|
#define SYN_Q_X 4, 5, 6, 7
|
||||||
|
|
||||||
#define REC_Q_DEFINE() {}
|
#define SYN_R_DEFINE() {}
|
||||||
#define REC_Q_STRIDE 4
|
#define SYN_R_D 0, 1, 2, 3
|
||||||
#define REC_Q_X 0, 1, 2, 3
|
#define SYN_R_X 4, 5, 6, 7
|
||||||
|
|
||||||
#define REC_R_DEFINE() {}
|
#define SYN_PQ_DEFINE() {}
|
||||||
#define REC_R_STRIDE 4
|
#define SYN_PQ_D 0, 1, 2, 3
|
||||||
#define REC_R_X 0, 1, 2, 3
|
#define SYN_PQ_X 4, 5, 6, 7
|
||||||
|
|
||||||
|
#define REC_PQ_STRIDE 2
|
||||||
#define REC_PQ_DEFINE() {}
|
#define REC_PQ_DEFINE() {}
|
||||||
#define REC_PQ_STRIDE 4
|
#define REC_PQ_X 0, 1
|
||||||
#define REC_PQ_X 0, 1, 2, 3
|
#define REC_PQ_Y 2, 3
|
||||||
#define REC_PQ_Y 4, 5, 6, 7
|
#define REC_PQ_T 4, 5
|
||||||
#define REC_PQ_D 20, 21, 22, 23
|
|
||||||
|
|
||||||
|
#define SYN_PR_DEFINE() {}
|
||||||
|
#define SYN_PR_D 0, 1, 2, 3
|
||||||
|
#define SYN_PR_X 4, 5, 6, 7
|
||||||
|
|
||||||
|
#define REC_PR_STRIDE 2
|
||||||
#define REC_PR_DEFINE() {}
|
#define REC_PR_DEFINE() {}
|
||||||
#define REC_PR_STRIDE 4
|
#define REC_PR_X 0, 1
|
||||||
#define REC_PR_X 0, 1, 2, 3
|
#define REC_PR_Y 2, 3
|
||||||
#define REC_PR_Y 4, 5, 6, 7
|
#define REC_PR_T 4, 5
|
||||||
#define REC_PR_D 20, 21, 22, 23
|
|
||||||
|
|
||||||
|
#define SYN_QR_DEFINE() {}
|
||||||
|
#define SYN_QR_D 0, 1, 2, 3
|
||||||
|
#define SYN_QR_X 4, 5, 6, 7
|
||||||
|
|
||||||
|
#define REC_QR_STRIDE 2
|
||||||
#define REC_QR_DEFINE() {}
|
#define REC_QR_DEFINE() {}
|
||||||
#define REC_QR_STRIDE 4
|
#define REC_QR_X 0, 1
|
||||||
#define REC_QR_X 0, 1, 2, 3
|
#define REC_QR_Y 2, 3
|
||||||
#define REC_QR_Y 4, 5, 6, 7
|
#define REC_QR_T 4, 5
|
||||||
#define REC_QR_D 20, 21, 22, 23
|
|
||||||
|
#define SYN_PQR_DEFINE() {}
|
||||||
|
#define SYN_PQR_D 0, 1, 2, 3
|
||||||
|
#define SYN_PQR_X 4, 5, 6, 7
|
||||||
|
|
||||||
#define REC_PQR_DEFINE() {}
|
|
||||||
#define REC_PQR_STRIDE 2
|
#define REC_PQR_STRIDE 2
|
||||||
|
#define REC_PQR_DEFINE() {}
|
||||||
#define REC_PQR_X 0, 1
|
#define REC_PQR_X 0, 1
|
||||||
#define REC_PQR_Y 2, 3
|
#define REC_PQR_Y 2, 3
|
||||||
#define REC_PQR_Z 4, 5
|
#define REC_PQR_Z 4, 5
|
||||||
#define REC_PQR_D 6, 7
|
|
||||||
#define REC_PQR_XS 6, 7
|
#define REC_PQR_XS 6, 7
|
||||||
#define REC_PQR_YS 8, 9
|
#define REC_PQR_YS 8, 9
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user