Add AVX512BW variant of fletcher

It is much faster than AVX512F when byteswapping on Skylake-SP
and newer, as we can do the byteswap in a single vshufb instead
of many instructions.

Reviewed by: Gvozden Neskovic <neskovic@gmail.com>
Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net>
Closes #9517
This commit is contained in:
Romain Dolbeau
2019-10-30 20:26:14 +01:00
committed by Brian Behlendorf
parent bae11ba8dc
commit 0b2a642351
4 changed files with 57 additions and 1 deletions
+3
View File
@@ -184,6 +184,9 @@ static const fletcher_4_ops_t *fletcher_4_impls[] = {
#if defined(__x86_64) && defined(HAVE_AVX512F)
&fletcher_4_avx512f_ops,
#endif
#if defined(__x86_64) && defined(HAVE_AVX512BW)
&fletcher_4_avx512bw_ops,
#endif
#if defined(__aarch64__)
&fletcher_4_aarch64_neon_ops,
#endif
+49
View File
@@ -171,4 +171,53 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
.name = "avx512f"
};
#if defined(HAVE_AVX512BW)
static void
fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
uint64_t size)
{
static const zfs_fletcher_avx512_t mask = {
.v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
};
const uint32_t *ip = buf;
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
kfpu_begin();
FLETCHER_4_AVX512_RESTORE_CTX(ctx);
__asm("vmovdqu64 %0, %%zmm5" :: "m" (mask));
for (; ip < ipend; ip += 8) {
__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
__asm("vpshufb %zmm5, %zmm4, %zmm4");
__asm("vpaddq %zmm4, %zmm0, %zmm0");
__asm("vpaddq %zmm0, %zmm1, %zmm1");
__asm("vpaddq %zmm1, %zmm2, %zmm2");
__asm("vpaddq %zmm2, %zmm3, %zmm3");
}
FLETCHER_4_AVX512_SAVE_CTX(ctx)
kfpu_end();
}
STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap);
const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
.init_native = fletcher_4_avx512f_init,
.fini_native = fletcher_4_avx512f_fini,
.compute_native = fletcher_4_avx512f_native,
.init_byteswap = fletcher_4_avx512f_init,
.fini_byteswap = fletcher_4_avx512f_fini,
.compute_byteswap = fletcher_4_avx512bw_byteswap,
.valid = fletcher_4_avx512f_valid,
.name = "avx512bw"
};
#endif
#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */