From 0b2a642351f375cb9be3d2569a0ac0417340c741 Mon Sep 17 00:00:00 2001 From: Romain Dolbeau Date: Wed, 30 Oct 2019 20:26:14 +0100 Subject: [PATCH] Add AVX512BW variant of fletcher It is much faster than AVX512F when byteswapping on Skylake-SP and newer, as we can do the byteswap in a single vshufb instead of many instructions. Reviewed by: Gvozden Neskovic Reviewed-by: Chunwei Chen Reviewed-by: Brian Behlendorf Signed-off-by: Romain Dolbeau Closes #9517 --- include/zfs_fletcher.h | 4 +++ man/man5/zfs-module-parameters.5 | 2 +- module/zcommon/zfs_fletcher.c | 3 ++ module/zcommon/zfs_fletcher_avx512.c | 49 ++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 1 deletion(-) diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index 5c7a61c56..9e8b2cf7c 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -143,6 +143,10 @@ extern const fletcher_4_ops_t fletcher_4_avx2_ops; extern const fletcher_4_ops_t fletcher_4_avx512f_ops; #endif +#if defined(__x86_64) && defined(HAVE_AVX512BW) +extern const fletcher_4_ops_t fletcher_4_avx512bw_ops; +#endif + #if defined(__aarch64__) extern const fletcher_4_ops_t fletcher_4_aarch64_neon_ops; #endif diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index c711f6de6..1c773435c 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1507,7 +1507,7 @@ Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR. Select a fletcher 4 implementation. .sp Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR, -\fBavx2\fR, \fBavx512f\fR, and \fBaarch64_neon\fR. +\fBavx2\fR, \fBavx512f\fR, \fBavx512bw\fR, and \fBaarch64_neon\fR. All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction set extensions to be available and will only appear if ZFS detects that they are present at runtime. If multiple implementations of fletcher 4 are available, diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 1280ace31..f955dc8d9 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -184,6 +184,9 @@ static const fletcher_4_ops_t *fletcher_4_impls[] = { #if defined(__x86_64) && defined(HAVE_AVX512F) &fletcher_4_avx512f_ops, #endif +#if defined(__x86_64) && defined(HAVE_AVX512BW) + &fletcher_4_avx512bw_ops, +#endif #if defined(__aarch64__) &fletcher_4_aarch64_neon_ops, #endif diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c index 43806f264..d33d2dc33 100644 --- a/module/zcommon/zfs_fletcher_avx512.c +++ b/module/zcommon/zfs_fletcher_avx512.c @@ -171,4 +171,53 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = { .name = "avx512f" }; +#if defined(HAVE_AVX512BW) +static void +fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf, + uint64_t size) +{ + static const zfs_fletcher_avx512_t mask = { + .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B } + }; + const uint32_t *ip = buf; + const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); + + kfpu_begin(); + + FLETCHER_4_AVX512_RESTORE_CTX(ctx); + + __asm("vmovdqu64 %0, %%zmm5" :: "m" (mask)); + + for (; ip < ipend; ip += 8) { + __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); + + __asm("vpshufb %zmm5, %zmm4, %zmm4"); + + __asm("vpaddq %zmm4, %zmm0, %zmm0"); + __asm("vpaddq %zmm0, %zmm1, %zmm1"); + __asm("vpaddq %zmm1, %zmm2, %zmm2"); + __asm("vpaddq %zmm2, %zmm3, %zmm3"); + } + + FLETCHER_4_AVX512_SAVE_CTX(ctx) + + kfpu_end(); +} +STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap); + +const fletcher_4_ops_t fletcher_4_avx512bw_ops = { + .init_native = fletcher_4_avx512f_init, + .fini_native = fletcher_4_avx512f_fini, + .compute_native = fletcher_4_avx512f_native, + .init_byteswap = fletcher_4_avx512f_init, + .fini_byteswap = fletcher_4_avx512f_fini, + .compute_byteswap = fletcher_4_avx512bw_byteswap, + .valid = fletcher_4_avx512f_valid, + .name = "avx512bw" +}; +#endif + #endif /* defined(__x86_64) && defined(HAVE_AVX512F) */