diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index 5c7a61c56..9e8b2cf7c 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -143,6 +143,10 @@ extern const fletcher_4_ops_t fletcher_4_avx2_ops; extern const fletcher_4_ops_t fletcher_4_avx512f_ops; #endif +#if defined(__x86_64) && defined(HAVE_AVX512BW) +extern const fletcher_4_ops_t fletcher_4_avx512bw_ops; +#endif + #if defined(__aarch64__) extern const fletcher_4_ops_t fletcher_4_aarch64_neon_ops; #endif diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index c711f6de6..1c773435c 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1507,7 +1507,7 @@ Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR. Select a fletcher 4 implementation. .sp Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR, -\fBavx2\fR, \fBavx512f\fR, and \fBaarch64_neon\fR. +\fBavx2\fR, \fBavx512f\fR, \fBavx512bw\fR, and \fBaarch64_neon\fR. All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction set extensions to be available and will only appear if ZFS detects that they are present at runtime. If multiple implementations of fletcher 4 are available, diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 1280ace31..f955dc8d9 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -184,6 +184,9 @@ static const fletcher_4_ops_t *fletcher_4_impls[] = { #if defined(__x86_64) && defined(HAVE_AVX512F) &fletcher_4_avx512f_ops, #endif +#if defined(__x86_64) && defined(HAVE_AVX512BW) + &fletcher_4_avx512bw_ops, +#endif #if defined(__aarch64__) &fletcher_4_aarch64_neon_ops, #endif diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c index 43806f264..d33d2dc33 100644 --- a/module/zcommon/zfs_fletcher_avx512.c +++ b/module/zcommon/zfs_fletcher_avx512.c @@ -171,4 +171,53 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = { .name = "avx512f" }; +#if defined(HAVE_AVX512BW) +static void +fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf, + uint64_t size) +{ + static const zfs_fletcher_avx512_t mask = { + .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B } + }; + const uint32_t *ip = buf; + const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); + + kfpu_begin(); + + FLETCHER_4_AVX512_RESTORE_CTX(ctx); + + __asm("vmovdqu64 %0, %%zmm5" :: "m" (mask)); + + for (; ip < ipend; ip += 8) { + __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); + + __asm("vpshufb %zmm5, %zmm4, %zmm4"); + + __asm("vpaddq %zmm4, %zmm0, %zmm0"); + __asm("vpaddq %zmm0, %zmm1, %zmm1"); + __asm("vpaddq %zmm1, %zmm2, %zmm2"); + __asm("vpaddq %zmm2, %zmm3, %zmm3"); + } + + FLETCHER_4_AVX512_SAVE_CTX(ctx) + + kfpu_end(); +} +STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap); + +const fletcher_4_ops_t fletcher_4_avx512bw_ops = { + .init_native = fletcher_4_avx512f_init, + .fini_native = fletcher_4_avx512f_fini, + .compute_native = fletcher_4_avx512f_native, + .init_byteswap = fletcher_4_avx512f_init, + .fini_byteswap = fletcher_4_avx512f_fini, + .compute_byteswap = fletcher_4_avx512bw_byteswap, + .valid = fletcher_4_avx512f_valid, + .name = "avx512bw" +}; +#endif + #endif /* defined(__x86_64) && defined(HAVE_AVX512F) */