mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-12 19:20:28 +03:00
Add AVX512BW variant of fletcher
It is much faster than AVX512F when byteswapping on Skylake-SP and newer, as we can do the byteswap in a single vshufb instead of many instructions. Reviewed by: Gvozden Neskovic <neskovic@gmail.com> Reviewed-by: Chunwei Chen <tuxoko@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #9517
This commit is contained in:
parent
bae11ba8dc
commit
0b2a642351
@ -143,6 +143,10 @@ extern const fletcher_4_ops_t fletcher_4_avx2_ops;
|
|||||||
extern const fletcher_4_ops_t fletcher_4_avx512f_ops;
|
extern const fletcher_4_ops_t fletcher_4_avx512f_ops;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__x86_64) && defined(HAVE_AVX512BW)
|
||||||
|
extern const fletcher_4_ops_t fletcher_4_avx512bw_ops;
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__)
|
||||||
extern const fletcher_4_ops_t fletcher_4_aarch64_neon_ops;
|
extern const fletcher_4_ops_t fletcher_4_aarch64_neon_ops;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1507,7 +1507,7 @@ Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR.
|
|||||||
Select a fletcher 4 implementation.
|
Select a fletcher 4 implementation.
|
||||||
.sp
|
.sp
|
||||||
Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR,
|
Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR,
|
||||||
\fBavx2\fR, \fBavx512f\fR, and \fBaarch64_neon\fR.
|
\fBavx2\fR, \fBavx512f\fR, \fBavx512bw\fR, and \fBaarch64_neon\fR.
|
||||||
All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction
|
All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction
|
||||||
set extensions to be available and will only appear if ZFS detects that they are
|
set extensions to be available and will only appear if ZFS detects that they are
|
||||||
present at runtime. If multiple implementations of fletcher 4 are available,
|
present at runtime. If multiple implementations of fletcher 4 are available,
|
||||||
|
@ -184,6 +184,9 @@ static const fletcher_4_ops_t *fletcher_4_impls[] = {
|
|||||||
#if defined(__x86_64) && defined(HAVE_AVX512F)
|
#if defined(__x86_64) && defined(HAVE_AVX512F)
|
||||||
&fletcher_4_avx512f_ops,
|
&fletcher_4_avx512f_ops,
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(__x86_64) && defined(HAVE_AVX512BW)
|
||||||
|
&fletcher_4_avx512bw_ops,
|
||||||
|
#endif
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__)
|
||||||
&fletcher_4_aarch64_neon_ops,
|
&fletcher_4_aarch64_neon_ops,
|
||||||
#endif
|
#endif
|
||||||
|
@ -171,4 +171,53 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
|
|||||||
.name = "avx512f"
|
.name = "avx512f"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX512BW)
|
||||||
|
static void
|
||||||
|
fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
|
||||||
|
uint64_t size)
|
||||||
|
{
|
||||||
|
static const zfs_fletcher_avx512_t mask = {
|
||||||
|
.v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
|
||||||
|
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
|
||||||
|
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
|
||||||
|
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
|
||||||
|
};
|
||||||
|
const uint32_t *ip = buf;
|
||||||
|
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
|
||||||
|
|
||||||
|
kfpu_begin();
|
||||||
|
|
||||||
|
FLETCHER_4_AVX512_RESTORE_CTX(ctx);
|
||||||
|
|
||||||
|
__asm("vmovdqu64 %0, %%zmm5" :: "m" (mask));
|
||||||
|
|
||||||
|
for (; ip < ipend; ip += 8) {
|
||||||
|
__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
|
||||||
|
|
||||||
|
__asm("vpshufb %zmm5, %zmm4, %zmm4");
|
||||||
|
|
||||||
|
__asm("vpaddq %zmm4, %zmm0, %zmm0");
|
||||||
|
__asm("vpaddq %zmm0, %zmm1, %zmm1");
|
||||||
|
__asm("vpaddq %zmm1, %zmm2, %zmm2");
|
||||||
|
__asm("vpaddq %zmm2, %zmm3, %zmm3");
|
||||||
|
}
|
||||||
|
|
||||||
|
FLETCHER_4_AVX512_SAVE_CTX(ctx)
|
||||||
|
|
||||||
|
kfpu_end();
|
||||||
|
}
|
||||||
|
STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap);
|
||||||
|
|
||||||
|
const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
|
||||||
|
.init_native = fletcher_4_avx512f_init,
|
||||||
|
.fini_native = fletcher_4_avx512f_fini,
|
||||||
|
.compute_native = fletcher_4_avx512f_native,
|
||||||
|
.init_byteswap = fletcher_4_avx512f_init,
|
||||||
|
.fini_byteswap = fletcher_4_avx512f_fini,
|
||||||
|
.compute_byteswap = fletcher_4_avx512bw_byteswap,
|
||||||
|
.valid = fletcher_4_avx512f_valid,
|
||||||
|
.name = "avx512bw"
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */
|
#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */
|
||||||
|
Loading…
Reference in New Issue
Block a user