Restore ASMABI and other Unify work

Make sure all SHA2 transform function has wrappers

For ASMABI to work, it is required the calling convention
is consistent.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Signed-off-by: Joergen Lundman <lundman@lundman.net>
Closes #14569
This commit is contained in:
Jorgen Lundman 2023-03-07 08:24:05 +09:00 committed by GitHub
parent 620a977f22
commit 47119d60ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 56 additions and 37 deletions

View File

@ -34,15 +34,15 @@
(defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__x86_64) && defined(HAVE_SSE2)) || \
(defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
uint64_t counter, uint8_t flags); uint64_t counter, uint8_t flags);
extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
uint64_t counter, uint8_t flags, uint8_t out[64]); uint64_t counter, uint8_t flags, uint8_t out[64]);
extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs,
size_t num_inputs, size_t blocks, const uint32_t key[8], size_t num_inputs, size_t blocks, const uint32_t key[8],
uint64_t counter, boolean_t increment_counter, uint8_t flags, uint64_t counter, boolean_t increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out); uint8_t flags_start, uint8_t flags_end, uint8_t *out);
@ -100,15 +100,15 @@ const blake3_ops_t blake3_sse2_impl = {
(defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__x86_64) && defined(HAVE_SSE2)) || \
(defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
uint64_t counter, uint8_t flags); uint64_t counter, uint8_t flags);
extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
uint64_t counter, uint8_t flags, uint8_t out[64]); uint64_t counter, uint8_t flags, uint8_t out[64]);
extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs,
size_t num_inputs, size_t blocks, const uint32_t key[8], size_t num_inputs, size_t blocks, const uint32_t key[8],
uint64_t counter, boolean_t increment_counter, uint8_t flags, uint64_t counter, boolean_t increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out); uint8_t flags_start, uint8_t flags_end, uint8_t *out);
@ -163,7 +163,7 @@ const blake3_ops_t blake3_sse41_impl = {
#endif #endif
#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs,
size_t num_inputs, size_t blocks, const uint32_t key[8], size_t num_inputs, size_t blocks, const uint32_t key[8],
uint64_t counter, boolean_t increment_counter, uint8_t flags, uint64_t counter, boolean_t increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out); uint8_t flags_start, uint8_t flags_end, uint8_t *out);
@ -196,15 +196,15 @@ blake3_avx2_impl = {
#endif #endif
#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
uint64_t counter, uint8_t flags); uint64_t counter, uint8_t flags);
extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
uint64_t counter, uint8_t flags, uint8_t out[64]); uint64_t counter, uint8_t flags, uint8_t out[64]);
extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs,
size_t num_inputs, size_t blocks, const uint32_t key[8], size_t num_inputs, size_t blocks, const uint32_t key[8],
uint64_t counter, boolean_t increment_counter, uint8_t flags, uint64_t counter, boolean_t increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out); uint8_t flags_start, uint8_t flags_end, uint8_t *out);

View File

@ -29,9 +29,10 @@
#include <sys/simd.h> #include <sys/simd.h>
#include <sha2/sha2_impl.h> #include <sha2/sha2_impl.h>
#include <sys/asm_linkage.h>
#define TF(E, N) \ #define TF(E, N) \
extern void E(uint32_t s[8], const void *, size_t); \ extern void ASMABI E(uint32_t s[8], const void *, size_t); \
static inline void N(uint32_t s[8], const void *d, size_t b) { \ static inline void N(uint32_t s[8], const void *d, size_t b) { \
kfpu_begin(); E(s, d, b); kfpu_end(); \ kfpu_begin(); E(s, d, b); kfpu_end(); \
} }
@ -44,10 +45,19 @@ static inline boolean_t sha2_is_supported(void)
#if defined(__x86_64) #if defined(__x86_64)
extern void zfs_sha256_transform_x64(uint32_t s[8], const void *, size_t); /* Users of ASMABI requires all calls to be from wrappers */
extern void ASMABI
zfs_sha256_transform_x64(uint32_t s[8], const void *, size_t);
static inline void
tf_sha256_transform_x64(uint32_t s[8], const void *d, size_t b)
{
zfs_sha256_transform_x64(s, d, b);
}
const sha256_ops_t sha256_x64_impl = { const sha256_ops_t sha256_x64_impl = {
.is_supported = sha2_is_supported, .is_supported = sha2_is_supported,
.transform = zfs_sha256_transform_x64, .transform = tf_sha256_transform_x64,
.name = "x64" .name = "x64"
}; };

View File

@ -29,9 +29,10 @@
#include <sys/simd.h> #include <sys/simd.h>
#include <sha2/sha2_impl.h> #include <sha2/sha2_impl.h>
#include <sys/asm_linkage.h>
#define TF(E, N) \ #define TF(E, N) \
extern void E(uint64_t s[8], const void *, size_t); \ extern void ASMABI E(uint64_t s[8], const void *, size_t); \
static inline void N(uint64_t s[8], const void *d, size_t b) { \ static inline void N(uint64_t s[8], const void *d, size_t b) { \
kfpu_begin(); E(s, d, b); kfpu_end(); \ kfpu_begin(); E(s, d, b); kfpu_end(); \
} }
@ -44,10 +45,18 @@ static inline boolean_t sha2_is_supported(void)
#if defined(__x86_64) #if defined(__x86_64)
extern void zfs_sha512_transform_x64(uint64_t s[8], const void *, size_t); /* Users of ASMABI requires all calls to be from wrappers */
extern void ASMABI
zfs_sha512_transform_x64(uint64_t s[8], const void *, size_t);
static inline void
tf_sha512_transform_x64(uint64_t s[8], const void *d, size_t b)
{
zfs_sha512_transform_x64(s, d, b);
}
const sha512_ops_t sha512_x64_impl = { const sha512_ops_t sha512_x64_impl = {
.is_supported = sha2_is_supported, .is_supported = sha2_is_supported,
.transform = zfs_sha512_transform_x64, .transform = tf_sha512_transform_x64,
.name = "x64" .name = "x64"
}; };

View File

@ -26,8 +26,8 @@
SECTION_STATIC SECTION_STATIC
.align 64 .balign 64
.type K256,@object SET_OBJ(K256)
K256: K256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@ -105,7 +105,7 @@ ENTRY_ALIGN(zfs_sha256_transform_x64, 16)
movl 24(%rdi),%r10d movl 24(%rdi),%r10d
movl 28(%rdi),%r11d movl 28(%rdi),%r11d
jmp .Lloop jmp .Lloop
.align 16 .balign 16
.Lloop: .Lloop:
movl %ebx,%edi movl %ebx,%edi
leaq K256(%rip),%rbp leaq K256(%rip),%rbp
@ -622,7 +622,7 @@ ENTRY_ALIGN(zfs_sha256_transform_x64, 16)
addl %r12d,%eax addl %r12d,%eax
leaq 20(%rbp),%rbp leaq 20(%rbp),%rbp
jmp .Lrounds_16_xx jmp .Lrounds_16_xx
.align 16 .balign 16
.Lrounds_16_xx: .Lrounds_16_xx:
movl 4(%rsp),%r13d movl 4(%rsp),%r13d
movl 56(%rsp),%r15d movl 56(%rsp),%r15d
@ -1436,7 +1436,7 @@ ENTRY_ALIGN(zfs_sha256_transform_shani, 64)
punpcklqdq %xmm0,%xmm2 punpcklqdq %xmm0,%xmm2
jmp .Loop_shani jmp .Loop_shani
.align 16 .balign 16
.Loop_shani: .Loop_shani:
movdqu (%rsi),%xmm3 movdqu (%rsi),%xmm3
movdqu 16(%rsi),%xmm4 movdqu 16(%rsi),%xmm4
@ -1666,7 +1666,7 @@ ENTRY_ALIGN(zfs_sha256_transform_ssse3, 64)
movl 28(%rdi),%r11d movl 28(%rdi),%r11d
jmp .Lloop_ssse3 jmp .Lloop_ssse3
.align 16 .balign 16
.Lloop_ssse3: .Lloop_ssse3:
movdqa K256+512(%rip),%xmm7 movdqa K256+512(%rip),%xmm7
movdqu 0(%rsi),%xmm0 movdqu 0(%rsi),%xmm0
@ -1696,7 +1696,7 @@ ENTRY_ALIGN(zfs_sha256_transform_ssse3, 64)
movl %r8d,%r13d movl %r8d,%r13d
jmp .Lssse3_00_47 jmp .Lssse3_00_47
.align 16 .balign 16
.Lssse3_00_47: .Lssse3_00_47:
subq $-128,%rbp subq $-128,%rbp
rorl $14,%r13d rorl $14,%r13d
@ -2779,7 +2779,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx, 64)
vmovdqa K256+512+32(%rip),%xmm8 vmovdqa K256+512+32(%rip),%xmm8
vmovdqa K256+512+64(%rip),%xmm9 vmovdqa K256+512+64(%rip),%xmm9
jmp .Lloop_avx jmp .Lloop_avx
.align 16 .balign 16
.Lloop_avx: .Lloop_avx:
vmovdqa K256+512(%rip),%xmm7 vmovdqa K256+512(%rip),%xmm7
vmovdqu 0(%rsi),%xmm0 vmovdqu 0(%rsi),%xmm0
@ -2805,7 +2805,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx, 64)
movl %r8d,%r13d movl %r8d,%r13d
jmp .Lavx_00_47 jmp .Lavx_00_47
.align 16 .balign 16
.Lavx_00_47: .Lavx_00_47:
subq $-128,%rbp subq $-128,%rbp
vpalignr $4,%xmm0,%xmm1,%xmm4 vpalignr $4,%xmm0,%xmm1,%xmm4
@ -3858,7 +3858,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx2, 64)
vmovdqa K256+512+32(%rip),%ymm8 vmovdqa K256+512+32(%rip),%ymm8
vmovdqa K256+512+64(%rip),%ymm9 vmovdqa K256+512+64(%rip),%ymm9
jmp .Loop_avx2 jmp .Loop_avx2
.align 16 .balign 16
.Loop_avx2: .Loop_avx2:
vmovdqa K256+512(%rip),%ymm7 vmovdqa K256+512(%rip),%ymm7
vmovdqu -64+0(%rsi),%xmm0 vmovdqu -64+0(%rsi),%xmm0
@ -3900,7 +3900,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx2, 64)
subq $-32*4,%rbp subq $-32*4,%rbp
jmp .Lavx2_00_47 jmp .Lavx2_00_47
.align 16 .balign 16
.Lavx2_00_47: .Lavx2_00_47:
leaq -64(%rsp),%rsp leaq -64(%rsp),%rsp
.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 .cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
@ -4842,7 +4842,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx2, 64)
xorl %ecx,%edi xorl %ecx,%edi
movl %r9d,%r12d movl %r9d,%r12d
jmp .Lower_avx2 jmp .Lower_avx2
.align 16 .balign 16
.Lower_avx2: .Lower_avx2:
addl 0+16(%rbp),%r11d addl 0+16(%rbp),%r11d
andl %r8d,%r12d andl %r8d,%r12d

View File

@ -26,8 +26,8 @@
SECTION_STATIC SECTION_STATIC
.align 64 .balign 64
.type K512,@object SET_OBJ(K512)
K512: K512:
.quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0x428a2f98d728ae22,0x7137449123ef65cd
@ -148,7 +148,7 @@ ENTRY_ALIGN(zfs_sha512_transform_x64, 16)
movq 48(%rdi),%r10 movq 48(%rdi),%r10
movq 56(%rdi),%r11 movq 56(%rdi),%r11
jmp .Lloop jmp .Lloop
.align 16 .balign 16
.Lloop: .Lloop:
movq %rbx,%rdi movq %rbx,%rdi
leaq K512(%rip),%rbp leaq K512(%rip),%rbp
@ -665,7 +665,7 @@ ENTRY_ALIGN(zfs_sha512_transform_x64, 16)
addq %r12,%rax addq %r12,%rax
leaq 24(%rbp),%rbp leaq 24(%rbp),%rbp
jmp .Lrounds_16_xx jmp .Lrounds_16_xx
.align 16 .balign 16
.Lrounds_16_xx: .Lrounds_16_xx:
movq 8(%rsp),%r13 movq 8(%rsp),%r13
movq 112(%rsp),%r15 movq 112(%rsp),%r15
@ -1501,7 +1501,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx, 64)
movq 48(%rdi),%r10 movq 48(%rdi),%r10
movq 56(%rdi),%r11 movq 56(%rdi),%r11
jmp .Lloop_avx jmp .Lloop_avx
.align 16 .balign 16
.Lloop_avx: .Lloop_avx:
vmovdqa K512+1280(%rip),%xmm11 vmovdqa K512+1280(%rip),%xmm11
vmovdqu 0(%rsi),%xmm0 vmovdqu 0(%rsi),%xmm0
@ -1543,7 +1543,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx, 64)
movq %r8,%r13 movq %r8,%r13
jmp .Lavx_00_47 jmp .Lavx_00_47
.align 16 .balign 16
.Lavx_00_47: .Lavx_00_47:
addq $256,%rbp addq $256,%rbp
vpalignr $8,%xmm0,%xmm1,%xmm8 vpalignr $8,%xmm0,%xmm1,%xmm8
@ -2670,7 +2670,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx2, 64)
movq 48(%rdi),%r10 movq 48(%rdi),%r10
movq 56(%rdi),%r11 movq 56(%rdi),%r11
jmp .Loop_avx2 jmp .Loop_avx2
.align 16 .balign 16
.Loop_avx2: .Loop_avx2:
vmovdqu -128(%rsi),%xmm0 vmovdqu -128(%rsi),%xmm0
vmovdqu -128+16(%rsi),%xmm1 vmovdqu -128+16(%rsi),%xmm1
@ -2732,7 +2732,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx2, 64)
addq $32*8,%rbp addq $32*8,%rbp
jmp .Lavx2_00_47 jmp .Lavx2_00_47
.align 16 .balign 16
.Lavx2_00_47: .Lavx2_00_47:
leaq -128(%rsp),%rsp leaq -128(%rsp),%rsp
.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 .cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
@ -3750,7 +3750,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx2, 64)
xorq %rcx,%rdi xorq %rcx,%rdi
movq %r9,%r12 movq %r9,%r12
jmp .Lower_avx2 jmp .Lower_avx2
.align 16 .balign 16
.Lower_avx2: .Lower_avx2:
addq 0+16(%rbp),%r11 addq 0+16(%rbp),%r11
andq %r8,%r12 andq %r8,%r12