From 47119d60eff666b7da4d230054cf8a113baf9b92 Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Tue, 7 Mar 2023 08:24:05 +0900 Subject: [PATCH] Restore ASMABI and other Unify work Make sure all SHA2 transform function has wrappers For ASMABI to work, it is required the calling convention is consistent. Reviewed-by: Tino Reichardt Reviewed-by: Richard Yao Signed-off-by: Joergen Lundman Closes #14569 --- module/icp/algs/blake3/blake3_impl.c | 20 +++++++++--------- module/icp/algs/sha2/sha256_impl.c | 16 ++++++++++++--- module/icp/algs/sha2/sha512_impl.c | 15 +++++++++++--- module/icp/asm-x86_64/sha2/sha256-x86_64.S | 24 +++++++++++----------- module/icp/asm-x86_64/sha2/sha512-x86_64.S | 18 ++++++++-------- 5 files changed, 56 insertions(+), 37 deletions(-) diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c index f68a5edfe..b59fde1a4 100644 --- a/module/icp/algs/blake3/blake3_impl.c +++ b/module/icp/algs/blake3/blake3_impl.c @@ -34,15 +34,15 @@ (defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); -extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); -extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); @@ -100,15 +100,15 @@ const blake3_ops_t blake3_sse2_impl = { (defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); -extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); -extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); @@ -163,7 +163,7 @@ const blake3_ops_t blake3_sse41_impl = { #endif #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) -extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); @@ -196,15 +196,15 @@ blake3_avx2_impl = { #endif #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) -extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); -extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); -extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); diff --git a/module/icp/algs/sha2/sha256_impl.c b/module/icp/algs/sha2/sha256_impl.c index 024cfb1e4..f85a33fb6 100644 --- a/module/icp/algs/sha2/sha256_impl.c +++ b/module/icp/algs/sha2/sha256_impl.c @@ -29,9 +29,10 @@ #include #include +#include #define TF(E, N) \ - extern void E(uint32_t s[8], const void *, size_t); \ + extern void ASMABI E(uint32_t s[8], const void *, size_t); \ static inline void N(uint32_t s[8], const void *d, size_t b) { \ kfpu_begin(); E(s, d, b); kfpu_end(); \ } @@ -44,10 +45,19 @@ static inline boolean_t sha2_is_supported(void) #if defined(__x86_64) -extern void zfs_sha256_transform_x64(uint32_t s[8], const void *, size_t); +/* Users of ASMABI requires all calls to be from wrappers */ +extern void ASMABI +zfs_sha256_transform_x64(uint32_t s[8], const void *, size_t); + +static inline void +tf_sha256_transform_x64(uint32_t s[8], const void *d, size_t b) +{ + zfs_sha256_transform_x64(s, d, b); +} + const sha256_ops_t sha256_x64_impl = { .is_supported = sha2_is_supported, - .transform = zfs_sha256_transform_x64, + .transform = tf_sha256_transform_x64, .name = "x64" }; diff --git a/module/icp/algs/sha2/sha512_impl.c b/module/icp/algs/sha2/sha512_impl.c index d21312336..2a809ccdd 100644 --- a/module/icp/algs/sha2/sha512_impl.c +++ b/module/icp/algs/sha2/sha512_impl.c @@ -29,9 +29,10 @@ #include #include +#include #define TF(E, N) \ - extern void E(uint64_t s[8], const void *, size_t); \ + extern void ASMABI E(uint64_t s[8], const void *, size_t); \ static inline void N(uint64_t s[8], const void *d, size_t b) { \ kfpu_begin(); E(s, d, b); kfpu_end(); \ } @@ -44,10 +45,18 @@ static inline boolean_t sha2_is_supported(void) #if defined(__x86_64) -extern void zfs_sha512_transform_x64(uint64_t s[8], const void *, size_t); +/* Users of ASMABI requires all calls to be from wrappers */ +extern void ASMABI +zfs_sha512_transform_x64(uint64_t s[8], const void *, size_t); + +static inline void +tf_sha512_transform_x64(uint64_t s[8], const void *d, size_t b) +{ + zfs_sha512_transform_x64(s, d, b); +} const sha512_ops_t sha512_x64_impl = { .is_supported = sha2_is_supported, - .transform = zfs_sha512_transform_x64, + .transform = tf_sha512_transform_x64, .name = "x64" }; diff --git a/module/icp/asm-x86_64/sha2/sha256-x86_64.S b/module/icp/asm-x86_64/sha2/sha256-x86_64.S index da3722f80..d3e5e3f0d 100644 --- a/module/icp/asm-x86_64/sha2/sha256-x86_64.S +++ b/module/icp/asm-x86_64/sha2/sha256-x86_64.S @@ -26,8 +26,8 @@ SECTION_STATIC -.align 64 -.type K256,@object +.balign 64 +SET_OBJ(K256) K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 @@ -105,7 +105,7 @@ ENTRY_ALIGN(zfs_sha256_transform_x64, 16) movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp .Lloop -.align 16 +.balign 16 .Lloop: movl %ebx,%edi leaq K256(%rip),%rbp @@ -622,7 +622,7 @@ ENTRY_ALIGN(zfs_sha256_transform_x64, 16) addl %r12d,%eax leaq 20(%rbp),%rbp jmp .Lrounds_16_xx -.align 16 +.balign 16 .Lrounds_16_xx: movl 4(%rsp),%r13d movl 56(%rsp),%r15d @@ -1436,7 +1436,7 @@ ENTRY_ALIGN(zfs_sha256_transform_shani, 64) punpcklqdq %xmm0,%xmm2 jmp .Loop_shani -.align 16 +.balign 16 .Loop_shani: movdqu (%rsi),%xmm3 movdqu 16(%rsi),%xmm4 @@ -1666,7 +1666,7 @@ ENTRY_ALIGN(zfs_sha256_transform_ssse3, 64) movl 28(%rdi),%r11d jmp .Lloop_ssse3 -.align 16 +.balign 16 .Lloop_ssse3: movdqa K256+512(%rip),%xmm7 movdqu 0(%rsi),%xmm0 @@ -1696,7 +1696,7 @@ ENTRY_ALIGN(zfs_sha256_transform_ssse3, 64) movl %r8d,%r13d jmp .Lssse3_00_47 -.align 16 +.balign 16 .Lssse3_00_47: subq $-128,%rbp rorl $14,%r13d @@ -2779,7 +2779,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx, 64) vmovdqa K256+512+32(%rip),%xmm8 vmovdqa K256+512+64(%rip),%xmm9 jmp .Lloop_avx -.align 16 +.balign 16 .Lloop_avx: vmovdqa K256+512(%rip),%xmm7 vmovdqu 0(%rsi),%xmm0 @@ -2805,7 +2805,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx, 64) movl %r8d,%r13d jmp .Lavx_00_47 -.align 16 +.balign 16 .Lavx_00_47: subq $-128,%rbp vpalignr $4,%xmm0,%xmm1,%xmm4 @@ -3858,7 +3858,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx2, 64) vmovdqa K256+512+32(%rip),%ymm8 vmovdqa K256+512+64(%rip),%ymm9 jmp .Loop_avx2 -.align 16 +.balign 16 .Loop_avx2: vmovdqa K256+512(%rip),%ymm7 vmovdqu -64+0(%rsi),%xmm0 @@ -3900,7 +3900,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx2, 64) subq $-32*4,%rbp jmp .Lavx2_00_47 -.align 16 +.balign 16 .Lavx2_00_47: leaq -64(%rsp),%rsp .cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 @@ -4842,7 +4842,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx2, 64) xorl %ecx,%edi movl %r9d,%r12d jmp .Lower_avx2 -.align 16 +.balign 16 .Lower_avx2: addl 0+16(%rbp),%r11d andl %r8d,%r12d diff --git a/module/icp/asm-x86_64/sha2/sha512-x86_64.S b/module/icp/asm-x86_64/sha2/sha512-x86_64.S index 29f103965..fbbcca650 100644 --- a/module/icp/asm-x86_64/sha2/sha512-x86_64.S +++ b/module/icp/asm-x86_64/sha2/sha512-x86_64.S @@ -26,8 +26,8 @@ SECTION_STATIC -.align 64 -.type K512,@object +.balign 64 +SET_OBJ(K512) K512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0x428a2f98d728ae22,0x7137449123ef65cd @@ -148,7 +148,7 @@ ENTRY_ALIGN(zfs_sha512_transform_x64, 16) movq 48(%rdi),%r10 movq 56(%rdi),%r11 jmp .Lloop -.align 16 +.balign 16 .Lloop: movq %rbx,%rdi leaq K512(%rip),%rbp @@ -665,7 +665,7 @@ ENTRY_ALIGN(zfs_sha512_transform_x64, 16) addq %r12,%rax leaq 24(%rbp),%rbp jmp .Lrounds_16_xx -.align 16 +.balign 16 .Lrounds_16_xx: movq 8(%rsp),%r13 movq 112(%rsp),%r15 @@ -1501,7 +1501,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx, 64) movq 48(%rdi),%r10 movq 56(%rdi),%r11 jmp .Lloop_avx -.align 16 +.balign 16 .Lloop_avx: vmovdqa K512+1280(%rip),%xmm11 vmovdqu 0(%rsi),%xmm0 @@ -1543,7 +1543,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx, 64) movq %r8,%r13 jmp .Lavx_00_47 -.align 16 +.balign 16 .Lavx_00_47: addq $256,%rbp vpalignr $8,%xmm0,%xmm1,%xmm8 @@ -2670,7 +2670,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx2, 64) movq 48(%rdi),%r10 movq 56(%rdi),%r11 jmp .Loop_avx2 -.align 16 +.balign 16 .Loop_avx2: vmovdqu -128(%rsi),%xmm0 vmovdqu -128+16(%rsi),%xmm1 @@ -2732,7 +2732,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx2, 64) addq $32*8,%rbp jmp .Lavx2_00_47 -.align 16 +.balign 16 .Lavx2_00_47: leaq -128(%rsp),%rsp .cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 @@ -3750,7 +3750,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx2, 64) xorq %rcx,%rdi movq %r9,%r12 jmp .Lower_avx2 -.align 16 +.balign 16 .Lower_avx2: addq 0+16(%rbp),%r11 andq %r8,%r12