From 47119d60eff666b7da4d230054cf8a113baf9b92 Mon Sep 17 00:00:00 2001
From: Jorgen Lundman <lundman@lundman.net>
Date: Tue, 7 Mar 2023 08:24:05 +0900
Subject: [PATCH] Restore ASMABI and other Unify work

Make sure all SHA2 transform function has wrappers

For ASMABI to work, it is required the calling convention
is consistent.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Signed-off-by: Joergen Lundman <lundman@lundman.net>
Closes #14569
---
 module/icp/algs/blake3/blake3_impl.c       | 20 +++++++++---------
 module/icp/algs/sha2/sha256_impl.c         | 16 ++++++++++++---
 module/icp/algs/sha2/sha512_impl.c         | 15 +++++++++++---
 module/icp/asm-x86_64/sha2/sha256-x86_64.S | 24 +++++++++++-----------
 module/icp/asm-x86_64/sha2/sha512-x86_64.S | 18 ++++++++--------
 5 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c
index f68a5edfe..b59fde1a4 100644
--- a/module/icp/algs/blake3/blake3_impl.c
+++ b/module/icp/algs/blake3/blake3_impl.c
@@ -34,15 +34,15 @@
 	(defined(__x86_64) && defined(HAVE_SSE2)) || \
 	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
 
-extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8],
+extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8],
     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
     uint64_t counter, uint8_t flags);
 
-extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8],
+extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8],
     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
     uint64_t counter, uint8_t flags, uint8_t out[64]);
 
-extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs,
+extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs,
     size_t num_inputs, size_t blocks, const uint32_t key[8],
     uint64_t counter, boolean_t increment_counter, uint8_t flags,
     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
@@ -100,15 +100,15 @@ const blake3_ops_t blake3_sse2_impl = {
 	(defined(__x86_64) && defined(HAVE_SSE2)) || \
 	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
 
-extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8],
+extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8],
     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
     uint64_t counter, uint8_t flags);
 
-extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8],
+extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8],
     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
     uint64_t counter, uint8_t flags, uint8_t out[64]);
 
-extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs,
+extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs,
     size_t num_inputs, size_t blocks, const uint32_t key[8],
     uint64_t counter, boolean_t increment_counter, uint8_t flags,
     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
@@ -163,7 +163,7 @@ const blake3_ops_t blake3_sse41_impl = {
 #endif
 
 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
-extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs,
+extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs,
     size_t num_inputs, size_t blocks, const uint32_t key[8],
     uint64_t counter, boolean_t increment_counter, uint8_t flags,
     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
@@ -196,15 +196,15 @@ blake3_avx2_impl = {
 #endif
 
 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
-extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8],
+extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8],
     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
     uint64_t counter, uint8_t flags);
 
-extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8],
+extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8],
     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
     uint64_t counter, uint8_t flags, uint8_t out[64]);
 
-extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs,
+extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs,
     size_t num_inputs, size_t blocks, const uint32_t key[8],
     uint64_t counter, boolean_t increment_counter, uint8_t flags,
     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
diff --git a/module/icp/algs/sha2/sha256_impl.c b/module/icp/algs/sha2/sha256_impl.c
index 024cfb1e4..f85a33fb6 100644
--- a/module/icp/algs/sha2/sha256_impl.c
+++ b/module/icp/algs/sha2/sha256_impl.c
@@ -29,9 +29,10 @@
 #include <sys/simd.h>
 
 #include <sha2/sha2_impl.h>
+#include <sys/asm_linkage.h>
 
 #define	TF(E, N) \
-	extern void E(uint32_t s[8], const void *, size_t); \
+	extern void ASMABI E(uint32_t s[8], const void *, size_t); \
 	static inline void N(uint32_t s[8], const void *d, size_t b) { \
 	kfpu_begin(); E(s, d, b); kfpu_end(); \
 }
@@ -44,10 +45,19 @@ static inline boolean_t sha2_is_supported(void)
 
 #if defined(__x86_64)
 
-extern void zfs_sha256_transform_x64(uint32_t s[8], const void *, size_t);
+/* Users of ASMABI requires all calls to be from wrappers */
+extern void ASMABI
+zfs_sha256_transform_x64(uint32_t s[8], const void *, size_t);
+
+static inline void
+tf_sha256_transform_x64(uint32_t s[8], const void *d, size_t b)
+{
+	zfs_sha256_transform_x64(s, d, b);
+}
+
 const sha256_ops_t sha256_x64_impl = {
 	.is_supported = sha2_is_supported,
-	.transform = zfs_sha256_transform_x64,
+	.transform = tf_sha256_transform_x64,
 	.name = "x64"
 };
 
diff --git a/module/icp/algs/sha2/sha512_impl.c b/module/icp/algs/sha2/sha512_impl.c
index d21312336..2a809ccdd 100644
--- a/module/icp/algs/sha2/sha512_impl.c
+++ b/module/icp/algs/sha2/sha512_impl.c
@@ -29,9 +29,10 @@
 #include <sys/simd.h>
 
 #include <sha2/sha2_impl.h>
+#include <sys/asm_linkage.h>
 
 #define	TF(E, N) \
-	extern void E(uint64_t s[8], const void *, size_t); \
+	extern void ASMABI E(uint64_t s[8], const void *, size_t); \
 	static inline void N(uint64_t s[8], const void *d, size_t b) { \
 	kfpu_begin(); E(s, d, b); kfpu_end(); \
 }
@@ -44,10 +45,18 @@ static inline boolean_t sha2_is_supported(void)
 
 #if defined(__x86_64)
 
-extern void zfs_sha512_transform_x64(uint64_t s[8], const void *, size_t);
+/* Users of ASMABI requires all calls to be from wrappers */
+extern void ASMABI
+zfs_sha512_transform_x64(uint64_t s[8], const void *, size_t);
+
+static inline void
+tf_sha512_transform_x64(uint64_t s[8], const void *d, size_t b)
+{
+	zfs_sha512_transform_x64(s, d, b);
+}
 const sha512_ops_t sha512_x64_impl = {
 	.is_supported = sha2_is_supported,
-	.transform = zfs_sha512_transform_x64,
+	.transform = tf_sha512_transform_x64,
 	.name = "x64"
 };
 
diff --git a/module/icp/asm-x86_64/sha2/sha256-x86_64.S b/module/icp/asm-x86_64/sha2/sha256-x86_64.S
index da3722f80..d3e5e3f0d 100644
--- a/module/icp/asm-x86_64/sha2/sha256-x86_64.S
+++ b/module/icp/asm-x86_64/sha2/sha256-x86_64.S
@@ -26,8 +26,8 @@
 
 SECTION_STATIC
 
-.align	64
-.type	K256,@object
+.balign	64
+SET_OBJ(K256)
 K256:
 .long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 .long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@@ -105,7 +105,7 @@ ENTRY_ALIGN(zfs_sha256_transform_x64, 16)
 	movl	24(%rdi),%r10d
 	movl	28(%rdi),%r11d
 	jmp	.Lloop
-.align	16
+.balign	16
 .Lloop:
 	movl	%ebx,%edi
 	leaq	K256(%rip),%rbp
@@ -622,7 +622,7 @@ ENTRY_ALIGN(zfs_sha256_transform_x64, 16)
 	addl	%r12d,%eax
 	leaq	20(%rbp),%rbp
 	jmp	.Lrounds_16_xx
-.align	16
+.balign	16
 .Lrounds_16_xx:
 	movl	4(%rsp),%r13d
 	movl	56(%rsp),%r15d
@@ -1436,7 +1436,7 @@ ENTRY_ALIGN(zfs_sha256_transform_shani, 64)
 	punpcklqdq	%xmm0,%xmm2
 	jmp	.Loop_shani
 
-.align	16
+.balign	16
 .Loop_shani:
 	movdqu	(%rsi),%xmm3
 	movdqu	16(%rsi),%xmm4
@@ -1666,7 +1666,7 @@ ENTRY_ALIGN(zfs_sha256_transform_ssse3, 64)
 	movl	28(%rdi),%r11d
 
 	jmp	.Lloop_ssse3
-.align	16
+.balign	16
 .Lloop_ssse3:
 	movdqa	K256+512(%rip),%xmm7
 	movdqu	0(%rsi),%xmm0
@@ -1696,7 +1696,7 @@ ENTRY_ALIGN(zfs_sha256_transform_ssse3, 64)
 	movl	%r8d,%r13d
 	jmp	.Lssse3_00_47
 
-.align	16
+.balign	16
 .Lssse3_00_47:
 	subq	$-128,%rbp
 	rorl	$14,%r13d
@@ -2779,7 +2779,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx, 64)
 	vmovdqa	K256+512+32(%rip),%xmm8
 	vmovdqa	K256+512+64(%rip),%xmm9
 	jmp	.Lloop_avx
-.align	16
+.balign	16
 .Lloop_avx:
 	vmovdqa	K256+512(%rip),%xmm7
 	vmovdqu	0(%rsi),%xmm0
@@ -2805,7 +2805,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx, 64)
 	movl	%r8d,%r13d
 	jmp	.Lavx_00_47
 
-.align	16
+.balign	16
 .Lavx_00_47:
 	subq	$-128,%rbp
 	vpalignr	$4,%xmm0,%xmm1,%xmm4
@@ -3858,7 +3858,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx2, 64)
 	vmovdqa	K256+512+32(%rip),%ymm8
 	vmovdqa	K256+512+64(%rip),%ymm9
 	jmp	.Loop_avx2
-.align	16
+.balign	16
 .Loop_avx2:
 	vmovdqa	K256+512(%rip),%ymm7
 	vmovdqu	-64+0(%rsi),%xmm0
@@ -3900,7 +3900,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx2, 64)
 	subq	$-32*4,%rbp
 	jmp	.Lavx2_00_47
 
-.align	16
+.balign	16
 .Lavx2_00_47:
 	leaq	-64(%rsp),%rsp
 .cfi_escape	0x0f,0x05,0x77,0x38,0x06,0x23,0x08
@@ -4842,7 +4842,7 @@ ENTRY_ALIGN(zfs_sha256_transform_avx2, 64)
 	xorl	%ecx,%edi
 	movl	%r9d,%r12d
 	jmp	.Lower_avx2
-.align	16
+.balign	16
 .Lower_avx2:
 	addl	0+16(%rbp),%r11d
 	andl	%r8d,%r12d
diff --git a/module/icp/asm-x86_64/sha2/sha512-x86_64.S b/module/icp/asm-x86_64/sha2/sha512-x86_64.S
index 29f103965..fbbcca650 100644
--- a/module/icp/asm-x86_64/sha2/sha512-x86_64.S
+++ b/module/icp/asm-x86_64/sha2/sha512-x86_64.S
@@ -26,8 +26,8 @@
 
 SECTION_STATIC
 
-.align	64
-.type	K512,@object
+.balign	64
+SET_OBJ(K512)
 K512:
 .quad	0x428a2f98d728ae22,0x7137449123ef65cd
 .quad	0x428a2f98d728ae22,0x7137449123ef65cd
@@ -148,7 +148,7 @@ ENTRY_ALIGN(zfs_sha512_transform_x64, 16)
 	movq	48(%rdi),%r10
 	movq	56(%rdi),%r11
 	jmp	.Lloop
-.align	16
+.balign	16
 .Lloop:
 	movq	%rbx,%rdi
 	leaq	K512(%rip),%rbp
@@ -665,7 +665,7 @@ ENTRY_ALIGN(zfs_sha512_transform_x64, 16)
 	addq	%r12,%rax
 	leaq	24(%rbp),%rbp
 	jmp	.Lrounds_16_xx
-.align	16
+.balign	16
 .Lrounds_16_xx:
 	movq	8(%rsp),%r13
 	movq	112(%rsp),%r15
@@ -1501,7 +1501,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx, 64)
 	movq	48(%rdi),%r10
 	movq	56(%rdi),%r11
 	jmp	.Lloop_avx
-.align	16
+.balign	16
 .Lloop_avx:
 	vmovdqa	K512+1280(%rip),%xmm11
 	vmovdqu	0(%rsi),%xmm0
@@ -1543,7 +1543,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx, 64)
 	movq	%r8,%r13
 	jmp	.Lavx_00_47
 
-.align	16
+.balign	16
 .Lavx_00_47:
 	addq	$256,%rbp
 	vpalignr	$8,%xmm0,%xmm1,%xmm8
@@ -2670,7 +2670,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx2, 64)
 	movq	48(%rdi),%r10
 	movq	56(%rdi),%r11
 	jmp	.Loop_avx2
-.align	16
+.balign	16
 .Loop_avx2:
 	vmovdqu	-128(%rsi),%xmm0
 	vmovdqu	-128+16(%rsi),%xmm1
@@ -2732,7 +2732,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx2, 64)
 	addq	$32*8,%rbp
 	jmp	.Lavx2_00_47
 
-.align	16
+.balign	16
 .Lavx2_00_47:
 	leaq	-128(%rsp),%rsp
 .cfi_escape	0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
@@ -3750,7 +3750,7 @@ ENTRY_ALIGN(zfs_sha512_transform_avx2, 64)
 	xorq	%rcx,%rdi
 	movq	%r9,%r12
 	jmp	.Lower_avx2
-.align	16
+.balign	16
 .Lower_avx2:
 	addq	0+16(%rbp),%r11
 	andq	%r8,%r12