diff --git a/module/icp/algs/sha2/sha512_impl.c b/module/icp/algs/sha2/sha512_impl.c index a85a71a83..4206f8f20 100644 --- a/module/icp/algs/sha2/sha512_impl.c +++ b/module/icp/algs/sha2/sha512_impl.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2022 Tino Reichardt + * Copyright (c) 2026, TrueNAS. */ #include @@ -92,6 +93,20 @@ const sha512_ops_t sha512_avx2_impl = { }; #endif +#if defined(HAVE_SHA512EXT) +static boolean_t sha2_have_sha512ext(void) +{ + return (kfpu_allowed() && zfs_sha512ext_available()); +} + +TF(zfs_sha512_transform_sha512ext, tf_sha512_sha512ext); +const sha512_ops_t sha512_sha512ext_impl = { + .is_supported = sha2_have_sha512ext, + .transform = tf_sha512_sha512ext, + .name = "sha512ext" +}; +#endif + #elif defined(__aarch64__) || defined(__arm__) extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t); const sha512_ops_t sha512_armv7_impl = { @@ -164,6 +179,9 @@ static const sha512_ops_t *const sha512_impls[] = { #if defined(__x86_64) && defined(HAVE_AVX2) &sha512_avx2_impl, #endif +#if defined(__x86_64) && defined(HAVE_SHA512EXT) + &sha512_sha512ext_impl, +#endif #if defined(__aarch64__) || defined(__arm__) &sha512_armv7_impl, #if defined(__aarch64__) diff --git a/module/icp/asm-x86_64/sha2/sha512-x86_64.S b/module/icp/asm-x86_64/sha2/sha512-x86_64.S index 9ed50ddc7..47e4edd51 100644 --- a/module/icp/asm-x86_64/sha2/sha512-x86_64.S +++ b/module/icp/asm-x86_64/sha2/sha512-x86_64.S @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 /* - * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2004-2025 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -114,6 +114,50 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f +.balign 64 +SET_OBJ(K512_single) +K512_single: +.quad 0x428a2f98d728ae22, 0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538, 0x59f111f1b605d019 +.quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242, 0x12835b0145706fbe +.quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235, 0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 +.quad 0x983e5152ee66dfab, 0xa831c66d2db43210 +.quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725 +.quad 0x06ca6351e003826f, 0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df +.quad 0x650a73548baf63de, 0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6, 0x92722c851482353b +.quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791, 0xc76c51a30654be30 +.quad 0xd192e819d6ef5218, 0xd69906245565a910 +.quad 0xf40e35855771202a, 0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc, 0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec +.quad 0x90befffa23631e28, 0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b +.quad 0xca273eceea26619c, 0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae, 0x1b710b35131c471b +.quad 0x28db77f523047d84, 0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 + ENTRY_ALIGN(zfs_sha512_transform_x64, 16) .cfi_startproc ENDBR @@ -4010,6 +4054,278 @@ ENTRY_ALIGN(zfs_sha512_transform_avx2, 64) SET_SIZE(zfs_sha512_transform_avx2) STACK_FRAME_NON_STANDARD zfs_sha512_transform_avx2 +#ifdef HAVE_SHA512EXT +ENTRY_ALIGN(zfs_sha512_transform_sha512ext, 64) +.cfi_startproc + ENDBR + orq %rdx,%rdx + je .Lsha512ext_done + + vbroadcasti128 1280+K512(%rip),%ymm15 + + + + + + + + + + + vmovdqu 0(%rdi),%ymm0 + vmovdqu 32(%rdi),%ymm1 + + vperm2i128 $0x20,%ymm1,%ymm0,%ymm2 + vperm2i128 $0x31,%ymm1,%ymm0,%ymm3 + + vpermq $0x1b,%ymm2,%ymm13 + vpermq $0x1b,%ymm3,%ymm14 + + + leaq K512_single(%rip),%r9 + +.balign 32 +.Lsha512ext_block_loop: + + vmovdqa %ymm13,%ymm11 + vmovdqa %ymm14,%ymm12 + + + vmovdqu 0(%rsi),%ymm0 + vpshufb %ymm15,%ymm0,%ymm3 + vpaddq 0(%r9),%ymm3,%ymm0 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 + + + vmovdqu 32(%rsi),%ymm0 + vpshufb %ymm15,%ymm0,%ymm4 + vpaddq 32(%r9),%ymm4,%ymm0 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xdc + + + vmovdqu 64(%rsi),%ymm0 + vpshufb %ymm15,%ymm0,%ymm5 + vpaddq 64(%r9),%ymm5,%ymm0 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xe5 + + + vmovdqu 96(%rsi),%ymm0 + vpshufb %ymm15,%ymm0,%ymm6 + vpaddq 96(%r9),%ymm6,%ymm0 + vpermq $0x1b,%ymm6,%ymm8 + vpermq $0x39,%ymm5,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm8 + vpaddq %ymm8,%ymm3,%ymm3 +.byte 0xc4,0xe2,0x7f,0xcd,0xde +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xee + + vpaddq 128(%r9),%ymm3,%ymm0 + vpermq $0x1b,%ymm3,%ymm8 + vpermq $0x39,%ymm6,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 +.byte 0xc4,0xe2,0x7f,0xcd,0xe3 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xf3 + + vpaddq 160(%r9),%ymm4,%ymm0 + vpermq $0x1b,%ymm4,%ymm8 + vpermq $0x39,%ymm3,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm5,%ymm5 +.byte 0xc4,0xe2,0x7f,0xcd,0xec +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xdc + + vpaddq 192(%r9),%ymm5,%ymm0 + vpermq $0x1b,%ymm5,%ymm8 + vpermq $0x39,%ymm4,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm6,%ymm6 +.byte 0xc4,0xe2,0x7f,0xcd,0xf5 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xe5 + + vpaddq 224(%r9),%ymm6,%ymm0 + vpermq $0x1b,%ymm6,%ymm8 + vpermq $0x39,%ymm5,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 +.byte 0xc4,0xe2,0x7f,0xcd,0xde +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xee + + vpaddq 256(%r9),%ymm3,%ymm0 + vpermq $0x1b,%ymm3,%ymm8 + vpermq $0x39,%ymm6,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 +.byte 0xc4,0xe2,0x7f,0xcd,0xe3 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xf3 + + vpaddq 288(%r9),%ymm4,%ymm0 + vpermq $0x1b,%ymm4,%ymm8 + vpermq $0x39,%ymm3,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm5,%ymm5 +.byte 0xc4,0xe2,0x7f,0xcd,0xec +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xdc + + vpaddq 320(%r9),%ymm5,%ymm0 + vpermq $0x1b,%ymm5,%ymm8 + vpermq $0x39,%ymm4,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm6,%ymm6 +.byte 0xc4,0xe2,0x7f,0xcd,0xf5 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xe5 + + vpaddq 352(%r9),%ymm6,%ymm0 + vpermq $0x1b,%ymm6,%ymm8 + vpermq $0x39,%ymm5,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 +.byte 0xc4,0xe2,0x7f,0xcd,0xde +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xee + + vpaddq 384(%r9),%ymm3,%ymm0 + vpermq $0x1b,%ymm3,%ymm8 + vpermq $0x39,%ymm6,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 +.byte 0xc4,0xe2,0x7f,0xcd,0xe3 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xf3 + + vpaddq 416(%r9),%ymm4,%ymm0 + vpermq $0x1b,%ymm4,%ymm8 + vpermq $0x39,%ymm3,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm5,%ymm5 +.byte 0xc4,0xe2,0x7f,0xcd,0xec +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xdc + + vpaddq 448(%r9),%ymm5,%ymm0 + vpermq $0x1b,%ymm5,%ymm8 + vpermq $0x39,%ymm4,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm6,%ymm6 +.byte 0xc4,0xe2,0x7f,0xcd,0xf5 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xe5 + + vpaddq 480(%r9),%ymm6,%ymm0 + vpermq $0x1b,%ymm6,%ymm8 + vpermq $0x39,%ymm5,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 +.byte 0xc4,0xe2,0x7f,0xcd,0xde +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xee + + vpaddq 512(%r9),%ymm3,%ymm0 + vpermq $0x1b,%ymm3,%ymm8 + vpermq $0x39,%ymm6,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 +.byte 0xc4,0xe2,0x7f,0xcd,0xe3 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 +.byte 0xc4,0xe2,0x7f,0xcc,0xf3 + + + vpaddq 544(%r9),%ymm4,%ymm0 + vpermq $0x1b,%ymm4,%ymm8 + vpermq $0x39,%ymm3,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm5,%ymm5 +.byte 0xc4,0xe2,0x7f,0xcd,0xec +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 + + + vpaddq 576(%r9),%ymm5,%ymm0 + vpermq $0x1b,%ymm5,%ymm8 + vpermq $0x39,%ymm4,%ymm9 + vpblendd $0x3f,%ymm9,%ymm8,%ymm7 + vpaddq %ymm7,%ymm6,%ymm6 +.byte 0xc4,0xe2,0x7f,0xcd,0xf5 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 + + + vpaddq 608(%r9),%ymm6,%ymm0 +.byte 0xc4,0x62,0x27,0xcb,0xe0 + vperm2i128 $0x1,%ymm0,%ymm0,%ymm0 +.byte 0xc4,0x62,0x1f,0xcb,0xd8 + + + vpaddq %ymm12,%ymm14,%ymm14 + vpaddq %ymm11,%ymm13,%ymm13 + addq $128,%rsi + decq %rdx + jnz .Lsha512ext_block_loop + + + + + vperm2i128 $0x31,%ymm14,%ymm13,%ymm1 + vperm2i128 $0x20,%ymm14,%ymm13,%ymm2 + vpermq $0xb1,%ymm1,%ymm1 + vpermq $0xb1,%ymm2,%ymm2 + vmovdqu %ymm1,0(%rdi) + vmovdqu %ymm2,32(%rdi) + + vzeroupper +.Lsha512ext_done: + RET +.cfi_endproc +SET_SIZE(zfs_sha512_transform_sha512ext) +STACK_FRAME_NON_STANDARD zfs_sha512_transform_sha512ext +#endif /* HAVE_SHA512EXT */ + /* Workaround for missing asm macro in RHEL 8. */ #if defined(__linux__) && defined(HAVE_STACK_FRAME_NON_STANDARD) && \ ! defined(HAVE_STACK_FRAME_NON_STANDARD_ASM) @@ -4017,6 +4333,9 @@ STACK_FRAME_NON_STANDARD zfs_sha512_transform_avx2 .long zfs_sha512_transform_x64 - . .long zfs_sha512_transform_avx - . .long zfs_sha512_transform_avx2 - . +#ifdef HAVE_SHA512EXT + .long zfs_sha512_transform_sha512ext - . +#endif #endif #if defined(__ELF__)