icp: add SHA512 implementation using Intel SHA512 extensions

Generated from crypto/sha/asm/sha512-x86_64.pl in
openssl/openssl@241d4826f8.

Sponsored-by: TrueNAS
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Attila Fülöp <attila@fueloep.org>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18233
This commit is contained in:
Rob Norris 2026-02-17 14:54:09 +11:00 committed by Brian Behlendorf
parent 3547a358fd
commit 09c27a14a3
2 changed files with 338 additions and 1 deletions

View File

@ -22,6 +22,7 @@
/*
* Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
* Copyright (c) 2026, TrueNAS.
*/
#include <sys/simd.h>
@ -92,6 +93,20 @@ const sha512_ops_t sha512_avx2_impl = {
};
#endif
#if defined(HAVE_SHA512EXT)
static boolean_t sha2_have_sha512ext(void)
{
return (kfpu_allowed() && zfs_sha512ext_available());
}
TF(zfs_sha512_transform_sha512ext, tf_sha512_sha512ext);
const sha512_ops_t sha512_sha512ext_impl = {
.is_supported = sha2_have_sha512ext,
.transform = tf_sha512_sha512ext,
.name = "sha512ext"
};
#endif
#elif defined(__aarch64__) || defined(__arm__)
extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
const sha512_ops_t sha512_armv7_impl = {
@ -164,6 +179,9 @@ static const sha512_ops_t *const sha512_impls[] = {
#if defined(__x86_64) && defined(HAVE_AVX2)
&sha512_avx2_impl,
#endif
#if defined(__x86_64) && defined(HAVE_SHA512EXT)
&sha512_sha512ext_impl,
#endif
#if defined(__aarch64__) || defined(__arm__)
&sha512_armv7_impl,
#if defined(__aarch64__)

View File

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
/*
* Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 2004-2025 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -114,6 +114,50 @@ K512:
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.balign 64
SET_OBJ(K512_single)
K512_single:
.quad 0x428a2f98d728ae22, 0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538, 0x59f111f1b605d019
.quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242, 0x12835b0145706fbe
.quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235, 0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
.quad 0x983e5152ee66dfab, 0xa831c66d2db43210
.quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725
.quad 0x06ca6351e003826f, 0x142929670a0e6e70
.quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
.quad 0x650a73548baf63de, 0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6, 0x92722c851482353b
.quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001
.quad 0xc24b8b70d0f89791, 0xc76c51a30654be30
.quad 0xd192e819d6ef5218, 0xd69906245565a910
.quad 0xf40e35855771202a, 0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
.quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc, 0x78a5636f43172f60
.quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec
.quad 0x90befffa23631e28, 0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b
.quad 0xca273eceea26619c, 0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae, 0x1b710b35131c471b
.quad 0x28db77f523047d84, 0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
ENTRY_ALIGN(zfs_sha512_transform_x64, 16)
.cfi_startproc
ENDBR
@ -4010,6 +4054,278 @@ ENTRY_ALIGN(zfs_sha512_transform_avx2, 64)
SET_SIZE(zfs_sha512_transform_avx2)
STACK_FRAME_NON_STANDARD zfs_sha512_transform_avx2
#ifdef HAVE_SHA512EXT
ENTRY_ALIGN(zfs_sha512_transform_sha512ext, 64)
.cfi_startproc
ENDBR
orq %rdx,%rdx
je .Lsha512ext_done
vbroadcasti128 1280+K512(%rip),%ymm15
vmovdqu 0(%rdi),%ymm0
vmovdqu 32(%rdi),%ymm1
vperm2i128 $0x20,%ymm1,%ymm0,%ymm2
vperm2i128 $0x31,%ymm1,%ymm0,%ymm3
vpermq $0x1b,%ymm2,%ymm13
vpermq $0x1b,%ymm3,%ymm14
leaq K512_single(%rip),%r9
.balign 32
.Lsha512ext_block_loop:
vmovdqa %ymm13,%ymm11
vmovdqa %ymm14,%ymm12
vmovdqu 0(%rsi),%ymm0
vpshufb %ymm15,%ymm0,%ymm3
vpaddq 0(%r9),%ymm3,%ymm0
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
vmovdqu 32(%rsi),%ymm0
vpshufb %ymm15,%ymm0,%ymm4
vpaddq 32(%r9),%ymm4,%ymm0
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xdc
vmovdqu 64(%rsi),%ymm0
vpshufb %ymm15,%ymm0,%ymm5
vpaddq 64(%r9),%ymm5,%ymm0
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xe5
vmovdqu 96(%rsi),%ymm0
vpshufb %ymm15,%ymm0,%ymm6
vpaddq 96(%r9),%ymm6,%ymm0
vpermq $0x1b,%ymm6,%ymm8
vpermq $0x39,%ymm5,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm8
vpaddq %ymm8,%ymm3,%ymm3
.byte 0xc4,0xe2,0x7f,0xcd,0xde
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xee
vpaddq 128(%r9),%ymm3,%ymm0
vpermq $0x1b,%ymm3,%ymm8
vpermq $0x39,%ymm6,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
.byte 0xc4,0xe2,0x7f,0xcd,0xe3
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xf3
vpaddq 160(%r9),%ymm4,%ymm0
vpermq $0x1b,%ymm4,%ymm8
vpermq $0x39,%ymm3,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm5,%ymm5
.byte 0xc4,0xe2,0x7f,0xcd,0xec
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xdc
vpaddq 192(%r9),%ymm5,%ymm0
vpermq $0x1b,%ymm5,%ymm8
vpermq $0x39,%ymm4,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm6,%ymm6
.byte 0xc4,0xe2,0x7f,0xcd,0xf5
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xe5
vpaddq 224(%r9),%ymm6,%ymm0
vpermq $0x1b,%ymm6,%ymm8
vpermq $0x39,%ymm5,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm3,%ymm3
.byte 0xc4,0xe2,0x7f,0xcd,0xde
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xee
vpaddq 256(%r9),%ymm3,%ymm0
vpermq $0x1b,%ymm3,%ymm8
vpermq $0x39,%ymm6,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
.byte 0xc4,0xe2,0x7f,0xcd,0xe3
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xf3
vpaddq 288(%r9),%ymm4,%ymm0
vpermq $0x1b,%ymm4,%ymm8
vpermq $0x39,%ymm3,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm5,%ymm5
.byte 0xc4,0xe2,0x7f,0xcd,0xec
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xdc
vpaddq 320(%r9),%ymm5,%ymm0
vpermq $0x1b,%ymm5,%ymm8
vpermq $0x39,%ymm4,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm6,%ymm6
.byte 0xc4,0xe2,0x7f,0xcd,0xf5
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xe5
vpaddq 352(%r9),%ymm6,%ymm0
vpermq $0x1b,%ymm6,%ymm8
vpermq $0x39,%ymm5,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm3,%ymm3
.byte 0xc4,0xe2,0x7f,0xcd,0xde
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xee
vpaddq 384(%r9),%ymm3,%ymm0
vpermq $0x1b,%ymm3,%ymm8
vpermq $0x39,%ymm6,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
.byte 0xc4,0xe2,0x7f,0xcd,0xe3
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xf3
vpaddq 416(%r9),%ymm4,%ymm0
vpermq $0x1b,%ymm4,%ymm8
vpermq $0x39,%ymm3,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm5,%ymm5
.byte 0xc4,0xe2,0x7f,0xcd,0xec
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xdc
vpaddq 448(%r9),%ymm5,%ymm0
vpermq $0x1b,%ymm5,%ymm8
vpermq $0x39,%ymm4,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm6,%ymm6
.byte 0xc4,0xe2,0x7f,0xcd,0xf5
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xe5
vpaddq 480(%r9),%ymm6,%ymm0
vpermq $0x1b,%ymm6,%ymm8
vpermq $0x39,%ymm5,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm3,%ymm3
.byte 0xc4,0xe2,0x7f,0xcd,0xde
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xee
vpaddq 512(%r9),%ymm3,%ymm0
vpermq $0x1b,%ymm3,%ymm8
vpermq $0x39,%ymm6,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
.byte 0xc4,0xe2,0x7f,0xcd,0xe3
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
.byte 0xc4,0xe2,0x7f,0xcc,0xf3
vpaddq 544(%r9),%ymm4,%ymm0
vpermq $0x1b,%ymm4,%ymm8
vpermq $0x39,%ymm3,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm5,%ymm5
.byte 0xc4,0xe2,0x7f,0xcd,0xec
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
vpaddq 576(%r9),%ymm5,%ymm0
vpermq $0x1b,%ymm5,%ymm8
vpermq $0x39,%ymm4,%ymm9
vpblendd $0x3f,%ymm9,%ymm8,%ymm7
vpaddq %ymm7,%ymm6,%ymm6
.byte 0xc4,0xe2,0x7f,0xcd,0xf5
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
vpaddq 608(%r9),%ymm6,%ymm0
.byte 0xc4,0x62,0x27,0xcb,0xe0
vperm2i128 $0x1,%ymm0,%ymm0,%ymm0
.byte 0xc4,0x62,0x1f,0xcb,0xd8
vpaddq %ymm12,%ymm14,%ymm14
vpaddq %ymm11,%ymm13,%ymm13
addq $128,%rsi
decq %rdx
jnz .Lsha512ext_block_loop
vperm2i128 $0x31,%ymm14,%ymm13,%ymm1
vperm2i128 $0x20,%ymm14,%ymm13,%ymm2
vpermq $0xb1,%ymm1,%ymm1
vpermq $0xb1,%ymm2,%ymm2
vmovdqu %ymm1,0(%rdi)
vmovdqu %ymm2,32(%rdi)
vzeroupper
.Lsha512ext_done:
RET
.cfi_endproc
SET_SIZE(zfs_sha512_transform_sha512ext)
STACK_FRAME_NON_STANDARD zfs_sha512_transform_sha512ext
#endif /* HAVE_SHA512EXT */
/* Workaround for missing asm macro in RHEL 8. */
#if defined(__linux__) && defined(HAVE_STACK_FRAME_NON_STANDARD) && \
! defined(HAVE_STACK_FRAME_NON_STANDARD_ASM)
@ -4017,6 +4333,9 @@ STACK_FRAME_NON_STANDARD zfs_sha512_transform_avx2
.long zfs_sha512_transform_x64 - .
.long zfs_sha512_transform_avx - .
.long zfs_sha512_transform_avx2 - .
#ifdef HAVE_SHA512EXT
.long zfs_sha512_transform_sha512ext - .
#endif
#endif
#if defined(__ELF__)