mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-13 11:40:25 +03:00
e8beeaa111
While evaluating other assembler implementations it turns out that the precomputed hash subkey tables vary in size, from 8*16 bytes (avx2/avx512) up to 48*16 bytes (avx512-vaes), depending on the implementation. To be able to handle the size differences later, allocate `gcm_Htable` dynamically rather then having a fixed size array, and adapt consumers. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Attila Fülöp <attila@fueloep.org> Closes #11102
1262 lines
31 KiB
ArmAsm
1262 lines
31 KiB
ArmAsm
# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
#
|
|
#
|
|
# AES-NI-CTR+GHASH stitch.
|
|
#
|
|
# February 2013
|
|
#
|
|
# OpenSSL GCM implementation is organized in such way that its
|
|
# performance is rather close to the sum of its streamed components,
|
|
# in the context parallelized AES-NI CTR and modulo-scheduled
|
|
# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
|
|
# was observed to perform significantly better than the sum of the
|
|
# components on contemporary CPUs, the effort was deemed impossible to
|
|
# justify. This module is based on combination of Intel submissions,
|
|
# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
|
|
# Locktyukhin of Intel Corp. who verified that it reduces shuffles
|
|
# pressure with notable relative improvement, achieving 1.0 cycle per
|
|
# byte processed with 128-bit key on Haswell processor, 0.74 - on
|
|
# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
|
|
# measurements for favourable packet size, one divisible by 96.
|
|
# Applications using the EVP interface will observe a few percent
|
|
# worse performance.]
|
|
#
|
|
# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
|
|
#
|
|
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
|
|
# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
|
|
|
|
# Generated once from
|
|
# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
|
|
# and modified for ICP. Modification are kept at a bare minimum to ease later
|
|
# upstream merges.
|
|
|
|
#if defined(__x86_64__) && defined(HAVE_AVX) && \
|
|
defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
|
|
|
|
.extern gcm_avx_can_use_movbe
|
|
|
|
.text
|
|
|
|
#ifdef HAVE_MOVBE
|
|
.type _aesni_ctr32_ghash_6x,@function
|
|
.align 32
|
|
_aesni_ctr32_ghash_6x:
|
|
.cfi_startproc
|
|
vmovdqu 32(%r11),%xmm2
|
|
subq $6,%rdx
|
|
vpxor %xmm4,%xmm4,%xmm4
|
|
vmovdqu 0-128(%rcx),%xmm15
|
|
vpaddb %xmm2,%xmm1,%xmm10
|
|
vpaddb %xmm2,%xmm10,%xmm11
|
|
vpaddb %xmm2,%xmm11,%xmm12
|
|
vpaddb %xmm2,%xmm12,%xmm13
|
|
vpaddb %xmm2,%xmm13,%xmm14
|
|
vpxor %xmm15,%xmm1,%xmm9
|
|
vmovdqu %xmm4,16+8(%rsp)
|
|
jmp .Loop6x
|
|
|
|
.align 32
|
|
.Loop6x:
|
|
addl $100663296,%ebx
|
|
jc .Lhandle_ctr32
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpaddb %xmm2,%xmm14,%xmm1
|
|
vpxor %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm15,%xmm11,%xmm11
|
|
|
|
.Lresume_ctr32:
|
|
vmovdqu %xmm1,(%r8)
|
|
vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
|
|
vpxor %xmm15,%xmm12,%xmm12
|
|
vmovups 16-128(%rcx),%xmm2
|
|
vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
|
|
xorq %r12,%r12
|
|
cmpq %r14,%r15
|
|
|
|
vaesenc %xmm2,%xmm9,%xmm9
|
|
vmovdqu 48+8(%rsp),%xmm0
|
|
vpxor %xmm15,%xmm13,%xmm13
|
|
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
|
|
vaesenc %xmm2,%xmm10,%xmm10
|
|
vpxor %xmm15,%xmm14,%xmm14
|
|
setnc %r12b
|
|
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
|
|
vaesenc %xmm2,%xmm11,%xmm11
|
|
vmovdqu 16-32(%r9),%xmm3
|
|
negq %r12
|
|
vaesenc %xmm2,%xmm12,%xmm12
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
|
|
vpxor %xmm4,%xmm8,%xmm8
|
|
vaesenc %xmm2,%xmm13,%xmm13
|
|
vpxor %xmm5,%xmm1,%xmm4
|
|
andq $0x60,%r12
|
|
vmovups 32-128(%rcx),%xmm15
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
|
|
vaesenc %xmm2,%xmm14,%xmm14
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
|
|
leaq (%r14,%r12,1),%r14
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor 16+8(%rsp),%xmm8,%xmm8
|
|
vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
|
|
vmovdqu 64+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movbeq 88(%r14),%r13
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movbeq 80(%r14),%r12
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,32+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,40+8(%rsp)
|
|
vmovdqu 48-32(%r9),%xmm5
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 48-128(%rcx),%xmm15
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm2,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm3,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
|
|
vmovdqu 80+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vpxor %xmm1,%xmm4,%xmm4
|
|
vmovdqu 64-32(%r9),%xmm1
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 64-128(%rcx),%xmm15
|
|
vpxor %xmm2,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm3,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movbeq 72(%r14),%r13
|
|
vpxor %xmm5,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movbeq 64(%r14),%r12
|
|
vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
|
|
vmovdqu 96+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,48+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,56+8(%rsp)
|
|
vpxor %xmm2,%xmm4,%xmm4
|
|
vmovdqu 96-32(%r9),%xmm2
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 80-128(%rcx),%xmm15
|
|
vpxor %xmm3,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movbeq 56(%r14),%r13
|
|
vpxor %xmm1,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
|
|
vpxor 112+8(%rsp),%xmm8,%xmm8
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movbeq 48(%r14),%r12
|
|
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,64+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,72+8(%rsp)
|
|
vpxor %xmm3,%xmm4,%xmm4
|
|
vmovdqu 112-32(%r9),%xmm3
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 96-128(%rcx),%xmm15
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movbeq 40(%r14),%r13
|
|
vpxor %xmm2,%xmm7,%xmm7
|
|
vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movbeq 32(%r14),%r12
|
|
vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,80+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,88+8(%rsp)
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
|
|
vmovups 112-128(%rcx),%xmm15
|
|
vpslldq $8,%xmm6,%xmm5
|
|
vpxor %xmm2,%xmm4,%xmm4
|
|
vmovdqu 16(%r11),%xmm3
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm8,%xmm7,%xmm7
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
movbeq 24(%r14),%r13
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movbeq 16(%r14),%r12
|
|
vpalignr $8,%xmm4,%xmm4,%xmm0
|
|
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
|
movq %r13,96+8(%rsp)
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r12,104+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vmovups 128-128(%rcx),%xmm1
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vmovups 144-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vpsrldq $8,%xmm6,%xmm6
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vpxor %xmm6,%xmm7,%xmm7
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vpxor %xmm0,%xmm4,%xmm4
|
|
movbeq 8(%r14),%r13
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
movbeq 0(%r14),%r12
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 160-128(%rcx),%xmm1
|
|
cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
|
|
jb .Lenc_tail
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
vmovups 176-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 192-128(%rcx),%xmm1
|
|
cmpl $14,%ebp // ICP does not zero key schedule.
|
|
jb .Lenc_tail
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
vmovups 208-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 224-128(%rcx),%xmm1
|
|
jmp .Lenc_tail
|
|
|
|
.align 32
|
|
.Lhandle_ctr32:
|
|
vmovdqu (%r11),%xmm0
|
|
vpshufb %xmm0,%xmm1,%xmm6
|
|
vmovdqu 48(%r11),%xmm5
|
|
vpaddd 64(%r11),%xmm6,%xmm10
|
|
vpaddd %xmm5,%xmm6,%xmm11
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpaddd %xmm5,%xmm10,%xmm12
|
|
vpshufb %xmm0,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm11,%xmm13
|
|
vpshufb %xmm0,%xmm11,%xmm11
|
|
vpxor %xmm15,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm12,%xmm14
|
|
vpshufb %xmm0,%xmm12,%xmm12
|
|
vpxor %xmm15,%xmm11,%xmm11
|
|
vpaddd %xmm5,%xmm13,%xmm1
|
|
vpshufb %xmm0,%xmm13,%xmm13
|
|
vpshufb %xmm0,%xmm14,%xmm14
|
|
vpshufb %xmm0,%xmm1,%xmm1
|
|
jmp .Lresume_ctr32
|
|
|
|
.align 32
|
|
.Lenc_tail:
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vmovdqu %xmm7,16+8(%rsp)
|
|
vpalignr $8,%xmm4,%xmm4,%xmm8
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
|
vpxor 0(%rdi),%xmm1,%xmm2
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vpxor 16(%rdi),%xmm1,%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vpxor 32(%rdi),%xmm1,%xmm5
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vpxor 48(%rdi),%xmm1,%xmm6
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vpxor 64(%rdi),%xmm1,%xmm7
|
|
vpxor 80(%rdi),%xmm1,%xmm3
|
|
vmovdqu (%r8),%xmm1
|
|
|
|
vaesenclast %xmm2,%xmm9,%xmm9
|
|
vmovdqu 32(%r11),%xmm2
|
|
vaesenclast %xmm0,%xmm10,%xmm10
|
|
vpaddb %xmm2,%xmm1,%xmm0
|
|
movq %r13,112+8(%rsp)
|
|
leaq 96(%rdi),%rdi
|
|
vaesenclast %xmm5,%xmm11,%xmm11
|
|
vpaddb %xmm2,%xmm0,%xmm5
|
|
movq %r12,120+8(%rsp)
|
|
leaq 96(%rsi),%rsi
|
|
vmovdqu 0-128(%rcx),%xmm15
|
|
vaesenclast %xmm6,%xmm12,%xmm12
|
|
vpaddb %xmm2,%xmm5,%xmm6
|
|
vaesenclast %xmm7,%xmm13,%xmm13
|
|
vpaddb %xmm2,%xmm6,%xmm7
|
|
vaesenclast %xmm3,%xmm14,%xmm14
|
|
vpaddb %xmm2,%xmm7,%xmm3
|
|
|
|
addq $0x60,%r10
|
|
subq $0x6,%rdx
|
|
jc .L6x_done
|
|
|
|
vmovups %xmm9,-96(%rsi)
|
|
vpxor %xmm15,%xmm1,%xmm9
|
|
vmovups %xmm10,-80(%rsi)
|
|
vmovdqa %xmm0,%xmm10
|
|
vmovups %xmm11,-64(%rsi)
|
|
vmovdqa %xmm5,%xmm11
|
|
vmovups %xmm12,-48(%rsi)
|
|
vmovdqa %xmm6,%xmm12
|
|
vmovups %xmm13,-32(%rsi)
|
|
vmovdqa %xmm7,%xmm13
|
|
vmovups %xmm14,-16(%rsi)
|
|
vmovdqa %xmm3,%xmm14
|
|
vmovdqu 32+8(%rsp),%xmm7
|
|
jmp .Loop6x
|
|
|
|
.L6x_done:
|
|
vpxor 16+8(%rsp),%xmm8,%xmm8
|
|
vpxor %xmm4,%xmm8,%xmm8
|
|
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
|
|
#endif /* ifdef HAVE_MOVBE */
|
|
|
|
.type _aesni_ctr32_ghash_no_movbe_6x,@function
|
|
.align 32
|
|
_aesni_ctr32_ghash_no_movbe_6x:
|
|
.cfi_startproc
|
|
vmovdqu 32(%r11),%xmm2
|
|
subq $6,%rdx
|
|
vpxor %xmm4,%xmm4,%xmm4
|
|
vmovdqu 0-128(%rcx),%xmm15
|
|
vpaddb %xmm2,%xmm1,%xmm10
|
|
vpaddb %xmm2,%xmm10,%xmm11
|
|
vpaddb %xmm2,%xmm11,%xmm12
|
|
vpaddb %xmm2,%xmm12,%xmm13
|
|
vpaddb %xmm2,%xmm13,%xmm14
|
|
vpxor %xmm15,%xmm1,%xmm9
|
|
vmovdqu %xmm4,16+8(%rsp)
|
|
jmp .Loop6x_nmb
|
|
|
|
.align 32
|
|
.Loop6x_nmb:
|
|
addl $100663296,%ebx
|
|
jc .Lhandle_ctr32_nmb
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpaddb %xmm2,%xmm14,%xmm1
|
|
vpxor %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm15,%xmm11,%xmm11
|
|
|
|
.Lresume_ctr32_nmb:
|
|
vmovdqu %xmm1,(%r8)
|
|
vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
|
|
vpxor %xmm15,%xmm12,%xmm12
|
|
vmovups 16-128(%rcx),%xmm2
|
|
vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
|
|
xorq %r12,%r12
|
|
cmpq %r14,%r15
|
|
|
|
vaesenc %xmm2,%xmm9,%xmm9
|
|
vmovdqu 48+8(%rsp),%xmm0
|
|
vpxor %xmm15,%xmm13,%xmm13
|
|
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
|
|
vaesenc %xmm2,%xmm10,%xmm10
|
|
vpxor %xmm15,%xmm14,%xmm14
|
|
setnc %r12b
|
|
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
|
|
vaesenc %xmm2,%xmm11,%xmm11
|
|
vmovdqu 16-32(%r9),%xmm3
|
|
negq %r12
|
|
vaesenc %xmm2,%xmm12,%xmm12
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
|
|
vpxor %xmm4,%xmm8,%xmm8
|
|
vaesenc %xmm2,%xmm13,%xmm13
|
|
vpxor %xmm5,%xmm1,%xmm4
|
|
andq $0x60,%r12
|
|
vmovups 32-128(%rcx),%xmm15
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
|
|
vaesenc %xmm2,%xmm14,%xmm14
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
|
|
leaq (%r14,%r12,1),%r14
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor 16+8(%rsp),%xmm8,%xmm8
|
|
vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
|
|
vmovdqu 64+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movq 88(%r14),%r13
|
|
bswapq %r13
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movq 80(%r14),%r12
|
|
bswapq %r12
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,32+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,40+8(%rsp)
|
|
vmovdqu 48-32(%r9),%xmm5
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 48-128(%rcx),%xmm15
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm2,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm3,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
|
|
vmovdqu 80+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vpxor %xmm1,%xmm4,%xmm4
|
|
vmovdqu 64-32(%r9),%xmm1
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 64-128(%rcx),%xmm15
|
|
vpxor %xmm2,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm3,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movq 72(%r14),%r13
|
|
bswapq %r13
|
|
vpxor %xmm5,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movq 64(%r14),%r12
|
|
bswapq %r12
|
|
vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
|
|
vmovdqu 96+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,48+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,56+8(%rsp)
|
|
vpxor %xmm2,%xmm4,%xmm4
|
|
vmovdqu 96-32(%r9),%xmm2
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 80-128(%rcx),%xmm15
|
|
vpxor %xmm3,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movq 56(%r14),%r13
|
|
bswapq %r13
|
|
vpxor %xmm1,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
|
|
vpxor 112+8(%rsp),%xmm8,%xmm8
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movq 48(%r14),%r12
|
|
bswapq %r12
|
|
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,64+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,72+8(%rsp)
|
|
vpxor %xmm3,%xmm4,%xmm4
|
|
vmovdqu 112-32(%r9),%xmm3
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 96-128(%rcx),%xmm15
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movq 40(%r14),%r13
|
|
bswapq %r13
|
|
vpxor %xmm2,%xmm7,%xmm7
|
|
vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movq 32(%r14),%r12
|
|
bswapq %r12
|
|
vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,80+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,88+8(%rsp)
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
|
|
vmovups 112-128(%rcx),%xmm15
|
|
vpslldq $8,%xmm6,%xmm5
|
|
vpxor %xmm2,%xmm4,%xmm4
|
|
vmovdqu 16(%r11),%xmm3
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm8,%xmm7,%xmm7
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
movq 24(%r14),%r13
|
|
bswapq %r13
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movq 16(%r14),%r12
|
|
bswapq %r12
|
|
vpalignr $8,%xmm4,%xmm4,%xmm0
|
|
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
|
movq %r13,96+8(%rsp)
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r12,104+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vmovups 128-128(%rcx),%xmm1
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vmovups 144-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vpsrldq $8,%xmm6,%xmm6
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vpxor %xmm6,%xmm7,%xmm7
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vpxor %xmm0,%xmm4,%xmm4
|
|
movq 8(%r14),%r13
|
|
bswapq %r13
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
movq 0(%r14),%r12
|
|
bswapq %r12
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 160-128(%rcx),%xmm1
|
|
cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
|
|
jb .Lenc_tail_nmb
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
vmovups 176-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 192-128(%rcx),%xmm1
|
|
cmpl $14,%ebp // ICP does not zero key schedule.
|
|
jb .Lenc_tail_nmb
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
vmovups 208-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 224-128(%rcx),%xmm1
|
|
jmp .Lenc_tail_nmb
|
|
|
|
.align 32
|
|
.Lhandle_ctr32_nmb:
|
|
vmovdqu (%r11),%xmm0
|
|
vpshufb %xmm0,%xmm1,%xmm6
|
|
vmovdqu 48(%r11),%xmm5
|
|
vpaddd 64(%r11),%xmm6,%xmm10
|
|
vpaddd %xmm5,%xmm6,%xmm11
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpaddd %xmm5,%xmm10,%xmm12
|
|
vpshufb %xmm0,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm11,%xmm13
|
|
vpshufb %xmm0,%xmm11,%xmm11
|
|
vpxor %xmm15,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm12,%xmm14
|
|
vpshufb %xmm0,%xmm12,%xmm12
|
|
vpxor %xmm15,%xmm11,%xmm11
|
|
vpaddd %xmm5,%xmm13,%xmm1
|
|
vpshufb %xmm0,%xmm13,%xmm13
|
|
vpshufb %xmm0,%xmm14,%xmm14
|
|
vpshufb %xmm0,%xmm1,%xmm1
|
|
jmp .Lresume_ctr32_nmb
|
|
|
|
.align 32
|
|
.Lenc_tail_nmb:
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vmovdqu %xmm7,16+8(%rsp)
|
|
vpalignr $8,%xmm4,%xmm4,%xmm8
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
|
vpxor 0(%rdi),%xmm1,%xmm2
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vpxor 16(%rdi),%xmm1,%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vpxor 32(%rdi),%xmm1,%xmm5
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vpxor 48(%rdi),%xmm1,%xmm6
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vpxor 64(%rdi),%xmm1,%xmm7
|
|
vpxor 80(%rdi),%xmm1,%xmm3
|
|
vmovdqu (%r8),%xmm1
|
|
|
|
vaesenclast %xmm2,%xmm9,%xmm9
|
|
vmovdqu 32(%r11),%xmm2
|
|
vaesenclast %xmm0,%xmm10,%xmm10
|
|
vpaddb %xmm2,%xmm1,%xmm0
|
|
movq %r13,112+8(%rsp)
|
|
leaq 96(%rdi),%rdi
|
|
vaesenclast %xmm5,%xmm11,%xmm11
|
|
vpaddb %xmm2,%xmm0,%xmm5
|
|
movq %r12,120+8(%rsp)
|
|
leaq 96(%rsi),%rsi
|
|
vmovdqu 0-128(%rcx),%xmm15
|
|
vaesenclast %xmm6,%xmm12,%xmm12
|
|
vpaddb %xmm2,%xmm5,%xmm6
|
|
vaesenclast %xmm7,%xmm13,%xmm13
|
|
vpaddb %xmm2,%xmm6,%xmm7
|
|
vaesenclast %xmm3,%xmm14,%xmm14
|
|
vpaddb %xmm2,%xmm7,%xmm3
|
|
|
|
addq $0x60,%r10
|
|
subq $0x6,%rdx
|
|
jc .L6x_done_nmb
|
|
|
|
vmovups %xmm9,-96(%rsi)
|
|
vpxor %xmm15,%xmm1,%xmm9
|
|
vmovups %xmm10,-80(%rsi)
|
|
vmovdqa %xmm0,%xmm10
|
|
vmovups %xmm11,-64(%rsi)
|
|
vmovdqa %xmm5,%xmm11
|
|
vmovups %xmm12,-48(%rsi)
|
|
vmovdqa %xmm6,%xmm12
|
|
vmovups %xmm13,-32(%rsi)
|
|
vmovdqa %xmm7,%xmm13
|
|
vmovups %xmm14,-16(%rsi)
|
|
vmovdqa %xmm3,%xmm14
|
|
vmovdqu 32+8(%rsp),%xmm7
|
|
jmp .Loop6x_nmb
|
|
|
|
.L6x_done_nmb:
|
|
vpxor 16+8(%rsp),%xmm8,%xmm8
|
|
vpxor %xmm4,%xmm8,%xmm8
|
|
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
|
|
|
|
.globl aesni_gcm_decrypt
|
|
.type aesni_gcm_decrypt,@function
|
|
.align 32
|
|
aesni_gcm_decrypt:
|
|
.cfi_startproc
|
|
xorq %r10,%r10
|
|
cmpq $0x60,%rdx
|
|
jb .Lgcm_dec_abort
|
|
|
|
leaq (%rsp),%rax
|
|
.cfi_def_cfa_register %rax
|
|
pushq %rbx
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_offset %r15,-56
|
|
pushq %r9
|
|
.cfi_offset %r9,-64
|
|
vzeroupper
|
|
|
|
vmovdqu (%r8),%xmm1
|
|
addq $-128,%rsp
|
|
movl 12(%r8),%ebx
|
|
leaq .Lbswap_mask(%rip),%r11
|
|
leaq -128(%rcx),%r14
|
|
movq $0xf80,%r15
|
|
vmovdqu (%r9),%xmm8
|
|
andq $-128,%rsp
|
|
vmovdqu (%r11),%xmm0
|
|
leaq 128(%rcx),%rcx
|
|
movq 32(%r9),%r9
|
|
leaq 32(%r9),%r9
|
|
movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds.
|
|
vpshufb %xmm0,%xmm8,%xmm8
|
|
|
|
andq %r15,%r14
|
|
andq %rsp,%r15
|
|
subq %r14,%r15
|
|
jc .Ldec_no_key_aliasing
|
|
cmpq $768,%r15
|
|
jnc .Ldec_no_key_aliasing
|
|
subq %r15,%rsp
|
|
.Ldec_no_key_aliasing:
|
|
|
|
vmovdqu 80(%rdi),%xmm7
|
|
leaq (%rdi),%r14
|
|
vmovdqu 64(%rdi),%xmm4
|
|
leaq -192(%rdi,%rdx,1),%r15
|
|
vmovdqu 48(%rdi),%xmm5
|
|
shrq $4,%rdx
|
|
xorq %r10,%r10
|
|
vmovdqu 32(%rdi),%xmm6
|
|
vpshufb %xmm0,%xmm7,%xmm7
|
|
vmovdqu 16(%rdi),%xmm2
|
|
vpshufb %xmm0,%xmm4,%xmm4
|
|
vmovdqu (%rdi),%xmm3
|
|
vpshufb %xmm0,%xmm5,%xmm5
|
|
vmovdqu %xmm4,48(%rsp)
|
|
vpshufb %xmm0,%xmm6,%xmm6
|
|
vmovdqu %xmm5,64(%rsp)
|
|
vpshufb %xmm0,%xmm2,%xmm2
|
|
vmovdqu %xmm6,80(%rsp)
|
|
vpshufb %xmm0,%xmm3,%xmm3
|
|
vmovdqu %xmm2,96(%rsp)
|
|
vmovdqu %xmm3,112(%rsp)
|
|
|
|
#ifdef HAVE_MOVBE
|
|
#ifdef _KERNEL
|
|
testl $1,gcm_avx_can_use_movbe(%rip)
|
|
#else
|
|
testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
|
|
#endif
|
|
jz 1f
|
|
call _aesni_ctr32_ghash_6x
|
|
jmp 2f
|
|
1:
|
|
#endif
|
|
call _aesni_ctr32_ghash_no_movbe_6x
|
|
2:
|
|
vmovups %xmm9,-96(%rsi)
|
|
vmovups %xmm10,-80(%rsi)
|
|
vmovups %xmm11,-64(%rsi)
|
|
vmovups %xmm12,-48(%rsi)
|
|
vmovups %xmm13,-32(%rsi)
|
|
vmovups %xmm14,-16(%rsi)
|
|
|
|
vpshufb (%r11),%xmm8,%xmm8
|
|
movq -56(%rax),%r9
|
|
.cfi_restore %r9
|
|
vmovdqu %xmm8,(%r9)
|
|
|
|
vzeroupper
|
|
movq -48(%rax),%r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax),%r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax),%r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax),%r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax),%rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax),%rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Lgcm_dec_abort:
|
|
movq %r10,%rax
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
|
|
.type _aesni_ctr32_6x,@function
|
|
.align 32
|
|
_aesni_ctr32_6x:
|
|
.cfi_startproc
|
|
vmovdqu 0-128(%rcx),%xmm4
|
|
vmovdqu 32(%r11),%xmm2
|
|
leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds.
|
|
vmovups 16-128(%rcx),%xmm15
|
|
leaq 32-128(%rcx),%r12
|
|
vpxor %xmm4,%xmm1,%xmm9
|
|
addl $100663296,%ebx
|
|
jc .Lhandle_ctr32_2
|
|
vpaddb %xmm2,%xmm1,%xmm10
|
|
vpaddb %xmm2,%xmm10,%xmm11
|
|
vpxor %xmm4,%xmm10,%xmm10
|
|
vpaddb %xmm2,%xmm11,%xmm12
|
|
vpxor %xmm4,%xmm11,%xmm11
|
|
vpaddb %xmm2,%xmm12,%xmm13
|
|
vpxor %xmm4,%xmm12,%xmm12
|
|
vpaddb %xmm2,%xmm13,%xmm14
|
|
vpxor %xmm4,%xmm13,%xmm13
|
|
vpaddb %xmm2,%xmm14,%xmm1
|
|
vpxor %xmm4,%xmm14,%xmm14
|
|
jmp .Loop_ctr32
|
|
|
|
.align 16
|
|
.Loop_ctr32:
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vmovups (%r12),%xmm15
|
|
leaq 16(%r12),%r12
|
|
decl %r13d
|
|
jnz .Loop_ctr32
|
|
|
|
vmovdqu (%r12),%xmm3
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor 0(%rdi),%xmm3,%xmm4
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpxor 16(%rdi),%xmm3,%xmm5
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vpxor 32(%rdi),%xmm3,%xmm6
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vpxor 48(%rdi),%xmm3,%xmm8
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vpxor 64(%rdi),%xmm3,%xmm2
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vpxor 80(%rdi),%xmm3,%xmm3
|
|
leaq 96(%rdi),%rdi
|
|
|
|
vaesenclast %xmm4,%xmm9,%xmm9
|
|
vaesenclast %xmm5,%xmm10,%xmm10
|
|
vaesenclast %xmm6,%xmm11,%xmm11
|
|
vaesenclast %xmm8,%xmm12,%xmm12
|
|
vaesenclast %xmm2,%xmm13,%xmm13
|
|
vaesenclast %xmm3,%xmm14,%xmm14
|
|
vmovups %xmm9,0(%rsi)
|
|
vmovups %xmm10,16(%rsi)
|
|
vmovups %xmm11,32(%rsi)
|
|
vmovups %xmm12,48(%rsi)
|
|
vmovups %xmm13,64(%rsi)
|
|
vmovups %xmm14,80(%rsi)
|
|
leaq 96(%rsi),%rsi
|
|
|
|
.byte 0xf3,0xc3
|
|
.align 32
|
|
.Lhandle_ctr32_2:
|
|
vpshufb %xmm0,%xmm1,%xmm6
|
|
vmovdqu 48(%r11),%xmm5
|
|
vpaddd 64(%r11),%xmm6,%xmm10
|
|
vpaddd %xmm5,%xmm6,%xmm11
|
|
vpaddd %xmm5,%xmm10,%xmm12
|
|
vpshufb %xmm0,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm11,%xmm13
|
|
vpshufb %xmm0,%xmm11,%xmm11
|
|
vpxor %xmm4,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm12,%xmm14
|
|
vpshufb %xmm0,%xmm12,%xmm12
|
|
vpxor %xmm4,%xmm11,%xmm11
|
|
vpaddd %xmm5,%xmm13,%xmm1
|
|
vpshufb %xmm0,%xmm13,%xmm13
|
|
vpxor %xmm4,%xmm12,%xmm12
|
|
vpshufb %xmm0,%xmm14,%xmm14
|
|
vpxor %xmm4,%xmm13,%xmm13
|
|
vpshufb %xmm0,%xmm1,%xmm1
|
|
vpxor %xmm4,%xmm14,%xmm14
|
|
jmp .Loop_ctr32
|
|
.cfi_endproc
|
|
.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
|
|
|
|
.globl aesni_gcm_encrypt
|
|
.type aesni_gcm_encrypt,@function
|
|
.align 32
|
|
aesni_gcm_encrypt:
|
|
.cfi_startproc
|
|
xorq %r10,%r10
|
|
cmpq $288,%rdx
|
|
jb .Lgcm_enc_abort
|
|
|
|
leaq (%rsp),%rax
|
|
.cfi_def_cfa_register %rax
|
|
pushq %rbx
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_offset %r15,-56
|
|
pushq %r9
|
|
.cfi_offset %r9,-64
|
|
vzeroupper
|
|
|
|
vmovdqu (%r8),%xmm1
|
|
addq $-128,%rsp
|
|
movl 12(%r8),%ebx
|
|
leaq .Lbswap_mask(%rip),%r11
|
|
leaq -128(%rcx),%r14
|
|
movq $0xf80,%r15
|
|
leaq 128(%rcx),%rcx
|
|
vmovdqu (%r11),%xmm0
|
|
andq $-128,%rsp
|
|
movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds.
|
|
|
|
andq %r15,%r14
|
|
andq %rsp,%r15
|
|
subq %r14,%r15
|
|
jc .Lenc_no_key_aliasing
|
|
cmpq $768,%r15
|
|
jnc .Lenc_no_key_aliasing
|
|
subq %r15,%rsp
|
|
.Lenc_no_key_aliasing:
|
|
|
|
leaq (%rsi),%r14
|
|
leaq -192(%rsi,%rdx,1),%r15
|
|
shrq $4,%rdx
|
|
|
|
call _aesni_ctr32_6x
|
|
vpshufb %xmm0,%xmm9,%xmm8
|
|
vpshufb %xmm0,%xmm10,%xmm2
|
|
vmovdqu %xmm8,112(%rsp)
|
|
vpshufb %xmm0,%xmm11,%xmm4
|
|
vmovdqu %xmm2,96(%rsp)
|
|
vpshufb %xmm0,%xmm12,%xmm5
|
|
vmovdqu %xmm4,80(%rsp)
|
|
vpshufb %xmm0,%xmm13,%xmm6
|
|
vmovdqu %xmm5,64(%rsp)
|
|
vpshufb %xmm0,%xmm14,%xmm7
|
|
vmovdqu %xmm6,48(%rsp)
|
|
|
|
call _aesni_ctr32_6x
|
|
|
|
vmovdqu (%r9),%xmm8
|
|
movq 32(%r9),%r9
|
|
leaq 32(%r9),%r9
|
|
subq $12,%rdx
|
|
movq $192,%r10
|
|
vpshufb %xmm0,%xmm8,%xmm8
|
|
|
|
#ifdef HAVE_MOVBE
|
|
#ifdef _KERNEL
|
|
testl $1,gcm_avx_can_use_movbe(%rip)
|
|
#else
|
|
testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
|
|
#endif
|
|
jz 1f
|
|
call _aesni_ctr32_ghash_6x
|
|
jmp 2f
|
|
1:
|
|
#endif
|
|
call _aesni_ctr32_ghash_no_movbe_6x
|
|
2:
|
|
vmovdqu 32(%rsp),%xmm7
|
|
vmovdqu (%r11),%xmm0
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpunpckhqdq %xmm7,%xmm7,%xmm1
|
|
vmovdqu 32-32(%r9),%xmm15
|
|
vmovups %xmm9,-96(%rsi)
|
|
vpshufb %xmm0,%xmm9,%xmm9
|
|
vpxor %xmm7,%xmm1,%xmm1
|
|
vmovups %xmm10,-80(%rsi)
|
|
vpshufb %xmm0,%xmm10,%xmm10
|
|
vmovups %xmm11,-64(%rsi)
|
|
vpshufb %xmm0,%xmm11,%xmm11
|
|
vmovups %xmm12,-48(%rsi)
|
|
vpshufb %xmm0,%xmm12,%xmm12
|
|
vmovups %xmm13,-32(%rsi)
|
|
vpshufb %xmm0,%xmm13,%xmm13
|
|
vmovups %xmm14,-16(%rsi)
|
|
vpshufb %xmm0,%xmm14,%xmm14
|
|
vmovdqu %xmm9,16(%rsp)
|
|
vmovdqu 48(%rsp),%xmm6
|
|
vmovdqu 16-32(%r9),%xmm0
|
|
vpunpckhqdq %xmm6,%xmm6,%xmm2
|
|
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
|
|
vpxor %xmm6,%xmm2,%xmm2
|
|
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
|
|
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
|
|
|
|
vmovdqu 64(%rsp),%xmm9
|
|
vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
|
|
vmovdqu 48-32(%r9),%xmm3
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
vpunpckhqdq %xmm9,%xmm9,%xmm5
|
|
vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
|
|
vpxor %xmm9,%xmm5,%xmm5
|
|
vpxor %xmm7,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
|
|
vmovdqu 80-32(%r9),%xmm15
|
|
vpxor %xmm1,%xmm2,%xmm2
|
|
|
|
vmovdqu 80(%rsp),%xmm1
|
|
vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
|
|
vmovdqu 64-32(%r9),%xmm0
|
|
vpxor %xmm4,%xmm7,%xmm7
|
|
vpunpckhqdq %xmm1,%xmm1,%xmm4
|
|
vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
|
|
vpxor %xmm1,%xmm4,%xmm4
|
|
vpxor %xmm6,%xmm9,%xmm9
|
|
vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
|
|
vpxor %xmm2,%xmm5,%xmm5
|
|
|
|
vmovdqu 96(%rsp),%xmm2
|
|
vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
|
|
vmovdqu 96-32(%r9),%xmm3
|
|
vpxor %xmm7,%xmm6,%xmm6
|
|
vpunpckhqdq %xmm2,%xmm2,%xmm7
|
|
vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
|
|
vpxor %xmm2,%xmm7,%xmm7
|
|
vpxor %xmm9,%xmm1,%xmm1
|
|
vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
|
|
vmovdqu 128-32(%r9),%xmm15
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
|
|
vpxor 112(%rsp),%xmm8,%xmm8
|
|
vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
|
|
vmovdqu 112-32(%r9),%xmm0
|
|
vpunpckhqdq %xmm8,%xmm8,%xmm9
|
|
vpxor %xmm6,%xmm5,%xmm5
|
|
vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
|
|
vpxor %xmm8,%xmm9,%xmm9
|
|
vpxor %xmm1,%xmm2,%xmm2
|
|
vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
|
|
vpxor %xmm4,%xmm7,%xmm4
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpunpckhqdq %xmm14,%xmm14,%xmm1
|
|
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
|
|
vpxor %xmm14,%xmm1,%xmm1
|
|
vpxor %xmm5,%xmm6,%xmm5
|
|
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
|
|
vmovdqu 32-32(%r9),%xmm15
|
|
vpxor %xmm2,%xmm8,%xmm7
|
|
vpxor %xmm4,%xmm9,%xmm6
|
|
|
|
vmovdqu 16-32(%r9),%xmm0
|
|
vpxor %xmm5,%xmm7,%xmm9
|
|
vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
|
|
vpxor %xmm9,%xmm6,%xmm6
|
|
vpunpckhqdq %xmm13,%xmm13,%xmm2
|
|
vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
|
|
vpxor %xmm13,%xmm2,%xmm2
|
|
vpslldq $8,%xmm6,%xmm9
|
|
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
|
|
vpxor %xmm9,%xmm5,%xmm8
|
|
vpsrldq $8,%xmm6,%xmm6
|
|
vpxor %xmm6,%xmm7,%xmm7
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
|
|
vmovdqu 48-32(%r9),%xmm3
|
|
vpxor %xmm4,%xmm5,%xmm5
|
|
vpunpckhqdq %xmm12,%xmm12,%xmm9
|
|
vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
|
|
vpxor %xmm12,%xmm9,%xmm9
|
|
vpxor %xmm14,%xmm13,%xmm13
|
|
vpalignr $8,%xmm8,%xmm8,%xmm14
|
|
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
|
|
vmovdqu 80-32(%r9),%xmm15
|
|
vpxor %xmm1,%xmm2,%xmm2
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
|
|
vmovdqu 64-32(%r9),%xmm0
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
vpunpckhqdq %xmm11,%xmm11,%xmm1
|
|
vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
|
|
vpxor %xmm11,%xmm1,%xmm1
|
|
vpxor %xmm13,%xmm12,%xmm12
|
|
vxorps 16(%rsp),%xmm7,%xmm7
|
|
vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
|
|
vpxor %xmm2,%xmm9,%xmm9
|
|
|
|
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
|
|
vxorps %xmm14,%xmm8,%xmm8
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
|
|
vmovdqu 96-32(%r9),%xmm3
|
|
vpxor %xmm4,%xmm5,%xmm5
|
|
vpunpckhqdq %xmm10,%xmm10,%xmm2
|
|
vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
|
|
vpxor %xmm10,%xmm2,%xmm2
|
|
vpalignr $8,%xmm8,%xmm8,%xmm14
|
|
vpxor %xmm12,%xmm11,%xmm11
|
|
vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
|
|
vmovdqu 128-32(%r9),%xmm15
|
|
vpxor %xmm9,%xmm1,%xmm1
|
|
|
|
vxorps %xmm7,%xmm14,%xmm14
|
|
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
|
|
vxorps %xmm14,%xmm8,%xmm8
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
|
|
vmovdqu 112-32(%r9),%xmm0
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
vpunpckhqdq %xmm8,%xmm8,%xmm9
|
|
vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
|
|
vpxor %xmm8,%xmm9,%xmm9
|
|
vpxor %xmm11,%xmm10,%xmm10
|
|
vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
|
|
vpxor %xmm1,%xmm2,%xmm2
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
|
|
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
|
|
vpxor %xmm4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
|
|
vpxor %xmm10,%xmm7,%xmm7
|
|
vpxor %xmm2,%xmm6,%xmm6
|
|
|
|
vpxor %xmm5,%xmm7,%xmm4
|
|
vpxor %xmm4,%xmm6,%xmm6
|
|
vpslldq $8,%xmm6,%xmm1
|
|
vmovdqu 16(%r11),%xmm3
|
|
vpsrldq $8,%xmm6,%xmm6
|
|
vpxor %xmm1,%xmm5,%xmm8
|
|
vpxor %xmm6,%xmm7,%xmm7
|
|
|
|
vpalignr $8,%xmm8,%xmm8,%xmm2
|
|
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
|
|
vpxor %xmm2,%xmm8,%xmm8
|
|
|
|
vpalignr $8,%xmm8,%xmm8,%xmm2
|
|
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
|
|
vpxor %xmm7,%xmm2,%xmm2
|
|
vpxor %xmm2,%xmm8,%xmm8
|
|
vpshufb (%r11),%xmm8,%xmm8
|
|
movq -56(%rax),%r9
|
|
.cfi_restore %r9
|
|
vmovdqu %xmm8,(%r9)
|
|
|
|
vzeroupper
|
|
movq -48(%rax),%r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax),%r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax),%r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax),%r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax),%rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax),%rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Lgcm_enc_abort:
|
|
movq %r10,%rax
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
|
|
|
|
/* Some utility routines */
|
|
|
|
/*
|
|
* clear all fpu registers
|
|
* void clear_fpu_regs_avx(void);
|
|
*/
|
|
.globl clear_fpu_regs_avx
|
|
.type clear_fpu_regs_avx,@function
|
|
.align 32
|
|
clear_fpu_regs_avx:
|
|
vzeroall
|
|
ret
|
|
.size clear_fpu_regs_avx,.-clear_fpu_regs_avx
|
|
|
|
/*
|
|
* void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
|
|
*
|
|
* XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
|
|
* stores the result at `dst'. The XOR is performed using FPU registers,
|
|
* so make sure FPU state is saved when running this in the kernel.
|
|
*/
|
|
.globl gcm_xor_avx
|
|
.type gcm_xor_avx,@function
|
|
.align 32
|
|
gcm_xor_avx:
|
|
movdqu (%rdi), %xmm0
|
|
movdqu (%rsi), %xmm1
|
|
pxor %xmm1, %xmm0
|
|
movdqu %xmm0, (%rsi)
|
|
ret
|
|
.size gcm_xor_avx,.-gcm_xor_avx
|
|
|
|
/*
|
|
* Toggle a boolean_t value atomically and return the new value.
|
|
* boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
|
|
*/
|
|
.globl atomic_toggle_boolean_nv
|
|
.type atomic_toggle_boolean_nv,@function
|
|
.align 32
|
|
atomic_toggle_boolean_nv:
|
|
xorl %eax, %eax
|
|
lock
|
|
xorl $1, (%rdi)
|
|
jz 1f
|
|
movl $1, %eax
|
|
1:
|
|
ret
|
|
.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
|
|
|
|
.align 64
|
|
.Lbswap_mask:
|
|
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
|
.Lpoly:
|
|
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
|
.Lone_msb:
|
|
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
|
|
.Ltwo_lsb:
|
|
.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
|
.Lone_lsb:
|
|
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
|
.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
|
.align 64
|
|
|
|
/* Mark the stack non-executable. */
|
|
#if defined(__linux__) && defined(__ELF__)
|
|
.section .note.GNU-stack,"",%progbits
|
|
#endif
|
|
|
|
#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
|