mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-16 13:07:10 +03:00
ee93cbc9d4
Commit 43569ee374
("Fix objtool: missing int3 after ret warning")
addressed replacing all `ret`s in x86 asm code to a macro in the
Linux kernel in order to enable SLS. That was done by copying the
upstream macro definitions and fixed objtool complaints.
Since then, several more mitigations were introduced, including
Rethunk. It requires to have a jump to one of the thunks in order
to work, so the RET macro was changed again. And, as ZFS code
didn't use the mainline defition, but copied it, this is currently
missing.
Objtool reminds about it time to time (Clang 16, CONFIG_RETHUNK=y):
fs/zfs/lua/zlua.o: warning: objtool: setjmp+0x25: 'naked' return
found in RETHUNK build
fs/zfs/lua/zlua.o: warning: objtool: longjmp+0x27: 'naked' return
found in RETHUNK build
Do it the following way:
* if we're building under Linux, unconditionally include
<linux/linkage.h> in the related files. It is available in x86
sources since even pre-2.6 times, so doesn't need any conftests;
* then, if RET macro is available, it will be used directly, so that
we will always have the version actual to the kernel we build;
* if there's no such macro, we define it as a simple `ret`, as it
was on pre-SLS times.
This ensures we always have the up-to-date definition with no need
to update it manually, and at the same time is safe for the whole
variety of kernels ZFS module supports.
Then, there's a couple more "naked" rets left in the code, they're
just defined as:
.byte 0xf3,0xc3
In fact, this is just:
rep ret
`rep ret` instead of just `ret` seems to mitigate performance issues
on some old AMD processors and most likely makes no sense as of
today.
Anyways, address those rets, so that they will be protected with
Rethunk and SLS. Include <sys/asm_linkage.h> here which now always
has RET definition and replace those constructs with just RET.
This wipes the last couple of places with unpatched rets objtool's
been complaining about.
Reviewed-by: Attila Fülöp <attila@fueloep.org>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Closes #14035
1265 lines
31 KiB
ArmAsm
1265 lines
31 KiB
ArmAsm
# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
#
|
|
#
|
|
# AES-NI-CTR+GHASH stitch.
|
|
#
|
|
# February 2013
|
|
#
|
|
# OpenSSL GCM implementation is organized in such way that its
|
|
# performance is rather close to the sum of its streamed components,
|
|
# in the context parallelized AES-NI CTR and modulo-scheduled
|
|
# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
|
|
# was observed to perform significantly better than the sum of the
|
|
# components on contemporary CPUs, the effort was deemed impossible to
|
|
# justify. This module is based on combination of Intel submissions,
|
|
# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
|
|
# Locktyukhin of Intel Corp. who verified that it reduces shuffles
|
|
# pressure with notable relative improvement, achieving 1.0 cycle per
|
|
# byte processed with 128-bit key on Haswell processor, 0.74 - on
|
|
# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
|
|
# measurements for favourable packet size, one divisible by 96.
|
|
# Applications using the EVP interface will observe a few percent
|
|
# worse performance.]
|
|
#
|
|
# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
|
|
#
|
|
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
|
|
# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
|
|
|
|
# Generated once from
|
|
# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
|
|
# and modified for ICP. Modification are kept at a bare minimum to ease later
|
|
# upstream merges.
|
|
|
|
#if defined(__x86_64__) && defined(HAVE_AVX) && \
|
|
defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
|
|
|
|
#define _ASM
|
|
#include <sys/asm_linkage.h>
|
|
|
|
.extern gcm_avx_can_use_movbe
|
|
|
|
.text
|
|
|
|
#ifdef HAVE_MOVBE
|
|
.type _aesni_ctr32_ghash_6x,@function
|
|
.align 32
|
|
_aesni_ctr32_ghash_6x:
|
|
.cfi_startproc
|
|
vmovdqu 32(%r11),%xmm2
|
|
subq $6,%rdx
|
|
vpxor %xmm4,%xmm4,%xmm4
|
|
vmovdqu 0-128(%rcx),%xmm15
|
|
vpaddb %xmm2,%xmm1,%xmm10
|
|
vpaddb %xmm2,%xmm10,%xmm11
|
|
vpaddb %xmm2,%xmm11,%xmm12
|
|
vpaddb %xmm2,%xmm12,%xmm13
|
|
vpaddb %xmm2,%xmm13,%xmm14
|
|
vpxor %xmm15,%xmm1,%xmm9
|
|
vmovdqu %xmm4,16+8(%rsp)
|
|
jmp .Loop6x
|
|
|
|
.align 32
|
|
.Loop6x:
|
|
addl $100663296,%ebx
|
|
jc .Lhandle_ctr32
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpaddb %xmm2,%xmm14,%xmm1
|
|
vpxor %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm15,%xmm11,%xmm11
|
|
|
|
.Lresume_ctr32:
|
|
vmovdqu %xmm1,(%r8)
|
|
vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
|
|
vpxor %xmm15,%xmm12,%xmm12
|
|
vmovups 16-128(%rcx),%xmm2
|
|
vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
|
|
xorq %r12,%r12
|
|
cmpq %r14,%r15
|
|
|
|
vaesenc %xmm2,%xmm9,%xmm9
|
|
vmovdqu 48+8(%rsp),%xmm0
|
|
vpxor %xmm15,%xmm13,%xmm13
|
|
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
|
|
vaesenc %xmm2,%xmm10,%xmm10
|
|
vpxor %xmm15,%xmm14,%xmm14
|
|
setnc %r12b
|
|
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
|
|
vaesenc %xmm2,%xmm11,%xmm11
|
|
vmovdqu 16-32(%r9),%xmm3
|
|
negq %r12
|
|
vaesenc %xmm2,%xmm12,%xmm12
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
|
|
vpxor %xmm4,%xmm8,%xmm8
|
|
vaesenc %xmm2,%xmm13,%xmm13
|
|
vpxor %xmm5,%xmm1,%xmm4
|
|
andq $0x60,%r12
|
|
vmovups 32-128(%rcx),%xmm15
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
|
|
vaesenc %xmm2,%xmm14,%xmm14
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
|
|
leaq (%r14,%r12,1),%r14
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor 16+8(%rsp),%xmm8,%xmm8
|
|
vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
|
|
vmovdqu 64+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movbeq 88(%r14),%r13
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movbeq 80(%r14),%r12
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,32+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,40+8(%rsp)
|
|
vmovdqu 48-32(%r9),%xmm5
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 48-128(%rcx),%xmm15
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm2,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm3,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
|
|
vmovdqu 80+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vpxor %xmm1,%xmm4,%xmm4
|
|
vmovdqu 64-32(%r9),%xmm1
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 64-128(%rcx),%xmm15
|
|
vpxor %xmm2,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm3,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movbeq 72(%r14),%r13
|
|
vpxor %xmm5,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movbeq 64(%r14),%r12
|
|
vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
|
|
vmovdqu 96+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,48+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,56+8(%rsp)
|
|
vpxor %xmm2,%xmm4,%xmm4
|
|
vmovdqu 96-32(%r9),%xmm2
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 80-128(%rcx),%xmm15
|
|
vpxor %xmm3,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movbeq 56(%r14),%r13
|
|
vpxor %xmm1,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
|
|
vpxor 112+8(%rsp),%xmm8,%xmm8
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movbeq 48(%r14),%r12
|
|
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,64+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,72+8(%rsp)
|
|
vpxor %xmm3,%xmm4,%xmm4
|
|
vmovdqu 112-32(%r9),%xmm3
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 96-128(%rcx),%xmm15
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movbeq 40(%r14),%r13
|
|
vpxor %xmm2,%xmm7,%xmm7
|
|
vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movbeq 32(%r14),%r12
|
|
vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,80+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,88+8(%rsp)
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
|
|
vmovups 112-128(%rcx),%xmm15
|
|
vpslldq $8,%xmm6,%xmm5
|
|
vpxor %xmm2,%xmm4,%xmm4
|
|
vmovdqu 16(%r11),%xmm3
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm8,%xmm7,%xmm7
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
movbeq 24(%r14),%r13
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movbeq 16(%r14),%r12
|
|
vpalignr $8,%xmm4,%xmm4,%xmm0
|
|
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
|
movq %r13,96+8(%rsp)
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r12,104+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vmovups 128-128(%rcx),%xmm1
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vmovups 144-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vpsrldq $8,%xmm6,%xmm6
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vpxor %xmm6,%xmm7,%xmm7
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vpxor %xmm0,%xmm4,%xmm4
|
|
movbeq 8(%r14),%r13
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
movbeq 0(%r14),%r12
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 160-128(%rcx),%xmm1
|
|
cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
|
|
jb .Lenc_tail
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
vmovups 176-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 192-128(%rcx),%xmm1
|
|
cmpl $14,%ebp // ICP does not zero key schedule.
|
|
jb .Lenc_tail
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
vmovups 208-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 224-128(%rcx),%xmm1
|
|
jmp .Lenc_tail
|
|
|
|
.align 32
|
|
.Lhandle_ctr32:
|
|
vmovdqu (%r11),%xmm0
|
|
vpshufb %xmm0,%xmm1,%xmm6
|
|
vmovdqu 48(%r11),%xmm5
|
|
vpaddd 64(%r11),%xmm6,%xmm10
|
|
vpaddd %xmm5,%xmm6,%xmm11
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpaddd %xmm5,%xmm10,%xmm12
|
|
vpshufb %xmm0,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm11,%xmm13
|
|
vpshufb %xmm0,%xmm11,%xmm11
|
|
vpxor %xmm15,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm12,%xmm14
|
|
vpshufb %xmm0,%xmm12,%xmm12
|
|
vpxor %xmm15,%xmm11,%xmm11
|
|
vpaddd %xmm5,%xmm13,%xmm1
|
|
vpshufb %xmm0,%xmm13,%xmm13
|
|
vpshufb %xmm0,%xmm14,%xmm14
|
|
vpshufb %xmm0,%xmm1,%xmm1
|
|
jmp .Lresume_ctr32
|
|
|
|
.align 32
|
|
.Lenc_tail:
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vmovdqu %xmm7,16+8(%rsp)
|
|
vpalignr $8,%xmm4,%xmm4,%xmm8
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
|
vpxor 0(%rdi),%xmm1,%xmm2
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vpxor 16(%rdi),%xmm1,%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vpxor 32(%rdi),%xmm1,%xmm5
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vpxor 48(%rdi),%xmm1,%xmm6
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vpxor 64(%rdi),%xmm1,%xmm7
|
|
vpxor 80(%rdi),%xmm1,%xmm3
|
|
vmovdqu (%r8),%xmm1
|
|
|
|
vaesenclast %xmm2,%xmm9,%xmm9
|
|
vmovdqu 32(%r11),%xmm2
|
|
vaesenclast %xmm0,%xmm10,%xmm10
|
|
vpaddb %xmm2,%xmm1,%xmm0
|
|
movq %r13,112+8(%rsp)
|
|
leaq 96(%rdi),%rdi
|
|
vaesenclast %xmm5,%xmm11,%xmm11
|
|
vpaddb %xmm2,%xmm0,%xmm5
|
|
movq %r12,120+8(%rsp)
|
|
leaq 96(%rsi),%rsi
|
|
vmovdqu 0-128(%rcx),%xmm15
|
|
vaesenclast %xmm6,%xmm12,%xmm12
|
|
vpaddb %xmm2,%xmm5,%xmm6
|
|
vaesenclast %xmm7,%xmm13,%xmm13
|
|
vpaddb %xmm2,%xmm6,%xmm7
|
|
vaesenclast %xmm3,%xmm14,%xmm14
|
|
vpaddb %xmm2,%xmm7,%xmm3
|
|
|
|
addq $0x60,%r10
|
|
subq $0x6,%rdx
|
|
jc .L6x_done
|
|
|
|
vmovups %xmm9,-96(%rsi)
|
|
vpxor %xmm15,%xmm1,%xmm9
|
|
vmovups %xmm10,-80(%rsi)
|
|
vmovdqa %xmm0,%xmm10
|
|
vmovups %xmm11,-64(%rsi)
|
|
vmovdqa %xmm5,%xmm11
|
|
vmovups %xmm12,-48(%rsi)
|
|
vmovdqa %xmm6,%xmm12
|
|
vmovups %xmm13,-32(%rsi)
|
|
vmovdqa %xmm7,%xmm13
|
|
vmovups %xmm14,-16(%rsi)
|
|
vmovdqa %xmm3,%xmm14
|
|
vmovdqu 32+8(%rsp),%xmm7
|
|
jmp .Loop6x
|
|
|
|
.L6x_done:
|
|
vpxor 16+8(%rsp),%xmm8,%xmm8
|
|
vpxor %xmm4,%xmm8,%xmm8
|
|
|
|
RET
|
|
.cfi_endproc
|
|
.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
|
|
#endif /* ifdef HAVE_MOVBE */
|
|
|
|
.type _aesni_ctr32_ghash_no_movbe_6x,@function
|
|
.align 32
|
|
_aesni_ctr32_ghash_no_movbe_6x:
|
|
.cfi_startproc
|
|
vmovdqu 32(%r11),%xmm2
|
|
subq $6,%rdx
|
|
vpxor %xmm4,%xmm4,%xmm4
|
|
vmovdqu 0-128(%rcx),%xmm15
|
|
vpaddb %xmm2,%xmm1,%xmm10
|
|
vpaddb %xmm2,%xmm10,%xmm11
|
|
vpaddb %xmm2,%xmm11,%xmm12
|
|
vpaddb %xmm2,%xmm12,%xmm13
|
|
vpaddb %xmm2,%xmm13,%xmm14
|
|
vpxor %xmm15,%xmm1,%xmm9
|
|
vmovdqu %xmm4,16+8(%rsp)
|
|
jmp .Loop6x_nmb
|
|
|
|
.align 32
|
|
.Loop6x_nmb:
|
|
addl $100663296,%ebx
|
|
jc .Lhandle_ctr32_nmb
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpaddb %xmm2,%xmm14,%xmm1
|
|
vpxor %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm15,%xmm11,%xmm11
|
|
|
|
.Lresume_ctr32_nmb:
|
|
vmovdqu %xmm1,(%r8)
|
|
vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
|
|
vpxor %xmm15,%xmm12,%xmm12
|
|
vmovups 16-128(%rcx),%xmm2
|
|
vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
|
|
xorq %r12,%r12
|
|
cmpq %r14,%r15
|
|
|
|
vaesenc %xmm2,%xmm9,%xmm9
|
|
vmovdqu 48+8(%rsp),%xmm0
|
|
vpxor %xmm15,%xmm13,%xmm13
|
|
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
|
|
vaesenc %xmm2,%xmm10,%xmm10
|
|
vpxor %xmm15,%xmm14,%xmm14
|
|
setnc %r12b
|
|
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
|
|
vaesenc %xmm2,%xmm11,%xmm11
|
|
vmovdqu 16-32(%r9),%xmm3
|
|
negq %r12
|
|
vaesenc %xmm2,%xmm12,%xmm12
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
|
|
vpxor %xmm4,%xmm8,%xmm8
|
|
vaesenc %xmm2,%xmm13,%xmm13
|
|
vpxor %xmm5,%xmm1,%xmm4
|
|
andq $0x60,%r12
|
|
vmovups 32-128(%rcx),%xmm15
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
|
|
vaesenc %xmm2,%xmm14,%xmm14
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
|
|
leaq (%r14,%r12,1),%r14
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor 16+8(%rsp),%xmm8,%xmm8
|
|
vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
|
|
vmovdqu 64+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movq 88(%r14),%r13
|
|
bswapq %r13
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movq 80(%r14),%r12
|
|
bswapq %r12
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,32+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,40+8(%rsp)
|
|
vmovdqu 48-32(%r9),%xmm5
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 48-128(%rcx),%xmm15
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm2,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm3,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
|
|
vmovdqu 80+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vpxor %xmm1,%xmm4,%xmm4
|
|
vmovdqu 64-32(%r9),%xmm1
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 64-128(%rcx),%xmm15
|
|
vpxor %xmm2,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm3,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movq 72(%r14),%r13
|
|
bswapq %r13
|
|
vpxor %xmm5,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movq 64(%r14),%r12
|
|
bswapq %r12
|
|
vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
|
|
vmovdqu 96+8(%rsp),%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,48+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,56+8(%rsp)
|
|
vpxor %xmm2,%xmm4,%xmm4
|
|
vmovdqu 96-32(%r9),%xmm2
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 80-128(%rcx),%xmm15
|
|
vpxor %xmm3,%xmm6,%xmm6
|
|
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movq 56(%r14),%r13
|
|
bswapq %r13
|
|
vpxor %xmm1,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
|
|
vpxor 112+8(%rsp),%xmm8,%xmm8
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movq 48(%r14),%r12
|
|
bswapq %r12
|
|
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,64+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,72+8(%rsp)
|
|
vpxor %xmm3,%xmm4,%xmm4
|
|
vmovdqu 112-32(%r9),%xmm3
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vmovups 96-128(%rcx),%xmm15
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
movq 40(%r14),%r13
|
|
bswapq %r13
|
|
vpxor %xmm2,%xmm7,%xmm7
|
|
vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movq 32(%r14),%r12
|
|
bswapq %r12
|
|
vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r13,80+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
movq %r12,88+8(%rsp)
|
|
vpxor %xmm5,%xmm6,%xmm6
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vpxor %xmm1,%xmm6,%xmm6
|
|
|
|
vmovups 112-128(%rcx),%xmm15
|
|
vpslldq $8,%xmm6,%xmm5
|
|
vpxor %xmm2,%xmm4,%xmm4
|
|
vmovdqu 16(%r11),%xmm3
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor %xmm8,%xmm7,%xmm7
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
movq 24(%r14),%r13
|
|
bswapq %r13
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
movq 16(%r14),%r12
|
|
bswapq %r12
|
|
vpalignr $8,%xmm4,%xmm4,%xmm0
|
|
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
|
movq %r13,96+8(%rsp)
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
movq %r12,104+8(%rsp)
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vmovups 128-128(%rcx),%xmm1
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vmovups 144-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vpsrldq $8,%xmm6,%xmm6
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vpxor %xmm6,%xmm7,%xmm7
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vpxor %xmm0,%xmm4,%xmm4
|
|
movq 8(%r14),%r13
|
|
bswapq %r13
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
movq 0(%r14),%r12
|
|
bswapq %r12
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 160-128(%rcx),%xmm1
|
|
cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
|
|
jb .Lenc_tail_nmb
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
vmovups 176-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 192-128(%rcx),%xmm1
|
|
cmpl $14,%ebp // ICP does not zero key schedule.
|
|
jb .Lenc_tail_nmb
|
|
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vaesenc %xmm1,%xmm10,%xmm10
|
|
vaesenc %xmm1,%xmm11,%xmm11
|
|
vaesenc %xmm1,%xmm12,%xmm12
|
|
vaesenc %xmm1,%xmm13,%xmm13
|
|
vmovups 208-128(%rcx),%xmm15
|
|
vaesenc %xmm1,%xmm14,%xmm14
|
|
vmovups 224-128(%rcx),%xmm1
|
|
jmp .Lenc_tail_nmb
|
|
|
|
.align 32
|
|
.Lhandle_ctr32_nmb:
|
|
vmovdqu (%r11),%xmm0
|
|
vpshufb %xmm0,%xmm1,%xmm6
|
|
vmovdqu 48(%r11),%xmm5
|
|
vpaddd 64(%r11),%xmm6,%xmm10
|
|
vpaddd %xmm5,%xmm6,%xmm11
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpaddd %xmm5,%xmm10,%xmm12
|
|
vpshufb %xmm0,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm11,%xmm13
|
|
vpshufb %xmm0,%xmm11,%xmm11
|
|
vpxor %xmm15,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm12,%xmm14
|
|
vpshufb %xmm0,%xmm12,%xmm12
|
|
vpxor %xmm15,%xmm11,%xmm11
|
|
vpaddd %xmm5,%xmm13,%xmm1
|
|
vpshufb %xmm0,%xmm13,%xmm13
|
|
vpshufb %xmm0,%xmm14,%xmm14
|
|
vpshufb %xmm0,%xmm1,%xmm1
|
|
jmp .Lresume_ctr32_nmb
|
|
|
|
.align 32
|
|
.Lenc_tail_nmb:
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vmovdqu %xmm7,16+8(%rsp)
|
|
vpalignr $8,%xmm4,%xmm4,%xmm8
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
|
vpxor 0(%rdi),%xmm1,%xmm2
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vpxor 16(%rdi),%xmm1,%xmm0
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vpxor 32(%rdi),%xmm1,%xmm5
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vpxor 48(%rdi),%xmm1,%xmm6
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vpxor 64(%rdi),%xmm1,%xmm7
|
|
vpxor 80(%rdi),%xmm1,%xmm3
|
|
vmovdqu (%r8),%xmm1
|
|
|
|
vaesenclast %xmm2,%xmm9,%xmm9
|
|
vmovdqu 32(%r11),%xmm2
|
|
vaesenclast %xmm0,%xmm10,%xmm10
|
|
vpaddb %xmm2,%xmm1,%xmm0
|
|
movq %r13,112+8(%rsp)
|
|
leaq 96(%rdi),%rdi
|
|
vaesenclast %xmm5,%xmm11,%xmm11
|
|
vpaddb %xmm2,%xmm0,%xmm5
|
|
movq %r12,120+8(%rsp)
|
|
leaq 96(%rsi),%rsi
|
|
vmovdqu 0-128(%rcx),%xmm15
|
|
vaesenclast %xmm6,%xmm12,%xmm12
|
|
vpaddb %xmm2,%xmm5,%xmm6
|
|
vaesenclast %xmm7,%xmm13,%xmm13
|
|
vpaddb %xmm2,%xmm6,%xmm7
|
|
vaesenclast %xmm3,%xmm14,%xmm14
|
|
vpaddb %xmm2,%xmm7,%xmm3
|
|
|
|
addq $0x60,%r10
|
|
subq $0x6,%rdx
|
|
jc .L6x_done_nmb
|
|
|
|
vmovups %xmm9,-96(%rsi)
|
|
vpxor %xmm15,%xmm1,%xmm9
|
|
vmovups %xmm10,-80(%rsi)
|
|
vmovdqa %xmm0,%xmm10
|
|
vmovups %xmm11,-64(%rsi)
|
|
vmovdqa %xmm5,%xmm11
|
|
vmovups %xmm12,-48(%rsi)
|
|
vmovdqa %xmm6,%xmm12
|
|
vmovups %xmm13,-32(%rsi)
|
|
vmovdqa %xmm7,%xmm13
|
|
vmovups %xmm14,-16(%rsi)
|
|
vmovdqa %xmm3,%xmm14
|
|
vmovdqu 32+8(%rsp),%xmm7
|
|
jmp .Loop6x_nmb
|
|
|
|
.L6x_done_nmb:
|
|
vpxor 16+8(%rsp),%xmm8,%xmm8
|
|
vpxor %xmm4,%xmm8,%xmm8
|
|
|
|
RET
|
|
.cfi_endproc
|
|
.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
|
|
|
|
.globl aesni_gcm_decrypt
|
|
.type aesni_gcm_decrypt,@function
|
|
.align 32
|
|
aesni_gcm_decrypt:
|
|
.cfi_startproc
|
|
xorq %r10,%r10
|
|
cmpq $0x60,%rdx
|
|
jb .Lgcm_dec_abort
|
|
|
|
leaq (%rsp),%rax
|
|
.cfi_def_cfa_register %rax
|
|
pushq %rbx
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_offset %r15,-56
|
|
pushq %r9
|
|
.cfi_offset %r9,-64
|
|
vzeroupper
|
|
|
|
vmovdqu (%r8),%xmm1
|
|
addq $-128,%rsp
|
|
movl 12(%r8),%ebx
|
|
leaq .Lbswap_mask(%rip),%r11
|
|
leaq -128(%rcx),%r14
|
|
movq $0xf80,%r15
|
|
vmovdqu (%r9),%xmm8
|
|
andq $-128,%rsp
|
|
vmovdqu (%r11),%xmm0
|
|
leaq 128(%rcx),%rcx
|
|
movq 32(%r9),%r9
|
|
leaq 32(%r9),%r9
|
|
movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds.
|
|
vpshufb %xmm0,%xmm8,%xmm8
|
|
|
|
andq %r15,%r14
|
|
andq %rsp,%r15
|
|
subq %r14,%r15
|
|
jc .Ldec_no_key_aliasing
|
|
cmpq $768,%r15
|
|
jnc .Ldec_no_key_aliasing
|
|
subq %r15,%rsp
|
|
.Ldec_no_key_aliasing:
|
|
|
|
vmovdqu 80(%rdi),%xmm7
|
|
leaq (%rdi),%r14
|
|
vmovdqu 64(%rdi),%xmm4
|
|
leaq -192(%rdi,%rdx,1),%r15
|
|
vmovdqu 48(%rdi),%xmm5
|
|
shrq $4,%rdx
|
|
xorq %r10,%r10
|
|
vmovdqu 32(%rdi),%xmm6
|
|
vpshufb %xmm0,%xmm7,%xmm7
|
|
vmovdqu 16(%rdi),%xmm2
|
|
vpshufb %xmm0,%xmm4,%xmm4
|
|
vmovdqu (%rdi),%xmm3
|
|
vpshufb %xmm0,%xmm5,%xmm5
|
|
vmovdqu %xmm4,48(%rsp)
|
|
vpshufb %xmm0,%xmm6,%xmm6
|
|
vmovdqu %xmm5,64(%rsp)
|
|
vpshufb %xmm0,%xmm2,%xmm2
|
|
vmovdqu %xmm6,80(%rsp)
|
|
vpshufb %xmm0,%xmm3,%xmm3
|
|
vmovdqu %xmm2,96(%rsp)
|
|
vmovdqu %xmm3,112(%rsp)
|
|
|
|
#ifdef HAVE_MOVBE
|
|
#ifdef _KERNEL
|
|
testl $1,gcm_avx_can_use_movbe(%rip)
|
|
#else
|
|
testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
|
|
#endif
|
|
jz 1f
|
|
call _aesni_ctr32_ghash_6x
|
|
jmp 2f
|
|
1:
|
|
#endif
|
|
call _aesni_ctr32_ghash_no_movbe_6x
|
|
2:
|
|
vmovups %xmm9,-96(%rsi)
|
|
vmovups %xmm10,-80(%rsi)
|
|
vmovups %xmm11,-64(%rsi)
|
|
vmovups %xmm12,-48(%rsi)
|
|
vmovups %xmm13,-32(%rsi)
|
|
vmovups %xmm14,-16(%rsi)
|
|
|
|
vpshufb (%r11),%xmm8,%xmm8
|
|
movq -56(%rax),%r9
|
|
.cfi_restore %r9
|
|
vmovdqu %xmm8,(%r9)
|
|
|
|
vzeroupper
|
|
movq -48(%rax),%r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax),%r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax),%r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax),%r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax),%rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax),%rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Lgcm_dec_abort:
|
|
movq %r10,%rax
|
|
RET
|
|
.cfi_endproc
|
|
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
|
|
.type _aesni_ctr32_6x,@function
|
|
.align 32
|
|
_aesni_ctr32_6x:
|
|
.cfi_startproc
|
|
vmovdqu 0-128(%rcx),%xmm4
|
|
vmovdqu 32(%r11),%xmm2
|
|
leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds.
|
|
vmovups 16-128(%rcx),%xmm15
|
|
leaq 32-128(%rcx),%r12
|
|
vpxor %xmm4,%xmm1,%xmm9
|
|
addl $100663296,%ebx
|
|
jc .Lhandle_ctr32_2
|
|
vpaddb %xmm2,%xmm1,%xmm10
|
|
vpaddb %xmm2,%xmm10,%xmm11
|
|
vpxor %xmm4,%xmm10,%xmm10
|
|
vpaddb %xmm2,%xmm11,%xmm12
|
|
vpxor %xmm4,%xmm11,%xmm11
|
|
vpaddb %xmm2,%xmm12,%xmm13
|
|
vpxor %xmm4,%xmm12,%xmm12
|
|
vpaddb %xmm2,%xmm13,%xmm14
|
|
vpxor %xmm4,%xmm13,%xmm13
|
|
vpaddb %xmm2,%xmm14,%xmm1
|
|
vpxor %xmm4,%xmm14,%xmm14
|
|
jmp .Loop_ctr32
|
|
|
|
.align 16
|
|
.Loop_ctr32:
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vmovups (%r12),%xmm15
|
|
leaq 16(%r12),%r12
|
|
decl %r13d
|
|
jnz .Loop_ctr32
|
|
|
|
vmovdqu (%r12),%xmm3
|
|
vaesenc %xmm15,%xmm9,%xmm9
|
|
vpxor 0(%rdi),%xmm3,%xmm4
|
|
vaesenc %xmm15,%xmm10,%xmm10
|
|
vpxor 16(%rdi),%xmm3,%xmm5
|
|
vaesenc %xmm15,%xmm11,%xmm11
|
|
vpxor 32(%rdi),%xmm3,%xmm6
|
|
vaesenc %xmm15,%xmm12,%xmm12
|
|
vpxor 48(%rdi),%xmm3,%xmm8
|
|
vaesenc %xmm15,%xmm13,%xmm13
|
|
vpxor 64(%rdi),%xmm3,%xmm2
|
|
vaesenc %xmm15,%xmm14,%xmm14
|
|
vpxor 80(%rdi),%xmm3,%xmm3
|
|
leaq 96(%rdi),%rdi
|
|
|
|
vaesenclast %xmm4,%xmm9,%xmm9
|
|
vaesenclast %xmm5,%xmm10,%xmm10
|
|
vaesenclast %xmm6,%xmm11,%xmm11
|
|
vaesenclast %xmm8,%xmm12,%xmm12
|
|
vaesenclast %xmm2,%xmm13,%xmm13
|
|
vaesenclast %xmm3,%xmm14,%xmm14
|
|
vmovups %xmm9,0(%rsi)
|
|
vmovups %xmm10,16(%rsi)
|
|
vmovups %xmm11,32(%rsi)
|
|
vmovups %xmm12,48(%rsi)
|
|
vmovups %xmm13,64(%rsi)
|
|
vmovups %xmm14,80(%rsi)
|
|
leaq 96(%rsi),%rsi
|
|
|
|
RET
|
|
.align 32
|
|
.Lhandle_ctr32_2:
|
|
vpshufb %xmm0,%xmm1,%xmm6
|
|
vmovdqu 48(%r11),%xmm5
|
|
vpaddd 64(%r11),%xmm6,%xmm10
|
|
vpaddd %xmm5,%xmm6,%xmm11
|
|
vpaddd %xmm5,%xmm10,%xmm12
|
|
vpshufb %xmm0,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm11,%xmm13
|
|
vpshufb %xmm0,%xmm11,%xmm11
|
|
vpxor %xmm4,%xmm10,%xmm10
|
|
vpaddd %xmm5,%xmm12,%xmm14
|
|
vpshufb %xmm0,%xmm12,%xmm12
|
|
vpxor %xmm4,%xmm11,%xmm11
|
|
vpaddd %xmm5,%xmm13,%xmm1
|
|
vpshufb %xmm0,%xmm13,%xmm13
|
|
vpxor %xmm4,%xmm12,%xmm12
|
|
vpshufb %xmm0,%xmm14,%xmm14
|
|
vpxor %xmm4,%xmm13,%xmm13
|
|
vpshufb %xmm0,%xmm1,%xmm1
|
|
vpxor %xmm4,%xmm14,%xmm14
|
|
jmp .Loop_ctr32
|
|
.cfi_endproc
|
|
.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
|
|
|
|
.globl aesni_gcm_encrypt
|
|
.type aesni_gcm_encrypt,@function
|
|
.align 32
|
|
aesni_gcm_encrypt:
|
|
.cfi_startproc
|
|
xorq %r10,%r10
|
|
cmpq $288,%rdx
|
|
jb .Lgcm_enc_abort
|
|
|
|
leaq (%rsp),%rax
|
|
.cfi_def_cfa_register %rax
|
|
pushq %rbx
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_offset %r15,-56
|
|
pushq %r9
|
|
.cfi_offset %r9,-64
|
|
vzeroupper
|
|
|
|
vmovdqu (%r8),%xmm1
|
|
addq $-128,%rsp
|
|
movl 12(%r8),%ebx
|
|
leaq .Lbswap_mask(%rip),%r11
|
|
leaq -128(%rcx),%r14
|
|
movq $0xf80,%r15
|
|
leaq 128(%rcx),%rcx
|
|
vmovdqu (%r11),%xmm0
|
|
andq $-128,%rsp
|
|
movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds.
|
|
|
|
andq %r15,%r14
|
|
andq %rsp,%r15
|
|
subq %r14,%r15
|
|
jc .Lenc_no_key_aliasing
|
|
cmpq $768,%r15
|
|
jnc .Lenc_no_key_aliasing
|
|
subq %r15,%rsp
|
|
.Lenc_no_key_aliasing:
|
|
|
|
leaq (%rsi),%r14
|
|
leaq -192(%rsi,%rdx,1),%r15
|
|
shrq $4,%rdx
|
|
|
|
call _aesni_ctr32_6x
|
|
vpshufb %xmm0,%xmm9,%xmm8
|
|
vpshufb %xmm0,%xmm10,%xmm2
|
|
vmovdqu %xmm8,112(%rsp)
|
|
vpshufb %xmm0,%xmm11,%xmm4
|
|
vmovdqu %xmm2,96(%rsp)
|
|
vpshufb %xmm0,%xmm12,%xmm5
|
|
vmovdqu %xmm4,80(%rsp)
|
|
vpshufb %xmm0,%xmm13,%xmm6
|
|
vmovdqu %xmm5,64(%rsp)
|
|
vpshufb %xmm0,%xmm14,%xmm7
|
|
vmovdqu %xmm6,48(%rsp)
|
|
|
|
call _aesni_ctr32_6x
|
|
|
|
vmovdqu (%r9),%xmm8
|
|
movq 32(%r9),%r9
|
|
leaq 32(%r9),%r9
|
|
subq $12,%rdx
|
|
movq $192,%r10
|
|
vpshufb %xmm0,%xmm8,%xmm8
|
|
|
|
#ifdef HAVE_MOVBE
|
|
#ifdef _KERNEL
|
|
testl $1,gcm_avx_can_use_movbe(%rip)
|
|
#else
|
|
testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
|
|
#endif
|
|
jz 1f
|
|
call _aesni_ctr32_ghash_6x
|
|
jmp 2f
|
|
1:
|
|
#endif
|
|
call _aesni_ctr32_ghash_no_movbe_6x
|
|
2:
|
|
vmovdqu 32(%rsp),%xmm7
|
|
vmovdqu (%r11),%xmm0
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpunpckhqdq %xmm7,%xmm7,%xmm1
|
|
vmovdqu 32-32(%r9),%xmm15
|
|
vmovups %xmm9,-96(%rsi)
|
|
vpshufb %xmm0,%xmm9,%xmm9
|
|
vpxor %xmm7,%xmm1,%xmm1
|
|
vmovups %xmm10,-80(%rsi)
|
|
vpshufb %xmm0,%xmm10,%xmm10
|
|
vmovups %xmm11,-64(%rsi)
|
|
vpshufb %xmm0,%xmm11,%xmm11
|
|
vmovups %xmm12,-48(%rsi)
|
|
vpshufb %xmm0,%xmm12,%xmm12
|
|
vmovups %xmm13,-32(%rsi)
|
|
vpshufb %xmm0,%xmm13,%xmm13
|
|
vmovups %xmm14,-16(%rsi)
|
|
vpshufb %xmm0,%xmm14,%xmm14
|
|
vmovdqu %xmm9,16(%rsp)
|
|
vmovdqu 48(%rsp),%xmm6
|
|
vmovdqu 16-32(%r9),%xmm0
|
|
vpunpckhqdq %xmm6,%xmm6,%xmm2
|
|
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
|
|
vpxor %xmm6,%xmm2,%xmm2
|
|
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
|
|
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
|
|
|
|
vmovdqu 64(%rsp),%xmm9
|
|
vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
|
|
vmovdqu 48-32(%r9),%xmm3
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
vpunpckhqdq %xmm9,%xmm9,%xmm5
|
|
vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
|
|
vpxor %xmm9,%xmm5,%xmm5
|
|
vpxor %xmm7,%xmm6,%xmm6
|
|
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
|
|
vmovdqu 80-32(%r9),%xmm15
|
|
vpxor %xmm1,%xmm2,%xmm2
|
|
|
|
vmovdqu 80(%rsp),%xmm1
|
|
vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
|
|
vmovdqu 64-32(%r9),%xmm0
|
|
vpxor %xmm4,%xmm7,%xmm7
|
|
vpunpckhqdq %xmm1,%xmm1,%xmm4
|
|
vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
|
|
vpxor %xmm1,%xmm4,%xmm4
|
|
vpxor %xmm6,%xmm9,%xmm9
|
|
vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
|
|
vpxor %xmm2,%xmm5,%xmm5
|
|
|
|
vmovdqu 96(%rsp),%xmm2
|
|
vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
|
|
vmovdqu 96-32(%r9),%xmm3
|
|
vpxor %xmm7,%xmm6,%xmm6
|
|
vpunpckhqdq %xmm2,%xmm2,%xmm7
|
|
vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
|
|
vpxor %xmm2,%xmm7,%xmm7
|
|
vpxor %xmm9,%xmm1,%xmm1
|
|
vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
|
|
vmovdqu 128-32(%r9),%xmm15
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
|
|
vpxor 112(%rsp),%xmm8,%xmm8
|
|
vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
|
|
vmovdqu 112-32(%r9),%xmm0
|
|
vpunpckhqdq %xmm8,%xmm8,%xmm9
|
|
vpxor %xmm6,%xmm5,%xmm5
|
|
vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
|
|
vpxor %xmm8,%xmm9,%xmm9
|
|
vpxor %xmm1,%xmm2,%xmm2
|
|
vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
|
|
vpxor %xmm4,%xmm7,%xmm4
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
|
|
vmovdqu 0-32(%r9),%xmm3
|
|
vpunpckhqdq %xmm14,%xmm14,%xmm1
|
|
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
|
|
vpxor %xmm14,%xmm1,%xmm1
|
|
vpxor %xmm5,%xmm6,%xmm5
|
|
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
|
|
vmovdqu 32-32(%r9),%xmm15
|
|
vpxor %xmm2,%xmm8,%xmm7
|
|
vpxor %xmm4,%xmm9,%xmm6
|
|
|
|
vmovdqu 16-32(%r9),%xmm0
|
|
vpxor %xmm5,%xmm7,%xmm9
|
|
vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
|
|
vpxor %xmm9,%xmm6,%xmm6
|
|
vpunpckhqdq %xmm13,%xmm13,%xmm2
|
|
vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
|
|
vpxor %xmm13,%xmm2,%xmm2
|
|
vpslldq $8,%xmm6,%xmm9
|
|
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
|
|
vpxor %xmm9,%xmm5,%xmm8
|
|
vpsrldq $8,%xmm6,%xmm6
|
|
vpxor %xmm6,%xmm7,%xmm7
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
|
|
vmovdqu 48-32(%r9),%xmm3
|
|
vpxor %xmm4,%xmm5,%xmm5
|
|
vpunpckhqdq %xmm12,%xmm12,%xmm9
|
|
vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
|
|
vpxor %xmm12,%xmm9,%xmm9
|
|
vpxor %xmm14,%xmm13,%xmm13
|
|
vpalignr $8,%xmm8,%xmm8,%xmm14
|
|
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
|
|
vmovdqu 80-32(%r9),%xmm15
|
|
vpxor %xmm1,%xmm2,%xmm2
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
|
|
vmovdqu 64-32(%r9),%xmm0
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
vpunpckhqdq %xmm11,%xmm11,%xmm1
|
|
vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
|
|
vpxor %xmm11,%xmm1,%xmm1
|
|
vpxor %xmm13,%xmm12,%xmm12
|
|
vxorps 16(%rsp),%xmm7,%xmm7
|
|
vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
|
|
vpxor %xmm2,%xmm9,%xmm9
|
|
|
|
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
|
|
vxorps %xmm14,%xmm8,%xmm8
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
|
|
vmovdqu 96-32(%r9),%xmm3
|
|
vpxor %xmm4,%xmm5,%xmm5
|
|
vpunpckhqdq %xmm10,%xmm10,%xmm2
|
|
vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
|
|
vpxor %xmm10,%xmm2,%xmm2
|
|
vpalignr $8,%xmm8,%xmm8,%xmm14
|
|
vpxor %xmm12,%xmm11,%xmm11
|
|
vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
|
|
vmovdqu 128-32(%r9),%xmm15
|
|
vpxor %xmm9,%xmm1,%xmm1
|
|
|
|
vxorps %xmm7,%xmm14,%xmm14
|
|
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
|
|
vxorps %xmm14,%xmm8,%xmm8
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
|
|
vmovdqu 112-32(%r9),%xmm0
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
vpunpckhqdq %xmm8,%xmm8,%xmm9
|
|
vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
|
|
vpxor %xmm8,%xmm9,%xmm9
|
|
vpxor %xmm11,%xmm10,%xmm10
|
|
vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
|
|
vpxor %xmm1,%xmm2,%xmm2
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
|
|
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
|
|
vpxor %xmm4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
|
|
vpxor %xmm10,%xmm7,%xmm7
|
|
vpxor %xmm2,%xmm6,%xmm6
|
|
|
|
vpxor %xmm5,%xmm7,%xmm4
|
|
vpxor %xmm4,%xmm6,%xmm6
|
|
vpslldq $8,%xmm6,%xmm1
|
|
vmovdqu 16(%r11),%xmm3
|
|
vpsrldq $8,%xmm6,%xmm6
|
|
vpxor %xmm1,%xmm5,%xmm8
|
|
vpxor %xmm6,%xmm7,%xmm7
|
|
|
|
vpalignr $8,%xmm8,%xmm8,%xmm2
|
|
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
|
|
vpxor %xmm2,%xmm8,%xmm8
|
|
|
|
vpalignr $8,%xmm8,%xmm8,%xmm2
|
|
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
|
|
vpxor %xmm7,%xmm2,%xmm2
|
|
vpxor %xmm2,%xmm8,%xmm8
|
|
vpshufb (%r11),%xmm8,%xmm8
|
|
movq -56(%rax),%r9
|
|
.cfi_restore %r9
|
|
vmovdqu %xmm8,(%r9)
|
|
|
|
vzeroupper
|
|
movq -48(%rax),%r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax),%r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax),%r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax),%r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax),%rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax),%rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Lgcm_enc_abort:
|
|
movq %r10,%rax
|
|
RET
|
|
.cfi_endproc
|
|
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
|
|
|
|
/* Some utility routines */
|
|
|
|
/*
|
|
* clear all fpu registers
|
|
* void clear_fpu_regs_avx(void);
|
|
*/
|
|
.globl clear_fpu_regs_avx
|
|
.type clear_fpu_regs_avx,@function
|
|
.align 32
|
|
clear_fpu_regs_avx:
|
|
vzeroall
|
|
RET
|
|
.size clear_fpu_regs_avx,.-clear_fpu_regs_avx
|
|
|
|
/*
|
|
* void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
|
|
*
|
|
* XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
|
|
* stores the result at `dst'. The XOR is performed using FPU registers,
|
|
* so make sure FPU state is saved when running this in the kernel.
|
|
*/
|
|
.globl gcm_xor_avx
|
|
.type gcm_xor_avx,@function
|
|
.align 32
|
|
gcm_xor_avx:
|
|
movdqu (%rdi), %xmm0
|
|
movdqu (%rsi), %xmm1
|
|
pxor %xmm1, %xmm0
|
|
movdqu %xmm0, (%rsi)
|
|
RET
|
|
.size gcm_xor_avx,.-gcm_xor_avx
|
|
|
|
/*
|
|
* Toggle a boolean_t value atomically and return the new value.
|
|
* boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
|
|
*/
|
|
.globl atomic_toggle_boolean_nv
|
|
.type atomic_toggle_boolean_nv,@function
|
|
.align 32
|
|
atomic_toggle_boolean_nv:
|
|
xorl %eax, %eax
|
|
lock
|
|
xorl $1, (%rdi)
|
|
jz 1f
|
|
movl $1, %eax
|
|
1:
|
|
RET
|
|
.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
|
|
|
|
.align 64
|
|
.Lbswap_mask:
|
|
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
|
.Lpoly:
|
|
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
|
.Lone_msb:
|
|
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
|
|
.Ltwo_lsb:
|
|
.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
|
.Lone_lsb:
|
|
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
|
.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
|
.align 64
|
|
|
|
/* Mark the stack non-executable. */
|
|
#if defined(__linux__) && defined(__ELF__)
|
|
.section .note.GNU-stack,"",%progbits
|
|
#endif
|
|
|
|
#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
|