mirror_zfs/module/icp/asm-x86_64/modes/gcm_intel.S
Jason Zaman a3600a106d icp: mark asm files with noexec stack
If there is no explicit note in the .S files, the obj file will mark it
as requiring an executable stack. This is unneeded and causes issues on
hardened systems.

More info:
https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart

Signed-off-by: Jason Zaman <jason@perfinion.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #4947
Closes #4962
2016-08-12 09:51:26 -07:00

339 lines
9.0 KiB
ArmAsm

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2009 Intel Corporation
* All Rights Reserved.
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Accelerated GHASH implementation with Intel PCLMULQDQ-NI
* instructions. This file contains an accelerated
* Galois Field Multiplication implementation.
*
* PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
* carry-less multiplication. More information about PCLMULQDQ can be
* found at:
* http://software.intel.com/en-us/articles/
* carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
*
*/
/*
* ====================================================================
* OpenSolaris OS modifications
*
* This source originates as file galois_hash_asm.c from
* Intel Corporation dated September 21, 2009.
*
* This OpenSolaris version has these major changes from the original source:
*
* 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
* /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
* definition for lint.
*
* 2. Formatted code, added comments, and added #includes and #defines.
*
* 3. If bit CR0.TS is set, clear and set the TS bit, after and before
* calling kpreempt_disable() and kpreempt_enable().
* If the TS bit is not set, Save and restore %xmm registers at the beginning
* and end of function calls (%xmm* registers are not saved and restored by
* during kernel thread preemption).
*
* 4. Removed code to perform hashing. This is already done with C macro
* GHASH in gcm.c. For better performance, this removed code should be
* reintegrated in the future to replace the C GHASH macro.
*
* 5. Added code to byte swap 16-byte input and output.
*
* 6. Folded in comments from the original C source with embedded assembly
* (SB_w_shift_xor.c)
*
* 7. Renamed function and reordered parameters to match OpenSolaris:
* Intel interface:
* void galois_hash_asm(unsigned char *hk, unsigned char *s,
* unsigned char *d, int length)
* OpenSolaris OS interface:
* void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
* ====================================================================
*/
#if defined(lint) || defined(__lint)
#include <sys/types.h>
/* ARGSUSED */
void
gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
}
#else /* lint */
#define _ASM
#include <sys/asm_linkage.h>
#ifdef _KERNEL
/*
* Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
* it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
* uses it to pass P2 to syscall.
* This also occurs with the STTS macro, but we dont care if
* P2 (%rsi) is modified just before function exit.
* The CLTS and STTS macros push and pop P1 (%rdi) already.
*/
#ifdef __xpv
#define PROTECTED_CLTS \
push %rsi; \
CLTS; \
pop %rsi
#else
#define PROTECTED_CLTS \
CLTS
#endif /* __xpv */
/*
* If CR0_TS is not set, align stack (with push %rbp) and push
* %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
*/
#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
push %rbp; \
mov %rsp, %rbp; \
movq %cr0, tmpreg; \
testq $CR0_TS, tmpreg; \
jnz 1f; \
and $-XMM_ALIGN, %rsp; \
sub $[XMM_SIZE * 11], %rsp; \
movaps %xmm0, 160(%rsp); \
movaps %xmm1, 144(%rsp); \
movaps %xmm2, 128(%rsp); \
movaps %xmm3, 112(%rsp); \
movaps %xmm4, 96(%rsp); \
movaps %xmm5, 80(%rsp); \
movaps %xmm6, 64(%rsp); \
movaps %xmm7, 48(%rsp); \
movaps %xmm8, 32(%rsp); \
movaps %xmm9, 16(%rsp); \
movaps %xmm10, (%rsp); \
jmp 2f; \
1: \
PROTECTED_CLTS; \
2:
/*
* If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
* otherwise set CR0_TS.
*/
#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
testq $CR0_TS, tmpreg; \
jnz 1f; \
movaps (%rsp), %xmm10; \
movaps 16(%rsp), %xmm9; \
movaps 32(%rsp), %xmm8; \
movaps 48(%rsp), %xmm7; \
movaps 64(%rsp), %xmm6; \
movaps 80(%rsp), %xmm5; \
movaps 96(%rsp), %xmm4; \
movaps 112(%rsp), %xmm3; \
movaps 128(%rsp), %xmm2; \
movaps 144(%rsp), %xmm1; \
movaps 160(%rsp), %xmm0; \
jmp 2f; \
1: \
STTS(tmpreg); \
2: \
mov %rbp, %rsp; \
pop %rbp
#else
#define PROTECTED_CLTS
#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
#endif /* _KERNEL */
/*
* Use this mask to byte-swap a 16-byte integer with the pshufb instruction
*/
// static uint8_t byte_swap16_mask[] = {
// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
.text
.align XMM_ALIGN
.Lbyte_swap16_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
/*
* void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
*
* Perform a carry-less multiplication (that is, use XOR instead of the
* multiply operator) on P1 and P2 and place the result in P3.
*
* Byte swap the input and the output.
*
* Note: x_in, y, and res all point to a block of 20-byte numbers
* (an array of two 64-bit integers).
*
* Note2: For kernel code, caller is responsible for ensuring
* kpreempt_disable() has been called. This is because %xmm registers are
* not saved/restored. Clear and set the CR0.TS bit on entry and exit,
* respectively, if TS is set on entry. Otherwise, if TS is not set,
* save and restore %xmm registers on the stack.
*
* Note3: Original Intel definition:
* void galois_hash_asm(unsigned char *hk, unsigned char *s,
* unsigned char *d, int length)
*
* Note4: Register/parameter mapping:
* Intel:
* Parameter 1: %rcx (copied to %xmm0) hk or x_in
* Parameter 2: %rdx (copied to %xmm1) s or y
* Parameter 3: %rdi (result) d or res
* OpenSolaris:
* Parameter 1: %rdi (copied to %xmm0) x_in
* Parameter 2: %rsi (copied to %xmm1) y
* Parameter 3: %rdx (result) res
*/
ENTRY_NP(gcm_mul_pclmulqdq)
CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
//
// Copy Parameters
//
movdqu (%rdi), %xmm0 // P1
movdqu (%rsi), %xmm1 // P2
//
// Byte swap 16-byte input
//
lea .Lbyte_swap16_mask(%rip), %rax
movaps (%rax), %xmm10
pshufb %xmm10, %xmm0
pshufb %xmm10, %xmm1
//
// Multiply with the hash key
//
movdqu %xmm0, %xmm3
pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0
movdqu %xmm0, %xmm4
pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1
movdqu %xmm0, %xmm5
pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0
movdqu %xmm0, %xmm6
pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1
pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0
movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5
psrldq $8, %xmm4 // shift by xmm4 64 bits to the right
pslldq $8, %xmm5 // shift by xmm5 64 bits to the left
pxor %xmm5, %xmm3
pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result
// of the carry-less multiplication of
// xmm0 by xmm1.
// We shift the result of the multiplication by one bit position
// to the left to cope for the fact that the bits are reversed.
movdqu %xmm3, %xmm7
movdqu %xmm6, %xmm8
pslld $1, %xmm3
pslld $1, %xmm6
psrld $31, %xmm7
psrld $31, %xmm8
movdqu %xmm7, %xmm9
pslldq $4, %xmm8
pslldq $4, %xmm7
psrldq $12, %xmm9
por %xmm7, %xmm3
por %xmm8, %xmm6
por %xmm9, %xmm6
//
// First phase of the reduction
//
// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
// independently.
movdqu %xmm3, %xmm7
movdqu %xmm3, %xmm8
movdqu %xmm3, %xmm9
pslld $31, %xmm7 // packed right shift shifting << 31
pslld $30, %xmm8 // packed right shift shifting << 30
pslld $25, %xmm9 // packed right shift shifting << 25
pxor %xmm8, %xmm7 // xor the shifted versions
pxor %xmm9, %xmm7
movdqu %xmm7, %xmm8
pslldq $12, %xmm7
psrldq $4, %xmm8
pxor %xmm7, %xmm3 // first phase of the reduction complete
//
// Second phase of the reduction
//
// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
// shift operations.
movdqu %xmm3, %xmm2
movdqu %xmm3, %xmm4 // packed left shifting >> 1
movdqu %xmm3, %xmm5
psrld $1, %xmm2
psrld $2, %xmm4 // packed left shifting >> 2
psrld $7, %xmm5 // packed left shifting >> 7
pxor %xmm4, %xmm2 // xor the shifted versions
pxor %xmm5, %xmm2
pxor %xmm8, %xmm2
pxor %xmm2, %xmm3
pxor %xmm3, %xmm6 // the result is in xmm6
//
// Byte swap 16-byte result
//
pshufb %xmm10, %xmm6 // %xmm10 has the swap mask
//
// Store the result
//
movdqu %xmm6, (%rdx) // P3
//
// Cleanup and Return
//
SET_TS_OR_POP_XMM_REGISTERS(%r10)
ret
SET_SIZE(gcm_mul_pclmulqdq)
#endif /* lint || __lint */
#ifdef __ELF__
.section .note.GNU-stack,"",%progbits
#endif