mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-23 02:44:41 +03:00
Backport AVX2 AES-GCM implementation from BoringSSL
This uses the AVX2 versions of the AESENC and PCLMULQDQ instructions; on Zen 3 this provides an up to 80% performance improvement. Original source: https://github.com/google/boringssl/blob/d5440dd2c2c500ac2d3bba4afec47a054b4d99ae/gen/bcm/aes-gcm-avx2-x86_64-linux.S See the original BoringSSL commit at https://github.com/google/boringssl/commit/3b6e1be4391d96e81cee022f77f7bab85d51cf4e. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Rob Norris <robn@despairlabs.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Attila Fülöp <attila@fueloep.org> Signed-off-by: Joel Low <joel@joelsplace.sg> Closes #17058
This commit is contained in:
committed by
Brian Behlendorf
parent
885d929cf8
commit
bb9225ea86
+293
-78
@@ -46,6 +46,9 @@
|
||||
#define IMPL_CYCLE (UINT32_MAX-1)
|
||||
#ifdef CAN_USE_GCM_ASM
|
||||
#define IMPL_AVX (UINT32_MAX-2)
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
#define IMPL_AVX2 (UINT32_MAX-3)
|
||||
#endif
|
||||
#endif
|
||||
#define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
|
||||
static uint32_t icp_gcm_impl = IMPL_FASTEST;
|
||||
@@ -56,17 +59,16 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
|
||||
boolean_t gcm_avx_can_use_movbe = B_FALSE;
|
||||
/*
|
||||
* Whether to use the optimized openssl gcm and ghash implementations.
|
||||
* Set to true if module parameter icp_gcm_impl == "avx".
|
||||
*/
|
||||
static boolean_t gcm_use_avx = B_FALSE;
|
||||
#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx)
|
||||
static gcm_impl gcm_impl_used = GCM_IMPL_GENERIC;
|
||||
#define GCM_IMPL_USED (*(volatile gcm_impl *)&gcm_impl_used)
|
||||
|
||||
extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
|
||||
|
||||
static inline boolean_t gcm_avx_will_work(void);
|
||||
static inline void gcm_set_avx(boolean_t);
|
||||
static inline boolean_t gcm_toggle_avx(void);
|
||||
static inline size_t gcm_simd_get_htab_size(boolean_t);
|
||||
static inline boolean_t gcm_avx2_will_work(void);
|
||||
static inline void gcm_use_impl(gcm_impl impl);
|
||||
static inline gcm_impl gcm_toggle_impl(void);
|
||||
|
||||
static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
|
||||
crypto_data_t *, size_t);
|
||||
@@ -89,7 +91,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
|
||||
void (*xor_block)(uint8_t *, uint8_t *))
|
||||
{
|
||||
#ifdef CAN_USE_GCM_ASM
|
||||
if (ctx->gcm_use_avx == B_TRUE)
|
||||
if (ctx->impl != GCM_IMPL_GENERIC)
|
||||
return (gcm_mode_encrypt_contiguous_blocks_avx(
|
||||
ctx, data, length, out, block_size));
|
||||
#endif
|
||||
@@ -208,7 +210,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
|
||||
{
|
||||
(void) copy_block;
|
||||
#ifdef CAN_USE_GCM_ASM
|
||||
if (ctx->gcm_use_avx == B_TRUE)
|
||||
if (ctx->impl != GCM_IMPL_GENERIC)
|
||||
return (gcm_encrypt_final_avx(ctx, out, block_size));
|
||||
#endif
|
||||
|
||||
@@ -374,7 +376,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
|
||||
void (*xor_block)(uint8_t *, uint8_t *))
|
||||
{
|
||||
#ifdef CAN_USE_GCM_ASM
|
||||
if (ctx->gcm_use_avx == B_TRUE)
|
||||
if (ctx->impl != GCM_IMPL_GENERIC)
|
||||
return (gcm_decrypt_final_avx(ctx, out, block_size));
|
||||
#endif
|
||||
|
||||
@@ -631,23 +633,23 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
|
||||
((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
|
||||
|
||||
if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
|
||||
gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
|
||||
gcm_ctx->impl = GCM_IMPL_USED;
|
||||
} else {
|
||||
/*
|
||||
* Handle the "cycle" implementation by creating avx and
|
||||
* non-avx contexts alternately.
|
||||
* Handle the "cycle" implementation by creating different
|
||||
* contexts, one per implementation.
|
||||
*/
|
||||
gcm_ctx->gcm_use_avx = gcm_toggle_avx();
|
||||
gcm_ctx->impl = gcm_toggle_impl();
|
||||
|
||||
/* The avx impl. doesn't handle byte swapped key schedules. */
|
||||
if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
|
||||
gcm_ctx->gcm_use_avx = B_FALSE;
|
||||
/* The AVX impl. doesn't handle byte swapped key schedules. */
|
||||
if (needs_bswap == B_TRUE) {
|
||||
gcm_ctx->impl = GCM_IMPL_GENERIC;
|
||||
}
|
||||
/*
|
||||
* If this is a GCM context, use the MOVBE and the BSWAP
|
||||
* If this is an AVX context, use the MOVBE and the BSWAP
|
||||
* variants alternately.
|
||||
*/
|
||||
if (gcm_ctx->gcm_use_avx == B_TRUE &&
|
||||
if (gcm_ctx->impl == GCM_IMPL_AVX &&
|
||||
zfs_movbe_available() == B_TRUE) {
|
||||
(void) atomic_toggle_boolean_nv(
|
||||
(volatile boolean_t *)&gcm_avx_can_use_movbe);
|
||||
@@ -658,12 +660,13 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
|
||||
* still they could be created by the aes generic implementation.
|
||||
* Make sure not to use them since we'll corrupt data if we do.
|
||||
*/
|
||||
if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
|
||||
gcm_ctx->gcm_use_avx = B_FALSE;
|
||||
if (gcm_ctx->impl != GCM_IMPL_GENERIC && needs_bswap == B_TRUE) {
|
||||
gcm_ctx->impl = GCM_IMPL_GENERIC;
|
||||
|
||||
cmn_err_once(CE_WARN,
|
||||
"ICP: Can't use the aes generic or cycle implementations "
|
||||
"in combination with the gcm avx implementation!");
|
||||
"in combination with the gcm avx or avx2-vaes "
|
||||
"implementation!");
|
||||
cmn_err_once(CE_WARN,
|
||||
"ICP: Falling back to a compatible implementation, "
|
||||
"aes-gcm performance will likely be degraded.");
|
||||
@@ -672,36 +675,20 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
|
||||
"restore performance.");
|
||||
}
|
||||
|
||||
/* Allocate Htab memory as needed. */
|
||||
if (gcm_ctx->gcm_use_avx == B_TRUE) {
|
||||
size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
|
||||
|
||||
if (htab_len == 0) {
|
||||
return (CRYPTO_MECHANISM_PARAM_INVALID);
|
||||
}
|
||||
gcm_ctx->gcm_htab_len = htab_len;
|
||||
gcm_ctx->gcm_Htable =
|
||||
kmem_alloc(htab_len, KM_SLEEP);
|
||||
|
||||
if (gcm_ctx->gcm_Htable == NULL) {
|
||||
return (CRYPTO_HOST_MEMORY);
|
||||
}
|
||||
/*
|
||||
* AVX implementations use Htable with sizes depending on
|
||||
* implementation.
|
||||
*/
|
||||
if (gcm_ctx->impl != GCM_IMPL_GENERIC) {
|
||||
rv = gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
|
||||
block_size);
|
||||
}
|
||||
/* Avx and non avx context initialization differs from here on. */
|
||||
if (gcm_ctx->gcm_use_avx == B_FALSE) {
|
||||
else
|
||||
#endif /* ifdef CAN_USE_GCM_ASM */
|
||||
if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
|
||||
encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
|
||||
rv = CRYPTO_MECHANISM_PARAM_INVALID;
|
||||
}
|
||||
#ifdef CAN_USE_GCM_ASM
|
||||
} else {
|
||||
if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
|
||||
block_size) != CRYPTO_SUCCESS) {
|
||||
rv = CRYPTO_MECHANISM_PARAM_INVALID;
|
||||
}
|
||||
if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
|
||||
encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
|
||||
rv = CRYPTO_MECHANISM_PARAM_INVALID;
|
||||
}
|
||||
#endif /* ifdef CAN_USE_GCM_ASM */
|
||||
|
||||
return (rv);
|
||||
}
|
||||
@@ -767,6 +754,9 @@ gcm_impl_get_ops(void)
|
||||
break;
|
||||
#ifdef CAN_USE_GCM_ASM
|
||||
case IMPL_AVX:
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
case IMPL_AVX2:
|
||||
#endif
|
||||
/*
|
||||
* Make sure that we return a valid implementation while
|
||||
* switching to the avx implementation since there still
|
||||
@@ -828,6 +818,13 @@ gcm_impl_init(void)
|
||||
* Use the avx implementation if it's available and the implementation
|
||||
* hasn't changed from its default value of fastest on module load.
|
||||
*/
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
if (gcm_avx2_will_work()) {
|
||||
if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
|
||||
gcm_use_impl(GCM_IMPL_AVX2);
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
if (gcm_avx_will_work()) {
|
||||
#ifdef HAVE_MOVBE
|
||||
if (zfs_movbe_available() == B_TRUE) {
|
||||
@@ -835,7 +832,7 @@ gcm_impl_init(void)
|
||||
}
|
||||
#endif
|
||||
if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
|
||||
gcm_set_avx(B_TRUE);
|
||||
gcm_use_impl(GCM_IMPL_AVX);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -852,6 +849,7 @@ static const struct {
|
||||
{ "fastest", IMPL_FASTEST },
|
||||
#ifdef CAN_USE_GCM_ASM
|
||||
{ "avx", IMPL_AVX },
|
||||
{ "avx2-vaes", IMPL_AVX2 },
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -887,7 +885,13 @@ gcm_impl_set(const char *val)
|
||||
/* Check mandatory options */
|
||||
for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
|
||||
#ifdef CAN_USE_GCM_ASM
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
/* Ignore avx implementation if it won't work. */
|
||||
if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
|
||||
!gcm_avx2_will_work()) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
|
||||
continue;
|
||||
}
|
||||
@@ -915,11 +919,17 @@ gcm_impl_set(const char *val)
|
||||
* Use the avx implementation if available and the requested one is
|
||||
* avx or fastest.
|
||||
*/
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
if (gcm_avx2_will_work() == B_TRUE &&
|
||||
(impl == IMPL_AVX2 || impl == IMPL_FASTEST)) {
|
||||
gcm_use_impl(GCM_IMPL_AVX2);
|
||||
} else
|
||||
#endif
|
||||
if (gcm_avx_will_work() == B_TRUE &&
|
||||
(impl == IMPL_AVX || impl == IMPL_FASTEST)) {
|
||||
gcm_set_avx(B_TRUE);
|
||||
gcm_use_impl(GCM_IMPL_AVX);
|
||||
} else {
|
||||
gcm_set_avx(B_FALSE);
|
||||
gcm_use_impl(GCM_IMPL_GENERIC);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -952,6 +962,12 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
|
||||
for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
|
||||
#ifdef CAN_USE_GCM_ASM
|
||||
/* Ignore avx implementation if it won't work. */
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
|
||||
!gcm_avx2_will_work()) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
|
||||
continue;
|
||||
}
|
||||
@@ -993,9 +1009,6 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
|
||||
|
||||
/* Clear the FPU registers since they hold sensitive internal state. */
|
||||
#define clear_fpu_regs() clear_fpu_regs_avx()
|
||||
#define GHASH_AVX(ctx, in, len) \
|
||||
gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
|
||||
in, len)
|
||||
|
||||
#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
|
||||
|
||||
@@ -1010,20 +1023,77 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
|
||||
static uint32_t gcm_avx_chunk_size =
|
||||
((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
|
||||
|
||||
/*
|
||||
* GCM definitions: uint128_t is copied from include/crypto/modes.h
|
||||
* Avoiding u128 because it is already defined in kernel sources.
|
||||
*/
|
||||
typedef struct {
|
||||
uint64_t hi, lo;
|
||||
} uint128_t;
|
||||
|
||||
extern void ASMABI clear_fpu_regs_avx(void);
|
||||
extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
|
||||
extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
|
||||
const uint32_t pt[4], uint32_t ct[4]);
|
||||
|
||||
extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
extern void ASMABI gcm_init_vpclmulqdq_avx2(uint128_t Htable[16],
|
||||
const uint64_t H[2]);
|
||||
#endif
|
||||
extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
|
||||
const uint8_t *in, size_t len);
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
extern void ASMABI gcm_ghash_vpclmulqdq_avx2(uint64_t ghash[2],
|
||||
const uint64_t *Htable, const uint8_t *in, size_t len);
|
||||
#endif
|
||||
static inline void GHASH_AVX(gcm_ctx_t *ctx, const uint8_t *in, size_t len)
|
||||
{
|
||||
switch (ctx->impl) {
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
case GCM_IMPL_AVX2:
|
||||
gcm_ghash_vpclmulqdq_avx2(ctx->gcm_ghash,
|
||||
(const uint64_t *)ctx->gcm_Htable, in, len);
|
||||
break;
|
||||
#endif
|
||||
|
||||
case GCM_IMPL_AVX:
|
||||
gcm_ghash_avx(ctx->gcm_ghash,
|
||||
(const uint64_t *)ctx->gcm_Htable, in, len);
|
||||
break;
|
||||
|
||||
default:
|
||||
VERIFY(B_FALSE);
|
||||
}
|
||||
}
|
||||
|
||||
typedef size_t ASMABI aesni_gcm_encrypt_impl(const uint8_t *, uint8_t *,
|
||||
size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
|
||||
extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
|
||||
const void *, uint64_t *, uint64_t *);
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
extern void ASMABI aes_gcm_enc_update_vaes_avx2(const uint8_t *in,
|
||||
uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
|
||||
const uint128_t Htable[16], uint8_t Xi[16]);
|
||||
#endif
|
||||
|
||||
typedef size_t ASMABI aesni_gcm_decrypt_impl(const uint8_t *, uint8_t *,
|
||||
size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
|
||||
extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
|
||||
const void *, uint64_t *, uint64_t *);
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
extern void ASMABI aes_gcm_dec_update_vaes_avx2(const uint8_t *in,
|
||||
uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
|
||||
const uint128_t Htable[16], uint8_t Xi[16]);
|
||||
#endif
|
||||
|
||||
static inline boolean_t
|
||||
gcm_avx2_will_work(void)
|
||||
{
|
||||
return (kfpu_allowed() &&
|
||||
zfs_avx2_available() && zfs_vaes_available() &&
|
||||
zfs_vpclmulqdq_available());
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
gcm_avx_will_work(void)
|
||||
@@ -1035,33 +1105,67 @@ gcm_avx_will_work(void)
|
||||
}
|
||||
|
||||
static inline void
|
||||
gcm_set_avx(boolean_t val)
|
||||
gcm_use_impl(gcm_impl impl)
|
||||
{
|
||||
if (gcm_avx_will_work() == B_TRUE) {
|
||||
atomic_swap_32(&gcm_use_avx, val);
|
||||
switch (impl) {
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
case GCM_IMPL_AVX2:
|
||||
if (gcm_avx2_will_work() == B_TRUE) {
|
||||
atomic_swap_32(&gcm_impl_used, impl);
|
||||
return;
|
||||
}
|
||||
|
||||
zfs_fallthrough;
|
||||
#endif
|
||||
|
||||
case GCM_IMPL_AVX:
|
||||
if (gcm_avx_will_work() == B_TRUE) {
|
||||
atomic_swap_32(&gcm_impl_used, impl);
|
||||
return;
|
||||
}
|
||||
|
||||
zfs_fallthrough;
|
||||
|
||||
default:
|
||||
atomic_swap_32(&gcm_impl_used, GCM_IMPL_GENERIC);
|
||||
}
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
gcm_toggle_avx(void)
|
||||
gcm_impl_will_work(gcm_impl impl)
|
||||
{
|
||||
if (gcm_avx_will_work() == B_TRUE) {
|
||||
return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
|
||||
} else {
|
||||
return (B_FALSE);
|
||||
switch (impl) {
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
case GCM_IMPL_AVX2:
|
||||
return (gcm_avx2_will_work());
|
||||
#endif
|
||||
|
||||
case GCM_IMPL_AVX:
|
||||
return (gcm_avx_will_work());
|
||||
|
||||
default:
|
||||
return (B_TRUE);
|
||||
}
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
gcm_simd_get_htab_size(boolean_t simd_mode)
|
||||
static inline gcm_impl
|
||||
gcm_toggle_impl(void)
|
||||
{
|
||||
switch (simd_mode) {
|
||||
case B_TRUE:
|
||||
return (2 * 6 * 2 * sizeof (uint64_t));
|
||||
gcm_impl current_impl, new_impl;
|
||||
do { /* handle races */
|
||||
current_impl = atomic_load_32(&gcm_impl_used);
|
||||
new_impl = current_impl;
|
||||
while (B_TRUE) { /* handle incompatble implementations */
|
||||
new_impl = (new_impl + 1) % GCM_IMPL_MAX;
|
||||
if (gcm_impl_will_work(new_impl)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
return (0);
|
||||
}
|
||||
} while (atomic_cas_32(&gcm_impl_used, current_impl, new_impl) !=
|
||||
current_impl);
|
||||
|
||||
return (new_impl);
|
||||
}
|
||||
|
||||
|
||||
@@ -1077,6 +1181,50 @@ gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
|
||||
ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
|
||||
}
|
||||
|
||||
static size_t aesni_gcm_encrypt_avx(const uint8_t *in, uint8_t *out,
|
||||
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
|
||||
uint64_t *Xip)
|
||||
{
|
||||
(void) Htable;
|
||||
return (aesni_gcm_encrypt(in, out, len, key, iv, Xip));
|
||||
}
|
||||
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
|
||||
// bits of a |size_t|.
|
||||
// This is from boringssl/crypto/fipsmodule/aes/gcm.cc.inc
|
||||
static const size_t kSizeTWithoutLower4Bits = (size_t)-16;
|
||||
|
||||
/* The following CRYPTO methods are from boringssl/crypto/internal.h */
|
||||
static inline uint32_t CRYPTO_bswap4(uint32_t x) {
|
||||
return (__builtin_bswap32(x));
|
||||
}
|
||||
|
||||
static inline uint32_t CRYPTO_load_u32_be(const void *in) {
|
||||
uint32_t v;
|
||||
memcpy(&v, in, sizeof (v));
|
||||
return (CRYPTO_bswap4(v));
|
||||
}
|
||||
|
||||
static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {
|
||||
v = CRYPTO_bswap4(v);
|
||||
memcpy(out, &v, sizeof (v));
|
||||
}
|
||||
|
||||
static size_t aesni_gcm_encrypt_avx2(const uint8_t *in, uint8_t *out,
|
||||
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
|
||||
uint64_t *Xip)
|
||||
{
|
||||
uint8_t *ivec = (uint8_t *)iv;
|
||||
len &= kSizeTWithoutLower4Bits;
|
||||
aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec,
|
||||
(const uint128_t *)Htable, (uint8_t *)Xip);
|
||||
CRYPTO_store_u32_be(&ivec[12],
|
||||
CRYPTO_load_u32_be(&ivec[12]) + len / 16);
|
||||
return (len);
|
||||
}
|
||||
#endif /* if CAN_USE_GCM_ASM >= 2 */
|
||||
|
||||
/*
|
||||
* Encrypt multiple blocks of data in GCM mode.
|
||||
* This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
|
||||
@@ -1091,8 +1239,15 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
|
||||
size_t done = 0;
|
||||
uint8_t *datap = (uint8_t *)data;
|
||||
size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
|
||||
aesni_gcm_encrypt_impl *encrypt_blocks =
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
ctx->impl == GCM_IMPL_AVX2 ?
|
||||
aesni_gcm_encrypt_avx2 :
|
||||
#endif
|
||||
aesni_gcm_encrypt_avx;
|
||||
const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
|
||||
uint64_t *ghash = ctx->gcm_ghash;
|
||||
uint64_t *htable = ctx->gcm_Htable;
|
||||
uint64_t *cb = ctx->gcm_cb;
|
||||
uint8_t *ct_buf = NULL;
|
||||
uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
|
||||
@@ -1156,8 +1311,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
|
||||
/* Do the bulk encryption in chunk_size blocks. */
|
||||
for (; bleft >= chunk_size; bleft -= chunk_size) {
|
||||
kfpu_begin();
|
||||
done = aesni_gcm_encrypt(
|
||||
datap, ct_buf, chunk_size, key, cb, ghash);
|
||||
done = encrypt_blocks(
|
||||
datap, ct_buf, chunk_size, key, cb, htable, ghash);
|
||||
|
||||
clear_fpu_regs();
|
||||
kfpu_end();
|
||||
@@ -1180,7 +1335,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
|
||||
/* Bulk encrypt the remaining data. */
|
||||
kfpu_begin();
|
||||
if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
|
||||
done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
|
||||
done = encrypt_blocks(datap, ct_buf, bleft, key, cb, htable,
|
||||
ghash);
|
||||
if (done == 0) {
|
||||
rv = CRYPTO_FAILED;
|
||||
goto out;
|
||||
@@ -1293,6 +1449,29 @@ gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
|
||||
return (CRYPTO_SUCCESS);
|
||||
}
|
||||
|
||||
static size_t aesni_gcm_decrypt_avx(const uint8_t *in, uint8_t *out,
|
||||
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
|
||||
uint64_t *Xip)
|
||||
{
|
||||
(void) Htable;
|
||||
return (aesni_gcm_decrypt(in, out, len, key, iv, Xip));
|
||||
}
|
||||
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
static size_t aesni_gcm_decrypt_avx2(const uint8_t *in, uint8_t *out,
|
||||
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
|
||||
uint64_t *Xip)
|
||||
{
|
||||
uint8_t *ivec = (uint8_t *)iv;
|
||||
len &= kSizeTWithoutLower4Bits;
|
||||
aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec,
|
||||
(const uint128_t *)Htable, (uint8_t *)Xip);
|
||||
CRYPTO_store_u32_be(&ivec[12],
|
||||
CRYPTO_load_u32_be(&ivec[12]) + len / 16);
|
||||
return (len);
|
||||
}
|
||||
#endif /* if CAN_USE_GCM_ASM >= 2 */
|
||||
|
||||
/*
|
||||
* Finalize decryption: We just have accumulated crypto text, so now we
|
||||
* decrypt it here inplace.
|
||||
@@ -1306,10 +1485,17 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
|
||||
B_FALSE);
|
||||
|
||||
size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
|
||||
aesni_gcm_decrypt_impl *decrypt_blocks =
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
ctx->impl == GCM_IMPL_AVX2 ?
|
||||
aesni_gcm_decrypt_avx2 :
|
||||
#endif
|
||||
aesni_gcm_decrypt_avx;
|
||||
size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
|
||||
uint8_t *datap = ctx->gcm_pt_buf;
|
||||
const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
|
||||
uint32_t *cb = (uint32_t *)ctx->gcm_cb;
|
||||
uint64_t *htable = ctx->gcm_Htable;
|
||||
uint64_t *ghash = ctx->gcm_ghash;
|
||||
uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
|
||||
int rv = CRYPTO_SUCCESS;
|
||||
@@ -1322,8 +1508,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
|
||||
*/
|
||||
for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
|
||||
kfpu_begin();
|
||||
done = aesni_gcm_decrypt(datap, datap, chunk_size,
|
||||
(const void *)key, ctx->gcm_cb, ghash);
|
||||
done = decrypt_blocks(datap, datap, chunk_size,
|
||||
(const void *)key, ctx->gcm_cb, htable, ghash);
|
||||
clear_fpu_regs();
|
||||
kfpu_end();
|
||||
if (done != chunk_size) {
|
||||
@@ -1334,8 +1520,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
|
||||
/* Decrypt remainder, which is less than chunk size, in one go. */
|
||||
kfpu_begin();
|
||||
if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
|
||||
done = aesni_gcm_decrypt(datap, datap, bleft,
|
||||
(const void *)key, ctx->gcm_cb, ghash);
|
||||
done = decrypt_blocks(datap, datap, bleft,
|
||||
(const void *)key, ctx->gcm_cb, htable, ghash);
|
||||
if (done == 0) {
|
||||
clear_fpu_regs();
|
||||
kfpu_end();
|
||||
@@ -1424,13 +1610,42 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
|
||||
ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
|
||||
B_FALSE);
|
||||
|
||||
size_t htab_len = 0;
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
if (ctx->impl == GCM_IMPL_AVX2) {
|
||||
/*
|
||||
* BoringSSL's API specifies uint128_t[16] for htab; but only
|
||||
* uint128_t[12] are used.
|
||||
* See https://github.com/google/boringssl/blob/
|
||||
* 813840dd094f9e9c1b00a7368aa25e656554221f1/crypto/fipsmodule/
|
||||
* modes/asm/aes-gcm-avx2-x86_64.pl#L198-L200
|
||||
*/
|
||||
htab_len = (2 * 8 * sizeof (uint128_t));
|
||||
} else
|
||||
#endif /* CAN_USE_GCM_ASM >= 2 */
|
||||
{
|
||||
htab_len = (2 * 6 * sizeof (uint128_t));
|
||||
}
|
||||
|
||||
ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP);
|
||||
if (ctx->gcm_Htable == NULL) {
|
||||
return (CRYPTO_HOST_MEMORY);
|
||||
}
|
||||
|
||||
/* Init H (encrypt zero block) and create the initial counter block. */
|
||||
memset(H, 0, sizeof (ctx->gcm_H));
|
||||
kfpu_begin();
|
||||
aes_encrypt_intel(keysched, aes_rounds,
|
||||
(const uint32_t *)H, (uint32_t *)H);
|
||||
|
||||
gcm_init_htab_avx(ctx->gcm_Htable, H);
|
||||
#if CAN_USE_GCM_ASM >= 2
|
||||
if (ctx->impl == GCM_IMPL_AVX2) {
|
||||
gcm_init_vpclmulqdq_avx2((uint128_t *)ctx->gcm_Htable, H);
|
||||
} else
|
||||
#endif /* if CAN_USE_GCM_ASM >= 2 */
|
||||
{
|
||||
gcm_init_htab_avx(ctx->gcm_Htable, H);
|
||||
}
|
||||
|
||||
if (iv_len == 12) {
|
||||
memcpy(cb, iv, 12);
|
||||
|
||||
@@ -171,7 +171,7 @@ gcm_clear_ctx(gcm_ctx_t *ctx)
|
||||
explicit_memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder));
|
||||
explicit_memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
|
||||
#if defined(CAN_USE_GCM_ASM)
|
||||
if (ctx->gcm_use_avx == B_TRUE) {
|
||||
if (ctx->impl != GCM_IMPL_GENERIC) {
|
||||
ASSERT3P(ctx->gcm_Htable, !=, NULL);
|
||||
explicit_memset(ctx->gcm_Htable, 0, ctx->gcm_htab_len);
|
||||
kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
|
||||
|
||||
Reference in New Issue
Block a user