Backport AVX2 AES-GCM implementation from BoringSSL

This uses the AVX2 versions of the AESENC and PCLMULQDQ instructions; on
Zen 3 this provides an up to 80% performance improvement.

Original source:
https://github.com/google/boringssl/blob/d5440dd2c2c500ac2d3bba4afec47a054b4d99ae/gen/bcm/aes-gcm-avx2-x86_64-linux.S

See the original BoringSSL commit at
https://github.com/google/boringssl/commit/3b6e1be4391d96e81cee022f77f7bab85d51cf4e.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Attila Fülöp <attila@fueloep.org>
Signed-off-by: Joel Low <joel@joelsplace.sg>
Closes #17058
This commit is contained in:
Joel Low
2025-02-15 11:37:33 +08:00
committed by Brian Behlendorf
parent 885d929cf8
commit bb9225ea86
15 changed files with 3574 additions and 82 deletions
+293 -78
View File
@@ -46,6 +46,9 @@
#define IMPL_CYCLE (UINT32_MAX-1)
#ifdef CAN_USE_GCM_ASM
#define IMPL_AVX (UINT32_MAX-2)
#if CAN_USE_GCM_ASM >= 2
#define IMPL_AVX2 (UINT32_MAX-3)
#endif
#endif
#define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
static uint32_t icp_gcm_impl = IMPL_FASTEST;
@@ -56,17 +59,16 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
boolean_t gcm_avx_can_use_movbe = B_FALSE;
/*
* Whether to use the optimized openssl gcm and ghash implementations.
* Set to true if module parameter icp_gcm_impl == "avx".
*/
static boolean_t gcm_use_avx = B_FALSE;
#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx)
static gcm_impl gcm_impl_used = GCM_IMPL_GENERIC;
#define GCM_IMPL_USED (*(volatile gcm_impl *)&gcm_impl_used)
extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
static inline boolean_t gcm_avx_will_work(void);
static inline void gcm_set_avx(boolean_t);
static inline boolean_t gcm_toggle_avx(void);
static inline size_t gcm_simd_get_htab_size(boolean_t);
static inline boolean_t gcm_avx2_will_work(void);
static inline void gcm_use_impl(gcm_impl impl);
static inline gcm_impl gcm_toggle_impl(void);
static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
crypto_data_t *, size_t);
@@ -89,7 +91,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
void (*xor_block)(uint8_t *, uint8_t *))
{
#ifdef CAN_USE_GCM_ASM
if (ctx->gcm_use_avx == B_TRUE)
if (ctx->impl != GCM_IMPL_GENERIC)
return (gcm_mode_encrypt_contiguous_blocks_avx(
ctx, data, length, out, block_size));
#endif
@@ -208,7 +210,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
{
(void) copy_block;
#ifdef CAN_USE_GCM_ASM
if (ctx->gcm_use_avx == B_TRUE)
if (ctx->impl != GCM_IMPL_GENERIC)
return (gcm_encrypt_final_avx(ctx, out, block_size));
#endif
@@ -374,7 +376,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
void (*xor_block)(uint8_t *, uint8_t *))
{
#ifdef CAN_USE_GCM_ASM
if (ctx->gcm_use_avx == B_TRUE)
if (ctx->impl != GCM_IMPL_GENERIC)
return (gcm_decrypt_final_avx(ctx, out, block_size));
#endif
@@ -631,23 +633,23 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
gcm_ctx->impl = GCM_IMPL_USED;
} else {
/*
* Handle the "cycle" implementation by creating avx and
* non-avx contexts alternately.
* Handle the "cycle" implementation by creating different
* contexts, one per implementation.
*/
gcm_ctx->gcm_use_avx = gcm_toggle_avx();
gcm_ctx->impl = gcm_toggle_impl();
/* The avx impl. doesn't handle byte swapped key schedules. */
if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
gcm_ctx->gcm_use_avx = B_FALSE;
/* The AVX impl. doesn't handle byte swapped key schedules. */
if (needs_bswap == B_TRUE) {
gcm_ctx->impl = GCM_IMPL_GENERIC;
}
/*
* If this is a GCM context, use the MOVBE and the BSWAP
* If this is an AVX context, use the MOVBE and the BSWAP
* variants alternately.
*/
if (gcm_ctx->gcm_use_avx == B_TRUE &&
if (gcm_ctx->impl == GCM_IMPL_AVX &&
zfs_movbe_available() == B_TRUE) {
(void) atomic_toggle_boolean_nv(
(volatile boolean_t *)&gcm_avx_can_use_movbe);
@@ -658,12 +660,13 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
* still they could be created by the aes generic implementation.
* Make sure not to use them since we'll corrupt data if we do.
*/
if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
gcm_ctx->gcm_use_avx = B_FALSE;
if (gcm_ctx->impl != GCM_IMPL_GENERIC && needs_bswap == B_TRUE) {
gcm_ctx->impl = GCM_IMPL_GENERIC;
cmn_err_once(CE_WARN,
"ICP: Can't use the aes generic or cycle implementations "
"in combination with the gcm avx implementation!");
"in combination with the gcm avx or avx2-vaes "
"implementation!");
cmn_err_once(CE_WARN,
"ICP: Falling back to a compatible implementation, "
"aes-gcm performance will likely be degraded.");
@@ -672,36 +675,20 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
"restore performance.");
}
/* Allocate Htab memory as needed. */
if (gcm_ctx->gcm_use_avx == B_TRUE) {
size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
if (htab_len == 0) {
return (CRYPTO_MECHANISM_PARAM_INVALID);
}
gcm_ctx->gcm_htab_len = htab_len;
gcm_ctx->gcm_Htable =
kmem_alloc(htab_len, KM_SLEEP);
if (gcm_ctx->gcm_Htable == NULL) {
return (CRYPTO_HOST_MEMORY);
}
/*
* AVX implementations use Htable with sizes depending on
* implementation.
*/
if (gcm_ctx->impl != GCM_IMPL_GENERIC) {
rv = gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
block_size);
}
/* Avx and non avx context initialization differs from here on. */
if (gcm_ctx->gcm_use_avx == B_FALSE) {
else
#endif /* ifdef CAN_USE_GCM_ASM */
if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
rv = CRYPTO_MECHANISM_PARAM_INVALID;
}
#ifdef CAN_USE_GCM_ASM
} else {
if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
block_size) != CRYPTO_SUCCESS) {
rv = CRYPTO_MECHANISM_PARAM_INVALID;
}
if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
rv = CRYPTO_MECHANISM_PARAM_INVALID;
}
#endif /* ifdef CAN_USE_GCM_ASM */
return (rv);
}
@@ -767,6 +754,9 @@ gcm_impl_get_ops(void)
break;
#ifdef CAN_USE_GCM_ASM
case IMPL_AVX:
#if CAN_USE_GCM_ASM >= 2
case IMPL_AVX2:
#endif
/*
* Make sure that we return a valid implementation while
* switching to the avx implementation since there still
@@ -828,6 +818,13 @@ gcm_impl_init(void)
* Use the avx implementation if it's available and the implementation
* hasn't changed from its default value of fastest on module load.
*/
#if CAN_USE_GCM_ASM >= 2
if (gcm_avx2_will_work()) {
if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
gcm_use_impl(GCM_IMPL_AVX2);
}
} else
#endif
if (gcm_avx_will_work()) {
#ifdef HAVE_MOVBE
if (zfs_movbe_available() == B_TRUE) {
@@ -835,7 +832,7 @@ gcm_impl_init(void)
}
#endif
if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
gcm_set_avx(B_TRUE);
gcm_use_impl(GCM_IMPL_AVX);
}
}
#endif
@@ -852,6 +849,7 @@ static const struct {
{ "fastest", IMPL_FASTEST },
#ifdef CAN_USE_GCM_ASM
{ "avx", IMPL_AVX },
{ "avx2-vaes", IMPL_AVX2 },
#endif
};
@@ -887,7 +885,13 @@ gcm_impl_set(const char *val)
/* Check mandatory options */
for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
#ifdef CAN_USE_GCM_ASM
#if CAN_USE_GCM_ASM >= 2
/* Ignore avx implementation if it won't work. */
if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
!gcm_avx2_will_work()) {
continue;
}
#endif
if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
continue;
}
@@ -915,11 +919,17 @@ gcm_impl_set(const char *val)
* Use the avx implementation if available and the requested one is
* avx or fastest.
*/
#if CAN_USE_GCM_ASM >= 2
if (gcm_avx2_will_work() == B_TRUE &&
(impl == IMPL_AVX2 || impl == IMPL_FASTEST)) {
gcm_use_impl(GCM_IMPL_AVX2);
} else
#endif
if (gcm_avx_will_work() == B_TRUE &&
(impl == IMPL_AVX || impl == IMPL_FASTEST)) {
gcm_set_avx(B_TRUE);
gcm_use_impl(GCM_IMPL_AVX);
} else {
gcm_set_avx(B_FALSE);
gcm_use_impl(GCM_IMPL_GENERIC);
}
#endif
@@ -952,6 +962,12 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
#ifdef CAN_USE_GCM_ASM
/* Ignore avx implementation if it won't work. */
#if CAN_USE_GCM_ASM >= 2
if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
!gcm_avx2_will_work()) {
continue;
}
#endif
if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
continue;
}
@@ -993,9 +1009,6 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
/* Clear the FPU registers since they hold sensitive internal state. */
#define clear_fpu_regs() clear_fpu_regs_avx()
#define GHASH_AVX(ctx, in, len) \
gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
in, len)
#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
@@ -1010,20 +1023,77 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
static uint32_t gcm_avx_chunk_size =
((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
/*
* GCM definitions: uint128_t is copied from include/crypto/modes.h
* Avoiding u128 because it is already defined in kernel sources.
*/
typedef struct {
uint64_t hi, lo;
} uint128_t;
extern void ASMABI clear_fpu_regs_avx(void);
extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
const uint32_t pt[4], uint32_t ct[4]);
extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
#if CAN_USE_GCM_ASM >= 2
extern void ASMABI gcm_init_vpclmulqdq_avx2(uint128_t Htable[16],
const uint64_t H[2]);
#endif
extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
const uint8_t *in, size_t len);
#if CAN_USE_GCM_ASM >= 2
extern void ASMABI gcm_ghash_vpclmulqdq_avx2(uint64_t ghash[2],
const uint64_t *Htable, const uint8_t *in, size_t len);
#endif
static inline void GHASH_AVX(gcm_ctx_t *ctx, const uint8_t *in, size_t len)
{
switch (ctx->impl) {
#if CAN_USE_GCM_ASM >= 2
case GCM_IMPL_AVX2:
gcm_ghash_vpclmulqdq_avx2(ctx->gcm_ghash,
(const uint64_t *)ctx->gcm_Htable, in, len);
break;
#endif
case GCM_IMPL_AVX:
gcm_ghash_avx(ctx->gcm_ghash,
(const uint64_t *)ctx->gcm_Htable, in, len);
break;
default:
VERIFY(B_FALSE);
}
}
typedef size_t ASMABI aesni_gcm_encrypt_impl(const uint8_t *, uint8_t *,
size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
const void *, uint64_t *, uint64_t *);
#if CAN_USE_GCM_ASM >= 2
extern void ASMABI aes_gcm_enc_update_vaes_avx2(const uint8_t *in,
uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
const uint128_t Htable[16], uint8_t Xi[16]);
#endif
typedef size_t ASMABI aesni_gcm_decrypt_impl(const uint8_t *, uint8_t *,
size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
const void *, uint64_t *, uint64_t *);
#if CAN_USE_GCM_ASM >= 2
extern void ASMABI aes_gcm_dec_update_vaes_avx2(const uint8_t *in,
uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
const uint128_t Htable[16], uint8_t Xi[16]);
#endif
static inline boolean_t
gcm_avx2_will_work(void)
{
return (kfpu_allowed() &&
zfs_avx2_available() && zfs_vaes_available() &&
zfs_vpclmulqdq_available());
}
static inline boolean_t
gcm_avx_will_work(void)
@@ -1035,33 +1105,67 @@ gcm_avx_will_work(void)
}
static inline void
gcm_set_avx(boolean_t val)
gcm_use_impl(gcm_impl impl)
{
if (gcm_avx_will_work() == B_TRUE) {
atomic_swap_32(&gcm_use_avx, val);
switch (impl) {
#if CAN_USE_GCM_ASM >= 2
case GCM_IMPL_AVX2:
if (gcm_avx2_will_work() == B_TRUE) {
atomic_swap_32(&gcm_impl_used, impl);
return;
}
zfs_fallthrough;
#endif
case GCM_IMPL_AVX:
if (gcm_avx_will_work() == B_TRUE) {
atomic_swap_32(&gcm_impl_used, impl);
return;
}
zfs_fallthrough;
default:
atomic_swap_32(&gcm_impl_used, GCM_IMPL_GENERIC);
}
}
static inline boolean_t
gcm_toggle_avx(void)
gcm_impl_will_work(gcm_impl impl)
{
if (gcm_avx_will_work() == B_TRUE) {
return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
} else {
return (B_FALSE);
switch (impl) {
#if CAN_USE_GCM_ASM >= 2
case GCM_IMPL_AVX2:
return (gcm_avx2_will_work());
#endif
case GCM_IMPL_AVX:
return (gcm_avx_will_work());
default:
return (B_TRUE);
}
}
static inline size_t
gcm_simd_get_htab_size(boolean_t simd_mode)
static inline gcm_impl
gcm_toggle_impl(void)
{
switch (simd_mode) {
case B_TRUE:
return (2 * 6 * 2 * sizeof (uint64_t));
gcm_impl current_impl, new_impl;
do { /* handle races */
current_impl = atomic_load_32(&gcm_impl_used);
new_impl = current_impl;
while (B_TRUE) { /* handle incompatble implementations */
new_impl = (new_impl + 1) % GCM_IMPL_MAX;
if (gcm_impl_will_work(new_impl)) {
break;
}
}
default:
return (0);
}
} while (atomic_cas_32(&gcm_impl_used, current_impl, new_impl) !=
current_impl);
return (new_impl);
}
@@ -1077,6 +1181,50 @@ gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
}
static size_t aesni_gcm_encrypt_avx(const uint8_t *in, uint8_t *out,
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
uint64_t *Xip)
{
(void) Htable;
return (aesni_gcm_encrypt(in, out, len, key, iv, Xip));
}
#if CAN_USE_GCM_ASM >= 2
// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
// bits of a |size_t|.
// This is from boringssl/crypto/fipsmodule/aes/gcm.cc.inc
static const size_t kSizeTWithoutLower4Bits = (size_t)-16;
/* The following CRYPTO methods are from boringssl/crypto/internal.h */
static inline uint32_t CRYPTO_bswap4(uint32_t x) {
return (__builtin_bswap32(x));
}
static inline uint32_t CRYPTO_load_u32_be(const void *in) {
uint32_t v;
memcpy(&v, in, sizeof (v));
return (CRYPTO_bswap4(v));
}
static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {
v = CRYPTO_bswap4(v);
memcpy(out, &v, sizeof (v));
}
static size_t aesni_gcm_encrypt_avx2(const uint8_t *in, uint8_t *out,
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
uint64_t *Xip)
{
uint8_t *ivec = (uint8_t *)iv;
len &= kSizeTWithoutLower4Bits;
aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec,
(const uint128_t *)Htable, (uint8_t *)Xip);
CRYPTO_store_u32_be(&ivec[12],
CRYPTO_load_u32_be(&ivec[12]) + len / 16);
return (len);
}
#endif /* if CAN_USE_GCM_ASM >= 2 */
/*
* Encrypt multiple blocks of data in GCM mode.
* This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
@@ -1091,8 +1239,15 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
size_t done = 0;
uint8_t *datap = (uint8_t *)data;
size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
aesni_gcm_encrypt_impl *encrypt_blocks =
#if CAN_USE_GCM_ASM >= 2
ctx->impl == GCM_IMPL_AVX2 ?
aesni_gcm_encrypt_avx2 :
#endif
aesni_gcm_encrypt_avx;
const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
uint64_t *ghash = ctx->gcm_ghash;
uint64_t *htable = ctx->gcm_Htable;
uint64_t *cb = ctx->gcm_cb;
uint8_t *ct_buf = NULL;
uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
@@ -1156,8 +1311,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
/* Do the bulk encryption in chunk_size blocks. */
for (; bleft >= chunk_size; bleft -= chunk_size) {
kfpu_begin();
done = aesni_gcm_encrypt(
datap, ct_buf, chunk_size, key, cb, ghash);
done = encrypt_blocks(
datap, ct_buf, chunk_size, key, cb, htable, ghash);
clear_fpu_regs();
kfpu_end();
@@ -1180,7 +1335,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
/* Bulk encrypt the remaining data. */
kfpu_begin();
if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
done = encrypt_blocks(datap, ct_buf, bleft, key, cb, htable,
ghash);
if (done == 0) {
rv = CRYPTO_FAILED;
goto out;
@@ -1293,6 +1449,29 @@ gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
return (CRYPTO_SUCCESS);
}
static size_t aesni_gcm_decrypt_avx(const uint8_t *in, uint8_t *out,
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
uint64_t *Xip)
{
(void) Htable;
return (aesni_gcm_decrypt(in, out, len, key, iv, Xip));
}
#if CAN_USE_GCM_ASM >= 2
static size_t aesni_gcm_decrypt_avx2(const uint8_t *in, uint8_t *out,
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
uint64_t *Xip)
{
uint8_t *ivec = (uint8_t *)iv;
len &= kSizeTWithoutLower4Bits;
aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec,
(const uint128_t *)Htable, (uint8_t *)Xip);
CRYPTO_store_u32_be(&ivec[12],
CRYPTO_load_u32_be(&ivec[12]) + len / 16);
return (len);
}
#endif /* if CAN_USE_GCM_ASM >= 2 */
/*
* Finalize decryption: We just have accumulated crypto text, so now we
* decrypt it here inplace.
@@ -1306,10 +1485,17 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
B_FALSE);
size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
aesni_gcm_decrypt_impl *decrypt_blocks =
#if CAN_USE_GCM_ASM >= 2
ctx->impl == GCM_IMPL_AVX2 ?
aesni_gcm_decrypt_avx2 :
#endif
aesni_gcm_decrypt_avx;
size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
uint8_t *datap = ctx->gcm_pt_buf;
const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
uint32_t *cb = (uint32_t *)ctx->gcm_cb;
uint64_t *htable = ctx->gcm_Htable;
uint64_t *ghash = ctx->gcm_ghash;
uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
int rv = CRYPTO_SUCCESS;
@@ -1322,8 +1508,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
*/
for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
kfpu_begin();
done = aesni_gcm_decrypt(datap, datap, chunk_size,
(const void *)key, ctx->gcm_cb, ghash);
done = decrypt_blocks(datap, datap, chunk_size,
(const void *)key, ctx->gcm_cb, htable, ghash);
clear_fpu_regs();
kfpu_end();
if (done != chunk_size) {
@@ -1334,8 +1520,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
/* Decrypt remainder, which is less than chunk size, in one go. */
kfpu_begin();
if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
done = aesni_gcm_decrypt(datap, datap, bleft,
(const void *)key, ctx->gcm_cb, ghash);
done = decrypt_blocks(datap, datap, bleft,
(const void *)key, ctx->gcm_cb, htable, ghash);
if (done == 0) {
clear_fpu_regs();
kfpu_end();
@@ -1424,13 +1610,42 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
B_FALSE);
size_t htab_len = 0;
#if CAN_USE_GCM_ASM >= 2
if (ctx->impl == GCM_IMPL_AVX2) {
/*
* BoringSSL's API specifies uint128_t[16] for htab; but only
* uint128_t[12] are used.
* See https://github.com/google/boringssl/blob/
* 813840dd094f9e9c1b00a7368aa25e656554221f1/crypto/fipsmodule/
* modes/asm/aes-gcm-avx2-x86_64.pl#L198-L200
*/
htab_len = (2 * 8 * sizeof (uint128_t));
} else
#endif /* CAN_USE_GCM_ASM >= 2 */
{
htab_len = (2 * 6 * sizeof (uint128_t));
}
ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP);
if (ctx->gcm_Htable == NULL) {
return (CRYPTO_HOST_MEMORY);
}
/* Init H (encrypt zero block) and create the initial counter block. */
memset(H, 0, sizeof (ctx->gcm_H));
kfpu_begin();
aes_encrypt_intel(keysched, aes_rounds,
(const uint32_t *)H, (uint32_t *)H);
gcm_init_htab_avx(ctx->gcm_Htable, H);
#if CAN_USE_GCM_ASM >= 2
if (ctx->impl == GCM_IMPL_AVX2) {
gcm_init_vpclmulqdq_avx2((uint128_t *)ctx->gcm_Htable, H);
} else
#endif /* if CAN_USE_GCM_ASM >= 2 */
{
gcm_init_htab_avx(ctx->gcm_Htable, H);
}
if (iv_len == 12) {
memcpy(cb, iv, 12);