Add support for selecting encryption backend

- Add two new module parameters to icp (icp_aes_impl, icp_gcm_impl) that control the crypto implementation. At the moment there is a choice between generic and aesni (on platforms that support it). - This enables support for AES-NI and PCLMULQDQ-NI on AMD Family 15h (bulldozer) and newer CPUs (zen). - Modify aes_key_t to track what implementation it was generated with as key schedules generated with various implementations are not necessarily interchangable. Reviewed by: Gvozden Neskovic <neskovic@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tom Caputi <tcaputi@datto.com> Reviewed-by: Richard Laager <rlaager@wiktel.com> Signed-off-by: Nathaniel R. Lewis <linux.robotdude@gmail.com> Closes #7102 Closes #7103
2026-06-03 21:04:08 +03:00 · 2018-08-02 11:59:24 -07:00
parent 3d503a76e8
commit 010d12474c
18 changed files with 2292 additions and 1582 deletions
@@ -129,6 +129,7 @@
 #include <zfs_fletcher.h>
 #include <libnvpair.h>
 #include <libzfs.h>
+#include <sys/crypto/icp.h>
 #ifdef __GLIBC__
 #include <execinfo.h> /* for backtrace() */
 #endif
@@ -3836,6 +3837,13 @@ ztest_dataset_create(char *dsname)
 		VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props,
 		    crypto_args, &dcp));

+		/*
+		 * Cycle through all available encryption implementations
+		 * to verify interoperability.
+		 */
+		VERIFY0(gcm_impl_set("cycle"));
+		VERIFY0(aes_impl_set("cycle"));
+
 		fnvlist_free(crypto_args);
 		fnvlist_free(props);
 	}
@@ -21,6 +21,8 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD], [
 			ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512PF
 			ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512ER
 			ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL
+			ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES
+			ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ
 			;;
 	esac
 ])
@@ -359,3 +361,43 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL], [
 		AC_MSG_RESULT([no])
 	])
 ])
+
+dnl #
+dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES
+dnl #
+AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES], [
+	AC_MSG_CHECKING([whether host toolchain supports AES])
+
+	AC_LINK_IFELSE([AC_LANG_SOURCE([
+	[
+		void main()
+		{
+			__asm__ __volatile__("aesenc %xmm0, %xmm1");
+		}
+	]])], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE([HAVE_AES], 1, [Define if host toolchain supports AES])
+	], [
+		AC_MSG_RESULT([no])
+	])
+])
+
+dnl #
+dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ
+dnl #
+AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ], [
+	AC_MSG_CHECKING([whether host toolchain supports PCLMULQDQ])
+
+	AC_LINK_IFELSE([AC_LANG_SOURCE([
+	[
+		void main()
+		{
+			__asm__ __volatile__("pclmulqdq %0, %%xmm0, %%xmm1" :: "i"(0));
+		}
+	]])], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE([HAVE_PCLMULQDQ], 1, [Define if host toolchain supports PCLMULQDQ])
+	], [
+		AC_MSG_RESULT([no])
+	])
+])
@@ -148,7 +148,9 @@ typedef enum cpuid_inst_sets {
 	AVX512VBMI,
 	AVX512PF,
 	AVX512ER,
-	AVX512VL
+	AVX512VL,
+	AES,
+	PCLMULQDQ
 } cpuid_inst_sets_t;

 /*
@@ -170,6 +172,8 @@ typedef struct cpuid_feature_desc {
 #define	_AVX512PF_BIT		(_AVX512F_BIT | (1U << 26))
 #define	_AVX512ER_BIT		(_AVX512F_BIT | (1U << 27))
 #define	_AVX512VL_BIT		(1U << 31) /* if used also check other levels */
+#define	_AES_BIT		(1U << 25)
+#define	_PCLMULQDQ_BIT		(1U << 1)

 /*
 * Descriptions of supported instruction sets
@@ -194,7 +198,9 @@ static const cpuid_feature_desc_t cpuid_features[] = {
 	[AVX512VBMI]	= {7U, 0U, _AVX512VBMI_BIT,	ECX	},
 	[AVX512PF]	= {7U, 0U, _AVX512PF_BIT,	EBX	},
 	[AVX512ER]	= {7U, 0U, _AVX512ER_BIT,	EBX	},
-	[AVX512VL]	= {7U, 0U, _AVX512ER_BIT,	EBX	}
+	[AVX512VL]	= {7U, 0U, _AVX512ER_BIT,	EBX	},
+	[AES]		= {1U, 0U, _AES_BIT,		ECX	},
+	[PCLMULQDQ]	= {1U, 0U, _PCLMULQDQ_BIT,	ECX	},
 };

 /*
@@ -265,6 +271,8 @@ CPUID_FEATURE_CHECK(avx512vbmi, AVX512VBMI);
 CPUID_FEATURE_CHECK(avx512pf, AVX512PF);
 CPUID_FEATURE_CHECK(avx512er, AVX512ER);
 CPUID_FEATURE_CHECK(avx512vl, AVX512VL);
+CPUID_FEATURE_CHECK(aes, AES);
+CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ);

 #endif /* !defined(_KERNEL) */

@@ -442,6 +450,35 @@ zfs_bmi2_available(void)
 #endif
 }

+/*
+ * Check if AES instruction set is available
+ */
+static inline boolean_t
+zfs_aes_available(void)
+{
+#if defined(_KERNEL) && defined(X86_FEATURE_AES)
+	return (!!boot_cpu_has(X86_FEATURE_AES));
+#elif defined(_KERNEL) && !defined(X86_FEATURE_AES)
+	return (B_FALSE);
+#else
+	return (__cpuid_has_aes());
+#endif
+}
+
+/*
+ * Check if PCLMULQDQ instruction set is available
+ */
+static inline boolean_t
+zfs_pclmulqdq_available(void)
+{
+#if defined(_KERNEL) && defined(X86_FEATURE_PCLMULQDQ)
+	return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ));
+#elif defined(_KERNEL) && !defined(X86_FEATURE_PCLMULQDQ)
+	return (B_FALSE);
+#else
+	return (__cpuid_has_pclmulqdq());
+#endif
+}

 /*
 * AVX-512 family of instruction sets:
@@ -44,4 +44,7 @@ int skein_mod_fini(void);
 int icp_init(void);
 void icp_fini(void);

+int aes_impl_set(const char *);
+int gcm_impl_set(const char *);
+
 #endif /* _SYS_CRYPTO_ALGS_H */
@@ -18,8 +18,8 @@ if TARGET_ASM_X86_64
 ASM_SOURCES_C = asm-x86_64/aes/aeskey.c
 ASM_SOURCES_AS = \
 	asm-x86_64/aes/aes_amd64.S \
-	asm-x86_64/aes/aes_intel.S \
-	asm-x86_64/modes/gcm_intel.S \
+	asm-x86_64/aes/aes_aesni.S \
+	asm-x86_64/modes/gcm_pclmulqdq.S \
 	asm-x86_64/sha1/sha1-x86_64.S \
 	asm-x86_64/sha2/sha256_impl.S \
 	asm-x86_64/sha2/sha512_impl.S
@@ -46,11 +46,16 @@ KERNEL_C = \
 	api/kcf_cipher.c \
 	api/kcf_miscapi.c \
 	api/kcf_mac.c \
+	algs/aes/aes_impl_aesni.c \
+	algs/aes/aes_impl_generic.c \
+	algs/aes/aes_impl_x86-64.c \
 	algs/aes/aes_impl.c \
 	algs/aes/aes_modes.c \
 	algs/edonr/edonr.c \
 	algs/modes/modes.c \
 	algs/modes/cbc.c \
+	algs/modes/gcm_generic.c \
+	algs/modes/gcm_pclmulqdq.c \
 	algs/modes/gcm.c \
 	algs/modes/ctr.c \
 	algs/modes/ccm.c \
@@ -8,8 +8,8 @@ TARGET_ASM_DIR = @TARGET_ASM_DIR@
 ifeq ($(TARGET_ASM_DIR), asm-x86_64)
 ASM_SOURCES := asm-x86_64/aes/aeskey.o
 ASM_SOURCES += asm-x86_64/aes/aes_amd64.o
-ASM_SOURCES += asm-x86_64/aes/aes_intel.o
-ASM_SOURCES += asm-x86_64/modes/gcm_intel.o
+ASM_SOURCES += asm-x86_64/aes/aes_aesni.o
+ASM_SOURCES += asm-x86_64/modes/gcm_pclmulqdq.o
 ASM_SOURCES += asm-x86_64/sha1/sha1-x86_64.o
 ASM_SOURCES += asm-x86_64/sha2/sha256_impl.o
 ASM_SOURCES += asm-x86_64/sha2/sha512_impl.o
@@ -53,8 +53,10 @@ $(MODULE)-objs += algs/modes/cbc.o
 $(MODULE)-objs += algs/modes/ccm.o
 $(MODULE)-objs += algs/modes/ctr.o
 $(MODULE)-objs += algs/modes/ecb.o
+$(MODULE)-objs += algs/modes/gcm_generic.o
 $(MODULE)-objs += algs/modes/gcm.o
 $(MODULE)-objs += algs/modes/modes.o
+$(MODULE)-objs += algs/aes/aes_impl_generic.o
 $(MODULE)-objs += algs/aes/aes_impl.o
 $(MODULE)-objs += algs/aes/aes_modes.o
 $(MODULE)-objs += algs/edonr/edonr.o
@@ -66,6 +68,10 @@ $(MODULE)-objs += algs/skein/skein_block.o
 $(MODULE)-objs += algs/skein/skein_iv.o
 $(MODULE)-objs += $(ASM_SOURCES)

+$(MODULE)-$(CONFIG_X86) += algs/modes/gcm_pclmulqdq.o
+$(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_aesni.o
+$(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_x86-64.o
+
 ICP_DIRS = \
 	api \
 	core \
@@ -0,0 +1,123 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64) && defined(HAVE_AES)
+
+#include <linux/simd_x86.h>
+
+/* These functions are used to execute AES-NI instructions: */
+extern int rijndael_key_setup_enc_intel(uint32_t rk[],
+	const uint32_t cipherKey[], uint64_t keyBits);
+extern int rijndael_key_setup_dec_intel(uint32_t rk[],
+	const uint32_t cipherKey[], uint64_t keyBits);
+extern void aes_encrypt_intel(const uint32_t rk[], int Nr,
+	const uint32_t pt[4], uint32_t ct[4]);
+extern void aes_decrypt_intel(const uint32_t rk[], int Nr,
+	const uint32_t ct[4], uint32_t pt[4]);
+
+
+#include <aes/aes_impl.h>
+
+/*
+ * Expand the 32-bit AES cipher key array into the encryption and decryption
+ * key schedules.
+ *
+ * Parameters:
+ * key		AES key schedule to be initialized
+ * keyarr32	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+static void
+aes_aesni_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits)
+{
+	kfpu_begin();
+	key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]),
+	    keyarr32, keybits);
+	key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]),
+	    keyarr32, keybits);
+	kfpu_end();
+}
+
+/*
+ * Encrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk		Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr		Number of rounds
+ * pt		Input block (plain text)
+ * ct		Output block (crypto text).  Can overlap with pt
+ */
+static void
+aes_aesni_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
+    uint32_t ct[4])
+{
+	kfpu_begin();
+	aes_encrypt_intel(rk, Nr, pt, ct);
+	kfpu_end();
+}
+
+/*
+ * Decrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk		Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr		Number of rounds
+ * ct		Input block (crypto text)
+ * pt		Output block (plain text). Can overlap with pt
+ */
+static void
+aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
+    uint32_t pt[4])
+{
+	kfpu_begin();
+	aes_decrypt_intel(rk, Nr, ct, pt);
+	kfpu_end();
+}
+
+static boolean_t
+aes_aesni_will_work(void)
+{
+	return (zfs_aes_available());
+}
+
+const aes_impl_ops_t aes_aesni_impl = {
+	.generate = &aes_aesni_generate,
+	.encrypt = &aes_aesni_encrypt,
+	.decrypt = &aes_aesni_decrypt,
+	.is_supported = &aes_aesni_will_work,
+	.needs_byteswap = B_FALSE,
+	.name = "aesni"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_AES) */
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64)
+
+#include <linux/simd_x86.h>
+
+/* These functions are used to execute amd64 instructions for AMD or Intel: */
+extern int rijndael_key_setup_enc_amd64(uint32_t rk[],
+	const uint32_t cipherKey[], int keyBits);
+extern int rijndael_key_setup_dec_amd64(uint32_t rk[],
+	const uint32_t cipherKey[], int keyBits);
+extern void aes_encrypt_amd64(const uint32_t rk[], int Nr,
+	const uint32_t pt[4], uint32_t ct[4]);
+extern void aes_decrypt_amd64(const uint32_t rk[], int Nr,
+	const uint32_t ct[4], uint32_t pt[4]);
+
+
+#include <aes/aes_impl.h>
+
+/*
+ * Expand the 32-bit AES cipher key array into the encryption and decryption
+ * key schedules.
+ *
+ * Parameters:
+ * key		AES key schedule to be initialized
+ * keyarr32	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+static void
+aes_x86_64_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits)
+{
+	key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]),
+	    keyarr32, keybits);
+	key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]),
+	    keyarr32, keybits);
+}
+
+static boolean_t
+aes_x86_64_will_work(void)
+{
+	return (B_TRUE);
+}
+
+const aes_impl_ops_t aes_x86_64_impl = {
+	.generate = &aes_x86_64_generate,
+	.encrypt = &aes_encrypt_amd64,
+	.decrypt = &aes_decrypt_amd64,
+	.is_supported = &aes_x86_64_will_work,
+	.needs_byteswap = B_FALSE,
+	.name = "x86_64"
+};
+
+#endif /* defined(__x86_64) */
@@ -22,93 +22,19 @@
 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 */

-#if defined(_KERNEL) && defined(__amd64)
-#include <linux/simd_x86.h>
-
-#define	KPREEMPT_DISABLE	kfpu_begin()
-#define	KPREEMPT_ENABLE		kfpu_end()
-
-#else
-#define	KPREEMPT_DISABLE
-#define	KPREEMPT_ENABLE
-#endif	/* _KERNEL */
-
 #include <sys/zfs_context.h>
 #include <modes/modes.h>
 #include <sys/crypto/common.h>
+#include <sys/crypto/icp.h>
 #include <sys/crypto/impl.h>
 #include <sys/byteorder.h>
+#include <modes/gcm_impl.h>

-#ifdef __amd64
-
-extern void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
-static int intel_pclmulqdq_instruction_present(void);
-#endif	/* __amd64 */
-
-struct aes_block {
-	uint64_t a;
-	uint64_t b;
-};
-
-
-/*
- * gcm_mul()
- * Perform a carry-less multiplication (that is, use XOR instead of the
- * multiply operator) on *x_in and *y and place the result in *res.
- *
- * Byte swap the input (*x_in and *y) and the output (*res).
- *
- * Note: x_in, y, and res all point to 16-byte numbers (an array of two
- * 64-bit integers).
- */
-void
-gcm_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
-{
-#ifdef __amd64
-	if (intel_pclmulqdq_instruction_present()) {
-		KPREEMPT_DISABLE;
-		gcm_mul_pclmulqdq(x_in, y, res);
-		KPREEMPT_ENABLE;
-	} else
-#endif	/* __amd64 */
-	{
-		static const uint64_t R = 0xe100000000000000ULL;
-		struct aes_block z = {0, 0};
-		struct aes_block v;
-		uint64_t x;
-		int i, j;
-
-		v.a = ntohll(y[0]);
-		v.b = ntohll(y[1]);
-
-		for (j = 0; j < 2; j++) {
-			x = ntohll(x_in[j]);
-			for (i = 0; i < 64; i++, x <<= 1) {
-				if (x & 0x8000000000000000ULL) {
-					z.a ^= v.a;
-					z.b ^= v.b;
-				}
-				if (v.b & 1ULL) {
-					v.b = (v.a << 63)|(v.b >> 1);
-					v.a = (v.a >> 1) ^ R;
-				} else {
-					v.b = (v.a << 63)|(v.b >> 1);
-					v.a = v.a >> 1;
-				}
-			}
-		}
-		res[0] = htonll(z.a);
-		res[1] = htonll(z.b);
-	}
-}
-
-
-#define	GHASH(c, d, t) \
+#define	GHASH(c, d, t, o) \
 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
-	gcm_mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
+	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
 	(uint64_t *)(void *)(t));

-
 /*
 * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
 * is done in another function.
@@ -120,6 +46,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
+	gcm_impl_ops_t *gops;
 	size_t remainder = length;
 	size_t need = 0;
 	uint8_t *datap = (uint8_t *)data;
@@ -147,6 +74,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
 	if (out != NULL)
 		crypto_init_ptrs(out, &iov_or_mp, &offset);

+	gops = gcm_impl_get_ops();
 	do {
 		/* Unprocessed data from last call. */
 		if (ctx->gcm_remainder_len > 0) {
@@ -207,7 +135,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
 		}

 		/* add ciphertext to the hash */
-		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash);
+		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);

 		/* Update pointer to next block of data to be processed. */
 		if (ctx->gcm_remainder_len != 0) {
@@ -240,6 +168,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
+	gcm_impl_ops_t *gops;
 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
 	uint8_t *ghash, *macp = NULL;
 	int i, rv;
@@ -249,6 +178,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
 		return (CRYPTO_DATA_LEN_RANGE);
 	}

+	gops = gcm_impl_get_ops();
 	ghash = (uint8_t *)ctx->gcm_ghash;

 	if (ctx->gcm_remainder_len > 0) {
@@ -281,14 +211,14 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
 		}

 		/* add ciphertext to the hash */
-		GHASH(ctx, macp, ghash);
+		GHASH(ctx, macp, ghash, gops);

 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
 	}

 	ctx->gcm_len_a_len_c[1] =
 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
-	GHASH(ctx, ctx->gcm_len_a_len_c, ghash);
+	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
 	    (uint8_t *)ctx->gcm_J0);
 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
@@ -340,7 +270,7 @@ gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
 	bcopy(datap, (uint8_t *)ctx->gcm_tmp, ctx->gcm_remainder_len);

 	/* add ciphertext to the hash */
-	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash);
+	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());

 	/* decrypt remaining ciphertext */
 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
@@ -390,6 +320,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
+	gcm_impl_ops_t *gops;
 	size_t pt_len;
 	size_t remainder;
 	uint8_t *ghash;
@@ -401,6 +332,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,

 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);

+	gops = gcm_impl_get_ops();
 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
 	ghash = (uint8_t *)ctx->gcm_ghash;
 	blockp = ctx->gcm_pt_buf;
@@ -420,7 +352,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
 			goto out;
 		}
 		/* add ciphertext to the hash */
-		GHASH(ctx, blockp, ghash);
+		GHASH(ctx, blockp, ghash, gops);

 		/*
 		 * Increment counter.
@@ -443,7 +375,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
 	}
 out:
 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
-	GHASH(ctx, ctx->gcm_len_a_len_c, ghash);
+	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
 	    (uint8_t *)ctx->gcm_J0);
 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
@@ -495,12 +427,14 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
+	gcm_impl_ops_t *gops;
 	uint8_t *cb;
 	ulong_t remainder = iv_len;
 	ulong_t processed = 0;
 	uint8_t *datap, *ghash;
 	uint64_t len_a_len_c[2];

+	gops = gcm_impl_get_ops();
 	ghash = (uint8_t *)ctx->gcm_ghash;
 	cb = (uint8_t *)ctx->gcm_cb;
 	if (iv_len == 12) {
@@ -524,12 +458,12 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
 				processed += block_size;
 				remainder -= block_size;
 			}
-			GHASH(ctx, datap, ghash);
+			GHASH(ctx, datap, ghash, gops);
 		} while (remainder > 0);

 		len_a_len_c[0] = 0;
 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
-		GHASH(ctx, len_a_len_c, ctx->gcm_J0);
+		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);

 		/* J0 will be used again in the final */
 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
@@ -547,6 +481,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
+	gcm_impl_ops_t *gops;
 	uint8_t *ghash, *datap, *authp;
 	size_t remainder, processed;

@@ -558,6 +493,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
 	    copy_block, xor_block);

+	gops = gcm_impl_get_ops();
 	authp = (uint8_t *)ctx->gcm_tmp;
 	ghash = (uint8_t *)ctx->gcm_ghash;
 	bzero(authp, block_size);
@@ -582,7 +518,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
 		}

 		/* add auth data to the hash */
-		GHASH(ctx, datap, ghash);
+		GHASH(ctx, datap, ghash, gops);

 	} while (remainder > 0);

@@ -694,55 +630,206 @@ gcm_set_kmflag(gcm_ctx_t *ctx, int kmflag)
 	ctx->gcm_kmflag = kmflag;
 }

+/* GCM implementation that contains the fastest methods */
+static gcm_impl_ops_t gcm_fastest_impl = {
+	.name = "fastest"
+};

-#ifdef __amd64
+/* All compiled in implementations */
+const gcm_impl_ops_t *gcm_all_impl[] = {
+	&gcm_generic_impl,
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+	&gcm_pclmulqdq_impl,
+#endif
+};

-#define	INTEL_PCLMULQDQ_FLAG (1 << 1)
+/* Indicate that benchmark has been completed */
+static boolean_t gcm_impl_initialized = B_FALSE;
+
+/* Select aes implementation */
+#define	IMPL_FASTEST	(UINT32_MAX)
+#define	IMPL_CYCLE	(UINT32_MAX-1)
+
+#define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
+
+static uint32_t icp_gcm_impl = IMPL_FASTEST;
+static uint32_t user_sel_impl = IMPL_FASTEST;
+
+/* Hold all supported implementations */
+static size_t gcm_supp_impl_cnt = 0;
+static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];

 /*
- * Return 1 if executing on Intel with PCLMULQDQ instructions,
- * otherwise 0 (i.e., Intel without PCLMULQDQ or AMD64).
- * Cache the result, as the CPU can't change.
- *
- * Note: the userland version uses getisax().  The kernel version uses
- * is_x86_featureset().
+ * Selects the gcm operation
 */
-static int
-intel_pclmulqdq_instruction_present(void)
+gcm_impl_ops_t *
+gcm_impl_get_ops()
 {
-	static int cached_result = -1;
-	unsigned eax, ebx, ecx, edx;
-	unsigned func, subfunc;
+	gcm_impl_ops_t *ops = NULL;
+	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);

-	if (cached_result == -1) { /* first time */
-		/* check for an intel cpu */
-		func = 0;
-		subfunc = 0;
+	switch (impl) {
+	case IMPL_FASTEST:
+		ASSERT(gcm_impl_initialized);
+		ops = &gcm_fastest_impl;
+		break;
+	case IMPL_CYCLE:
+	{
+		ASSERT(gcm_impl_initialized);
+		ASSERT3U(gcm_supp_impl_cnt, >, 0);
+		/* Cycle through supported implementations */
+		static size_t cycle_impl_idx = 0;
+		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
+		ops = gcm_supp_impl[idx];
+	}
+	break;
+	default:
+		ASSERT3U(impl, <, gcm_supp_impl_cnt);
+		ASSERT3U(gcm_supp_impl_cnt, >, 0);
+		if (impl < ARRAY_SIZE(gcm_all_impl))
+			ops = gcm_supp_impl[impl];
+		break;
+	}

-		__asm__ __volatile__(
-		    "cpuid"
-		    : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
-		    : "a"(func), "c"(subfunc));
+	ASSERT3P(ops, !=, NULL);

-		if (memcmp((char *)(&ebx), "Genu", 4) == 0 &&
-		    memcmp((char *)(&edx), "ineI", 4) == 0 &&
-		    memcmp((char *)(&ecx), "ntel", 4) == 0) {
-			func = 1;
-			subfunc = 0;
+	return (ops);
+}

-			/* check for aes-ni instruction set */
-			__asm__ __volatile__(
-			    "cpuid"
-			    : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
-			    : "a"(func), "c"(subfunc));
+void
+gcm_impl_init(void)
+{
+	gcm_impl_ops_t *curr_impl;
+	int i, c;

-			cached_result = !!(ecx & INTEL_PCLMULQDQ_FLAG);
-		} else {
-			cached_result = 0;
+	/* move supported impl into aes_supp_impls */
+	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
+		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
+
+		if (curr_impl->is_supported())
+			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
+	}
+	gcm_supp_impl_cnt = c;
+
+	/* set fastest implementation. assume hardware accelerated is fastest */
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+	if (gcm_pclmulqdq_impl.is_supported())
+		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
+		    sizeof (gcm_fastest_impl));
+	else
+#endif
+		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
+		    sizeof (gcm_fastest_impl));
+
+	strcpy(gcm_fastest_impl.name, "fastest");
+
+	/* Finish initialization */
+	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
+	gcm_impl_initialized = B_TRUE;
+}
+
+static const struct {
+	char *name;
+	uint32_t sel;
+} gcm_impl_opts[] = {
+		{ "cycle",	IMPL_CYCLE },
+		{ "fastest",	IMPL_FASTEST },
+};
+
+/*
+ * Function sets desired gcm implementation.
+ *
+ * If we are called before init(), user preference will be saved in
+ * user_sel_impl, and applied in later init() call. This occurs when module
+ * parameter is specified on module load. Otherwise, directly update
+ * icp_aes_impl.
+ *
+ * @val		Name of gcm implementation to use
+ * @param	Unused.
+ */
+int
+gcm_impl_set(const char *val)
+{
+	int err = -EINVAL;
+	char req_name[GCM_IMPL_NAME_MAX];
+	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
+	size_t i;
+
+	/* sanitize input */
+	i = strnlen(val, GCM_IMPL_NAME_MAX);
+	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
+		return (err);
+
+	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
+	while (i > 0 && isspace(req_name[i-1]))
+		i--;
+	req_name[i] = '\0';
+
+	/* Check mandatory options */
+	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
+		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
+			impl = gcm_impl_opts[i].sel;
+			err = 0;
+			break;
 		}
 	}

-	return (cached_result);
+	/* check all supported impl if init() was already called */
+	if (err != 0 && gcm_impl_initialized) {
+		/* check all supported implementations */
+		for (i = 0; i < gcm_supp_impl_cnt; i++) {
+			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
+				impl = i;
+				err = 0;
+				break;
+			}
+		}
+	}
+
+	if (err == 0) {
+		if (gcm_impl_initialized)
+			atomic_swap_32(&icp_gcm_impl, impl);
+		else
+			atomic_swap_32(&user_sel_impl, impl);
+	}
+
+	return (err);
 }

-#endif	/* __amd64 */
+#if defined(_KERNEL)
+#include <linux/mod_compat.h>
+
+static int
+icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
+{
+	return (gcm_impl_set(val));
+}
+
+static int
+icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+	int i, cnt = 0;
+	char *fmt;
+	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
+
+	ASSERT(gcm_impl_initialized);
+
+	/* list mandatory options */
+	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
+		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
+		cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name);
+	}
+
+	/* list all supported implementations */
+	for (i = 0; i < gcm_supp_impl_cnt; i++) {
+		fmt = (i == impl) ? "[%s] " : "%s ";
+		cnt += sprintf(buffer + cnt, fmt, gcm_supp_impl[i]->name);
+	}
+
+	return (cnt);
+}
+
+module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
+    NULL, 0644);
+MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
+#endif
@@ -0,0 +1,83 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <modes/gcm_impl.h>
+
+struct aes_block {
+	uint64_t a;
+	uint64_t b;
+};
+
+/*
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on *x_in and *y and place the result in *res.
+ *
+ * Byte swap the input (*x_in and *y) and the output (*res).
+ *
+ * Note: x_in, y, and res all point to 16-byte numbers (an array of two
+ * 64-bit integers).
+ */
+static void
+gcm_generic_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
+{
+	static const uint64_t R = 0xe100000000000000ULL;
+	struct aes_block z = {0, 0};
+	struct aes_block v;
+	uint64_t x;
+	int i, j;
+
+	v.a = ntohll(y[0]);
+	v.b = ntohll(y[1]);
+
+	for (j = 0; j < 2; j++) {
+		x = ntohll(x_in[j]);
+		for (i = 0; i < 64; i++, x <<= 1) {
+			if (x & 0x8000000000000000ULL) {
+				z.a ^= v.a;
+				z.b ^= v.b;
+			}
+			if (v.b & 1ULL) {
+				v.b = (v.a << 63)|(v.b >> 1);
+				v.a = (v.a >> 1) ^ R;
+			} else {
+				v.b = (v.a << 63)|(v.b >> 1);
+				v.a = v.a >> 1;
+			}
+		}
+	}
+	res[0] = htonll(z.a);
+	res[1] = htonll(z.b);
+}
+
+static boolean_t
+gcm_generic_will_work(void)
+{
+	return (B_TRUE);
+}
+
+const gcm_impl_ops_t gcm_generic_impl = {
+	.mul = &gcm_generic_mul,
+	.is_supported = &gcm_generic_will_work,
+	.name = "generic"
+};
@@ -0,0 +1,64 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+
+#include <linux/simd_x86.h>
+
+/* These functions are used to execute pclmulqdq based assembly methods */
+extern void gcm_mul_pclmulqdq(uint64_t *, uint64_t *, uint64_t *);
+
+
+#include <modes/gcm_impl.h>
+
+/*
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on *x_in and *y and place the result in *res.
+ *
+ * Byte swap the input (*x_in and *y) and the output (*res).
+ *
+ * Note: x_in, y, and res all point to 16-byte numbers (an array of two
+ * 64-bit integers).
+ */
+static void
+gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
+{
+	kfpu_begin();
+	gcm_mul_pclmulqdq(x_in, y, res);
+	kfpu_end();
+}
+
+static boolean_t
+gcm_pclmulqdq_will_work(void)
+{
+	return (zfs_pclmulqdq_available());
+}
+
+const gcm_impl_ops_t gcm_pclmulqdq_impl = {
+	.mul = &gcm_pclmulqdq_mul,
+	.is_supported = &gcm_pclmulqdq_will_work,
+	.name = "pclmulqdq"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_PCLMULQDQ) */
@@ -178,12 +178,11 @@ rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
 }


-#else	/* lint */
+#elif defined(HAVE_AES)	/* guard by instruction set */

 #define _ASM
 #include <sys/asm_linkage.h>

-
 /*
 * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
 * _key_expansion_256a(), _key_expansion_256b()
@@ -81,7 +81,7 @@
 */


-#if defined(lint) || defined(__lint)
+#if defined(lint) || defined(__lint)	/* lint */

 #include <sys/types.h>

@@ -90,7 +90,7 @@ void
 gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
 }

-#else	/* lint */
+#elif defined(HAVE_PCLMULQDQ)	/* guard by instruction set */

 #define _ASM
 #include <sys/asm_linkage.h>
@@ -106,17 +106,15 @@ typedef union {
 	uint32_t	ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
 } aes_ks_t;

-/* aes_key.flags value: */
-#define	INTEL_AES_NI_CAPABLE	0x1	/* AES-NI instructions present */
-
+typedef struct aes_impl_ops aes_impl_ops_t;
 typedef struct aes_key aes_key_t;
 struct aes_key {
 	aes_ks_t	encr_ks;  /* encryption key schedule */
 	aes_ks_t	decr_ks;  /* decryption key schedule */
 #ifdef __amd64
 	long double	align128; /* Align fields above for Intel AES-NI */
-	int		flags;	  /* implementation-dependent flags */
 #endif	/* __amd64 */
+	const aes_impl_ops_t	*ops;	/* ops associated with this schedule */
 	int		nr;	  /* number of rounds (10, 12, or 14) */
 	int		type;	  /* key schedule size (32 or 64 bits) */
 };
@@ -163,6 +161,50 @@ typedef enum aes_mech_type {

 #endif /* _AES_IMPL */

+/*
+ * Methods used to define aes implementation
+ *
+ * @aes_gen_f Key generation
+ * @aes_enc_f Function encrypts one block
+ * @aes_dec_f Function decrypts one block
+ * @aes_will_work_f Function tests whether method will function
+ */
+typedef void 		(*aes_generate_f)(aes_key_t *, const uint32_t *, int);
+typedef void		(*aes_encrypt_f)(const uint32_t[], int,
+    const uint32_t[4], uint32_t[4]);
+typedef void		(*aes_decrypt_f)(const uint32_t[], int,
+    const uint32_t[4], uint32_t[4]);
+typedef boolean_t	(*aes_will_work_f)(void);
+
+#define	AES_IMPL_NAME_MAX (16)
+
+struct aes_impl_ops {
+	aes_generate_f generate;
+	aes_encrypt_f encrypt;
+	aes_decrypt_f decrypt;
+	aes_will_work_f is_supported;
+	boolean_t needs_byteswap;
+	char name[AES_IMPL_NAME_MAX];
+};
+
+extern const aes_impl_ops_t aes_generic_impl;
+#if defined(__x86_64)
+extern const aes_impl_ops_t aes_x86_64_impl;
+#endif
+#if defined(__x86_64) && defined(HAVE_AES)
+extern const aes_impl_ops_t aes_aesni_impl;
+#endif
+
+/*
+ * Initializes fastest implementation
+ */
+void aes_impl_init(void);
+
+/*
+ * Get selected aes implementation
+ */
+struct aes_impl_ops *aes_impl_get_ops(void);
+
 #ifdef	__cplusplus
 }
 #endif
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_GCM_IMPL_H
+#define	_GCM_IMPL_H
+
+/*
+ * GCM function dispatcher.
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+
+/*
+ * Methods used to define gcm implementation
+ *
+ * @gcm_mul_f Perform carry-less multiplication
+ * @gcm_will_work_f Function tests whether implementation will function
+ */
+typedef void 		(*gcm_mul_f)(uint64_t *, uint64_t *, uint64_t *);
+typedef boolean_t	(*gcm_will_work_f)(void);
+
+#define	GCM_IMPL_NAME_MAX (16)
+
+typedef struct gcm_impl_ops {
+	gcm_mul_f mul;
+	gcm_will_work_f is_supported;
+	char name[GCM_IMPL_NAME_MAX];
+} gcm_impl_ops_t;
+
+extern const gcm_impl_ops_t gcm_generic_impl;
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
+#endif
+
+/*
+ * Initializes fastest implementation
+ */
+void gcm_impl_init(void);
+
+/*
+ * Get selected aes implementation
+ */
+struct gcm_impl_ops *gcm_impl_get_ops(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _GCM_IMPL_H */
@@ -35,6 +35,7 @@
 #include <sys/modctl.h>
 #define	_AES_IMPL
 #include <aes/aes_impl.h>
+#include <modes/gcm_impl.h>

 #define	CRYPTO_PROVIDER_NAME "aes"

@@ -205,6 +206,10 @@ aes_mod_init(void)
 {
 	int ret;

+	/* find fastest implementations and set any requested implementations */
+	aes_impl_init();
+	gcm_impl_init();
+
 	if ((ret = mod_install(&modlinkage)) != 0)
 		return (ret);