Add support for selecting encryption backend

- Add two new module parameters to icp (icp_aes_impl, icp_gcm_impl) that control the crypto implementation. At the moment there is a choice between generic and aesni (on platforms that support it). - This enables support for AES-NI and PCLMULQDQ-NI on AMD Family 15h (bulldozer) and newer CPUs (zen). - Modify aes_key_t to track what implementation it was generated with as key schedules generated with various implementations are not necessarily interchangable. Reviewed by: Gvozden Neskovic <neskovic@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tom Caputi <tcaputi@datto.com> Reviewed-by: Richard Laager <rlaager@wiktel.com> Signed-off-by: Nathaniel R. Lewis <linux.robotdude@gmail.com> Closes #7102 Closes #7103
2026-05-22 10:37:35 +03:00 · 2018-08-02 11:59:24 -07:00
parent 3d503a76e8
commit 010d12474c
18 changed files with 2292 additions and 1582 deletions
@@ -0,0 +1,123 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64) && defined(HAVE_AES)
+
+#include <linux/simd_x86.h>
+
+/* These functions are used to execute AES-NI instructions: */
+extern int rijndael_key_setup_enc_intel(uint32_t rk[],
+	const uint32_t cipherKey[], uint64_t keyBits);
+extern int rijndael_key_setup_dec_intel(uint32_t rk[],
+	const uint32_t cipherKey[], uint64_t keyBits);
+extern void aes_encrypt_intel(const uint32_t rk[], int Nr,
+	const uint32_t pt[4], uint32_t ct[4]);
+extern void aes_decrypt_intel(const uint32_t rk[], int Nr,
+	const uint32_t ct[4], uint32_t pt[4]);
+
+
+#include <aes/aes_impl.h>
+
+/*
+ * Expand the 32-bit AES cipher key array into the encryption and decryption
+ * key schedules.
+ *
+ * Parameters:
+ * key		AES key schedule to be initialized
+ * keyarr32	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+static void
+aes_aesni_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits)
+{
+	kfpu_begin();
+	key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]),
+	    keyarr32, keybits);
+	key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]),
+	    keyarr32, keybits);
+	kfpu_end();
+}
+
+/*
+ * Encrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk		Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr		Number of rounds
+ * pt		Input block (plain text)
+ * ct		Output block (crypto text).  Can overlap with pt
+ */
+static void
+aes_aesni_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
+    uint32_t ct[4])
+{
+	kfpu_begin();
+	aes_encrypt_intel(rk, Nr, pt, ct);
+	kfpu_end();
+}
+
+/*
+ * Decrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk		Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr		Number of rounds
+ * ct		Input block (crypto text)
+ * pt		Output block (plain text). Can overlap with pt
+ */
+static void
+aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
+    uint32_t pt[4])
+{
+	kfpu_begin();
+	aes_decrypt_intel(rk, Nr, ct, pt);
+	kfpu_end();
+}
+
+static boolean_t
+aes_aesni_will_work(void)
+{
+	return (zfs_aes_available());
+}
+
+const aes_impl_ops_t aes_aesni_impl = {
+	.generate = &aes_aesni_generate,
+	.encrypt = &aes_aesni_encrypt,
+	.decrypt = &aes_aesni_decrypt,
+	.is_supported = &aes_aesni_will_work,
+	.needs_byteswap = B_FALSE,
+	.name = "aesni"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_AES) */
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64)
+
+#include <linux/simd_x86.h>
+
+/* These functions are used to execute amd64 instructions for AMD or Intel: */
+extern int rijndael_key_setup_enc_amd64(uint32_t rk[],
+	const uint32_t cipherKey[], int keyBits);
+extern int rijndael_key_setup_dec_amd64(uint32_t rk[],
+	const uint32_t cipherKey[], int keyBits);
+extern void aes_encrypt_amd64(const uint32_t rk[], int Nr,
+	const uint32_t pt[4], uint32_t ct[4]);
+extern void aes_decrypt_amd64(const uint32_t rk[], int Nr,
+	const uint32_t ct[4], uint32_t pt[4]);
+
+
+#include <aes/aes_impl.h>
+
+/*
+ * Expand the 32-bit AES cipher key array into the encryption and decryption
+ * key schedules.
+ *
+ * Parameters:
+ * key		AES key schedule to be initialized
+ * keyarr32	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+static void
+aes_x86_64_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits)
+{
+	key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]),
+	    keyarr32, keybits);
+	key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]),
+	    keyarr32, keybits);
+}
+
+static boolean_t
+aes_x86_64_will_work(void)
+{
+	return (B_TRUE);
+}
+
+const aes_impl_ops_t aes_x86_64_impl = {
+	.generate = &aes_x86_64_generate,
+	.encrypt = &aes_encrypt_amd64,
+	.decrypt = &aes_decrypt_amd64,
+	.is_supported = &aes_x86_64_will_work,
+	.needs_byteswap = B_FALSE,
+	.name = "x86_64"
+};
+
+#endif /* defined(__x86_64) */
@@ -22,93 +22,19 @@
 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 */

-#if defined(_KERNEL) && defined(__amd64)
-#include <linux/simd_x86.h>
-
-#define	KPREEMPT_DISABLE	kfpu_begin()
-#define	KPREEMPT_ENABLE		kfpu_end()
-
-#else
-#define	KPREEMPT_DISABLE
-#define	KPREEMPT_ENABLE
-#endif	/* _KERNEL */
-
 #include <sys/zfs_context.h>
 #include <modes/modes.h>
 #include <sys/crypto/common.h>
+#include <sys/crypto/icp.h>
 #include <sys/crypto/impl.h>
 #include <sys/byteorder.h>
+#include <modes/gcm_impl.h>

-#ifdef __amd64
-
-extern void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
-static int intel_pclmulqdq_instruction_present(void);
-#endif	/* __amd64 */
-
-struct aes_block {
-	uint64_t a;
-	uint64_t b;
-};
-
-
-/*
- * gcm_mul()
- * Perform a carry-less multiplication (that is, use XOR instead of the
- * multiply operator) on *x_in and *y and place the result in *res.
- *
- * Byte swap the input (*x_in and *y) and the output (*res).
- *
- * Note: x_in, y, and res all point to 16-byte numbers (an array of two
- * 64-bit integers).
- */
-void
-gcm_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
-{
-#ifdef __amd64
-	if (intel_pclmulqdq_instruction_present()) {
-		KPREEMPT_DISABLE;
-		gcm_mul_pclmulqdq(x_in, y, res);
-		KPREEMPT_ENABLE;
-	} else
-#endif	/* __amd64 */
-	{
-		static const uint64_t R = 0xe100000000000000ULL;
-		struct aes_block z = {0, 0};
-		struct aes_block v;
-		uint64_t x;
-		int i, j;
-
-		v.a = ntohll(y[0]);
-		v.b = ntohll(y[1]);
-
-		for (j = 0; j < 2; j++) {
-			x = ntohll(x_in[j]);
-			for (i = 0; i < 64; i++, x <<= 1) {
-				if (x & 0x8000000000000000ULL) {
-					z.a ^= v.a;
-					z.b ^= v.b;
-				}
-				if (v.b & 1ULL) {
-					v.b = (v.a << 63)|(v.b >> 1);
-					v.a = (v.a >> 1) ^ R;
-				} else {
-					v.b = (v.a << 63)|(v.b >> 1);
-					v.a = v.a >> 1;
-				}
-			}
-		}
-		res[0] = htonll(z.a);
-		res[1] = htonll(z.b);
-	}
-}
-
-
-#define	GHASH(c, d, t) \
+#define	GHASH(c, d, t, o) \
 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
-	gcm_mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
+	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
 	(uint64_t *)(void *)(t));

-
 /*
 * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
 * is done in another function.
@@ -120,6 +46,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
+	gcm_impl_ops_t *gops;
 	size_t remainder = length;
 	size_t need = 0;
 	uint8_t *datap = (uint8_t *)data;
@@ -147,6 +74,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
 	if (out != NULL)
 		crypto_init_ptrs(out, &iov_or_mp, &offset);

+	gops = gcm_impl_get_ops();
 	do {
 		/* Unprocessed data from last call. */
 		if (ctx->gcm_remainder_len > 0) {
@@ -207,7 +135,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
 		}

 		/* add ciphertext to the hash */
-		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash);
+		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);

 		/* Update pointer to next block of data to be processed. */
 		if (ctx->gcm_remainder_len != 0) {
@@ -240,6 +168,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
+	gcm_impl_ops_t *gops;
 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
 	uint8_t *ghash, *macp = NULL;
 	int i, rv;
@@ -249,6 +178,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
 		return (CRYPTO_DATA_LEN_RANGE);
 	}

+	gops = gcm_impl_get_ops();
 	ghash = (uint8_t *)ctx->gcm_ghash;

 	if (ctx->gcm_remainder_len > 0) {
@@ -281,14 +211,14 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
 		}

 		/* add ciphertext to the hash */
-		GHASH(ctx, macp, ghash);
+		GHASH(ctx, macp, ghash, gops);

 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
 	}

 	ctx->gcm_len_a_len_c[1] =
 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
-	GHASH(ctx, ctx->gcm_len_a_len_c, ghash);
+	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
 	    (uint8_t *)ctx->gcm_J0);
 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
@@ -340,7 +270,7 @@ gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
 	bcopy(datap, (uint8_t *)ctx->gcm_tmp, ctx->gcm_remainder_len);

 	/* add ciphertext to the hash */
-	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash);
+	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());

 	/* decrypt remaining ciphertext */
 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
@@ -390,6 +320,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
+	gcm_impl_ops_t *gops;
 	size_t pt_len;
 	size_t remainder;
 	uint8_t *ghash;
@@ -401,6 +332,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,

 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);

+	gops = gcm_impl_get_ops();
 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
 	ghash = (uint8_t *)ctx->gcm_ghash;
 	blockp = ctx->gcm_pt_buf;
@@ -420,7 +352,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
 			goto out;
 		}
 		/* add ciphertext to the hash */
-		GHASH(ctx, blockp, ghash);
+		GHASH(ctx, blockp, ghash, gops);

 		/*
 		 * Increment counter.
@@ -443,7 +375,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
 	}
 out:
 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
-	GHASH(ctx, ctx->gcm_len_a_len_c, ghash);
+	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
 	    (uint8_t *)ctx->gcm_J0);
 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
@@ -495,12 +427,14 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
+	gcm_impl_ops_t *gops;
 	uint8_t *cb;
 	ulong_t remainder = iv_len;
 	ulong_t processed = 0;
 	uint8_t *datap, *ghash;
 	uint64_t len_a_len_c[2];

+	gops = gcm_impl_get_ops();
 	ghash = (uint8_t *)ctx->gcm_ghash;
 	cb = (uint8_t *)ctx->gcm_cb;
 	if (iv_len == 12) {
@@ -524,12 +458,12 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
 				processed += block_size;
 				remainder -= block_size;
 			}
-			GHASH(ctx, datap, ghash);
+			GHASH(ctx, datap, ghash, gops);
 		} while (remainder > 0);

 		len_a_len_c[0] = 0;
 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
-		GHASH(ctx, len_a_len_c, ctx->gcm_J0);
+		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);

 		/* J0 will be used again in the final */
 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
@@ -547,6 +481,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
    void (*copy_block)(uint8_t *, uint8_t *),
    void (*xor_block)(uint8_t *, uint8_t *))
 {
+	gcm_impl_ops_t *gops;
 	uint8_t *ghash, *datap, *authp;
 	size_t remainder, processed;

@@ -558,6 +493,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
 	    copy_block, xor_block);

+	gops = gcm_impl_get_ops();
 	authp = (uint8_t *)ctx->gcm_tmp;
 	ghash = (uint8_t *)ctx->gcm_ghash;
 	bzero(authp, block_size);
@@ -582,7 +518,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
 		}

 		/* add auth data to the hash */
-		GHASH(ctx, datap, ghash);
+		GHASH(ctx, datap, ghash, gops);

 	} while (remainder > 0);

@@ -694,55 +630,206 @@ gcm_set_kmflag(gcm_ctx_t *ctx, int kmflag)
 	ctx->gcm_kmflag = kmflag;
 }

+/* GCM implementation that contains the fastest methods */
+static gcm_impl_ops_t gcm_fastest_impl = {
+	.name = "fastest"
+};

-#ifdef __amd64
+/* All compiled in implementations */
+const gcm_impl_ops_t *gcm_all_impl[] = {
+	&gcm_generic_impl,
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+	&gcm_pclmulqdq_impl,
+#endif
+};

-#define	INTEL_PCLMULQDQ_FLAG (1 << 1)
+/* Indicate that benchmark has been completed */
+static boolean_t gcm_impl_initialized = B_FALSE;
+
+/* Select aes implementation */
+#define	IMPL_FASTEST	(UINT32_MAX)
+#define	IMPL_CYCLE	(UINT32_MAX-1)
+
+#define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
+
+static uint32_t icp_gcm_impl = IMPL_FASTEST;
+static uint32_t user_sel_impl = IMPL_FASTEST;
+
+/* Hold all supported implementations */
+static size_t gcm_supp_impl_cnt = 0;
+static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];

 /*
- * Return 1 if executing on Intel with PCLMULQDQ instructions,
- * otherwise 0 (i.e., Intel without PCLMULQDQ or AMD64).
- * Cache the result, as the CPU can't change.
- *
- * Note: the userland version uses getisax().  The kernel version uses
- * is_x86_featureset().
+ * Selects the gcm operation
 */
-static int
-intel_pclmulqdq_instruction_present(void)
+gcm_impl_ops_t *
+gcm_impl_get_ops()
 {
-	static int cached_result = -1;
-	unsigned eax, ebx, ecx, edx;
-	unsigned func, subfunc;
+	gcm_impl_ops_t *ops = NULL;
+	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);

-	if (cached_result == -1) { /* first time */
-		/* check for an intel cpu */
-		func = 0;
-		subfunc = 0;
+	switch (impl) {
+	case IMPL_FASTEST:
+		ASSERT(gcm_impl_initialized);
+		ops = &gcm_fastest_impl;
+		break;
+	case IMPL_CYCLE:
+	{
+		ASSERT(gcm_impl_initialized);
+		ASSERT3U(gcm_supp_impl_cnt, >, 0);
+		/* Cycle through supported implementations */
+		static size_t cycle_impl_idx = 0;
+		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
+		ops = gcm_supp_impl[idx];
+	}
+	break;
+	default:
+		ASSERT3U(impl, <, gcm_supp_impl_cnt);
+		ASSERT3U(gcm_supp_impl_cnt, >, 0);
+		if (impl < ARRAY_SIZE(gcm_all_impl))
+			ops = gcm_supp_impl[impl];
+		break;
+	}

-		__asm__ __volatile__(
-		    "cpuid"
-		    : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
-		    : "a"(func), "c"(subfunc));
+	ASSERT3P(ops, !=, NULL);

-		if (memcmp((char *)(&ebx), "Genu", 4) == 0 &&
-		    memcmp((char *)(&edx), "ineI", 4) == 0 &&
-		    memcmp((char *)(&ecx), "ntel", 4) == 0) {
-			func = 1;
-			subfunc = 0;
+	return (ops);
+}

-			/* check for aes-ni instruction set */
-			__asm__ __volatile__(
-			    "cpuid"
-			    : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
-			    : "a"(func), "c"(subfunc));
+void
+gcm_impl_init(void)
+{
+	gcm_impl_ops_t *curr_impl;
+	int i, c;

-			cached_result = !!(ecx & INTEL_PCLMULQDQ_FLAG);
-		} else {
-			cached_result = 0;
+	/* move supported impl into aes_supp_impls */
+	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
+		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
+
+		if (curr_impl->is_supported())
+			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
+	}
+	gcm_supp_impl_cnt = c;
+
+	/* set fastest implementation. assume hardware accelerated is fastest */
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+	if (gcm_pclmulqdq_impl.is_supported())
+		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
+		    sizeof (gcm_fastest_impl));
+	else
+#endif
+		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
+		    sizeof (gcm_fastest_impl));
+
+	strcpy(gcm_fastest_impl.name, "fastest");
+
+	/* Finish initialization */
+	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
+	gcm_impl_initialized = B_TRUE;
+}
+
+static const struct {
+	char *name;
+	uint32_t sel;
+} gcm_impl_opts[] = {
+		{ "cycle",	IMPL_CYCLE },
+		{ "fastest",	IMPL_FASTEST },
+};
+
+/*
+ * Function sets desired gcm implementation.
+ *
+ * If we are called before init(), user preference will be saved in
+ * user_sel_impl, and applied in later init() call. This occurs when module
+ * parameter is specified on module load. Otherwise, directly update
+ * icp_aes_impl.
+ *
+ * @val		Name of gcm implementation to use
+ * @param	Unused.
+ */
+int
+gcm_impl_set(const char *val)
+{
+	int err = -EINVAL;
+	char req_name[GCM_IMPL_NAME_MAX];
+	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
+	size_t i;
+
+	/* sanitize input */
+	i = strnlen(val, GCM_IMPL_NAME_MAX);
+	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
+		return (err);
+
+	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
+	while (i > 0 && isspace(req_name[i-1]))
+		i--;
+	req_name[i] = '\0';
+
+	/* Check mandatory options */
+	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
+		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
+			impl = gcm_impl_opts[i].sel;
+			err = 0;
+			break;
 		}
 	}

-	return (cached_result);
+	/* check all supported impl if init() was already called */
+	if (err != 0 && gcm_impl_initialized) {
+		/* check all supported implementations */
+		for (i = 0; i < gcm_supp_impl_cnt; i++) {
+			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
+				impl = i;
+				err = 0;
+				break;
+			}
+		}
+	}
+
+	if (err == 0) {
+		if (gcm_impl_initialized)
+			atomic_swap_32(&icp_gcm_impl, impl);
+		else
+			atomic_swap_32(&user_sel_impl, impl);
+	}
+
+	return (err);
 }

-#endif	/* __amd64 */
+#if defined(_KERNEL)
+#include <linux/mod_compat.h>
+
+static int
+icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
+{
+	return (gcm_impl_set(val));
+}
+
+static int
+icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+	int i, cnt = 0;
+	char *fmt;
+	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
+
+	ASSERT(gcm_impl_initialized);
+
+	/* list mandatory options */
+	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
+		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
+		cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name);
+	}
+
+	/* list all supported implementations */
+	for (i = 0; i < gcm_supp_impl_cnt; i++) {
+		fmt = (i == impl) ? "[%s] " : "%s ";
+		cnt += sprintf(buffer + cnt, fmt, gcm_supp_impl[i]->name);
+	}
+
+	return (cnt);
+}
+
+module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
+    NULL, 0644);
+MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
+#endif
@@ -0,0 +1,83 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <modes/gcm_impl.h>
+
+struct aes_block {
+	uint64_t a;
+	uint64_t b;
+};
+
+/*
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on *x_in and *y and place the result in *res.
+ *
+ * Byte swap the input (*x_in and *y) and the output (*res).
+ *
+ * Note: x_in, y, and res all point to 16-byte numbers (an array of two
+ * 64-bit integers).
+ */
+static void
+gcm_generic_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
+{
+	static const uint64_t R = 0xe100000000000000ULL;
+	struct aes_block z = {0, 0};
+	struct aes_block v;
+	uint64_t x;
+	int i, j;
+
+	v.a = ntohll(y[0]);
+	v.b = ntohll(y[1]);
+
+	for (j = 0; j < 2; j++) {
+		x = ntohll(x_in[j]);
+		for (i = 0; i < 64; i++, x <<= 1) {
+			if (x & 0x8000000000000000ULL) {
+				z.a ^= v.a;
+				z.b ^= v.b;
+			}
+			if (v.b & 1ULL) {
+				v.b = (v.a << 63)|(v.b >> 1);
+				v.a = (v.a >> 1) ^ R;
+			} else {
+				v.b = (v.a << 63)|(v.b >> 1);
+				v.a = v.a >> 1;
+			}
+		}
+	}
+	res[0] = htonll(z.a);
+	res[1] = htonll(z.b);
+}
+
+static boolean_t
+gcm_generic_will_work(void)
+{
+	return (B_TRUE);
+}
+
+const gcm_impl_ops_t gcm_generic_impl = {
+	.mul = &gcm_generic_mul,
+	.is_supported = &gcm_generic_will_work,
+	.name = "generic"
+};
@@ -0,0 +1,64 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+
+#include <linux/simd_x86.h>
+
+/* These functions are used to execute pclmulqdq based assembly methods */
+extern void gcm_mul_pclmulqdq(uint64_t *, uint64_t *, uint64_t *);
+
+
+#include <modes/gcm_impl.h>
+
+/*
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on *x_in and *y and place the result in *res.
+ *
+ * Byte swap the input (*x_in and *y) and the output (*res).
+ *
+ * Note: x_in, y, and res all point to 16-byte numbers (an array of two
+ * 64-bit integers).
+ */
+static void
+gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
+{
+	kfpu_begin();
+	gcm_mul_pclmulqdq(x_in, y, res);
+	kfpu_end();
+}
+
+static boolean_t
+gcm_pclmulqdq_will_work(void)
+{
+	return (zfs_pclmulqdq_available());
+}
+
+const gcm_impl_ops_t gcm_pclmulqdq_impl = {
+	.mul = &gcm_pclmulqdq_mul,
+	.is_supported = &gcm_pclmulqdq_will_work,
+	.name = "pclmulqdq"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_PCLMULQDQ) */