Linux 4.14, 4.19, 5.0+ compat: SIMD save/restore

Contrary to initial testing we cannot rely on these kernels to
invalidate the per-cpu FPU state and restore the FPU registers.
Nor can we guarantee that the kernel won't modify the FPU state
which we saved in the task struck.

Therefore, the kfpu_begin() and kfpu_end() functions have been
updated to save and restore the FPU state using our own dedicated
per-cpu FPU state variables.

This has the additional advantage of allowing us to use the FPU
again in user threads.  So we remove the code which was added to
use task queues to ensure some functions ran in kernel threads.

Reviewed-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #9346
Closes #9403
This commit is contained in:
Brian Behlendorf
2019-10-24 10:17:33 -07:00
committed by GitHub
parent b834b58ae6
commit 10fa254539
19 changed files with 276 additions and 294 deletions
+1 -2
View File
@@ -295,9 +295,8 @@ aes_impl_get_ops(void)
/*
* Initialize all supported implementations.
*/
/* ARGSUSED */
void
aes_impl_init(void *arg)
aes_impl_init(void)
{
aes_impl_ops_t *curr_impl;
int i, c;
+1 -2
View File
@@ -703,9 +703,8 @@ gcm_impl_get_ops()
/*
* Initialize all supported implementations.
*/
/* ARGSUSED */
void
gcm_impl_init(void *arg)
gcm_impl_init(void)
{
gcm_impl_ops_t *curr_impl;
int i, c;
+1 -1
View File
@@ -198,7 +198,7 @@ extern const aes_impl_ops_t aes_aesni_impl;
/*
* Initializes fastest implementation
*/
void aes_impl_init(void *arg);
void aes_impl_init(void);
/*
* Returns optimal allowed AES implementation
+1 -1
View File
@@ -61,7 +61,7 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
/*
* Initializes fastest implementation
*/
void gcm_impl_init(void *arg);
void gcm_impl_init(void);
/*
* Returns optimal allowed GCM implementation
+3 -29
View File
@@ -206,35 +206,9 @@ aes_mod_init(void)
{
int ret;
#if defined(_KERNEL)
/*
* Determine the fastest available implementation. The benchmarks
* are run in dedicated kernel threads to allow Linux 5.0+ kernels
* to use SIMD operations. If for some reason this isn't possible,
* fallback to the generic implementations. See the comment in
* linux/simd_x86.h for additional details. Additionally, this has
* the benefit of allowing them to be run in parallel.
*/
taskqid_t aes_id = taskq_dispatch(system_taskq, aes_impl_init,
NULL, TQ_SLEEP);
taskqid_t gcm_id = taskq_dispatch(system_taskq, gcm_impl_init,
NULL, TQ_SLEEP);
if (aes_id != TASKQID_INVALID) {
taskq_wait_id(system_taskq, aes_id);
} else {
aes_impl_init(NULL);
}
if (gcm_id != TASKQID_INVALID) {
taskq_wait_id(system_taskq, gcm_id);
} else {
gcm_impl_init(NULL);
}
#else
aes_impl_init(NULL);
gcm_impl_init(NULL);
#endif
/* Determine the fastest available implementation. */
aes_impl_init();
gcm_impl_init();
if ((ret = mod_install(&modlinkage)) != 0)
return (ret);
-2
View File
@@ -28,7 +28,6 @@
#include <sys/taskq.h>
#include <sys/kmem.h>
#include <sys/tsd.h>
#include <sys/simd.h>
int spl_taskq_thread_bind = 0;
module_param(spl_taskq_thread_bind, int, 0644);
@@ -854,7 +853,6 @@ taskq_thread(void *args)
sigfillset(&blocked);
sigprocmask(SIG_BLOCK, &blocked, NULL);
flush_signals(current);
kfpu_initialize();
tsd_set(taskq_tsd, tq);
spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
-2
View File
@@ -27,7 +27,6 @@
#include <sys/thread.h>
#include <sys/kmem.h>
#include <sys/tsd.h>
#include <sys/simd.h>
/*
* Thread interfaces
@@ -55,7 +54,6 @@ thread_generic_wrapper(void *arg)
args = tp->tp_args;
set_current_state(tp->tp_state);
set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
kfpu_initialize();
kmem_free(tp->tp_name, tp->tp_name_size);
kmem_free(tp, sizeof (thread_priv_t));
+32 -113
View File
@@ -25,8 +25,6 @@
#include <sys/zio.h>
#include <sys/zil.h>
#include <sys/sha2.h>
#include <sys/simd.h>
#include <sys/spa_impl.h>
#include <sys/hkdf.h>
#include <sys/qat.h>
@@ -376,7 +374,7 @@ error:
* plaintext / ciphertext alone.
*/
static int
zio_do_crypt_uio_impl(boolean_t encrypt, uint64_t crypt, crypto_key_t *key,
zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key,
crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen,
uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len)
{
@@ -476,75 +474,9 @@ error:
return (ret);
}
typedef struct crypt_uio_arg {
boolean_t cu_encrypt;
uint64_t cu_crypt;
crypto_key_t *cu_key;
crypto_ctx_template_t cu_tmpl;
uint8_t *cu_ivbuf;
uint_t cu_datalen;
uio_t *cu_puio;
uio_t *cu_cuio;
uint8_t *cu_authbuf;
uint_t cu_auth_len;
int cu_error;
} crypt_uio_arg_t;
static void
zio_do_crypt_uio_func(void *arg)
{
crypt_uio_arg_t *cu = (crypt_uio_arg_t *)arg;
cu->cu_error = zio_do_crypt_uio_impl(cu->cu_encrypt, cu->cu_crypt,
cu->cu_key, cu->cu_tmpl, cu->cu_ivbuf, cu->cu_datalen,
cu->cu_puio, cu->cu_cuio, cu->cu_authbuf, cu->cu_auth_len);
}
static int
zio_do_crypt_uio(spa_t *spa, boolean_t encrypt, uint64_t crypt,
crypto_key_t *key, crypto_ctx_template_t tmpl, uint8_t *ivbuf,
uint_t datalen, uio_t *puio, uio_t *cuio, uint8_t *authbuf,
uint_t auth_len)
{
int error;
/*
* Dispatch to the I/O pipeline as required by the context in order
* to take advantage of the SIMD optimization when available.
*/
if (kfpu_allowed()) {
error = zio_do_crypt_uio_impl(encrypt, crypt, key, tmpl,
ivbuf, datalen, puio, cuio, authbuf, auth_len);
} else {
crypt_uio_arg_t *cu;
cu = kmem_alloc(sizeof (*cu), KM_SLEEP);
cu->cu_encrypt = encrypt;
cu->cu_crypt = crypt;
cu->cu_key = key;
cu->cu_tmpl = tmpl;
cu->cu_ivbuf = ivbuf;
cu->cu_datalen = datalen;
cu->cu_puio = puio;
cu->cu_cuio = cuio;
cu->cu_authbuf = authbuf;
cu->cu_auth_len = auth_len;
cu->cu_error = 0;
spa_taskq_dispatch_sync(spa,
encrypt ? ZIO_TYPE_WRITE : ZIO_TYPE_READ,
ZIO_TASKQ_ISSUE, zio_do_crypt_uio_func, cu, TQ_SLEEP);
error = cu->cu_error;
kmem_free(cu, sizeof (*cu));
}
return (error);
}
int
zio_crypt_key_wrap(spa_t *spa, crypto_key_t *cwkey, zio_crypt_key_t *key,
uint8_t *iv, uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
{
int ret;
uio_t puio, cuio;
@@ -601,7 +533,7 @@ zio_crypt_key_wrap(spa_t *spa, crypto_key_t *cwkey, zio_crypt_key_t *key,
cuio.uio_segflg = UIO_SYSSPACE;
/* encrypt the keys and store the resulting ciphertext and mac */
ret = zio_do_crypt_uio(spa, B_TRUE, crypt, cwkey, NULL, iv, enc_len,
ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len,
&puio, &cuio, (uint8_t *)aad, aad_len);
if (ret != 0)
goto error;
@@ -612,33 +544,12 @@ error:
return (ret);
}
static void
zio_crypt_create_ctx_templates(void *arg)
{
zio_crypt_key_t *key = (zio_crypt_key_t *)arg;
crypto_mechanism_t mech;
int ret;
mech.cm_type = crypto_mech2id(
zio_crypt_table[key->zk_crypt].ci_mechname);
ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
&key->zk_current_tmpl, KM_SLEEP);
if (ret != CRYPTO_SUCCESS)
key->zk_current_tmpl = NULL;
mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
&key->zk_hmac_tmpl, KM_SLEEP);
if (ret != CRYPTO_SUCCESS)
key->zk_hmac_tmpl = NULL;
}
int
zio_crypt_key_unwrap(spa_t *spa, crypto_key_t *cwkey, uint64_t crypt,
uint64_t version, uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata,
uint8_t *iv, uint8_t *mac, zio_crypt_key_t *key)
zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
uint8_t *mac, zio_crypt_key_t *key)
{
crypto_mechanism_t mech;
uio_t puio, cuio;
uint64_t aad[3];
iovec_t plain_iovecs[2], cipher_iovecs[3];
@@ -685,7 +596,7 @@ zio_crypt_key_unwrap(spa_t *spa, crypto_key_t *cwkey, uint64_t crypt,
cuio.uio_segflg = UIO_SYSSPACE;
/* decrypt the keys and store the result in the output buffers */
ret = zio_do_crypt_uio(spa, B_FALSE, crypt, cwkey, NULL, iv, enc_len,
ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len,
&puio, &cuio, (uint8_t *)aad, aad_len);
if (ret != 0)
goto error;
@@ -711,18 +622,27 @@ zio_crypt_key_unwrap(spa_t *spa, crypto_key_t *cwkey, uint64_t crypt,
key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
/*
* Initialize the crypto templates. It's ok if this fails because
* this is just an optimization.
*/
mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
&key->zk_current_tmpl, KM_SLEEP);
if (ret != CRYPTO_SUCCESS)
key->zk_current_tmpl = NULL;
mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
&key->zk_hmac_tmpl, KM_SLEEP);
if (ret != CRYPTO_SUCCESS)
key->zk_hmac_tmpl = NULL;
key->zk_crypt = crypt;
key->zk_version = version;
key->zk_guid = guid;
key->zk_salt_count = 0;
/*
* Initialize the crypto templates in the context they will be
* primarily used. It's ok if this fails, it's just an optimization.
*/
spa_taskq_dispatch_sync(spa, ZIO_TYPE_READ, ZIO_TASKQ_ISSUE,
zio_crypt_create_ctx_templates, key, TQ_SLEEP);
return (0);
error:
@@ -1941,7 +1861,7 @@ error:
* Primary encryption / decryption entrypoint for zio data.
*/
int
zio_do_crypt_data(spa_t *spa, boolean_t encrypt, zio_crypt_key_t *key,
zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
boolean_t *no_crypt)
@@ -2028,8 +1948,8 @@ zio_do_crypt_data(spa_t *spa, boolean_t encrypt, zio_crypt_key_t *key,
goto error;
/* perform the encryption / decryption in software */
ret = zio_do_crypt_uio(spa, encrypt, key->zk_crypt, ckey, tmpl, iv,
enc_len, &puio, &cuio, authbuf, auth_len);
ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len,
&puio, &cuio, authbuf, auth_len);
if (ret != 0)
goto error;
@@ -2065,10 +1985,9 @@ error:
* linear buffers.
*/
int
zio_do_crypt_abd(spa_t *spa, boolean_t encrypt, zio_crypt_key_t *key,
dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd,
boolean_t *no_crypt)
zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
{
int ret;
void *ptmp, *ctmp;
@@ -2081,7 +2000,7 @@ zio_do_crypt_abd(spa_t *spa, boolean_t encrypt, zio_crypt_key_t *key,
ctmp = abd_borrow_buf_copy(cabd, datalen);
}
ret = zio_do_crypt_data(spa, encrypt, key, ot, byteswap, salt, iv, mac,
ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
datalen, ptmp, ctmp, no_crypt);
if (ret != 0)
goto error;
+4 -16
View File
@@ -726,7 +726,7 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
* Initialize and benchmark all supported implementations.
*/
static void
fletcher_4_benchmark(void *arg)
fletcher_4_benchmark(void)
{
fletcher_4_ops_t *curr_impl;
int i, c;
@@ -769,20 +769,10 @@ fletcher_4_benchmark(void *arg)
void
fletcher_4_init(void)
{
#if defined(_KERNEL)
/*
* For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
* run in a kernel threads. This is needed to take advantage of the
* SIMD functionality, see linux/simd_x86.h for details.
*/
taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark,
NULL, TQ_SLEEP);
if (id != TASKQID_INVALID) {
taskq_wait_id(system_taskq, id);
} else {
fletcher_4_benchmark(NULL);
}
/* Determine the fastest available implementation. */
fletcher_4_benchmark();
#if defined(_KERNEL)
/* Install kstats for all implementations */
fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
@@ -795,8 +785,6 @@ fletcher_4_init(void)
fletcher_4_kstat_addr);
kstat_install(fletcher_4_kstat);
}
#else
fletcher_4_benchmark(NULL);
#endif
/* Finish initialization */
+14
View File
@@ -865,10 +865,23 @@ zfs_prop_align_right(zfs_prop_t prop)
#endif
#if defined(_KERNEL)
#include <sys/simd.h>
#if defined(HAVE_KERNEL_FPU_INTERNAL)
union fpregs_state **zfs_kfpu_fpregs;
EXPORT_SYMBOL(zfs_kfpu_fpregs);
#endif /* HAVE_KERNEL_FPU_INTERNAL */
static int __init
zcommon_init(void)
{
int error = kfpu_init();
if (error)
return (error);
fletcher_4_init();
return (0);
}
@@ -876,6 +889,7 @@ static void __exit
zcommon_fini(void)
{
fletcher_4_fini();
kfpu_fini();
}
module_init(zcommon_init);
+1 -1
View File
@@ -8136,7 +8136,7 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
if (ret != 0)
goto error;
ret = zio_do_crypt_abd(spa, B_TRUE, &dck->dck_key,
ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
&no_crypt);
+10 -10
View File
@@ -601,8 +601,8 @@ dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey,
* Unwrap the keys. If there is an error return EACCES to indicate
* an authentication failure.
*/
ret = zio_crypt_key_unwrap(mos->os_spa, &wkey->wk_key, crypt, version,
guid, raw_keydata, raw_hmac_keydata, iv, mac, &dck->dck_key);
ret = zio_crypt_key_unwrap(&wkey->wk_key, crypt, version, guid,
raw_keydata, raw_hmac_keydata, iv, mac, &dck->dck_key);
if (ret != 0) {
ret = SET_ERROR(EACCES);
goto error;
@@ -1221,7 +1221,6 @@ dsl_crypto_key_sync(dsl_crypto_key_t *dck, dmu_tx_t *tx)
{
zio_crypt_key_t *key = &dck->dck_key;
dsl_wrapping_key_t *wkey = dck->dck_wkey;
objset_t *mos = tx->tx_pool->dp_meta_objset;
uint8_t keydata[MASTER_KEY_MAX_LEN];
uint8_t hmac_keydata[SHA512_HMAC_KEYLEN];
uint8_t iv[WRAPPING_IV_LEN];
@@ -1231,13 +1230,14 @@ dsl_crypto_key_sync(dsl_crypto_key_t *dck, dmu_tx_t *tx)
ASSERT3U(key->zk_crypt, <, ZIO_CRYPT_FUNCTIONS);
/* encrypt and store the keys along with the IV and MAC */
VERIFY0(zio_crypt_key_wrap(mos->os_spa, &dck->dck_wkey->wk_key, key,
iv, mac, keydata, hmac_keydata));
VERIFY0(zio_crypt_key_wrap(&dck->dck_wkey->wk_key, key, iv, mac,
keydata, hmac_keydata));
/* update the ZAP with the obtained values */
dsl_crypto_key_sync_impl(mos, dck->dck_obj, key->zk_crypt,
wkey->wk_ddobj, key->zk_guid, iv, mac, keydata, hmac_keydata,
wkey->wk_keyformat, wkey->wk_salt, wkey->wk_iters, tx);
dsl_crypto_key_sync_impl(tx->tx_pool->dp_meta_objset, dck->dck_obj,
key->zk_crypt, wkey->wk_ddobj, key->zk_guid, iv, mac, keydata,
hmac_keydata, wkey->wk_keyformat, wkey->wk_salt, wkey->wk_iters,
tx);
}
typedef struct spa_keystore_change_key_args {
@@ -2846,8 +2846,8 @@ spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb,
}
/* call lower level function to perform encryption / decryption */
ret = zio_do_crypt_data(spa, encrypt, &dck->dck_key, ot, bswap, salt,
iv, mac, datalen, plainbuf, cipherbuf, no_crypt);
ret = zio_do_crypt_data(encrypt, &dck->dck_key, ot, bswap, salt, iv,
mac, datalen, plainbuf, cipherbuf, no_crypt);
/*
* Handle injected decryption faults. Unfortunately, we cannot inject
+4 -16
View File
@@ -445,7 +445,7 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
* Initialize and benchmark all supported implementations.
*/
static void
benchmark_raidz(void *arg)
benchmark_raidz(void)
{
raidz_impl_ops_t *curr_impl;
int i, c;
@@ -515,20 +515,10 @@ benchmark_raidz(void *arg)
void
vdev_raidz_math_init(void)
{
#if defined(_KERNEL)
/*
* For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
* run in a kernel threads. This is needed to take advantage of the
* SIMD functionality, see include/linux/simd_x86.h for details.
*/
taskqid_t id = taskq_dispatch(system_taskq, benchmark_raidz,
NULL, TQ_SLEEP);
if (id != TASKQID_INVALID) {
taskq_wait_id(system_taskq, id);
} else {
benchmark_raidz(NULL);
}
/* Determine the fastest available implementation. */
benchmark_raidz();
#if defined(_KERNEL)
/* Install kstats for all implementations */
raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
@@ -541,8 +531,6 @@ vdev_raidz_math_init(void)
raidz_math_kstat_addr);
kstat_install(raidz_math_kstat);
}
#else
benchmark_raidz(NULL);
#endif
/* Finish initialization */