mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
zcommon: Refactor FPU state handling in fletcher4
Currently calls to kfpu_begin() and kfpu_end() are split between the init() and fini() functions of the particular SIMD implementation. This was done in #14247 as an optimization measure for the ABD adapter. Unfortunately the split complicates FPU handling on platforms that use a local FPU state buffer, like Windows and macOS. To ease porting, we introduce a boolean struct member in fletcher_4_ops_t, indicating use of the FPU, and move the FPU state handling from the SIMD implementations to the call sites. Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de> Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Reviewed-by: Jorgen Lundman <lundman@lundman.net> Signed-off-by: Attila Fülöp <attila@fueloep.org> Closes #14600
This commit is contained in:
@@ -160,6 +160,7 @@ static const fletcher_4_ops_t fletcher_4_scalar_ops = {
|
||||
.fini_byteswap = fletcher_4_scalar_fini,
|
||||
.compute_byteswap = fletcher_4_scalar_byteswap,
|
||||
.valid = fletcher_4_scalar_valid,
|
||||
.uses_fpu = B_FALSE,
|
||||
.name = "scalar"
|
||||
};
|
||||
|
||||
@@ -458,9 +459,15 @@ fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
||||
fletcher_4_ctx_t ctx;
|
||||
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
|
||||
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_begin();
|
||||
}
|
||||
ops->init_native(&ctx);
|
||||
ops->compute_native(&ctx, buf, size);
|
||||
ops->fini_native(&ctx, zcp);
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_end();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -500,9 +507,15 @@ fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
||||
fletcher_4_ctx_t ctx;
|
||||
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
|
||||
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_begin();
|
||||
}
|
||||
ops->init_byteswap(&ctx);
|
||||
ops->compute_byteswap(&ctx, buf, size);
|
||||
ops->fini_byteswap(&ctx, zcp);
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_end();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -661,6 +674,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
|
||||
fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \
|
||||
fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \
|
||||
fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
|
||||
fletcher_4_fastest_impl.uses_fpu = src->uses_fpu; \
|
||||
}
|
||||
|
||||
#define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */
|
||||
@@ -816,10 +830,14 @@ abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
|
||||
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
|
||||
cdp->acd_private = (void *) ops;
|
||||
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_begin();
|
||||
}
|
||||
if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
|
||||
ops->init_native(cdp->acd_ctx);
|
||||
else
|
||||
ops->init_byteswap(cdp->acd_ctx);
|
||||
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -833,8 +851,13 @@ abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
|
||||
ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
|
||||
else
|
||||
ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
|
||||
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
|
||||
zio_abd_checksum_data_t *cdp)
|
||||
|
||||
@@ -52,7 +52,6 @@ ZFS_NO_SANITIZE_UNDEFINED
|
||||
static void
|
||||
fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx)
|
||||
{
|
||||
kfpu_begin();
|
||||
memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t));
|
||||
}
|
||||
|
||||
@@ -70,7 +69,6 @@ fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
|
||||
8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] +
|
||||
ctx->aarch64_neon[1].v[1];
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
#define NEON_INIT_LOOP() \
|
||||
@@ -205,6 +203,7 @@ const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
|
||||
.compute_byteswap = fletcher_4_aarch64_neon_byteswap,
|
||||
.fini_byteswap = fletcher_4_aarch64_neon_fini,
|
||||
.valid = fletcher_4_aarch64_neon_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "aarch64_neon"
|
||||
};
|
||||
|
||||
|
||||
@@ -39,7 +39,6 @@ ZFS_NO_SANITIZE_UNDEFINED
|
||||
static void
|
||||
fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
|
||||
{
|
||||
kfpu_begin();
|
||||
memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t));
|
||||
}
|
||||
|
||||
@@ -73,7 +72,6 @@ fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
|
||||
}
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
#define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \
|
||||
@@ -166,6 +164,7 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
|
||||
.fini_byteswap = fletcher_4_avx512f_fini,
|
||||
.compute_byteswap = fletcher_4_avx512f_byteswap,
|
||||
.valid = fletcher_4_avx512f_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "avx512f"
|
||||
};
|
||||
|
||||
@@ -216,6 +215,7 @@ const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
|
||||
.fini_byteswap = fletcher_4_avx512f_fini,
|
||||
.compute_byteswap = fletcher_4_avx512bw_byteswap,
|
||||
.valid = fletcher_4_avx512bw_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "avx512bw"
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -51,7 +51,6 @@ ZFS_NO_SANITIZE_UNDEFINED
|
||||
static void
|
||||
fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
|
||||
{
|
||||
kfpu_begin();
|
||||
memset(ctx->avx, 0, 4 * sizeof (zfs_fletcher_avx_t));
|
||||
}
|
||||
|
||||
@@ -82,7 +81,6 @@ fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
|
||||
64 * ctx->avx[3].v[3];
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
#define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \
|
||||
@@ -163,6 +161,7 @@ const fletcher_4_ops_t fletcher_4_avx2_ops = {
|
||||
.fini_byteswap = fletcher_4_avx2_fini,
|
||||
.compute_byteswap = fletcher_4_avx2_byteswap,
|
||||
.valid = fletcher_4_avx2_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "avx2"
|
||||
};
|
||||
|
||||
|
||||
@@ -53,7 +53,6 @@ ZFS_NO_SANITIZE_UNDEFINED
|
||||
static void
|
||||
fletcher_4_sse2_init(fletcher_4_ctx_t *ctx)
|
||||
{
|
||||
kfpu_begin();
|
||||
memset(ctx->sse, 0, 4 * sizeof (zfs_fletcher_sse_t));
|
||||
}
|
||||
|
||||
@@ -81,7 +80,6 @@ fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
|
||||
8 * ctx->sse[2].v[1] + ctx->sse[1].v[1];
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
#define FLETCHER_4_SSE_RESTORE_CTX(ctx) \
|
||||
@@ -164,6 +162,7 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = {
|
||||
.fini_byteswap = fletcher_4_sse2_fini,
|
||||
.compute_byteswap = fletcher_4_sse2_byteswap,
|
||||
.valid = fletcher_4_sse2_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "sse2"
|
||||
};
|
||||
|
||||
@@ -218,6 +217,7 @@ const fletcher_4_ops_t fletcher_4_ssse3_ops = {
|
||||
.fini_byteswap = fletcher_4_sse2_fini,
|
||||
.compute_byteswap = fletcher_4_ssse3_byteswap,
|
||||
.valid = fletcher_4_ssse3_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "ssse3"
|
||||
};
|
||||
|
||||
|
||||
@@ -163,5 +163,6 @@ const fletcher_4_ops_t fletcher_4_superscalar_ops = {
|
||||
.compute_byteswap = fletcher_4_superscalar_byteswap,
|
||||
.fini_byteswap = fletcher_4_superscalar_fini,
|
||||
.valid = fletcher_4_superscalar_valid,
|
||||
.uses_fpu = B_FALSE,
|
||||
.name = "superscalar"
|
||||
};
|
||||
|
||||
@@ -229,5 +229,6 @@ const fletcher_4_ops_t fletcher_4_superscalar4_ops = {
|
||||
.compute_byteswap = fletcher_4_superscalar4_byteswap,
|
||||
.fini_byteswap = fletcher_4_superscalar4_fini,
|
||||
.valid = fletcher_4_superscalar4_valid,
|
||||
.uses_fpu = B_FALSE,
|
||||
.name = "superscalar4"
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user