mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-25 18:59:33 +03:00
Fletcher4: save/reload implementation context
Init, compute, and fini methods are changed to work on internal context object. This is necessary because ABI does not guarantee that SIMD registers will be preserved on function calls. This is technically the case in Linux kernel in between `kfpu_begin()/kfpu_end()`, but it breaks user-space tests and some kernels that don't require disabling preemption for using SIMD (osx). Use scalar compute methods in-place for small buffers, and when the buffer size does not meet SIMD size alignment. Signed-off-by: Gvozden Neskovic <neskovic@gmail.com>
This commit is contained in:
parent
37f520db2d
commit
5bf703b8f3
@ -62,12 +62,43 @@ void fletcher_4_init(void);
|
||||
void fletcher_4_fini(void);
|
||||
|
||||
|
||||
|
||||
/* Internal fletcher ctx */
|
||||
|
||||
typedef struct zfs_fletcher_sse {
|
||||
uint64_t v[2] __attribute__((aligned(16)));
|
||||
} zfs_fletcher_sse_t;
|
||||
|
||||
typedef struct zfs_fletcher_avx {
|
||||
uint64_t v[4] __attribute__((aligned(32)));
|
||||
} zfs_fletcher_avx_t;
|
||||
|
||||
typedef struct zfs_fletcher_avx512 {
|
||||
uint64_t v[8] __attribute__((aligned(64)));
|
||||
} zfs_fletcher_avx512_t;
|
||||
|
||||
|
||||
typedef union fletcher_4_ctx {
|
||||
zio_cksum_t scalar;
|
||||
|
||||
#if defined(HAVE_SSE2) || (defined(HAVE_SSE2) && defined(HAVE_SSSE3))
|
||||
zfs_fletcher_sse_t sse[4];
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
|
||||
zfs_fletcher_avx_t avx[4];
|
||||
#endif
|
||||
#if defined(__x86_64) && defined(HAVE_AVX512F)
|
||||
zfs_fletcher_avx512_t avx512[4];
|
||||
#endif
|
||||
} fletcher_4_ctx_t;
|
||||
|
||||
/*
|
||||
* fletcher checksum struct
|
||||
*/
|
||||
typedef void (*fletcher_4_init_f)(zio_cksum_t *);
|
||||
typedef void (*fletcher_4_fini_f)(zio_cksum_t *);
|
||||
typedef void (*fletcher_4_compute_f)(const void *, uint64_t, zio_cksum_t *);
|
||||
typedef void (*fletcher_4_init_f)(fletcher_4_ctx_t *);
|
||||
typedef void (*fletcher_4_fini_f)(fletcher_4_ctx_t *, zio_cksum_t *);
|
||||
typedef void (*fletcher_4_compute_f)(fletcher_4_ctx_t *,
|
||||
const void *, uint64_t);
|
||||
|
||||
typedef struct fletcher_4_func {
|
||||
fletcher_4_init_f init_native;
|
||||
@ -80,6 +111,7 @@ typedef struct fletcher_4_func {
|
||||
const char *name;
|
||||
} fletcher_4_ops_t;
|
||||
|
||||
|
||||
#if defined(HAVE_SSE2)
|
||||
extern const fletcher_4_ops_t fletcher_4_sse2_ops;
|
||||
#endif
|
||||
|
@ -138,17 +138,20 @@
|
||||
#include <zfs_fletcher.h>
|
||||
|
||||
|
||||
static void fletcher_4_scalar_init(zio_cksum_t *zcp);
|
||||
static void fletcher_4_scalar_native(const void *buf, uint64_t size,
|
||||
zio_cksum_t *zcp);
|
||||
static void fletcher_4_scalar_byteswap(const void *buf, uint64_t size,
|
||||
zio_cksum_t *zcp);
|
||||
static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
|
||||
static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
|
||||
static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
|
||||
const void *buf, uint64_t size);
|
||||
static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
|
||||
const void *buf, uint64_t size);
|
||||
static boolean_t fletcher_4_scalar_valid(void);
|
||||
|
||||
static const fletcher_4_ops_t fletcher_4_scalar_ops = {
|
||||
.init_native = fletcher_4_scalar_init,
|
||||
.fini_native = fletcher_4_scalar_fini,
|
||||
.compute_native = fletcher_4_scalar_native,
|
||||
.init_byteswap = fletcher_4_scalar_init,
|
||||
.fini_byteswap = fletcher_4_scalar_fini,
|
||||
.compute_byteswap = fletcher_4_scalar_byteswap,
|
||||
.valid = fletcher_4_scalar_valid,
|
||||
.name = "scalar"
|
||||
@ -248,22 +251,29 @@ fletcher_2_byteswap(const void *buf, uint64_t size,
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_scalar_init(zio_cksum_t *zcp)
|
||||
fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
|
||||
{
|
||||
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
|
||||
ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
||||
fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
|
||||
{
|
||||
memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
|
||||
uint64_t size)
|
||||
{
|
||||
const uint32_t *ip = buf;
|
||||
const uint32_t *ipend = ip + (size / sizeof (uint32_t));
|
||||
uint64_t a, b, c, d;
|
||||
|
||||
a = zcp->zc_word[0];
|
||||
b = zcp->zc_word[1];
|
||||
c = zcp->zc_word[2];
|
||||
d = zcp->zc_word[3];
|
||||
a = ctx->scalar.zc_word[0];
|
||||
b = ctx->scalar.zc_word[1];
|
||||
c = ctx->scalar.zc_word[2];
|
||||
d = ctx->scalar.zc_word[3];
|
||||
|
||||
for (; ip < ipend; ip++) {
|
||||
a += ip[0];
|
||||
@ -272,20 +282,21 @@ fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
||||
d += c;
|
||||
}
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, a, b, c, d);
|
||||
ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
||||
fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
|
||||
uint64_t size)
|
||||
{
|
||||
const uint32_t *ip = buf;
|
||||
const uint32_t *ipend = ip + (size / sizeof (uint32_t));
|
||||
uint64_t a, b, c, d;
|
||||
|
||||
a = zcp->zc_word[0];
|
||||
b = zcp->zc_word[1];
|
||||
c = zcp->zc_word[2];
|
||||
d = zcp->zc_word[3];
|
||||
a = ctx->scalar.zc_word[0];
|
||||
b = ctx->scalar.zc_word[1];
|
||||
c = ctx->scalar.zc_word[2];
|
||||
d = ctx->scalar.zc_word[3];
|
||||
|
||||
for (; ip < ipend; ip++) {
|
||||
a += BSWAP_32(ip[0]);
|
||||
@ -294,7 +305,7 @@ fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
||||
d += c;
|
||||
}
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, a, b, c, d);
|
||||
ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
@ -384,13 +395,14 @@ fletcher_4_impl_get(void)
|
||||
}
|
||||
|
||||
static inline void
|
||||
fletcher_4_native_impl(const fletcher_4_ops_t *ops, const void *buf,
|
||||
uint64_t size, zio_cksum_t *zcp)
|
||||
fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
||||
{
|
||||
ops->init_native(zcp);
|
||||
ops->compute_native(buf, size, zcp);
|
||||
if (ops->fini_native != NULL)
|
||||
ops->fini_native(zcp);
|
||||
fletcher_4_ctx_t ctx;
|
||||
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
|
||||
|
||||
ops->init_native(&ctx);
|
||||
ops->compute_native(&ctx, buf, size);
|
||||
ops->fini_native(&ctx, zcp);
|
||||
}
|
||||
|
||||
/*ARGSUSED*/
|
||||
@ -398,40 +410,41 @@ void
|
||||
fletcher_4_native(const void *buf, uint64_t size,
|
||||
const void *ctx_template, zio_cksum_t *zcp)
|
||||
{
|
||||
const fletcher_4_ops_t *ops;
|
||||
uint64_t p2size = P2ALIGN(size, 64);
|
||||
const uint64_t p2size = P2ALIGN(size, 64);
|
||||
|
||||
ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
|
||||
|
||||
if (size == 0) {
|
||||
if (size == 0 || p2size == 0) {
|
||||
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
|
||||
} else if (p2size == 0) {
|
||||
ops = &fletcher_4_scalar_ops;
|
||||
fletcher_4_native_impl(ops, buf, size, zcp);
|
||||
|
||||
if (size > 0)
|
||||
fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
|
||||
buf, size);
|
||||
} else {
|
||||
ops = fletcher_4_impl_get();
|
||||
fletcher_4_native_impl(ops, buf, p2size, zcp);
|
||||
fletcher_4_native_impl(buf, p2size, zcp);
|
||||
|
||||
if (p2size < size)
|
||||
fletcher_4_incremental_native((char *)buf + p2size,
|
||||
size - p2size, zcp);
|
||||
fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
|
||||
(char *)buf + p2size, size - p2size);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
||||
{
|
||||
fletcher_4_native_impl(&fletcher_4_scalar_ops, buf, size, zcp);
|
||||
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
|
||||
fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
|
||||
}
|
||||
|
||||
static inline void
|
||||
fletcher_4_byteswap_impl(const fletcher_4_ops_t *ops, const void *buf,
|
||||
uint64_t size, zio_cksum_t *zcp)
|
||||
fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
||||
{
|
||||
ops->init_byteswap(zcp);
|
||||
ops->compute_byteswap(buf, size, zcp);
|
||||
if (ops->fini_byteswap != NULL)
|
||||
ops->fini_byteswap(zcp);
|
||||
fletcher_4_ctx_t ctx;
|
||||
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
|
||||
|
||||
ops->init_byteswap(&ctx);
|
||||
ops->compute_byteswap(&ctx, buf, size);
|
||||
ops->fini_byteswap(&ctx, zcp);
|
||||
}
|
||||
|
||||
/*ARGSUSED*/
|
||||
@ -439,28 +452,29 @@ void
|
||||
fletcher_4_byteswap(const void *buf, uint64_t size,
|
||||
const void *ctx_template, zio_cksum_t *zcp)
|
||||
{
|
||||
const fletcher_4_ops_t *ops;
|
||||
uint64_t p2size = P2ALIGN(size, 64);
|
||||
const uint64_t p2size = P2ALIGN(size, 64);
|
||||
|
||||
ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
|
||||
|
||||
if (size == 0) {
|
||||
if (size == 0 || p2size == 0) {
|
||||
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
|
||||
} else if (p2size == 0) {
|
||||
ops = &fletcher_4_scalar_ops;
|
||||
fletcher_4_byteswap_impl(ops, buf, size, zcp);
|
||||
|
||||
if (size > 0)
|
||||
fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
|
||||
buf, size);
|
||||
} else {
|
||||
ops = fletcher_4_impl_get();
|
||||
fletcher_4_byteswap_impl(ops, buf, p2size, zcp);
|
||||
fletcher_4_byteswap_impl(buf, p2size, zcp);
|
||||
|
||||
if (p2size < size)
|
||||
fletcher_4_incremental_byteswap((char *)buf + p2size,
|
||||
size - p2size, zcp);
|
||||
fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
|
||||
(char *)buf + p2size, size - p2size);
|
||||
}
|
||||
}
|
||||
|
||||
/* Incremental Fletcher 4 */
|
||||
|
||||
#define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20)
|
||||
|
||||
static inline void
|
||||
fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
|
||||
const zio_cksum_t *nzcp)
|
||||
@ -469,6 +483,13 @@ fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
|
||||
const uint64_t c2 = c1 * (c1 + 1) / 2;
|
||||
const uint64_t c3 = c2 * (c1 + 2) / 3;
|
||||
|
||||
/*
|
||||
* Value of 'c3' overflows on buffer sizes close to 16MiB. For that
|
||||
* reason we split incremental fletcher4 computation of large buffers
|
||||
* to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
|
||||
*/
|
||||
ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
|
||||
|
||||
zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
|
||||
c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
|
||||
zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
|
||||
@ -481,13 +502,9 @@ static inline void
|
||||
fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
|
||||
zio_cksum_t *zcp)
|
||||
{
|
||||
static const uint64_t FLETCHER_4_INC_MAX = 8ULL << 20;
|
||||
uint64_t len;
|
||||
|
||||
while (size > 0) {
|
||||
zio_cksum_t nzc;
|
||||
|
||||
len = MIN(size, FLETCHER_4_INC_MAX);
|
||||
uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
|
||||
|
||||
if (native)
|
||||
fletcher_4_native(buf, len, NULL, &nzc);
|
||||
@ -504,14 +521,22 @@ fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
|
||||
void
|
||||
fletcher_4_incremental_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
||||
{
|
||||
fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
|
||||
/* Use scalar impl to directly update cksum of small blocks */
|
||||
if (size < SPA_MINBLOCKSIZE)
|
||||
fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
|
||||
else
|
||||
fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
|
||||
}
|
||||
|
||||
void
|
||||
fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
|
||||
zio_cksum_t *zcp)
|
||||
{
|
||||
fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
|
||||
/* Use scalar impl to directly update cksum of small blocks */
|
||||
if (size < SPA_MINBLOCKSIZE)
|
||||
fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
|
||||
else
|
||||
fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
|
||||
}
|
||||
|
||||
|
||||
@ -662,9 +687,6 @@ fletcher_4_init(void)
|
||||
membar_producer();
|
||||
|
||||
fletcher_4_initialized = B_TRUE;
|
||||
|
||||
/* Use 'cycle' math selection method for userspace */
|
||||
VERIFY0(fletcher_4_impl_set("cycle"));
|
||||
return;
|
||||
#endif
|
||||
/* Benchmark all supported implementations */
|
||||
|
@ -28,31 +28,73 @@
|
||||
#include <sys/byteorder.h>
|
||||
#include <sys/spa_checksum.h>
|
||||
#include <zfs_fletcher.h>
|
||||
#include <strings.h>
|
||||
|
||||
#define __asm __asm__ __volatile__
|
||||
|
||||
typedef struct {
|
||||
uint64_t v[8] __attribute__((aligned(64)));
|
||||
} zfs_avx512_t;
|
||||
|
||||
static void
|
||||
fletcher_4_avx512f_init(zio_cksum_t *zcp)
|
||||
fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
|
||||
{
|
||||
kfpu_begin();
|
||||
|
||||
/* clear registers */
|
||||
__asm("vpxorq %zmm0, %zmm0, %zmm0");
|
||||
__asm("vpxorq %zmm1, %zmm1, %zmm1");
|
||||
__asm("vpxorq %zmm2, %zmm2, %zmm2");
|
||||
__asm("vpxorq %zmm3, %zmm3, %zmm3");
|
||||
bzero(ctx->avx512, 4 * sizeof (zfs_fletcher_avx512_t));
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_avx512f_native(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
|
||||
{
|
||||
static const uint64_t
|
||||
CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 },
|
||||
CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 },
|
||||
DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 },
|
||||
DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 },
|
||||
DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 };
|
||||
|
||||
uint64_t A, B, C, D;
|
||||
uint64_t i;
|
||||
|
||||
A = ctx->avx512[0].v[0];
|
||||
B = 8 * ctx->avx512[1].v[0];
|
||||
C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0];
|
||||
D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] +
|
||||
DcB[0] * ctx->avx512[1].v[0];
|
||||
|
||||
for (i = 1; i < 8; i++) {
|
||||
A += ctx->avx512[0].v[i];
|
||||
B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i];
|
||||
C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] +
|
||||
CcA[i] * ctx->avx512[0].v[i];
|
||||
D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] +
|
||||
DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i];
|
||||
}
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
}
|
||||
|
||||
#define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \
|
||||
{ \
|
||||
__asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0])); \
|
||||
__asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1])); \
|
||||
__asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2])); \
|
||||
__asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3])); \
|
||||
}
|
||||
|
||||
#define FLETCHER_4_AVX512_SAVE_CTX(ctx) \
|
||||
{ \
|
||||
__asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0])); \
|
||||
__asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1])); \
|
||||
__asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2])); \
|
||||
__asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3])); \
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
|
||||
{
|
||||
const uint32_t *ip = buf;
|
||||
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
|
||||
|
||||
kfpu_begin();
|
||||
|
||||
FLETCHER_4_AVX512_RESTORE_CTX(ctx);
|
||||
|
||||
for (; ip < ipend; ip += 8) {
|
||||
__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
|
||||
__asm("vpaddq %zmm4, %zmm0, %zmm0");
|
||||
@ -60,15 +102,24 @@ fletcher_4_avx512f_native(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
__asm("vpaddq %zmm1, %zmm2, %zmm2");
|
||||
__asm("vpaddq %zmm2, %zmm3, %zmm3");
|
||||
}
|
||||
|
||||
FLETCHER_4_AVX512_SAVE_CTX(ctx);
|
||||
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_avx512f_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
|
||||
uint64_t size)
|
||||
{
|
||||
static const uint64_t byteswap_mask = 0xFFULL;
|
||||
const uint32_t *ip = buf;
|
||||
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
|
||||
|
||||
kfpu_begin();
|
||||
|
||||
FLETCHER_4_AVX512_RESTORE_CTX(ctx);
|
||||
|
||||
__asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask));
|
||||
__asm("vpsllq $8, %zmm8, %zmm9");
|
||||
__asm("vpsllq $16, %zmm8, %zmm10");
|
||||
@ -94,49 +145,10 @@ fletcher_4_avx512f_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
__asm("vpaddq %zmm1, %zmm2, %zmm2");
|
||||
__asm("vpaddq %zmm2, %zmm3, %zmm3");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_avx512f_fini(zio_cksum_t *zcp)
|
||||
{
|
||||
static const uint64_t
|
||||
CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 },
|
||||
CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 },
|
||||
DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 },
|
||||
DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 },
|
||||
DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 };
|
||||
|
||||
zfs_avx512_t a, b, c, b8, c64, d512;
|
||||
uint64_t A, B, C, D;
|
||||
uint64_t i;
|
||||
|
||||
__asm("vmovdqu64 %%zmm0, %0":"=m" (a));
|
||||
__asm("vmovdqu64 %%zmm1, %0":"=m" (b));
|
||||
__asm("vmovdqu64 %%zmm2, %0":"=m" (c));
|
||||
__asm("vpsllq $3, %zmm1, %zmm1");
|
||||
__asm("vpsllq $6, %zmm2, %zmm2");
|
||||
__asm("vpsllq $9, %zmm3, %zmm3");
|
||||
|
||||
__asm("vmovdqu64 %%zmm1, %0":"=m" (b8));
|
||||
__asm("vmovdqu64 %%zmm2, %0":"=m" (c64));
|
||||
__asm("vmovdqu64 %%zmm3, %0":"=m" (d512));
|
||||
FLETCHER_4_AVX512_SAVE_CTX(ctx)
|
||||
|
||||
kfpu_end();
|
||||
|
||||
A = a.v[0];
|
||||
B = b8.v[0];
|
||||
C = c64.v[0] - CcB[0] * b.v[0];
|
||||
D = d512.v[0] - DcC[0] * c.v[0] + DcB[0] * b.v[0];
|
||||
|
||||
for (i = 1; i < 8; i++) {
|
||||
A += a.v[i];
|
||||
B += b8.v[i] - i * a.v[i];
|
||||
C += c64.v[i] - CcB[i] * b.v[i] + CcA[i] * a.v[i];
|
||||
D += d512.v[i] - DcC[i] * c.v[i] + DcB[i] * b.v[i] -
|
||||
DcA[i] * a.v[i];
|
||||
}
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
|
@ -45,58 +45,69 @@
|
||||
#include <linux/simd_x86.h>
|
||||
#include <sys/spa_checksum.h>
|
||||
#include <zfs_fletcher.h>
|
||||
#include <strings.h>
|
||||
|
||||
static void
|
||||
fletcher_4_avx2_init(zio_cksum_t *zcp)
|
||||
fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
|
||||
{
|
||||
kfpu_begin();
|
||||
|
||||
/* clear avx2 registers */
|
||||
asm volatile("vpxor %ymm0, %ymm0, %ymm0");
|
||||
asm volatile("vpxor %ymm1, %ymm1, %ymm1");
|
||||
asm volatile("vpxor %ymm2, %ymm2, %ymm2");
|
||||
asm volatile("vpxor %ymm3, %ymm3, %ymm3");
|
||||
bzero(ctx->avx, 4 * sizeof (zfs_fletcher_avx_t));
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_avx2_fini(zio_cksum_t *zcp)
|
||||
fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
|
||||
{
|
||||
uint64_t __attribute__((aligned(32))) a[4];
|
||||
uint64_t __attribute__((aligned(32))) b[4];
|
||||
uint64_t __attribute__((aligned(32))) c[4];
|
||||
uint64_t __attribute__((aligned(32))) d[4];
|
||||
uint64_t A, B, C, D;
|
||||
|
||||
asm volatile("vmovdqu %%ymm0, %0":"=m" (a));
|
||||
asm volatile("vmovdqu %%ymm1, %0":"=m" (b));
|
||||
asm volatile("vmovdqu %%ymm2, %0":"=m" (c));
|
||||
asm volatile("vmovdqu %%ymm3, %0":"=m" (d));
|
||||
asm volatile("vzeroupper");
|
||||
A = ctx->avx[0].v[0] + ctx->avx[0].v[1] +
|
||||
ctx->avx[0].v[2] + ctx->avx[0].v[3];
|
||||
B = 0 - ctx->avx[0].v[1] - 2 * ctx->avx[0].v[2] - 3 * ctx->avx[0].v[3] +
|
||||
4 * ctx->avx[1].v[0] + 4 * ctx->avx[1].v[1] + 4 * ctx->avx[1].v[2] +
|
||||
4 * ctx->avx[1].v[3];
|
||||
|
||||
kfpu_end();
|
||||
C = ctx->avx[0].v[2] + 3 * ctx->avx[0].v[3] - 6 * ctx->avx[1].v[0] -
|
||||
10 * ctx->avx[1].v[1] - 14 * ctx->avx[1].v[2] -
|
||||
18 * ctx->avx[1].v[3] + 16 * ctx->avx[2].v[0] +
|
||||
16 * ctx->avx[2].v[1] + 16 * ctx->avx[2].v[2] +
|
||||
16 * ctx->avx[2].v[3];
|
||||
|
||||
A = a[0] + a[1] + a[2] + a[3];
|
||||
B = 0 - a[1] - 2*a[2] - 3*a[3]
|
||||
+ 4*b[0] + 4*b[1] + 4*b[2] + 4*b[3];
|
||||
|
||||
C = a[2] + 3*a[3]
|
||||
- 6*b[0] - 10*b[1] - 14*b[2] - 18*b[3]
|
||||
+ 16*c[0] + 16*c[1] + 16*c[2] + 16*c[3];
|
||||
|
||||
D = 0 - a[3]
|
||||
+ 4*b[0] + 10*b[1] + 20*b[2] + 34*b[3]
|
||||
- 48*c[0] - 64*c[1] - 80*c[2] - 96*c[3]
|
||||
+ 64*d[0] + 64*d[1] + 64*d[2] + 64*d[3];
|
||||
D = 0 - ctx->avx[0].v[3] + 4 * ctx->avx[1].v[0] +
|
||||
10 * ctx->avx[1].v[1] + 20 * ctx->avx[1].v[2] +
|
||||
34 * ctx->avx[1].v[3] - 48 * ctx->avx[2].v[0] -
|
||||
64 * ctx->avx[2].v[1] - 80 * ctx->avx[2].v[2] -
|
||||
96 * ctx->avx[2].v[3] + 64 * ctx->avx[3].v[0] +
|
||||
64 * ctx->avx[3].v[1] + 64 * ctx->avx[3].v[2] +
|
||||
64 * ctx->avx[3].v[3];
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
}
|
||||
|
||||
#define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \
|
||||
{ \
|
||||
asm volatile("vmovdqu %0, %%ymm0" :: "m" ((ctx)->avx[0])); \
|
||||
asm volatile("vmovdqu %0, %%ymm1" :: "m" ((ctx)->avx[1])); \
|
||||
asm volatile("vmovdqu %0, %%ymm2" :: "m" ((ctx)->avx[2])); \
|
||||
asm volatile("vmovdqu %0, %%ymm3" :: "m" ((ctx)->avx[3])); \
|
||||
}
|
||||
|
||||
#define FLETCHER_4_AVX2_SAVE_CTX(ctx) \
|
||||
{ \
|
||||
asm volatile("vmovdqu %%ymm0, %0" : "=m" ((ctx)->avx[0])); \
|
||||
asm volatile("vmovdqu %%ymm1, %0" : "=m" ((ctx)->avx[1])); \
|
||||
asm volatile("vmovdqu %%ymm2, %0" : "=m" ((ctx)->avx[2])); \
|
||||
asm volatile("vmovdqu %%ymm3, %0" : "=m" ((ctx)->avx[3])); \
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
fletcher_4_avx2_native(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
fletcher_4_avx2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
|
||||
{
|
||||
const uint64_t *ip = buf;
|
||||
const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
|
||||
|
||||
kfpu_begin();
|
||||
|
||||
FLETCHER_4_AVX2_RESTORE_CTX(ctx);
|
||||
|
||||
for (; ip < ipend; ip += 2) {
|
||||
asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip));
|
||||
asm volatile("vpaddq %ymm4, %ymm0, %ymm0");
|
||||
@ -104,21 +115,28 @@ fletcher_4_avx2_native(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
asm volatile("vpaddq %ymm1, %ymm2, %ymm2");
|
||||
asm volatile("vpaddq %ymm2, %ymm3, %ymm3");
|
||||
}
|
||||
|
||||
FLETCHER_4_AVX2_SAVE_CTX(ctx);
|
||||
asm volatile("vzeroupper");
|
||||
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_avx2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
|
||||
{
|
||||
static const struct {
|
||||
uint64_t v[4] __attribute__((aligned(32)));
|
||||
} mask = {
|
||||
static const zfs_fletcher_avx_t mask = {
|
||||
.v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
|
||||
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
|
||||
};
|
||||
const uint64_t *ip = buf;
|
||||
const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
|
||||
|
||||
asm volatile("vmovdqa %0, %%ymm5"::"m"(mask));
|
||||
kfpu_begin();
|
||||
|
||||
FLETCHER_4_AVX2_RESTORE_CTX(ctx);
|
||||
|
||||
asm volatile("vmovdqu %0, %%ymm5" :: "m" (mask));
|
||||
|
||||
for (; ip < ipend; ip += 2) {
|
||||
asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip));
|
||||
@ -129,6 +147,11 @@ fletcher_4_avx2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
asm volatile("vpaddq %ymm1, %ymm2, %ymm2");
|
||||
asm volatile("vpaddq %ymm2, %ymm3, %ymm3");
|
||||
}
|
||||
|
||||
FLETCHER_4_AVX2_SAVE_CTX(ctx);
|
||||
asm volatile("vzeroupper");
|
||||
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
static boolean_t fletcher_4_avx2_valid(void)
|
||||
|
@ -45,39 +45,19 @@
|
||||
|
||||
#include <linux/simd_x86.h>
|
||||
#include <sys/spa_checksum.h>
|
||||
#include <sys/byteorder.h>
|
||||
#include <zfs_fletcher.h>
|
||||
|
||||
struct zfs_fletcher_sse_array {
|
||||
uint64_t v[2] __attribute__((aligned(16)));
|
||||
};
|
||||
#include <strings.h>
|
||||
|
||||
static void
|
||||
fletcher_4_sse2_init(zio_cksum_t *zcp)
|
||||
{
|
||||
kfpu_begin();
|
||||
|
||||
/* clear sse registers */
|
||||
asm volatile("pxor %xmm0, %xmm0");
|
||||
asm volatile("pxor %xmm1, %xmm1");
|
||||
asm volatile("pxor %xmm2, %xmm2");
|
||||
asm volatile("pxor %xmm3, %xmm3");
|
||||
fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) {
|
||||
bzero(ctx->sse, 4 * sizeof (zfs_fletcher_sse_t));
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_sse2_fini(zio_cksum_t *zcp)
|
||||
{
|
||||
struct zfs_fletcher_sse_array a, b, c, d;
|
||||
fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) {
|
||||
uint64_t A, B, C, D;
|
||||
|
||||
asm volatile("movdqu %%xmm0, %0":"=m" (a.v));
|
||||
asm volatile("movdqu %%xmm1, %0":"=m" (b.v));
|
||||
asm volatile("psllq $0x2, %xmm2");
|
||||
asm volatile("movdqu %%xmm2, %0":"=m" (c.v));
|
||||
asm volatile("psllq $0x3, %xmm3");
|
||||
asm volatile("movdqu %%xmm3, %0":"=m" (d.v));
|
||||
|
||||
kfpu_end();
|
||||
|
||||
/*
|
||||
* The mixing matrix for checksum calculation is:
|
||||
* a = a0 + a1
|
||||
@ -88,20 +68,42 @@ fletcher_4_sse2_fini(zio_cksum_t *zcp)
|
||||
* c and d are multiplied by 4 and 8, respectively,
|
||||
* before spilling the vectors out to memory.
|
||||
*/
|
||||
A = a.v[0] + a.v[1];
|
||||
B = 2*b.v[0] + 2*b.v[1] - a.v[1];
|
||||
C = c.v[0] - b.v[0] + c.v[1] - 3*b.v[1];
|
||||
D = d.v[0] - c.v[0] + d.v[1] - 2*c.v[1] + b.v[1];
|
||||
A = ctx->sse[0].v[0] + ctx->sse[0].v[1];
|
||||
B = 2 * ctx->sse[1].v[0] + 2 * ctx->sse[1].v[1] - ctx->sse[0].v[1];
|
||||
C = 4 * ctx->sse[2].v[0] - ctx->sse[1].v[0] + 4 * ctx->sse[2].v[1] -
|
||||
3 * ctx->sse[1].v[1];
|
||||
D = 8 * ctx->sse[3].v[0] - 4 * ctx->sse[2].v[0] + 8 * ctx->sse[3].v[1] -
|
||||
8 * ctx->sse[2].v[1] + ctx->sse[1].v[1];
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
}
|
||||
|
||||
#define FLETCHER_4_SSE_RESTORE_CTX(ctx) \
|
||||
{ \
|
||||
asm volatile("movdqu %0, %%xmm0" :: "m" ((ctx)->sse[0])); \
|
||||
asm volatile("movdqu %0, %%xmm1" :: "m" ((ctx)->sse[1])); \
|
||||
asm volatile("movdqu %0, %%xmm2" :: "m" ((ctx)->sse[2])); \
|
||||
asm volatile("movdqu %0, %%xmm3" :: "m" ((ctx)->sse[3])); \
|
||||
}
|
||||
|
||||
#define FLETCHER_4_SSE_SAVE_CTX(ctx) \
|
||||
{ \
|
||||
asm volatile("movdqu %%xmm0, %0" : "=m" ((ctx)->sse[0])); \
|
||||
asm volatile("movdqu %%xmm1, %0" : "=m" ((ctx)->sse[1])); \
|
||||
asm volatile("movdqu %%xmm2, %0" : "=m" ((ctx)->sse[2])); \
|
||||
asm volatile("movdqu %%xmm3, %0" : "=m" ((ctx)->sse[3])); \
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_sse2_native(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
fletcher_4_sse2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
|
||||
{
|
||||
const uint64_t *ip = buf;
|
||||
const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
|
||||
|
||||
kfpu_begin();
|
||||
|
||||
FLETCHER_4_SSE_RESTORE_CTX(ctx);
|
||||
|
||||
asm volatile("pxor %xmm4, %xmm4");
|
||||
|
||||
for (; ip < ipend; ip += 2) {
|
||||
@ -118,27 +120,37 @@ fletcher_4_sse2_native(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
asm volatile("paddq %xmm1, %xmm2");
|
||||
asm volatile("paddq %xmm2, %xmm3");
|
||||
}
|
||||
|
||||
FLETCHER_4_SSE_SAVE_CTX(ctx);
|
||||
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
fletcher_4_sse2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
|
||||
{
|
||||
const uint32_t *ip = buf;
|
||||
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
|
||||
|
||||
for (; ip < ipend; ip += 2) {
|
||||
uint32_t scratch;
|
||||
kfpu_begin();
|
||||
|
||||
asm volatile("bswapl %0" : "=r"(scratch) : "0"(*ip));
|
||||
asm volatile("movd %0, %%xmm5" :: "r"(scratch));
|
||||
asm volatile("bswapl %0" : "=r"(scratch) : "0"(*(ip + 1)));
|
||||
asm volatile("movd %0, %%xmm6" :: "r"(scratch));
|
||||
FLETCHER_4_SSE_RESTORE_CTX(ctx);
|
||||
|
||||
for (; ip < ipend; ip += 2) {
|
||||
uint32_t scratch1 = BSWAP_32(ip[0]);
|
||||
uint32_t scratch2 = BSWAP_32(ip[1]);
|
||||
asm volatile("movd %0, %%xmm5" :: "r"(scratch1));
|
||||
asm volatile("movd %0, %%xmm6" :: "r"(scratch2));
|
||||
asm volatile("punpcklqdq %xmm6, %xmm5");
|
||||
asm volatile("paddq %xmm5, %xmm0");
|
||||
asm volatile("paddq %xmm0, %xmm1");
|
||||
asm volatile("paddq %xmm1, %xmm2");
|
||||
asm volatile("paddq %xmm2, %xmm3");
|
||||
}
|
||||
|
||||
FLETCHER_4_SSE_SAVE_CTX(ctx);
|
||||
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
static boolean_t fletcher_4_sse2_valid(void)
|
||||
@ -161,15 +173,19 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = {
|
||||
|
||||
#if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
|
||||
static void
|
||||
fletcher_4_ssse3_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
|
||||
{
|
||||
static const struct zfs_fletcher_sse_array mask = {
|
||||
static const zfs_fletcher_sse_t mask = {
|
||||
.v = { 0x0405060700010203, 0x0C0D0E0F08090A0B }
|
||||
};
|
||||
|
||||
const uint64_t *ip = buf;
|
||||
const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
|
||||
|
||||
kfpu_begin();
|
||||
|
||||
FLETCHER_4_SSE_RESTORE_CTX(ctx);
|
||||
|
||||
asm volatile("movdqu %0, %%xmm7"::"m" (mask));
|
||||
asm volatile("pxor %xmm4, %xmm4");
|
||||
|
||||
@ -188,6 +204,10 @@ fletcher_4_ssse3_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
|
||||
asm volatile("paddq %xmm1, %xmm2");
|
||||
asm volatile("paddq %xmm2, %xmm3");
|
||||
}
|
||||
|
||||
FLETCHER_4_SSE_SAVE_CTX(ctx);
|
||||
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
static boolean_t fletcher_4_ssse3_valid(void)
|
||||
|
Loading…
Reference in New Issue
Block a user