Fletcher4: Incremental updates and ctx calculation

Fixes ABI issues with fletcher4 code, adds support for
incremental updates, and adds ztest method for testing.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Gvozden Neskovic <neskovic@gmail.com>
Closes #5164
This commit is contained in:
Brian Behlendorf 2016-10-07 12:44:12 -07:00 committed by GitHub
commit 482cd9ee69
7 changed files with 436 additions and 206 deletions

View File

@ -332,6 +332,7 @@ ztest_func_t ztest_split_pool;
ztest_func_t ztest_reguid; ztest_func_t ztest_reguid;
ztest_func_t ztest_spa_upgrade; ztest_func_t ztest_spa_upgrade;
ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher;
ztest_func_t ztest_fletcher_incr;
ztest_func_t ztest_verify_dnode_bt; ztest_func_t ztest_verify_dnode_bt;
uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
@ -379,6 +380,7 @@ ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime),
ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime),
ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
}; };
@ -5674,6 +5676,82 @@ ztest_fletcher(ztest_ds_t *zd, uint64_t id)
} }
} }
void
ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id)
{
void *buf;
size_t size;
int *ptr;
int i;
zio_cksum_t zc_ref;
zio_cksum_t zc_ref_bswap;
hrtime_t end = gethrtime() + NANOSEC;
while (gethrtime() <= end) {
int run_count = 100;
size = ztest_random_blocksize();
buf = umem_alloc(size, UMEM_NOFAIL);
for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++)
*ptr = ztest_random(UINT_MAX);
VERIFY0(fletcher_4_impl_set("scalar"));
fletcher_4_native(buf, size, NULL, &zc_ref);
fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap);
VERIFY0(fletcher_4_impl_set("cycle"));
while (run_count-- > 0) {
zio_cksum_t zc;
zio_cksum_t zc_bswap;
size_t pos = 0;
ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0);
while (pos < size) {
size_t inc = 64 * ztest_random(size / 67);
/* sometimes add few bytes to test non-simd */
if (ztest_random(100) < 10)
inc += P2ALIGN(ztest_random(64),
sizeof (uint32_t));
if (inc > (size - pos))
inc = size - pos;
fletcher_4_incremental_native(buf + pos, inc,
&zc);
fletcher_4_incremental_byteswap(buf + pos, inc,
&zc_bswap);
pos += inc;
}
VERIFY3U(pos, ==, size);
VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref));
VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap));
/*
* verify if incremental on the whole buffer is
* equivalent to non-incremental version
*/
ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0);
fletcher_4_incremental_native(buf, size, &zc);
fletcher_4_incremental_byteswap(buf, size, &zc_bswap);
VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref));
VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap));
}
umem_free(buf, size);
}
}
static int static int
ztest_check_path(char *path) ztest_check_path(char *path)
{ {

View File

@ -62,12 +62,43 @@ void fletcher_4_init(void);
void fletcher_4_fini(void); void fletcher_4_fini(void);
/* Internal fletcher ctx */
typedef struct zfs_fletcher_sse {
uint64_t v[2] __attribute__((aligned(16)));
} zfs_fletcher_sse_t;
typedef struct zfs_fletcher_avx {
uint64_t v[4] __attribute__((aligned(32)));
} zfs_fletcher_avx_t;
typedef struct zfs_fletcher_avx512 {
uint64_t v[8] __attribute__((aligned(64)));
} zfs_fletcher_avx512_t;
typedef union fletcher_4_ctx {
zio_cksum_t scalar;
#if defined(HAVE_SSE2) || (defined(HAVE_SSE2) && defined(HAVE_SSSE3))
zfs_fletcher_sse_t sse[4];
#endif
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
zfs_fletcher_avx_t avx[4];
#endif
#if defined(__x86_64) && defined(HAVE_AVX512F)
zfs_fletcher_avx512_t avx512[4];
#endif
} fletcher_4_ctx_t;
/* /*
* fletcher checksum struct * fletcher checksum struct
*/ */
typedef void (*fletcher_4_init_f)(zio_cksum_t *); typedef void (*fletcher_4_init_f)(fletcher_4_ctx_t *);
typedef void (*fletcher_4_fini_f)(zio_cksum_t *); typedef void (*fletcher_4_fini_f)(fletcher_4_ctx_t *, zio_cksum_t *);
typedef void (*fletcher_4_compute_f)(const void *, uint64_t, zio_cksum_t *); typedef void (*fletcher_4_compute_f)(fletcher_4_ctx_t *,
const void *, uint64_t);
typedef struct fletcher_4_func { typedef struct fletcher_4_func {
fletcher_4_init_f init_native; fletcher_4_init_f init_native;
@ -80,6 +111,7 @@ typedef struct fletcher_4_func {
const char *name; const char *name;
} fletcher_4_ops_t; } fletcher_4_ops_t;
#if defined(HAVE_SSE2) #if defined(HAVE_SSE2)
extern const fletcher_4_ops_t fletcher_4_sse2_ops; extern const fletcher_4_ops_t fletcher_4_sse2_ops;
#endif #endif

View File

@ -51,6 +51,7 @@
#include "libzfs_impl.h" #include "libzfs_impl.h"
#include "zfs_prop.h" #include "zfs_prop.h"
#include "zfeature_common.h" #include "zfeature_common.h"
#include <zfs_fletcher.h>
int int
libzfs_errno(libzfs_handle_t *hdl) libzfs_errno(libzfs_handle_t *hdl)
@ -876,6 +877,7 @@ libzfs_init(void)
zpool_prop_init(); zpool_prop_init();
zpool_feature_init(); zpool_feature_init();
libzfs_mnttab_init(hdl); libzfs_mnttab_init(hdl);
fletcher_4_init();
return (hdl); return (hdl);
} }
@ -898,6 +900,7 @@ libzfs_fini(libzfs_handle_t *hdl)
namespace_clear(hdl); namespace_clear(hdl);
libzfs_mnttab_fini(hdl); libzfs_mnttab_fini(hdl);
libzfs_core_fini(); libzfs_core_fini();
fletcher_4_fini();
free(hdl); free(hdl);
} }

View File

@ -138,17 +138,20 @@
#include <zfs_fletcher.h> #include <zfs_fletcher.h>
static void fletcher_4_scalar_init(zio_cksum_t *zcp); static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
static void fletcher_4_scalar_native(const void *buf, uint64_t size, static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
zio_cksum_t *zcp); static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
static void fletcher_4_scalar_byteswap(const void *buf, uint64_t size, const void *buf, uint64_t size);
zio_cksum_t *zcp); static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
const void *buf, uint64_t size);
static boolean_t fletcher_4_scalar_valid(void); static boolean_t fletcher_4_scalar_valid(void);
static const fletcher_4_ops_t fletcher_4_scalar_ops = { static const fletcher_4_ops_t fletcher_4_scalar_ops = {
.init_native = fletcher_4_scalar_init, .init_native = fletcher_4_scalar_init,
.fini_native = fletcher_4_scalar_fini,
.compute_native = fletcher_4_scalar_native, .compute_native = fletcher_4_scalar_native,
.init_byteswap = fletcher_4_scalar_init, .init_byteswap = fletcher_4_scalar_init,
.fini_byteswap = fletcher_4_scalar_fini,
.compute_byteswap = fletcher_4_scalar_byteswap, .compute_byteswap = fletcher_4_scalar_byteswap,
.valid = fletcher_4_scalar_valid, .valid = fletcher_4_scalar_valid,
.name = "scalar" .name = "scalar"
@ -248,22 +251,29 @@ fletcher_2_byteswap(const void *buf, uint64_t size,
} }
static void static void
fletcher_4_scalar_init(zio_cksum_t *zcp) fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
{ {
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
} }
static void static void
fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp) fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
{
memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
}
static void
fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
uint64_t size)
{ {
const uint32_t *ip = buf; const uint32_t *ip = buf;
const uint32_t *ipend = ip + (size / sizeof (uint32_t)); const uint32_t *ipend = ip + (size / sizeof (uint32_t));
uint64_t a, b, c, d; uint64_t a, b, c, d;
a = zcp->zc_word[0]; a = ctx->scalar.zc_word[0];
b = zcp->zc_word[1]; b = ctx->scalar.zc_word[1];
c = zcp->zc_word[2]; c = ctx->scalar.zc_word[2];
d = zcp->zc_word[3]; d = ctx->scalar.zc_word[3];
for (; ip < ipend; ip++) { for (; ip < ipend; ip++) {
a += ip[0]; a += ip[0];
@ -272,20 +282,21 @@ fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
d += c; d += c;
} }
ZIO_SET_CHECKSUM(zcp, a, b, c, d); ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
} }
static void static void
fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
uint64_t size)
{ {
const uint32_t *ip = buf; const uint32_t *ip = buf;
const uint32_t *ipend = ip + (size / sizeof (uint32_t)); const uint32_t *ipend = ip + (size / sizeof (uint32_t));
uint64_t a, b, c, d; uint64_t a, b, c, d;
a = zcp->zc_word[0]; a = ctx->scalar.zc_word[0];
b = zcp->zc_word[1]; b = ctx->scalar.zc_word[1];
c = zcp->zc_word[2]; c = ctx->scalar.zc_word[2];
d = zcp->zc_word[3]; d = ctx->scalar.zc_word[3];
for (; ip < ipend; ip++) { for (; ip < ipend; ip++) {
a += BSWAP_32(ip[0]); a += BSWAP_32(ip[0]);
@ -294,7 +305,7 @@ fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
d += c; d += c;
} }
ZIO_SET_CHECKSUM(zcp, a, b, c, d); ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
} }
static boolean_t static boolean_t
@ -383,32 +394,15 @@ fletcher_4_impl_get(void)
return (ops); return (ops);
} }
void
fletcher_4_incremental_native(const void *buf, uint64_t size,
zio_cksum_t *zcp)
{
ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
fletcher_4_scalar_native(buf, size, zcp);
}
void
fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
zio_cksum_t *zcp)
{
ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
fletcher_4_scalar_byteswap(buf, size, zcp);
}
static inline void static inline void
fletcher_4_native_impl(const fletcher_4_ops_t *ops, const void *buf, fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
uint64_t size, zio_cksum_t *zcp)
{ {
ops->init_native(zcp); fletcher_4_ctx_t ctx;
ops->compute_native(buf, size, zcp); const fletcher_4_ops_t *ops = fletcher_4_impl_get();
if (ops->fini_native != NULL)
ops->fini_native(zcp); ops->init_native(&ctx);
ops->compute_native(&ctx, buf, size);
ops->fini_native(&ctx, zcp);
} }
/*ARGSUSED*/ /*ARGSUSED*/
@ -416,40 +410,41 @@ void
fletcher_4_native(const void *buf, uint64_t size, fletcher_4_native(const void *buf, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp) const void *ctx_template, zio_cksum_t *zcp)
{ {
const fletcher_4_ops_t *ops; const uint64_t p2size = P2ALIGN(size, 64);
uint64_t p2size = P2ALIGN(size, 64);
ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
if (size == 0) { if (size == 0 || p2size == 0) {
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
} else if (p2size == 0) {
ops = &fletcher_4_scalar_ops; if (size > 0)
fletcher_4_native_impl(ops, buf, size, zcp); fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
buf, size);
} else { } else {
ops = fletcher_4_impl_get(); fletcher_4_native_impl(buf, p2size, zcp);
fletcher_4_native_impl(ops, buf, p2size, zcp);
if (p2size < size) if (p2size < size)
fletcher_4_incremental_native((char *)buf + p2size, fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
size - p2size, zcp); (char *)buf + p2size, size - p2size);
} }
} }
void void
fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp) fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
{ {
fletcher_4_native_impl(&fletcher_4_scalar_ops, buf, size, zcp); ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
} }
static inline void static inline void
fletcher_4_byteswap_impl(const fletcher_4_ops_t *ops, const void *buf, fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
uint64_t size, zio_cksum_t *zcp)
{ {
ops->init_byteswap(zcp); fletcher_4_ctx_t ctx;
ops->compute_byteswap(buf, size, zcp); const fletcher_4_ops_t *ops = fletcher_4_impl_get();
if (ops->fini_byteswap != NULL)
ops->fini_byteswap(zcp); ops->init_byteswap(&ctx);
ops->compute_byteswap(&ctx, buf, size);
ops->fini_byteswap(&ctx, zcp);
} }
/*ARGSUSED*/ /*ARGSUSED*/
@ -457,26 +452,96 @@ void
fletcher_4_byteswap(const void *buf, uint64_t size, fletcher_4_byteswap(const void *buf, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp) const void *ctx_template, zio_cksum_t *zcp)
{ {
const fletcher_4_ops_t *ops; const uint64_t p2size = P2ALIGN(size, 64);
uint64_t p2size = P2ALIGN(size, 64);
ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
if (size == 0) { if (size == 0 || p2size == 0) {
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
} else if (p2size == 0) {
ops = &fletcher_4_scalar_ops; if (size > 0)
fletcher_4_byteswap_impl(ops, buf, size, zcp); fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
buf, size);
} else { } else {
ops = fletcher_4_impl_get(); fletcher_4_byteswap_impl(buf, p2size, zcp);
fletcher_4_byteswap_impl(ops, buf, p2size, zcp);
if (p2size < size) if (p2size < size)
fletcher_4_incremental_byteswap((char *)buf + p2size, fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
size - p2size, zcp); (char *)buf + p2size, size - p2size);
} }
} }
/* Incremental Fletcher 4 */
#define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20)
static inline void
fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
const zio_cksum_t *nzcp)
{
const uint64_t c1 = size / sizeof (uint32_t);
const uint64_t c2 = c1 * (c1 + 1) / 2;
const uint64_t c3 = c2 * (c1 + 2) / 3;
/*
* Value of 'c3' overflows on buffer sizes close to 16MiB. For that
* reason we split incremental fletcher4 computation of large buffers
* to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
*/
ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
c2 * zcp->zc_word[0];
zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
zcp->zc_word[0] += nzcp->zc_word[0];
}
static inline void
fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
zio_cksum_t *zcp)
{
while (size > 0) {
zio_cksum_t nzc;
uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
if (native)
fletcher_4_native(buf, len, NULL, &nzc);
else
fletcher_4_byteswap(buf, len, NULL, &nzc);
fletcher_4_incremental_combine(zcp, len, &nzc);
size -= len;
buf += len;
}
}
void
fletcher_4_incremental_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
{
/* Use scalar impl to directly update cksum of small blocks */
if (size < SPA_MINBLOCKSIZE)
fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
else
fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
}
void
fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
zio_cksum_t *zcp)
{
/* Use scalar impl to directly update cksum of small blocks */
if (size < SPA_MINBLOCKSIZE)
fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
else
fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
}
/* Fletcher 4 kstats */
static int static int
fletcher_4_kstat_headers(char *buf, size_t size) fletcher_4_kstat_headers(char *buf, size_t size)
{ {
@ -622,9 +687,6 @@ fletcher_4_init(void)
membar_producer(); membar_producer();
fletcher_4_initialized = B_TRUE; fletcher_4_initialized = B_TRUE;
/* Use 'cycle' math selection method for userspace */
VERIFY0(fletcher_4_impl_set("cycle"));
return; return;
#endif #endif
/* Benchmark all supported implementations */ /* Benchmark all supported implementations */

View File

@ -28,31 +28,73 @@
#include <sys/byteorder.h> #include <sys/byteorder.h>
#include <sys/spa_checksum.h> #include <sys/spa_checksum.h>
#include <zfs_fletcher.h> #include <zfs_fletcher.h>
#include <strings.h>
#define __asm __asm__ __volatile__ #define __asm __asm__ __volatile__
typedef struct {
uint64_t v[8] __attribute__((aligned(64)));
} zfs_avx512_t;
static void static void
fletcher_4_avx512f_init(zio_cksum_t *zcp) fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
{ {
kfpu_begin(); bzero(ctx->avx512, 4 * sizeof (zfs_fletcher_avx512_t));
/* clear registers */
__asm("vpxorq %zmm0, %zmm0, %zmm0");
__asm("vpxorq %zmm1, %zmm1, %zmm1");
__asm("vpxorq %zmm2, %zmm2, %zmm2");
__asm("vpxorq %zmm3, %zmm3, %zmm3");
} }
static void static void
fletcher_4_avx512f_native(const void *buf, uint64_t size, zio_cksum_t *unused) fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
{
static const uint64_t
CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 },
CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 },
DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 },
DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 },
DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 };
uint64_t A, B, C, D;
uint64_t i;
A = ctx->avx512[0].v[0];
B = 8 * ctx->avx512[1].v[0];
C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0];
D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] +
DcB[0] * ctx->avx512[1].v[0];
for (i = 1; i < 8; i++) {
A += ctx->avx512[0].v[i];
B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i];
C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] +
CcA[i] * ctx->avx512[0].v[i];
D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] +
DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i];
}
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
}
#define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \
{ \
__asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0])); \
__asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1])); \
__asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2])); \
__asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3])); \
}
#define FLETCHER_4_AVX512_SAVE_CTX(ctx) \
{ \
__asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0])); \
__asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1])); \
__asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2])); \
__asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3])); \
}
static void
fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
{ {
const uint32_t *ip = buf; const uint32_t *ip = buf;
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
kfpu_begin();
FLETCHER_4_AVX512_RESTORE_CTX(ctx);
for (; ip < ipend; ip += 8) { for (; ip < ipend; ip += 8) {
__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
__asm("vpaddq %zmm4, %zmm0, %zmm0"); __asm("vpaddq %zmm4, %zmm0, %zmm0");
@ -60,15 +102,24 @@ fletcher_4_avx512f_native(const void *buf, uint64_t size, zio_cksum_t *unused)
__asm("vpaddq %zmm1, %zmm2, %zmm2"); __asm("vpaddq %zmm1, %zmm2, %zmm2");
__asm("vpaddq %zmm2, %zmm3, %zmm3"); __asm("vpaddq %zmm2, %zmm3, %zmm3");
} }
FLETCHER_4_AVX512_SAVE_CTX(ctx);
kfpu_end();
} }
static void static void
fletcher_4_avx512f_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
uint64_t size)
{ {
static const uint64_t byteswap_mask = 0xFFULL; static const uint64_t byteswap_mask = 0xFFULL;
const uint32_t *ip = buf; const uint32_t *ip = buf;
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
kfpu_begin();
FLETCHER_4_AVX512_RESTORE_CTX(ctx);
__asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask)); __asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask));
__asm("vpsllq $8, %zmm8, %zmm9"); __asm("vpsllq $8, %zmm8, %zmm9");
__asm("vpsllq $16, %zmm8, %zmm10"); __asm("vpsllq $16, %zmm8, %zmm10");
@ -94,49 +145,10 @@ fletcher_4_avx512f_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
__asm("vpaddq %zmm1, %zmm2, %zmm2"); __asm("vpaddq %zmm1, %zmm2, %zmm2");
__asm("vpaddq %zmm2, %zmm3, %zmm3"); __asm("vpaddq %zmm2, %zmm3, %zmm3");
} }
}
static void FLETCHER_4_AVX512_SAVE_CTX(ctx)
fletcher_4_avx512f_fini(zio_cksum_t *zcp)
{
static const uint64_t
CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 },
CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 },
DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 },
DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 },
DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 };
zfs_avx512_t a, b, c, b8, c64, d512;
uint64_t A, B, C, D;
uint64_t i;
__asm("vmovdqu64 %%zmm0, %0":"=m" (a));
__asm("vmovdqu64 %%zmm1, %0":"=m" (b));
__asm("vmovdqu64 %%zmm2, %0":"=m" (c));
__asm("vpsllq $3, %zmm1, %zmm1");
__asm("vpsllq $6, %zmm2, %zmm2");
__asm("vpsllq $9, %zmm3, %zmm3");
__asm("vmovdqu64 %%zmm1, %0":"=m" (b8));
__asm("vmovdqu64 %%zmm2, %0":"=m" (c64));
__asm("vmovdqu64 %%zmm3, %0":"=m" (d512));
kfpu_end(); kfpu_end();
A = a.v[0];
B = b8.v[0];
C = c64.v[0] - CcB[0] * b.v[0];
D = d512.v[0] - DcC[0] * c.v[0] + DcB[0] * b.v[0];
for (i = 1; i < 8; i++) {
A += a.v[i];
B += b8.v[i] - i * a.v[i];
C += c64.v[i] - CcB[i] * b.v[i] + CcA[i] * a.v[i];
D += d512.v[i] - DcC[i] * c.v[i] + DcB[i] * b.v[i] -
DcA[i] * a.v[i];
}
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
} }
static boolean_t static boolean_t

View File

@ -45,58 +45,69 @@
#include <linux/simd_x86.h> #include <linux/simd_x86.h>
#include <sys/spa_checksum.h> #include <sys/spa_checksum.h>
#include <zfs_fletcher.h> #include <zfs_fletcher.h>
#include <strings.h>
static void static void
fletcher_4_avx2_init(zio_cksum_t *zcp) fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
{ {
kfpu_begin(); bzero(ctx->avx, 4 * sizeof (zfs_fletcher_avx_t));
/* clear avx2 registers */
asm volatile("vpxor %ymm0, %ymm0, %ymm0");
asm volatile("vpxor %ymm1, %ymm1, %ymm1");
asm volatile("vpxor %ymm2, %ymm2, %ymm2");
asm volatile("vpxor %ymm3, %ymm3, %ymm3");
} }
static void static void
fletcher_4_avx2_fini(zio_cksum_t *zcp) fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
{ {
uint64_t __attribute__((aligned(32))) a[4];
uint64_t __attribute__((aligned(32))) b[4];
uint64_t __attribute__((aligned(32))) c[4];
uint64_t __attribute__((aligned(32))) d[4];
uint64_t A, B, C, D; uint64_t A, B, C, D;
asm volatile("vmovdqu %%ymm0, %0":"=m" (a)); A = ctx->avx[0].v[0] + ctx->avx[0].v[1] +
asm volatile("vmovdqu %%ymm1, %0":"=m" (b)); ctx->avx[0].v[2] + ctx->avx[0].v[3];
asm volatile("vmovdqu %%ymm2, %0":"=m" (c)); B = 0 - ctx->avx[0].v[1] - 2 * ctx->avx[0].v[2] - 3 * ctx->avx[0].v[3] +
asm volatile("vmovdqu %%ymm3, %0":"=m" (d)); 4 * ctx->avx[1].v[0] + 4 * ctx->avx[1].v[1] + 4 * ctx->avx[1].v[2] +
asm volatile("vzeroupper"); 4 * ctx->avx[1].v[3];
kfpu_end(); C = ctx->avx[0].v[2] + 3 * ctx->avx[0].v[3] - 6 * ctx->avx[1].v[0] -
10 * ctx->avx[1].v[1] - 14 * ctx->avx[1].v[2] -
18 * ctx->avx[1].v[3] + 16 * ctx->avx[2].v[0] +
16 * ctx->avx[2].v[1] + 16 * ctx->avx[2].v[2] +
16 * ctx->avx[2].v[3];
A = a[0] + a[1] + a[2] + a[3]; D = 0 - ctx->avx[0].v[3] + 4 * ctx->avx[1].v[0] +
B = 0 - a[1] - 2*a[2] - 3*a[3] 10 * ctx->avx[1].v[1] + 20 * ctx->avx[1].v[2] +
+ 4*b[0] + 4*b[1] + 4*b[2] + 4*b[3]; 34 * ctx->avx[1].v[3] - 48 * ctx->avx[2].v[0] -
64 * ctx->avx[2].v[1] - 80 * ctx->avx[2].v[2] -
C = a[2] + 3*a[3] 96 * ctx->avx[2].v[3] + 64 * ctx->avx[3].v[0] +
- 6*b[0] - 10*b[1] - 14*b[2] - 18*b[3] 64 * ctx->avx[3].v[1] + 64 * ctx->avx[3].v[2] +
+ 16*c[0] + 16*c[1] + 16*c[2] + 16*c[3]; 64 * ctx->avx[3].v[3];
D = 0 - a[3]
+ 4*b[0] + 10*b[1] + 20*b[2] + 34*b[3]
- 48*c[0] - 64*c[1] - 80*c[2] - 96*c[3]
+ 64*d[0] + 64*d[1] + 64*d[2] + 64*d[3];
ZIO_SET_CHECKSUM(zcp, A, B, C, D); ZIO_SET_CHECKSUM(zcp, A, B, C, D);
} }
#define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \
{ \
asm volatile("vmovdqu %0, %%ymm0" :: "m" ((ctx)->avx[0])); \
asm volatile("vmovdqu %0, %%ymm1" :: "m" ((ctx)->avx[1])); \
asm volatile("vmovdqu %0, %%ymm2" :: "m" ((ctx)->avx[2])); \
asm volatile("vmovdqu %0, %%ymm3" :: "m" ((ctx)->avx[3])); \
}
#define FLETCHER_4_AVX2_SAVE_CTX(ctx) \
{ \
asm volatile("vmovdqu %%ymm0, %0" : "=m" ((ctx)->avx[0])); \
asm volatile("vmovdqu %%ymm1, %0" : "=m" ((ctx)->avx[1])); \
asm volatile("vmovdqu %%ymm2, %0" : "=m" ((ctx)->avx[2])); \
asm volatile("vmovdqu %%ymm3, %0" : "=m" ((ctx)->avx[3])); \
}
static void static void
fletcher_4_avx2_native(const void *buf, uint64_t size, zio_cksum_t *unused) fletcher_4_avx2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
{ {
const uint64_t *ip = buf; const uint64_t *ip = buf;
const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
kfpu_begin();
FLETCHER_4_AVX2_RESTORE_CTX(ctx);
for (; ip < ipend; ip += 2) { for (; ip < ipend; ip += 2) {
asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip)); asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip));
asm volatile("vpaddq %ymm4, %ymm0, %ymm0"); asm volatile("vpaddq %ymm4, %ymm0, %ymm0");
@ -104,21 +115,28 @@ fletcher_4_avx2_native(const void *buf, uint64_t size, zio_cksum_t *unused)
asm volatile("vpaddq %ymm1, %ymm2, %ymm2"); asm volatile("vpaddq %ymm1, %ymm2, %ymm2");
asm volatile("vpaddq %ymm2, %ymm3, %ymm3"); asm volatile("vpaddq %ymm2, %ymm3, %ymm3");
} }
FLETCHER_4_AVX2_SAVE_CTX(ctx);
asm volatile("vzeroupper");
kfpu_end();
} }
static void static void
fletcher_4_avx2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
{ {
static const struct { static const zfs_fletcher_avx_t mask = {
uint64_t v[4] __attribute__((aligned(32)));
} mask = {
.v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B } 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
}; };
const uint64_t *ip = buf; const uint64_t *ip = buf;
const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
asm volatile("vmovdqa %0, %%ymm5"::"m"(mask)); kfpu_begin();
FLETCHER_4_AVX2_RESTORE_CTX(ctx);
asm volatile("vmovdqu %0, %%ymm5" :: "m" (mask));
for (; ip < ipend; ip += 2) { for (; ip < ipend; ip += 2) {
asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip)); asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip));
@ -129,6 +147,11 @@ fletcher_4_avx2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
asm volatile("vpaddq %ymm1, %ymm2, %ymm2"); asm volatile("vpaddq %ymm1, %ymm2, %ymm2");
asm volatile("vpaddq %ymm2, %ymm3, %ymm3"); asm volatile("vpaddq %ymm2, %ymm3, %ymm3");
} }
FLETCHER_4_AVX2_SAVE_CTX(ctx);
asm volatile("vzeroupper");
kfpu_end();
} }
static boolean_t fletcher_4_avx2_valid(void) static boolean_t fletcher_4_avx2_valid(void)

View File

@ -45,39 +45,19 @@
#include <linux/simd_x86.h> #include <linux/simd_x86.h>
#include <sys/spa_checksum.h> #include <sys/spa_checksum.h>
#include <sys/byteorder.h>
#include <zfs_fletcher.h> #include <zfs_fletcher.h>
#include <strings.h>
struct zfs_fletcher_sse_array {
uint64_t v[2] __attribute__((aligned(16)));
};
static void static void
fletcher_4_sse2_init(zio_cksum_t *zcp) fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) {
{ bzero(ctx->sse, 4 * sizeof (zfs_fletcher_sse_t));
kfpu_begin();
/* clear sse registers */
asm volatile("pxor %xmm0, %xmm0");
asm volatile("pxor %xmm1, %xmm1");
asm volatile("pxor %xmm2, %xmm2");
asm volatile("pxor %xmm3, %xmm3");
} }
static void static void
fletcher_4_sse2_fini(zio_cksum_t *zcp) fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) {
{
struct zfs_fletcher_sse_array a, b, c, d;
uint64_t A, B, C, D; uint64_t A, B, C, D;
asm volatile("movdqu %%xmm0, %0":"=m" (a.v));
asm volatile("movdqu %%xmm1, %0":"=m" (b.v));
asm volatile("psllq $0x2, %xmm2");
asm volatile("movdqu %%xmm2, %0":"=m" (c.v));
asm volatile("psllq $0x3, %xmm3");
asm volatile("movdqu %%xmm3, %0":"=m" (d.v));
kfpu_end();
/* /*
* The mixing matrix for checksum calculation is: * The mixing matrix for checksum calculation is:
* a = a0 + a1 * a = a0 + a1
@ -88,20 +68,42 @@ fletcher_4_sse2_fini(zio_cksum_t *zcp)
* c and d are multiplied by 4 and 8, respectively, * c and d are multiplied by 4 and 8, respectively,
* before spilling the vectors out to memory. * before spilling the vectors out to memory.
*/ */
A = a.v[0] + a.v[1]; A = ctx->sse[0].v[0] + ctx->sse[0].v[1];
B = 2*b.v[0] + 2*b.v[1] - a.v[1]; B = 2 * ctx->sse[1].v[0] + 2 * ctx->sse[1].v[1] - ctx->sse[0].v[1];
C = c.v[0] - b.v[0] + c.v[1] - 3*b.v[1]; C = 4 * ctx->sse[2].v[0] - ctx->sse[1].v[0] + 4 * ctx->sse[2].v[1] -
D = d.v[0] - c.v[0] + d.v[1] - 2*c.v[1] + b.v[1]; 3 * ctx->sse[1].v[1];
D = 8 * ctx->sse[3].v[0] - 4 * ctx->sse[2].v[0] + 8 * ctx->sse[3].v[1] -
8 * ctx->sse[2].v[1] + ctx->sse[1].v[1];
ZIO_SET_CHECKSUM(zcp, A, B, C, D); ZIO_SET_CHECKSUM(zcp, A, B, C, D);
} }
#define FLETCHER_4_SSE_RESTORE_CTX(ctx) \
{ \
asm volatile("movdqu %0, %%xmm0" :: "m" ((ctx)->sse[0])); \
asm volatile("movdqu %0, %%xmm1" :: "m" ((ctx)->sse[1])); \
asm volatile("movdqu %0, %%xmm2" :: "m" ((ctx)->sse[2])); \
asm volatile("movdqu %0, %%xmm3" :: "m" ((ctx)->sse[3])); \
}
#define FLETCHER_4_SSE_SAVE_CTX(ctx) \
{ \
asm volatile("movdqu %%xmm0, %0" : "=m" ((ctx)->sse[0])); \
asm volatile("movdqu %%xmm1, %0" : "=m" ((ctx)->sse[1])); \
asm volatile("movdqu %%xmm2, %0" : "=m" ((ctx)->sse[2])); \
asm volatile("movdqu %%xmm3, %0" : "=m" ((ctx)->sse[3])); \
}
static void static void
fletcher_4_sse2_native(const void *buf, uint64_t size, zio_cksum_t *unused) fletcher_4_sse2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
{ {
const uint64_t *ip = buf; const uint64_t *ip = buf;
const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
kfpu_begin();
FLETCHER_4_SSE_RESTORE_CTX(ctx);
asm volatile("pxor %xmm4, %xmm4"); asm volatile("pxor %xmm4, %xmm4");
for (; ip < ipend; ip += 2) { for (; ip < ipend; ip += 2) {
@ -118,27 +120,37 @@ fletcher_4_sse2_native(const void *buf, uint64_t size, zio_cksum_t *unused)
asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm1, %xmm2");
asm volatile("paddq %xmm2, %xmm3"); asm volatile("paddq %xmm2, %xmm3");
} }
FLETCHER_4_SSE_SAVE_CTX(ctx);
kfpu_end();
} }
static void static void
fletcher_4_sse2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
{ {
const uint32_t *ip = buf; const uint32_t *ip = buf;
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
for (; ip < ipend; ip += 2) { kfpu_begin();
uint32_t scratch;
asm volatile("bswapl %0" : "=r"(scratch) : "0"(*ip)); FLETCHER_4_SSE_RESTORE_CTX(ctx);
asm volatile("movd %0, %%xmm5" :: "r"(scratch));
asm volatile("bswapl %0" : "=r"(scratch) : "0"(*(ip + 1))); for (; ip < ipend; ip += 2) {
asm volatile("movd %0, %%xmm6" :: "r"(scratch)); uint32_t scratch1 = BSWAP_32(ip[0]);
uint32_t scratch2 = BSWAP_32(ip[1]);
asm volatile("movd %0, %%xmm5" :: "r"(scratch1));
asm volatile("movd %0, %%xmm6" :: "r"(scratch2));
asm volatile("punpcklqdq %xmm6, %xmm5"); asm volatile("punpcklqdq %xmm6, %xmm5");
asm volatile("paddq %xmm5, %xmm0"); asm volatile("paddq %xmm5, %xmm0");
asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm0, %xmm1");
asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm1, %xmm2");
asm volatile("paddq %xmm2, %xmm3"); asm volatile("paddq %xmm2, %xmm3");
} }
FLETCHER_4_SSE_SAVE_CTX(ctx);
kfpu_end();
} }
static boolean_t fletcher_4_sse2_valid(void) static boolean_t fletcher_4_sse2_valid(void)
@ -161,15 +173,19 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = {
#if defined(HAVE_SSE2) && defined(HAVE_SSSE3) #if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
static void static void
fletcher_4_ssse3_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
{ {
static const struct zfs_fletcher_sse_array mask = { static const zfs_fletcher_sse_t mask = {
.v = { 0x0405060700010203, 0x0C0D0E0F08090A0B } .v = { 0x0405060700010203, 0x0C0D0E0F08090A0B }
}; };
const uint64_t *ip = buf; const uint64_t *ip = buf;
const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
kfpu_begin();
FLETCHER_4_SSE_RESTORE_CTX(ctx);
asm volatile("movdqu %0, %%xmm7"::"m" (mask)); asm volatile("movdqu %0, %%xmm7"::"m" (mask));
asm volatile("pxor %xmm4, %xmm4"); asm volatile("pxor %xmm4, %xmm4");
@ -188,6 +204,10 @@ fletcher_4_ssse3_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm1, %xmm2");
asm volatile("paddq %xmm2, %xmm3"); asm volatile("paddq %xmm2, %xmm3");
} }
FLETCHER_4_SSE_SAVE_CTX(ctx);
kfpu_end();
} }
static boolean_t fletcher_4_ssse3_valid(void) static boolean_t fletcher_4_ssse3_valid(void)