Fletcher4 implementation using avx512f instruction set

Algorithm runs 8 parallel sums, consuming 8x uint32_t elements per
loop iteration. Size alignment of main fletcher4 methods is adjusted
accordingly. New implementation is called 'avx512f'.

Note: byteswap method can be implemented more efficiently when avx512bw hardware
becomes available. Currently, it is ~ 2x slower than native method.

Table shows result of full (native) fletcher4 calculation for different buffer size:

fletcher4   4KB     16KB    64KB    128KB   256KB   1MB     16MB
--------------------------------------------------------------------
[scalar]    1213    1228    1231    1231    1225    1200    1160
[sse2]      2374    2442    2459    2456    2462    2250    2220
[avx2]      4288    4753    4871    4893    4900    4050    3882
[avx512f]   5975    8445    9196    9221    9262    6307    5620

Signed-off-by: Gvozden Neskovic <neskovic@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4952
This commit is contained in:
Gvozden Neskovic
2016-07-06 13:42:04 +02:00
committed by Brian Behlendorf
parent 32ffaa3de5
commit 70b258fc96
6 changed files with 182 additions and 10 deletions
+11 -2
View File
@@ -158,6 +158,9 @@ static const fletcher_4_ops_t *fletcher_4_algos[] = {
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
&fletcher_4_avx2_ops,
#endif
#if defined(__x86_64) && defined(HAVE_AVX512F)
&fletcher_4_avx512f_ops,
#endif
};
static enum fletcher_selector {
@@ -171,6 +174,9 @@ static enum fletcher_selector {
#endif
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
FLETCHER_AVX2,
#endif
#if defined(__x86_64) && defined(HAVE_AVX512F)
FLETCHER_AVX512F,
#endif
FLETCHER_CYCLE
} fletcher_4_impl_chosen = FLETCHER_SCALAR;
@@ -190,6 +196,9 @@ static struct fletcher_4_impl_selector {
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
[ FLETCHER_AVX2 ] = { "avx2", &fletcher_4_avx2_ops },
#endif
#if defined(__x86_64) && defined(HAVE_AVX512F)
[ FLETCHER_AVX512F ] = { "avx512f", &fletcher_4_avx512f_ops },
#endif
#if !defined(_KERNEL)
[ FLETCHER_CYCLE ] = { "cycle", &fletcher_4_scalar_ops }
#endif
@@ -354,7 +363,7 @@ fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
{
const fletcher_4_ops_t *ops;
if (IS_P2ALIGNED(size, 4 * sizeof (uint32_t)))
if (IS_P2ALIGNED(size, 8 * sizeof (uint32_t)))
ops = fletcher_4_impl_get();
else
ops = &fletcher_4_scalar_ops;
@@ -370,7 +379,7 @@ fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
{
const fletcher_4_ops_t *ops;
if (IS_P2ALIGNED(size, 4 * sizeof (uint32_t)))
if (IS_P2ALIGNED(size, 8 * sizeof (uint32_t)))
ops = fletcher_4_impl_get();
else
ops = &fletcher_4_scalar_ops;