mirror_zfs/include/zfs_fletcher.h

165 lines
5.0 KiB
C
Raw Permalink Normal View History

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
OpenZFS 4185 - add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com> Reviewed by: Richard Lowe <richlowe@richlowe.net> Approved by: Garrett D'Amore <garrett@damore.org> Ported by: Tony Hutter <hutter2@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/4185 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/45818ee Porting Notes: This code is ported on top of the Illumos Crypto Framework code: https://github.com/zfsonlinux/zfs/pull/4329/commits/b5e030c8dbb9cd393d313571dee4756fbba8c22d The list of porting changes includes: - Copied module/icp/include/sha2/sha2.h directly from illumos - Removed from module/icp/algs/sha2/sha2.c: #pragma inline(SHA256Init, SHA384Init, SHA512Init) - Added 'ctx' to lib/libzfs/libzfs_sendrecv.c:zio_checksum_SHA256() since it now takes in an extra parameter. - Added CTASSERT() to assert.h from for module/zfs/edonr_zfs.c - Added skein & edonr to libicp/Makefile.am - Added sha512.S. It was generated from sha512-x86_64.pl in Illumos. - Updated ztest.c with new fletcher_4_*() args; used NULL for new CTX argument. - In icp/algs/edonr/edonr_byteorder.h, Removed the #if defined(__linux) section to not #include the non-existant endian.h. - In skein_test.c, renane NULL to 0 in "no test vector" array entries to get around a compiler warning. - Fixup test files: - Rename <sys/varargs.h> -> <varargs.h>, <strings.h> -> <string.h>, - Remove <note.h> and define NOTE() as NOP. - Define u_longlong_t - Rename "#!/usr/bin/ksh" -> "#!/bin/ksh -p" - Rename NULL to 0 in "no test vector" array entries to get around a compiler warning. - Remove "for isa in $($ISAINFO); do" stuff - Add/update Makefiles - Add some userspace headers like stdio.h/stdlib.h in places of sys/types.h. - EXPORT_SYMBOL *_Init/*_Update/*_Final... routines in ICP modules. - Update scripts/zfs2zol-patch.sed - include <sys/sha2.h> in sha2_impl.h - Add sha2.h to include/sys/Makefile.am - Add skein and edonr dirs to icp Makefile - Add new checksums to zpool_get.cfg - Move checksum switch block from zfs_secpolicy_setprop() to zfs_check_settable() - Fix -Wuninitialized error in edonr_byteorder.h on PPC - Fix stack frame size errors on ARM32 - Don't unroll loops in Skein on 32-bit to save stack space - Add memory barriers in sha2.c on 32-bit to save stack space - Add filetest_001_pos.ksh checksum sanity test - Add option to write psudorandom data in file_write utility
2016-06-16 01:47:05 +03:00
/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
*/
#ifndef _ZFS_FLETCHER_H
#define _ZFS_FLETCHER_H extern __attribute__((visibility("default")))
#include <sys/types.h>
#include <sys/spa_checksum.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* fletcher checksum functions
Rework of fletcher_4 module - Benchmark memory block is increased to 128kiB to reflect real block sizes more accurately. Measurements include all three stages needed for checksum generation, i.e. `init()/compute()/fini()`. The inner loop is repeated multiple times to offset overhead of time function. - Fastest implementation selects native and byteswap methods independently in benchmark. To support this new function pointers `init_byteswap()/fini_byteswap()` are introduced. - Implementation mutex lock is replaced by atomic variable. - To save time, benchmark is not executed in userspace. Instead, highest supported implementation is used for fastest. Default userspace selector is still 'cycle'. - `fletcher_4_native/byteswap()` methods use incremental methods to finish calculation if data size is not multiple of vector stride (currently 64B). - Added `fletcher_4_native_varsize()` special purpose method for use when buffer size is not known in advance. The method does not enforce 4B alignment on buffer size, and will ignore last (size % 4) bytes of the data buffer. - Benchmark `kstat` is changed to match the one of vdev_raidz. It now shows throughput for all supported implementations (in B/s), native and byteswap, as well as the code [fastest] is running. Example of `fletcher_4_bench` running on `Intel(R) Xeon(R) CPU E5-2660 v3 @ 2.60GHz`: implementation native byteswap scalar 4768120823 3426105750 sse2 7947841777 4318964249 ssse3 7951922722 6112191941 avx2 13269714358 11043200912 fastest avx2 avx2 Example of `fletcher_4_bench` running on `Intel(R) Xeon Phi(TM) CPU 7210 @ 1.30GHz`: implementation native byteswap scalar 1291115967 1031555336 sse2 2539571138 1280970926 ssse3 2537778746 1080016762 avx2 4950749767 1078493449 avx512f 9581379998 4010029046 fastest avx512f avx512f Signed-off-by: Gvozden Neskovic <neskovic@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4952
2016-07-12 18:50:54 +03:00
*
* Note: Fletcher checksum methods expect buffer size to be 4B aligned. This
* limitation stems from the algorithm design. Performing incremental checksum
* without said alignment would yield different results. Therefore, the code
* includes assertions for the size alignment.
* For compatibility, it is required that some code paths calculate checksum of
* non-aligned buffer sizes. For this purpose, `fletcher_4_native_varsize()`
* checksum method is added. This method will ignore last (size % 4) bytes of
* the data buffer.
*/
_ZFS_FLETCHER_H void fletcher_init(zio_cksum_t *);
_ZFS_FLETCHER_H void fletcher_2_native(const void *, uint64_t, const void *,
zio_cksum_t *);
_ZFS_FLETCHER_H void fletcher_2_byteswap(const void *, uint64_t, const void *,
zio_cksum_t *);
_ZFS_FLETCHER_H void fletcher_4_native(const void *, uint64_t, const void *,
zio_cksum_t *);
_ZFS_FLETCHER_H int fletcher_2_incremental_native(void *, size_t, void *);
_ZFS_FLETCHER_H int fletcher_2_incremental_byteswap(void *, size_t, void *);
_ZFS_FLETCHER_H void fletcher_4_native_varsize(const void *, uint64_t,
zio_cksum_t *);
_ZFS_FLETCHER_H void fletcher_4_byteswap(const void *, uint64_t, const void *,
zio_cksum_t *);
_ZFS_FLETCHER_H int fletcher_4_incremental_native(void *, size_t, void *);
_ZFS_FLETCHER_H int fletcher_4_incremental_byteswap(void *, size_t, void *);
_ZFS_FLETCHER_H int fletcher_4_impl_set(const char *selector);
_ZFS_FLETCHER_H void fletcher_4_init(void);
_ZFS_FLETCHER_H void fletcher_4_fini(void);
Rework of fletcher_4 module - Benchmark memory block is increased to 128kiB to reflect real block sizes more accurately. Measurements include all three stages needed for checksum generation, i.e. `init()/compute()/fini()`. The inner loop is repeated multiple times to offset overhead of time function. - Fastest implementation selects native and byteswap methods independently in benchmark. To support this new function pointers `init_byteswap()/fini_byteswap()` are introduced. - Implementation mutex lock is replaced by atomic variable. - To save time, benchmark is not executed in userspace. Instead, highest supported implementation is used for fastest. Default userspace selector is still 'cycle'. - `fletcher_4_native/byteswap()` methods use incremental methods to finish calculation if data size is not multiple of vector stride (currently 64B). - Added `fletcher_4_native_varsize()` special purpose method for use when buffer size is not known in advance. The method does not enforce 4B alignment on buffer size, and will ignore last (size % 4) bytes of the data buffer. - Benchmark `kstat` is changed to match the one of vdev_raidz. It now shows throughput for all supported implementations (in B/s), native and byteswap, as well as the code [fastest] is running. Example of `fletcher_4_bench` running on `Intel(R) Xeon(R) CPU E5-2660 v3 @ 2.60GHz`: implementation native byteswap scalar 4768120823 3426105750 sse2 7947841777 4318964249 ssse3 7951922722 6112191941 avx2 13269714358 11043200912 fastest avx2 avx2 Example of `fletcher_4_bench` running on `Intel(R) Xeon Phi(TM) CPU 7210 @ 1.30GHz`: implementation native byteswap scalar 1291115967 1031555336 sse2 2539571138 1280970926 ssse3 2537778746 1080016762 avx2 4950749767 1078493449 avx512f 9581379998 4010029046 fastest avx512f avx512f Signed-off-by: Gvozden Neskovic <neskovic@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4952
2016-07-12 18:50:54 +03:00
/* Internal fletcher ctx */
typedef struct zfs_fletcher_superscalar {
uint64_t v[4];
} zfs_fletcher_superscalar_t;
typedef struct zfs_fletcher_sse {
uint64_t v[2];
} zfs_fletcher_sse_t;
typedef struct zfs_fletcher_avx {
uint64_t v[4];
} zfs_fletcher_avx_t;
typedef struct zfs_fletcher_avx512 {
uint64_t v[8];
} zfs_fletcher_avx512_t;
typedef struct zfs_fletcher_aarch64_neon {
uint64_t v[2];
} zfs_fletcher_aarch64_neon_t;
typedef union fletcher_4_ctx {
zio_cksum_t scalar;
zfs_fletcher_superscalar_t superscalar[4];
#if defined(HAVE_SSE2) || (defined(HAVE_SSE2) && defined(HAVE_SSSE3))
zfs_fletcher_sse_t sse[4];
#endif
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
zfs_fletcher_avx_t avx[4];
#endif
#if defined(__x86_64) && defined(HAVE_AVX512F)
zfs_fletcher_avx512_t avx512[4];
#endif
#if defined(__aarch64__)
zfs_fletcher_aarch64_neon_t aarch64_neon[4];
#endif
} fletcher_4_ctx_t;
/*
* fletcher checksum struct
*/
typedef void (*fletcher_4_init_f)(fletcher_4_ctx_t *);
typedef void (*fletcher_4_fini_f)(fletcher_4_ctx_t *, zio_cksum_t *);
typedef void (*fletcher_4_compute_f)(fletcher_4_ctx_t *,
const void *, uint64_t);
Rework of fletcher_4 module - Benchmark memory block is increased to 128kiB to reflect real block sizes more accurately. Measurements include all three stages needed for checksum generation, i.e. `init()/compute()/fini()`. The inner loop is repeated multiple times to offset overhead of time function. - Fastest implementation selects native and byteswap methods independently in benchmark. To support this new function pointers `init_byteswap()/fini_byteswap()` are introduced. - Implementation mutex lock is replaced by atomic variable. - To save time, benchmark is not executed in userspace. Instead, highest supported implementation is used for fastest. Default userspace selector is still 'cycle'. - `fletcher_4_native/byteswap()` methods use incremental methods to finish calculation if data size is not multiple of vector stride (currently 64B). - Added `fletcher_4_native_varsize()` special purpose method for use when buffer size is not known in advance. The method does not enforce 4B alignment on buffer size, and will ignore last (size % 4) bytes of the data buffer. - Benchmark `kstat` is changed to match the one of vdev_raidz. It now shows throughput for all supported implementations (in B/s), native and byteswap, as well as the code [fastest] is running. Example of `fletcher_4_bench` running on `Intel(R) Xeon(R) CPU E5-2660 v3 @ 2.60GHz`: implementation native byteswap scalar 4768120823 3426105750 sse2 7947841777 4318964249 ssse3 7951922722 6112191941 avx2 13269714358 11043200912 fastest avx2 avx2 Example of `fletcher_4_bench` running on `Intel(R) Xeon Phi(TM) CPU 7210 @ 1.30GHz`: implementation native byteswap scalar 1291115967 1031555336 sse2 2539571138 1280970926 ssse3 2537778746 1080016762 avx2 4950749767 1078493449 avx512f 9581379998 4010029046 fastest avx512f avx512f Signed-off-by: Gvozden Neskovic <neskovic@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4952
2016-07-12 18:50:54 +03:00
typedef struct fletcher_4_func {
Rework of fletcher_4 module - Benchmark memory block is increased to 128kiB to reflect real block sizes more accurately. Measurements include all three stages needed for checksum generation, i.e. `init()/compute()/fini()`. The inner loop is repeated multiple times to offset overhead of time function. - Fastest implementation selects native and byteswap methods independently in benchmark. To support this new function pointers `init_byteswap()/fini_byteswap()` are introduced. - Implementation mutex lock is replaced by atomic variable. - To save time, benchmark is not executed in userspace. Instead, highest supported implementation is used for fastest. Default userspace selector is still 'cycle'. - `fletcher_4_native/byteswap()` methods use incremental methods to finish calculation if data size is not multiple of vector stride (currently 64B). - Added `fletcher_4_native_varsize()` special purpose method for use when buffer size is not known in advance. The method does not enforce 4B alignment on buffer size, and will ignore last (size % 4) bytes of the data buffer. - Benchmark `kstat` is changed to match the one of vdev_raidz. It now shows throughput for all supported implementations (in B/s), native and byteswap, as well as the code [fastest] is running. Example of `fletcher_4_bench` running on `Intel(R) Xeon(R) CPU E5-2660 v3 @ 2.60GHz`: implementation native byteswap scalar 4768120823 3426105750 sse2 7947841777 4318964249 ssse3 7951922722 6112191941 avx2 13269714358 11043200912 fastest avx2 avx2 Example of `fletcher_4_bench` running on `Intel(R) Xeon Phi(TM) CPU 7210 @ 1.30GHz`: implementation native byteswap scalar 1291115967 1031555336 sse2 2539571138 1280970926 ssse3 2537778746 1080016762 avx2 4950749767 1078493449 avx512f 9581379998 4010029046 fastest avx512f avx512f Signed-off-by: Gvozden Neskovic <neskovic@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4952
2016-07-12 18:50:54 +03:00
fletcher_4_init_f init_native;
fletcher_4_fini_f fini_native;
fletcher_4_compute_f compute_native;
fletcher_4_init_f init_byteswap;
fletcher_4_fini_f fini_byteswap;
fletcher_4_compute_f compute_byteswap;
boolean_t (*valid)(void);
boolean_t uses_fpu;
const char *name;
} __attribute__((aligned(64))) fletcher_4_ops_t;
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar_ops;
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar4_ops;
#if defined(HAVE_SSE2)
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_sse2_ops;
#endif
#if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_ssse3_ops;
#endif
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_avx2_ops;
#endif
#if defined(__x86_64) && defined(HAVE_AVX512F)
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_avx512f_ops;
#endif
#if defined(__x86_64) && defined(HAVE_AVX512BW)
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_avx512bw_ops;
#endif
#if defined(__aarch64__)
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_aarch64_neon_ops;
#endif
#ifdef __cplusplus
}
#endif
#endif /* _ZFS_FLETCHER_H */