Support for vectorized algorithms on x86

This is initial support for x86 vectorized implementations of ZFS parity
and checksum algorithms.

For the compilation phase, configure step checks if toolchain supports relevant
instruction sets. Each implementation must ensure that the code is not passed
to compiler if relevant instruction set is not supported. For this purpose,
following new defines are provided if instruction set is supported:
	- HAVE_SSE,
	- HAVE_SSE2,
	- HAVE_SSE3,
	- HAVE_SSSE3,
	- HAVE_SSE4_1,
	- HAVE_SSE4_2,
	- HAVE_AVX,
	- HAVE_AVX2.

For detecting if an instruction set can be used in runtime, following functions
are provided in (include/linux/simd_x86.h):
	- zfs_sse_available()
	- zfs_sse2_available()
	- zfs_sse3_available()
	- zfs_ssse3_available()
	- zfs_sse4_1_available()
	- zfs_sse4_2_available()
	- zfs_avx_available()
	- zfs_avx2_available()
	- zfs_bmi1_available()
	- zfs_bmi2_available()

These function should be called once, on module load, or initialization.
They are safe to use from user and kernel space.
If an implementation is using more than single instruction set, both compiler
and runtime support for all relevant instruction sets should be checked.

Kernel fpu methods:
	- kfpu_begin()
	- kfpu_end()

Use __get_cpuid_max and __cpuid_count from <cpuid.h>
Both gcc and clang have support for these. They also handle ebx register
in case it is used for PIC code.

Signed-off-by: Gvozden Neskovic <neskovic@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Chunwei Chen <tuxoko@gmail.com>
Closes #4381
This commit is contained in:
Gvozden Neskovic 2016-02-29 19:42:27 +01:00 committed by Brian Behlendorf
parent e853ba3519
commit fc0c72b167
6 changed files with 583 additions and 2 deletions

18
config/kernel-fpu.m4 Normal file
View File

@ -0,0 +1,18 @@
dnl #
dnl # 4.2 API change
dnl # asm/i387.h is replaced by asm/fpu/api.h
dnl #
AC_DEFUN([ZFS_AC_KERNEL_FPU], [
AC_MSG_CHECKING([whether asm/fpu/api.h exists])
ZFS_LINUX_TRY_COMPILE([
#include <linux/kernel.h>
#include <asm/fpu/api.h>
],[
__kernel_fpu_begin();
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_FPU_API_H, 1, [kernel has <asm/fpu/api.h> interface])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -1,5 +1,5 @@
dnl #
dnl # Default ZFS kernel configuration
dnl # Default ZFS kernel configuration
dnl #
AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_KERNEL
@ -91,6 +91,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_KERNEL_FOLLOW_DOWN_ONE
ZFS_AC_KERNEL_MAKE_REQUEST_FN
ZFS_AC_KERNEL_GENERIC_IO_ACCT
ZFS_AC_KERNEL_FPU
AS_IF([test "$LINUX_OBJ" != "$LINUX"], [
KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ"

172
config/toolchain-simd.m4 Normal file
View File

@ -0,0 +1,172 @@
dnl #
dnl # Checks if host toolchain supports SIMD instructions
dnl #
AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD], [
case "$host_cpu" in
x86_64 | x86 | i686)
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSSE3
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_1
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_2
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2
;;
esac
])
dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE
dnl #
AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE], [
AC_MSG_CHECKING([whether host toolchain supports SSE])
AC_LINK_IFELSE([AC_LANG_SOURCE([[
void main()
{
__asm__ __volatile__("xorps %xmm0, %xmm1");
}
]])], [
AC_DEFINE([HAVE_SSE], 1, [Define if host toolchain supports SSE])
AC_MSG_RESULT([yes])
], [
AC_MSG_RESULT([no])
])
])
dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2
dnl #
AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2], [
AC_MSG_CHECKING([whether host toolchain supports SSE2])
AC_LINK_IFELSE([AC_LANG_SOURCE([[
void main()
{
__asm__ __volatile__("pxor %xmm0, %xmm1");
}
]])], [
AC_DEFINE([HAVE_SSE2], 1, [Define if host toolchain supports SSE2])
AC_MSG_RESULT([yes])
], [
AC_MSG_RESULT([no])
])
])
dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3
dnl #
AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3], [
AC_MSG_CHECKING([whether host toolchain supports SSE3])
AC_LINK_IFELSE([AC_LANG_SOURCE([[
void main()
{
char v[16];
__asm__ __volatile__("lddqu %0,%%xmm0" :: "m"(v[0]));
}
]])], [
AC_DEFINE([HAVE_SSE3], 1, [Define if host toolchain supports SSE3])
AC_MSG_RESULT([yes])
], [
AC_MSG_RESULT([no])
])
])
dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSSE3
dnl #
AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSSE3], [
AC_MSG_CHECKING([whether host toolchain supports SSSE3])
AC_LINK_IFELSE([AC_LANG_SOURCE([[
void main()
{
__asm__ __volatile__("pshufb %xmm0,%xmm1");
}
]])], [
AC_DEFINE([HAVE_SSSE3], 1, [Define if host toolchain supports SSSE3])
AC_MSG_RESULT([yes])
], [
AC_MSG_RESULT([no])
])
])
dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_1
dnl #
AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_1], [
AC_MSG_CHECKING([whether host toolchain supports SSE4.1])
AC_LINK_IFELSE([AC_LANG_SOURCE([[
void main()
{
__asm__ __volatile__("pmaxsb %xmm0,%xmm1");
}
]])], [
AC_DEFINE([HAVE_SSE4_1], 1, [Define if host toolchain supports SSE4.1])
AC_MSG_RESULT([yes])
], [
AC_MSG_RESULT([no])
])
])
dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_2
dnl #
AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_2], [
AC_MSG_CHECKING([whether host toolchain supports SSE4.2])
AC_LINK_IFELSE([AC_LANG_SOURCE([[
void main()
{
__asm__ __volatile__("pcmpgtq %xmm0, %xmm1");
}
]])], [
AC_DEFINE([HAVE_SSE4_2], 1, [Define if host toolchain supports SSE4.2])
AC_MSG_RESULT([yes])
], [
AC_MSG_RESULT([no])
])
])
dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX
dnl #
AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX], [
AC_MSG_CHECKING([whether host toolchain supports AVX])
AC_LINK_IFELSE([AC_LANG_SOURCE([[
void main()
{
char v[32];
__asm__ __volatile__("vmovdqa %0,%%ymm0" :: "m"(v[0]));
}
]])], [
AC_MSG_RESULT([yes])
AC_DEFINE([HAVE_AVX], 1, [Define if host toolchain supports AVX])
], [
AC_MSG_RESULT([no])
])
])
dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2
dnl #
AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2], [
AC_MSG_CHECKING([whether host toolchain supports AVX2])
AC_LINK_IFELSE([AC_LANG_SOURCE([
[
void main()
{
__asm__ __volatile__("vpshufb %ymm0,%ymm1,%ymm2");
}
]])], [
AC_MSG_RESULT([yes])
AC_DEFINE([HAVE_AVX2], 1, [Define if host toolchain supports AVX2])
], [
AC_MSG_RESULT([no])
])
])

View File

@ -63,6 +63,7 @@ AC_DEFUN([ZFS_AC_DEBUG_DMU_TX], [
AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [
ZFS_AC_CONFIG_ALWAYS_NO_UNUSED_BUT_SET_VARIABLE
ZFS_AC_CONFIG_ALWAYS_NO_BOOL_COMPARE
ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD
])
AC_DEFUN([ZFS_AC_CONFIG], [

View File

@ -6,7 +6,8 @@ KERNEL_H = \
$(top_srcdir)/include/linux/vfs_compat.h \
$(top_srcdir)/include/linux/blkdev_compat.h \
$(top_srcdir)/include/linux/utsname_compat.h \
$(top_srcdir)/include/linux/kmap_compat.h
$(top_srcdir)/include/linux/kmap_compat.h \
$(top_srcdir)/include/linux/simd_x86.h
USER_H =

388
include/linux/simd_x86.h Normal file
View File

@ -0,0 +1,388 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (C) 2016 Gvozden Neskovic <neskovic@compeng.uni-frankfurt.de>.
*/
/*
* USER API:
*
* Kernel fpu methods:
* kfpu_begin()
* kfpu_end()
*
* SIMD support:
*
* Following functions should be called to determine whether CPU feature
* is supported. All functions are usable in kernel and user space.
* If a SIMD algorithm is using more than one instruction set
* all relevant feature test functions should be called.
*
* Supported features:
* zfs_sse_available()
* zfs_sse2_available()
* zfs_sse3_available()
* zfs_ssse3_available()
* zfs_sse4_1_available()
* zfs_sse4_2_available()
* zfs_avx_available()
* zfs_avx2_available()
* zfs_bmi1_available()
* zfs_bmi2_available()
*/
#ifndef _SIMD_X86_H
#define _SIMD_X86_H
#include <sys/isa_defs.h>
/* only for __x86 */
#if defined(__x86)
#include <sys/types.h>
#if defined(_KERNEL)
#include <asm/cpufeature.h>
#else
#include <cpuid.h>
#endif
#if defined(_KERNEL)
#if defined(HAVE_FPU_API_H)
#include <asm/fpu/api.h>
#include <asm/fpu/internal.h>
#define kfpu_begin() \
{ \
preempt_disable(); \
__kernel_fpu_begin(); \
}
#define kfpu_end() \
{ \
__kernel_fpu_end(); \
preempt_enable(); \
}
#else
#include <asm/i387.h>
#include <asm/xcr.h>
#define kfpu_begin() kernel_fpu_begin()
#define kfpu_end() kernel_fpu_end()
#endif /* defined(HAVE_FPU_API_H) */
#else
/*
* fpu dummy methods for userspace
*/
#define kfpu_begin() do {} while (0)
#define kfpu_end() do {} while (0)
#endif /* defined(_KERNEL) */
/*
* CPUID feature tests for user-space. Linux kernel provides an interface for
* CPU feature testing.
*/
#if !defined(_KERNEL)
/*
* x86 registers used implicitly by CPUID
*/
typedef enum cpuid_regs {
EAX = 0,
EBX,
ECX,
EDX,
CPUID_REG_CNT = 4
} cpuid_regs_t;
/*
* List of instruction sets identified by CPUID
*/
typedef enum cpuid_inst_sets {
SSE = 0,
SSE2,
SSE3,
SSSE3,
SSE4_1,
SSE4_2,
OSXSAVE,
AVX,
AVX2,
BMI1,
BMI2
} cpuid_inst_sets_t;
/*
* Instruction set descriptor.
*/
typedef struct cpuid_feature_desc {
uint32_t leaf; /* CPUID leaf */
uint32_t subleaf; /* CPUID subleaf */
uint32_t flag; /* bit mask of the feature */
cpuid_regs_t reg; /* which CPUID return register to test */
} cpuid_feature_desc_t;
/*
* Descriptions of supported instruction sets
*/
static const cpuid_feature_desc_t cpuid_features[] = {
[SSE] = {1U, 0U, 1U << 25, EDX },
[SSE2] = {1U, 0U, 1U << 26, EDX },
[SSE3] = {1U, 0U, 1U << 0, ECX },
[SSSE3] = {1U, 0U, 1U << 9, ECX },
[SSE4_1] = {1U, 0U, 1U << 19, ECX },
[SSE4_2] = {1U, 0U, 1U << 20, ECX },
[OSXSAVE] = {1U, 0U, 1U << 27, ECX },
[AVX] = {1U, 0U, 1U << 28, ECX },
[AVX2] = {7U, 0U, 1U << 5, EBX },
[BMI1] = {7U, 0U, 1U << 3, EBX },
[BMI2] = {7U, 0U, 1U << 8, EBX }
};
/*
* Check if OS supports AVX and AVX2 by checking XCR0
* Only call this function if CPUID indicates that AVX feature is
* supported by the CPU, otherwise it might be an illegal instruction.
*/
static inline uint64_t
xgetbv(uint32_t index)
{
uint32_t eax, edx;
/* xgetbv - instruction byte code */
__asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0"
: "=a" (eax), "=d" (edx)
: "c" (index));
return ((((uint64_t)edx)<<32) | (uint64_t)eax);
}
/*
* Check if CPU supports a feature
*/
static inline boolean_t
__cpuid_check_feature(const cpuid_feature_desc_t *desc)
{
uint32_t r[CPUID_REG_CNT];
if (__get_cpuid_max(0, NULL) >= desc->leaf) {
/*
* __cpuid_count is needed to properly check
* for AVX2. It is a macro, so return parameters
* are passed by value.
*/
__cpuid_count(desc->leaf, desc->subleaf,
r[EAX], r[EBX], r[ECX], r[EDX]);
return (!!(r[desc->reg] & desc->flag));
}
return (B_FALSE);
}
#define CPUID_FEATURE_CHECK(name, id) \
static inline boolean_t \
__cpuid_has_ ## name(void)\
{ \
return (__cpuid_check_feature(&cpuid_features[id])); \
}
/*
* Define functions for user-space CPUID features testing
*/
CPUID_FEATURE_CHECK(sse, SSE);
CPUID_FEATURE_CHECK(sse2, SSE2);
CPUID_FEATURE_CHECK(sse3, SSE3);
CPUID_FEATURE_CHECK(ssse3, SSSE3);
CPUID_FEATURE_CHECK(sse4_1, SSE4_1);
CPUID_FEATURE_CHECK(sse4_2, SSE4_2);
CPUID_FEATURE_CHECK(avx, AVX);
CPUID_FEATURE_CHECK(avx2, AVX2);
CPUID_FEATURE_CHECK(osxsave, OSXSAVE);
CPUID_FEATURE_CHECK(bmi1, BMI1);
CPUID_FEATURE_CHECK(bmi2, BMI2);
#endif /* !defined(_KERNEL) */
/*
* Detect ymm register set support
*/
static inline boolean_t
__ymm_enabled(void)
{
static const uint64_t XSTATE_SSE_AVX = 0x2 | 0x4;
boolean_t has_osxsave;
uint64_t xcr0;
#if defined(_KERNEL) && defined(X86_FEATURE_OSXSAVE)
has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE);
#elif defined(_KERNEL) && !defined(X86_FEATURE_OSXSAVE)
has_osxsave = B_FALSE;
#else
has_osxsave = __cpuid_has_osxsave();
#endif
if (!has_osxsave)
return (B_FALSE);
xcr0 = xgetbv(0);
return ((xcr0 & XSTATE_SSE_AVX) == XSTATE_SSE_AVX);
}
/*
* Check if SSE instruction set is available
*/
static inline boolean_t
zfs_sse_available(void)
{
#if defined(_KERNEL)
return (!!boot_cpu_has(X86_FEATURE_XMM));
#else
return (__cpuid_has_sse());
#endif
}
/*
* Check if SSE2 instruction set is available
*/
static inline boolean_t
zfs_sse2_available(void)
{
#if defined(_KERNEL)
return (!!boot_cpu_has(X86_FEATURE_XMM2));
#else
return (__cpuid_has_sse2());
#endif
}
/*
* Check if SSE3 instruction set is available
*/
static inline boolean_t
zfs_sse3_available(void)
{
#if defined(_KERNEL)
return (!!boot_cpu_has(X86_FEATURE_XMM3));
#else
return (__cpuid_has_sse3());
#endif
}
/*
* Check if SSSE3 instruction set is available
*/
static inline boolean_t
zfs_ssse3_available(void)
{
#if defined(_KERNEL)
return (!!boot_cpu_has(X86_FEATURE_SSSE3));
#else
return (__cpuid_has_ssse3());
#endif
}
/*
* Check if SSE4.1 instruction set is available
*/
static inline boolean_t
zfs_sse4_1_available(void)
{
#if defined(_KERNEL)
return (!!boot_cpu_has(X86_FEATURE_XMM4_1));
#else
return (__cpuid_has_sse4_1());
#endif
}
/*
* Check if SSE4.2 instruction set is available
*/
static inline boolean_t
zfs_sse4_2_available(void)
{
#if defined(_KERNEL)
return (!!boot_cpu_has(X86_FEATURE_XMM4_2));
#else
return (__cpuid_has_sse4_2());
#endif
}
/*
* Check if AVX instruction set is available
*/
static inline boolean_t
zfs_avx_available(void)
{
boolean_t has_avx;
#if defined(_KERNEL)
has_avx = !!boot_cpu_has(X86_FEATURE_AVX);
#else
has_avx = __cpuid_has_avx();
#endif
return (has_avx && __ymm_enabled());
}
/*
* Check if AVX2 instruction set is available
*/
static inline boolean_t
zfs_avx2_available(void)
{
boolean_t has_avx2;
#if defined(_KERNEL) && defined(X86_FEATURE_AVX2)
has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2);
#elif defined(_KERNEL) && !defined(X86_FEATURE_AVX2)
has_avx2 = B_FALSE;
#else
has_avx2 = __cpuid_has_avx2();
#endif
return (has_avx2 && __ymm_enabled());
}
/*
* Check if BMI1 instruction set is available
*/
static inline boolean_t
zfs_bmi1_available(void)
{
#if defined(_KERNEL) && defined(X86_FEATURE_BMI1)
return (!!boot_cpu_has(X86_FEATURE_BMI1));
#elif defined(_KERNEL) && !defined(X86_FEATURE_BMI1)
return (B_FALSE);
#else
return (__cpuid_has_bmi1());
#endif
}
/*
* Check if BMI2 instruction set is available
*/
static inline boolean_t
zfs_bmi2_available(void)
{
#if defined(_KERNEL) && defined(X86_FEATURE_BMI2)
return (!!boot_cpu_has(X86_FEATURE_BMI2));
#elif defined(_KERNEL) && !defined(X86_FEATURE_BMI2)
return (B_FALSE);
#else
return (__cpuid_has_bmi2());
#endif
}
#endif /* defined(__x86) */
#endif /* _SIMD_X86_H */