diff --git a/lib/arm/matchfinder_impl.h b/lib/arm/matchfinder_impl.h index 9e711cc..a4f2a50 100644 --- a/lib/arm/matchfinder_impl.h +++ b/lib/arm/matchfinder_impl.h @@ -25,9 +25,27 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#ifdef __ARM_NEON +#include "cpu_features.h" + +#undef DISPATCH_NEON +#if !defined(matchfinder_init_default) && \ + (defined(__ARM_NEON) || (ARM_CPU_FEATURES_ENABLED && \ + COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS)) +# ifdef __ARM_NEON +# define ATTRIBUTES +# define matchfinder_init_default matchfinder_init_neon +# define matchfinder_rebase_default matchfinder_rebase_neon +# else +# ifdef __arm__ +# define ATTRIBUTES __attribute__((target("fpu=neon"))) +# else +# define ATTRIBUTES __attribute__((target("+simd"))) +# endif +# define DISPATCH 1 +# define DISPATCH_NEON 1 +# endif # include -static forceinline void +static void ATTRIBUTES matchfinder_init_neon(mf_pos_t *data, size_t size) { int16x8_t *p = (int16x8_t *)data; @@ -50,9 +68,8 @@ matchfinder_init_neon(mf_pos_t *data, size_t size) size -= 4 * sizeof(*p); } while (size != 0); } -#define matchfinder_init matchfinder_init_neon -static forceinline void +static void ATTRIBUTES matchfinder_rebase_neon(mf_pos_t *data, size_t size) { int16x8_t *p = (int16x8_t *)data; @@ -76,6 +93,31 @@ matchfinder_rebase_neon(mf_pos_t *data, size_t size) size -= 4 * sizeof(*p); } while (size != 0); } -#define matchfinder_rebase matchfinder_rebase_neon +#undef ATTRIBUTES +#endif /* NEON implementation */ -#endif /* __ARM_NEON */ +#ifdef DISPATCH +static inline mf_init_func_t +arch_select_matchfinder_init_func(void) +{ + u32 features = get_cpu_features(); + +#ifdef DISPATCH_NEON + if (features & ARM_CPU_FEATURE_NEON) + return matchfinder_init_neon; +#endif + return NULL; +} + +static inline mf_rebase_func_t +arch_select_matchfinder_rebase_func(void) +{ + u32 features = get_cpu_features(); + +#ifdef DISPATCH_NEON + if (features & ARM_CPU_FEATURE_NEON) + return matchfinder_rebase_neon; +#endif + return NULL; +} +#endif /* DISPATCH */ diff --git a/lib/matchfinder_common.h b/lib/matchfinder_common.h index 49ff334..2a47c40 100644 --- a/lib/matchfinder_common.h +++ b/lib/matchfinder_common.h @@ -25,8 +25,10 @@ typedef s16 mf_pos_t; #define MATCHFINDER_MEM_ALIGNMENT 32 #define MATCHFINDER_SIZE_ALIGNMENT 128 -#undef matchfinder_init -#undef matchfinder_rebase +typedef void (*mf_init_func_t)(mf_pos_t *data, size_t size); +typedef void (*mf_rebase_func_t)(mf_pos_t *data, size_t size); + +#undef DISPATCH #ifdef _aligned_attribute # if defined(__arm__) || defined(__aarch64__) # include "arm/matchfinder_impl.h" @@ -35,6 +37,91 @@ typedef s16 mf_pos_t; # endif #endif +#ifndef matchfinder_init_default +static inline void +matchfinder_init_default(mf_pos_t *data, size_t size) +{ + size_t num_entries = size / sizeof(*data); + size_t i; + + for (i = 0; i < num_entries; i++) + data[i] = MATCHFINDER_INITVAL; +} + +static inline void +matchfinder_rebase_default(mf_pos_t *data, size_t size) +{ + size_t num_entries = size / sizeof(*data); + size_t i; + + if (MATCHFINDER_WINDOW_SIZE == 32768) { + /* + * Branchless version for 32768 byte windows. If the value was + * already negative, clear all bits except the sign bit; this + * changes the value to -32768. Otherwise, set the sign bit; + * this is equivalent to subtracting 32768. + */ + for (i = 0; i < num_entries; i++) { + u16 v = data[i]; + u16 sign_bit = v & 0x8000; + + v &= sign_bit - ((sign_bit >> 15) ^ 1); + v |= 0x8000; + data[i] = v; + } + return; + } + + for (i = 0; i < num_entries; i++) { + if (data[i] >= 0) + data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; + else + data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; + } +} +#endif /* !matchfinder_init_default */ + +#ifdef DISPATCH +static void dispatch_matchfinder_init(mf_pos_t *data, size_t size); +static void dispatch_matchfinder_rebase(mf_pos_t *data, size_t size); + +static volatile mf_init_func_t matchfinder_init_impl = + dispatch_matchfinder_init; +static volatile mf_rebase_func_t matchfinder_rebase_impl = + dispatch_matchfinder_rebase; + +/* Choose the fastest implementation at runtime */ +static void +dispatch_matchfinder_init(mf_pos_t *data, size_t size) +{ + mf_init_func_t f = arch_select_matchfinder_init_func(); + + if (f == NULL) + f = matchfinder_init_default; + + matchfinder_init_impl = f; + (*f)(data, size); +} + +/* Choose the fastest implementation at runtime */ +static void +dispatch_matchfinder_rebase(mf_pos_t *data, size_t size) +{ + mf_rebase_func_t f = arch_select_matchfinder_rebase_func(); + + if (f == NULL) + f = matchfinder_rebase_default; + + matchfinder_rebase_impl = f; + (*f)(data, size); +} +#else + /* only one implementation, use it */ +# define matchfinder_init_impl matchfinder_init_default +# define matchfinder_rebase_impl matchfinder_rebase_default +#endif +#undef DISPATCH + /* * Initialize the hash table portion of the matchfinder. * @@ -43,17 +130,11 @@ typedef s16 mf_pos_t; * 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and * 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT. */ -#ifndef matchfinder_init static forceinline void matchfinder_init(mf_pos_t *data, size_t size) { - size_t num_entries = size / sizeof(*data); - size_t i; - - for (i = 0; i < num_entries; i++) - data[i] = MATCHFINDER_INITVAL; + matchfinder_init_impl(data, size); } -#endif /* * Slide the matchfinder by WINDOW_SIZE bytes. @@ -79,36 +160,11 @@ matchfinder_init(mf_pos_t *data, size_t size) * 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and * 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT. */ -#ifndef matchfinder_rebase static forceinline void matchfinder_rebase(mf_pos_t *data, size_t size) { - size_t num_entries = size / sizeof(*data); - size_t i; - - if (MATCHFINDER_WINDOW_SIZE == 32768) { - /* Branchless version for 32768 byte windows. If the value was - * already negative, clear all bits except the sign bit; this - * changes the value to -32768. Otherwise, set the sign bit; - * this is equivalent to subtracting 32768. */ - for (i = 0; i < num_entries; i++) { - u16 v = data[i]; - u16 sign_bit = v & 0x8000; - v &= sign_bit - ((sign_bit >> 15) ^ 1); - v |= 0x8000; - data[i] = v; - } - return; - } - - for (i = 0; i < num_entries; i++) { - if (data[i] >= 0) - data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; - else - data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; - } + matchfinder_rebase_impl(data, size); } -#endif /* * The hash function: given a sequence prefix held in the low-order bits of a diff --git a/lib/x86/matchfinder_impl.h b/lib/x86/matchfinder_impl.h index 22a6c82..25dbc59 100644 --- a/lib/x86/matchfinder_impl.h +++ b/lib/x86/matchfinder_impl.h @@ -25,9 +25,23 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#ifdef __AVX2__ +#include "cpu_features.h" + +#undef DISPATCH_AVX2 +#if !defined(matchfinder_init_default) && \ + (defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED && \ + COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS)) +# ifdef __AVX2__ +# define ATTRIBUTES +# define matchfinder_init_default matchfinder_init_avx2 +# define matchfinder_rebase_default matchfinder_rebase_avx2 +# else +# define ATTRIBUTES __attribute__((target("avx2"))) +# define DISPATCH 1 +# define DISPATCH_AVX2 1 +# endif # include -static forceinline void +static void ATTRIBUTES matchfinder_init_avx2(mf_pos_t *data, size_t size) { __m256i *p = (__m256i *)data; @@ -46,9 +60,8 @@ matchfinder_init_avx2(mf_pos_t *data, size_t size) size -= 4 * sizeof(*p); } while (size != 0); } -#define matchfinder_init matchfinder_init_avx2 -static forceinline void +static void ATTRIBUTES matchfinder_rebase_avx2(mf_pos_t *data, size_t size) { __m256i *p = (__m256i *)data; @@ -68,11 +81,24 @@ matchfinder_rebase_avx2(mf_pos_t *data, size_t size) size -= 4 * sizeof(*p); } while (size != 0); } -#define matchfinder_rebase matchfinder_rebase_avx2 +#undef ATTRIBUTES +#endif /* AVX2 implementation */ -#elif defined(__SSE2__) +#undef DISPATCH_SSE2 +#if !defined(matchfinder_init_default) && \ + (defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED && \ + COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS)) +# ifdef __SSE2__ +# define ATTRIBUTES +# define matchfinder_init_default matchfinder_init_sse2 +# define matchfinder_rebase_default matchfinder_rebase_sse2 +# else +# define ATTRIBUTES __attribute__((target("sse2"))) +# define DISPATCH 1 +# define DISPATCH_SSE2 1 +# endif # include -static forceinline void +static void ATTRIBUTES matchfinder_init_sse2(mf_pos_t *data, size_t size) { __m128i *p = (__m128i *)data; @@ -91,9 +117,8 @@ matchfinder_init_sse2(mf_pos_t *data, size_t size) size -= 4 * sizeof(*p); } while (size != 0); } -#define matchfinder_init matchfinder_init_sse2 -static forceinline void +static void ATTRIBUTES matchfinder_rebase_sse2(mf_pos_t *data, size_t size) { __m128i *p = (__m128i *)data; @@ -113,5 +138,39 @@ matchfinder_rebase_sse2(mf_pos_t *data, size_t size) size -= 4 * sizeof(*p); } while (size != 0); } -#define matchfinder_rebase matchfinder_rebase_sse2 -#endif /* __SSE2__ */ +#undef ATTRIBUTES +#endif /* SSE2 implementation */ + +#ifdef DISPATCH +static inline mf_init_func_t +arch_select_matchfinder_init_func(void) +{ + u32 features = get_cpu_features(); + +#ifdef DISPATCH_AVX2 + if (features & X86_CPU_FEATURE_AVX2) + return matchfinder_init_avx2; +#endif +#ifdef DISPATCH_SSE2 + if (features & X86_CPU_FEATURE_SSE2) + return matchfinder_init_sse2; +#endif + return NULL; +} + +static inline mf_rebase_func_t +arch_select_matchfinder_rebase_func(void) +{ + u32 features = get_cpu_features(); + +#ifdef DISPATCH_AVX2 + if (features & X86_CPU_FEATURE_AVX2) + return matchfinder_rebase_avx2; +#endif +#ifdef DISPATCH_SSE2 + if (features & X86_CPU_FEATURE_SSE2) + return matchfinder_rebase_sse2; +#endif + return NULL; +} +#endif /* DISPATCH */