diff --git a/common/common_defs.h b/common/common_defs.h index d56c5cf..6d8a3a7 100644 --- a/common/common_defs.h +++ b/common/common_defs.h @@ -137,6 +137,9 @@ typedef size_t machine_word_t; #ifndef COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS # define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 0 #endif +#ifndef COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS +# define COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS 0 +#endif #ifndef COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS # define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS 0 #endif diff --git a/common/compiler_gcc.h b/common/compiler_gcc.h index 8259946..9b3e168 100644 --- a/common/compiler_gcc.h +++ b/common/compiler_gcc.h @@ -84,6 +84,7 @@ */ # if GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) # define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 1 +# define COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS 1 # define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS \ COMPILER_SUPPORTS_PCLMUL_TARGET # define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS \ diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index 4b88c01..95934e8 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -216,6 +216,69 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) # include "../adler32_vec_template.h" #endif /* AVX2 implementation */ +/* SSSE3 implementation */ +#undef DISPATCH_SSSE3 +#if !defined(DEFAULT_IMPL) && \ + (defined(__SSSE3__) || (X86_CPU_FEATURES_ENABLED && \ + COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS)) +# define FUNCNAME adler32_ssse3 +# define FUNCNAME_CHUNK adler32_ssse3_chunk +# define IMPL_ALIGNMENT 16 +# define IMPL_SEGMENT_SIZE 16 +# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE +# ifdef __SSSE3__ +# define ATTRIBUTES +# define DEFAULT_IMPL adler32_ssse3 +# else +# define ATTRIBUTES __attribute__((target("ssse3"))) +# define DISPATCH 1 +# define DISPATCH_SSSE3 1 +# endif +# include +static forceinline ATTRIBUTES void +adler32_ssse3_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) +{ + const __m128i zeroes = _mm_setzero_si128(); + const __v16qu multipliers = (__v16qu){ + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + }; + const __v8hu ones = (__v8hu)_mm_set1_epi16(1); + __v4su v_s1 = (__v4su)zeroes; + __v4su v_s1_sums = (__v4su)zeroes; + __v4su v_s2 = (__v4su)zeroes; + + do { + /* Load the next 16-byte segment */ + __m128i bytes = *p++; + + /* Multiply the bytes by 16...1 (the number of times they need + * to be added to s2) and add adjacent products */ + __v8hi sums = (__v8hi)_mm_maddubs_epi16( + bytes, (__m128i)multipliers); + + /* Keep sum of all previous s1 counters, for adding to s2 later. + * This allows delaying the multiplication by 16 to the end. */ + v_s1_sums += v_s1; + + /* Add the sum of each group of 8 bytes to the corresponding s1 + * counter */ + v_s1 += (__v4si)_mm_sad_epu8(bytes, zeroes); + + /* Add the sum of each group of 4 products of the bytes by + * 16...1 to the corresponding s2 counter */ + v_s2 += (__v4si)_mm_madd_epi16((__m128i)sums, (__m128i)ones); + } while (p != end); + + /* Finish the s2 counters by adding the sum of the s1 values at the + * beginning of each segment, multiplied by the segment size (16) */ + v_s2 += (__v4si)_mm_slli_epi32((__m128i)v_s1_sums, 4); + + /* Add the counters to the real s1 and s2 */ + ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2); +} +# include "../adler32_vec_template.h" +#endif /* SSSE3 implementation */ + /* SSE2 implementation */ #undef DISPATCH_SSE2 #if !defined(DEFAULT_IMPL) && \ @@ -323,6 +386,10 @@ arch_select_adler32_func(void) if (features & X86_CPU_FEATURE_AVX2) return adler32_avx2; #endif +#ifdef DISPATCH_SSSE3 + if (features & X86_CPU_FEATURE_SSSE3) + return adler32_ssse3; +#endif #ifdef DISPATCH_SSE2 if (features & X86_CPU_FEATURE_SSE2) return adler32_sse2; diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index e3471d4..c68edd9 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -78,6 +78,7 @@ read_xcr(u32 index) static const struct cpu_feature x86_cpu_feature_table[] = { {X86_CPU_FEATURE_SSE2, "sse2"}, + {X86_CPU_FEATURE_SSSE3, "ssse3"}, {X86_CPU_FEATURE_PCLMUL, "pclmul"}, {X86_CPU_FEATURE_AVX, "avx"}, {X86_CPU_FEATURE_AVX2, "avx2"}, @@ -109,6 +110,9 @@ void setup_cpu_features(void) if (IS_SET(features_2, 1)) features |= X86_CPU_FEATURE_PCLMUL; + if (IS_SET(features_2, 9)) + features |= X86_CPU_FEATURE_SSSE3; + if (IS_SET(features_2, 27)) { /* OSXSAVE set? */ u64 xcr0 = read_xcr(0); diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index 4c02353..4e89c91 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -17,11 +17,12 @@ #if X86_CPU_FEATURES_ENABLED #define X86_CPU_FEATURE_SSE2 0x00000001 -#define X86_CPU_FEATURE_PCLMUL 0x00000002 -#define X86_CPU_FEATURE_AVX 0x00000004 -#define X86_CPU_FEATURE_AVX2 0x00000008 -#define X86_CPU_FEATURE_BMI2 0x00000010 -#define X86_CPU_FEATURE_AVX512BW 0x00000020 +#define X86_CPU_FEATURE_SSSE3 0x00000002 +#define X86_CPU_FEATURE_PCLMUL 0x00000004 +#define X86_CPU_FEATURE_AVX 0x00000008 +#define X86_CPU_FEATURE_AVX2 0x00000010 +#define X86_CPU_FEATURE_BMI2 0x00000020 +#define X86_CPU_FEATURE_AVX512BW 0x00000040 #define X86_CPU_FEATURES_KNOWN 0x80000000