[EXPERIMENTAL] lib/x86/adler32: add an SSSE3 optimized Adler32 implementation

This commit is contained in:
Eric Biggers 2018-12-26 15:37:57 -06:00
parent 166084acaa
commit cd92f3c704
5 changed files with 81 additions and 5 deletions

View File

@ -137,6 +137,9 @@ typedef size_t machine_word_t;
#ifndef COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS 0
#endif

View File

@ -84,6 +84,7 @@
*/
# if GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000)
# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 1
# define COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS 1
# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS \
COMPILER_SUPPORTS_PCLMUL_TARGET
# define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS \

View File

@ -216,6 +216,69 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
# include "../adler32_vec_template.h"
#endif /* AVX2 implementation */
/* SSSE3 implementation */
#undef DISPATCH_SSSE3
#if !defined(DEFAULT_IMPL) && \
(defined(__SSSE3__) || (X86_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS))
# define FUNCNAME adler32_ssse3
# define FUNCNAME_CHUNK adler32_ssse3_chunk
# define IMPL_ALIGNMENT 16
# define IMPL_SEGMENT_SIZE 16
# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE
# ifdef __SSSE3__
# define ATTRIBUTES
# define DEFAULT_IMPL adler32_ssse3
# else
# define ATTRIBUTES __attribute__((target("ssse3")))
# define DISPATCH 1
# define DISPATCH_SSSE3 1
# endif
# include <tmmintrin.h>
static forceinline ATTRIBUTES void
adler32_ssse3_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
{
const __m128i zeroes = _mm_setzero_si128();
const __v16qu multipliers = (__v16qu){
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
};
const __v8hu ones = (__v8hu)_mm_set1_epi16(1);
__v4su v_s1 = (__v4su)zeroes;
__v4su v_s1_sums = (__v4su)zeroes;
__v4su v_s2 = (__v4su)zeroes;
do {
/* Load the next 16-byte segment */
__m128i bytes = *p++;
/* Multiply the bytes by 16...1 (the number of times they need
* to be added to s2) and add adjacent products */
__v8hi sums = (__v8hi)_mm_maddubs_epi16(
bytes, (__m128i)multipliers);
/* Keep sum of all previous s1 counters, for adding to s2 later.
* This allows delaying the multiplication by 16 to the end. */
v_s1_sums += v_s1;
/* Add the sum of each group of 8 bytes to the corresponding s1
* counter */
v_s1 += (__v4si)_mm_sad_epu8(bytes, zeroes);
/* Add the sum of each group of 4 products of the bytes by
* 16...1 to the corresponding s2 counter */
v_s2 += (__v4si)_mm_madd_epi16((__m128i)sums, (__m128i)ones);
} while (p != end);
/* Finish the s2 counters by adding the sum of the s1 values at the
* beginning of each segment, multiplied by the segment size (16) */
v_s2 += (__v4si)_mm_slli_epi32((__m128i)v_s1_sums, 4);
/* Add the counters to the real s1 and s2 */
ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
}
# include "../adler32_vec_template.h"
#endif /* SSSE3 implementation */
/* SSE2 implementation */
#undef DISPATCH_SSE2
#if !defined(DEFAULT_IMPL) && \
@ -323,6 +386,10 @@ arch_select_adler32_func(void)
if (features & X86_CPU_FEATURE_AVX2)
return adler32_avx2;
#endif
#ifdef DISPATCH_SSSE3
if (features & X86_CPU_FEATURE_SSSE3)
return adler32_ssse3;
#endif
#ifdef DISPATCH_SSE2
if (features & X86_CPU_FEATURE_SSE2)
return adler32_sse2;

View File

@ -78,6 +78,7 @@ read_xcr(u32 index)
static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_SSE2, "sse2"},
{X86_CPU_FEATURE_SSSE3, "ssse3"},
{X86_CPU_FEATURE_PCLMUL, "pclmul"},
{X86_CPU_FEATURE_AVX, "avx"},
{X86_CPU_FEATURE_AVX2, "avx2"},
@ -109,6 +110,9 @@ void setup_cpu_features(void)
if (IS_SET(features_2, 1))
features |= X86_CPU_FEATURE_PCLMUL;
if (IS_SET(features_2, 9))
features |= X86_CPU_FEATURE_SSSE3;
if (IS_SET(features_2, 27)) { /* OSXSAVE set? */
u64 xcr0 = read_xcr(0);

View File

@ -17,11 +17,12 @@
#if X86_CPU_FEATURES_ENABLED
#define X86_CPU_FEATURE_SSE2 0x00000001
#define X86_CPU_FEATURE_PCLMUL 0x00000002
#define X86_CPU_FEATURE_AVX 0x00000004
#define X86_CPU_FEATURE_AVX2 0x00000008
#define X86_CPU_FEATURE_BMI2 0x00000010
#define X86_CPU_FEATURE_AVX512BW 0x00000020
#define X86_CPU_FEATURE_SSSE3 0x00000002
#define X86_CPU_FEATURE_PCLMUL 0x00000004
#define X86_CPU_FEATURE_AVX 0x00000008
#define X86_CPU_FEATURE_AVX2 0x00000010
#define X86_CPU_FEATURE_BMI2 0x00000020
#define X86_CPU_FEATURE_AVX512BW 0x00000040
#define X86_CPU_FEATURES_KNOWN 0x80000000