mirror of
https://github.com/cuberite/libdeflate.git
synced 2025-08-03 09:46:04 -04:00
[EXPERIMENTAL] lib/x86/adler32: add an SSSE3 optimized Adler32 implementation
This commit is contained in:
parent
166084acaa
commit
cd92f3c704
@ -137,6 +137,9 @@ typedef size_t machine_word_t;
|
||||
#ifndef COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS
|
||||
# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 0
|
||||
#endif
|
||||
#ifndef COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS
|
||||
# define COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS 0
|
||||
#endif
|
||||
#ifndef COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS
|
||||
# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS 0
|
||||
#endif
|
||||
|
@ -84,6 +84,7 @@
|
||||
*/
|
||||
# if GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000)
|
||||
# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 1
|
||||
# define COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS 1
|
||||
# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS \
|
||||
COMPILER_SUPPORTS_PCLMUL_TARGET
|
||||
# define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS \
|
||||
|
@ -216,6 +216,69 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
|
||||
# include "../adler32_vec_template.h"
|
||||
#endif /* AVX2 implementation */
|
||||
|
||||
/* SSSE3 implementation */
|
||||
#undef DISPATCH_SSSE3
|
||||
#if !defined(DEFAULT_IMPL) && \
|
||||
(defined(__SSSE3__) || (X86_CPU_FEATURES_ENABLED && \
|
||||
COMPILER_SUPPORTS_SSSE3_TARGET_INTRINSICS))
|
||||
# define FUNCNAME adler32_ssse3
|
||||
# define FUNCNAME_CHUNK adler32_ssse3_chunk
|
||||
# define IMPL_ALIGNMENT 16
|
||||
# define IMPL_SEGMENT_SIZE 16
|
||||
# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE
|
||||
# ifdef __SSSE3__
|
||||
# define ATTRIBUTES
|
||||
# define DEFAULT_IMPL adler32_ssse3
|
||||
# else
|
||||
# define ATTRIBUTES __attribute__((target("ssse3")))
|
||||
# define DISPATCH 1
|
||||
# define DISPATCH_SSSE3 1
|
||||
# endif
|
||||
# include <tmmintrin.h>
|
||||
static forceinline ATTRIBUTES void
|
||||
adler32_ssse3_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
|
||||
{
|
||||
const __m128i zeroes = _mm_setzero_si128();
|
||||
const __v16qu multipliers = (__v16qu){
|
||||
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
|
||||
};
|
||||
const __v8hu ones = (__v8hu)_mm_set1_epi16(1);
|
||||
__v4su v_s1 = (__v4su)zeroes;
|
||||
__v4su v_s1_sums = (__v4su)zeroes;
|
||||
__v4su v_s2 = (__v4su)zeroes;
|
||||
|
||||
do {
|
||||
/* Load the next 16-byte segment */
|
||||
__m128i bytes = *p++;
|
||||
|
||||
/* Multiply the bytes by 16...1 (the number of times they need
|
||||
* to be added to s2) and add adjacent products */
|
||||
__v8hi sums = (__v8hi)_mm_maddubs_epi16(
|
||||
bytes, (__m128i)multipliers);
|
||||
|
||||
/* Keep sum of all previous s1 counters, for adding to s2 later.
|
||||
* This allows delaying the multiplication by 16 to the end. */
|
||||
v_s1_sums += v_s1;
|
||||
|
||||
/* Add the sum of each group of 8 bytes to the corresponding s1
|
||||
* counter */
|
||||
v_s1 += (__v4si)_mm_sad_epu8(bytes, zeroes);
|
||||
|
||||
/* Add the sum of each group of 4 products of the bytes by
|
||||
* 16...1 to the corresponding s2 counter */
|
||||
v_s2 += (__v4si)_mm_madd_epi16((__m128i)sums, (__m128i)ones);
|
||||
} while (p != end);
|
||||
|
||||
/* Finish the s2 counters by adding the sum of the s1 values at the
|
||||
* beginning of each segment, multiplied by the segment size (16) */
|
||||
v_s2 += (__v4si)_mm_slli_epi32((__m128i)v_s1_sums, 4);
|
||||
|
||||
/* Add the counters to the real s1 and s2 */
|
||||
ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
|
||||
}
|
||||
# include "../adler32_vec_template.h"
|
||||
#endif /* SSSE3 implementation */
|
||||
|
||||
/* SSE2 implementation */
|
||||
#undef DISPATCH_SSE2
|
||||
#if !defined(DEFAULT_IMPL) && \
|
||||
@ -323,6 +386,10 @@ arch_select_adler32_func(void)
|
||||
if (features & X86_CPU_FEATURE_AVX2)
|
||||
return adler32_avx2;
|
||||
#endif
|
||||
#ifdef DISPATCH_SSSE3
|
||||
if (features & X86_CPU_FEATURE_SSSE3)
|
||||
return adler32_ssse3;
|
||||
#endif
|
||||
#ifdef DISPATCH_SSE2
|
||||
if (features & X86_CPU_FEATURE_SSE2)
|
||||
return adler32_sse2;
|
||||
|
@ -78,6 +78,7 @@ read_xcr(u32 index)
|
||||
|
||||
static const struct cpu_feature x86_cpu_feature_table[] = {
|
||||
{X86_CPU_FEATURE_SSE2, "sse2"},
|
||||
{X86_CPU_FEATURE_SSSE3, "ssse3"},
|
||||
{X86_CPU_FEATURE_PCLMUL, "pclmul"},
|
||||
{X86_CPU_FEATURE_AVX, "avx"},
|
||||
{X86_CPU_FEATURE_AVX2, "avx2"},
|
||||
@ -109,6 +110,9 @@ void setup_cpu_features(void)
|
||||
if (IS_SET(features_2, 1))
|
||||
features |= X86_CPU_FEATURE_PCLMUL;
|
||||
|
||||
if (IS_SET(features_2, 9))
|
||||
features |= X86_CPU_FEATURE_SSSE3;
|
||||
|
||||
if (IS_SET(features_2, 27)) { /* OSXSAVE set? */
|
||||
u64 xcr0 = read_xcr(0);
|
||||
|
||||
|
@ -17,11 +17,12 @@
|
||||
#if X86_CPU_FEATURES_ENABLED
|
||||
|
||||
#define X86_CPU_FEATURE_SSE2 0x00000001
|
||||
#define X86_CPU_FEATURE_PCLMUL 0x00000002
|
||||
#define X86_CPU_FEATURE_AVX 0x00000004
|
||||
#define X86_CPU_FEATURE_AVX2 0x00000008
|
||||
#define X86_CPU_FEATURE_BMI2 0x00000010
|
||||
#define X86_CPU_FEATURE_AVX512BW 0x00000020
|
||||
#define X86_CPU_FEATURE_SSSE3 0x00000002
|
||||
#define X86_CPU_FEATURE_PCLMUL 0x00000004
|
||||
#define X86_CPU_FEATURE_AVX 0x00000008
|
||||
#define X86_CPU_FEATURE_AVX2 0x00000010
|
||||
#define X86_CPU_FEATURE_BMI2 0x00000020
|
||||
#define X86_CPU_FEATURE_AVX512BW 0x00000040
|
||||
|
||||
#define X86_CPU_FEATURES_KNOWN 0x80000000
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user