mirror of
https://github.com/cuberite/libdeflate.git
synced 2025-09-10 12:58:30 -04:00
lib/x86/adler32: add an AVX-512BW optimized Adler32 implementation
This commit is contained in:
parent
5c80decb26
commit
73017f08e5
@ -27,7 +27,132 @@
|
|||||||
|
|
||||||
#include "cpu_features.h"
|
#include "cpu_features.h"
|
||||||
|
|
||||||
/* AVX2 implementation */
|
/*
|
||||||
|
* The following macros horizontally sum the s1 counters and add them to the
|
||||||
|
* real s1, and likewise for s2. They do this via a series of reductions, each
|
||||||
|
* of which halves the vector length, until just one counter remains.
|
||||||
|
*
|
||||||
|
* The s1 reductions don't depend on the s2 reductions and vice versa, so for
|
||||||
|
* efficiency they are interleaved. Also, every other s1 counter is 0 due to
|
||||||
|
* the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than
|
||||||
|
* 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2) \
|
||||||
|
{ \
|
||||||
|
__v4si s1_last = (v_s1), s2_last = (v_s2); \
|
||||||
|
\
|
||||||
|
/* 128 => 32 bits */ \
|
||||||
|
s2_last += (__v4si)_mm_shuffle_epi32((__m128i)s2_last, 0x31); \
|
||||||
|
s1_last += (__v4si)_mm_shuffle_epi32((__m128i)s1_last, 0x02); \
|
||||||
|
s2_last += (__v4si)_mm_shuffle_epi32((__m128i)s2_last, 0x02); \
|
||||||
|
\
|
||||||
|
*(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last); \
|
||||||
|
*(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2) \
|
||||||
|
{ \
|
||||||
|
__v4si s1_128bit, s2_128bit; \
|
||||||
|
\
|
||||||
|
/* 256 => 128 bits */ \
|
||||||
|
s1_128bit = (__v4si)_mm256_extracti128_si256((__m256i)(v_s1), 0) + \
|
||||||
|
(__v4si)_mm256_extracti128_si256((__m256i)(v_s1), 1); \
|
||||||
|
s2_128bit = (__v4si)_mm256_extracti128_si256((__m256i)(v_s2), 0) + \
|
||||||
|
(__v4si)_mm256_extracti128_si256((__m256i)(v_s2), 1); \
|
||||||
|
\
|
||||||
|
ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \
|
||||||
|
{ \
|
||||||
|
__v8si s1_256bit, s2_256bit; \
|
||||||
|
\
|
||||||
|
/* 512 => 256 bits */ \
|
||||||
|
s1_256bit = (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s1), 0) + \
|
||||||
|
(__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s1), 1); \
|
||||||
|
s2_256bit = (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s2), 0) + \
|
||||||
|
(__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s2), 1); \
|
||||||
|
\
|
||||||
|
ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \
|
||||||
|
}
|
||||||
|
|
||||||
|
/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */
|
||||||
|
#undef DISPATCH_AVX512BW
|
||||||
|
#if !defined(DEFAULT_IMPL) && \
|
||||||
|
/*
|
||||||
|
* clang before v3.9 is missing some AVX-512BW intrinsics including
|
||||||
|
* _mm512_sad_epu8(), a.k.a. __builtin_ia32_psadbw512. So just make using
|
||||||
|
* AVX-512BW, even when __AVX512BW__ is defined, conditional on
|
||||||
|
* COMPILER_SUPPORTS_AVX512BW_TARGET where we check for that builtin.
|
||||||
|
*/ \
|
||||||
|
COMPILER_SUPPORTS_AVX512BW_TARGET && \
|
||||||
|
(defined(__AVX512BW__) || (X86_CPU_FEATURES_ENABLED && \
|
||||||
|
COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS))
|
||||||
|
# define FUNCNAME adler32_avx512bw
|
||||||
|
# define FUNCNAME_CHUNK adler32_avx512bw_chunk
|
||||||
|
# define IMPL_ALIGNMENT 64
|
||||||
|
# define IMPL_SEGMENT_SIZE 64
|
||||||
|
# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE
|
||||||
|
# ifdef __AVX512BW__
|
||||||
|
# define ATTRIBUTES
|
||||||
|
# define DEFAULT_IMPL adler32_avx512bw
|
||||||
|
# else
|
||||||
|
# define ATTRIBUTES __attribute__((target("avx512bw")))
|
||||||
|
# define DISPATCH 1
|
||||||
|
# define DISPATCH_AVX512BW 1
|
||||||
|
# endif
|
||||||
|
# include <immintrin.h>
|
||||||
|
static forceinline ATTRIBUTES void
|
||||||
|
adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end,
|
||||||
|
u32 *s1, u32 *s2)
|
||||||
|
{
|
||||||
|
const __m512i zeroes = _mm512_setzero_si512();
|
||||||
|
const __v64qi multipliers = (__v64qi){
|
||||||
|
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
|
||||||
|
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
|
||||||
|
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
|
||||||
|
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
|
||||||
|
};
|
||||||
|
const __v32hi ones = (__v32hi)_mm512_set1_epi16(1);
|
||||||
|
__v16si v_s1 = (__v16si)zeroes;
|
||||||
|
__v16si v_s1_sums = (__v16si)zeroes;
|
||||||
|
__v16si v_s2 = (__v16si)zeroes;
|
||||||
|
|
||||||
|
do {
|
||||||
|
/* Load the next 64-byte segment */
|
||||||
|
__m512i bytes = *p++;
|
||||||
|
|
||||||
|
/* Multiply the bytes by 64...1 (the number of times they need
|
||||||
|
* to be added to s2) and add adjacent products */
|
||||||
|
__v32hi sums = (__v32hi)_mm512_maddubs_epi16(
|
||||||
|
bytes, (__m512i)multipliers);
|
||||||
|
|
||||||
|
/* Keep sum of all previous s1 counters, for adding to s2 later.
|
||||||
|
* This allows delaying the multiplication by 64 to the end. */
|
||||||
|
v_s1_sums += v_s1;
|
||||||
|
|
||||||
|
/* Add the sum of each group of 8 bytes to the corresponding s1
|
||||||
|
* counter */
|
||||||
|
v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes);
|
||||||
|
|
||||||
|
/* Add the sum of each group of 4 products of the bytes by
|
||||||
|
* 64...1 to the corresponding s2 counter */
|
||||||
|
v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums,
|
||||||
|
(__m512i)ones);
|
||||||
|
} while (p != end);
|
||||||
|
|
||||||
|
/* Finish the s2 counters by adding the sum of the s1 values at the
|
||||||
|
* beginning of each segment, multiplied by the segment size (64) */
|
||||||
|
v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6);
|
||||||
|
|
||||||
|
/* Add the counters to the real s1 and s2 */
|
||||||
|
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2);
|
||||||
|
}
|
||||||
|
# include "../adler32_vec_template.h"
|
||||||
|
#endif /* AVX-512BW implementation */
|
||||||
|
|
||||||
|
/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */
|
||||||
#undef DISPATCH_AVX2
|
#undef DISPATCH_AVX2
|
||||||
#if !defined(DEFAULT_IMPL) && \
|
#if !defined(DEFAULT_IMPL) && \
|
||||||
(defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED && \
|
(defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED && \
|
||||||
@ -50,32 +175,43 @@ static forceinline ATTRIBUTES void
|
|||||||
adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
|
adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
|
||||||
{
|
{
|
||||||
const __m256i zeroes = _mm256_setzero_si256();
|
const __m256i zeroes = _mm256_setzero_si256();
|
||||||
const __v32qi multipliers = (__v32qi) { 32, 31, 30, 29, 28, 27, 26, 25,
|
const __v32qi multipliers = (__v32qi){
|
||||||
24, 23, 22, 21, 20, 19, 18, 17,
|
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
|
||||||
16, 15, 14, 13, 12, 11, 10, 9,
|
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
|
||||||
8, 7, 6, 5, 4, 3, 2, 1 };
|
};
|
||||||
const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
|
const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
|
||||||
__v8si v_s1 = (__v8si)zeroes;
|
__v8si v_s1 = (__v8si)zeroes;
|
||||||
__v8si v_s1_sums = (__v8si)zeroes;
|
__v8si v_s1_sums = (__v8si)zeroes;
|
||||||
__v8si v_s2 = (__v8si)zeroes;
|
__v8si v_s2 = (__v8si)zeroes;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
|
/* Load the next 32-byte segment */
|
||||||
__m256i bytes = *p++;
|
__m256i bytes = *p++;
|
||||||
|
|
||||||
|
/* Multiply the bytes by 32...1 (the number of times they need
|
||||||
|
* to be added to s2) and add adjacent products */
|
||||||
__v16hi sums = (__v16hi)_mm256_maddubs_epi16(
|
__v16hi sums = (__v16hi)_mm256_maddubs_epi16(
|
||||||
bytes, (__m256i)multipliers);
|
bytes, (__m256i)multipliers);
|
||||||
|
|
||||||
|
/* Keep sum of all previous s1 counters, for adding to s2 later.
|
||||||
|
* This allows delaying the multiplication by 32 to the end. */
|
||||||
v_s1_sums += v_s1;
|
v_s1_sums += v_s1;
|
||||||
|
|
||||||
|
/* Add the sum of each group of 8 bytes to the corresponding s1
|
||||||
|
* counter */
|
||||||
v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
|
v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
|
||||||
|
|
||||||
|
/* Add the sum of each group of 4 products of the bytes by
|
||||||
|
* 32...1 to the corresponding s2 counter */
|
||||||
v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
|
v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
|
||||||
} while (p != end);
|
} while (p != end);
|
||||||
|
|
||||||
v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
|
/* Finish the s2 counters by adding the sum of the s1 values at the
|
||||||
v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
|
* beginning of each segment, multiplied by the segment size (32) */
|
||||||
*s1 += (u32)v_s1[0] + (u32)v_s1[4];
|
|
||||||
|
|
||||||
v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
|
v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
|
||||||
v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
|
|
||||||
v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
|
/* Add the counters to the real s1 and s2 */
|
||||||
*s2 += (u32)v_s2[0] + (u32)v_s2[4];
|
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2);
|
||||||
}
|
}
|
||||||
# include "../adler32_vec_template.h"
|
# include "../adler32_vec_template.h"
|
||||||
#endif /* AVX2 implementation */
|
#endif /* AVX2 implementation */
|
||||||
@ -167,14 +303,8 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
|
|||||||
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
|
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
|
||||||
(__m128i)(__v8hi){ 8, 7, 6, 5, 4, 3, 2, 1 });
|
(__m128i)(__v8hi){ 8, 7, 6, 5, 4, 3, 2, 1 });
|
||||||
|
|
||||||
/* Now accumulate what we computed into the real s1 and s2 */
|
/* Add the counters to the real s1 and s2 */
|
||||||
v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x31);
|
ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
|
||||||
v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x02);
|
|
||||||
*s1 += _mm_cvtsi128_si32((__m128i)v_s1);
|
|
||||||
|
|
||||||
v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x31);
|
|
||||||
v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x02);
|
|
||||||
*s2 += _mm_cvtsi128_si32((__m128i)v_s2);
|
|
||||||
}
|
}
|
||||||
# include "../adler32_vec_template.h"
|
# include "../adler32_vec_template.h"
|
||||||
#endif /* SSE2 implementation */
|
#endif /* SSE2 implementation */
|
||||||
@ -185,6 +315,10 @@ arch_select_adler32_func(void)
|
|||||||
{
|
{
|
||||||
u32 features = get_cpu_features();
|
u32 features = get_cpu_features();
|
||||||
|
|
||||||
|
#ifdef DISPATCH_AVX512BW
|
||||||
|
if (features & X86_CPU_FEATURE_AVX512BW)
|
||||||
|
return adler32_avx512bw;
|
||||||
|
#endif
|
||||||
#ifdef DISPATCH_AVX2
|
#ifdef DISPATCH_AVX2
|
||||||
if (features & X86_CPU_FEATURE_AVX2)
|
if (features & X86_CPU_FEATURE_AVX2)
|
||||||
return adler32_avx2;
|
return adler32_avx2;
|
||||||
|
@ -126,6 +126,10 @@ echo
|
|||||||
{
|
{
|
||||||
case $ARCH in
|
case $ARCH in
|
||||||
i386|x86_64)
|
i386|x86_64)
|
||||||
|
if have_cpu_feature avx512bw; then
|
||||||
|
do_benchmark "AVX-512BW"
|
||||||
|
disable_impl "AVX512BW" "-mno-avx512bw"
|
||||||
|
fi
|
||||||
if have_cpu_feature avx2; then
|
if have_cpu_feature avx2; then
|
||||||
do_benchmark "AVX2"
|
do_benchmark "AVX2"
|
||||||
disable_impl "AVX2" "-mno-avx2"
|
disable_impl "AVX2" "-mno-avx2"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user