mirror of
https://github.com/cuberite/libdeflate.git
synced 2025-09-11 13:32:14 -04:00
Vectorized Adler-32 optimizations and cleanups
This commit is contained in:
parent
a525da5ba7
commit
1bd972195e
@ -121,48 +121,45 @@ static u32 adler32_generic(const void *buffer, size_t size)
|
||||
#define DEFAULT_IMPL adler32_generic
|
||||
#endif /* NEED_GENERIC_IMPL */
|
||||
|
||||
#define TARGET_SSE2 100
|
||||
#define TARGET_AVX2 200
|
||||
#define TARGET_NEON 300
|
||||
|
||||
/* Define the SSE2 implementation if needed. */
|
||||
#if NEED_SSE2_IMPL
|
||||
# define FUNCNAME adler32_sse2
|
||||
# define ADLER32_TARGET_SSE2
|
||||
# ifdef __SSE2__
|
||||
# define ATTRIBUTES
|
||||
# define DEFAULT_IMPL adler32_sse2
|
||||
# else
|
||||
# define ATTRIBUTES __attribute__((target("sse2")))
|
||||
# endif
|
||||
# define FUNCNAME adler32_sse2
|
||||
# define TARGET TARGET_SSE2
|
||||
# define ALIGNMENT_REQUIRED 16
|
||||
# define BYTES_PER_ITERATION 32
|
||||
# define ATTRIBUTES
|
||||
# define DEFAULT_IMPL adler32_sse2
|
||||
# include "adler32_impl.h"
|
||||
# undef FUNCNAME
|
||||
# undef ADLER32_TARGET_SSE2
|
||||
# undef ATTRIBUTES
|
||||
#endif
|
||||
|
||||
/* Define the AVX2 implementation if needed. */
|
||||
#if NEED_AVX2_IMPL
|
||||
# define FUNCNAME adler32_avx2
|
||||
# define ADLER32_TARGET_AVX2
|
||||
# define FUNCNAME adler32_avx2
|
||||
# define TARGET TARGET_AVX2
|
||||
# define ALIGNMENT_REQUIRED 32
|
||||
# define BYTES_PER_ITERATION 32
|
||||
# ifdef __AVX2__
|
||||
# define ATTRIBUTES
|
||||
# define DEFAULT_IMPL adler32_avx2
|
||||
# define DEFAULT_IMPL adler32_avx2
|
||||
# else
|
||||
# define ATTRIBUTES __attribute__((target("avx2")))
|
||||
# define ATTRIBUTES __attribute__((target("avx2")))
|
||||
# endif
|
||||
# include "adler32_impl.h"
|
||||
# undef FUNCNAME
|
||||
# undef ADLER32_TARGET_AVX2
|
||||
# undef ATTRIBUTES
|
||||
#endif
|
||||
|
||||
/* Define the NEON implementation if needed. */
|
||||
#if NEED_NEON_IMPL
|
||||
# define FUNCNAME adler32_neon
|
||||
# define ADLER32_TARGET_NEON
|
||||
# define FUNCNAME adler32_neon
|
||||
# define TARGET TARGET_NEON
|
||||
# define ALIGNMENT_REQUIRED 16
|
||||
# define BYTES_PER_ITERATION 32
|
||||
# define ATTRIBUTES
|
||||
# define DEFAULT_IMPL adler32_neon
|
||||
# define DEFAULT_IMPL adler32_neon
|
||||
# include "adler32_impl.h"
|
||||
# undef FUNCNAME
|
||||
# undef ADLER32_TARGET_NEON
|
||||
# undef ATTRIBUTES
|
||||
#endif
|
||||
|
||||
typedef u32 (*adler32_func_t)(const void *, size_t);
|
||||
|
@ -12,7 +12,40 @@
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
/* Template for vectorized Adler-32 implementations */
|
||||
/*
|
||||
* This file contains a template for vectorized Adler-32 implementations.
|
||||
*
|
||||
* The inner loop between reductions modulo 65521 of an unvectorized Adler-32
|
||||
* implementation looks something like this:
|
||||
*
|
||||
* do {
|
||||
* s1 += *p;
|
||||
* s2 += s1;
|
||||
* } while (++p != chunk_end);
|
||||
*
|
||||
* For vectorized calculation of s1, we only need to sum the input bytes. They
|
||||
* can be accumulated into multiple counters which are eventually summed
|
||||
* together.
|
||||
*
|
||||
* For vectorized calculation of s2, the basic idea is that for each iteration
|
||||
* that processes N bytes, we can perform the following vectorizable
|
||||
* calculation:
|
||||
*
|
||||
* s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
|
||||
*
|
||||
* Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
|
||||
* separate counters, then do the multiplications by N...1 just once at the end
|
||||
* rather than once per iteration.
|
||||
*
|
||||
* Also, we must account for how previous bytes will affect s2 by doing the
|
||||
* following at beginning of each iteration:
|
||||
*
|
||||
* s2 += s1 * N
|
||||
*
|
||||
* Futhermore, like s1, "s2" can actually be multiple counters which are
|
||||
* eventually summed together.
|
||||
*/
|
||||
|
||||
static u32 ATTRIBUTES
|
||||
FUNCNAME(const void *buffer, size_t size)
|
||||
{
|
||||
@ -20,41 +53,50 @@ FUNCNAME(const void *buffer, size_t size)
|
||||
u32 s2 = 0;
|
||||
const u8 *p = buffer;
|
||||
const u8 * const end = p + size;
|
||||
const uintptr_t endv = (uintptr_t)end & ~31;
|
||||
const u8 *vend;
|
||||
|
||||
/* Proceed until the buffer is 32-byte aligned. */
|
||||
if (p != end && ((uintptr_t)p & 31)) {
|
||||
/* Process a byte at a time until the required alignment is reached. */
|
||||
if (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED) {
|
||||
do {
|
||||
s1 += *p++;
|
||||
s2 += s1;
|
||||
} while (p != end && ((uintptr_t)p & 31));
|
||||
} while (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED);
|
||||
s1 %= DIVISOR;
|
||||
s2 %= DIVISOR;
|
||||
}
|
||||
|
||||
/* While there are still 32-byte aligned vectors remaining... */
|
||||
while ((uintptr_t)p < endv) {
|
||||
/*
|
||||
* Process "chunks" of bytes using vector instructions. Chunk sizes are
|
||||
* limited to MAX_BYTES_PER_CHUNK, which guarantees that s1 and s2 never
|
||||
* overflow before being reduced modulo DIVISOR. For vector processing,
|
||||
* chunks size are also made evenly divisible by BYTES_PER_ITERATION.
|
||||
*/
|
||||
STATIC_ASSERT(BYTES_PER_ITERATION % ALIGNMENT_REQUIRED == 0);
|
||||
vend = end - ((end - p) % BYTES_PER_ITERATION);
|
||||
while (p != vend) {
|
||||
size_t chunk_size;
|
||||
const u8 *chunk_end;
|
||||
|
||||
/* Process a chunk which is guaranteed not to overflow s2. */
|
||||
size_t chunk_size = MIN(endv - (uintptr_t)p,
|
||||
MAX_BYTES_PER_CHUNK & ~31);
|
||||
const u8 *chunk_end = p + chunk_size;
|
||||
chunk_size = MIN(vend - p, MAX_BYTES_PER_CHUNK);
|
||||
#if TARGET == TARGET_SSE2
|
||||
/* SSE2: the 16-bit precision byte counters must not undergo
|
||||
* *signed* overflow, otherwise the signed multiplication at the
|
||||
* end will not behave as desired. */
|
||||
chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0x7FFF / 0xFF));
|
||||
#elif TARGET == TARGET_NEON
|
||||
/* NEON: the 16-bit precision counters must not undergo
|
||||
* *unsigned* overflow. */
|
||||
chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0xFFFF / 0xFF));
|
||||
#endif
|
||||
chunk_size -= chunk_size % BYTES_PER_ITERATION;
|
||||
|
||||
chunk_end = p + chunk_size;
|
||||
|
||||
s2 += s1 * chunk_size;
|
||||
s2 %= DIVISOR;
|
||||
{
|
||||
#ifdef ADLER32_TARGET_AVX2
|
||||
/*
|
||||
* AVX2 implementation. This works like the SSE2 implementation
|
||||
* below, except:
|
||||
*
|
||||
* - Vectors are 32 bytes instead of 16 bytes.
|
||||
*
|
||||
* - We use "Packed Multiply and Add Unsigned Byte to Signed
|
||||
* Word" (_mm256_maddubs_epi16()) to directly do the
|
||||
* multiplication of the bytes with their multipliers without
|
||||
* having to expand the bytes to 16-bit integers first.
|
||||
*/
|
||||
#if TARGET == TARGET_AVX2
|
||||
/* AVX2 implementation */
|
||||
const __m256i zeroes = _mm256_setzero_si256();
|
||||
const __v32qi multipliers = (__v32qi) { 32, 31, 30, 29, 28, 27, 26, 25,
|
||||
24, 23, 22, 21, 20, 19, 18, 17,
|
||||
@ -62,117 +104,132 @@ FUNCNAME(const void *buffer, size_t size)
|
||||
8, 7, 6, 5, 4, 3, 2, 1 };
|
||||
const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
|
||||
__v8si v_s1 = (__v8si)zeroes;
|
||||
__v8si v_s1_sums = (__v8si)zeroes;
|
||||
__v8si v_s2 = (__v8si)zeroes;
|
||||
STATIC_ASSERT(ALIGNMENT_REQUIRED == 32 && BYTES_PER_ITERATION == 32);
|
||||
do {
|
||||
__v32qi bytes = *(const __v32qi *)p;
|
||||
__m256i bytes = *(const __m256i *)p;
|
||||
__v16hi sums = (__v16hi)_mm256_maddubs_epi16(
|
||||
(__m256i)bytes, (__m256i)multipliers);
|
||||
v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1, 5);
|
||||
v_s1 += (__v8si)_mm256_sad_epu8((__m256i)bytes, zeroes);
|
||||
bytes, (__m256i)multipliers);
|
||||
v_s1_sums += v_s1;
|
||||
v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
|
||||
v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
|
||||
p += 32;
|
||||
} while (p != chunk_end);
|
||||
} while ((p += BYTES_PER_ITERATION) != chunk_end);
|
||||
|
||||
v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
|
||||
v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
|
||||
s1 += v_s1[0] + v_s1[4];
|
||||
|
||||
v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
|
||||
v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
|
||||
v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
|
||||
s2 += v_s2[0] + v_s2[4];
|
||||
|
||||
#elif defined(ADLER32_TARGET_SSE2)
|
||||
#elif TARGET == TARGET_SSE2
|
||||
/* SSE2 implementation */
|
||||
const __m128i zeroes = _mm_setzero_si128();
|
||||
const __v8hi multipliers_a = (__v8hi) { 32, 31, 30, 29, 28, 27, 26, 25 };
|
||||
const __v8hi multipliers_b = (__v8hi) { 24, 23, 22, 21, 20, 19, 18, 17 };
|
||||
const __v8hi multipliers_c = (__v8hi) { 16, 15, 14, 13, 12, 11, 10, 9 };
|
||||
const __v8hi multipliers_d = (__v8hi) { 8, 7, 6, 5, 4, 3, 2, 1 };
|
||||
|
||||
/* s1 counters: 32-bit, sum of bytes */
|
||||
__v4si v_s1 = (__v4si)zeroes;
|
||||
|
||||
/* s2 counters: 32-bit, sum of s1 values */
|
||||
__v4si v_s2 = (__v4si)zeroes;
|
||||
__v4si v_s2_a = (__v4si)zeroes;
|
||||
__v4si v_s2_b = (__v4si)zeroes;
|
||||
|
||||
/*
|
||||
* Thirty-two 16-bit counters for byte sums. Each accumulates
|
||||
* the bytes that eventually need to be multiplied by a number
|
||||
* 32...1 for addition into s2.
|
||||
*/
|
||||
__v8hi v_byte_sums_a = (__v8hi)zeroes;
|
||||
__v8hi v_byte_sums_b = (__v8hi)zeroes;
|
||||
__v8hi v_byte_sums_c = (__v8hi)zeroes;
|
||||
__v8hi v_byte_sums_d = (__v8hi)zeroes;
|
||||
|
||||
STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
|
||||
do {
|
||||
/* Load the next 32 bytes. */
|
||||
__v16qi bytes1 = *(const __v16qi *)p;
|
||||
__v16qi bytes2 = *(const __v16qi *)(p + 16);
|
||||
|
||||
/* Expand the bytes into 16-bit integers. */
|
||||
__v8hi bytes_a = (__v8hi)_mm_unpacklo_epi8((__m128i)bytes1, zeroes);
|
||||
__v8hi bytes_b = (__v8hi)_mm_unpackhi_epi8((__m128i)bytes1, zeroes);
|
||||
__v8hi bytes_c = (__v8hi)_mm_unpacklo_epi8((__m128i)bytes2, zeroes);
|
||||
__v8hi bytes_d = (__v8hi)_mm_unpackhi_epi8((__m128i)bytes2, zeroes);
|
||||
const __m128i bytes1 = *(const __m128i *)p;
|
||||
const __m128i bytes2 = *(const __m128i *)(p + 16);
|
||||
|
||||
/*
|
||||
* s2 update (part 1): multiply the s1 counters by 32
|
||||
* and add them to the s2 counters. This accounts for
|
||||
* all the additions of existing s1 values into s2 which
|
||||
* need to happen over the 32 bytes currently being
|
||||
* processed.
|
||||
* Accumulate the previous s1 counters into the s2
|
||||
* counters. Logically, this really should be
|
||||
* v_s2 += v_s1 * BYTES_PER_ITERATION, but we can do the
|
||||
* multiplication (or left shift) later.
|
||||
*/
|
||||
v_s2_a += (__v4si)_mm_slli_epi32((__m128i)v_s1, 5);
|
||||
v_s2 += v_s1;
|
||||
|
||||
/*
|
||||
* s1 update: use "Packed Sum of Absolute Differences"
|
||||
* to add the bytes horizontally with 8 bytes per sum.
|
||||
* Then add the sums to the s1 counters.
|
||||
*/
|
||||
v_s1 += (__v4si)_mm_sad_epu8((__m128i)bytes1, zeroes);
|
||||
v_s1 += (__v4si)_mm_sad_epu8((__m128i)bytes2, zeroes);
|
||||
v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes);
|
||||
v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes);
|
||||
|
||||
/*
|
||||
* s2 update (part 2): use "Packed Multiply and Add Word
|
||||
* to Doubleword" to multiply the expanded bytes by the
|
||||
* multipliers and sum adjacent pairs into 32-bit sums.
|
||||
* Then add those sums to the s2 counters.
|
||||
* Also accumulate the bytes into 32 separate counters
|
||||
* that have 16-bit precision.
|
||||
*/
|
||||
v_s2_a += (__v4si)_mm_madd_epi16((__m128i)bytes_a, (__m128i)multipliers_a);
|
||||
v_s2_a += (__v4si)_mm_madd_epi16((__m128i)bytes_b, (__m128i)multipliers_b);
|
||||
v_s2_b += (__v4si)_mm_madd_epi16((__m128i)bytes_c, (__m128i)multipliers_c);
|
||||
v_s2_b += (__v4si)_mm_madd_epi16((__m128i)bytes_d, (__m128i)multipliers_d);
|
||||
v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes);
|
||||
v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes);
|
||||
v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes);
|
||||
v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes);
|
||||
|
||||
p += 32;
|
||||
} while (p != chunk_end);
|
||||
} while ((p += BYTES_PER_ITERATION) != chunk_end);
|
||||
|
||||
v_s2 += v_s2_a + v_s2_b;
|
||||
/* Finish calculating the s2 counters. */
|
||||
v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5);
|
||||
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a,
|
||||
(__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 });
|
||||
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b,
|
||||
(__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 });
|
||||
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c,
|
||||
(__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 });
|
||||
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
|
||||
(__m128i)(__v8hi){ 8, 7, 6, 5, 4, 3, 2, 1 });
|
||||
|
||||
/* Now accumulate what we computed into the real s1 and s2. */
|
||||
s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
|
||||
s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
|
||||
#elif defined(ADLER32_TARGET_NEON)
|
||||
|
||||
#elif TARGET == TARGET_NEON
|
||||
/* ARM NEON (Advanced SIMD) implementation */
|
||||
const uint8x8_t multipliers_a = (uint8x8_t) { 32, 31, 30, 29, 28, 27, 26, 25 };
|
||||
const uint8x8_t multipliers_b = (uint8x8_t) { 24, 23, 22, 21, 20, 19, 18, 17 };
|
||||
const uint8x8_t multipliers_c = (uint8x8_t) { 16, 15, 14, 13, 12, 11, 10, 9 };
|
||||
const uint8x8_t multipliers_d = (uint8x8_t) { 8, 7, 6, 5, 4, 3, 2, 1 };
|
||||
uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
|
||||
uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
|
||||
uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
|
||||
do {
|
||||
const uint8x16_t bytes1 = *(const uint8x16_t *)p;
|
||||
const uint8x16_t bytes2 = *(const uint8x16_t *)(p + 16);
|
||||
uint16x8_t tmp;
|
||||
|
||||
/* s2 update (part 1) */
|
||||
v_s2 += vqshlq_n_u32(v_s1, 5);
|
||||
v_s2 += v_s1;
|
||||
|
||||
/* s1 update */
|
||||
tmp = vpaddlq_u8(bytes1);
|
||||
tmp = vpadalq_u8(tmp, bytes2);
|
||||
v_s1 = vpadalq_u16(v_s1, tmp);
|
||||
|
||||
/* s2 update (part 2) */
|
||||
tmp = vmull_u8(vget_low_u8(bytes1), multipliers_a);
|
||||
tmp = vmlal_u8(tmp, vget_high_u8(bytes1), multipliers_b);
|
||||
tmp = vmlal_u8(tmp, vget_low_u8(bytes2), multipliers_c);
|
||||
tmp = vmlal_u8(tmp, vget_high_u8(bytes2), multipliers_d);
|
||||
v_s2 = vpadalq_u16(v_s2, tmp);
|
||||
v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
|
||||
v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
|
||||
v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
|
||||
v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
|
||||
|
||||
p += 32;
|
||||
} while (p != chunk_end);
|
||||
} while ((p += BYTES_PER_ITERATION) != chunk_end);
|
||||
|
||||
v_s2 = vqshlq_n_u32(v_s2, 5);
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), (uint16x4_t) { 32, 31, 30, 29 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), (uint16x4_t) { 24, 23, 22, 21 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), (uint16x4_t) { 16, 15, 14, 13 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10, 9 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) { 8, 7, 6, 5 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) { 4, 3, 2, 1 });
|
||||
|
||||
s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
|
||||
s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
|
||||
@ -197,3 +254,9 @@ FUNCNAME(const void *buffer, size_t size)
|
||||
|
||||
return (s2 << 16) | s1;
|
||||
}
|
||||
|
||||
#undef FUNCNAME
|
||||
#undef TARGET
|
||||
#undef ALIGNMENT_REQUIRED
|
||||
#undef BYTES_PER_ITERATION
|
||||
#undef ATTRIBUTES
|
||||
|
Loading…
x
Reference in New Issue
Block a user