mirror of
https://github.com/cuberite/libdeflate.git
synced 2025-09-09 20:29:26 -04:00
Use shuffle+add instead of vector subscripting in SSE2 Adler-32
Shuffle+add is more efficient and also works on gcc versions that don't support subscripting of vector types (pre-4.6).
This commit is contained in:
parent
7d3e2a997c
commit
36f8286db7
@ -204,8 +204,13 @@ FUNCNAME(const void *buffer, size_t size)
|
|||||||
(__m128i)(__v8hi){ 8, 7, 6, 5, 4, 3, 2, 1 });
|
(__m128i)(__v8hi){ 8, 7, 6, 5, 4, 3, 2, 1 });
|
||||||
|
|
||||||
/* Now accumulate what we computed into the real s1 and s2. */
|
/* Now accumulate what we computed into the real s1 and s2. */
|
||||||
s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
|
v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x31);
|
||||||
s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
|
v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x02);
|
||||||
|
s1 += _mm_cvtsi128_si32((__m128i)v_s1);
|
||||||
|
|
||||||
|
v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x31);
|
||||||
|
v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x02);
|
||||||
|
s2 += _mm_cvtsi128_si32((__m128i)v_s2);
|
||||||
|
|
||||||
#elif TARGET == TARGET_NEON
|
#elif TARGET == TARGET_NEON
|
||||||
/* ARM NEON (Advanced SIMD) implementation */
|
/* ARM NEON (Advanced SIMD) implementation */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user