From 1bd972195ed01314b0993ea86d98ee701b542972 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sat, 3 Sep 2016 21:23:34 -0700
Subject: [PATCH] Vectorized Adler-32 optimizations and cleanups

---
 lib/adler32.c      |  45 +++++----
 lib/adler32_impl.h | 221 +++++++++++++++++++++++++++++----------------
 2 files changed, 163 insertions(+), 103 deletions(-)

diff --git a/lib/adler32.c b/lib/adler32.c
index 4cfb994..09dc415 100644
--- a/lib/adler32.c
+++ b/lib/adler32.c
@@ -121,48 +121,45 @@ static u32 adler32_generic(const void *buffer, size_t size)
 #define DEFAULT_IMPL adler32_generic
 #endif /* NEED_GENERIC_IMPL */
 
+#define TARGET_SSE2 100
+#define TARGET_AVX2 200
+#define TARGET_NEON 300
+
 /* Define the SSE2 implementation if needed. */
 #if NEED_SSE2_IMPL
-#  define FUNCNAME adler32_sse2
-#  define ADLER32_TARGET_SSE2
-#  ifdef __SSE2__
-#    define ATTRIBUTES
-#    define DEFAULT_IMPL adler32_sse2
-#  else
-#    define ATTRIBUTES __attribute__((target("sse2")))
-#  endif
+#  define FUNCNAME		adler32_sse2
+#  define TARGET		TARGET_SSE2
+#  define ALIGNMENT_REQUIRED	16
+#  define BYTES_PER_ITERATION	32
+#  define ATTRIBUTES
+#  define DEFAULT_IMPL		adler32_sse2
 #  include "adler32_impl.h"
-#  undef FUNCNAME
-#  undef ADLER32_TARGET_SSE2
-#  undef ATTRIBUTES
 #endif
 
 /* Define the AVX2 implementation if needed. */
 #if NEED_AVX2_IMPL
-#  define FUNCNAME adler32_avx2
-#  define ADLER32_TARGET_AVX2
+#  define FUNCNAME		adler32_avx2
+#  define TARGET		TARGET_AVX2
+#  define ALIGNMENT_REQUIRED	32
+#  define BYTES_PER_ITERATION	32
 #  ifdef __AVX2__
 #    define ATTRIBUTES
-#    define DEFAULT_IMPL adler32_avx2
+#    define DEFAULT_IMPL	adler32_avx2
 #  else
-#    define ATTRIBUTES __attribute__((target("avx2")))
+#    define ATTRIBUTES		__attribute__((target("avx2")))
 #  endif
 #  include "adler32_impl.h"
-#  undef FUNCNAME
-#  undef ADLER32_TARGET_AVX2
-#  undef ATTRIBUTES
 #endif
 
 /* Define the NEON implementation if needed. */
 #if NEED_NEON_IMPL
-#  define FUNCNAME adler32_neon
-#  define ADLER32_TARGET_NEON
+#  define FUNCNAME		adler32_neon
+#  define TARGET		TARGET_NEON
+#  define ALIGNMENT_REQUIRED	16
+#  define BYTES_PER_ITERATION	32
 #  define ATTRIBUTES
-#  define DEFAULT_IMPL adler32_neon
+#  define DEFAULT_IMPL		adler32_neon
 #  include "adler32_impl.h"
-#  undef FUNCNAME
-#  undef ADLER32_TARGET_NEON
-#  undef ATTRIBUTES
 #endif
 
 typedef u32 (*adler32_func_t)(const void *, size_t);
diff --git a/lib/adler32_impl.h b/lib/adler32_impl.h
index f0df11c..dce896c 100644
--- a/lib/adler32_impl.h
+++ b/lib/adler32_impl.h
@@ -12,7 +12,40 @@
  * <http://creativecommons.org/publicdomain/zero/1.0/>.
  */
 
-/* Template for vectorized Adler-32 implementations */
+/*
+ * This file contains a template for vectorized Adler-32 implementations.
+ *
+ * The inner loop between reductions modulo 65521 of an unvectorized Adler-32
+ * implementation looks something like this:
+ *
+ *	do {
+ * 		s1 += *p;
+ * 		s2 += s1;
+ *	} while (++p != chunk_end);
+ *
+ * For vectorized calculation of s1, we only need to sum the input bytes.  They
+ * can be accumulated into multiple counters which are eventually summed
+ * together.
+ *
+ * For vectorized calculation of s2, the basic idea is that for each iteration
+ * that processes N bytes, we can perform the following vectorizable
+ * calculation:
+ *
+ *	s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
+ *
+ * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
+ * separate counters, then do the multiplications by N...1 just once at the end
+ * rather than once per iteration.
+ *
+ * Also, we must account for how previous bytes will affect s2 by doing the
+ * following at beginning of each iteration:
+ *
+ *	s2 += s1 * N
+ *
+ * Futhermore, like s1, "s2" can actually be multiple counters which are
+ * eventually summed together.
+ */
+
 static u32 ATTRIBUTES
 FUNCNAME(const void *buffer, size_t size)
 {
@@ -20,41 +53,50 @@ FUNCNAME(const void *buffer, size_t size)
 	u32 s2 = 0;
 	const u8 *p = buffer;
 	const u8 * const end = p + size;
-	const uintptr_t endv = (uintptr_t)end & ~31;
+	const u8 *vend;
 
-	/* Proceed until the buffer is 32-byte aligned. */
-	if (p != end && ((uintptr_t)p & 31)) {
+	/* Process a byte at a time until the required alignment is reached. */
+	if (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED) {
 		do {
 			s1 += *p++;
 			s2 += s1;
-		} while (p != end && ((uintptr_t)p & 31));
+		} while (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED);
 		s1 %= DIVISOR;
 		s2 %= DIVISOR;
 	}
 
-	/* While there are still 32-byte aligned vectors remaining... */
-	while ((uintptr_t)p < endv) {
+	/*
+	 * Process "chunks" of bytes using vector instructions.  Chunk sizes are
+	 * limited to MAX_BYTES_PER_CHUNK, which guarantees that s1 and s2 never
+	 * overflow before being reduced modulo DIVISOR.  For vector processing,
+	 * chunks size are also made evenly divisible by BYTES_PER_ITERATION.
+	 */
+	STATIC_ASSERT(BYTES_PER_ITERATION % ALIGNMENT_REQUIRED == 0);
+	vend = end - ((end - p) % BYTES_PER_ITERATION);
+	while (p != vend) {
+		size_t chunk_size;
+		const u8 *chunk_end;
 
-		/* Process a chunk which is guaranteed not to overflow s2. */
-		size_t chunk_size = MIN(endv - (uintptr_t)p,
-					MAX_BYTES_PER_CHUNK & ~31);
-		const u8 *chunk_end = p + chunk_size;
+		chunk_size = MIN(vend - p, MAX_BYTES_PER_CHUNK);
+	#if TARGET == TARGET_SSE2
+		/* SSE2: the 16-bit precision byte counters must not undergo
+		 * *signed* overflow, otherwise the signed multiplication at the
+		 * end will not behave as desired. */
+		chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0x7FFF / 0xFF));
+	#elif TARGET == TARGET_NEON
+		/* NEON: the 16-bit precision counters must not undergo
+		 * *unsigned* overflow. */
+		chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0xFFFF / 0xFF));
+	#endif
+		chunk_size -= chunk_size % BYTES_PER_ITERATION;
+
+		chunk_end = p + chunk_size;
 
 		s2 += s1 * chunk_size;
 		s2 %= DIVISOR;
 		{
-	#ifdef ADLER32_TARGET_AVX2
-		/*
-		 * AVX2 implementation.  This works like the SSE2 implementation
-		 * below, except:
-		 *
-		 * - Vectors are 32 bytes instead of 16 bytes.
-		 *
-		 * - We use "Packed Multiply and Add Unsigned Byte to Signed
-		 *   Word" (_mm256_maddubs_epi16()) to directly do the
-		 *   multiplication of the bytes with their multipliers without
-		 *   having to expand the bytes to 16-bit integers first.
-		 */
+	#if TARGET == TARGET_AVX2
+		/* AVX2 implementation */
 		const __m256i zeroes = _mm256_setzero_si256();
 		const __v32qi multipliers = (__v32qi) { 32, 31, 30, 29, 28, 27, 26, 25,
 							24, 23, 22, 21, 20, 19, 18, 17,
@@ -62,117 +104,132 @@ FUNCNAME(const void *buffer, size_t size)
 							8,  7,  6,  5,  4,  3,  2,  1 };
 		const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
 		__v8si v_s1 = (__v8si)zeroes;
+		__v8si v_s1_sums = (__v8si)zeroes;
 		__v8si v_s2 = (__v8si)zeroes;
+		STATIC_ASSERT(ALIGNMENT_REQUIRED == 32 && BYTES_PER_ITERATION == 32);
 		do {
-			__v32qi bytes = *(const __v32qi *)p;
+			__m256i bytes = *(const __m256i *)p;
 			__v16hi sums = (__v16hi)_mm256_maddubs_epi16(
-						(__m256i)bytes, (__m256i)multipliers);
-			v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1, 5);
-			v_s1 += (__v8si)_mm256_sad_epu8((__m256i)bytes, zeroes);
+							bytes, (__m256i)multipliers);
+			v_s1_sums += v_s1;
+			v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
 			v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
-			p += 32;
-		} while (p != chunk_end);
+		} while ((p += BYTES_PER_ITERATION) != chunk_end);
 
 		v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
 		v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
 		s1 += v_s1[0] + v_s1[4];
 
+		v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
 		v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
 		v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
 		s2 += v_s2[0] + v_s2[4];
 
-	#elif defined(ADLER32_TARGET_SSE2)
+	#elif TARGET == TARGET_SSE2
 		/* SSE2 implementation */
 		const __m128i zeroes = _mm_setzero_si128();
-		const __v8hi multipliers_a = (__v8hi) { 32, 31, 30, 29, 28, 27, 26, 25 };
-		const __v8hi multipliers_b = (__v8hi) { 24, 23, 22, 21, 20, 19, 18, 17 };
-		const __v8hi multipliers_c = (__v8hi) { 16, 15, 14, 13, 12, 11, 10, 9  };
-		const __v8hi multipliers_d = (__v8hi) { 8,  7,  6,  5,  4,  3,  2,  1  };
 
 		/* s1 counters: 32-bit, sum of bytes */
 		__v4si v_s1 = (__v4si)zeroes;
 
 		/* s2 counters: 32-bit, sum of s1 values */
 		__v4si v_s2 = (__v4si)zeroes;
-		__v4si v_s2_a = (__v4si)zeroes;
-		__v4si v_s2_b = (__v4si)zeroes;
+
+		/*
+		 * Thirty-two 16-bit counters for byte sums.  Each accumulates
+		 * the bytes that eventually need to be multiplied by a number
+		 * 32...1 for addition into s2.
+		 */
+		__v8hi v_byte_sums_a = (__v8hi)zeroes;
+		__v8hi v_byte_sums_b = (__v8hi)zeroes;
+		__v8hi v_byte_sums_c = (__v8hi)zeroes;
+		__v8hi v_byte_sums_d = (__v8hi)zeroes;
+
+		STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
 		do {
 			/* Load the next 32 bytes. */
-			__v16qi bytes1 = *(const __v16qi *)p;
-			__v16qi bytes2 = *(const __v16qi *)(p + 16);
-
-			/* Expand the bytes into 16-bit integers. */
-			__v8hi bytes_a = (__v8hi)_mm_unpacklo_epi8((__m128i)bytes1, zeroes);
-			__v8hi bytes_b = (__v8hi)_mm_unpackhi_epi8((__m128i)bytes1, zeroes);
-			__v8hi bytes_c = (__v8hi)_mm_unpacklo_epi8((__m128i)bytes2, zeroes);
-			__v8hi bytes_d = (__v8hi)_mm_unpackhi_epi8((__m128i)bytes2, zeroes);
+			const __m128i bytes1 = *(const __m128i *)p;
+			const __m128i bytes2 = *(const __m128i *)(p + 16);
 
 			/*
-			 * s2 update (part 1): multiply the s1 counters by 32
-			 * and add them to the s2 counters.  This accounts for
-			 * all the additions of existing s1 values into s2 which
-			 * need to happen over the 32 bytes currently being
-			 * processed.
+			 * Accumulate the previous s1 counters into the s2
+			 * counters.  Logically, this really should be
+			 * v_s2 += v_s1 * BYTES_PER_ITERATION, but we can do the
+			 * multiplication (or left shift) later.
 			 */
-			v_s2_a += (__v4si)_mm_slli_epi32((__m128i)v_s1, 5);
+			v_s2 += v_s1;
 
 			/*
 			 * s1 update: use "Packed Sum of Absolute Differences"
 			 * to add the bytes horizontally with 8 bytes per sum.
 			 * Then add the sums to the s1 counters.
 			 */
-			v_s1 += (__v4si)_mm_sad_epu8((__m128i)bytes1, zeroes);
-			v_s1 += (__v4si)_mm_sad_epu8((__m128i)bytes2, zeroes);
+			v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes);
+			v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes);
 
 			/*
-			 * s2 update (part 2): use "Packed Multiply and Add Word
-			 * to Doubleword" to multiply the expanded bytes by the
-			 * multipliers and sum adjacent pairs into 32-bit sums.
-			 * Then add those sums to the s2 counters.
+			 * Also accumulate the bytes into 32 separate counters
+			 * that have 16-bit precision.
 			 */
-			v_s2_a += (__v4si)_mm_madd_epi16((__m128i)bytes_a, (__m128i)multipliers_a);
-			v_s2_a += (__v4si)_mm_madd_epi16((__m128i)bytes_b, (__m128i)multipliers_b);
-			v_s2_b += (__v4si)_mm_madd_epi16((__m128i)bytes_c, (__m128i)multipliers_c);
-			v_s2_b += (__v4si)_mm_madd_epi16((__m128i)bytes_d, (__m128i)multipliers_d);
+			v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes);
+			v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes);
+			v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes);
+			v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes);
 
-			p += 32;
-		} while (p != chunk_end);
+		} while ((p += BYTES_PER_ITERATION) != chunk_end);
 
-		v_s2 += v_s2_a + v_s2_b;
+		/* Finish calculating the s2 counters. */
+		v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5);
+		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a,
+					       (__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 });
+		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b,
+					       (__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 });
+		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c,
+					       (__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 });
+		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
+					       (__m128i)(__v8hi){ 8,  7,  6,  5,  4,  3,  2,  1 });
 
+		/* Now accumulate what we computed into the real s1 and s2. */
 		s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
 		s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
-	#elif defined(ADLER32_TARGET_NEON)
+
+	#elif TARGET == TARGET_NEON
 		/* ARM NEON (Advanced SIMD) implementation */
-		const uint8x8_t multipliers_a = (uint8x8_t) { 32, 31, 30, 29, 28, 27, 26, 25 };
-		const uint8x8_t multipliers_b = (uint8x8_t) { 24, 23, 22, 21, 20, 19, 18, 17 };
-		const uint8x8_t multipliers_c = (uint8x8_t) { 16, 15, 14, 13, 12, 11, 10, 9  };
-		const uint8x8_t multipliers_d = (uint8x8_t) { 8,  7,  6,  5,  4,  3,  2,  1  };
 		uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
 		uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
+		uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
+		uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
+		uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
+		uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
 
+		STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
 		do {
 			const uint8x16_t bytes1 = *(const uint8x16_t *)p;
 			const uint8x16_t bytes2 = *(const uint8x16_t *)(p + 16);
 			uint16x8_t tmp;
 
-			/* s2 update (part 1) */
-			v_s2 += vqshlq_n_u32(v_s1, 5);
+			v_s2 += v_s1;
 
-			/* s1 update */
 			tmp = vpaddlq_u8(bytes1);
 			tmp = vpadalq_u8(tmp, bytes2);
 			v_s1 = vpadalq_u16(v_s1, tmp);
 
-			/* s2 update (part 2) */
-			tmp = vmull_u8(vget_low_u8(bytes1), multipliers_a);
-			tmp = vmlal_u8(tmp, vget_high_u8(bytes1), multipliers_b);
-			tmp = vmlal_u8(tmp, vget_low_u8(bytes2), multipliers_c);
-			tmp = vmlal_u8(tmp, vget_high_u8(bytes2), multipliers_d);
-			v_s2 = vpadalq_u16(v_s2, tmp);
+			v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
+			v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
+			v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
+			v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
 
-			p += 32;
-		} while (p != chunk_end);
+		} while ((p += BYTES_PER_ITERATION) != chunk_end);
+
+		v_s2 = vqshlq_n_u32(v_s2, 5);
+		v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a),  (uint16x4_t) { 32, 31, 30, 29 });
+		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
+		v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b),  (uint16x4_t) { 24, 23, 22, 21 });
+		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
+		v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c),  (uint16x4_t) { 16, 15, 14, 13 });
+		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10,  9 });
+		v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) {  8,  7,  6,  5 });
+		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) {  4,  3,  2,  1 });
 
 		s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
 		s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
@@ -197,3 +254,9 @@ FUNCNAME(const void *buffer, size_t size)
 
 	return (s2 << 16) | s1;
 }
+
+#undef FUNCNAME
+#undef TARGET
+#undef ALIGNMENT_REQUIRED
+#undef BYTES_PER_ITERATION
+#undef ATTRIBUTES