libdeflate/lib/adler32_vec_template.h
Eric Biggers 4829a5add2 lib: refactor architecture-specific code
Move the x86 and ARM-specific code into their own directories to prevent
it from cluttering up the main library.  This will make it a bit easier
to add new architecture-specific code.

But to avoid complicating things too much for people who aren't using
the provided Makefile, we still just compile all .c files for all
architectures (irrelevant ones end up #ifdef'ed out), and the headers
are included explicitly for each architecture so that an
architecture-specific include path isn't needed.  So, now people just
need to compile both lib/*.c and lib/*/*.c instead of only lib/*.c.
2018-02-18 23:03:26 -08:00

125 lines
3.8 KiB
C

/*
* adler32_vec_template.h - template for vectorized Adler-32 implementations
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* This file contains a template for vectorized Adler-32 implementations.
*
* The inner loop between reductions modulo 65521 of an unvectorized Adler-32
* implementation looks something like this:
*
* do {
* s1 += *p;
* s2 += s1;
* } while (++p != chunk_end);
*
* For vectorized calculation of s1, we only need to sum the input bytes. They
* can be accumulated into multiple counters which are eventually summed
* together.
*
* For vectorized calculation of s2, the basic idea is that for each iteration
* that processes N bytes, we can perform the following vectorizable
* calculation:
*
* s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
*
* Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
* separate counters, then do the multiplications by N...1 just once at the end
* rather than once per iteration.
*
* Also, we must account for how previous bytes will affect s2 by doing the
* following at beginning of each iteration:
*
* s2 += s1 * N
*
* Furthermore, like s1, "s2" can actually be multiple counters which are
* eventually summed together.
*/
static u32 ATTRIBUTES
FUNCNAME(u32 adler, const u8 *p, size_t size)
{
u32 s1 = adler & 0xFFFF;
u32 s2 = adler >> 16;
const u8 * const end = p + size;
const u8 *vend;
const size_t max_chunk_size =
MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) -
(MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) %
IMPL_SEGMENT_SIZE);
/* Process a byte at a time until the needed alignment is reached */
if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) {
do {
s1 += *p++;
s2 += s1;
} while (p != end && (uintptr_t)p % IMPL_ALIGNMENT);
s1 %= DIVISOR;
s2 %= DIVISOR;
}
/*
* Process "chunks" of bytes using vector instructions. Chunk sizes are
* limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never
* overflow before being reduced modulo DIVISOR. For vector processing,
* chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and
* may be further limited to IMPL_MAX_CHUNK_SIZE.
*/
STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0);
vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE);
while (p != vend) {
size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size);
s2 += s1 * chunk_size;
FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size),
&s1, &s2);
p += chunk_size;
s1 %= DIVISOR;
s2 %= DIVISOR;
}
/* Process any remaining bytes */
if (p != end) {
do {
s1 += *p++;
s2 += s1;
} while (p != end);
s1 %= DIVISOR;
s2 %= DIVISOR;
}
return (s2 << 16) | s1;
}
#undef FUNCNAME
#undef FUNCNAME_CHUNK
#undef ATTRIBUTES
#undef IMPL_ALIGNMENT
#undef IMPL_SEGMENT_SIZE
#undef IMPL_MAX_CHUNK_SIZE