libdeflate/lib/crc32.c
Eric Biggers f2c3a5b4e9 Various reorganization and cleanups
* Bring in common headers and program code from xpack project
* Move program code to programs/
* Move library code to lib/
* GNU89 and MSVC2010 compatibility
* Other changes
2016-05-21 15:38:15 -05:00

261 lines
8.5 KiB
C

/*
* crc32.c - CRC-32 checksum algorithm for the gzip format
*
* Written in 2014-2015 by Eric Biggers <ebiggers3@gmail.com>
*
* To the extent possible under law, the author(s) have dedicated all copyright
* and related and neighboring rights to this software to the public domain
* worldwide. This software is distributed without any warranty.
*
* You should have received a copy of the CC0 Public Domain Dedication along
* with this software. If not, see
* <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
/*
* High-level description of CRC
* =============================
*
* Consider a bit sequence 'bits[1...len]'. Interpet 'bits' as the "message"
* polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
* where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute:
*
* R(x) = M(x)*x^n mod G(x)
*
* where G(x) is a selected "generator" polynomial of degree 'n'. The remainder
* R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x)
* interpeted as a bitstring of length 'n'.
*
* CRC used in gzip
* ================
*
* In the gzip format (RFC 1952):
*
* - The bitstring to checksum is formed from the bytes of the uncompressed
* data by concatenating the bits from the bytes in order, proceeding
* from the low-order bit to the high-order bit within each byte.
*
* - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
* x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
* Consequently, the CRC length is 32 bits ("CRC-32").
*
* - The highest order 32 coefficients of M(x)*x^n are inverted.
*
* - All 32 coefficients of R(x) are inverted.
*
* The two inversions cause added leading and trailing zero bits to affect the
* resulting CRC, whereas with a regular CRC such bits would have no effect on
* the CRC.
*
* Computation and optimizations
* =============================
*
* We can compute R(x) through "long division", maintaining only 32 bits of
* state at any given time. Multiplication by 'x' can be implemented as
* right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
* highest order bit represents the coefficient of x^0), and both addition and
* subtraction can be implemented as bitwise exclusive OR (since we are working
* in GF(2)). Here is an unoptimized implementation:
*
* static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
* {
* u32 remainder = 0;
* const u32 divisor = 0xEDB88320;
*
* for (size_t i = 0; i < nbytes * 8 + 32; i++) {
* int bit;
* u32 multiple;
*
* if (i < nbytes * 8)
* bit = (buffer[i / 8] >> (i % 8)) & 1;
* else
* bit = 0; // one of the 32 appended 0 bits
*
* if (i < 32) // the first 32 bits are inverted
* bit ^= 1;
*
* if (remainder & 1)
* multiple = divisor;
* else
* multiple = 0;
*
* remainder >>= 1;
* remainder |= (u32)bit << 31;
* remainder ^= multiple;
* }
*
* return ~remainder;
* }
*
* In this implementation, the 32-bit integer 'remainder' maintains the
* remainder of the currently processed portion of the message (with 32 zero
* bits appended) when divided by the generator polynomial. 'remainder' is the
* representation of R(x), and 'divisor' is the representation of G(x) excluding
* the x^32 coefficient. For each bit to process, we multiply R(x) by 'x^1',
* then add 'x^0' if the new bit is a 1. If this causes R(x) to gain a nonzero
* x^32 term, then we subtract G(x) from R(x).
*
* We can speed this up by taking advantage of the fact that XOR is commutative
* and associative, so the order in which we combine the inputs into 'remainder'
* is unimportant. And since each message bit we add doesn't affect the choice
* of 'multiple' until 32 bits later, we need not actually add each message bit
* until that point:
*
* static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
* {
* u32 remainder = ~0;
* const u32 divisor = 0xEDB88320;
*
* for (size_t i = 0; i < nbytes * 8; i++) {
* int bit;
* u32 multiple;
*
* bit = (buffer[i / 8] >> (i % 8)) & 1;
* remainder ^= bit;
* if (remainder & 1)
* multiple = divisor;
* else
* multiple = 0;
* remainder >>= 1;
* remainder ^= multiple;
* }
*
* return ~remainder;
* }
*
* With the above implementation we get the effect of 32 appended 0 bits for
* free; they never affect the choice of a divisor, nor would they change the
* value of 'remainder' if they were to be actually XOR'ed in. And by starting
* with a remainder of all 1 bits, we get the effect of complementing the first
* 32 message bits.
*
* The next optimization is to process the input in multi-bit units. Suppose
* that we insert the next 'n' message bits into the remainder. Then we get an
* intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
* bits is the amount by which the low 32 bits of the remainder will change as a
* result of cancelling out those 'n' bits. Taking n=8 (one byte) and
* precomputing a table containing the CRC of each possible byte, we get
* crc32_slice1() defined below.
*
* As a further optimization, we could increase the multi-bit unit size to 16.
* However, that is inefficient because the table size explodes from 256 entries
* (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
* fit in L1 cache on typical processors.
*
* However, we can actually process 4 bytes at a time using 4 different tables
* with 256 entries each. Logically, we form a 64-bit intermediate remainder
* and cancel out the high 32 bits in 8-bit chunks. Bits 32-39 are cancelled
* out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
* CRC of those bits with 8 zero bits appended, and so on. This method is
* implemented in crc32_slice4(), defined below.
*
* In crc32_slice8(), this method is extended to 8 bytes at a time. The
* intermediate remainder (which we never actually store explicitly) is 96 bits.
*/
#define CRC32_SLICE8
#include "crc32.h"
#include "crc32_table.h"
static forceinline u32
crc32_update_byte(u32 remainder, u8 next_byte)
{
return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
}
#ifdef CRC32_SLICE1
static u32
crc32_slice1(u32 remainder, const u8 *buffer, size_t nbytes)
{
size_t i;
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
for (i = 0; i < nbytes; i++)
remainder = crc32_update_byte(remainder, buffer[i]);
return remainder;
}
#endif
#ifdef CRC32_SLICE4
static u32
crc32_slice4(u32 remainder, const u8 *buffer, size_t nbytes)
{
const u8 *p = buffer;
const u8 *end = buffer + nbytes;
const u8 *end32;
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
for (; ((uintptr_t)p & 3) && p != end; p++)
remainder = crc32_update_byte(remainder, *p);
end32 = p + ((end - p) & ~3);
for (; p != end32; p += 4) {
u32 v = le32_bswap(*(const u32 *)p);
remainder =
crc32_table[0x300 + (u8)((remainder ^ v) >> 0)] ^
crc32_table[0x200 + (u8)((remainder ^ v) >> 8)] ^
crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
}
for (; p != end; p++)
remainder = crc32_update_byte(remainder, *p);
return remainder;
}
#endif
#ifdef CRC32_SLICE8
static u32
crc32_slice8(u32 remainder, const u8 *buffer, size_t nbytes)
{
const u8 *p = buffer;
const u8 *end = buffer + nbytes;
const u8 *end64;
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
for (; ((uintptr_t)p & 7) && p != end; p++)
remainder = crc32_update_byte(remainder, *p);
end64 = p + ((end - p) & ~7);
for (; p != end64; p += 8) {
u32 v1 = le32_bswap(*(const u32 *)(p + 0));
u32 v2 = le32_bswap(*(const u32 *)(p + 4));
remainder =
crc32_table[0x700 + (u8)((remainder ^ v1) >> 0)] ^
crc32_table[0x600 + (u8)((remainder ^ v1) >> 8)] ^
crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
crc32_table[0x300 + (u8)(v2 >> 0)] ^
crc32_table[0x200 + (u8)(v2 >> 8)] ^
crc32_table[0x100 + (u8)(v2 >> 16)] ^
crc32_table[0x000 + (u8)(v2 >> 24)];
}
for (; p != end; p++)
remainder = crc32_update_byte(remainder, *p);
return remainder;
}
#endif
u32
crc32_gzip(const void *buffer, size_t nbytes)
{
u32 remainder = ~0;
#if defined(CRC32_SLICE1)
remainder = crc32_slice1(remainder, buffer, nbytes);
#elif defined(CRC32_SLICE4)
remainder = crc32_slice4(remainder, buffer, nbytes);
#elif defined(CRC32_SLICE8)
remainder = crc32_slice8(remainder, buffer, nbytes);
#else
# error "don't know which CRC-32 implementation to use!"
#endif
return ~remainder;
}