From 6a982b7ac73f52be4832b18cc159c5405ab94a80 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 27 Dec 2014 23:11:49 -0600 Subject: [PATCH] initial commit --- CMakeLists.txt | 83 ++ README | 19 + examples/benchmark.c | 210 +++ libdeflate.h | 131 ++ src/adler32.c | 19 + src/adler32.h | 12 + src/bitops.h | 80 ++ src/bt_matchfinder.h | 279 ++++ src/compiler-gcc.h | 52 + src/compiler.h | 60 + src/crc32.c | 73 ++ src/crc32.h | 12 + src/deflate_compress.c | 2323 ++++++++++++++++++++++++++++++++++ src/deflate_compress.h | 9 + src/deflate_constants.h | 59 + src/deflate_decompress.c | 1455 +++++++++++++++++++++ src/endianness.h | 75 ++ src/gzip_compress.c | 64 + src/gzip_constants.h | 47 + src/gzip_decompress.c | 100 ++ src/hc_matchfinder.h | 235 ++++ src/lz_extend.h | 60 + src/lz_hash3.h | 49 + src/matchfinder_avx2.h | 64 + src/matchfinder_common.h | 163 +++ src/matchfinder_nonsliding.h | 47 + src/matchfinder_sliding.h | 30 + src/matchfinder_sse2.h | 64 + src/types.h | 38 + src/unaligned.h | 216 ++++ src/zlib_compress.c | 56 + src/zlib_constants.h | 20 + src/zlib_decompress.c | 56 + 33 files changed, 6260 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 README create mode 100644 examples/benchmark.c create mode 100644 libdeflate.h create mode 100644 src/adler32.c create mode 100644 src/adler32.h create mode 100644 src/bitops.h create mode 100644 src/bt_matchfinder.h create mode 100644 src/compiler-gcc.h create mode 100644 src/compiler.h create mode 100644 src/crc32.c create mode 100644 src/crc32.h create mode 100644 src/deflate_compress.c create mode 100644 src/deflate_compress.h create mode 100644 src/deflate_constants.h create mode 100644 src/deflate_decompress.c create mode 100644 src/endianness.h create mode 100644 src/gzip_compress.c create mode 100644 src/gzip_constants.h create mode 100644 src/gzip_decompress.c create mode 100644 src/hc_matchfinder.h create mode 100644 src/lz_extend.h create mode 100644 src/lz_hash3.h create mode 100644 src/matchfinder_avx2.h create mode 100644 src/matchfinder_common.h create mode 100644 src/matchfinder_nonsliding.h create mode 100644 src/matchfinder_sliding.h create mode 100644 src/matchfinder_sse2.h create mode 100644 src/types.h create mode 100644 src/unaligned.h create mode 100644 src/zlib_compress.c create mode 100644 src/zlib_constants.h create mode 100644 src/zlib_decompress.c diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..7dc9d49 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,83 @@ +cmake_minimum_required(VERSION 2.6) +project(libdeflate C) + +set(LIB_VERSION_MAJOR 0) +set(LIB_VERSION_MINOR 0) +set(LIB_VERSION_PATCH 0) + +set(LIB_VERSION_STRING "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(C_FLAGS "-std=c11 -fvisibility=hidden") + +set(CMAKE_C_FLAGS_RELEASE "${C_FLAGS} -O2 -DNDEBUG") +set(CMAKE_C_FLAGS_DEBUG "${C_FLAGS} -O0 -g") + +include_directories(".") + +option(SUPPORT_COMPRESSION "Support DEFLATE compression" ON) +if(SUPPORT_COMPRESSION) + set(LIB_SOURCES ${LIB_SOURCES} src/deflate_compress.c) +endif() + +option(SUPPORT_DECOMPRESSION "Support DEFLATE decompression" ON) +if(SUPPORT_DECOMPRESSION) + set(LIB_SOURCES ${LIB_SOURCES} src/deflate_decompress.c) +endif() + +option(SUPPORT_ZLIB "Support zlib wrapper format" ON) +if(SUPPORT_ZLIB) + set(LIB_SOURCES ${LIB_SOURCES} src/adler32.c) + if(SUPPORT_COMPRESSION) + set(LIB_SOURCES ${LIB_SOURCES} src/zlib_compress.c) + endif() + if(SUPPORT_DECOMPRESSION) + set(LIB_SOURCES ${LIB_SOURCES} src/zlib_decompress.c) + endif() +endif() + +option(SUPPORT_GZIP "Support gzip wrapper format" ON) +if(SUPPORT_GZIP) + set(LIB_SOURCES ${LIB_SOURCES} src/crc32.c) + if(SUPPORT_COMPRESSION) + set(LIB_SOURCES ${LIB_SOURCES} src/gzip_compress.c) + endif() + if(SUPPORT_DECOMPRESSION) + set(LIB_SOURCES ${LIB_SOURCES} src/gzip_decompress.c) + endif() +endif() + +option(SUPPORT_NEAR_OPTIMAL_PARSING "Support near optimal parsing (high compression mode)" ON) +if(SUPPORT_NEAR_OPTIMAL_PARSING) + add_definitions(-DSUPPORT_NEAR_OPTIMAL_PARSING=1) +else() + add_definitions(-DSUPPORT_NEAR_OPTIMAL_PARSING=0) +endif() + +option(UNSAFE_DECOMPRESSION "Assume that all compressed data is valid (faster but insecure)" OFF) +if(UNSAFE_DECOMPRESSION) + add_definitions(-DUNSAFE_DECOMPRESSION=1) +else() + add_definitions(-DUNSAFE_DECOMPRESSION=0) +endif() + +option(BUILD_EXAMPLES "Build the example programs" OFF) +if(BUILD_EXAMPLES) + add_executable(benchmark examples/benchmark.c) + target_link_libraries(benchmark deflate) +endif() + +add_library(deflate SHARED ${LIB_SOURCES}) +add_library(deflatestatic STATIC ${LIB_SOURCES}) + +set_target_properties(deflate PROPERTIES VERSION ${LIB_VERSION_STRING}) +set_target_properties(deflate PROPERTIES SOVERSION ${LIB_VERSION_MAJOR}) + +install(TARGETS deflate deflatestatic + LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" + ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib") + +install(FILES libdeflate.h DESTINATION "${CMAKE_INSTALL_PREFIX}/include") diff --git a/README b/README new file mode 100644 index 0000000..b0100eb --- /dev/null +++ b/README @@ -0,0 +1,19 @@ +This is libdeflate, a free (public domain) library for fast, whole-buffer +DEFLATE compression and decompression. + +The supported formats are: + + - DEFLATE (raw) + - zlib (DEFLATE with zlib header and footer) + - gzip (DEFLATE with gzip header and footer) + +libdeflate is heavily optimized. It is significantly faster than zlib, both for +compression and decompression. In addition, at compression levels 8 and above +it provides a compression ratio better than zlib's, while still being about the +same speed as zlib's level 9. + +libdeflate has a simple API that is not zlib-compatible. You can create +compressors and decompressors, and use them to compress or decompress buffers. +There is not yet any support for streaming. See libdeflate.h for details. + +libdeflate is public domain; the author claims no copyright on it. diff --git a/examples/benchmark.c b/examples/benchmark.c new file mode 100644 index 0000000..edc0482 --- /dev/null +++ b/examples/benchmark.c @@ -0,0 +1,210 @@ +/* + * benchmark.c - A compression testing and benchmark program. + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + +#include + +#ifdef __WIN32__ +# include +#else +# define _FILE_OFFSET_BITS 64 +# define O_BINARY 0 +# define _POSIX_C_SOURCE 199309L +# include +#endif + +#include +#include +#include +#include +#include +#include +#include + +static uint64_t +current_time(void) +{ +#ifdef __WIN32__ +# define TIME_UNIT_PER_MS 10000 + LARGE_INTEGER time; + QueryPerformanceCounter(&time); + return time.QuadPart; +#else +# define TIME_UNIT_PER_MS 1000000 + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (1000000000ULL * ts.tv_sec) + ts.tv_nsec; +#endif +} + +static int +do_benchmark(int fd, char *ubuf1, char *ubuf2, + char *cbuf, uint32_t max_chunk_size, + struct deflate_compressor *compressor, + struct deflate_decompressor *decompressor) +{ + uint64_t usize_total = 0; + uint64_t csize_total = 0; + uint64_t compress_time_total = 0; + uint64_t decompress_time_total = 0; + + for (;;) { + char *p = ubuf1; + ssize_t bytes_read; + size_t usize; + size_t csize; + bool ok; + uint64_t start_time; + + /* Read the next chunk of data. */ + do { + bytes_read = read(fd, p, ubuf1 + max_chunk_size - p); + if (bytes_read < 0) { + fprintf(stderr, "ERROR: Read error: %s\n", + strerror(errno)); + return 1; + } + p += bytes_read; + } while (bytes_read != 0 && p != ubuf1 + max_chunk_size); + + usize = p - ubuf1; + + if (usize == 0) /* End of file? */ + break; + + /* Compress the chunk of data. */ + usize_total += usize; + start_time = current_time(); + csize = deflate_compress(compressor, ubuf1, usize, + cbuf, usize - 1); + compress_time_total += current_time() - start_time; + + if (csize) { + /* Successfully compressed the chunk of data. */ + csize_total += csize; + + /* Decompress the data we just compressed and compare + * the result with the original. */ + start_time = current_time(); + ok = deflate_decompress(decompressor, cbuf, csize, + ubuf2, usize); + decompress_time_total += current_time() - start_time; + if (!ok) { + fprintf(stderr, "ERROR: Failed to " + "decompress data\n"); + return 1; + } + + if (memcmp(ubuf1, ubuf2, usize)) { + fprintf(stderr, "ERROR: Data did not " + "decompress to original\n"); + return 1; + } + } else { + /* Chunk of data did not compress to less than its + * original size. */ + csize_total += usize; + } + } + + + if (usize_total == 0) { + printf("\tEmpty input.\n"); + return 0; + } + + if (compress_time_total == 0) + compress_time_total++; + if (decompress_time_total == 0) + decompress_time_total++; + + printf("\tCompressed %"PRIu64 " => %"PRIu64" bytes (%u.%u%%)\n", + usize_total, csize_total, + (unsigned int)(csize_total * 100 / usize_total), + (unsigned int)(csize_total * 100000 / usize_total % 1000)); + printf("\tCompression time: %"PRIu64" ms (%"PRIu64" MB/s)\n", + compress_time_total / TIME_UNIT_PER_MS, + 1000 * usize_total / compress_time_total); + printf("\tDecompression time: %"PRIu64" ms (%"PRIu64" MB/s)\n", + decompress_time_total / TIME_UNIT_PER_MS, + 1000 * usize_total / decompress_time_total); + return 0; +} + +int +main(int argc, char **argv) +{ + const char *filename; + uint32_t chunk_size = 32768; + unsigned int compression_level = 6; + char *ubuf1 = NULL; + char *ubuf2 = NULL; + char *cbuf = NULL; + struct deflate_compressor *compressor = NULL; + struct deflate_decompressor *decompressor = NULL; + int fd = -1; + int ret; + + if (argc < 2 || argc > 5) { + fprintf(stderr, "Usage: %s FILE [CHUNK_SIZE [LEVEL]]]\n", argv[0]); + ret = 2; + goto out; + } + + filename = argv[1]; + + if (argc >= 3) + chunk_size = strtoul(argv[2], NULL, 10); + + if (argc >= 4) + compression_level = strtoul(argv[3], NULL, 10); + + printf("DEFLATE compression with %"PRIu32" byte chunks (level %u)\n", + chunk_size, compression_level); + + compressor = deflate_alloc_compressor(compression_level); + if (!compressor) { + fprintf(stderr, "ERROR: Failed to create compressor\n"); + ret = 1; + goto out; + } + + decompressor = deflate_alloc_decompressor(); + if (!decompressor) { + fprintf(stderr, "ERROR: Failed to create decompressor\n"); + ret = 1; + goto out; + } + + ubuf1 = malloc(chunk_size); + ubuf2 = malloc(chunk_size); + cbuf = malloc(chunk_size - 1); + + if (!ubuf1 || !ubuf2 || !cbuf) { + fprintf(stderr, "ERROR: Insufficient memory\n"); + ret = 1; + goto out; + } + + fd = open(filename, O_RDONLY | O_BINARY); + if (fd < 0) { + fprintf(stderr, "ERROR: Can't open \"%s\" for reading: %s\n", + filename, strerror(errno)); + ret = 1; + goto out; + } + + ret = do_benchmark(fd, ubuf1, ubuf2, cbuf, chunk_size, + compressor, decompressor); +out: + close(fd); + free(cbuf); + free(ubuf2); + free(ubuf1); + deflate_free_decompressor(decompressor); + deflate_free_compressor(compressor); + return ret; +} diff --git a/libdeflate.h b/libdeflate.h new file mode 100644 index 0000000..1d42ed4 --- /dev/null +++ b/libdeflate.h @@ -0,0 +1,131 @@ +/* + * libdeflate.h + * + * Public header for the DEFLATE compression library. + */ + +#ifndef LIBDEFLATE_H +#define LIBDEFLATE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* ========================================================================== */ +/* Compression */ +/* ========================================================================== */ + +struct deflate_compressor; + +/* + * deflate_alloc_compressor() allocates a new DEFLATE compressor. + * 'compression_level' is the compression level on a zlib-like scale (1 = + * fastest, 6 = medium/default, 9 = slowest). The return value is a pointer to + * the new DEFLATE compressor, or NULL if out of memory. + * + * Note: the sliding window size is defined at compilation time (default 32768). + */ +extern struct deflate_compressor * +deflate_alloc_compressor(unsigned int compression_level); + +/* + * deflate_compress() performs DEFLATE compression on a buffer of data. The + * function attempts to compress 'in_nbytes' bytes of data located at 'in' and + * write the results to 'out', which has space for 'out_nbytes_avail' bytes. + * The return value is the compressed size in bytes, or 0 if the data could not + * be compressed to 'out_nbytes_avail' bytes or fewer. + */ +extern size_t +deflate_compress(struct deflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * Like deflate_compress(), but store the data in the zlib wrapper format. + */ +extern size_t +zlib_compress(struct deflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * Like deflate_compress(), but store the data in the gzip wrapper format. + */ +extern size_t +gzip_compress(struct deflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * deflate_free_compressor() frees a DEFLATE compressor that was allocated with + * deflate_alloc_compressor(). + */ +extern void +deflate_free_compressor(struct deflate_compressor *compressor); + +/* ========================================================================== */ +/* Decompression */ +/* ========================================================================== */ + +struct deflate_decompressor; + +/* + * deflate_alloc_decompressor() allocates a new DEFLATE decompressor. The + * return value is a pointer to the new DEFLATE decompressor, or NULL if out of + * memory. + * + * This function takes no parameters, and the returned decompressor is valid for + * decompressing data that was compressed at any compression level and with any + * sliding window size. + */ +extern struct deflate_decompressor * +deflate_alloc_decompressor(void); + +/* + * deflate_decompress() decompresses 'in_nbytes' bytes of DEFLATE-compressed + * data at 'in' and writes the uncompressed data, which had original size + * 'out_nbytes', to 'out'. The return value is true if decompression was + * successful, or false if the compressed data was invalid. + * + * To be clear: the uncompressed size must be known *exactly* and passed as + * 'out_nbytes'. + */ +extern bool +deflate_decompress(struct deflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes); + +/* + * Like deflate_decompress(), but assumes the zlib wrapper format instead of raw + * DEFLATE. + */ +extern bool +zlib_decompress(struct deflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes); + +/* + * Like deflate_decompress(), but assumes the gzip wrapper format instead of raw + * DEFLATE. + */ +extern bool +gzip_decompress(struct deflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes); + +/* + * deflate_free_decompressor() frees a DEFLATE decompressor that was allocated + * with deflate_alloc_decompressor(). + */ +extern void +deflate_free_decompressor(struct deflate_decompressor *decompressor); + + +#ifdef __cplusplus +} +#endif + +#endif /* LIBDEFLATE_H */ diff --git a/src/adler32.c b/src/adler32.c new file mode 100644 index 0000000..da5afc5 --- /dev/null +++ b/src/adler32.c @@ -0,0 +1,19 @@ +/* + * adler32.c + * + * Adler-32 checksum algorithm. + */ + +#include "adler32.h" + +u32 +adler32(const u8 *buffer, size_t size) +{ + u32 s1 = 1; + u32 s2 = 0; + for (size_t i = 0; i < size; i++) { + s1 = (s1 + buffer[i]) % 65521; + s2 = (s2 + s1) % 65521; + } + return (s2 << 16) | s1; +} diff --git a/src/adler32.h b/src/adler32.h new file mode 100644 index 0000000..78c2d02 --- /dev/null +++ b/src/adler32.h @@ -0,0 +1,12 @@ +/* + * adler32.h + * + * Adler-32 checksum algorithm. + */ + +#pragma once + +#include "types.h" + +extern u32 +adler32(const u8 *buffer, size_t size); diff --git a/src/bitops.h b/src/bitops.h new file mode 100644 index 0000000..1e6f68c --- /dev/null +++ b/src/bitops.h @@ -0,0 +1,80 @@ +/* + * bitops.h + * + * Inline functions for bit manipulation. + */ + +#pragma once + +#include "compiler.h" +#include "types.h" + +/* Find Last Set bit */ + +static inline unsigned fls32(u32 v) +{ +#ifdef compiler_fls32 + return compiler_fls32(v); +#else + unsigned bit = 0; + while ((v >>= 1) != 0) + bit++; + return bit; +#endif +} + +static inline unsigned fls64(u64 v) +{ +#ifdef compiler_fls64 + return compiler_fls64(v); +#else + unsigned bit = 0; + while ((v >>= 1) != 0) + bit++; + return bit; +#endif +} + +static inline unsigned flsw(machine_word_t v) +{ + BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8); + if (WORDSIZE == 4) + return fls32(v); + else + return fls64(v); +} + +/* Find First Set bit */ + +static inline unsigned ffs32(u32 v) +{ +#ifdef compiler_ffs32 + return compiler_ffs32(v); +#else + unsigned bit; + for (bit = 0; !(v & 1); bit++, v >>= 1) + ; + return bit; +#endif +} + +static inline unsigned ffs64(u64 v) +{ +#ifdef compiler_ffs64 + return compiler_ffs64(v); +#else + unsigned bit; + for (bit = 0; !(v & 1); bit++, v >>= 1) + ; + return bit; +#endif +} + +static inline unsigned ffsw(machine_word_t v) +{ + BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8); + if (WORDSIZE == 4) + return ffs32(v); + else + return ffs64(v); +} diff --git a/src/bt_matchfinder.h b/src/bt_matchfinder.h new file mode 100644 index 0000000..b827044 --- /dev/null +++ b/src/bt_matchfinder.h @@ -0,0 +1,279 @@ +/* + * bt_matchfinder.h + * + * This is a Binary Tree (bt) based matchfinder. + * + * The data structure is a hash table where each hash bucket contains a binary + * tree of sequences, referenced by position. The sequences in the binary tree + * are ordered such that a left child is lexicographically lesser than its + * parent, and a right child is lexicographically greater than its parent. + * + * For each sequence (position) in the input, the first 3 bytes are hashed and + * the the appropriate binary tree is re-rooted at that sequence (position). + * Since the sequences are inserted in order, each binary tree maintains the + * invariant that each child node has greater match offset than its parent. + * + * While inserting a sequence, we may search the binary tree for matches with + * that sequence. At each step, the length of the match is computed. The + * search ends when the sequences get too far away (outside of the sliding + * window), or when the binary tree ends (in the code this is the same check as + * "too far away"), or when 'max_search_depth' positions have been searched, or + * when a match of at least 'nice_len' bytes has been found. + * + * Notes: + * + * - Typically, we need to search more nodes to find a given match in a + * binary tree versus in a linked list. However, a binary tree has more + * overhead than a linked list: it needs to be kept sorted, and the inner + * search loop is more complicated. As a result, binary trees are best + * suited for compression modes where the potential matches are searched + * more thoroughly. + * + * - Since no attempt is made to keep the binary trees balanced, it's + * essential to have the 'max_search_depth' cutoff. Otherwise it could + * take quadratic time to run data through the matchfinder. + */ + +#pragma once + +#include "lz_extend.h" +#include "lz_hash3.h" +#include "matchfinder_common.h" + +#ifndef BT_MATCHFINDER_HASH_ORDER +# if MATCHFINDER_WINDOW_ORDER < 14 +# define BT_MATCHFINDER_HASH_ORDER 14 +# else +# define BT_MATCHFINDER_HASH_ORDER 15 +# endif +#endif + +#define BT_MATCHFINDER_HASH_LENGTH (1UL << BT_MATCHFINDER_HASH_ORDER) + +#define BT_MATCHFINDER_TOTAL_LENGTH \ + (BT_MATCHFINDER_HASH_LENGTH + (2UL * MATCHFINDER_WINDOW_SIZE)) + +struct bt_matchfinder { + union { + pos_t mf_data[BT_MATCHFINDER_TOTAL_LENGTH]; + struct { + pos_t hash_tab[BT_MATCHFINDER_HASH_LENGTH]; + pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE]; + }; + }; +} _aligned_attribute(MATCHFINDER_ALIGNMENT); + +static inline void +bt_matchfinder_init(struct bt_matchfinder *mf) +{ + matchfinder_init(mf->hash_tab, BT_MATCHFINDER_HASH_LENGTH); +} + +#if MATCHFINDER_IS_SLIDING +static inline void +bt_matchfinder_slide_window(struct bt_matchfinder *mf) +{ + matchfinder_rebase(mf->mf_data, BT_MATCHFINDER_TOTAL_LENGTH); +} +#endif + +/* + * Find matches with the current sequence. + * + * @mf + * The matchfinder structure. + * @in_base + * Pointer to the next byte in the input buffer to process _at the last + * time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_. + * @in_next + * Pointer to the next byte in the input buffer to process. This is the + * pointer to the bytes being matched against. + * @max_len + * Maximum match length to return. + * @nice_len + * Stop searching if a match of at least this length is found. + * @max_search_depth + * Limit on the number of potential matches to consider. + * @prev_hash + * TODO + * @matches + * Space to write the matches that are found. + * + * Returns the number of matches found, which may be anywhere from 0 to + * (nice_len - 3 + 1), inclusively. The matches are written to @matches in + * order of strictly increasing length and strictly increasing offset. The + * minimum match length is assumed to be 3. + */ +static inline unsigned +bt_matchfinder_get_matches(struct bt_matchfinder * const restrict mf, + const u8 * const in_base, + const u8 * const in_next, + const unsigned max_len, + const unsigned nice_len, + const unsigned max_search_depth, + unsigned long *prev_hash, + struct lz_match * const restrict matches) +{ + struct lz_match *lz_matchptr = matches; + unsigned depth_remaining = max_search_depth; + unsigned hash; + pos_t cur_match; + const u8 *matchptr; + unsigned best_len; + pos_t *pending_lt_ptr, *pending_gt_ptr; + unsigned best_lt_len, best_gt_len; + unsigned len; + pos_t *children; + + if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES + 1)) + return 0; + + hash = *prev_hash; + *prev_hash = lz_hash3(in_next + 1, BT_MATCHFINDER_HASH_ORDER); + prefetch(&mf->hash_tab[*prev_hash]); + cur_match = mf->hash_tab[hash]; + mf->hash_tab[hash] = in_next - in_base; + + best_len = 2; + pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1]; + pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1]; + best_lt_len = 0; + best_gt_len = 0; + for (;;) { + if (!matchfinder_match_in_window(cur_match, + in_base, in_next) || + !depth_remaining--) + { + *pending_lt_ptr = MATCHFINDER_INITVAL; + *pending_gt_ptr = MATCHFINDER_INITVAL; + return lz_matchptr - matches; + } + + matchptr = &in_base[cur_match]; + len = min(best_lt_len, best_gt_len); + + children = &mf->child_tab[(unsigned long) + matchfinder_slot_for_match(cur_match) << 1]; + + if (matchptr[len] == in_next[len]) { + + len = lz_extend(in_next, matchptr, len + 1, max_len); + + if (len > best_len) { + best_len = len; + + lz_matchptr->length = len; + lz_matchptr->offset = in_next - matchptr; + lz_matchptr++; + + if (len >= nice_len) { + *pending_lt_ptr = children[0]; + *pending_gt_ptr = children[1]; + return lz_matchptr - matches; + } + } + } + + if (matchptr[len] < in_next[len]) { + *pending_lt_ptr = cur_match; + pending_lt_ptr = &children[1]; + cur_match = *pending_lt_ptr; + best_lt_len = len; + } else { + *pending_gt_ptr = cur_match; + pending_gt_ptr = &children[0]; + cur_match = *pending_gt_ptr; + best_gt_len = len; + } + } +} + +/* + * Advance the match-finder, but don't search for matches. + * + * @mf + * The matchfinder structure. + * @in_base + * Pointer to the next byte in the input buffer to process _at the last + * time bc_matchfinder_init() or bc_matchfinder_slide_window() was called_. + * @in_next + * Pointer to the next byte in the input buffer to process. + * @in_end + * Pointer to the end of the input buffer. + * @nice_len + * Stop searching if a match of at least this length is found. + * @max_search_depth + * Limit on the number of potential matches to consider. + * @prev_hash + * TODO + */ +static inline void +bt_matchfinder_skip_position(struct bt_matchfinder * const restrict mf, + const u8 * const in_base, + const u8 * const in_next, + const u8 * const in_end, + const unsigned nice_len, + const unsigned max_search_depth, + unsigned long *prev_hash) +{ + unsigned depth_remaining = max_search_depth; + unsigned hash; + pos_t cur_match; + const u8 *matchptr; + pos_t *pending_lt_ptr, *pending_gt_ptr; + unsigned best_lt_len, best_gt_len; + unsigned len; + pos_t *children; + + if (unlikely(in_end - in_next < LZ_HASH_REQUIRED_NBYTES + 1)) + return; + + hash = *prev_hash; + *prev_hash = lz_hash3(in_next + 1, BT_MATCHFINDER_HASH_ORDER); + prefetch(&mf->hash_tab[*prev_hash]); + cur_match = mf->hash_tab[hash]; + mf->hash_tab[hash] = in_next - in_base; + + depth_remaining = max_search_depth; + pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1]; + pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1]; + best_lt_len = 0; + best_gt_len = 0; + for (;;) { + if (!matchfinder_match_in_window(cur_match, + in_base, in_next) || + !depth_remaining--) + { + *pending_lt_ptr = MATCHFINDER_INITVAL; + *pending_gt_ptr = MATCHFINDER_INITVAL; + return; + } + + matchptr = &in_base[cur_match]; + len = min(best_lt_len, best_gt_len); + + children = &mf->child_tab[(unsigned long) + matchfinder_slot_for_match(cur_match) << 1]; + + if (matchptr[len] == in_next[len]) { + len = lz_extend(in_next, matchptr, len + 1, nice_len); + if (len == nice_len) { + *pending_lt_ptr = children[0]; + *pending_gt_ptr = children[1]; + return; + } + } + + if (matchptr[len] < in_next[len]) { + *pending_lt_ptr = cur_match; + pending_lt_ptr = &children[1]; + cur_match = *pending_lt_ptr; + best_lt_len = len; + } else { + *pending_gt_ptr = cur_match; + pending_gt_ptr = &children[0]; + cur_match = *pending_gt_ptr; + best_gt_len = len; + } + } +} diff --git a/src/compiler-gcc.h b/src/compiler-gcc.h new file mode 100644 index 0000000..b9e1869 --- /dev/null +++ b/src/compiler-gcc.h @@ -0,0 +1,52 @@ +/* + * compiler-gcc.h + * + * Compiler and platform-specific definitions for the GNU C compiler. + */ + +#pragma once + +#ifdef __WIN32__ +# define LIBEXPORT __declspec(dllexport) +#else +# define LIBEXPORT __attribute__((visibility("default"))) +#endif + +#define likely(expr) __builtin_expect(!!(expr), 1) +#define unlikely(expr) __builtin_expect(!!(expr), 0) +#define prefetch(addr) __builtin_prefetch(addr) +#define inline inline __attribute__((always_inline)) +#define _aligned_attribute(n) __attribute__((aligned(n))) +#define _packed_attribute __attribute__((packed)) + +#define CPU_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + +#if defined(__x86_64__) || defined(__i386__) +# define UNALIGNED_ACCESS_SPEED 3 +#elif defined(__ARM_FEATURE_UNALIGNED) && (__ARM_FEATURE_UNALIGNED == 1) +# define UNALIGNED_ACCESS_SPEED 2 +#else +# define UNALIGNED_ACCESS_SPEED 0 +#endif + +#define min(a, b) ({ __typeof__(a) _a = (a); __typeof__(b) _b = (b); \ + (_a < _b) ? _a : _b; }) + +#define max(a, b) ({ __typeof__(a) _a = (a); __typeof__(b) _b = (b); \ + (_a > _b) ? _a : _b; }) + +#define swap(a, b) ({ __typeof__(a) _a = a; (a) = (b); (b) = _a; }) + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) +# define compiler_bswap32 __builtin_bswap32 +# define compiler_bswap64 __builtin_bswap64 +#endif + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) +# define compiler_bswap16 __builtin_bswap16 +#endif + +#define compiler_fls32(n) (31 - __builtin_clz(n)) +#define compiler_fls64(n) (63 - __builtin_clzll(n)) +#define compiler_ffs32(n) __builtin_ctz(n) +#define compiler_ffs64(n) __builtin_ctzll(n) diff --git a/src/compiler.h b/src/compiler.h new file mode 100644 index 0000000..95d2bc3 --- /dev/null +++ b/src/compiler.h @@ -0,0 +1,60 @@ +/* + * compiler.h + * + * Compiler and platform-specific definitions. + */ + +#pragma once + +#ifdef __GNUC__ +# include "compiler-gcc.h" +#else +# warning "Unrecognized compiler. Please add a header file for your compiler." +#endif + +#ifndef LIBEXPORT +# define LIBEXPORT +#endif + +#ifndef BUILD_BUG_ON +# define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#endif + +#ifndef likely +# define likely(expr) (expr) +#endif + +#ifndef unlikely +# define unlikely(expr) (expr) +#endif + +#ifndef prefetch +# define prefetch(addr) +#endif + +#ifndef _aligned_attribute +# error "missing required definition of _aligned_attribute" +#endif + +#ifndef _packed_attribute +# error "missing required definition of _packed_attribute" +#endif + +#ifndef CPU_IS_BIG_ENDIAN +# error "missing required endianness definition" +#endif + +#define CPU_IS_LITTLE_ENDIAN (!CPU_IS_BIG_ENDIAN) + +#ifndef UNALIGNED_ACCESS_SPEED +# warning "assuming unaligned accesses are not allowed" +# define UNALIGNED_ACCESS_SPEED 0 +#endif + +#define UNALIGNED_ACCESS_IS_ALLOWED (UNALIGNED_ACCESS_SPEED >= 1) +#define UNALIGNED_ACCESS_IS_FAST (UNALIGNED_ACCESS_SPEED >= 2) +#define UNALIGNED_ACCESS_IS_VERY_FAST (UNALIGNED_ACCESS_SPEED >= 3) + +#if !defined(min) || !defined(max) || !defined(swap) +# error "missing required definitions of min(), max(), and swap() macros" +#endif diff --git a/src/crc32.c b/src/crc32.c new file mode 100644 index 0000000..94dc83c --- /dev/null +++ b/src/crc32.c @@ -0,0 +1,73 @@ +/* + * crc32.c + * + * CRC-32 checksum algorithm. + */ + +#include "crc32.h" + +static const u32 crc_table[256] = { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, + 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, + 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, + 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, + 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, + 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, + 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, + 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, + 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a, + 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, + 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, + 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, + 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, + 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, + 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, + 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, + 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, + 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, + 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, + 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010, + 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, + 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, + 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, + 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, + 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, + 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, + 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, + 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, + 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c, + 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, + 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, + 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, + 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, + 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, + 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, + 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, + 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, + 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278, + 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, + 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, + 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, + 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, + 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, + 0x2d02ef8d, +}; + +u32 +crc32(const u8 *buffer, size_t size) +{ + u32 crc = ~0; + + for (size_t i = 0; i < size; i++) + crc = crc_table[(u8)crc ^ buffer[i]] ^ (crc >> 8); + + return ~crc; +} diff --git a/src/crc32.h b/src/crc32.h new file mode 100644 index 0000000..c8ee66c --- /dev/null +++ b/src/crc32.h @@ -0,0 +1,12 @@ +/* + * crc32.h + * + * CRC-32 checksum algorithm. + */ + +#pragma once + +#include "types.h" + +extern u32 +crc32(const u8 *buffer, size_t size); diff --git a/src/deflate_compress.c b/src/deflate_compress.c new file mode 100644 index 0000000..e566625 --- /dev/null +++ b/src/deflate_compress.c @@ -0,0 +1,2323 @@ +/* + * deflate_compress.c + */ + +#include +#include +#include + +#include "libdeflate.h" + +#include "deflate_compress.h" +#include "deflate_constants.h" +#include "unaligned.h" + +/* + * Note: when compiling this file, SUPPORT_NEAR_OPTIMAL_PARSING should be + * defined to either 0 or 1. When defined to 1, the near-optimal parsing + * algorithm is enabled at compression level 80 and above. The near-optimal + * parsing algorithm produces a compression ratio significantly better than the + * greedy and lazy algorithms implemented here, and also the algorithm used by + * zlib at level 9. However, it is slow. + */ +#ifndef SUPPORT_NEAR_OPTIMAL_PARSING +# define SUPPORT_NEAR_OPTIMAL_PARSING 0 +#endif + +/* + * Define to 1 to maintain the full map from match offsets to offset slots. + * This slightly speeds up translations of match offsets to offset slots, but it + * uses 32768 bytes of memory rather than the 512 bytes used by the condensed + * map. The speedup provided by the larger map is most helpful when the + * near-optimal parsing algorithm is being used. + */ +#define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING + +/* + * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters + * appropriately. + */ +#define MATCHFINDER_WINDOW_ORDER 15 +#define MATCHFINDER_IS_SLIDING 1 + +#include "hc_matchfinder.h" +#if SUPPORT_NEAR_OPTIMAL_PARSING +# include "bt_matchfinder.h" +#endif + +/* + * Number of literals+matches to output before starting new Huffman codes. + * + * This is just a heuristic, as there is no efficient algorithm for computing + * optimal block splitting in general. + * + * Note: a lower value than defined here usually results in a slightly better + * compression ratio, but creates more overhead in compression and + * decompression. + * + * This value is not used by the near-optimal parsing algorithm, which uses + * OPTIM_BLOCK_LENGTH instead. + */ +#define MAX_ITEMS_PER_BLOCK 16384 + +#if SUPPORT_NEAR_OPTIMAL_PARSING +/* Constants specific to the near-optimal parsing algorithm. */ + +/* The preferred DEFLATE block length in bytes. */ +# define OPTIM_BLOCK_LENGTH 16384 + +/* The maximum number of matches the matchfinder can find at a single position. + * Since the matchfinder never finds more than one match for the same length, + * presuming one of each possible length is sufficient for an upper bound. + * (This says nothing about whether it is worthwhile to consider so many + * matches; this is just defining the worst case.) */ +# define MAX_MATCHES_PER_POS (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1) + +/* The number of array spaces to reserve for a single block's matches. This + * value should be high enough so that virtually the time, all matches found in + * OPTIM_BLOCK_LENGTH consecutive positions can fit in this array. However, + * this is *not* the true upper bound on the number of matches that can possibly + * be found. Therefore, checks for overflow are still required. */ +# define CACHE_LEN ((OPTIM_BLOCK_LENGTH * 8) + (MAX_MATCHES_PER_POS + 1)) + +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +#define ARRAY_LEN(A) (sizeof(A) / sizeof((A)[0])) + +/* Table: length slot => length slot base value */ +static const unsigned deflate_length_slot_base[] = { + 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , + 11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 , + 35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 , + 131 , 163 , 195 , 227 , 258 , +}; + +/* Table: length slot => number of extra length bits */ +static const u8 deflate_extra_length_bits[] = { + 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , + 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , + 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 , + 5 , 5 , 5 , 5 , 0 , +}; + +/* Table: offset slot => offset slot base value */ +static const unsigned deflate_offset_slot_base[] = { + 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 , + 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 , + 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 , + 4097 , 6145 , 8193 , 12289 , 16385 , 24577 , +}; + +/* Table: offset slot => number of extra offset bits */ +static const u8 deflate_extra_offset_bits[] = { + 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 , + 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , + 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 , + 11 , 11 , 12 , 12 , 13 , 13 , +}; + +/* Codewords for the DEFLATE Huffman codes. */ +struct deflate_codewords { + u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u32 offset[DEFLATE_NUM_OFFSET_SYMS]; +}; + +/* Codeword lengths (in bits) for the DEFLATE Huffman codes. + * A zero length means the corresponding symbol had zero frequency. */ +struct deflate_lens { + union { + u8 all[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS]; + struct { + u8 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u8 offset[DEFLATE_NUM_OFFSET_SYMS]; + }; + }; +}; + +/* Codewords and lengths for the DEFLATE Huffman codes. */ +struct deflate_codes { + struct deflate_codewords codewords; + struct deflate_lens lens; +}; + +/* Symbol frequency counters for the DEFLATE Huffman codes. */ +struct deflate_freqs { + u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u32 offset[DEFLATE_NUM_OFFSET_SYMS]; +}; + +#if SUPPORT_NEAR_OPTIMAL_PARSING + +/* Costs for the near-optimal parsing algorithm. */ +struct deflate_costs { + + /* The cost to output each possible literal. */ + u32 literal[DEFLATE_NUM_LITERALS]; + + /* The cost to output each possible match length. */ + u32 length[DEFLATE_MAX_MATCH_LEN + 1]; + + /* The cost to output a match offset of each possible offset slot. */ + u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS]; +}; + +/* + * COST_SHIFT is a scaling factor that makes it possible to consider fractional + * bit costs. A token requiring 'n' bits to represent has cost n << COST_SHIFT. + * + * Note: this is only useful as a statistical trick for when the true costs are + * unknown. In reality, each token in DEFLATE requires a whole number of bits + * to output. + */ +#define COST_SHIFT 3 + +/* + * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to + * be needed to output a symbol that was unused in the previous optimization + * pass. Assigning a default cost allows the symbol to be used in the next + * optimization pass. However, the cost should be relatively high because the + * symbol probably won't be used very many times (if at all). + */ +#define LITERAL_NOSTAT_BITS 13 +#define LENGTH_NOSTAT_BITS 13 +#define OFFSET_NOSTAT_BITS 10 + +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +/* An intermediate representation of a DEFLATE match or literal */ +struct deflate_item { + /* + * Bits 0 - 8: Literal/length symbol + * Bits 9 - 13: Extra length bits + * Bits 14 - 18: Offset symbol + * Bits 19 - 31: Extra offset bits + * + * Unfortunately, gcc generates worse code if we use real bitfields here. + */ + u32 data; +}; + +#if SUPPORT_NEAR_OPTIMAL_PARSING + +/* + * This structure represents a byte position in the input data and a node in the + * graph of possible match/literal choices for the current block. + * + * Logically, each incoming edge to this node is labeled with a literal or a + * match that can be taken to reach this position from an earlier position; and + * each outgoing edge from this node is labeled with a literal or a match that + * can be taken to advance from this position to a later position. + * + * But these "edges" are actually stored elsewhere (in 'cached_matches'). + * Here we associate with each node just two pieces of information: + * + * 'cost_to_end' is the minimum cost to reach the end of the block from + * this position. + * + * 'item' represents the literal or match that must be chosen from here to + * reach the end of the block with the minimum cost. Equivalently, this + * can be interpreted as the label of the outgoing edge on the minimum-cost + * path to the "end of block" node from this node. + */ +struct deflate_optimum_node { + + u32 cost_to_end; + + /* + * Notes on the match/literal representation used here: + * + * The low bits of 'item' are the length: 1 if this is a literal, + * or the match length if this is a match. + * + * The high bits of 'item' are the actual literal byte if this is a + * literal, or the match offset if this is a match. + */ +#define OPTIMUM_OFFSET_SHIFT 9 +#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1) + u32 item; + +} _aligned_attribute(8); + +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +/* The main DEFLATE compressor structure */ +struct deflate_compressor { + + /* Pointer to the compress() implementation chosen at allocation time */ + size_t (*impl)(struct deflate_compressor *, + const u8 *, size_t, u8 *, size_t); + + /* Frequency counters for the current block */ + struct deflate_freqs freqs; + + /* Dynamic Huffman codes for the current block */ + struct deflate_codes codes; + + /* Static Huffman codes set at allocation time */ + struct deflate_codes static_codes; + + /* A table for fast lookups of length slot by length. The length slot + * for the match length 'len' is 'length_slot_fast[len]'. */ + u8 length_slot_fast[DEFLATE_MAX_MATCH_LEN + 1]; + + /* A table for fast lookups of offset slot by match offset. + * + * If the full table is being used, it is a direct mapping from offset + * to offset slot. + * + * If the condensed table is being used, the first 256 entries map + * directly to the offset slots of offsets 1 through 256. The next 256 + * entries map to the offset slots for the remaining offsets, stepping + * through the offsets with a stride of 128. This relies on the fact + * that each of the remaining offset slots contains at least 128 offsets + * and has an offset base that is a multiple of 128. */ +#if USE_FULL_OFFSET_SLOT_FAST + u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1]; +#else + u8 offset_slot_fast[512]; +#endif + + /* The "nice" match length: if a match of this length is found, choose + * it immediately without further consideration. */ + unsigned nice_match_length; + + /* The maximum search depth: consider at most this many potential + * matches at each position. */ + unsigned max_search_depth; + + /* The compression level with which this compressor was created. */ + unsigned compression_level; + + union { + /* Data for greedy or lazy parsing */ + struct { + /* Hash chain matchfinder */ + struct hc_matchfinder hc_mf; + + /* The match/literal sequence for the current block */ + struct deflate_item chosen_items[MAX_ITEMS_PER_BLOCK]; + + u8 nonoptimal_end[0]; + }; + + #if SUPPORT_NEAR_OPTIMAL_PARSING + /* Data for near-optimal parsing */ + struct { + + /* Binary tree matchfinder */ + struct bt_matchfinder bt_mf; + + /* Matches found using the matchfinder are cached in + * this array so that later optimization of the block + * has the matches easily available. The cached matches + * are cleared when a new block is started. */ + struct lz_match cached_matches[CACHE_LEN]; + + /* Array of structures, one per position, for running + * the minimum-cost path algorithm. */ + struct deflate_optimum_node optimum[OPTIM_BLOCK_LENGTH + + 1 + DEFLATE_MAX_MATCH_LEN]; + + /* The current cost model being used. */ + struct deflate_costs costs; + + unsigned num_optim_passes; + + u8 optimal_end[0]; + }; + #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + }; +}; + +/* + * The type for the bitbuffer variable, which temporarily holds bits that are + * being packed into bytes and written to the output buffer. For best + * performance, this should have size equal to a machine word. + */ +typedef machine_word_t bitbuf_t; +#define BITBUF_NBITS (8 * sizeof(bitbuf_t)) + +/* + * Structure to keep track of the current state of sending bits to the + * compressed output buffer. + */ +struct deflate_output_bitstream { + + /* Bits that haven't yet been written to the output buffer. */ + bitbuf_t bitbuf; + + /* Number of bits currently held in @bitbuf. */ + int bitcount; + + /* Pointer to the start of the output buffer. */ + u8 *start; + + /* Pointer to the position in the output buffer at which the next byte + * should be written. */ + u8 *next; + + /* Pointer just past the end of the output buffer. */ + u8 *end; +}; + +/* Initialize the output bitstream. */ +static void +deflate_init_output(struct deflate_output_bitstream *os, void *buffer, size_t size) +{ + os->bitbuf = 0; + os->bitcount = 0; + os->start = buffer; + os->next = os->start; + os->end = os->start + size; +} + +/* Write some bits to the output bitstream. */ +static inline void +deflate_write_bits(struct deflate_output_bitstream *os, + const bitbuf_t bits, const unsigned num_bits) +{ + /* We only flush 'bitbuf' when it completely fills up. + * This improves performance. */ + + os->bitbuf |= bits << os->bitcount; + os->bitcount += num_bits; + if (os->bitcount >= BITBUF_NBITS) { + if (os->end - os->next >= sizeof(bitbuf_t)) { + put_unaligned_word_le(os->bitbuf, os->next); + os->next += sizeof(bitbuf_t); + } else { + os->next = os->end; + } + os->bitcount -= BITBUF_NBITS; + os->bitbuf = bits >> (num_bits - os->bitcount); + } +} + +/* + * Flush any remaining bits to the output buffer if needed. Return the total + * number of bytes written to the output buffer, or 0 if an overflow occurred. + */ +static u32 +deflate_flush_output(struct deflate_output_bitstream *os) +{ + while (os->bitcount > 0) { + if (os->next != os->end) + *os->next++ = os->bitbuf; + os->bitcount -= 8; + os->bitbuf >>= 8; + } + + if (os->next == os->end) /* overflow? */ + return 0; + + return os->next - os->start; +} + +/* Given the binary tree node A[subtree_idx] whose children already + * satisfy the maxheap property, swap the node with its greater child + * until it is greater than both its children, so that the maxheap + * property is satisfied in the subtree rooted at A[subtree_idx]. */ +static void +heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) +{ + unsigned parent_idx; + unsigned child_idx; + u32 v; + + v = A[subtree_idx]; + parent_idx = subtree_idx; + while ((child_idx = parent_idx * 2) <= length) { + if (child_idx < length && A[child_idx + 1] > A[child_idx]) + child_idx++; + if (v >= A[child_idx]) + break; + A[parent_idx] = A[child_idx]; + parent_idx = child_idx; + } + A[parent_idx] = v; +} + +/* Rearrange the array 'A' so that it satisfies the maxheap property. + * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. + */ +static void +heapify_array(u32 A[], unsigned length) +{ + for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) + heapify_subtree(A, length, subtree_idx); +} + +/* Sort the array 'A', which contains 'length' unsigned 32-bit integers. */ +static void +heapsort(u32 A[], unsigned length) +{ + A--; /* Use 1-based indices */ + + heapify_array(A, length); + + while (length >= 2) { + swap(A[1], A[length]); + length--; + heapify_subtree(A, length, 1); + } +} + +#define NUM_SYMBOL_BITS 10 +#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) + +/* + * Sort the symbols primarily by frequency and secondarily by symbol + * value. Discard symbols with zero frequency and fill in an array with + * the remaining symbols, along with their frequencies. The low + * NUM_SYMBOL_BITS bits of each array entry will contain the symbol + * value, and the remaining bits will contain the frequency. + * + * @num_syms + * Number of symbols in the alphabet. + * Can't be greater than (1 << NUM_SYMBOL_BITS). + * + * @freqs[num_syms] + * The frequency of each symbol. + * + * @lens[num_syms] + * An array that eventually will hold the length of each codeword. + * This function only fills in the codeword lengths for symbols that + * have zero frequency, which are not well defined per se but will + * be set to 0. + * + * @symout[num_syms] + * The output array, described above. + * + * Returns the number of entries in 'symout' that were filled. This is + * the number of symbols that have nonzero frequency. + */ +static unsigned +sort_symbols(unsigned num_syms, const u32 freqs[restrict], + u8 lens[restrict], u32 symout[restrict]) +{ + unsigned num_used_syms; + unsigned num_counters; + + /* We rely on heapsort, but with an added optimization. Since + * it's common for most symbol frequencies to be low, we first do + * a count sort using a limited number of counters. High + * frequencies will be counted in the last counter, and only they + * will be sorted with heapsort. + * + * Note: with more symbols, it is generally beneficial to have more + * counters. About 1 counter per 4 symbols seems fast. + * + * Note: I also tested radix sort, but even for large symbol + * counts (> 255) and frequencies bounded at 16 bits (enabling + * radix sort by just two base-256 digits), it didn't seem any + * faster than the method implemented here. + * + * Note: I tested the optimized quicksort implementation from + * glibc (with indirection overhead removed), but it was only + * marginally faster than the simple heapsort implemented here. + * + * Tests were done with building the codes for LZX. Results may + * vary for different compression algorithms...! */ + + num_counters = ((num_syms + 3 / 4) + 3) & ~3; + + unsigned counters[num_counters]; + + memset(counters, 0, sizeof(counters)); + + /* Count the frequencies. */ + for (unsigned sym = 0; sym < num_syms; sym++) + counters[min(freqs[sym], num_counters - 1)]++; + + /* Make the counters cumulative, ignoring the zero-th, which + * counted symbols with zero frequency. As a side effect, this + * calculates the number of symbols with nonzero frequency. */ + num_used_syms = 0; + for (unsigned i = 1; i < num_counters; i++) { + unsigned count = counters[i]; + counters[i] = num_used_syms; + num_used_syms += count; + } + + /* Sort nonzero-frequency symbols using the counters. At the + * same time, set the codeword lengths of zero-frequency symbols + * to 0. */ + for (unsigned sym = 0; sym < num_syms; sym++) { + u32 freq = freqs[sym]; + if (freq != 0) { + symout[counters[min(freq, num_counters - 1)]++] = + sym | (freq << NUM_SYMBOL_BITS); + } else { + lens[sym] = 0; + } + } + + /* Sort the symbols counted in the last counter. */ + heapsort(symout + counters[num_counters - 2], + counters[num_counters - 1] - counters[num_counters - 2]); + + return num_used_syms; +} + +/* + * Build the Huffman tree. + * + * This is an optimized implementation that + * (a) takes advantage of the frequencies being already sorted; + * (b) only generates non-leaf nodes, since the non-leaf nodes of a + * Huffman tree are sufficient to generate a canonical code; + * (c) Only stores parent pointers, not child pointers; + * (d) Produces the nodes in the same memory used for input + * frequency information. + * + * Array 'A', which contains 'sym_count' entries, is used for both input + * and output. For this function, 'sym_count' must be at least 2. + * + * For input, the array must contain the frequencies of the symbols, + * sorted in increasing order. Specifically, each entry must contain a + * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low + * NUM_SYMBOL_BITS bits of the entries will be ignored by this function. + * Although these bits will, in fact, contain the symbols that correspond + * to the frequencies, this function is concerned with frequencies only + * and keeps the symbols as-is. + * + * For output, this function will produce the non-leaf nodes of the + * Huffman tree. These nodes will be stored in the first (sym_count - 1) + * entries of the array. Entry A[sym_count - 2] will represent the root + * node. Each other node will contain the zero-based index of its parent + * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low + * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again, + * note that although these low bits will, in fact, contain a symbol + * value, this symbol will have *no relationship* with the Huffman tree + * node that happens to occupy the same slot. This is because this + * implementation only generates the non-leaf nodes of the tree. + */ +static void +build_tree(u32 A[], unsigned sym_count) +{ + /* Index, in 'A', of next lowest frequency symbol that has not + * yet been processed. */ + unsigned i = 0; + + /* Index, in 'A', of next lowest frequency parentless non-leaf + * node; or, if equal to 'e', then no such node exists yet. */ + unsigned b = 0; + + /* Index, in 'A', of next node to allocate as a non-leaf. */ + unsigned e = 0; + + do { + unsigned m, n; + u32 freq_shifted; + + /* Choose the two next lowest frequency entries. */ + + if (i != sym_count && + (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) + m = i++; + else + m = b++; + + if (i != sym_count && + (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) + n = i++; + else + n = b++; + + /* Allocate a non-leaf node and link the entries to it. + * + * If we link an entry that we're visiting for the first + * time (via index 'i'), then we're actually linking a + * leaf node and it will have no effect, since the leaf + * will be overwritten with a non-leaf when index 'e' + * catches up to it. But it's not any slower to + * unconditionally set the parent index. + * + * We also compute the frequency of the non-leaf node as + * the sum of its two children's frequencies. */ + + freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK); + + A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); + A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); + A[e] = (A[e] & SYMBOL_MASK) | freq_shifted; + e++; + } while (sym_count - e > 1); + /* When just one entry remains, it is a "leaf" that was + * linked to some other node. We ignore it, since the + * rest of the array contains the non-leaves which we + * need. (Note that we're assuming the cases with 0 or 1 + * symbols were handled separately.) */ +} + +/* + * Given the stripped-down Huffman tree constructed by build_tree(), + * determine the number of codewords that should be assigned each + * possible length, taking into account the length-limited constraint. + * + * @A + * The array produced by build_tree(), containing parent index + * information for the non-leaf nodes of the Huffman tree. Each + * entry in this array is a node; a node's parent always has a + * greater index than that node itself. This function will + * overwrite the parent index information in this array, so + * essentially it will destroy the tree. However, the data in the + * low NUM_SYMBOL_BITS of each entry will be preserved. + * + * @root_idx + * The 0-based index of the root node in 'A', and consequently one + * less than the number of tree node entries in 'A'. (Or, really 2 + * less than the actual length of 'A'.) + * + * @len_counts + * An array of length ('max_codeword_len' + 1) in which the number of + * codewords having each length <= max_codeword_len will be + * returned. + * + * @max_codeword_len + * The maximum permissible codeword length. + */ +static void +compute_length_counts(u32 A[restrict], unsigned root_idx, + unsigned len_counts[restrict], unsigned max_codeword_len) +{ + /* The key observations are: + * + * (1) We can traverse the non-leaf nodes of the tree, always + * visiting a parent before its children, by simply iterating + * through the array in reverse order. Consequently, we can + * compute the depth of each node in one pass, overwriting the + * parent indices with depths. + * + * (2) We can initially assume that in the real Huffman tree, + * both children of the root are leaves. This corresponds to two + * codewords of length 1. Then, whenever we visit a (non-leaf) + * node during the traversal, we modify this assumption to + * account for the current node *not* being a leaf, but rather + * its two children being leaves. This causes the loss of one + * codeword for the current depth and the addition of two + * codewords for the current depth plus one. + * + * (3) We can handle the length-limited constraint fairly easily + * by simply using the largest length available when a depth + * exceeds max_codeword_len. + */ + + for (unsigned len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + len_counts[1] = 2; + + /* Set the root node's depth to 0. */ + A[root_idx] &= SYMBOL_MASK; + + for (int node = root_idx - 1; node >= 0; node--) { + + /* Calculate the depth of this node. */ + + unsigned parent = A[node] >> NUM_SYMBOL_BITS; + unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; + unsigned depth = parent_depth + 1; + unsigned len = depth; + + /* Set the depth of this node so that it is available + * when its children (if any) are processed. */ + + A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); + + /* If needed, decrease the length to meet the + * length-limited constraint. This is not the optimal + * method for generating length-limited Huffman codes! + * But it should be good enough. */ + if (len >= max_codeword_len) { + len = max_codeword_len; + do { + len--; + } while (len_counts[len] == 0); + } + + /* Account for the fact that we have a non-leaf node at + * the current depth. */ + len_counts[len]--; + len_counts[len + 1] += 2; + } +} + +/* + * Generate the codewords for a canonical Huffman code. + * + * @A + * The output array for codewords. In addition, initially this + * array must contain the symbols, sorted primarily by frequency and + * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of + * each entry. + * + * @len + * Output array for codeword lengths. + * + * @len_counts + * An array that provides the number of codewords that will have + * each possible length <= max_codeword_len. + * + * @max_codeword_len + * Maximum length, in bits, of each codeword. + * + * @num_syms + * Number of symbols in the alphabet, including symbols with zero + * frequency. This is the length of the 'A' and 'len' arrays. + */ +static void +gen_codewords(u32 A[restrict], u8 lens[restrict], + const unsigned len_counts[restrict], + unsigned max_codeword_len, unsigned num_syms) +{ + u32 next_codewords[max_codeword_len + 1]; + + /* Given the number of codewords that will have each length, + * assign codeword lengths to symbols. We do this by assigning + * the lengths in decreasing order to the symbols sorted + * primarily by increasing frequency and secondarily by + * increasing symbol value. */ + for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) { + unsigned count = len_counts[len]; + while (count--) + lens[A[i++] & SYMBOL_MASK] = len; + } + + /* Generate the codewords themselves. We initialize the + * 'next_codewords' array to provide the lexicographically first + * codeword of each length, then assign codewords in symbol + * order. This produces a canonical code. */ + next_codewords[0] = 0; + next_codewords[1] = 0; + for (unsigned len = 2; len <= max_codeword_len; len++) + next_codewords[len] = + (next_codewords[len - 1] + len_counts[len - 1]) << 1; + + for (unsigned sym = 0; sym < num_syms; sym++) + A[sym] = next_codewords[lens[sym]]++; +} + +/* + * --------------------------------------------------------------------- + * make_canonical_huffman_code() + * --------------------------------------------------------------------- + * + * Given an alphabet and the frequency of each symbol in it, construct a + * length-limited canonical Huffman code. + * + * @num_syms + * The number of symbols in the alphabet. The symbols are the + * integers in the range [0, num_syms - 1]. This parameter must be + * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS). + * + * @max_codeword_len + * The maximum permissible codeword length. + * + * @freqs + * An array of @num_syms entries, each of which specifies the + * frequency of the corresponding symbol. It is valid for some, + * none, or all of the frequencies to be 0. + * + * @lens + * An array of @num_syms entries in which this function will return + * the length, in bits, of the codeword assigned to each symbol. + * Symbols with 0 frequency will not have codewords per se, but + * their entries in this array will be set to 0. No lengths greater + * than @max_codeword_len will be assigned. + * + * @codewords + * An array of @num_syms entries in which this function will return + * the codeword for each symbol, right-justified and padded on the + * left with zeroes. Codewords for symbols with 0 frequency will be + * undefined. + * + * --------------------------------------------------------------------- + * + * This function builds a length-limited canonical Huffman code. + * + * A length-limited Huffman code contains no codewords longer than some + * specified length, and has exactly (with some algorithms) or + * approximately (with the algorithm used here) the minimum weighted path + * length from the root, given this constraint. + * + * A canonical Huffman code satisfies the properties that a longer + * codeword never lexicographically precedes a shorter codeword, and the + * lexicographic ordering of codewords of the same length is the same as + * the lexicographic ordering of the corresponding symbols. A canonical + * Huffman code, or more generally a canonical prefix code, can be + * reconstructed from only a list containing the codeword length of each + * symbol. + * + * The classic algorithm to generate a Huffman code creates a node for + * each symbol, then inserts these nodes into a min-heap keyed by symbol + * frequency. Then, repeatedly, the two lowest-frequency nodes are + * removed from the min-heap and added as the children of a new node + * having frequency equal to the sum of its two children, which is then + * inserted into the min-heap. When only a single node remains in the + * min-heap, it is the root of the Huffman tree. The codeword for each + * symbol is determined by the path needed to reach the corresponding + * node from the root. Descending to the left child appends a 0 bit, + * whereas descending to the right child appends a 1 bit. + * + * The classic algorithm is relatively easy to understand, but it is + * subject to a number of inefficiencies. In practice, it is fastest to + * first sort the symbols by frequency. (This itself can be subject to + * an optimization based on the fact that most frequencies tend to be + * low.) At the same time, we sort secondarily by symbol value, which + * aids the process of generating a canonical code. Then, during tree + * construction, no heap is necessary because both the leaf nodes and the + * unparented non-leaf nodes can be easily maintained in sorted order. + * Consequently, there can never be more than two possibilities for the + * next-lowest-frequency node. + * + * In addition, because we're generating a canonical code, we actually + * don't need the leaf nodes of the tree at all, only the non-leaf nodes. + * This is because for canonical code generation we don't need to know + * where the symbols are in the tree. Rather, we only need to know how + * many leaf nodes have each depth (codeword length). And this + * information can, in fact, be quickly generated from the tree of + * non-leaves only. + * + * Furthermore, we can build this stripped-down Huffman tree directly in + * the array in which the codewords are to be generated, provided that + * these array slots are large enough to hold a symbol and frequency + * value. + * + * Still furthermore, we don't even need to maintain explicit child + * pointers. We only need the parent pointers, and even those can be + * overwritten in-place with depth information as part of the process of + * extracting codeword lengths from the tree. So in summary, we do NOT + * need a big structure like: + * + * struct huffman_tree_node { + * unsigned int symbol; + * unsigned int frequency; + * unsigned int depth; + * struct huffman_tree_node *left_child; + * struct huffman_tree_node *right_child; + * }; + * + * + * ... which often gets used in "naive" implementations of Huffman code + * generation. + * + * Many of these optimizations are based on the implementation in 7-Zip + * (source file: C/HuffEnc.c), which has been placed in the public domain + * by Igor Pavlov. + */ +static void +make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len, + const u32 freqs[restrict], + u8 lens[restrict], u32 codewords[restrict]) +{ + u32 *A = codewords; + unsigned num_used_syms; + + /* Assumptions */ + assert(num_syms >= 2); + assert(num_syms <= (1 << NUM_SYMBOL_BITS)); + assert((1ULL << max_codeword_len) >= num_syms); + assert(max_codeword_len <= 32); + + /* We begin by sorting the symbols primarily by frequency and + * secondarily by symbol value. As an optimization, the array + * used for this purpose ('A') shares storage with the space in + * which we will eventually return the codewords. */ + + num_used_syms = sort_symbols(num_syms, freqs, lens, A); + + /* 'num_used_syms' is the number of symbols with nonzero + * frequency. This may be less than @num_syms. 'num_used_syms' + * is also the number of entries in 'A' that are valid. Each + * entry consists of a distinct symbol and a nonzero frequency + * packed into a 32-bit integer. */ + + /* Handle special cases where only 0 or 1 symbols were used (had + * nonzero frequency). */ + + if (unlikely(num_used_syms == 0)) { + /* Code is empty. sort_symbols() already set all lengths + * to 0, so there is nothing more to do. */ + return; + } + + if (unlikely(num_used_syms == 1)) { + /* Only one symbol was used, so we only need one + * codeword. But two codewords are needed to form the + * smallest complete Huffman code, which uses codewords 0 + * and 1. Therefore, we choose another symbol to which + * to assign a codeword. We use 0 (if the used symbol is + * not 0) or 1 (if the used symbol is 0). In either + * case, the lesser-valued symbol must be assigned + * codeword 0 so that the resulting code is canonical. */ + + unsigned sym = A[0] & SYMBOL_MASK; + unsigned nonzero_idx = sym ? sym : 1; + + codewords[0] = 0; + lens[0] = 1; + codewords[nonzero_idx] = 1; + lens[nonzero_idx] = 1; + return; + } + + /* Build a stripped-down version of the Huffman tree, sharing the + * array 'A' with the symbol values. Then extract length counts + * from the tree and use them to generate the final codewords. */ + + build_tree(A, num_used_syms); + + { + unsigned len_counts[max_codeword_len + 1]; + + compute_length_counts(A, num_used_syms - 2, + len_counts, max_codeword_len); + + gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); + } +} + +/* + * Clear the Huffman symbol frequency counters. + * This must be called when starting a new DEFLATE block. + */ +static void +deflate_reset_symbol_frequencies(struct deflate_compressor *c) +{ + memset(&c->freqs, 0, sizeof(c->freqs)); +} + +/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */ +static u32 +deflate_reverse_codeword(u32 codeword, u8 len) +{ + u32 codeword_reversed = 0; + + for (int bit = (int)len - 1; bit >= 0; bit--) + codeword_reversed |= ((codeword >> bit) & 1) << (len - 1 - bit); + + return codeword_reversed; +} + +/* Make a canonical Huffman code with bit-reversed codewords. */ +static void +deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, + const u32 freqs[], u8 lens[], u32 codewords[]) +{ + make_canonical_huffman_code(num_syms, max_codeword_len, + freqs, lens, codewords); + + for (unsigned i = 0; i < num_syms; i++) + codewords[i] = deflate_reverse_codeword(codewords[i], lens[i]); +} + +/* + * Build the literal/length and offset Huffman codes for a DEFLATE block. + * + * This takes as input the frequency tables for each code and produces as output + * a set of tables that map symbols to codewords and codeword lengths. + */ +static void +deflate_make_huffman_codes(const struct deflate_freqs *freqs, + struct deflate_codes *codes) +{ + deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS, + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + freqs->litlen, + codes->lens.litlen, + codes->codewords.litlen); + + deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS, + DEFLATE_MAX_OFFSET_CODEWORD_LEN, + freqs->offset, + codes->lens.offset, + codes->codewords.offset); +} + +/* Write the header fields common to all DEFLATE block types. */ +static void +deflate_write_block_header(struct deflate_output_bitstream *os, + bool is_final_block, unsigned block_type) +{ + deflate_write_bits(os, is_final_block, 1); + deflate_write_bits(os, block_type, 2); +} + +static unsigned +deflate_compute_precode_items(const u8 lens[restrict], + const unsigned num_lens, + u32 precode_freqs[restrict], + unsigned precode_items[restrict]) +{ + unsigned *itemptr; + unsigned run_start; + unsigned run_end; + unsigned extra_bits; + u8 len; + + itemptr = precode_items; + run_start = 0; + do { + /* Find the next run of codeword lengths. */ + + /* len = the length being repeated */ + len = lens[run_start]; + + /* Extend the run. */ + run_end = run_start; + do { + run_end++; + } while (run_end != num_lens && len == lens[run_end]); + + if (len == 0) { + /* Run of zeroes. */ + + /* Symbol 18: RLE 11 to 138 zeroes at a time. */ + while ((run_end - run_start) >= 11) { + extra_bits = min((run_end - run_start) - 11, 0x7F); + precode_freqs[18]++; + *itemptr++ = 18 | (extra_bits << 5); + run_start += 11 + extra_bits; + } + + /* Symbol 17: RLE 3 to 10 zeroes at a time. */ + if ((run_end - run_start) >= 3) { + extra_bits = min((run_end - run_start) - 3, 0x7); + precode_freqs[17]++; + *itemptr++ = 17 | (extra_bits << 5); + run_start += 3 + extra_bits; + } + } else { + + /* A run of nonzero lengths. */ + + /* Symbol 16: RLE 3 to 6 of the previous length. */ + if ((run_end - run_start) >= 4) { + precode_freqs[len]++; + *itemptr++ = len; + run_start++; + do { + extra_bits = min((run_end - run_start) - 3, 0x3); + precode_freqs[16]++; + *itemptr++ = 16 | (extra_bits << 5); + run_start += 3 + extra_bits; + } while ((run_end - run_start) >= 3); + } + } + + /* Output any remaining lengths without RLE. */ + while (run_start != run_end) { + precode_freqs[len]++; + *itemptr++ = len; + run_start++; + } + } while (run_start != num_lens); + + return itemptr - precode_items; +} + +/* + * Output a list of Huffman codeword lengths in compressed form. + * + * The codeword lengths are compressed using a separate Huffman code, the + * "precode", which contains a symbol for each possible codeword length in the + * larger code as well as several special symbols to represent repeated codeword + * lengths (a form of run-length encoding). The precode is itself constructed + * in canonical form, and its codeword lengths are represented literally in 19 + * 3-bit fields that immediately precede the compressed codeword lengths of the + * larger code. + */ +static void +deflate_write_compressed_lens(struct deflate_output_bitstream *os, + const u8 lens[], unsigned num_lens) +{ + u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS]; + u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; + u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS]; + unsigned precode_items[num_lens]; + unsigned num_precode_items; + unsigned precode_item; + unsigned precode_sym; + unsigned num_explicit_lens; + unsigned i; + static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + + for (i = 0; i < DEFLATE_NUM_PRECODE_SYMS; i++) + precode_freqs[i] = 0; + + /* Compute the "items" (RLE / literal tokens and extra bits) with which + * the codeword lengths in the larger code will be output. */ + num_precode_items = deflate_compute_precode_items(lens, + num_lens, + precode_freqs, + precode_items); + + /* Build the precode. */ + deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, + DEFLATE_MAX_PRE_CODEWORD_LEN, + precode_freqs, precode_lens, + precode_codewords); + + /* Count how many precode lengths we actually need to output. */ + for (num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; + num_explicit_lens > 4; + num_explicit_lens--) + if (precode_lens[deflate_precode_lens_permutation[num_explicit_lens - 1]] != 0) + break; + + deflate_write_bits(os, num_explicit_lens - 4, 4); + + /* Output the lengths of the codewords in the precode. */ + for (i = 0; i < num_explicit_lens; i++) + deflate_write_bits(os, precode_lens[deflate_precode_lens_permutation[i]], 3); + + /* Output the encoded lengths of the codewords in the larger code. */ + for (i = 0; i < num_precode_items; i++) { + precode_item = precode_items[i]; + precode_sym = precode_item & 0x1F; + deflate_write_bits(os, precode_codewords[precode_sym], + precode_lens[precode_sym]); + if (precode_sym >= 16) { + if (precode_sym == 16) + deflate_write_bits(os, precode_item >> 5, 2); + else if (precode_sym == 17) + deflate_write_bits(os, precode_item >> 5, 3); + else + deflate_write_bits(os, precode_item >> 5, 7); + } + } +} + +/* + * Output the specified Huffman codes. + * This is used for dynamic Huffman blocks. + */ +static void +deflate_write_huffman_codes(struct deflate_output_bitstream *os, + struct deflate_codes *codes) +{ + unsigned num_litlen_syms; + unsigned num_offset_syms; + + /* We only need to output up to the highest-valued symbol actually used. */ + + for (num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; + num_litlen_syms > 257; + num_litlen_syms--) + if (codes->lens.litlen[num_litlen_syms - 1] != 0) + break; + + for (num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; + num_offset_syms > 1; + num_offset_syms--) + if (codes->lens.offset[num_offset_syms - 1] != 0) + break; + + deflate_write_bits(os, num_litlen_syms - 257, 5); + deflate_write_bits(os, num_offset_syms - 1, 5); + + /* If we're not outputting the full set of literal/length codeword + * lengths, temporarily move the offset codeword lengths over so that + * the literal/length and offset codeword lengths are contiguous. */ + + BUILD_BUG_ON(offsetof(struct deflate_lens, offset) != + DEFLATE_NUM_LITLEN_SYMS); + + if (num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) + memmove(&codes->lens.all[num_litlen_syms], + &codes->lens.all[DEFLATE_NUM_LITLEN_SYMS], + num_offset_syms * sizeof(codes->lens.all[0])); + + /* Output the codeword lengths. */ + + deflate_write_compressed_lens(os, codes->lens.all, + num_litlen_syms + num_offset_syms); + + /* Restore the offset codeword lengths if needed. */ + if (num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) + memmove(&codes->lens.all[DEFLATE_NUM_LITLEN_SYMS], + &codes->lens.all[num_litlen_syms], + num_offset_syms * sizeof(codes->lens.all[0])); +} + +/* Output a literal or match, encoded using the specified Huffman codes. */ +static inline void +deflate_write_item(struct deflate_output_bitstream *os, struct deflate_item item, + const struct deflate_codes *codes) +{ + u32 data = item.data; + u32 litlen_symbol = data & 0x1FF; + u32 offset_symbol; + + /* Literal/length symbol */ + deflate_write_bits(os, codes->codewords.litlen[litlen_symbol], + codes->lens.litlen[litlen_symbol]); + + if (data < 256) /* Literal? */ + return; + + /* Match length */ + deflate_write_bits(os, (data >> 9) & 0x1F, + deflate_extra_length_bits[litlen_symbol - 257]); + + /* Match offset */ + offset_symbol = (data >> 14) & 0x1F; + deflate_write_bits(os, + codes->codewords.offset[offset_symbol], + codes->lens.offset[offset_symbol]); + deflate_write_bits(os, (data >> 19), + deflate_extra_offset_bits[offset_symbol]); +} + +/* Output the literals and matches for a block. */ +static void +deflate_write_items(struct deflate_output_bitstream *os, + const struct deflate_item items[], unsigned num_items, + const struct deflate_codes *codes) +{ + for (unsigned i = 0; i < num_items; i++) + deflate_write_item(os, items[i], codes); +} + +/* Output the end-of-block symbol. */ +static void +deflate_write_end_of_block(struct deflate_output_bitstream *os, + const struct deflate_codes *codes) +{ + deflate_write_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK], + codes->lens.litlen[DEFLATE_END_OF_BLOCK]); +} + +/* + * Output a block containing the literal/match "items" stored in + * c->chosen_items...next_chosen_item. + */ +static void +deflate_write_block(struct deflate_compressor *c, + struct deflate_item *next_chosen_item, + struct deflate_output_bitstream *os, + bool is_final_block) +{ + unsigned num_chosen_items = next_chosen_item - c->chosen_items; + struct deflate_codes *codes; + + /* Note: we don't currently output any uncompressed blocks. */ + + /* Account for end-of-block symbol */ + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; + + if (num_chosen_items >= 100) { + /* Use custom ("dynamic") Huffman codes. */ + deflate_write_block_header(os, is_final_block, + DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN); + deflate_make_huffman_codes(&c->freqs, &c->codes); + deflate_write_huffman_codes(os, &c->codes); + codes = &c->codes; + } else { + /* This is a very short block. Just use the static codes. */ + deflate_write_block_header(os, is_final_block, + DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); + codes = &c->static_codes; + } + deflate_write_items(os, c->chosen_items, num_chosen_items, codes); + deflate_write_end_of_block(os, codes); + + /* Reset symbol frequencies if this wasn't the final block. */ + if (!is_final_block) + deflate_reset_symbol_frequencies(c); +} + +/* Return the length slot for the specified match length. */ +static inline unsigned +deflate_get_length_slot(struct deflate_compressor *c, unsigned length) +{ + return c->length_slot_fast[length]; +} + +/* Return the offset slot for the specified match offset. */ +static inline unsigned +deflate_get_offset_slot(struct deflate_compressor *c, unsigned offset) +{ +#if USE_FULL_OFFSET_SLOT_FAST + return c->offset_slot_fast[offset]; +#else + if (offset <= 256) + return c->offset_slot_fast[offset - 1]; + else + return c->offset_slot_fast[256 + ((offset - 1) >> 7)]; +#endif +} + +/* Tally the Huffman symbol frequencies needed to output a literal, and return + * the literal in intermediate form. */ +static inline struct deflate_item +deflate_choose_literal(struct deflate_compressor *c, unsigned literal) +{ + c->freqs.litlen[literal]++; + return (struct deflate_item) { .data = literal }; +} + +/* Tally the Huffman symbol frequencies needed to output a match, and return the + * match in intermediate form. */ +static inline struct deflate_item +deflate_choose_match(struct deflate_compressor *c, unsigned length, unsigned offset) +{ + unsigned length_slot = deflate_get_length_slot(c, length); + unsigned offset_slot = deflate_get_offset_slot(c, offset); + + c->freqs.litlen[257 + length_slot]++; + c->freqs.offset[offset_slot]++; + + return (struct deflate_item) { + .data = (257 + length_slot) | + ((u32)(length - deflate_length_slot_base[length_slot]) << 9) | + ((u32)offset_slot << 14) | + ((u32)(offset - deflate_offset_slot_base[offset_slot]) << 19), + }; +} + +/* + * This is the "greedy" DEFLATE compressor. It always chooses the longest match. + */ +static size_t +deflate_compress_greedy(struct deflate_compressor * restrict c, + const u8 * restrict in, size_t in_nbytes, + u8 * restrict out, size_t out_nbytes_avail) +{ + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + struct deflate_output_bitstream os; + struct deflate_item *next_chosen_item; + + deflate_init_output(&os, out, out_nbytes_avail); + next_chosen_item = c->chosen_items; + deflate_reset_symbol_frequencies(c); + + /* The outer loop repeats every WINDOW_SIZE bytes and handles the + * sliding window. */ + do { + const u8 *in_cur_base; + const u8 *in_cur_end; + + if (in == in_next) + hc_matchfinder_init(&c->hc_mf); + else + hc_matchfinder_slide_window(&c->hc_mf); + + in_cur_base = in_next; + in_cur_end = in_next + min(in_end - in_next, + MATCHFINDER_WINDOW_SIZE); + do { + unsigned max_len; + unsigned nice_len; + unsigned length; + unsigned offset; + + max_len = min(in_cur_end - in_next, DEFLATE_MAX_MATCH_LEN); + nice_len = min(max_len, c->nice_match_length); + + length = hc_matchfinder_longest_match(&c->hc_mf, + in_cur_base, + in_next, + DEFLATE_MIN_MATCH_LEN - 1, + max_len, + nice_len, + c->max_search_depth, + &offset); + in_next += 1; + + if (length >= DEFLATE_MIN_MATCH_LEN) { + /* Match found. */ + *next_chosen_item++ = + deflate_choose_match(c, length, offset); + hc_matchfinder_skip_positions(&c->hc_mf, + in_cur_base, + in_next, + in_end, + length - 1); + in_next += length - 1; + } else { + /* No match found. */ + *next_chosen_item++ = + deflate_choose_literal(c, *(in_next - 1)); + } + + /* Check if it's time to output another block. */ + if (next_chosen_item - c->chosen_items >= + MAX_ITEMS_PER_BLOCK) + { + deflate_write_block(c, next_chosen_item, + &os, in_next == in_end); + next_chosen_item = c->chosen_items; + } + + } while (in_next != in_cur_end); + + } while (in_next != in_end); + + /* Output the last block. */ + if (next_chosen_item != c->chosen_items) + deflate_write_block(c, next_chosen_item, &os, true); + + return deflate_flush_output(&os); +} + +/* + * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to + * see if there's a longer match at the next position. If yes, it outputs a + * literal and continues to the next position. If no, it outputs the match. + */ +static size_t +deflate_compress_lazy(struct deflate_compressor * restrict c, + const u8 * restrict in, size_t in_nbytes, + u8 * restrict out, size_t out_nbytes_avail) +{ + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + struct deflate_output_bitstream os; + struct deflate_item *next_chosen_item; + struct deflate_item *end_block = c->chosen_items + MAX_ITEMS_PER_BLOCK; + + deflate_init_output(&os, out, out_nbytes_avail); + next_chosen_item = c->chosen_items; + deflate_reset_symbol_frequencies(c); + + /* The outer loop repeats every WINDOW_SIZE bytes and handles the + * sliding window. */ + do { + const u8 *in_cur_base; + const u8 *in_cur_end; + unsigned max_len; + unsigned nice_len; + + if (in == in_next) + hc_matchfinder_init(&c->hc_mf); + else + hc_matchfinder_slide_window(&c->hc_mf); + + in_cur_base = in_next; + in_cur_end = in_next + min(in_end - in_next, + MATCHFINDER_WINDOW_SIZE); + max_len = DEFLATE_MAX_MATCH_LEN; + nice_len = min(c->nice_match_length, max_len); + do { + unsigned cur_len; + unsigned cur_offset; + unsigned next_len; + unsigned next_offset; + + if (unlikely(in_cur_end - in_next < DEFLATE_MAX_MATCH_LEN)) { + max_len = in_cur_end - in_next; + nice_len = min(max_len, nice_len); + } + + /* Find the longest match at the current position. */ + cur_len = hc_matchfinder_longest_match(&c->hc_mf, + in_cur_base, + in_next, + DEFLATE_MIN_MATCH_LEN - 1, + max_len, + nice_len, + c->max_search_depth, + &cur_offset); + in_next += 1; + + if (cur_len < DEFLATE_MIN_MATCH_LEN) { + /* No match found. Choose a literal. */ + *next_chosen_item++ = + deflate_choose_literal(c, *(in_next - 1)); + goto check_block_and_continue; + } + + have_cur_match: + /* We have a match at the current position. */ + + /* If the current match is very long, choose it + * immediately. */ + if (cur_len >= nice_len) { + *next_chosen_item++ = + deflate_choose_match(c, cur_len, cur_offset); + + hc_matchfinder_skip_positions(&c->hc_mf, + in_cur_base, + in_next, + in_end, + cur_len - 1); + in_next += cur_len - 1; + goto check_block_and_continue; + } + + /* + * Try to find a match at the next position. + * + * Note: since we already have a match at the *current* + * position, we use only half the 'max_search_depth' + * when checking the *next* position. This is a useful + * trade-off because it's more worthwhile to use a + * greater search depth on the initial match. + * + * Note: it's possible to structure the code such that + * there's only one call to longest_match(), which + * handles both the "find the initial match" and "try to + * find a longer match" cases. However, it is faster to + * have two call sites, with longest_match() inlined at + * each. + */ + if (unlikely(in_cur_end - in_next < DEFLATE_MAX_MATCH_LEN)) { + max_len = in_cur_end - in_next; + nice_len = min(max_len, nice_len); + } + next_len = hc_matchfinder_longest_match(&c->hc_mf, + in_cur_base, + in_next, + cur_len, + max_len, + nice_len, + c->max_search_depth / 2, + &next_offset); + in_next += 1; + + if (next_len > cur_len) { + /* Found a longer match at the next position. + * Output a literal. Then the next match + * becomes the current match. */ + *next_chosen_item++ = + deflate_choose_literal(c, *(in_next - 2)); + if (next_chosen_item == end_block) { + deflate_write_block(c, next_chosen_item, + &os, in_next == in_end); + next_chosen_item = c->chosen_items; + } + cur_len = next_len; + cur_offset = next_offset; + goto have_cur_match; + } else { + /* No longer match at the next position. + * Output the current match. */ + *next_chosen_item++ = + deflate_choose_match(c, cur_len, cur_offset); + + hc_matchfinder_skip_positions(&c->hc_mf, + in_cur_base, + in_next, + in_end, + cur_len - 2); + in_next += cur_len - 2; + goto check_block_and_continue; + } + + check_block_and_continue: + /* Check if it's time to output another block. */ + if (next_chosen_item == end_block) { + deflate_write_block(c, next_chosen_item, + &os, in_next == in_end); + next_chosen_item = c->chosen_items; + } + } while (in_next != in_cur_end); + + } while (in_next != in_end); + + /* Output the last block. */ + if (next_chosen_item != c->chosen_items) + deflate_write_block(c, next_chosen_item, &os, true); + + return deflate_flush_output(&os); +} + +#if SUPPORT_NEAR_OPTIMAL_PARSING + +/* + * Follow the minimum-cost path in the graph of possible match/literal choices + * for the current block and compute the frequencies of the Huffman symbols that + * are needed to output those matches and literals. + */ +static void +deflate_tally_item_list(struct deflate_compressor *c, + struct deflate_optimum_node *end_optimum_ptr) +{ + struct deflate_optimum_node *cur_optimum_ptr = c->optimum; + do { + unsigned length = cur_optimum_ptr->item & OPTIMUM_LEN_MASK; + unsigned offset = cur_optimum_ptr->item >> OPTIMUM_OFFSET_SHIFT; + + if (length == 1) { + /* Literal */ + c->freqs.litlen[offset]++; + } else { + /* Match */ + c->freqs.litlen[257 + deflate_get_length_slot(c, length)]++; + c->freqs.offset[deflate_get_offset_slot(c, offset)]++; + } + cur_optimum_ptr += length; + } while (cur_optimum_ptr != end_optimum_ptr); +} + +/* + * Follow the minimum-cost path in the graph of possible match/literal choices + * for the current block and write out the matches/literals using the specified + * Huffman codes. + * + * Note: this is slightly duplicated with deflate_write_item(), the reason being + * that we don't want to waste time translating between intermediate + * match/literal representations. + */ +static void +deflate_write_item_list(struct deflate_output_bitstream *os, + const struct deflate_codes *codes, + struct deflate_compressor *c, + struct deflate_optimum_node * const end_optimum_ptr) +{ + struct deflate_optimum_node *cur_optimum_ptr = c->optimum; + do { + unsigned length = cur_optimum_ptr->item & OPTIMUM_LEN_MASK; + unsigned offset = cur_optimum_ptr->item >> OPTIMUM_OFFSET_SHIFT; + unsigned litlen_symbol; + unsigned length_slot; + unsigned offset_slot; + + if (length == 1) { + /* Literal */ + litlen_symbol = offset; + deflate_write_bits(os, codes->codewords.litlen[litlen_symbol], + codes->lens.litlen[litlen_symbol]); + } else { + /* Match length */ + length_slot = deflate_get_length_slot(c, length); + litlen_symbol = 257 + length_slot; + deflate_write_bits(os, codes->codewords.litlen[litlen_symbol], + codes->lens.litlen[litlen_symbol]); + + deflate_write_bits(os, length - deflate_length_slot_base[length_slot], + deflate_extra_length_bits[length_slot]); + + /* Match offset */ + offset_slot = deflate_get_offset_slot(c, offset); + deflate_write_bits(os, codes->codewords.offset[offset_slot], + codes->lens.offset[offset_slot]); + deflate_write_bits(os, offset - deflate_offset_slot_base[offset_slot], + deflate_extra_offset_bits[offset_slot]); + } + cur_optimum_ptr += length; + } while (cur_optimum_ptr != end_optimum_ptr); +} + +/* Set the current cost model from the codeword lengths specified in @lens. */ +static void +deflate_set_costs(struct deflate_compressor *c, const struct deflate_lens * lens) +{ + /* Literals */ + for (unsigned i = 0; i < DEFLATE_NUM_LITERALS; i++) { + u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS); + c->costs.literal[i] = bits << COST_SHIFT; + } + + /* Lengths */ + for (unsigned i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) { + unsigned length_slot = deflate_get_length_slot(c, i); + unsigned litlen_sym = 257 + length_slot; + u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS); + bits += deflate_extra_length_bits[length_slot]; + c->costs.length[i] = bits << COST_SHIFT; + } + + /* Offset slots */ + for (unsigned i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) { + u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS); + bits += deflate_extra_offset_bits[i]; + c->costs.offset_slot[i] = bits << COST_SHIFT; + } +} + +static inline u32 +deflate_default_literal_cost(unsigned literal) +{ + BUILD_BUG_ON(COST_SHIFT != 3); + /* 66 is 8.25 bits/symbol */ + return 66; +} + +static inline u32 +deflate_default_length_slot_cost(unsigned length_slot) +{ + BUILD_BUG_ON(COST_SHIFT != 3); + /* 60 is 7.5 bits/symbol */ + return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT); +} + +static inline u32 +deflate_default_offset_slot_cost(unsigned offset_slot) +{ + BUILD_BUG_ON(COST_SHIFT != 3); + /* 39 is 4.875 bits/symbol */ + return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT); +} + +/* + * Set default Huffman symbol costs for the first optimization pass. + * + * It works well to assume that each Huffman symbol is equally probable. This + * results in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 << + * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding + * alphabet. However, we intentionally bias the parse towards matches rather + * than literals by using a slightly lower default cost for length symbols than + * for literals. This often improves the compression ratio slightly. + */ +static void +deflate_set_default_costs(struct deflate_compressor *c) +{ + unsigned i; + + /* Literals */ + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) + c->costs.literal[i] = deflate_default_literal_cost(i); + + /* Lengths */ + for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) + c->costs.length[i] = deflate_default_length_slot_cost( + deflate_get_length_slot(c, i)); + + /* Offset slots */ + for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) + c->costs.offset_slot[i] = deflate_default_offset_slot_cost(i); +} + +static inline void +deflate_adjust_cost(u32 *cost_p, u32 default_cost) +{ + *cost_p += ((s32)default_cost - (s32)*cost_p) >> 1; +} + +/* + * Adjust the costs when beginning a new block. + * + * Since the current costs have been optimized for the data, it's undesirable to + * throw them away and start over with the default costs. At the same time, we + * don't want to bias the parse by assuming that the next block will be similar + * to the current block. As a compromise, make the costs closer to the + * defaults, but don't simply set them to the defaults. + */ +static void +deflate_adjust_costs(struct deflate_compressor *c) +{ + unsigned i; + + /* Literals */ + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) + deflate_adjust_cost(&c->costs.literal[i], + deflate_default_literal_cost(i)); + + /* Lengths */ + for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) + deflate_adjust_cost(&c->costs.length[i], + deflate_default_length_slot_cost( + deflate_get_length_slot(c, i))); + + /* Offset slots */ + for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) + deflate_adjust_cost(&c->costs.offset_slot[i], + deflate_default_offset_slot_cost(i)); +} + +static void +deflate_optimize_and_write_block(struct deflate_compressor *c, + struct deflate_output_bitstream *os, + const unsigned block_len, + struct lz_match *end_cache_ptr, + const bool is_final_block) +{ + struct deflate_optimum_node *end_optimum_ptr = c->optimum + block_len; + unsigned num_passes_remaining = c->num_optim_passes; + + do { + /* + * Beginning a new optimization pass and finding a new + * minimum-cost path through the graph of possible match/literal + * choices for this block. + * + * We find the minimum cost path from 'c->optimum', which + * represents the node at the beginning of the block, to + * 'end_optimum_ptr', which represents the node at the end of + * the block. Edge costs are evaluated using the cost model + * 'c->costs'. + * + * The algorithm works backward, starting at 'end_optimum_ptr' + * and proceeding backwards one position at a time. At each + * position, the minimum cost to reach 'end_optimum_ptr' is + * computed and the match/literal choice is saved. + */ + struct deflate_optimum_node *cur_optimum_ptr = end_optimum_ptr; + struct lz_match *cache_ptr = end_cache_ptr; + + cur_optimum_ptr->cost_to_end = 0; + do { + unsigned num_matches; + unsigned literal; + u32 best_cost_to_end; + u32 best_item; + + cur_optimum_ptr--; + cache_ptr--; + + num_matches = cache_ptr->length; + literal = cache_ptr->offset; + + /* It's always possible to choose a literal. */ + best_cost_to_end = c->costs.literal[literal] + + (cur_optimum_ptr + 1)->cost_to_end; + best_item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1; + + /* Also consider matches if there are any. */ + if (num_matches) { + struct lz_match *match; + unsigned len; + unsigned offset; + unsigned offset_slot; + u32 offset_cost; + u32 cost_to_end; + + /* + * Consider each length from the minimum + * (DEFLATE_MIN_MATCH_LEN) to the length of the + * longest match found at this position. For + * each length, we consider only the smallest + * offset for which that length is available. + * Although this is not guaranteed to be optimal + * due to the possibility of a larger offset + * costing less than a smaller offset to code, + * this is a very useful heuristic. + */ + match = cache_ptr - num_matches; + len = DEFLATE_MIN_MATCH_LEN; + do { + offset = match->offset; + offset_slot = deflate_get_offset_slot(c, offset); + offset_cost = c->costs.offset_slot[offset_slot]; + do { + cost_to_end = offset_cost + + c->costs.length[len] + + (cur_optimum_ptr + len)->cost_to_end; + if (cost_to_end < best_cost_to_end) { + best_cost_to_end = cost_to_end; + best_item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len; + } + } while (++len <= match->length); + } while (++match != cache_ptr); + cache_ptr -= num_matches; + } + cur_optimum_ptr->cost_to_end = best_cost_to_end; + cur_optimum_ptr->item = best_item; + } while (cur_optimum_ptr != c->optimum); + + /* Tally Huffman symbol frequencies. */ + deflate_tally_item_list(c, end_optimum_ptr); + + /* If this wasn't the last pass, update the cost model. */ + if (num_passes_remaining > 1) { + deflate_make_huffman_codes(&c->freqs, &c->codes); + deflate_set_costs(c, &c->codes.lens); + deflate_reset_symbol_frequencies(c); + } + } while (--num_passes_remaining); + + /* All optimization passes are done. Output a block using the + * minimum-cost path computed on the last optimization pass. */ + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; + deflate_make_huffman_codes(&c->freqs, &c->codes); + deflate_reset_symbol_frequencies(c); + deflate_write_block_header(os, is_final_block, DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN); + deflate_write_huffman_codes(os, &c->codes); + deflate_write_item_list(os, &c->codes, c, end_optimum_ptr); + deflate_write_end_of_block(os, &c->codes); +} + +/* + * This is the "near-optimal" DEFLATE compressor. It computes the optimal + * representation of each DEFLATE block using a minimum-cost path search over + * the graph of possible match/literal choices for that block, assuming a + * certain cost for each Huffman symbol. + * + * For several reasons, the end result is not guaranteed to be optimal: + * + * - Nonoptimal choice of blocks + * - Heuristic limitations on which matches are actually considered + * - Symbol costs are unknown until the symbols have already been chosen + * (so iterative optimization must be used) + */ +static size_t +deflate_compress_near_optimal(struct deflate_compressor * restrict c, + const u8 * restrict in, size_t in_nbytes, + u8 * restrict out, size_t out_nbytes_avail) +{ + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + struct deflate_output_bitstream os; + const u8 *in_cur_base; + const u8 *in_next_slide; + unsigned max_len; + unsigned nice_len; + struct lz_match *cache_ptr; + struct lz_match *cache_end; + const u8 *in_block_begin; + const u8 *in_block_end; + unsigned num_matches; + unsigned best_len; + unsigned long prev_hash = 0; + + deflate_init_output(&os, out, out_nbytes_avail); + deflate_reset_symbol_frequencies(c); + + bt_matchfinder_init(&c->bt_mf); + in_cur_base = in_next; + in_next_slide = in_next + min(in_end - in_next, MATCHFINDER_WINDOW_SIZE); + + max_len = DEFLATE_MAX_MATCH_LEN; + nice_len = min(c->nice_match_length, max_len); + + do { + /* Starting a new DEFLATE block. */ + + cache_ptr = c->cached_matches; + cache_end = &c->cached_matches[CACHE_LEN - (MAX_MATCHES_PER_POS + 1)]; + in_block_begin = in_next; + in_block_end = in_next + min(in_end - in_next, OPTIM_BLOCK_LENGTH); + + /* Set the initial cost model. */ + if (in_next == in) + deflate_set_default_costs(c); + else + deflate_adjust_costs(c); + + /* Find all match possibilities in this block. */ + do { + /* Decrease the maximum and nice match lengths if we're + * approaching the end of the input buffer. */ + if (unlikely(max_len > in_end - in_next)) { + max_len = in_end - in_next; + nice_len = min(max_len, nice_len); + } + + /* Force the block to end if the match cache may + * overflow. This case is very unlikely. */ + if (unlikely(cache_ptr > cache_end)) + break; + + /* Slide the window forward if needed. */ + if (in_next == in_next_slide) { + bt_matchfinder_slide_window(&c->bt_mf); + in_cur_base = in_next; + in_next_slide = in_next + min(in_end - in_next, + MATCHFINDER_WINDOW_SIZE); + } + + /* + * Find matches with the current position using the + * binary tree matchfinder and save them in + * 'cached_matches'. + * + * Note: the binary tree matchfinder is more suited for + * optimal parsing than the hash chain matchfinder. The + * reasons for this include: + * + * - The binary tree matchfinder can find more matches + * in the same number of steps. + * - One of the major advantages of hash chains is that + * skipping positions (not searching for matches at + * them) is faster; however, with optimal parsing we + * search for matches at almost all positions, so this + * advantage of hash chains is negated. + */ + num_matches = + bt_matchfinder_get_matches(&c->bt_mf, + in_cur_base, + in_next, + max_len, + nice_len, + c->max_search_depth, + &prev_hash, + cache_ptr); + cache_ptr += num_matches; + cache_ptr->length = num_matches; + cache_ptr->offset = *in_next; + in_next++; + cache_ptr++; + + if (num_matches) { + best_len = cache_ptr[-2].length; + + /* + * If there was a very long match found, don't + * cache any matches for the bytes covered by + * that match. This avoids degenerate behavior + * when compressing highly redundant data, where + * the number of matches can be very large. + * + * This heuristic doesn't actually hurt the + * compression ratio very much. If there's a + * long match, then the data must be highly + * compressible, so it doesn't matter much what + * we do. + * + * We also trigger this same case when + * approaching the desired end of the block. + * This forces the block to reach a "stopping + * point" where there are no matches extending + * to later positions. (XXX: this behavior is + * non-optimal and should be improved.) + */ + if (best_len >= min(nice_len, in_block_end - in_next)) { + --best_len; + do { + if (unlikely(max_len > in_end - in_next)) { + max_len = in_end - in_next; + nice_len = min(max_len, nice_len); + } + if (in_next == in_next_slide) { + bt_matchfinder_slide_window(&c->bt_mf); + in_cur_base = in_next; + in_next_slide = in_next + min(in_end - in_next, + MATCHFINDER_WINDOW_SIZE); + } + bt_matchfinder_skip_position(&c->bt_mf, + in_cur_base, + in_next, + in_end, + nice_len, + c->max_search_depth, + &prev_hash); + + cache_ptr->length = 0; + cache_ptr->offset = *in_next; + in_next++; + cache_ptr++; + } while (--best_len); + } + } + } while (in_next < in_block_end); + + /* All the matches for this block have been cached. Now compute + * a near-optimal sequence of literals and matches, and output + * the block. */ + + deflate_optimize_and_write_block(c, &os, in_next - in_block_begin, + cache_ptr, in_next == in_end); + } while (in_next != in_end); + + return deflate_flush_output(&os); +} + +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +/* Initialize c->length_slot_fast. */ +static void +deflate_init_length_slot_fast(struct deflate_compressor *c) +{ + unsigned length_slot; + unsigned length; + unsigned length_end; + + for (length_slot = 0; + length_slot < ARRAY_LEN(deflate_length_slot_base); + length_slot++) + { + length = deflate_length_slot_base[length_slot]; + length_end = length + (1 << deflate_extra_length_bits[length_slot]); + do { + c->length_slot_fast[length] = length_slot; + } while (++length != length_end); + } +} + +/* Initialize c->offset_slot_fast. */ +static void +deflate_init_offset_slot_fast(struct deflate_compressor *c) +{ + unsigned offset_slot; + unsigned offset; + unsigned offset_end; + + for (offset_slot = 0; + offset_slot < ARRAY_LEN(deflate_offset_slot_base); + offset_slot++) + { + offset = deflate_offset_slot_base[offset_slot]; + #if USE_FULL_OFFSET_SLOT_FAST + offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); + do { + c->offset_slot_fast[offset] = offset_slot; + } while (++offset != offset_end); + #else + if (offset <= 256) { + offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); + do { + c->offset_slot_fast[offset - 1] = offset_slot; + } while (++offset != offset_end); + } else { + offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); + do { + c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot; + } while ((offset += (1 << 7)) != offset_end); + } + #endif + } +} + +/* Initialize c->static_codes. */ +static void +deflate_init_static_codes(struct deflate_compressor *c) +{ + unsigned i; + + for (i = 0; i < 144; i++) + c->freqs.litlen[i] = 1 << (9 - 8); + for (; i < 256; i++) + c->freqs.litlen[i] = 1 << (9 - 9); + for (; i < 280; i++) + c->freqs.litlen[i] = 1 << (9 - 7); + for (; i < 288; i++) + c->freqs.litlen[i] = 1 << (9 - 8); + + for (i = 0; i < 32; i++) + c->freqs.offset[i] = 1 << (5 - 5); + + deflate_make_huffman_codes(&c->freqs, &c->static_codes); +} + +LIBEXPORT struct deflate_compressor * +deflate_alloc_compressor(unsigned int compression_level) +{ + struct deflate_compressor *c; + size_t size; + +#if SUPPORT_NEAR_OPTIMAL_PARSING + if (compression_level >= 8) + size = offsetof(struct deflate_compressor, optimal_end); + else +#endif + size = offsetof(struct deflate_compressor, nonoptimal_end); + + c = aligned_alloc(MATCHFINDER_ALIGNMENT, size); + if (!c) + return NULL; + + c->compression_level = compression_level; + + switch (compression_level) { + case 0: + c->impl = deflate_compress_greedy; + c->max_search_depth = 1; + c->nice_match_length = 3; + break; + case 1: + c->impl = deflate_compress_greedy; + c->max_search_depth = 4; + c->nice_match_length = 9; + break; + case 2: + c->impl = deflate_compress_greedy; + c->max_search_depth = 12; + c->nice_match_length = 9; + break; + case 3: + c->impl = deflate_compress_greedy; + c->max_search_depth = 24; + c->nice_match_length = 18; + break; + case 4: + c->impl = deflate_compress_greedy; + c->max_search_depth = 36; + c->nice_match_length = 27; + break; + case 5: + c->impl = deflate_compress_lazy; + c->max_search_depth = 32; + c->nice_match_length = 24; + break; + case 6: + c->impl = deflate_compress_lazy; + c->max_search_depth = 48; + c->nice_match_length = 36; + break; + case 7: + c->impl = deflate_compress_lazy; + c->max_search_depth = 72; + c->nice_match_length = 54; + break; +#if SUPPORT_NEAR_OPTIMAL_PARSING + case 8: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 25; + c->nice_match_length = 40; + c->num_optim_passes = 1; + break; + case 9: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 50; + c->nice_match_length = 80; + c->num_optim_passes = 2; + break; + case 10: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 100; + c->nice_match_length = 120; + c->num_optim_passes = 3; + break; + default: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 250; + c->nice_match_length = DEFLATE_MAX_MATCH_LEN; + c->num_optim_passes = 5; + break; +#else + case 8: + c->impl = deflate_compress_lazy; + c->max_search_depth = 108; + c->nice_match_length = 81; + break; + case 9: + c->impl = deflate_compress_lazy; + c->max_search_depth = 162; + c->nice_match_length = 122; + break; + case 10: + c->impl = deflate_compress_lazy; + c->max_search_depth = 243; + c->nice_match_length = 182; + break; + default: + c->impl = deflate_compress_lazy; + c->max_search_depth = 365; + c->nice_match_length = DEFLATE_MAX_MATCH_LEN; + break; +#endif + } + + deflate_init_offset_slot_fast(c); + deflate_init_length_slot_fast(c); + deflate_init_static_codes(c); + + return c; +} + +LIBEXPORT size_t +deflate_compress(struct deflate_compressor *c, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail) +{ + if (in_nbytes < 16) + return 0; + return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail); +} + +LIBEXPORT void +deflate_free_compressor(struct deflate_compressor *c) +{ + free(c); +} + +unsigned int +deflate_get_compression_level(struct deflate_compressor *c) +{ + return c->compression_level; +} diff --git a/src/deflate_compress.h b/src/deflate_compress.h new file mode 100644 index 0000000..ccf943e --- /dev/null +++ b/src/deflate_compress.h @@ -0,0 +1,9 @@ +#pragma once + +/* 'struct deflate_compressor' is private to deflate_compress.c, but zlib header + * generation needs to be able to query the compression level. */ + +struct deflate_compressor; + +extern unsigned int +deflate_get_compression_level(struct deflate_compressor *c); diff --git a/src/deflate_constants.h b/src/deflate_constants.h new file mode 100644 index 0000000..cb07c07 --- /dev/null +++ b/src/deflate_constants.h @@ -0,0 +1,59 @@ +/* + * deflate_constants.h + * + * Constants for the DEFLATE compression format. + */ + +#pragma once + +/* Valid block types */ +#define DEFLATE_BLOCKTYPE_UNCOMPRESSED 0 +#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN 1 +#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN 2 + +/* Minimum and maximum supported match lengths (in bytes) */ +#define DEFLATE_MIN_MATCH_LEN 3 +#define DEFLATE_MAX_MATCH_LEN 258 + +/* Minimum and maximum supported match offsets (in bytes) */ +#define DEFLATE_MIN_MATCH_OFFSET 1 +#define DEFLATE_MAX_MATCH_OFFSET 32767 + +#define DEFLATE_MAX_WINDOW_SIZE 32768 + +/* Number of symbols in each Huffman code. Note: for the literal/length + * and offset codes, these are actually the maximum values; a given block + * might use fewer symbols. */ +#define DEFLATE_NUM_PRECODE_SYMS 19 +#define DEFLATE_NUM_LITLEN_SYMS 288 +#define DEFLATE_NUM_OFFSET_SYMS 32 + +/* Division of symbols in the literal/length code */ +#define DEFLATE_NUM_LITERALS 256 +#define DEFLATE_END_OF_BLOCK 256 +#define DEFLATE_NUM_LEN_SYMS 31 + +/* Maximum codeword length, in bits, within each Huffman code */ +#define DEFLATE_MAX_PRE_CODEWORD_LEN 7 +#define DEFLATE_MAX_LITLEN_CODEWORD_LEN 15 +#define DEFLATE_MAX_OFFSET_CODEWORD_LEN 15 + +/* Maximum possible overrun when decoding codeword lengths */ +#define DEFLATE_MAX_LENS_OVERRUN 137 + +/* + * Maximum number of extra bits that may be required to represent a match + * length or offset. + * + * TODO: are we going to have full DEFLATE64 support? If so, up to 16 + * length bits must be supported. + */ +#define DEFLATE_MAX_EXTRA_LENGTH_BITS 5 +#define DEFLATE_MAX_EXTRA_OFFSET_BITS 14 + +/* The maximum number of bits in which a match can be represented. This + * is the absolute worst case, which assumes the longest possible Huffman + * codewords and the maximum numbers of extra bits. */ +#define DEFLATE_MAX_MATCH_BITS \ + (DEFLATE_MAX_LITLEN_CODEWORD_LEN + DEFLATE_MAX_EXTRA_LENGTH_BITS + \ + DEFLATE_MAX_OFFSET_CODEWORD_LEN + DEFLATE_MAX_EXTRA_OFFSET_BITS) diff --git a/src/deflate_decompress.c b/src/deflate_decompress.c new file mode 100644 index 0000000..eacf6fd --- /dev/null +++ b/src/deflate_decompress.c @@ -0,0 +1,1455 @@ +/* + * deflate_decompress.c + * + * This is a highly optimized DEFLATE decompressor. On x86_64 it decompresses + * data in about 59% of the time of zlib. On other architectures it should + * still be significantly faster than zlib, but the difference may be smaller. + * + * This decompressor currently only supports raw DEFLATE (not zlib or gzip), and + * it only supports whole-buffer decompression (not streaming). + * + * Why this is faster than zlib's implementation: + * + * - Word accesses rather than byte accesses when reading input + * - Word accesses rather than byte accesses when copying matches + * - Faster Huffman decoding combined with various DEFLATE-specific tricks + * - Larger bitbuffer variable that doesn't need to be filled as often + * - Other optimizations to remove unnecessary branches + * - Only full-buffer decompression is supported, so the code doesn't need to + * support stopping and resuming decompression. + */ + +#include +#include +#include + +#include "libdeflate.h" + +#include "deflate_constants.h" +#include "unaligned.h" + +#ifndef UNSAFE_DECOMPRESSION +# define UNSAFE_DECOMPRESSION 0 +#endif + +#if UNSAFE_DECOMPRESSION +# warning "unsafe decompression is enabled" +# define SAFETY_CHECK(expr) 0 +#else +# define SAFETY_CHECK(expr) unlikely(expr) +#endif + +/* + * Each of these values is the base 2 logarithm of the number of entries of the + * corresponding decode table. Each value should be large enough to ensure that + * for typical data, the vast majority of symbols can be decoded by a direct + * lookup of the next TABLEBITS bits of compressed data. However, this must be + * balanced against the fact that a larger table requires more memory and + * requires more time to fill. + */ +#define DEFLATE_PRECODE_TABLEBITS 7 +#define DEFLATE_LITLEN_TABLEBITS 10 +#define DEFLATE_OFFSET_TABLEBITS 9 + +/* + * Type for codeword lengths. + */ +typedef u8 len_t; + +/* + * The main DEFLATE decompressor structure. Since this implementation only + * supports full buffer decompression, this structure does not store the entire + * decompression state, but rather only some arrays that are too large to + * comfortably allocate on the stack. + */ +struct deflate_decompressor { + + /* + * The arrays aren't all needed at the same time. 'precode_lens' and + * 'precode_decode_table' are unneeded after 'lens' has been filled. + * Furthermore, 'lens' need not be retained after building the litlen + * and offset decode tables. In fact, 'lens' can be in union with + * 'litlen_decode_table' provided that 'offset_decode_table' is separate + * and is built first. + */ + + union { + len_t precode_lens[DEFLATE_NUM_PRECODE_SYMS]; + + struct { + len_t lens[DEFLATE_NUM_LITLEN_SYMS + + DEFLATE_NUM_OFFSET_SYMS + + DEFLATE_MAX_LENS_OVERRUN]; + + u32 precode_decode_table[(1 << DEFLATE_PRECODE_TABLEBITS) + + (2 * DEFLATE_NUM_PRECODE_SYMS)]; + }; + + u32 litlen_decode_table[(1 << DEFLATE_LITLEN_TABLEBITS) + + (2 * DEFLATE_NUM_LITLEN_SYMS)]; + }; + + u32 offset_decode_table[(1 << DEFLATE_OFFSET_TABLEBITS) + + (2 * DEFLATE_NUM_OFFSET_SYMS)]; +}; + +/***************************************************************************** + * Input bitstream * + *****************************************************************************/ + +/* + * The state of the "input bitstream" consists of the following variables: + * + * - in_next: pointer to the next unread byte in the input buffer + * + * - in_end: pointer just past the end of the input buffer + * + * - bitbuf: a word-sized variable containing bits that have been read from + * the input buffer. The buffered bits are right-aligned + * (they're the low-order bits). + * + * - bitsleft: number of bits in 'bitbuf' that are valid. + * + * To make it easier for the compiler to optimize the code by keeping variables + * in registers, these are declared as normal variables and manipulated using + * macros. + */ + +/* + * The type for the bitbuffer variable ('bitbuf' described above). For best + * performance, this should have size equal to a machine word. + */ +typedef machine_word_t bitbuf_t; + +/* + * Number of bits the bitbuffer variable can hold. + */ +#define BITBUF_NBITS (8 * sizeof(bitbuf_t)) + +/* + * The maximum number of bits that can be requested to be in the bitbuffer + * variable. This is the maximum value of 'n' that can be passed + * ENSURE_BITS(n). + * + * This not equal to BITBUF_NBITS because we never read less than one byte at a + * time. If the bitbuffer variable contains more than (BITBUF_NBITS - 8) bits, + * then we can't read another byte without first consuming some bits. So the + * maximum count we can ensure is (BITBUF_NBITS - 7). + */ +#define MAX_ENSURE (BITBUF_NBITS - 7) + +/* + * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if + * 'n' is too large to be passed to ENSURE_BITS(n). Note: if 'n' is a compile + * time constant, then this expression will be a compile-type constant. + * Therefore, CAN_ENSURE() can be used choose between alternative + * implementations at compile time. + */ +#define CAN_ENSURE(n) ((n) <= MAX_ENSURE) + +/* + * Fill the bitbuffer variable, reading one byte at a time. + * + * Note: if we would overrun the input buffer, we just don't read anything, + * leaving the bits as 0 but marking them as filled. This makes the + * implementation simpler because this removes the need to distinguish between + * "real" overruns and overruns that occur because of our own lookahead during + * Huffman decoding. The disadvantage is that a "real" overrun can go + * undetected, and deflate_decompress() may return a success status rather than + * the expected failure status if one occurs. However, this is irrelevant + * because even if this specific case were to be handled "correctly", one could + * easily come up with a different case where the compressed data would be + * corrupted in such a way that fully retains its validity. Users should run a + * checksum against the uncompressed data if they wish to detect corruptions. + */ +#define FILL_BITS_BYTEWISE() \ +({ \ + assert(bitsleft <= BITBUF_NBITS - 8); \ + do { \ + if (likely(in_next != in_end)) \ + bitbuf |= (bitbuf_t)*in_next++ << bitsleft; \ + else \ + overrun_count++; \ + bitsleft += 8; \ + } while (bitsleft <= BITBUF_NBITS - 8); \ +}) + +/* + * Fill the bitbuffer variable by reading the next word from the input buffer. + * This can be significantly faster than FILL_BITS_BYTEWISE(). However, for + * this to work correctly, the word must be interpreted in little-endian format. + * In addition, the memory access may be unaligned. Therefore, this method is + * most efficient on little-endian architectures that support fast unaligned + * access, such as x86 and x86_64. + */ +#define FILL_BITS_WORDWISE() \ +({ \ + assert(bitsleft < BITBUF_NBITS); \ + bitbuf |= get_unaligned_word_le(in_next) << bitsleft; \ + in_next += (BITBUF_NBITS - bitsleft) >> 3; \ + bitsleft += (BITBUF_NBITS - bitsleft) & ~7; \ +}) + +/* + * Does the bitbuffer variable currently contain at least 'n' bits? + */ +#define HAVE_BITS(n) (bitsleft >= (n)) + +/* + * Raw form of ENSURE_BITS(): the bitbuffer variable must not already contain + * the requested number of bits. + */ +#define DO_ENSURE_BITS(n) \ +({ \ + assert(CAN_ENSURE(n)); \ + assert(!HAVE_BITS(n)); \ + if (CPU_IS_LITTLE_ENDIAN && \ + UNALIGNED_ACCESS_IS_FAST && \ + likely(in_end - in_next >= sizeof(bitbuf_t))) \ + FILL_BITS_WORDWISE(); \ + else \ + FILL_BITS_BYTEWISE(); \ + assert(HAVE_BITS(n)); \ +}) + +/* + * Load more bits from the input buffer until the specified number of bits is + * present in the bitbuffer variable. 'n' cannot be too large; see MAX_ENSURE + * and CAN_ENSURE(). + */ +#define ENSURE_BITS(n) \ +({ \ + assert(CAN_ENSURE(n)); \ + if (!HAVE_BITS(n)) \ + DO_ENSURE_BITS(n); \ + assert(HAVE_BITS(n)); \ +}) + +/* + * Return the next 'n' bits from the bitbuffer variable without removing them. + */ +#define BITS(n) \ +({ \ + assert(HAVE_BITS(n)); \ + bitbuf & (((bitbuf_t)1 << (n)) - 1); \ +}) + +/* + * Remove the next 'n' bits from the bitbuffer variable. + */ +#define REMOVE_BITS(n) \ +({ \ + assert(HAVE_BITS(n)); \ + bitbuf >>= (n); \ + bitsleft -= (n); \ +}) + +/* + * Remove and return the next 'n' bits from the bitbuffer variable. + */ +#define POP_BITS(n) \ +({ \ + bitbuf_t bits = BITS(n); \ + REMOVE_BITS(n); \ + bits; \ +}) + +/* + * Align the input to the next byte boundary, discarding any remaining bits in + * the current byte. + * + * Note that if the bitbuffer variable currently contains more than 8 bits, then + * we must rewind 'in_next', effectively putting those bits back. Only the bits + * in what would be the "current" byte if we were reading one byte at a time can + * be actually discarded. + */ +#define ALIGN_INPUT() \ +({ \ + in_next -= (bitsleft >> 3) - min(overrun_count, bitsleft >> 3); \ + bitbuf = 0; \ + bitsleft = 0; \ +}) + +/* + * Read a 16-bit value from the input. This must have been preceded by a call + * to ALIGN_INPUT(), and the caller must have already checked for overrun. + */ +#define READ_U16() \ +({ \ + u16 v; \ + \ + assert(bitsleft == 0); \ + assert(in_end - in_next >= 2); \ + \ + v = get_unaligned_u16_le(in_next); \ + in_next += 2; \ + v; \ +}) + +/***************************************************************************** + * Huffman decoding * + *****************************************************************************/ + +/* + * A decode table for order TABLEBITS contains (1 << TABLEBITS) entries, plus + * additional entries for non-root binary tree nodes. The number of non-root + * binary tree nodes is variable, but cannot possibly be more than twice the + * number of symbols in the alphabet for which the decode table is built. + * + * The decoding algorithm takes the next TABLEBITS bits of compressed data and + * uses them as an index into the decode table. The resulting entry is either a + * "direct entry", meaning that it contains the value desired, or a "tree root + * entry", meaning that it is the root of a binary tree that must be traversed + * using more bits of the compressed data (0 bit means go to the left child, 1 + * bit means go to the right child) until a leaf is reached. + * + * Each decode table is associated with a Huffman code. Logically, the result + * of a decode table lookup is a symbol from the alphabet from which the + * corresponding Huffman code was constructed. A symbol with codeword length n + * <= TABLEBITS is associated with 2**(TABLEBITS - n) direct entries in the + * table, whereas a symbol with codeword length n > TABLEBITS shares a binary + * tree with a number of other codewords. + * + * On top of this basic design, we implement several optimizations: + * + * - We store the length of each codeword directly in each of its decode table + * entries. This allows the codeword length to be produced without indexing + * an additional table. + * + * - When beneficial, we don't store the Huffman symbol itself, but instead data + * generated from it. For example, when decoding an offset symbol in DEFLATE, + * it's more efficient if we can decode the offset base and number of extra + * offset bits directly rather than decoding the offset symbol and then + * looking up both of those values in an additional table or tables. + * + * - It can be possible to decode more than just a single Huffman symbol from + * the next TABLEBITS bits of the input. We take advantage of this when + * decoding match lengths. When possible, the decode table entry will provide + * the full match length. In this case, the stored "codeword length" will + * actually be the codeword length plus the number of extra length bits that + * are being consumed. + * + * The size of each decode table entry is 32 bits, which provides slightly + * better performance than 16-bit entries on 32 and 64 bit processers, provided + * that the table doesn't get so large that it takes up too much memory and + * starts generating cache misses. The bits of each decode table entry are + * defined as follows: + * + * - Bits 29 -- 31: flags (see below) + * - Bits 25 -- 28: codeword length + * - Bits 0 -- 24: decode result: a Huffman symbol or related data + */ + +/* + * Flags usage: + * + * The precode and offset tables only use these flags to distinguish nonleaf + * tree nodes from other entries. In nonleaf tree node entries, all flags are + * set and the recommended one to test is HUFFDEC_TREE_NONLEAF_FAST_FLAG. + * + * The literal/length decode table uses all the flags. During decoding, the + * flags are designed to be tested from high to low. If a flag is set, then all + * higher flags are also set. + */ + +/* + * This flag is set in all entries that do not represent a literal symbol, + * excluding tree leaves. This enables a very fast path for non-rare literals: + * just check if this bit is clear, and if so extract the literal from the low + * bits. + */ +#define HUFFDEC_NOT_LITERAL 0x80000000 + +/* + * This flag is set in all entries that represent neither a literal symbol nor a + * full match length, excluding tree leaves. + */ +#define HUFFDEC_NOT_FULL_LENGTH 0x40000000 + +/* + * This flag is set in all nonleaf tree entries (roots and internal nodes). + */ +#define HUFFDEC_TREE_NONLEAF 0x20000000 + +/* + * HUFFDEC_TREE_NONLEAF implies that the following flags are also set. + */ +#define HUFFDEC_TREE_NONLEAF_FLAGS 0xE0000000 + +/* + * For distinguishing between any direct entry and a tree root, or between an + * internal tree node and a leaf node, this bit should be checked in preference + * to any other in HUFFDEC_TREE_NONLEAF_FLAGS --- the reason being this is the + * sign bit, and some architectures have special instructions to handle it. + */ +#define HUFFDEC_TREE_NONLEAF_FAST_FLAG 0x80000000 + +/* + * Number of flag bits defined above. + */ +#define HUFFDEC_NUM_FLAG_BITS 3 + +/* + * Number of bits reserved for the codeword length in decode table entries, and + * the corresponding mask and limit. 4 bits provides a max length of 15, which + * is enough for any DEFLATE codeword. (And actually, we don't even need the + * full 15 because only lengths less than or equal to the appropriate TABLEBITS + * will ever be stored in this field.) + */ +#define HUFFDEC_LEN_BITS 4 +#define HUFFDEC_LEN_MASK (((u32)1 << HUFFDEC_LEN_BITS) - 1) +#define HUFFDEC_MAX_LEN HUFFDEC_LEN_MASK + +/* + * Value by which a decode table entry can be right-shifted to get the length + * field. Note: the result must be AND-ed with HUFFDEC_LEN_MASK unless it is + * guaranteed that no flag bits are set. + */ +#define HUFFDEC_LEN_SHIFT (32 - HUFFDEC_NUM_FLAG_BITS - HUFFDEC_LEN_BITS) + +/* + * Mask to get the "value" of a decode table entry. This is the decode result + * and contains data dependent on the table. + */ +#define HUFFDEC_VALUE_MASK (((u32)1 << HUFFDEC_LEN_SHIFT) - 1) + +/* + * Data needed to initialize the entries in the length/literal decode table. + */ +static const u32 deflate_litlen_symbol_data[DEFLATE_NUM_LITLEN_SYMS] = { + /* Literals */ + 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , + 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , + 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , + 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , + 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , + 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , + 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , + 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , + 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , + 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , + 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , + 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , + 96 , 97 , 98 , 99 , 100 , 101 , 102 , 103 , + 104 , 105 , 106 , 107 , 108 , 109 , 110 , 111 , + 112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 , + 120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 , + 128 , 129 , 130 , 131 , 132 , 133 , 134 , 135 , + 136 , 137 , 138 , 139 , 140 , 141 , 142 , 143 , + 144 , 145 , 146 , 147 , 148 , 149 , 150 , 151 , + 152 , 153 , 154 , 155 , 156 , 157 , 158 , 159 , + 160 , 161 , 162 , 163 , 164 , 165 , 166 , 167 , + 168 , 169 , 170 , 171 , 172 , 173 , 174 , 175 , + 176 , 177 , 178 , 179 , 180 , 181 , 182 , 183 , + 184 , 185 , 186 , 187 , 188 , 189 , 190 , 191 , + 192 , 193 , 194 , 195 , 196 , 197 , 198 , 199 , + 200 , 201 , 202 , 203 , 204 , 205 , 206 , 207 , + 208 , 209 , 210 , 211 , 212 , 213 , 214 , 215 , + 216 , 217 , 218 , 219 , 220 , 221 , 222 , 223 , + 224 , 225 , 226 , 227 , 228 , 229 , 230 , 231 , + 232 , 233 , 234 , 235 , 236 , 237 , 238 , 239 , + 240 , 241 , 242 , 243 , 244 , 245 , 246 , 247 , + 248 , 249 , 250 , 251 , 252 , 253 , 254 , 255 , + +#define HUFFDEC_NUM_BITS_FOR_EXTRA_LENGTH_BITS 3 +#define HUFFDEC_MAX_EXTRA_LENGTH_BITS (((u32)1 << HUFFDEC_NUM_BITS_FOR_EXTRA_LENGTH_BITS) - 1) +#define HUFFDEC_EXTRA_LENGTH_BITS_SHIFT (HUFFDEC_LEN_SHIFT - HUFFDEC_NUM_BITS_FOR_EXTRA_LENGTH_BITS) +#define HUFFDEC_LENGTH_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_LENGTH_BITS_SHIFT) - 1) +#define HUFFDEC_END_OF_BLOCK_LENGTH 0 + +#define ENTRY(length_base, num_extra_bits) \ + (256 + (length_base) + ((num_extra_bits) << HUFFDEC_EXTRA_LENGTH_BITS_SHIFT)) + + /* End of block */ + ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0), + + /* Match length data */ + ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), + ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), + ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), + ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), + ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), + ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), + ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), + ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , +#undef ENTRY +}; + +/* + * Data needed to initialize the entries in the offset decode table. + */ +static const u32 deflate_offset_symbol_data[DEFLATE_NUM_OFFSET_SYMS] = { + +#define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16 +#define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1) + +#define ENTRY(offset_base, num_extra_bits) \ + ((offset_base) | ((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT)) + ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , + ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , + ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , + ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , + ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , + ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , + ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , + ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) , +#undef ENTRY +}; + +/* Construct a direct decode table entry (not a tree node) */ +static inline u32 +make_direct_entry(u32 value, u32 length) +{ + return (length << HUFFDEC_LEN_SHIFT) | value; +} + +/* + * The following functions define the way entries are created for each decode + * table. Note that these will all be inlined into build_decode_table(), which + * will itself be inlined for each decode table. This is important for + * performance because the make_*_entry() functions get called from the inner + * loop of build_decode_table(). + */ + +static inline u32 +make_litlen_direct_entry(unsigned symbol, unsigned codeword_length, + unsigned *extra_mask_ret) +{ + u32 entry_value = deflate_litlen_symbol_data[symbol]; + u32 entry_length = codeword_length; + unsigned length_bits; + u32 length_base; + + BUILD_BUG_ON(DEFLATE_MAX_EXTRA_LENGTH_BITS > + HUFFDEC_MAX_EXTRA_LENGTH_BITS); + + if (symbol >= 256) { + /* Match, not a literal. (This can also be the special + * end-of-block symbol, which we handle identically.) */ + entry_value -= 256; + length_bits = entry_value >> HUFFDEC_EXTRA_LENGTH_BITS_SHIFT; + length_base = entry_value & HUFFDEC_LENGTH_BASE_MASK; + if (codeword_length + length_bits <= DEFLATE_LITLEN_TABLEBITS) { + /* TABLEBITS is enough to decode the length slot as well + * as all the extra length bits. So store the full + * length in the decode table entry. + * + * Note that a length slot may be used for multiple + * lengths, and multiple decode table entries may map to + * the same length; hence the need for the 'extra_mask', + * which allows build_decode_table() to cycle through + * the lengths that use this length slot. */ + entry_value = length_base; + entry_length += length_bits; + *extra_mask_ret = (1U << length_bits) - 1; + } else { + /* TABLEBITS isn't enough to decode all the extra length + * bits. The decoder will have to decode the extra bits + * separately. This is the less common case. */ + entry_value |= HUFFDEC_NOT_FULL_LENGTH; + } + entry_value |= HUFFDEC_NOT_LITERAL; + } + + return make_direct_entry(entry_value, entry_length); +} + +static inline u32 +make_litlen_leaf_entry(unsigned sym) +{ + return deflate_litlen_symbol_data[sym]; +} + +static inline u32 +make_offset_direct_entry(unsigned sym, unsigned codeword_len, unsigned *extra_mask_ret) +{ + return make_direct_entry(deflate_offset_symbol_data[sym], codeword_len); +} + +static inline u32 +make_offset_leaf_entry(unsigned sym) +{ + return deflate_offset_symbol_data[sym]; +} + +static inline u32 +make_pre_direct_entry(unsigned sym, unsigned codeword_len, unsigned *extra_mask_ret) +{ + return make_direct_entry(sym, codeword_len); +} + +static inline u32 +make_pre_leaf_entry(unsigned sym) +{ + return sym; +} + +/* + * Build a table for fast Huffman decoding, using bit-reversed codewords. + * + * The Huffman code is assumed to be in canonical form and is specified by its + * codeword lengths only. + * + * @decode_table + * A table with ((1 << table_bits) + (2 * num_syms)) entries. The format + * of the table has been described in previous comments. + * @lens + * Lengths of the Huffman codewords. 'lens[sym]' specifies the length, in + * bits, of the codeword for symbol 'sym'. If a symbol is not used in the + * code, its length must be specified as 0. It is valid for this parameter + * to alias @decode_table because nothing gets written to @decode_table + * until all information in @lens has been consumed. + * @num_syms + * Number of symbols in the code. + * @make_direct_entry + * Function to create a direct decode table entry, given the symbol and + * codeword length. + * @make_leaf_entry + * Function to create a tree decode table entry, at a tree leaf, given the + * symbol. + * @table_bits + * log base 2 of the size of the direct lookup portion of the decode table. + * @max_codeword_len + * Maximum allowed codeword length for this Huffman code. + * + * Returns %true if successful; %false if the codeword lengths do not form a + * valid Huffman code. + */ +static inline bool +build_decode_table(u32 decode_table[], + const len_t lens[], + const unsigned num_syms, + u32 (*make_direct_entry)(unsigned, unsigned, unsigned *), + u32 (*make_leaf_entry)(unsigned), + const unsigned table_bits, + const unsigned max_codeword_len) +{ + unsigned len_counts[max_codeword_len + 1]; + unsigned offsets[max_codeword_len + 1]; + unsigned sorted_syms[num_syms]; + unsigned sym; + unsigned len; + s32 remainder; + unsigned sym_idx; + unsigned codeword_reversed; + unsigned codeword_len; + unsigned loop_count; + + /* Preconditions */ + assert(table_bits > 0); + assert(table_bits <= max_codeword_len); + assert(max_codeword_len <= HUFFDEC_MAX_LEN); + for (sym = 0; sym < num_syms; sym++) + assert(lens[sym] <= max_codeword_len); + + /* Count how many symbols have each codeword length. */ + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + for (sym = 0; sym < num_syms; sym++) + len_counts[lens[sym]]++; + + /* We guarantee that all lengths are <= max_codeword_len, but we cannot + * assume they form a valid prefix code. A codeword of length n should + * require a proportion of the codespace equaling (1/2)^n. The code is + * valid if and only if the codespace is exactly filled by the lengths + * by this measure. */ + remainder = 1; + for (len = 1; len <= max_codeword_len; len++) { + remainder <<= 1; + remainder -= len_counts[len]; + if (unlikely(remainder < 0)) { + /* The lengths overflow the codespace; that is, the code + * is over-subscribed. */ + return false; + } + } + + if (unlikely(remainder != 0)) { + /* The lengths do not fill the codespace; that is, they form an + * incomplete set. */ + if (remainder == (1U << max_codeword_len)) { + /* The code is completely empty. By definition, no + * symbols can be decoded with an empty code. + * Consequently, we technically don't even need to fill + * in the decode table. However, to avoid accessing + * uninitialized memory if the algorithm nevertheless + * attempts to decode symbols using such a code, we fill + * the decode table with default values. */ + unsigned dummy; + for (unsigned i = 0; i < (1U << table_bits); i++) + decode_table[i] = (*make_direct_entry)(0, 1, &dummy); + return true; + } + return false; + } + + /* Sort the symbols primarily by length and secondarily by symbol value. + */ + + /* Initialize 'offsets' so that offsets[len] is the number of codewords + * shorter than 'len' bits. */ + offsets[0] = 0; + for (len = 0; len < max_codeword_len; len++) + offsets[len + 1] = offsets[len] + len_counts[len]; + + /* Use the 'offsets' array to sort the symbols. */ + for (sym = 0; sym < num_syms; sym++) + sorted_syms[offsets[lens[sym]]++] = sym; + + /* Generate entries for codewords with length <= 'table_bits'. + * Start with codeword length 1 and proceed to longer codewords. */ + sym_idx = offsets[0]; + codeword_reversed = 0; + codeword_len = 1; + loop_count = (1U << (table_bits - codeword_len)); + for (; loop_count != 0; codeword_len++, loop_count >>= 1) { + + const unsigned end_sym_idx = sym_idx + len_counts[codeword_len]; + const unsigned increment = 1U << codeword_len; + + /* Iterate through the symbols that have codewords with length + * 'codeword_len'. Since the code is assumed to be canonical, + * we can generate the codewords by iterating in symbol order + * and incrementing the current codeword by 1 each time. */ + + for (; sym_idx < end_sym_idx; sym_idx++) { + unsigned sym; + u32 entry; + unsigned extra_mask; + unsigned extra; + unsigned i; + unsigned n; + unsigned bit; + + sym = sorted_syms[sym_idx]; + extra_mask = 0; + entry = (*make_direct_entry)(sym, codeword_len, &extra_mask); + extra = 0; + i = codeword_reversed; + n = loop_count; + do { + decode_table[i] = entry + extra; + extra = (extra + 1) & extra_mask; + i += increment; + } while (--n); + + /* Increment the codeword by 1. Since DEFLATE requires + * bit-reversed codewords, we must manipulate bits + * ourselves. */ + bit = 1U << (codeword_len - 1); + while (codeword_reversed & bit) + bit >>= 1; + codeword_reversed &= bit - 1; + codeword_reversed |= bit; + } + } + + /* If we've filled in the entire table, we are done. Otherwise, there + * are codewords longer than 'table_bits' for which we must generate + * binary trees. */ + if (max_codeword_len > table_bits && + offsets[table_bits] != offsets[max_codeword_len]) + { + unsigned i; + unsigned bit; + unsigned next_free_slot; + + /* First, zero out the remaining entries. This is necessary so + * that those entries appear as "unallocated" in the next part. + * Each of these entries will eventually be filled with the + * representation of the root node of a binary tree. */ + + i = (1U << table_bits) - 1; /* All 1 bits */ + for (;;) { + decode_table[i] = 0; + + if (i == codeword_reversed) + break; + + /* Subtract 1 from the bit-reversed index. */ + bit = 1U << table_bits; + do { + bit >>= 1; + i ^= bit; + } while (i & bit); + } + + /* We allocate child nodes starting at the end of the direct + * lookup table. Note that there should be 2*num_syms extra + * entries for this purpose, although fewer than this may + * actually be needed. */ + next_free_slot = 1U << table_bits; + + for (; codeword_len <= max_codeword_len; codeword_len++) { + + const unsigned end_sym_idx = sym_idx + len_counts[codeword_len]; + + for (; sym_idx < end_sym_idx; sym_idx++) { + + unsigned shift = table_bits; + unsigned node_idx = codeword_reversed & ((1U << table_bits) - 1); + + /* Go through each bit of the current codeword + * beyond the prefix of length @table_bits and + * walk the appropriate binary tree, allocating + * any slots that have not yet been allocated. + * + * Note that the 'pointer' entry to the binary + * tree, which is stored in the direct lookup + * portion of the table, is represented + * identically to other internal (non-leaf) + * nodes of the binary tree; it can be thought + * of as simply the root of the tree. The + * representation of these internal nodes is + * simply the index of the left child combined + * with special flags to distingush the entry + * from direct mapping and leaf node entries. + */ + do { + + /* At least one bit remains in the + * codeword, but the current node is + * unallocated. Allocate it as an + * internal tree node. */ + if (decode_table[node_idx] == 0) { + decode_table[node_idx] = + next_free_slot | + HUFFDEC_TREE_NONLEAF_FLAGS; + decode_table[next_free_slot++] = 0; + decode_table[next_free_slot++] = 0; + } + + /* Go to the left child if the next bit + * in the codeword is 0; otherwise go to + * the right child. */ + node_idx = decode_table[node_idx] & + ~HUFFDEC_TREE_NONLEAF_FLAGS; + node_idx += (codeword_reversed >> shift) & 1; + shift += 1; + } while (shift != codeword_len); + + /* Generate the leaf node, which contains the + * real decode table entry. */ + decode_table[node_idx] = + (*make_leaf_entry)(sorted_syms[sym_idx]); + + /* Increment the codeword by 1. Since DEFLATE + * requires bit-reversed codewords, we must + * manipulate bits ourselves. */ + bit = 1U << (codeword_len - 1); + while (codeword_reversed & bit) + bit >>= 1; + codeword_reversed &= bit - 1; + codeword_reversed |= bit; + } + } + } + return true; +} + +/* Build the decode table for the precode. */ +static bool +build_precode_decode_table(struct deflate_decompressor *d) +{ + return build_decode_table(d->precode_decode_table, + d->precode_lens, + DEFLATE_NUM_PRECODE_SYMS, + make_pre_direct_entry, + make_pre_leaf_entry, + DEFLATE_PRECODE_TABLEBITS, + DEFLATE_MAX_PRE_CODEWORD_LEN); +} + +/* Build the decode table for the literal/length code. */ +static bool +build_litlen_decode_table(struct deflate_decompressor *d, + unsigned num_litlen_syms, unsigned num_offset_syms) +{ + return build_decode_table(d->litlen_decode_table, + d->lens, + num_litlen_syms, + make_litlen_direct_entry, + make_litlen_leaf_entry, + DEFLATE_LITLEN_TABLEBITS, + DEFLATE_MAX_LITLEN_CODEWORD_LEN); +} + +/* Build the decode table for the offset code. */ +static bool +build_offset_decode_table(struct deflate_decompressor *d, + unsigned num_litlen_syms, unsigned num_offset_syms) +{ + return build_decode_table(d->offset_decode_table, + d->lens + num_litlen_syms, + num_offset_syms, + make_offset_direct_entry, + make_offset_leaf_entry, + DEFLATE_OFFSET_TABLEBITS, + DEFLATE_MAX_OFFSET_CODEWORD_LEN); +} + +static inline machine_word_t +repeat_byte(u8 b) +{ + machine_word_t v; + + BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8); + + v = b; + v |= v << 8; + v |= v << 16; + v |= v << ((WORDSIZE == 8) ? 32 : 0); + return v; +} + +static inline void +copy_word_unaligned(const void *src, void *dst) +{ + store_word_unaligned(load_word_unaligned(src), dst); +} + +/* + * Copy an LZ77 match at (dst - offset) to dst. + * + * The length and offset must be already validated --- that is, (dst - offset) + * can't underrun the output buffer, and (dst + length) can't overrun the output + * buffer. Also, the length cannot be 0. + * + * @winend points to the byte past the end of the output buffer. + * This function won't write any data beyond this position. + */ +static inline void +lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend, u32 min_length) +{ + const u8 *src = dst - offset; + const u8 * const end = dst + length; + + /* + * Try to copy one machine word at a time. On i386 and x86_64 this is + * faster than copying one byte at a time, unless the data is + * near-random and all the matches have very short lengths. Note that + * since this requires unaligned memory accesses, it won't necessarily + * be faster on every architecture. + * + * Also note that we might copy more than the length of the match. For + * example, if a word is 8 bytes and the match is of length 5, then + * we'll simply copy 8 bytes. This is okay as long as we don't write + * beyond the end of the output buffer, hence the check for (winend - + * end >= WORDSIZE - 1). + */ + if (UNALIGNED_ACCESS_IS_VERY_FAST && + likely(winend - end >= WORDSIZE - 1)) + { + + if (offset >= WORDSIZE) { + /* The source and destination words don't overlap. */ + + /* To improve branch prediction, one iteration of this + * loop is unrolled. Most matches are short and will + * fail the first check. But if that check passes, then + * it becomes increasing likely that the match is long + * and we'll need to continue copying. */ + + copy_word_unaligned(src, dst); + src += WORDSIZE; + dst += WORDSIZE; + + if (dst < end) { + do { + copy_word_unaligned(src, dst); + src += WORDSIZE; + dst += WORDSIZE; + } while (dst < end); + } + return; + } else if (offset == 1) { + + /* Offset 1 matches are equivalent to run-length + * encoding of the previous byte. This case is common + * if the data contains many repeated bytes. */ + + machine_word_t v = repeat_byte(*(dst - 1)); + do { + store_word_unaligned(v, dst); + src += WORDSIZE; + dst += WORDSIZE; + } while (dst < end); + return; + } + /* + * We don't bother with special cases for other 'offset < + * WORDSIZE', which are usually rarer than 'offset == 1'. Extra + * checks will just slow things down. Actually, it's possible + * to handle all the 'offset < WORDSIZE' cases using the same + * code, but it still becomes more complicated doesn't seem any + * faster overall; it definitely slows down the more common + * 'offset == 1' case. + */ + } + + /* Fall back to a bytewise copy. */ + + if (min_length >= 2) { + *dst++ = *src++; + length--; + } + if (min_length >= 3) { + *dst++ = *src++; + length--; + } + if (min_length >= 4) { + *dst++ = *src++; + length--; + } + do { + *dst++ = *src++; + } while (--length); +} + +/***************************************************************************** + * Main decompression routine + *****************************************************************************/ + +/* + * This is the main DEFLATE decompression routine. It decompresses 'in_nbytes' + * bytes of compressed data from the buffer 'in' and writes the uncompressed + * data to the buffer 'out'. The caller must know the exact length of the + * uncompressed data and pass it as 'out_nbytes'. The return value is %true if + * and only if decompression was successful. A return value of %false indicates + * that either the compressed data is invalid or it does not decompress to + * exactly 'out_nbytes' bytes of uncompressed data. + */ +LIBEXPORT bool +deflate_decompress(struct deflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes) +{ + u8 *out_next = out; + u8 * const out_end = out_next + out_nbytes; + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + bitbuf_t bitbuf = 0; + unsigned bitsleft = 0; + size_t overrun_count = 0; + unsigned i; + unsigned is_final_block; + unsigned block_type; + u16 len; + u16 nlen; + unsigned num_litlen_syms; + unsigned num_offset_syms; + +next_block: + /* Starting to read the next block. */ + + BUILD_BUG_ON(!CAN_ENSURE(1 + 2 + 5 + 5 + 4)); + ENSURE_BITS(1 + 2 + 5 + 5 + 4); + + /* BFINAL: 1 bit */ + is_final_block = POP_BITS(1); + + /* BTYPE: 2 bits */ + block_type = POP_BITS(2); + + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { + + /* Dynamic Huffman block. */ + + /* The order in which precode lengths are stored. */ + static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + + unsigned num_explicit_precode_lens; + + /* Read the codeword length counts. */ + + BUILD_BUG_ON(DEFLATE_NUM_LITLEN_SYMS != ((1 << 5) - 1) + 257); + num_litlen_syms = POP_BITS(5) + 257; + + BUILD_BUG_ON(DEFLATE_NUM_OFFSET_SYMS != ((1 << 5) - 1) + 1); + num_offset_syms = POP_BITS(5) + 1; + + BUILD_BUG_ON(DEFLATE_NUM_PRECODE_SYMS != ((1 << 4) - 1) + 4); + num_explicit_precode_lens = POP_BITS(4) + 4; + + /* Read the precode codeword lengths. */ + BUILD_BUG_ON(DEFLATE_MAX_PRE_CODEWORD_LEN != (1 << 3) - 1); + if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) { + + ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3); + + for (i = 0; i < num_explicit_precode_lens; i++) + d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); + } else { + for (i = 0; i < num_explicit_precode_lens; i++) { + ENSURE_BITS(3); + d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); + } + } + + for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) + d->precode_lens[deflate_precode_lens_permutation[i]] = 0; + + /* Build the decode table for the precode. */ + if (!build_precode_decode_table(d)) + return false; + + /* Expand the literal/length and offset codeword lengths. */ + for (i = 0; i < num_litlen_syms + num_offset_syms; ) { + u32 entry; + unsigned presym; + u8 rep_val; + unsigned rep_count; + + ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7); + + /* (The code below assumes there are no binary trees in + * the decode table.) */ + BUILD_BUG_ON(DEFLATE_PRECODE_TABLEBITS != DEFLATE_MAX_PRE_CODEWORD_LEN); + + /* Read the next precode symbol. */ + entry = d->precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)]; + REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT); + presym = entry & HUFFDEC_VALUE_MASK; + + if (presym < 16) { + /* Explicit codeword length */ + d->lens[i++] = presym; + continue; + } + + /* Run-length encoded codeword lengths */ + + /* Note: we don't need verify that the repeat count + * doesn't overflow the number of elements, since we + * have enough extra spaces to allow for the worst-case + * overflow (138 zeroes when only 1 length was + * remaining). + * + * In the case of the small repeat counts (presyms 16 + * and 17), it is fastest to always write the maximum + * number of entries. That gets rid of branches that + * would otherwise be required. + * + * It is not just because of the numerical order that + * our checks go in the order 'presym < 16', 'presym == + * 16', and 'presym == 17'. For typical data this is + * ordered from most frequent to least frequent case. + */ + BUILD_BUG_ON(DEFLATE_MAX_LENS_OVERRUN != 138 - 1); + + if (presym == 16) { + /* Repeat the previous length 3 - 6 times */ + if (SAFETY_CHECK(i == 0)) + return false; + rep_val = d->lens[i - 1]; + BUILD_BUG_ON(3 + ((1 << 2) - 1) != 6); + rep_count = 3 + POP_BITS(2); + d->lens[i + 0] = rep_val; + d->lens[i + 1] = rep_val; + d->lens[i + 2] = rep_val; + d->lens[i + 3] = rep_val; + d->lens[i + 4] = rep_val; + d->lens[i + 5] = rep_val; + i += rep_count; + } else if (presym == 17) { + /* Repeat zero 3 - 10 times */ + BUILD_BUG_ON(3 + ((1 << 3) - 1) != 10); + rep_count = 3 + POP_BITS(3); + d->lens[i + 0] = 0; + d->lens[i + 1] = 0; + d->lens[i + 2] = 0; + d->lens[i + 3] = 0; + d->lens[i + 4] = 0; + d->lens[i + 5] = 0; + d->lens[i + 6] = 0; + d->lens[i + 7] = 0; + d->lens[i + 8] = 0; + d->lens[i + 9] = 0; + i += rep_count; + } else { + /* Repeat zero 11 - 138 times */ + BUILD_BUG_ON(11 + ((1 << 7) - 1) != 138); + rep_count = 11 + POP_BITS(7); + memset(&d->lens[i], 0, rep_count * sizeof(d->lens[i])); + i += rep_count; + } + } + } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + + /* Uncompressed block: copy 'len' bytes literally from the input + * buffer to the output buffer. */ + + ALIGN_INPUT(); + + if (SAFETY_CHECK(in_end - in_next < 4)) + return false; + + len = READ_U16(); + nlen = READ_U16(); + + if (SAFETY_CHECK(len != (u16)~nlen)) + return false; + + if (SAFETY_CHECK(len > out_end - out_next)) + return false; + + if (SAFETY_CHECK(len > in_end - in_next)) + return false; + + memcpy(out_next, in_next, len); + in_next += len; + out_next += len; + + goto block_done; + + } else if (block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN) { + + /* Static Huffman block: set the static Huffman codeword + * lengths. Then the remainder is the same as decompressing a + * dynamic Huffman block. */ + + BUILD_BUG_ON(DEFLATE_NUM_LITLEN_SYMS != 288); + BUILD_BUG_ON(DEFLATE_NUM_OFFSET_SYMS != 32); + + for (i = 0; i < 144; i++) + d->lens[i] = 8; + for (; i < 256; i++) + d->lens[i] = 9; + for (; i < 280; i++) + d->lens[i] = 7; + for (; i < 288; i++) + d->lens[i] = 8; + + for (; i < 288 + 32; i++) + d->lens[i] = 5; + + num_litlen_syms = 288; + num_offset_syms = 32; + + } else { + /* Reserved block type. */ + return false; + } + + /* Decompressing a Huffman block (either dynamic or static) */ + + if (!build_offset_decode_table(d, num_litlen_syms, num_offset_syms)) + return false; + + if (!build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)) + return false; + + /* The main DEFLATE decode loop */ + for (;;) { + u32 entry; + u32 length; + u32 offset; + + /* If our bitbuffer variable is large enough, we load new bits + * only once for each match or literal decoded. This is + * fastest. Otherwise, we may need to load new bits multiple + * times when decoding a match. */ + + BUILD_BUG_ON(!CAN_ENSURE(DEFLATE_MAX_LITLEN_CODEWORD_LEN)); + ENSURE_BITS(MAX_ENSURE); + + /* Read a literal or length. */ + + entry = d->litlen_decode_table[BITS(DEFLATE_LITLEN_TABLEBITS)]; + + if (CAN_ENSURE(DEFLATE_LITLEN_TABLEBITS * 2) && + likely(out_end - out_next >= MAX_ENSURE / DEFLATE_LITLEN_TABLEBITS)) + { + /* Fast path for decoding literals */ + + #define NUM_BITS_TO_ENSURE_AFTER_INLINE_LITERALS \ + ((MAX_ENSURE >= DEFLATE_MAX_MATCH_BITS) ? \ + DEFLATE_MAX_MATCH_BITS : \ + ((MAX_ENSURE >= DEFLATE_MAX_LITLEN_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS) ? \ + DEFLATE_MAX_LITLEN_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS : \ + DEFLATE_MAX_LITLEN_CODEWORD_LEN)) + + #define INLINE_LITERAL(seq) \ + if (CAN_ENSURE(DEFLATE_LITLEN_TABLEBITS * (seq))) { \ + entry = d->litlen_decode_table[ \ + BITS(DEFLATE_LITLEN_TABLEBITS)];\ + if (entry & HUFFDEC_NOT_LITERAL) { \ + if ((seq) != 1) \ + ENSURE_BITS(NUM_BITS_TO_ENSURE_AFTER_INLINE_LITERALS); \ + goto not_literal; \ + } \ + REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT); \ + *out_next++ = entry; \ + } + + INLINE_LITERAL(1); + INLINE_LITERAL(2); + INLINE_LITERAL(3); + INLINE_LITERAL(4); + INLINE_LITERAL(5); + INLINE_LITERAL(6); + INLINE_LITERAL(7); + INLINE_LITERAL(8); + continue; + } + + if (!(entry & HUFFDEC_NOT_LITERAL)) { + REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT); + if (SAFETY_CHECK(out_next == out_end)) + return false; + *out_next++ = entry; + continue; + } + not_literal: + if (likely(!(entry & HUFFDEC_NOT_FULL_LENGTH))) { + + /* The next TABLEBITS bits were enough to directly look + * up a litlen symbol, which was a length slot. In + * addition, the full match length, including the extra + * bits, fit into TABLEBITS. So the result of the + * lookup was the full match length. + * + * On typical data, most match lengths are short enough + * to fall into this category. */ + + REMOVE_BITS((entry >> HUFFDEC_LEN_SHIFT) & HUFFDEC_LEN_MASK); + length = entry & HUFFDEC_VALUE_MASK; + + } else if (!(entry & HUFFDEC_TREE_NONLEAF)) { + + /* The next TABLEBITS bits were enough to directly look + * up a litlen symbol, which was either a length slot or + * end-of-block. However, the full match length, + * including the extra bits (0 in the case of + * end-of-block), requires more than TABLEBITS bits to + * decode. So the result of the lookup was the length + * base and number of extra length bits. We will read + * this number of extra length bits and add them to the + * length base in order to construct the full length. + * + * On typical data, this case is rare. */ + + REMOVE_BITS((entry >> HUFFDEC_LEN_SHIFT) & HUFFDEC_LEN_MASK); + entry &= HUFFDEC_VALUE_MASK; + + if (!CAN_ENSURE(DEFLATE_MAX_LITLEN_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_LENGTH_BITS)) + ENSURE_BITS(DEFLATE_MAX_EXTRA_LENGTH_BITS); + + length = (entry & HUFFDEC_LENGTH_BASE_MASK) + + POP_BITS(entry >> HUFFDEC_EXTRA_LENGTH_BITS_SHIFT); + } else { + + /* The next TABLEBITS bits were not enough to directly + * look up a litlen symbol. Therefore, we must walk the + * appropriate binary tree to decode the symbol, which + * may be a literal, length slot, or end-of-block. + * + * On typical data, this case is rare. */ + + REMOVE_BITS(DEFLATE_LITLEN_TABLEBITS); + do { + entry &= ~HUFFDEC_TREE_NONLEAF_FLAGS; + entry += POP_BITS(1); + entry = d->litlen_decode_table[entry]; + } while (entry & HUFFDEC_TREE_NONLEAF_FAST_FLAG); + if (entry < 256) { + if (SAFETY_CHECK(out_next == out_end)) + return false; + *out_next++ = entry; + continue; + } + entry -= 256; + + if (!CAN_ENSURE(DEFLATE_MAX_LITLEN_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_LENGTH_BITS)) + ENSURE_BITS(DEFLATE_MAX_EXTRA_LENGTH_BITS); + + length = (entry & HUFFDEC_LENGTH_BASE_MASK) + + POP_BITS(entry >> HUFFDEC_EXTRA_LENGTH_BITS_SHIFT); + } + + /* The match destination must not end after the end of the + * output buffer. */ + if (SAFETY_CHECK(length > out_end - out_next)) + return false; + + if (unlikely(length == HUFFDEC_END_OF_BLOCK_LENGTH)) + goto block_done; + + /* Read the match offset. */ + + if (!CAN_ENSURE(DEFLATE_MAX_MATCH_BITS)) { + if (CAN_ENSURE(DEFLATE_MAX_OFFSET_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_OFFSET_BITS)) + ENSURE_BITS(DEFLATE_MAX_OFFSET_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_OFFSET_BITS); + else + ENSURE_BITS(DEFLATE_MAX_OFFSET_CODEWORD_LEN); + } + + entry = d->offset_decode_table[BITS(DEFLATE_OFFSET_TABLEBITS)]; + if (likely(!(entry & HUFFDEC_TREE_NONLEAF_FAST_FLAG))) { + REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT); + entry &= HUFFDEC_VALUE_MASK; + } else { + REMOVE_BITS(DEFLATE_OFFSET_TABLEBITS); + do { + entry &= ~HUFFDEC_TREE_NONLEAF_FLAGS; + entry += POP_BITS(1); + entry = d->offset_decode_table[entry]; + } while (entry & HUFFDEC_TREE_NONLEAF_FAST_FLAG); + } + + /* The value we have here isn't the offset symbol itself, but + * rather the offset symbol indexed into + * deflate_offset_symbol_data[]. This gives us the offset base + * and number of extra offset bits without having to index + * additional tables in the main decode loop. */ + + if (!CAN_ENSURE(DEFLATE_MAX_OFFSET_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_OFFSET_BITS)) + ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS); + + offset = (entry & HUFFDEC_OFFSET_BASE_MASK) + + POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT); + + /* The match source must not begin before the beginning of the + * output buffer. */ + if (SAFETY_CHECK(offset > out_next - (const u8 *)out)) + return false; + + /* Copy the match: + * 'length' bytes at 'out_next - offset' to 'out_next'. */ + + lz_copy(out_next, length, offset, out_end, DEFLATE_MIN_MATCH_LEN); + + out_next += length; + } + +block_done: + /* Finished decoding a block. */ + + if (!is_final_block) + goto next_block; + + /* That was the last block. Return %true if we got all the output we + * expected, otherwise %false. */ + return (out_next == out_end); +} + +LIBEXPORT struct deflate_decompressor * +deflate_alloc_decompressor(void) +{ + return malloc(sizeof(struct deflate_decompressor)); +} + +LIBEXPORT void +deflate_free_decompressor(struct deflate_decompressor *d) +{ + free(d); +} diff --git a/src/endianness.h b/src/endianness.h new file mode 100644 index 0000000..41cfdf6 --- /dev/null +++ b/src/endianness.h @@ -0,0 +1,75 @@ +/* + * endianness.h + * + * Inline functions for endianness conversion. + */ + +#pragma once + +#include "compiler.h" +#include "types.h" + +static inline u16 bswap16(u16 n) +{ +#ifdef compiler_bswap16 + return compiler_bswap16(n); +#else + return (n << 8) | (n >> 8); +#endif +} + +static inline u32 bswap32(u32 n) +{ +#ifdef compiler_bswap32 + return compiler_bswap32(n); +#else + return (n << 24) | + ((n & 0xFF00) << 8) | + ((n & 0xFF0000) >> 8) | + (n >> 24); +#endif +} + +static inline u64 bswap64(u64 n) +{ +#ifdef compiler_bswap64 + return compiler_bswap64(n); +#else + return (n << 56) | + ((n & 0xFF00) << 40) | + ((n & 0xFF0000) << 24) | + ((n & 0xFF000000) << 8) | + ((n & 0xFF00000000) >> 8) | + ((n & 0xFF0000000000) >> 24) | + ((n & 0xFF000000000000) >> 40) | + (n >> 56); +#endif +} + +#if CPU_IS_BIG_ENDIAN +# define cpu_to_le16(n) bswap16(n) +# define cpu_to_le32(n) bswap32(n) +# define cpu_to_le64(n) bswap64(n) +# define le16_to_cpu(n) bswap16(n) +# define le32_to_cpu(n) bswap32(n) +# define le64_to_cpu(n) bswap64(n) +# define cpu_to_be16(n) (n) +# define cpu_to_be32(n) (n) +# define cpu_to_be64(n) (n) +# define be16_to_cpu(n) (n) +# define be32_to_cpu(n) (n) +# define be64_to_cpu(n) (n) +#else +# define cpu_to_le16(n) (n) +# define cpu_to_le32(n) (n) +# define cpu_to_le64(n) (n) +# define le16_to_cpu(n) (n) +# define le32_to_cpu(n) (n) +# define le64_to_cpu(n) (n) +# define cpu_to_be16(n) bswap16(n) +# define cpu_to_be32(n) bswap32(n) +# define cpu_to_be64(n) bswap64(n) +# define be16_to_cpu(n) bswap16(n) +# define be32_to_cpu(n) bswap32(n) +# define be64_to_cpu(n) bswap64(n) +#endif diff --git a/src/gzip_compress.c b/src/gzip_compress.c new file mode 100644 index 0000000..eea49f3 --- /dev/null +++ b/src/gzip_compress.c @@ -0,0 +1,64 @@ +/* + * gzip_compress.c + * + * Generate DEFLATE-compressed data in the gzip wrapper format. + */ + +#include "libdeflate.h" + +#include "crc32.h" +#include "deflate_compress.h" +#include "gzip_constants.h" +#include "unaligned.h" + +LIBEXPORT size_t +gzip_compress(struct deflate_compressor *c, const void *in, size_t in_size, + void *out, size_t out_nbytes_avail) +{ + u8 *out_next = out; + unsigned compression_level; + u8 xfl; + size_t deflate_size; + + if (out_nbytes_avail <= GZIP_MIN_OVERHEAD) + return 0; + + /* ID1 */ + *out_next++ = GZIP_ID1; + /* ID2 */ + *out_next++ = GZIP_ID2; + /* CM */ + *out_next++ = GZIP_CM_DEFLATE; + /* FLG */ + *out_next++ = 0; + /* MTIME */ + put_unaligned_u32_be(GZIP_MTIME_UNAVAILABLE, out_next); + out_next += 4; + /* XFL */ + xfl = 0; + compression_level = deflate_get_compression_level(c); + if (compression_level < 2) + xfl |= GZIP_XFL_FASTEST_COMRESSION; + else if (compression_level >= 8) + xfl |= GZIP_XFL_SLOWEST_COMRESSION; + *out_next++ = xfl; + /* OS */ + *out_next++ = GZIP_OS_UNKNOWN; /* OS */ + + /* Compressed data */ + deflate_size = deflate_compress(c, in, in_size, out_next, + out_nbytes_avail - GZIP_MIN_OVERHEAD); + if (deflate_size == 0) + return 0; + out_next += deflate_size; + + /* CRC32 */ + put_unaligned_u32_be(crc32(in, in_size), out_next); + out_next += 4; + + /* ISIZE */ + put_unaligned_u32_be(in_size, out_next); + out_next += 4; + + return out_next - (u8 *)out; +} diff --git a/src/gzip_constants.h b/src/gzip_constants.h new file mode 100644 index 0000000..edd092a --- /dev/null +++ b/src/gzip_constants.h @@ -0,0 +1,47 @@ +/* + * gzip_constants.h + * + * Constants for the gzip wrapper format. + */ + +#pragma once + +#include "compiler.h" + +#define GZIP_MIN_HEADER_SIZE 10 +#define GZIP_FOOTER_SIZE 8 +#define GZIP_MIN_OVERHEAD (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE) + +#define GZIP_ID1 0x1F +#define GZIP_ID2 0x8B + +#define GZIP_CM_DEFLATE 8 + +#define GZIP_FTEXT 0x01 +#define GZIP_FHCRC 0x02 +#define GZIP_FEXTRA 0x04 +#define GZIP_FNAME 0x08 +#define GZIP_FCOMMENT 0x10 +#define GZIP_FRESERVED 0xE0 + +#define GZIP_MTIME_UNAVAILABLE 0 + +#define GZIP_XFL_SLOWEST_COMRESSION 0x02 +#define GZIP_XFL_FASTEST_COMRESSION 0x04 +#define GZIP_XFL_RESERVED 0xF9 + +#define GZIP_OS_FAT 0 +#define GZIP_OS_AMIGA 1 +#define GZIP_OS_VMS 2 +#define GZIP_OS_UNIX 3 +#define GZIP_OS_VM_CMS 4 +#define GZIP_OS_ATARI_TOS 5 +#define GZIP_OS_HPFS 6 +#define GZIP_OS_MACINTOSH 7 +#define GZIP_OS_Z_SYSTEM 8 +#define GZIP_OS_CP_M 9 +#define GZIP_OS_TOPS_20 10 +#define GZIP_OS_NTFS 11 +#define GZIP_OS_QDOS 12 +#define GZIP_OS_RISCOS 13 +#define GZIP_OS_UNKNOWN 255 diff --git a/src/gzip_decompress.c b/src/gzip_decompress.c new file mode 100644 index 0000000..55fd6ed --- /dev/null +++ b/src/gzip_decompress.c @@ -0,0 +1,100 @@ +/* + * gzip_decompress.c + * + * Decompress DEFLATE-compressed data wrapped in the gzip format. + */ + +#include "libdeflate.h" + +#include "crc32.h" +#include "gzip_constants.h" +#include "unaligned.h" + +LIBEXPORT bool +gzip_decompress(struct deflate_decompressor *d, + const void *in, size_t in_nbytes, void *out, size_t out_nbytes) +{ + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + u8 flg; + + if (in_nbytes < GZIP_MIN_OVERHEAD) + return false; + + /* ID1 */ + if (*in_next++ != GZIP_ID1) + return false; + /* ID2 */ + if (*in_next++ != GZIP_ID2) + return false; + /* CM */ + if (*in_next++ != GZIP_CM_DEFLATE) + return false; + flg = *in_next++; + /* MTIME */ + in_next += 4; + /* XFL */ + if (*in_next++ & GZIP_XFL_RESERVED) + return false; + /* OS */ + in_next += 1; + + if (flg & GZIP_FRESERVED) + return false; + + /* Extra field */ + if (flg & GZIP_FEXTRA) { + u16 xlen = get_unaligned_u16_be(in_next); + in_next += 2; + + if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE) + return false; + + in_next += xlen; + } + + /* Original file name (zero terminated) */ + if (flg & GZIP_FNAME) { + while (*in_next != 0 && ++in_next != in_end) + ; + if (in_next != in_end) + in_next++; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return false; + } + + /* File comment (zero terminated) */ + if (flg & GZIP_FCOMMENT) { + while (*in_next != 0 && ++in_next != in_end) + ; + if (in_next != in_end) + in_next++; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return false; + } + + /* CRC16 for gzip header */ + if (flg & GZIP_FHCRC) { + in_next += 2; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return false; + } + + /* Compressed data */ + if (!deflate_decompress(d, in_next, in_end - GZIP_FOOTER_SIZE - in_next, + out, out_nbytes)) + return false; + + in_next = in_end - GZIP_FOOTER_SIZE; + + /* CRC32 */ + if (crc32(out, out_nbytes) != get_unaligned_u32_be(in_next)) + return false; + in_next += 4; + + /* ISIZE */ + if ((u32)out_nbytes != get_unaligned_u32_be(in_next)) + return false; + + return true; +} diff --git a/src/hc_matchfinder.h b/src/hc_matchfinder.h new file mode 100644 index 0000000..67cb746 --- /dev/null +++ b/src/hc_matchfinder.h @@ -0,0 +1,235 @@ +/* + * hc_matchfinder.h + * + * This is a Hash Chain (hc) based matchfinder. + * + * The data structure is a hash table where each hash bucket contains a linked + * list of sequences, referenced by position. + * + * For each sequence (position) in the input, the first 3 bytes are hashed and + * that sequence (position) is prepended to the appropriate linked list in the + * hash table. Since the sequences are inserted in order, each list is always + * sorted by increasing match offset. + * + * At the same time as inserting a sequence, we may search the linked list for + * matches with that sequence. At each step, the length of the match is + * computed. The search ends when the sequences get too far away (outside of + * the sliding window), or when the list ends (in the code this is the same + * check as "too far away"), or when 'max_search_depth' positions have been + * searched, or when a match of at least 'nice_len' bytes has been found. + */ + +#pragma once + +#include "lz_extend.h" +#include "lz_hash3.h" +#include "matchfinder_common.h" +#include "unaligned.h" + +#ifndef HC_MATCHFINDER_HASH_ORDER +# if MATCHFINDER_WINDOW_ORDER < 14 +# define HC_MATCHFINDER_HASH_ORDER 14 +# else +# define HC_MATCHFINDER_HASH_ORDER 15 +# endif +#endif + +#define HC_MATCHFINDER_HASH_LENGTH (1UL << HC_MATCHFINDER_HASH_ORDER) + +#define HC_MATCHFINDER_TOTAL_LENGTH \ + (HC_MATCHFINDER_HASH_LENGTH + MATCHFINDER_WINDOW_SIZE) + +struct hc_matchfinder { + union { + pos_t mf_data[HC_MATCHFINDER_TOTAL_LENGTH]; + struct { + pos_t hash_tab[HC_MATCHFINDER_HASH_LENGTH]; + pos_t next_tab[MATCHFINDER_WINDOW_SIZE]; + }; + }; +} _aligned_attribute(MATCHFINDER_ALIGNMENT); + +static inline void +hc_matchfinder_init(struct hc_matchfinder *mf) +{ + matchfinder_init(mf->hash_tab, HC_MATCHFINDER_HASH_LENGTH); +} + +#if MATCHFINDER_IS_SLIDING +static inline void +hc_matchfinder_slide_window(struct hc_matchfinder *mf) +{ + matchfinder_rebase(mf->mf_data, HC_MATCHFINDER_TOTAL_LENGTH); +} +#endif + +/* + * Find the longest match longer than 'best_len'. + * + * @mf + * The matchfinder structure. + * @in_base + * Pointer to the next byte in the input buffer to process _at the last + * time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_. + * @in_next + * Pointer to the next byte in the input buffer to process. This is the + * pointer to the bytes being matched against. + * @best_len + * Require a match at least this long. + * @max_len + * Maximum match length to return. + * @nice_len + * Stop searching if a match of at least this length is found. + * @max_search_depth + * Limit on the number of potential matches to consider. + * @offset_ret + * The match offset is returned here. + * + * Return the length of the match found, or 'best_len' if no match longer than + * 'best_len' was found. + */ +static inline unsigned +hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, + const u8 * const in_base, + const u8 * const in_next, + unsigned best_len, + const unsigned max_len, + const unsigned nice_len, + const unsigned max_search_depth, + unsigned *offset_ret) +{ + unsigned depth_remaining = max_search_depth; + const u8 *best_matchptr = best_matchptr; /* uninitialized */ + const u8 *matchptr; + unsigned len; + unsigned hash; + pos_t cur_match; + u32 first_3_bytes; + + /* Insert the current sequence into the appropriate hash chain. */ + if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES)) + goto out; + first_3_bytes = load_u24_unaligned(in_next); + hash = lz_hash3_u24(first_3_bytes, HC_MATCHFINDER_HASH_ORDER); + cur_match = mf->hash_tab[hash]; + mf->next_tab[in_next - in_base] = cur_match; + mf->hash_tab[hash] = in_next - in_base; + + if (unlikely(best_len >= max_len)) + goto out; + + /* Search the appropriate hash chain for matches. */ + + if (!(matchfinder_match_in_window(cur_match, in_base, in_next))) + goto out; + + if (best_len < 3) { + for (;;) { + /* No length 3 match found yet. + * Check the first 3 bytes. */ + matchptr = &in_base[cur_match]; + + if (load_u24_unaligned(matchptr) == first_3_bytes) + break; + + /* Not a match; keep trying. */ + cur_match = mf->next_tab[ + matchfinder_slot_for_match(cur_match)]; + if (!matchfinder_match_in_window(cur_match, + in_base, in_next)) + goto out; + if (!--depth_remaining) + goto out; + } + + /* Found a length 3 match. */ + best_matchptr = matchptr; + best_len = lz_extend(in_next, best_matchptr, 3, max_len); + if (best_len >= nice_len) + goto out; + cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)]; + if (!matchfinder_match_in_window(cur_match, in_base, in_next)) + goto out; + if (!--depth_remaining) + goto out; + } + + for (;;) { + for (;;) { + matchptr = &in_base[cur_match]; + + /* Already found a length 3 match. Try for a longer match; + * start by checking the last 2 bytes and the first 4 bytes. */ + #if UNALIGNED_ACCESS_IS_FAST + if ((load_u32_unaligned(matchptr + best_len - 3) == + load_u32_unaligned(in_next + best_len - 3)) && + (load_u32_unaligned(matchptr) == + load_u32_unaligned(in_next))) + #else + if (matchptr[best_len] == in_next[best_len]) + #endif + break; + + cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)]; + if (!matchfinder_match_in_window(cur_match, in_base, in_next)) + goto out; + if (!--depth_remaining) + goto out; + } + + if (UNALIGNED_ACCESS_IS_FAST) + len = 4; + else + len = 0; + len = lz_extend(in_next, matchptr, len, max_len); + if (len > best_len) { + best_len = len; + best_matchptr = matchptr; + if (best_len >= nice_len) + goto out; + } + cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)]; + if (!matchfinder_match_in_window(cur_match, in_base, in_next)) + goto out; + if (!--depth_remaining) + goto out; + } +out: + *offset_ret = in_next - best_matchptr; + return best_len; +} + +/* + * Advance the match-finder, but don't search for matches. + * + * @mf + * The matchfinder structure. + * @in_base + * Pointer to the next byte in the input buffer to process _at the last + * time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_. + * @in_next + * Pointer to the next byte in the input buffer to process. + * @in_end + * Pointer to the end of the input buffer. + * @count + * Number of bytes to skip; must be > 0. + */ +static inline void +hc_matchfinder_skip_positions(struct hc_matchfinder * restrict mf, + const u8 *in_base, + const u8 *in_next, + const u8 *in_end, + unsigned count) +{ + unsigned hash; + + if (unlikely(in_next + count >= in_end - LZ_HASH_REQUIRED_NBYTES)) + return; + + do { + hash = lz_hash3(in_next, HC_MATCHFINDER_HASH_ORDER); + mf->next_tab[in_next - in_base] = mf->hash_tab[hash]; + mf->hash_tab[hash] = in_next - in_base; + in_next++; + } while (--count); +} diff --git a/src/lz_extend.h b/src/lz_extend.h new file mode 100644 index 0000000..94b281a --- /dev/null +++ b/src/lz_extend.h @@ -0,0 +1,60 @@ +/* + * lz_extend.h + * + * Fast match extension for Lempel-Ziv matchfinding. + */ + +#pragma once + +#include "bitops.h" +#include "unaligned.h" + +/* + * Return the number of bytes at @matchptr that match the bytes at @strptr, up + * to a maximum of @max_len. Initially, @start_len bytes are matched. + */ +static inline unsigned +lz_extend(const u8 * const strptr, const u8 * const matchptr, + const unsigned start_len, const unsigned max_len) +{ + unsigned len = start_len; + machine_word_t v_word; + + if (UNALIGNED_ACCESS_IS_FAST) { + + if (likely(max_len - len >= 4 * WORDSIZE)) { + + #define COMPARE_WORD_STEP \ + v_word = load_word_unaligned(&matchptr[len]) ^ \ + load_word_unaligned(&strptr[len]); \ + if (v_word != 0) \ + goto word_differs; \ + len += WORDSIZE; \ + + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP + #undef COMPARE_WORD_STEP + } + + while (len + WORDSIZE <= max_len) { + v_word = load_word_unaligned(&matchptr[len]) ^ + load_word_unaligned(&strptr[len]); + if (v_word != 0) + goto word_differs; + len += WORDSIZE; + } + } + + while (len < max_len && matchptr[len] == strptr[len]) + len++; + return len; + +word_differs: + if (CPU_IS_LITTLE_ENDIAN) + len += (ffsw(v_word) >> 3); + else + len += (flsw(v_word) >> 3); + return len; +} diff --git a/src/lz_hash3.h b/src/lz_hash3.h new file mode 100644 index 0000000..ec322d9 --- /dev/null +++ b/src/lz_hash3.h @@ -0,0 +1,49 @@ +/* + * lz_hash3.h + * + * 3-byte hashing for Lempel-Ziv matchfinding. + */ + +#pragma once + +#include "unaligned.h" + +static inline u32 +loaded_u32_to_u24(u32 v) +{ + if (CPU_IS_LITTLE_ENDIAN) + return v & 0xFFFFFF; + else + return v >> 8; +} + +static inline u32 +load_u24_unaligned(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return loaded_u32_to_u24(load_u32_unaligned(p)); + else + return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); +} + +static inline u32 +lz_hash3_u24(u32 str, unsigned num_bits) +{ + return (u32)(str * 0x1E35A7BD) >> (32 - num_bits); +} + +/* + * Hash the next 3-byte sequence in the window, producing a hash of length + * 'num_bits' bits. At least LZ_HASH_REQUIRED_NBYTES must be available at 'p'; + * this might be 4 bytes rather than 3 because an unaligned load is faster on + * some architectures. + */ +static inline u32 +lz_hash3(const u8 *p, unsigned num_bits) +{ + return lz_hash3_u24(load_u24_unaligned(p), num_bits); +} + +/* Number of bytes the hash function actually requires be available, due to the + * possibility of an unaligned load. */ +#define LZ_HASH_REQUIRED_NBYTES (UNALIGNED_ACCESS_IS_FAST ? 4 : 3) diff --git a/src/matchfinder_avx2.h b/src/matchfinder_avx2.h new file mode 100644 index 0000000..fe98b63 --- /dev/null +++ b/src/matchfinder_avx2.h @@ -0,0 +1,64 @@ +/* + * matchfinder_avx2.h + * + * Matchfinding routines optimized for Intel AVX2 (Advanced Vector Extensions). + */ + +#include + +static inline bool +matchfinder_init_avx2(pos_t *data, size_t size) +{ + __m256i v, *p; + size_t n; + + if (size % sizeof(__m256i) * 4) + return false; + + if (sizeof(pos_t) == 2) + v = _mm256_set1_epi16(MATCHFINDER_INITVAL); + else if (sizeof(pos_t) == 4) + v = _mm256_set1_epi32(MATCHFINDER_INITVAL); + else + return false; + + p = (__m256i *)data; + n = size / (sizeof(__m256i) * 4); + do { + p[0] = v; + p[1] = v; + p[2] = v; + p[3] = v; + p += 4; + } while (--n); + return true; +} + +static inline bool +matchfinder_rebase_avx2(pos_t *data, size_t size) +{ + __m256i v, *p; + size_t n; + + if ((size % sizeof(__m256i) * 4 != 0)) + return false; + + if (sizeof(pos_t) == 2) + v = _mm256_set1_epi16((pos_t)-MATCHFINDER_WINDOW_SIZE); + else if (sizeof(pos_t) == 4) + v = _mm256_set1_epi32((pos_t)-MATCHFINDER_WINDOW_SIZE); + else + return false; + + p = (__m256i *)data; + n = size / (sizeof(__m256i) * 4); + do { + /* PADDSW: Add Packed Signed Integers With Signed Saturation */ + p[0] = _mm256_adds_epi16(p[0], v); + p[1] = _mm256_adds_epi16(p[1], v); + p[2] = _mm256_adds_epi16(p[2], v); + p[3] = _mm256_adds_epi16(p[3], v); + p += 4; + } while (--n); + return true; +} diff --git a/src/matchfinder_common.h b/src/matchfinder_common.h new file mode 100644 index 0000000..2a01336 --- /dev/null +++ b/src/matchfinder_common.h @@ -0,0 +1,163 @@ +/* + * matchfinder_common.h + * + * Common code for Lempel-Ziv matchfinding. + */ + +#pragma once + +#include "types.h" + +#include + +#ifndef MATCHFINDER_WINDOW_ORDER +# error "MATCHFINDER_WINDOW_ORDER must be defined!" +#endif + +#ifndef MATCHFINDER_IS_SLIDING +# error "MATCHFINDER_IS_SLIDING must be defined!" +#endif + +#define MATCHFINDER_WINDOW_SIZE ((size_t)1 << MATCHFINDER_WINDOW_ORDER) + +#if MATCHFINDER_IS_SLIDING +# include "matchfinder_sliding.h" +#else +# include "matchfinder_nonsliding.h" +#endif + +#define MATCHFINDER_ALIGNMENT 8 + +#ifdef __AVX2__ +# include "matchfinder_avx2.h" +# if MATCHFINDER_ALIGNMENT < 32 +# undef MATCHFINDER_ALIGNMENT +# define MATCHFINDER_ALIGNMENT 32 +# endif +#endif + +#ifdef __SSE2__ +# include "matchfinder_sse2.h" +# if MATCHFINDER_ALIGNMENT < 16 +# undef MATCHFINDER_ALIGNMENT +# define MATCHFINDER_ALIGNMENT 16 +# endif +#endif + +/* + * Representation of a match. + */ +struct lz_match { + + /* The number of bytes matched. */ + pos_t length; + + /* The offset back from the current position that was matched. */ + pos_t offset; +}; + +static inline bool +matchfinder_memset_init_okay(void) +{ + /* All bytes must match in order to use memset. */ + const pos_t v = MATCHFINDER_INITVAL; + if (sizeof(pos_t) == 2) + return (u8)v == (u8)(v >> 8); + if (sizeof(pos_t) == 4) + return (u8)v == (u8)(v >> 8) && + (u8)v == (u8)(v >> 16) && + (u8)v == (u8)(v >> 24); + return false; +} + +/* + * Initialize the hash table portion of the matchfinder. + * + * Essentially, this is an optimized memset(). + * + * 'data' must be aligned to a MATCHFINDER_ALIGNMENT boundary. + */ +static inline void +matchfinder_init(pos_t *data, size_t num_entries) +{ + const size_t size = num_entries * sizeof(data[0]); + +#ifdef __AVX2__ + if (matchfinder_init_avx2(data, size)) + return; +#endif + +#ifdef __SSE2__ + if (matchfinder_init_sse2(data, size)) + return; +#endif + + if (matchfinder_memset_init_okay()) { + memset(data, (u8)MATCHFINDER_INITVAL, size); + return; + } + + for (size_t i = 0; i < num_entries; i++) + data[i] = MATCHFINDER_INITVAL; +} + +#if MATCHFINDER_IS_SLIDING +/* + * Slide the matchfinder by WINDOW_SIZE bytes. + * + * This must be called just after each WINDOW_SIZE bytes have been run through + * the matchfinder. + * + * This will subtract WINDOW_SIZE bytes from each entry in the array specified. + * The effect is that all entries are updated to be relative to the current + * position, rather than the position WINDOW_SIZE bytes prior. + * + * Underflow is detected and replaced with signed saturation. This ensures that + * once the sliding window has passed over a position, that position forever + * remains out of bounds. + * + * The array passed in must contain all matchfinder data that is + * position-relative. Concretely, this will include the hash table as well as + * the table of positions that is used to link together the sequences in each + * hash bucket. Note that in the latter table, the links are 1-ary in the case + * of "hash chains", and 2-ary in the case of "binary trees". In either case, + * the links need to be rebased in the same way. + */ +static inline void +matchfinder_rebase(pos_t *data, size_t num_entries) +{ + const size_t size = num_entries * sizeof(data[0]); + +#ifdef __AVX2__ + if (matchfinder_rebase_avx2(data, size)) + return; +#endif + +#ifdef __SSE2__ + if (matchfinder_rebase_sse2(data, size)) + return; +#endif + + if (MATCHFINDER_WINDOW_SIZE == 32768) { + /* Branchless version for 32768 byte windows. If the value was + * already negative, clear all bits except the sign bit; this + * changes the value to -32768. Otherwise, set the sign bit; + * this is equivalent to subtracting 32768. */ + for (size_t i = 0; i < num_entries; i++) { + u16 v = data[i]; + u16 sign_bit = v & 0x8000; + v &= sign_bit - ((sign_bit >> 15) ^ 1); + v |= 0x8000; + data[i] = v; + } + return; + } + + for (size_t i = 0; i < num_entries; i++) { + if (data[i] >= 0) + data[i] -= (pos_t)-MATCHFINDER_WINDOW_SIZE; + else + data[i] = (pos_t)-MATCHFINDER_WINDOW_SIZE; + } +} +#endif /* MATCHFINDER_IS_SLIDING */ diff --git a/src/matchfinder_nonsliding.h b/src/matchfinder_nonsliding.h new file mode 100644 index 0000000..e08f461 --- /dev/null +++ b/src/matchfinder_nonsliding.h @@ -0,0 +1,47 @@ +/* + * matchfinder_nonsliding.h + * + * Definitions for nonsliding window matchfinders. + * + * "Nonsliding window" means that any prior sequence can be matched. + */ + +#if MATCHFINDER_WINDOW_ORDER <= 16 +typedef u16 pos_t; +#else +typedef u32 pos_t; +#endif + +#if MATCHFINDER_WINDOW_ORDER != 16 && MATCHFINDER_WINDOW_ORDER != 32 + +/* Not all the bits of the position type are needed, so the sign bit can be + * reserved to mean "out of bounds". */ +#define MATCHFINDER_INITVAL ((pos_t)-1) + +static inline bool +matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next) +{ + return !(cur_match & ((pos_t)1 << (sizeof(pos_t) * 8 - 1))); +} + +#else + +/* All bits of the position type are needed, so use 0 to mean "out of bounds". + * This prevents the beginning of the buffer from matching anything; however, + * this doesn't matter much. */ + +#define MATCHFINDER_INITVAL ((pos_t)0) + +static inline bool +matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next) +{ + return cur_match != 0; +} + +#endif + +static inline pos_t +matchfinder_slot_for_match(pos_t cur_match) +{ + return cur_match; +} diff --git a/src/matchfinder_sliding.h b/src/matchfinder_sliding.h new file mode 100644 index 0000000..4b8a515 --- /dev/null +++ b/src/matchfinder_sliding.h @@ -0,0 +1,30 @@ +/* + * matchfinder_sliding.h + * + * Definitions for sliding window matchfinders. + * + * "Sliding window" means that only sequences beginning in the most recent + * MATCHFINDER_WINDOW_SIZE bytes can be matched. + */ + +#if MATCHFINDER_WINDOW_ORDER <= 15 +typedef s16 pos_t; +#else +typedef s32 pos_t; +#endif + +#define MATCHFINDER_INITVAL ((pos_t)-MATCHFINDER_WINDOW_SIZE) + +/* In the sliding window case, positions are stored relative to 'in_base'. */ + +static inline bool +matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next) +{ + return cur_match > (pos_t)((in_next - in_base) - MATCHFINDER_WINDOW_SIZE); +} + +static inline pos_t +matchfinder_slot_for_match(pos_t cur_match) +{ + return cur_match & (MATCHFINDER_WINDOW_SIZE - 1); +} diff --git a/src/matchfinder_sse2.h b/src/matchfinder_sse2.h new file mode 100644 index 0000000..cc27600 --- /dev/null +++ b/src/matchfinder_sse2.h @@ -0,0 +1,64 @@ +/* + * matchfinder_sse2.h + * + * Matchfinding routines optimized for Intel SSE2 (Streaming SIMD Extensions). + */ + +#include + +static inline bool +matchfinder_init_sse2(pos_t *data, size_t size) +{ + __m128i v, *p; + size_t n; + + if (size % sizeof(__m128i) * 4) + return false; + + if (sizeof(pos_t) == 2) + v = _mm_set1_epi16(MATCHFINDER_INITVAL); + else if (sizeof(pos_t) == 4) + v = _mm_set1_epi32(MATCHFINDER_INITVAL); + else + return false; + + p = (__m128i *)data; + n = size / (sizeof(__m128i) * 4); + do { + p[0] = v; + p[1] = v; + p[2] = v; + p[3] = v; + p += 4; + } while (--n); + return true; +} + +static inline bool +matchfinder_rebase_sse2(pos_t *data, size_t size) +{ + __m128i v, *p; + size_t n; + + if ((size % sizeof(__m128i) * 4 != 0)) + return false; + + if (sizeof(pos_t) == 2) + v = _mm_set1_epi16((pos_t)-MATCHFINDER_WINDOW_SIZE); + else if (sizeof(pos_t) == 4) + v = _mm_set1_epi32((pos_t)-MATCHFINDER_WINDOW_SIZE); + else + return false; + + p = (__m128i *)data; + n = size / (sizeof(__m128i) * 4); + do { + /* PADDSW: Add Packed Signed Integers With Signed Saturation */ + p[0] = _mm_adds_epi16(p[0], v); + p[1] = _mm_adds_epi16(p[1], v); + p[2] = _mm_adds_epi16(p[2], v); + p[3] = _mm_adds_epi16(p[3], v); + p += 4; + } while (--n); + return true; +} diff --git a/src/types.h b/src/types.h new file mode 100644 index 0000000..205dbd3 --- /dev/null +++ b/src/types.h @@ -0,0 +1,38 @@ +/* + * types.h + * + * Definitions of fixed-width integers, 'bool', 'size_t', and 'machine_word_t'. + */ + +#pragma once + +#include +#include +#include + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +typedef uint16_t le16; +typedef uint32_t le32; +typedef uint64_t le64; + +typedef uint16_t be16; +typedef uint32_t be32; +typedef uint64_t be64; + +/* + * Type of a machine word. 'unsigned long' would be logical, but that is only + * 32 bits on x86_64 Windows. The same applies to 'uint_fast32_t'. So the best + * we can do without a bunch of #ifdefs appears to be 'size_t'. + */ +typedef size_t machine_word_t; + +#define WORDSIZE sizeof(machine_word_t) diff --git a/src/unaligned.h b/src/unaligned.h new file mode 100644 index 0000000..d5b0f95 --- /dev/null +++ b/src/unaligned.h @@ -0,0 +1,216 @@ +/* + * unaligned.h + * + * Inline functions for unaligned memory access. + */ + +#pragma once + +#include "compiler.h" +#include "endianness.h" +#include "types.h" + +#define DEFINE_UNALIGNED_TYPE(type) \ +struct type##_unaligned { \ + type v; \ +} _packed_attribute; \ + \ +static inline type \ +load_##type##_unaligned(const void *p) \ +{ \ + return ((const struct type##_unaligned *)p)->v; \ +} \ + \ +static inline void \ +store_##type##_unaligned(type val, void *p) \ +{ \ + ((struct type##_unaligned *)p)->v = val; \ +} + +DEFINE_UNALIGNED_TYPE(u16); +DEFINE_UNALIGNED_TYPE(u32); +DEFINE_UNALIGNED_TYPE(u64); +DEFINE_UNALIGNED_TYPE(machine_word_t); + +#define load_word_unaligned load_machine_word_t_unaligned +#define store_word_unaligned store_machine_word_t_unaligned + +static inline u16 +get_unaligned_u16_le(const void *p) +{ + u16 v; + + if (UNALIGNED_ACCESS_IS_FAST) { + v = le16_to_cpu(load_u16_unaligned(p)); + } else { + const u8 *p8 = p; + v = 0; + v |= (u16)p8[0] << 0; + v |= (u16)p8[1] << 8; + } + return v; +} + +static inline u32 +get_unaligned_u32_le(const void *p) +{ + u32 v; + + if (UNALIGNED_ACCESS_IS_FAST) { + v = le32_to_cpu(load_u32_unaligned(p)); + } else { + const u8 *p8 = p; + v = 0; + v |= (u32)p8[0] << 0; + v |= (u32)p8[1] << 8; + v |= (u32)p8[2] << 16; + v |= (u32)p8[3] << 24; + } + return v; +} + +static inline u64 +get_unaligned_u64_le(const void *p) +{ + u64 v; + + if (UNALIGNED_ACCESS_IS_FAST) { + v = le64_to_cpu(load_u64_unaligned(p)); + } else { + const u8 *p8 = p; + v = 0; + v |= (u64)p8[0] << 0; + v |= (u64)p8[1] << 8; + v |= (u64)p8[2] << 16; + v |= (u64)p8[3] << 24; + v |= (u64)p8[4] << 32; + v |= (u64)p8[5] << 40; + v |= (u64)p8[6] << 48; + v |= (u64)p8[7] << 56; + } + return v; +} + +static inline machine_word_t +get_unaligned_word_le(const void *p) +{ + BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8); + if (WORDSIZE == 4) + return get_unaligned_u32_le(p); + else + return get_unaligned_u64_le(p); +} + +static inline void +put_unaligned_u16_le(u16 v, void *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u16_unaligned(cpu_to_le16(v), p); + } else { + u8 *p8 = p; + p8[0] = (v >> 0) & 0xFF; + p8[1] = (v >> 8) & 0xFF; + } +} + +static inline void +put_unaligned_u32_le(u32 v, void *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u32_unaligned(cpu_to_le32(v), p); + } else { + u8 *p8 = p; + p8[0] = (v >> 0) & 0xFF; + p8[1] = (v >> 8) & 0xFF; + p8[2] = (v >> 16) & 0xFF; + p8[3] = (v >> 24) & 0xFF; + } +} + +static inline void +put_unaligned_u64_le(u64 v, void *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u64_unaligned(cpu_to_le64(v), p); + } else { + u8 *p8 = p; + p8[0] = (v >> 0) & 0xFF; + p8[1] = (v >> 8) & 0xFF; + p8[2] = (v >> 16) & 0xFF; + p8[3] = (v >> 24) & 0xFF; + p8[4] = (v >> 32) & 0xFF; + p8[5] = (v >> 40) & 0xFF; + p8[6] = (v >> 48) & 0xFF; + p8[7] = (v >> 56) & 0xFF; + } +} + +static inline void +put_unaligned_word_le(machine_word_t v, void *p) +{ + BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8); + if (WORDSIZE == 4) + put_unaligned_u32_le(v, p); + else + put_unaligned_u64_le(v, p); +} + +static inline u16 +get_unaligned_u16_be(const void *p) +{ + u16 v; + + if (UNALIGNED_ACCESS_IS_FAST) { + v = be16_to_cpu(load_u16_unaligned(p)); + } else { + const u8 *p8 = p; + v = 0; + v |= (u16)p8[0] << 8; + v |= (u16)p8[1] << 0; + } + return v; +} + +static inline u32 +get_unaligned_u32_be(const void *p) +{ + u32 v; + + if (UNALIGNED_ACCESS_IS_FAST) { + v = be32_to_cpu(load_u32_unaligned(p)); + } else { + const u8 *p8 = p; + v = 0; + v |= (u32)p8[0] << 24; + v |= (u32)p8[1] << 16; + v |= (u32)p8[2] << 8; + v |= (u32)p8[3] << 0; + } + return v; +} + +static inline void +put_unaligned_u16_be(u16 v, void *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u16_unaligned(cpu_to_be16(v), p); + } else { + u8 *p8 = p; + p8[0] = (v >> 8) & 0xFF; + p8[1] = (v >> 0) & 0xFF; + } +} + +static inline void +put_unaligned_u32_be(u32 v, void *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u32_unaligned(cpu_to_be32(v), p); + } else { + u8 *p8 = p; + p8[0] = (v >> 24) & 0xFF; + p8[1] = (v >> 16) & 0xFF; + p8[2] = (v >> 8) & 0xFF; + p8[3] = (v >> 0) & 0xFF; + } +} diff --git a/src/zlib_compress.c b/src/zlib_compress.c new file mode 100644 index 0000000..9fdb359 --- /dev/null +++ b/src/zlib_compress.c @@ -0,0 +1,56 @@ +/* + * zlib_compress.c + * + * Generate DEFLATE-compressed data in the zlib wrapper format. + */ + +#include "libdeflate.h" + +#include "adler32.h" +#include "deflate_compress.h" +#include "unaligned.h" +#include "zlib_constants.h" + +LIBEXPORT size_t +zlib_compress(struct deflate_compressor *c, const void *in, size_t in_size, + void *out, size_t out_nbytes_avail) +{ + u8 *out_next = out; + u16 hdr; + unsigned compression_level; + unsigned level_hint; + size_t deflate_size; + + if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD) + return 0; + + /* 2 byte header: CMF and FLG */ + hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12); + compression_level = deflate_get_compression_level(c); + if (compression_level < 2) + level_hint = ZLIB_FASTEST_COMPRESSION; + else if (compression_level < 6) + level_hint = ZLIB_FAST_COMPRESSION; + else if (compression_level < 8) + level_hint = ZLIB_DEFAULT_COMPRESSION; + else + level_hint = ZLIB_SLOWEST_COMPRESSION; + hdr |= level_hint << 6; + hdr |= 31 - (hdr % 31); + + put_unaligned_u16_be(hdr, out_next); + out_next += 2; + + /* Compressed data */ + deflate_size = deflate_compress(c, in, in_size, out_next, + out_nbytes_avail - ZLIB_MIN_OVERHEAD); + if (deflate_size == 0) + return 0; + out_next += deflate_size; + + /* ADLER32 */ + put_unaligned_u32_be(adler32(in, in_size), out_next); + out_next += 4; + + return out_next - (u8 *)out; +} diff --git a/src/zlib_constants.h b/src/zlib_constants.h new file mode 100644 index 0000000..faec03b --- /dev/null +++ b/src/zlib_constants.h @@ -0,0 +1,20 @@ +/* + * zlib_constants.h + * + * Constants for the zlib wrapper format. + */ + +#pragma once + +#define ZLIB_MIN_HEADER_SIZE 2 +#define ZLIB_FOOTER_SIZE 4 +#define ZLIB_MIN_OVERHEAD (ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE) + +#define ZLIB_CM_DEFLATE 8 + +#define ZLIB_CINFO_32K_WINDOW 7 + +#define ZLIB_FASTEST_COMPRESSION 0 +#define ZLIB_FAST_COMPRESSION 1 +#define ZLIB_DEFAULT_COMPRESSION 2 +#define ZLIB_SLOWEST_COMPRESSION 3 diff --git a/src/zlib_decompress.c b/src/zlib_decompress.c new file mode 100644 index 0000000..bd7fc6b --- /dev/null +++ b/src/zlib_decompress.c @@ -0,0 +1,56 @@ +/* + * zlib_decompress.c + * + * Decompress DEFLATE-compressed data wrapped in the zlib format. + */ + +#include "libdeflate.h" + +#include "adler32.h" +#include "unaligned.h" +#include "zlib_constants.h" + +LIBEXPORT bool +zlib_decompress(struct deflate_decompressor *d, + const void *in, size_t in_nbytes, void *out, size_t out_nbytes) +{ + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + u16 hdr; + + if (in_nbytes < ZLIB_MIN_OVERHEAD) + return false; + + /* 2 byte header: CMF and FLG */ + hdr = get_unaligned_u16_be(in_next); + in_next += 2; + + /* FCHECK */ + if ((hdr % 31) != 0) + return false; + + /* CM */ + if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE) + return false; + + /* CINFO */ + if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW) + return false; + + /* FDICT */ + if ((hdr >> 5) & 1) + return false; + + /* Compressed data */ + if (!deflate_decompress(d, in_next, in_end - ZLIB_FOOTER_SIZE - in_next, + out, out_nbytes)) + return false; + + in_next = in_end - ZLIB_FOOTER_SIZE; + + /* ADLER32 */ + if (adler32(out, out_nbytes) != get_unaligned_u32_be(in_next)) + return false; + + return true; +}