From 6a982b7ac73f52be4832b18cc159c5405ab94a80 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sat, 27 Dec 2014 23:11:49 -0600
Subject: [PATCH] initial commit

---
 CMakeLists.txt               |   83 ++
 README                       |   19 +
 examples/benchmark.c         |  210 +++
 libdeflate.h                 |  131 ++
 src/adler32.c                |   19 +
 src/adler32.h                |   12 +
 src/bitops.h                 |   80 ++
 src/bt_matchfinder.h         |  279 ++++
 src/compiler-gcc.h           |   52 +
 src/compiler.h               |   60 +
 src/crc32.c                  |   73 ++
 src/crc32.h                  |   12 +
 src/deflate_compress.c       | 2323 ++++++++++++++++++++++++++++++++++
 src/deflate_compress.h       |    9 +
 src/deflate_constants.h      |   59 +
 src/deflate_decompress.c     | 1455 +++++++++++++++++++++
 src/endianness.h             |   75 ++
 src/gzip_compress.c          |   64 +
 src/gzip_constants.h         |   47 +
 src/gzip_decompress.c        |  100 ++
 src/hc_matchfinder.h         |  235 ++++
 src/lz_extend.h              |   60 +
 src/lz_hash3.h               |   49 +
 src/matchfinder_avx2.h       |   64 +
 src/matchfinder_common.h     |  163 +++
 src/matchfinder_nonsliding.h |   47 +
 src/matchfinder_sliding.h    |   30 +
 src/matchfinder_sse2.h       |   64 +
 src/types.h                  |   38 +
 src/unaligned.h              |  216 ++++
 src/zlib_compress.c          |   56 +
 src/zlib_constants.h         |   20 +
 src/zlib_decompress.c        |   56 +
 33 files changed, 6260 insertions(+)
 create mode 100644 CMakeLists.txt
 create mode 100644 README
 create mode 100644 examples/benchmark.c
 create mode 100644 libdeflate.h
 create mode 100644 src/adler32.c
 create mode 100644 src/adler32.h
 create mode 100644 src/bitops.h
 create mode 100644 src/bt_matchfinder.h
 create mode 100644 src/compiler-gcc.h
 create mode 100644 src/compiler.h
 create mode 100644 src/crc32.c
 create mode 100644 src/crc32.h
 create mode 100644 src/deflate_compress.c
 create mode 100644 src/deflate_compress.h
 create mode 100644 src/deflate_constants.h
 create mode 100644 src/deflate_decompress.c
 create mode 100644 src/endianness.h
 create mode 100644 src/gzip_compress.c
 create mode 100644 src/gzip_constants.h
 create mode 100644 src/gzip_decompress.c
 create mode 100644 src/hc_matchfinder.h
 create mode 100644 src/lz_extend.h
 create mode 100644 src/lz_hash3.h
 create mode 100644 src/matchfinder_avx2.h
 create mode 100644 src/matchfinder_common.h
 create mode 100644 src/matchfinder_nonsliding.h
 create mode 100644 src/matchfinder_sliding.h
 create mode 100644 src/matchfinder_sse2.h
 create mode 100644 src/types.h
 create mode 100644 src/unaligned.h
 create mode 100644 src/zlib_compress.c
 create mode 100644 src/zlib_constants.h
 create mode 100644 src/zlib_decompress.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..7dc9d49
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,83 @@
+cmake_minimum_required(VERSION 2.6)
+project(libdeflate C)
+
+set(LIB_VERSION_MAJOR 0)
+set(LIB_VERSION_MINOR 0)
+set(LIB_VERSION_PATCH 0)
+
+set(LIB_VERSION_STRING "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}")
+
+if(NOT CMAKE_BUILD_TYPE)
+	set(CMAKE_BUILD_TYPE Release)
+endif()
+
+set(C_FLAGS "-std=c11 -fvisibility=hidden")
+
+set(CMAKE_C_FLAGS_RELEASE "${C_FLAGS} -O2 -DNDEBUG")
+set(CMAKE_C_FLAGS_DEBUG "${C_FLAGS} -O0 -g")
+
+include_directories(".")
+
+option(SUPPORT_COMPRESSION "Support DEFLATE compression" ON)
+if(SUPPORT_COMPRESSION)
+	set(LIB_SOURCES ${LIB_SOURCES} src/deflate_compress.c)
+endif()
+
+option(SUPPORT_DECOMPRESSION "Support DEFLATE decompression" ON)
+if(SUPPORT_DECOMPRESSION)
+	set(LIB_SOURCES ${LIB_SOURCES} src/deflate_decompress.c)
+endif()
+
+option(SUPPORT_ZLIB "Support zlib wrapper format" ON)
+if(SUPPORT_ZLIB)
+	set(LIB_SOURCES ${LIB_SOURCES} src/adler32.c)
+	if(SUPPORT_COMPRESSION)
+		set(LIB_SOURCES ${LIB_SOURCES} src/zlib_compress.c)
+	endif()
+	if(SUPPORT_DECOMPRESSION)
+		set(LIB_SOURCES ${LIB_SOURCES} src/zlib_decompress.c)
+	endif()
+endif()
+
+option(SUPPORT_GZIP "Support gzip wrapper format" ON)
+if(SUPPORT_GZIP)
+	set(LIB_SOURCES ${LIB_SOURCES} src/crc32.c)
+	if(SUPPORT_COMPRESSION)
+		set(LIB_SOURCES ${LIB_SOURCES} src/gzip_compress.c)
+	endif()
+	if(SUPPORT_DECOMPRESSION)
+		set(LIB_SOURCES ${LIB_SOURCES} src/gzip_decompress.c)
+	endif()
+endif()
+
+option(SUPPORT_NEAR_OPTIMAL_PARSING "Support near optimal parsing (high compression mode)" ON)
+if(SUPPORT_NEAR_OPTIMAL_PARSING)
+	add_definitions(-DSUPPORT_NEAR_OPTIMAL_PARSING=1)
+else()
+	add_definitions(-DSUPPORT_NEAR_OPTIMAL_PARSING=0)
+endif()
+
+option(UNSAFE_DECOMPRESSION "Assume that all compressed data is valid (faster but insecure)" OFF)
+if(UNSAFE_DECOMPRESSION)
+	add_definitions(-DUNSAFE_DECOMPRESSION=1)
+else()
+	add_definitions(-DUNSAFE_DECOMPRESSION=0)
+endif()
+
+option(BUILD_EXAMPLES "Build the example programs" OFF)
+if(BUILD_EXAMPLES)
+	add_executable(benchmark examples/benchmark.c)
+	target_link_libraries(benchmark deflate)
+endif()
+
+add_library(deflate SHARED ${LIB_SOURCES})
+add_library(deflatestatic STATIC ${LIB_SOURCES})
+
+set_target_properties(deflate PROPERTIES VERSION ${LIB_VERSION_STRING})
+set_target_properties(deflate PROPERTIES SOVERSION ${LIB_VERSION_MAJOR})
+
+install(TARGETS deflate deflatestatic
+	LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib"
+	ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib")
+
+install(FILES libdeflate.h DESTINATION "${CMAKE_INSTALL_PREFIX}/include")
diff --git a/README b/README
new file mode 100644
index 0000000..b0100eb
--- /dev/null
+++ b/README
@@ -0,0 +1,19 @@
+This is libdeflate, a free (public domain) library for fast, whole-buffer
+DEFLATE compression and decompression.
+
+The supported formats are:
+
+	- DEFLATE (raw)
+	- zlib (DEFLATE with zlib header and footer)
+	- gzip (DEFLATE with gzip header and footer)
+
+libdeflate is heavily optimized.  It is significantly faster than zlib, both for
+compression and decompression.  In addition, at compression levels 8 and above
+it provides a compression ratio better than zlib's, while still being about the
+same speed as zlib's level 9.
+
+libdeflate has a simple API that is not zlib-compatible.  You can create
+compressors and decompressors, and use them to compress or decompress buffers.
+There is not yet any support for streaming.  See libdeflate.h for details.
+
+libdeflate is public domain; the author claims no copyright on it.
diff --git a/examples/benchmark.c b/examples/benchmark.c
new file mode 100644
index 0000000..edc0482
--- /dev/null
+++ b/examples/benchmark.c
@@ -0,0 +1,210 @@
+/*
+ * benchmark.c - A compression testing and benchmark program.
+ *
+ * The author dedicates this file to the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#include <libdeflate.h>
+
+#ifdef __WIN32__
+#  include <windows.h>
+#else
+#  define _FILE_OFFSET_BITS 64
+#  define O_BINARY 0
+#  define _POSIX_C_SOURCE 199309L
+#  include <time.h>
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static uint64_t
+current_time(void)
+{
+#ifdef __WIN32__
+#  define TIME_UNIT_PER_MS 10000
+	LARGE_INTEGER time;
+	QueryPerformanceCounter(&time);
+	return time.QuadPart;
+#else
+#  define TIME_UNIT_PER_MS 1000000
+	struct timespec ts;
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return (1000000000ULL * ts.tv_sec) + ts.tv_nsec;
+#endif
+}
+
+static int
+do_benchmark(int fd, char *ubuf1, char *ubuf2,
+	     char *cbuf, uint32_t max_chunk_size,
+	     struct deflate_compressor *compressor,
+	     struct deflate_decompressor *decompressor)
+{
+	uint64_t usize_total = 0;
+	uint64_t csize_total = 0;
+	uint64_t compress_time_total = 0;
+	uint64_t decompress_time_total = 0;
+
+	for (;;) {
+		char *p = ubuf1;
+		ssize_t bytes_read;
+		size_t usize;
+		size_t csize;
+		bool ok;
+		uint64_t start_time;
+
+		/* Read the next chunk of data.  */
+		do {
+			bytes_read = read(fd, p, ubuf1 + max_chunk_size - p);
+			if (bytes_read < 0) {
+				fprintf(stderr, "ERROR: Read error: %s\n",
+					strerror(errno));
+				return 1;
+			}
+			p += bytes_read;
+		} while (bytes_read != 0 && p != ubuf1 + max_chunk_size);
+
+		usize = p - ubuf1;
+
+		if (usize == 0)  /* End of file?  */
+			break;
+
+		/* Compress the chunk of data.  */
+		usize_total += usize;
+		start_time = current_time();
+		csize = deflate_compress(compressor, ubuf1, usize,
+					 cbuf, usize - 1);
+		compress_time_total += current_time() - start_time;
+
+		if (csize) {
+			/* Successfully compressed the chunk of data.  */
+			csize_total += csize;
+
+			/* Decompress the data we just compressed and compare
+			 * the result with the original.  */
+			start_time = current_time();
+			ok = deflate_decompress(decompressor, cbuf, csize,
+						ubuf2, usize);
+			decompress_time_total += current_time() - start_time;
+			if (!ok) {
+				fprintf(stderr, "ERROR: Failed to "
+					"decompress data\n");
+				return 1;
+			}
+
+			if (memcmp(ubuf1, ubuf2, usize)) {
+				fprintf(stderr, "ERROR: Data did not "
+					"decompress to original\n");
+				return 1;
+			}
+		} else {
+			/* Chunk of data did not compress to less than its
+			 * original size.  */
+			csize_total += usize;
+		}
+	}
+
+
+	if (usize_total == 0) {
+		printf("\tEmpty input.\n");
+		return 0;
+	}
+
+	if (compress_time_total == 0)
+		compress_time_total++;
+	if (decompress_time_total == 0)
+		decompress_time_total++;
+
+	printf("\tCompressed %"PRIu64 " => %"PRIu64" bytes (%u.%u%%)\n",
+	       usize_total, csize_total,
+	       (unsigned int)(csize_total * 100 / usize_total),
+	       (unsigned int)(csize_total * 100000 / usize_total % 1000));
+	printf("\tCompression time: %"PRIu64" ms (%"PRIu64" MB/s)\n",
+	       compress_time_total / TIME_UNIT_PER_MS,
+	       1000 * usize_total / compress_time_total);
+	printf("\tDecompression time: %"PRIu64" ms (%"PRIu64" MB/s)\n",
+	       decompress_time_total / TIME_UNIT_PER_MS,
+	       1000 * usize_total / decompress_time_total);
+	return 0;
+}
+
+int
+main(int argc, char **argv)
+{
+	const char *filename;
+	uint32_t chunk_size = 32768;
+	unsigned int compression_level = 6;
+	char *ubuf1 = NULL;
+	char *ubuf2 = NULL;
+	char *cbuf = NULL;
+	struct deflate_compressor *compressor = NULL;
+	struct deflate_decompressor *decompressor = NULL;
+	int fd = -1;
+	int ret;
+
+	if (argc < 2 || argc > 5) {
+		fprintf(stderr, "Usage: %s FILE [CHUNK_SIZE [LEVEL]]]\n", argv[0]);
+		ret = 2;
+		goto out;
+	}
+
+	filename = argv[1];
+
+	if (argc >= 3)
+		chunk_size = strtoul(argv[2], NULL, 10);
+
+	if (argc >= 4)
+		compression_level = strtoul(argv[3], NULL, 10);
+
+	printf("DEFLATE compression with %"PRIu32" byte chunks (level %u)\n",
+	       chunk_size, compression_level);
+
+	compressor = deflate_alloc_compressor(compression_level);
+	if (!compressor) {
+		fprintf(stderr, "ERROR: Failed to create compressor\n");
+		ret = 1;
+		goto out;
+	}
+
+	decompressor = deflate_alloc_decompressor();
+	if (!decompressor) {
+		fprintf(stderr, "ERROR: Failed to create decompressor\n");
+		ret = 1;
+		goto out;
+	}
+
+	ubuf1 = malloc(chunk_size);
+	ubuf2 = malloc(chunk_size);
+	cbuf = malloc(chunk_size - 1);
+
+	if (!ubuf1 || !ubuf2 || !cbuf) {
+		fprintf(stderr, "ERROR: Insufficient memory\n");
+		ret = 1;
+		goto out;
+	}
+
+	fd = open(filename, O_RDONLY | O_BINARY);
+	if (fd < 0) {
+		fprintf(stderr, "ERROR: Can't open \"%s\" for reading: %s\n",
+			filename, strerror(errno));
+		ret = 1;
+		goto out;
+	}
+
+	ret = do_benchmark(fd, ubuf1, ubuf2, cbuf, chunk_size,
+			   compressor, decompressor);
+out:
+	close(fd);
+	free(cbuf);
+	free(ubuf2);
+	free(ubuf1);
+	deflate_free_decompressor(decompressor);
+	deflate_free_compressor(compressor);
+	return ret;
+}
diff --git a/libdeflate.h b/libdeflate.h
new file mode 100644
index 0000000..1d42ed4
--- /dev/null
+++ b/libdeflate.h
@@ -0,0 +1,131 @@
+/*
+ * libdeflate.h
+ *
+ * Public header for the DEFLATE compression library.
+ */
+
+#ifndef LIBDEFLATE_H
+#define LIBDEFLATE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+#include <stddef.h>
+
+/* ========================================================================== */
+/*                             Compression                                    */
+/* ========================================================================== */
+
+struct deflate_compressor;
+
+/*
+ * deflate_alloc_compressor() allocates a new DEFLATE compressor.
+ * 'compression_level' is the compression level on a zlib-like scale (1 =
+ * fastest, 6 = medium/default, 9 = slowest).  The return value is a pointer to
+ * the new DEFLATE compressor, or NULL if out of memory.
+ *
+ * Note: the sliding window size is defined at compilation time (default 32768).
+ */
+extern struct deflate_compressor *
+deflate_alloc_compressor(unsigned int compression_level);
+
+/*
+ * deflate_compress() performs DEFLATE compression on a buffer of data.  The
+ * function attempts to compress 'in_nbytes' bytes of data located at 'in' and
+ * write the results to 'out', which has space for 'out_nbytes_avail' bytes.
+ * The return value is the compressed size in bytes, or 0 if the data could not
+ * be compressed to 'out_nbytes_avail' bytes or fewer.
+ */
+extern size_t
+deflate_compress(struct deflate_compressor *compressor,
+		 const void *in, size_t in_nbytes,
+		 void *out, size_t out_nbytes_avail);
+
+/*
+ * Like deflate_compress(), but store the data in the zlib wrapper format.
+ */
+extern size_t
+zlib_compress(struct deflate_compressor *compressor,
+	      const void *in, size_t in_nbytes,
+	      void *out, size_t out_nbytes_avail);
+
+/*
+ * Like deflate_compress(), but store the data in the gzip wrapper format.
+ */
+extern size_t
+gzip_compress(struct deflate_compressor *compressor,
+	      const void *in, size_t in_nbytes,
+	      void *out, size_t out_nbytes_avail);
+
+/*
+ * deflate_free_compressor() frees a DEFLATE compressor that was allocated with
+ * deflate_alloc_compressor().
+ */
+extern void
+deflate_free_compressor(struct deflate_compressor *compressor);
+
+/* ========================================================================== */
+/*                             Decompression                                  */
+/* ========================================================================== */
+
+struct deflate_decompressor;
+
+/*
+ * deflate_alloc_decompressor() allocates a new DEFLATE decompressor.  The
+ * return value is a pointer to the new DEFLATE decompressor, or NULL if out of
+ * memory.
+ *
+ * This function takes no parameters, and the returned decompressor is valid for
+ * decompressing data that was compressed at any compression level and with any
+ * sliding window size.
+ */
+extern struct deflate_decompressor *
+deflate_alloc_decompressor(void);
+
+/*
+ * deflate_decompress() decompresses 'in_nbytes' bytes of DEFLATE-compressed
+ * data at 'in' and writes the uncompressed data, which had original size
+ * 'out_nbytes', to 'out'.  The return value is true if decompression was
+ * successful, or false if the compressed data was invalid.
+ *
+ * To be clear: the uncompressed size must be known *exactly* and passed as
+ * 'out_nbytes'.
+ */
+extern bool
+deflate_decompress(struct deflate_decompressor *decompressor,
+		   const void *in, size_t in_nbytes,
+		   void *out, size_t out_nbytes);
+
+/*
+ * Like deflate_decompress(), but assumes the zlib wrapper format instead of raw
+ * DEFLATE.
+ */
+extern bool
+zlib_decompress(struct deflate_decompressor *decompressor,
+		const void *in, size_t in_nbytes,
+		void *out, size_t out_nbytes);
+
+/*
+ * Like deflate_decompress(), but assumes the gzip wrapper format instead of raw
+ * DEFLATE.
+ */
+extern bool
+gzip_decompress(struct deflate_decompressor *decompressor,
+		const void *in, size_t in_nbytes,
+		void *out, size_t out_nbytes);
+
+/*
+ * deflate_free_decompressor() frees a DEFLATE decompressor that was allocated
+ * with deflate_alloc_decompressor().
+ */
+extern void
+deflate_free_decompressor(struct deflate_decompressor *decompressor);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBDEFLATE_H */
diff --git a/src/adler32.c b/src/adler32.c
new file mode 100644
index 0000000..da5afc5
--- /dev/null
+++ b/src/adler32.c
@@ -0,0 +1,19 @@
+/*
+ * adler32.c
+ *
+ * Adler-32 checksum algorithm.
+ */
+
+#include "adler32.h"
+
+u32
+adler32(const u8 *buffer, size_t size)
+{
+	u32 s1 = 1;
+	u32 s2 = 0;
+	for (size_t i = 0; i < size; i++) {
+		s1 = (s1 + buffer[i]) % 65521;
+		s2 = (s2 + s1) % 65521;
+	}
+	return (s2 << 16) | s1;
+}
diff --git a/src/adler32.h b/src/adler32.h
new file mode 100644
index 0000000..78c2d02
--- /dev/null
+++ b/src/adler32.h
@@ -0,0 +1,12 @@
+/*
+ * adler32.h
+ *
+ * Adler-32 checksum algorithm.
+ */
+
+#pragma once
+
+#include "types.h"
+
+extern u32
+adler32(const u8 *buffer, size_t size);
diff --git a/src/bitops.h b/src/bitops.h
new file mode 100644
index 0000000..1e6f68c
--- /dev/null
+++ b/src/bitops.h
@@ -0,0 +1,80 @@
+/*
+ * bitops.h
+ *
+ * Inline functions for bit manipulation.
+ */
+
+#pragma once
+
+#include "compiler.h"
+#include "types.h"
+
+/* Find Last Set bit   */
+
+static inline unsigned fls32(u32 v)
+{
+#ifdef compiler_fls32
+	return compiler_fls32(v);
+#else
+	unsigned bit = 0;
+	while ((v >>= 1) != 0)
+		bit++;
+	return bit;
+#endif
+}
+
+static inline unsigned fls64(u64 v)
+{
+#ifdef compiler_fls64
+	return compiler_fls64(v);
+#else
+	unsigned bit = 0;
+	while ((v >>= 1) != 0)
+		bit++;
+	return bit;
+#endif
+}
+
+static inline unsigned flsw(machine_word_t v)
+{
+	BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
+	if (WORDSIZE == 4)
+		return fls32(v);
+	else
+		return fls64(v);
+}
+
+/* Find First Set bit   */
+
+static inline unsigned ffs32(u32 v)
+{
+#ifdef compiler_ffs32
+	return compiler_ffs32(v);
+#else
+	unsigned bit;
+	for (bit = 0; !(v & 1); bit++, v >>= 1)
+		;
+	return bit;
+#endif
+}
+
+static inline unsigned ffs64(u64 v)
+{
+#ifdef compiler_ffs64
+	return compiler_ffs64(v);
+#else
+	unsigned bit;
+	for (bit = 0; !(v & 1); bit++, v >>= 1)
+		;
+	return bit;
+#endif
+}
+
+static inline unsigned ffsw(machine_word_t v)
+{
+	BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
+	if (WORDSIZE == 4)
+		return ffs32(v);
+	else
+		return ffs64(v);
+}
diff --git a/src/bt_matchfinder.h b/src/bt_matchfinder.h
new file mode 100644
index 0000000..b827044
--- /dev/null
+++ b/src/bt_matchfinder.h
@@ -0,0 +1,279 @@
+/*
+ * bt_matchfinder.h
+ *
+ * This is a Binary Tree (bt) based matchfinder.
+ *
+ * The data structure is a hash table where each hash bucket contains a binary
+ * tree of sequences, referenced by position.  The sequences in the binary tree
+ * are ordered such that a left child is lexicographically lesser than its
+ * parent, and a right child is lexicographically greater than its parent.
+ *
+ * For each sequence (position) in the input, the first 3 bytes are hashed and
+ * the the appropriate binary tree is re-rooted at that sequence (position).
+ * Since the sequences are inserted in order, each binary tree maintains the
+ * invariant that each child node has greater match offset than its parent.
+ *
+ * While inserting a sequence, we may search the binary tree for matches with
+ * that sequence.  At each step, the length of the match is computed.  The
+ * search ends when the sequences get too far away (outside of the sliding
+ * window), or when the binary tree ends (in the code this is the same check as
+ * "too far away"), or when 'max_search_depth' positions have been searched, or
+ * when a match of at least 'nice_len' bytes has been found.
+ *
+ * Notes:
+ *
+ *	- Typically, we need to search more nodes to find a given match in a
+ *	  binary tree versus in a linked list.  However, a binary tree has more
+ *	  overhead than a linked list: it needs to be kept sorted, and the inner
+ *	  search loop is more complicated.  As a result, binary trees are best
+ *	  suited for compression modes where the potential matches are searched
+ *	  more thoroughly.
+ *
+ *	- Since no attempt is made to keep the binary trees balanced, it's
+ *	  essential to have the 'max_search_depth' cutoff.  Otherwise it could
+ *	  take quadratic time to run data through the matchfinder.
+ */
+
+#pragma once
+
+#include "lz_extend.h"
+#include "lz_hash3.h"
+#include "matchfinder_common.h"
+
+#ifndef BT_MATCHFINDER_HASH_ORDER
+#  if MATCHFINDER_WINDOW_ORDER < 14
+#    define BT_MATCHFINDER_HASH_ORDER 14
+#  else
+#    define BT_MATCHFINDER_HASH_ORDER 15
+#  endif
+#endif
+
+#define BT_MATCHFINDER_HASH_LENGTH	(1UL << BT_MATCHFINDER_HASH_ORDER)
+
+#define BT_MATCHFINDER_TOTAL_LENGTH	\
+	(BT_MATCHFINDER_HASH_LENGTH + (2UL * MATCHFINDER_WINDOW_SIZE))
+
+struct bt_matchfinder {
+	union {
+		pos_t mf_data[BT_MATCHFINDER_TOTAL_LENGTH];
+		struct {
+			pos_t hash_tab[BT_MATCHFINDER_HASH_LENGTH];
+			pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
+		};
+	};
+} _aligned_attribute(MATCHFINDER_ALIGNMENT);
+
+static inline void
+bt_matchfinder_init(struct bt_matchfinder *mf)
+{
+	matchfinder_init(mf->hash_tab, BT_MATCHFINDER_HASH_LENGTH);
+}
+
+#if MATCHFINDER_IS_SLIDING
+static inline void
+bt_matchfinder_slide_window(struct bt_matchfinder *mf)
+{
+	matchfinder_rebase(mf->mf_data, BT_MATCHFINDER_TOTAL_LENGTH);
+}
+#endif
+
+/*
+ * Find matches with the current sequence.
+ *
+ * @mf
+ *	The matchfinder structure.
+ * @in_base
+ *	Pointer to the next byte in the input buffer to process _at the last
+ *	time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
+ * @in_next
+ *	Pointer to the next byte in the input buffer to process.  This is the
+ *	pointer to the bytes being matched against.
+ * @max_len
+ *	Maximum match length to return.
+ * @nice_len
+ *	Stop searching if a match of at least this length is found.
+ * @max_search_depth
+ *	Limit on the number of potential matches to consider.
+ * @prev_hash
+ *	TODO
+ * @matches
+ *	Space to write the matches that are found.
+ *
+ * Returns the number of matches found, which may be anywhere from 0 to
+ * (nice_len - 3 + 1), inclusively.  The matches are written to @matches in
+ * order of strictly increasing length and strictly increasing offset.  The
+ * minimum match length is assumed to be 3.
+ */
+static inline unsigned
+bt_matchfinder_get_matches(struct bt_matchfinder * const restrict mf,
+			   const u8 * const in_base,
+			   const u8 * const in_next,
+			   const unsigned max_len,
+			   const unsigned nice_len,
+			   const unsigned max_search_depth,
+			   unsigned long *prev_hash,
+			   struct lz_match * const restrict matches)
+{
+	struct lz_match *lz_matchptr = matches;
+	unsigned depth_remaining = max_search_depth;
+	unsigned hash;
+	pos_t cur_match;
+	const u8 *matchptr;
+	unsigned best_len;
+	pos_t *pending_lt_ptr, *pending_gt_ptr;
+	unsigned best_lt_len, best_gt_len;
+	unsigned len;
+	pos_t *children;
+
+	if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES + 1))
+		return 0;
+
+	hash = *prev_hash;
+	*prev_hash = lz_hash3(in_next + 1, BT_MATCHFINDER_HASH_ORDER);
+	prefetch(&mf->hash_tab[*prev_hash]);
+	cur_match = mf->hash_tab[hash];
+	mf->hash_tab[hash] = in_next - in_base;
+
+	best_len = 2;
+	pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1];
+	pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1];
+	best_lt_len = 0;
+	best_gt_len = 0;
+	for (;;) {
+		if (!matchfinder_match_in_window(cur_match,
+						 in_base, in_next) ||
+		    !depth_remaining--)
+		{
+			*pending_lt_ptr = MATCHFINDER_INITVAL;
+			*pending_gt_ptr = MATCHFINDER_INITVAL;
+			return lz_matchptr - matches;
+		}
+
+		matchptr = &in_base[cur_match];
+		len = min(best_lt_len, best_gt_len);
+
+		children = &mf->child_tab[(unsigned long)
+				matchfinder_slot_for_match(cur_match) << 1];
+
+		if (matchptr[len] == in_next[len]) {
+
+			len = lz_extend(in_next, matchptr, len + 1, max_len);
+
+			if (len > best_len) {
+				best_len = len;
+
+				lz_matchptr->length = len;
+				lz_matchptr->offset = in_next - matchptr;
+				lz_matchptr++;
+
+				if (len >= nice_len) {
+					*pending_lt_ptr = children[0];
+					*pending_gt_ptr = children[1];
+					return lz_matchptr - matches;
+				}
+			}
+		}
+
+		if (matchptr[len] < in_next[len]) {
+			*pending_lt_ptr = cur_match;
+			pending_lt_ptr = &children[1];
+			cur_match = *pending_lt_ptr;
+			best_lt_len = len;
+		} else {
+			*pending_gt_ptr = cur_match;
+			pending_gt_ptr = &children[0];
+			cur_match = *pending_gt_ptr;
+			best_gt_len = len;
+		}
+	}
+}
+
+/*
+ * Advance the match-finder, but don't search for matches.
+ *
+ * @mf
+ *	The matchfinder structure.
+ * @in_base
+ *	Pointer to the next byte in the input buffer to process _at the last
+ *	time bc_matchfinder_init() or bc_matchfinder_slide_window() was called_.
+ * @in_next
+ *	Pointer to the next byte in the input buffer to process.
+ * @in_end
+ *	Pointer to the end of the input buffer.
+ * @nice_len
+ *	Stop searching if a match of at least this length is found.
+ * @max_search_depth
+ *	Limit on the number of potential matches to consider.
+ * @prev_hash
+ *	TODO
+ */
+static inline void
+bt_matchfinder_skip_position(struct bt_matchfinder * const restrict mf,
+			     const u8 * const in_base,
+			     const u8 * const in_next,
+			     const u8 * const in_end,
+			     const unsigned nice_len,
+			     const unsigned max_search_depth,
+			     unsigned long *prev_hash)
+{
+	unsigned depth_remaining = max_search_depth;
+	unsigned hash;
+	pos_t cur_match;
+	const u8 *matchptr;
+	pos_t *pending_lt_ptr, *pending_gt_ptr;
+	unsigned best_lt_len, best_gt_len;
+	unsigned len;
+	pos_t *children;
+
+	if (unlikely(in_end - in_next < LZ_HASH_REQUIRED_NBYTES + 1))
+		return;
+
+	hash = *prev_hash;
+	*prev_hash = lz_hash3(in_next + 1, BT_MATCHFINDER_HASH_ORDER);
+	prefetch(&mf->hash_tab[*prev_hash]);
+	cur_match = mf->hash_tab[hash];
+	mf->hash_tab[hash] = in_next - in_base;
+
+	depth_remaining = max_search_depth;
+	pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1];
+	pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1];
+	best_lt_len = 0;
+	best_gt_len = 0;
+	for (;;) {
+		if (!matchfinder_match_in_window(cur_match,
+						 in_base, in_next) ||
+		    !depth_remaining--)
+		{
+			*pending_lt_ptr = MATCHFINDER_INITVAL;
+			*pending_gt_ptr = MATCHFINDER_INITVAL;
+			return;
+		}
+
+		matchptr = &in_base[cur_match];
+		len = min(best_lt_len, best_gt_len);
+
+		children = &mf->child_tab[(unsigned long)
+				matchfinder_slot_for_match(cur_match) << 1];
+
+		if (matchptr[len] == in_next[len]) {
+			len = lz_extend(in_next, matchptr, len + 1, nice_len);
+			if (len == nice_len) {
+				*pending_lt_ptr = children[0];
+				*pending_gt_ptr = children[1];
+				return;
+			}
+		}
+
+		if (matchptr[len] < in_next[len]) {
+			*pending_lt_ptr = cur_match;
+			pending_lt_ptr = &children[1];
+			cur_match = *pending_lt_ptr;
+			best_lt_len = len;
+		} else {
+			*pending_gt_ptr = cur_match;
+			pending_gt_ptr = &children[0];
+			cur_match = *pending_gt_ptr;
+			best_gt_len = len;
+		}
+	}
+}
diff --git a/src/compiler-gcc.h b/src/compiler-gcc.h
new file mode 100644
index 0000000..b9e1869
--- /dev/null
+++ b/src/compiler-gcc.h
@@ -0,0 +1,52 @@
+/*
+ * compiler-gcc.h
+ *
+ * Compiler and platform-specific definitions for the GNU C compiler.
+ */
+
+#pragma once
+
+#ifdef __WIN32__
+#  define LIBEXPORT __declspec(dllexport)
+#else
+#  define LIBEXPORT __attribute__((visibility("default")))
+#endif
+
+#define likely(expr)		__builtin_expect(!!(expr), 1)
+#define unlikely(expr)		__builtin_expect(!!(expr), 0)
+#define prefetch(addr)		__builtin_prefetch(addr)
+#define inline			inline __attribute__((always_inline))
+#define _aligned_attribute(n)	__attribute__((aligned(n)))
+#define _packed_attribute	__attribute__((packed))
+
+#define CPU_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+
+#if defined(__x86_64__) || defined(__i386__)
+#  define UNALIGNED_ACCESS_SPEED 3
+#elif defined(__ARM_FEATURE_UNALIGNED) && (__ARM_FEATURE_UNALIGNED == 1)
+#  define UNALIGNED_ACCESS_SPEED 2
+#else
+#  define UNALIGNED_ACCESS_SPEED 0
+#endif
+
+#define min(a, b)  ({ __typeof__(a) _a = (a); __typeof__(b) _b = (b); \
+			(_a < _b) ? _a : _b; })
+
+#define max(a, b)  ({ __typeof__(a) _a = (a); __typeof__(b) _b = (b); \
+			(_a > _b) ? _a : _b; })
+
+#define swap(a, b) ({ __typeof__(a) _a = a; (a) = (b); (b) = _a; })
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
+#  define compiler_bswap32 __builtin_bswap32
+#  define compiler_bswap64 __builtin_bswap64
+#endif
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+#  define compiler_bswap16 __builtin_bswap16
+#endif
+
+#define compiler_fls32(n) (31 - __builtin_clz(n))
+#define compiler_fls64(n) (63 - __builtin_clzll(n))
+#define compiler_ffs32(n) __builtin_ctz(n)
+#define compiler_ffs64(n) __builtin_ctzll(n)
diff --git a/src/compiler.h b/src/compiler.h
new file mode 100644
index 0000000..95d2bc3
--- /dev/null
+++ b/src/compiler.h
@@ -0,0 +1,60 @@
+/*
+ * compiler.h
+ *
+ * Compiler and platform-specific definitions.
+ */
+
+#pragma once
+
+#ifdef __GNUC__
+#  include "compiler-gcc.h"
+#else
+#  warning "Unrecognized compiler.  Please add a header file for your compiler."
+#endif
+
+#ifndef LIBEXPORT
+#  define LIBEXPORT
+#endif
+
+#ifndef BUILD_BUG_ON
+#  define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+#endif
+
+#ifndef likely
+#  define likely(expr) (expr)
+#endif
+
+#ifndef unlikely
+#  define unlikely(expr) (expr)
+#endif
+
+#ifndef prefetch
+#  define prefetch(addr)
+#endif
+
+#ifndef _aligned_attribute
+#  error "missing required definition of _aligned_attribute"
+#endif
+
+#ifndef _packed_attribute
+#  error "missing required definition of _packed_attribute"
+#endif
+
+#ifndef CPU_IS_BIG_ENDIAN
+#  error "missing required endianness definition"
+#endif
+
+#define CPU_IS_LITTLE_ENDIAN (!CPU_IS_BIG_ENDIAN)
+
+#ifndef UNALIGNED_ACCESS_SPEED
+#  warning "assuming unaligned accesses are not allowed"
+#  define UNALIGNED_ACCESS_SPEED 0
+#endif
+
+#define UNALIGNED_ACCESS_IS_ALLOWED	(UNALIGNED_ACCESS_SPEED >= 1)
+#define UNALIGNED_ACCESS_IS_FAST	(UNALIGNED_ACCESS_SPEED >= 2)
+#define UNALIGNED_ACCESS_IS_VERY_FAST	(UNALIGNED_ACCESS_SPEED >= 3)
+
+#if !defined(min) || !defined(max) || !defined(swap)
+#  error "missing required definitions of min(), max(), and swap() macros"
+#endif
diff --git a/src/crc32.c b/src/crc32.c
new file mode 100644
index 0000000..94dc83c
--- /dev/null
+++ b/src/crc32.c
@@ -0,0 +1,73 @@
+/*
+ * crc32.c
+ *
+ * CRC-32 checksum algorithm.
+ */
+
+#include "crc32.h"
+
+static const u32 crc_table[256] = {
+    0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419,
+    0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4,
+    0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07,
+    0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+    0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856,
+    0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+    0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
+    0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+    0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3,
+    0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a,
+    0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599,
+    0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+    0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190,
+    0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
+    0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e,
+    0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+    0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed,
+    0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+    0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3,
+    0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+    0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
+    0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
+    0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010,
+    0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+    0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17,
+    0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6,
+    0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615,
+    0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+    0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344,
+    0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+    0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a,
+    0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+    0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1,
+    0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c,
+    0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
+    0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+    0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe,
+    0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
+    0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c,
+    0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+    0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b,
+    0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+    0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1,
+    0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+    0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278,
+    0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
+    0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66,
+    0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+    0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
+    0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8,
+    0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
+    0x2d02ef8d,
+};
+
+u32
+crc32(const u8 *buffer, size_t size)
+{
+	u32 crc = ~0;
+
+	for (size_t i = 0; i < size; i++)
+		crc = crc_table[(u8)crc ^ buffer[i]] ^ (crc >> 8);
+
+	return ~crc;
+}
diff --git a/src/crc32.h b/src/crc32.h
new file mode 100644
index 0000000..c8ee66c
--- /dev/null
+++ b/src/crc32.h
@@ -0,0 +1,12 @@
+/*
+ * crc32.h
+ *
+ * CRC-32 checksum algorithm.
+ */
+
+#pragma once
+
+#include "types.h"
+
+extern u32
+crc32(const u8 *buffer, size_t size);
diff --git a/src/deflate_compress.c b/src/deflate_compress.c
new file mode 100644
index 0000000..e566625
--- /dev/null
+++ b/src/deflate_compress.c
@@ -0,0 +1,2323 @@
+/*
+ * deflate_compress.c
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libdeflate.h"
+
+#include "deflate_compress.h"
+#include "deflate_constants.h"
+#include "unaligned.h"
+
+/*
+ * Note: when compiling this file, SUPPORT_NEAR_OPTIMAL_PARSING should be
+ * defined to either 0 or 1.  When defined to 1, the near-optimal parsing
+ * algorithm is enabled at compression level 80 and above.  The near-optimal
+ * parsing algorithm produces a compression ratio significantly better than the
+ * greedy and lazy algorithms implemented here, and also the algorithm used by
+ * zlib at level 9.  However, it is slow.
+ */
+#ifndef SUPPORT_NEAR_OPTIMAL_PARSING
+#  define SUPPORT_NEAR_OPTIMAL_PARSING 0
+#endif
+
+/*
+ * Define to 1 to maintain the full map from match offsets to offset slots.
+ * This slightly speeds up translations of match offsets to offset slots, but it
+ * uses 32768 bytes of memory rather than the 512 bytes used by the condensed
+ * map.  The speedup provided by the larger map is most helpful when the
+ * near-optimal parsing algorithm is being used.
+ */
+#define USE_FULL_OFFSET_SLOT_FAST	SUPPORT_NEAR_OPTIMAL_PARSING
+
+/*
+ * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
+ * appropriately.
+ */
+#define MATCHFINDER_WINDOW_ORDER	15
+#define MATCHFINDER_IS_SLIDING		1
+
+#include "hc_matchfinder.h"
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+#  include "bt_matchfinder.h"
+#endif
+
+/*
+ * Number of literals+matches to output before starting new Huffman codes.
+ *
+ * This is just a heuristic, as there is no efficient algorithm for computing
+ * optimal block splitting in general.
+ *
+ * Note: a lower value than defined here usually results in a slightly better
+ * compression ratio, but creates more overhead in compression and
+ * decompression.
+ *
+ * This value is not used by the near-optimal parsing algorithm, which uses
+ * OPTIM_BLOCK_LENGTH instead.
+ */
+#define MAX_ITEMS_PER_BLOCK	16384
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+/* Constants specific to the near-optimal parsing algorithm.  */
+
+/* The preferred DEFLATE block length in bytes.  */
+#  define OPTIM_BLOCK_LENGTH	16384
+
+/* The maximum number of matches the matchfinder can find at a single position.
+ * Since the matchfinder never finds more than one match for the same length,
+ * presuming one of each possible length is sufficient for an upper bound.
+ * (This says nothing about whether it is worthwhile to consider so many
+ * matches; this is just defining the worst case.)  */
+#  define MAX_MATCHES_PER_POS	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
+
+/* The number of array spaces to reserve for a single block's matches.  This
+ * value should be high enough so that virtually the time, all matches found in
+ * OPTIM_BLOCK_LENGTH consecutive positions can fit in this array.  However,
+ * this is *not* the true upper bound on the number of matches that can possibly
+ * be found.  Therefore, checks for overflow are still required.  */
+#  define CACHE_LEN		((OPTIM_BLOCK_LENGTH * 8) + (MAX_MATCHES_PER_POS + 1))
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+#define ARRAY_LEN(A) (sizeof(A) / sizeof((A)[0]))
+
+/* Table: length slot => length slot base value  */
+static const unsigned deflate_length_slot_base[] = {
+	3   , 4   , 5   , 6   , 7   , 8   , 9   , 10  ,
+	11  , 13  , 15  , 17  , 19  , 23  , 27  , 31  ,
+	35  , 43  , 51  , 59  , 67  , 83  , 99  , 115 ,
+	131 , 163 , 195 , 227 , 258 ,
+};
+
+/* Table: length slot => number of extra length bits  */
+static const u8 deflate_extra_length_bits[] = {
+	0   , 0   , 0   , 0   , 0   , 0   , 0   , 0 ,
+	1   , 1   , 1   , 1   , 2   , 2   , 2   , 2 ,
+	3   , 3   , 3   , 3   , 4   , 4   , 4   , 4 ,
+	5   , 5   , 5   , 5   , 0   ,
+};
+
+/* Table: offset slot => offset slot base value  */
+static const unsigned deflate_offset_slot_base[] = {
+	1    , 2    , 3    , 4     , 5     , 7     , 9     , 13    ,
+	17   , 25   , 33   , 49    , 65    , 97    , 129   , 193   ,
+	257  , 385  , 513  , 769   , 1025  , 1537  , 2049  , 3073  ,
+	4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
+};
+
+/* Table: offset slot => number of extra offset bits  */
+static const u8 deflate_extra_offset_bits[] = {
+	0    , 0    , 0    , 0     , 1     , 1     , 2     , 2     ,
+	3    , 3    , 4    , 4     , 5     , 5     , 6     , 6     ,
+	7    , 7    , 8    , 8     , 9     , 9     , 10    , 10    ,
+	11   , 11   , 12   , 12    , 13    , 13    ,
+};
+
+/* Codewords for the DEFLATE Huffman codes.  */
+struct deflate_codewords {
+	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
+	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/* Codeword lengths (in bits) for the DEFLATE Huffman codes.
+ * A zero length means the corresponding symbol had zero frequency.  */
+struct deflate_lens {
+	union {
+		u8 all[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
+		struct {
+			u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
+			u8 offset[DEFLATE_NUM_OFFSET_SYMS];
+		};
+	};
+};
+
+/* Codewords and lengths for the DEFLATE Huffman codes.  */
+struct deflate_codes {
+	struct deflate_codewords codewords;
+	struct deflate_lens lens;
+};
+
+/* Symbol frequency counters for the DEFLATE Huffman codes.  */
+struct deflate_freqs {
+	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
+	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/* Costs for the near-optimal parsing algorithm.  */
+struct deflate_costs {
+
+	/* The cost to output each possible literal.  */
+	u32 literal[DEFLATE_NUM_LITERALS];
+
+	/* The cost to output each possible match length.  */
+	u32 length[DEFLATE_MAX_MATCH_LEN + 1];
+
+	/* The cost to output a match offset of each possible offset slot.  */
+	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/*
+ * COST_SHIFT is a scaling factor that makes it possible to consider fractional
+ * bit costs.  A token requiring 'n' bits to represent has cost n << COST_SHIFT.
+ *
+ * Note: this is only useful as a statistical trick for when the true costs are
+ * unknown.  In reality, each token in DEFLATE requires a whole number of bits
+ * to output.
+ */
+#define COST_SHIFT	3
+
+/*
+ * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
+ * be needed to output a symbol that was unused in the previous optimization
+ * pass.  Assigning a default cost allows the symbol to be used in the next
+ * optimization pass.  However, the cost should be relatively high because the
+ * symbol probably won't be used very many times (if at all).
+ */
+#define LITERAL_NOSTAT_BITS	13
+#define LENGTH_NOSTAT_BITS	13
+#define OFFSET_NOSTAT_BITS	10
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+/* An intermediate representation of a DEFLATE match or literal  */
+struct deflate_item {
+	/*
+	 * Bits 0  -  8: Literal/length symbol
+	 * Bits 9  - 13: Extra length bits
+	 * Bits 14 - 18: Offset symbol
+	 * Bits 19 - 31: Extra offset bits
+	 *
+	 * Unfortunately, gcc generates worse code if we use real bitfields here.
+	 */
+	u32 data;
+};
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/*
+ * This structure represents a byte position in the input data and a node in the
+ * graph of possible match/literal choices for the current block.
+ *
+ * Logically, each incoming edge to this node is labeled with a literal or a
+ * match that can be taken to reach this position from an earlier position; and
+ * each outgoing edge from this node is labeled with a literal or a match that
+ * can be taken to advance from this position to a later position.
+ *
+ * But these "edges" are actually stored elsewhere (in 'cached_matches').
+ * Here we associate with each node just two pieces of information:
+ *
+ *	'cost_to_end' is the minimum cost to reach the end of the block from
+ *	this position.
+ *
+ *	'item' represents the literal or match that must be chosen from here to
+ *	reach the end of the block with the minimum cost.  Equivalently, this
+ *	can be interpreted as the label of the outgoing edge on the minimum-cost
+ *	path to the "end of block" node from this node.
+ */
+struct deflate_optimum_node {
+
+	u32 cost_to_end;
+
+	/*
+	 * Notes on the match/literal representation used here:
+	 *
+	 *	The low bits of 'item' are the length: 1 if this is a literal,
+	 *	or the match length if this is a match.
+	 *
+	 *	The high bits of 'item' are the actual literal byte if this is a
+	 *	literal, or the match offset if this is a match.
+	 */
+#define OPTIMUM_OFFSET_SHIFT 9
+#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
+	u32 item;
+
+} _aligned_attribute(8);
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+/* The main DEFLATE compressor structure  */
+struct deflate_compressor {
+
+	/* Pointer to the compress() implementation chosen at allocation time */
+	size_t (*impl)(struct deflate_compressor *,
+		       const u8 *, size_t, u8 *, size_t);
+
+	/* Frequency counters for the current block  */
+	struct deflate_freqs freqs;
+
+	/* Dynamic Huffman codes for the current block  */
+	struct deflate_codes codes;
+
+	/* Static Huffman codes set at allocation time  */
+	struct deflate_codes static_codes;
+
+	/* A table for fast lookups of length slot by length.  The length slot
+	 * for the match length 'len' is 'length_slot_fast[len]'.  */
+	u8 length_slot_fast[DEFLATE_MAX_MATCH_LEN + 1];
+
+	/* A table for fast lookups of offset slot by match offset.
+	 *
+	 * If the full table is being used, it is a direct mapping from offset
+	 * to offset slot.
+	 *
+	 * If the condensed table is being used, the first 256 entries map
+	 * directly to the offset slots of offsets 1 through 256.  The next 256
+	 * entries map to the offset slots for the remaining offsets, stepping
+	 * through the offsets with a stride of 128.  This relies on the fact
+	 * that each of the remaining offset slots contains at least 128 offsets
+	 * and has an offset base that is a multiple of 128.  */
+#if USE_FULL_OFFSET_SLOT_FAST
+	u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
+#else
+	u8 offset_slot_fast[512];
+#endif
+
+	/* The "nice" match length: if a match of this length is found, choose
+	 * it immediately without further consideration.  */
+	unsigned nice_match_length;
+
+	/* The maximum search depth: consider at most this many potential
+	 * matches at each position.  */
+	unsigned max_search_depth;
+
+	/* The compression level with which this compressor was created.  */
+	unsigned compression_level;
+
+	union {
+		/* Data for greedy or lazy parsing  */
+		struct {
+			/* Hash chain matchfinder  */
+			struct hc_matchfinder hc_mf;
+
+			/* The match/literal sequence for the current block  */
+			struct deflate_item chosen_items[MAX_ITEMS_PER_BLOCK];
+
+			u8 nonoptimal_end[0];
+		};
+
+	#if SUPPORT_NEAR_OPTIMAL_PARSING
+		/* Data for near-optimal parsing  */
+		struct {
+
+			/* Binary tree matchfinder  */
+			struct bt_matchfinder bt_mf;
+
+			/* Matches found using the matchfinder are cached in
+			 * this array so that later optimization of the block
+			 * has the matches easily available.  The cached matches
+			 * are cleared when a new block is started.  */
+			struct lz_match cached_matches[CACHE_LEN];
+
+			/* Array of structures, one per position, for running
+			 * the minimum-cost path algorithm.  */
+			struct deflate_optimum_node optimum[OPTIM_BLOCK_LENGTH +
+							    1 + DEFLATE_MAX_MATCH_LEN];
+
+			/* The current cost model being used.  */
+			struct deflate_costs costs;
+
+			unsigned num_optim_passes;
+
+			u8 optimal_end[0];
+		};
+	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+	};
+};
+
+/*
+ * The type for the bitbuffer variable, which temporarily holds bits that are
+ * being packed into bytes and written to the output buffer.  For best
+ * performance, this should have size equal to a machine word.
+ */
+typedef machine_word_t bitbuf_t;
+#define BITBUF_NBITS	(8 * sizeof(bitbuf_t))
+
+/*
+ * Structure to keep track of the current state of sending bits to the
+ * compressed output buffer.
+ */
+struct deflate_output_bitstream {
+
+	/* Bits that haven't yet been written to the output buffer.  */
+	bitbuf_t bitbuf;
+
+	/* Number of bits currently held in @bitbuf.  */
+	int bitcount;
+
+	/* Pointer to the start of the output buffer.  */
+	u8 *start;
+
+	/* Pointer to the position in the output buffer at which the next byte
+	 * should be written.  */
+	u8 *next;
+
+	/* Pointer just past the end of the output buffer.  */
+	u8 *end;
+};
+
+/* Initialize the output bitstream.  */
+static void
+deflate_init_output(struct deflate_output_bitstream *os, void *buffer, size_t size)
+{
+	os->bitbuf = 0;
+	os->bitcount = 0;
+	os->start = buffer;
+	os->next = os->start;
+	os->end = os->start + size;
+}
+
+/* Write some bits to the output bitstream.  */
+static inline void
+deflate_write_bits(struct deflate_output_bitstream *os,
+		   const bitbuf_t bits, const unsigned num_bits)
+{
+	/* We only flush 'bitbuf' when it completely fills up.
+	 * This improves performance.  */
+
+	os->bitbuf |= bits << os->bitcount;
+	os->bitcount += num_bits;
+	if (os->bitcount >= BITBUF_NBITS) {
+		if (os->end - os->next >= sizeof(bitbuf_t)) {
+			put_unaligned_word_le(os->bitbuf, os->next);
+			os->next += sizeof(bitbuf_t);
+		} else {
+			os->next = os->end;
+		}
+		os->bitcount -= BITBUF_NBITS;
+		os->bitbuf = bits >> (num_bits - os->bitcount);
+	}
+}
+
+/*
+ * Flush any remaining bits to the output buffer if needed.  Return the total
+ * number of bytes written to the output buffer, or 0 if an overflow occurred.
+ */
+static u32
+deflate_flush_output(struct deflate_output_bitstream *os)
+{
+	while (os->bitcount > 0) {
+		if (os->next != os->end)
+			*os->next++ = os->bitbuf;
+		os->bitcount -= 8;
+		os->bitbuf >>= 8;
+	}
+
+	if (os->next == os->end)  /* overflow?  */
+		return 0;
+
+	return os->next - os->start;
+}
+
+/* Given the binary tree node A[subtree_idx] whose children already
+ * satisfy the maxheap property, swap the node with its greater child
+ * until it is greater than both its children, so that the maxheap
+ * property is satisfied in the subtree rooted at A[subtree_idx].  */
+static void
+heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
+{
+	unsigned parent_idx;
+	unsigned child_idx;
+	u32 v;
+
+	v = A[subtree_idx];
+	parent_idx = subtree_idx;
+	while ((child_idx = parent_idx * 2) <= length) {
+		if (child_idx < length && A[child_idx + 1] > A[child_idx])
+			child_idx++;
+		if (v >= A[child_idx])
+			break;
+		A[parent_idx] = A[child_idx];
+		parent_idx = child_idx;
+	}
+	A[parent_idx] = v;
+}
+
+/* Rearrange the array 'A' so that it satisfies the maxheap property.
+ * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
+ */
+static void
+heapify_array(u32 A[], unsigned length)
+{
+	for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
+		heapify_subtree(A, length, subtree_idx);
+}
+
+/* Sort the array 'A', which contains 'length' unsigned 32-bit integers.  */
+static void
+heapsort(u32 A[], unsigned length)
+{
+	A--; /* Use 1-based indices  */
+
+	heapify_array(A, length);
+
+	while (length >= 2) {
+		swap(A[1], A[length]);
+		length--;
+		heapify_subtree(A, length, 1);
+	}
+}
+
+#define NUM_SYMBOL_BITS 10
+#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
+
+/*
+ * Sort the symbols primarily by frequency and secondarily by symbol
+ * value.  Discard symbols with zero frequency and fill in an array with
+ * the remaining symbols, along with their frequencies.  The low
+ * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
+ * value, and the remaining bits will contain the frequency.
+ *
+ * @num_syms
+ *	Number of symbols in the alphabet.
+ *	Can't be greater than (1 << NUM_SYMBOL_BITS).
+ *
+ * @freqs[num_syms]
+ *	The frequency of each symbol.
+ *
+ * @lens[num_syms]
+ *	An array that eventually will hold the length of each codeword.
+ *	This function only fills in the codeword lengths for symbols that
+ *	have zero frequency, which are not well defined per se but will
+ *	be set to 0.
+ *
+ * @symout[num_syms]
+ *	The output array, described above.
+ *
+ * Returns the number of entries in 'symout' that were filled.  This is
+ * the number of symbols that have nonzero frequency.
+ */
+static unsigned
+sort_symbols(unsigned num_syms, const u32 freqs[restrict],
+	     u8 lens[restrict], u32 symout[restrict])
+{
+	unsigned num_used_syms;
+	unsigned num_counters;
+
+	/* We rely on heapsort, but with an added optimization.  Since
+	 * it's common for most symbol frequencies to be low, we first do
+	 * a count sort using a limited number of counters.  High
+	 * frequencies will be counted in the last counter, and only they
+	 * will be sorted with heapsort.
+	 *
+	 * Note: with more symbols, it is generally beneficial to have more
+	 * counters.  About 1 counter per 4 symbols seems fast.
+	 *
+	 * Note: I also tested radix sort, but even for large symbol
+	 * counts (> 255) and frequencies bounded at 16 bits (enabling
+	 * radix sort by just two base-256 digits), it didn't seem any
+	 * faster than the method implemented here.
+	 *
+	 * Note: I tested the optimized quicksort implementation from
+	 * glibc (with indirection overhead removed), but it was only
+	 * marginally faster than the simple heapsort implemented here.
+	 *
+	 * Tests were done with building the codes for LZX.  Results may
+	 * vary for different compression algorithms...!  */
+
+	num_counters = ((num_syms + 3 / 4) + 3) & ~3;
+
+	unsigned counters[num_counters];
+
+	memset(counters, 0, sizeof(counters));
+
+	/* Count the frequencies.  */
+	for (unsigned sym = 0; sym < num_syms; sym++)
+		counters[min(freqs[sym], num_counters - 1)]++;
+
+	/* Make the counters cumulative, ignoring the zero-th, which
+	 * counted symbols with zero frequency.  As a side effect, this
+	 * calculates the number of symbols with nonzero frequency.  */
+	num_used_syms = 0;
+	for (unsigned i = 1; i < num_counters; i++) {
+		unsigned count = counters[i];
+		counters[i] = num_used_syms;
+		num_used_syms += count;
+	}
+
+	/* Sort nonzero-frequency symbols using the counters.  At the
+	 * same time, set the codeword lengths of zero-frequency symbols
+	 * to 0.  */
+	for (unsigned sym = 0; sym < num_syms; sym++) {
+		u32 freq = freqs[sym];
+		if (freq != 0) {
+			symout[counters[min(freq, num_counters - 1)]++] =
+				sym | (freq << NUM_SYMBOL_BITS);
+		} else {
+			lens[sym] = 0;
+		}
+	}
+
+	/* Sort the symbols counted in the last counter.  */
+	heapsort(symout + counters[num_counters - 2],
+		 counters[num_counters - 1] - counters[num_counters - 2]);
+
+	return num_used_syms;
+}
+
+/*
+ * Build the Huffman tree.
+ *
+ * This is an optimized implementation that
+ *	(a) takes advantage of the frequencies being already sorted;
+ *	(b) only generates non-leaf nodes, since the non-leaf nodes of a
+ *	    Huffman tree are sufficient to generate a canonical code;
+ *	(c) Only stores parent pointers, not child pointers;
+ *	(d) Produces the nodes in the same memory used for input
+ *	    frequency information.
+ *
+ * Array 'A', which contains 'sym_count' entries, is used for both input
+ * and output.  For this function, 'sym_count' must be at least 2.
+ *
+ * For input, the array must contain the frequencies of the symbols,
+ * sorted in increasing order.  Specifically, each entry must contain a
+ * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
+ * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
+ * Although these bits will, in fact, contain the symbols that correspond
+ * to the frequencies, this function is concerned with frequencies only
+ * and keeps the symbols as-is.
+ *
+ * For output, this function will produce the non-leaf nodes of the
+ * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
+ * entries of the array.  Entry A[sym_count - 2] will represent the root
+ * node.  Each other node will contain the zero-based index of its parent
+ * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
+ * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
+ * note that although these low bits will, in fact, contain a symbol
+ * value, this symbol will have *no relationship* with the Huffman tree
+ * node that happens to occupy the same slot.  This is because this
+ * implementation only generates the non-leaf nodes of the tree.
+ */
+static void
+build_tree(u32 A[], unsigned sym_count)
+{
+	/* Index, in 'A', of next lowest frequency symbol that has not
+	 * yet been processed.  */
+	unsigned i = 0;
+
+	/* Index, in 'A', of next lowest frequency parentless non-leaf
+	 * node; or, if equal to 'e', then no such node exists yet.  */
+	unsigned b = 0;
+
+	/* Index, in 'A', of next node to allocate as a non-leaf.  */
+	unsigned e = 0;
+
+	do {
+		unsigned m, n;
+		u32 freq_shifted;
+
+		/* Choose the two next lowest frequency entries.  */
+
+		if (i != sym_count &&
+		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
+			m = i++;
+		else
+			m = b++;
+
+		if (i != sym_count &&
+		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
+			n = i++;
+		else
+			n = b++;
+
+		/* Allocate a non-leaf node and link the entries to it.
+		 *
+		 * If we link an entry that we're visiting for the first
+		 * time (via index 'i'), then we're actually linking a
+		 * leaf node and it will have no effect, since the leaf
+		 * will be overwritten with a non-leaf when index 'e'
+		 * catches up to it.  But it's not any slower to
+		 * unconditionally set the parent index.
+		 *
+		 * We also compute the frequency of the non-leaf node as
+		 * the sum of its two children's frequencies.  */
+
+		freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
+
+		A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
+		A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
+		A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
+		e++;
+	} while (sym_count - e > 1);
+		/* When just one entry remains, it is a "leaf" that was
+		 * linked to some other node.  We ignore it, since the
+		 * rest of the array contains the non-leaves which we
+		 * need.  (Note that we're assuming the cases with 0 or 1
+		 * symbols were handled separately.) */
+}
+
+/*
+ * Given the stripped-down Huffman tree constructed by build_tree(),
+ * determine the number of codewords that should be assigned each
+ * possible length, taking into account the length-limited constraint.
+ *
+ * @A
+ *	The array produced by build_tree(), containing parent index
+ *	information for the non-leaf nodes of the Huffman tree.  Each
+ *	entry in this array is a node; a node's parent always has a
+ *	greater index than that node itself.  This function will
+ *	overwrite the parent index information in this array, so
+ *	essentially it will destroy the tree.  However, the data in the
+ *	low NUM_SYMBOL_BITS of each entry will be preserved.
+ *
+ * @root_idx
+ *	The 0-based index of the root node in 'A', and consequently one
+ *	less than the number of tree node entries in 'A'.  (Or, really 2
+ *	less than the actual length of 'A'.)
+ *
+ * @len_counts
+ *	An array of length ('max_codeword_len' + 1) in which the number of
+ *	codewords having each length <= max_codeword_len will be
+ *	returned.
+ *
+ * @max_codeword_len
+ *	The maximum permissible codeword length.
+ */
+static void
+compute_length_counts(u32 A[restrict], unsigned root_idx,
+		      unsigned len_counts[restrict], unsigned max_codeword_len)
+{
+	/* The key observations are:
+	 *
+	 * (1) We can traverse the non-leaf nodes of the tree, always
+	 * visiting a parent before its children, by simply iterating
+	 * through the array in reverse order.  Consequently, we can
+	 * compute the depth of each node in one pass, overwriting the
+	 * parent indices with depths.
+	 *
+	 * (2) We can initially assume that in the real Huffman tree,
+	 * both children of the root are leaves.  This corresponds to two
+	 * codewords of length 1.  Then, whenever we visit a (non-leaf)
+	 * node during the traversal, we modify this assumption to
+	 * account for the current node *not* being a leaf, but rather
+	 * its two children being leaves.  This causes the loss of one
+	 * codeword for the current depth and the addition of two
+	 * codewords for the current depth plus one.
+	 *
+	 * (3) We can handle the length-limited constraint fairly easily
+	 * by simply using the largest length available when a depth
+	 * exceeds max_codeword_len.
+	 */
+
+	for (unsigned len = 0; len <= max_codeword_len; len++)
+		len_counts[len] = 0;
+	len_counts[1] = 2;
+
+	/* Set the root node's depth to 0.  */
+	A[root_idx] &= SYMBOL_MASK;
+
+	for (int node = root_idx - 1; node >= 0; node--) {
+
+		/* Calculate the depth of this node.  */
+
+		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
+		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
+		unsigned depth = parent_depth + 1;
+		unsigned len = depth;
+
+		/* Set the depth of this node so that it is available
+		 * when its children (if any) are processed.  */
+
+		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
+
+		/* If needed, decrease the length to meet the
+		 * length-limited constraint.  This is not the optimal
+		 * method for generating length-limited Huffman codes!
+		 * But it should be good enough.  */
+		if (len >= max_codeword_len) {
+			len = max_codeword_len;
+			do {
+				len--;
+			} while (len_counts[len] == 0);
+		}
+
+		/* Account for the fact that we have a non-leaf node at
+		 * the current depth.  */
+		len_counts[len]--;
+		len_counts[len + 1] += 2;
+	}
+}
+
+/*
+ * Generate the codewords for a canonical Huffman code.
+ *
+ * @A
+ *	The output array for codewords.  In addition, initially this
+ *	array must contain the symbols, sorted primarily by frequency and
+ *	secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
+ *	each entry.
+ *
+ * @len
+ *	Output array for codeword lengths.
+ *
+ * @len_counts
+ *	An array that provides the number of codewords that will have
+ *	each possible length <= max_codeword_len.
+ *
+ * @max_codeword_len
+ *	Maximum length, in bits, of each codeword.
+ *
+ * @num_syms
+ *	Number of symbols in the alphabet, including symbols with zero
+ *	frequency.  This is the length of the 'A' and 'len' arrays.
+ */
+static void
+gen_codewords(u32 A[restrict], u8 lens[restrict],
+	      const unsigned len_counts[restrict],
+	      unsigned max_codeword_len, unsigned num_syms)
+{
+	u32 next_codewords[max_codeword_len + 1];
+
+	/* Given the number of codewords that will have each length,
+	 * assign codeword lengths to symbols.  We do this by assigning
+	 * the lengths in decreasing order to the symbols sorted
+	 * primarily by increasing frequency and secondarily by
+	 * increasing symbol value.  */
+	for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) {
+		unsigned count = len_counts[len];
+		while (count--)
+			lens[A[i++] & SYMBOL_MASK] = len;
+	}
+
+	/* Generate the codewords themselves.  We initialize the
+	 * 'next_codewords' array to provide the lexicographically first
+	 * codeword of each length, then assign codewords in symbol
+	 * order.  This produces a canonical code.  */
+	next_codewords[0] = 0;
+	next_codewords[1] = 0;
+	for (unsigned len = 2; len <= max_codeword_len; len++)
+		next_codewords[len] =
+			(next_codewords[len - 1] + len_counts[len - 1]) << 1;
+
+	for (unsigned sym = 0; sym < num_syms; sym++)
+		A[sym] = next_codewords[lens[sym]]++;
+}
+
+/*
+ * ---------------------------------------------------------------------
+ *			make_canonical_huffman_code()
+ * ---------------------------------------------------------------------
+ *
+ * Given an alphabet and the frequency of each symbol in it, construct a
+ * length-limited canonical Huffman code.
+ *
+ * @num_syms
+ *	The number of symbols in the alphabet.  The symbols are the
+ *	integers in the range [0, num_syms - 1].  This parameter must be
+ *	at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
+ *
+ * @max_codeword_len
+ *	The maximum permissible codeword length.
+ *
+ * @freqs
+ *	An array of @num_syms entries, each of which specifies the
+ *	frequency of the corresponding symbol.  It is valid for some,
+ *	none, or all of the frequencies to be 0.
+ *
+ * @lens
+ *	An array of @num_syms entries in which this function will return
+ *	the length, in bits, of the codeword assigned to each symbol.
+ *	Symbols with 0 frequency will not have codewords per se, but
+ *	their entries in this array will be set to 0.  No lengths greater
+ *	than @max_codeword_len will be assigned.
+ *
+ * @codewords
+ *	An array of @num_syms entries in which this function will return
+ *	the codeword for each symbol, right-justified and padded on the
+ *	left with zeroes.  Codewords for symbols with 0 frequency will be
+ *	undefined.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * This function builds a length-limited canonical Huffman code.
+ *
+ * A length-limited Huffman code contains no codewords longer than some
+ * specified length, and has exactly (with some algorithms) or
+ * approximately (with the algorithm used here) the minimum weighted path
+ * length from the root, given this constraint.
+ *
+ * A canonical Huffman code satisfies the properties that a longer
+ * codeword never lexicographically precedes a shorter codeword, and the
+ * lexicographic ordering of codewords of the same length is the same as
+ * the lexicographic ordering of the corresponding symbols.  A canonical
+ * Huffman code, or more generally a canonical prefix code, can be
+ * reconstructed from only a list containing the codeword length of each
+ * symbol.
+ *
+ * The classic algorithm to generate a Huffman code creates a node for
+ * each symbol, then inserts these nodes into a min-heap keyed by symbol
+ * frequency.  Then, repeatedly, the two lowest-frequency nodes are
+ * removed from the min-heap and added as the children of a new node
+ * having frequency equal to the sum of its two children, which is then
+ * inserted into the min-heap.  When only a single node remains in the
+ * min-heap, it is the root of the Huffman tree.  The codeword for each
+ * symbol is determined by the path needed to reach the corresponding
+ * node from the root.  Descending to the left child appends a 0 bit,
+ * whereas descending to the right child appends a 1 bit.
+ *
+ * The classic algorithm is relatively easy to understand, but it is
+ * subject to a number of inefficiencies.  In practice, it is fastest to
+ * first sort the symbols by frequency.  (This itself can be subject to
+ * an optimization based on the fact that most frequencies tend to be
+ * low.)  At the same time, we sort secondarily by symbol value, which
+ * aids the process of generating a canonical code.  Then, during tree
+ * construction, no heap is necessary because both the leaf nodes and the
+ * unparented non-leaf nodes can be easily maintained in sorted order.
+ * Consequently, there can never be more than two possibilities for the
+ * next-lowest-frequency node.
+ *
+ * In addition, because we're generating a canonical code, we actually
+ * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
+ * This is because for canonical code generation we don't need to know
+ * where the symbols are in the tree.  Rather, we only need to know how
+ * many leaf nodes have each depth (codeword length).  And this
+ * information can, in fact, be quickly generated from the tree of
+ * non-leaves only.
+ *
+ * Furthermore, we can build this stripped-down Huffman tree directly in
+ * the array in which the codewords are to be generated, provided that
+ * these array slots are large enough to hold a symbol and frequency
+ * value.
+ *
+ * Still furthermore, we don't even need to maintain explicit child
+ * pointers.  We only need the parent pointers, and even those can be
+ * overwritten in-place with depth information as part of the process of
+ * extracting codeword lengths from the tree.  So in summary, we do NOT
+ * need a big structure like:
+ *
+ *	struct huffman_tree_node {
+ *		unsigned int symbol;
+ *		unsigned int frequency;
+ *		unsigned int depth;
+ *		struct huffman_tree_node *left_child;
+ *		struct huffman_tree_node *right_child;
+ *	};
+ *
+ *
+ *   ... which often gets used in "naive" implementations of Huffman code
+ *   generation.
+ *
+ * Many of these optimizations are based on the implementation in 7-Zip
+ * (source file: C/HuffEnc.c), which has been placed in the public domain
+ * by Igor Pavlov.
+ */
+static void
+make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
+			    const u32 freqs[restrict],
+			    u8 lens[restrict], u32 codewords[restrict])
+{
+	u32 *A = codewords;
+	unsigned num_used_syms;
+
+	/* Assumptions  */
+	assert(num_syms >= 2);
+	assert(num_syms <= (1 << NUM_SYMBOL_BITS));
+	assert((1ULL << max_codeword_len) >= num_syms);
+	assert(max_codeword_len <= 32);
+
+	/* We begin by sorting the symbols primarily by frequency and
+	 * secondarily by symbol value.  As an optimization, the array
+	 * used for this purpose ('A') shares storage with the space in
+	 * which we will eventually return the codewords.  */
+
+	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
+
+	/* 'num_used_syms' is the number of symbols with nonzero
+	 * frequency.  This may be less than @num_syms.  'num_used_syms'
+	 * is also the number of entries in 'A' that are valid.  Each
+	 * entry consists of a distinct symbol and a nonzero frequency
+	 * packed into a 32-bit integer.  */
+
+	/* Handle special cases where only 0 or 1 symbols were used (had
+	 * nonzero frequency).  */
+
+	if (unlikely(num_used_syms == 0)) {
+		/* Code is empty.  sort_symbols() already set all lengths
+		 * to 0, so there is nothing more to do.  */
+		return;
+	}
+
+	if (unlikely(num_used_syms == 1)) {
+		/* Only one symbol was used, so we only need one
+		 * codeword.  But two codewords are needed to form the
+		 * smallest complete Huffman code, which uses codewords 0
+		 * and 1.  Therefore, we choose another symbol to which
+		 * to assign a codeword.  We use 0 (if the used symbol is
+		 * not 0) or 1 (if the used symbol is 0).  In either
+		 * case, the lesser-valued symbol must be assigned
+		 * codeword 0 so that the resulting code is canonical.  */
+
+		unsigned sym = A[0] & SYMBOL_MASK;
+		unsigned nonzero_idx = sym ? sym : 1;
+
+		codewords[0] = 0;
+		lens[0] = 1;
+		codewords[nonzero_idx] = 1;
+		lens[nonzero_idx] = 1;
+		return;
+	}
+
+	/* Build a stripped-down version of the Huffman tree, sharing the
+	 * array 'A' with the symbol values.  Then extract length counts
+	 * from the tree and use them to generate the final codewords.  */
+
+	build_tree(A, num_used_syms);
+
+	{
+		unsigned len_counts[max_codeword_len + 1];
+
+		compute_length_counts(A, num_used_syms - 2,
+				      len_counts, max_codeword_len);
+
+		gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
+	}
+}
+
+/*
+ * Clear the Huffman symbol frequency counters.
+ * This must be called when starting a new DEFLATE block.
+ */
+static void
+deflate_reset_symbol_frequencies(struct deflate_compressor *c)
+{
+	memset(&c->freqs, 0, sizeof(c->freqs));
+}
+
+/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length.  */
+static u32
+deflate_reverse_codeword(u32 codeword, u8 len)
+{
+	u32 codeword_reversed = 0;
+
+	for (int bit = (int)len - 1; bit >= 0; bit--)
+		codeword_reversed |= ((codeword >> bit) & 1) << (len - 1 - bit);
+
+	return codeword_reversed;
+}
+
+/* Make a canonical Huffman code with bit-reversed codewords.  */
+static void
+deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
+			  const u32 freqs[], u8 lens[], u32 codewords[])
+{
+	make_canonical_huffman_code(num_syms, max_codeword_len,
+				    freqs, lens, codewords);
+
+	for (unsigned i = 0; i < num_syms; i++)
+		codewords[i] = deflate_reverse_codeword(codewords[i], lens[i]);
+}
+
+/*
+ * Build the literal/length and offset Huffman codes for a DEFLATE block.
+ *
+ * This takes as input the frequency tables for each code and produces as output
+ * a set of tables that map symbols to codewords and codeword lengths.
+ */
+static void
+deflate_make_huffman_codes(const struct deflate_freqs *freqs,
+			   struct deflate_codes *codes)
+{
+	deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
+				  DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+				  freqs->litlen,
+				  codes->lens.litlen,
+				  codes->codewords.litlen);
+
+	deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
+				  DEFLATE_MAX_OFFSET_CODEWORD_LEN,
+				  freqs->offset,
+				  codes->lens.offset,
+				  codes->codewords.offset);
+}
+
+/* Write the header fields common to all DEFLATE block types.  */
+static void
+deflate_write_block_header(struct deflate_output_bitstream *os,
+			   bool is_final_block, unsigned block_type)
+{
+	deflate_write_bits(os, is_final_block, 1);
+	deflate_write_bits(os, block_type, 2);
+}
+
+static unsigned
+deflate_compute_precode_items(const u8 lens[restrict],
+			      const unsigned num_lens,
+			      u32 precode_freqs[restrict],
+			      unsigned precode_items[restrict])
+{
+	unsigned *itemptr;
+	unsigned run_start;
+	unsigned run_end;
+	unsigned extra_bits;
+	u8 len;
+
+	itemptr = precode_items;
+	run_start = 0;
+	do {
+		/* Find the next run of codeword lengths.  */
+
+		/* len = the length being repeated  */
+		len = lens[run_start];
+
+		/* Extend the run.  */
+		run_end = run_start;
+		do {
+			run_end++;
+		} while (run_end != num_lens && len == lens[run_end]);
+
+		if (len == 0) {
+			/* Run of zeroes.  */
+
+			/* Symbol 18: RLE 11 to 138 zeroes at a time.  */
+			while ((run_end - run_start) >= 11) {
+				extra_bits = min((run_end - run_start) - 11, 0x7F);
+				precode_freqs[18]++;
+				*itemptr++ = 18 | (extra_bits << 5);
+				run_start += 11 + extra_bits;
+			}
+
+			/* Symbol 17: RLE 3 to 10 zeroes at a time.  */
+			if ((run_end - run_start) >= 3) {
+				extra_bits = min((run_end - run_start) - 3, 0x7);
+				precode_freqs[17]++;
+				*itemptr++ = 17 | (extra_bits << 5);
+				run_start += 3 + extra_bits;
+			}
+		} else {
+
+			/* A run of nonzero lengths. */
+
+			/* Symbol 16: RLE 3 to 6 of the previous length.  */
+			if ((run_end - run_start) >= 4) {
+				precode_freqs[len]++;
+				*itemptr++ = len;
+				run_start++;
+				do {
+					extra_bits = min((run_end - run_start) - 3, 0x3);
+					precode_freqs[16]++;
+					*itemptr++ = 16 | (extra_bits << 5);
+					run_start += 3 + extra_bits;
+				} while ((run_end - run_start) >= 3);
+			}
+		}
+
+		/* Output any remaining lengths without RLE.  */
+		while (run_start != run_end) {
+			precode_freqs[len]++;
+			*itemptr++ = len;
+			run_start++;
+		}
+	} while (run_start != num_lens);
+
+	return itemptr - precode_items;
+}
+
+/*
+ * Output a list of Huffman codeword lengths in compressed form.
+ *
+ * The codeword lengths are compressed using a separate Huffman code, the
+ * "precode", which contains a symbol for each possible codeword length in the
+ * larger code as well as several special symbols to represent repeated codeword
+ * lengths (a form of run-length encoding).  The precode is itself constructed
+ * in canonical form, and its codeword lengths are represented literally in 19
+ * 3-bit fields that immediately precede the compressed codeword lengths of the
+ * larger code.
+ */
+static void
+deflate_write_compressed_lens(struct deflate_output_bitstream *os,
+			      const u8 lens[], unsigned num_lens)
+{
+	u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
+	u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
+	u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
+	unsigned precode_items[num_lens];
+	unsigned num_precode_items;
+	unsigned precode_item;
+	unsigned precode_sym;
+	unsigned num_explicit_lens;
+	unsigned i;
+	static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+		16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+	};
+
+	for (i = 0; i < DEFLATE_NUM_PRECODE_SYMS; i++)
+		precode_freqs[i] = 0;
+
+	/* Compute the "items" (RLE / literal tokens and extra bits) with which
+	 * the codeword lengths in the larger code will be output.  */
+	num_precode_items = deflate_compute_precode_items(lens,
+							  num_lens,
+							  precode_freqs,
+							  precode_items);
+
+	/* Build the precode.  */
+	deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
+				  DEFLATE_MAX_PRE_CODEWORD_LEN,
+				  precode_freqs, precode_lens,
+				  precode_codewords);
+
+	/* Count how many precode lengths we actually need to output.  */
+	for (num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
+	     num_explicit_lens > 4;
+	     num_explicit_lens--)
+		if (precode_lens[deflate_precode_lens_permutation[num_explicit_lens - 1]] != 0)
+			break;
+
+	deflate_write_bits(os, num_explicit_lens - 4, 4);
+
+	/* Output the lengths of the codewords in the precode.  */
+	for (i = 0; i < num_explicit_lens; i++)
+		deflate_write_bits(os, precode_lens[deflate_precode_lens_permutation[i]], 3);
+
+	/* Output the encoded lengths of the codewords in the larger code.  */
+	for (i = 0; i < num_precode_items; i++) {
+		precode_item = precode_items[i];
+		precode_sym = precode_item & 0x1F;
+		deflate_write_bits(os, precode_codewords[precode_sym],
+				   precode_lens[precode_sym]);
+		if (precode_sym >= 16) {
+			if (precode_sym == 16)
+				deflate_write_bits(os, precode_item >> 5, 2);
+			else if (precode_sym == 17)
+				deflate_write_bits(os, precode_item >> 5, 3);
+			else
+				deflate_write_bits(os, precode_item >> 5, 7);
+		}
+	}
+}
+
+/*
+ * Output the specified Huffman codes.
+ * This is used for dynamic Huffman blocks.
+ */
+static void
+deflate_write_huffman_codes(struct deflate_output_bitstream *os,
+			    struct deflate_codes *codes)
+{
+	unsigned num_litlen_syms;
+	unsigned num_offset_syms;
+
+	/* We only need to output up to the highest-valued symbol actually used.  */
+
+	for (num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
+	     num_litlen_syms > 257;
+	     num_litlen_syms--)
+		if (codes->lens.litlen[num_litlen_syms - 1] != 0)
+			break;
+
+	for (num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
+	     num_offset_syms > 1;
+	     num_offset_syms--)
+		if (codes->lens.offset[num_offset_syms - 1] != 0)
+			break;
+
+	deflate_write_bits(os, num_litlen_syms - 257, 5);
+	deflate_write_bits(os, num_offset_syms - 1, 5);
+
+	/* If we're not outputting the full set of literal/length codeword
+	 * lengths, temporarily move the offset codeword lengths over so that
+	 * the literal/length and offset codeword lengths are contiguous.  */
+
+	BUILD_BUG_ON(offsetof(struct deflate_lens, offset) !=
+		     DEFLATE_NUM_LITLEN_SYMS);
+
+	if (num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS)
+		memmove(&codes->lens.all[num_litlen_syms],
+			&codes->lens.all[DEFLATE_NUM_LITLEN_SYMS],
+			num_offset_syms * sizeof(codes->lens.all[0]));
+
+	/* Output the codeword lengths.  */
+
+	deflate_write_compressed_lens(os, codes->lens.all,
+				      num_litlen_syms + num_offset_syms);
+
+	/* Restore the offset codeword lengths if needed.  */
+	if (num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS)
+		memmove(&codes->lens.all[DEFLATE_NUM_LITLEN_SYMS],
+			&codes->lens.all[num_litlen_syms],
+			num_offset_syms * sizeof(codes->lens.all[0]));
+}
+
+/* Output a literal or match, encoded using the specified Huffman codes.  */
+static inline void
+deflate_write_item(struct deflate_output_bitstream *os, struct deflate_item item,
+		   const struct deflate_codes *codes)
+{
+	u32 data = item.data;
+	u32 litlen_symbol = data & 0x1FF;
+	u32 offset_symbol;
+
+	/* Literal/length symbol  */
+	deflate_write_bits(os, codes->codewords.litlen[litlen_symbol],
+			   codes->lens.litlen[litlen_symbol]);
+
+	if (data < 256) /* Literal?  */
+		return;
+
+	/* Match length  */
+	deflate_write_bits(os, (data >> 9) & 0x1F,
+			   deflate_extra_length_bits[litlen_symbol - 257]);
+
+	/* Match offset  */
+	offset_symbol = (data >> 14) & 0x1F;
+	deflate_write_bits(os,
+			   codes->codewords.offset[offset_symbol],
+			   codes->lens.offset[offset_symbol]);
+	deflate_write_bits(os, (data >> 19),
+			   deflate_extra_offset_bits[offset_symbol]);
+}
+
+/* Output the literals and matches for a block.  */
+static void
+deflate_write_items(struct deflate_output_bitstream *os,
+		    const struct deflate_item items[], unsigned num_items,
+		    const struct deflate_codes *codes)
+{
+	for (unsigned i = 0; i < num_items; i++)
+		deflate_write_item(os, items[i], codes);
+}
+
+/* Output the end-of-block symbol.  */
+static void
+deflate_write_end_of_block(struct deflate_output_bitstream *os,
+			   const struct deflate_codes *codes)
+{
+	deflate_write_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
+			   codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
+}
+
+/*
+ * Output a block containing the literal/match "items" stored in
+ * c->chosen_items...next_chosen_item.
+ */
+static void
+deflate_write_block(struct deflate_compressor *c,
+		    struct deflate_item *next_chosen_item,
+		    struct deflate_output_bitstream *os,
+		    bool is_final_block)
+{
+	unsigned num_chosen_items = next_chosen_item - c->chosen_items;
+	struct deflate_codes *codes;
+
+	/* Note: we don't currently output any uncompressed blocks.  */
+
+	/* Account for end-of-block symbol  */
+	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+
+	if (num_chosen_items >= 100) {
+		/* Use custom ("dynamic") Huffman codes.  */
+		deflate_write_block_header(os, is_final_block,
+					   DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN);
+		deflate_make_huffman_codes(&c->freqs, &c->codes);
+		deflate_write_huffman_codes(os, &c->codes);
+		codes = &c->codes;
+	} else {
+		/* This is a very short block.  Just use the static codes.  */
+		deflate_write_block_header(os, is_final_block,
+					   DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
+		codes = &c->static_codes;
+	}
+	deflate_write_items(os, c->chosen_items, num_chosen_items, codes);
+	deflate_write_end_of_block(os, codes);
+
+	/* Reset symbol frequencies if this wasn't the final block.  */
+	if (!is_final_block)
+		deflate_reset_symbol_frequencies(c);
+}
+
+/* Return the length slot for the specified match length.  */
+static inline unsigned
+deflate_get_length_slot(struct deflate_compressor *c, unsigned length)
+{
+	return c->length_slot_fast[length];
+}
+
+/* Return the offset slot for the specified match offset.  */
+static inline unsigned
+deflate_get_offset_slot(struct deflate_compressor *c, unsigned offset)
+{
+#if USE_FULL_OFFSET_SLOT_FAST
+	return c->offset_slot_fast[offset];
+#else
+	if (offset <= 256)
+		return c->offset_slot_fast[offset - 1];
+	else
+		return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
+#endif
+}
+
+/* Tally the Huffman symbol frequencies needed to output a literal, and return
+ * the literal in intermediate form.  */
+static inline struct deflate_item
+deflate_choose_literal(struct deflate_compressor *c, unsigned literal)
+{
+	c->freqs.litlen[literal]++;
+	return (struct deflate_item) { .data = literal };
+}
+
+/* Tally the Huffman symbol frequencies needed to output a match, and return the
+ * match in intermediate form.  */
+static inline struct deflate_item
+deflate_choose_match(struct deflate_compressor *c, unsigned length, unsigned offset)
+{
+	unsigned length_slot = deflate_get_length_slot(c, length);
+	unsigned offset_slot = deflate_get_offset_slot(c, offset);
+
+	c->freqs.litlen[257 + length_slot]++;
+	c->freqs.offset[offset_slot]++;
+
+	return (struct deflate_item) {
+		.data = (257 + length_slot) |
+			((u32)(length - deflate_length_slot_base[length_slot]) << 9) |
+			((u32)offset_slot << 14) |
+			((u32)(offset - deflate_offset_slot_base[offset_slot]) << 19),
+	};
+}
+
+/*
+ * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
+ */
+static size_t
+deflate_compress_greedy(struct deflate_compressor * restrict c,
+			const u8 * restrict in, size_t in_nbytes,
+			u8 * restrict out, size_t out_nbytes_avail)
+{
+	const u8 *in_next = in;
+	const u8 *in_end = in_next + in_nbytes;
+	struct deflate_output_bitstream os;
+	struct deflate_item *next_chosen_item;
+
+	deflate_init_output(&os, out, out_nbytes_avail);
+	next_chosen_item = c->chosen_items;
+	deflate_reset_symbol_frequencies(c);
+
+	/* The outer loop repeats every WINDOW_SIZE bytes and handles the
+	 * sliding window.  */
+	do {
+		const u8 *in_cur_base;
+		const u8 *in_cur_end;
+
+		if (in == in_next)
+			hc_matchfinder_init(&c->hc_mf);
+		else
+			hc_matchfinder_slide_window(&c->hc_mf);
+
+		in_cur_base = in_next;
+		in_cur_end = in_next + min(in_end - in_next,
+					   MATCHFINDER_WINDOW_SIZE);
+		do {
+			unsigned max_len;
+			unsigned nice_len;
+			unsigned length;
+			unsigned offset;
+
+			max_len = min(in_cur_end - in_next, DEFLATE_MAX_MATCH_LEN);
+			nice_len = min(max_len, c->nice_match_length);
+
+			length = hc_matchfinder_longest_match(&c->hc_mf,
+							      in_cur_base,
+							      in_next,
+							      DEFLATE_MIN_MATCH_LEN - 1,
+							      max_len,
+							      nice_len,
+							      c->max_search_depth,
+							      &offset);
+			in_next += 1;
+
+			if (length >= DEFLATE_MIN_MATCH_LEN) {
+				/* Match found.  */
+				*next_chosen_item++ =
+					deflate_choose_match(c, length, offset);
+				hc_matchfinder_skip_positions(&c->hc_mf,
+							      in_cur_base,
+							      in_next,
+							      in_end,
+							      length - 1);
+				in_next += length - 1;
+			} else {
+				/* No match found.  */
+				*next_chosen_item++ =
+					deflate_choose_literal(c, *(in_next - 1));
+			}
+
+			/* Check if it's time to output another block.  */
+			if (next_chosen_item - c->chosen_items >=
+			    MAX_ITEMS_PER_BLOCK)
+			{
+				deflate_write_block(c, next_chosen_item,
+						    &os, in_next == in_end);
+				next_chosen_item = c->chosen_items;
+			}
+
+		} while (in_next != in_cur_end);
+
+	} while (in_next != in_end);
+
+	/* Output the last block.  */
+	if (next_chosen_item != c->chosen_items)
+		deflate_write_block(c, next_chosen_item, &os, true);
+
+	return deflate_flush_output(&os);
+}
+
+/*
+ * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
+ * see if there's a longer match at the next position.  If yes, it outputs a
+ * literal and continues to the next position.  If no, it outputs the match.
+ */
+static size_t
+deflate_compress_lazy(struct deflate_compressor * restrict c,
+		      const u8 * restrict in, size_t in_nbytes,
+		      u8 * restrict out, size_t out_nbytes_avail)
+{
+	const u8 *in_next = in;
+	const u8 *in_end = in_next + in_nbytes;
+	struct deflate_output_bitstream os;
+	struct deflate_item *next_chosen_item;
+	struct deflate_item *end_block = c->chosen_items + MAX_ITEMS_PER_BLOCK;
+
+	deflate_init_output(&os, out, out_nbytes_avail);
+	next_chosen_item = c->chosen_items;
+	deflate_reset_symbol_frequencies(c);
+
+	/* The outer loop repeats every WINDOW_SIZE bytes and handles the
+	 * sliding window.  */
+	do {
+		const u8 *in_cur_base;
+		const u8 *in_cur_end;
+		unsigned max_len;
+		unsigned nice_len;
+
+		if (in == in_next)
+			hc_matchfinder_init(&c->hc_mf);
+		else
+			hc_matchfinder_slide_window(&c->hc_mf);
+
+		in_cur_base = in_next;
+		in_cur_end = in_next + min(in_end - in_next,
+					   MATCHFINDER_WINDOW_SIZE);
+		max_len = DEFLATE_MAX_MATCH_LEN;
+		nice_len = min(c->nice_match_length, max_len);
+		do {
+			unsigned cur_len;
+			unsigned cur_offset;
+			unsigned next_len;
+			unsigned next_offset;
+
+			if (unlikely(in_cur_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
+				max_len = in_cur_end - in_next;
+				nice_len = min(max_len, nice_len);
+			}
+
+			/* Find the longest match at the current position.  */
+			cur_len = hc_matchfinder_longest_match(&c->hc_mf,
+							       in_cur_base,
+							       in_next,
+							       DEFLATE_MIN_MATCH_LEN - 1,
+							       max_len,
+							       nice_len,
+							       c->max_search_depth,
+							       &cur_offset);
+			in_next += 1;
+
+			if (cur_len < DEFLATE_MIN_MATCH_LEN) {
+				/* No match found.  Choose a literal.  */
+				*next_chosen_item++ =
+					deflate_choose_literal(c, *(in_next - 1));
+				goto check_block_and_continue;
+			}
+
+		have_cur_match:
+			/* We have a match at the current position.  */
+
+			/* If the current match is very long, choose it
+			 * immediately.  */
+			if (cur_len >= nice_len) {
+				*next_chosen_item++ =
+					deflate_choose_match(c, cur_len, cur_offset);
+
+				hc_matchfinder_skip_positions(&c->hc_mf,
+							      in_cur_base,
+							      in_next,
+							      in_end,
+							      cur_len - 1);
+				in_next += cur_len - 1;
+				goto check_block_and_continue;
+			}
+
+			/*
+			 * Try to find a match at the next position.
+			 *
+			 * Note: since we already have a match at the *current*
+			 * position, we use only half the 'max_search_depth'
+			 * when checking the *next* position.  This is a useful
+			 * trade-off because it's more worthwhile to use a
+			 * greater search depth on the initial match.
+			 *
+			 * Note: it's possible to structure the code such that
+			 * there's only one call to longest_match(), which
+			 * handles both the "find the initial match" and "try to
+			 * find a longer match" cases.  However, it is faster to
+			 * have two call sites, with longest_match() inlined at
+			 * each.
+			 */
+			if (unlikely(in_cur_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
+				max_len = in_cur_end - in_next;
+				nice_len = min(max_len, nice_len);
+			}
+			next_len = hc_matchfinder_longest_match(&c->hc_mf,
+								in_cur_base,
+								in_next,
+								cur_len,
+								max_len,
+								nice_len,
+								c->max_search_depth / 2,
+								&next_offset);
+			in_next += 1;
+
+			if (next_len > cur_len) {
+				/* Found a longer match at the next position.
+				 * Output a literal.  Then the next match
+				 * becomes the current match.  */
+				*next_chosen_item++ =
+					deflate_choose_literal(c, *(in_next - 2));
+				if (next_chosen_item == end_block) {
+					deflate_write_block(c, next_chosen_item,
+							    &os, in_next == in_end);
+					next_chosen_item = c->chosen_items;
+				}
+				cur_len = next_len;
+				cur_offset = next_offset;
+				goto have_cur_match;
+			} else {
+				/* No longer match at the next position.
+				 * Output the current match.  */
+				*next_chosen_item++ =
+					deflate_choose_match(c, cur_len, cur_offset);
+
+				hc_matchfinder_skip_positions(&c->hc_mf,
+							      in_cur_base,
+							      in_next,
+							      in_end,
+							      cur_len - 2);
+				in_next += cur_len - 2;
+				goto check_block_and_continue;
+			}
+
+		check_block_and_continue:
+			/* Check if it's time to output another block.  */
+			if (next_chosen_item == end_block) {
+				deflate_write_block(c, next_chosen_item,
+						    &os, in_next == in_end);
+				next_chosen_item = c->chosen_items;
+			}
+		} while (in_next != in_cur_end);
+
+	} while (in_next != in_end);
+
+	/* Output the last block.  */
+	if (next_chosen_item != c->chosen_items)
+		deflate_write_block(c, next_chosen_item, &os, true);
+
+	return deflate_flush_output(&os);
+}
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/*
+ * Follow the minimum-cost path in the graph of possible match/literal choices
+ * for the current block and compute the frequencies of the Huffman symbols that
+ * are needed to output those matches and literals.
+ */
+static void
+deflate_tally_item_list(struct deflate_compressor *c,
+			struct deflate_optimum_node *end_optimum_ptr)
+{
+	struct deflate_optimum_node *cur_optimum_ptr = c->optimum;
+	do {
+		unsigned length = cur_optimum_ptr->item & OPTIMUM_LEN_MASK;
+		unsigned offset = cur_optimum_ptr->item >> OPTIMUM_OFFSET_SHIFT;
+
+		if (length == 1) {
+			/* Literal  */
+			c->freqs.litlen[offset]++;
+		} else {
+			/* Match  */
+			c->freqs.litlen[257 + deflate_get_length_slot(c, length)]++;
+			c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
+		}
+		cur_optimum_ptr += length;
+	} while (cur_optimum_ptr != end_optimum_ptr);
+}
+
+/*
+ * Follow the minimum-cost path in the graph of possible match/literal choices
+ * for the current block and write out the matches/literals using the specified
+ * Huffman codes.
+ *
+ * Note: this is slightly duplicated with deflate_write_item(), the reason being
+ * that we don't want to waste time translating between intermediate
+ * match/literal representations.
+ */
+static void
+deflate_write_item_list(struct deflate_output_bitstream *os,
+			const struct deflate_codes *codes,
+			struct deflate_compressor *c,
+			struct deflate_optimum_node * const end_optimum_ptr)
+{
+	struct deflate_optimum_node *cur_optimum_ptr = c->optimum;
+	do {
+		unsigned length = cur_optimum_ptr->item & OPTIMUM_LEN_MASK;
+		unsigned offset = cur_optimum_ptr->item >> OPTIMUM_OFFSET_SHIFT;
+		unsigned litlen_symbol;
+		unsigned length_slot;
+		unsigned offset_slot;
+
+		if (length == 1) {
+			/* Literal  */
+			litlen_symbol = offset;
+			deflate_write_bits(os, codes->codewords.litlen[litlen_symbol],
+					   codes->lens.litlen[litlen_symbol]);
+		} else {
+			/* Match length  */
+			length_slot = deflate_get_length_slot(c, length);
+			litlen_symbol = 257 + length_slot;
+			deflate_write_bits(os, codes->codewords.litlen[litlen_symbol],
+					   codes->lens.litlen[litlen_symbol]);
+
+			deflate_write_bits(os, length - deflate_length_slot_base[length_slot],
+					   deflate_extra_length_bits[length_slot]);
+
+			/* Match offset  */
+			offset_slot = deflate_get_offset_slot(c, offset);
+			deflate_write_bits(os, codes->codewords.offset[offset_slot],
+					   codes->lens.offset[offset_slot]);
+			deflate_write_bits(os, offset - deflate_offset_slot_base[offset_slot],
+					   deflate_extra_offset_bits[offset_slot]);
+		}
+		cur_optimum_ptr += length;
+	} while (cur_optimum_ptr != end_optimum_ptr);
+}
+
+/* Set the current cost model from the codeword lengths specified in @lens.  */
+static void
+deflate_set_costs(struct deflate_compressor *c, const struct deflate_lens * lens)
+{
+	/* Literals  */
+	for (unsigned i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+		u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
+		c->costs.literal[i] = bits << COST_SHIFT;
+	}
+
+	/* Lengths  */
+	for (unsigned i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
+		unsigned length_slot = deflate_get_length_slot(c, i);
+		unsigned litlen_sym = 257 + length_slot;
+		u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
+		bits += deflate_extra_length_bits[length_slot];
+		c->costs.length[i] = bits << COST_SHIFT;
+	}
+
+	/* Offset slots  */
+	for (unsigned i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
+		u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
+		bits += deflate_extra_offset_bits[i];
+		c->costs.offset_slot[i] = bits << COST_SHIFT;
+	}
+}
+
+static inline u32
+deflate_default_literal_cost(unsigned literal)
+{
+	BUILD_BUG_ON(COST_SHIFT != 3);
+	/* 66 is 8.25 bits/symbol  */
+	return 66;
+}
+
+static inline u32
+deflate_default_length_slot_cost(unsigned length_slot)
+{
+	BUILD_BUG_ON(COST_SHIFT != 3);
+	/* 60 is 7.5 bits/symbol  */
+	return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
+}
+
+static inline u32
+deflate_default_offset_slot_cost(unsigned offset_slot)
+{
+	BUILD_BUG_ON(COST_SHIFT != 3);
+	/* 39 is 4.875 bits/symbol  */
+	return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
+}
+
+/*
+ * Set default Huffman symbol costs for the first optimization pass.
+ *
+ * It works well to assume that each Huffman symbol is equally probable.  This
+ * results in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
+ * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
+ * alphabet.  However, we intentionally bias the parse towards matches rather
+ * than literals by using a slightly lower default cost for length symbols than
+ * for literals.  This often improves the compression ratio slightly.
+ */
+static void
+deflate_set_default_costs(struct deflate_compressor *c)
+{
+	unsigned i;
+
+	/* Literals  */
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+		c->costs.literal[i] = deflate_default_literal_cost(i);
+
+	/* Lengths  */
+	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
+		c->costs.length[i] = deflate_default_length_slot_cost(
+						deflate_get_length_slot(c, i));
+
+	/* Offset slots  */
+	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
+		c->costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
+}
+
+static inline void
+deflate_adjust_cost(u32 *cost_p, u32 default_cost)
+{
+	*cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
+}
+
+/*
+ * Adjust the costs when beginning a new block.
+ *
+ * Since the current costs have been optimized for the data, it's undesirable to
+ * throw them away and start over with the default costs.  At the same time, we
+ * don't want to bias the parse by assuming that the next block will be similar
+ * to the current block.  As a compromise, make the costs closer to the
+ * defaults, but don't simply set them to the defaults.
+ */
+static void
+deflate_adjust_costs(struct deflate_compressor *c)
+{
+	unsigned i;
+
+	/* Literals  */
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+		deflate_adjust_cost(&c->costs.literal[i],
+				    deflate_default_literal_cost(i));
+
+	/* Lengths  */
+	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
+		deflate_adjust_cost(&c->costs.length[i],
+				    deflate_default_length_slot_cost(
+						deflate_get_length_slot(c, i)));
+
+	/* Offset slots  */
+	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
+		deflate_adjust_cost(&c->costs.offset_slot[i],
+				    deflate_default_offset_slot_cost(i));
+}
+
+static void
+deflate_optimize_and_write_block(struct deflate_compressor *c,
+				 struct deflate_output_bitstream *os,
+				 const unsigned block_len,
+				 struct lz_match *end_cache_ptr,
+				 const bool is_final_block)
+{
+	struct deflate_optimum_node *end_optimum_ptr = c->optimum + block_len;
+	unsigned num_passes_remaining = c->num_optim_passes;
+
+	do {
+		/*
+		 * Beginning a new optimization pass and finding a new
+		 * minimum-cost path through the graph of possible match/literal
+		 * choices for this block.
+		 *
+		 * We find the minimum cost path from 'c->optimum', which
+		 * represents the node at the beginning of the block, to
+		 * 'end_optimum_ptr', which represents the node at the end of
+		 * the block.  Edge costs are evaluated using the cost model
+		 * 'c->costs'.
+		 *
+		 * The algorithm works backward, starting at 'end_optimum_ptr'
+		 * and proceeding backwards one position at a time.  At each
+		 * position, the minimum cost to reach 'end_optimum_ptr' is
+		 * computed and the match/literal choice is saved.
+		 */
+		struct deflate_optimum_node *cur_optimum_ptr = end_optimum_ptr;
+		struct lz_match *cache_ptr = end_cache_ptr;
+
+		cur_optimum_ptr->cost_to_end = 0;
+		do {
+			unsigned num_matches;
+			unsigned literal;
+			u32 best_cost_to_end;
+			u32 best_item;
+
+			cur_optimum_ptr--;
+			cache_ptr--;
+
+			num_matches = cache_ptr->length;
+			literal = cache_ptr->offset;
+
+			/* It's always possible to choose a literal.  */
+			best_cost_to_end = c->costs.literal[literal] +
+					   (cur_optimum_ptr + 1)->cost_to_end;
+			best_item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
+
+			/* Also consider matches if there are any.  */
+			if (num_matches) {
+				struct lz_match *match;
+				unsigned len;
+				unsigned offset;
+				unsigned offset_slot;
+				u32 offset_cost;
+				u32 cost_to_end;
+
+				/*
+				 * Consider each length from the minimum
+				 * (DEFLATE_MIN_MATCH_LEN) to the length of the
+				 * longest match found at this position.  For
+				 * each length, we consider only the smallest
+				 * offset for which that length is available.
+				 * Although this is not guaranteed to be optimal
+				 * due to the possibility of a larger offset
+				 * costing less than a smaller offset to code,
+				 * this is a very useful heuristic.
+				 */
+				match = cache_ptr - num_matches;
+				len = DEFLATE_MIN_MATCH_LEN;
+				do {
+					offset = match->offset;
+					offset_slot = deflate_get_offset_slot(c, offset);
+					offset_cost = c->costs.offset_slot[offset_slot];
+					do {
+						cost_to_end = offset_cost +
+							      c->costs.length[len] +
+							      (cur_optimum_ptr + len)->cost_to_end;
+						if (cost_to_end < best_cost_to_end) {
+							best_cost_to_end = cost_to_end;
+							best_item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
+						}
+					} while (++len <= match->length);
+				} while (++match != cache_ptr);
+				cache_ptr -= num_matches;
+			}
+			cur_optimum_ptr->cost_to_end = best_cost_to_end;
+			cur_optimum_ptr->item = best_item;
+		} while (cur_optimum_ptr != c->optimum);
+
+		/* Tally Huffman symbol frequencies.  */
+		deflate_tally_item_list(c, end_optimum_ptr);
+
+		/* If this wasn't the last pass, update the cost model.  */
+		if (num_passes_remaining > 1) {
+			deflate_make_huffman_codes(&c->freqs, &c->codes);
+			deflate_set_costs(c, &c->codes.lens);
+			deflate_reset_symbol_frequencies(c);
+		}
+	} while (--num_passes_remaining);
+
+	/* All optimization passes are done.  Output a block using the
+	 * minimum-cost path computed on the last optimization pass.  */
+	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+	deflate_make_huffman_codes(&c->freqs, &c->codes);
+	deflate_reset_symbol_frequencies(c);
+	deflate_write_block_header(os, is_final_block, DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN);
+	deflate_write_huffman_codes(os, &c->codes);
+	deflate_write_item_list(os, &c->codes, c, end_optimum_ptr);
+	deflate_write_end_of_block(os, &c->codes);
+}
+
+/*
+ * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
+ * representation of each DEFLATE block using a minimum-cost path search over
+ * the graph of possible match/literal choices for that block, assuming a
+ * certain cost for each Huffman symbol.
+ *
+ * For several reasons, the end result is not guaranteed to be optimal:
+ *
+ * - Nonoptimal choice of blocks
+ * - Heuristic limitations on which matches are actually considered
+ * - Symbol costs are unknown until the symbols have already been chosen
+ *   (so iterative optimization must be used)
+ */
+static size_t
+deflate_compress_near_optimal(struct deflate_compressor * restrict c,
+			      const u8 * restrict in, size_t in_nbytes,
+			      u8 * restrict out, size_t out_nbytes_avail)
+{
+	const u8 *in_next = in;
+	const u8 *in_end = in_next + in_nbytes;
+	struct deflate_output_bitstream os;
+	const u8 *in_cur_base;
+	const u8 *in_next_slide;
+	unsigned max_len;
+	unsigned nice_len;
+	struct lz_match *cache_ptr;
+	struct lz_match *cache_end;
+	const u8 *in_block_begin;
+	const u8 *in_block_end;
+	unsigned num_matches;
+	unsigned best_len;
+	unsigned long prev_hash = 0;
+
+	deflate_init_output(&os, out, out_nbytes_avail);
+	deflate_reset_symbol_frequencies(c);
+
+	bt_matchfinder_init(&c->bt_mf);
+	in_cur_base = in_next;
+	in_next_slide = in_next + min(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
+
+	max_len = DEFLATE_MAX_MATCH_LEN;
+	nice_len = min(c->nice_match_length, max_len);
+
+	do {
+		/* Starting a new DEFLATE block.  */
+
+		cache_ptr = c->cached_matches;
+		cache_end = &c->cached_matches[CACHE_LEN - (MAX_MATCHES_PER_POS + 1)];
+		in_block_begin = in_next;
+		in_block_end = in_next + min(in_end - in_next, OPTIM_BLOCK_LENGTH);
+
+		/* Set the initial cost model.  */
+		if (in_next == in)
+			deflate_set_default_costs(c);
+		else
+			deflate_adjust_costs(c);
+
+		/* Find all match possibilities in this block.  */
+		do {
+			/* Decrease the maximum and nice match lengths if we're
+			 * approaching the end of the input buffer.  */
+			if (unlikely(max_len > in_end - in_next)) {
+				max_len = in_end - in_next;
+				nice_len = min(max_len, nice_len);
+			}
+
+			/* Force the block to end if the match cache may
+			 * overflow.  This case is very unlikely.  */
+			if (unlikely(cache_ptr > cache_end))
+				break;
+
+			/* Slide the window forward if needed.  */
+			if (in_next == in_next_slide) {
+				bt_matchfinder_slide_window(&c->bt_mf);
+				in_cur_base = in_next;
+				in_next_slide = in_next + min(in_end - in_next,
+							      MATCHFINDER_WINDOW_SIZE);
+			}
+
+			/*
+			 * Find matches with the current position using the
+			 * binary tree matchfinder and save them in
+			 * 'cached_matches'.
+			 *
+			 * Note: the binary tree matchfinder is more suited for
+			 * optimal parsing than the hash chain matchfinder.  The
+			 * reasons for this include:
+			 *
+			 * - The binary tree matchfinder can find more matches
+			 *   in the same number of steps.
+			 * - One of the major advantages of hash chains is that
+			 *   skipping positions (not searching for matches at
+			 *   them) is faster; however, with optimal parsing we
+			 *   search for matches at almost all positions, so this
+			 *   advantage of hash chains is negated.
+			 */
+			num_matches =
+				bt_matchfinder_get_matches(&c->bt_mf,
+							   in_cur_base,
+							   in_next,
+							   max_len,
+							   nice_len,
+							   c->max_search_depth,
+							   &prev_hash,
+							   cache_ptr);
+			cache_ptr += num_matches;
+			cache_ptr->length = num_matches;
+			cache_ptr->offset = *in_next;
+			in_next++;
+			cache_ptr++;
+
+			if (num_matches) {
+				best_len = cache_ptr[-2].length;
+
+				/*
+				 * If there was a very long match found, don't
+				 * cache any matches for the bytes covered by
+				 * that match.  This avoids degenerate behavior
+				 * when compressing highly redundant data, where
+				 * the number of matches can be very large.
+				 *
+				 * This heuristic doesn't actually hurt the
+				 * compression ratio very much.  If there's a
+				 * long match, then the data must be highly
+				 * compressible, so it doesn't matter much what
+				 * we do.
+				 *
+				 * We also trigger this same case when
+				 * approaching the desired end of the block.
+				 * This forces the block to reach a "stopping
+				 * point" where there are no matches extending
+				 * to later positions.  (XXX: this behavior is
+				 * non-optimal and should be improved.)
+				 */
+				if (best_len >= min(nice_len, in_block_end - in_next)) {
+					--best_len;
+					do {
+						if (unlikely(max_len > in_end - in_next)) {
+							max_len = in_end - in_next;
+							nice_len = min(max_len, nice_len);
+						}
+						if (in_next == in_next_slide) {
+							bt_matchfinder_slide_window(&c->bt_mf);
+							in_cur_base = in_next;
+							in_next_slide = in_next + min(in_end - in_next,
+										      MATCHFINDER_WINDOW_SIZE);
+						}
+						bt_matchfinder_skip_position(&c->bt_mf,
+									     in_cur_base,
+									     in_next,
+									     in_end,
+									     nice_len,
+									     c->max_search_depth,
+									     &prev_hash);
+
+						cache_ptr->length = 0;
+						cache_ptr->offset = *in_next;
+						in_next++;
+						cache_ptr++;
+					} while (--best_len);
+				}
+			}
+		} while (in_next < in_block_end);
+
+		/* All the matches for this block have been cached.  Now compute
+		 * a near-optimal sequence of literals and matches, and output
+		 * the block.  */
+
+		deflate_optimize_and_write_block(c, &os, in_next - in_block_begin,
+						 cache_ptr, in_next == in_end);
+	} while (in_next != in_end);
+
+	return deflate_flush_output(&os);
+}
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+/* Initialize c->length_slot_fast.  */
+static void
+deflate_init_length_slot_fast(struct deflate_compressor *c)
+{
+	unsigned length_slot;
+	unsigned length;
+	unsigned length_end;
+
+	for (length_slot = 0;
+	     length_slot < ARRAY_LEN(deflate_length_slot_base);
+	     length_slot++)
+	{
+		length = deflate_length_slot_base[length_slot];
+		length_end = length + (1 << deflate_extra_length_bits[length_slot]);
+		do {
+			c->length_slot_fast[length] = length_slot;
+		} while (++length != length_end);
+	}
+}
+
+/* Initialize c->offset_slot_fast.  */
+static void
+deflate_init_offset_slot_fast(struct deflate_compressor *c)
+{
+	unsigned offset_slot;
+	unsigned offset;
+	unsigned offset_end;
+
+	for (offset_slot = 0;
+	     offset_slot < ARRAY_LEN(deflate_offset_slot_base);
+	     offset_slot++)
+	{
+		offset = deflate_offset_slot_base[offset_slot];
+	#if USE_FULL_OFFSET_SLOT_FAST
+		offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
+		do {
+			c->offset_slot_fast[offset] = offset_slot;
+		} while (++offset != offset_end);
+	#else
+		if (offset <= 256) {
+			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
+			do {
+				c->offset_slot_fast[offset - 1] = offset_slot;
+			} while (++offset != offset_end);
+		} else {
+			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
+			do {
+				c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
+			} while ((offset += (1 << 7)) != offset_end);
+		}
+	#endif
+	}
+}
+
+/* Initialize c->static_codes.  */
+static void
+deflate_init_static_codes(struct deflate_compressor *c)
+{
+	unsigned i;
+
+	for (i = 0; i < 144; i++)
+		c->freqs.litlen[i] = 1 << (9 - 8);
+	for (; i < 256; i++)
+		c->freqs.litlen[i] = 1 << (9 - 9);
+	for (; i < 280; i++)
+		c->freqs.litlen[i] = 1 << (9 - 7);
+	for (; i < 288; i++)
+		c->freqs.litlen[i] = 1 << (9 - 8);
+
+	for (i = 0; i < 32; i++)
+		c->freqs.offset[i] = 1 << (5 - 5);
+
+	deflate_make_huffman_codes(&c->freqs, &c->static_codes);
+}
+
+LIBEXPORT struct deflate_compressor *
+deflate_alloc_compressor(unsigned int compression_level)
+{
+	struct deflate_compressor *c;
+	size_t size;
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	if (compression_level >= 8)
+		size = offsetof(struct deflate_compressor, optimal_end);
+	else
+#endif
+		size = offsetof(struct deflate_compressor, nonoptimal_end);
+
+	c = aligned_alloc(MATCHFINDER_ALIGNMENT, size);
+	if (!c)
+		return NULL;
+
+	c->compression_level = compression_level;
+
+	switch (compression_level) {
+	case 0:
+		c->impl = deflate_compress_greedy;
+		c->max_search_depth = 1;
+		c->nice_match_length = 3;
+		break;
+	case 1:
+		c->impl = deflate_compress_greedy;
+		c->max_search_depth = 4;
+		c->nice_match_length = 9;
+		break;
+	case 2:
+		c->impl = deflate_compress_greedy;
+		c->max_search_depth = 12;
+		c->nice_match_length = 9;
+		break;
+	case 3:
+		c->impl = deflate_compress_greedy;
+		c->max_search_depth = 24;
+		c->nice_match_length = 18;
+		break;
+	case 4:
+		c->impl = deflate_compress_greedy;
+		c->max_search_depth = 36;
+		c->nice_match_length = 27;
+		break;
+	case 5:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 32;
+		c->nice_match_length = 24;
+		break;
+	case 6:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 48;
+		c->nice_match_length = 36;
+		break;
+	case 7:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 72;
+		c->nice_match_length = 54;
+		break;
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	case 8:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = 25;
+		c->nice_match_length = 40;
+		c->num_optim_passes = 1;
+		break;
+	case 9:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = 50;
+		c->nice_match_length = 80;
+		c->num_optim_passes = 2;
+		break;
+	case 10:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = 100;
+		c->nice_match_length = 120;
+		c->num_optim_passes = 3;
+		break;
+	default:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = 250;
+		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+		c->num_optim_passes = 5;
+		break;
+#else
+	case 8:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 108;
+		c->nice_match_length = 81;
+		break;
+	case 9:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 162;
+		c->nice_match_length = 122;
+		break;
+	case 10:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 243;
+		c->nice_match_length = 182;
+		break;
+	default:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 365;
+		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+		break;
+#endif
+	}
+
+	deflate_init_offset_slot_fast(c);
+	deflate_init_length_slot_fast(c);
+	deflate_init_static_codes(c);
+
+	return c;
+}
+
+LIBEXPORT size_t
+deflate_compress(struct deflate_compressor *c,
+		 const void *in, size_t in_nbytes,
+		 void *out, size_t out_nbytes_avail)
+{
+	if (in_nbytes < 16)
+		return 0;
+	return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
+}
+
+LIBEXPORT void
+deflate_free_compressor(struct deflate_compressor *c)
+{
+	free(c);
+}
+
+unsigned int
+deflate_get_compression_level(struct deflate_compressor *c)
+{
+	return c->compression_level;
+}
diff --git a/src/deflate_compress.h b/src/deflate_compress.h
new file mode 100644
index 0000000..ccf943e
--- /dev/null
+++ b/src/deflate_compress.h
@@ -0,0 +1,9 @@
+#pragma once
+
+/* 'struct deflate_compressor' is private to deflate_compress.c, but zlib header
+ * generation needs to be able to query the compression level.  */
+
+struct deflate_compressor;
+
+extern unsigned int
+deflate_get_compression_level(struct deflate_compressor *c);
diff --git a/src/deflate_constants.h b/src/deflate_constants.h
new file mode 100644
index 0000000..cb07c07
--- /dev/null
+++ b/src/deflate_constants.h
@@ -0,0 +1,59 @@
+/*
+ * deflate_constants.h
+ *
+ * Constants for the DEFLATE compression format.
+ */
+
+#pragma once
+
+/* Valid block types  */
+#define DEFLATE_BLOCKTYPE_UNCOMPRESSED		0
+#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN	1
+#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN	2
+
+/* Minimum and maximum supported match lengths (in bytes)  */
+#define DEFLATE_MIN_MATCH_LEN			3
+#define DEFLATE_MAX_MATCH_LEN			258
+
+/* Minimum and maximum supported match offsets (in bytes)  */
+#define DEFLATE_MIN_MATCH_OFFSET		1
+#define DEFLATE_MAX_MATCH_OFFSET		32767
+
+#define DEFLATE_MAX_WINDOW_SIZE			32768
+
+/* Number of symbols in each Huffman code.  Note: for the literal/length
+ * and offset codes, these are actually the maximum values; a given block
+ * might use fewer symbols.  */
+#define DEFLATE_NUM_PRECODE_SYMS		19
+#define DEFLATE_NUM_LITLEN_SYMS			288
+#define DEFLATE_NUM_OFFSET_SYMS			32
+
+/* Division of symbols in the literal/length code  */
+#define DEFLATE_NUM_LITERALS			256
+#define DEFLATE_END_OF_BLOCK			256
+#define DEFLATE_NUM_LEN_SYMS			31
+
+/* Maximum codeword length, in bits, within each Huffman code  */
+#define DEFLATE_MAX_PRE_CODEWORD_LEN		7
+#define DEFLATE_MAX_LITLEN_CODEWORD_LEN		15
+#define DEFLATE_MAX_OFFSET_CODEWORD_LEN		15
+
+/* Maximum possible overrun when decoding codeword lengths  */
+#define DEFLATE_MAX_LENS_OVERRUN		137
+
+/*
+ * Maximum number of extra bits that may be required to represent a match
+ * length or offset.
+ *
+ * TODO: are we going to have full DEFLATE64 support?  If so, up to 16
+ * length bits must be supported.
+ */
+#define DEFLATE_MAX_EXTRA_LENGTH_BITS		5
+#define DEFLATE_MAX_EXTRA_OFFSET_BITS		14
+
+/* The maximum number of bits in which a match can be represented.  This
+ * is the absolute worst case, which assumes the longest possible Huffman
+ * codewords and the maximum numbers of extra bits.  */
+#define DEFLATE_MAX_MATCH_BITS	\
+	(DEFLATE_MAX_LITLEN_CODEWORD_LEN + DEFLATE_MAX_EXTRA_LENGTH_BITS + \
+	DEFLATE_MAX_OFFSET_CODEWORD_LEN + DEFLATE_MAX_EXTRA_OFFSET_BITS)
diff --git a/src/deflate_decompress.c b/src/deflate_decompress.c
new file mode 100644
index 0000000..eacf6fd
--- /dev/null
+++ b/src/deflate_decompress.c
@@ -0,0 +1,1455 @@
+/*
+ * deflate_decompress.c
+ *
+ * This is a highly optimized DEFLATE decompressor.  On x86_64 it decompresses
+ * data in about 59% of the time of zlib.  On other architectures it should
+ * still be significantly faster than zlib, but the difference may be smaller.
+ *
+ * This decompressor currently only supports raw DEFLATE (not zlib or gzip), and
+ * it only supports whole-buffer decompression (not streaming).
+ *
+ * Why this is faster than zlib's implementation:
+ *
+ * - Word accesses rather than byte accesses when reading input
+ * - Word accesses rather than byte accesses when copying matches
+ * - Faster Huffman decoding combined with various DEFLATE-specific tricks
+ * - Larger bitbuffer variable that doesn't need to be filled as often
+ * - Other optimizations to remove unnecessary branches
+ * - Only full-buffer decompression is supported, so the code doesn't need to
+ *   support stopping and resuming decompression.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libdeflate.h"
+
+#include "deflate_constants.h"
+#include "unaligned.h"
+
+#ifndef UNSAFE_DECOMPRESSION
+#  define UNSAFE_DECOMPRESSION 0
+#endif
+
+#if UNSAFE_DECOMPRESSION
+#  warning "unsafe decompression is enabled"
+#  define SAFETY_CHECK(expr) 0
+#else
+#  define SAFETY_CHECK(expr) unlikely(expr)
+#endif
+
+/*
+ * Each of these values is the base 2 logarithm of the number of entries of the
+ * corresponding decode table.  Each value should be large enough to ensure that
+ * for typical data, the vast majority of symbols can be decoded by a direct
+ * lookup of the next TABLEBITS bits of compressed data.  However, this must be
+ * balanced against the fact that a larger table requires more memory and
+ * requires more time to fill.
+ */
+#define DEFLATE_PRECODE_TABLEBITS	7
+#define DEFLATE_LITLEN_TABLEBITS	10
+#define DEFLATE_OFFSET_TABLEBITS	9
+
+/*
+ * Type for codeword lengths.
+ */
+typedef u8 len_t;
+
+/*
+ * The main DEFLATE decompressor structure.  Since this implementation only
+ * supports full buffer decompression, this structure does not store the entire
+ * decompression state, but rather only some arrays that are too large to
+ * comfortably allocate on the stack.
+ */
+struct deflate_decompressor {
+
+	/*
+	 * The arrays aren't all needed at the same time.  'precode_lens' and
+	 * 'precode_decode_table' are unneeded after 'lens' has been filled.
+	 * Furthermore, 'lens' need not be retained after building the litlen
+	 * and offset decode tables.  In fact, 'lens' can be in union with
+	 * 'litlen_decode_table' provided that 'offset_decode_table' is separate
+	 * and is built first.
+	 */
+
+	union {
+		len_t precode_lens[DEFLATE_NUM_PRECODE_SYMS];
+
+		struct {
+			len_t lens[DEFLATE_NUM_LITLEN_SYMS +
+				   DEFLATE_NUM_OFFSET_SYMS +
+				   DEFLATE_MAX_LENS_OVERRUN];
+
+			u32 precode_decode_table[(1 << DEFLATE_PRECODE_TABLEBITS) +
+						 (2 * DEFLATE_NUM_PRECODE_SYMS)];
+		};
+
+		u32 litlen_decode_table[(1 << DEFLATE_LITLEN_TABLEBITS) +
+					(2 * DEFLATE_NUM_LITLEN_SYMS)];
+	};
+
+	u32 offset_decode_table[(1 << DEFLATE_OFFSET_TABLEBITS) +
+				(2 * DEFLATE_NUM_OFFSET_SYMS)];
+};
+
+/*****************************************************************************
+ *				Input bitstream                              *
+ *****************************************************************************/
+
+/*
+ * The state of the "input bitstream" consists of the following variables:
+ *
+ *	- in_next: pointer to the next unread byte in the input buffer
+ *
+ *	- in_end: pointer just past the end of the input buffer
+ *
+ *	- bitbuf: a word-sized variable containing bits that have been read from
+ *		  the input buffer.  The buffered bits are right-aligned
+ *		  (they're the low-order bits).
+ *
+ *	- bitsleft: number of bits in 'bitbuf' that are valid.
+ *
+ * To make it easier for the compiler to optimize the code by keeping variables
+ * in registers, these are declared as normal variables and manipulated using
+ * macros.
+ */
+
+/*
+ * The type for the bitbuffer variable ('bitbuf' described above).  For best
+ * performance, this should have size equal to a machine word.
+ */
+typedef machine_word_t bitbuf_t;
+
+/*
+ * Number of bits the bitbuffer variable can hold.
+ */
+#define BITBUF_NBITS	(8 * sizeof(bitbuf_t))
+
+/*
+ * The maximum number of bits that can be requested to be in the bitbuffer
+ * variable.  This is the maximum value of 'n' that can be passed
+ * ENSURE_BITS(n).
+ *
+ * This not equal to BITBUF_NBITS because we never read less than one byte at a
+ * time.  If the bitbuffer variable contains more than (BITBUF_NBITS - 8) bits,
+ * then we can't read another byte without first consuming some bits.  So the
+ * maximum count we can ensure is (BITBUF_NBITS - 7).
+ */
+#define MAX_ENSURE	(BITBUF_NBITS - 7)
+
+/*
+ * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if
+ * 'n' is too large to be passed to ENSURE_BITS(n).  Note: if 'n' is a compile
+ * time constant, then this expression will be a compile-type constant.
+ * Therefore, CAN_ENSURE() can be used choose between alternative
+ * implementations at compile time.
+ */
+#define CAN_ENSURE(n)	((n) <= MAX_ENSURE)
+
+/*
+ * Fill the bitbuffer variable, reading one byte at a time.
+ *
+ * Note: if we would overrun the input buffer, we just don't read anything,
+ * leaving the bits as 0 but marking them as filled.  This makes the
+ * implementation simpler because this removes the need to distinguish between
+ * "real" overruns and overruns that occur because of our own lookahead during
+ * Huffman decoding.  The disadvantage is that a "real" overrun can go
+ * undetected, and deflate_decompress() may return a success status rather than
+ * the expected failure status if one occurs.  However, this is irrelevant
+ * because even if this specific case were to be handled "correctly", one could
+ * easily come up with a different case where the compressed data would be
+ * corrupted in such a way that fully retains its validity.  Users should run a
+ * checksum against the uncompressed data if they wish to detect corruptions.
+ */
+#define FILL_BITS_BYTEWISE()						\
+({									\
+	assert(bitsleft <= BITBUF_NBITS - 8);				\
+	do {								\
+		if (likely(in_next != in_end))				\
+			bitbuf |= (bitbuf_t)*in_next++ << bitsleft;	\
+		else							\
+			overrun_count++;				\
+		bitsleft += 8;						\
+	} while (bitsleft <= BITBUF_NBITS - 8);				\
+})
+
+/*
+ * Fill the bitbuffer variable by reading the next word from the input buffer.
+ * This can be significantly faster than FILL_BITS_BYTEWISE().  However, for
+ * this to work correctly, the word must be interpreted in little-endian format.
+ * In addition, the memory access may be unaligned.  Therefore, this method is
+ * most efficient on little-endian architectures that support fast unaligned
+ * access, such as x86 and x86_64.
+ */
+#define FILL_BITS_WORDWISE()						\
+({									\
+	assert(bitsleft < BITBUF_NBITS);				\
+	bitbuf |= get_unaligned_word_le(in_next) << bitsleft;		\
+	in_next += (BITBUF_NBITS - bitsleft) >> 3;			\
+	bitsleft += (BITBUF_NBITS - bitsleft) & ~7;			\
+})
+
+/*
+ * Does the bitbuffer variable currently contain at least 'n' bits?
+ */
+#define HAVE_BITS(n) (bitsleft >= (n))
+
+/*
+ * Raw form of ENSURE_BITS(): the bitbuffer variable must not already contain
+ * the requested number of bits.
+ */
+#define DO_ENSURE_BITS(n)					\
+({								\
+	assert(CAN_ENSURE(n));					\
+	assert(!HAVE_BITS(n));					\
+	if (CPU_IS_LITTLE_ENDIAN &&				\
+	    UNALIGNED_ACCESS_IS_FAST &&				\
+	    likely(in_end - in_next >= sizeof(bitbuf_t)))	\
+		FILL_BITS_WORDWISE();				\
+	else							\
+		FILL_BITS_BYTEWISE();				\
+	assert(HAVE_BITS(n));					\
+})
+
+/*
+ * Load more bits from the input buffer until the specified number of bits is
+ * present in the bitbuffer variable.  'n' cannot be too large; see MAX_ENSURE
+ * and CAN_ENSURE().
+ */
+#define ENSURE_BITS(n)							\
+({									\
+	assert(CAN_ENSURE(n));						\
+	if (!HAVE_BITS(n))						\
+		DO_ENSURE_BITS(n);					\
+	assert(HAVE_BITS(n));						\
+})
+
+/*
+ * Return the next 'n' bits from the bitbuffer variable without removing them.
+ */
+#define BITS(n)								\
+({									\
+	assert(HAVE_BITS(n));						\
+	bitbuf & (((bitbuf_t)1 << (n)) - 1);				\
+})
+
+/*
+ * Remove the next 'n' bits from the bitbuffer variable.
+ */
+#define REMOVE_BITS(n)							\
+({									\
+	assert(HAVE_BITS(n));						\
+	bitbuf >>= (n);							\
+	bitsleft -= (n);						\
+})
+
+/*
+ * Remove and return the next 'n' bits from the bitbuffer variable.
+ */
+#define POP_BITS(n)							\
+({									\
+	bitbuf_t bits = BITS(n);					\
+	REMOVE_BITS(n);							\
+	bits;								\
+})
+
+/*
+ * Align the input to the next byte boundary, discarding any remaining bits in
+ * the current byte.
+ *
+ * Note that if the bitbuffer variable currently contains more than 8 bits, then
+ * we must rewind 'in_next', effectively putting those bits back.  Only the bits
+ * in what would be the "current" byte if we were reading one byte at a time can
+ * be actually discarded.
+ */
+#define ALIGN_INPUT()							\
+({									\
+	in_next -= (bitsleft >> 3) - min(overrun_count, bitsleft >> 3);	\
+	bitbuf = 0;							\
+	bitsleft = 0;							\
+})
+
+/*
+ * Read a 16-bit value from the input.  This must have been preceded by a call
+ * to ALIGN_INPUT(), and the caller must have already checked for overrun.
+ */
+#define READ_U16()							\
+({									\
+	u16 v;								\
+									\
+	assert(bitsleft == 0);						\
+	assert(in_end - in_next >= 2);					\
+									\
+	v = get_unaligned_u16_le(in_next);				\
+	in_next += 2;							\
+	v;								\
+})
+
+/*****************************************************************************
+ *                              Huffman decoding                             *
+ *****************************************************************************/
+
+/*
+ * A decode table for order TABLEBITS contains (1 << TABLEBITS) entries, plus
+ * additional entries for non-root binary tree nodes.  The number of non-root
+ * binary tree nodes is variable, but cannot possibly be more than twice the
+ * number of symbols in the alphabet for which the decode table is built.
+ *
+ * The decoding algorithm takes the next TABLEBITS bits of compressed data and
+ * uses them as an index into the decode table.  The resulting entry is either a
+ * "direct entry", meaning that it contains the value desired, or a "tree root
+ * entry", meaning that it is the root of a binary tree that must be traversed
+ * using more bits of the compressed data (0 bit means go to the left child, 1
+ * bit means go to the right child) until a leaf is reached.
+ *
+ * Each decode table is associated with a Huffman code.  Logically, the result
+ * of a decode table lookup is a symbol from the alphabet from which the
+ * corresponding Huffman code was constructed.  A symbol with codeword length n
+ * <= TABLEBITS is associated with 2**(TABLEBITS - n) direct entries in the
+ * table, whereas a symbol with codeword length n > TABLEBITS shares a binary
+ * tree with a number of other codewords.
+ *
+ * On top of this basic design, we implement several optimizations:
+ *
+ * - We store the length of each codeword directly in each of its decode table
+ *   entries.  This allows the codeword length to be produced without indexing
+ *   an additional table.
+ *
+ * - When beneficial, we don't store the Huffman symbol itself, but instead data
+ *   generated from it.  For example, when decoding an offset symbol in DEFLATE,
+ *   it's more efficient if we can decode the offset base and number of extra
+ *   offset bits directly rather than decoding the offset symbol and then
+ *   looking up both of those values in an additional table or tables.
+ *
+ * - It can be possible to decode more than just a single Huffman symbol from
+ *   the next TABLEBITS bits of the input.  We take advantage of this when
+ *   decoding match lengths.  When possible, the decode table entry will provide
+ *   the full match length.  In this case, the stored "codeword length" will
+ *   actually be the codeword length plus the number of extra length bits that
+ *   are being consumed.
+ *
+ * The size of each decode table entry is 32 bits, which provides slightly
+ * better performance than 16-bit entries on 32 and 64 bit processers, provided
+ * that the table doesn't get so large that it takes up too much memory and
+ * starts generating cache misses.  The bits of each decode table entry are
+ * defined as follows:
+ *
+ * - Bits 29 -- 31: flags (see below)
+ * - Bits 25 -- 28: codeword length
+ * - Bits 0 -- 24: decode result: a Huffman symbol or related data
+ */
+
+/*
+ * Flags usage:
+ *
+ * The precode and offset tables only use these flags to distinguish nonleaf
+ * tree nodes from other entries.  In nonleaf tree node entries, all flags are
+ * set and the recommended one to test is HUFFDEC_TREE_NONLEAF_FAST_FLAG.
+ *
+ * The literal/length decode table uses all the flags.  During decoding, the
+ * flags are designed to be tested from high to low.  If a flag is set, then all
+ * higher flags are also set.
+ */
+
+/*
+ * This flag is set in all entries that do not represent a literal symbol,
+ * excluding tree leaves.  This enables a very fast path for non-rare literals:
+ * just check if this bit is clear, and if so extract the literal from the low
+ * bits.
+ */
+#define HUFFDEC_NOT_LITERAL		0x80000000
+
+/*
+ * This flag is set in all entries that represent neither a literal symbol nor a
+ * full match length, excluding tree leaves.
+ */
+#define HUFFDEC_NOT_FULL_LENGTH		0x40000000
+
+/*
+ * This flag is set in all nonleaf tree entries (roots and internal nodes).
+ */
+#define HUFFDEC_TREE_NONLEAF		0x20000000
+
+/*
+ * HUFFDEC_TREE_NONLEAF implies that the following flags are also set.
+ */
+#define HUFFDEC_TREE_NONLEAF_FLAGS	0xE0000000
+
+/*
+ * For distinguishing between any direct entry and a tree root, or between an
+ * internal tree node and a leaf node, this bit should be checked in preference
+ * to any other in HUFFDEC_TREE_NONLEAF_FLAGS --- the reason being this is the
+ * sign bit, and some architectures have special instructions to handle it.
+ */
+#define HUFFDEC_TREE_NONLEAF_FAST_FLAG	0x80000000
+
+/*
+ * Number of flag bits defined above.
+ */
+#define HUFFDEC_NUM_FLAG_BITS	3
+
+/*
+ * Number of bits reserved for the codeword length in decode table entries, and
+ * the corresponding mask and limit.  4 bits provides a max length of 15, which
+ * is enough for any DEFLATE codeword.  (And actually, we don't even need the
+ * full 15 because only lengths less than or equal to the appropriate TABLEBITS
+ * will ever be stored in this field.)
+ */
+#define HUFFDEC_LEN_BITS	4
+#define HUFFDEC_LEN_MASK	(((u32)1 << HUFFDEC_LEN_BITS) - 1)
+#define HUFFDEC_MAX_LEN		HUFFDEC_LEN_MASK
+
+/*
+ * Value by which a decode table entry can be right-shifted to get the length
+ * field.  Note: the result must be AND-ed with HUFFDEC_LEN_MASK unless it is
+ * guaranteed that no flag bits are set.
+ */
+#define HUFFDEC_LEN_SHIFT	(32 - HUFFDEC_NUM_FLAG_BITS - HUFFDEC_LEN_BITS)
+
+/*
+ * Mask to get the "value" of a decode table entry.  This is the decode result
+ * and contains data dependent on the table.
+ */
+#define HUFFDEC_VALUE_MASK	(((u32)1 << HUFFDEC_LEN_SHIFT) - 1)
+
+/*
+ * Data needed to initialize the entries in the length/literal decode table.
+ */
+static const u32 deflate_litlen_symbol_data[DEFLATE_NUM_LITLEN_SYMS] = {
+	/* Literals  */
+	0   , 1   , 2   , 3   , 4   , 5   , 6   , 7   ,
+	8   , 9   , 10  , 11  , 12  , 13  , 14  , 15  ,
+	16  , 17  , 18  , 19  , 20  , 21  , 22  , 23  ,
+	24  , 25  , 26  , 27  , 28  , 29  , 30  , 31  ,
+	32  , 33  , 34  , 35  , 36  , 37  , 38  , 39  ,
+	40  , 41  , 42  , 43  , 44  , 45  , 46  , 47  ,
+	48  , 49  , 50  , 51  , 52  , 53  , 54  , 55  ,
+	56  , 57  , 58  , 59  , 60  , 61  , 62  , 63  ,
+	64  , 65  , 66  , 67  , 68  , 69  , 70  , 71  ,
+	72  , 73  , 74  , 75  , 76  , 77  , 78  , 79  ,
+	80  , 81  , 82  , 83  , 84  , 85  , 86  , 87  ,
+	88  , 89  , 90  , 91  , 92  , 93  , 94  , 95  ,
+	96  , 97  , 98  , 99  , 100 , 101 , 102 , 103 ,
+	104 , 105 , 106 , 107 , 108 , 109 , 110 , 111 ,
+	112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 ,
+	120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 ,
+	128 , 129 , 130 , 131 , 132 , 133 , 134 , 135 ,
+	136 , 137 , 138 , 139 , 140 , 141 , 142 , 143 ,
+	144 , 145 , 146 , 147 , 148 , 149 , 150 , 151 ,
+	152 , 153 , 154 , 155 , 156 , 157 , 158 , 159 ,
+	160 , 161 , 162 , 163 , 164 , 165 , 166 , 167 ,
+	168 , 169 , 170 , 171 , 172 , 173 , 174 , 175 ,
+	176 , 177 , 178 , 179 , 180 , 181 , 182 , 183 ,
+	184 , 185 , 186 , 187 , 188 , 189 , 190 , 191 ,
+	192 , 193 , 194 , 195 , 196 , 197 , 198 , 199 ,
+	200 , 201 , 202 , 203 , 204 , 205 , 206 , 207 ,
+	208 , 209 , 210 , 211 , 212 , 213 , 214 , 215 ,
+	216 , 217 , 218 , 219 , 220 , 221 , 222 , 223 ,
+	224 , 225 , 226 , 227 , 228 , 229 , 230 , 231 ,
+	232 , 233 , 234 , 235 , 236 , 237 , 238 , 239 ,
+	240 , 241 , 242 , 243 , 244 , 245 , 246 , 247 ,
+	248 , 249 , 250 , 251 , 252 , 253 , 254 , 255 ,
+
+#define HUFFDEC_NUM_BITS_FOR_EXTRA_LENGTH_BITS	3
+#define HUFFDEC_MAX_EXTRA_LENGTH_BITS	(((u32)1 << HUFFDEC_NUM_BITS_FOR_EXTRA_LENGTH_BITS) - 1)
+#define HUFFDEC_EXTRA_LENGTH_BITS_SHIFT (HUFFDEC_LEN_SHIFT - HUFFDEC_NUM_BITS_FOR_EXTRA_LENGTH_BITS)
+#define HUFFDEC_LENGTH_BASE_MASK	(((u32)1 << HUFFDEC_EXTRA_LENGTH_BITS_SHIFT) - 1)
+#define HUFFDEC_END_OF_BLOCK_LENGTH	0
+
+#define ENTRY(length_base, num_extra_bits) \
+	(256 + (length_base) + ((num_extra_bits) << HUFFDEC_EXTRA_LENGTH_BITS_SHIFT))
+
+	/* End of block  */
+	ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0),
+
+	/* Match length data  */
+	ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
+	ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
+	ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
+	ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
+	ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
+	ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
+	ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
+	ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
+#undef ENTRY
+};
+
+/*
+ * Data needed to initialize the entries in the offset decode table.
+ */
+static const u32 deflate_offset_symbol_data[DEFLATE_NUM_OFFSET_SYMS] = {
+
+#define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16
+#define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1)
+
+#define ENTRY(offset_base, num_extra_bits) \
+	((offset_base) | ((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT))
+	ENTRY(1     , 0)  , ENTRY(2     , 0)  , ENTRY(3     , 0)  , ENTRY(4     , 0)  ,
+	ENTRY(5     , 1)  , ENTRY(7     , 1)  , ENTRY(9     , 2)  , ENTRY(13    , 2) ,
+	ENTRY(17    , 3)  , ENTRY(25    , 3)  , ENTRY(33    , 4)  , ENTRY(49    , 4)  ,
+	ENTRY(65    , 5)  , ENTRY(97    , 5)  , ENTRY(129   , 6)  , ENTRY(193   , 6)  ,
+	ENTRY(257   , 7)  , ENTRY(385   , 7)  , ENTRY(513   , 8)  , ENTRY(769   , 8)  ,
+	ENTRY(1025  , 9)  , ENTRY(1537  , 9)  , ENTRY(2049  , 10) , ENTRY(3073  , 10) ,
+	ENTRY(4097  , 11) , ENTRY(6145  , 11) , ENTRY(8193  , 12) , ENTRY(12289 , 12) ,
+	ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) ,
+#undef ENTRY
+};
+
+/* Construct a direct decode table entry (not a tree node)  */
+static inline u32
+make_direct_entry(u32 value, u32 length)
+{
+	return (length << HUFFDEC_LEN_SHIFT) | value;
+}
+
+/*
+ * The following functions define the way entries are created for each decode
+ * table.  Note that these will all be inlined into build_decode_table(), which
+ * will itself be inlined for each decode table.  This is important for
+ * performance because the make_*_entry() functions get called from the inner
+ * loop of build_decode_table().
+ */
+
+static inline u32
+make_litlen_direct_entry(unsigned symbol, unsigned codeword_length,
+			 unsigned *extra_mask_ret)
+{
+	u32 entry_value = deflate_litlen_symbol_data[symbol];
+	u32 entry_length = codeword_length;
+	unsigned length_bits;
+	u32 length_base;
+
+	BUILD_BUG_ON(DEFLATE_MAX_EXTRA_LENGTH_BITS >
+		     HUFFDEC_MAX_EXTRA_LENGTH_BITS);
+
+	if (symbol >= 256) {
+		/* Match, not a literal.  (This can also be the special
+		 * end-of-block symbol, which we handle identically.)  */
+		entry_value -= 256;
+		length_bits = entry_value >> HUFFDEC_EXTRA_LENGTH_BITS_SHIFT;
+		length_base = entry_value & HUFFDEC_LENGTH_BASE_MASK;
+		if (codeword_length + length_bits <= DEFLATE_LITLEN_TABLEBITS) {
+			/* TABLEBITS is enough to decode the length slot as well
+			 * as all the extra length bits.  So store the full
+			 * length in the decode table entry.
+			 *
+			 * Note that a length slot may be used for multiple
+			 * lengths, and multiple decode table entries may map to
+			 * the same length; hence the need for the 'extra_mask',
+			 * which allows build_decode_table() to cycle through
+			 * the lengths that use this length slot.  */
+			entry_value = length_base;
+			entry_length += length_bits;
+			*extra_mask_ret = (1U << length_bits) - 1;
+		} else {
+			/* TABLEBITS isn't enough to decode all the extra length
+			 * bits.  The decoder will have to decode the extra bits
+			 * separately.  This is the less common case.  */
+			entry_value |= HUFFDEC_NOT_FULL_LENGTH;
+		}
+		entry_value |= HUFFDEC_NOT_LITERAL;
+	}
+
+	return make_direct_entry(entry_value, entry_length);
+}
+
+static inline u32
+make_litlen_leaf_entry(unsigned sym)
+{
+	return deflate_litlen_symbol_data[sym];
+}
+
+static inline u32
+make_offset_direct_entry(unsigned sym, unsigned codeword_len, unsigned *extra_mask_ret)
+{
+	return make_direct_entry(deflate_offset_symbol_data[sym], codeword_len);
+}
+
+static inline u32
+make_offset_leaf_entry(unsigned sym)
+{
+	return deflate_offset_symbol_data[sym];
+}
+
+static inline u32
+make_pre_direct_entry(unsigned sym, unsigned codeword_len, unsigned *extra_mask_ret)
+{
+	return make_direct_entry(sym, codeword_len);
+}
+
+static inline u32
+make_pre_leaf_entry(unsigned sym)
+{
+	return sym;
+}
+
+/*
+ * Build a table for fast Huffman decoding, using bit-reversed codewords.
+ *
+ * The Huffman code is assumed to be in canonical form and is specified by its
+ * codeword lengths only.
+ *
+ * @decode_table
+ *	A table with ((1 << table_bits) + (2 * num_syms)) entries.  The format
+ *	of the table has been described in previous comments.
+ * @lens
+ *	Lengths of the Huffman codewords.  'lens[sym]' specifies the length, in
+ *	bits, of the codeword for symbol 'sym'.  If a symbol is not used in the
+ *	code, its length must be specified as 0.  It is valid for this parameter
+ *	to alias @decode_table because nothing gets written to @decode_table
+ *	until all information in @lens has been consumed.
+ * @num_syms
+ *	Number of symbols in the code.
+ * @make_direct_entry
+ *	Function to create a direct decode table entry, given the symbol and
+ *	codeword length.
+ * @make_leaf_entry
+ *	Function to create a tree decode table entry, at a tree leaf, given the
+ *	symbol.
+ * @table_bits
+ *	log base 2 of the size of the direct lookup portion of the decode table.
+ * @max_codeword_len
+ *	Maximum allowed codeword length for this Huffman code.
+ *
+ * Returns %true if successful; %false if the codeword lengths do not form a
+ * valid Huffman code.
+ */
+static inline bool
+build_decode_table(u32 decode_table[],
+		   const len_t lens[],
+		   const unsigned num_syms,
+		   u32 (*make_direct_entry)(unsigned, unsigned, unsigned *),
+		   u32 (*make_leaf_entry)(unsigned),
+		   const unsigned table_bits,
+		   const unsigned max_codeword_len)
+{
+	unsigned len_counts[max_codeword_len + 1];
+	unsigned offsets[max_codeword_len + 1];
+	unsigned sorted_syms[num_syms];
+	unsigned sym;
+	unsigned len;
+	s32 remainder;
+	unsigned sym_idx;
+	unsigned codeword_reversed;
+	unsigned codeword_len;
+	unsigned loop_count;
+
+	/* Preconditions  */
+	assert(table_bits > 0);
+	assert(table_bits <= max_codeword_len);
+	assert(max_codeword_len <= HUFFDEC_MAX_LEN);
+	for (sym = 0; sym < num_syms; sym++)
+		assert(lens[sym] <= max_codeword_len);
+
+	/* Count how many symbols have each codeword length.  */
+	for (len = 0; len <= max_codeword_len; len++)
+		len_counts[len] = 0;
+	for (sym = 0; sym < num_syms; sym++)
+		len_counts[lens[sym]]++;
+
+	/* We guarantee that all lengths are <= max_codeword_len, but we cannot
+	 * assume they form a valid prefix code.  A codeword of length n should
+	 * require a proportion of the codespace equaling (1/2)^n.  The code is
+	 * valid if and only if the codespace is exactly filled by the lengths
+	 * by this measure.  */
+	remainder = 1;
+	for (len = 1; len <= max_codeword_len; len++) {
+		remainder <<= 1;
+		remainder -= len_counts[len];
+		if (unlikely(remainder < 0)) {
+			/* The lengths overflow the codespace; that is, the code
+			 * is over-subscribed.  */
+			return false;
+		}
+	}
+
+	if (unlikely(remainder != 0)) {
+		/* The lengths do not fill the codespace; that is, they form an
+		 * incomplete set.  */
+		if (remainder == (1U << max_codeword_len)) {
+			/* The code is completely empty.  By definition, no
+			 * symbols can be decoded with an empty code.
+			 * Consequently, we technically don't even need to fill
+			 * in the decode table.  However, to avoid accessing
+			 * uninitialized memory if the algorithm nevertheless
+			 * attempts to decode symbols using such a code, we fill
+			 * the decode table with default values.  */
+			unsigned dummy;
+			for (unsigned i = 0; i < (1U << table_bits); i++)
+				decode_table[i] = (*make_direct_entry)(0, 1, &dummy);
+			return true;
+		}
+		return false;
+	}
+
+	/* Sort the symbols primarily by length and secondarily by symbol value.
+	 */
+
+	/* Initialize 'offsets' so that offsets[len] is the number of codewords
+	 * shorter than 'len' bits.  */
+	offsets[0] = 0;
+	for (len = 0; len < max_codeword_len; len++)
+		offsets[len + 1] = offsets[len] + len_counts[len];
+
+	/* Use the 'offsets' array to sort the symbols.  */
+	for (sym = 0; sym < num_syms; sym++)
+		sorted_syms[offsets[lens[sym]]++] = sym;
+
+	/* Generate entries for codewords with length <= 'table_bits'.
+	 * Start with codeword length 1 and proceed to longer codewords.  */
+	sym_idx = offsets[0];
+	codeword_reversed = 0;
+	codeword_len = 1;
+	loop_count = (1U << (table_bits - codeword_len));
+	for (; loop_count != 0; codeword_len++, loop_count >>= 1) {
+
+		const unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
+		const unsigned increment = 1U << codeword_len;
+
+		/* Iterate through the symbols that have codewords with length
+		 * 'codeword_len'.  Since the code is assumed to be canonical,
+		 * we can generate the codewords by iterating in symbol order
+		 * and incrementing the current codeword by 1 each time.  */
+
+		for (; sym_idx < end_sym_idx; sym_idx++) {
+			unsigned sym;
+			u32 entry;
+			unsigned extra_mask;
+			unsigned extra;
+			unsigned i;
+			unsigned n;
+			unsigned bit;
+
+			sym = sorted_syms[sym_idx];
+			extra_mask = 0;
+			entry = (*make_direct_entry)(sym, codeword_len, &extra_mask);
+			extra = 0;
+			i = codeword_reversed;
+			n = loop_count;
+			do {
+				decode_table[i] = entry + extra;
+				extra = (extra + 1) & extra_mask;
+				i += increment;
+			} while (--n);
+
+			/* Increment the codeword by 1.  Since DEFLATE requires
+			 * bit-reversed codewords, we must manipulate bits
+			 * ourselves.  */
+			bit = 1U << (codeword_len - 1);
+			while (codeword_reversed & bit)
+				bit >>= 1;
+			codeword_reversed &= bit - 1;
+			codeword_reversed |= bit;
+		}
+	}
+
+	/* If we've filled in the entire table, we are done.  Otherwise, there
+	 * are codewords longer than 'table_bits' for which we must generate
+	 * binary trees.  */
+	if (max_codeword_len > table_bits &&
+	    offsets[table_bits] != offsets[max_codeword_len])
+	{
+		unsigned i;
+		unsigned bit;
+		unsigned next_free_slot;
+
+		/* First, zero out the remaining entries.  This is necessary so
+		 * that those entries appear as "unallocated" in the next part.
+		 * Each of these entries will eventually be filled with the
+		 * representation of the root node of a binary tree.  */
+
+		i = (1U << table_bits) - 1; /* All 1 bits */
+		for (;;) {
+			decode_table[i] = 0;
+
+			if (i == codeword_reversed)
+				break;
+
+			/* Subtract 1 from the bit-reversed index.  */
+			bit = 1U << table_bits;
+			do {
+				bit >>= 1;
+				i ^= bit;
+			} while (i & bit);
+		}
+
+		/* We allocate child nodes starting at the end of the direct
+		 * lookup table.  Note that there should be 2*num_syms extra
+		 * entries for this purpose, although fewer than this may
+		 * actually be needed.  */
+		next_free_slot = 1U << table_bits;
+
+		for (; codeword_len <= max_codeword_len; codeword_len++) {
+
+			const unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
+
+			for (; sym_idx < end_sym_idx; sym_idx++) {
+
+				unsigned shift = table_bits;
+				unsigned node_idx = codeword_reversed & ((1U << table_bits) - 1);
+
+				/* Go through each bit of the current codeword
+				 * beyond the prefix of length @table_bits and
+				 * walk the appropriate binary tree, allocating
+				 * any slots that have not yet been allocated.
+				 *
+				 * Note that the 'pointer' entry to the binary
+				 * tree, which is stored in the direct lookup
+				 * portion of the table, is represented
+				 * identically to other internal (non-leaf)
+				 * nodes of the binary tree; it can be thought
+				 * of as simply the root of the tree.  The
+				 * representation of these internal nodes is
+				 * simply the index of the left child combined
+				 * with special flags to distingush the entry
+				 * from direct mapping and leaf node entries.
+				 */
+				do {
+
+					/* At least one bit remains in the
+					 * codeword, but the current node is
+					 * unallocated.  Allocate it as an
+					 * internal tree node.  */
+					if (decode_table[node_idx] == 0) {
+						decode_table[node_idx] =
+							next_free_slot |
+							HUFFDEC_TREE_NONLEAF_FLAGS;
+						decode_table[next_free_slot++] = 0;
+						decode_table[next_free_slot++] = 0;
+					}
+
+					/* Go to the left child if the next bit
+					 * in the codeword is 0; otherwise go to
+					 * the right child.  */
+					node_idx = decode_table[node_idx] &
+						   ~HUFFDEC_TREE_NONLEAF_FLAGS;
+					node_idx += (codeword_reversed >> shift) & 1;
+					shift += 1;
+				} while (shift != codeword_len);
+
+				/* Generate the leaf node, which contains the
+				 * real decode table entry.  */
+				decode_table[node_idx] =
+					(*make_leaf_entry)(sorted_syms[sym_idx]);
+
+				/* Increment the codeword by 1.  Since DEFLATE
+				 * requires bit-reversed codewords, we must
+				 * manipulate bits ourselves.  */
+				bit = 1U << (codeword_len - 1);
+				while (codeword_reversed & bit)
+					bit >>= 1;
+				codeword_reversed &= bit - 1;
+				codeword_reversed |= bit;
+			}
+		}
+	}
+	return true;
+}
+
+/* Build the decode table for the precode.  */
+static bool
+build_precode_decode_table(struct deflate_decompressor *d)
+{
+	return build_decode_table(d->precode_decode_table,
+				  d->precode_lens,
+				  DEFLATE_NUM_PRECODE_SYMS,
+				  make_pre_direct_entry,
+				  make_pre_leaf_entry,
+				  DEFLATE_PRECODE_TABLEBITS,
+				  DEFLATE_MAX_PRE_CODEWORD_LEN);
+}
+
+/* Build the decode table for the literal/length code.  */
+static bool
+build_litlen_decode_table(struct deflate_decompressor *d,
+			  unsigned num_litlen_syms, unsigned num_offset_syms)
+{
+	return build_decode_table(d->litlen_decode_table,
+				  d->lens,
+				  num_litlen_syms,
+				  make_litlen_direct_entry,
+				  make_litlen_leaf_entry,
+				  DEFLATE_LITLEN_TABLEBITS,
+				  DEFLATE_MAX_LITLEN_CODEWORD_LEN);
+}
+
+/* Build the decode table for the offset code.  */
+static bool
+build_offset_decode_table(struct deflate_decompressor *d,
+			  unsigned num_litlen_syms, unsigned num_offset_syms)
+{
+	return build_decode_table(d->offset_decode_table,
+				  d->lens + num_litlen_syms,
+				  num_offset_syms,
+				  make_offset_direct_entry,
+				  make_offset_leaf_entry,
+				  DEFLATE_OFFSET_TABLEBITS,
+				  DEFLATE_MAX_OFFSET_CODEWORD_LEN);
+}
+
+static inline machine_word_t
+repeat_byte(u8 b)
+{
+	machine_word_t v;
+
+	BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
+
+	v = b;
+	v |= v << 8;
+	v |= v << 16;
+	v |= v << ((WORDSIZE == 8) ? 32 : 0);
+	return v;
+}
+
+static inline void
+copy_word_unaligned(const void *src, void *dst)
+{
+	store_word_unaligned(load_word_unaligned(src), dst);
+}
+
+/*
+ * Copy an LZ77 match at (dst - offset) to dst.
+ *
+ * The length and offset must be already validated --- that is, (dst - offset)
+ * can't underrun the output buffer, and (dst + length) can't overrun the output
+ * buffer.  Also, the length cannot be 0.
+ *
+ * @winend points to the byte past the end of the output buffer.
+ * This function won't write any data beyond this position.
+ */
+static inline void
+lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend, u32 min_length)
+{
+	const u8 *src = dst - offset;
+	const u8 * const end = dst + length;
+
+	/*
+	 * Try to copy one machine word at a time.  On i386 and x86_64 this is
+	 * faster than copying one byte at a time, unless the data is
+	 * near-random and all the matches have very short lengths.  Note that
+	 * since this requires unaligned memory accesses, it won't necessarily
+	 * be faster on every architecture.
+	 *
+	 * Also note that we might copy more than the length of the match.  For
+	 * example, if a word is 8 bytes and the match is of length 5, then
+	 * we'll simply copy 8 bytes.  This is okay as long as we don't write
+	 * beyond the end of the output buffer, hence the check for (winend -
+	 * end >= WORDSIZE - 1).
+	 */
+	if (UNALIGNED_ACCESS_IS_VERY_FAST &&
+	    likely(winend - end >= WORDSIZE - 1))
+	{
+
+		if (offset >= WORDSIZE) {
+			/* The source and destination words don't overlap.  */
+
+			/* To improve branch prediction, one iteration of this
+			 * loop is unrolled.  Most matches are short and will
+			 * fail the first check.  But if that check passes, then
+			 * it becomes increasing likely that the match is long
+			 * and we'll need to continue copying.  */
+
+			copy_word_unaligned(src, dst);
+			src += WORDSIZE;
+			dst += WORDSIZE;
+
+			if (dst < end) {
+				do {
+					copy_word_unaligned(src, dst);
+					src += WORDSIZE;
+					dst += WORDSIZE;
+				} while (dst < end);
+			}
+			return;
+		} else if (offset == 1) {
+
+			/* Offset 1 matches are equivalent to run-length
+			 * encoding of the previous byte.  This case is common
+			 * if the data contains many repeated bytes.  */
+
+			machine_word_t v = repeat_byte(*(dst - 1));
+			do {
+				store_word_unaligned(v, dst);
+				src += WORDSIZE;
+				dst += WORDSIZE;
+			} while (dst < end);
+			return;
+		}
+		/*
+		 * We don't bother with special cases for other 'offset <
+		 * WORDSIZE', which are usually rarer than 'offset == 1'.  Extra
+		 * checks will just slow things down.  Actually, it's possible
+		 * to handle all the 'offset < WORDSIZE' cases using the same
+		 * code, but it still becomes more complicated doesn't seem any
+		 * faster overall; it definitely slows down the more common
+		 * 'offset == 1' case.
+		 */
+	}
+
+	/* Fall back to a bytewise copy.  */
+
+	if (min_length >= 2) {
+		*dst++ = *src++;
+		length--;
+	}
+	if (min_length >= 3) {
+		*dst++ = *src++;
+		length--;
+	}
+	if (min_length >= 4) {
+		*dst++ = *src++;
+		length--;
+	}
+	do {
+		*dst++ = *src++;
+	} while (--length);
+}
+
+/*****************************************************************************
+ *                         Main decompression routine
+ *****************************************************************************/
+
+/*
+ * This is the main DEFLATE decompression routine.  It decompresses 'in_nbytes'
+ * bytes of compressed data from the buffer 'in' and writes the uncompressed
+ * data to the buffer 'out'.  The caller must know the exact length of the
+ * uncompressed data and pass it as 'out_nbytes'.  The return value is %true if
+ * and only if decompression was successful.  A return value of %false indicates
+ * that either the compressed data is invalid or it does not decompress to
+ * exactly 'out_nbytes' bytes of uncompressed data.
+ */
+LIBEXPORT bool
+deflate_decompress(struct deflate_decompressor * restrict d,
+		   const void * restrict in, size_t in_nbytes,
+		   void * restrict out, size_t out_nbytes)
+{
+	u8 *out_next = out;
+	u8 * const out_end = out_next + out_nbytes;
+	const u8 *in_next = in;
+	const u8 * const in_end = in_next + in_nbytes;
+	bitbuf_t bitbuf = 0;
+	unsigned bitsleft = 0;
+	size_t overrun_count = 0;
+	unsigned i;
+	unsigned is_final_block;
+	unsigned block_type;
+	u16 len;
+	u16 nlen;
+	unsigned num_litlen_syms;
+	unsigned num_offset_syms;
+
+next_block:
+	/* Starting to read the next block.  */
+
+	BUILD_BUG_ON(!CAN_ENSURE(1 + 2 + 5 + 5 + 4));
+	ENSURE_BITS(1 + 2 + 5 + 5 + 4);
+
+	/* BFINAL: 1 bit  */
+	is_final_block = POP_BITS(1);
+
+	/* BTYPE: 2 bits  */
+	block_type = POP_BITS(2);
+
+	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
+
+		/* Dynamic Huffman block.  */
+
+		/* The order in which precode lengths are stored.  */
+		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+		};
+
+		unsigned num_explicit_precode_lens;
+
+		/* Read the codeword length counts.  */
+
+		BUILD_BUG_ON(DEFLATE_NUM_LITLEN_SYMS != ((1 << 5) - 1) + 257);
+		num_litlen_syms = POP_BITS(5) + 257;
+
+		BUILD_BUG_ON(DEFLATE_NUM_OFFSET_SYMS != ((1 << 5) - 1) + 1);
+		num_offset_syms = POP_BITS(5) + 1;
+
+		BUILD_BUG_ON(DEFLATE_NUM_PRECODE_SYMS != ((1 << 4) - 1) + 4);
+		num_explicit_precode_lens = POP_BITS(4) + 4;
+
+		/* Read the precode codeword lengths.  */
+		BUILD_BUG_ON(DEFLATE_MAX_PRE_CODEWORD_LEN != (1 << 3) - 1);
+		if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) {
+
+			ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3);
+
+			for (i = 0; i < num_explicit_precode_lens; i++)
+				d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
+		} else {
+			for (i = 0; i < num_explicit_precode_lens; i++) {
+				ENSURE_BITS(3);
+				d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
+			}
+		}
+
+		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
+			d->precode_lens[deflate_precode_lens_permutation[i]] = 0;
+
+		/* Build the decode table for the precode.  */
+		if (!build_precode_decode_table(d))
+			return false;
+
+		/* Expand the literal/length and offset codeword lengths.  */
+		for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
+			u32 entry;
+			unsigned presym;
+			u8 rep_val;
+			unsigned rep_count;
+
+			ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
+
+			/* (The code below assumes there are no binary trees in
+			 * the decode table.)  */
+			BUILD_BUG_ON(DEFLATE_PRECODE_TABLEBITS != DEFLATE_MAX_PRE_CODEWORD_LEN);
+
+			/* Read the next precode symbol.  */
+			entry = d->precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
+			REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT);
+			presym = entry & HUFFDEC_VALUE_MASK;
+
+			if (presym < 16) {
+				/* Explicit codeword length  */
+				d->lens[i++] = presym;
+				continue;
+			}
+
+			/* Run-length encoded codeword lengths  */
+
+			/* Note: we don't need verify that the repeat count
+			 * doesn't overflow the number of elements, since we
+			 * have enough extra spaces to allow for the worst-case
+			 * overflow (138 zeroes when only 1 length was
+			 * remaining).
+			 *
+			 * In the case of the small repeat counts (presyms 16
+			 * and 17), it is fastest to always write the maximum
+			 * number of entries.  That gets rid of branches that
+			 * would otherwise be required.
+			 *
+			 * It is not just because of the numerical order that
+			 * our checks go in the order 'presym < 16', 'presym ==
+			 * 16', and 'presym == 17'.  For typical data this is
+			 * ordered from most frequent to least frequent case.
+			 */
+			BUILD_BUG_ON(DEFLATE_MAX_LENS_OVERRUN != 138 - 1);
+
+			if (presym == 16) {
+				/* Repeat the previous length 3 - 6 times  */
+				if (SAFETY_CHECK(i == 0))
+					return false;
+				rep_val = d->lens[i - 1];
+				BUILD_BUG_ON(3 + ((1 << 2) - 1) != 6);
+				rep_count = 3 + POP_BITS(2);
+				d->lens[i + 0] = rep_val;
+				d->lens[i + 1] = rep_val;
+				d->lens[i + 2] = rep_val;
+				d->lens[i + 3] = rep_val;
+				d->lens[i + 4] = rep_val;
+				d->lens[i + 5] = rep_val;
+				i += rep_count;
+			} else if (presym == 17) {
+				/* Repeat zero 3 - 10 times  */
+				BUILD_BUG_ON(3 + ((1 << 3) - 1) != 10);
+				rep_count = 3 + POP_BITS(3);
+				d->lens[i + 0] = 0;
+				d->lens[i + 1] = 0;
+				d->lens[i + 2] = 0;
+				d->lens[i + 3] = 0;
+				d->lens[i + 4] = 0;
+				d->lens[i + 5] = 0;
+				d->lens[i + 6] = 0;
+				d->lens[i + 7] = 0;
+				d->lens[i + 8] = 0;
+				d->lens[i + 9] = 0;
+				i += rep_count;
+			} else {
+				/* Repeat zero 11 - 138 times  */
+				BUILD_BUG_ON(11 + ((1 << 7) - 1) != 138);
+				rep_count = 11 + POP_BITS(7);
+				memset(&d->lens[i], 0, rep_count * sizeof(d->lens[i]));
+				i += rep_count;
+			}
+		}
+	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
+
+		/* Uncompressed block: copy 'len' bytes literally from the input
+		 * buffer to the output buffer.  */
+
+		ALIGN_INPUT();
+
+		if (SAFETY_CHECK(in_end - in_next < 4))
+			return false;
+
+		len = READ_U16();
+		nlen = READ_U16();
+
+		if (SAFETY_CHECK(len != (u16)~nlen))
+			return false;
+
+		if (SAFETY_CHECK(len > out_end - out_next))
+			return false;
+
+		if (SAFETY_CHECK(len > in_end - in_next))
+			return false;
+
+		memcpy(out_next, in_next, len);
+		in_next += len;
+		out_next += len;
+
+		goto block_done;
+
+	} else if (block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN) {
+
+		/* Static Huffman block: set the static Huffman codeword
+		 * lengths.  Then the remainder is the same as decompressing a
+		 * dynamic Huffman block.  */
+
+		BUILD_BUG_ON(DEFLATE_NUM_LITLEN_SYMS != 288);
+		BUILD_BUG_ON(DEFLATE_NUM_OFFSET_SYMS != 32);
+
+		for (i = 0; i < 144; i++)
+			d->lens[i] = 8;
+		for (; i < 256; i++)
+			d->lens[i] = 9;
+		for (; i < 280; i++)
+			d->lens[i] = 7;
+		for (; i < 288; i++)
+			d->lens[i] = 8;
+
+		for (; i < 288 + 32; i++)
+			d->lens[i] = 5;
+
+		num_litlen_syms = 288;
+		num_offset_syms = 32;
+
+	} else {
+		/* Reserved block type.  */
+		return false;
+	}
+
+	/* Decompressing a Huffman block (either dynamic or static)  */
+
+	if (!build_offset_decode_table(d, num_litlen_syms, num_offset_syms))
+		return false;
+
+	if (!build_litlen_decode_table(d, num_litlen_syms, num_offset_syms))
+		return false;
+
+	/* The main DEFLATE decode loop  */
+	for (;;) {
+		u32 entry;
+		u32 length;
+		u32 offset;
+
+		/* If our bitbuffer variable is large enough, we load new bits
+		 * only once for each match or literal decoded.  This is
+		 * fastest.  Otherwise, we may need to load new bits multiple
+		 * times when decoding a match.  */
+
+		BUILD_BUG_ON(!CAN_ENSURE(DEFLATE_MAX_LITLEN_CODEWORD_LEN));
+		ENSURE_BITS(MAX_ENSURE);
+
+		/* Read a literal or length.  */
+
+		entry = d->litlen_decode_table[BITS(DEFLATE_LITLEN_TABLEBITS)];
+
+		if (CAN_ENSURE(DEFLATE_LITLEN_TABLEBITS * 2) &&
+		    likely(out_end - out_next >= MAX_ENSURE / DEFLATE_LITLEN_TABLEBITS))
+		{
+			/* Fast path for decoding literals  */
+
+		#define NUM_BITS_TO_ENSURE_AFTER_INLINE_LITERALS		\
+			((MAX_ENSURE >= DEFLATE_MAX_MATCH_BITS) ?		\
+			 DEFLATE_MAX_MATCH_BITS :				\
+			  ((MAX_ENSURE >= DEFLATE_MAX_LITLEN_CODEWORD_LEN +	\
+					  DEFLATE_MAX_EXTRA_LENGTH_BITS) ?	\
+				DEFLATE_MAX_LITLEN_CODEWORD_LEN +		\
+				DEFLATE_MAX_EXTRA_LENGTH_BITS :			\
+					DEFLATE_MAX_LITLEN_CODEWORD_LEN))
+
+		#define INLINE_LITERAL(seq)					\
+			if (CAN_ENSURE(DEFLATE_LITLEN_TABLEBITS * (seq))) {	\
+				entry = d->litlen_decode_table[			\
+						BITS(DEFLATE_LITLEN_TABLEBITS)];\
+				if (entry & HUFFDEC_NOT_LITERAL) {		\
+					if ((seq) != 1)				\
+						ENSURE_BITS(NUM_BITS_TO_ENSURE_AFTER_INLINE_LITERALS);	\
+					goto not_literal;			\
+				}						\
+				REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT);	\
+				*out_next++ = entry;				\
+			}
+
+			INLINE_LITERAL(1);
+			INLINE_LITERAL(2);
+			INLINE_LITERAL(3);
+			INLINE_LITERAL(4);
+			INLINE_LITERAL(5);
+			INLINE_LITERAL(6);
+			INLINE_LITERAL(7);
+			INLINE_LITERAL(8);
+			continue;
+		}
+
+		if (!(entry & HUFFDEC_NOT_LITERAL)) {
+			REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT);
+			if (SAFETY_CHECK(out_next == out_end))
+				return false;
+			*out_next++ = entry;
+			continue;
+		}
+	not_literal:
+		if (likely(!(entry & HUFFDEC_NOT_FULL_LENGTH))) {
+
+			/* The next TABLEBITS bits were enough to directly look
+			 * up a litlen symbol, which was a length slot.  In
+			 * addition, the full match length, including the extra
+			 * bits, fit into TABLEBITS.  So the result of the
+			 * lookup was the full match length.
+			 *
+			 * On typical data, most match lengths are short enough
+			 * to fall into this category.  */
+
+			REMOVE_BITS((entry >> HUFFDEC_LEN_SHIFT) & HUFFDEC_LEN_MASK);
+			length = entry & HUFFDEC_VALUE_MASK;
+
+		} else if (!(entry & HUFFDEC_TREE_NONLEAF)) {
+
+			/* The next TABLEBITS bits were enough to directly look
+			 * up a litlen symbol, which was either a length slot or
+			 * end-of-block.  However, the full match length,
+			 * including the extra bits (0 in the case of
+			 * end-of-block), requires more than TABLEBITS bits to
+			 * decode.  So the result of the lookup was the length
+			 * base and number of extra length bits.  We will read
+			 * this number of extra length bits and add them to the
+			 * length base in order to construct the full length.
+			 *
+			 * On typical data, this case is rare.  */
+
+			REMOVE_BITS((entry >> HUFFDEC_LEN_SHIFT) & HUFFDEC_LEN_MASK);
+			entry &= HUFFDEC_VALUE_MASK;
+
+			if (!CAN_ENSURE(DEFLATE_MAX_LITLEN_CODEWORD_LEN +
+					DEFLATE_MAX_EXTRA_LENGTH_BITS))
+				ENSURE_BITS(DEFLATE_MAX_EXTRA_LENGTH_BITS);
+
+			length = (entry & HUFFDEC_LENGTH_BASE_MASK) +
+				  POP_BITS(entry >> HUFFDEC_EXTRA_LENGTH_BITS_SHIFT);
+		} else {
+
+			/* The next TABLEBITS bits were not enough to directly
+			 * look up a litlen symbol.  Therefore, we must walk the
+			 * appropriate binary tree to decode the symbol, which
+			 * may be a literal, length slot, or end-of-block.
+			 *
+			 * On typical data, this case is rare.  */
+
+			REMOVE_BITS(DEFLATE_LITLEN_TABLEBITS);
+			do {
+				entry &= ~HUFFDEC_TREE_NONLEAF_FLAGS;
+				entry += POP_BITS(1);
+				entry = d->litlen_decode_table[entry];
+			} while (entry & HUFFDEC_TREE_NONLEAF_FAST_FLAG);
+			if (entry < 256) {
+				if (SAFETY_CHECK(out_next == out_end))
+					return false;
+				*out_next++ = entry;
+				continue;
+			}
+			entry -= 256;
+
+			if (!CAN_ENSURE(DEFLATE_MAX_LITLEN_CODEWORD_LEN +
+					DEFLATE_MAX_EXTRA_LENGTH_BITS))
+				ENSURE_BITS(DEFLATE_MAX_EXTRA_LENGTH_BITS);
+
+			length = (entry & HUFFDEC_LENGTH_BASE_MASK) +
+				  POP_BITS(entry >> HUFFDEC_EXTRA_LENGTH_BITS_SHIFT);
+		}
+
+		/* The match destination must not end after the end of the
+		 * output buffer.  */
+		if (SAFETY_CHECK(length > out_end - out_next))
+			return false;
+
+		if (unlikely(length == HUFFDEC_END_OF_BLOCK_LENGTH))
+			goto block_done;
+
+		/* Read the match offset.  */
+
+		if (!CAN_ENSURE(DEFLATE_MAX_MATCH_BITS)) {
+			if (CAN_ENSURE(DEFLATE_MAX_OFFSET_CODEWORD_LEN +
+				       DEFLATE_MAX_EXTRA_OFFSET_BITS))
+				ENSURE_BITS(DEFLATE_MAX_OFFSET_CODEWORD_LEN +
+					    DEFLATE_MAX_EXTRA_OFFSET_BITS);
+			else
+				ENSURE_BITS(DEFLATE_MAX_OFFSET_CODEWORD_LEN);
+		}
+
+		entry = d->offset_decode_table[BITS(DEFLATE_OFFSET_TABLEBITS)];
+		if (likely(!(entry & HUFFDEC_TREE_NONLEAF_FAST_FLAG))) {
+			REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT);
+			entry &= HUFFDEC_VALUE_MASK;
+		} else {
+			REMOVE_BITS(DEFLATE_OFFSET_TABLEBITS);
+			do {
+				entry &= ~HUFFDEC_TREE_NONLEAF_FLAGS;
+				entry += POP_BITS(1);
+				entry = d->offset_decode_table[entry];
+			} while (entry & HUFFDEC_TREE_NONLEAF_FAST_FLAG);
+		}
+
+		/* The value we have here isn't the offset symbol itself, but
+		 * rather the offset symbol indexed into
+		 * deflate_offset_symbol_data[].  This gives us the offset base
+		 * and number of extra offset bits without having to index
+		 * additional tables in the main decode loop.  */
+
+		if (!CAN_ENSURE(DEFLATE_MAX_OFFSET_CODEWORD_LEN +
+				DEFLATE_MAX_EXTRA_OFFSET_BITS))
+			ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
+
+		offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
+			 POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
+
+		/* The match source must not begin before the beginning of the
+		 * output buffer.  */
+		if (SAFETY_CHECK(offset > out_next - (const u8 *)out))
+			return false;
+
+		/* Copy the match:
+		 * 'length' bytes at 'out_next - offset' to 'out_next'.  */
+
+		lz_copy(out_next, length, offset, out_end, DEFLATE_MIN_MATCH_LEN);
+
+		out_next += length;
+	}
+
+block_done:
+	/* Finished decoding a block.  */
+
+	if (!is_final_block)
+		goto next_block;
+
+	/* That was the last block.  Return %true if we got all the output we
+	 * expected, otherwise %false.  */
+	return (out_next == out_end);
+}
+
+LIBEXPORT struct deflate_decompressor *
+deflate_alloc_decompressor(void)
+{
+	return malloc(sizeof(struct deflate_decompressor));
+}
+
+LIBEXPORT void
+deflate_free_decompressor(struct deflate_decompressor *d)
+{
+	free(d);
+}
diff --git a/src/endianness.h b/src/endianness.h
new file mode 100644
index 0000000..41cfdf6
--- /dev/null
+++ b/src/endianness.h
@@ -0,0 +1,75 @@
+/*
+ * endianness.h
+ *
+ * Inline functions for endianness conversion.
+ */
+
+#pragma once
+
+#include "compiler.h"
+#include "types.h"
+
+static inline u16 bswap16(u16 n)
+{
+#ifdef compiler_bswap16
+	return compiler_bswap16(n);
+#else
+	return (n << 8) | (n >> 8);
+#endif
+}
+
+static inline u32 bswap32(u32 n)
+{
+#ifdef compiler_bswap32
+	return compiler_bswap32(n);
+#else
+	return (n << 24) |
+	       ((n & 0xFF00) << 8) |
+	       ((n & 0xFF0000) >> 8) |
+	       (n >> 24);
+#endif
+}
+
+static inline u64 bswap64(u64 n)
+{
+#ifdef compiler_bswap64
+	return compiler_bswap64(n);
+#else
+	return (n << 56) |
+	       ((n & 0xFF00) << 40) |
+	       ((n & 0xFF0000) << 24) |
+	       ((n & 0xFF000000) << 8) |
+	       ((n & 0xFF00000000) >> 8) |
+	       ((n & 0xFF0000000000) >> 24) |
+	       ((n & 0xFF000000000000) >> 40) |
+	       (n >> 56);
+#endif
+}
+
+#if CPU_IS_BIG_ENDIAN
+#  define cpu_to_le16(n) bswap16(n)
+#  define cpu_to_le32(n) bswap32(n)
+#  define cpu_to_le64(n) bswap64(n)
+#  define le16_to_cpu(n) bswap16(n)
+#  define le32_to_cpu(n) bswap32(n)
+#  define le64_to_cpu(n) bswap64(n)
+#  define cpu_to_be16(n) (n)
+#  define cpu_to_be32(n) (n)
+#  define cpu_to_be64(n) (n)
+#  define be16_to_cpu(n) (n)
+#  define be32_to_cpu(n) (n)
+#  define be64_to_cpu(n) (n)
+#else
+#  define cpu_to_le16(n) (n)
+#  define cpu_to_le32(n) (n)
+#  define cpu_to_le64(n) (n)
+#  define le16_to_cpu(n) (n)
+#  define le32_to_cpu(n) (n)
+#  define le64_to_cpu(n) (n)
+#  define cpu_to_be16(n) bswap16(n)
+#  define cpu_to_be32(n) bswap32(n)
+#  define cpu_to_be64(n) bswap64(n)
+#  define be16_to_cpu(n) bswap16(n)
+#  define be32_to_cpu(n) bswap32(n)
+#  define be64_to_cpu(n) bswap64(n)
+#endif
diff --git a/src/gzip_compress.c b/src/gzip_compress.c
new file mode 100644
index 0000000..eea49f3
--- /dev/null
+++ b/src/gzip_compress.c
@@ -0,0 +1,64 @@
+/*
+ * gzip_compress.c
+ *
+ * Generate DEFLATE-compressed data in the gzip wrapper format.
+ */
+
+#include "libdeflate.h"
+
+#include "crc32.h"
+#include "deflate_compress.h"
+#include "gzip_constants.h"
+#include "unaligned.h"
+
+LIBEXPORT size_t
+gzip_compress(struct deflate_compressor *c, const void *in, size_t in_size,
+	      void *out, size_t out_nbytes_avail)
+{
+	u8 *out_next = out;
+	unsigned compression_level;
+	u8 xfl;
+	size_t deflate_size;
+
+	if (out_nbytes_avail <= GZIP_MIN_OVERHEAD)
+		return 0;
+
+	/* ID1 */
+	*out_next++ = GZIP_ID1;
+	/* ID2 */
+	*out_next++ = GZIP_ID2;
+	/* CM */
+	*out_next++ = GZIP_CM_DEFLATE;
+	/* FLG */
+	*out_next++ = 0;
+	/* MTIME */
+	put_unaligned_u32_be(GZIP_MTIME_UNAVAILABLE, out_next);
+	out_next += 4;
+	/* XFL */
+	xfl = 0;
+	compression_level = deflate_get_compression_level(c);
+	if (compression_level < 2)
+		xfl |= GZIP_XFL_FASTEST_COMRESSION;
+	else if (compression_level >= 8)
+		xfl |= GZIP_XFL_SLOWEST_COMRESSION;
+	*out_next++ = xfl;
+	/* OS */
+	*out_next++ = GZIP_OS_UNKNOWN;	/* OS  */
+
+	/* Compressed data  */
+	deflate_size = deflate_compress(c, in, in_size, out_next,
+					out_nbytes_avail - GZIP_MIN_OVERHEAD);
+	if (deflate_size == 0)
+		return 0;
+	out_next += deflate_size;
+
+	/* CRC32 */
+	put_unaligned_u32_be(crc32(in, in_size), out_next);
+	out_next += 4;
+
+	/* ISIZE */
+	put_unaligned_u32_be(in_size, out_next);
+	out_next += 4;
+
+	return out_next - (u8 *)out;
+}
diff --git a/src/gzip_constants.h b/src/gzip_constants.h
new file mode 100644
index 0000000..edd092a
--- /dev/null
+++ b/src/gzip_constants.h
@@ -0,0 +1,47 @@
+/*
+ * gzip_constants.h
+ *
+ * Constants for the gzip wrapper format.
+ */
+
+#pragma once
+
+#include "compiler.h"
+
+#define GZIP_MIN_HEADER_SIZE	10
+#define GZIP_FOOTER_SIZE	8
+#define GZIP_MIN_OVERHEAD	(GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE)
+
+#define GZIP_ID1		0x1F
+#define GZIP_ID2		0x8B
+
+#define GZIP_CM_DEFLATE		8
+
+#define GZIP_FTEXT		0x01
+#define GZIP_FHCRC		0x02
+#define GZIP_FEXTRA		0x04
+#define GZIP_FNAME		0x08
+#define GZIP_FCOMMENT		0x10
+#define GZIP_FRESERVED		0xE0
+
+#define GZIP_MTIME_UNAVAILABLE	0
+
+#define GZIP_XFL_SLOWEST_COMRESSION	0x02
+#define GZIP_XFL_FASTEST_COMRESSION	0x04
+#define GZIP_XFL_RESERVED		0xF9
+
+#define GZIP_OS_FAT			0
+#define GZIP_OS_AMIGA			1
+#define GZIP_OS_VMS			2
+#define GZIP_OS_UNIX			3
+#define GZIP_OS_VM_CMS			4
+#define GZIP_OS_ATARI_TOS		5
+#define GZIP_OS_HPFS			6
+#define GZIP_OS_MACINTOSH		7
+#define GZIP_OS_Z_SYSTEM		8
+#define GZIP_OS_CP_M			9
+#define GZIP_OS_TOPS_20			10
+#define GZIP_OS_NTFS			11
+#define GZIP_OS_QDOS			12
+#define GZIP_OS_RISCOS			13
+#define GZIP_OS_UNKNOWN			255
diff --git a/src/gzip_decompress.c b/src/gzip_decompress.c
new file mode 100644
index 0000000..55fd6ed
--- /dev/null
+++ b/src/gzip_decompress.c
@@ -0,0 +1,100 @@
+/*
+ * gzip_decompress.c
+ *
+ * Decompress DEFLATE-compressed data wrapped in the gzip format.
+ */
+
+#include "libdeflate.h"
+
+#include "crc32.h"
+#include "gzip_constants.h"
+#include "unaligned.h"
+
+LIBEXPORT bool
+gzip_decompress(struct deflate_decompressor *d,
+		const void *in, size_t in_nbytes, void *out, size_t out_nbytes)
+{
+	const u8 *in_next = in;
+	const u8 * const in_end = in_next + in_nbytes;
+	u8 flg;
+
+	if (in_nbytes < GZIP_MIN_OVERHEAD)
+		return false;
+
+	/* ID1 */
+	if (*in_next++ != GZIP_ID1)
+		return false;
+	/* ID2 */
+	if (*in_next++ != GZIP_ID2)
+		return false;
+	/* CM */
+	if (*in_next++ != GZIP_CM_DEFLATE)
+		return false;
+	flg = *in_next++;
+	/* MTIME */
+	in_next += 4;
+	/* XFL */
+	if (*in_next++ & GZIP_XFL_RESERVED)
+		return false;
+	/* OS */
+	in_next += 1;
+
+	if (flg & GZIP_FRESERVED)
+		return false;
+
+	/* Extra field */
+	if (flg & GZIP_FEXTRA) {
+		u16 xlen = get_unaligned_u16_be(in_next);
+		in_next += 2;
+
+		if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE)
+			return false;
+
+		in_next += xlen;
+	}
+
+	/* Original file name (zero terminated) */
+	if (flg & GZIP_FNAME) {
+		while (*in_next != 0 && ++in_next != in_end)
+			;
+		if (in_next != in_end)
+			in_next++;
+		if (in_end - in_next < GZIP_FOOTER_SIZE)
+			return false;
+	}
+
+	/* File comment (zero terminated) */
+	if (flg & GZIP_FCOMMENT) {
+		while (*in_next != 0 && ++in_next != in_end)
+			;
+		if (in_next != in_end)
+			in_next++;
+		if (in_end - in_next < GZIP_FOOTER_SIZE)
+			return false;
+	}
+
+	/* CRC16 for gzip header */
+	if (flg & GZIP_FHCRC) {
+		in_next += 2;
+		if (in_end - in_next < GZIP_FOOTER_SIZE)
+			return false;
+	}
+
+	/* Compressed data  */
+	if (!deflate_decompress(d, in_next, in_end - GZIP_FOOTER_SIZE - in_next,
+				out, out_nbytes))
+		return false;
+
+	in_next = in_end - GZIP_FOOTER_SIZE;
+
+	/* CRC32 */
+	if (crc32(out, out_nbytes) != get_unaligned_u32_be(in_next))
+		return false;
+	in_next += 4;
+
+	/* ISIZE */
+	if ((u32)out_nbytes != get_unaligned_u32_be(in_next))
+		return false;
+
+	return true;
+}
diff --git a/src/hc_matchfinder.h b/src/hc_matchfinder.h
new file mode 100644
index 0000000..67cb746
--- /dev/null
+++ b/src/hc_matchfinder.h
@@ -0,0 +1,235 @@
+/*
+ * hc_matchfinder.h
+ *
+ * This is a Hash Chain (hc) based matchfinder.
+ *
+ * The data structure is a hash table where each hash bucket contains a linked
+ * list of sequences, referenced by position.
+ *
+ * For each sequence (position) in the input, the first 3 bytes are hashed and
+ * that sequence (position) is prepended to the appropriate linked list in the
+ * hash table.  Since the sequences are inserted in order, each list is always
+ * sorted by increasing match offset.
+ *
+ * At the same time as inserting a sequence, we may search the linked list for
+ * matches with that sequence.  At each step, the length of the match is
+ * computed.  The search ends when the sequences get too far away (outside of
+ * the sliding window), or when the list ends (in the code this is the same
+ * check as "too far away"), or when 'max_search_depth' positions have been
+ * searched, or when a match of at least 'nice_len' bytes has been found.
+ */
+
+#pragma once
+
+#include "lz_extend.h"
+#include "lz_hash3.h"
+#include "matchfinder_common.h"
+#include "unaligned.h"
+
+#ifndef HC_MATCHFINDER_HASH_ORDER
+#  if MATCHFINDER_WINDOW_ORDER < 14
+#    define HC_MATCHFINDER_HASH_ORDER 14
+#  else
+#    define HC_MATCHFINDER_HASH_ORDER 15
+#  endif
+#endif
+
+#define HC_MATCHFINDER_HASH_LENGTH	(1UL << HC_MATCHFINDER_HASH_ORDER)
+
+#define HC_MATCHFINDER_TOTAL_LENGTH	\
+	(HC_MATCHFINDER_HASH_LENGTH + MATCHFINDER_WINDOW_SIZE)
+
+struct hc_matchfinder {
+	union {
+		pos_t mf_data[HC_MATCHFINDER_TOTAL_LENGTH];
+		struct {
+			pos_t hash_tab[HC_MATCHFINDER_HASH_LENGTH];
+			pos_t next_tab[MATCHFINDER_WINDOW_SIZE];
+		};
+	};
+} _aligned_attribute(MATCHFINDER_ALIGNMENT);
+
+static inline void
+hc_matchfinder_init(struct hc_matchfinder *mf)
+{
+	matchfinder_init(mf->hash_tab, HC_MATCHFINDER_HASH_LENGTH);
+}
+
+#if MATCHFINDER_IS_SLIDING
+static inline void
+hc_matchfinder_slide_window(struct hc_matchfinder *mf)
+{
+	matchfinder_rebase(mf->mf_data, HC_MATCHFINDER_TOTAL_LENGTH);
+}
+#endif
+
+/*
+ * Find the longest match longer than 'best_len'.
+ *
+ * @mf
+ *	The matchfinder structure.
+ * @in_base
+ *	Pointer to the next byte in the input buffer to process _at the last
+ *	time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_.
+ * @in_next
+ *	Pointer to the next byte in the input buffer to process.  This is the
+ *	pointer to the bytes being matched against.
+ * @best_len
+ *	Require a match at least this long.
+ * @max_len
+ *	Maximum match length to return.
+ * @nice_len
+ *	Stop searching if a match of at least this length is found.
+ * @max_search_depth
+ *	Limit on the number of potential matches to consider.
+ * @offset_ret
+ *	The match offset is returned here.
+ *
+ * Return the length of the match found, or 'best_len' if no match longer than
+ * 'best_len' was found.
+ */
+static inline unsigned
+hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
+			     const u8 * const in_base,
+			     const u8 * const in_next,
+			     unsigned best_len,
+			     const unsigned max_len,
+			     const unsigned nice_len,
+			     const unsigned max_search_depth,
+			     unsigned *offset_ret)
+{
+	unsigned depth_remaining = max_search_depth;
+	const u8 *best_matchptr = best_matchptr; /* uninitialized */
+	const u8 *matchptr;
+	unsigned len;
+	unsigned hash;
+	pos_t cur_match;
+	u32 first_3_bytes;
+
+	/* Insert the current sequence into the appropriate hash chain.  */
+	if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES))
+		goto out;
+	first_3_bytes = load_u24_unaligned(in_next);
+	hash = lz_hash3_u24(first_3_bytes, HC_MATCHFINDER_HASH_ORDER);
+	cur_match = mf->hash_tab[hash];
+	mf->next_tab[in_next - in_base] = cur_match;
+	mf->hash_tab[hash] = in_next - in_base;
+
+	if (unlikely(best_len >= max_len))
+		goto out;
+
+	/* Search the appropriate hash chain for matches.  */
+
+	if (!(matchfinder_match_in_window(cur_match, in_base, in_next)))
+		goto out;
+
+	if (best_len < 3) {
+		for (;;) {
+			/* No length 3 match found yet.
+			 * Check the first 3 bytes.  */
+			matchptr = &in_base[cur_match];
+
+			if (load_u24_unaligned(matchptr) == first_3_bytes)
+				break;
+
+			/* Not a match; keep trying.  */
+			cur_match = mf->next_tab[
+					matchfinder_slot_for_match(cur_match)];
+			if (!matchfinder_match_in_window(cur_match,
+							 in_base, in_next))
+				goto out;
+			if (!--depth_remaining)
+				goto out;
+		}
+
+		/* Found a length 3 match.  */
+		best_matchptr = matchptr;
+		best_len = lz_extend(in_next, best_matchptr, 3, max_len);
+		if (best_len >= nice_len)
+			goto out;
+		cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)];
+		if (!matchfinder_match_in_window(cur_match, in_base, in_next))
+			goto out;
+		if (!--depth_remaining)
+			goto out;
+	}
+
+	for (;;) {
+		for (;;) {
+			matchptr = &in_base[cur_match];
+
+			/* Already found a length 3 match.  Try for a longer match;
+			 * start by checking the last 2 bytes and the first 4 bytes.  */
+		#if UNALIGNED_ACCESS_IS_FAST
+			if ((load_u32_unaligned(matchptr + best_len - 3) ==
+			     load_u32_unaligned(in_next + best_len - 3)) &&
+			    (load_u32_unaligned(matchptr) ==
+			     load_u32_unaligned(in_next)))
+		#else
+			if (matchptr[best_len] == in_next[best_len])
+		#endif
+				break;
+
+			cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)];
+			if (!matchfinder_match_in_window(cur_match, in_base, in_next))
+				goto out;
+			if (!--depth_remaining)
+				goto out;
+		}
+
+		if (UNALIGNED_ACCESS_IS_FAST)
+			len = 4;
+		else
+			len = 0;
+		len = lz_extend(in_next, matchptr, len, max_len);
+		if (len > best_len) {
+			best_len = len;
+			best_matchptr = matchptr;
+			if (best_len >= nice_len)
+				goto out;
+		}
+		cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)];
+		if (!matchfinder_match_in_window(cur_match, in_base, in_next))
+			goto out;
+		if (!--depth_remaining)
+			goto out;
+	}
+out:
+	*offset_ret = in_next - best_matchptr;
+	return best_len;
+}
+
+/*
+ * Advance the match-finder, but don't search for matches.
+ *
+ * @mf
+ *	The matchfinder structure.
+ * @in_base
+ *	Pointer to the next byte in the input buffer to process _at the last
+ *	time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_.
+ * @in_next
+ *	Pointer to the next byte in the input buffer to process.
+ * @in_end
+ *	Pointer to the end of the input buffer.
+ * @count
+ *	Number of bytes to skip; must be > 0.
+ */
+static inline void
+hc_matchfinder_skip_positions(struct hc_matchfinder * restrict mf,
+			      const u8 *in_base,
+			      const u8 *in_next,
+			      const u8 *in_end,
+			      unsigned count)
+{
+	unsigned hash;
+
+	if (unlikely(in_next + count >= in_end - LZ_HASH_REQUIRED_NBYTES))
+		return;
+
+	do {
+		hash = lz_hash3(in_next, HC_MATCHFINDER_HASH_ORDER);
+		mf->next_tab[in_next - in_base] = mf->hash_tab[hash];
+		mf->hash_tab[hash] = in_next - in_base;
+		in_next++;
+	} while (--count);
+}
diff --git a/src/lz_extend.h b/src/lz_extend.h
new file mode 100644
index 0000000..94b281a
--- /dev/null
+++ b/src/lz_extend.h
@@ -0,0 +1,60 @@
+/*
+ * lz_extend.h
+ *
+ * Fast match extension for Lempel-Ziv matchfinding.
+ */
+
+#pragma once
+
+#include "bitops.h"
+#include "unaligned.h"
+
+/*
+ * Return the number of bytes at @matchptr that match the bytes at @strptr, up
+ * to a maximum of @max_len.  Initially, @start_len bytes are matched.
+ */
+static inline unsigned
+lz_extend(const u8 * const strptr, const u8 * const matchptr,
+	  const unsigned start_len, const unsigned max_len)
+{
+	unsigned len = start_len;
+	machine_word_t v_word;
+
+	if (UNALIGNED_ACCESS_IS_FAST) {
+
+		if (likely(max_len - len >= 4 * WORDSIZE)) {
+
+		#define COMPARE_WORD_STEP					\
+			v_word = load_word_unaligned(&matchptr[len]) ^		\
+				 load_word_unaligned(&strptr[len]);		\
+			if (v_word != 0)					\
+				goto word_differs;				\
+			len += WORDSIZE;					\
+
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+		#undef COMPARE_WORD_STEP
+		}
+
+		while (len + WORDSIZE <= max_len) {
+			v_word = load_word_unaligned(&matchptr[len]) ^
+				 load_word_unaligned(&strptr[len]);
+			if (v_word != 0)
+				goto word_differs;
+			len += WORDSIZE;
+		}
+	}
+
+	while (len < max_len && matchptr[len] == strptr[len])
+		len++;
+	return len;
+
+word_differs:
+	if (CPU_IS_LITTLE_ENDIAN)
+		len += (ffsw(v_word) >> 3);
+	else
+		len += (flsw(v_word) >> 3);
+	return len;
+}
diff --git a/src/lz_hash3.h b/src/lz_hash3.h
new file mode 100644
index 0000000..ec322d9
--- /dev/null
+++ b/src/lz_hash3.h
@@ -0,0 +1,49 @@
+/*
+ * lz_hash3.h
+ *
+ * 3-byte hashing for Lempel-Ziv matchfinding.
+ */
+
+#pragma once
+
+#include "unaligned.h"
+
+static inline u32
+loaded_u32_to_u24(u32 v)
+{
+	if (CPU_IS_LITTLE_ENDIAN)
+		return v & 0xFFFFFF;
+	else
+		return v >> 8;
+}
+
+static inline u32
+load_u24_unaligned(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return loaded_u32_to_u24(load_u32_unaligned(p));
+	else
+		return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
+}
+
+static inline u32
+lz_hash3_u24(u32 str, unsigned num_bits)
+{
+	return (u32)(str * 0x1E35A7BD) >> (32 - num_bits);
+}
+
+/*
+ * Hash the next 3-byte sequence in the window, producing a hash of length
+ * 'num_bits' bits.  At least LZ_HASH_REQUIRED_NBYTES must be available at 'p';
+ * this might be 4 bytes rather than 3 because an unaligned load is faster on
+ * some architectures.
+ */
+static inline u32
+lz_hash3(const u8 *p, unsigned num_bits)
+{
+	return lz_hash3_u24(load_u24_unaligned(p), num_bits);
+}
+
+/* Number of bytes the hash function actually requires be available, due to the
+ * possibility of an unaligned load.  */
+#define LZ_HASH_REQUIRED_NBYTES (UNALIGNED_ACCESS_IS_FAST ? 4 : 3)
diff --git a/src/matchfinder_avx2.h b/src/matchfinder_avx2.h
new file mode 100644
index 0000000..fe98b63
--- /dev/null
+++ b/src/matchfinder_avx2.h
@@ -0,0 +1,64 @@
+/*
+ * matchfinder_avx2.h
+ *
+ * Matchfinding routines optimized for Intel AVX2 (Advanced Vector Extensions).
+ */
+
+#include <immintrin.h>
+
+static inline bool
+matchfinder_init_avx2(pos_t *data, size_t size)
+{
+	__m256i v, *p;
+	size_t n;
+
+	if (size % sizeof(__m256i) * 4)
+		return false;
+
+	if (sizeof(pos_t) == 2)
+		v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
+	else if (sizeof(pos_t) == 4)
+		v = _mm256_set1_epi32(MATCHFINDER_INITVAL);
+	else
+		return false;
+
+	p = (__m256i *)data;
+	n = size / (sizeof(__m256i) * 4);
+	do {
+		p[0] = v;
+		p[1] = v;
+		p[2] = v;
+		p[3] = v;
+		p += 4;
+	} while (--n);
+	return true;
+}
+
+static inline bool
+matchfinder_rebase_avx2(pos_t *data, size_t size)
+{
+	__m256i v, *p;
+	size_t n;
+
+	if ((size % sizeof(__m256i) * 4 != 0))
+		return false;
+
+	if (sizeof(pos_t) == 2)
+		v = _mm256_set1_epi16((pos_t)-MATCHFINDER_WINDOW_SIZE);
+	else if (sizeof(pos_t) == 4)
+		v = _mm256_set1_epi32((pos_t)-MATCHFINDER_WINDOW_SIZE);
+	else
+		return false;
+
+	p = (__m256i *)data;
+	n = size / (sizeof(__m256i) * 4);
+	do {
+		/* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+		p[0] = _mm256_adds_epi16(p[0], v);
+		p[1] = _mm256_adds_epi16(p[1], v);
+		p[2] = _mm256_adds_epi16(p[2], v);
+		p[3] = _mm256_adds_epi16(p[3], v);
+		p += 4;
+	} while (--n);
+	return true;
+}
diff --git a/src/matchfinder_common.h b/src/matchfinder_common.h
new file mode 100644
index 0000000..2a01336
--- /dev/null
+++ b/src/matchfinder_common.h
@@ -0,0 +1,163 @@
+/*
+ * matchfinder_common.h
+ *
+ * Common code for Lempel-Ziv matchfinding.
+ */
+
+#pragma once
+
+#include "types.h"
+
+#include <string.h>
+
+#ifndef MATCHFINDER_WINDOW_ORDER
+#  error "MATCHFINDER_WINDOW_ORDER must be defined!"
+#endif
+
+#ifndef MATCHFINDER_IS_SLIDING
+#  error "MATCHFINDER_IS_SLIDING must be defined!"
+#endif
+
+#define MATCHFINDER_WINDOW_SIZE ((size_t)1 << MATCHFINDER_WINDOW_ORDER)
+
+#if MATCHFINDER_IS_SLIDING
+#  include "matchfinder_sliding.h"
+#else
+#  include "matchfinder_nonsliding.h"
+#endif
+
+#define MATCHFINDER_ALIGNMENT 8
+
+#ifdef __AVX2__
+#  include "matchfinder_avx2.h"
+#  if MATCHFINDER_ALIGNMENT < 32
+#    undef MATCHFINDER_ALIGNMENT
+#    define MATCHFINDER_ALIGNMENT 32
+#  endif
+#endif
+
+#ifdef __SSE2__
+#  include "matchfinder_sse2.h"
+#  if MATCHFINDER_ALIGNMENT < 16
+#    undef MATCHFINDER_ALIGNMENT
+#    define MATCHFINDER_ALIGNMENT 16
+#  endif
+#endif
+
+/*
+ * Representation of a match.
+ */
+struct lz_match {
+
+	/* The number of bytes matched.  */
+	pos_t length;
+
+	/* The offset back from the current position that was matched.  */
+	pos_t offset;
+};
+
+static inline bool
+matchfinder_memset_init_okay(void)
+{
+	/* All bytes must match in order to use memset.  */
+	const pos_t v = MATCHFINDER_INITVAL;
+	if (sizeof(pos_t) == 2)
+		return (u8)v == (u8)(v >> 8);
+	if (sizeof(pos_t) == 4)
+		return (u8)v == (u8)(v >> 8) &&
+		       (u8)v == (u8)(v >> 16) &&
+		       (u8)v == (u8)(v >> 24);
+	return false;
+}
+
+/*
+ * Initialize the hash table portion of the matchfinder.
+ *
+ * Essentially, this is an optimized memset().
+ *
+ * 'data' must be aligned to a MATCHFINDER_ALIGNMENT boundary.
+ */
+static inline void
+matchfinder_init(pos_t *data, size_t num_entries)
+{
+	const size_t size = num_entries * sizeof(data[0]);
+
+#ifdef __AVX2__
+	if (matchfinder_init_avx2(data, size))
+		return;
+#endif
+
+#ifdef __SSE2__
+	if (matchfinder_init_sse2(data, size))
+		return;
+#endif
+
+	if (matchfinder_memset_init_okay()) {
+		memset(data, (u8)MATCHFINDER_INITVAL, size);
+		return;
+	}
+
+	for (size_t i = 0; i < num_entries; i++)
+		data[i] = MATCHFINDER_INITVAL;
+}
+
+#if MATCHFINDER_IS_SLIDING
+/*
+ * Slide the matchfinder by WINDOW_SIZE bytes.
+ *
+ * This must be called just after each WINDOW_SIZE bytes have been run through
+ * the matchfinder.
+ *
+ * This will subtract WINDOW_SIZE bytes from each entry in the array specified.
+ * The effect is that all entries are updated to be relative to the current
+ * position, rather than the position WINDOW_SIZE bytes prior.
+ *
+ * Underflow is detected and replaced with signed saturation.  This ensures that
+ * once the sliding window has passed over a position, that position forever
+ * remains out of bounds.
+ *
+ * The array passed in must contain all matchfinder data that is
+ * position-relative.  Concretely, this will include the hash table as well as
+ * the table of positions that is used to link together the sequences in each
+ * hash bucket.  Note that in the latter table, the links are 1-ary in the case
+ * of "hash chains", and 2-ary in the case of "binary trees".  In either case,
+ * the links need to be rebased in the same way.
+ */
+static inline void
+matchfinder_rebase(pos_t *data, size_t num_entries)
+{
+	const size_t size = num_entries * sizeof(data[0]);
+
+#ifdef __AVX2__
+	if (matchfinder_rebase_avx2(data, size))
+		return;
+#endif
+
+#ifdef __SSE2__
+	if (matchfinder_rebase_sse2(data, size))
+		return;
+#endif
+
+	if (MATCHFINDER_WINDOW_SIZE == 32768) {
+		/* Branchless version for 32768 byte windows.  If the value was
+		 * already negative, clear all bits except the sign bit; this
+		 * changes the value to -32768.  Otherwise, set the sign bit;
+		 * this is equivalent to subtracting 32768.  */
+		for (size_t i = 0; i < num_entries; i++) {
+			u16 v = data[i];
+			u16 sign_bit = v & 0x8000;
+			v &= sign_bit - ((sign_bit >> 15) ^ 1);
+			v |= 0x8000;
+			data[i] = v;
+		}
+		return;
+	}
+
+	for (size_t i = 0; i < num_entries; i++) {
+		if (data[i] >= 0)
+			data[i] -= (pos_t)-MATCHFINDER_WINDOW_SIZE;
+		else
+			data[i] = (pos_t)-MATCHFINDER_WINDOW_SIZE;
+	}
+}
+#endif /* MATCHFINDER_IS_SLIDING */
diff --git a/src/matchfinder_nonsliding.h b/src/matchfinder_nonsliding.h
new file mode 100644
index 0000000..e08f461
--- /dev/null
+++ b/src/matchfinder_nonsliding.h
@@ -0,0 +1,47 @@
+/*
+ * matchfinder_nonsliding.h
+ *
+ * Definitions for nonsliding window matchfinders.
+ *
+ * "Nonsliding window" means that any prior sequence can be matched.
+ */
+
+#if MATCHFINDER_WINDOW_ORDER <= 16
+typedef u16 pos_t;
+#else
+typedef u32 pos_t;
+#endif
+
+#if MATCHFINDER_WINDOW_ORDER != 16 && MATCHFINDER_WINDOW_ORDER != 32
+
+/* Not all the bits of the position type are needed, so the sign bit can be
+ * reserved to mean "out of bounds".  */
+#define MATCHFINDER_INITVAL ((pos_t)-1)
+
+static inline bool
+matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next)
+{
+	return !(cur_match & ((pos_t)1 << (sizeof(pos_t) * 8 - 1)));
+}
+
+#else
+
+/* All bits of the position type are needed, so use 0 to mean "out of bounds".
+ * This prevents the beginning of the buffer from matching anything; however,
+ * this doesn't matter much.  */
+
+#define MATCHFINDER_INITVAL ((pos_t)0)
+
+static inline bool
+matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next)
+{
+	return cur_match != 0;
+}
+
+#endif
+
+static inline pos_t
+matchfinder_slot_for_match(pos_t cur_match)
+{
+	return cur_match;
+}
diff --git a/src/matchfinder_sliding.h b/src/matchfinder_sliding.h
new file mode 100644
index 0000000..4b8a515
--- /dev/null
+++ b/src/matchfinder_sliding.h
@@ -0,0 +1,30 @@
+/*
+ * matchfinder_sliding.h
+ *
+ * Definitions for sliding window matchfinders.
+ *
+ * "Sliding window" means that only sequences beginning in the most recent
+ * MATCHFINDER_WINDOW_SIZE bytes can be matched.
+ */
+
+#if MATCHFINDER_WINDOW_ORDER <= 15
+typedef s16 pos_t;
+#else
+typedef s32 pos_t;
+#endif
+
+#define MATCHFINDER_INITVAL ((pos_t)-MATCHFINDER_WINDOW_SIZE)
+
+/* In the sliding window case, positions are stored relative to 'in_base'.  */
+
+static inline bool
+matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next)
+{
+	return cur_match > (pos_t)((in_next - in_base) - MATCHFINDER_WINDOW_SIZE);
+}
+
+static inline pos_t
+matchfinder_slot_for_match(pos_t cur_match)
+{
+	return cur_match & (MATCHFINDER_WINDOW_SIZE - 1);
+}
diff --git a/src/matchfinder_sse2.h b/src/matchfinder_sse2.h
new file mode 100644
index 0000000..cc27600
--- /dev/null
+++ b/src/matchfinder_sse2.h
@@ -0,0 +1,64 @@
+/*
+ * matchfinder_sse2.h
+ *
+ * Matchfinding routines optimized for Intel SSE2 (Streaming SIMD Extensions).
+ */
+
+#include <emmintrin.h>
+
+static inline bool
+matchfinder_init_sse2(pos_t *data, size_t size)
+{
+	__m128i v, *p;
+	size_t n;
+
+	if (size % sizeof(__m128i) * 4)
+		return false;
+
+	if (sizeof(pos_t) == 2)
+		v = _mm_set1_epi16(MATCHFINDER_INITVAL);
+	else if (sizeof(pos_t) == 4)
+		v = _mm_set1_epi32(MATCHFINDER_INITVAL);
+	else
+		return false;
+
+	p = (__m128i *)data;
+	n = size / (sizeof(__m128i) * 4);
+	do {
+		p[0] = v;
+		p[1] = v;
+		p[2] = v;
+		p[3] = v;
+		p += 4;
+	} while (--n);
+	return true;
+}
+
+static inline bool
+matchfinder_rebase_sse2(pos_t *data, size_t size)
+{
+	__m128i v, *p;
+	size_t n;
+
+	if ((size % sizeof(__m128i) * 4 != 0))
+		return false;
+
+	if (sizeof(pos_t) == 2)
+		v = _mm_set1_epi16((pos_t)-MATCHFINDER_WINDOW_SIZE);
+	else if (sizeof(pos_t) == 4)
+		v = _mm_set1_epi32((pos_t)-MATCHFINDER_WINDOW_SIZE);
+	else
+		return false;
+
+	p = (__m128i *)data;
+	n = size / (sizeof(__m128i) * 4);
+	do {
+		/* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+		p[0] = _mm_adds_epi16(p[0], v);
+		p[1] = _mm_adds_epi16(p[1], v);
+		p[2] = _mm_adds_epi16(p[2], v);
+		p[3] = _mm_adds_epi16(p[3], v);
+		p += 4;
+	} while (--n);
+	return true;
+}
diff --git a/src/types.h b/src/types.h
new file mode 100644
index 0000000..205dbd3
--- /dev/null
+++ b/src/types.h
@@ -0,0 +1,38 @@
+/*
+ * types.h
+ *
+ * Definitions of fixed-width integers, 'bool', 'size_t', and 'machine_word_t'.
+ */
+
+#pragma once
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+typedef uint8_t  u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef int8_t  s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+typedef uint16_t le16;
+typedef uint32_t le32;
+typedef uint64_t le64;
+
+typedef uint16_t be16;
+typedef uint32_t be32;
+typedef uint64_t be64;
+
+/*
+ * Type of a machine word.  'unsigned long' would be logical, but that is only
+ * 32 bits on x86_64 Windows.  The same applies to 'uint_fast32_t'.  So the best
+ * we can do without a bunch of #ifdefs appears to be 'size_t'.
+ */
+typedef size_t machine_word_t;
+
+#define WORDSIZE	sizeof(machine_word_t)
diff --git a/src/unaligned.h b/src/unaligned.h
new file mode 100644
index 0000000..d5b0f95
--- /dev/null
+++ b/src/unaligned.h
@@ -0,0 +1,216 @@
+/*
+ * unaligned.h
+ *
+ * Inline functions for unaligned memory access.
+ */
+
+#pragma once
+
+#include "compiler.h"
+#include "endianness.h"
+#include "types.h"
+
+#define DEFINE_UNALIGNED_TYPE(type)				\
+struct type##_unaligned {					\
+	type v;							\
+} _packed_attribute;						\
+								\
+static inline type						\
+load_##type##_unaligned(const void *p)				\
+{								\
+	return ((const struct type##_unaligned *)p)->v;		\
+}								\
+								\
+static inline void						\
+store_##type##_unaligned(type val, void *p)			\
+{								\
+	((struct type##_unaligned *)p)->v = val;		\
+}
+
+DEFINE_UNALIGNED_TYPE(u16);
+DEFINE_UNALIGNED_TYPE(u32);
+DEFINE_UNALIGNED_TYPE(u64);
+DEFINE_UNALIGNED_TYPE(machine_word_t);
+
+#define load_word_unaligned	load_machine_word_t_unaligned
+#define store_word_unaligned	store_machine_word_t_unaligned
+
+static inline u16
+get_unaligned_u16_le(const void *p)
+{
+	u16 v;
+
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		v = le16_to_cpu(load_u16_unaligned(p));
+	} else {
+		const u8 *p8 = p;
+		v = 0;
+		v |= (u16)p8[0] << 0;
+		v |= (u16)p8[1] << 8;
+	}
+	return v;
+}
+
+static inline u32
+get_unaligned_u32_le(const void *p)
+{
+	u32 v;
+
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		v = le32_to_cpu(load_u32_unaligned(p));
+	} else {
+		const u8 *p8 = p;
+		v = 0;
+		v |= (u32)p8[0] << 0;
+		v |= (u32)p8[1] << 8;
+		v |= (u32)p8[2] << 16;
+		v |= (u32)p8[3] << 24;
+	}
+	return v;
+}
+
+static inline u64
+get_unaligned_u64_le(const void *p)
+{
+	u64 v;
+
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		v = le64_to_cpu(load_u64_unaligned(p));
+	} else {
+		const u8 *p8 = p;
+		v = 0;
+		v |= (u64)p8[0] << 0;
+		v |= (u64)p8[1] << 8;
+		v |= (u64)p8[2] << 16;
+		v |= (u64)p8[3] << 24;
+		v |= (u64)p8[4] << 32;
+		v |= (u64)p8[5] << 40;
+		v |= (u64)p8[6] << 48;
+		v |= (u64)p8[7] << 56;
+	}
+	return v;
+}
+
+static inline machine_word_t
+get_unaligned_word_le(const void *p)
+{
+	BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
+	if (WORDSIZE == 4)
+		return get_unaligned_u32_le(p);
+	else
+		return get_unaligned_u64_le(p);
+}
+
+static inline void
+put_unaligned_u16_le(u16 v, void *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u16_unaligned(cpu_to_le16(v), p);
+	} else {
+		u8 *p8 = p;
+		p8[0] = (v >> 0) & 0xFF;
+		p8[1] = (v >> 8) & 0xFF;
+	}
+}
+
+static inline void
+put_unaligned_u32_le(u32 v, void *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u32_unaligned(cpu_to_le32(v), p);
+	} else {
+		u8 *p8 = p;
+		p8[0] = (v >> 0) & 0xFF;
+		p8[1] = (v >> 8) & 0xFF;
+		p8[2] = (v >> 16) & 0xFF;
+		p8[3] = (v >> 24) & 0xFF;
+	}
+}
+
+static inline void
+put_unaligned_u64_le(u64 v, void *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u64_unaligned(cpu_to_le64(v), p);
+	} else {
+		u8 *p8 = p;
+		p8[0] = (v >> 0) & 0xFF;
+		p8[1] = (v >> 8) & 0xFF;
+		p8[2] = (v >> 16) & 0xFF;
+		p8[3] = (v >> 24) & 0xFF;
+		p8[4] = (v >> 32) & 0xFF;
+		p8[5] = (v >> 40) & 0xFF;
+		p8[6] = (v >> 48) & 0xFF;
+		p8[7] = (v >> 56) & 0xFF;
+	}
+}
+
+static inline void
+put_unaligned_word_le(machine_word_t v, void *p)
+{
+	BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
+	if (WORDSIZE == 4)
+		put_unaligned_u32_le(v, p);
+	else
+		put_unaligned_u64_le(v, p);
+}
+
+static inline u16
+get_unaligned_u16_be(const void *p)
+{
+	u16 v;
+
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		v = be16_to_cpu(load_u16_unaligned(p));
+	} else {
+		const u8 *p8 = p;
+		v = 0;
+		v |= (u16)p8[0] << 8;
+		v |= (u16)p8[1] << 0;
+	}
+	return v;
+}
+
+static inline u32
+get_unaligned_u32_be(const void *p)
+{
+	u32 v;
+
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		v = be32_to_cpu(load_u32_unaligned(p));
+	} else {
+		const u8 *p8 = p;
+		v = 0;
+		v |= (u32)p8[0] << 24;
+		v |= (u32)p8[1] << 16;
+		v |= (u32)p8[2] << 8;
+		v |= (u32)p8[3] << 0;
+	}
+	return v;
+}
+
+static inline void
+put_unaligned_u16_be(u16 v, void *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u16_unaligned(cpu_to_be16(v), p);
+	} else {
+		u8 *p8 = p;
+		p8[0] = (v >> 8) & 0xFF;
+		p8[1] = (v >> 0) & 0xFF;
+	}
+}
+
+static inline void
+put_unaligned_u32_be(u32 v, void *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u32_unaligned(cpu_to_be32(v), p);
+	} else {
+		u8 *p8 = p;
+		p8[0] = (v >> 24) & 0xFF;
+		p8[1] = (v >> 16) & 0xFF;
+		p8[2] = (v >> 8) & 0xFF;
+		p8[3] = (v >> 0) & 0xFF;
+	}
+}
diff --git a/src/zlib_compress.c b/src/zlib_compress.c
new file mode 100644
index 0000000..9fdb359
--- /dev/null
+++ b/src/zlib_compress.c
@@ -0,0 +1,56 @@
+/*
+ * zlib_compress.c
+ *
+ * Generate DEFLATE-compressed data in the zlib wrapper format.
+ */
+
+#include "libdeflate.h"
+
+#include "adler32.h"
+#include "deflate_compress.h"
+#include "unaligned.h"
+#include "zlib_constants.h"
+
+LIBEXPORT size_t
+zlib_compress(struct deflate_compressor *c, const void *in, size_t in_size,
+	      void *out, size_t out_nbytes_avail)
+{
+	u8 *out_next = out;
+	u16 hdr;
+	unsigned compression_level;
+	unsigned level_hint;
+	size_t deflate_size;
+
+	if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD)
+		return 0;
+
+	/* 2 byte header: CMF and FLG  */
+	hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12);
+	compression_level = deflate_get_compression_level(c);
+	if (compression_level < 2)
+		level_hint = ZLIB_FASTEST_COMPRESSION;
+	else if (compression_level < 6)
+		level_hint = ZLIB_FAST_COMPRESSION;
+	else if (compression_level < 8)
+		level_hint = ZLIB_DEFAULT_COMPRESSION;
+	else
+		level_hint = ZLIB_SLOWEST_COMPRESSION;
+	hdr |= level_hint << 6;
+	hdr |= 31 - (hdr % 31);
+
+	put_unaligned_u16_be(hdr, out_next);
+	out_next += 2;
+
+	/* Compressed data  */
+	deflate_size = deflate_compress(c, in, in_size, out_next,
+					out_nbytes_avail - ZLIB_MIN_OVERHEAD);
+	if (deflate_size == 0)
+		return 0;
+	out_next += deflate_size;
+
+	/* ADLER32  */
+	put_unaligned_u32_be(adler32(in, in_size), out_next);
+	out_next += 4;
+
+	return out_next - (u8 *)out;
+}
diff --git a/src/zlib_constants.h b/src/zlib_constants.h
new file mode 100644
index 0000000..faec03b
--- /dev/null
+++ b/src/zlib_constants.h
@@ -0,0 +1,20 @@
+/*
+ * zlib_constants.h
+ *
+ * Constants for the zlib wrapper format.
+ */
+
+#pragma once
+
+#define ZLIB_MIN_HEADER_SIZE	2
+#define ZLIB_FOOTER_SIZE	4
+#define ZLIB_MIN_OVERHEAD	(ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE)
+
+#define ZLIB_CM_DEFLATE		8
+
+#define ZLIB_CINFO_32K_WINDOW	7
+
+#define ZLIB_FASTEST_COMPRESSION	0
+#define ZLIB_FAST_COMPRESSION		1
+#define ZLIB_DEFAULT_COMPRESSION	2
+#define ZLIB_SLOWEST_COMPRESSION	3
diff --git a/src/zlib_decompress.c b/src/zlib_decompress.c
new file mode 100644
index 0000000..bd7fc6b
--- /dev/null
+++ b/src/zlib_decompress.c
@@ -0,0 +1,56 @@
+/*
+ * zlib_decompress.c
+ *
+ * Decompress DEFLATE-compressed data wrapped in the zlib format.
+ */
+
+#include "libdeflate.h"
+
+#include "adler32.h"
+#include "unaligned.h"
+#include "zlib_constants.h"
+
+LIBEXPORT bool
+zlib_decompress(struct deflate_decompressor *d,
+		const void *in, size_t in_nbytes, void *out, size_t out_nbytes)
+{
+	const u8 *in_next = in;
+	const u8 * const in_end = in_next + in_nbytes;
+	u16 hdr;
+
+	if (in_nbytes < ZLIB_MIN_OVERHEAD)
+		return false;
+
+	/* 2 byte header: CMF and FLG  */
+	hdr = get_unaligned_u16_be(in_next);
+	in_next += 2;
+
+	/* FCHECK */
+	if ((hdr % 31) != 0)
+		return false;
+
+	/* CM */
+	if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE)
+		return false;
+
+	/* CINFO */
+	if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW)
+		return false;
+
+	/* FDICT */
+	if ((hdr >> 5) & 1)
+		return false;
+
+	/* Compressed data  */
+	if (!deflate_decompress(d, in_next, in_end - ZLIB_FOOTER_SIZE - in_next,
+				out, out_nbytes))
+		return false;
+
+	in_next = in_end - ZLIB_FOOTER_SIZE;
+
+	/* ADLER32  */
+	if (adler32(out, out_nbytes) != get_unaligned_u32_be(in_next))
+		return false;
+
+	return true;
+}