Choose BMI2-optimized decompression routine at runtime

2025-09-10 12:58:30 -04:00 · 2016-01-22 23:54:33 -06:00 · 2016-01-22 23:54:33 -06:00 · e731f4b510
commit e731f4b510
parent 16f3b420a0
5 changed files with 621 additions and 352 deletions
--- a/11
+++ b/11
@ -29,6 +29,10 @@ SUPPORT_NEAR_OPTIMAL_PARSING := yes
 # This is faster but ***insecure***!  Default to secure.
 UNSAFE_DECOMPRESSION := no
 # Will the decompressor detect CPU features at runtime in order to run more
 # optimized code?  This only affects some platforms and architectures.
 RUNTIME_CPU_DETECTION := yes
 # The compiler and archiver
 CC := gcc
 AR := ar
@ -62,12 +66,19 @@ ifeq ($(UNSAFE_DECOMPRESSION),yes)
  override CFLAGS += -DUNSAFE_DECOMPRESSION=1
 endif
 ifeq ($(RUNTIME_CPU_DETECTION),yes)
  override CFLAGS += -DRUNTIME_CPU_DETECTION=1
 endif
 SRC := src/aligned_malloc.c
 ifeq ($(SUPPORT_COMPRESSION),yes)
    SRC += src/deflate_compress.c
 endif
 ifeq ($(SUPPORT_DECOMPRESSION),yes)
    SRC += src/deflate_decompress.c
    ifeq ($(RUNTIME_CPU_DETECTION),yes)
        SRC += src/x86_cpu_features.c
    endif
 endif
 ifeq ($(SUPPORT_ZLIB),yes)
    ifeq ($(SUPPORT_COMPRESSION),yes)
--- a/src/decompress_impl.h
+++ b/src/decompress_impl.h
@ -0,0 +1,364 @@
 /*
 * decompress_impl.h
 *
 * The actual DEFLATE decompression routine, lifted out of deflate_decompress.c
 * so that it can be compiled multiple times with different target instruction
 * sets.
 */
 static bool ATTRIBUTES
 FUNCNAME(struct deflate_decompressor * restrict d,
 	 const void * restrict in, size_t in_nbytes,
 	 void * restrict out, size_t out_nbytes)
 {
 	u8 *out_next = out;
 	u8 * const out_end = out_next + out_nbytes;
 	const u8 *in_next = in;
 	const u8 * const in_end = in_next + in_nbytes;
 	bitbuf_t bitbuf = 0;
 	unsigned bitsleft = 0;
 	size_t overrun_count = 0;
 	unsigned i;
 	unsigned is_final_block;
 	unsigned block_type;
 	u16 len;
 	u16 nlen;
 	unsigned num_litlen_syms;
 	unsigned num_offset_syms;
 next_block:
 	/* Starting to read the next block.  */
 	;
 	STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
 	ENSURE_BITS(1 + 2 + 5 + 5 + 4);
 	/* BFINAL: 1 bit  */
 	is_final_block = POP_BITS(1);
 	/* BTYPE: 2 bits  */
 	block_type = POP_BITS(2);
 	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
 		/* Dynamic Huffman block.  */
 		/* The order in which precode lengths are stored.  */
 		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
 			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
 		};
 		unsigned num_explicit_precode_lens;
 		/* Read the codeword length counts.  */
 		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
 		num_litlen_syms = POP_BITS(5) + 257;
 		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
 		num_offset_syms = POP_BITS(5) + 1;
 		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
 		num_explicit_precode_lens = POP_BITS(4) + 4;
 		/* Read the precode codeword lengths.  */
 		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
 		if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) {
 			ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3);
 			for (i = 0; i < num_explicit_precode_lens; i++)
 				d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
 		} else {
 			for (i = 0; i < num_explicit_precode_lens; i++) {
 				ENSURE_BITS(3);
 				d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
 			}
 		}
 		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
 			d->precode_lens[deflate_precode_lens_permutation[i]] = 0;
 		/* Build the decode table for the precode.  */
 		SAFETY_CHECK(build_precode_decode_table(d));
 		/* Expand the literal/length and offset codeword lengths.  */
 		for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
 			u32 entry;
 			unsigned presym;
 			u8 rep_val;
 			unsigned rep_count;
 			ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
 			/* (The code below assumes that the precode decode table
 			 * does not have any subtables.)  */
 			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
 			/* Read the next precode symbol.  */
 			entry = d->precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
 			REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
 			presym = entry >> HUFFDEC_RESULT_SHIFT;
 			if (presym < 16) {
 				/* Explicit codeword length  */
 				d->lens[i++] = presym;
 				continue;
 			}
 			/* Run-length encoded codeword lengths  */
 			/* Note: we don't need verify that the repeat count
 			 * doesn't overflow the number of elements, since we
 			 * have enough extra spaces to allow for the worst-case
 			 * overflow (138 zeroes when only 1 length was
 			 * remaining).
 			 *
 			 * In the case of the small repeat counts (presyms 16
 			 * and 17), it is fastest to always write the maximum
 			 * number of entries.  That gets rid of branches that
 			 * would otherwise be required.
 			 *
 			 * It is not just because of the numerical order that
 			 * our checks go in the order 'presym < 16', 'presym ==
 			 * 16', and 'presym == 17'.  For typical data this is
 			 * ordered from most frequent to least frequent case.
 			 */
 			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
 			if (presym == 16) {
 				/* Repeat the previous length 3 - 6 times  */
 				SAFETY_CHECK(i != 0);
 				rep_val = d->lens[i - 1];
 				STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
 				rep_count = 3 + POP_BITS(2);
 				d->lens[i + 0] = rep_val;
 				d->lens[i + 1] = rep_val;
 				d->lens[i + 2] = rep_val;
 				d->lens[i + 3] = rep_val;
 				d->lens[i + 4] = rep_val;
 				d->lens[i + 5] = rep_val;
 				i += rep_count;
 			} else if (presym == 17) {
 				/* Repeat zero 3 - 10 times  */
 				STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
 				rep_count = 3 + POP_BITS(3);
 				d->lens[i + 0] = 0;
 				d->lens[i + 1] = 0;
 				d->lens[i + 2] = 0;
 				d->lens[i + 3] = 0;
 				d->lens[i + 4] = 0;
 				d->lens[i + 5] = 0;
 				d->lens[i + 6] = 0;
 				d->lens[i + 7] = 0;
 				d->lens[i + 8] = 0;
 				d->lens[i + 9] = 0;
 				i += rep_count;
 			} else {
 				/* Repeat zero 11 - 138 times  */
 				STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
 				rep_count = 11 + POP_BITS(7);
 				memset(&d->lens[i], 0, rep_count * sizeof(d->lens[i]));
 				i += rep_count;
 			}
 		}
 	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
 		/* Uncompressed block: copy 'len' bytes literally from the input
 		 * buffer to the output buffer.  */
 		ALIGN_INPUT();
 		SAFETY_CHECK(in_end - in_next >= 4);
 		len = READ_U16();
 		nlen = READ_U16();
 		SAFETY_CHECK(len == (u16)~nlen);
 		SAFETY_CHECK(len <= out_end - out_next);
 		SAFETY_CHECK(len <= in_end - in_next);
 		memcpy(out_next, in_next, len);
 		in_next += len;
 		out_next += len;
 		goto block_done;
 	} else {
 		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
 		/* Static Huffman block: set the static Huffman codeword
 		 * lengths.  Then the remainder is the same as decompressing a
 		 * dynamic Huffman block.  */
 		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
 		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
 		for (i = 0; i < 144; i++)
 			d->lens[i] = 8;
 		for (; i < 256; i++)
 			d->lens[i] = 9;
 		for (; i < 280; i++)
 			d->lens[i] = 7;
 		for (; i < 288; i++)
 			d->lens[i] = 8;
 		for (; i < 288 + 32; i++)
 			d->lens[i] = 5;
 		num_litlen_syms = 288;
 		num_offset_syms = 32;
 	}
 	/* Decompressing a Huffman block (either dynamic or static)  */
 	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
 	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
 	/* The main DEFLATE decode loop  */
 	for (;;) {
 		u32 entry;
 		u32 length;
 		u32 offset;
 		/* Decode a litlen symbol.  */
 		ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
 		entry = d->litlen_decode_table[BITS(LITLEN_TABLEBITS)];
 		if (entry & HUFFDEC_SUBTABLE_POINTER) {
 			/* Litlen subtable required (uncommon case)  */
 			REMOVE_BITS(LITLEN_TABLEBITS);
 			entry = d->litlen_decode_table[
 				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
 				BITS(entry & HUFFDEC_LENGTH_MASK)];
 		}
 		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
 		if (entry & HUFFDEC_LITERAL) {
 			/* Literal  */
 			SAFETY_CHECK(out_next < out_end);
 			*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
 			continue;
 		}
 		/* Match or end-of-block  */
 		entry >>= HUFFDEC_RESULT_SHIFT;
 		ENSURE_BITS(MAX_ENSURE);
 		/* Pop the extra length bits and add them to the length base to
 		 * produce the full length.  */
 		length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
 			 POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
 		/* The match destination must not end after the end of the
 		 * output buffer.  For efficiency, combine this check with the
 		 * end-of-block check.  We're using 0 for the special
 		 * end-of-block length, so subtract 1 and it turn it into
 		 * SIZE_MAX.  */
 		STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
 		if (unlikely((size_t)length - 1 > out_end - out_next)) {
 			SAFETY_CHECK(length == HUFFDEC_END_OF_BLOCK_LENGTH);
 			goto block_done;
 		}
 		/* Decode the match offset.  */
 		entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
 		if (entry & HUFFDEC_SUBTABLE_POINTER) {
 			/* Offset subtable required (uncommon case)  */
 			REMOVE_BITS(OFFSET_TABLEBITS);
 			entry = d->offset_decode_table[
 				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
 				BITS(entry & HUFFDEC_LENGTH_MASK)];
 		}
 		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
 		entry >>= HUFFDEC_RESULT_SHIFT;
 		STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
 					 DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
 			      CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
 		if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
 				DEFLATE_MAX_OFFSET_CODEWORD_LEN +
 				DEFLATE_MAX_EXTRA_OFFSET_BITS))
 			ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
 		/* Pop the extra offset bits and add them to the offset base to
 		 * produce the full offset.  */
 		offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
 			 POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
 		/* The match source must not begin before the beginning of the
 		 * output buffer.  */
 		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
 		/* Copy the match: 'length' bytes at 'out_next - offset' to
 		 * 'out_next'.  */
 		if (UNALIGNED_ACCESS_IS_FAST &&
 		    length <= (3 * WORDSIZE) &&
 		    offset >= WORDSIZE &&
 		    length + (3 * WORDSIZE) <= out_end - out_next)
 		{
 			/* Fast case: short length, no overlaps if we copy one
 			 * word at a time, and we aren't getting too close to
 			 * the end of the output array.  */
 			copy_word_unaligned(out_next - offset + (0 * WORDSIZE),
 					    out_next + (0 * WORDSIZE));
 			copy_word_unaligned(out_next - offset + (1 * WORDSIZE),
 					    out_next + (1 * WORDSIZE));
 			copy_word_unaligned(out_next - offset + (2 * WORDSIZE),
 					    out_next + (2 * WORDSIZE));
 		} else {
 			const u8 *src = out_next - offset;
 			u8 *dst = out_next;
 			u8 *end = out_next + length;
 			if (UNALIGNED_ACCESS_IS_FAST &&
 			    likely(out_end - end >= WORDSIZE - 1)) {
 				if (offset >= WORDSIZE) {
 					copy_word_unaligned(src, dst);
 					src += WORDSIZE;
 					dst += WORDSIZE;
 					if (dst < end) {
 						do {
 							copy_word_unaligned(src, dst);
 							src += WORDSIZE;
 							dst += WORDSIZE;
 						} while (dst < end);
 					}
 				} else if (offset == 1) {
 					machine_word_t v = repeat_byte(*(dst - 1));
 					do {
 						store_word_unaligned(v, dst);
 						src += WORDSIZE;
 						dst += WORDSIZE;
 					} while (dst < end);
 				} else {
 					*dst++ = *src++;
 					*dst++ = *src++;
 					do {
 						*dst++ = *src++;
 					} while (dst < end);
 				}
 			} else {
 				*dst++ = *src++;
 				*dst++ = *src++;
 				do {
 					*dst++ = *src++;
 				} while (dst < end);
 			}
 		}
 		out_next += length;
 	}
 block_done:
 	/* Finished decoding a block.  */
 	if (!is_final_block)
 		goto next_block;
 	/* That was the last block.  Return %true if we got all the output we
 	 * expected, otherwise %false.  */
 	return (out_next == out_end);
 }
--- a/src/deflate_decompress.c
+++ b/src/deflate_decompress.c
@ -10,8 +10,9 @@
 * ---------------------------------------------------------------------------
 *
 * This is a highly optimized DEFLATE decompressor.  On x86_64 it decompresses
- * data in about 52% of the time of zlib.  On other architectures it should
+ * data in about 52% of the time of zlib (48% if BMI2 instructions are
- * still be significantly faster than zlib, but the difference may be smaller.
+ * available).  On other architectures it should still be significantly faster
 * than zlib, but the difference may be smaller.
 *
 * Why this is faster than zlib's implementation:
 *
@ -22,6 +23,8 @@
 * - Other optimizations to remove unnecessary branches
 * - Only full-buffer decompression is supported, so the code doesn't need to
 *   support stopping and resuming decompression.
 * - On x86_64, compile a version of the decompression routine using BMI2
 *   instructions and use it automatically at runtime when supported.
 */
 #include <stdlib.h>
@ -31,6 +34,7 @@
 #include "deflate_constants.h"
 #include "unaligned.h"
 #include "x86_cpu_features.h"
 /* By default, if the expression passed to SAFETY_CHECK() evaluates to false,
 * then deflate_decompress() immediately returns false as the compressed data is
@ -793,6 +797,50 @@ copy_word_unaligned(const void *src, void *dst)
 *                         Main decompression routine
 *****************************************************************************/
 #define FUNCNAME deflate_decompress_default
 #define ATTRIBUTES
 #include "decompress_impl.h"
 #undef FUNCNAME
 #undef ATTRIBUTES
 #if X86_CPU_FEATURES_ENABLED && !defined(__BMI2__)
 #  define FUNCNAME deflate_decompress_bmi2
 #  define ATTRIBUTES __attribute__((target("bmi2")))
 #  include "decompress_impl.h"
 #  undef FUNCNAME
 #  undef ATTRIBUTES
 #  define DISPATCH_ENABLED 1
 #endif
 #if DISPATCH_ENABLED
 static bool
 dispatch(struct deflate_decompressor * restrict d,
 	 const void * restrict in, size_t in_nbytes,
 	 void * restrict out, size_t out_nbytes);
 typedef bool (*decompress_func_t)(struct deflate_decompressor * restrict d,
 				  const void * restrict in, size_t in_nbytes,
 				  void * restrict out, size_t out_nbytes);
 static decompress_func_t decompress_impl = dispatch;
 static bool
 dispatch(struct deflate_decompressor * restrict d,
 	 const void * restrict in, size_t in_nbytes,
 	 void * restrict out, size_t out_nbytes)
 {
 	decompress_func_t f = deflate_decompress_default;
 #if X86_CPU_FEATURES_ENABLED
 	if (x86_have_cpu_feature(X86_CPU_FEATURE_BMI2))
 		f = deflate_decompress_bmi2;
 #endif
 	decompress_impl = f;
 	return (*f)(d, in, in_nbytes, out, out_nbytes);
 }
 #endif /* DISPATCH_ENABLED */
 /*
 * This is the main DEFLATE decompression routine.  It decompresses 'in_nbytes'
 * bytes of compressed data from the buffer 'in' and writes the uncompressed
@ -801,362 +849,20 @@ copy_word_unaligned(const void *src, void *dst)
 * and only if decompression was successful.  A return value of %false indicates
 * that either the compressed data is invalid or it does not decompress to
 * exactly 'out_nbytes' bytes of uncompressed data.
 *
 * The real code is in decompress_impl.h.  The part here just handles calling
 * the appropriate implementation depending on the CPU features at runtime.
 */
 LIBEXPORT bool
 deflate_decompress(struct deflate_decompressor * restrict d,
 		   const void * restrict in, size_t in_nbytes,
 		   void * restrict out, size_t out_nbytes)
 {
-	u8 *out_next = out;
+#if DISPATCH_ENABLED
-	u8 * const out_end = out_next + out_nbytes;
+	return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes);
-	const u8 *in_next = in;
+#else
-	const u8 * const in_end = in_next + in_nbytes;
+	return deflate_decompress_default(d, in, in_nbytes, out, out_nbytes);
-	bitbuf_t bitbuf = 0;
+#endif
 	unsigned bitsleft = 0;
 	size_t overrun_count = 0;
 	unsigned i;
 	unsigned is_final_block;
 	unsigned block_type;
 	u16 len;
 	u16 nlen;
 	unsigned num_litlen_syms;
 	unsigned num_offset_syms;
 next_block:
 	/* Starting to read the next block.  */
 	;
 	STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
 	ENSURE_BITS(1 + 2 + 5 + 5 + 4);
 	/* BFINAL: 1 bit  */
 	is_final_block = POP_BITS(1);
 	/* BTYPE: 2 bits  */
 	block_type = POP_BITS(2);
 	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
 		/* Dynamic Huffman block.  */
 		/* The order in which precode lengths are stored.  */
 		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
 			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
 		};
 		unsigned num_explicit_precode_lens;
 		/* Read the codeword length counts.  */
 		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
 		num_litlen_syms = POP_BITS(5) + 257;
 		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
 		num_offset_syms = POP_BITS(5) + 1;
 		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
 		num_explicit_precode_lens = POP_BITS(4) + 4;
 		/* Read the precode codeword lengths.  */
 		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
 		if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) {
 			ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3);
 			for (i = 0; i < num_explicit_precode_lens; i++)
 				d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
 		} else {
 			for (i = 0; i < num_explicit_precode_lens; i++) {
 				ENSURE_BITS(3);
 				d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
 			}
 		}
 		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
 			d->precode_lens[deflate_precode_lens_permutation[i]] = 0;
 		/* Build the decode table for the precode.  */
 		SAFETY_CHECK(build_precode_decode_table(d));
 		/* Expand the literal/length and offset codeword lengths.  */
 		for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
 			u32 entry;
 			unsigned presym;
 			u8 rep_val;
 			unsigned rep_count;
 			ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
 			/* (The code below assumes that the precode decode table
 			 * does not have any subtables.)  */
 			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
 			/* Read the next precode symbol.  */
 			entry = d->precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
 			REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
 			presym = entry >> HUFFDEC_RESULT_SHIFT;
 			if (presym < 16) {
 				/* Explicit codeword length  */
 				d->lens[i++] = presym;
 				continue;
 			}
 			/* Run-length encoded codeword lengths  */
 			/* Note: we don't need verify that the repeat count
 			 * doesn't overflow the number of elements, since we
 			 * have enough extra spaces to allow for the worst-case
 			 * overflow (138 zeroes when only 1 length was
 			 * remaining).
 			 *
 			 * In the case of the small repeat counts (presyms 16
 			 * and 17), it is fastest to always write the maximum
 			 * number of entries.  That gets rid of branches that
 			 * would otherwise be required.
 			 *
 			 * It is not just because of the numerical order that
 			 * our checks go in the order 'presym < 16', 'presym ==
 			 * 16', and 'presym == 17'.  For typical data this is
 			 * ordered from most frequent to least frequent case.
 			 */
 			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
 			if (presym == 16) {
 				/* Repeat the previous length 3 - 6 times  */
 				SAFETY_CHECK(i != 0);
 				rep_val = d->lens[i - 1];
 				STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
 				rep_count = 3 + POP_BITS(2);
 				d->lens[i + 0] = rep_val;
 				d->lens[i + 1] = rep_val;
 				d->lens[i + 2] = rep_val;
 				d->lens[i + 3] = rep_val;
 				d->lens[i + 4] = rep_val;
 				d->lens[i + 5] = rep_val;
 				i += rep_count;
 			} else if (presym == 17) {
 				/* Repeat zero 3 - 10 times  */
 				STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
 				rep_count = 3 + POP_BITS(3);
 				d->lens[i + 0] = 0;
 				d->lens[i + 1] = 0;
 				d->lens[i + 2] = 0;
 				d->lens[i + 3] = 0;
 				d->lens[i + 4] = 0;
 				d->lens[i + 5] = 0;
 				d->lens[i + 6] = 0;
 				d->lens[i + 7] = 0;
 				d->lens[i + 8] = 0;
 				d->lens[i + 9] = 0;
 				i += rep_count;
 			} else {
 				/* Repeat zero 11 - 138 times  */
 				STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
 				rep_count = 11 + POP_BITS(7);
 				memset(&d->lens[i], 0, rep_count * sizeof(d->lens[i]));
 				i += rep_count;
 			}
 		}
 	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
 		/* Uncompressed block: copy 'len' bytes literally from the input
 		 * buffer to the output buffer.  */
 		ALIGN_INPUT();
 		SAFETY_CHECK(in_end - in_next >= 4);
 		len = READ_U16();
 		nlen = READ_U16();
 		SAFETY_CHECK(len == (u16)~nlen);
 		SAFETY_CHECK(len <= out_end - out_next);
 		SAFETY_CHECK(len <= in_end - in_next);
 		memcpy(out_next, in_next, len);
 		in_next += len;
 		out_next += len;
 		goto block_done;
 	} else {
 		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
 		/* Static Huffman block: set the static Huffman codeword
 		 * lengths.  Then the remainder is the same as decompressing a
 		 * dynamic Huffman block.  */
 		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
 		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
 		for (i = 0; i < 144; i++)
 			d->lens[i] = 8;
 		for (; i < 256; i++)
 			d->lens[i] = 9;
 		for (; i < 280; i++)
 			d->lens[i] = 7;
 		for (; i < 288; i++)
 			d->lens[i] = 8;
 		for (; i < 288 + 32; i++)
 			d->lens[i] = 5;
 		num_litlen_syms = 288;
 		num_offset_syms = 32;
 	}
 	/* Decompressing a Huffman block (either dynamic or static)  */
 	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
 	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
 	/* The main DEFLATE decode loop  */
 	for (;;) {
 		u32 entry;
 		u32 length;
 		u32 offset;
 		/* Decode a litlen symbol.  */
 		ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
 		entry = d->litlen_decode_table[BITS(LITLEN_TABLEBITS)];
 		if (entry & HUFFDEC_SUBTABLE_POINTER) {
 			/* Litlen subtable required (uncommon case)  */
 			REMOVE_BITS(LITLEN_TABLEBITS);
 			entry = d->litlen_decode_table[
 				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
 				BITS(entry & HUFFDEC_LENGTH_MASK)];
 		}
 		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
 		if (entry & HUFFDEC_LITERAL) {
 			/* Literal  */
 			SAFETY_CHECK(out_next < out_end);
 			*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
 			continue;
 		}
 		/* Match or end-of-block  */
 		entry >>= HUFFDEC_RESULT_SHIFT;
 		ENSURE_BITS(MAX_ENSURE);
 		/* Pop the extra length bits and add them to the length base to
 		 * produce the full length.  */
 		length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
 			 POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
 		/* The match destination must not end after the end of the
 		 * output buffer.  For efficiency, combine this check with the
 		 * end-of-block check.  We're using 0 for the special
 		 * end-of-block length, so subtract 1 and it turn it into
 		 * SIZE_MAX.  */
 		STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
 		if (unlikely((size_t)length - 1 > out_end - out_next)) {
 			SAFETY_CHECK(length == HUFFDEC_END_OF_BLOCK_LENGTH);
 			goto block_done;
 		}
 		/* Decode the match offset.  */
 		entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
 		if (entry & HUFFDEC_SUBTABLE_POINTER) {
 			/* Offset subtable required (uncommon case)  */
 			REMOVE_BITS(OFFSET_TABLEBITS);
 			entry = d->offset_decode_table[
 				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
 				BITS(entry & HUFFDEC_LENGTH_MASK)];
 		}
 		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
 		entry >>= HUFFDEC_RESULT_SHIFT;
 		STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
 					 DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
 			      CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
 		if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
 				DEFLATE_MAX_OFFSET_CODEWORD_LEN +
 				DEFLATE_MAX_EXTRA_OFFSET_BITS))
 			ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
 		/* Pop the extra offset bits and add them to the offset base to
 		 * produce the full offset.  */
 		offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
 			 POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
 		/* The match source must not begin before the beginning of the
 		 * output buffer.  */
 		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
 		/* Copy the match: 'length' bytes at 'out_next - offset' to
 		 * 'out_next'.  */
 		if (UNALIGNED_ACCESS_IS_FAST &&
 		    length <= (3 * WORDSIZE) &&
 		    offset >= WORDSIZE &&
 		    length + (3 * WORDSIZE) <= out_end - out_next)
 		{
 			/* Fast case: short length, no overlaps if we copy one
 			 * word at a time, and we aren't getting too close to
 			 * the end of the output array.  */
 			copy_word_unaligned(out_next - offset + (0 * WORDSIZE),
 					    out_next + (0 * WORDSIZE));
 			copy_word_unaligned(out_next - offset + (1 * WORDSIZE),
 					    out_next + (1 * WORDSIZE));
 			copy_word_unaligned(out_next - offset + (2 * WORDSIZE),
 					    out_next + (2 * WORDSIZE));
 		} else {
 			const u8 *src = out_next - offset;
 			u8 *dst = out_next;
 			u8 *end = out_next + length;
 			if (UNALIGNED_ACCESS_IS_FAST &&
 			    likely(out_end - end >= WORDSIZE - 1)) {
 				if (offset >= WORDSIZE) {
 					copy_word_unaligned(src, dst);
 					src += WORDSIZE;
 					dst += WORDSIZE;
 					if (dst < end) {
 						do {
 							copy_word_unaligned(src, dst);
 							src += WORDSIZE;
 							dst += WORDSIZE;
 						} while (dst < end);
 					}
 				} else if (offset == 1) {
 					machine_word_t v = repeat_byte(*(dst - 1));
 					do {
 						store_word_unaligned(v, dst);
 						src += WORDSIZE;
 						dst += WORDSIZE;
 					} while (dst < end);
 				} else {
 					*dst++ = *src++;
 					*dst++ = *src++;
 					do {
 						*dst++ = *src++;
 					} while (dst < end);
 				}
 			} else {
 				*dst++ = *src++;
 				*dst++ = *src++;
 				do {
 					*dst++ = *src++;
 				} while (dst < end);
 			}
 		}
 		out_next += length;
 	}
 block_done:
 	/* Finished decoding a block.  */
 	if (!is_final_block)
 		goto next_block;
 	/* That was the last block.  Return %true if we got all the output we
 	 * expected, otherwise %false.  */
 	return (out_next == out_end);
 }
 LIBEXPORT struct deflate_decompressor *
--- a/src/x86_cpu_features.c
+++ b/src/x86_cpu_features.c
@ -0,0 +1,145 @@
 /*
 * x86_cpu_features.c - feature detection for x86 processors
 *
 * Author:	Eric Biggers
 * Year:	2015
 *
 * The author dedicates this file to the public domain.
 * You can do whatever you want with this file.
 */
 #include "x86_cpu_features.h"
 #ifdef X86_CPU_FEATURES_ENABLED
 #define DEBUG 0
 #if DEBUG
 #  include <stdio.h>
 #endif
 u32 _x86_cpu_features = 0;
 /* With old GCC versions we have to manually save and restore the x86_32 PIC
 * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602  */
 #if defined(__i386__) && defined(__PIC__)
 #  define EBX_CONSTRAINT "=r"
 #else
 #  define EBX_CONSTRAINT "=b"
 #endif
 /* Execute the CPUID instruction.  */
 static inline void
 cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
 {
 	__asm__(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
 		"cpuid                                  \n"
 		".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
 		: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
 		: "a" (leaf), "c" (subleaf));
 }
 /* Read an extended control register.  */
 static inline u64
 read_xcr(u32 index)
 {
 	u32 edx, eax;
 	/* Execute the "xgetbv" instruction.  Old versions of binutils do not
 	 * recognize this instruction, so list the raw bytes instead.  */
 	__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
 	return ((u64)edx << 32) | eax;
 }
 #define IS_SET(reg, bit) ((reg) & ((u32)1 << (bit)))
 /* Initialize _x86_cpu_features with bits for interesting processor features. */
 void
 x86_setup_cpu_features(void)
 {
 	u32 features = 0;
 	u32 dummy1, dummy2, dummy3, dummy4;
 	u32 max_function;
 	u32 features_1, features_2, features_3, features_4;
 	bool os_saves_ymm_regs = false;
 	/* Get maximum supported function  */
 	cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
 	if (max_function < 1)
 		goto out;
 	/* Standard feature flags  */
 	cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
 	if (IS_SET(features_1, 25))
 		features |= X86_CPU_FEATURE_SSE;
 	if (IS_SET(features_1, 26))
 		features |= X86_CPU_FEATURE_SSE2;
 	if (IS_SET(features_2, 0))
 		features |= X86_CPU_FEATURE_SSE3;
 	if (IS_SET(features_2, 9))
 		features |= X86_CPU_FEATURE_SSSE3;
 	if (IS_SET(features_2, 19))
 		features |= X86_CPU_FEATURE_SSE4_1;
 	if (IS_SET(features_2, 20))
 		features |= X86_CPU_FEATURE_SSE4_2;
 	if (IS_SET(features_2, 27)) /* OSXSAVE set?  */
 		if ((read_xcr(0) & 0x6) == 0x6)
 			os_saves_ymm_regs = true;
 	if (os_saves_ymm_regs && IS_SET(features_2, 28))
 		features |= X86_CPU_FEATURE_AVX;
 	if (max_function < 7)
 		goto out;
 	/* Extended feature flags  */
 	cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
 	if (IS_SET(features_3, 3))
 		features |= X86_CPU_FEATURE_BMI;
 	if (os_saves_ymm_regs && IS_SET(features_3, 5))
 		features |= X86_CPU_FEATURE_AVX2;
 	if (IS_SET(features_3, 8))
 		features |= X86_CPU_FEATURE_BMI2;
 out:
 #if DEBUG
 	printf("Detected x86 CPU features: ");
 	if (features & X86_CPU_FEATURE_SSE)
 		printf("SSE ");
 	if (features & X86_CPU_FEATURE_SSE2)
 		printf("SSE2 ");
 	if (features & X86_CPU_FEATURE_SSE3)
 		printf("SSE3 ");
 	if (features & X86_CPU_FEATURE_SSSE3)
 		printf("SSSE3 ");
 	if (features & X86_CPU_FEATURE_SSE4_1)
 		printf("SSE4.1 ");
 	if (features & X86_CPU_FEATURE_SSE4_2)
 		printf("SSE4.2 ");
 	if (features & X86_CPU_FEATURE_BMI)
 		printf("BMI ");
 	if (features & X86_CPU_FEATURE_AVX)
 		printf("AVX ");
 	if (features & X86_CPU_FEATURE_BMI2)
 		printf("BMI2 ");
 	if (features & X86_CPU_FEATURE_AVX2)
 		printf("AVX2 ");
 	printf("\n");
 #endif /* DEBUG */
 	_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN;
 }
 #endif /* X86_CPU_FEATURES_ENABLED */
--- a/src/x86_cpu_features.h
+++ b/src/x86_cpu_features.h
@ -0,0 +1,43 @@
 /*
 * x86_cpu_features.h - feature detection for x86 processors
 */
 #pragma once
 #include "util.h"
 #if RUNTIME_CPU_DETECTION && defined(__GNUC__) && defined(__x86_64__)
 #  define X86_CPU_FEATURES_ENABLED 1
 #endif
 #if X86_CPU_FEATURES_ENABLED
 #define X86_CPU_FEATURE_SSE		0x00000001
 #define X86_CPU_FEATURE_SSE2		0x00000002
 #define X86_CPU_FEATURE_SSE3		0x00000004
 #define X86_CPU_FEATURE_SSSE3		0x00000008
 #define X86_CPU_FEATURE_SSE4_1		0x00000010
 #define X86_CPU_FEATURE_SSE4_2		0x00000020
 #define X86_CPU_FEATURE_AVX		0x00000040
 #define X86_CPU_FEATURE_BMI		0x00000080
 #define X86_CPU_FEATURE_AVX2		0x00000100
 #define X86_CPU_FEATURE_BMI2		0x00000200
 #define X86_CPU_FEATURES_KNOWN		0x80000000
 extern u32 _x86_cpu_features;
 extern void
 x86_setup_cpu_features(void);
 /* Does the processor have the specified feature?  */
 static inline bool
 x86_have_cpu_feature(u32 feature)
 {
 	if (_x86_cpu_features == 0)
 		x86_setup_cpu_features();
 	return _x86_cpu_features & feature;
 }
 #endif /* X86_CPU_FEATURES_ENABLED */