Choose BMI2-optimized decompression routine at runtime

2025-09-08 03:39:25 -04:00 · 2016-01-22 23:54:33 -06:00 · 2016-01-22 23:54:33 -06:00 · e731f4b510
commit e731f4b510
parent 16f3b420a0
5 changed files with 621 additions and 352 deletions
--- a/11
+++ b/11
@ -29,6 +29,10 @@ SUPPORT_NEAR_OPTIMAL_PARSING := yes
 # This is faster but ***insecure***!  Default to secure.
 UNSAFE_DECOMPRESSION := no

+# Will the decompressor detect CPU features at runtime in order to run more
+# optimized code?  This only affects some platforms and architectures.
+RUNTIME_CPU_DETECTION := yes
+
 # The compiler and archiver
 CC := gcc
 AR := ar
@ -62,12 +66,19 @@ ifeq ($(UNSAFE_DECOMPRESSION),yes)
  override CFLAGS += -DUNSAFE_DECOMPRESSION=1
 endif

+ifeq ($(RUNTIME_CPU_DETECTION),yes)
+  override CFLAGS += -DRUNTIME_CPU_DETECTION=1
+endif
+
 SRC := src/aligned_malloc.c
 ifeq ($(SUPPORT_COMPRESSION),yes)
    SRC += src/deflate_compress.c
 endif
 ifeq ($(SUPPORT_DECOMPRESSION),yes)
    SRC += src/deflate_decompress.c
+    ifeq ($(RUNTIME_CPU_DETECTION),yes)
+        SRC += src/x86_cpu_features.c
+    endif
 endif
 ifeq ($(SUPPORT_ZLIB),yes)
    ifeq ($(SUPPORT_COMPRESSION),yes)
--- a/src/decompress_impl.h
+++ b/src/decompress_impl.h
@ -0,0 +1,364 @@
+/*
+ * decompress_impl.h
+ *
+ * The actual DEFLATE decompression routine, lifted out of deflate_decompress.c
+ * so that it can be compiled multiple times with different target instruction
+ * sets.
+ */
+
+static bool ATTRIBUTES
+FUNCNAME(struct deflate_decompressor * restrict d,
+	 const void * restrict in, size_t in_nbytes,
+	 void * restrict out, size_t out_nbytes)
+{
+	u8 *out_next = out;
+	u8 * const out_end = out_next + out_nbytes;
+	const u8 *in_next = in;
+	const u8 * const in_end = in_next + in_nbytes;
+	bitbuf_t bitbuf = 0;
+	unsigned bitsleft = 0;
+	size_t overrun_count = 0;
+	unsigned i;
+	unsigned is_final_block;
+	unsigned block_type;
+	u16 len;
+	u16 nlen;
+	unsigned num_litlen_syms;
+	unsigned num_offset_syms;
+
+next_block:
+	/* Starting to read the next block.  */
+	;
+
+	STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
+	ENSURE_BITS(1 + 2 + 5 + 5 + 4);
+
+	/* BFINAL: 1 bit  */
+	is_final_block = POP_BITS(1);
+
+	/* BTYPE: 2 bits  */
+	block_type = POP_BITS(2);
+
+	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
+
+		/* Dynamic Huffman block.  */
+
+		/* The order in which precode lengths are stored.  */
+		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+		};
+
+		unsigned num_explicit_precode_lens;
+
+		/* Read the codeword length counts.  */
+
+		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
+		num_litlen_syms = POP_BITS(5) + 257;
+
+		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
+		num_offset_syms = POP_BITS(5) + 1;
+
+		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
+		num_explicit_precode_lens = POP_BITS(4) + 4;
+
+		/* Read the precode codeword lengths.  */
+		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
+		if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) {
+
+			ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3);
+
+			for (i = 0; i < num_explicit_precode_lens; i++)
+				d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
+		} else {
+			for (i = 0; i < num_explicit_precode_lens; i++) {
+				ENSURE_BITS(3);
+				d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
+			}
+		}
+
+		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
+			d->precode_lens[deflate_precode_lens_permutation[i]] = 0;
+
+		/* Build the decode table for the precode.  */
+		SAFETY_CHECK(build_precode_decode_table(d));
+
+		/* Expand the literal/length and offset codeword lengths.  */
+		for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
+			u32 entry;
+			unsigned presym;
+			u8 rep_val;
+			unsigned rep_count;
+
+			ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
+
+			/* (The code below assumes that the precode decode table
+			 * does not have any subtables.)  */
+			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
+
+			/* Read the next precode symbol.  */
+			entry = d->precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
+			REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
+			presym = entry >> HUFFDEC_RESULT_SHIFT;
+
+			if (presym < 16) {
+				/* Explicit codeword length  */
+				d->lens[i++] = presym;
+				continue;
+			}
+
+			/* Run-length encoded codeword lengths  */
+
+			/* Note: we don't need verify that the repeat count
+			 * doesn't overflow the number of elements, since we
+			 * have enough extra spaces to allow for the worst-case
+			 * overflow (138 zeroes when only 1 length was
+			 * remaining).
+			 *
+			 * In the case of the small repeat counts (presyms 16
+			 * and 17), it is fastest to always write the maximum
+			 * number of entries.  That gets rid of branches that
+			 * would otherwise be required.
+			 *
+			 * It is not just because of the numerical order that
+			 * our checks go in the order 'presym < 16', 'presym ==
+			 * 16', and 'presym == 17'.  For typical data this is
+			 * ordered from most frequent to least frequent case.
+			 */
+			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
+
+			if (presym == 16) {
+				/* Repeat the previous length 3 - 6 times  */
+				SAFETY_CHECK(i != 0);
+				rep_val = d->lens[i - 1];
+				STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
+				rep_count = 3 + POP_BITS(2);
+				d->lens[i + 0] = rep_val;
+				d->lens[i + 1] = rep_val;
+				d->lens[i + 2] = rep_val;
+				d->lens[i + 3] = rep_val;
+				d->lens[i + 4] = rep_val;
+				d->lens[i + 5] = rep_val;
+				i += rep_count;
+			} else if (presym == 17) {
+				/* Repeat zero 3 - 10 times  */
+				STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
+				rep_count = 3 + POP_BITS(3);
+				d->lens[i + 0] = 0;
+				d->lens[i + 1] = 0;
+				d->lens[i + 2] = 0;
+				d->lens[i + 3] = 0;
+				d->lens[i + 4] = 0;
+				d->lens[i + 5] = 0;
+				d->lens[i + 6] = 0;
+				d->lens[i + 7] = 0;
+				d->lens[i + 8] = 0;
+				d->lens[i + 9] = 0;
+				i += rep_count;
+			} else {
+				/* Repeat zero 11 - 138 times  */
+				STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
+				rep_count = 11 + POP_BITS(7);
+				memset(&d->lens[i], 0, rep_count * sizeof(d->lens[i]));
+				i += rep_count;
+			}
+		}
+	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
+
+		/* Uncompressed block: copy 'len' bytes literally from the input
+		 * buffer to the output buffer.  */
+
+		ALIGN_INPUT();
+
+		SAFETY_CHECK(in_end - in_next >= 4);
+
+		len = READ_U16();
+		nlen = READ_U16();
+
+		SAFETY_CHECK(len == (u16)~nlen);
+		SAFETY_CHECK(len <= out_end - out_next);
+		SAFETY_CHECK(len <= in_end - in_next);
+
+		memcpy(out_next, in_next, len);
+		in_next += len;
+		out_next += len;
+
+		goto block_done;
+
+	} else {
+		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
+
+		/* Static Huffman block: set the static Huffman codeword
+		 * lengths.  Then the remainder is the same as decompressing a
+		 * dynamic Huffman block.  */
+
+		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
+		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
+
+		for (i = 0; i < 144; i++)
+			d->lens[i] = 8;
+		for (; i < 256; i++)
+			d->lens[i] = 9;
+		for (; i < 280; i++)
+			d->lens[i] = 7;
+		for (; i < 288; i++)
+			d->lens[i] = 8;
+
+		for (; i < 288 + 32; i++)
+			d->lens[i] = 5;
+
+		num_litlen_syms = 288;
+		num_offset_syms = 32;
+
+	}
+
+	/* Decompressing a Huffman block (either dynamic or static)  */
+
+	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
+	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
+
+	/* The main DEFLATE decode loop  */
+	for (;;) {
+		u32 entry;
+		u32 length;
+		u32 offset;
+
+		/* Decode a litlen symbol.  */
+		ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
+		entry = d->litlen_decode_table[BITS(LITLEN_TABLEBITS)];
+		if (entry & HUFFDEC_SUBTABLE_POINTER) {
+			/* Litlen subtable required (uncommon case)  */
+			REMOVE_BITS(LITLEN_TABLEBITS);
+			entry = d->litlen_decode_table[
+				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
+				BITS(entry & HUFFDEC_LENGTH_MASK)];
+		}
+		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
+		if (entry & HUFFDEC_LITERAL) {
+			/* Literal  */
+			SAFETY_CHECK(out_next < out_end);
+			*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
+			continue;
+		}
+
+		/* Match or end-of-block  */
+
+		entry >>= HUFFDEC_RESULT_SHIFT;
+		ENSURE_BITS(MAX_ENSURE);
+
+		/* Pop the extra length bits and add them to the length base to
+		 * produce the full length.  */
+		length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
+			 POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
+
+		/* The match destination must not end after the end of the
+		 * output buffer.  For efficiency, combine this check with the
+		 * end-of-block check.  We're using 0 for the special
+		 * end-of-block length, so subtract 1 and it turn it into
+		 * SIZE_MAX.  */
+		STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
+		if (unlikely((size_t)length - 1 > out_end - out_next)) {
+			SAFETY_CHECK(length == HUFFDEC_END_OF_BLOCK_LENGTH);
+			goto block_done;
+		}
+
+		/* Decode the match offset.  */
+
+		entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
+		if (entry & HUFFDEC_SUBTABLE_POINTER) {
+			/* Offset subtable required (uncommon case)  */
+			REMOVE_BITS(OFFSET_TABLEBITS);
+			entry = d->offset_decode_table[
+				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
+				BITS(entry & HUFFDEC_LENGTH_MASK)];
+		}
+		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
+		entry >>= HUFFDEC_RESULT_SHIFT;
+
+		STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
+					 DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
+			      CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
+		if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
+				DEFLATE_MAX_OFFSET_CODEWORD_LEN +
+				DEFLATE_MAX_EXTRA_OFFSET_BITS))
+			ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
+
+		/* Pop the extra offset bits and add them to the offset base to
+		 * produce the full offset.  */
+		offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
+			 POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
+
+		/* The match source must not begin before the beginning of the
+		 * output buffer.  */
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+
+		/* Copy the match: 'length' bytes at 'out_next - offset' to
+		 * 'out_next'.  */
+
+		if (UNALIGNED_ACCESS_IS_FAST &&
+		    length <= (3 * WORDSIZE) &&
+		    offset >= WORDSIZE &&
+		    length + (3 * WORDSIZE) <= out_end - out_next)
+		{
+			/* Fast case: short length, no overlaps if we copy one
+			 * word at a time, and we aren't getting too close to
+			 * the end of the output array.  */
+			copy_word_unaligned(out_next - offset + (0 * WORDSIZE),
+					    out_next + (0 * WORDSIZE));
+			copy_word_unaligned(out_next - offset + (1 * WORDSIZE),
+					    out_next + (1 * WORDSIZE));
+			copy_word_unaligned(out_next - offset + (2 * WORDSIZE),
+					    out_next + (2 * WORDSIZE));
+		} else {
+			const u8 *src = out_next - offset;
+			u8 *dst = out_next;
+			u8 *end = out_next + length;
+
+			if (UNALIGNED_ACCESS_IS_FAST &&
+			    likely(out_end - end >= WORDSIZE - 1)) {
+				if (offset >= WORDSIZE) {
+					copy_word_unaligned(src, dst);
+					src += WORDSIZE;
+					dst += WORDSIZE;
+					if (dst < end) {
+						do {
+							copy_word_unaligned(src, dst);
+							src += WORDSIZE;
+							dst += WORDSIZE;
+						} while (dst < end);
+					}
+				} else if (offset == 1) {
+					machine_word_t v = repeat_byte(*(dst - 1));
+					do {
+						store_word_unaligned(v, dst);
+						src += WORDSIZE;
+						dst += WORDSIZE;
+					} while (dst < end);
+				} else {
+					*dst++ = *src++;
+					*dst++ = *src++;
+					do {
+						*dst++ = *src++;
+					} while (dst < end);
+				}
+			} else {
+				*dst++ = *src++;
+				*dst++ = *src++;
+				do {
+					*dst++ = *src++;
+				} while (dst < end);
+			}
+		}
+
+		out_next += length;
+	}
+
+block_done:
+	/* Finished decoding a block.  */
+
+	if (!is_final_block)
+		goto next_block;
+
+	/* That was the last block.  Return %true if we got all the output we
+	 * expected, otherwise %false.  */
+	return (out_next == out_end);
+}
--- a/src/deflate_decompress.c
+++ b/src/deflate_decompress.c
@ -10,8 +10,9 @@
 * ---------------------------------------------------------------------------
 *
 * This is a highly optimized DEFLATE decompressor.  On x86_64 it decompresses
- * data in about 52% of the time of zlib.  On other architectures it should
- * still be significantly faster than zlib, but the difference may be smaller.
+ * data in about 52% of the time of zlib (48% if BMI2 instructions are
+ * available).  On other architectures it should still be significantly faster
+ * than zlib, but the difference may be smaller.
 *
 * Why this is faster than zlib's implementation:
 *
@ -22,6 +23,8 @@
 * - Other optimizations to remove unnecessary branches
 * - Only full-buffer decompression is supported, so the code doesn't need to
 *   support stopping and resuming decompression.
+ * - On x86_64, compile a version of the decompression routine using BMI2
+ *   instructions and use it automatically at runtime when supported.
 */

 #include <stdlib.h>
@ -31,6 +34,7 @@

 #include "deflate_constants.h"
 #include "unaligned.h"
+#include "x86_cpu_features.h"

 /* By default, if the expression passed to SAFETY_CHECK() evaluates to false,
 * then deflate_decompress() immediately returns false as the compressed data is
@ -793,6 +797,50 @@ copy_word_unaligned(const void *src, void *dst)
 *                         Main decompression routine
 *****************************************************************************/

+#define FUNCNAME deflate_decompress_default
+#define ATTRIBUTES
+#include "decompress_impl.h"
+#undef FUNCNAME
+#undef ATTRIBUTES
+
+#if X86_CPU_FEATURES_ENABLED && !defined(__BMI2__)
+#  define FUNCNAME deflate_decompress_bmi2
+#  define ATTRIBUTES __attribute__((target("bmi2")))
+#  include "decompress_impl.h"
+#  undef FUNCNAME
+#  undef ATTRIBUTES
+#  define DISPATCH_ENABLED 1
+#endif
+
+#if DISPATCH_ENABLED
+
+static bool
+dispatch(struct deflate_decompressor * restrict d,
+	 const void * restrict in, size_t in_nbytes,
+	 void * restrict out, size_t out_nbytes);
+
+typedef bool (*decompress_func_t)(struct deflate_decompressor * restrict d,
+				  const void * restrict in, size_t in_nbytes,
+				  void * restrict out, size_t out_nbytes);
+
+static decompress_func_t decompress_impl = dispatch;
+
+static bool
+dispatch(struct deflate_decompressor * restrict d,
+	 const void * restrict in, size_t in_nbytes,
+	 void * restrict out, size_t out_nbytes)
+{
+	decompress_func_t f = deflate_decompress_default;
+#if X86_CPU_FEATURES_ENABLED
+	if (x86_have_cpu_feature(X86_CPU_FEATURE_BMI2))
+		f = deflate_decompress_bmi2;
+#endif
+	decompress_impl = f;
+	return (*f)(d, in, in_nbytes, out, out_nbytes);
+}
+#endif /* DISPATCH_ENABLED */
+
+
 /*
 * This is the main DEFLATE decompression routine.  It decompresses 'in_nbytes'
 * bytes of compressed data from the buffer 'in' and writes the uncompressed
@ -801,362 +849,20 @@ copy_word_unaligned(const void *src, void *dst)
 * and only if decompression was successful.  A return value of %false indicates
 * that either the compressed data is invalid or it does not decompress to
 * exactly 'out_nbytes' bytes of uncompressed data.
+ *
+ * The real code is in decompress_impl.h.  The part here just handles calling
+ * the appropriate implementation depending on the CPU features at runtime.
 */
 LIBEXPORT bool
 deflate_decompress(struct deflate_decompressor * restrict d,
 		   const void * restrict in, size_t in_nbytes,
 		   void * restrict out, size_t out_nbytes)
 {
-	u8 *out_next = out;
-	u8 * const out_end = out_next + out_nbytes;
-	const u8 *in_next = in;
-	const u8 * const in_end = in_next + in_nbytes;
-	bitbuf_t bitbuf = 0;
-	unsigned bitsleft = 0;
-	size_t overrun_count = 0;
-	unsigned i;
-	unsigned is_final_block;
-	unsigned block_type;
-	u16 len;
-	u16 nlen;
-	unsigned num_litlen_syms;
-	unsigned num_offset_syms;
-
-next_block:
-	/* Starting to read the next block.  */
-	;
-
-	STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
-	ENSURE_BITS(1 + 2 + 5 + 5 + 4);
-
-	/* BFINAL: 1 bit  */
-	is_final_block = POP_BITS(1);
-
-	/* BTYPE: 2 bits  */
-	block_type = POP_BITS(2);
-
-	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
-
-		/* Dynamic Huffman block.  */
-
-		/* The order in which precode lengths are stored.  */
-		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
-			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
-		};
-
-		unsigned num_explicit_precode_lens;
-
-		/* Read the codeword length counts.  */
-
-		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
-		num_litlen_syms = POP_BITS(5) + 257;
-
-		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
-		num_offset_syms = POP_BITS(5) + 1;
-
-		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
-		num_explicit_precode_lens = POP_BITS(4) + 4;
-
-		/* Read the precode codeword lengths.  */
-		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
-		if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) {
-
-			ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3);
-
-			for (i = 0; i < num_explicit_precode_lens; i++)
-				d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
-		} else {
-			for (i = 0; i < num_explicit_precode_lens; i++) {
-				ENSURE_BITS(3);
-				d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
-			}
-		}
-
-		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
-			d->precode_lens[deflate_precode_lens_permutation[i]] = 0;
-
-		/* Build the decode table for the precode.  */
-		SAFETY_CHECK(build_precode_decode_table(d));
-
-		/* Expand the literal/length and offset codeword lengths.  */
-		for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
-			u32 entry;
-			unsigned presym;
-			u8 rep_val;
-			unsigned rep_count;
-
-			ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
-
-			/* (The code below assumes that the precode decode table
-			 * does not have any subtables.)  */
-			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
-
-			/* Read the next precode symbol.  */
-			entry = d->precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
-			REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
-			presym = entry >> HUFFDEC_RESULT_SHIFT;
-
-			if (presym < 16) {
-				/* Explicit codeword length  */
-				d->lens[i++] = presym;
-				continue;
-			}
-
-			/* Run-length encoded codeword lengths  */
-
-			/* Note: we don't need verify that the repeat count
-			 * doesn't overflow the number of elements, since we
-			 * have enough extra spaces to allow for the worst-case
-			 * overflow (138 zeroes when only 1 length was
-			 * remaining).
-			 *
-			 * In the case of the small repeat counts (presyms 16
-			 * and 17), it is fastest to always write the maximum
-			 * number of entries.  That gets rid of branches that
-			 * would otherwise be required.
-			 *
-			 * It is not just because of the numerical order that
-			 * our checks go in the order 'presym < 16', 'presym ==
-			 * 16', and 'presym == 17'.  For typical data this is
-			 * ordered from most frequent to least frequent case.
-			 */
-			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
-
-			if (presym == 16) {
-				/* Repeat the previous length 3 - 6 times  */
-				SAFETY_CHECK(i != 0);
-				rep_val = d->lens[i - 1];
-				STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
-				rep_count = 3 + POP_BITS(2);
-				d->lens[i + 0] = rep_val;
-				d->lens[i + 1] = rep_val;
-				d->lens[i + 2] = rep_val;
-				d->lens[i + 3] = rep_val;
-				d->lens[i + 4] = rep_val;
-				d->lens[i + 5] = rep_val;
-				i += rep_count;
-			} else if (presym == 17) {
-				/* Repeat zero 3 - 10 times  */
-				STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
-				rep_count = 3 + POP_BITS(3);
-				d->lens[i + 0] = 0;
-				d->lens[i + 1] = 0;
-				d->lens[i + 2] = 0;
-				d->lens[i + 3] = 0;
-				d->lens[i + 4] = 0;
-				d->lens[i + 5] = 0;
-				d->lens[i + 6] = 0;
-				d->lens[i + 7] = 0;
-				d->lens[i + 8] = 0;
-				d->lens[i + 9] = 0;
-				i += rep_count;
-			} else {
-				/* Repeat zero 11 - 138 times  */
-				STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
-				rep_count = 11 + POP_BITS(7);
-				memset(&d->lens[i], 0, rep_count * sizeof(d->lens[i]));
-				i += rep_count;
-			}
-		}
-	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
-
-		/* Uncompressed block: copy 'len' bytes literally from the input
-		 * buffer to the output buffer.  */
-
-		ALIGN_INPUT();
-
-		SAFETY_CHECK(in_end - in_next >= 4);
-
-		len = READ_U16();
-		nlen = READ_U16();
-
-		SAFETY_CHECK(len == (u16)~nlen);
-		SAFETY_CHECK(len <= out_end - out_next);
-		SAFETY_CHECK(len <= in_end - in_next);
-
-		memcpy(out_next, in_next, len);
-		in_next += len;
-		out_next += len;
-
-		goto block_done;
-
-	} else {
-		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
-
-		/* Static Huffman block: set the static Huffman codeword
-		 * lengths.  Then the remainder is the same as decompressing a
-		 * dynamic Huffman block.  */
-
-		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
-		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
-
-		for (i = 0; i < 144; i++)
-			d->lens[i] = 8;
-		for (; i < 256; i++)
-			d->lens[i] = 9;
-		for (; i < 280; i++)
-			d->lens[i] = 7;
-		for (; i < 288; i++)
-			d->lens[i] = 8;
-
-		for (; i < 288 + 32; i++)
-			d->lens[i] = 5;
-
-		num_litlen_syms = 288;
-		num_offset_syms = 32;
-
-	}
-
-	/* Decompressing a Huffman block (either dynamic or static)  */
-
-	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
-	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
-
-	/* The main DEFLATE decode loop  */
-	for (;;) {
-		u32 entry;
-		u32 length;
-		u32 offset;
-
-		/* Decode a litlen symbol.  */
-		ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
-		entry = d->litlen_decode_table[BITS(LITLEN_TABLEBITS)];
-		if (entry & HUFFDEC_SUBTABLE_POINTER) {
-			/* Litlen subtable required (uncommon case)  */
-			REMOVE_BITS(LITLEN_TABLEBITS);
-			entry = d->litlen_decode_table[
-				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
-				BITS(entry & HUFFDEC_LENGTH_MASK)];
-		}
-		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
-		if (entry & HUFFDEC_LITERAL) {
-			/* Literal  */
-			SAFETY_CHECK(out_next < out_end);
-			*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
-			continue;
-		}
-
-		/* Match or end-of-block  */
-
-		entry >>= HUFFDEC_RESULT_SHIFT;
-		ENSURE_BITS(MAX_ENSURE);
-
-		/* Pop the extra length bits and add them to the length base to
-		 * produce the full length.  */
-		length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
-			 POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
-
-		/* The match destination must not end after the end of the
-		 * output buffer.  For efficiency, combine this check with the
-		 * end-of-block check.  We're using 0 for the special
-		 * end-of-block length, so subtract 1 and it turn it into
-		 * SIZE_MAX.  */
-		STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
-		if (unlikely((size_t)length - 1 > out_end - out_next)) {
-			SAFETY_CHECK(length == HUFFDEC_END_OF_BLOCK_LENGTH);
-			goto block_done;
-		}
-
-		/* Decode the match offset.  */
-
-		entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
-		if (entry & HUFFDEC_SUBTABLE_POINTER) {
-			/* Offset subtable required (uncommon case)  */
-			REMOVE_BITS(OFFSET_TABLEBITS);
-			entry = d->offset_decode_table[
-				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
-				BITS(entry & HUFFDEC_LENGTH_MASK)];
-		}
-		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
-		entry >>= HUFFDEC_RESULT_SHIFT;
-
-		STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
-					 DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
-			      CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
-		if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
-				DEFLATE_MAX_OFFSET_CODEWORD_LEN +
-				DEFLATE_MAX_EXTRA_OFFSET_BITS))
-			ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
-
-		/* Pop the extra offset bits and add them to the offset base to
-		 * produce the full offset.  */
-		offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
-			 POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
-
-		/* The match source must not begin before the beginning of the
-		 * output buffer.  */
-		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
-
-		/* Copy the match: 'length' bytes at 'out_next - offset' to
-		 * 'out_next'.  */
-
-		if (UNALIGNED_ACCESS_IS_FAST &&
-		    length <= (3 * WORDSIZE) &&
-		    offset >= WORDSIZE &&
-		    length + (3 * WORDSIZE) <= out_end - out_next)
-		{
-			/* Fast case: short length, no overlaps if we copy one
-			 * word at a time, and we aren't getting too close to
-			 * the end of the output array.  */
-			copy_word_unaligned(out_next - offset + (0 * WORDSIZE),
-					    out_next + (0 * WORDSIZE));
-			copy_word_unaligned(out_next - offset + (1 * WORDSIZE),
-					    out_next + (1 * WORDSIZE));
-			copy_word_unaligned(out_next - offset + (2 * WORDSIZE),
-					    out_next + (2 * WORDSIZE));
-		} else {
-			const u8 *src = out_next - offset;
-			u8 *dst = out_next;
-			u8 *end = out_next + length;
-
-			if (UNALIGNED_ACCESS_IS_FAST &&
-			    likely(out_end - end >= WORDSIZE - 1)) {
-				if (offset >= WORDSIZE) {
-					copy_word_unaligned(src, dst);
-					src += WORDSIZE;
-					dst += WORDSIZE;
-					if (dst < end) {
-						do {
-							copy_word_unaligned(src, dst);
-							src += WORDSIZE;
-							dst += WORDSIZE;
-						} while (dst < end);
-					}
-				} else if (offset == 1) {
-					machine_word_t v = repeat_byte(*(dst - 1));
-					do {
-						store_word_unaligned(v, dst);
-						src += WORDSIZE;
-						dst += WORDSIZE;
-					} while (dst < end);
-				} else {
-					*dst++ = *src++;
-					*dst++ = *src++;
-					do {
-						*dst++ = *src++;
-					} while (dst < end);
-				}
-			} else {
-				*dst++ = *src++;
-				*dst++ = *src++;
-				do {
-					*dst++ = *src++;
-				} while (dst < end);
-			}
-		}
-
-		out_next += length;
-	}
-
-block_done:
-	/* Finished decoding a block.  */
-
-	if (!is_final_block)
-		goto next_block;
-
-	/* That was the last block.  Return %true if we got all the output we
-	 * expected, otherwise %false.  */
-	return (out_next == out_end);
+#if DISPATCH_ENABLED
+	return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes);
+#else
+	return deflate_decompress_default(d, in, in_nbytes, out, out_nbytes);
+#endif
 }

 LIBEXPORT struct deflate_decompressor *
--- a/src/x86_cpu_features.c
+++ b/src/x86_cpu_features.c
@ -0,0 +1,145 @@
+/*
+ * x86_cpu_features.c - feature detection for x86 processors
+ *
+ * Author:	Eric Biggers
+ * Year:	2015
+ *
+ * The author dedicates this file to the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#include "x86_cpu_features.h"
+
+#ifdef X86_CPU_FEATURES_ENABLED
+
+#define DEBUG 0
+
+#if DEBUG
+#  include <stdio.h>
+#endif
+
+u32 _x86_cpu_features = 0;
+
+/* With old GCC versions we have to manually save and restore the x86_32 PIC
+ * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602  */
+#if defined(__i386__) && defined(__PIC__)
+#  define EBX_CONSTRAINT "=r"
+#else
+#  define EBX_CONSTRAINT "=b"
+#endif
+
+/* Execute the CPUID instruction.  */
+static inline void
+cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
+{
+	__asm__(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
+		"cpuid                                  \n"
+		".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
+		: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
+		: "a" (leaf), "c" (subleaf));
+}
+
+/* Read an extended control register.  */
+static inline u64
+read_xcr(u32 index)
+{
+	u32 edx, eax;
+
+	/* Execute the "xgetbv" instruction.  Old versions of binutils do not
+	 * recognize this instruction, so list the raw bytes instead.  */
+	__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
+
+	return ((u64)edx << 32) | eax;
+}
+
+#define IS_SET(reg, bit) ((reg) & ((u32)1 << (bit)))
+
+/* Initialize _x86_cpu_features with bits for interesting processor features. */
+void
+x86_setup_cpu_features(void)
+{
+	u32 features = 0;
+	u32 dummy1, dummy2, dummy3, dummy4;
+	u32 max_function;
+	u32 features_1, features_2, features_3, features_4;
+	bool os_saves_ymm_regs = false;
+
+	/* Get maximum supported function  */
+	cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
+	if (max_function < 1)
+		goto out;
+
+	/* Standard feature flags  */
+	cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
+
+	if (IS_SET(features_1, 25))
+		features |= X86_CPU_FEATURE_SSE;
+
+	if (IS_SET(features_1, 26))
+		features |= X86_CPU_FEATURE_SSE2;
+
+	if (IS_SET(features_2, 0))
+		features |= X86_CPU_FEATURE_SSE3;
+
+	if (IS_SET(features_2, 9))
+		features |= X86_CPU_FEATURE_SSSE3;
+
+	if (IS_SET(features_2, 19))
+		features |= X86_CPU_FEATURE_SSE4_1;
+
+	if (IS_SET(features_2, 20))
+		features |= X86_CPU_FEATURE_SSE4_2;
+
+	if (IS_SET(features_2, 27)) /* OSXSAVE set?  */
+		if ((read_xcr(0) & 0x6) == 0x6)
+			os_saves_ymm_regs = true;
+
+	if (os_saves_ymm_regs && IS_SET(features_2, 28))
+		features |= X86_CPU_FEATURE_AVX;
+
+	if (max_function < 7)
+		goto out;
+
+	/* Extended feature flags  */
+	cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
+
+	if (IS_SET(features_3, 3))
+		features |= X86_CPU_FEATURE_BMI;
+
+	if (os_saves_ymm_regs && IS_SET(features_3, 5))
+		features |= X86_CPU_FEATURE_AVX2;
+
+	if (IS_SET(features_3, 8))
+		features |= X86_CPU_FEATURE_BMI2;
+
+out:
+
+#if DEBUG
+	printf("Detected x86 CPU features: ");
+	if (features & X86_CPU_FEATURE_SSE)
+		printf("SSE ");
+	if (features & X86_CPU_FEATURE_SSE2)
+		printf("SSE2 ");
+	if (features & X86_CPU_FEATURE_SSE3)
+		printf("SSE3 ");
+	if (features & X86_CPU_FEATURE_SSSE3)
+		printf("SSSE3 ");
+	if (features & X86_CPU_FEATURE_SSE4_1)
+		printf("SSE4.1 ");
+	if (features & X86_CPU_FEATURE_SSE4_2)
+		printf("SSE4.2 ");
+	if (features & X86_CPU_FEATURE_BMI)
+		printf("BMI ");
+	if (features & X86_CPU_FEATURE_AVX)
+		printf("AVX ");
+	if (features & X86_CPU_FEATURE_BMI2)
+		printf("BMI2 ");
+	if (features & X86_CPU_FEATURE_AVX2)
+		printf("AVX2 ");
+	printf("\n");
+#endif /* DEBUG */
+
+	_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN;
+}
+
+#endif /* X86_CPU_FEATURES_ENABLED */
--- a/src/x86_cpu_features.h
+++ b/src/x86_cpu_features.h
@ -0,0 +1,43 @@
+/*
+ * x86_cpu_features.h - feature detection for x86 processors
+ */
+
+#pragma once
+
+#include "util.h"
+
+#if RUNTIME_CPU_DETECTION && defined(__GNUC__) && defined(__x86_64__)
+#  define X86_CPU_FEATURES_ENABLED 1
+#endif
+
+
+#if X86_CPU_FEATURES_ENABLED
+
+#define X86_CPU_FEATURE_SSE		0x00000001
+#define X86_CPU_FEATURE_SSE2		0x00000002
+#define X86_CPU_FEATURE_SSE3		0x00000004
+#define X86_CPU_FEATURE_SSSE3		0x00000008
+#define X86_CPU_FEATURE_SSE4_1		0x00000010
+#define X86_CPU_FEATURE_SSE4_2		0x00000020
+#define X86_CPU_FEATURE_AVX		0x00000040
+#define X86_CPU_FEATURE_BMI		0x00000080
+#define X86_CPU_FEATURE_AVX2		0x00000100
+#define X86_CPU_FEATURE_BMI2		0x00000200
+
+#define X86_CPU_FEATURES_KNOWN		0x80000000
+
+extern u32 _x86_cpu_features;
+
+extern void
+x86_setup_cpu_features(void);
+
+/* Does the processor have the specified feature?  */
+static inline bool
+x86_have_cpu_feature(u32 feature)
+{
+	if (_x86_cpu_features == 0)
+		x86_setup_cpu_features();
+	return _x86_cpu_features & feature;
+}
+
+#endif /* X86_CPU_FEATURES_ENABLED */