diff --git a/libdeflate.h b/libdeflate.h
index d41609e..bae28ad 100644
--- a/libdeflate.h
+++ b/libdeflate.h
@@ -94,8 +94,10 @@ deflate_alloc_decompressor(void);
  * 'out_nbytes', to 'out'.  The return value is true if decompression was
  * successful, or false if the compressed data was invalid.
  *
- * To be clear: the uncompressed size must be known *exactly* and passed as
- * 'out_nbytes'.
+ * Note that the uncompressed size must be known *exactly* and passed as
+ * 'out_nbytes'.  This is because this API is designed for block-based
+ * compression where the uncompressed size should have already been stored
+ * elsewhere.
  */
 extern bool
 deflate_decompress(struct deflate_decompressor *decompressor,
diff --git a/src/deflate_decompress.c b/src/deflate_decompress.c
index 0eb40c9..9344b88 100644
--- a/src/deflate_decompress.c
+++ b/src/deflate_decompress.c
@@ -2,7 +2,7 @@
  * deflate_decompress.c - a decompressor for DEFLATE
  *
  * Author:	Eric Biggers
- * Year:	2014, 2015
+ * Year:	2014, 2015, 2016
  *
  * The author dedicates this file to the public domain.
  * You can do whatever you want with this file.
@@ -10,12 +10,9 @@
  * ---------------------------------------------------------------------------
  *
  * This is a highly optimized DEFLATE decompressor.  On x86_64 it decompresses
- * data in about 59% of the time of zlib.  On other architectures it should
+ * data in about 52% of the time of zlib.  On other architectures it should
  * still be significantly faster than zlib, but the difference may be smaller.
  *
- * This decompressor currently only supports raw DEFLATE (not zlib or gzip), and
- * it only supports whole-buffer decompression (not streaming).
- *
  * Why this is faster than zlib's implementation:
  *
  * - Word accesses rather than byte accesses when reading input
@@ -35,28 +32,48 @@
 #include "deflate_constants.h"
 #include "unaligned.h"
 
-#ifndef UNSAFE_DECOMPRESSION
-#  define UNSAFE_DECOMPRESSION 0
-#endif
-
+/* By default, if the expression passed to SAFETY_CHECK() evaluates to false,
+ * then deflate_decompress() immediately returns false as the compressed data is
+ * invalid.  But if unsafe decompression is enabled, then the value of the
+ * expression is ignored, allowing the compiler to optimize out some code.  */
 #if UNSAFE_DECOMPRESSION
-#  warning "unsafe decompression is enabled"
-#  define SAFETY_CHECK(expr) 0
+#  warning "UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!"
+#  define SAFETY_CHECK(expr)	(void)(expr)
 #else
-#  define SAFETY_CHECK(expr) unlikely(expr)
+#  define SAFETY_CHECK(expr)	if (unlikely(!(expr))) return false
 #endif
 
 /*
- * Each of these values is the base 2 logarithm of the number of entries of the
- * corresponding decode table.  Each value should be large enough to ensure that
- * for typical data, the vast majority of symbols can be decoded by a direct
- * lookup of the next TABLEBITS bits of compressed data.  However, this must be
- * balanced against the fact that a larger table requires more memory and
- * requires more time to fill.
+ * Each TABLEBITS number is the base-2 logarithm of the number of entries in the
+ * main portion of the corresponding decode table.  Each number should be large
+ * enough to ensure that for typical data, the vast majority of symbols can be
+ * decoded by a direct lookup of the next TABLEBITS bits of compressed data.
+ * However, this must be balanced against the fact that a larger table requires
+ * more memory and requires more time to fill.
+ *
+ * Note: you cannot change a TABLEBITS number without also changing the
+ * corresponding ENOUGH number!
  */
-#define DEFLATE_PRECODE_TABLEBITS	7
-#define DEFLATE_LITLEN_TABLEBITS	10
-#define DEFLATE_OFFSET_TABLEBITS	9
+#define PRECODE_TABLEBITS	7
+#define LITLEN_TABLEBITS	10
+#define OFFSET_TABLEBITS	8
+
+/*
+ * Each ENOUGH number is the maximum number of decode table entries that may be
+ * required for the corresponding Huffman code, including the main table and all
+ * subtables.  Each number depends on three parameters:
+ *
+ *	(1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMBOLS)
+ *	(2) the number of main table bits (the TABLEBITS numbers defined above)
+ *	(3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
+ *
+ * The ENOUGH numbers were computed using the utility program 'enough' from
+ * zlib.  This program enumerates all possible relevant Huffman codes to find
+ * the worst-case usage of decode table entries.
+ */
+#define PRECODE_ENOUGH		128	/* enough 19 7 7	*/
+#define LITLEN_ENOUGH		1334	/* enough 288 10 15	*/
+#define OFFSET_ENOUGH		402	/* enough 32 8 15	*/
 
 /*
  * Type for codeword lengths.
@@ -88,16 +105,13 @@ struct deflate_decompressor {
 				   DEFLATE_NUM_OFFSET_SYMS +
 				   DEFLATE_MAX_LENS_OVERRUN];
 
-			u32 precode_decode_table[(1 << DEFLATE_PRECODE_TABLEBITS) +
-						 (2 * DEFLATE_NUM_PRECODE_SYMS)];
+			u32 precode_decode_table[PRECODE_ENOUGH];
 		};
 
-		u32 litlen_decode_table[(1 << DEFLATE_LITLEN_TABLEBITS) +
-					(2 * DEFLATE_NUM_LITLEN_SYMS)];
+		u32 litlen_decode_table[LITLEN_ENOUGH];
 	};
 
-	u32 offset_decode_table[(1 << DEFLATE_OFFSET_TABLEBITS) +
-				(2 * DEFLATE_NUM_OFFSET_SYMS)];
+	u32 offset_decode_table[OFFSET_ENOUGH];
 
 	u16 working_space[2 * (DEFLATE_MAX_CODEWORD_LEN + 1) +
 			  DEFLATE_MAX_NUM_SYMS];
@@ -128,6 +142,9 @@ struct deflate_decompressor {
 /*
  * The type for the bitbuffer variable ('bitbuf' described above).  For best
  * performance, this should have size equal to a machine word.
+ *
+ * 64-bit platforms have a significant advantage: they get a bigger bitbuffer
+ * which they have to fill less often.
  */
 typedef machine_word_t bitbuf_t;
 
@@ -289,24 +306,22 @@ typedef machine_word_t bitbuf_t;
  *****************************************************************************/
 
 /*
- * A decode table for order TABLEBITS contains (1 << TABLEBITS) entries, plus
- * additional entries for non-root binary tree nodes.  The number of non-root
- * binary tree nodes is variable, but cannot possibly be more than twice the
- * number of symbols in the alphabet for which the decode table is built.
+ * A decode table for order TABLEBITS consists of a main table of (1 <<
+ * TABLEBITS) entries followed by a variable number of subtables.
  *
  * The decoding algorithm takes the next TABLEBITS bits of compressed data and
  * uses them as an index into the decode table.  The resulting entry is either a
- * "direct entry", meaning that it contains the value desired, or a "tree root
- * entry", meaning that it is the root of a binary tree that must be traversed
- * using more bits of the compressed data (0 bit means go to the left child, 1
- * bit means go to the right child) until a leaf is reached.
+ * "direct entry", meaning that it contains the value desired, or a "subtable
+ * pointer", meaning that the entry references a subtable that must be indexed
+ * using more bits of the compressed data to decode the symbol.
  *
- * Each decode table is associated with a Huffman code.  Logically, the result
- * of a decode table lookup is a symbol from the alphabet from which the
- * corresponding Huffman code was constructed.  A symbol with codeword length n
- * <= TABLEBITS is associated with 2**(TABLEBITS - n) direct entries in the
- * table, whereas a symbol with codeword length n > TABLEBITS shares a binary
- * tree with a number of other codewords.
+ * Each decode table (a main table along with with its subtables, if any) is
+ * associated with a Huffman code.  Logically, the result of a decode table
+ * lookup is a symbol from the alphabet from which the corresponding Huffman
+ * code was constructed.  A symbol with codeword length n <= TABLEBITS is
+ * associated with 2**(TABLEBITS - n) direct entries in the table, whereas a
+ * symbol with codeword length n > TABLEBITS is associated with one or more
+ * subtable entries.
  *
  * On top of this basic design, we implement several optimizations:
  *
@@ -320,148 +335,125 @@ typedef machine_word_t bitbuf_t;
  *   offset bits directly rather than decoding the offset symbol and then
  *   looking up both of those values in an additional table or tables.
  *
- * - It can be possible to decode more than just a single Huffman symbol from
- *   the next TABLEBITS bits of the input.  We take advantage of this when
- *   decoding match lengths.  When possible, the decode table entry will provide
- *   the full match length.  In this case, the stored "codeword length" will
- *   actually be the codeword length plus the number of extra length bits that
- *   are being consumed.
- *
  * The size of each decode table entry is 32 bits, which provides slightly
  * better performance than 16-bit entries on 32 and 64 bit processers, provided
  * that the table doesn't get so large that it takes up too much memory and
  * starts generating cache misses.  The bits of each decode table entry are
  * defined as follows:
  *
- * - Bits 29 -- 31: flags (see below)
- * - Bits 25 -- 28: codeword length
- * - Bits 0 -- 24: decode result: a Huffman symbol or related data
+ * - Bits 30 -- 31: flags (see below)
+ * - Bits 8 -- 29: decode result: a Huffman symbol or related data
+ * - Bits 0 -- 7: codeword length
  */
 
 /*
- * Flags usage:
- *
- * The precode and offset tables only use these flags to distinguish nonleaf
- * tree nodes from other entries.  In nonleaf tree node entries, all flags are
- * set and the recommended one to test is HUFFDEC_TREE_NONLEAF_FAST_FLAG.
- *
- * The literal/length decode table uses all the flags.  During decoding, the
- * flags are designed to be tested from high to low.  If a flag is set, then all
- * higher flags are also set.
+ * This flag is set in all main decode table entries that represent subtable
+ * pointers.
  */
+#define HUFFDEC_SUBTABLE_POINTER	0x80000000
 
 /*
- * This flag is set in all entries that do not represent a literal symbol,
- * excluding tree leaves.  This enables a very fast path for non-rare literals:
- * just check if this bit is clear, and if so extract the literal from the low
- * bits.
+ * This flag is set in all entries in the litlen decode table that represent
+ * literals.
  */
-#define HUFFDEC_NOT_LITERAL		0x80000000
+#define HUFFDEC_LITERAL			0x40000000
 
-/*
- * This flag is set in all entries that represent neither a literal symbol nor a
- * full match length, excluding tree leaves.
- */
-#define HUFFDEC_NOT_FULL_LENGTH		0x40000000
+/* Mask for extracting the codeword length from a decode table entry.  */
+#define HUFFDEC_LENGTH_MASK		0xFF
 
-/*
- * This flag is set in all nonleaf tree entries (roots and internal nodes).
- */
-#define HUFFDEC_TREE_NONLEAF		0x20000000
+/* Shift to extract the decode result from a decode table entry.  */
+#define HUFFDEC_RESULT_SHIFT		8
 
-/*
- * HUFFDEC_TREE_NONLEAF implies that the following flags are also set.
- */
-#define HUFFDEC_TREE_NONLEAF_FLAGS	0xE0000000
+/* The decode result for each precode symbol.  There is no special optimization
+ * for the precode; the decode result is simply the symbol value.  */
+static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = {
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+};
 
-/*
- * For distinguishing between any direct entry and a tree root, or between an
- * internal tree node and a leaf node, this bit should be checked in preference
- * to any other in HUFFDEC_TREE_NONLEAF_FLAGS --- the reason being this is the
- * sign bit, and some architectures have special instructions to handle it.
- */
-#define HUFFDEC_TREE_NONLEAF_FAST_FLAG	0x80000000
+/* The decode result for each litlen symbol.  For literals, this is the literal
+ * value itself and the HUFFDEC_LITERAL flag.  For lengths, this is the length
+ * base and the number of extra length bits.  */
+static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = {
+#define ENTRY(literal)	((HUFFDEC_LITERAL >> HUFFDEC_RESULT_SHIFT) | (literal))
 
-/*
- * Number of flag bits defined above.
- */
-#define HUFFDEC_NUM_FLAG_BITS	3
-
-/*
- * Number of bits reserved for the codeword length in decode table entries, and
- * the corresponding mask and limit.  4 bits provides a max length of 15, which
- * is enough for any DEFLATE codeword.  (And actually, we don't even need the
- * full 15 because only lengths less than or equal to the appropriate TABLEBITS
- * will ever be stored in this field.)
- */
-#define HUFFDEC_LEN_BITS	4
-#define HUFFDEC_LEN_MASK	(((u32)1 << HUFFDEC_LEN_BITS) - 1)
-#define HUFFDEC_MAX_LEN		HUFFDEC_LEN_MASK
-
-/*
- * Value by which a decode table entry can be right-shifted to get the length
- * field.  Note: the result must be AND-ed with HUFFDEC_LEN_MASK unless it is
- * guaranteed that no flag bits are set.
- */
-#define HUFFDEC_LEN_SHIFT	(32 - HUFFDEC_NUM_FLAG_BITS - HUFFDEC_LEN_BITS)
-
-/*
- * Mask to get the "value" of a decode table entry.  This is the decode result
- * and contains data dependent on the table.
- */
-#define HUFFDEC_VALUE_MASK	(((u32)1 << HUFFDEC_LEN_SHIFT) - 1)
-
-/*
- * Data needed to initialize the entries in the length/literal decode table.
- */
-static const u32 deflate_litlen_symbol_data[DEFLATE_NUM_LITLEN_SYMS] = {
 	/* Literals  */
-	0   , 1   , 2   , 3   , 4   , 5   , 6   , 7   ,
-	8   , 9   , 10  , 11  , 12  , 13  , 14  , 15  ,
-	16  , 17  , 18  , 19  , 20  , 21  , 22  , 23  ,
-	24  , 25  , 26  , 27  , 28  , 29  , 30  , 31  ,
-	32  , 33  , 34  , 35  , 36  , 37  , 38  , 39  ,
-	40  , 41  , 42  , 43  , 44  , 45  , 46  , 47  ,
-	48  , 49  , 50  , 51  , 52  , 53  , 54  , 55  ,
-	56  , 57  , 58  , 59  , 60  , 61  , 62  , 63  ,
-	64  , 65  , 66  , 67  , 68  , 69  , 70  , 71  ,
-	72  , 73  , 74  , 75  , 76  , 77  , 78  , 79  ,
-	80  , 81  , 82  , 83  , 84  , 85  , 86  , 87  ,
-	88  , 89  , 90  , 91  , 92  , 93  , 94  , 95  ,
-	96  , 97  , 98  , 99  , 100 , 101 , 102 , 103 ,
-	104 , 105 , 106 , 107 , 108 , 109 , 110 , 111 ,
-	112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 ,
-	120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 ,
-	128 , 129 , 130 , 131 , 132 , 133 , 134 , 135 ,
-	136 , 137 , 138 , 139 , 140 , 141 , 142 , 143 ,
-	144 , 145 , 146 , 147 , 148 , 149 , 150 , 151 ,
-	152 , 153 , 154 , 155 , 156 , 157 , 158 , 159 ,
-	160 , 161 , 162 , 163 , 164 , 165 , 166 , 167 ,
-	168 , 169 , 170 , 171 , 172 , 173 , 174 , 175 ,
-	176 , 177 , 178 , 179 , 180 , 181 , 182 , 183 ,
-	184 , 185 , 186 , 187 , 188 , 189 , 190 , 191 ,
-	192 , 193 , 194 , 195 , 196 , 197 , 198 , 199 ,
-	200 , 201 , 202 , 203 , 204 , 205 , 206 , 207 ,
-	208 , 209 , 210 , 211 , 212 , 213 , 214 , 215 ,
-	216 , 217 , 218 , 219 , 220 , 221 , 222 , 223 ,
-	224 , 225 , 226 , 227 , 228 , 229 , 230 , 231 ,
-	232 , 233 , 234 , 235 , 236 , 237 , 238 , 239 ,
-	240 , 241 , 242 , 243 , 244 , 245 , 246 , 247 ,
-	248 , 249 , 250 , 251 , 252 , 253 , 254 , 255 ,
+	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
+	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
+	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
+	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
+	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  , ENTRY(19)  ,
+	ENTRY(20)  , ENTRY(21)  , ENTRY(22)  , ENTRY(23)  ,
+	ENTRY(24)  , ENTRY(25)  , ENTRY(26)  , ENTRY(27)  ,
+	ENTRY(28)  , ENTRY(29)  , ENTRY(30)  , ENTRY(31)  ,
+	ENTRY(32)  , ENTRY(33)  , ENTRY(34)  , ENTRY(35)  ,
+	ENTRY(36)  , ENTRY(37)  , ENTRY(38)  , ENTRY(39)  ,
+	ENTRY(40)  , ENTRY(41)  , ENTRY(42)  , ENTRY(43)  ,
+	ENTRY(44)  , ENTRY(45)  , ENTRY(46)  , ENTRY(47)  ,
+	ENTRY(48)  , ENTRY(49)  , ENTRY(50)  , ENTRY(51)  ,
+	ENTRY(52)  , ENTRY(53)  , ENTRY(54)  , ENTRY(55)  ,
+	ENTRY(56)  , ENTRY(57)  , ENTRY(58)  , ENTRY(59)  ,
+	ENTRY(60)  , ENTRY(61)  , ENTRY(62)  , ENTRY(63)  ,
+	ENTRY(64)  , ENTRY(65)  , ENTRY(66)  , ENTRY(67)  ,
+	ENTRY(68)  , ENTRY(69)  , ENTRY(70)  , ENTRY(71)  ,
+	ENTRY(72)  , ENTRY(73)  , ENTRY(74)  , ENTRY(75)  ,
+	ENTRY(76)  , ENTRY(77)  , ENTRY(78)  , ENTRY(79)  ,
+	ENTRY(80)  , ENTRY(81)  , ENTRY(82)  , ENTRY(83)  ,
+	ENTRY(84)  , ENTRY(85)  , ENTRY(86)  , ENTRY(87)  ,
+	ENTRY(88)  , ENTRY(89)  , ENTRY(90)  , ENTRY(91)  ,
+	ENTRY(92)  , ENTRY(93)  , ENTRY(94)  , ENTRY(95)  ,
+	ENTRY(96)  , ENTRY(97)  , ENTRY(98)  , ENTRY(99)  ,
+	ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
+	ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
+	ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
+	ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
+	ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
+	ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
+	ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
+	ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
+	ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
+	ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
+	ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
+	ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
+	ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
+	ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
+	ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
+	ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
+	ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
+	ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
+	ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
+	ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
+	ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
+	ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
+	ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
+	ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
+	ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
+	ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
+	ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
+	ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
+	ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
+	ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
+	ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
+	ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
+	ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
+	ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
+	ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
+	ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
+	ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
+	ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
+	ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
+#undef ENTRY
 
-#define HUFFDEC_NUM_BITS_FOR_EXTRA_LENGTH_BITS	3
-#define HUFFDEC_MAX_EXTRA_LENGTH_BITS	(((u32)1 << HUFFDEC_NUM_BITS_FOR_EXTRA_LENGTH_BITS) - 1)
-#define HUFFDEC_EXTRA_LENGTH_BITS_SHIFT (HUFFDEC_LEN_SHIFT - HUFFDEC_NUM_BITS_FOR_EXTRA_LENGTH_BITS)
-#define HUFFDEC_LENGTH_BASE_MASK	(((u32)1 << HUFFDEC_EXTRA_LENGTH_BITS_SHIFT) - 1)
+#define HUFFDEC_EXTRA_LENGTH_BITS_MASK	0xFF
+#define HUFFDEC_LENGTH_BASE_SHIFT	8
 #define HUFFDEC_END_OF_BLOCK_LENGTH	0
 
 #define ENTRY(length_base, num_extra_bits) \
-	(256 + (length_base) + ((num_extra_bits) << HUFFDEC_EXTRA_LENGTH_BITS_SHIFT))
+	(((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits))
 
 	/* End of block  */
 	ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0),
 
-	/* Match length data  */
+	/* Lengths  */
 	ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
 	ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
 	ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
@@ -473,10 +465,9 @@ static const u32 deflate_litlen_symbol_data[DEFLATE_NUM_LITLEN_SYMS] = {
 #undef ENTRY
 };
 
-/*
- * Data needed to initialize the entries in the offset decode table.
- */
-static const u32 deflate_offset_symbol_data[DEFLATE_NUM_OFFSET_SYMS] = {
+/* The decode result for each offset symbol.  This is the offset base and the
+ * number of extra offset bits.  */
+static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
 
 #define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16
 #define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1)
@@ -494,133 +485,55 @@ static const u32 deflate_offset_symbol_data[DEFLATE_NUM_OFFSET_SYMS] = {
 #undef ENTRY
 };
 
-/* Construct a direct decode table entry (not a tree node)  */
+/* Construct a decode table entry from a decode result and codeword length.  */
 static forceinline u32
-make_direct_entry(u32 value, u32 length)
+make_decode_table_entry(u32 result, u32 length)
 {
-	return (length << HUFFDEC_LEN_SHIFT) | value;
+	return (result << HUFFDEC_RESULT_SHIFT) | length;
 }
 
 /*
- * The following functions define the way entries are created for each decode
- * table.  Note that these will all be inlined into build_decode_table(), which
- * will itself be inlined for each decode table.  This is important for
- * performance because the make_*_entry() functions get called from the inner
- * loop of build_decode_table().
- */
-
-static forceinline u32
-make_litlen_direct_entry(unsigned symbol, unsigned codeword_length,
-			 unsigned *extra_mask_ret)
-{
-	u32 entry_value = deflate_litlen_symbol_data[symbol];
-	u32 entry_length = codeword_length;
-	unsigned length_bits;
-	u32 length_base;
-
-	STATIC_ASSERT(DEFLATE_MAX_EXTRA_LENGTH_BITS <=
-		      HUFFDEC_MAX_EXTRA_LENGTH_BITS);
-
-	if (symbol >= 256) {
-		/* Match, not a literal.  (This can also be the special
-		 * end-of-block symbol, which we handle identically.)  */
-		entry_value -= 256;
-		length_bits = entry_value >> HUFFDEC_EXTRA_LENGTH_BITS_SHIFT;
-		length_base = entry_value & HUFFDEC_LENGTH_BASE_MASK;
-		if (codeword_length + length_bits <= DEFLATE_LITLEN_TABLEBITS) {
-			/* TABLEBITS is enough to decode the length slot as well
-			 * as all the extra length bits.  So store the full
-			 * length in the decode table entry.
-			 *
-			 * Note that a length slot may be used for multiple
-			 * lengths, and multiple decode table entries may map to
-			 * the same length; hence the need for the 'extra_mask',
-			 * which allows build_decode_table() to cycle through
-			 * the lengths that use this length slot.  */
-			entry_value = length_base;
-			entry_length += length_bits;
-			*extra_mask_ret = (1U << length_bits) - 1;
-		} else {
-			/* TABLEBITS isn't enough to decode all the extra length
-			 * bits.  The decoder will have to decode the extra bits
-			 * separately.  This is the less common case.  */
-			entry_value |= HUFFDEC_NOT_FULL_LENGTH;
-		}
-		entry_value |= HUFFDEC_NOT_LITERAL;
-	}
-
-	return make_direct_entry(entry_value, entry_length);
-}
-
-static forceinline u32
-make_litlen_leaf_entry(unsigned sym)
-{
-	return deflate_litlen_symbol_data[sym];
-}
-
-static forceinline u32
-make_offset_direct_entry(unsigned sym, unsigned codeword_len, unsigned *extra_mask_ret)
-{
-	return make_direct_entry(deflate_offset_symbol_data[sym], codeword_len);
-}
-
-static forceinline u32
-make_offset_leaf_entry(unsigned sym)
-{
-	return deflate_offset_symbol_data[sym];
-}
-
-static forceinline u32
-make_pre_direct_entry(unsigned sym, unsigned codeword_len, unsigned *extra_mask_ret)
-{
-	return make_direct_entry(sym, codeword_len);
-}
-
-static forceinline u32
-make_pre_leaf_entry(unsigned sym)
-{
-	return sym;
-}
-
-/*
- * Build a table for fast Huffman decoding, using bit-reversed codewords.
- *
- * The Huffman code is assumed to be in canonical form and is specified by its
- * codeword lengths only.
+ * Build a table for fast decoding of symbols from a Huffman code.  As input,
+ * this function takes the codeword length of each symbol which may be used in
+ * the code.  As output, it produces a decode table for the canonical Huffman
+ * code described by the codeword lengths.  The decode table is built with the
+ * assumption that it will be indexed with "bit-reversed" codewords, where the
+ * low-order bit is the first bit of the codeword.  This format is used for all
+ * Huffman codes in DEFLATE.
  *
  * @decode_table
- *	A table with ((1 << table_bits) + (2 * num_syms)) entries.  The format
- *	of the table has been described in previous comments.
+ *	The array in which the decode table will be generated.  This array must
+ *	have sufficient length; see the definition of the ENOUGH numbers.
  * @lens
- *	Lengths of the Huffman codewords.  'lens[sym]' specifies the length, in
- *	bits, of the codeword for symbol 'sym'.  If a symbol is not used in the
- *	code, its length must be specified as 0.  It is valid for this parameter
- *	to alias @decode_table because nothing gets written to @decode_table
- *	until all information in @lens has been consumed.
+ *	An array which provides, for each symbol, the length of the
+ *	corresponding codeword in bits, or 0 if the symbol is unused.  This may
+ *	alias @decode_table, since nothing is written to @decode_table until all
+ *	@lens have been consumed.  All codeword lengths are assumed to be <=
+ *	@max_codeword_len but are otherwise considered untrusted.  If they do
+ *	not form a valid Huffman code, then the decode table is not built and
+ *	%false is returned.
  * @num_syms
- *	Number of symbols in the code.
- * @make_direct_entry
- *	Function to create a direct decode table entry, given the symbol and
- *	codeword length.
- * @make_leaf_entry
- *	Function to create a tree decode table entry, at a tree leaf, given the
- *	symbol.
+ *	The number of symbols in the code, including all unused symbols.
+ * @decode_results
+ *	An array which provides, for each symbol, the actual value to store into
+ *	the decode table.  This value will be directly produced as the result of
+ *	decoding that symbol, thereby moving the indirection out of the decode
+ *	loop and into the table initialization.
  * @table_bits
- *	log base 2 of the size of the direct lookup portion of the decode table.
+ *	The log base-2 of the number of main table entries to use.
  * @max_codeword_len
- *	Maximum allowed codeword length for this Huffman code.
+ *	The maximum allowed codeword length for this Huffman code.
  * @working_space
  *	A temporary array of length '2 * (@max_codeword_len + 1) + @num_syms'.
  *
  * Returns %true if successful; %false if the codeword lengths do not form a
  * valid Huffman code.
  */
-static forceinline bool
+static bool
 build_decode_table(u32 decode_table[],
 		   const len_t lens[],
 		   const unsigned num_syms,
-		   u32 (*make_direct_entry)(unsigned, unsigned, unsigned *),
-		   u32 (*make_leaf_entry)(unsigned),
+		   const u32 decode_results[],
 		   const unsigned table_bits,
 		   const unsigned max_codeword_len,
 		   u16 working_space[])
@@ -628,25 +541,29 @@ build_decode_table(u32 decode_table[],
 	u16 * const len_counts = &working_space[0];
 	u16 * const offsets = &working_space[1 * (max_codeword_len + 1)];
 	u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)];
-	unsigned sym;
 	unsigned len;
+	unsigned sym;
 	s32 remainder;
 	unsigned sym_idx;
-	unsigned codeword_reversed;
 	unsigned codeword_len;
-	unsigned loop_count;
+	unsigned codeword_reversed = 0;
+	unsigned cur_codeword_prefix = -1;
+	unsigned cur_table_start = 0;
+	unsigned cur_table_bits = table_bits;
+	unsigned num_dropped_bits = 0;
+	const unsigned table_mask = (1U << table_bits) - 1;
 
-	/* Count how many symbols have each codeword length.  */
+	/* Count how many symbols have each codeword length, including 0.  */
 	for (len = 0; len <= max_codeword_len; len++)
 		len_counts[len] = 0;
 	for (sym = 0; sym < num_syms; sym++)
 		len_counts[lens[sym]]++;
 
-	/* We guarantee that all lengths are <= max_codeword_len, but we cannot
-	 * assume they form a valid prefix code.  A codeword of length n should
-	 * require a proportion of the codespace equaling (1/2)^n.  The code is
-	 * valid if and only if the codespace is exactly filled by the lengths
-	 * by this measure.  */
+	/* It is already guaranteed that all lengths are <= max_codeword_len,
+	 * but it cannot be assumed they form a valid prefix code.  A codeword
+	 * of length n should require a proportion of the codespace equaling
+	 * (1/2)^n.  The code is valid if and only if, by this measure, the
+	 * codespace is exactly filled by the lengths.  */
 	remainder = 1;
 	for (len = 1; len <= max_codeword_len; len++) {
 		remainder <<= 1;
@@ -669,9 +586,11 @@ build_decode_table(u32 decode_table[],
 			 * uninitialized memory if the algorithm nevertheless
 			 * attempts to decode symbols using such a code, we fill
 			 * the decode table with default values.  */
-			unsigned dummy;
-			for (unsigned i = 0; i < (1U << table_bits); i++)
-				decode_table[i] = (*make_direct_entry)(0, 1, &dummy);
+			for (unsigned i = 0; i < (1U << table_bits); i++) {
+				decode_table[i] =
+					make_decode_table_entry(
+							decode_results[0], 1);
+			}
 			return true;
 		}
 		return false;
@@ -681,7 +600,7 @@ build_decode_table(u32 decode_table[],
 	 */
 
 	/* Initialize 'offsets' so that offsets[len] is the number of codewords
-	 * shorter than 'len' bits.  */
+	 * shorter than 'len' bits, including length 0.  */
 	offsets[0] = 0;
 	for (len = 0; len < max_codeword_len; len++)
 		offsets[len + 1] = offsets[len] + len_counts[len];
@@ -690,167 +609,128 @@ build_decode_table(u32 decode_table[],
 	for (sym = 0; sym < num_syms; sym++)
 		sorted_syms[offsets[lens[sym]]++] = sym;
 
-	/* Generate entries for codewords with length <= 'table_bits'.
-	 * Start with codeword length 1 and proceed to longer codewords.  */
+	/* Generate the decode table entries.  Since we process codewords from
+	 * shortest to longest, the main portion of the decode table is filled
+	 * first; then the subtables are filled.  Note that it's already been
+	 * verified that the codewords form a valid (complete) prefix code.  */
+
+	/* Start with the index of the first used symbol.  */
 	sym_idx = offsets[0];
-	codeword_reversed = 0;
+
+	/* Start with the smallest used codeword length.  */
 	codeword_len = 1;
-	loop_count = (1U << (table_bits - codeword_len));
-	for (; loop_count != 0; codeword_len++, loop_count >>= 1) {
+	while (len_counts[codeword_len] == 0)
+		codeword_len++;
 
-		const unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
-		const unsigned increment = 1U << codeword_len;
-
-		/* Iterate through the symbols that have codewords with length
-		 * 'codeword_len'.  Since the code is assumed to be canonical,
-		 * we can generate the codewords by iterating in symbol order
-		 * and incrementing the current codeword by 1 each time.  */
-
-		for (; sym_idx < end_sym_idx; sym_idx++) {
-			unsigned sym;
-			u32 entry;
-			unsigned extra_mask;
-			unsigned extra;
-			unsigned i;
-			unsigned n;
-			unsigned bit;
-
-			sym = sorted_syms[sym_idx];
-			extra_mask = 0;
-			entry = (*make_direct_entry)(sym, codeword_len, &extra_mask);
-			extra = 0;
-			i = codeword_reversed;
-			n = loop_count;
-			do {
-				decode_table[i] = entry + extra;
-				extra = (extra + 1) & extra_mask;
-				i += increment;
-			} while (--n);
-
-			/* Increment the codeword by 1.  Since DEFLATE requires
-			 * bit-reversed codewords, we must manipulate bits
-			 * ourselves.  */
-			bit = 1U << (codeword_len - 1);
-			while (codeword_reversed & bit)
-				bit >>= 1;
-			codeword_reversed &= bit - 1;
-			codeword_reversed |= bit;
-		}
-	}
-
-	/* If we've filled in the entire table, we are done.  Otherwise, there
-	 * are codewords longer than 'table_bits' for which we must generate
-	 * binary trees.  */
-	if (max_codeword_len > table_bits &&
-	    offsets[table_bits] != offsets[max_codeword_len])
-	{
+	for (;;) {  /* For used each symbol and its codeword...  */
+		unsigned sym;
+		u32 entry;
 		unsigned i;
+		unsigned end;
+		unsigned increment;
 		unsigned bit;
-		unsigned next_free_slot;
 
-		/* First, zero out the remaining entries.  This is necessary so
-		 * that those entries appear as "unallocated" in the next part.
-		 * Each of these entries will eventually be filled with the
-		 * representation of the root node of a binary tree.  */
+		/* Get the next symbol.  */
+		sym = sorted_syms[sym_idx];
 
-		i = (1U << table_bits) - 1; /* All 1 bits */
-		for (;;) {
-			decode_table[i] = 0;
+		/* Start a new subtable if the codeword is long enough to
+		 * require a subtable, *and* the first 'table_bits' bits of the
+		 * codeword don't match the prefix for the previous subtable if
+		 * any.  */
+		if (codeword_len > table_bits &&
+		    (codeword_reversed & table_mask) != cur_codeword_prefix) {
 
-			if (i == codeword_reversed)
-				break;
+			cur_codeword_prefix = (codeword_reversed & table_mask);
 
-			/* Subtract 1 from the bit-reversed index.  */
-			bit = 1U << table_bits;
-			do {
-				bit >>= 1;
-				i ^= bit;
-			} while (i & bit);
-		}
+			cur_table_start += 1U << cur_table_bits;
 
-		/* We allocate child nodes starting at the end of the direct
-		 * lookup table.  Note that there should be 2*num_syms extra
-		 * entries for this purpose, although fewer than this may
-		 * actually be needed.  */
-		next_free_slot = 1U << table_bits;
-
-		for (; codeword_len <= max_codeword_len; codeword_len++) {
-
-			const unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
-
-			for (; sym_idx < end_sym_idx; sym_idx++) {
-
-				unsigned shift = table_bits;
-				unsigned node_idx = codeword_reversed & ((1U << table_bits) - 1);
-
-				/* Go through each bit of the current codeword
-				 * beyond the prefix of length @table_bits and
-				 * walk the appropriate binary tree, allocating
-				 * any slots that have not yet been allocated.
-				 *
-				 * Note that the 'pointer' entry to the binary
-				 * tree, which is stored in the direct lookup
-				 * portion of the table, is represented
-				 * identically to other internal (non-leaf)
-				 * nodes of the binary tree; it can be thought
-				 * of as simply the root of the tree.  The
-				 * representation of these internal nodes is
-				 * simply the index of the left child combined
-				 * with special flags to distingush the entry
-				 * from direct mapping and leaf node entries.
-				 */
-				do {
-
-					/* At least one bit remains in the
-					 * codeword, but the current node is
-					 * unallocated.  Allocate it as an
-					 * internal tree node.  */
-					if (decode_table[node_idx] == 0) {
-						decode_table[node_idx] =
-							next_free_slot |
-							HUFFDEC_TREE_NONLEAF_FLAGS;
-						decode_table[next_free_slot++] = 0;
-						decode_table[next_free_slot++] = 0;
-					}
-
-					/* Go to the left child if the next bit
-					 * in the codeword is 0; otherwise go to
-					 * the right child.  */
-					node_idx = decode_table[node_idx] &
-						   ~HUFFDEC_TREE_NONLEAF_FLAGS;
-					node_idx += (codeword_reversed >> shift) & 1;
-					shift += 1;
-				} while (shift != codeword_len);
-
-				/* Generate the leaf node, which contains the
-				 * real decode table entry.  */
-				decode_table[node_idx] =
-					(*make_leaf_entry)(sorted_syms[sym_idx]);
-
-				/* Increment the codeword by 1.  Since DEFLATE
-				 * requires bit-reversed codewords, we must
-				 * manipulate bits ourselves.  */
-				bit = 1U << (codeword_len - 1);
-				while (codeword_reversed & bit)
-					bit >>= 1;
-				codeword_reversed &= bit - 1;
-				codeword_reversed |= bit;
+			/* Calculate the subtable length.  If the codeword
+			 * length exceeds 'table_bits' by n, the subtable needs
+			 * at least 2**n entries.  But it may need more; if
+			 * there are fewer than 2**n codewords of length
+			 * 'table_bits + n' remaining, then n will need to be
+			 * incremented to bring in longer codewords until the
+			 * subtable can be filled completely.  */
+			cur_table_bits = codeword_len - table_bits;
+			remainder = (s32)1 << cur_table_bits;
+			while (table_bits + cur_table_bits < max_codeword_len) {
+				remainder -= len_counts[table_bits +
+							cur_table_bits];
+				if (remainder <= 0)
+					break;
+				cur_table_bits++;
+				remainder <<= 1;
 			}
+
+			/* Create the entry that points from the main table to
+			 * the subtable.  This entry contains the index of the
+			 * start of the subtable and the number of bits with
+			 * which the subtable is indexed (the log base 2 of the
+			 * number of entries it contains).  */
+			decode_table[cur_codeword_prefix] =
+				HUFFDEC_SUBTABLE_POINTER |
+				make_decode_table_entry(cur_table_start,
+							cur_table_bits);
+
+			/* Now that we're filling a subtable, we need to drop
+			 * the first 'table_bits' bits of the codewords.  */
+			num_dropped_bits = table_bits;
 		}
+
+		/* Create the decode table entry, which packs the decode result
+		 * and the codeword length (minus 'table_bits' for subtables)
+		 * together.  */
+		entry = make_decode_table_entry(decode_results[sym],
+						codeword_len - num_dropped_bits);
+
+		/* Fill in as many copies of the decode table entry as are
+		 * needed.  The number of entries to fill is a power of 2 and
+		 * depends on the codeword length; it could be as few as 1 or as
+		 * large as half the size of the table.  Since the codewords are
+		 * bit-reversed, the indices to fill are those with the codeword
+		 * in its low bits; it's the high bits that vary.  */
+		i = cur_table_start + (codeword_reversed >> num_dropped_bits);
+		end = cur_table_start + (1U << cur_table_bits);
+		increment = 1U << (codeword_len - num_dropped_bits);
+		do {
+			decode_table[i] = entry;
+			i += increment;
+		} while (i < end);
+
+		/* Advance to the next codeword by incrementing it.  But since
+		 * our codewords are bit-reversed, we must manipulate the bits
+		 * ourselves rather than simply adding 1.  */
+		bit = 1U << (codeword_len - 1);
+		while (codeword_reversed & bit)
+			bit >>= 1;
+		codeword_reversed &= bit - 1;
+		codeword_reversed |= bit;
+
+		/* Advance to the next symbol.  This will either increase the
+		 * codeword length, or keep the same codeword length but
+		 * increase the symbol value.  Note: since we are using
+		 * bit-reversed codewords, we don't need to explicitly append
+		 * zeroes to the codeword when the codeword length increases. */
+		if (++sym_idx == num_syms)
+			return true;
+		len_counts[codeword_len]--;
+		while (len_counts[codeword_len] == 0)
+			codeword_len++;
 	}
-	return true;
 }
 
 /* Build the decode table for the precode.  */
 static bool
 build_precode_decode_table(struct deflate_decompressor *d)
 {
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
+
 	return build_decode_table(d->precode_decode_table,
 				  d->precode_lens,
 				  DEFLATE_NUM_PRECODE_SYMS,
-				  make_pre_direct_entry,
-				  make_pre_leaf_entry,
-				  DEFLATE_PRECODE_TABLEBITS,
+				  precode_decode_results,
+				  PRECODE_TABLEBITS,
 				  DEFLATE_MAX_PRE_CODEWORD_LEN,
 				  d->working_space);
 }
@@ -860,12 +740,14 @@ static bool
 build_litlen_decode_table(struct deflate_decompressor *d,
 			  unsigned num_litlen_syms, unsigned num_offset_syms)
 {
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334);
+
 	return build_decode_table(d->litlen_decode_table,
 				  d->lens,
 				  num_litlen_syms,
-				  make_litlen_direct_entry,
-				  make_litlen_leaf_entry,
-				  DEFLATE_LITLEN_TABLEBITS,
+				  litlen_decode_results,
+				  LITLEN_TABLEBITS,
 				  DEFLATE_MAX_LITLEN_CODEWORD_LEN,
 				  d->working_space);
 }
@@ -875,12 +757,14 @@ static bool
 build_offset_decode_table(struct deflate_decompressor *d,
 			  unsigned num_litlen_syms, unsigned num_offset_syms)
 {
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
+
 	return build_decode_table(d->offset_decode_table,
 				  d->lens + num_litlen_syms,
 				  num_offset_syms,
-				  make_offset_direct_entry,
-				  make_offset_leaf_entry,
-				  DEFLATE_OFFSET_TABLEBITS,
+				  offset_decode_results,
+				  OFFSET_TABLEBITS,
 				  DEFLATE_MAX_OFFSET_CODEWORD_LEN,
 				  d->working_space);
 }
@@ -905,93 +789,6 @@ copy_word_unaligned(const void *src, void *dst)
 	store_word_unaligned(load_word_unaligned(src), dst);
 }
 
-/*
- * Copy an LZ77 match at (dst - offset) to dst.
- *
- * The length and offset must be already validated --- that is, (dst - offset)
- * can't underrun the output buffer, and (dst + length) can't overrun the output
- * buffer.  Also, the length cannot be 0.
- *
- * @winend points to the byte past the end of the output buffer.
- * This function won't write any data beyond this position.
- */
-static forceinline void
-lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend)
-{
-	const u8 *src = dst - offset;
-	const u8 * const end = dst + length;
-
-	/*
-	 * Try to copy one machine word at a time.  On i386 and x86_64 this is
-	 * faster than copying one byte at a time, unless the data is
-	 * near-random and all the matches have very short lengths.  Note that
-	 * since this requires unaligned memory accesses, it won't necessarily
-	 * be faster on every architecture.
-	 *
-	 * Also note that we might copy more than the length of the match.  For
-	 * example, if a word is 8 bytes and the match is of length 5, then
-	 * we'll simply copy 8 bytes.  This is okay as long as we don't write
-	 * beyond the end of the output buffer, hence the check for (winend -
-	 * end >= WORDSIZE - 1).
-	 */
-	if (UNALIGNED_ACCESS_IS_FAST && likely(winend - end >= WORDSIZE - 1)) {
-
-		if (offset >= WORDSIZE) {
-			/* The source and destination words don't overlap.  */
-
-			/* To improve branch prediction, one iteration of this
-			 * loop is unrolled.  Most matches are short and will
-			 * fail the first check.  But if that check passes, then
-			 * it becomes increasing likely that the match is long
-			 * and we'll need to continue copying.  */
-
-			copy_word_unaligned(src, dst);
-			src += WORDSIZE;
-			dst += WORDSIZE;
-
-			if (dst < end) {
-				do {
-					copy_word_unaligned(src, dst);
-					src += WORDSIZE;
-					dst += WORDSIZE;
-				} while (dst < end);
-			}
-			return;
-		} else if (offset == 1) {
-
-			/* Offset 1 matches are equivalent to run-length
-			 * encoding of the previous byte.  This case is common
-			 * if the data contains many repeated bytes.  */
-
-			machine_word_t v = repeat_byte(*(dst - 1));
-			do {
-				store_word_unaligned(v, dst);
-				src += WORDSIZE;
-				dst += WORDSIZE;
-			} while (dst < end);
-			return;
-		}
-		/*
-		 * We don't bother with special cases for other 'offset <
-		 * WORDSIZE', which are usually rarer than 'offset == 1'.  Extra
-		 * checks will just slow things down.  Actually, it's possible
-		 * to handle all the 'offset < WORDSIZE' cases using the same
-		 * code, but it still becomes more complicated doesn't seem any
-		 * faster overall; it definitely slows down the more common
-		 * 'offset == 1' case.
-		 */
-	}
-
-	/* Fall back to a bytewise copy.  */
-	STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
-	*dst++ = *src++;
-	*dst++ = *src++;
-	length -= 2;
-	do {
-		*dst++ = *src++;
-	} while (--length);
-}
-
 /*****************************************************************************
  *                         Main decompression routine
  *****************************************************************************/
@@ -1079,8 +876,7 @@ next_block:
 			d->precode_lens[deflate_precode_lens_permutation[i]] = 0;
 
 		/* Build the decode table for the precode.  */
-		if (!build_precode_decode_table(d))
-			return false;
+		SAFETY_CHECK(build_precode_decode_table(d));
 
 		/* Expand the literal/length and offset codeword lengths.  */
 		for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
@@ -1091,14 +887,14 @@ next_block:
 
 			ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
 
-			/* (The code below assumes there are no binary trees in
-			 * the decode table.)  */
-			STATIC_ASSERT(DEFLATE_PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
+			/* (The code below assumes that the precode decode table
+			 * does not have any subtables.)  */
+			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
 
 			/* Read the next precode symbol.  */
 			entry = d->precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
-			REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT);
-			presym = entry & HUFFDEC_VALUE_MASK;
+			REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
+			presym = entry >> HUFFDEC_RESULT_SHIFT;
 
 			if (presym < 16) {
 				/* Explicit codeword length  */
@@ -1128,8 +924,7 @@ next_block:
 
 			if (presym == 16) {
 				/* Repeat the previous length 3 - 6 times  */
-				if (SAFETY_CHECK(i == 0))
-					return false;
+				SAFETY_CHECK(i != 0);
 				rep_val = d->lens[i - 1];
 				STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
 				rep_count = 3 + POP_BITS(2);
@@ -1170,20 +965,14 @@ next_block:
 
 		ALIGN_INPUT();
 
-		if (SAFETY_CHECK(in_end - in_next < 4))
-			return false;
+		SAFETY_CHECK(in_end - in_next >= 4);
 
 		len = READ_U16();
 		nlen = READ_U16();
 
-		if (SAFETY_CHECK(len != (u16)~nlen))
-			return false;
-
-		if (SAFETY_CHECK(len > out_end - out_next))
-			return false;
-
-		if (SAFETY_CHECK(len > in_end - in_next))
-			return false;
+		SAFETY_CHECK(len == (u16)~nlen);
+		SAFETY_CHECK(len <= out_end - out_next);
+		SAFETY_CHECK(len <= in_end - in_next);
 
 		memcpy(out_next, in_next, len);
 		in_next += len;
@@ -1191,7 +980,8 @@ next_block:
 
 		goto block_done;
 
-	} else if (block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN) {
+	} else {
+		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
 
 		/* Static Huffman block: set the static Huffman codeword
 		 * lengths.  Then the remainder is the same as decompressing a
@@ -1215,18 +1005,12 @@ next_block:
 		num_litlen_syms = 288;
 		num_offset_syms = 32;
 
-	} else {
-		/* Reserved block type.  */
-		return false;
 	}
 
 	/* Decompressing a Huffman block (either dynamic or static)  */
 
-	if (!build_offset_decode_table(d, num_litlen_syms, num_offset_syms))
-		return false;
-
-	if (!build_litlen_decode_table(d, num_litlen_syms, num_offset_syms))
-		return false;
+	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
+	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
 
 	/* The main DEFLATE decode loop  */
 	for (;;) {
@@ -1234,186 +1018,132 @@ next_block:
 		u32 length;
 		u32 offset;
 
-		/* If our bitbuffer variable is large enough, we load new bits
-		 * only once for each match or literal decoded.  This is
-		 * fastest.  Otherwise, we may need to load new bits multiple
-		 * times when decoding a match.  */
+		/* Decode a litlen symbol.  */
+		ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
+		entry = d->litlen_decode_table[BITS(LITLEN_TABLEBITS)];
+		if (entry & HUFFDEC_SUBTABLE_POINTER) {
+			/* Litlen subtable required (uncommon case)  */
+			REMOVE_BITS(LITLEN_TABLEBITS);
+			entry = d->litlen_decode_table[
+				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
+				BITS(entry & HUFFDEC_LENGTH_MASK)];
+		}
+		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
+		if (entry & HUFFDEC_LITERAL) {
+			/* Literal  */
+			SAFETY_CHECK(out_next < out_end);
+			*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
+			continue;
+		}
 
-		STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_LITLEN_CODEWORD_LEN));
+		/* Match or end-of-block  */
+
+		entry >>= HUFFDEC_RESULT_SHIFT;
 		ENSURE_BITS(MAX_ENSURE);
 
-		/* Read a literal or length.  */
-
-		entry = d->litlen_decode_table[BITS(DEFLATE_LITLEN_TABLEBITS)];
-
-		if (CAN_ENSURE(DEFLATE_LITLEN_TABLEBITS * 2) &&
-		    likely(out_end - out_next >= MAX_ENSURE / DEFLATE_LITLEN_TABLEBITS))
-		{
-			/* Fast path for decoding literals  */
-
-		#define NUM_BITS_TO_ENSURE_AFTER_INLINE_LITERALS		\
-			((MAX_ENSURE >= DEFLATE_MAX_MATCH_BITS) ?		\
-			 DEFLATE_MAX_MATCH_BITS :				\
-			  ((MAX_ENSURE >= DEFLATE_MAX_LITLEN_CODEWORD_LEN +	\
-					  DEFLATE_MAX_EXTRA_LENGTH_BITS) ?	\
-				DEFLATE_MAX_LITLEN_CODEWORD_LEN +		\
-				DEFLATE_MAX_EXTRA_LENGTH_BITS :			\
-					DEFLATE_MAX_LITLEN_CODEWORD_LEN))
-
-		#define INLINE_LITERAL(seq)					\
-			if (CAN_ENSURE(DEFLATE_LITLEN_TABLEBITS * (seq))) {	\
-				entry = d->litlen_decode_table[			\
-						BITS(DEFLATE_LITLEN_TABLEBITS)];\
-				if (entry & HUFFDEC_NOT_LITERAL) {		\
-					if ((seq) != 1)				\
-						ENSURE_BITS(NUM_BITS_TO_ENSURE_AFTER_INLINE_LITERALS);	\
-					goto not_literal;			\
-				}						\
-				REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT);	\
-				*out_next++ = entry;				\
-			}
-
-			INLINE_LITERAL(1);
-			INLINE_LITERAL(2);
-			INLINE_LITERAL(3);
-			INLINE_LITERAL(4);
-			INLINE_LITERAL(5);
-			INLINE_LITERAL(6);
-			INLINE_LITERAL(7);
-			INLINE_LITERAL(8);
-			continue;
-		}
-
-		if (!(entry & HUFFDEC_NOT_LITERAL)) {
-			REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT);
-			if (SAFETY_CHECK(out_next == out_end))
-				return false;
-			*out_next++ = entry;
-			continue;
-		}
-	not_literal:
-		if (likely(!(entry & HUFFDEC_NOT_FULL_LENGTH))) {
-
-			/* The next TABLEBITS bits were enough to directly look
-			 * up a litlen symbol, which was a length slot.  In
-			 * addition, the full match length, including the extra
-			 * bits, fit into TABLEBITS.  So the result of the
-			 * lookup was the full match length.
-			 *
-			 * On typical data, most match lengths are short enough
-			 * to fall into this category.  */
-
-			REMOVE_BITS((entry >> HUFFDEC_LEN_SHIFT) & HUFFDEC_LEN_MASK);
-			length = entry & HUFFDEC_VALUE_MASK;
-
-		} else if (!(entry & HUFFDEC_TREE_NONLEAF)) {
-
-			/* The next TABLEBITS bits were enough to directly look
-			 * up a litlen symbol, which was either a length slot or
-			 * end-of-block.  However, the full match length,
-			 * including the extra bits (0 in the case of
-			 * end-of-block), requires more than TABLEBITS bits to
-			 * decode.  So the result of the lookup was the length
-			 * base and number of extra length bits.  We will read
-			 * this number of extra length bits and add them to the
-			 * length base in order to construct the full length.
-			 *
-			 * On typical data, this case is rare.  */
-
-			REMOVE_BITS((entry >> HUFFDEC_LEN_SHIFT) & HUFFDEC_LEN_MASK);
-			entry &= HUFFDEC_VALUE_MASK;
-
-			if (!CAN_ENSURE(DEFLATE_MAX_LITLEN_CODEWORD_LEN +
-					DEFLATE_MAX_EXTRA_LENGTH_BITS))
-				ENSURE_BITS(DEFLATE_MAX_EXTRA_LENGTH_BITS);
-
-			length = (entry & HUFFDEC_LENGTH_BASE_MASK) +
-				  POP_BITS(entry >> HUFFDEC_EXTRA_LENGTH_BITS_SHIFT);
-		} else {
-
-			/* The next TABLEBITS bits were not enough to directly
-			 * look up a litlen symbol.  Therefore, we must walk the
-			 * appropriate binary tree to decode the symbol, which
-			 * may be a literal, length slot, or end-of-block.
-			 *
-			 * On typical data, this case is rare.  */
-
-			REMOVE_BITS(DEFLATE_LITLEN_TABLEBITS);
-			do {
-				entry &= ~HUFFDEC_TREE_NONLEAF_FLAGS;
-				entry += POP_BITS(1);
-				entry = d->litlen_decode_table[entry];
-			} while (entry & HUFFDEC_TREE_NONLEAF_FAST_FLAG);
-			if (entry < 256) {
-				if (SAFETY_CHECK(out_next == out_end))
-					return false;
-				*out_next++ = entry;
-				continue;
-			}
-			entry -= 256;
-
-			if (!CAN_ENSURE(DEFLATE_MAX_LITLEN_CODEWORD_LEN +
-					DEFLATE_MAX_EXTRA_LENGTH_BITS))
-				ENSURE_BITS(DEFLATE_MAX_EXTRA_LENGTH_BITS);
-
-			length = (entry & HUFFDEC_LENGTH_BASE_MASK) +
-				  POP_BITS(entry >> HUFFDEC_EXTRA_LENGTH_BITS_SHIFT);
-		}
+		/* Pop the extra length bits and add them to the length base to
+		 * produce the full length.  */
+		length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
+			 POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
 
 		/* The match destination must not end after the end of the
-		 * output buffer.  */
-		if (SAFETY_CHECK(length > out_end - out_next))
-			return false;
-
-		if (unlikely(length == HUFFDEC_END_OF_BLOCK_LENGTH))
+		 * output buffer.  For efficiency, combine this check with the
+		 * end-of-block check.  We're using 0 for the special
+		 * end-of-block length, so subtract 1 and it turn it into
+		 * SIZE_MAX.  */
+		STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
+		if (unlikely((size_t)length - 1 > out_end - out_next)) {
+			SAFETY_CHECK(length == HUFFDEC_END_OF_BLOCK_LENGTH);
 			goto block_done;
-
-		/* Read the match offset.  */
-
-		if (!CAN_ENSURE(DEFLATE_MAX_MATCH_BITS)) {
-			if (CAN_ENSURE(DEFLATE_MAX_OFFSET_CODEWORD_LEN +
-				       DEFLATE_MAX_EXTRA_OFFSET_BITS))
-				ENSURE_BITS(DEFLATE_MAX_OFFSET_CODEWORD_LEN +
-					    DEFLATE_MAX_EXTRA_OFFSET_BITS);
-			else
-				ENSURE_BITS(DEFLATE_MAX_OFFSET_CODEWORD_LEN);
 		}
 
-		entry = d->offset_decode_table[BITS(DEFLATE_OFFSET_TABLEBITS)];
-		if (likely(!(entry & HUFFDEC_TREE_NONLEAF_FAST_FLAG))) {
-			REMOVE_BITS(entry >> HUFFDEC_LEN_SHIFT);
-			entry &= HUFFDEC_VALUE_MASK;
-		} else {
-			REMOVE_BITS(DEFLATE_OFFSET_TABLEBITS);
-			do {
-				entry &= ~HUFFDEC_TREE_NONLEAF_FLAGS;
-				entry += POP_BITS(1);
-				entry = d->offset_decode_table[entry];
-			} while (entry & HUFFDEC_TREE_NONLEAF_FAST_FLAG);
+		/* Decode the match offset.  */
+
+		entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
+		if (entry & HUFFDEC_SUBTABLE_POINTER) {
+			/* Offset subtable required (uncommon case)  */
+			REMOVE_BITS(OFFSET_TABLEBITS);
+			entry = d->offset_decode_table[
+				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
+				BITS(entry & HUFFDEC_LENGTH_MASK)];
 		}
+		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
+		entry >>= HUFFDEC_RESULT_SHIFT;
 
-		/* The value we have here isn't the offset symbol itself, but
-		 * rather the offset symbol indexed into
-		 * deflate_offset_symbol_data[].  This gives us the offset base
-		 * and number of extra offset bits without having to index
-		 * additional tables in the main decode loop.  */
-
-		if (!CAN_ENSURE(DEFLATE_MAX_OFFSET_CODEWORD_LEN +
+		STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
+					 DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
+			      CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
+		if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
+				DEFLATE_MAX_OFFSET_CODEWORD_LEN +
 				DEFLATE_MAX_EXTRA_OFFSET_BITS))
 			ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
 
+		/* Pop the extra offset bits and add them to the offset base to
+		 * produce the full offset.  */
 		offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
 			 POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
 
 		/* The match source must not begin before the beginning of the
 		 * output buffer.  */
-		if (SAFETY_CHECK(offset > out_next - (const u8 *)out))
-			return false;
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
 
-		/* Copy the match:
-		 * 'length' bytes at 'out_next - offset' to 'out_next'.  */
+		/* Copy the match: 'length' bytes at 'out_next - offset' to
+		 * 'out_next'.  */
 
-		lz_copy(out_next, length, offset, out_end);
+		if (UNALIGNED_ACCESS_IS_FAST &&
+		    length <= (3 * WORDSIZE) &&
+		    offset >= WORDSIZE &&
+		    length + (3 * WORDSIZE) <= out_end - out_next)
+		{
+			/* Fast case: short length, no overlaps if we copy one
+			 * word at a time, and we aren't getting too close to
+			 * the end of the output array.  */
+			copy_word_unaligned(out_next - offset + (0 * WORDSIZE),
+					    out_next + (0 * WORDSIZE));
+			copy_word_unaligned(out_next - offset + (1 * WORDSIZE),
+					    out_next + (1 * WORDSIZE));
+			copy_word_unaligned(out_next - offset + (2 * WORDSIZE),
+					    out_next + (2 * WORDSIZE));
+		} else {
+			const u8 *src = out_next - offset;
+			u8 *dst = out_next;
+			u8 *end = out_next + length;
+
+			if (UNALIGNED_ACCESS_IS_FAST &&
+			    likely(out_end - end >= WORDSIZE - 1)) {
+				if (offset >= WORDSIZE) {
+					copy_word_unaligned(src, dst);
+					src += WORDSIZE;
+					dst += WORDSIZE;
+					if (dst < end) {
+						do {
+							copy_word_unaligned(src, dst);
+							src += WORDSIZE;
+							dst += WORDSIZE;
+						} while (dst < end);
+					}
+				} else if (offset == 1) {
+					machine_word_t v = repeat_byte(*(dst - 1));
+					do {
+						store_word_unaligned(v, dst);
+						src += WORDSIZE;
+						dst += WORDSIZE;
+					} while (dst < end);
+				} else {
+					*dst++ = *src++;
+					*dst++ = *src++;
+					do {
+						*dst++ = *src++;
+					} while (dst < end);
+				}
+			} else {
+				*dst++ = *src++;
+				*dst++ = *src++;
+				do {
+					*dst++ = *src++;
+				} while (dst < end);
+			}
+		}
 
 		out_next += length;
 	}