lib/deflate_decompress: optimize build_decode_table() via table doubling

Another build_decode_table() optimization: rather than filling all the entries for each codeword using strided stores, just fill one initially and fill the rest by memcpy()s as the table is incrementally expanded. Also make some other cleanups and small optimizations.
2025-09-15 07:18:29 -04:00 · 2018-12-27 17:10:23 -06:00 · 2018-12-27 17:10:23 -06:00 · a64bd1e830
commit a64bd1e830
parent 954b59041a
2 changed files with 222 additions and 177 deletions
--- a/lib/deflate_decompress.c
+++ b/lib/deflate_decompress.c
@ -44,6 +44,7 @@
 *   instructions and use it automatically at runtime when supported.
 */
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
@ -515,39 +516,6 @@ static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
 #undef ENTRY
 };
 /* Advance to the next codeword in the canonical Huffman code */
 static forceinline void
 next_codeword(unsigned *codeword_p, unsigned *codeword_len_p,
 	      unsigned *stride_p, unsigned len_counts[])
 {
 	unsigned codeword = *codeword_p;
 	unsigned codeword_len = *codeword_len_p;
 	unsigned bit;
 	/*
 	 * Increment the codeword, bit-reversed: find the last (highest order) 0
 	 * bit in the codeword, set it, and clear any later (higher order) bits.
 	 */
 	bit = 1U << bsr32(~codeword & ((1U << codeword_len) - 1));
 	codeword &= bit - 1;
 	codeword |= bit;
 	/*
 	 * If there are no more codewords of this length, proceed to the next
 	 * lowest used length.  Increasing the length logically appends 0's to
 	 * the codeword, but this is a no-op due to the codeword being
 	 * represented in bit-reversed form.
 	 */
 	len_counts[codeword_len]--;
 	while (len_counts[codeword_len] == 0) {
 		codeword_len++;
 		*stride_p <<= 1;
 	}
 	*codeword_p = codeword;
 	*codeword_len_p = codeword_len;
 }
 /*
 * Build a table for fast decoding of symbols from a Huffman code.  As input,
 * this function takes the codeword length of each symbol which may be used in
@ -593,189 +561,259 @@ build_decode_table(u32 decode_table[],
 		   const u32 decode_results[],
 		   const unsigned table_bits,
 		   const unsigned max_codeword_len,
-		   u16 sorted_syms[])
+		   u16 *sorted_syms)
 {
 	const unsigned table_mask = (1U << table_bits) - 1;
 	unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
 	unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
-	unsigned len;
+	unsigned sym;		/* current symbol */
-	unsigned sym;
+	unsigned codeword;	/* current codeword, bit-reversed */
-	s32 remainder;
+	unsigned len;		/* current codeword length in bits */
-	unsigned sym_idx;
+	unsigned count;		/* num codewords remaining with this length */
-	unsigned codeword;
+	u32 codespace_used;	/* codespace used out of '2^max_codeword_len' */
-	unsigned codeword_len;
+	unsigned cur_table_end; /* end index of current table */
-	unsigned stride;
+	unsigned subtable_prefix; /* codeword prefix of current subtable */
-	unsigned cur_table_end = 1U << table_bits;
+	unsigned subtable_start;  /* start index of current subtable */
-	unsigned subtable_prefix;
+	unsigned subtable_bits;   /* log2 of current subtable length */
 	unsigned subtable_start;
 	unsigned subtable_bits;
-	/* Count how many symbols have each codeword length, including 0.  */
+	/* Count how many codewords have each length, including 0. */
 	for (len = 0; len <= max_codeword_len; len++)
 		len_counts[len] = 0;
 	for (sym = 0; sym < num_syms; sym++)
 		len_counts[lens[sym]]++;
-	/* Sort the symbols primarily by increasing codeword length and
+	/*
-	 * secondarily by increasing symbol value.  */
+	 * Sort the symbols primarily by increasing codeword length and
 	 * secondarily by increasing symbol value; or equivalently by their
 	 * codewords in lexicographic order, since a canonical code is assumed.
 	 *
 	 * For efficiency, also compute 'codespace_used' in the same pass over
 	 * 'len_counts[]' used to build 'offsets[]' for sorting.
 	 */
 	/* Ensure that 'codespace_used' cannot overflow. */
 	STATIC_ASSERT(sizeof(codespace_used) == 4);
 	STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >=
 		      DEFLATE_MAX_NUM_SYMS);
 	/* Initialize 'offsets' so that offsets[len] is the number of codewords
 	 * shorter than 'len' bits, including length 0.  */
 	offsets[0] = 0;
-	for (len = 0; len < max_codeword_len; len++)
+	offsets[1] = len_counts[0];
 	codespace_used = 0;
 	for (len = 1; len < max_codeword_len; len++) {
 		offsets[len + 1] = offsets[len] + len_counts[len];
 		codespace_used = (codespace_used << 1) + len_counts[len];
 	}
 	codespace_used = (codespace_used << 1) + len_counts[len];
 	/* Use the 'offsets' array to sort the symbols.  */
 	for (sym = 0; sym < num_syms; sym++)
 		sorted_syms[offsets[lens[sym]]++] = sym;
-	/* It is already guaranteed that all lengths are <= max_codeword_len,
+	sorted_syms += offsets[0]; /* Skip unused symbols */
 	 * but it cannot be assumed they form a complete prefix code.  A
 	 * codeword of length n should require a proportion of the codespace
 	 * equaling (1/2)^n.  The code is complete if and only if, by this
 	 * measure, the codespace is exactly filled by the lengths.  */
 	remainder = 1;
 	for (len = 1; len <= max_codeword_len; len++) {
 		remainder <<= 1;
 		remainder -= len_counts[len];
 		if (unlikely(remainder < 0)) {
 			/* The lengths overflow the codespace; that is, the code
 			 * is over-subscribed.  */
 			return false;
 		}
 	}
-	if (unlikely(remainder != 0)) {
+	/* lens[] is done being used, so we can write to decode_table[] now. */
 		/* The lengths do not fill the codespace; that is, they form an
 		 * incomplete code.  */
-		/* Initialize the table entries to default values.  When
+	/*
-		 * decompressing a well-formed stream, these default values will
+	 * Check whether the lengths form a complete code (exactly fills the
-		 * never be used.  But since a malformed stream might contain
+	 * codespace), an incomplete code (doesn't fill the codespace), or an
-		 * any bits at all, these entries need to be set anyway.  */
+	 * overfull code (overflows the codespace).  A codeword of length 'n'
-		u32 entry = decode_results[0] | 1;
+	 * uses proportion '1/(2^n)' of the codespace.  An overfull code is
-		for (sym = 0; sym < (1U << table_bits); sym++)
+	 * nonsensical, so is considered invalid.  An incomplete code is
-			decode_table[sym] = entry;
+	 * considered valid only in two specific cases; see below.
 	 */
-		/* A completely empty code is permitted.  */
+	/* overfull code? */
-		if (remainder == (1U << max_codeword_len))
+	if (unlikely(codespace_used > (1U << max_codeword_len)))
-			return true;
+		return false;
-		/* The code is nonempty and incomplete.  Proceed only if there
+	/* incomplete code? */
-		 * is a single used symbol and its codeword has length 1.  The
+	if (unlikely(codespace_used < (1U << max_codeword_len))) {
 		 * DEFLATE RFC is somewhat unclear regarding this case.  What
 		 * zlib's decompressor does is permit this case for
 		 * literal/length and offset codes and assume the codeword is 0
 		 * rather than 1.  We do the same except we allow this case for
 		 * precodes too.  */
 		if (remainder != (1U << (max_codeword_len - 1)) ||
 		    len_counts[1] != 1)
 			return false;
 	}
 	/* Generate the decode table entries.  Since we process codewords from
 	 * shortest to longest, the main portion of the decode table is filled
 	 * first; then the subtables are filled.  Note that it's already been
 	 * verified that the code is nonempty and not over-subscribed.  */
 	/* Start with the smallest codeword length and the smallest-valued
 	 * symbol which has that codeword length.  */
 	sym_idx = offsets[0];
 	sym = sorted_syms[sym_idx++];
 	codeword = 0;
 	codeword_len = 1;
 	while (len_counts[codeword_len] == 0)
 		codeword_len++;
 	stride = 1U << codeword_len;
 	/* For each symbol and its codeword in the main part of the table... */
 	do {
 		u32 entry;
 		unsigned i;
-		/* Fill in as many copies of the decode table entry as are
+		if (codespace_used == 0) {
-		 * needed.  The number of entries to fill is a power of 2 and
+			/*
-		 * depends on the codeword length; it could be as few as 1 or as
+			 * An empty code is allowed.  This can happen for the
-		 * large as half the size of the table.  Since the codewords are
+			 * offset code in DEFLATE, since a dynamic Huffman block
-		 * bit-reversed, the indices to fill are those with the codeword
+			 * need not contain any matches.
-		 * in its low bits; it's the high bits that vary.  */
+			 */
-		entry = decode_results[sym] | codeword_len;
+
-		i = codeword;
+			/* sym=0, len=1 (arbitrary) */
-		do {
+			entry = decode_results[0] | 1;
 		} else {
 			/*
 			 * Allow codes with a single used symbol, with codeword
 			 * length 1.  The DEFLATE RFC is unclear regarding this
 			 * case.  What zlib's decompressor does is permit this
 			 * for the litlen and offset codes and assume the
 			 * codeword is '0' rather than '1'.  We do the same
 			 * except we allow this for precodes too, since there's
 			 * no convincing reason to treat the codes differently.
 			 * We also assign both codewords '0' and '1' to the
 			 * symbol to avoid having to handle '1' specially.
 			 */
 			if (codespace_used != (1U << (max_codeword_len - 1)) ||
 			    len_counts[1] != 1)
 				return false;
 			entry = decode_results[*sorted_syms] | 1;
 		}
 		/*
 		 * Note: the decode table still must be fully initialized, in
 		 * case the stream is malformed and contains bits from the part
 		 * of the codespace the incomplete code doesn't use.
 		 */
 		for (i = 0; i < (1U << table_bits); i++)
 			decode_table[i] = entry;
-			i += stride; /* stride is 1U << codeword_len */
+		return true;
-		} while (i < cur_table_end);
+	}
-		/* Advance to the next symbol and codeword */
+	/*
-		if (sym_idx == num_syms)
+	 * The lengths form a complete code.  Now, enumerate the codewords in
-			return true;
+	 * lexicographic order and fill the decode table entries for each one.
-		sym = sorted_syms[sym_idx++];
+	 *
-		next_codeword(&codeword, &codeword_len, &stride, len_counts);
+	 * First, process all codewords with len <= table_bits.  Each one gets
-	} while (codeword_len <= table_bits);
+	 * '2^(table_bits-len)' direct entries in the table.
 	 *
 	 * Since DEFLATE uses bit-reversed codewords, these entries aren't
 	 * consecutive but rather are spaced '2^len' entries apart.  This makes
 	 * filling them naively somewhat awkward and inefficient, since strided
 	 * stores are less cache-friendly and preclude the use of word or
 	 * vector-at-a-time stores to fill multiple entries per instruction.
 	 *
 	 * To optimize this, we incrementally double the table size.  When
 	 * processing codewords with length 'len', the table is treated as
 	 * having only '2^len' entries, so each codeword uses just one entry.
 	 * Then, each time 'len' is incremented, the table size is doubled and
 	 * the first half is copied to the second half.  This significantly
 	 * improves performance over naively doing strided stores.
 	 *
 	 * Note that some entries copied for each table doubling may not have
 	 * been initialized yet, but it doesn't matter since they're guaranteed
 	 * to be initialized later (because the Huffman code is complete).
 	 */
 	codeword = 0;
 	len = 1;
 	while ((count = len_counts[len]) == 0)
 		len++;
 	cur_table_end = 1U << len;
 	while (len <= table_bits) {
 		/* Process all 'count' codewords with length 'len' bits. */
 		do {
 			unsigned bit;
-	/* The codeword length has exceeded table_bits, so we're done filling
+			/* Fill the first entry for the current codeword. */
-	 * direct entries.  Start filling subtable pointers and subtables. */
+			decode_table[codeword] =
-	stride >>= table_bits;
+				decode_results[*sorted_syms++] | len;
 	goto new_subtable;
 			if (codeword == cur_table_end - 1) {
 				/* Last codeword (all 1's) */
 				for (; len < table_bits; len++) {
 					memcpy(&decode_table[cur_table_end],
 					       decode_table,
 					       cur_table_end *
 						sizeof(decode_table[0]));
 					cur_table_end <<= 1;
 				}
 				return true;
 			}
 			/*
 			 * To advance to the lexicographically next codeword in
 			 * the canonical code, the codeword must be incremented,
 			 * then 0's must be appended to the codeword as needed
 			 * to match the next codeword's length.
 			 *
 			 * Since the codeword is bit-reversed, appending 0's is
 			 * a no-op.  However, incrementing it is nontrivial.  To
 			 * do so efficiently, use the 'bsr' instruction to find
 			 * the last (highest order) 0 bit in the codeword, set
 			 * it, and clear any later (higher order) 1 bits.  But
 			 * 'bsr' actually finds the highest order 1 bit, so to
 			 * use it first flip all bits in the codeword by XOR'ing
 			 * it with (1U << len) - 1 == cur_table_end - 1.
 			 */
 			bit = 1U << bsr32(codeword ^ (cur_table_end - 1));
 			codeword &= bit - 1;
 			codeword |= bit;
 		} while (--count);
 		/* Advance to the next codeword length. */
 		do {
 			if (++len <= table_bits) {
 				memcpy(&decode_table[cur_table_end],
 				       decode_table,
 				       cur_table_end * sizeof(decode_table[0]));
 				cur_table_end <<= 1;
 			}
 		} while ((count = len_counts[len]) == 0);
 	}
 	/* Process codewords with len > table_bits.  These require subtables. */
 	cur_table_end = 1U << table_bits;
 	subtable_prefix = -1;
 	subtable_start = 0;
 	for (;;) {
 		u32 entry;
 		unsigned i;
 		unsigned stride;
 		unsigned bit;
-		/* Start a new subtable if the first 'table_bits' bits of the
+		/*
-		 * codeword don't match the prefix for the current subtable. */
+		 * Start a new subtable if the first 'table_bits' bits of the
-		if ((codeword & table_mask) != subtable_prefix) {
+		 * codeword don't match the prefix of the current subtable.
-		new_subtable:
+		 */
-			subtable_prefix = (codeword & table_mask);
+		if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) {
 			subtable_prefix = (codeword & ((1U << table_bits) - 1));
 			subtable_start = cur_table_end;
-			/* Calculate the subtable length.  If the codeword
+			/*
-			 * length exceeds 'table_bits' by n, the subtable needs
+			 * Calculate the subtable length.  If the codeword has
-			 * at least 2**n entries.  But it may need more; if
+			 * length 'table_bits + n', then the subtable needs
-			 * there are fewer than 2**n codewords of length
+			 * '2^n' entries.  But it may need more; if fewer than
-			 * 'table_bits + n' remaining, then n will need to be
+			 * '2^n' codewords of length 'table_bits + n' remain,
-			 * incremented to bring in longer codewords until the
+			 * then the length will need to be incremented to bring
-			 * subtable can be filled completely.  Note that it
+			 * in longer codewords until the subtable can be
-			 * always will, eventually, be possible to fill the
+			 * completely filled.  Note that because the Huffman
-			 * subtable, since the only case where we may have an
+			 * code is complete, it will always be possible to fill
-			 * incomplete code is a single codeword of length 1,
+			 * the subtable eventually.
-			 * and that never requires any subtables.  */
+			 */
-			subtable_bits = codeword_len - table_bits;
+			subtable_bits = len - table_bits;
-			remainder = 1U << subtable_bits;
+			codespace_used = count;
-			for (;;) {
+			while (codespace_used < (1U << subtable_bits)) {
 				remainder -= len_counts[table_bits +
 							subtable_bits];
 				if (remainder <= 0)
 					break;
 				subtable_bits++;
-				remainder <<= 1;
+				codespace_used = (codespace_used << 1) +
 					len_counts[table_bits + subtable_bits];
 			}
 			cur_table_end = subtable_start + (1U << subtable_bits);
-			/* Create the entry that points from the main table to
+			/*
 			 * Create the entry that points from the main table to
 			 * the subtable.  This entry contains the index of the
 			 * start of the subtable and the number of bits with
 			 * which the subtable is indexed (the log base 2 of the
-			 * number of entries it contains).  */
+			 * number of entries it contains).
 			 */
 			decode_table[subtable_prefix] =
 				HUFFDEC_SUBTABLE_POINTER |
 				HUFFDEC_RESULT_ENTRY(subtable_start) |
 				subtable_bits;
 		}
-		/* Fill the subtable entries */
+		/* Fill the subtable entries for the current codeword. */
-		entry = decode_results[sym] | (codeword_len - table_bits);
+		entry = decode_results[*sorted_syms++] | (len - table_bits);
 		i = subtable_start + (codeword >> table_bits);
 		stride = 1U << (len - table_bits);
 		do {
 			decode_table[i] = entry;
 			/* stride is 1U << (codeword_len - table_bits) */
 			i += stride;
 		} while (i < cur_table_end);
-		/* Advance to the next symbol and codeword */
+		/* Advance to the next codeword. */
-		if (sym_idx == num_syms)
+		if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */
 			return true;
-		sym = sorted_syms[sym_idx++];
+		bit = 1U << bsr32(codeword ^ ((1U << len) - 1));
-		next_codeword(&codeword, &codeword_len, &stride, len_counts);
+		codeword &= bit - 1;
 		codeword |= bit;
 		if (--count == 0) {
 			while ((count = len_counts[++len]) == 0)
 				;
 		}
 	}
 }
@ -935,15 +973,22 @@ libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d,
 LIBDEFLATEAPI struct libdeflate_decompressor *
 libdeflate_alloc_decompressor(void)
 {
-	struct libdeflate_decompressor *d;
+	/*
-
+	 * Note that only certain parts of the decompressor actually must be
-	d = malloc(sizeof(*d));
+	 * initialized here:
-	if (d == NULL)
+	 *
-		return NULL;
+	 * - 'static_codes_loaded' must be initialized to false.
-
+	 *
-	d->static_codes_loaded = false;
+	 * - The first half of the main portion of each decode table must be
-
+	 *   initialized to any value, to avoid reading from uninitialized
-	return d;
+	 *   memory during table expansion in build_decode_table().  (Although,
 	 *   this is really just to avoid warnings with dynamic tools like
 	 *   valgrind, since build_decode_table() is guaranteed to initialize
 	 *   all entries eventually anyway.)
 	 *
 	 * But for simplicity, we currently just zero the whole decompressor.
 	 */
 	return calloc(1, sizeof(struct libdeflate_decompressor));
 }
 LIBDEFLATEAPI void
--- a/programs/test_slow_decompression.c
+++ b/programs/test_slow_decompression.c
@ -463,8 +463,8 @@ tmain(int argc, tchar *argv[])
 	tz = do_test_zlib("static huffman", in, sizeof(in), out, sizeof(out));
 	/*
 	 * libdeflate is faster than zlib in this case, e.g.
-	 *	[static huffman, libdeflate]: 175243 KB/s
+	 *	[static huffman, libdeflate]: 215861 KB/s
-	 *	[static huffman, zlib      ]: 71331 KB/s
+	 *	[static huffman, zlib      ]: 73651 KB/s
 	 */
 	putchar('\n');
 	ASSERT(t < tz);
@ -476,8 +476,8 @@ tmain(int argc, tchar *argv[])
 	tz = do_test_zlib("dynamic huffman", in, sizeof(in), out, sizeof(out));
 	/*
 	 * libdeflate is slower than zlib in this case, though not super bad.
-	 *	[dynamic huffman, libdeflate]: 5197 KB/s
+	 *	[dynamic huffman, libdeflate]: 6277 KB/s
-	 *	[dynamic huffman, zlib      ]: 10206 KB/s
+	 *	[dynamic huffman, zlib      ]: 10419 KB/s
 	 * FIXME: make it faster.
 	 */
 	putchar('\n');