decompress: handle Huffman codes with one used symbol

This commit is contained in:
Eric Biggers 2016-01-27 00:17:56 -06:00
parent 4325101bb9
commit 446e2268b3

View File

@ -565,10 +565,10 @@ build_decode_table(u32 decode_table[],
len_counts[lens[sym]]++; len_counts[lens[sym]]++;
/* It is already guaranteed that all lengths are <= max_codeword_len, /* It is already guaranteed that all lengths are <= max_codeword_len,
* but it cannot be assumed they form a valid prefix code. A codeword * but it cannot be assumed they form a complete prefix code. A
* of length n should require a proportion of the codespace equaling * codeword of length n should require a proportion of the codespace
* (1/2)^n. The code is valid if and only if, by this measure, the * equaling (1/2)^n. The code is complete if and only if, by this
* codespace is exactly filled by the lengths. */ * measure, the codespace is exactly filled by the lengths. */
remainder = 1; remainder = 1;
for (len = 1; len <= max_codeword_len; len++) { for (len = 1; len <= max_codeword_len; len++) {
remainder <<= 1; remainder <<= 1;
@ -582,27 +582,34 @@ build_decode_table(u32 decode_table[],
if (unlikely(remainder != 0)) { if (unlikely(remainder != 0)) {
/* The lengths do not fill the codespace; that is, they form an /* The lengths do not fill the codespace; that is, they form an
* incomplete set. */ * incomplete code. */
if (remainder == (1U << max_codeword_len)) {
/* The code is completely empty. By definition, no /* Initialize the table entries to default values. When
* symbols can be decoded with an empty code. * decompressing a well-formed stream, these default values will
* Consequently, we technically don't even need to fill * never be used. But since a malformed stream might contain
* in the decode table. However, to avoid accessing * any bits at all, these entries need to be set anyway. */
* uninitialized memory if the algorithm nevertheless u32 entry = make_decode_table_entry(decode_results[0], 1);
* attempts to decode symbols using such a code, we fill for (unsigned i = 0; i < (1U << table_bits); i++)
* the decode table with default values. */ decode_table[i] = entry;
for (unsigned i = 0; i < (1U << table_bits); i++) {
decode_table[i] = /* A completely empty code is permitted. */
make_decode_table_entry( if (remainder == (1U << max_codeword_len))
decode_results[0], 1);
}
return true; return true;
}
return false; /* The code is nonempty and incomplete. Proceed only if there
* is a single used symbol and its codeword has length 1. The
* DEFLATE RFC is somewhat unclear regarding this case. What
* zlib's decompressor does is permit this case for
* literal/length and offset codes and assume the codeword is 0
* rather than 1. We do the same except we allow this case for
* precodes too. */
if (remainder != (1U << (max_codeword_len - 1)) ||
len_counts[1] != 1)
return false;
} }
/* Sort the symbols primarily by length and secondarily by symbol value. /* Sort the symbols primarily by increasing codeword length and
*/ * secondarily by increasing symbol value. */
/* Initialize 'offsets' so that offsets[len] is the number of codewords /* Initialize 'offsets' so that offsets[len] is the number of codewords
* shorter than 'len' bits, including length 0. */ * shorter than 'len' bits, including length 0. */
@ -617,17 +624,16 @@ build_decode_table(u32 decode_table[],
/* Generate the decode table entries. Since we process codewords from /* Generate the decode table entries. Since we process codewords from
* shortest to longest, the main portion of the decode table is filled * shortest to longest, the main portion of the decode table is filled
* first; then the subtables are filled. Note that it's already been * first; then the subtables are filled. Note that it's already been
* verified that the codewords form a valid (complete) prefix code. */ * verified that the code is nonempty and not over-subscribed. */
/* Start with the index of the first used symbol. */ /* Start with the smallest codeword length and the smallest-valued
* symbol which has that codeword length. */
sym_idx = offsets[0]; sym_idx = offsets[0];
/* Start with the smallest used codeword length. */
codeword_len = 1; codeword_len = 1;
while (len_counts[codeword_len] == 0) while (len_counts[codeword_len] == 0)
codeword_len++; codeword_len++;
for (;;) { /* For used each symbol and its codeword... */ for (;;) { /* For each used symbol and its codeword... */
unsigned sym; unsigned sym;
u32 entry; u32 entry;
unsigned i; unsigned i;