mirror of
https://github.com/cuberite/libdeflate.git
synced 2025-09-13 06:15:51 -04:00
lib/deflate_decompress: build subtables separately
Further improve build_decode_table() performance by splitting the "fill direct entries" and "fill subtable pointers and subtables" steps into separate loops and making some other optimizations.
This commit is contained in:
parent
515b7ad15c
commit
bfc3f610e1
@ -515,6 +515,39 @@ static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
|
|||||||
#undef ENTRY
|
#undef ENTRY
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* Advance to the next codeword in the canonical Huffman code */
|
||||||
|
static forceinline void
|
||||||
|
next_codeword(unsigned *codeword_p, unsigned *codeword_len_p,
|
||||||
|
unsigned *stride_p, unsigned len_counts[])
|
||||||
|
{
|
||||||
|
unsigned codeword = *codeword_p;
|
||||||
|
unsigned codeword_len = *codeword_len_p;
|
||||||
|
unsigned bit;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Increment the codeword, bit-reversed: find the last (highest order) 0
|
||||||
|
* bit in the codeword, set it, and clear any later (higher order) bits.
|
||||||
|
*/
|
||||||
|
bit = 1U << bsr32(~codeword & ((1U << codeword_len) - 1));
|
||||||
|
codeword &= bit - 1;
|
||||||
|
codeword |= bit;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If there are no more codewords of this length, proceed to the next
|
||||||
|
* lowest used length. Increasing the length logically appends 0's to
|
||||||
|
* the codeword, but this is a no-op due to the codeword being
|
||||||
|
* represented in bit-reversed form.
|
||||||
|
*/
|
||||||
|
len_counts[codeword_len]--;
|
||||||
|
while (len_counts[codeword_len] == 0) {
|
||||||
|
codeword_len++;
|
||||||
|
*stride_p <<= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
*codeword_p = codeword;
|
||||||
|
*codeword_len_p = codeword_len;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Build a table for fast decoding of symbols from a Huffman code. As input,
|
* Build a table for fast decoding of symbols from a Huffman code. As input,
|
||||||
* this function takes the codeword length of each symbol which may be used in
|
* this function takes the codeword length of each symbol which may be used in
|
||||||
@ -562,19 +595,20 @@ build_decode_table(u32 decode_table[],
|
|||||||
const unsigned max_codeword_len,
|
const unsigned max_codeword_len,
|
||||||
u16 sorted_syms[])
|
u16 sorted_syms[])
|
||||||
{
|
{
|
||||||
|
const unsigned table_mask = (1U << table_bits) - 1;
|
||||||
unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
|
unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
|
||||||
unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
|
unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
|
||||||
unsigned len;
|
unsigned len;
|
||||||
unsigned sym;
|
unsigned sym;
|
||||||
s32 remainder;
|
s32 remainder;
|
||||||
unsigned sym_idx;
|
unsigned sym_idx;
|
||||||
|
unsigned codeword;
|
||||||
unsigned codeword_len;
|
unsigned codeword_len;
|
||||||
unsigned codeword_reversed = 0;
|
unsigned stride;
|
||||||
unsigned cur_codeword_prefix = -1;
|
unsigned cur_table_end = 1U << table_bits;
|
||||||
unsigned cur_table_start = 0;
|
unsigned subtable_prefix;
|
||||||
unsigned cur_table_bits = table_bits;
|
unsigned subtable_start;
|
||||||
unsigned num_dropped_bits = 0;
|
unsigned subtable_bits;
|
||||||
const unsigned table_mask = (1U << table_bits) - 1;
|
|
||||||
|
|
||||||
/* Count how many symbols have each codeword length, including 0. */
|
/* Count how many symbols have each codeword length, including 0. */
|
||||||
for (len = 0; len <= max_codeword_len; len++)
|
for (len = 0; len <= max_codeword_len; len++)
|
||||||
@ -647,32 +681,53 @@ build_decode_table(u32 decode_table[],
|
|||||||
/* Start with the smallest codeword length and the smallest-valued
|
/* Start with the smallest codeword length and the smallest-valued
|
||||||
* symbol which has that codeword length. */
|
* symbol which has that codeword length. */
|
||||||
sym_idx = offsets[0];
|
sym_idx = offsets[0];
|
||||||
|
sym = sorted_syms[sym_idx++];
|
||||||
|
codeword = 0;
|
||||||
codeword_len = 1;
|
codeword_len = 1;
|
||||||
while (len_counts[codeword_len] == 0)
|
while (len_counts[codeword_len] == 0)
|
||||||
codeword_len++;
|
codeword_len++;
|
||||||
|
stride = 1U << codeword_len;
|
||||||
|
|
||||||
for (;;) { /* For each used symbol and its codeword... */
|
/* For each symbol and its codeword in the main part of the table... */
|
||||||
unsigned sym;
|
do {
|
||||||
u32 entry;
|
u32 entry;
|
||||||
unsigned i;
|
unsigned i;
|
||||||
unsigned end;
|
|
||||||
unsigned increment;
|
|
||||||
unsigned bit;
|
|
||||||
|
|
||||||
/* Get the next symbol. */
|
/* Fill in as many copies of the decode table entry as are
|
||||||
sym = sorted_syms[sym_idx];
|
* needed. The number of entries to fill is a power of 2 and
|
||||||
|
* depends on the codeword length; it could be as few as 1 or as
|
||||||
|
* large as half the size of the table. Since the codewords are
|
||||||
|
* bit-reversed, the indices to fill are those with the codeword
|
||||||
|
* in its low bits; it's the high bits that vary. */
|
||||||
|
entry = decode_results[sym] | codeword_len;
|
||||||
|
i = codeword;
|
||||||
|
do {
|
||||||
|
decode_table[i] = entry;
|
||||||
|
i += stride; /* stride is 1U << codeword_len */
|
||||||
|
} while (i < cur_table_end);
|
||||||
|
|
||||||
/* Start a new subtable if the codeword is long enough to
|
/* Advance to the next symbol and codeword */
|
||||||
* require a subtable, *and* the first 'table_bits' bits of the
|
if (sym_idx == num_syms)
|
||||||
* codeword don't match the prefix for the previous subtable if
|
return true;
|
||||||
* any. */
|
sym = sorted_syms[sym_idx++];
|
||||||
if (codeword_len > table_bits &&
|
next_codeword(&codeword, &codeword_len, &stride, len_counts);
|
||||||
(codeword_reversed & table_mask) != cur_codeword_prefix) {
|
} while (codeword_len <= table_bits);
|
||||||
|
|
||||||
cur_codeword_prefix = (codeword_reversed & table_mask);
|
/* The codeword length has exceeded table_bits, so we're done filling
|
||||||
|
* direct entries. Start filling subtable pointers and subtables. */
|
||||||
|
stride >>= table_bits;
|
||||||
|
goto new_subtable;
|
||||||
|
|
||||||
cur_table_start += 1U << cur_table_bits;
|
for (;;) {
|
||||||
|
u32 entry;
|
||||||
|
unsigned i;
|
||||||
|
|
||||||
|
/* Start a new subtable if the first 'table_bits' bits of the
|
||||||
|
* codeword don't match the prefix for the current subtable. */
|
||||||
|
if ((codeword & table_mask) != subtable_prefix) {
|
||||||
|
new_subtable:
|
||||||
|
subtable_prefix = (codeword & table_mask);
|
||||||
|
subtable_start = cur_table_end;
|
||||||
/* Calculate the subtable length. If the codeword
|
/* Calculate the subtable length. If the codeword
|
||||||
* length exceeds 'table_bits' by n, the subtable needs
|
* length exceeds 'table_bits' by n, the subtable needs
|
||||||
* at least 2**n entries. But it may need more; if
|
* at least 2**n entries. But it may need more; if
|
||||||
@ -684,74 +739,43 @@ build_decode_table(u32 decode_table[],
|
|||||||
* subtable, since the only case where we may have an
|
* subtable, since the only case where we may have an
|
||||||
* incomplete code is a single codeword of length 1,
|
* incomplete code is a single codeword of length 1,
|
||||||
* and that never requires any subtables. */
|
* and that never requires any subtables. */
|
||||||
cur_table_bits = codeword_len - table_bits;
|
subtable_bits = codeword_len - table_bits;
|
||||||
remainder = (s32)1 << cur_table_bits;
|
remainder = 1U << subtable_bits;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
remainder -= len_counts[table_bits +
|
remainder -= len_counts[table_bits +
|
||||||
cur_table_bits];
|
subtable_bits];
|
||||||
if (remainder <= 0)
|
if (remainder <= 0)
|
||||||
break;
|
break;
|
||||||
cur_table_bits++;
|
subtable_bits++;
|
||||||
remainder <<= 1;
|
remainder <<= 1;
|
||||||
}
|
}
|
||||||
|
cur_table_end = subtable_start + (1U << subtable_bits);
|
||||||
|
|
||||||
/* Create the entry that points from the main table to
|
/* Create the entry that points from the main table to
|
||||||
* the subtable. This entry contains the index of the
|
* the subtable. This entry contains the index of the
|
||||||
* start of the subtable and the number of bits with
|
* start of the subtable and the number of bits with
|
||||||
* which the subtable is indexed (the log base 2 of the
|
* which the subtable is indexed (the log base 2 of the
|
||||||
* number of entries it contains). */
|
* number of entries it contains). */
|
||||||
decode_table[cur_codeword_prefix] =
|
decode_table[subtable_prefix] =
|
||||||
HUFFDEC_SUBTABLE_POINTER |
|
HUFFDEC_SUBTABLE_POINTER |
|
||||||
HUFFDEC_RESULT_ENTRY(cur_table_start) |
|
HUFFDEC_RESULT_ENTRY(subtable_start) |
|
||||||
cur_table_bits;
|
subtable_bits;
|
||||||
|
|
||||||
/* Now that we're filling a subtable, we need to drop
|
|
||||||
* the first 'table_bits' bits of the codewords. */
|
|
||||||
num_dropped_bits = table_bits;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Create the decode table entry, which packs the decode result
|
/* Fill the subtable entries */
|
||||||
* and the codeword length (minus 'table_bits' for subtables)
|
entry = decode_results[sym] | (codeword_len - table_bits);
|
||||||
* together. */
|
i = subtable_start + (codeword >> table_bits);
|
||||||
entry = decode_results[sym] | (codeword_len - num_dropped_bits);
|
|
||||||
|
|
||||||
/* Fill in as many copies of the decode table entry as are
|
|
||||||
* needed. The number of entries to fill is a power of 2 and
|
|
||||||
* depends on the codeword length; it could be as few as 1 or as
|
|
||||||
* large as half the size of the table. Since the codewords are
|
|
||||||
* bit-reversed, the indices to fill are those with the codeword
|
|
||||||
* in its low bits; it's the high bits that vary. */
|
|
||||||
i = cur_table_start + (codeword_reversed >> num_dropped_bits);
|
|
||||||
end = cur_table_start + (1U << cur_table_bits);
|
|
||||||
increment = 1U << (codeword_len - num_dropped_bits);
|
|
||||||
do {
|
do {
|
||||||
decode_table[i] = entry;
|
decode_table[i] = entry;
|
||||||
i += increment;
|
/* stride is 1U << (codeword_len - table_bits) */
|
||||||
} while (i < end);
|
i += stride;
|
||||||
|
} while (i < cur_table_end);
|
||||||
|
|
||||||
/* Advance to the next symbol and codeword */
|
/* Advance to the next symbol and codeword */
|
||||||
|
if (sym_idx == num_syms)
|
||||||
if (++sym_idx == num_syms)
|
|
||||||
return true;
|
return true;
|
||||||
/*
|
sym = sorted_syms[sym_idx++];
|
||||||
* Increment the codeword, bit-reversed: find the last (highest
|
next_codeword(&codeword, &codeword_len, &stride, len_counts);
|
||||||
* order) 0 bit in the codeword, set it, and clear any later
|
|
||||||
* (higher order) bits.
|
|
||||||
*/
|
|
||||||
bit = 1U << bsr32(~codeword_reversed &
|
|
||||||
((1U << codeword_len) - 1));
|
|
||||||
codeword_reversed &= bit - 1;
|
|
||||||
codeword_reversed |= bit;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If there are no more codewords of this length, proceed to the
|
|
||||||
* next lowest used length. Increasing the length logically
|
|
||||||
* appends 0's to the codeword, but this is a no-op due to the
|
|
||||||
* codeword being represented in bit-reversed form.
|
|
||||||
*/
|
|
||||||
len_counts[codeword_len]--;
|
|
||||||
while (len_counts[codeword_len] == 0)
|
|
||||||
codeword_len++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user