deflate_compress: only use full offset slot map when useful

Reduce the memory usage of compression levels 1-9 by using the condensed
offset slot map instead of the full one.
This commit is contained in:
Eric Biggers 2022-01-01 19:49:15 -06:00
parent 5e9226fff8
commit 88d45c7e1c
2 changed files with 119 additions and 56 deletions

View File

@ -50,14 +50,6 @@
*/
#define SUPPORT_NEAR_OPTIMAL_PARSING 1
/*
* If this parameter is defined to 1, then the compressor will maintain a full
* map from match offsets to offset slots, rather than a condensed map. This
* will usually improve performance, especially for the near-optimal parsing
* algorithm. However, it will use an additional 32257 bytes of memory.
*/
#define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING
/*
* This is the minimum block length, in uncompressed bytes, which the compressor
* will use. This should be a value below which using shorter blocks is very
@ -211,6 +203,49 @@ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
27, 27, 28,
};
/*
* A condensed table which maps offset => offset slot as follows:
*
* offset <= 256: deflate_offset_slot[offset]
* offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)]
*
* This table was generated by scripts/gen_offset_slot_map.py.
*/
static const u8 deflate_offset_slot[512] = {
0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7,
7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9,
9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
};
/* The order in which precode codeword lengths are stored */
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
@ -359,23 +394,6 @@ struct libdeflate_compressor {
/* Block split statistics for the currently pending block */
struct block_split_stats split_stats;
/* A table for fast lookups of offset slot by match offset.
*
* If the full table is being used, it is a direct mapping from offset
* to offset slot.
*
* If the condensed table is being used, the first 256 entries map
* directly to the offset slots of offsets 1 through 256. The next 256
* entries map to the offset slots for the remaining offsets, stepping
* through the offsets with a stride of 128. This relies on the fact
* that each of the remaining offset slots contains at least 128 offsets
* and has an offset base that is a multiple of 128. */
#if USE_FULL_OFFSET_SLOT_FAST
u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
#else
u8 offset_slot_fast[512];
#endif
/* The "nice" match length: if a match of this length is found, choose
* it immediately without further consideration. */
unsigned nice_match_length;
@ -480,6 +498,18 @@ struct libdeflate_compressor {
/* The current cost model being used. */
struct deflate_costs costs;
/*
* A table that maps match offset to offset slot. This
* differs from deflate_offset_slot[] in that this is a
* full map, not a condensed one. The full map is more
* appropriate for the near-optimal parser, since the
* near-optimal parser does more offset => offset_slot
* translations, it doesn't intersperse them with
* matchfinding (so cache evictions are less of a
* concern), and it uses more memory anyway.
*/
u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1];
/* Literal/match statistics saved from previous block */
u32 prev_observations[NUM_OBSERVATION_TYPES];
u32 prev_num_observations;
@ -1290,17 +1320,21 @@ deflate_init_static_codes(struct libdeflate_compressor *c)
deflate_make_huffman_codes(&c->freqs, &c->static_codes);
}
/* Return the offset slot for the specified match offset. */
/* Return the offset slot for the given match offset, using the small map. */
static forceinline unsigned
deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
deflate_get_offset_slot(unsigned offset)
{
#if USE_FULL_OFFSET_SLOT_FAST
return c->offset_slot_fast[offset];
#else
#if 1
if (offset <= 256)
return c->offset_slot_fast[offset - 1];
return deflate_offset_slot[offset];
else
return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
return deflate_offset_slot[256 + ((offset - 1) >> 7)];
#else /* Branchless version */
u32 i1 = offset;
u32 i2 = 256 + ((offset - 1) >> 7);
u32 is_small = (s32)(offset - 257) >> 31;
return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)];
#endif
}
@ -1670,7 +1704,7 @@ deflate_write_item_list(struct deflate_output_bitstream *os,
/* Match offset */
offset_slot = deflate_get_offset_slot(c, offset);
offset_slot = c->p.n.offset_slot_full[offset];
deflate_add_bits(os, codes->codewords.offset[offset_slot],
codes->lens.offset[offset_slot]);
@ -1878,7 +1912,7 @@ deflate_choose_match(struct libdeflate_compressor *c,
{
struct deflate_sequence *seq = *seq_p;
unsigned length_slot = deflate_length_slot[length];
unsigned offset_slot = deflate_get_offset_slot(c, offset);
unsigned offset_slot = deflate_get_offset_slot(offset);
c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
c->freqs.offset[offset_slot]++;
@ -2572,7 +2606,7 @@ deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
/* Match */
c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
deflate_length_slot[length]]++;
c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
c->freqs.offset[c->p.n.offset_slot_full[offset]]++;
}
cur_node += length;
} while (cur_node != end_node);
@ -3004,7 +3038,7 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
len = DEFLATE_MIN_MATCH_LEN;
do {
offset = match->offset;
offset_slot = deflate_get_offset_slot(c, offset);
offset_slot = c->p.n.offset_slot_full[offset];
offset_cost = c->p.n.costs.offset_slot[offset_slot];
do {
cost_to_end = offset_cost +
@ -3285,11 +3319,9 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
return deflate_flush_output(&os);
}
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
/* Initialize c->offset_slot_fast. */
/* Initialize c->p.n.offset_slot_full. */
static void
deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
deflate_init_offset_slot_full(struct libdeflate_compressor *c)
{
unsigned offset_slot;
unsigned offset;
@ -3300,27 +3332,15 @@ deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
offset_slot++)
{
offset = deflate_offset_slot_base[offset_slot];
#if USE_FULL_OFFSET_SLOT_FAST
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
do {
c->offset_slot_fast[offset] = offset_slot;
c->p.n.offset_slot_full[offset] = offset_slot;
} while (++offset != offset_end);
#else
if (offset <= 256) {
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
do {
c->offset_slot_fast[offset - 1] = offset_slot;
} while (++offset != offset_end);
} else {
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
do {
c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
} while ((offset += (1 << 7)) != offset_end);
}
#endif
}
}
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
libdeflate_alloc_compressor(int compression_level)
{
@ -3412,12 +3432,14 @@ libdeflate_alloc_compressor(int compression_level)
c->max_search_depth = 35;
c->nice_match_length = 75;
c->p.n.num_optim_passes = 2;
deflate_init_offset_slot_full(c);
break;
case 11:
c->impl = deflate_compress_near_optimal;
c->max_search_depth = 70;
c->nice_match_length = 150;
c->p.n.num_optim_passes = 3;
deflate_init_offset_slot_full(c);
break;
case 12:
default:
@ -3425,11 +3447,11 @@ libdeflate_alloc_compressor(int compression_level)
c->max_search_depth = 150;
c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
c->p.n.num_optim_passes = 4;
deflate_init_offset_slot_full(c);
break;
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
}
deflate_init_offset_slot_fast(c);
deflate_init_static_codes(c);
return c;

41
scripts/gen_offset_slot_map.py Executable file
View File

@ -0,0 +1,41 @@
#!/usr/bin/env python3
#
# This script generates the deflate_offset_slot[] array, which is a condensed
# map from offsets to offset slots.
DEFLATE_OFFSET_SLOT_BASE = [
1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 ,
17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 ,
257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 ,
4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
]
DEFLATE_EXTRA_OFFSET_BITS = [
0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 ,
3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 ,
11 , 11 , 12 , 12 , 13 , 13 ,
]
offset_slot_map = [0] * 512
for offset_slot, offset_base in enumerate(DEFLATE_OFFSET_SLOT_BASE):
num_extra_bits = DEFLATE_EXTRA_OFFSET_BITS[offset_slot]
offset_end = offset_base + (1 << num_extra_bits)
if offset_base <= 256:
for offset in range(offset_base, offset_end):
offset_slot_map[offset] = offset_slot
else:
for offset in range(offset_base, offset_end, 128):
offset_slot_map[256 + ((offset - 1) >> 7)] = offset_slot
print('static const u8 deflate_offset_slot_map[512] = {')
for i in range(0, len(offset_slot_map), 16):
print('\t', end='')
for j, v in enumerate(offset_slot_map[i:i+16]):
print(f'{v},', end='')
if j == 15:
print('')
else:
print(' ', end='')
print('};')