mirror of
https://github.com/cuberite/libdeflate.git
synced 2025-08-03 09:46:04 -04:00
deflate_compress: only use full offset slot map when useful
Reduce the memory usage of compression levels 1-9 by using the condensed offset slot map instead of the full one.
This commit is contained in:
parent
5e9226fff8
commit
88d45c7e1c
@ -50,14 +50,6 @@
|
||||
*/
|
||||
#define SUPPORT_NEAR_OPTIMAL_PARSING 1
|
||||
|
||||
/*
|
||||
* If this parameter is defined to 1, then the compressor will maintain a full
|
||||
* map from match offsets to offset slots, rather than a condensed map. This
|
||||
* will usually improve performance, especially for the near-optimal parsing
|
||||
* algorithm. However, it will use an additional 32257 bytes of memory.
|
||||
*/
|
||||
#define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING
|
||||
|
||||
/*
|
||||
* This is the minimum block length, in uncompressed bytes, which the compressor
|
||||
* will use. This should be a value below which using shorter blocks is very
|
||||
@ -211,6 +203,49 @@ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
|
||||
27, 27, 28,
|
||||
};
|
||||
|
||||
/*
|
||||
* A condensed table which maps offset => offset slot as follows:
|
||||
*
|
||||
* offset <= 256: deflate_offset_slot[offset]
|
||||
* offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)]
|
||||
*
|
||||
* This table was generated by scripts/gen_offset_slot_map.py.
|
||||
*/
|
||||
static const u8 deflate_offset_slot[512] = {
|
||||
0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7,
|
||||
7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
||||
11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
||||
12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
||||
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
||||
13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||
14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
|
||||
22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
||||
26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
|
||||
26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
|
||||
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
||||
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
||||
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
|
||||
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
|
||||
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
|
||||
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
|
||||
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
|
||||
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
|
||||
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
|
||||
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
|
||||
};
|
||||
|
||||
/* The order in which precode codeword lengths are stored */
|
||||
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
|
||||
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
|
||||
@ -359,23 +394,6 @@ struct libdeflate_compressor {
|
||||
/* Block split statistics for the currently pending block */
|
||||
struct block_split_stats split_stats;
|
||||
|
||||
/* A table for fast lookups of offset slot by match offset.
|
||||
*
|
||||
* If the full table is being used, it is a direct mapping from offset
|
||||
* to offset slot.
|
||||
*
|
||||
* If the condensed table is being used, the first 256 entries map
|
||||
* directly to the offset slots of offsets 1 through 256. The next 256
|
||||
* entries map to the offset slots for the remaining offsets, stepping
|
||||
* through the offsets with a stride of 128. This relies on the fact
|
||||
* that each of the remaining offset slots contains at least 128 offsets
|
||||
* and has an offset base that is a multiple of 128. */
|
||||
#if USE_FULL_OFFSET_SLOT_FAST
|
||||
u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
|
||||
#else
|
||||
u8 offset_slot_fast[512];
|
||||
#endif
|
||||
|
||||
/* The "nice" match length: if a match of this length is found, choose
|
||||
* it immediately without further consideration. */
|
||||
unsigned nice_match_length;
|
||||
@ -480,6 +498,18 @@ struct libdeflate_compressor {
|
||||
/* The current cost model being used. */
|
||||
struct deflate_costs costs;
|
||||
|
||||
/*
|
||||
* A table that maps match offset to offset slot. This
|
||||
* differs from deflate_offset_slot[] in that this is a
|
||||
* full map, not a condensed one. The full map is more
|
||||
* appropriate for the near-optimal parser, since the
|
||||
* near-optimal parser does more offset => offset_slot
|
||||
* translations, it doesn't intersperse them with
|
||||
* matchfinding (so cache evictions are less of a
|
||||
* concern), and it uses more memory anyway.
|
||||
*/
|
||||
u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1];
|
||||
|
||||
/* Literal/match statistics saved from previous block */
|
||||
u32 prev_observations[NUM_OBSERVATION_TYPES];
|
||||
u32 prev_num_observations;
|
||||
@ -1290,17 +1320,21 @@ deflate_init_static_codes(struct libdeflate_compressor *c)
|
||||
deflate_make_huffman_codes(&c->freqs, &c->static_codes);
|
||||
}
|
||||
|
||||
/* Return the offset slot for the specified match offset. */
|
||||
/* Return the offset slot for the given match offset, using the small map. */
|
||||
static forceinline unsigned
|
||||
deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
|
||||
deflate_get_offset_slot(unsigned offset)
|
||||
{
|
||||
#if USE_FULL_OFFSET_SLOT_FAST
|
||||
return c->offset_slot_fast[offset];
|
||||
#else
|
||||
#if 1
|
||||
if (offset <= 256)
|
||||
return c->offset_slot_fast[offset - 1];
|
||||
return deflate_offset_slot[offset];
|
||||
else
|
||||
return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
|
||||
return deflate_offset_slot[256 + ((offset - 1) >> 7)];
|
||||
#else /* Branchless version */
|
||||
u32 i1 = offset;
|
||||
u32 i2 = 256 + ((offset - 1) >> 7);
|
||||
u32 is_small = (s32)(offset - 257) >> 31;
|
||||
|
||||
return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)];
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -1670,7 +1704,7 @@ deflate_write_item_list(struct deflate_output_bitstream *os,
|
||||
|
||||
|
||||
/* Match offset */
|
||||
offset_slot = deflate_get_offset_slot(c, offset);
|
||||
offset_slot = c->p.n.offset_slot_full[offset];
|
||||
deflate_add_bits(os, codes->codewords.offset[offset_slot],
|
||||
codes->lens.offset[offset_slot]);
|
||||
|
||||
@ -1878,7 +1912,7 @@ deflate_choose_match(struct libdeflate_compressor *c,
|
||||
{
|
||||
struct deflate_sequence *seq = *seq_p;
|
||||
unsigned length_slot = deflate_length_slot[length];
|
||||
unsigned offset_slot = deflate_get_offset_slot(c, offset);
|
||||
unsigned offset_slot = deflate_get_offset_slot(offset);
|
||||
|
||||
c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
|
||||
c->freqs.offset[offset_slot]++;
|
||||
@ -2572,7 +2606,7 @@ deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
|
||||
/* Match */
|
||||
c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
|
||||
deflate_length_slot[length]]++;
|
||||
c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
|
||||
c->freqs.offset[c->p.n.offset_slot_full[offset]]++;
|
||||
}
|
||||
cur_node += length;
|
||||
} while (cur_node != end_node);
|
||||
@ -3004,7 +3038,7 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
|
||||
len = DEFLATE_MIN_MATCH_LEN;
|
||||
do {
|
||||
offset = match->offset;
|
||||
offset_slot = deflate_get_offset_slot(c, offset);
|
||||
offset_slot = c->p.n.offset_slot_full[offset];
|
||||
offset_cost = c->p.n.costs.offset_slot[offset_slot];
|
||||
do {
|
||||
cost_to_end = offset_cost +
|
||||
@ -3285,11 +3319,9 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
|
||||
return deflate_flush_output(&os);
|
||||
}
|
||||
|
||||
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
||||
|
||||
/* Initialize c->offset_slot_fast. */
|
||||
/* Initialize c->p.n.offset_slot_full. */
|
||||
static void
|
||||
deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
|
||||
deflate_init_offset_slot_full(struct libdeflate_compressor *c)
|
||||
{
|
||||
unsigned offset_slot;
|
||||
unsigned offset;
|
||||
@ -3300,27 +3332,15 @@ deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
|
||||
offset_slot++)
|
||||
{
|
||||
offset = deflate_offset_slot_base[offset_slot];
|
||||
#if USE_FULL_OFFSET_SLOT_FAST
|
||||
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
||||
do {
|
||||
c->offset_slot_fast[offset] = offset_slot;
|
||||
c->p.n.offset_slot_full[offset] = offset_slot;
|
||||
} while (++offset != offset_end);
|
||||
#else
|
||||
if (offset <= 256) {
|
||||
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
||||
do {
|
||||
c->offset_slot_fast[offset - 1] = offset_slot;
|
||||
} while (++offset != offset_end);
|
||||
} else {
|
||||
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
||||
do {
|
||||
c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
|
||||
} while ((offset += (1 << 7)) != offset_end);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
||||
|
||||
LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
|
||||
libdeflate_alloc_compressor(int compression_level)
|
||||
{
|
||||
@ -3412,12 +3432,14 @@ libdeflate_alloc_compressor(int compression_level)
|
||||
c->max_search_depth = 35;
|
||||
c->nice_match_length = 75;
|
||||
c->p.n.num_optim_passes = 2;
|
||||
deflate_init_offset_slot_full(c);
|
||||
break;
|
||||
case 11:
|
||||
c->impl = deflate_compress_near_optimal;
|
||||
c->max_search_depth = 70;
|
||||
c->nice_match_length = 150;
|
||||
c->p.n.num_optim_passes = 3;
|
||||
deflate_init_offset_slot_full(c);
|
||||
break;
|
||||
case 12:
|
||||
default:
|
||||
@ -3425,11 +3447,11 @@ libdeflate_alloc_compressor(int compression_level)
|
||||
c->max_search_depth = 150;
|
||||
c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
|
||||
c->p.n.num_optim_passes = 4;
|
||||
deflate_init_offset_slot_full(c);
|
||||
break;
|
||||
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
||||
}
|
||||
|
||||
deflate_init_offset_slot_fast(c);
|
||||
deflate_init_static_codes(c);
|
||||
|
||||
return c;
|
||||
|
41
scripts/gen_offset_slot_map.py
Executable file
41
scripts/gen_offset_slot_map.py
Executable file
@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# This script generates the deflate_offset_slot[] array, which is a condensed
|
||||
# map from offsets to offset slots.
|
||||
|
||||
DEFLATE_OFFSET_SLOT_BASE = [
|
||||
1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 ,
|
||||
17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 ,
|
||||
257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 ,
|
||||
4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
|
||||
]
|
||||
|
||||
DEFLATE_EXTRA_OFFSET_BITS = [
|
||||
0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 ,
|
||||
3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
|
||||
7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 ,
|
||||
11 , 11 , 12 , 12 , 13 , 13 ,
|
||||
]
|
||||
|
||||
offset_slot_map = [0] * 512
|
||||
|
||||
for offset_slot, offset_base in enumerate(DEFLATE_OFFSET_SLOT_BASE):
|
||||
num_extra_bits = DEFLATE_EXTRA_OFFSET_BITS[offset_slot]
|
||||
offset_end = offset_base + (1 << num_extra_bits)
|
||||
if offset_base <= 256:
|
||||
for offset in range(offset_base, offset_end):
|
||||
offset_slot_map[offset] = offset_slot
|
||||
else:
|
||||
for offset in range(offset_base, offset_end, 128):
|
||||
offset_slot_map[256 + ((offset - 1) >> 7)] = offset_slot
|
||||
|
||||
print('static const u8 deflate_offset_slot_map[512] = {')
|
||||
for i in range(0, len(offset_slot_map), 16):
|
||||
print('\t', end='')
|
||||
for j, v in enumerate(offset_slot_map[i:i+16]):
|
||||
print(f'{v},', end='')
|
||||
if j == 15:
|
||||
print('')
|
||||
else:
|
||||
print(' ', end='')
|
||||
print('};')
|
Loading…
x
Reference in New Issue
Block a user