mirror of
https://github.com/cuberite/libdeflate.git
synced 2025-08-05 10:46:34 -04:00
deflate_compress: only use full offset slot map when useful
Reduce the memory usage of compression levels 1-9 by using the condensed offset slot map instead of the full one.
This commit is contained in:
parent
5e9226fff8
commit
88d45c7e1c
@ -50,14 +50,6 @@
|
|||||||
*/
|
*/
|
||||||
#define SUPPORT_NEAR_OPTIMAL_PARSING 1
|
#define SUPPORT_NEAR_OPTIMAL_PARSING 1
|
||||||
|
|
||||||
/*
|
|
||||||
* If this parameter is defined to 1, then the compressor will maintain a full
|
|
||||||
* map from match offsets to offset slots, rather than a condensed map. This
|
|
||||||
* will usually improve performance, especially for the near-optimal parsing
|
|
||||||
* algorithm. However, it will use an additional 32257 bytes of memory.
|
|
||||||
*/
|
|
||||||
#define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is the minimum block length, in uncompressed bytes, which the compressor
|
* This is the minimum block length, in uncompressed bytes, which the compressor
|
||||||
* will use. This should be a value below which using shorter blocks is very
|
* will use. This should be a value below which using shorter blocks is very
|
||||||
@ -211,6 +203,49 @@ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
|
|||||||
27, 27, 28,
|
27, 27, 28,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A condensed table which maps offset => offset slot as follows:
|
||||||
|
*
|
||||||
|
* offset <= 256: deflate_offset_slot[offset]
|
||||||
|
* offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)]
|
||||||
|
*
|
||||||
|
* This table was generated by scripts/gen_offset_slot_map.py.
|
||||||
|
*/
|
||||||
|
static const u8 deflate_offset_slot[512] = {
|
||||||
|
0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7,
|
||||||
|
7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9,
|
||||||
|
9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||||
|
10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
||||||
|
11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
||||||
|
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
||||||
|
12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
||||||
|
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
||||||
|
13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||||
|
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||||
|
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||||
|
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||||
|
14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||||
|
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||||
|
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||||
|
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||||
|
15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
|
||||||
|
22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
|
||||||
|
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||||
|
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
||||||
|
26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
|
||||||
|
26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
|
||||||
|
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
||||||
|
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
||||||
|
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
|
||||||
|
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
|
||||||
|
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
|
||||||
|
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
|
||||||
|
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
|
||||||
|
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
|
||||||
|
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
|
||||||
|
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
|
||||||
|
};
|
||||||
|
|
||||||
/* The order in which precode codeword lengths are stored */
|
/* The order in which precode codeword lengths are stored */
|
||||||
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
|
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
|
||||||
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
|
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
|
||||||
@ -359,23 +394,6 @@ struct libdeflate_compressor {
|
|||||||
/* Block split statistics for the currently pending block */
|
/* Block split statistics for the currently pending block */
|
||||||
struct block_split_stats split_stats;
|
struct block_split_stats split_stats;
|
||||||
|
|
||||||
/* A table for fast lookups of offset slot by match offset.
|
|
||||||
*
|
|
||||||
* If the full table is being used, it is a direct mapping from offset
|
|
||||||
* to offset slot.
|
|
||||||
*
|
|
||||||
* If the condensed table is being used, the first 256 entries map
|
|
||||||
* directly to the offset slots of offsets 1 through 256. The next 256
|
|
||||||
* entries map to the offset slots for the remaining offsets, stepping
|
|
||||||
* through the offsets with a stride of 128. This relies on the fact
|
|
||||||
* that each of the remaining offset slots contains at least 128 offsets
|
|
||||||
* and has an offset base that is a multiple of 128. */
|
|
||||||
#if USE_FULL_OFFSET_SLOT_FAST
|
|
||||||
u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
|
|
||||||
#else
|
|
||||||
u8 offset_slot_fast[512];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* The "nice" match length: if a match of this length is found, choose
|
/* The "nice" match length: if a match of this length is found, choose
|
||||||
* it immediately without further consideration. */
|
* it immediately without further consideration. */
|
||||||
unsigned nice_match_length;
|
unsigned nice_match_length;
|
||||||
@ -480,6 +498,18 @@ struct libdeflate_compressor {
|
|||||||
/* The current cost model being used. */
|
/* The current cost model being used. */
|
||||||
struct deflate_costs costs;
|
struct deflate_costs costs;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A table that maps match offset to offset slot. This
|
||||||
|
* differs from deflate_offset_slot[] in that this is a
|
||||||
|
* full map, not a condensed one. The full map is more
|
||||||
|
* appropriate for the near-optimal parser, since the
|
||||||
|
* near-optimal parser does more offset => offset_slot
|
||||||
|
* translations, it doesn't intersperse them with
|
||||||
|
* matchfinding (so cache evictions are less of a
|
||||||
|
* concern), and it uses more memory anyway.
|
||||||
|
*/
|
||||||
|
u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1];
|
||||||
|
|
||||||
/* Literal/match statistics saved from previous block */
|
/* Literal/match statistics saved from previous block */
|
||||||
u32 prev_observations[NUM_OBSERVATION_TYPES];
|
u32 prev_observations[NUM_OBSERVATION_TYPES];
|
||||||
u32 prev_num_observations;
|
u32 prev_num_observations;
|
||||||
@ -1290,17 +1320,21 @@ deflate_init_static_codes(struct libdeflate_compressor *c)
|
|||||||
deflate_make_huffman_codes(&c->freqs, &c->static_codes);
|
deflate_make_huffman_codes(&c->freqs, &c->static_codes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Return the offset slot for the specified match offset. */
|
/* Return the offset slot for the given match offset, using the small map. */
|
||||||
static forceinline unsigned
|
static forceinline unsigned
|
||||||
deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
|
deflate_get_offset_slot(unsigned offset)
|
||||||
{
|
{
|
||||||
#if USE_FULL_OFFSET_SLOT_FAST
|
#if 1
|
||||||
return c->offset_slot_fast[offset];
|
|
||||||
#else
|
|
||||||
if (offset <= 256)
|
if (offset <= 256)
|
||||||
return c->offset_slot_fast[offset - 1];
|
return deflate_offset_slot[offset];
|
||||||
else
|
else
|
||||||
return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
|
return deflate_offset_slot[256 + ((offset - 1) >> 7)];
|
||||||
|
#else /* Branchless version */
|
||||||
|
u32 i1 = offset;
|
||||||
|
u32 i2 = 256 + ((offset - 1) >> 7);
|
||||||
|
u32 is_small = (s32)(offset - 257) >> 31;
|
||||||
|
|
||||||
|
return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)];
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1670,7 +1704,7 @@ deflate_write_item_list(struct deflate_output_bitstream *os,
|
|||||||
|
|
||||||
|
|
||||||
/* Match offset */
|
/* Match offset */
|
||||||
offset_slot = deflate_get_offset_slot(c, offset);
|
offset_slot = c->p.n.offset_slot_full[offset];
|
||||||
deflate_add_bits(os, codes->codewords.offset[offset_slot],
|
deflate_add_bits(os, codes->codewords.offset[offset_slot],
|
||||||
codes->lens.offset[offset_slot]);
|
codes->lens.offset[offset_slot]);
|
||||||
|
|
||||||
@ -1878,7 +1912,7 @@ deflate_choose_match(struct libdeflate_compressor *c,
|
|||||||
{
|
{
|
||||||
struct deflate_sequence *seq = *seq_p;
|
struct deflate_sequence *seq = *seq_p;
|
||||||
unsigned length_slot = deflate_length_slot[length];
|
unsigned length_slot = deflate_length_slot[length];
|
||||||
unsigned offset_slot = deflate_get_offset_slot(c, offset);
|
unsigned offset_slot = deflate_get_offset_slot(offset);
|
||||||
|
|
||||||
c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
|
c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
|
||||||
c->freqs.offset[offset_slot]++;
|
c->freqs.offset[offset_slot]++;
|
||||||
@ -2572,7 +2606,7 @@ deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
|
|||||||
/* Match */
|
/* Match */
|
||||||
c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
|
c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
|
||||||
deflate_length_slot[length]]++;
|
deflate_length_slot[length]]++;
|
||||||
c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
|
c->freqs.offset[c->p.n.offset_slot_full[offset]]++;
|
||||||
}
|
}
|
||||||
cur_node += length;
|
cur_node += length;
|
||||||
} while (cur_node != end_node);
|
} while (cur_node != end_node);
|
||||||
@ -3004,7 +3038,7 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
|
|||||||
len = DEFLATE_MIN_MATCH_LEN;
|
len = DEFLATE_MIN_MATCH_LEN;
|
||||||
do {
|
do {
|
||||||
offset = match->offset;
|
offset = match->offset;
|
||||||
offset_slot = deflate_get_offset_slot(c, offset);
|
offset_slot = c->p.n.offset_slot_full[offset];
|
||||||
offset_cost = c->p.n.costs.offset_slot[offset_slot];
|
offset_cost = c->p.n.costs.offset_slot[offset_slot];
|
||||||
do {
|
do {
|
||||||
cost_to_end = offset_cost +
|
cost_to_end = offset_cost +
|
||||||
@ -3285,11 +3319,9 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
|
|||||||
return deflate_flush_output(&os);
|
return deflate_flush_output(&os);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
/* Initialize c->p.n.offset_slot_full. */
|
||||||
|
|
||||||
/* Initialize c->offset_slot_fast. */
|
|
||||||
static void
|
static void
|
||||||
deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
|
deflate_init_offset_slot_full(struct libdeflate_compressor *c)
|
||||||
{
|
{
|
||||||
unsigned offset_slot;
|
unsigned offset_slot;
|
||||||
unsigned offset;
|
unsigned offset;
|
||||||
@ -3300,27 +3332,15 @@ deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
|
|||||||
offset_slot++)
|
offset_slot++)
|
||||||
{
|
{
|
||||||
offset = deflate_offset_slot_base[offset_slot];
|
offset = deflate_offset_slot_base[offset_slot];
|
||||||
#if USE_FULL_OFFSET_SLOT_FAST
|
|
||||||
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
||||||
do {
|
do {
|
||||||
c->offset_slot_fast[offset] = offset_slot;
|
c->p.n.offset_slot_full[offset] = offset_slot;
|
||||||
} while (++offset != offset_end);
|
} while (++offset != offset_end);
|
||||||
#else
|
|
||||||
if (offset <= 256) {
|
|
||||||
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
|
||||||
do {
|
|
||||||
c->offset_slot_fast[offset - 1] = offset_slot;
|
|
||||||
} while (++offset != offset_end);
|
|
||||||
} else {
|
|
||||||
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
|
||||||
do {
|
|
||||||
c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
|
|
||||||
} while ((offset += (1 << 7)) != offset_end);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
||||||
|
|
||||||
LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
|
LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
|
||||||
libdeflate_alloc_compressor(int compression_level)
|
libdeflate_alloc_compressor(int compression_level)
|
||||||
{
|
{
|
||||||
@ -3412,12 +3432,14 @@ libdeflate_alloc_compressor(int compression_level)
|
|||||||
c->max_search_depth = 35;
|
c->max_search_depth = 35;
|
||||||
c->nice_match_length = 75;
|
c->nice_match_length = 75;
|
||||||
c->p.n.num_optim_passes = 2;
|
c->p.n.num_optim_passes = 2;
|
||||||
|
deflate_init_offset_slot_full(c);
|
||||||
break;
|
break;
|
||||||
case 11:
|
case 11:
|
||||||
c->impl = deflate_compress_near_optimal;
|
c->impl = deflate_compress_near_optimal;
|
||||||
c->max_search_depth = 70;
|
c->max_search_depth = 70;
|
||||||
c->nice_match_length = 150;
|
c->nice_match_length = 150;
|
||||||
c->p.n.num_optim_passes = 3;
|
c->p.n.num_optim_passes = 3;
|
||||||
|
deflate_init_offset_slot_full(c);
|
||||||
break;
|
break;
|
||||||
case 12:
|
case 12:
|
||||||
default:
|
default:
|
||||||
@ -3425,11 +3447,11 @@ libdeflate_alloc_compressor(int compression_level)
|
|||||||
c->max_search_depth = 150;
|
c->max_search_depth = 150;
|
||||||
c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
|
c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
|
||||||
c->p.n.num_optim_passes = 4;
|
c->p.n.num_optim_passes = 4;
|
||||||
|
deflate_init_offset_slot_full(c);
|
||||||
break;
|
break;
|
||||||
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
||||||
}
|
}
|
||||||
|
|
||||||
deflate_init_offset_slot_fast(c);
|
|
||||||
deflate_init_static_codes(c);
|
deflate_init_static_codes(c);
|
||||||
|
|
||||||
return c;
|
return c;
|
||||||
|
41
scripts/gen_offset_slot_map.py
Executable file
41
scripts/gen_offset_slot_map.py
Executable file
@ -0,0 +1,41 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# This script generates the deflate_offset_slot[] array, which is a condensed
|
||||||
|
# map from offsets to offset slots.
|
||||||
|
|
||||||
|
DEFLATE_OFFSET_SLOT_BASE = [
|
||||||
|
1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 ,
|
||||||
|
17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 ,
|
||||||
|
257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 ,
|
||||||
|
4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
|
||||||
|
]
|
||||||
|
|
||||||
|
DEFLATE_EXTRA_OFFSET_BITS = [
|
||||||
|
0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 ,
|
||||||
|
3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
|
||||||
|
7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 ,
|
||||||
|
11 , 11 , 12 , 12 , 13 , 13 ,
|
||||||
|
]
|
||||||
|
|
||||||
|
offset_slot_map = [0] * 512
|
||||||
|
|
||||||
|
for offset_slot, offset_base in enumerate(DEFLATE_OFFSET_SLOT_BASE):
|
||||||
|
num_extra_bits = DEFLATE_EXTRA_OFFSET_BITS[offset_slot]
|
||||||
|
offset_end = offset_base + (1 << num_extra_bits)
|
||||||
|
if offset_base <= 256:
|
||||||
|
for offset in range(offset_base, offset_end):
|
||||||
|
offset_slot_map[offset] = offset_slot
|
||||||
|
else:
|
||||||
|
for offset in range(offset_base, offset_end, 128):
|
||||||
|
offset_slot_map[256 + ((offset - 1) >> 7)] = offset_slot
|
||||||
|
|
||||||
|
print('static const u8 deflate_offset_slot_map[512] = {')
|
||||||
|
for i in range(0, len(offset_slot_map), 16):
|
||||||
|
print('\t', end='')
|
||||||
|
for j, v in enumerate(offset_slot_map[i:i+16]):
|
||||||
|
print(f'{v},', end='')
|
||||||
|
if j == 15:
|
||||||
|
print('')
|
||||||
|
else:
|
||||||
|
print(' ', end='')
|
||||||
|
print('};')
|
Loading…
x
Reference in New Issue
Block a user