mirror of
https://github.com/cuberite/libdeflate.git
synced 2025-08-03 17:56:17 -04:00
deflate_compress: automatically select minimum match length
In the greedy and lazy compressors, automatically increase the minimum match length from the default of 3 if the data doesn't contain many different literals. This greatly improves the compression ratio of levels 1-9 on certain types of data, such as DNA sequencing data, while not worsening the ratio on other types of data. The near-optimal compressor (used by compression levels 10-12) continues to use a minimum match length of 3, since it already did a better job at deciding when short matches are worthwhile. (The method for setting the initial costs needs improvement; later commits address that.) Resolves https://github.com/ebiggers/libdeflate/issues/57
This commit is contained in:
parent
3bc42e23d6
commit
69a7ca07fd
@ -1990,6 +1990,101 @@ adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Choose the minimum match length for the greedy and lazy parsers.
|
||||||
|
*
|
||||||
|
* By default the minimum match length is 3, which is the smallest length the
|
||||||
|
* DEFLATE format allows. However, with greedy and lazy parsing, some data
|
||||||
|
* (e.g. DNA sequencing data) benefits greatly from a longer minimum length.
|
||||||
|
* Typically, this is because literals are very cheap. In general, the
|
||||||
|
* near-optimal parser handles this case naturally, but the greedy and lazy
|
||||||
|
* parsers need a heuristic to decide when to use short matches.
|
||||||
|
*
|
||||||
|
* The heuristic we use is to make the minimum match length depend on the number
|
||||||
|
* of different literals that exist in the data. If there are many different
|
||||||
|
* literals, then literals will probably be expensive, so short matches will
|
||||||
|
* probably be worthwhile. Conversely, if not many literals are used, then
|
||||||
|
* probably literals will be cheap and short matches won't be worthwhile.
|
||||||
|
*/
|
||||||
|
static unsigned
|
||||||
|
choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
|
||||||
|
{
|
||||||
|
/* map from num_used_literals to min_len */
|
||||||
|
static const u8 min_lens[] = {
|
||||||
|
9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
|
||||||
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||||
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
|
||||||
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||||
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||||
|
/* the rest is implicitly 3 */
|
||||||
|
};
|
||||||
|
unsigned min_len;
|
||||||
|
|
||||||
|
STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
|
||||||
|
STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);
|
||||||
|
|
||||||
|
if (num_used_literals >= ARRAY_LEN(min_lens))
|
||||||
|
return 3;
|
||||||
|
min_len = min_lens[num_used_literals];
|
||||||
|
/*
|
||||||
|
* With a low max_search_depth, it may be too hard to find long matches.
|
||||||
|
*/
|
||||||
|
if (max_search_depth < 16) {
|
||||||
|
if (max_search_depth < 5)
|
||||||
|
min_len = MIN(min_len, 4);
|
||||||
|
else if (max_search_depth < 10)
|
||||||
|
min_len = MIN(min_len, 5);
|
||||||
|
else
|
||||||
|
min_len = MIN(min_len, 7);
|
||||||
|
}
|
||||||
|
return min_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned
|
||||||
|
calculate_min_match_len(const u8 *data, size_t data_len,
|
||||||
|
unsigned max_search_depth)
|
||||||
|
{
|
||||||
|
u8 used[256] = { 0 };
|
||||||
|
unsigned num_used_literals = 0;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For an initial approximation, scan the first 4 KiB of data.
|
||||||
|
* recalculate_min_match_len() will update the min_len later.
|
||||||
|
*/
|
||||||
|
data_len = MIN(data_len, 4096);
|
||||||
|
for (i = 0; i < data_len; i++)
|
||||||
|
used[data[i]] = 1;
|
||||||
|
for (i = 0; i < 256; i++)
|
||||||
|
num_used_literals += used[i];
|
||||||
|
return choose_min_match_len(num_used_literals, max_search_depth);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Recalculate the minimum match length for a block, now that we know the
|
||||||
|
* distribution of literals that are actually being used (freqs->litlen).
|
||||||
|
*/
|
||||||
|
static unsigned
|
||||||
|
recalculate_min_match_len(const struct deflate_freqs *freqs,
|
||||||
|
unsigned max_search_depth)
|
||||||
|
{
|
||||||
|
u32 literal_freq = 0;
|
||||||
|
u32 cutoff;
|
||||||
|
unsigned num_used_literals = 0;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
|
||||||
|
literal_freq += freqs->litlen[i];
|
||||||
|
|
||||||
|
cutoff = literal_freq >> 10; /* Ignore literals used very rarely */
|
||||||
|
|
||||||
|
for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
|
||||||
|
if (freqs->litlen[i] > cutoff)
|
||||||
|
num_used_literals++;
|
||||||
|
}
|
||||||
|
return choose_min_match_len(num_used_literals, max_search_depth);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is the level 0 "compressor". It always outputs uncompressed blocks.
|
* This is the level 0 "compressor". It always outputs uncompressed blocks.
|
||||||
*/
|
*/
|
||||||
@ -2032,11 +2127,15 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
|
|||||||
const u8 * const in_block_begin = in_next;
|
const u8 * const in_block_begin = in_next;
|
||||||
const u8 * const in_max_block_end =
|
const u8 * const in_max_block_end =
|
||||||
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
|
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
|
||||||
|
unsigned min_len;
|
||||||
u32 litrunlen = 0;
|
u32 litrunlen = 0;
|
||||||
struct deflate_sequence *next_seq = c->p.g.sequences;
|
struct deflate_sequence *next_seq = c->p.g.sequences;
|
||||||
|
|
||||||
init_block_split_stats(&c->split_stats);
|
init_block_split_stats(&c->split_stats);
|
||||||
deflate_reset_symbol_frequencies(c);
|
deflate_reset_symbol_frequencies(c);
|
||||||
|
min_len = calculate_min_match_len(in_next,
|
||||||
|
in_max_block_end - in_next,
|
||||||
|
c->max_search_depth);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
u32 length;
|
u32 length;
|
||||||
@ -2048,15 +2147,15 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
|
|||||||
&c->p.g.hc_mf,
|
&c->p.g.hc_mf,
|
||||||
&in_cur_base,
|
&in_cur_base,
|
||||||
in_next,
|
in_next,
|
||||||
DEFLATE_MIN_MATCH_LEN - 1,
|
min_len - 1,
|
||||||
max_len,
|
max_len,
|
||||||
nice_len,
|
nice_len,
|
||||||
c->max_search_depth,
|
c->max_search_depth,
|
||||||
next_hashes,
|
next_hashes,
|
||||||
&offset);
|
&offset);
|
||||||
|
|
||||||
if (length > DEFLATE_MIN_MATCH_LEN ||
|
if (length >= min_len &&
|
||||||
(length == DEFLATE_MIN_MATCH_LEN &&
|
(length > DEFLATE_MIN_MATCH_LEN ||
|
||||||
offset <= 4096)) {
|
offset <= 4096)) {
|
||||||
/* Match found. */
|
/* Match found. */
|
||||||
deflate_choose_match(c, length, offset,
|
deflate_choose_match(c, length, offset,
|
||||||
@ -2113,18 +2212,37 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
|
|||||||
const u8 * const in_block_begin = in_next;
|
const u8 * const in_block_begin = in_next;
|
||||||
const u8 * const in_max_block_end =
|
const u8 * const in_max_block_end =
|
||||||
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
|
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
|
||||||
|
const u8 *next_recalc_min_len =
|
||||||
|
in_next + MIN(in_end - in_next, 10000);
|
||||||
|
unsigned min_len = DEFLATE_MIN_MATCH_LEN;
|
||||||
u32 litrunlen = 0;
|
u32 litrunlen = 0;
|
||||||
struct deflate_sequence *next_seq = c->p.g.sequences;
|
struct deflate_sequence *next_seq = c->p.g.sequences;
|
||||||
|
|
||||||
init_block_split_stats(&c->split_stats);
|
init_block_split_stats(&c->split_stats);
|
||||||
deflate_reset_symbol_frequencies(c);
|
deflate_reset_symbol_frequencies(c);
|
||||||
|
|
||||||
|
min_len = calculate_min_match_len(in_next,
|
||||||
|
in_max_block_end - in_next,
|
||||||
|
c->max_search_depth);
|
||||||
do {
|
do {
|
||||||
unsigned cur_len;
|
unsigned cur_len;
|
||||||
unsigned cur_offset;
|
unsigned cur_offset;
|
||||||
unsigned next_len;
|
unsigned next_len;
|
||||||
unsigned next_offset;
|
unsigned next_offset;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Recalculate the minimum match length if it hasn't
|
||||||
|
* been done recently.
|
||||||
|
*/
|
||||||
|
if (in_next >= next_recalc_min_len) {
|
||||||
|
min_len = recalculate_min_match_len(
|
||||||
|
&c->freqs,
|
||||||
|
c->max_search_depth);
|
||||||
|
next_recalc_min_len +=
|
||||||
|
MIN(in_end - next_recalc_min_len,
|
||||||
|
in_next - in_block_begin);
|
||||||
|
}
|
||||||
|
|
||||||
/* Find the longest match at the current position. */
|
/* Find the longest match at the current position. */
|
||||||
adjust_max_and_nice_len(&max_len, &nice_len,
|
adjust_max_and_nice_len(&max_len, &nice_len,
|
||||||
in_end - in_next);
|
in_end - in_next);
|
||||||
@ -2132,13 +2250,13 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
|
|||||||
&c->p.g.hc_mf,
|
&c->p.g.hc_mf,
|
||||||
&in_cur_base,
|
&in_cur_base,
|
||||||
in_next,
|
in_next,
|
||||||
DEFLATE_MIN_MATCH_LEN - 1,
|
min_len - 1,
|
||||||
max_len,
|
max_len,
|
||||||
nice_len,
|
nice_len,
|
||||||
c->max_search_depth,
|
c->max_search_depth,
|
||||||
next_hashes,
|
next_hashes,
|
||||||
&cur_offset);
|
&cur_offset);
|
||||||
if (cur_len < DEFLATE_MIN_MATCH_LEN ||
|
if (cur_len < min_len ||
|
||||||
(cur_len == DEFLATE_MIN_MATCH_LEN &&
|
(cur_len == DEFLATE_MIN_MATCH_LEN &&
|
||||||
cur_offset > 8192)) {
|
cur_offset > 8192)) {
|
||||||
/* No match found. Choose a literal. */
|
/* No match found. Choose a literal. */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user