deflate_compress: automatically select minimum match length

In the greedy and lazy compressors, automatically increase the minimum
match length from the default of 3 if the data doesn't contain many
different literals.  This greatly improves the compression ratio of
levels 1-9 on certain types of data, such as DNA sequencing data, while
not worsening the ratio on other types of data.

The near-optimal compressor (used by compression levels 10-12) continues
to use a minimum match length of 3, since it already did a better job at
deciding when short matches are worthwhile.  (The method for setting the
initial costs needs improvement; later commits address that.)

Resolves https://github.com/ebiggers/libdeflate/issues/57
This commit is contained in:
Eric Biggers 2021-12-31 16:04:49 -06:00
parent 3bc42e23d6
commit 69a7ca07fd

View File

@ -1990,6 +1990,101 @@ adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
}
}
/*
* Choose the minimum match length for the greedy and lazy parsers.
*
* By default the minimum match length is 3, which is the smallest length the
* DEFLATE format allows. However, with greedy and lazy parsing, some data
* (e.g. DNA sequencing data) benefits greatly from a longer minimum length.
* Typically, this is because literals are very cheap. In general, the
* near-optimal parser handles this case naturally, but the greedy and lazy
* parsers need a heuristic to decide when to use short matches.
*
* The heuristic we use is to make the minimum match length depend on the number
* of different literals that exist in the data. If there are many different
* literals, then literals will probably be expensive, so short matches will
* probably be worthwhile. Conversely, if not many literals are used, then
* probably literals will be cheap and short matches won't be worthwhile.
*/
static unsigned
choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
{
/* map from num_used_literals to min_len */
static const u8 min_lens[] = {
9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
/* the rest is implicitly 3 */
};
unsigned min_len;
STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);
if (num_used_literals >= ARRAY_LEN(min_lens))
return 3;
min_len = min_lens[num_used_literals];
/*
* With a low max_search_depth, it may be too hard to find long matches.
*/
if (max_search_depth < 16) {
if (max_search_depth < 5)
min_len = MIN(min_len, 4);
else if (max_search_depth < 10)
min_len = MIN(min_len, 5);
else
min_len = MIN(min_len, 7);
}
return min_len;
}
static unsigned
calculate_min_match_len(const u8 *data, size_t data_len,
unsigned max_search_depth)
{
u8 used[256] = { 0 };
unsigned num_used_literals = 0;
int i;
/*
* For an initial approximation, scan the first 4 KiB of data.
* recalculate_min_match_len() will update the min_len later.
*/
data_len = MIN(data_len, 4096);
for (i = 0; i < data_len; i++)
used[data[i]] = 1;
for (i = 0; i < 256; i++)
num_used_literals += used[i];
return choose_min_match_len(num_used_literals, max_search_depth);
}
/*
* Recalculate the minimum match length for a block, now that we know the
* distribution of literals that are actually being used (freqs->litlen).
*/
static unsigned
recalculate_min_match_len(const struct deflate_freqs *freqs,
unsigned max_search_depth)
{
u32 literal_freq = 0;
u32 cutoff;
unsigned num_used_literals = 0;
int i;
for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
literal_freq += freqs->litlen[i];
cutoff = literal_freq >> 10; /* Ignore literals used very rarely */
for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
if (freqs->litlen[i] > cutoff)
num_used_literals++;
}
return choose_min_match_len(num_used_literals, max_search_depth);
}
/*
* This is the level 0 "compressor". It always outputs uncompressed blocks.
*/
@ -2032,11 +2127,15 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
const u8 * const in_block_begin = in_next;
const u8 * const in_max_block_end =
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
unsigned min_len;
u32 litrunlen = 0;
struct deflate_sequence *next_seq = c->p.g.sequences;
init_block_split_stats(&c->split_stats);
deflate_reset_symbol_frequencies(c);
min_len = calculate_min_match_len(in_next,
in_max_block_end - in_next,
c->max_search_depth);
do {
u32 length;
@ -2048,15 +2147,15 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
&c->p.g.hc_mf,
&in_cur_base,
in_next,
DEFLATE_MIN_MATCH_LEN - 1,
min_len - 1,
max_len,
nice_len,
c->max_search_depth,
next_hashes,
&offset);
if (length > DEFLATE_MIN_MATCH_LEN ||
(length == DEFLATE_MIN_MATCH_LEN &&
if (length >= min_len &&
(length > DEFLATE_MIN_MATCH_LEN ||
offset <= 4096)) {
/* Match found. */
deflate_choose_match(c, length, offset,
@ -2113,18 +2212,37 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
const u8 * const in_block_begin = in_next;
const u8 * const in_max_block_end =
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
const u8 *next_recalc_min_len =
in_next + MIN(in_end - in_next, 10000);
unsigned min_len = DEFLATE_MIN_MATCH_LEN;
u32 litrunlen = 0;
struct deflate_sequence *next_seq = c->p.g.sequences;
init_block_split_stats(&c->split_stats);
deflate_reset_symbol_frequencies(c);
min_len = calculate_min_match_len(in_next,
in_max_block_end - in_next,
c->max_search_depth);
do {
unsigned cur_len;
unsigned cur_offset;
unsigned next_len;
unsigned next_offset;
/*
* Recalculate the minimum match length if it hasn't
* been done recently.
*/
if (in_next >= next_recalc_min_len) {
min_len = recalculate_min_match_len(
&c->freqs,
c->max_search_depth);
next_recalc_min_len +=
MIN(in_end - next_recalc_min_len,
in_next - in_block_begin);
}
/* Find the longest match at the current position. */
adjust_max_and_nice_len(&max_len, &nice_len,
in_end - in_next);
@ -2132,13 +2250,13 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
&c->p.g.hc_mf,
&in_cur_base,
in_next,
DEFLATE_MIN_MATCH_LEN - 1,
min_len - 1,
max_len,
nice_len,
c->max_search_depth,
next_hashes,
&cur_offset);
if (cur_len < DEFLATE_MIN_MATCH_LEN ||
if (cur_len < min_len ||
(cur_len == DEFLATE_MIN_MATCH_LEN &&
cur_offset > 8192)) {
/* No match found. Choose a literal. */