deflate_compress: automatically select minimum match length

In the greedy and lazy compressors, automatically increase the minimum match length from the default of 3 if the data doesn't contain many different literals. This greatly improves the compression ratio of levels 1-9 on certain types of data, such as DNA sequencing data, while not worsening the ratio on other types of data. The near-optimal compressor (used by compression levels 10-12) continues to use a minimum match length of 3, since it already did a better job at deciding when short matches are worthwhile. (The method for setting the initial costs needs improvement; later commits address that.) Resolves https://github.com/ebiggers/libdeflate/issues/57
2025-09-23 11:20:31 -04:00 · 2021-12-31 16:04:49 -06:00 · 2021-12-31 16:04:49 -06:00 · 69a7ca07fd
commit 69a7ca07fd
parent 3bc42e23d6
1 changed files with 123 additions and 5 deletions
--- a/lib/deflate_compress.c
+++ b/lib/deflate_compress.c
@ -1990,6 +1990,101 @@ adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
 	}
 }
 /*
 * Choose the minimum match length for the greedy and lazy parsers.
 *
 * By default the minimum match length is 3, which is the smallest length the
 * DEFLATE format allows.  However, with greedy and lazy parsing, some data
 * (e.g. DNA sequencing data) benefits greatly from a longer minimum length.
 * Typically, this is because literals are very cheap.  In general, the
 * near-optimal parser handles this case naturally, but the greedy and lazy
 * parsers need a heuristic to decide when to use short matches.
 *
 * The heuristic we use is to make the minimum match length depend on the number
 * of different literals that exist in the data.  If there are many different
 * literals, then literals will probably be expensive, so short matches will
 * probably be worthwhile.  Conversely, if not many literals are used, then
 * probably literals will be cheap and short matches won't be worthwhile.
 */
 static unsigned
 choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
 {
 	/* map from num_used_literals to min_len */
 	static const u8 min_lens[] = {
 		9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
 		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
 		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 		/* the rest is implicitly 3 */
 	};
 	unsigned min_len;
 	STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
 	STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);
 	if (num_used_literals >= ARRAY_LEN(min_lens))
 		return 3;
 	min_len = min_lens[num_used_literals];
 	/*
 	 * With a low max_search_depth, it may be too hard to find long matches.
 	 */
 	if (max_search_depth < 16) {
 		if (max_search_depth < 5)
 			min_len = MIN(min_len, 4);
 		else if (max_search_depth < 10)
 			min_len = MIN(min_len, 5);
 		else
 			min_len = MIN(min_len, 7);
 	}
 	return min_len;
 }
 static unsigned
 calculate_min_match_len(const u8 *data, size_t data_len,
 			unsigned max_search_depth)
 {
 	u8 used[256] = { 0 };
 	unsigned num_used_literals = 0;
 	int i;
 	/*
 	 * For an initial approximation, scan the first 4 KiB of data.
 	 * recalculate_min_match_len() will update the min_len later.
 	 */
 	data_len = MIN(data_len, 4096);
 	for (i = 0; i < data_len; i++)
 		used[data[i]] = 1;
 	for (i = 0; i < 256; i++)
 		num_used_literals += used[i];
 	return choose_min_match_len(num_used_literals, max_search_depth);
 }
 /*
 * Recalculate the minimum match length for a block, now that we know the
 * distribution of literals that are actually being used (freqs->litlen).
 */
 static unsigned
 recalculate_min_match_len(const struct deflate_freqs *freqs,
 			  unsigned max_search_depth)
 {
 	u32 literal_freq = 0;
 	u32 cutoff;
 	unsigned num_used_literals = 0;
 	int i;
 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
 		literal_freq += freqs->litlen[i];
 	cutoff = literal_freq >> 10; /* Ignore literals used very rarely */
 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
 		if (freqs->litlen[i] > cutoff)
 			num_used_literals++;
 	}
 	return choose_min_match_len(num_used_literals, max_search_depth);
 }
 /*
 * This is the level 0 "compressor".  It always outputs uncompressed blocks.
 */
@ -2032,11 +2127,15 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
 		const u8 * const in_block_begin = in_next;
 		const u8 * const in_max_block_end =
 			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
 		unsigned min_len;
 		u32 litrunlen = 0;
 		struct deflate_sequence *next_seq = c->p.g.sequences;
 		init_block_split_stats(&c->split_stats);
 		deflate_reset_symbol_frequencies(c);
 		min_len = calculate_min_match_len(in_next,
 						  in_max_block_end - in_next,
 						  c->max_search_depth);
 		do {
 			u32 length;
@ -2048,15 +2147,15 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
 						&c->p.g.hc_mf,
 						&in_cur_base,
 						in_next,
-						DEFLATE_MIN_MATCH_LEN - 1,
+						min_len - 1,
 						max_len,
 						nice_len,
 						c->max_search_depth,
 						next_hashes,
 						&offset);
-			if (length > DEFLATE_MIN_MATCH_LEN ||
+			if (length >= min_len &&
-			    (length == DEFLATE_MIN_MATCH_LEN &&
+			    (length > DEFLATE_MIN_MATCH_LEN ||
 			     offset <= 4096)) {
 				/* Match found. */
 				deflate_choose_match(c, length, offset,
@ -2113,18 +2212,37 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
 		const u8 * const in_block_begin = in_next;
 		const u8 * const in_max_block_end =
 			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
 		const u8 *next_recalc_min_len =
 			in_next + MIN(in_end - in_next, 10000);
 		unsigned min_len = DEFLATE_MIN_MATCH_LEN;
 		u32 litrunlen = 0;
 		struct deflate_sequence *next_seq = c->p.g.sequences;
 		init_block_split_stats(&c->split_stats);
 		deflate_reset_symbol_frequencies(c);
 		min_len = calculate_min_match_len(in_next,
 						  in_max_block_end - in_next,
 						  c->max_search_depth);
 		do {
 			unsigned cur_len;
 			unsigned cur_offset;
 			unsigned next_len;
 			unsigned next_offset;
 			/*
 			 * Recalculate the minimum match length if it hasn't
 			 * been done recently.
 			 */
 			if (in_next >= next_recalc_min_len) {
 				min_len = recalculate_min_match_len(
 						&c->freqs,
 						c->max_search_depth);
 				next_recalc_min_len +=
 					MIN(in_end - next_recalc_min_len,
 					    in_next - in_block_begin);
 			}
 			/* Find the longest match at the current position. */
 			adjust_max_and_nice_len(&max_len, &nice_len,
 						in_end - in_next);
@ -2132,13 +2250,13 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
 						&c->p.g.hc_mf,
 						&in_cur_base,
 						in_next,
-						DEFLATE_MIN_MATCH_LEN - 1,
+						min_len - 1,
 						max_len,
 						nice_len,
 						c->max_search_depth,
 						next_hashes,
 						&cur_offset);
-			if (cur_len < DEFLATE_MIN_MATCH_LEN ||
+			if (cur_len < min_len ||
 			    (cur_len == DEFLATE_MIN_MATCH_LEN &&
 			     cur_offset > 8192)) {
 				/* No match found.  Choose a literal. */