New block splitting algorithm

2025-09-13 14:26:02 -04:00 · 2016-05-21 10:33:59 -05:00 · 2016-05-21 10:33:59 -05:00 · ecdcfc600b
commit ecdcfc600b
parent c3f68e9ba7
2 changed files with 215 additions and 64 deletions
--- a/src/deflate_compress.c
+++ b/src/deflate_compress.c
@ -54,27 +54,14 @@
 #  include "bt_matchfinder.h"
 #endif

-/*
- * Number of literals+matches to output before starting new Huffman codes.
- *
- * This is just a heuristic, as there is no efficient algorithm for computing
- * optimal block splitting in general.
- *
- * Note: a lower value than defined here usually results in a slightly better
- * compression ratio, but creates more overhead in compression and
- * decompression.
- *
- * This value is not used by the near-optimal parsing algorithm, which uses
- * OPTIM_BLOCK_LENGTH instead.
- */
-#define MAX_ITEMS_PER_BLOCK	16384
+/* The minimum and maximum block lengths, in bytes of source data, which the
+ * parsing algorithms may choose.  */
+#define MIN_BLOCK_LENGTH	10000
+#define MAX_BLOCK_LENGTH	300000

 #if SUPPORT_NEAR_OPTIMAL_PARSING
 /* Constants specific to the near-optimal parsing algorithm.  */

-/* The preferred DEFLATE block length in bytes.  */
-#  define OPTIM_BLOCK_LENGTH	16384
-
 /* The maximum number of matches the matchfinder can find at a single position.
 * Since the matchfinder never finds more than one match for the same length,
 * presuming one of each possible length is sufficient for an upper bound.
@ -84,10 +71,10 @@

 /* The number of array spaces to reserve for a single block's matches.  This
 * value should be high enough so that virtually the time, all matches found in
- * OPTIM_BLOCK_LENGTH consecutive positions can fit in this array.  However,
- * this is *not* the true upper bound on the number of matches that can possibly
- * be found.  Therefore, checks for overflow are still required.  */
-#  define CACHE_LEN		((OPTIM_BLOCK_LENGTH * 8) + (MAX_MATCHES_PER_POS + 1))
+ * MAX_BLOCK_LENGTH consecutive positions can fit in this array.  However, this
+ * is *not* the true upper bound on the number of matches that can possibly be
+ * found.  Therefore, checks for overflow are still required.  */
+#  define CACHE_LEN		((MAX_BLOCK_LENGTH * 5) + (MAX_MATCHES_PER_POS + 1))

 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

@ -351,7 +338,14 @@ struct deflate_compressor {
 			/* Hash chain matchfinder  */
 			struct hc_matchfinder hc_mf;

-			struct deflate_sequence sequences[MAX_ITEMS_PER_BLOCK];
+			/* The matches and literals that the parser has chosen
+			 * for the current block.  The required length of this
+			 * array is limited by the maximum number of matches
+			 * that can ever be chosen for a single block, plus one
+			 * for the special entry at the end.  */
+			struct deflate_sequence sequences[
+				DIV_ROUND_UP(MAX_BLOCK_LENGTH,
+					     DEFLATE_MIN_MATCH_LEN) + 1];

 			u8 nonoptimal_end[0];
 		};
@ -371,8 +365,8 @@ struct deflate_compressor {

 			/* Array of structures, one per position, for running
 			 * the minimum-cost path algorithm.  */
-			struct deflate_optimum_node optimum[OPTIM_BLOCK_LENGTH +
-							    1 + DEFLATE_MAX_MATCH_LEN];
+			struct deflate_optimum_node optimum[MAX_BLOCK_LENGTH +
+							    3 * DEFLATE_MAX_MATCH_LEN];

 			/* The current cost model being used.  */
 			struct deflate_costs costs;
@ -1491,7 +1485,7 @@ deflate_write_end_of_block(struct deflate_output_bitstream *os,
 static void
 deflate_write_block(struct deflate_compressor * restrict c,
 		    struct deflate_output_bitstream * restrict os,
-		    const u8 * restrict block_begin, s32 items_remaining,
+		    const u8 * restrict block_begin, u32 block_length,
 		    bool is_final_block)
 {
 	struct deflate_codes *codes;
@ -1501,7 +1495,7 @@ deflate_write_block(struct deflate_compressor * restrict c,
 	/* Account for end-of-block symbol  */
 	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;

-	if (items_remaining < MAX_ITEMS_PER_BLOCK - 100) {
+	if (block_length >= 1000) {
 		/* Use custom ("dynamic") Huffman codes.  */
 		deflate_write_block_header(os, is_final_block,
 					   DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN);
@ -1577,6 +1571,131 @@ deflate_finish_sequence(struct deflate_sequence *seq, unsigned litrunlen)
 	seq->length = 0;
 }

+/******************************************************************************/
+
+/*
+ * Block splitting algorithm.  The problem is to decide when it is worthwhile to
+ * start a new block with new Huffman codes.  There is a theoretically optimal
+ * solution: recursively consider every possible block split, considering the
+ * exact cost of each block, and choose the minimum cost approach.  But this is
+ * far too slow.  Instead, as an approximation, we can count symbols and after
+ * every N symbols, compare the expected distribution of symbols based on the
+ * previous data with the actual distribution.  If they differ "by enough", then
+ * start a new block.
+ *
+ * As an optimization and heuristic, we don't distinguish between every symbol
+ * but rather we combine many symbols into a single "observation type".  For
+ * literals we only look at the high bits and low bits, and for matches we only
+ * look at whether the match is long or not.  The assumption is that for typical
+ * "real" data, places that are good block boundaries will tend to be noticable
+ * based only on changes in these aggregate frequencies, without looking for
+ * subtle differences in individual symbols.  For example, a change from ASCII
+ * bytes to non-ASCII bytes, or from few matches (generally less compressible)
+ * to many matches (generally more compressible), would be easily noticed based
+ * on the aggregates.
+ *
+ * For determining whether the frequency distributions are "different enough" to
+ * start a new block, the simply heuristic of splitting when the sum of absolute
+ * differences exceeds a constant seems to be good enough.  We also add a number
+ * proportional to the block size so that the algorithm is more likely to end
+ * large blocks than small blocks.  This reflects the general expectation that
+ * it will become increasingly beneficial to start a new block as the current
+ * blocks grows larger.
+ *
+ * Finally, for an approximation, it is not strictly necessary that the exact
+ * symbols being used are considered.  With "near-optimal parsing", for example,
+ * the actual symbols that will be used are unknown until after the block
+ * boundary is chosen and the block has been optimized.  Since the final choices
+ * cannot be used, we can use preliminary "greedy" choices instead.
+ */
+
+#define NUM_LITERAL_OBSERVATION_TYPES 8
+#define NUM_MATCH_OBSERVATION_TYPES 2
+#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
+struct block_split_stats {
+	u32 new_observations[NUM_OBSERVATION_TYPES];
+	u32 observations[NUM_OBSERVATION_TYPES];
+	u32 num_new_observations;
+	u32 num_observations;
+};
+
+/* Initialize the block split statistics when starting a new block. */
+static void
+init_block_split_stats(struct block_split_stats *stats)
+{
+	for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->new_observations[i] = 0;
+		stats->observations[i] = 0;
+	}
+	stats->num_new_observations = 0;
+	stats->num_observations = 0;
+}
+
+/* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
+ * literal, for 8 possible literal observation types.  */
+static forceinline void
+observe_literal(struct block_split_stats *stats, u8 lit)
+{
+	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
+	stats->num_new_observations++;
+}
+
+/* Match observation.  Heuristic: use one observation type for "short match" and
+ * one observation type for "long match".  */
+static forceinline void
+observe_match(struct block_split_stats *stats, unsigned length)
+{
+	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
+	stats->num_new_observations++;
+}
+
+static bool
+do_end_block_check(struct block_split_stats *stats, u32 block_size)
+{
+	if (stats->num_observations > 0) {
+
+		/* Note: to avoid slow divisions, we do not divide by
+		 * 'num_observations', but rather do all math with the numbers
+		 * multiplied by 'num_observations'.  */
+		u32 total_delta = 0;
+		for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+			u32 expected = stats->observations[i] * stats->num_new_observations;
+			u32 actual = stats->new_observations[i] * stats->num_observations;
+			u32 delta = (actual > expected) ? actual - expected :
+							  expected - actual;
+			total_delta += delta;
+		}
+
+		/* Ready to end the block? */
+		if (total_delta + (block_size >> 12) * stats->num_observations >=
+		    200 * stats->num_observations)
+			return true;
+	}
+
+	for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->num_observations += stats->new_observations[i];
+		stats->observations[i] += stats->new_observations[i];
+		stats->new_observations[i] = 0;
+	}
+	stats->num_new_observations = 0;
+	return false;
+}
+
+static forceinline bool
+should_end_block(struct block_split_stats *stats,
+		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
+{
+	/* Ready to check block split statistics? */
+	if (stats->num_new_observations < 512 ||
+	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
+	    in_end - in_next < 16384)
+		return false;
+
+	return do_end_block_check(stats, in_next - in_block_begin);
+}
+
+/******************************************************************************/
+
 /*
 * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
 */
@ -1601,9 +1720,12 @@ deflate_compress_greedy(struct deflate_compressor * restrict c,
 		/* Starting a new DEFLATE block.  */

 		const u8 * const in_block_begin = in_next;
+		const u8 * const in_max_block_end = in_next + MIN(in_end - in_next, MAX_BLOCK_LENGTH);
 		u32 litrunlen = 0;
 		struct deflate_sequence *next_seq = c->sequences;
-		s32 items_remaining = MAX_ITEMS_PER_BLOCK;
+		struct block_split_stats split_stats;
+
+		init_block_split_stats(&split_stats);

 		do {
 			u32 length;
@ -1630,6 +1752,7 @@ deflate_compress_greedy(struct deflate_compressor * restrict c,
 				/* Match found.  */
 				deflate_choose_match(c, length, offset,
 						     &litrunlen, &next_seq);
+				observe_match(&split_stats, length);
 				in_next = hc_matchfinder_skip_positions(&c->hc_mf,
 									&in_cur_base,
 									in_next + 1,
@ -1638,15 +1761,18 @@ deflate_compress_greedy(struct deflate_compressor * restrict c,
 									next_hashes);
 			} else {
 				/* No match found.  */
-				deflate_choose_literal(c, *in_next++, &litrunlen);
+				deflate_choose_literal(c, *in_next, &litrunlen);
+				observe_literal(&split_stats, *in_next);
+				in_next++;
 			}

 			/* Check if it's time to output another block.  */
-		} while (in_next != in_end && --items_remaining > 0);
+		} while (in_next < in_max_block_end &&
+			 !should_end_block(&split_stats, in_block_begin, in_next, in_end));

 		deflate_finish_sequence(next_seq, litrunlen);
 		deflate_write_block(c, &os, in_block_begin,
-				    items_remaining, in_next == in_end);
+				    in_next - in_block_begin, in_next == in_end);
 	} while (in_next != in_end);

 	return deflate_flush_output(&os);
@ -1678,9 +1804,12 @@ deflate_compress_lazy(struct deflate_compressor * restrict c,
 		/* Starting a new DEFLATE block.  */

 		const u8 * const in_block_begin = in_next;
+		const u8 * const in_max_block_end = in_next + MIN(in_end - in_next, MAX_BLOCK_LENGTH);
 		u32 litrunlen = 0;
 		struct deflate_sequence *next_seq = c->sequences;
-		s32 items_remaining = MAX_ITEMS_PER_BLOCK;
+		struct block_split_stats split_stats;
+
+		init_block_split_stats(&split_stats);

 		do {
 			unsigned cur_len;
@ -1708,10 +1837,13 @@ deflate_compress_lazy(struct deflate_compressor * restrict c,
 			if (cur_len < DEFLATE_MIN_MATCH_LEN) {
 				/* No match found.  Choose a literal.  */
 				deflate_choose_literal(c, *(in_next - 1), &litrunlen);
+				observe_literal(&split_stats, *(in_next - 1));
 				continue;
 			}

 		have_cur_match:
+			observe_match(&split_stats, cur_len);
+
 			/* We have a match at the current position.  */

 			/* If the current match is very long, choose it
@ -1764,7 +1896,6 @@ deflate_compress_lazy(struct deflate_compressor * restrict c,
 				 * Output a literal.  Then the next match
 				 * becomes the current match.  */
 				deflate_choose_literal(c, *(in_next - 2), &litrunlen);
-				items_remaining--;
 				cur_len = next_len;
 				cur_offset = next_offset;
 				goto have_cur_match;
@ -1782,11 +1913,12 @@ deflate_compress_lazy(struct deflate_compressor * restrict c,
 								next_hashes);

 			/* Check if it's time to output another block.  */
-		} while (in_next != in_end && --items_remaining > 0);
+		} while (in_next < in_max_block_end &&
+			 !should_end_block(&split_stats, in_block_begin, in_next, in_end));

 		deflate_finish_sequence(next_seq, litrunlen);
 		deflate_write_block(c, &os, in_block_begin,
-				    items_remaining, in_next == in_end);
+				    in_next - in_block_begin, in_next == in_end);

 	} while (in_next != in_end);

@ -2012,6 +2144,11 @@ deflate_optimize_and_write_block(struct deflate_compressor *c,
 	struct deflate_optimum_node *end_node = c->optimum + block_len;
 	unsigned num_passes_remaining = c->num_optim_passes;

+	/* Force the block to really end at 'end_node', even if some matches
+	 * extend beyond it.  */
+	for (int i = 1; i < DEFLATE_MAX_MATCH_LEN; i++)
+		end_node[i].cost_to_end = 0x80000000;
+
 	do {
 		/*
 		 * Beginning a new optimization pass and finding a new
@ -2151,18 +2288,24 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c,
 		struct lz_match *cache_ptr = c->cached_matches;
 		struct lz_match * const cache_end = &c->cached_matches[CACHE_LEN - (MAX_MATCHES_PER_POS + 1)];
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_block_end = in_next + MIN(in_end - in_next, OPTIM_BLOCK_LENGTH);
+		const u8 * const in_max_block_end = in_next + MIN(in_end - in_next, MAX_BLOCK_LENGTH);
+		struct block_split_stats split_stats;
+		const u8 *next_observation = in_next;

-		/* Find all match possibilities in this block.  */
+		init_block_split_stats(&split_stats);
+
+		/*
+		 * Find matches until we decide to end the block.  We end the
+		 * block if any of the following is true:
+		 *
+		 * (1) Maximum block size has been reached
+		 * (2) Match catch may overflow.
+		 * (3) Block split heuristic says to split now.
+		 */
 		do {
 			struct lz_match *matches;
 			unsigned best_len;

-			/* Force the block to end if the match cache may
-			 * overflow.  This case is very unlikely.  */
-			if (unlikely(cache_ptr > cache_end))
-				break;
-
 			/* Slide the window forward if needed.  */
 			if (in_next == in_next_slide) {
 				bt_matchfinder_slide_window(&c->bt_mf);
@ -2208,6 +2351,17 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c,
 								       &best_len,
 								       matches);
 			}
+
+			if (in_next >= next_observation) {
+				if (best_len >= 4) {
+					observe_match(&split_stats, best_len);
+					next_observation = in_next + best_len;
+				} else {
+					observe_literal(&split_stats, *in_next);
+					next_observation = in_next + 1;
+				}
+			}
+
 			cache_ptr->length = cache_ptr - matches;
 			cache_ptr->offset = *in_next;
 			in_next++;
@ -2224,15 +2378,8 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c,
 			 * ratio very much.  If there's a long match, then the
 			 * data must be highly compressible, so it doesn't
 			 * matter much what we do.
-			 *
-			 * We also trigger this same case when approaching the
-			 * desired end of the block.  This forces the block to
-			 * reach a "stopping point" where there are no matches
-			 * extending to later positions.  (XXX: this behavior is
-			 * non-optimal and should be improved.)
 			 */
-			if (best_len >= DEFLATE_MIN_MATCH_LEN &&
-			    best_len >= MIN(nice_len, in_block_end - in_next)) {
+			if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
 				--best_len;
 				do {
 					if (in_next == in_next_slide) {
@ -2260,7 +2407,9 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c,
 					cache_ptr++;
 				} while (--best_len);
 			}
-		} while (in_next < in_block_end);
+		} while (in_next < in_max_block_end &&
+			 cache_ptr <= cache_end &&
+			 !should_end_block(&split_stats, in_block_begin, in_next, in_end));

 		/* All the matches for this block have been cached.  Now compute
 		 * a near-optimal sequence of literals and matches, and output
@ -2354,17 +2503,17 @@ deflate_alloc_compressor(unsigned int compression_level)
 		break;
 	case 5:
 		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 28;
-		c->nice_match_length = 28;
+		c->max_search_depth = 20;
+		c->nice_match_length = 30;
 		break;
 	case 6:
 		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 70;
-		c->nice_match_length = 70;
+		c->max_search_depth = 40;
+		c->nice_match_length = 65;
 		break;
 	case 7:
 		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 130;
+		c->max_search_depth = 100;
 		c->nice_match_length = 130;
 		break;
 #if SUPPORT_NEAR_OPTIMAL_PARSING
@ -2376,14 +2525,14 @@ deflate_alloc_compressor(unsigned int compression_level)
 		break;
 	case 9:
 		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 18;
-		c->nice_match_length = 24;
+		c->max_search_depth = 16;
+		c->nice_match_length = 26;
 		c->num_optim_passes = 2;
 		break;
 	case 10:
 		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 36;
-		c->nice_match_length = 48;
+		c->max_search_depth = 30;
+		c->nice_match_length = 50;
 		c->num_optim_passes = 2;
 		break;
 	case 11:
@ -2401,12 +2550,12 @@ deflate_alloc_compressor(unsigned int compression_level)
 #else
 	case 8:
 		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 200;
+		c->max_search_depth = 150;
 		c->nice_match_length = 200;
 		break;
 	case 9:
 		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 300;
+		c->max_search_depth = 200;
 		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
 		break;
 #endif
@ -2457,10 +2606,9 @@ deflate_get_compression_level(struct deflate_compressor *c)
 LIBEXPORT size_t
 deflate_compress_bound(struct deflate_compressor *c, size_t in_nbytes)
 {
-	size_t max_num_blocks =
-		(in_nbytes + MAX_ITEMS_PER_BLOCK - 1) / MAX_ITEMS_PER_BLOCK;
+	size_t max_num_blocks = DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH);
 	if (max_num_blocks == 0)
 		max_num_blocks++;
-	return MIN_OUTPUT_SIZE + (in_nbytes * 9 + 7) / 8 +
+	return MIN_OUTPUT_SIZE + DIV_ROUND_UP(in_nbytes * 9, 8) +
 		max_num_blocks * 200;
 }
--- a/src/util.h
+++ b/src/util.h
@ -48,3 +48,6 @@ typedef size_t machine_word_t;
 /* MAX() - calculate the maximum of two variables.  Arguments may be evaluted
 * multiple times.  */
 #define MAX(a, b)	((a) >= (b) ? (a) : (b))
+
+/* Calculate 'n / d', but round up instead of down.  */
+#define DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))