From f649a4b8db1df8b0e26242e92361376d4a729f42 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sat, 11 Jun 2016 15:33:27 -0500
Subject: [PATCH] Compressor updates

---
 lib/bt_matchfinder.h   |   6 +-
 lib/deflate_compress.c | 309 ++++++++++++++++++++++-------------------
 2 files changed, 167 insertions(+), 148 deletions(-)

diff --git a/lib/bt_matchfinder.h b/lib/bt_matchfinder.h
index 7c1217f..28ef228 100644
--- a/lib/bt_matchfinder.h
+++ b/lib/bt_matchfinder.h
@@ -211,8 +211,7 @@ bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
 		matchptr = &in_base[cur_node];
 
 		if (matchptr[len] == in_next[len]) {
-			len = lz_extend(in_next, matchptr, len + 1,
-					(record_matches ? max_len : nice_len));
+			len = lz_extend(in_next, matchptr, len + 1, max_len);
 			if (!record_matches || len > best_len) {
 				if (record_matches) {
 					best_len = len;
@@ -325,7 +324,6 @@ static forceinline void
 bt_matchfinder_skip_position(struct bt_matchfinder *mf,
 			     const u8 *in_base,
 			     ptrdiff_t cur_pos,
-			     u32 max_len,
 			     u32 nice_len,
 			     u32 max_search_depth,
 			     u32 next_hashes[2])
@@ -334,7 +332,7 @@ bt_matchfinder_skip_position(struct bt_matchfinder *mf,
 	bt_matchfinder_advance_one_byte(mf,
 					in_base,
 					cur_pos,
-					max_len,
+					nice_len,
 					nice_len,
 					max_search_depth,
 					next_hashes,
diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c
index 3709167..fa28b0f 100644
--- a/lib/deflate_compress.c
+++ b/lib/deflate_compress.c
@@ -51,12 +51,24 @@
 #endif
 
 /*
- * The minimum and maximum block lengths, in bytes of source data, which the
- * parsing algorithms may choose.  Caveat: due to implementation details, the
- * actual maximum will be slightly higher than the number defined below.
+ * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
+ * except if the last block has to be shorter.
  */
 #define MIN_BLOCK_LENGTH	10000
-#define MAX_BLOCK_LENGTH	300000
+
+/*
+ * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
+ * the final length might be slightly longer due to matches extending beyond
+ * this limit.
+ */
+#define SOFT_MAX_BLOCK_LENGTH	300000
+
+/*
+ * The number of observed matches or literals that represents sufficient data to
+ * decide whether the current block should be terminated or not.
+ */
+#define NUM_OBSERVATIONS_PER_BLOCK_CHECK       512
+
 
 #if SUPPORT_NEAR_OPTIMAL_PARSING
 /* Constants specific to the near-optimal parsing algorithm */
@@ -77,7 +89,7 @@
  * However, fallback behavior (immediately terminating the block) on cache
  * overflow is still required.
  */
-#  define CACHE_LENGTH      (MAX_BLOCK_LENGTH * 5)
+#  define CACHE_LENGTH      (SOFT_MAX_BLOCK_LENGTH * 5)
 
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
@@ -85,7 +97,7 @@
  * These are the compressor-side limits on the codeword lengths for each Huffman
  * code.  To make outputting bits slightly faster, some of these limits are
  * lower than the limits defined by the DEFLATE format.  This does not
- * significantly affect the compression ratio, at least for the block sizes we
+ * significantly affect the compression ratio, at least for the block lengths we
  * use.
  */
 #define MAX_LITLEN_CODEWORD_LEN		14
@@ -365,7 +377,7 @@ struct deflate_compressor {
 			 * that can ever be chosen for a single block, plus one
 			 * for the special entry at the end.  */
 			struct deflate_sequence sequences[
-				DIV_ROUND_UP(MAX_BLOCK_LENGTH,
+				DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
 					     DEFLATE_MIN_MATCH_LEN) + 1];
 		} g; /* (g)reedy */
 
@@ -411,11 +423,12 @@ struct deflate_compressor {
 			 * This array must be large enough to accommodate the
 			 * worst-case number of nodes, which occurs if we find a
 			 * match of length DEFLATE_MAX_MATCH_LEN at position
-			 * MAX_BLOCK_LENGTH - 1, producing a block of length
-			 * MAX_BLOCK_LENGTH - 1 + DEFLATE_MAX_MATCH_LEN.  Add
-			 * one for the end-of-block node.
+			 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
+			 * length SOFT_MAX_BLOCK_LENGTH - 1 +
+			 * DEFLATE_MAX_MATCH_LEN.  Add one for the end-of-block
+			 * node.
 			 */
-			struct deflate_optimum_node optimum_nodes[MAX_BLOCK_LENGTH - 1 +
+			struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
 								  DEFLATE_MAX_MATCH_LEN + 1];
 
 			/* The current cost model being used.  */
@@ -1829,10 +1842,10 @@ deflate_finish_sequence(struct deflate_sequence *seq, unsigned litrunlen)
  * For determining whether the frequency distributions are "different enough" to
  * start a new block, the simply heuristic of splitting when the sum of absolute
  * differences exceeds a constant seems to be good enough.  We also add a number
- * proportional to the block size so that the algorithm is more likely to end
- * large blocks than small blocks.  This reflects the general expectation that
- * it will become increasingly beneficial to start a new block as the current
- * blocks grows larger.
+ * proportional to the block length so that the algorithm is more likely to end
+ * long blocks than short blocks.  This reflects the general expectation that it
+ * will become increasingly beneficial to start a new block as the current
+ * block grows longer.
  *
  * Finally, for an approximation, it is not strictly necessary that the exact
  * symbols being used are considered.  With "near-optimal parsing", for example,
@@ -1874,7 +1887,7 @@ observe_match(struct block_split_stats *stats, unsigned length)
 }
 
 static bool
-do_end_block_check(struct block_split_stats *stats, u32 block_size)
+do_end_block_check(struct block_split_stats *stats, u32 block_length)
 {
 	int i;
 
@@ -1893,8 +1906,8 @@ do_end_block_check(struct block_split_stats *stats, u32 block_size)
 		}
 
 		/* Ready to end the block? */
-		if (total_delta + (block_size >> 12) * stats->num_observations >=
-		    200 * stats->num_observations)
+		if (total_delta + (block_length / 4096) * stats->num_observations >=
+		    NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
 			return true;
 	}
 
@@ -1912,9 +1925,9 @@ should_end_block(struct block_split_stats *stats,
 		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
 {
 	/* Ready to check block split statistics? */
-	if (stats->num_new_observations < 512 ||
+	if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
 	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
-	    in_end - in_next < 16384)
+	    in_end - in_next < MIN_BLOCK_LENGTH)
 		return false;
 
 	return do_end_block_check(stats, in_next - in_block_begin);
@@ -1945,7 +1958,8 @@ deflate_compress_greedy(struct deflate_compressor * restrict c,
 		/* Starting a new DEFLATE block.  */
 
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end = in_next + MIN(in_end - in_next, MAX_BLOCK_LENGTH);
+		const u8 * const in_max_block_end =
+			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
 		u32 litrunlen = 0;
 		struct deflate_sequence *next_seq = c->p.g.sequences;
 
@@ -2029,7 +2043,8 @@ deflate_compress_lazy(struct deflate_compressor * restrict c,
 		/* Starting a new DEFLATE block.  */
 
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end = in_next + MIN(in_end - in_next, MAX_BLOCK_LENGTH);
+		const u8 * const in_max_block_end =
+			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
 		u32 litrunlen = 0;
 		struct deflate_sequence *next_seq = c->p.g.sequences;
 
@@ -2155,13 +2170,13 @@ deflate_compress_lazy(struct deflate_compressor * restrict c,
 /*
  * Follow the minimum-cost path in the graph of possible match/literal choices
  * for the current block and compute the frequencies of the Huffman symbols that
- * are needed to output those matches and literals.
+ * would be needed to output those matches and literals.
  */
 static void
-deflate_tally_item_list(struct deflate_compressor *c,
-			struct deflate_optimum_node *end_node)
+deflate_tally_item_list(struct deflate_compressor *c, u32 block_length)
 {
 	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
+	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
 	do {
 		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
 		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
@@ -2180,7 +2195,8 @@ deflate_tally_item_list(struct deflate_compressor *c,
 
 /* Set the current cost model from the codeword lengths specified in @lens.  */
 static void
-deflate_set_costs(struct deflate_compressor *c, const struct deflate_lens *lens)
+deflate_set_costs_from_codes(struct deflate_compressor *c,
+			     const struct deflate_lens *lens)
 {
 	unsigned i;
 
@@ -2232,10 +2248,10 @@ deflate_default_offset_slot_cost(unsigned offset_slot)
 }
 
 /*
- * Set default Huffman symbol costs for the first optimization pass.
+ * Set default symbol costs for the first block's first optimization pass.
  *
- * It works well to assume that each Huffman symbol is equally probable.  This
- * results in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
+ * It works well to assume that each symbol is equally probable.  This results
+ * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
  * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
  * alphabet.  However, we intentionally bias the parse towards matches rather
  * than literals by using a slightly lower default cost for length symbols than
@@ -2297,120 +2313,130 @@ deflate_adjust_costs(struct deflate_compressor *c)
 				    deflate_default_offset_slot_cost(i));
 }
 
+/*
+ * Find the minimum-cost path through the graph of possible match/literal
+ * choices for this block.
+ *
+ * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
+ * represents the node at the beginning of the block, to
+ * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
+ * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
+ *
+ * The algorithm works backwards, starting at the end node and proceeding
+ * backwards one node at a time.  At each node, the minimum cost to reach the
+ * end node is computed and the match/literal choice that begins that path is
+ * saved.
+ */
 static void
-deflate_optimize_and_write_block(struct deflate_compressor *c,
-				 struct deflate_output_bitstream *os,
-				 const u8 * const block_begin,
-				 const u32 block_length,
-				 const struct lz_match * const end_cache_ptr,
-				 const bool is_final_block)
+deflate_find_min_cost_path(struct deflate_compressor *c, const u32 block_length,
+			   const struct lz_match *cache_ptr)
+{
+	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
+	struct deflate_optimum_node *cur_node = end_node;
+
+	cur_node->cost_to_end = 0;
+	do {
+		unsigned num_matches;
+		unsigned literal;
+		u32 best_cost_to_end;
+
+		cur_node--;
+		cache_ptr--;
+
+		num_matches = cache_ptr->length;
+		literal = cache_ptr->offset;
+
+		/* It's always possible to choose a literal.  */
+		best_cost_to_end = c->p.n.costs.literal[literal] +
+				   (cur_node + 1)->cost_to_end;
+		cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
+
+		/* Also consider matches if there are any.  */
+		if (num_matches) {
+			const struct lz_match *match;
+			unsigned len;
+			unsigned offset;
+			unsigned offset_slot;
+			u32 offset_cost;
+			u32 cost_to_end;
+
+			/*
+			 * Consider each length from the minimum
+			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
+			 * match found at this position.  For each length, we
+			 * consider only the smallest offset for which that
+			 * length is available.  Although this is not guaranteed
+			 * to be optimal due to the possibility of a larger
+			 * offset costing less than a smaller offset to code,
+			 * this is a very useful heuristic.
+			 */
+			match = cache_ptr - num_matches;
+			len = DEFLATE_MIN_MATCH_LEN;
+			do {
+				offset = match->offset;
+				offset_slot = deflate_get_offset_slot(c, offset);
+				offset_cost = c->p.n.costs.offset_slot[offset_slot];
+				do {
+					cost_to_end = offset_cost +
+						      c->p.n.costs.length[len] +
+						      (cur_node + len)->cost_to_end;
+					if (cost_to_end < best_cost_to_end) {
+						best_cost_to_end = cost_to_end;
+						cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
+					}
+				} while (++len <= match->length);
+			} while (++match != cache_ptr);
+			cache_ptr -= num_matches;
+		}
+		cur_node->cost_to_end = best_cost_to_end;
+	} while (cur_node != &c->p.n.optimum_nodes[0]);
+}
+
+/*
+ * Choose the literal/match sequence to use for the current block.  The basic
+ * algorithm finds a minimum-cost path through the block's graph of
+ * literal/match choices, given a cost model.  However, the cost of each symbol
+ * is unknown until the Huffman codes have been built, but at the same time the
+ * Huffman codes depend on the frequencies of chosen symbols.  Consequently,
+ * multiple passes must be used to try to approximate an optimal solution.  The
+ * first pass uses default costs, mixed with the costs from the previous block
+ * if any.  Later passes use the Huffman codeword lengths from the previous pass
+ * as the costs.
+ */
+static void
+deflate_optimize_block(struct deflate_compressor *c, u32 block_length,
+		       const struct lz_match *cache_ptr, bool is_first_block)
 {
-	struct deflate_optimum_node * const end_node =
-		&c->p.n.optimum_nodes[block_length];
 	unsigned num_passes_remaining = c->p.n.num_optim_passes;
 	u32 i;
 
-	/* Force the block to really end at 'end_node', even if some matches
-	 * extend beyond it.  */
+	/* Force the block to really end at the desired length, even if some
+	 * matches extend beyond it. */
 	for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
 					ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
 		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
 
-	do {
-		/*
-		 * Beginning a new optimization pass and finding a new
-		 * minimum-cost path through the graph of possible match/literal
-		 * choices for this block.
-		 *
-		 * We find the minimum cost path from 'c->optimum_nodes[0]',
-		 * which represents the node at the beginning of the block, to
-		 * 'end_node', which represents the node at the end of the
-		 * block.  Edge costs are evaluated using the cost model
-		 * 'c->costs'.
-		 *
-		 * The algorithm works backward, starting at 'end_node' and
-		 * proceeding backwards one position at a time.  At each
-		 * position, the minimum cost to reach 'end_node' is computed
-		 * and the match/literal choice is saved.
-		 */
-		struct deflate_optimum_node *cur_node = end_node;
-		const struct lz_match *cache_ptr = end_cache_ptr;
+	/* Set the initial costs. */
+	if (is_first_block)
+		deflate_set_default_costs(c);
+	else
+		deflate_adjust_costs(c);
 
-		cur_node->cost_to_end = 0;
-		do {
-			unsigned num_matches;
-			unsigned literal;
-			u32 best_cost_to_end;
-			u32 best_item;
+	for (;;) {
+		/* Find the minimum cost path for this pass. */
+		deflate_find_min_cost_path(c, block_length, cache_ptr);
 
-			cur_node--;
-			cache_ptr--;
+		/* Compute frequencies of the chosen symbols. */
+		deflate_reset_symbol_frequencies(c);
+		deflate_tally_item_list(c, block_length);
 
-			num_matches = cache_ptr->length;
-			literal = cache_ptr->offset;
+		if (--num_passes_remaining == 0)
+			break;
 
-			/* It's always possible to choose a literal.  */
-			best_cost_to_end = c->p.n.costs.literal[literal] +
-					   (cur_node + 1)->cost_to_end;
-			best_item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
-
-			/* Also consider matches if there are any.  */
-			if (num_matches) {
-				const struct lz_match *match;
-				unsigned len;
-				unsigned offset;
-				unsigned offset_slot;
-				u32 offset_cost;
-				u32 cost_to_end;
-
-				/*
-				 * Consider each length from the minimum
-				 * (DEFLATE_MIN_MATCH_LEN) to the length of the
-				 * longest match found at this position.  For
-				 * each length, we consider only the smallest
-				 * offset for which that length is available.
-				 * Although this is not guaranteed to be optimal
-				 * due to the possibility of a larger offset
-				 * costing less than a smaller offset to code,
-				 * this is a very useful heuristic.
-				 */
-				match = cache_ptr - num_matches;
-				len = DEFLATE_MIN_MATCH_LEN;
-				do {
-					offset = match->offset;
-					offset_slot = deflate_get_offset_slot(c, offset);
-					offset_cost = c->p.n.costs.offset_slot[offset_slot];
-					do {
-						cost_to_end = offset_cost +
-							      c->p.n.costs.length[len] +
-							      (cur_node + len)->cost_to_end;
-						if (cost_to_end < best_cost_to_end) {
-							best_cost_to_end = cost_to_end;
-							best_item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
-						}
-					} while (++len <= match->length);
-				} while (++match != cache_ptr);
-				cache_ptr -= num_matches;
-			}
-			cur_node->cost_to_end = best_cost_to_end;
-			cur_node->item = best_item;
-		} while (cur_node != &c->p.n.optimum_nodes[0]);
-
-		/* Tally Huffman symbol frequencies.  */
-		deflate_tally_item_list(c, end_node);
-
-		/* If this wasn't the last pass, update the cost model.  */
-		if (num_passes_remaining > 1) {
-			deflate_make_huffman_codes(&c->freqs, &c->codes);
-			deflate_set_costs(c, &c->codes.lens);
-			deflate_reset_symbol_frequencies(c);
-		}
-	} while (--num_passes_remaining);
-
-	/* All optimization passes are done.  Output a block using the
-	 * minimum-cost path computed on the last optimization pass.  */
-	deflate_flush_block(c, os, block_begin, block_length,
-			    is_final_block, true);
+		/* At least one optimization pass remains; update the costs. */
+		deflate_make_huffman_codes(&c->freqs, &c->codes);
+		deflate_set_costs_from_codes(c, &c->codes.lens);
+	}
 }
 
 /*
@@ -2448,17 +2474,17 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c,
 
 		struct lz_match *cache_ptr = c->p.n.match_cache;
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end = in_next + MIN(in_end - in_next, MAX_BLOCK_LENGTH);
+		const u8 * const in_max_block_end =
+			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
 		const u8 *next_observation = in_next;
 
 		init_block_split_stats(&c->split_stats);
-		deflate_reset_symbol_frequencies(c);
 
 		/*
 		 * Find matches until we decide to end the block.  We end the
 		 * block if any of the following is true:
 		 *
-		 * (1) Maximum block size has been reached
+		 * (1) Maximum block length has been reached
 		 * (2) Match catch may overflow.
 		 * (3) Block split heuristic says to split now.
 		 */
@@ -2556,7 +2582,6 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c,
 						bt_matchfinder_skip_position(&c->p.n.bt_mf,
 									     in_cur_base,
 									     in_next - in_cur_base,
-									     max_len,
 									     nice_len,
 									     c->max_search_depth,
 									     next_hashes);
@@ -2571,16 +2596,12 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c,
 			 cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
 			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
 
-		/* All the matches for this block have been cached.  Now compute
-		 * a near-optimal sequence of literals and matches, and output
-		 * the block.  */
-		if (in_block_begin == in)
-			deflate_set_default_costs(c);
-		else
-			deflate_adjust_costs(c);
-		deflate_optimize_and_write_block(c, &os, in_block_begin,
-						 in_next - in_block_begin,
-						 cache_ptr, in_next == in_end);
+		/* All the matches for this block have been cached.  Now choose
+		 * the sequence of items to output and flush the block.  */
+		deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
+				       in_block_begin == in);
+		deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
+				    in_next == in_end, true);
 	} while (in_next != in_end);
 
 	return deflate_flush_output(&os);