From 71db68b27ff6a237e97be89aaf6f3ae4f62622f2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sun, 2 Jan 2022 11:54:50 -0600
Subject: [PATCH] deflate_compress: adjust block splitting conditions

For fastest, greedy, lazy, and lazy2: save memory by reducing the length
of the sequence store, and forcing a split if it is filled.

For fastest: increase the max block length, but use a relatively short
sequence store that will cause shorter blocks to be used often.

For all: allow the final block to exceed the soft maximum length if it
avoids having to create a block below the minimum length.
---
 lib/deflate_compress.c | 183 +++++++++++++++++++++++++++++------------
 1 file changed, 129 insertions(+), 54 deletions(-)

diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c
index 4b3ee36..bc586c3 100644
--- a/lib/deflate_compress.c
+++ b/lib/deflate_compress.c
@@ -51,10 +51,11 @@
 #define SUPPORT_NEAR_OPTIMAL_PARSING	1
 
 /*
- * This is the minimum block length, in uncompressed bytes, which the compressor
- * will use.  This should be a value below which using shorter blocks is very
- * unlikely to be worthwhile, due to the per-block overhead.  This parameter
- * doesn't apply to the final block, which can be arbitrarily short.
+ * This is the minimum block length that the compressor will use, in
+ * uncompressed bytes.  It is also the amount by which the final block is
+ * allowed to grow past the soft maximum length in order to avoid using a very
+ * short block at the end.  This should be a value below which using shorter
+ * blocks is unlikely to be worthwhile, due to the per-block overhead.
  *
  * Defining a fixed minimum block length is needed in order to guarantee a
  * reasonable upper bound on the compressed size.  It's also needed because our
@@ -63,23 +64,46 @@
 #define MIN_BLOCK_LENGTH	10000
 
 /*
- * This is the soft maximum block length, in uncompressed bytes, which the
- * compressor will use.  This is a "soft" maximum, meaning that the compressor
- * will try to end blocks at this length, but it may go slightly past it if
- * there is a match that straddles this limit.  This parameter doesn't apply to
- * uncompressed blocks, which the DEFLATE format limits to 65535 bytes.
+ * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft
+ * maximum block length, in uncompressed bytes.  The compressor will try to end
+ * blocks at this length, but it may go slightly past it if there is a match
+ * that straddles this limit or if the input data ends soon after this limit.
+ * This parameter doesn't apply to uncompressed blocks, which the DEFLATE format
+ * limits to 65535 bytes.
  *
  * This should be a value above which it is very likely that splitting the block
- * would produce a better compression ratio.  Increasing/decreasing this
- * parameter will increase/decrease per-compressor memory usage linearly.
+ * would produce a better compression ratio.  For the near-optimal compressor,
+ * increasing/decreasing this parameter will increase/decrease per-compressor
+ * memory usage linearly.
  */
 #define SOFT_MAX_BLOCK_LENGTH	300000
 
 /*
- * Block length, in uncompressed bytes, used by deflate_compress_fastest().
- * deflate_compress_fastest() doesn't use the other block length settings.
+ * For the greedy, lazy, and lazy2 compressors: this is the length of the
+ * sequence store, which is an array where the compressor temporarily stores
+ * matches that it's going to use in the current block.  This value is 1 more
+ * than the number of matches that can be used in a block.  If the sequence
+ * store fills up, then the compressor will be forced to end the block early.
+ * This value should be large enough so that this rarely happens, due to the
+ * block being ended normally before then.  Increasing/decreasing this value
+ * will increase/decrease per-compressor memory usage linearly.
  */
-#define FAST_BLOCK_LENGTH	MIN(32768, SOFT_MAX_BLOCK_LENGTH)
+#define SEQ_STORE_LENGTH	50000
+
+/*
+ * For deflate_compress_fastest(): This is the soft maximum block length.
+ * deflate_compress_fastest() doesn't use the regular block splitting algorithm;
+ * it only ends blocks when they reach FAST_SOFT_MAX_BLOCK_LENGTH bytes or
+ * FAST_SEQ_STORE_LENGTH - 1 matches.  Therefore, this value should be lower
+ * than the regular SOFT_MAX_BLOCK_LENGTH.
+ */
+#define FAST_SOFT_MAX_BLOCK_LENGTH	65535
+
+/*
+ * For deflate_compress_fastest(): this is the length of the sequence store.
+ * This is like SEQ_STORE_LENGTH, but this should be a lower value.
+ */
+#define FAST_SEQ_STORE_LENGTH	8192
 
 /*
  * These are the maximum codeword lengths, in bits, the compressor will use for
@@ -97,13 +121,13 @@
 /* Parameters specific to the near-optimal parsing algorithm */
 
 /*
- * BIT_COST is a scaling factor that allows the compressor to consider
- * fractional bit costs when deciding which literal/match sequence to use.  This
- * is useful when the true symbol costs are unknown.  For example, if the
- * compressor thinks that a symbol has 6.5 bits of entropy, it can set its cost
- * to 6.5 bits rather than have to use 6 or 7 bits.  Although in the end each
- * symbol will use a whole number of bits due to the Huffman coding, considering
- * fractional bits can be helpful due to the limited information.
+ * BIT_COST is a scaling factor that allows the near-optimal compressor to
+ * consider fractional bit costs when deciding which literal/match sequence to
+ * use.  This is useful when the true symbol costs are unknown.  For example, if
+ * the compressor thinks that a symbol has 6.5 bits of entropy, it can set its
+ * cost to 6.5 bits rather than have to use 6 or 7 bits.  Although in the end
+ * each symbol will use a whole number of bits due to the Huffman coding,
+ * considering fractional bits can be helpful due to the limited information.
  *
  * BIT_COST should be a power of 2.  A value of 8 or 16 works well.  A higher
  * value isn't very useful since the calculations are approximate anyway.
@@ -122,12 +146,9 @@
 #define OFFSET_NOSTAT_BITS	10
 
 /*
- * This is (approximately) the maximum number of matches that the compressor
- * will cache per block.  If the match cache becomes full, then the compressor
- * will be forced to end the block early.  This value should be large enough so
- * that this rarely happens, due to the block being ended normally before the
- * cache fills up.  Increasing/decreasing this parameter will increase/decrease
- * per-compressor memory usage linearly.
+ * This is (slightly less than) the maximum number of matches that the
+ * near-optimal compressor will cache per block.  This behaves similarly to
+ * SEQ_STORE_LENGTH for the other compressors.
  */
 #define MATCH_CACHE_LENGTH	(SOFT_MAX_BLOCK_LENGTH * 5)
 
@@ -152,6 +173,47 @@
 	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
 #endif
 
+static forceinline void
+check_buildtime_parameters(void)
+{
+	/*
+	 * Verify that MIN_BLOCK_LENGTH is being honored, as
+	 * libdeflate_compress_bound() depends on it.
+	 */
+	STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(
+		(SEQ_STORE_LENGTH - 1) * DEFLATE_MIN_MATCH_LEN >=
+		MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(
+		(FAST_SEQ_STORE_LENGTH - 1) * HT_MATCHFINDER_MIN_MATCH_LEN >=
+		MIN_BLOCK_LENGTH);
+
+	/* Verify that the sequence stores aren't uselessly large. */
+	STATIC_ASSERT(
+		(SEQ_STORE_LENGTH - 1) * DEFLATE_MIN_MATCH_LEN <=
+		SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(
+		(FAST_SEQ_STORE_LENGTH - 1) * HT_MATCHFINDER_MIN_MATCH_LEN <=
+		FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
+
+	/* Verify that the maximum codeword lengths are valid. */
+	STATIC_ASSERT(
+		MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
+	STATIC_ASSERT(
+		MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
+	STATIC_ASSERT(
+		MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
+	STATIC_ASSERT(
+		(1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS);
+	STATIC_ASSERT(
+		(1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS);
+	STATIC_ASSERT(
+		(1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS);
+}
+
+/******************************************************************************/
+
 /* Table: length slot => length slot base value  */
 static const unsigned deflate_length_slot_base[] = {
 	3   , 4   , 5   , 6   , 7   , 8   , 9   , 10  ,
@@ -424,14 +486,12 @@ struct libdeflate_compressor {
 			/* Hash chains matchfinder */
 			struct hc_matchfinder hc_mf;
 
-			/* The matches and literals that the parser has chosen
-			 * for the current block.  The required length of this
-			 * array is limited by the maximum number of matches
-			 * that can ever be chosen for a single block, plus one
-			 * for the special entry at the end.  */
-			struct deflate_sequence sequences[
-				DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
-					     DEFLATE_MIN_MATCH_LEN) + 1];
+			/*
+			 * The matches and literals that the parser has chosen
+			 * for the current block.
+			 */
+			struct deflate_sequence sequences[SEQ_STORE_LENGTH];
+
 		} g; /* (g)reedy */
 
 		/* Data for fastest parsing */
@@ -440,8 +500,8 @@ struct libdeflate_compressor {
 			struct ht_matchfinder ht_mf;
 
 			struct deflate_sequence sequences[
-				DIV_ROUND_UP(FAST_BLOCK_LENGTH,
-					     HT_MATCHFINDER_MIN_MATCH_LEN) + 1];
+						FAST_SEQ_STORE_LENGTH];
+
 		} f; /* (f)astest */
 
 	#if SUPPORT_NEAR_OPTIMAL_PARSING
@@ -485,15 +545,17 @@ struct libdeflate_compressor {
 			 * minimum-cost path algorithm.
 			 *
 			 * This array must be large enough to accommodate the
-			 * worst-case number of nodes, which occurs if we find a
-			 * match of length DEFLATE_MAX_MATCH_LEN at position
-			 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
-			 * length SOFT_MAX_BLOCK_LENGTH - 1 +
-			 * DEFLATE_MAX_MATCH_LEN.  Add one for the end-of-block
-			 * node.
+			 * worst-case number of nodes, which occurs when the
+			 * final block is of length SOFT_MAX_BLOCK_LENGTH +
+			 * MIN_BLOCK_LENGTH, or when any block is of length
+			 * SOFT_MAX_BLOCK_LENGTH + DEFLATE_MAX_MATCH_LEN
+			 * - 1.  Add one for the end-of-block node.
 			 */
-			struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
-								  DEFLATE_MAX_MATCH_LEN + 1];
+			struct deflate_optimum_node optimum_nodes[
+						SOFT_MAX_BLOCK_LENGTH +
+						MAX(MIN_BLOCK_LENGTH,
+						    DEFLATE_MAX_MATCH_LEN - 1)
+						+ 1];
 
 			/* The current cost model being used.  */
 			struct deflate_costs costs;
@@ -2160,6 +2222,14 @@ recalculate_min_match_len(const struct deflate_freqs *freqs,
 	return choose_min_match_len(num_used_literals, max_search_depth);
 }
 
+static forceinline const u8 *
+choose_max_block_end(const u8 *in_next, const u8 *in_end, size_t soft_max_len)
+{
+	if (in_end - in_next < soft_max_len + MIN_BLOCK_LENGTH)
+		return in_end;
+	return in_next + soft_max_len;
+}
+
 /*
  * This is the level 0 "compressor".  It always outputs uncompressed blocks.
  */
@@ -2203,8 +2273,8 @@ deflate_compress_fastest(struct libdeflate_compressor * restrict c,
 		/* Starting a new DEFLATE block. */
 
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end =
-			in_next + MIN(in_end - in_next, FAST_BLOCK_LENGTH);
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH);
 		struct deflate_sequence *seq = c->p.f.sequences;
 
 		deflate_begin_sequences(c, seq);
@@ -2248,7 +2318,8 @@ deflate_compress_fastest(struct libdeflate_compressor * restrict c,
 			}
 
 			/* Check if it's time to output another block. */
-		} while (in_next < in_max_block_end);
+		} while (in_next < in_max_block_end &&
+			 seq < &c->p.f.sequences[ARRAY_LEN(c->p.f.sequences)]);
 
 		deflate_flush_block(c, &os, in_block_begin,
 				    in_next - in_block_begin,
@@ -2281,8 +2352,8 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
 		/* Starting a new DEFLATE block. */
 
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end =
-			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
 		unsigned min_len;
 		struct deflate_sequence *seq = c->p.g.sequences;
 
@@ -2332,6 +2403,7 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
 
 			/* Check if it's time to output another block. */
 		} while (in_next < in_max_block_end &&
+			 seq < &c->p.g.sequences[ARRAY_LEN(c->p.g.sequences)] &&
 			 !should_end_block(&c->split_stats,
 					   in_block_begin, in_next, in_end));
 
@@ -2364,8 +2436,8 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
 		/* Starting a new DEFLATE block. */
 
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end =
-			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
 		const u8 *next_recalc_min_len =
 			in_next + MIN(in_end - in_next, 10000);
 		unsigned min_len = DEFLATE_MIN_MATCH_LEN;
@@ -2544,6 +2616,7 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
 			}
 			/* Check if it's time to output another block. */
 		} while (in_next < in_max_block_end &&
+			 seq < &c->p.g.sequences[ARRAY_LEN(c->p.g.sequences)] &&
 			 !should_end_block(&c->split_stats,
 					   in_block_begin, in_next, in_end));
 
@@ -3178,8 +3251,8 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 
 		struct lz_match *cache_ptr = c->p.n.match_cache;
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end =
-			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
 		const u8 *next_observation = in_next;
 
 		deflate_near_optimal_begin_block(c, in_block_begin == in);
@@ -3347,6 +3420,8 @@ libdeflate_alloc_compressor(int compression_level)
 	struct libdeflate_compressor *c;
 	size_t size = offsetof(struct libdeflate_compressor, p);
 
+	check_buildtime_parameters();
+
 	if (compression_level < 0 || compression_level > 12)
 		return NULL;