deflate_compress: improve costs for near-optimal parsing

Further improve the way the near-optimal parser estimates symbol costs: - When setting a block's initial costs, weigh the default costs and previous block's costs differently, depending on how different the current block seems to be from the previous block. - When determining the "default" costs, take into account how many literals appear in the block and how frequent matches seem to be. - Increase BIT_COST from 8 to 16, to increase precision in calculations.
2025-09-23 03:17:30 -04:00 · 2021-12-31 16:04:49 -06:00 · 2021-12-31 16:04:49 -06:00 · 3dca7de4bd
commit 3dca7de4bd
parent bf3e032f71
2 changed files with 406 additions and 68 deletions
--- a/lib/deflate_compress.c
+++ b/lib/deflate_compress.c
@ -110,7 +110,7 @@
 * BIT_COST should be a power of 2.  A value of 8 or 16 works well.  A higher
 * value isn't very useful since the calculations are approximate anyway.
 */
-#define BIT_COST	8
+#define BIT_COST	16

 /*
 * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
@ -463,6 +463,17 @@ struct libdeflate_compressor {
 			/* The current cost model being used.  */
 			struct deflate_costs costs;

+			/* Literal/match statistics saved from previous block */
+			u32 prev_observations[NUM_OBSERVATION_TYPES];
+			u32 prev_num_observations;
+
+			/*
+			 * Approximate match length frequencies based on a
+			 * greedy parse, gathered during matchfinding.  This is
+			 * used for setting the initial symbol costs.
+			 */
+			u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+
 			unsigned num_optim_passes;
 		} n; /* (n)ear-optimal */
 	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
@ -1930,17 +1941,29 @@ observe_match(struct block_split_stats *stats, unsigned length)
 	stats->num_new_observations++;
 }

-static bool
-do_end_block_check(struct block_split_stats *stats, u32 block_length)
+static void
+merge_new_observations(struct block_split_stats *stats)
 {
 	int i;

-	if (stats->num_observations > 0) {
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->observations[i] += stats->new_observations[i];
+		stats->new_observations[i] = 0;
+	}
+	stats->num_observations += stats->num_new_observations;
+	stats->num_new_observations = 0;
+}

+static bool
+do_end_block_check(struct block_split_stats *stats, u32 block_length)
+{
+	if (stats->num_observations > 0) {
 		/* Note: to avoid slow divisions, we do not divide by
 		 * 'num_observations', but rather do all math with the numbers
 		 * multiplied by 'num_observations'.  */
 		u32 total_delta = 0;
+		int i;
+
 		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
 			u32 expected = stats->observations[i] * stats->num_new_observations;
 			u32 actual = stats->new_observations[i] * stats->num_observations;
@ -1954,13 +1977,7 @@ do_end_block_check(struct block_split_stats *stats, u32 block_length)
 		    NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
 			return true;
 	}
-
-	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
-		stats->observations[i] += stats->new_observations[i];
-		stats->new_observations[i] = 0;
-	}
-	stats->num_observations += stats->num_new_observations;
-	stats->num_new_observations = 0;
+	merge_new_observations(stats);
 	return false;
 }

@ -2495,63 +2512,287 @@ deflate_set_costs_from_codes(struct libdeflate_compressor *c,
 	}
 }

-static forceinline u32
-deflate_default_literal_cost(unsigned literal)
-{
-	STATIC_ASSERT(BIT_COST == 8);
-	/* 66 is 8.25 bits/symbol  */
-	return 66;
-}
-
-static forceinline u32
-deflate_default_length_slot_cost(unsigned length_slot)
-{
-	STATIC_ASSERT(BIT_COST == 8);
-	/* 60 is 7.5 bits/symbol  */
-	return 60 + ((u32)deflate_extra_length_bits[length_slot] * BIT_COST);
-}
-
-static forceinline u32
-deflate_default_offset_slot_cost(unsigned offset_slot)
-{
-	STATIC_ASSERT(BIT_COST == 8);
-	/* 39 is 4.875 bits/symbol  */
-	return 39 + ((u32)deflate_extra_offset_bits[offset_slot] * BIT_COST);
-}
+/*
+ * This lookup table gives the default cost of a literal symbol and of a length
+ * symbol, depending on the characteristics of the input data.  It was generated
+ * by scripts/gen_default_litlen_costs.py.
+ *
+ * This table is indexed first by the estimated match probability:
+ *
+ *	i=0: data doesn't contain many matches	[match_prob=0.25]
+ *	i=1: neutral				[match_prob=0.50]
+ *	i=2: data contains lots of matches	[match_prob=0.75]
+ *
+ * This lookup produces a subtable which maps the number of distinct used
+ * literals to the default cost of a literal symbol, i.e.:
+ *
+ *	int(-log2((1 - match_prob) / num_used_literals) * BIT_COST)
+ *
+ * ... for num_used_literals in [1, 256] (and 0, which is copied from 1).  This
+ * accounts for literals usually getting cheaper as the number of distinct
+ * literals decreases, and as the proportion of literals to matches increases.
+ *
+ * The lookup also produces the cost of a length symbol, which is:
+ *
+ *	int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST)
+ *
+ * Note: we don't currently assign different costs to different literal symbols,
+ * or to different length symbols, as this is hard to do.
+ */
+static const struct {
+	u8 used_lits_to_lit_cost[257];
+	u8 len_sym_cost;
+} default_litlen_costs[] = {
+	{ /* match_prob = 0.25 */
+		.used_lits_to_lit_cost = {
+			6, 6, 22, 32, 38, 43, 48, 51,
+			54, 57, 59, 61, 64, 65, 67, 69,
+			70, 72, 73, 74, 75, 76, 77, 79,
+			80, 80, 81, 82, 83, 84, 85, 85,
+			86, 87, 88, 88, 89, 89, 90, 91,
+			91, 92, 92, 93, 93, 94, 95, 95,
+			96, 96, 96, 97, 97, 98, 98, 99,
+			99, 99, 100, 100, 101, 101, 101, 102,
+			102, 102, 103, 103, 104, 104, 104, 105,
+			105, 105, 105, 106, 106, 106, 107, 107,
+			107, 108, 108, 108, 108, 109, 109, 109,
+			109, 110, 110, 110, 111, 111, 111, 111,
+			112, 112, 112, 112, 112, 113, 113, 113,
+			113, 114, 114, 114, 114, 114, 115, 115,
+			115, 115, 115, 116, 116, 116, 116, 116,
+			117, 117, 117, 117, 117, 118, 118, 118,
+			118, 118, 118, 119, 119, 119, 119, 119,
+			120, 120, 120, 120, 120, 120, 121, 121,
+			121, 121, 121, 121, 121, 122, 122, 122,
+			122, 122, 122, 123, 123, 123, 123, 123,
+			123, 123, 124, 124, 124, 124, 124, 124,
+			124, 125, 125, 125, 125, 125, 125, 125,
+			125, 126, 126, 126, 126, 126, 126, 126,
+			127, 127, 127, 127, 127, 127, 127, 127,
+			128, 128, 128, 128, 128, 128, 128, 128,
+			128, 129, 129, 129, 129, 129, 129, 129,
+			129, 129, 130, 130, 130, 130, 130, 130,
+			130, 130, 130, 131, 131, 131, 131, 131,
+			131, 131, 131, 131, 131, 132, 132, 132,
+			132, 132, 132, 132, 132, 132, 132, 133,
+			133, 133, 133, 133, 133, 133, 133, 133,
+			133, 134, 134, 134, 134, 134, 134, 134,
+			134,
+		},
+		.len_sym_cost = 109,
+	}, { /* match_prob = 0.5 */
+		.used_lits_to_lit_cost = {
+			16, 16, 32, 41, 48, 53, 57, 60,
+			64, 66, 69, 71, 73, 75, 76, 78,
+			80, 81, 82, 83, 85, 86, 87, 88,
+			89, 90, 91, 92, 92, 93, 94, 95,
+			96, 96, 97, 98, 98, 99, 99, 100,
+			101, 101, 102, 102, 103, 103, 104, 104,
+			105, 105, 106, 106, 107, 107, 108, 108,
+			108, 109, 109, 110, 110, 110, 111, 111,
+			112, 112, 112, 113, 113, 113, 114, 114,
+			114, 115, 115, 115, 115, 116, 116, 116,
+			117, 117, 117, 118, 118, 118, 118, 119,
+			119, 119, 119, 120, 120, 120, 120, 121,
+			121, 121, 121, 122, 122, 122, 122, 122,
+			123, 123, 123, 123, 124, 124, 124, 124,
+			124, 125, 125, 125, 125, 125, 126, 126,
+			126, 126, 126, 127, 127, 127, 127, 127,
+			128, 128, 128, 128, 128, 128, 129, 129,
+			129, 129, 129, 129, 130, 130, 130, 130,
+			130, 130, 131, 131, 131, 131, 131, 131,
+			131, 132, 132, 132, 132, 132, 132, 133,
+			133, 133, 133, 133, 133, 133, 134, 134,
+			134, 134, 134, 134, 134, 134, 135, 135,
+			135, 135, 135, 135, 135, 135, 136, 136,
+			136, 136, 136, 136, 136, 136, 137, 137,
+			137, 137, 137, 137, 137, 137, 138, 138,
+			138, 138, 138, 138, 138, 138, 138, 139,
+			139, 139, 139, 139, 139, 139, 139, 139,
+			140, 140, 140, 140, 140, 140, 140, 140,
+			140, 141, 141, 141, 141, 141, 141, 141,
+			141, 141, 141, 142, 142, 142, 142, 142,
+			142, 142, 142, 142, 142, 142, 143, 143,
+			143, 143, 143, 143, 143, 143, 143, 143,
+			144,
+		},
+		.len_sym_cost = 93,
+	}, { /* match_prob = 0.75 */
+		.used_lits_to_lit_cost = {
+			32, 32, 48, 57, 64, 69, 73, 76,
+			80, 82, 85, 87, 89, 91, 92, 94,
+			96, 97, 98, 99, 101, 102, 103, 104,
+			105, 106, 107, 108, 108, 109, 110, 111,
+			112, 112, 113, 114, 114, 115, 115, 116,
+			117, 117, 118, 118, 119, 119, 120, 120,
+			121, 121, 122, 122, 123, 123, 124, 124,
+			124, 125, 125, 126, 126, 126, 127, 127,
+			128, 128, 128, 129, 129, 129, 130, 130,
+			130, 131, 131, 131, 131, 132, 132, 132,
+			133, 133, 133, 134, 134, 134, 134, 135,
+			135, 135, 135, 136, 136, 136, 136, 137,
+			137, 137, 137, 138, 138, 138, 138, 138,
+			139, 139, 139, 139, 140, 140, 140, 140,
+			140, 141, 141, 141, 141, 141, 142, 142,
+			142, 142, 142, 143, 143, 143, 143, 143,
+			144, 144, 144, 144, 144, 144, 145, 145,
+			145, 145, 145, 145, 146, 146, 146, 146,
+			146, 146, 147, 147, 147, 147, 147, 147,
+			147, 148, 148, 148, 148, 148, 148, 149,
+			149, 149, 149, 149, 149, 149, 150, 150,
+			150, 150, 150, 150, 150, 150, 151, 151,
+			151, 151, 151, 151, 151, 151, 152, 152,
+			152, 152, 152, 152, 152, 152, 153, 153,
+			153, 153, 153, 153, 153, 153, 154, 154,
+			154, 154, 154, 154, 154, 154, 154, 155,
+			155, 155, 155, 155, 155, 155, 155, 155,
+			156, 156, 156, 156, 156, 156, 156, 156,
+			156, 157, 157, 157, 157, 157, 157, 157,
+			157, 157, 157, 158, 158, 158, 158, 158,
+			158, 158, 158, 158, 158, 158, 159, 159,
+			159, 159, 159, 159, 159, 159, 159, 159,
+			160,
+		},
+		.len_sym_cost = 84,
+	},
+};

 /*
- * Set default symbol costs for the first block's first optimization pass.
- *
- * It works well to assume that each symbol is equally probable.  This results
- * in each symbol being assigned a cost of (-log2(1.0/num_syms) * BIT_COST)
- * where 'num_syms' is the number of symbols in the corresponding alphabet.
- * However, we intentionally bias the parse towards matches rather than literals
- * by using a slightly lower default cost for length symbols than for literals.
- * This often improves the compression ratio slightly.
+ * Choose the default costs for literal and length symbols.  These symbols are
+ * both part of the litlen alphabet.
 */
 static void
-deflate_set_default_costs(struct libdeflate_compressor *c)
+deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
+				    u32 block_length,
+				    u32 *lit_cost, u32 *len_sym_cost)
+{
+	unsigned num_used_literals = 0;
+	u32 literal_freq = block_length;
+	u32 match_freq = 0;
+	u32 cutoff;
+	int i;
+
+	/* Calculate the number of distinct literals that exist in the data. */
+	cutoff = literal_freq >> 11; /* Ignore literals used very rarely */
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+		if (c->freqs.litlen[i] > cutoff)
+			num_used_literals++;
+	}
+	if (num_used_literals == 0)
+		num_used_literals = 1;
+
+	/*
+	 * Estimate the relative frequency of literals and matches in the
+	 * optimal parsing solution.  We don't know the optimal solution, so
+	 * this can only be a very rough estimate.  Therefore, we basically use
+	 * the match frequency from a greedy parse.  We also apply the min_len
+	 * heuristic used by the greedy and lazy parsers, to avoid counting too
+	 * many matches when literals are cheaper than short matches.
+	 */
+	match_freq = 0;
+	i = choose_min_match_len(num_used_literals, c->max_search_depth);
+	for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
+		match_freq += c->p.n.match_len_freqs[i];
+		literal_freq -= i * c->p.n.match_len_freqs[i];
+	}
+	if ((s32)literal_freq < 0) /* shouldn't happen */
+		literal_freq = 0;
+
+	if (match_freq > literal_freq)
+		i = 2; /* many matches */
+	else if (match_freq * 4 > literal_freq)
+		i = 1; /* neutral */
+	else
+		i = 0; /* few matches */
+
+	*lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[
+							num_used_literals];
+	*len_sym_cost = default_litlen_costs[i].len_sym_cost;
+}
+
+static forceinline u32
+deflate_default_length_cost(unsigned len, u32 len_sym_cost)
+{
+	unsigned slot = deflate_length_slot[len];
+	u32 num_extra_bits = deflate_extra_length_bits[slot];
+
+	return len_sym_cost + (num_extra_bits * BIT_COST);
+}
+
+static forceinline u32
+deflate_default_offset_slot_cost(unsigned slot)
+{
+	u32 num_extra_bits = deflate_extra_offset_bits[slot];
+	/*
+	 * Assume that all offset symbols are equally probable.
+	 * The resulting cost is 'int(-log2(1/30) * BIT_COST)',
+	 * where 30 is the number of potentially-used offset symbols.
+	 */
+	u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000;
+
+	return offset_sym_cost + (num_extra_bits * BIT_COST);
+}
+
+/* Set default symbol costs for the first block's first optimization pass. */
+static void
+deflate_set_default_costs(struct libdeflate_compressor *c,
+			  u32 lit_cost, u32 len_sym_cost)
 {
 	unsigned i;

 	/* Literals  */
 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
-		c->p.n.costs.literal[i] = deflate_default_literal_cost(i);
+		c->p.n.costs.literal[i] = lit_cost;

 	/* Lengths  */
 	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
-		c->p.n.costs.length[i] = deflate_default_length_slot_cost(
-						deflate_length_slot[i]);
+		c->p.n.costs.length[i] =
+			deflate_default_length_cost(i, len_sym_cost);

 	/* Offset slots  */
 	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
-		c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
+		c->p.n.costs.offset_slot[i] =
+			deflate_default_offset_slot_cost(i);
 }

 static forceinline void
-deflate_adjust_cost(u32 *cost_p, u32 default_cost)
+deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount)
 {
-	*cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
+	if (change_amount == 0)
+		/* Block is very similar to previous; prefer previous costs. */
+		*cost_p = (default_cost + 3 * *cost_p) / 4;
+	else if (change_amount == 1)
+		*cost_p = (default_cost + *cost_p) / 2;
+	else if (change_amount == 2)
+		*cost_p = (5 * default_cost + 3 * *cost_p) / 8;
+	else
+		/* Block differs greatly from previous; prefer default costs. */
+		*cost_p = (3 * default_cost + *cost_p) / 4;
+}
+
+static forceinline void
+deflate_adjust_costs_impl(struct libdeflate_compressor *c,
+			  u32 lit_cost, u32 len_sym_cost, int change_amount)
+{
+	unsigned i;
+
+	/* Literals  */
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+		deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost,
+				    change_amount);
+
+	/* Lengths  */
+	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
+		deflate_adjust_cost(&c->p.n.costs.length[i],
+				    deflate_default_length_cost(i,
+								len_sym_cost),
+				    change_amount);
+
+	/* Offset slots  */
+	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
+		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
+				    deflate_default_offset_slot_cost(i),
+				    change_amount);
 }

 /*
@ -2564,25 +2805,42 @@ deflate_adjust_cost(u32 *cost_p, u32 default_cost)
 * defaults, but don't simply set them to the defaults.
 */
 static void
-deflate_adjust_costs(struct libdeflate_compressor *c)
+deflate_adjust_costs(struct libdeflate_compressor *c,
+		     u32 lit_cost, u32 len_sym_cost)
 {
-	unsigned i;
+	u64 total_delta = 0;
+	u64 cutoff;
+	int i;

-	/* Literals  */
-	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
-		deflate_adjust_cost(&c->p.n.costs.literal[i],
-				    deflate_default_literal_cost(i));
+	/*
+	 * Decide how different the current block is from the previous block,
+	 * using the block splitting statistics from the current and previous
+	 * blocks.  The more different the current block is, the more we prefer
+	 * the default costs rather than the previous block's costs.
+	 *
+	 * The algorithm here is similar to the end-of-block check one, but here
+	 * we compare two entire blocks rather than a partial block with a small
+	 * extra part, and therefore we need 64-bit numbers in some places.
+	 */
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		u64 prev = (u64)c->p.n.prev_observations[i] *
+			    c->split_stats.num_observations;
+		u64 cur = (u64)c->split_stats.observations[i] *
+			  c->p.n.prev_num_observations;

-	/* Lengths  */
-	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
-		deflate_adjust_cost(&c->p.n.costs.length[i],
-				    deflate_default_length_slot_cost(
-						deflate_length_slot[i]));
+		total_delta += prev > cur ? prev - cur : cur - prev;
+	}
+	cutoff = ((u64)c->p.n.prev_num_observations *
+		  c->split_stats.num_observations * 200) / 512;

-	/* Offset slots  */
-	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
-		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
-				    deflate_default_offset_slot_cost(i));
+	if (4 * total_delta > 9 * cutoff)
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3);
+	else if (2 * total_delta > 3 * cutoff)
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2);
+	else if (2 * total_delta > cutoff)
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1);
+	else
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0);
 }

 /*
@ -2682,6 +2940,7 @@ deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
 		       bool is_final_block)
 {
 	unsigned num_passes_remaining = c->p.n.num_optim_passes;
+	u32 lit_cost, len_sym_cost;
 	u32 i;

 	/* Force the block to really end at the desired length, even if some
@ -2690,11 +2949,16 @@ deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
 					ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
 		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;

+	/* Make sure the literal/match statistics are up to date. */
+	merge_new_observations(&c->split_stats);
+
 	/* Set the initial costs. */
+	deflate_choose_default_litlen_costs(c, block_length,
+					    &lit_cost, &len_sym_cost);
 	if (is_first_block)
-		deflate_set_default_costs(c);
+		deflate_set_default_costs(c, lit_cost, len_sym_cost);
 	else
-		deflate_adjust_costs(c);
+		deflate_adjust_costs(c, lit_cost, len_sym_cost);

 	do {
 		/* Find the minimum cost path for this pass. */
@ -2717,6 +2981,34 @@ deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
 	} while (num_passes_remaining);
 }

+static void deflate_near_optimal_begin_block(struct libdeflate_compressor *c,
+					     bool is_first_block)
+{
+	int i;
+
+	if (!is_first_block) {
+		/*
+		 * Save some literal/match statistics from the previous block so
+		 * that deflate_adjust_costs() will be able to decide how much
+		 * the current block differs from the previous one.
+		 */
+		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+			c->p.n.prev_observations[i] =
+				c->split_stats.observations[i];
+		}
+		c->p.n.prev_num_observations = c->split_stats.num_observations;
+	}
+	init_block_split_stats(&c->split_stats);
+
+	/*
+	 * During matchfinding, we keep track of appropximate literal and match
+	 * length frequencies for the purpose of setting the initial costs.
+	 */
+	memset(c->freqs.litlen, 0,
+	       DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
+	memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
+}
+
 /*
 * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
 * representation of each DEFLATE block using a minimum-cost path search over
@ -2757,7 +3049,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
 		const u8 *next_observation = in_next;

-		init_block_split_stats(&c->split_stats);
+		deflate_near_optimal_begin_block(c, in_block_begin == in);

 		/*
 		 * Find matches until we decide to end the block.  We end the
@ -2812,12 +3104,13 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 						&best_len,
 						matches);
 			}
-
+			c->freqs.litlen[*in_next]++;
 			if (in_next >= next_observation) {
 				if (best_len >= 4) {
 					observe_match(&c->split_stats,
 						      best_len);
 					next_observation = in_next + best_len;
+					c->p.n.match_len_freqs[best_len]++;
 				} else {
 					observe_literal(&c->split_stats,
 							*in_next);
@ -2870,6 +3163,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 					}
 					cache_ptr->length = 0;
 					cache_ptr->offset = *in_next;
+					c->freqs.litlen[*in_next]++;
 					in_next++;
 					cache_ptr++;
 				} while (--best_len);
--- a/scripts/gen_default_litlen_costs.py
+++ b/scripts/gen_default_litlen_costs.py
@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+#
+# This script computes the default litlen symbol costs for the near-optimal
+# parser.
+
+from math import log2
+
+BIT_COST = 16 # Must match BIT_COST in deflate_compress.c
+NUM_LEN_SLOTS = 29
+
+print("""static const struct {
+	u8 used_lits_to_lit_cost[257];
+	u8 len_sym_cost;
+} default_litlen_costs[] = {""")
+MATCH_PROBS = [0.25, 0.50, 0.75]
+for i, match_prob in enumerate(MATCH_PROBS):
+    len_prob = match_prob / NUM_LEN_SLOTS
+    len_sym_cost = int(-log2(len_prob) * BIT_COST)
+    if i == 0:
+        print('\t{', end='')
+    print(f' /* match_prob = {match_prob} */')
+    print('\t\t.used_lits_to_lit_cost = {')
+
+    j = 0
+    for num_used_literals in range(0, 257):
+        if num_used_literals == 0:
+            num_used_literals = 1
+        lit_prob = (1 - match_prob) / num_used_literals
+        lit_cost = int(-log2(lit_prob) * BIT_COST)
+        if j == 0:
+            print('\t\t\t', end='')
+        if j == 7 or num_used_literals == 256:
+            print(f'{lit_cost},')
+            j = 0
+        else:
+            print(f'{lit_cost}, ', end='')
+            j += 1
+    print('\t\t},')
+    print(f'\t\t.len_sym_cost = {len_sym_cost},')
+    if i < len(MATCH_PROBS) - 1:
+        print('\t}, {', end='')
+    else:
+        print('\t},')
+print('};')