diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c
index 12e76ff..dc0c956 100644
--- a/lib/deflate_compress.c
+++ b/lib/deflate_compress.c
@@ -592,6 +592,7 @@ struct libdeflate_compressor {
 			 * greedy parse, gathered during matchfinding.  This is
 			 * used for setting the initial symbol costs.
 			 */
+			u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
 			u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
 
 			unsigned num_optim_passes;
@@ -2164,14 +2165,22 @@ do_end_block_check(struct block_split_stats *stats, u32 block_length)
 	return false;
 }
 
+static forceinline bool
+ready_to_check_block(const struct block_split_stats *stats,
+		     const u8 *in_block_begin, const u8 *in_next,
+		     const u8 *in_end)
+{
+	return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK
+		&& in_next - in_block_begin >= MIN_BLOCK_LENGTH
+		&& in_end - in_next >= MIN_BLOCK_LENGTH;
+}
+
 static forceinline bool
 should_end_block(struct block_split_stats *stats,
 		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
 {
-	/* Ready to check block split statistics? */
-	if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
-	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
-	    in_end - in_next < MIN_BLOCK_LENGTH)
+	/* Ready to try to end the block (again)? */
+	if (!ready_to_check_block(stats, in_block_begin, in_next, in_end))
 		return false;
 
 	return do_end_block_check(stats, in_next - in_block_begin);
@@ -2330,11 +2339,12 @@ recalculate_min_match_len(const struct deflate_freqs *freqs,
 }
 
 static forceinline const u8 *
-choose_max_block_end(const u8 *in_next, const u8 *in_end, size_t soft_max_len)
+choose_max_block_end(const u8 *in_block_begin, const u8 *in_end,
+		     size_t soft_max_len)
 {
-	if (in_end - in_next < soft_max_len + MIN_BLOCK_LENGTH)
+	if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH)
 		return in_end;
-	return in_next + soft_max_len;
+	return in_block_begin + soft_max_len;
 }
 
 /*
@@ -2981,17 +2991,21 @@ static const struct {
  */
 static void
 deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
-				    u32 block_length,
+				    const u8 *block_begin, u32 block_length,
 				    u32 *lit_cost, u32 *len_sym_cost)
 {
 	unsigned num_used_literals = 0;
 	u32 literal_freq = block_length;
 	u32 match_freq = 0;
 	u32 cutoff;
-	unsigned i;
+	u32 i;
 
 	/* Calculate the number of distinct literals that exist in the data. */
+	memset(c->freqs.litlen, 0,
+	       DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
 	cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */
+	for (i = 0; i < block_length; i++)
+		c->freqs.litlen[block_begin[i]]++;
 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
 		if (c->freqs.litlen[i] > cutoff)
 			num_used_literals++;
@@ -3258,7 +3272,8 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
  * as the costs.
  */
 static void
-deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
+deflate_optimize_block(struct libdeflate_compressor *c,
+		       const u8 *block_begin, u32 block_length,
 		       const struct lz_match *cache_ptr, bool is_first_block,
 		       bool is_final_block)
 {
@@ -3275,11 +3290,8 @@ deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
 		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
 		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
 
-	/* Make sure the literal/match statistics are up to date. */
-	merge_new_observations(&c->split_stats);
-
 	/* Set the initial costs. */
-	deflate_choose_default_litlen_costs(c, block_length,
+	deflate_choose_default_litlen_costs(c, block_begin, block_length,
 					    &lit_cost, &len_sym_cost);
 	if (is_first_block)
 		deflate_set_default_costs(c, lit_cost, len_sym_cost);
@@ -3308,31 +3320,49 @@ deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
 }
 
 static void
-deflate_near_optimal_begin_block(struct libdeflate_compressor *c,
-				 bool is_first_block)
+deflate_near_optimal_init_stats(struct libdeflate_compressor *c)
+{
+	init_block_split_stats(&c->split_stats);
+	memset(c->p.n.new_match_len_freqs, 0,
+	       sizeof(c->p.n.new_match_len_freqs));
+	memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
+}
+
+static void
+deflate_near_optimal_merge_stats(struct libdeflate_compressor *c)
+{
+	unsigned i;
+
+	merge_new_observations(&c->split_stats);
+	for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
+		c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i];
+		c->p.n.new_match_len_freqs[i] = 0;
+	}
+}
+
+/*
+ * Save some literal/match statistics from the previous block so that
+ * deflate_adjust_costs() will be able to decide how much the current block
+ * differs from the previous one.
+ */
+static void
+deflate_near_optimal_save_stats(struct libdeflate_compressor *c)
 {
 	int i;
 
-	if (!is_first_block) {
-		/*
-		 * Save some literal/match statistics from the previous block so
-		 * that deflate_adjust_costs() will be able to decide how much
-		 * the current block differs from the previous one.
-		 */
-		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
-			c->p.n.prev_observations[i] =
-				c->split_stats.observations[i];
-		}
-		c->p.n.prev_num_observations = c->split_stats.num_observations;
-	}
-	init_block_split_stats(&c->split_stats);
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+		c->p.n.prev_observations[i] = c->split_stats.observations[i];
+	c->p.n.prev_num_observations = c->split_stats.num_observations;
+}
 
-	/*
-	 * During matchfinding, we keep track of approximate literal and match
-	 * length frequencies for the purpose of setting the initial costs.
-	 */
-	memset(c->freqs.litlen, 0,
-	       DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
+static void
+deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c)
+{
+	int i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+		c->split_stats.observations[i] = 0;
+	c->split_stats.num_observations = 0;
 	memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
 }
 
@@ -3355,6 +3385,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 			      u8 * restrict out, size_t out_nbytes_avail)
 {
 	const u8 *in_next = in;
+	const u8 *in_block_begin = in_next;
 	const u8 *in_end = in_next + in_nbytes;
 	struct deflate_output_bitstream os;
 	const u8 *in_cur_base = in_next;
@@ -3362,23 +3393,29 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 		in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
 	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
 	unsigned nice_len = MIN(c->nice_match_length, max_len);
+	struct lz_match *cache_ptr = c->p.n.match_cache;
 	u32 next_hashes[2] = {0, 0};
 
 	deflate_init_output(&os, out, out_nbytes_avail);
 	bt_matchfinder_init(&c->p.n.bt_mf);
+	deflate_near_optimal_init_stats(c);
 
 	do {
 		/* Starting a new DEFLATE block */
-
-		struct lz_match *cache_ptr = c->p.n.match_cache;
-		const u8 * const in_block_begin = in_next;
 		const u8 * const in_max_block_end = choose_max_block_end(
-				in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
+				in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH);
+		const u8 *prev_end_block_check = NULL;
+		bool change_detected = false;
 		const u8 *next_observation = in_next;
 		unsigned min_len;
 
-		deflate_near_optimal_begin_block(c, in_block_begin == in);
-		min_len = calculate_min_match_len(in_next,
+		/*
+		 * Use the minimum match length heuristic to improve the
+		 * literal/match statistics gathered during matchfinding.
+		 * However, the actual near-optimal parse won't respect min_len,
+		 * as it can accurately assess the costs of different matches.
+		 */
+		min_len = calculate_min_match_len(in_block_begin,
 						  in_max_block_end - in_next,
 						  c->max_search_depth);
 
@@ -3390,7 +3427,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 		 * (2) Match catch may overflow.
 		 * (3) Block split heuristic says to split now.
 		 */
-		do {
+		for (;;) {
 			struct lz_match *matches;
 			unsigned best_len;
 			size_t remaining = in_end - in_next;
@@ -3436,13 +3473,12 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 				if (cache_ptr > matches)
 					best_len = cache_ptr[-1].length;
 			}
-			c->freqs.litlen[*in_next]++;
 			if (in_next >= next_observation) {
 				if (best_len >= min_len) {
 					observe_match(&c->split_stats,
 						      best_len);
 					next_observation = in_next + best_len;
-					c->p.n.match_len_freqs[best_len]++;
+					c->p.n.new_match_len_freqs[best_len]++;
 				} else {
 					observe_literal(&c->split_stats,
 							*in_next);
@@ -3495,24 +3531,101 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 					}
 					cache_ptr->length = 0;
 					cache_ptr->offset = *in_next;
-					c->freqs.litlen[*in_next]++;
 					in_next++;
 					cache_ptr++;
 				} while (--best_len);
 			}
-		} while (in_next < in_max_block_end &&
-			 cache_ptr < &c->p.n.match_cache[MATCH_CACHE_LENGTH] &&
-			 !should_end_block(&c->split_stats,
-					   in_block_begin, in_next, in_end));
+			/* Maximum block length or end of input reached? */
+			if (in_next >= in_max_block_end)
+				break;
+			/* Match cache overflowed? */
+			if (cache_ptr >=
+			    &c->p.n.match_cache[MATCH_CACHE_LENGTH])
+				break;
+			/* Not ready to try to end the block (again)? */
+			if (!ready_to_check_block(&c->split_stats,
+						  in_block_begin, in_next,
+						  in_end))
+				continue;
+			/* Check if it would be worthwhile to end the block. */
+			if (do_end_block_check(&c->split_stats,
+					       in_next - in_block_begin)) {
+				change_detected = true;
+				break;
+			}
+			/* Ending the block doesn't seem worthwhile here. */
+			deflate_near_optimal_merge_stats(c);
+			prev_end_block_check = in_next;
+		}
 		/*
 		 * All the matches for this block have been cached.  Now choose
-		 * the sequence of items to output and flush the block.
+		 * the precise end of the block and the sequence of items to
+		 * output to represent it, then flush the block.
 		 */
-		deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
-				       in_block_begin == in, in_next == in_end);
-		deflate_flush_block(c, &os, in_block_begin,
-				    in_next - in_block_begin,
-				    NULL, in_next == in_end);
+		if (change_detected && prev_end_block_check != NULL) {
+			/*
+			 * The block is being ended because a recent chunk of
+			 * data differs from the rest of the block.  We could
+			 * end the block at 'in_next' like the greedy and lazy
+			 * compressors do, but that's not ideal since it would
+			 * include the differing chunk in the block.  The
+			 * near-optimal compressor has time to do a better job.
+			 * Therefore, we rewind to just before the chunk, and
+			 * output a block that only goes up to there.
+			 *
+			 * We then set things up to correctly start the next
+			 * block, considering that some work has already been
+			 * done on it (some matches found and stats gathered).
+			 */
+			struct lz_match *orig_cache_ptr = cache_ptr;
+			const u8 *in_block_end = prev_end_block_check;
+			u32 block_length = in_block_end - in_block_begin;
+			bool is_first = (in_block_begin == in);
+			bool is_final = false;
+			u32 num_bytes_to_rewind = in_next - in_block_end;
+			size_t cache_len_rewound;
+
+			/* Rewind the match cache. */
+			do {
+				cache_ptr--;
+				cache_ptr -= cache_ptr->length;
+			} while (--num_bytes_to_rewind);
+			cache_len_rewound = orig_cache_ptr - cache_ptr;
+
+			deflate_optimize_block(c, in_block_begin, block_length,
+					       cache_ptr, is_first, is_final);
+			deflate_flush_block(c, &os, in_block_begin,
+					    block_length, NULL, is_final);
+			memmove(c->p.n.match_cache, cache_ptr,
+				cache_len_rewound * sizeof(*cache_ptr));
+			cache_ptr = &c->p.n.match_cache[cache_len_rewound];
+			deflate_near_optimal_save_stats(c);
+			/*
+			 * Clear the stats for the just-flushed block, leaving
+			 * just the stats for the beginning of the next block.
+			 */
+			deflate_near_optimal_clear_old_stats(c);
+			in_block_begin = in_block_end;
+		} else {
+			/*
+			 * The block is being ended for a reason other than a
+			 * differing data chunk being detected.  Don't rewind at
+			 * all; just end the block at the current position.
+			 */
+			u32 block_length = in_next - in_block_begin;
+			bool is_first = (in_block_begin == in);
+			bool is_final = (in_next == in_end);
+
+			deflate_near_optimal_merge_stats(c);
+			deflate_optimize_block(c, in_block_begin, block_length,
+					       cache_ptr, is_first, is_final);
+			deflate_flush_block(c, &os, in_block_begin,
+					    block_length, NULL, is_final);
+			cache_ptr = &c->p.n.match_cache[0];
+			deflate_near_optimal_save_stats(c);
+			deflate_near_optimal_init_stats(c);
+			in_block_begin = in_next;
+		}
 	} while (in_next != in_end);
 
 	return deflate_flush_output(&os);