lib/deflate_decompress: further optimize match copying

2025-08-04 02:06:31 -04:00 · 2018-12-25 18:14:32 -06:00 · 2018-12-25 18:14:32 -06:00 · a25f3b86d7
commit a25f3b86d7
parent 6750e4f19d
1 changed files with 50 additions and 47 deletions
--- a/lib/decompress_template.h
+++ b/lib/decompress_template.h
@ -256,6 +256,8 @@ have_decode_tables:
 		u32 entry;
 		u32 length;
 		u32 offset;
 		const u8 *src;
 		u8 *dst;
 		/* Decode a litlen symbol.  */
 		ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
@ -328,67 +330,68 @@ have_decode_tables:
 		 * output buffer.  */
 		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
-		/* Copy the match: 'length' bytes at 'out_next - offset' to
+		/*
-		 * 'out_next'.  */
+		 * Copy the match: 'length' bytes at 'out_next - offset' to
 		 * 'out_next', possibly overlapping.  If the match doesn't end
 		 * too close to the end of the buffer and offset >= WORDBYTES ||
 		 * offset == 1, take a fast path which copies a word at a time
 		 * -- potentially more than the length of the match, but that's
 		 * fine as long as we check for enough extra space.
 		 *
 		 * The remaining cases are not performance-critical so are
 		 * handled by a simple byte-by-byte copy.
 		 */
 		src = out_next - offset;
 		dst = out_next;
 		out_next += length;
 		if (UNALIGNED_ACCESS_IS_FAST &&
-		    length <= (3 * WORDBYTES) &&
+		    /* max overrun is writing 3 words for a min length match */
-		    offset >= WORDBYTES &&
+		    likely(out_end - out_next >=
-		    length + (3 * WORDBYTES) <= out_end - out_next)
+			   3 * WORDBYTES - DEFLATE_MIN_MATCH_LEN)) {
-		{
+			if (offset >= WORDBYTES) { /* words don't overlap? */
-			/* Fast case: short length, no overlaps if we copy one
+				copy_word_unaligned(src, dst);
-			 * word at a time, and we aren't getting too close to
+				src += WORDBYTES;
-			 * the end of the output array.  */
+				dst += WORDBYTES;
 			copy_word_unaligned(out_next - offset + (0 * WORDBYTES),
 					    out_next + (0 * WORDBYTES));
 			copy_word_unaligned(out_next - offset + (1 * WORDBYTES),
 					    out_next + (1 * WORDBYTES));
 			copy_word_unaligned(out_next - offset + (2 * WORDBYTES),
 					    out_next + (2 * WORDBYTES));
 		} else {
 			const u8 *src = out_next - offset;
 			u8 *dst = out_next;
 			u8 *end = out_next + length;
 			if (UNALIGNED_ACCESS_IS_FAST &&
 			    likely(out_end - end >= WORDBYTES - 1)) {
 				if (offset >= WORDBYTES) {
 				copy_word_unaligned(src, dst);
 				src += WORDBYTES;
 				dst += WORDBYTES;
 					if (dst < end) {
 				do {
 					copy_word_unaligned(src, dst);
 					src += WORDBYTES;
 					dst += WORDBYTES;
-						} while (dst < end);
+				} while (dst < out_next);
 					}
 			} else if (offset == 1) {
-					machine_word_t v = repeat_byte(*(dst - 1));
+				/* RLE encoding of previous byte, common if the
 				 * data contains many repeated bytes */
 				machine_word_t v = repeat_byte(*src);
 				store_word_unaligned(v, dst);
 				dst += WORDBYTES;
 				store_word_unaligned(v, dst);
 				dst += WORDBYTES;
 				do {
 					store_word_unaligned(v, dst);
 						src += WORDBYTES;
 					dst += WORDBYTES;
-					} while (dst < end);
+				} while (dst < out_next);
 			} else {
 				*dst++ = *src++;
 				*dst++ = *src++;
 				do {
 					*dst++ = *src++;
-					} while (dst < end);
+				} while (dst < out_next);
 			}
 		} else {
 			STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
 			*dst++ = *src++;
 			*dst++ = *src++;
 			do {
 				*dst++ = *src++;
-				} while (dst < end);
+			} while (dst < out_next);
 		}
 	}
 		out_next += length;
 	}
 block_done:
 	/* Finished decoding a block.  */