From a25f3b86d77c3ffd5ab0a87b5b9229e151ba5abb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 25 Dec 2018 18:14:32 -0600 Subject: [PATCH] lib/deflate_decompress: further optimize match copying --- lib/decompress_template.h | 97 ++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/lib/decompress_template.h b/lib/decompress_template.h index b91ac9a..c6bcf9f 100644 --- a/lib/decompress_template.h +++ b/lib/decompress_template.h @@ -256,6 +256,8 @@ have_decode_tables: u32 entry; u32 length; u32 offset; + const u8 *src; + u8 *dst; /* Decode a litlen symbol. */ ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN); @@ -328,65 +330,66 @@ have_decode_tables: * output buffer. */ SAFETY_CHECK(offset <= out_next - (const u8 *)out); - /* Copy the match: 'length' bytes at 'out_next - offset' to - * 'out_next'. */ + /* + * Copy the match: 'length' bytes at 'out_next - offset' to + * 'out_next', possibly overlapping. If the match doesn't end + * too close to the end of the buffer and offset >= WORDBYTES || + * offset == 1, take a fast path which copies a word at a time + * -- potentially more than the length of the match, but that's + * fine as long as we check for enough extra space. + * + * The remaining cases are not performance-critical so are + * handled by a simple byte-by-byte copy. + */ + + src = out_next - offset; + dst = out_next; + out_next += length; if (UNALIGNED_ACCESS_IS_FAST && - length <= (3 * WORDBYTES) && - offset >= WORDBYTES && - length + (3 * WORDBYTES) <= out_end - out_next) - { - /* Fast case: short length, no overlaps if we copy one - * word at a time, and we aren't getting too close to - * the end of the output array. */ - copy_word_unaligned(out_next - offset + (0 * WORDBYTES), - out_next + (0 * WORDBYTES)); - copy_word_unaligned(out_next - offset + (1 * WORDBYTES), - out_next + (1 * WORDBYTES)); - copy_word_unaligned(out_next - offset + (2 * WORDBYTES), - out_next + (2 * WORDBYTES)); - } else { - const u8 *src = out_next - offset; - u8 *dst = out_next; - u8 *end = out_next + length; - - if (UNALIGNED_ACCESS_IS_FAST && - likely(out_end - end >= WORDBYTES - 1)) { - if (offset >= WORDBYTES) { + /* max overrun is writing 3 words for a min length match */ + likely(out_end - out_next >= + 3 * WORDBYTES - DEFLATE_MIN_MATCH_LEN)) { + if (offset >= WORDBYTES) { /* words don't overlap? */ + copy_word_unaligned(src, dst); + src += WORDBYTES; + dst += WORDBYTES; + copy_word_unaligned(src, dst); + src += WORDBYTES; + dst += WORDBYTES; + do { copy_word_unaligned(src, dst); src += WORDBYTES; dst += WORDBYTES; - if (dst < end) { - do { - copy_word_unaligned(src, dst); - src += WORDBYTES; - dst += WORDBYTES; - } while (dst < end); - } - } else if (offset == 1) { - machine_word_t v = repeat_byte(*(dst - 1)); - do { - store_word_unaligned(v, dst); - src += WORDBYTES; - dst += WORDBYTES; - } while (dst < end); - } else { - *dst++ = *src++; - *dst++ = *src++; - do { - *dst++ = *src++; - } while (dst < end); - } + } while (dst < out_next); + } else if (offset == 1) { + /* RLE encoding of previous byte, common if the + * data contains many repeated bytes */ + machine_word_t v = repeat_byte(*src); + + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + do { + store_word_unaligned(v, dst); + dst += WORDBYTES; + } while (dst < out_next); } else { *dst++ = *src++; *dst++ = *src++; do { *dst++ = *src++; - } while (dst < end); + } while (dst < out_next); } + } else { + STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); } - - out_next += length; } block_done: