diff --git a/src/deflate_compress.c b/src/deflate_compress.c index 45b091e..61e6eac 100644 --- a/src/deflate_compress.c +++ b/src/deflate_compress.c @@ -1491,7 +1491,7 @@ deflate_write_end_of_block(struct deflate_output_bitstream *os, static void deflate_write_block(struct deflate_compressor * restrict c, struct deflate_output_bitstream * restrict os, - const u8 * restrict block_begin, u32 items_remaining, + const u8 * restrict block_begin, s32 items_remaining, bool is_final_block) { struct deflate_codes *codes; @@ -1588,41 +1588,37 @@ deflate_compress_greedy(struct deflate_compressor * restrict c, const u8 *in_next = in; const u8 *in_end = in_next + in_nbytes; struct deflate_output_bitstream os; - const u8 *block_begin = in_next; - struct deflate_sequence *next_seq = c->sequences; - u32 litrunlen = 0; - u32 items_remaining = MAX_ITEMS_PER_BLOCK; + const u8 *in_cur_base = in_next; + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); u32 next_hashes[2] = {0, 0}; deflate_init_output(&os, out, out_nbytes_avail); deflate_reset_symbol_frequencies(c); + hc_matchfinder_init(&c->hc_mf); - /* The outer loop repeats every WINDOW_SIZE bytes and handles the - * sliding window. */ do { - const u8 *in_cur_base; - const u8 *in_cur_end; + /* Starting a new DEFLATE block. */ - if (in == in_next) - hc_matchfinder_init(&c->hc_mf); - else - hc_matchfinder_slide_window(&c->hc_mf); + const u8 * const in_block_begin = in_next; + u32 litrunlen = 0; + struct deflate_sequence *next_seq = c->sequences; + s32 items_remaining = MAX_ITEMS_PER_BLOCK; - in_cur_base = in_next; - in_cur_end = in_next + MIN(in_end - in_next, - MATCHFINDER_WINDOW_SIZE); do { - unsigned max_len; - unsigned nice_len; - unsigned length; - unsigned offset; + u32 length; + u32 offset; - max_len = MIN(in_cur_end - in_next, DEFLATE_MAX_MATCH_LEN); - nice_len = MIN(max_len, c->nice_match_length); + /* Decrease the maximum and nice match lengths if we're + * approaching the end of the input buffer. */ + if (unlikely(max_len > in_end - in_next)) { + max_len = in_end - in_next; + nice_len = MIN(nice_len, max_len); + } length = hc_matchfinder_longest_match(&c->hc_mf, - in_cur_base, - in_next - in_cur_base, + &in_cur_base, + in_next, DEFLATE_MIN_MATCH_LEN - 1, max_len, nice_len, @@ -1635,9 +1631,9 @@ deflate_compress_greedy(struct deflate_compressor * restrict c, deflate_choose_match(c, length, offset, &litrunlen, &next_seq); in_next = hc_matchfinder_skip_positions(&c->hc_mf, - in_cur_base, - in_next + 1 - in_cur_base, - in_end - in_cur_base, + &in_cur_base, + in_next + 1, + in_end, length - 1, next_hashes); } else { @@ -1646,28 +1642,12 @@ deflate_compress_greedy(struct deflate_compressor * restrict c, } /* Check if it's time to output another block. */ - if (--items_remaining == 0) { - deflate_finish_sequence(next_seq, litrunlen); - deflate_write_block(c, &os, block_begin, - items_remaining, - in_next == in_end); + } while (in_next != in_end && --items_remaining > 0); - block_begin = in_next; - next_seq = c->sequences; - litrunlen = 0; - items_remaining = MAX_ITEMS_PER_BLOCK; - } - - } while (in_next != in_cur_end); - - } while (in_next != in_end); - - /* Output the last block. */ - if (items_remaining != MAX_ITEMS_PER_BLOCK) { deflate_finish_sequence(next_seq, litrunlen); - deflate_write_block(c, &os, block_begin, - items_remaining, true); - } + deflate_write_block(c, &os, in_block_begin, + items_remaining, in_next == in_end); + } while (in_next != in_end); return deflate_flush_output(&os); } @@ -1685,48 +1665,38 @@ deflate_compress_lazy(struct deflate_compressor * restrict c, const u8 *in_next = in; const u8 *in_end = in_next + in_nbytes; struct deflate_output_bitstream os; - const u8 *block_begin = in_next; - struct deflate_sequence *next_seq = c->sequences; - u32 litrunlen = 0; - u32 items_remaining = MAX_ITEMS_PER_BLOCK; + const u8 *in_cur_base = in_next; + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); u32 next_hashes[2] = {0, 0}; deflate_init_output(&os, out, out_nbytes_avail); deflate_reset_symbol_frequencies(c); + hc_matchfinder_init(&c->hc_mf); - /* The outer loop repeats every WINDOW_SIZE bytes and handles the - * sliding window. */ do { - const u8 *in_cur_base; - const u8 *in_cur_end; - unsigned max_len; - unsigned nice_len; + /* Starting a new DEFLATE block. */ - if (in == in_next) - hc_matchfinder_init(&c->hc_mf); - else - hc_matchfinder_slide_window(&c->hc_mf); + const u8 * const in_block_begin = in_next; + u32 litrunlen = 0; + struct deflate_sequence *next_seq = c->sequences; + s32 items_remaining = MAX_ITEMS_PER_BLOCK; - in_cur_base = in_next; - in_cur_end = in_next + MIN(in_end - in_next, - MATCHFINDER_WINDOW_SIZE); - max_len = DEFLATE_MAX_MATCH_LEN; - nice_len = MIN(c->nice_match_length, max_len); do { unsigned cur_len; unsigned cur_offset; unsigned next_len; unsigned next_offset; - if (unlikely(in_cur_end - in_next < DEFLATE_MAX_MATCH_LEN)) { - max_len = in_cur_end - in_next; - nice_len = MIN(max_len, nice_len); + if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) { + max_len = in_end - in_next; + nice_len = MIN(nice_len, max_len); } /* Find the longest match at the current position. */ cur_len = hc_matchfinder_longest_match(&c->hc_mf, - in_cur_base, - in_next - in_cur_base, + &in_cur_base, + in_next, DEFLATE_MIN_MATCH_LEN - 1, max_len, nice_len, @@ -1738,7 +1708,7 @@ deflate_compress_lazy(struct deflate_compressor * restrict c, if (cur_len < DEFLATE_MIN_MATCH_LEN) { /* No match found. Choose a literal. */ deflate_choose_literal(c, *(in_next - 1), &litrunlen); - goto check_block_and_continue; + continue; } have_cur_match: @@ -1750,12 +1720,12 @@ deflate_compress_lazy(struct deflate_compressor * restrict c, deflate_choose_match(c, cur_len, cur_offset, &litrunlen, &next_seq); in_next = hc_matchfinder_skip_positions(&c->hc_mf, - in_cur_base, - in_next - in_cur_base, - in_end - in_cur_base, + &in_cur_base, + in_next, + in_end, cur_len - 1, next_hashes); - goto check_block_and_continue; + continue; } /* @@ -1774,13 +1744,13 @@ deflate_compress_lazy(struct deflate_compressor * restrict c, * have two call sites, with longest_match() inlined at * each. */ - if (unlikely(in_cur_end - in_next < DEFLATE_MAX_MATCH_LEN)) { - max_len = in_cur_end - in_next; - nice_len = MIN(max_len, nice_len); + if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) { + max_len = in_end - in_next; + nice_len = MIN(nice_len, max_len); } next_len = hc_matchfinder_longest_match(&c->hc_mf, - in_cur_base, - in_next - in_cur_base, + &in_cur_base, + in_next, cur_len, max_len, nice_len, @@ -1794,57 +1764,32 @@ deflate_compress_lazy(struct deflate_compressor * restrict c, * Output a literal. Then the next match * becomes the current match. */ deflate_choose_literal(c, *(in_next - 2), &litrunlen); - if (--items_remaining == 0) { - deflate_finish_sequence(next_seq, litrunlen); - deflate_write_block(c, &os, block_begin, - items_remaining, - in_next == in_end); - - block_begin = in_next - 1; - next_seq = c->sequences; - litrunlen = 0; - items_remaining = MAX_ITEMS_PER_BLOCK; - } + items_remaining--; cur_len = next_len; cur_offset = next_offset; goto have_cur_match; - } else { - /* No longer match at the next position. - * Output the current match. */ - deflate_choose_match(c, cur_len, cur_offset, - &litrunlen, &next_seq); - in_next = hc_matchfinder_skip_positions(&c->hc_mf, - in_cur_base, - in_next - in_cur_base, - in_end - in_cur_base, - cur_len - 2, - next_hashes); - goto check_block_and_continue; } - check_block_and_continue: + /* No longer match at the next position. + * Output the current match. */ + deflate_choose_match(c, cur_len, cur_offset, + &litrunlen, &next_seq); + in_next = hc_matchfinder_skip_positions(&c->hc_mf, + &in_cur_base, + in_next, + in_end, + cur_len - 2, + next_hashes); + /* Check if it's time to output another block. */ - if (--items_remaining == 0) { - deflate_finish_sequence(next_seq, litrunlen); - deflate_write_block(c, &os, block_begin, - items_remaining, - in_next == in_end); + } while (in_next != in_end && --items_remaining > 0); - block_begin = in_next; - next_seq = c->sequences; - litrunlen = 0; - items_remaining = MAX_ITEMS_PER_BLOCK; - } - } while (in_next != in_cur_end); + deflate_finish_sequence(next_seq, litrunlen); + deflate_write_block(c, &os, in_block_begin, + items_remaining, in_next == in_end); } while (in_next != in_end); - /* Output the last block. */ - if (items_remaining != MAX_ITEMS_PER_BLOCK) { - deflate_finish_sequence(next_seq, litrunlen); - deflate_write_block(c, &os, block_begin, items_remaining, true); - } - return deflate_flush_output(&os); } diff --git a/src/hc_matchfinder.h b/src/hc_matchfinder.h index 3cb0960..8b27eaa 100644 --- a/src/hc_matchfinder.h +++ b/src/hc_matchfinder.h @@ -138,9 +138,10 @@ hc_matchfinder_slide_window(struct hc_matchfinder *mf) * * @mf * The matchfinder structure. - * @in_base - * Pointer to the next byte in the input buffer to process _at the last - * time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_. + * @in_base_p + * Location of a pointer which points to the place in the input data the + * matchfinder currently stores positions relative to. This may be updated + * by this function. * @cur_pos * The current position in the input buffer relative to @in_base (the * position of the sequence being matched against). @@ -165,16 +166,23 @@ hc_matchfinder_slide_window(struct hc_matchfinder *mf) */ static forceinline u32 hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, - const u8 * const restrict in_base, - const ptrdiff_t cur_pos, + const u8 ** const restrict in_base_p, + const u8 * const restrict in_next, u32 best_len, const u32 max_len, const u32 nice_len, const u32 max_search_depth, - u32 next_hashes[restrict 2], + u32 * const restrict next_hashes, u32 * const restrict offset_ret) { - const u8 *in_next = in_base + cur_pos; + u32 cur_pos = in_next - *in_base_p; + if (cur_pos == MATCHFINDER_WINDOW_SIZE) { + hc_matchfinder_slide_window(mf); + *in_base_p += MATCHFINDER_WINDOW_SIZE; + cur_pos = 0; + } + + const u8 * const in_base = *in_base_p; u32 depth_remaining = max_search_depth; const u8 *best_matchptr = in_next; const mf_pos_t cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE; @@ -317,9 +325,10 @@ out: * * @mf * The matchfinder structure. - * @in_base - * Pointer to the next byte in the input buffer to process _at the last - * time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_. + * @in_base_p + * Location of a pointer which points to the place in the input data the + * matchfinder currently stores positions relative to. This may be updated + * by this function. * @cur_pos * The current position in the input buffer relative to @in_base. * @end_pos @@ -335,38 +344,44 @@ out: */ static forceinline const u8 * hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf, - const u8 * const restrict in_base, - const ptrdiff_t cur_pos, - const ptrdiff_t end_pos, + const u8 ** const restrict in_base_p, + const u8 *in_next, + const u8 * const in_end, const u32 count, - u32 next_hashes[restrict 2]) + u32 * const restrict next_hashes) { - const u8 *in_next = in_base + cur_pos; - const u8 * const stop_ptr = in_next + count; + u32 cur_pos; + u32 hash3, hash4; + u32 next_seq3, next_seq4; + u32 remaining = count; - if (likely(count + 5 <= end_pos - cur_pos)) { - u32 hash3, hash4; - u32 next_seq3, next_seq4; + if (unlikely(count + 5 > in_end - in_next)) + return &in_next[count]; - hash3 = next_hashes[0]; - hash4 = next_hashes[1]; - do { - mf->hash3_tab[hash3] = in_next - in_base; - mf->next_tab[in_next - in_base] = mf->hash4_tab[hash4]; - mf->hash4_tab[hash4] = in_next - in_base; + cur_pos = in_next - *in_base_p; + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; + do { + if (cur_pos == MATCHFINDER_WINDOW_SIZE) { + hc_matchfinder_slide_window(mf); + *in_base_p += MATCHFINDER_WINDOW_SIZE; + cur_pos = 0; + } + mf->hash3_tab[hash3] = cur_pos; + mf->next_tab[cur_pos] = mf->hash4_tab[hash4]; + mf->hash4_tab[hash4] = cur_pos; - next_seq4 = load_u32_unaligned(++in_next); - next_seq3 = loaded_u32_to_u24(next_seq4); - hash3 = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER); - hash4 = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER); + next_seq4 = load_u32_unaligned(++in_next); + next_seq3 = loaded_u32_to_u24(next_seq4); + hash3 = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER); + hash4 = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER); + cur_pos++; + } while (--remaining); - } while (in_next != stop_ptr); + prefetchw(&mf->hash3_tab[hash3]); + prefetchw(&mf->hash4_tab[hash4]); + next_hashes[0] = hash3; + next_hashes[1] = hash4; - prefetchw(&mf->hash3_tab[hash3]); - prefetchw(&mf->hash4_tab[hash4]); - next_hashes[0] = hash3; - next_hashes[1] = hash4; - } - - return stop_ptr; + return in_next; }