diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c index 23dfd28..4b3ee36 100644 --- a/lib/deflate_compress.c +++ b/lib/deflate_compress.c @@ -50,14 +50,6 @@ */ #define SUPPORT_NEAR_OPTIMAL_PARSING 1 -/* - * If this parameter is defined to 1, then the compressor will maintain a full - * map from match offsets to offset slots, rather than a condensed map. This - * will usually improve performance, especially for the near-optimal parsing - * algorithm. However, it will use an additional 32257 bytes of memory. - */ -#define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING - /* * This is the minimum block length, in uncompressed bytes, which the compressor * will use. This should be a value below which using shorter blocks is very @@ -211,6 +203,49 @@ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { 27, 27, 28, }; +/* + * A condensed table which maps offset => offset slot as follows: + * + * offset <= 256: deflate_offset_slot[offset] + * offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)] + * + * This table was generated by scripts/gen_offset_slot_map.py. + */ +static const u8 deflate_offset_slot[512] = { + 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, + 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, + 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, +}; + /* The order in which precode codeword lengths are stored */ static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 @@ -359,23 +394,6 @@ struct libdeflate_compressor { /* Block split statistics for the currently pending block */ struct block_split_stats split_stats; - /* A table for fast lookups of offset slot by match offset. - * - * If the full table is being used, it is a direct mapping from offset - * to offset slot. - * - * If the condensed table is being used, the first 256 entries map - * directly to the offset slots of offsets 1 through 256. The next 256 - * entries map to the offset slots for the remaining offsets, stepping - * through the offsets with a stride of 128. This relies on the fact - * that each of the remaining offset slots contains at least 128 offsets - * and has an offset base that is a multiple of 128. */ -#if USE_FULL_OFFSET_SLOT_FAST - u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1]; -#else - u8 offset_slot_fast[512]; -#endif - /* The "nice" match length: if a match of this length is found, choose * it immediately without further consideration. */ unsigned nice_match_length; @@ -480,6 +498,18 @@ struct libdeflate_compressor { /* The current cost model being used. */ struct deflate_costs costs; + /* + * A table that maps match offset to offset slot. This + * differs from deflate_offset_slot[] in that this is a + * full map, not a condensed one. The full map is more + * appropriate for the near-optimal parser, since the + * near-optimal parser does more offset => offset_slot + * translations, it doesn't intersperse them with + * matchfinding (so cache evictions are less of a + * concern), and it uses more memory anyway. + */ + u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1]; + /* Literal/match statistics saved from previous block */ u32 prev_observations[NUM_OBSERVATION_TYPES]; u32 prev_num_observations; @@ -1290,17 +1320,21 @@ deflate_init_static_codes(struct libdeflate_compressor *c) deflate_make_huffman_codes(&c->freqs, &c->static_codes); } -/* Return the offset slot for the specified match offset. */ +/* Return the offset slot for the given match offset, using the small map. */ static forceinline unsigned -deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset) +deflate_get_offset_slot(unsigned offset) { -#if USE_FULL_OFFSET_SLOT_FAST - return c->offset_slot_fast[offset]; -#else +#if 1 if (offset <= 256) - return c->offset_slot_fast[offset - 1]; + return deflate_offset_slot[offset]; else - return c->offset_slot_fast[256 + ((offset - 1) >> 7)]; + return deflate_offset_slot[256 + ((offset - 1) >> 7)]; +#else /* Branchless version */ + u32 i1 = offset; + u32 i2 = 256 + ((offset - 1) >> 7); + u32 is_small = (s32)(offset - 257) >> 31; + + return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)]; #endif } @@ -1670,7 +1704,7 @@ deflate_write_item_list(struct deflate_output_bitstream *os, /* Match offset */ - offset_slot = deflate_get_offset_slot(c, offset); + offset_slot = c->p.n.offset_slot_full[offset]; deflate_add_bits(os, codes->codewords.offset[offset_slot], codes->lens.offset[offset_slot]); @@ -1878,7 +1912,7 @@ deflate_choose_match(struct libdeflate_compressor *c, { struct deflate_sequence *seq = *seq_p; unsigned length_slot = deflate_length_slot[length]; - unsigned offset_slot = deflate_get_offset_slot(c, offset); + unsigned offset_slot = deflate_get_offset_slot(offset); c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++; c->freqs.offset[offset_slot]++; @@ -2572,7 +2606,7 @@ deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length) /* Match */ c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + deflate_length_slot[length]]++; - c->freqs.offset[deflate_get_offset_slot(c, offset)]++; + c->freqs.offset[c->p.n.offset_slot_full[offset]]++; } cur_node += length; } while (cur_node != end_node); @@ -3004,7 +3038,7 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c, len = DEFLATE_MIN_MATCH_LEN; do { offset = match->offset; - offset_slot = deflate_get_offset_slot(c, offset); + offset_slot = c->p.n.offset_slot_full[offset]; offset_cost = c->p.n.costs.offset_slot[offset_slot]; do { cost_to_end = offset_cost + @@ -3285,11 +3319,9 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, return deflate_flush_output(&os); } -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* Initialize c->offset_slot_fast. */ +/* Initialize c->p.n.offset_slot_full. */ static void -deflate_init_offset_slot_fast(struct libdeflate_compressor *c) +deflate_init_offset_slot_full(struct libdeflate_compressor *c) { unsigned offset_slot; unsigned offset; @@ -3300,27 +3332,15 @@ deflate_init_offset_slot_fast(struct libdeflate_compressor *c) offset_slot++) { offset = deflate_offset_slot_base[offset_slot]; - #if USE_FULL_OFFSET_SLOT_FAST offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); do { - c->offset_slot_fast[offset] = offset_slot; + c->p.n.offset_slot_full[offset] = offset_slot; } while (++offset != offset_end); - #else - if (offset <= 256) { - offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); - do { - c->offset_slot_fast[offset - 1] = offset_slot; - } while (++offset != offset_end); - } else { - offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); - do { - c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot; - } while ((offset += (1 << 7)) != offset_end); - } - #endif } } +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI libdeflate_alloc_compressor(int compression_level) { @@ -3412,12 +3432,14 @@ libdeflate_alloc_compressor(int compression_level) c->max_search_depth = 35; c->nice_match_length = 75; c->p.n.num_optim_passes = 2; + deflate_init_offset_slot_full(c); break; case 11: c->impl = deflate_compress_near_optimal; c->max_search_depth = 70; c->nice_match_length = 150; c->p.n.num_optim_passes = 3; + deflate_init_offset_slot_full(c); break; case 12: default: @@ -3425,11 +3447,11 @@ libdeflate_alloc_compressor(int compression_level) c->max_search_depth = 150; c->nice_match_length = DEFLATE_MAX_MATCH_LEN; c->p.n.num_optim_passes = 4; + deflate_init_offset_slot_full(c); break; #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ } - deflate_init_offset_slot_fast(c); deflate_init_static_codes(c); return c; diff --git a/scripts/gen_offset_slot_map.py b/scripts/gen_offset_slot_map.py new file mode 100755 index 0000000..500332c --- /dev/null +++ b/scripts/gen_offset_slot_map.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +# +# This script generates the deflate_offset_slot[] array, which is a condensed +# map from offsets to offset slots. + +DEFLATE_OFFSET_SLOT_BASE = [ + 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 , + 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 , + 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 , + 4097 , 6145 , 8193 , 12289 , 16385 , 24577 , +] + +DEFLATE_EXTRA_OFFSET_BITS = [ + 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 , + 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , + 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 , + 11 , 11 , 12 , 12 , 13 , 13 , +] + +offset_slot_map = [0] * 512 + +for offset_slot, offset_base in enumerate(DEFLATE_OFFSET_SLOT_BASE): + num_extra_bits = DEFLATE_EXTRA_OFFSET_BITS[offset_slot] + offset_end = offset_base + (1 << num_extra_bits) + if offset_base <= 256: + for offset in range(offset_base, offset_end): + offset_slot_map[offset] = offset_slot + else: + for offset in range(offset_base, offset_end, 128): + offset_slot_map[256 + ((offset - 1) >> 7)] = offset_slot + +print('static const u8 deflate_offset_slot_map[512] = {') +for i in range(0, len(offset_slot_map), 16): + print('\t', end='') + for j, v in enumerate(offset_slot_map[i:i+16]): + print(f'{v},', end='') + if j == 15: + print('') + else: + print(' ', end='') +print('};')