diff --git a/src/deflate_compress.c b/src/deflate_compress.c index d684121..92a1ca6 100644 --- a/src/deflate_compress.c +++ b/src/deflate_compress.c @@ -140,6 +140,11 @@ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { 27, 27, 28, }; +/* The order in which precode codeword lengths are stored */ +static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 +}; + /* Codewords for the DEFLATE Huffman codes. */ struct deflate_codewords { u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; @@ -295,7 +300,7 @@ struct deflate_compressor { /* Dynamic Huffman codes for the current block */ struct deflate_codes codes; - /* Static Huffman codes set just before first use */ + /* Static Huffman codes */ struct deflate_codes static_codes; /* A table for fast lookups of offset slot by match offset. @@ -326,11 +331,15 @@ struct deflate_compressor { /* The compression level with which this compressor was created. */ unsigned compression_level; - /* Temporary arrays for Huffman code output */ + /* Temporary space for Huffman code output */ u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS]; u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS]; unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS]; + unsigned num_litlen_syms; + unsigned num_offset_syms; + unsigned num_explicit_lens; + unsigned num_precode_items; union { /* Data for greedy or lazy parsing */ @@ -460,6 +469,14 @@ deflate_flush_bits(struct deflate_output_bitstream *os) } } +/* Align the bitstream on a byte boundary. */ +static forceinline void +deflate_align_bitstream(struct deflate_output_bitstream *os) +{ + os->bitcount += -os->bitcount & 7; + deflate_flush_bits(os); +} + /* * Flush any remaining bits to the output buffer if needed. Return the total * number of bytes written to the output buffer, or 0 if an overflow occurred. @@ -1135,6 +1152,20 @@ deflate_init_static_codes(struct deflate_compressor *c) deflate_make_huffman_codes(&c->freqs, &c->static_codes); } +/* Return the offset slot for the specified match offset. */ +static forceinline unsigned +deflate_get_offset_slot(struct deflate_compressor *c, unsigned offset) +{ +#if USE_FULL_OFFSET_SLOT_FAST + return c->offset_slot_fast[offset]; +#else + if (offset <= 256) + return c->offset_slot_fast[offset - 1]; + else + return c->offset_slot_fast[256 + ((offset - 1) >> 7)]; +#endif +} + /* Write the header fields common to all DEFLATE block types. */ static void deflate_write_block_header(struct deflate_output_bitstream *os, @@ -1157,6 +1188,9 @@ deflate_compute_precode_items(const u8 lens[restrict], unsigned extra_bits; u8 len; + memset(precode_freqs, 0, + DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0])); + itemptr = precode_items; run_start = 0; do { @@ -1219,67 +1253,102 @@ deflate_compute_precode_items(const u8 lens[restrict], } /* - * Output a list of Huffman codeword lengths in compressed form. - * - * The codeword lengths are compressed using a separate Huffman code, the - * "precode", which contains a symbol for each possible codeword length in the - * larger code as well as several special symbols to represent repeated codeword - * lengths (a form of run-length encoding). The precode is itself constructed - * in canonical form, and its codeword lengths are represented literally in 19 - * 3-bit fields that immediately precede the compressed codeword lengths of the - * larger code. + * Huffman codeword lengths for dynamic Huffman blocks are compressed using a + * separate Huffman code, the "precode", which contains a symbol for each + * possible codeword length in the larger code as well as several special + * symbols to represent repeated codeword lengths (a form of run-length + * encoding). The precode is itself constructed in canonical form, and its + * codeword lengths are represented literally in 19 3-bit fields that + * immediately precede the compressed codeword lengths of the larger code. */ -static void -deflate_write_compressed_lens(struct deflate_compressor *c, - struct deflate_output_bitstream *os, - const u8 lens[], unsigned num_lens) -{ - unsigned num_precode_items; - unsigned precode_item; - unsigned precode_sym; - unsigned num_explicit_lens; - unsigned i; - static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { - 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 - }; - for (i = 0; i < DEFLATE_NUM_PRECODE_SYMS; i++) - c->precode_freqs[i] = 0; +/* Precompute the information needed to output Huffman codes. */ +static void +deflate_precompute_huffman_header(struct deflate_compressor *c) +{ + /* Compute how many litlen and offset symbols are needed. */ + + for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; + c->num_litlen_syms > 257; + c->num_litlen_syms--) + if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0) + break; + + for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; + c->num_offset_syms > 1; + c->num_offset_syms--) + if (c->codes.lens.offset[c->num_offset_syms - 1] != 0) + break; + + /* If we're not using the full set of literal/length codeword lengths, + * then temporarily move the offset codeword lengths over so that the + * literal/length and offset codeword lengths are contiguous. */ + + STATIC_ASSERT(offsetof(struct deflate_lens, offset) == + DEFLATE_NUM_LITLEN_SYMS); + + if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { + memmove(&c->codes.lens.all[c->num_litlen_syms], + &c->codes.lens.all[DEFLATE_NUM_LITLEN_SYMS], + c->num_offset_syms * sizeof(c->codes.lens.all[0])); + } + /* Compute the "items" (RLE / literal tokens and extra bits) with which - * the codeword lengths in the larger code will be output. */ - num_precode_items = deflate_compute_precode_items(lens, - num_lens, - c->precode_freqs, - c->precode_items); + * the codeword lengths in the larger code will be output. */ + c->num_precode_items = + deflate_compute_precode_items(c->codes.lens.all, + c->num_litlen_syms + + c->num_offset_syms, + c->precode_freqs, + c->precode_items); - /* Build the precode. */ + /* Build the precode. */ STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN); deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, MAX_PRE_CODEWORD_LEN, c->precode_freqs, c->precode_lens, c->precode_codewords); - /* Count how many precode lengths we actually need to output. */ - for (num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; - num_explicit_lens > 4; - num_explicit_lens--) - if (c->precode_lens[deflate_precode_lens_permutation[num_explicit_lens - 1]] != 0) + /* Count how many precode lengths we actually need to output. */ + for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; + c->num_explicit_lens > 4; + c->num_explicit_lens--) + if (c->precode_lens[deflate_precode_lens_permutation[ + c->num_explicit_lens - 1]] != 0) break; - deflate_add_bits(os, num_explicit_lens - 4, 4); + /* Restore the offset codeword lengths if needed. */ + if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { + memmove(&c->codes.lens.all[DEFLATE_NUM_LITLEN_SYMS], + &c->codes.lens.all[c->num_litlen_syms], + c->num_offset_syms * sizeof(c->codes.lens.all[0])); + } +} + +/* Output the Huffman codes. */ +static void +deflate_write_huffman_header(struct deflate_compressor *c, + struct deflate_output_bitstream *os) +{ + unsigned i; + + deflate_add_bits(os, c->num_litlen_syms - 257, 5); + deflate_add_bits(os, c->num_offset_syms - 1, 5); + deflate_add_bits(os, c->num_explicit_lens - 4, 4); deflate_flush_bits(os); /* Output the lengths of the codewords in the precode. */ - for (i = 0; i < num_explicit_lens; i++) { - deflate_add_bits(os, c->precode_lens[deflate_precode_lens_permutation[i]], 3); + for (i = 0; i < c->num_explicit_lens; i++) { + deflate_add_bits(os, c->precode_lens[ + deflate_precode_lens_permutation[i]], 3); deflate_flush_bits(os); } /* Output the encoded lengths of the codewords in the larger code. */ - for (i = 0; i < num_precode_items; i++) { - precode_item = c->precode_items[i]; - precode_sym = precode_item & 0x1F; + for (i = 0; i < c->num_precode_items; i++) { + unsigned precode_item = c->precode_items[i]; + unsigned precode_sym = precode_item & 0x1F; deflate_add_bits(os, c->precode_codewords[precode_sym], c->precode_lens[precode_sym]); if (precode_sym >= 16) { @@ -1295,64 +1364,11 @@ deflate_write_compressed_lens(struct deflate_compressor *c, } } -/* - * Output the specified Huffman codes. - * This is used for dynamic Huffman blocks. - */ -static void -deflate_write_huffman_codes(struct deflate_compressor *c, - struct deflate_output_bitstream *os) -{ - unsigned num_litlen_syms; - unsigned num_offset_syms; - - /* We only need to output up to the highest-valued symbol actually used. */ - - for (num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; - num_litlen_syms > 257; - num_litlen_syms--) - if (c->codes.lens.litlen[num_litlen_syms - 1] != 0) - break; - - for (num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; - num_offset_syms > 1; - num_offset_syms--) - if (c->codes.lens.offset[num_offset_syms - 1] != 0) - break; - - deflate_add_bits(os, num_litlen_syms - 257, 5); - deflate_add_bits(os, num_offset_syms - 1, 5); - deflate_flush_bits(os); - - /* If we're not outputting the full set of literal/length codeword - * lengths, temporarily move the offset codeword lengths over so that - * the literal/length and offset codeword lengths are contiguous. */ - - STATIC_ASSERT(offsetof(struct deflate_lens, offset) == - DEFLATE_NUM_LITLEN_SYMS); - - if (num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) - memmove(&c->codes.lens.all[num_litlen_syms], - &c->codes.lens.all[DEFLATE_NUM_LITLEN_SYMS], - num_offset_syms * sizeof(c->codes.lens.all[0])); - - /* Output the codeword lengths. */ - - deflate_write_compressed_lens(c, os, c->codes.lens.all, - num_litlen_syms + num_offset_syms); - - /* Restore the offset codeword lengths if needed. */ - if (num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) - memmove(&c->codes.lens.all[DEFLATE_NUM_LITLEN_SYMS], - &c->codes.lens.all[num_litlen_syms], - num_offset_syms * sizeof(c->codes.lens.all[0])); -} - static void deflate_write_sequences(struct deflate_output_bitstream * restrict os, - const u8 * restrict in_next, + const struct deflate_codes * restrict codes, const struct deflate_sequence sequences[restrict], - const struct deflate_codes * restrict codes) + const u8 * restrict in_next) { const struct deflate_sequence *seq = sequences; @@ -1471,6 +1487,71 @@ deflate_write_sequences(struct deflate_output_bitstream * restrict os, } } +/* + * Follow the minimum-cost path in the graph of possible match/literal choices + * for the current block and write out the matches/literals using the specified + * Huffman codes. + * + * Note: this is slightly duplicated with deflate_write_sequences(), the reason + * being that we don't want to waste time translating between intermediate + * match/literal representations. + */ +static void +deflate_write_item_list(struct deflate_output_bitstream *os, + const struct deflate_codes *codes, + struct deflate_compressor *c, + u32 block_length) +{ + struct deflate_optimum_node *cur_node = c->optimum; + struct deflate_optimum_node * const end_node = cur_node + block_length; + do { + unsigned length = cur_node->item & OPTIMUM_LEN_MASK; + unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; + unsigned litlen_symbol; + unsigned length_slot; + unsigned offset_slot; + + if (length == 1) { + /* Literal */ + litlen_symbol = offset; + deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], + codes->lens.litlen[litlen_symbol]); + deflate_flush_bits(os); + } else { + /* Match length */ + length_slot = deflate_length_slot[length]; + litlen_symbol = 257 + length_slot; + deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], + codes->lens.litlen[litlen_symbol]); + + deflate_add_bits(os, length - deflate_length_slot_base[length_slot], + deflate_extra_length_bits[length_slot]); + + if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_LENGTH_BITS + + MAX_OFFSET_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_OFFSET_BITS)) + deflate_flush_bits(os); + + + /* Match offset */ + offset_slot = deflate_get_offset_slot(c, offset); + deflate_add_bits(os, codes->codewords.offset[offset_slot], + codes->lens.offset[offset_slot]); + + if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_OFFSET_BITS)) + deflate_flush_bits(os); + + deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot], + deflate_extra_offset_bits[offset_slot]); + + deflate_flush_bits(os); + } + cur_node += length; + } while (cur_node != end_node); +} + /* Output the end-of-block symbol. */ static void deflate_write_end_of_block(struct deflate_output_bitstream *os, @@ -1481,56 +1562,153 @@ deflate_write_end_of_block(struct deflate_output_bitstream *os, deflate_flush_bits(os); } - static void -deflate_write_block(struct deflate_compressor * restrict c, - struct deflate_output_bitstream * restrict os, - const u8 * restrict block_begin, u32 block_length, - bool is_final_block) +deflate_write_uncompressed_block(struct deflate_output_bitstream *os, + const u8 *data, u16 len, + bool is_final_block) { - struct deflate_codes *codes; + deflate_write_block_header(os, is_final_block, + DEFLATE_BLOCKTYPE_UNCOMPRESSED); + deflate_align_bitstream(os); - /* Note: we don't currently output any uncompressed blocks. */ - - /* Account for end-of-block symbol */ - c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; - - if (block_length >= 1000) { - /* Use custom ("dynamic") Huffman codes. */ - deflate_write_block_header(os, is_final_block, - DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN); - deflate_make_huffman_codes(&c->freqs, &c->codes); - deflate_write_huffman_codes(c, os); - codes = &c->codes; - } else { - /* This is a very short block. Just use the static codes. */ - deflate_write_block_header(os, is_final_block, - DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); - codes = &c->static_codes; - if (codes->codewords.litlen[0] == 0xFFFFFFFF) - deflate_init_static_codes(c); + if (4 + (u32)len >= os->end - os->next) { + os->next = os->end; + return; } - deflate_write_sequences(os, block_begin, c->sequences, codes); - deflate_write_end_of_block(os, codes); - - /* Reset symbol frequencies if this wasn't the final block. */ - if (!is_final_block) - deflate_reset_symbol_frequencies(c); + put_unaligned_le16(len, os->next); + os->next += 2; + put_unaligned_le16(~len, os->next); + os->next += 2; + memcpy(os->next, data, len); + os->next += len; } -/* Return the offset slot for the specified match offset. */ -static forceinline unsigned -deflate_get_offset_slot(struct deflate_compressor *c, unsigned offset) +static void +deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os, + const u8 *data, u32 data_length, + bool is_final_block) { -#if USE_FULL_OFFSET_SLOT_FAST - return c->offset_slot_fast[offset]; -#else - if (offset <= 256) - return c->offset_slot_fast[offset - 1]; - else - return c->offset_slot_fast[256 + ((offset - 1) >> 7)]; -#endif + do { + u16 len = MIN(data_length, UINT16_MAX); + + deflate_write_uncompressed_block(os, data, len, + is_final_block && len == data_length); + data += len; + data_length -= len; + } while (data_length != 0); +} + +/* + * Choose the best type of block to use (dynamic Huffman, static Huffman, or + * uncompressed), then output it. + */ +static void +deflate_flush_block(struct deflate_compressor * restrict c, + struct deflate_output_bitstream * restrict os, + const u8 * restrict block_begin, u32 block_length, + bool is_final_block, bool use_item_list) +{ + static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7, + }; + + /* Costs are measured in bits */ + u32 dynamic_cost = 0; + u32 static_cost = 0; + u32 uncompressed_cost = 0; + struct deflate_codes *codes; + int block_type; + unsigned sym; + + /* Tally the end-of-block symbol. */ + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; + + /* Build dynamic Huffman codes. */ + deflate_make_huffman_codes(&c->freqs, &c->codes); + + /* Account for the cost of sending dynamic Huffman codes. */ + deflate_precompute_huffman_header(c); + dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens); + for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { + u32 extra = deflate_extra_precode_bits[sym]; + dynamic_cost += c->precode_freqs[sym] * + (extra + c->precode_lens[sym]); + } + + /* Account for the cost of encoding literals. */ + for (sym = 0; sym < 256; sym++) { + dynamic_cost += c->freqs.litlen[sym] * + c->codes.lens.litlen[sym]; + } + for (sym = 0; sym < 144; sym++) + static_cost += c->freqs.litlen[sym] * 8; + for (; sym < 256; sym++) + static_cost += c->freqs.litlen[sym] * 9; + + /* Account for the cost of encoding the end-of-block symbol. */ + dynamic_cost += c->codes.lens.litlen[256]; + static_cost += 7; + + /* Account for the cost of encoding lengths. */ + for (sym = 257; sym < DEFLATE_NUM_LITLEN_SYMS; sym++) { + u32 extra = deflate_extra_length_bits[sym - 257]; + dynamic_cost += c->freqs.litlen[sym] * + (extra + c->codes.lens.litlen[sym]); + static_cost += c->freqs.litlen[sym] * + (extra + c->static_codes.lens.litlen[sym]); + } + + /* Account for the cost of encoding offsets. */ + for (sym = 0; sym < DEFLATE_NUM_OFFSET_SYMS; sym++) { + u32 extra = deflate_extra_offset_bits[sym]; + dynamic_cost += c->freqs.offset[sym] * + (extra + c->codes.lens.offset[sym]); + static_cost += c->freqs.offset[sym] * (extra + 5); + } + + /* Compute the cost of using uncompressed blocks. */ + uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 + + (40 * (DIV_ROUND_UP(block_length, + UINT16_MAX) - 1)) + + (8 * block_length); + + /* Choose the cheapest block type. */ + if (dynamic_cost < MIN(static_cost, uncompressed_cost)) { + block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN; + codes = &c->codes; + } else if (static_cost < uncompressed_cost) { + block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN; + codes = &c->static_codes; + } else { + block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED; + } + + /* Now actually output the block. */ + + if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + /* Note: the length being flushed may exceed the maximum length + * of an uncompressed block (65535 bytes). Therefore, more than + * one uncompressed block might be needed. */ + deflate_write_uncompressed_blocks(os, block_begin, block_length, + is_final_block); + } else { + /* Output the block header. */ + deflate_write_block_header(os, is_final_block, block_type); + + /* Output the Huffman codes (dynamic Huffman blocks only). */ + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) + deflate_write_huffman_header(c, os); + + /* Output the literals, matches, and end-of-block symbol. */ + if (use_item_list) { + deflate_write_item_list(os, codes, c, block_length); + } else { + deflate_write_sequences(os, codes, c->sequences, + block_begin); + } + deflate_write_end_of_block(os, codes); + } } static forceinline void @@ -1713,7 +1891,6 @@ deflate_compress_greedy(struct deflate_compressor * restrict c, u32 next_hashes[2] = {0, 0}; deflate_init_output(&os, out, out_nbytes_avail); - deflate_reset_symbol_frequencies(c); hc_matchfinder_init(&c->hc_mf); do { @@ -1726,6 +1903,7 @@ deflate_compress_greedy(struct deflate_compressor * restrict c, struct block_split_stats split_stats; init_block_split_stats(&split_stats); + deflate_reset_symbol_frequencies(c); do { u32 length; @@ -1771,8 +1949,9 @@ deflate_compress_greedy(struct deflate_compressor * restrict c, !should_end_block(&split_stats, in_block_begin, in_next, in_end)); deflate_finish_sequence(next_seq, litrunlen); - deflate_write_block(c, &os, in_block_begin, - in_next - in_block_begin, in_next == in_end); + deflate_flush_block(c, &os, in_block_begin, + in_next - in_block_begin, + in_next == in_end, false); } while (in_next != in_end); return deflate_flush_output(&os); @@ -1797,7 +1976,6 @@ deflate_compress_lazy(struct deflate_compressor * restrict c, u32 next_hashes[2] = {0, 0}; deflate_init_output(&os, out, out_nbytes_avail); - deflate_reset_symbol_frequencies(c); hc_matchfinder_init(&c->hc_mf); do { @@ -1810,6 +1988,7 @@ deflate_compress_lazy(struct deflate_compressor * restrict c, struct block_split_stats split_stats; init_block_split_stats(&split_stats); + deflate_reset_symbol_frequencies(c); do { unsigned cur_len; @@ -1917,9 +2096,9 @@ deflate_compress_lazy(struct deflate_compressor * restrict c, !should_end_block(&split_stats, in_block_begin, in_next, in_end)); deflate_finish_sequence(next_seq, litrunlen); - deflate_write_block(c, &os, in_block_begin, - in_next - in_block_begin, in_next == in_end); - + deflate_flush_block(c, &os, in_block_begin, + in_next - in_block_begin, + in_next == in_end, false); } while (in_next != in_end); return deflate_flush_output(&os); @@ -1953,70 +2132,6 @@ deflate_tally_item_list(struct deflate_compressor *c, } while (cur_node != end_node); } -/* - * Follow the minimum-cost path in the graph of possible match/literal choices - * for the current block and write out the matches/literals using the specified - * Huffman codes. - * - * Note: this is slightly duplicated with deflate_write_sequences(), the reason - * being that we don't want to waste time translating between intermediate - * match/literal representations. - */ -static void -deflate_write_item_list(struct deflate_output_bitstream *os, - const struct deflate_codes *codes, - struct deflate_compressor *c, - struct deflate_optimum_node * const end_node) -{ - struct deflate_optimum_node *cur_node = c->optimum; - do { - unsigned length = cur_node->item & OPTIMUM_LEN_MASK; - unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; - unsigned litlen_symbol; - unsigned length_slot; - unsigned offset_slot; - - if (length == 1) { - /* Literal */ - litlen_symbol = offset; - deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], - codes->lens.litlen[litlen_symbol]); - deflate_flush_bits(os); - } else { - /* Match length */ - length_slot = deflate_length_slot[length]; - litlen_symbol = 257 + length_slot; - deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], - codes->lens.litlen[litlen_symbol]); - - deflate_add_bits(os, length - deflate_length_slot_base[length_slot], - deflate_extra_length_bits[length_slot]); - - if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_LENGTH_BITS + - MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - - /* Match offset */ - offset_slot = deflate_get_offset_slot(c, offset); - deflate_add_bits(os, codes->codewords.offset[offset_slot], - codes->lens.offset[offset_slot]); - - if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot], - deflate_extra_offset_bits[offset_slot]); - - deflate_flush_bits(os); - } - cur_node += length; - } while (cur_node != end_node); -} - /* Set the current cost model from the codeword lengths specified in @lens. */ static void deflate_set_costs(struct deflate_compressor *c, const struct deflate_lens * lens) @@ -2137,11 +2252,12 @@ deflate_adjust_costs(struct deflate_compressor *c) static void deflate_optimize_and_write_block(struct deflate_compressor *c, struct deflate_output_bitstream *os, - const unsigned block_len, - struct lz_match *end_cache_ptr, + const u8 * const block_begin, + const u32 block_length, + struct lz_match * const end_cache_ptr, const bool is_final_block) { - struct deflate_optimum_node *end_node = c->optimum + block_len; + struct deflate_optimum_node * const end_node = c->optimum + block_length; unsigned num_passes_remaining = c->num_optim_passes; /* Force the block to really end at 'end_node', even if some matches @@ -2242,13 +2358,8 @@ deflate_optimize_and_write_block(struct deflate_compressor *c, /* All optimization passes are done. Output a block using the * minimum-cost path computed on the last optimization pass. */ - c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; - deflate_make_huffman_codes(&c->freqs, &c->codes); - deflate_reset_symbol_frequencies(c); - deflate_write_block_header(os, is_final_block, DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN); - deflate_write_huffman_codes(c, os); - deflate_write_item_list(os, &c->codes, c, end_node); - deflate_write_end_of_block(os, &c->codes); + deflate_flush_block(c, os, block_begin, block_length, + is_final_block, true); } /* @@ -2279,7 +2390,6 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c, u32 next_hashes[2] = {0, 0}; deflate_init_output(&os, out, out_nbytes_avail); - deflate_reset_symbol_frequencies(c); bt_matchfinder_init(&c->bt_mf); do { @@ -2293,6 +2403,7 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c, const u8 *next_observation = in_next; init_block_split_stats(&split_stats); + deflate_reset_symbol_frequencies(c); /* * Find matches until we decide to end the block. We end the @@ -2418,7 +2529,8 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c, deflate_set_default_costs(c); else deflate_adjust_costs(c); - deflate_optimize_and_write_block(c, &os, in_next - in_block_begin, + deflate_optimize_and_write_block(c, &os, in_block_begin, + in_next - in_block_begin, cache_ptr, in_next == in_end); } while (in_next != in_end); @@ -2565,7 +2677,7 @@ deflate_alloc_compressor(unsigned int compression_level) } deflate_init_offset_slot_fast(c); - c->static_codes.codewords.litlen[0] = 0xFFFFFFFF; + deflate_init_static_codes(c); return c; } @@ -2577,15 +2689,15 @@ deflate_compress(struct deflate_compressor *c, { if (unlikely(out_nbytes_avail < MIN_OUTPUT_SIZE)) return 0; - if (unlikely(in_nbytes == 0)) { - /* Empty input; output a single empty block. */ + + /* For extremely small inputs just use a single uncompressed block. */ + if (unlikely(in_nbytes < 16)) { struct deflate_output_bitstream os; deflate_init_output(&os, out, out_nbytes_avail); - deflate_reset_symbol_frequencies(c); - deflate_finish_sequence(c->sequences, 0); - deflate_write_block(c, &os, in, 0, true); + deflate_write_uncompressed_block(&os, in, in_nbytes, true); return deflate_flush_output(&os); } + return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail); } @@ -2601,14 +2713,15 @@ deflate_get_compression_level(struct deflate_compressor *c) return c->compression_level; } -/* Return an upper bound on the compressed size for compressing @in_nbytes bytes - * of data. This function needs some work to be more accurate. */ LIBEXPORT size_t deflate_compress_bound(struct deflate_compressor *c, size_t in_nbytes) { - size_t max_num_blocks = DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH); - if (max_num_blocks == 0) - max_num_blocks++; - return MIN_OUTPUT_SIZE + DIV_ROUND_UP(in_nbytes * 9, 8) + - max_num_blocks * 200; + /* + * The worst case is all uncompressed blocks where one block has length + * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH. + * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE, + * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN. + */ + size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); + return MAX((5 * max_num_blocks) + in_nbytes, MIN_OUTPUT_SIZE); }