diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c index dccb9dd..377b51f 100644 --- a/lib/deflate_compress.c +++ b/lib/deflate_compress.c @@ -52,10 +52,10 @@ /* * This is the minimum block length that the compressor will use, in - * uncompressed bytes. It is also the amount by which the final block is - * allowed to grow past the soft maximum length in order to avoid using a very - * short block at the end. This should be a value below which using shorter - * blocks is unlikely to be worthwhile, due to the per-block overhead. + * uncompressed bytes. It is also approximately the amount by which the final + * block is allowed to grow past the soft maximum length in order to avoid using + * a very short block at the end. This should be a value below which using + * shorter blocks is unlikely to be worthwhile, due to the per-block overhead. * * Defining a fixed minimum block length is needed in order to guarantee a * reasonable upper bound on the compressed size. It's also needed because our @@ -94,8 +94,8 @@ * For deflate_compress_fastest(): This is the soft maximum block length. * deflate_compress_fastest() doesn't use the regular block splitting algorithm; * it only ends blocks when they reach FAST_SOFT_MAX_BLOCK_LENGTH bytes or - * FAST_SEQ_STORE_LENGTH - 1 matches. Therefore, this value should be lower - * than the regular SOFT_MAX_BLOCK_LENGTH. + * FAST_SEQ_STORE_LENGTH matches. Therefore, this value should be lower than + * the regular SOFT_MAX_BLOCK_LENGTH. */ #define FAST_SOFT_MAX_BLOCK_LENGTH 65535 @@ -490,7 +490,7 @@ struct libdeflate_compressor { /* Frequency counters for the current block */ struct deflate_freqs freqs; - /* Block split statistics for the currently pending block */ + /* Block split statistics for the current block */ struct block_split_stats split_stats; /* Dynamic Huffman codes for the current block */ @@ -648,7 +648,7 @@ struct deflate_output_bitstream { */ u8 *next; - /* Pointer just past the end of the output buffer */ + /* Pointer to just past the end of the output buffer */ u8 *end; }; @@ -664,8 +664,7 @@ struct deflate_output_bitstream { #define OUTPUT_END_PADDING 8 /* - * Initialize the output bitstream. 'size' is assumed to be at least - * OUTPUT_END_PADDING. + * Initialize the output bitstream. 'size' must be at least OUTPUT_END_PADDING. */ static void deflate_init_output(struct deflate_output_bitstream *os, @@ -680,7 +679,8 @@ deflate_init_output(struct deflate_output_bitstream *os, /* * Add some bits to the bitbuffer variable of the output bitstream. The caller - * must make sure there is enough room. + * must ensure that os->bitcount + num_bits <= BITBUF_NBITS, by calling + * deflate_flush_bits() frequently enough. */ static forceinline void deflate_add_bits(struct deflate_output_bitstream *os, @@ -734,7 +734,8 @@ deflate_align_bitstream(struct deflate_output_bitstream *os) /* * Flush any remaining bits to the output buffer if needed. Return the total - * number of bytes written to the output buffer, or 0 if an overflow occurred. + * number of bytes that have been written to the output buffer since + * deflate_init_output(), or 0 if an overflow occurred. */ static size_t deflate_flush_output(struct deflate_output_bitstream *os) @@ -755,7 +756,7 @@ deflate_flush_output(struct deflate_output_bitstream *os) * Given the binary tree node A[subtree_idx] whose children already satisfy the * maxheap property, swap the node with its greater child until it is greater * than or equal to both of its children, so that the maxheap property is - * satisfied in the subtree rooted at A[subtree_idx]. + * satisfied in the subtree rooted at A[subtree_idx]. 'A' uses 1-based indices. */ static void heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) @@ -908,7 +909,7 @@ sort_symbols(unsigned num_syms, const u32 freqs[restrict], } /* - * Build the Huffman tree. + * Build a Huffman tree. * * This is an optimized implementation that * (a) takes advantage of the frequencies being already sorted; @@ -1386,8 +1387,8 @@ deflate_reset_symbol_frequencies(struct libdeflate_compressor *c) /* * Build the literal/length and offset Huffman codes for a DEFLATE block. * - * This takes as input the frequency tables for each code and produces as output - * a set of tables that map symbols to codewords and codeword lengths. + * This takes as input the frequency tables for each alphabet and produces as + * output a set of tables that map symbols to codewords and codeword lengths. */ static void deflate_make_huffman_codes(const struct deflate_freqs *freqs, @@ -2005,19 +2006,19 @@ deflate_flush_block(struct libdeflate_compressor * restrict c, * literals we only look at the high bits and low bits, and for matches we only * look at whether the match is long or not. The assumption is that for typical * "real" data, places that are good block boundaries will tend to be noticeable - * based only on changes in these aggregate frequencies, without looking for + * based only on changes in these aggregate probabilities, without looking for * subtle differences in individual symbols. For example, a change from ASCII * bytes to non-ASCII bytes, or from few matches (generally less compressible) * to many matches (generally more compressible), would be easily noticed based * on the aggregates. * - * For determining whether the frequency distributions are "different enough" to - * start a new block, the simply heuristic of splitting when the sum of absolute - * differences exceeds a constant seems to be good enough. We also add a number - * proportional to the block length so that the algorithm is more likely to end - * long blocks than short blocks. This reflects the general expectation that it - * will become increasingly beneficial to start a new block as the current - * block grows longer. + * For determining whether the probability distributions are "different enough" + * to start a new block, the simple heuristic of splitting when the sum of + * absolute differences exceeds a constant seems to be good enough. We also add + * a number proportional to the block length so that the algorithm is more + * likely to end long blocks than short blocks. This reflects the general + * expectation that it will become increasingly beneficial to start a new block + * as the current block grows longer. * * Finally, for an approximation, it is not strictly necessary that the exact * symbols being used are considered. With "near-optimal parsing", for example, @@ -2081,9 +2082,14 @@ do_end_block_check(struct block_split_stats *stats, u32 block_length) { if (stats->num_observations > 0) { /* - * Note: to avoid slow divisions, we do not divide by - * 'num_observations', but rather do all math with the numbers - * multiplied by 'num_observations'. + * Compute the sum of absolute differences of probabilities. To + * avoid needing to use floating point arithmetic or do slow + * divisions, we do all arithmetic with the probabilities + * multiplied by num_observations * num_new_observations. E.g., + * for the "old" observations the probabilities would be + * (double)observations[i] / num_observations, but since we + * multiply by both num_observations and num_new_observations we + * really do observations[i] * num_new_observations. */ u32 total_delta = 0; u32 num_items; @@ -2103,6 +2109,12 @@ do_end_block_check(struct block_split_stats *stats, u32 block_length) num_items = stats->num_observations + stats->num_new_observations; + /* + * Heuristic: the cutoff is when the sum of absolute differences + * of probabilities becomes at least 200/512. As above, the + * probability is multiplied by both num_new_observations and + * num_observations. Be careful to avoid integer overflow. + */ cutoff = stats->num_new_observations * 200 / 512 * stats->num_observations; /*