mirror of
https://github.com/cuberite/libdeflate.git
synced 2025-08-04 02:06:31 -04:00

Introduce deflate_compress_fastest(), and use it for level 1. It uses the ht_matchfinder instead of the hc_matchfinder, and it skips the block splitting algorithm. This speeds up level 1, without changing the compression ratio too much (relative to what is expected for level 1).
3484 lines
113 KiB
C
3484 lines
113 KiB
C
/*
|
|
* deflate_compress.c - a compressor for DEFLATE
|
|
*
|
|
* Copyright 2016 Eric Biggers
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person
|
|
* obtaining a copy of this software and associated documentation
|
|
* files (the "Software"), to deal in the Software without
|
|
* restriction, including without limitation the rights to use,
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following
|
|
* conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "deflate_compress.h"
|
|
#include "deflate_constants.h"
|
|
#include "unaligned.h"
|
|
|
|
#include "libdeflate.h"
|
|
|
|
/******************************************************************************/
|
|
|
|
/*
|
|
* The following parameters can be changed at build time to customize the
|
|
* compression algorithms slightly:
|
|
*
|
|
* (Note, not all customizable parameters are here. Some others can be found in
|
|
* libdeflate_alloc_compressor() and in *_matchfinder.h.)
|
|
*/
|
|
|
|
/*
|
|
* If this parameter is defined to 1, then the near-optimal parsing algorithm
|
|
* will be included, and compression levels 10-12 will use it. This algorithm
|
|
* usually produces a compression ratio significantly better than the other
|
|
* algorithms. However, it is slow. If this parameter is defined to 0, then
|
|
* levels 10-12 will be the same as level 9 and will use the lazy2 algorithm.
|
|
*/
|
|
#define SUPPORT_NEAR_OPTIMAL_PARSING 1
|
|
|
|
/*
|
|
* If this parameter is defined to 1, then the compressor will maintain a full
|
|
* map from match offsets to offset slots, rather than a condensed map. This
|
|
* will usually improve performance, especially for the near-optimal parsing
|
|
* algorithm. However, it will use an additional 32257 bytes of memory.
|
|
*/
|
|
#define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING
|
|
|
|
/*
|
|
* This is the minimum block length, in uncompressed bytes, which the compressor
|
|
* will use. This should be a value below which using shorter blocks is very
|
|
* unlikely to be worthwhile, due to the per-block overhead. This parameter
|
|
* doesn't apply to the final block, which can be arbitrarily short.
|
|
*
|
|
* Defining a fixed minimum block length is needed in order to guarantee a
|
|
* reasonable upper bound on the compressed size. It's also needed because our
|
|
* block splitting algorithm doesn't work well on very short blocks.
|
|
*/
|
|
#define MIN_BLOCK_LENGTH 10000
|
|
|
|
/*
|
|
* This is the soft maximum block length, in uncompressed bytes, which the
|
|
* compressor will use. This is a "soft" maximum, meaning that the compressor
|
|
* will try to end blocks at this length, but it may go slightly past it if
|
|
* there is a match that straddles this limit. This parameter doesn't apply to
|
|
* uncompressed blocks, which the DEFLATE format limits to 65535 bytes.
|
|
*
|
|
* This should be a value above which it is very likely that splitting the block
|
|
* would produce a better compression ratio. Increasing/decreasing this
|
|
* parameter will increase/decrease per-compressor memory usage linearly.
|
|
*/
|
|
#define SOFT_MAX_BLOCK_LENGTH 300000
|
|
|
|
/*
|
|
* Block length, in uncompressed bytes, used by deflate_compress_fastest().
|
|
* deflate_compress_fastest() doesn't use the other block length settings.
|
|
*/
|
|
#define FAST_BLOCK_LENGTH MIN(32768, SOFT_MAX_BLOCK_LENGTH)
|
|
|
|
/*
|
|
* These are the maximum codeword lengths, in bits, the compressor will use for
|
|
* each Huffman code. The DEFLATE format defines limits for these. However,
|
|
* further limiting litlen codewords to 14 bits is beneficial, since it has
|
|
* negligible effect on compression ratio but allows some optimizations when
|
|
* outputting bits. (It allows 4 literals to be written at once rather than 3.)
|
|
*/
|
|
#define MAX_LITLEN_CODEWORD_LEN 14
|
|
#define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN
|
|
#define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN
|
|
|
|
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
|
|
|
/* Parameters specific to the near-optimal parsing algorithm */
|
|
|
|
/*
|
|
* BIT_COST is a scaling factor that allows the compressor to consider
|
|
* fractional bit costs when deciding which literal/match sequence to use. This
|
|
* is useful when the true symbol costs are unknown. For example, if the
|
|
* compressor thinks that a symbol has 6.5 bits of entropy, it can set its cost
|
|
* to 6.5 bits rather than have to use 6 or 7 bits. Although in the end each
|
|
* symbol will use a whole number of bits due to the Huffman coding, considering
|
|
* fractional bits can be helpful due to the limited information.
|
|
*
|
|
* BIT_COST should be a power of 2. A value of 8 or 16 works well. A higher
|
|
* value isn't very useful since the calculations are approximate anyway.
|
|
*/
|
|
#define BIT_COST 16
|
|
|
|
/*
|
|
* The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
|
|
* be needed to output a symbol that was unused in the previous optimization
|
|
* pass. Assigning a default cost allows the symbol to be used in the next
|
|
* optimization pass. However, the cost should be relatively high because the
|
|
* symbol probably won't be used very many times (if at all).
|
|
*/
|
|
#define LITERAL_NOSTAT_BITS 13
|
|
#define LENGTH_NOSTAT_BITS 13
|
|
#define OFFSET_NOSTAT_BITS 10
|
|
|
|
/*
|
|
* This is (approximately) the maximum number of matches that the compressor
|
|
* will cache per block. If the match cache becomes full, then the compressor
|
|
* will be forced to end the block early. This value should be large enough so
|
|
* that this rarely happens, due to the block being ended normally before the
|
|
* cache fills up. Increasing/decreasing this parameter will increase/decrease
|
|
* per-compressor memory usage linearly.
|
|
*/
|
|
#define MATCH_CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5)
|
|
|
|
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
|
|
|
/******************************************************************************/
|
|
|
|
/* Include the needed matchfinders. */
|
|
#define MATCHFINDER_WINDOW_ORDER DEFLATE_WINDOW_ORDER
|
|
#include "hc_matchfinder.h"
|
|
#include "ht_matchfinder.h"
|
|
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
|
# include "bt_matchfinder.h"
|
|
/*
|
|
* This is the maximum number of matches the binary trees matchfinder can find
|
|
* at a single position. Since the matchfinder never finds more than one match
|
|
* for the same length, presuming one of each possible length is sufficient for
|
|
* an upper bound. (This says nothing about whether it is worthwhile to
|
|
* consider so many matches; this is just defining the worst case.)
|
|
*/
|
|
#define MAX_MATCHES_PER_POS \
|
|
(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
|
|
#endif
|
|
|
|
/* Table: length slot => length slot base value */
|
|
static const unsigned deflate_length_slot_base[] = {
|
|
3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ,
|
|
11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 ,
|
|
35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 ,
|
|
131 , 163 , 195 , 227 , 258 ,
|
|
};
|
|
|
|
/* Table: length slot => number of extra length bits */
|
|
static const u8 deflate_extra_length_bits[] = {
|
|
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
|
1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 ,
|
|
3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
|
|
5 , 5 , 5 , 5 , 0 ,
|
|
};
|
|
|
|
/* Table: offset slot => offset slot base value */
|
|
static const unsigned deflate_offset_slot_base[] = {
|
|
1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 ,
|
|
17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 ,
|
|
257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 ,
|
|
4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
|
|
};
|
|
|
|
/* Table: offset slot => number of extra offset bits */
|
|
static const u8 deflate_extra_offset_bits[] = {
|
|
0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 ,
|
|
3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
|
|
7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 ,
|
|
11 , 11 , 12 , 12 , 13 , 13 ,
|
|
};
|
|
|
|
/* Table: length => length slot */
|
|
static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
|
|
0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
|
|
12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
|
|
16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
|
|
18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
|
20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
|
|
21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
|
|
22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
|
|
23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
|
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
|
|
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
|
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
|
|
26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
|
|
26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
|
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
|
27, 27, 28,
|
|
};
|
|
|
|
/* The order in which precode codeword lengths are stored */
|
|
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
|
|
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
|
|
};
|
|
|
|
/* Codewords for the DEFLATE Huffman codes. */
|
|
struct deflate_codewords {
|
|
u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
|
|
u32 offset[DEFLATE_NUM_OFFSET_SYMS];
|
|
};
|
|
|
|
/* Codeword lengths (in bits) for the DEFLATE Huffman codes.
|
|
* A zero length means the corresponding symbol had zero frequency. */
|
|
struct deflate_lens {
|
|
u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
|
|
u8 offset[DEFLATE_NUM_OFFSET_SYMS];
|
|
};
|
|
|
|
/* Codewords and lengths for the DEFLATE Huffman codes. */
|
|
struct deflate_codes {
|
|
struct deflate_codewords codewords;
|
|
struct deflate_lens lens;
|
|
};
|
|
|
|
/* Symbol frequency counters for the DEFLATE Huffman codes. */
|
|
struct deflate_freqs {
|
|
u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
|
|
u32 offset[DEFLATE_NUM_OFFSET_SYMS];
|
|
};
|
|
|
|
/*
|
|
* Represents a run of literals followed by a match or end-of-block. This
|
|
* struct is needed to temporarily store items chosen by the parser, since items
|
|
* cannot be written until all items for the block have been chosen and the
|
|
* block's Huffman codes have been computed.
|
|
*/
|
|
struct deflate_sequence {
|
|
|
|
/* Bits 0..22: the number of literals in this run. This may be 0 and
|
|
* can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not
|
|
* stored explicitly in this structure; instead, they are read directly
|
|
* from the uncompressed data.
|
|
*
|
|
* Bits 23..31: the length of the match which follows the literals, or 0
|
|
* if this literal run was the last in the block, so there is no match
|
|
* which follows it. */
|
|
u32 litrunlen_and_length;
|
|
|
|
/* If 'length' doesn't indicate end-of-block, then this is the offset of
|
|
* the match which follows the literals. */
|
|
u16 offset;
|
|
|
|
/* If 'length' doesn't indicate end-of-block, then this is the offset
|
|
* symbol of the match which follows the literals. */
|
|
u8 offset_symbol;
|
|
|
|
/* If 'length' doesn't indicate end-of-block, then this is the length
|
|
* slot of the match which follows the literals. */
|
|
u8 length_slot;
|
|
};
|
|
|
|
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
|
|
|
/* Costs for the near-optimal parsing algorithm. */
|
|
struct deflate_costs {
|
|
|
|
/* The cost to output each possible literal. */
|
|
u32 literal[DEFLATE_NUM_LITERALS];
|
|
|
|
/* The cost to output each possible match length. */
|
|
u32 length[DEFLATE_MAX_MATCH_LEN + 1];
|
|
|
|
/* The cost to output a match offset of each possible offset slot. */
|
|
u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
|
|
};
|
|
|
|
/*
|
|
* This structure represents a byte position in the input data and a node in the
|
|
* graph of possible match/literal choices for the current block.
|
|
*
|
|
* Logically, each incoming edge to this node is labeled with a literal or a
|
|
* match that can be taken to reach this position from an earlier position; and
|
|
* each outgoing edge from this node is labeled with a literal or a match that
|
|
* can be taken to advance from this position to a later position.
|
|
*
|
|
* But these "edges" are actually stored elsewhere (in 'match_cache'). Here we
|
|
* associate with each node just two pieces of information:
|
|
*
|
|
* 'cost_to_end' is the minimum cost to reach the end of the block from
|
|
* this position.
|
|
*
|
|
* 'item' represents the literal or match that must be chosen from here to
|
|
* reach the end of the block with the minimum cost. Equivalently, this
|
|
* can be interpreted as the label of the outgoing edge on the minimum-cost
|
|
* path to the "end of block" node from this node.
|
|
*/
|
|
struct deflate_optimum_node {
|
|
|
|
u32 cost_to_end;
|
|
|
|
/*
|
|
* Notes on the match/literal representation used here:
|
|
*
|
|
* The low bits of 'item' are the length: 1 if this is a literal,
|
|
* or the match length if this is a match.
|
|
*
|
|
* The high bits of 'item' are the actual literal byte if this is a
|
|
* literal, or the match offset if this is a match.
|
|
*/
|
|
#define OPTIMUM_OFFSET_SHIFT 9
|
|
#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
|
|
u32 item;
|
|
|
|
};
|
|
|
|
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
|
|
|
/* Block split statistics. See "Block splitting algorithm" below. */
|
|
#define NUM_LITERAL_OBSERVATION_TYPES 8
|
|
#define NUM_MATCH_OBSERVATION_TYPES 2
|
|
#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
|
|
#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
|
|
struct block_split_stats {
|
|
u32 new_observations[NUM_OBSERVATION_TYPES];
|
|
u32 observations[NUM_OBSERVATION_TYPES];
|
|
u32 num_new_observations;
|
|
u32 num_observations;
|
|
};
|
|
|
|
/* The main DEFLATE compressor structure */
|
|
struct libdeflate_compressor {
|
|
|
|
/* Pointer to the compress() implementation chosen at allocation time */
|
|
size_t (*impl)(struct libdeflate_compressor *,
|
|
const u8 *, size_t, u8 *, size_t);
|
|
|
|
/* Frequency counters for the current block */
|
|
struct deflate_freqs freqs;
|
|
|
|
/* Dynamic Huffman codes for the current block */
|
|
struct deflate_codes codes;
|
|
|
|
/* Static Huffman codes */
|
|
struct deflate_codes static_codes;
|
|
|
|
/* Block split statistics for the currently pending block */
|
|
struct block_split_stats split_stats;
|
|
|
|
/* A table for fast lookups of offset slot by match offset.
|
|
*
|
|
* If the full table is being used, it is a direct mapping from offset
|
|
* to offset slot.
|
|
*
|
|
* If the condensed table is being used, the first 256 entries map
|
|
* directly to the offset slots of offsets 1 through 256. The next 256
|
|
* entries map to the offset slots for the remaining offsets, stepping
|
|
* through the offsets with a stride of 128. This relies on the fact
|
|
* that each of the remaining offset slots contains at least 128 offsets
|
|
* and has an offset base that is a multiple of 128. */
|
|
#if USE_FULL_OFFSET_SLOT_FAST
|
|
u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
|
|
#else
|
|
u8 offset_slot_fast[512];
|
|
#endif
|
|
|
|
/* The "nice" match length: if a match of this length is found, choose
|
|
* it immediately without further consideration. */
|
|
unsigned nice_match_length;
|
|
|
|
/* The maximum search depth: consider at most this many potential
|
|
* matches at each position. */
|
|
unsigned max_search_depth;
|
|
|
|
/* The compression level with which this compressor was created. */
|
|
unsigned compression_level;
|
|
|
|
/* Anything smaller than this we won't bother trying to compress. */
|
|
unsigned min_size_to_compress;
|
|
|
|
/* Temporary space for Huffman code output */
|
|
u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
|
|
u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
|
|
u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
|
|
unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
|
|
unsigned num_litlen_syms;
|
|
unsigned num_offset_syms;
|
|
unsigned num_explicit_lens;
|
|
unsigned num_precode_items;
|
|
|
|
union {
|
|
/* Data for greedy or lazy parsing */
|
|
struct {
|
|
/* Hash chains matchfinder */
|
|
struct hc_matchfinder hc_mf;
|
|
|
|
/* The matches and literals that the parser has chosen
|
|
* for the current block. The required length of this
|
|
* array is limited by the maximum number of matches
|
|
* that can ever be chosen for a single block, plus one
|
|
* for the special entry at the end. */
|
|
struct deflate_sequence sequences[
|
|
DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
|
|
DEFLATE_MIN_MATCH_LEN) + 1];
|
|
} g; /* (g)reedy */
|
|
|
|
/* Data for fastest parsing */
|
|
struct {
|
|
/* Hash table matchfinder */
|
|
struct ht_matchfinder ht_mf;
|
|
|
|
struct deflate_sequence sequences[
|
|
DIV_ROUND_UP(FAST_BLOCK_LENGTH,
|
|
HT_MATCHFINDER_MIN_MATCH_LEN) + 1];
|
|
} f; /* (f)astest */
|
|
|
|
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
|
/* Data for near-optimal parsing */
|
|
struct {
|
|
|
|
/* Binary tree matchfinder */
|
|
struct bt_matchfinder bt_mf;
|
|
|
|
/*
|
|
* Cached matches for the current block. This array
|
|
* contains the matches that were found at each position
|
|
* in the block. Specifically, for each position, there
|
|
* is a list of matches found at that position, if any,
|
|
* sorted by strictly increasing length. In addition,
|
|
* following the matches for each position, there is a
|
|
* special 'struct lz_match' whose 'length' member
|
|
* contains the number of matches found at that
|
|
* position, and whose 'offset' member contains the
|
|
* literal at that position.
|
|
*
|
|
* Note: in rare cases, there will be a very high number
|
|
* of matches in the block and this array will overflow.
|
|
* If this happens, we force the end of the current
|
|
* block. MATCH_CACHE_LENGTH is the length at which we
|
|
* actually check for overflow. The extra slots beyond
|
|
* this are enough to absorb the worst case overflow,
|
|
* which occurs if starting at
|
|
* &match_cache[MATCH_CACHE_LENGTH - 1], we write
|
|
* MAX_MATCHES_PER_POS matches and a match count header,
|
|
* then skip searching for matches at
|
|
* 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the
|
|
* match count header for each.
|
|
*/
|
|
struct lz_match match_cache[MATCH_CACHE_LENGTH +
|
|
MAX_MATCHES_PER_POS +
|
|
DEFLATE_MAX_MATCH_LEN - 1];
|
|
|
|
/*
|
|
* Array of nodes, one per position, for running the
|
|
* minimum-cost path algorithm.
|
|
*
|
|
* This array must be large enough to accommodate the
|
|
* worst-case number of nodes, which occurs if we find a
|
|
* match of length DEFLATE_MAX_MATCH_LEN at position
|
|
* SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
|
|
* length SOFT_MAX_BLOCK_LENGTH - 1 +
|
|
* DEFLATE_MAX_MATCH_LEN. Add one for the end-of-block
|
|
* node.
|
|
*/
|
|
struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
|
|
DEFLATE_MAX_MATCH_LEN + 1];
|
|
|
|
/* The current cost model being used. */
|
|
struct deflate_costs costs;
|
|
|
|
/* Literal/match statistics saved from previous block */
|
|
u32 prev_observations[NUM_OBSERVATION_TYPES];
|
|
u32 prev_num_observations;
|
|
|
|
/*
|
|
* Approximate match length frequencies based on a
|
|
* greedy parse, gathered during matchfinding. This is
|
|
* used for setting the initial symbol costs.
|
|
*/
|
|
u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
|
|
|
|
unsigned num_optim_passes;
|
|
} n; /* (n)ear-optimal */
|
|
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
|
|
|
} p; /* (p)arser */
|
|
};
|
|
|
|
/*
|
|
* The type for the bitbuffer variable, which temporarily holds bits that are
|
|
* being packed into bytes and written to the output buffer. For best
|
|
* performance, this should have size equal to a machine word.
|
|
*/
|
|
typedef machine_word_t bitbuf_t;
|
|
#define BITBUF_NBITS (8 * sizeof(bitbuf_t))
|
|
|
|
/* Can the specified number of bits always be added to 'bitbuf' after any
|
|
* pending bytes have been flushed? */
|
|
#define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7)
|
|
|
|
/*
|
|
* Structure to keep track of the current state of sending bits to the
|
|
* compressed output buffer.
|
|
*/
|
|
struct deflate_output_bitstream {
|
|
|
|
/* Bits that haven't yet been written to the output buffer. */
|
|
bitbuf_t bitbuf;
|
|
|
|
/* Number of bits currently held in @bitbuf. */
|
|
unsigned bitcount;
|
|
|
|
/* Pointer to the beginning of the output buffer. */
|
|
u8 *begin;
|
|
|
|
/* Pointer to the position in the output buffer at which the next byte
|
|
* should be written. */
|
|
u8 *next;
|
|
|
|
/* Pointer just past the end of the output buffer. */
|
|
u8 *end;
|
|
};
|
|
|
|
/*
|
|
* OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
|
|
* present following os->end, in order to not overrun the buffer when generating
|
|
* output. When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
|
|
* bytes for put_unaligned_leword(). Otherwise we need only 1 byte. However,
|
|
* to make the compression algorithm produce the same result on all CPU
|
|
* architectures (which is sometimes desirable), we have to unconditionally use
|
|
* the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
|
|
*/
|
|
#define OUTPUT_END_PADDING 8
|
|
|
|
/* Initialize the output bitstream. 'size' is assumed to be at least
|
|
* OUTPUT_END_PADDING. */
|
|
static void
|
|
deflate_init_output(struct deflate_output_bitstream *os,
|
|
void *buffer, size_t size)
|
|
{
|
|
os->bitbuf = 0;
|
|
os->bitcount = 0;
|
|
os->begin = buffer;
|
|
os->next = os->begin;
|
|
os->end = os->begin + size - OUTPUT_END_PADDING;
|
|
}
|
|
|
|
/* Add some bits to the bitbuffer variable of the output bitstream. The caller
|
|
* must make sure there is enough room. */
|
|
static forceinline void
|
|
deflate_add_bits(struct deflate_output_bitstream *os,
|
|
const bitbuf_t bits, const unsigned num_bits)
|
|
{
|
|
os->bitbuf |= bits << os->bitcount;
|
|
os->bitcount += num_bits;
|
|
}
|
|
|
|
/* Flush bits from the bitbuffer variable to the output buffer. */
|
|
static forceinline void
|
|
deflate_flush_bits(struct deflate_output_bitstream *os)
|
|
{
|
|
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
/* Flush a whole word (branchlessly). */
|
|
put_unaligned_leword(os->bitbuf, os->next);
|
|
os->bitbuf >>= os->bitcount & ~7;
|
|
os->next += MIN(os->end - os->next, os->bitcount >> 3);
|
|
os->bitcount &= 7;
|
|
} else {
|
|
/* Flush a byte at a time. */
|
|
while (os->bitcount >= 8) {
|
|
*os->next = os->bitbuf;
|
|
if (os->next != os->end)
|
|
os->next++;
|
|
os->bitcount -= 8;
|
|
os->bitbuf >>= 8;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Align the bitstream on a byte boundary. */
|
|
static forceinline void
|
|
deflate_align_bitstream(struct deflate_output_bitstream *os)
|
|
{
|
|
os->bitcount += -os->bitcount & 7;
|
|
deflate_flush_bits(os);
|
|
}
|
|
|
|
/*
|
|
* Flush any remaining bits to the output buffer if needed. Return the total
|
|
* number of bytes written to the output buffer, or 0 if an overflow occurred.
|
|
*/
|
|
static size_t
|
|
deflate_flush_output(struct deflate_output_bitstream *os)
|
|
{
|
|
if (os->next == os->end) /* overflow? */
|
|
return 0;
|
|
|
|
while ((int)os->bitcount > 0) {
|
|
*os->next++ = os->bitbuf;
|
|
os->bitcount -= 8;
|
|
os->bitbuf >>= 8;
|
|
}
|
|
|
|
return os->next - os->begin;
|
|
}
|
|
|
|
/* Given the binary tree node A[subtree_idx] whose children already
|
|
* satisfy the maxheap property, swap the node with its greater child
|
|
* until it is greater than both its children, so that the maxheap
|
|
* property is satisfied in the subtree rooted at A[subtree_idx]. */
|
|
static void
|
|
heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
|
|
{
|
|
unsigned parent_idx;
|
|
unsigned child_idx;
|
|
u32 v;
|
|
|
|
v = A[subtree_idx];
|
|
parent_idx = subtree_idx;
|
|
while ((child_idx = parent_idx * 2) <= length) {
|
|
if (child_idx < length && A[child_idx + 1] > A[child_idx])
|
|
child_idx++;
|
|
if (v >= A[child_idx])
|
|
break;
|
|
A[parent_idx] = A[child_idx];
|
|
parent_idx = child_idx;
|
|
}
|
|
A[parent_idx] = v;
|
|
}
|
|
|
|
/* Rearrange the array 'A' so that it satisfies the maxheap property.
|
|
* 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
|
|
*/
|
|
static void
|
|
heapify_array(u32 A[], unsigned length)
|
|
{
|
|
unsigned subtree_idx;
|
|
|
|
for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
|
|
heapify_subtree(A, length, subtree_idx);
|
|
}
|
|
|
|
/*
|
|
* Sort the array 'A', which contains 'length' unsigned 32-bit integers.
|
|
*
|
|
* Note: name this function heap_sort() instead of heapsort() to avoid colliding
|
|
* with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
|
|
* necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
|
|
*/
|
|
static void
|
|
heap_sort(u32 A[], unsigned length)
|
|
{
|
|
A--; /* Use 1-based indices */
|
|
|
|
heapify_array(A, length);
|
|
|
|
while (length >= 2) {
|
|
u32 tmp = A[length];
|
|
A[length] = A[1];
|
|
A[1] = tmp;
|
|
length--;
|
|
heapify_subtree(A, length, 1);
|
|
}
|
|
}
|
|
|
|
#define NUM_SYMBOL_BITS 10
|
|
#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
|
|
|
|
#define GET_NUM_COUNTERS(num_syms) ((((num_syms) + 3 / 4) + 3) & ~3)
|
|
/*
|
|
* Sort the symbols primarily by frequency and secondarily by symbol
|
|
* value. Discard symbols with zero frequency and fill in an array with
|
|
* the remaining symbols, along with their frequencies. The low
|
|
* NUM_SYMBOL_BITS bits of each array entry will contain the symbol
|
|
* value, and the remaining bits will contain the frequency.
|
|
*
|
|
* @num_syms
|
|
* Number of symbols in the alphabet.
|
|
* Can't be greater than (1 << NUM_SYMBOL_BITS).
|
|
*
|
|
* @freqs[num_syms]
|
|
* The frequency of each symbol.
|
|
*
|
|
* @lens[num_syms]
|
|
* An array that eventually will hold the length of each codeword.
|
|
* This function only fills in the codeword lengths for symbols that
|
|
* have zero frequency, which are not well defined per se but will
|
|
* be set to 0.
|
|
*
|
|
* @symout[num_syms]
|
|
* The output array, described above.
|
|
*
|
|
* Returns the number of entries in 'symout' that were filled. This is
|
|
* the number of symbols that have nonzero frequency.
|
|
*/
|
|
static unsigned
|
|
sort_symbols(unsigned num_syms, const u32 freqs[restrict],
|
|
u8 lens[restrict], u32 symout[restrict])
|
|
{
|
|
unsigned sym;
|
|
unsigned i;
|
|
unsigned num_used_syms;
|
|
unsigned num_counters;
|
|
unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
|
|
|
|
/* We rely on heapsort, but with an added optimization. Since
|
|
* it's common for most symbol frequencies to be low, we first do
|
|
* a count sort using a limited number of counters. High
|
|
* frequencies will be counted in the last counter, and only they
|
|
* will be sorted with heapsort.
|
|
*
|
|
* Note: with more symbols, it is generally beneficial to have more
|
|
* counters. About 1 counter per 4 symbols seems fast.
|
|
*
|
|
* Note: I also tested radix sort, but even for large symbol
|
|
* counts (> 255) and frequencies bounded at 16 bits (enabling
|
|
* radix sort by just two base-256 digits), it didn't seem any
|
|
* faster than the method implemented here.
|
|
*
|
|
* Note: I tested the optimized quicksort implementation from
|
|
* glibc (with indirection overhead removed), but it was only
|
|
* marginally faster than the simple heapsort implemented here.
|
|
*
|
|
* Tests were done with building the codes for LZX. Results may
|
|
* vary for different compression algorithms...! */
|
|
|
|
num_counters = GET_NUM_COUNTERS(num_syms);
|
|
|
|
memset(counters, 0, num_counters * sizeof(counters[0]));
|
|
|
|
/* Count the frequencies. */
|
|
for (sym = 0; sym < num_syms; sym++)
|
|
counters[MIN(freqs[sym], num_counters - 1)]++;
|
|
|
|
/* Make the counters cumulative, ignoring the zero-th, which
|
|
* counted symbols with zero frequency. As a side effect, this
|
|
* calculates the number of symbols with nonzero frequency. */
|
|
num_used_syms = 0;
|
|
for (i = 1; i < num_counters; i++) {
|
|
unsigned count = counters[i];
|
|
counters[i] = num_used_syms;
|
|
num_used_syms += count;
|
|
}
|
|
|
|
/* Sort nonzero-frequency symbols using the counters. At the
|
|
* same time, set the codeword lengths of zero-frequency symbols
|
|
* to 0. */
|
|
for (sym = 0; sym < num_syms; sym++) {
|
|
u32 freq = freqs[sym];
|
|
if (freq != 0) {
|
|
symout[counters[MIN(freq, num_counters - 1)]++] =
|
|
sym | (freq << NUM_SYMBOL_BITS);
|
|
} else {
|
|
lens[sym] = 0;
|
|
}
|
|
}
|
|
|
|
/* Sort the symbols counted in the last counter. */
|
|
heap_sort(symout + counters[num_counters - 2],
|
|
counters[num_counters - 1] - counters[num_counters - 2]);
|
|
|
|
return num_used_syms;
|
|
}
|
|
|
|
/*
|
|
* Build the Huffman tree.
|
|
*
|
|
* This is an optimized implementation that
|
|
* (a) takes advantage of the frequencies being already sorted;
|
|
* (b) only generates non-leaf nodes, since the non-leaf nodes of a
|
|
* Huffman tree are sufficient to generate a canonical code;
|
|
* (c) Only stores parent pointers, not child pointers;
|
|
* (d) Produces the nodes in the same memory used for input
|
|
* frequency information.
|
|
*
|
|
* Array 'A', which contains 'sym_count' entries, is used for both input
|
|
* and output. For this function, 'sym_count' must be at least 2.
|
|
*
|
|
* For input, the array must contain the frequencies of the symbols,
|
|
* sorted in increasing order. Specifically, each entry must contain a
|
|
* frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low
|
|
* NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
|
|
* Although these bits will, in fact, contain the symbols that correspond
|
|
* to the frequencies, this function is concerned with frequencies only
|
|
* and keeps the symbols as-is.
|
|
*
|
|
* For output, this function will produce the non-leaf nodes of the
|
|
* Huffman tree. These nodes will be stored in the first (sym_count - 1)
|
|
* entries of the array. Entry A[sym_count - 2] will represent the root
|
|
* node. Each other node will contain the zero-based index of its parent
|
|
* node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low
|
|
* NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again,
|
|
* note that although these low bits will, in fact, contain a symbol
|
|
* value, this symbol will have *no relationship* with the Huffman tree
|
|
* node that happens to occupy the same slot. This is because this
|
|
* implementation only generates the non-leaf nodes of the tree.
|
|
*/
|
|
static void
|
|
build_tree(u32 A[], unsigned sym_count)
|
|
{
|
|
/* Index, in 'A', of next lowest frequency symbol that has not
|
|
* yet been processed. */
|
|
unsigned i = 0;
|
|
|
|
/* Index, in 'A', of next lowest frequency parentless non-leaf
|
|
* node; or, if equal to 'e', then no such node exists yet. */
|
|
unsigned b = 0;
|
|
|
|
/* Index, in 'A', of next node to allocate as a non-leaf. */
|
|
unsigned e = 0;
|
|
|
|
do {
|
|
unsigned m, n;
|
|
u32 freq_shifted;
|
|
|
|
/* Choose the two next lowest frequency entries. */
|
|
|
|
if (i != sym_count &&
|
|
(b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
|
|
m = i++;
|
|
else
|
|
m = b++;
|
|
|
|
if (i != sym_count &&
|
|
(b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
|
|
n = i++;
|
|
else
|
|
n = b++;
|
|
|
|
/* Allocate a non-leaf node and link the entries to it.
|
|
*
|
|
* If we link an entry that we're visiting for the first
|
|
* time (via index 'i'), then we're actually linking a
|
|
* leaf node and it will have no effect, since the leaf
|
|
* will be overwritten with a non-leaf when index 'e'
|
|
* catches up to it. But it's not any slower to
|
|
* unconditionally set the parent index.
|
|
*
|
|
* We also compute the frequency of the non-leaf node as
|
|
* the sum of its two children's frequencies. */
|
|
|
|
freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
|
|
|
|
A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
|
|
A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
|
|
A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
|
|
e++;
|
|
} while (sym_count - e > 1);
|
|
/* When just one entry remains, it is a "leaf" that was
|
|
* linked to some other node. We ignore it, since the
|
|
* rest of the array contains the non-leaves which we
|
|
* need. (Note that we're assuming the cases with 0 or 1
|
|
* symbols were handled separately.) */
|
|
}
|
|
|
|
/*
|
|
* Given the stripped-down Huffman tree constructed by build_tree(),
|
|
* determine the number of codewords that should be assigned each
|
|
* possible length, taking into account the length-limited constraint.
|
|
*
|
|
* @A
|
|
* The array produced by build_tree(), containing parent index
|
|
* information for the non-leaf nodes of the Huffman tree. Each
|
|
* entry in this array is a node; a node's parent always has a
|
|
* greater index than that node itself. This function will
|
|
* overwrite the parent index information in this array, so
|
|
* essentially it will destroy the tree. However, the data in the
|
|
* low NUM_SYMBOL_BITS of each entry will be preserved.
|
|
*
|
|
* @root_idx
|
|
* The 0-based index of the root node in 'A', and consequently one
|
|
* less than the number of tree node entries in 'A'. (Or, really 2
|
|
* less than the actual length of 'A'.)
|
|
*
|
|
* @len_counts
|
|
* An array of length ('max_codeword_len' + 1) in which the number of
|
|
* codewords having each length <= max_codeword_len will be
|
|
* returned.
|
|
*
|
|
* @max_codeword_len
|
|
* The maximum permissible codeword length.
|
|
*/
|
|
static void
|
|
compute_length_counts(u32 A[restrict], unsigned root_idx,
|
|
unsigned len_counts[restrict], unsigned max_codeword_len)
|
|
{
|
|
unsigned len;
|
|
int node;
|
|
|
|
/* The key observations are:
|
|
*
|
|
* (1) We can traverse the non-leaf nodes of the tree, always
|
|
* visiting a parent before its children, by simply iterating
|
|
* through the array in reverse order. Consequently, we can
|
|
* compute the depth of each node in one pass, overwriting the
|
|
* parent indices with depths.
|
|
*
|
|
* (2) We can initially assume that in the real Huffman tree,
|
|
* both children of the root are leaves. This corresponds to two
|
|
* codewords of length 1. Then, whenever we visit a (non-leaf)
|
|
* node during the traversal, we modify this assumption to
|
|
* account for the current node *not* being a leaf, but rather
|
|
* its two children being leaves. This causes the loss of one
|
|
* codeword for the current depth and the addition of two
|
|
* codewords for the current depth plus one.
|
|
*
|
|
* (3) We can handle the length-limited constraint fairly easily
|
|
* by simply using the largest length available when a depth
|
|
* exceeds max_codeword_len.
|
|
*/
|
|
|
|
for (len = 0; len <= max_codeword_len; len++)
|
|
len_counts[len] = 0;
|
|
len_counts[1] = 2;
|
|
|
|
/* Set the root node's depth to 0. */
|
|
A[root_idx] &= SYMBOL_MASK;
|
|
|
|
for (node = root_idx - 1; node >= 0; node--) {
|
|
|
|
/* Calculate the depth of this node. */
|
|
|
|
unsigned parent = A[node] >> NUM_SYMBOL_BITS;
|
|
unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
|
|
unsigned depth = parent_depth + 1;
|
|
unsigned len = depth;
|
|
|
|
/* Set the depth of this node so that it is available
|
|
* when its children (if any) are processed. */
|
|
|
|
A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
|
|
|
|
/* If needed, decrease the length to meet the
|
|
* length-limited constraint. This is not the optimal
|
|
* method for generating length-limited Huffman codes!
|
|
* But it should be good enough. */
|
|
if (len >= max_codeword_len) {
|
|
len = max_codeword_len;
|
|
do {
|
|
len--;
|
|
} while (len_counts[len] == 0);
|
|
}
|
|
|
|
/* Account for the fact that we have a non-leaf node at
|
|
* the current depth. */
|
|
len_counts[len]--;
|
|
len_counts[len + 1] += 2;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Generate the codewords for a canonical Huffman code.
|
|
*
|
|
* @A
|
|
* The output array for codewords. In addition, initially this
|
|
* array must contain the symbols, sorted primarily by frequency and
|
|
* secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
|
|
* each entry.
|
|
*
|
|
* @len
|
|
* Output array for codeword lengths.
|
|
*
|
|
* @len_counts
|
|
* An array that provides the number of codewords that will have
|
|
* each possible length <= max_codeword_len.
|
|
*
|
|
* @max_codeword_len
|
|
* Maximum length, in bits, of each codeword.
|
|
*
|
|
* @num_syms
|
|
* Number of symbols in the alphabet, including symbols with zero
|
|
* frequency. This is the length of the 'A' and 'len' arrays.
|
|
*/
|
|
static void
|
|
gen_codewords(u32 A[restrict], u8 lens[restrict],
|
|
const unsigned len_counts[restrict],
|
|
unsigned max_codeword_len, unsigned num_syms)
|
|
{
|
|
u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
|
|
unsigned i;
|
|
unsigned len;
|
|
unsigned sym;
|
|
|
|
/* Given the number of codewords that will have each length,
|
|
* assign codeword lengths to symbols. We do this by assigning
|
|
* the lengths in decreasing order to the symbols sorted
|
|
* primarily by increasing frequency and secondarily by
|
|
* increasing symbol value. */
|
|
for (i = 0, len = max_codeword_len; len >= 1; len--) {
|
|
unsigned count = len_counts[len];
|
|
while (count--)
|
|
lens[A[i++] & SYMBOL_MASK] = len;
|
|
}
|
|
|
|
/* Generate the codewords themselves. We initialize the
|
|
* 'next_codewords' array to provide the lexicographically first
|
|
* codeword of each length, then assign codewords in symbol
|
|
* order. This produces a canonical code. */
|
|
next_codewords[0] = 0;
|
|
next_codewords[1] = 0;
|
|
for (len = 2; len <= max_codeword_len; len++)
|
|
next_codewords[len] =
|
|
(next_codewords[len - 1] + len_counts[len - 1]) << 1;
|
|
|
|
for (sym = 0; sym < num_syms; sym++)
|
|
A[sym] = next_codewords[lens[sym]]++;
|
|
}
|
|
|
|
/*
|
|
* ---------------------------------------------------------------------
|
|
* make_canonical_huffman_code()
|
|
* ---------------------------------------------------------------------
|
|
*
|
|
* Given an alphabet and the frequency of each symbol in it, construct a
|
|
* length-limited canonical Huffman code.
|
|
*
|
|
* @num_syms
|
|
* The number of symbols in the alphabet. The symbols are the
|
|
* integers in the range [0, num_syms - 1]. This parameter must be
|
|
* at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
|
|
*
|
|
* @max_codeword_len
|
|
* The maximum permissible codeword length.
|
|
*
|
|
* @freqs
|
|
* An array of @num_syms entries, each of which specifies the
|
|
* frequency of the corresponding symbol. It is valid for some,
|
|
* none, or all of the frequencies to be 0.
|
|
*
|
|
* @lens
|
|
* An array of @num_syms entries in which this function will return
|
|
* the length, in bits, of the codeword assigned to each symbol.
|
|
* Symbols with 0 frequency will not have codewords per se, but
|
|
* their entries in this array will be set to 0. No lengths greater
|
|
* than @max_codeword_len will be assigned.
|
|
*
|
|
* @codewords
|
|
* An array of @num_syms entries in which this function will return
|
|
* the codeword for each symbol, right-justified and padded on the
|
|
* left with zeroes. Codewords for symbols with 0 frequency will be
|
|
* undefined.
|
|
*
|
|
* ---------------------------------------------------------------------
|
|
*
|
|
* This function builds a length-limited canonical Huffman code.
|
|
*
|
|
* A length-limited Huffman code contains no codewords longer than some
|
|
* specified length, and has exactly (with some algorithms) or
|
|
* approximately (with the algorithm used here) the minimum weighted path
|
|
* length from the root, given this constraint.
|
|
*
|
|
* A canonical Huffman code satisfies the properties that a longer
|
|
* codeword never lexicographically precedes a shorter codeword, and the
|
|
* lexicographic ordering of codewords of the same length is the same as
|
|
* the lexicographic ordering of the corresponding symbols. A canonical
|
|
* Huffman code, or more generally a canonical prefix code, can be
|
|
* reconstructed from only a list containing the codeword length of each
|
|
* symbol.
|
|
*
|
|
* The classic algorithm to generate a Huffman code creates a node for
|
|
* each symbol, then inserts these nodes into a min-heap keyed by symbol
|
|
* frequency. Then, repeatedly, the two lowest-frequency nodes are
|
|
* removed from the min-heap and added as the children of a new node
|
|
* having frequency equal to the sum of its two children, which is then
|
|
* inserted into the min-heap. When only a single node remains in the
|
|
* min-heap, it is the root of the Huffman tree. The codeword for each
|
|
* symbol is determined by the path needed to reach the corresponding
|
|
* node from the root. Descending to the left child appends a 0 bit,
|
|
* whereas descending to the right child appends a 1 bit.
|
|
*
|
|
* The classic algorithm is relatively easy to understand, but it is
|
|
* subject to a number of inefficiencies. In practice, it is fastest to
|
|
* first sort the symbols by frequency. (This itself can be subject to
|
|
* an optimization based on the fact that most frequencies tend to be
|
|
* low.) At the same time, we sort secondarily by symbol value, which
|
|
* aids the process of generating a canonical code. Then, during tree
|
|
* construction, no heap is necessary because both the leaf nodes and the
|
|
* unparented non-leaf nodes can be easily maintained in sorted order.
|
|
* Consequently, there can never be more than two possibilities for the
|
|
* next-lowest-frequency node.
|
|
*
|
|
* In addition, because we're generating a canonical code, we actually
|
|
* don't need the leaf nodes of the tree at all, only the non-leaf nodes.
|
|
* This is because for canonical code generation we don't need to know
|
|
* where the symbols are in the tree. Rather, we only need to know how
|
|
* many leaf nodes have each depth (codeword length). And this
|
|
* information can, in fact, be quickly generated from the tree of
|
|
* non-leaves only.
|
|
*
|
|
* Furthermore, we can build this stripped-down Huffman tree directly in
|
|
* the array in which the codewords are to be generated, provided that
|
|
* these array slots are large enough to hold a symbol and frequency
|
|
* value.
|
|
*
|
|
* Still furthermore, we don't even need to maintain explicit child
|
|
* pointers. We only need the parent pointers, and even those can be
|
|
* overwritten in-place with depth information as part of the process of
|
|
* extracting codeword lengths from the tree. So in summary, we do NOT
|
|
* need a big structure like:
|
|
*
|
|
* struct huffman_tree_node {
|
|
* unsigned int symbol;
|
|
* unsigned int frequency;
|
|
* unsigned int depth;
|
|
* struct huffman_tree_node *left_child;
|
|
* struct huffman_tree_node *right_child;
|
|
* };
|
|
*
|
|
*
|
|
* ... which often gets used in "naive" implementations of Huffman code
|
|
* generation.
|
|
*
|
|
* Many of these optimizations are based on the implementation in 7-Zip
|
|
* (source file: C/HuffEnc.c), which has been placed in the public domain
|
|
* by Igor Pavlov.
|
|
*/
|
|
static void
|
|
make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
|
|
const u32 freqs[restrict],
|
|
u8 lens[restrict], u32 codewords[restrict])
|
|
{
|
|
u32 *A = codewords;
|
|
unsigned num_used_syms;
|
|
|
|
STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
|
|
|
|
/* We begin by sorting the symbols primarily by frequency and
|
|
* secondarily by symbol value. As an optimization, the array
|
|
* used for this purpose ('A') shares storage with the space in
|
|
* which we will eventually return the codewords. */
|
|
|
|
num_used_syms = sort_symbols(num_syms, freqs, lens, A);
|
|
|
|
/* 'num_used_syms' is the number of symbols with nonzero
|
|
* frequency. This may be less than @num_syms. 'num_used_syms'
|
|
* is also the number of entries in 'A' that are valid. Each
|
|
* entry consists of a distinct symbol and a nonzero frequency
|
|
* packed into a 32-bit integer. */
|
|
|
|
/* Handle special cases where only 0 or 1 symbols were used (had
|
|
* nonzero frequency). */
|
|
|
|
if (unlikely(num_used_syms == 0)) {
|
|
/* Code is empty. sort_symbols() already set all lengths
|
|
* to 0, so there is nothing more to do. */
|
|
return;
|
|
}
|
|
|
|
if (unlikely(num_used_syms == 1)) {
|
|
/* Only one symbol was used, so we only need one
|
|
* codeword. But two codewords are needed to form the
|
|
* smallest complete Huffman code, which uses codewords 0
|
|
* and 1. Therefore, we choose another symbol to which
|
|
* to assign a codeword. We use 0 (if the used symbol is
|
|
* not 0) or 1 (if the used symbol is 0). In either
|
|
* case, the lesser-valued symbol must be assigned
|
|
* codeword 0 so that the resulting code is canonical. */
|
|
|
|
unsigned sym = A[0] & SYMBOL_MASK;
|
|
unsigned nonzero_idx = sym ? sym : 1;
|
|
|
|
codewords[0] = 0;
|
|
lens[0] = 1;
|
|
codewords[nonzero_idx] = 1;
|
|
lens[nonzero_idx] = 1;
|
|
return;
|
|
}
|
|
|
|
/* Build a stripped-down version of the Huffman tree, sharing the
|
|
* array 'A' with the symbol values. Then extract length counts
|
|
* from the tree and use them to generate the final codewords. */
|
|
|
|
build_tree(A, num_used_syms);
|
|
|
|
{
|
|
unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
|
|
|
|
compute_length_counts(A, num_used_syms - 2,
|
|
len_counts, max_codeword_len);
|
|
|
|
gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Clear the Huffman symbol frequency counters.
|
|
* This must be called when starting a new DEFLATE block.
|
|
*/
|
|
static void
|
|
deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
|
|
{
|
|
memset(&c->freqs, 0, sizeof(c->freqs));
|
|
}
|
|
|
|
/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */
|
|
static u32
|
|
deflate_reverse_codeword(u32 codeword, u8 len)
|
|
{
|
|
/* The following branchless algorithm is faster than going bit by bit.
|
|
* Note: since no codewords are longer than 16 bits, we only need to
|
|
* reverse the low 16 bits of the 'u32'. */
|
|
STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
|
|
|
|
/* Flip adjacent 1-bit fields */
|
|
codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
|
|
|
|
/* Flip adjacent 2-bit fields */
|
|
codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
|
|
|
|
/* Flip adjacent 4-bit fields */
|
|
codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
|
|
|
|
/* Flip adjacent 8-bit fields */
|
|
codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
|
|
|
|
/* Return the high 'len' bits of the bit-reversed 16 bit value. */
|
|
return codeword >> (16 - len);
|
|
}
|
|
|
|
/* Make a canonical Huffman code with bit-reversed codewords. */
|
|
static void
|
|
deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
|
|
const u32 freqs[], u8 lens[], u32 codewords[])
|
|
{
|
|
unsigned sym;
|
|
|
|
make_canonical_huffman_code(num_syms, max_codeword_len,
|
|
freqs, lens, codewords);
|
|
|
|
for (sym = 0; sym < num_syms; sym++)
|
|
codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
|
|
}
|
|
|
|
/*
|
|
* Build the literal/length and offset Huffman codes for a DEFLATE block.
|
|
*
|
|
* This takes as input the frequency tables for each code and produces as output
|
|
* a set of tables that map symbols to codewords and codeword lengths.
|
|
*/
|
|
static void
|
|
deflate_make_huffman_codes(const struct deflate_freqs *freqs,
|
|
struct deflate_codes *codes)
|
|
{
|
|
STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
|
|
STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
|
|
|
|
deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
|
|
MAX_LITLEN_CODEWORD_LEN,
|
|
freqs->litlen,
|
|
codes->lens.litlen,
|
|
codes->codewords.litlen);
|
|
|
|
deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
|
|
MAX_OFFSET_CODEWORD_LEN,
|
|
freqs->offset,
|
|
codes->lens.offset,
|
|
codes->codewords.offset);
|
|
}
|
|
|
|
/* Initialize c->static_codes. */
|
|
static void
|
|
deflate_init_static_codes(struct libdeflate_compressor *c)
|
|
{
|
|
unsigned i;
|
|
|
|
for (i = 0; i < 144; i++)
|
|
c->freqs.litlen[i] = 1 << (9 - 8);
|
|
for (; i < 256; i++)
|
|
c->freqs.litlen[i] = 1 << (9 - 9);
|
|
for (; i < 280; i++)
|
|
c->freqs.litlen[i] = 1 << (9 - 7);
|
|
for (; i < 288; i++)
|
|
c->freqs.litlen[i] = 1 << (9 - 8);
|
|
|
|
for (i = 0; i < 32; i++)
|
|
c->freqs.offset[i] = 1 << (5 - 5);
|
|
|
|
deflate_make_huffman_codes(&c->freqs, &c->static_codes);
|
|
}
|
|
|
|
/* Return the offset slot for the specified match offset. */
|
|
static forceinline unsigned
|
|
deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
|
|
{
|
|
#if USE_FULL_OFFSET_SLOT_FAST
|
|
return c->offset_slot_fast[offset];
|
|
#else
|
|
if (offset <= 256)
|
|
return c->offset_slot_fast[offset - 1];
|
|
else
|
|
return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
|
|
#endif
|
|
}
|
|
|
|
/* Write the header fields common to all DEFLATE block types. */
|
|
static void
|
|
deflate_write_block_header(struct deflate_output_bitstream *os,
|
|
bool is_final_block, unsigned block_type)
|
|
{
|
|
deflate_add_bits(os, is_final_block, 1);
|
|
deflate_add_bits(os, block_type, 2);
|
|
deflate_flush_bits(os);
|
|
}
|
|
|
|
static unsigned
|
|
deflate_compute_precode_items(const u8 lens[restrict],
|
|
const unsigned num_lens,
|
|
u32 precode_freqs[restrict],
|
|
unsigned precode_items[restrict])
|
|
{
|
|
unsigned *itemptr;
|
|
unsigned run_start;
|
|
unsigned run_end;
|
|
unsigned extra_bits;
|
|
u8 len;
|
|
|
|
memset(precode_freqs, 0,
|
|
DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
|
|
|
|
itemptr = precode_items;
|
|
run_start = 0;
|
|
do {
|
|
/* Find the next run of codeword lengths. */
|
|
|
|
/* len = the length being repeated */
|
|
len = lens[run_start];
|
|
|
|
/* Extend the run. */
|
|
run_end = run_start;
|
|
do {
|
|
run_end++;
|
|
} while (run_end != num_lens && len == lens[run_end]);
|
|
|
|
if (len == 0) {
|
|
/* Run of zeroes. */
|
|
|
|
/* Symbol 18: RLE 11 to 138 zeroes at a time. */
|
|
while ((run_end - run_start) >= 11) {
|
|
extra_bits = MIN((run_end - run_start) - 11, 0x7F);
|
|
precode_freqs[18]++;
|
|
*itemptr++ = 18 | (extra_bits << 5);
|
|
run_start += 11 + extra_bits;
|
|
}
|
|
|
|
/* Symbol 17: RLE 3 to 10 zeroes at a time. */
|
|
if ((run_end - run_start) >= 3) {
|
|
extra_bits = MIN((run_end - run_start) - 3, 0x7);
|
|
precode_freqs[17]++;
|
|
*itemptr++ = 17 | (extra_bits << 5);
|
|
run_start += 3 + extra_bits;
|
|
}
|
|
} else {
|
|
|
|
/* A run of nonzero lengths. */
|
|
|
|
/* Symbol 16: RLE 3 to 6 of the previous length. */
|
|
if ((run_end - run_start) >= 4) {
|
|
precode_freqs[len]++;
|
|
*itemptr++ = len;
|
|
run_start++;
|
|
do {
|
|
extra_bits = MIN((run_end - run_start) - 3, 0x3);
|
|
precode_freqs[16]++;
|
|
*itemptr++ = 16 | (extra_bits << 5);
|
|
run_start += 3 + extra_bits;
|
|
} while ((run_end - run_start) >= 3);
|
|
}
|
|
}
|
|
|
|
/* Output any remaining lengths without RLE. */
|
|
while (run_start != run_end) {
|
|
precode_freqs[len]++;
|
|
*itemptr++ = len;
|
|
run_start++;
|
|
}
|
|
} while (run_start != num_lens);
|
|
|
|
return itemptr - precode_items;
|
|
}
|
|
|
|
/*
|
|
* Huffman codeword lengths for dynamic Huffman blocks are compressed using a
|
|
* separate Huffman code, the "precode", which contains a symbol for each
|
|
* possible codeword length in the larger code as well as several special
|
|
* symbols to represent repeated codeword lengths (a form of run-length
|
|
* encoding). The precode is itself constructed in canonical form, and its
|
|
* codeword lengths are represented literally in 19 3-bit fields that
|
|
* immediately precede the compressed codeword lengths of the larger code.
|
|
*/
|
|
|
|
/* Precompute the information needed to output Huffman codes. */
|
|
static void
|
|
deflate_precompute_huffman_header(struct libdeflate_compressor *c)
|
|
{
|
|
/* Compute how many litlen and offset symbols are needed. */
|
|
|
|
for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
|
|
c->num_litlen_syms > 257;
|
|
c->num_litlen_syms--)
|
|
if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
|
|
break;
|
|
|
|
for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
|
|
c->num_offset_syms > 1;
|
|
c->num_offset_syms--)
|
|
if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
|
|
break;
|
|
|
|
/* If we're not using the full set of literal/length codeword lengths,
|
|
* then temporarily move the offset codeword lengths over so that the
|
|
* literal/length and offset codeword lengths are contiguous. */
|
|
|
|
STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
|
|
DEFLATE_NUM_LITLEN_SYMS);
|
|
|
|
if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
|
|
memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
|
|
(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
|
|
c->num_offset_syms);
|
|
}
|
|
|
|
/* Compute the "items" (RLE / literal tokens and extra bits) with which
|
|
* the codeword lengths in the larger code will be output. */
|
|
c->num_precode_items =
|
|
deflate_compute_precode_items((u8 *)&c->codes.lens,
|
|
c->num_litlen_syms +
|
|
c->num_offset_syms,
|
|
c->precode_freqs,
|
|
c->precode_items);
|
|
|
|
/* Build the precode. */
|
|
STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
|
|
deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
|
|
MAX_PRE_CODEWORD_LEN,
|
|
c->precode_freqs, c->precode_lens,
|
|
c->precode_codewords);
|
|
|
|
/* Count how many precode lengths we actually need to output. */
|
|
for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
|
|
c->num_explicit_lens > 4;
|
|
c->num_explicit_lens--)
|
|
if (c->precode_lens[deflate_precode_lens_permutation[
|
|
c->num_explicit_lens - 1]] != 0)
|
|
break;
|
|
|
|
/* Restore the offset codeword lengths if needed. */
|
|
if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
|
|
memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
|
|
(u8 *)&c->codes.lens + c->num_litlen_syms,
|
|
c->num_offset_syms);
|
|
}
|
|
}
|
|
|
|
/* Output the Huffman codes. */
|
|
static void
|
|
deflate_write_huffman_header(struct libdeflate_compressor *c,
|
|
struct deflate_output_bitstream *os)
|
|
{
|
|
unsigned i;
|
|
|
|
deflate_add_bits(os, c->num_litlen_syms - 257, 5);
|
|
deflate_add_bits(os, c->num_offset_syms - 1, 5);
|
|
deflate_add_bits(os, c->num_explicit_lens - 4, 4);
|
|
deflate_flush_bits(os);
|
|
|
|
/* Output the lengths of the codewords in the precode. */
|
|
for (i = 0; i < c->num_explicit_lens; i++) {
|
|
deflate_add_bits(os, c->precode_lens[
|
|
deflate_precode_lens_permutation[i]], 3);
|
|
deflate_flush_bits(os);
|
|
}
|
|
|
|
/* Output the encoded lengths of the codewords in the larger code. */
|
|
for (i = 0; i < c->num_precode_items; i++) {
|
|
unsigned precode_item = c->precode_items[i];
|
|
unsigned precode_sym = precode_item & 0x1F;
|
|
deflate_add_bits(os, c->precode_codewords[precode_sym],
|
|
c->precode_lens[precode_sym]);
|
|
if (precode_sym >= 16) {
|
|
if (precode_sym == 16)
|
|
deflate_add_bits(os, precode_item >> 5, 2);
|
|
else if (precode_sym == 17)
|
|
deflate_add_bits(os, precode_item >> 5, 3);
|
|
else
|
|
deflate_add_bits(os, precode_item >> 5, 7);
|
|
}
|
|
STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7));
|
|
deflate_flush_bits(os);
|
|
}
|
|
}
|
|
|
|
static void
|
|
deflate_write_sequences(struct deflate_output_bitstream * restrict os,
|
|
const struct deflate_codes * restrict codes,
|
|
const struct deflate_sequence sequences[restrict],
|
|
const u8 * restrict in_next)
|
|
{
|
|
const struct deflate_sequence *seq = sequences;
|
|
|
|
for (;;) {
|
|
u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
|
|
unsigned length = seq->litrunlen_and_length >> 23;
|
|
unsigned length_slot;
|
|
unsigned litlen_symbol;
|
|
unsigned offset_symbol;
|
|
|
|
if (litrunlen) {
|
|
#if 1
|
|
while (litrunlen >= 4) {
|
|
unsigned lit0 = in_next[0];
|
|
unsigned lit1 = in_next[1];
|
|
unsigned lit2 = in_next[2];
|
|
unsigned lit3 = in_next[3];
|
|
|
|
deflate_add_bits(os, codes->codewords.litlen[lit0],
|
|
codes->lens.litlen[lit0]);
|
|
if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
|
|
deflate_flush_bits(os);
|
|
|
|
deflate_add_bits(os, codes->codewords.litlen[lit1],
|
|
codes->lens.litlen[lit1]);
|
|
if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
|
|
deflate_flush_bits(os);
|
|
|
|
deflate_add_bits(os, codes->codewords.litlen[lit2],
|
|
codes->lens.litlen[lit2]);
|
|
if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
|
|
deflate_flush_bits(os);
|
|
|
|
deflate_add_bits(os, codes->codewords.litlen[lit3],
|
|
codes->lens.litlen[lit3]);
|
|
deflate_flush_bits(os);
|
|
in_next += 4;
|
|
litrunlen -= 4;
|
|
}
|
|
if (litrunlen-- != 0) {
|
|
deflate_add_bits(os, codes->codewords.litlen[*in_next],
|
|
codes->lens.litlen[*in_next]);
|
|
if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
|
|
deflate_flush_bits(os);
|
|
in_next++;
|
|
if (litrunlen-- != 0) {
|
|
deflate_add_bits(os, codes->codewords.litlen[*in_next],
|
|
codes->lens.litlen[*in_next]);
|
|
if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
|
|
deflate_flush_bits(os);
|
|
in_next++;
|
|
if (litrunlen-- != 0) {
|
|
deflate_add_bits(os, codes->codewords.litlen[*in_next],
|
|
codes->lens.litlen[*in_next]);
|
|
if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
|
|
deflate_flush_bits(os);
|
|
in_next++;
|
|
}
|
|
}
|
|
if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
|
|
deflate_flush_bits(os);
|
|
}
|
|
#else
|
|
do {
|
|
unsigned lit = *in_next++;
|
|
deflate_add_bits(os, codes->codewords.litlen[lit],
|
|
codes->lens.litlen[lit]);
|
|
deflate_flush_bits(os);
|
|
} while (--litrunlen);
|
|
#endif
|
|
}
|
|
|
|
if (length == 0)
|
|
return;
|
|
|
|
in_next += length;
|
|
|
|
length_slot = seq->length_slot;
|
|
litlen_symbol = DEFLATE_FIRST_LEN_SYM + length_slot;
|
|
|
|
/* Litlen symbol */
|
|
deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
|
|
codes->lens.litlen[litlen_symbol]);
|
|
|
|
/* Extra length bits */
|
|
STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
|
|
DEFLATE_MAX_EXTRA_LENGTH_BITS));
|
|
deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
|
|
deflate_extra_length_bits[length_slot]);
|
|
|
|
if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
|
|
DEFLATE_MAX_EXTRA_LENGTH_BITS +
|
|
MAX_OFFSET_CODEWORD_LEN +
|
|
DEFLATE_MAX_EXTRA_OFFSET_BITS))
|
|
deflate_flush_bits(os);
|
|
|
|
/* Offset symbol */
|
|
offset_symbol = seq->offset_symbol;
|
|
deflate_add_bits(os, codes->codewords.offset[offset_symbol],
|
|
codes->lens.offset[offset_symbol]);
|
|
|
|
if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
|
|
DEFLATE_MAX_EXTRA_OFFSET_BITS))
|
|
deflate_flush_bits(os);
|
|
|
|
/* Extra offset bits */
|
|
deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
|
|
deflate_extra_offset_bits[offset_symbol]);
|
|
|
|
deflate_flush_bits(os);
|
|
|
|
seq++;
|
|
}
|
|
}
|
|
|
|
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
|
/*
|
|
* Follow the minimum-cost path in the graph of possible match/literal choices
|
|
* for the current block and write out the matches/literals using the specified
|
|
* Huffman codes.
|
|
*
|
|
* Note: this is slightly duplicated with deflate_write_sequences(), the reason
|
|
* being that we don't want to waste time translating between intermediate
|
|
* match/literal representations.
|
|
*/
|
|
static void
|
|
deflate_write_item_list(struct deflate_output_bitstream *os,
|
|
const struct deflate_codes *codes,
|
|
struct libdeflate_compressor *c,
|
|
u32 block_length)
|
|
{
|
|
struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
|
|
struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
|
|
do {
|
|
unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
|
|
unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
|
|
unsigned litlen_symbol;
|
|
unsigned length_slot;
|
|
unsigned offset_slot;
|
|
|
|
if (length == 1) {
|
|
/* Literal */
|
|
litlen_symbol = offset;
|
|
deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
|
|
codes->lens.litlen[litlen_symbol]);
|
|
deflate_flush_bits(os);
|
|
} else {
|
|
/* Match length */
|
|
length_slot = deflate_length_slot[length];
|
|
litlen_symbol = DEFLATE_FIRST_LEN_SYM + length_slot;
|
|
deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
|
|
codes->lens.litlen[litlen_symbol]);
|
|
|
|
deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
|
|
deflate_extra_length_bits[length_slot]);
|
|
|
|
if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
|
|
DEFLATE_MAX_EXTRA_LENGTH_BITS +
|
|
MAX_OFFSET_CODEWORD_LEN +
|
|
DEFLATE_MAX_EXTRA_OFFSET_BITS))
|
|
deflate_flush_bits(os);
|
|
|
|
|
|
/* Match offset */
|
|
offset_slot = deflate_get_offset_slot(c, offset);
|
|
deflate_add_bits(os, codes->codewords.offset[offset_slot],
|
|
codes->lens.offset[offset_slot]);
|
|
|
|
if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
|
|
DEFLATE_MAX_EXTRA_OFFSET_BITS))
|
|
deflate_flush_bits(os);
|
|
|
|
deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
|
|
deflate_extra_offset_bits[offset_slot]);
|
|
|
|
deflate_flush_bits(os);
|
|
}
|
|
cur_node += length;
|
|
} while (cur_node != end_node);
|
|
}
|
|
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
|
|
|
/* Output the end-of-block symbol. */
|
|
static void
|
|
deflate_write_end_of_block(struct deflate_output_bitstream *os,
|
|
const struct deflate_codes *codes)
|
|
{
|
|
deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
|
|
codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
|
|
deflate_flush_bits(os);
|
|
}
|
|
|
|
static void
|
|
deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
|
|
const u8 *data, u16 len,
|
|
bool is_final_block)
|
|
{
|
|
deflate_write_block_header(os, is_final_block,
|
|
DEFLATE_BLOCKTYPE_UNCOMPRESSED);
|
|
deflate_align_bitstream(os);
|
|
|
|
if (4 + (u32)len >= os->end - os->next) {
|
|
os->next = os->end;
|
|
return;
|
|
}
|
|
|
|
put_unaligned_le16(len, os->next);
|
|
os->next += 2;
|
|
put_unaligned_le16(~len, os->next);
|
|
os->next += 2;
|
|
memcpy(os->next, data, len);
|
|
os->next += len;
|
|
}
|
|
|
|
static void
|
|
deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
|
|
const u8 *data, size_t data_length,
|
|
bool is_final_block)
|
|
{
|
|
do {
|
|
u16 len = MIN(data_length, UINT16_MAX);
|
|
|
|
deflate_write_uncompressed_block(os, data, len,
|
|
is_final_block && len == data_length);
|
|
data += len;
|
|
data_length -= len;
|
|
} while (data_length != 0);
|
|
}
|
|
|
|
/*
|
|
* Choose the best type of block to use (dynamic Huffman, static Huffman, or
|
|
* uncompressed), then output it.
|
|
*/
|
|
static void
|
|
deflate_flush_block(struct libdeflate_compressor * restrict c,
|
|
struct deflate_output_bitstream * restrict os,
|
|
const u8 * restrict block_begin, u32 block_length,
|
|
const struct deflate_sequence *sequences,
|
|
bool is_final_block)
|
|
{
|
|
static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
|
|
};
|
|
|
|
/* Costs are measured in bits */
|
|
u32 dynamic_cost = 0;
|
|
u32 static_cost = 0;
|
|
u32 uncompressed_cost = 0;
|
|
struct deflate_codes *codes;
|
|
int block_type;
|
|
unsigned sym;
|
|
|
|
if (sequences != NULL /* !near_optimal */ ||
|
|
!SUPPORT_NEAR_OPTIMAL_PARSING) {
|
|
/* Tally the end-of-block symbol. */
|
|
c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
|
|
|
|
/* Build dynamic Huffman codes. */
|
|
deflate_make_huffman_codes(&c->freqs, &c->codes);
|
|
} /* Else, this was already done */
|
|
|
|
/* Account for the cost of sending dynamic Huffman codes. */
|
|
deflate_precompute_huffman_header(c);
|
|
dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
|
|
for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
|
|
u32 extra = deflate_extra_precode_bits[sym];
|
|
dynamic_cost += c->precode_freqs[sym] *
|
|
(extra + c->precode_lens[sym]);
|
|
}
|
|
|
|
/* Account for the cost of encoding literals. */
|
|
for (sym = 0; sym < 256; sym++) {
|
|
dynamic_cost += c->freqs.litlen[sym] *
|
|
c->codes.lens.litlen[sym];
|
|
}
|
|
for (sym = 0; sym < 144; sym++)
|
|
static_cost += c->freqs.litlen[sym] * 8;
|
|
for (; sym < 256; sym++)
|
|
static_cost += c->freqs.litlen[sym] * 9;
|
|
|
|
/* Account for the cost of encoding the end-of-block symbol. */
|
|
dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK];
|
|
static_cost += 7;
|
|
|
|
/* Account for the cost of encoding lengths. */
|
|
for (sym = DEFLATE_FIRST_LEN_SYM;
|
|
sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits);
|
|
sym++) {
|
|
u32 extra = deflate_extra_length_bits[
|
|
sym - DEFLATE_FIRST_LEN_SYM];
|
|
dynamic_cost += c->freqs.litlen[sym] *
|
|
(extra + c->codes.lens.litlen[sym]);
|
|
static_cost += c->freqs.litlen[sym] *
|
|
(extra + c->static_codes.lens.litlen[sym]);
|
|
}
|
|
|
|
/* Account for the cost of encoding offsets. */
|
|
for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
|
|
u32 extra = deflate_extra_offset_bits[sym];
|
|
dynamic_cost += c->freqs.offset[sym] *
|
|
(extra + c->codes.lens.offset[sym]);
|
|
static_cost += c->freqs.offset[sym] * (extra + 5);
|
|
}
|
|
|
|
/* Compute the cost of using uncompressed blocks. */
|
|
uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
|
|
(40 * (DIV_ROUND_UP(block_length,
|
|
UINT16_MAX) - 1)) +
|
|
(8 * block_length);
|
|
|
|
/* Choose the cheapest block type. */
|
|
if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
|
|
block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
|
|
codes = &c->codes;
|
|
} else if (static_cost < uncompressed_cost) {
|
|
block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
|
|
codes = &c->static_codes;
|
|
} else {
|
|
block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
|
|
}
|
|
|
|
/* Now actually output the block. */
|
|
|
|
if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
|
|
/* Note: the length being flushed may exceed the maximum length
|
|
* of an uncompressed block (65535 bytes). Therefore, more than
|
|
* one uncompressed block might be needed. */
|
|
deflate_write_uncompressed_blocks(os, block_begin, block_length,
|
|
is_final_block);
|
|
} else {
|
|
/* Output the block header. */
|
|
deflate_write_block_header(os, is_final_block, block_type);
|
|
|
|
/* Output the Huffman codes (dynamic Huffman blocks only). */
|
|
if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
|
|
deflate_write_huffman_header(c, os);
|
|
|
|
/* Output the literals, matches, and end-of-block symbol. */
|
|
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
|
if (sequences == NULL)
|
|
deflate_write_item_list(os, codes, c, block_length);
|
|
else
|
|
#endif
|
|
deflate_write_sequences(os, codes, sequences,
|
|
block_begin);
|
|
deflate_write_end_of_block(os, codes);
|
|
}
|
|
}
|
|
|
|
static void
|
|
deflate_begin_sequences(struct libdeflate_compressor *c,
|
|
struct deflate_sequence *first_seq)
|
|
{
|
|
deflate_reset_symbol_frequencies(c);
|
|
first_seq->litrunlen_and_length = 0;
|
|
}
|
|
|
|
static forceinline void
|
|
deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
|
|
struct deflate_sequence *seq)
|
|
{
|
|
c->freqs.litlen[literal]++;
|
|
seq->litrunlen_and_length++;
|
|
}
|
|
|
|
static forceinline void
|
|
deflate_choose_match(struct libdeflate_compressor *c,
|
|
unsigned length, unsigned offset,
|
|
struct deflate_sequence **seq_p)
|
|
{
|
|
struct deflate_sequence *seq = *seq_p;
|
|
unsigned length_slot = deflate_length_slot[length];
|
|
unsigned offset_slot = deflate_get_offset_slot(c, offset);
|
|
|
|
c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
|
|
c->freqs.offset[offset_slot]++;
|
|
|
|
seq->litrunlen_and_length |= (u32)length << 23;
|
|
seq->offset = offset;
|
|
seq->length_slot = length_slot;
|
|
seq->offset_symbol = offset_slot;
|
|
|
|
seq++;
|
|
seq->litrunlen_and_length = 0;
|
|
*seq_p = seq;
|
|
}
|
|
|
|
/******************************************************************************/
|
|
|
|
/*
|
|
* Block splitting algorithm. The problem is to decide when it is worthwhile to
|
|
* start a new block with new Huffman codes. There is a theoretically optimal
|
|
* solution: recursively consider every possible block split, considering the
|
|
* exact cost of each block, and choose the minimum cost approach. But this is
|
|
* far too slow. Instead, as an approximation, we can count symbols and after
|
|
* every N symbols, compare the expected distribution of symbols based on the
|
|
* previous data with the actual distribution. If they differ "by enough", then
|
|
* start a new block.
|
|
*
|
|
* As an optimization and heuristic, we don't distinguish between every symbol
|
|
* but rather we combine many symbols into a single "observation type". For
|
|
* literals we only look at the high bits and low bits, and for matches we only
|
|
* look at whether the match is long or not. The assumption is that for typical
|
|
* "real" data, places that are good block boundaries will tend to be noticeable
|
|
* based only on changes in these aggregate frequencies, without looking for
|
|
* subtle differences in individual symbols. For example, a change from ASCII
|
|
* bytes to non-ASCII bytes, or from few matches (generally less compressible)
|
|
* to many matches (generally more compressible), would be easily noticed based
|
|
* on the aggregates.
|
|
*
|
|
* For determining whether the frequency distributions are "different enough" to
|
|
* start a new block, the simply heuristic of splitting when the sum of absolute
|
|
* differences exceeds a constant seems to be good enough. We also add a number
|
|
* proportional to the block length so that the algorithm is more likely to end
|
|
* long blocks than short blocks. This reflects the general expectation that it
|
|
* will become increasingly beneficial to start a new block as the current
|
|
* block grows longer.
|
|
*
|
|
* Finally, for an approximation, it is not strictly necessary that the exact
|
|
* symbols being used are considered. With "near-optimal parsing", for example,
|
|
* the actual symbols that will be used are unknown until after the block
|
|
* boundary is chosen and the block has been optimized. Since the final choices
|
|
* cannot be used, we can use preliminary "greedy" choices instead.
|
|
*/
|
|
|
|
/* Initialize the block split statistics when starting a new block. */
|
|
static void
|
|
init_block_split_stats(struct block_split_stats *stats)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
|
|
stats->new_observations[i] = 0;
|
|
stats->observations[i] = 0;
|
|
}
|
|
stats->num_new_observations = 0;
|
|
stats->num_observations = 0;
|
|
}
|
|
|
|
/* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the
|
|
* literal, for 8 possible literal observation types. */
|
|
static forceinline void
|
|
observe_literal(struct block_split_stats *stats, u8 lit)
|
|
{
|
|
stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
|
|
stats->num_new_observations++;
|
|
}
|
|
|
|
/* Match observation. Heuristic: use one observation type for "short match" and
|
|
* one observation type for "long match". */
|
|
static forceinline void
|
|
observe_match(struct block_split_stats *stats, unsigned length)
|
|
{
|
|
stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
|
|
stats->num_new_observations++;
|
|
}
|
|
|
|
static void
|
|
merge_new_observations(struct block_split_stats *stats)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
|
|
stats->observations[i] += stats->new_observations[i];
|
|
stats->new_observations[i] = 0;
|
|
}
|
|
stats->num_observations += stats->num_new_observations;
|
|
stats->num_new_observations = 0;
|
|
}
|
|
|
|
static bool
|
|
do_end_block_check(struct block_split_stats *stats, u32 block_length)
|
|
{
|
|
if (stats->num_observations > 0) {
|
|
/* Note: to avoid slow divisions, we do not divide by
|
|
* 'num_observations', but rather do all math with the numbers
|
|
* multiplied by 'num_observations'. */
|
|
u32 total_delta = 0;
|
|
int i;
|
|
|
|
for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
|
|
u32 expected = stats->observations[i] * stats->num_new_observations;
|
|
u32 actual = stats->new_observations[i] * stats->num_observations;
|
|
u32 delta = (actual > expected) ? actual - expected :
|
|
expected - actual;
|
|
total_delta += delta;
|
|
}
|
|
|
|
/* Ready to end the block? */
|
|
if (total_delta + (block_length / 4096) * stats->num_observations >=
|
|
NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
|
|
return true;
|
|
}
|
|
merge_new_observations(stats);
|
|
return false;
|
|
}
|
|
|
|
static forceinline bool
|
|
should_end_block(struct block_split_stats *stats,
|
|
const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
|
|
{
|
|
/* Ready to check block split statistics? */
|
|
if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
|
|
in_next - in_block_begin < MIN_BLOCK_LENGTH ||
|
|
in_end - in_next < MIN_BLOCK_LENGTH)
|
|
return false;
|
|
|
|
return do_end_block_check(stats, in_next - in_block_begin);
|
|
}
|
|
|
|
/******************************************************************************/
|
|
|
|
/*
|
|
* Decrease the maximum and nice match lengths if we're approaching the end of
|
|
* the input buffer.
|
|
*/
|
|
static forceinline void
|
|
adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
|
|
{
|
|
if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
|
|
*max_len = remaining;
|
|
*nice_len = MIN(*nice_len, *max_len);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Choose the minimum match length for the greedy and lazy parsers.
|
|
*
|
|
* By default the minimum match length is 3, which is the smallest length the
|
|
* DEFLATE format allows. However, with greedy and lazy parsing, some data
|
|
* (e.g. DNA sequencing data) benefits greatly from a longer minimum length.
|
|
* Typically, this is because literals are very cheap. In general, the
|
|
* near-optimal parser handles this case naturally, but the greedy and lazy
|
|
* parsers need a heuristic to decide when to use short matches.
|
|
*
|
|
* The heuristic we use is to make the minimum match length depend on the number
|
|
* of different literals that exist in the data. If there are many different
|
|
* literals, then literals will probably be expensive, so short matches will
|
|
* probably be worthwhile. Conversely, if not many literals are used, then
|
|
* probably literals will be cheap and short matches won't be worthwhile.
|
|
*/
|
|
static unsigned
|
|
choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
|
|
{
|
|
/* map from num_used_literals to min_len */
|
|
static const u8 min_lens[] = {
|
|
9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
|
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
/* the rest is implicitly 3 */
|
|
};
|
|
unsigned min_len;
|
|
|
|
STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
|
|
STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);
|
|
|
|
if (num_used_literals >= ARRAY_LEN(min_lens))
|
|
return 3;
|
|
min_len = min_lens[num_used_literals];
|
|
/*
|
|
* With a low max_search_depth, it may be too hard to find long matches.
|
|
*/
|
|
if (max_search_depth < 16) {
|
|
if (max_search_depth < 5)
|
|
min_len = MIN(min_len, 4);
|
|
else if (max_search_depth < 10)
|
|
min_len = MIN(min_len, 5);
|
|
else
|
|
min_len = MIN(min_len, 7);
|
|
}
|
|
return min_len;
|
|
}
|
|
|
|
static unsigned
|
|
calculate_min_match_len(const u8 *data, size_t data_len,
|
|
unsigned max_search_depth)
|
|
{
|
|
u8 used[256] = { 0 };
|
|
unsigned num_used_literals = 0;
|
|
int i;
|
|
|
|
/*
|
|
* For an initial approximation, scan the first 4 KiB of data.
|
|
* recalculate_min_match_len() will update the min_len later.
|
|
*/
|
|
data_len = MIN(data_len, 4096);
|
|
for (i = 0; i < data_len; i++)
|
|
used[data[i]] = 1;
|
|
for (i = 0; i < 256; i++)
|
|
num_used_literals += used[i];
|
|
return choose_min_match_len(num_used_literals, max_search_depth);
|
|
}
|
|
|
|
/*
|
|
* Recalculate the minimum match length for a block, now that we know the
|
|
* distribution of literals that are actually being used (freqs->litlen).
|
|
*/
|
|
static unsigned
|
|
recalculate_min_match_len(const struct deflate_freqs *freqs,
|
|
unsigned max_search_depth)
|
|
{
|
|
u32 literal_freq = 0;
|
|
u32 cutoff;
|
|
unsigned num_used_literals = 0;
|
|
int i;
|
|
|
|
for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
|
|
literal_freq += freqs->litlen[i];
|
|
|
|
cutoff = literal_freq >> 10; /* Ignore literals used very rarely */
|
|
|
|
for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
|
|
if (freqs->litlen[i] > cutoff)
|
|
num_used_literals++;
|
|
}
|
|
return choose_min_match_len(num_used_literals, max_search_depth);
|
|
}
|
|
|
|
/*
|
|
* This is the level 0 "compressor". It always outputs uncompressed blocks.
|
|
*/
|
|
static size_t
|
|
deflate_compress_none(struct libdeflate_compressor * restrict c,
|
|
const u8 * restrict in, size_t in_nbytes,
|
|
u8 * restrict out, size_t out_nbytes_avail)
|
|
{
|
|
struct deflate_output_bitstream os;
|
|
|
|
deflate_init_output(&os, out, out_nbytes_avail);
|
|
|
|
deflate_write_uncompressed_blocks(&os, in, in_nbytes, true);
|
|
|
|
return deflate_flush_output(&os);
|
|
}
|
|
|
|
/*
|
|
* This is a faster variant of deflate_compress_greedy(). It uses the
|
|
* ht_matchfinder rather than the hc_matchfinder. It also skips the block
|
|
* splitting algorithm and just uses fixed length blocks. c->max_search_depth
|
|
* has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h.
|
|
*/
|
|
static size_t
|
|
deflate_compress_fastest(struct libdeflate_compressor * restrict c,
|
|
const u8 * restrict in, size_t in_nbytes,
|
|
u8 * restrict out, size_t out_nbytes_avail)
|
|
{
|
|
const u8 *in_next = in;
|
|
const u8 *in_end = in_next + in_nbytes;
|
|
struct deflate_output_bitstream os;
|
|
const u8 *in_cur_base = in_next;
|
|
unsigned max_len = DEFLATE_MAX_MATCH_LEN;
|
|
unsigned nice_len = MIN(c->nice_match_length, max_len);
|
|
u32 next_hash = 0;
|
|
|
|
deflate_init_output(&os, out, out_nbytes_avail);
|
|
ht_matchfinder_init(&c->p.f.ht_mf);
|
|
|
|
do {
|
|
/* Starting a new DEFLATE block. */
|
|
|
|
const u8 * const in_block_begin = in_next;
|
|
const u8 * const in_max_block_end =
|
|
in_next + MIN(in_end - in_next, FAST_BLOCK_LENGTH);
|
|
struct deflate_sequence *seq = c->p.f.sequences;
|
|
|
|
deflate_begin_sequences(c, seq);
|
|
|
|
do {
|
|
u32 length;
|
|
u32 offset;
|
|
size_t remaining = in_end - in_next;
|
|
|
|
if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
|
|
max_len = remaining;
|
|
if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) {
|
|
do {
|
|
deflate_choose_literal(
|
|
c, *in_next++, seq);
|
|
} while (--max_len);
|
|
break;
|
|
}
|
|
nice_len = MIN(nice_len, max_len);
|
|
}
|
|
length = ht_matchfinder_longest_match(&c->p.f.ht_mf,
|
|
&in_cur_base,
|
|
in_next,
|
|
max_len,
|
|
nice_len,
|
|
&next_hash,
|
|
&offset);
|
|
if (length) {
|
|
/* Match found. */
|
|
deflate_choose_match(c, length, offset, &seq);
|
|
ht_matchfinder_skip_bytes(&c->p.f.ht_mf,
|
|
&in_cur_base,
|
|
in_next + 1,
|
|
in_end,
|
|
length - 1,
|
|
&next_hash);
|
|
in_next += length;
|
|
} else {
|
|
/* No match found. */
|
|
deflate_choose_literal(c, *in_next++, seq);
|
|
}
|
|
|
|
/* Check if it's time to output another block. */
|
|
} while (in_next < in_max_block_end);
|
|
|
|
deflate_flush_block(c, &os, in_block_begin,
|
|
in_next - in_block_begin,
|
|
c->p.f.sequences, in_next == in_end);
|
|
} while (in_next != in_end);
|
|
|
|
return deflate_flush_output(&os);
|
|
}
|
|
|
|
/*
|
|
* This is the "greedy" DEFLATE compressor. It always chooses the longest match.
|
|
*/
|
|
static size_t
|
|
deflate_compress_greedy(struct libdeflate_compressor * restrict c,
|
|
const u8 * restrict in, size_t in_nbytes,
|
|
u8 * restrict out, size_t out_nbytes_avail)
|
|
{
|
|
const u8 *in_next = in;
|
|
const u8 *in_end = in_next + in_nbytes;
|
|
struct deflate_output_bitstream os;
|
|
const u8 *in_cur_base = in_next;
|
|
unsigned max_len = DEFLATE_MAX_MATCH_LEN;
|
|
unsigned nice_len = MIN(c->nice_match_length, max_len);
|
|
u32 next_hashes[2] = {0, 0};
|
|
|
|
deflate_init_output(&os, out, out_nbytes_avail);
|
|
hc_matchfinder_init(&c->p.g.hc_mf);
|
|
|
|
do {
|
|
/* Starting a new DEFLATE block. */
|
|
|
|
const u8 * const in_block_begin = in_next;
|
|
const u8 * const in_max_block_end =
|
|
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
|
|
unsigned min_len;
|
|
struct deflate_sequence *seq = c->p.g.sequences;
|
|
|
|
init_block_split_stats(&c->split_stats);
|
|
deflate_begin_sequences(c, seq);
|
|
|
|
min_len = calculate_min_match_len(in_next,
|
|
in_max_block_end - in_next,
|
|
c->max_search_depth);
|
|
|
|
do {
|
|
u32 length;
|
|
u32 offset;
|
|
|
|
adjust_max_and_nice_len(&max_len, &nice_len,
|
|
in_end - in_next);
|
|
length = hc_matchfinder_longest_match(
|
|
&c->p.g.hc_mf,
|
|
&in_cur_base,
|
|
in_next,
|
|
min_len - 1,
|
|
max_len,
|
|
nice_len,
|
|
c->max_search_depth,
|
|
next_hashes,
|
|
&offset);
|
|
|
|
if (length >= min_len &&
|
|
(length > DEFLATE_MIN_MATCH_LEN ||
|
|
offset <= 4096)) {
|
|
/* Match found. */
|
|
deflate_choose_match(c, length, offset, &seq);
|
|
observe_match(&c->split_stats, length);
|
|
hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
|
|
&in_cur_base,
|
|
in_next + 1,
|
|
in_end,
|
|
length - 1,
|
|
next_hashes);
|
|
in_next += length;
|
|
} else {
|
|
/* No match found. */
|
|
deflate_choose_literal(c, *in_next, seq);
|
|
observe_literal(&c->split_stats, *in_next);
|
|
in_next++;
|
|
}
|
|
|
|
/* Check if it's time to output another block. */
|
|
} while (in_next < in_max_block_end &&
|
|
!should_end_block(&c->split_stats,
|
|
in_block_begin, in_next, in_end));
|
|
|
|
deflate_flush_block(c, &os, in_block_begin,
|
|
in_next - in_block_begin,
|
|
c->p.g.sequences, in_next == in_end);
|
|
} while (in_next != in_end);
|
|
|
|
return deflate_flush_output(&os);
|
|
}
|
|
|
|
static forceinline size_t
|
|
deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
|
|
const u8 * restrict in, size_t in_nbytes,
|
|
u8 * restrict out, size_t out_nbytes_avail,
|
|
bool lazy2)
|
|
{
|
|
const u8 *in_next = in;
|
|
const u8 *in_end = in_next + in_nbytes;
|
|
struct deflate_output_bitstream os;
|
|
const u8 *in_cur_base = in_next;
|
|
unsigned max_len = DEFLATE_MAX_MATCH_LEN;
|
|
unsigned nice_len = MIN(c->nice_match_length, max_len);
|
|
u32 next_hashes[2] = {0, 0};
|
|
|
|
deflate_init_output(&os, out, out_nbytes_avail);
|
|
hc_matchfinder_init(&c->p.g.hc_mf);
|
|
|
|
do {
|
|
/* Starting a new DEFLATE block. */
|
|
|
|
const u8 * const in_block_begin = in_next;
|
|
const u8 * const in_max_block_end =
|
|
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
|
|
const u8 *next_recalc_min_len =
|
|
in_next + MIN(in_end - in_next, 10000);
|
|
unsigned min_len = DEFLATE_MIN_MATCH_LEN;
|
|
struct deflate_sequence *seq = c->p.g.sequences;
|
|
|
|
init_block_split_stats(&c->split_stats);
|
|
deflate_begin_sequences(c, seq);
|
|
|
|
min_len = calculate_min_match_len(in_next,
|
|
in_max_block_end - in_next,
|
|
c->max_search_depth);
|
|
do {
|
|
unsigned cur_len;
|
|
unsigned cur_offset;
|
|
unsigned next_len;
|
|
unsigned next_offset;
|
|
|
|
/*
|
|
* Recalculate the minimum match length if it hasn't
|
|
* been done recently.
|
|
*/
|
|
if (in_next >= next_recalc_min_len) {
|
|
min_len = recalculate_min_match_len(
|
|
&c->freqs,
|
|
c->max_search_depth);
|
|
next_recalc_min_len +=
|
|
MIN(in_end - next_recalc_min_len,
|
|
in_next - in_block_begin);
|
|
}
|
|
|
|
/* Find the longest match at the current position. */
|
|
adjust_max_and_nice_len(&max_len, &nice_len,
|
|
in_end - in_next);
|
|
cur_len = hc_matchfinder_longest_match(
|
|
&c->p.g.hc_mf,
|
|
&in_cur_base,
|
|
in_next,
|
|
min_len - 1,
|
|
max_len,
|
|
nice_len,
|
|
c->max_search_depth,
|
|
next_hashes,
|
|
&cur_offset);
|
|
if (cur_len < min_len ||
|
|
(cur_len == DEFLATE_MIN_MATCH_LEN &&
|
|
cur_offset > 8192)) {
|
|
/* No match found. Choose a literal. */
|
|
deflate_choose_literal(c, *in_next, seq);
|
|
observe_literal(&c->split_stats, *in_next);
|
|
in_next++;
|
|
continue;
|
|
}
|
|
in_next++;
|
|
|
|
have_cur_match:
|
|
observe_match(&c->split_stats, cur_len);
|
|
/*
|
|
* We have a match at the current position.
|
|
* If it's very long, choose it immediately.
|
|
*/
|
|
if (cur_len >= nice_len) {
|
|
deflate_choose_match(c, cur_len, cur_offset,
|
|
&seq);
|
|
hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
|
|
&in_cur_base,
|
|
in_next,
|
|
in_end,
|
|
cur_len - 1,
|
|
next_hashes);
|
|
in_next += cur_len - 1;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* Try to find a better match at the next position.
|
|
*
|
|
* Note: since we already have a match at the *current*
|
|
* position, we use only half the 'max_search_depth'
|
|
* when checking the *next* position. This is a useful
|
|
* trade-off because it's more worthwhile to use a
|
|
* greater search depth on the initial match.
|
|
*
|
|
* Note: it's possible to structure the code such that
|
|
* there's only one call to longest_match(), which
|
|
* handles both the "find the initial match" and "try to
|
|
* find a better match" cases. However, it is faster to
|
|
* have two call sites, with longest_match() inlined at
|
|
* each.
|
|
*/
|
|
adjust_max_and_nice_len(&max_len, &nice_len,
|
|
in_end - in_next);
|
|
next_len = hc_matchfinder_longest_match(
|
|
&c->p.g.hc_mf,
|
|
&in_cur_base,
|
|
in_next++,
|
|
cur_len - 1,
|
|
max_len,
|
|
nice_len,
|
|
c->max_search_depth >> 1,
|
|
next_hashes,
|
|
&next_offset);
|
|
if (next_len >= cur_len &&
|
|
4 * (int)(next_len - cur_len) +
|
|
((int)bsr32(cur_offset) -
|
|
(int)bsr32(next_offset)) > 2) {
|
|
/*
|
|
* Found a better match at the next position.
|
|
* Output a literal. Then the next match
|
|
* becomes the current match.
|
|
*/
|
|
deflate_choose_literal(c, *(in_next - 2), seq);
|
|
cur_len = next_len;
|
|
cur_offset = next_offset;
|
|
goto have_cur_match;
|
|
}
|
|
|
|
if (lazy2) {
|
|
/* In lazy2 mode, look ahead another position */
|
|
adjust_max_and_nice_len(&max_len, &nice_len,
|
|
in_end - in_next);
|
|
next_len = hc_matchfinder_longest_match(
|
|
&c->p.g.hc_mf,
|
|
&in_cur_base,
|
|
in_next++,
|
|
cur_len - 1,
|
|
max_len,
|
|
nice_len,
|
|
c->max_search_depth >> 2,
|
|
next_hashes,
|
|
&next_offset);
|
|
if (next_len >= cur_len &&
|
|
4 * (int)(next_len - cur_len) +
|
|
((int)bsr32(cur_offset) -
|
|
(int)bsr32(next_offset)) > 6) {
|
|
/*
|
|
* There's a much better match two
|
|
* positions ahead, so use two literals.
|
|
*/
|
|
deflate_choose_literal(
|
|
c, *(in_next - 3), seq);
|
|
deflate_choose_literal(
|
|
c, *(in_next - 2), seq);
|
|
cur_len = next_len;
|
|
cur_offset = next_offset;
|
|
goto have_cur_match;
|
|
}
|
|
/*
|
|
* No better match at either of the next 2
|
|
* positions. Output the current match.
|
|
*/
|
|
deflate_choose_match(c, cur_len, cur_offset,
|
|
&seq);
|
|
if (cur_len > 3) {
|
|
hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
|
|
&in_cur_base,
|
|
in_next,
|
|
in_end,
|
|
cur_len - 3,
|
|
next_hashes);
|
|
in_next += cur_len - 3;
|
|
}
|
|
} else { /* !lazy2 */
|
|
/*
|
|
* No better match at the next position. Output
|
|
* the current match.
|
|
*/
|
|
deflate_choose_match(c, cur_len, cur_offset,
|
|
&seq);
|
|
hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
|
|
&in_cur_base,
|
|
in_next,
|
|
in_end,
|
|
cur_len - 2,
|
|
next_hashes);
|
|
in_next += cur_len - 2;
|
|
}
|
|
/* Check if it's time to output another block. */
|
|
} while (in_next < in_max_block_end &&
|
|
!should_end_block(&c->split_stats,
|
|
in_block_begin, in_next, in_end));
|
|
|
|
deflate_flush_block(c, &os, in_block_begin,
|
|
in_next - in_block_begin,
|
|
c->p.g.sequences, in_next == in_end);
|
|
} while (in_next != in_end);
|
|
|
|
return deflate_flush_output(&os);
|
|
}
|
|
|
|
/*
|
|
* This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to
|
|
* see if there's a better match at the next position. If yes, it outputs a
|
|
* literal and continues to the next position. If no, it outputs the match.
|
|
*/
|
|
static size_t
|
|
deflate_compress_lazy(struct libdeflate_compressor * restrict c,
|
|
const u8 * restrict in, size_t in_nbytes,
|
|
u8 * restrict out, size_t out_nbytes_avail)
|
|
{
|
|
return deflate_compress_lazy_generic(c, in, in_nbytes, out,
|
|
out_nbytes_avail, false);
|
|
}
|
|
|
|
/*
|
|
* The lazy2 compressor. This is similar to the regular lazy one, but it looks
|
|
* for a better match at the next 2 positions rather than the next 1. This
|
|
* makes it take slightly more time, but compress some inputs slightly more.
|
|
*/
|
|
static size_t
|
|
deflate_compress_lazy2(struct libdeflate_compressor * restrict c,
|
|
const u8 * restrict in, size_t in_nbytes,
|
|
u8 * restrict out, size_t out_nbytes_avail)
|
|
{
|
|
return deflate_compress_lazy_generic(c, in, in_nbytes, out,
|
|
out_nbytes_avail, true);
|
|
}
|
|
|
|
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
|
|
|
/*
|
|
* Follow the minimum-cost path in the graph of possible match/literal choices
|
|
* for the current block and compute the frequencies of the Huffman symbols that
|
|
* would be needed to output those matches and literals.
|
|
*/
|
|
static void
|
|
deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
|
|
{
|
|
struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
|
|
struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
|
|
do {
|
|
unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
|
|
unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
|
|
|
|
if (length == 1) {
|
|
/* Literal */
|
|
c->freqs.litlen[offset]++;
|
|
} else {
|
|
/* Match */
|
|
c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
|
|
deflate_length_slot[length]]++;
|
|
c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
|
|
}
|
|
cur_node += length;
|
|
} while (cur_node != end_node);
|
|
|
|
/* Tally the end-of-block symbol. */
|
|
c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
|
|
}
|
|
|
|
/* Set the current cost model from the codeword lengths specified in @lens. */
|
|
static void
|
|
deflate_set_costs_from_codes(struct libdeflate_compressor *c,
|
|
const struct deflate_lens *lens)
|
|
{
|
|
unsigned i;
|
|
|
|
/* Literals */
|
|
for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
|
|
u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
|
|
c->p.n.costs.literal[i] = bits * BIT_COST;
|
|
}
|
|
|
|
/* Lengths */
|
|
for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
|
|
unsigned length_slot = deflate_length_slot[i];
|
|
unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot;
|
|
u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
|
|
bits += deflate_extra_length_bits[length_slot];
|
|
c->p.n.costs.length[i] = bits * BIT_COST;
|
|
}
|
|
|
|
/* Offset slots */
|
|
for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
|
|
u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
|
|
bits += deflate_extra_offset_bits[i];
|
|
c->p.n.costs.offset_slot[i] = bits * BIT_COST;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This lookup table gives the default cost of a literal symbol and of a length
|
|
* symbol, depending on the characteristics of the input data. It was generated
|
|
* by scripts/gen_default_litlen_costs.py.
|
|
*
|
|
* This table is indexed first by the estimated match probability:
|
|
*
|
|
* i=0: data doesn't contain many matches [match_prob=0.25]
|
|
* i=1: neutral [match_prob=0.50]
|
|
* i=2: data contains lots of matches [match_prob=0.75]
|
|
*
|
|
* This lookup produces a subtable which maps the number of distinct used
|
|
* literals to the default cost of a literal symbol, i.e.:
|
|
*
|
|
* int(-log2((1 - match_prob) / num_used_literals) * BIT_COST)
|
|
*
|
|
* ... for num_used_literals in [1, 256] (and 0, which is copied from 1). This
|
|
* accounts for literals usually getting cheaper as the number of distinct
|
|
* literals decreases, and as the proportion of literals to matches increases.
|
|
*
|
|
* The lookup also produces the cost of a length symbol, which is:
|
|
*
|
|
* int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST)
|
|
*
|
|
* Note: we don't currently assign different costs to different literal symbols,
|
|
* or to different length symbols, as this is hard to do.
|
|
*/
|
|
static const struct {
|
|
u8 used_lits_to_lit_cost[257];
|
|
u8 len_sym_cost;
|
|
} default_litlen_costs[] = {
|
|
{ /* match_prob = 0.25 */
|
|
.used_lits_to_lit_cost = {
|
|
6, 6, 22, 32, 38, 43, 48, 51,
|
|
54, 57, 59, 61, 64, 65, 67, 69,
|
|
70, 72, 73, 74, 75, 76, 77, 79,
|
|
80, 80, 81, 82, 83, 84, 85, 85,
|
|
86, 87, 88, 88, 89, 89, 90, 91,
|
|
91, 92, 92, 93, 93, 94, 95, 95,
|
|
96, 96, 96, 97, 97, 98, 98, 99,
|
|
99, 99, 100, 100, 101, 101, 101, 102,
|
|
102, 102, 103, 103, 104, 104, 104, 105,
|
|
105, 105, 105, 106, 106, 106, 107, 107,
|
|
107, 108, 108, 108, 108, 109, 109, 109,
|
|
109, 110, 110, 110, 111, 111, 111, 111,
|
|
112, 112, 112, 112, 112, 113, 113, 113,
|
|
113, 114, 114, 114, 114, 114, 115, 115,
|
|
115, 115, 115, 116, 116, 116, 116, 116,
|
|
117, 117, 117, 117, 117, 118, 118, 118,
|
|
118, 118, 118, 119, 119, 119, 119, 119,
|
|
120, 120, 120, 120, 120, 120, 121, 121,
|
|
121, 121, 121, 121, 121, 122, 122, 122,
|
|
122, 122, 122, 123, 123, 123, 123, 123,
|
|
123, 123, 124, 124, 124, 124, 124, 124,
|
|
124, 125, 125, 125, 125, 125, 125, 125,
|
|
125, 126, 126, 126, 126, 126, 126, 126,
|
|
127, 127, 127, 127, 127, 127, 127, 127,
|
|
128, 128, 128, 128, 128, 128, 128, 128,
|
|
128, 129, 129, 129, 129, 129, 129, 129,
|
|
129, 129, 130, 130, 130, 130, 130, 130,
|
|
130, 130, 130, 131, 131, 131, 131, 131,
|
|
131, 131, 131, 131, 131, 132, 132, 132,
|
|
132, 132, 132, 132, 132, 132, 132, 133,
|
|
133, 133, 133, 133, 133, 133, 133, 133,
|
|
133, 134, 134, 134, 134, 134, 134, 134,
|
|
134,
|
|
},
|
|
.len_sym_cost = 109,
|
|
}, { /* match_prob = 0.5 */
|
|
.used_lits_to_lit_cost = {
|
|
16, 16, 32, 41, 48, 53, 57, 60,
|
|
64, 66, 69, 71, 73, 75, 76, 78,
|
|
80, 81, 82, 83, 85, 86, 87, 88,
|
|
89, 90, 91, 92, 92, 93, 94, 95,
|
|
96, 96, 97, 98, 98, 99, 99, 100,
|
|
101, 101, 102, 102, 103, 103, 104, 104,
|
|
105, 105, 106, 106, 107, 107, 108, 108,
|
|
108, 109, 109, 110, 110, 110, 111, 111,
|
|
112, 112, 112, 113, 113, 113, 114, 114,
|
|
114, 115, 115, 115, 115, 116, 116, 116,
|
|
117, 117, 117, 118, 118, 118, 118, 119,
|
|
119, 119, 119, 120, 120, 120, 120, 121,
|
|
121, 121, 121, 122, 122, 122, 122, 122,
|
|
123, 123, 123, 123, 124, 124, 124, 124,
|
|
124, 125, 125, 125, 125, 125, 126, 126,
|
|
126, 126, 126, 127, 127, 127, 127, 127,
|
|
128, 128, 128, 128, 128, 128, 129, 129,
|
|
129, 129, 129, 129, 130, 130, 130, 130,
|
|
130, 130, 131, 131, 131, 131, 131, 131,
|
|
131, 132, 132, 132, 132, 132, 132, 133,
|
|
133, 133, 133, 133, 133, 133, 134, 134,
|
|
134, 134, 134, 134, 134, 134, 135, 135,
|
|
135, 135, 135, 135, 135, 135, 136, 136,
|
|
136, 136, 136, 136, 136, 136, 137, 137,
|
|
137, 137, 137, 137, 137, 137, 138, 138,
|
|
138, 138, 138, 138, 138, 138, 138, 139,
|
|
139, 139, 139, 139, 139, 139, 139, 139,
|
|
140, 140, 140, 140, 140, 140, 140, 140,
|
|
140, 141, 141, 141, 141, 141, 141, 141,
|
|
141, 141, 141, 142, 142, 142, 142, 142,
|
|
142, 142, 142, 142, 142, 142, 143, 143,
|
|
143, 143, 143, 143, 143, 143, 143, 143,
|
|
144,
|
|
},
|
|
.len_sym_cost = 93,
|
|
}, { /* match_prob = 0.75 */
|
|
.used_lits_to_lit_cost = {
|
|
32, 32, 48, 57, 64, 69, 73, 76,
|
|
80, 82, 85, 87, 89, 91, 92, 94,
|
|
96, 97, 98, 99, 101, 102, 103, 104,
|
|
105, 106, 107, 108, 108, 109, 110, 111,
|
|
112, 112, 113, 114, 114, 115, 115, 116,
|
|
117, 117, 118, 118, 119, 119, 120, 120,
|
|
121, 121, 122, 122, 123, 123, 124, 124,
|
|
124, 125, 125, 126, 126, 126, 127, 127,
|
|
128, 128, 128, 129, 129, 129, 130, 130,
|
|
130, 131, 131, 131, 131, 132, 132, 132,
|
|
133, 133, 133, 134, 134, 134, 134, 135,
|
|
135, 135, 135, 136, 136, 136, 136, 137,
|
|
137, 137, 137, 138, 138, 138, 138, 138,
|
|
139, 139, 139, 139, 140, 140, 140, 140,
|
|
140, 141, 141, 141, 141, 141, 142, 142,
|
|
142, 142, 142, 143, 143, 143, 143, 143,
|
|
144, 144, 144, 144, 144, 144, 145, 145,
|
|
145, 145, 145, 145, 146, 146, 146, 146,
|
|
146, 146, 147, 147, 147, 147, 147, 147,
|
|
147, 148, 148, 148, 148, 148, 148, 149,
|
|
149, 149, 149, 149, 149, 149, 150, 150,
|
|
150, 150, 150, 150, 150, 150, 151, 151,
|
|
151, 151, 151, 151, 151, 151, 152, 152,
|
|
152, 152, 152, 152, 152, 152, 153, 153,
|
|
153, 153, 153, 153, 153, 153, 154, 154,
|
|
154, 154, 154, 154, 154, 154, 154, 155,
|
|
155, 155, 155, 155, 155, 155, 155, 155,
|
|
156, 156, 156, 156, 156, 156, 156, 156,
|
|
156, 157, 157, 157, 157, 157, 157, 157,
|
|
157, 157, 157, 158, 158, 158, 158, 158,
|
|
158, 158, 158, 158, 158, 158, 159, 159,
|
|
159, 159, 159, 159, 159, 159, 159, 159,
|
|
160,
|
|
},
|
|
.len_sym_cost = 84,
|
|
},
|
|
};
|
|
|
|
/*
|
|
* Choose the default costs for literal and length symbols. These symbols are
|
|
* both part of the litlen alphabet.
|
|
*/
|
|
static void
|
|
deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
|
|
u32 block_length,
|
|
u32 *lit_cost, u32 *len_sym_cost)
|
|
{
|
|
unsigned num_used_literals = 0;
|
|
u32 literal_freq = block_length;
|
|
u32 match_freq = 0;
|
|
u32 cutoff;
|
|
int i;
|
|
|
|
/* Calculate the number of distinct literals that exist in the data. */
|
|
cutoff = literal_freq >> 11; /* Ignore literals used very rarely */
|
|
for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
|
|
if (c->freqs.litlen[i] > cutoff)
|
|
num_used_literals++;
|
|
}
|
|
if (num_used_literals == 0)
|
|
num_used_literals = 1;
|
|
|
|
/*
|
|
* Estimate the relative frequency of literals and matches in the
|
|
* optimal parsing solution. We don't know the optimal solution, so
|
|
* this can only be a very rough estimate. Therefore, we basically use
|
|
* the match frequency from a greedy parse. We also apply the min_len
|
|
* heuristic used by the greedy and lazy parsers, to avoid counting too
|
|
* many matches when literals are cheaper than short matches.
|
|
*/
|
|
match_freq = 0;
|
|
i = choose_min_match_len(num_used_literals, c->max_search_depth);
|
|
for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
|
|
match_freq += c->p.n.match_len_freqs[i];
|
|
literal_freq -= i * c->p.n.match_len_freqs[i];
|
|
}
|
|
if ((s32)literal_freq < 0) /* shouldn't happen */
|
|
literal_freq = 0;
|
|
|
|
if (match_freq > literal_freq)
|
|
i = 2; /* many matches */
|
|
else if (match_freq * 4 > literal_freq)
|
|
i = 1; /* neutral */
|
|
else
|
|
i = 0; /* few matches */
|
|
|
|
*lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[
|
|
num_used_literals];
|
|
*len_sym_cost = default_litlen_costs[i].len_sym_cost;
|
|
}
|
|
|
|
static forceinline u32
|
|
deflate_default_length_cost(unsigned len, u32 len_sym_cost)
|
|
{
|
|
unsigned slot = deflate_length_slot[len];
|
|
u32 num_extra_bits = deflate_extra_length_bits[slot];
|
|
|
|
return len_sym_cost + (num_extra_bits * BIT_COST);
|
|
}
|
|
|
|
static forceinline u32
|
|
deflate_default_offset_slot_cost(unsigned slot)
|
|
{
|
|
u32 num_extra_bits = deflate_extra_offset_bits[slot];
|
|
/*
|
|
* Assume that all offset symbols are equally probable.
|
|
* The resulting cost is 'int(-log2(1/30) * BIT_COST)',
|
|
* where 30 is the number of potentially-used offset symbols.
|
|
*/
|
|
u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000;
|
|
|
|
return offset_sym_cost + (num_extra_bits * BIT_COST);
|
|
}
|
|
|
|
/* Set default symbol costs for the first block's first optimization pass. */
|
|
static void
|
|
deflate_set_default_costs(struct libdeflate_compressor *c,
|
|
u32 lit_cost, u32 len_sym_cost)
|
|
{
|
|
unsigned i;
|
|
|
|
/* Literals */
|
|
for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
|
|
c->p.n.costs.literal[i] = lit_cost;
|
|
|
|
/* Lengths */
|
|
for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
|
|
c->p.n.costs.length[i] =
|
|
deflate_default_length_cost(i, len_sym_cost);
|
|
|
|
/* Offset slots */
|
|
for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
|
|
c->p.n.costs.offset_slot[i] =
|
|
deflate_default_offset_slot_cost(i);
|
|
}
|
|
|
|
static forceinline void
|
|
deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount)
|
|
{
|
|
if (change_amount == 0)
|
|
/* Block is very similar to previous; prefer previous costs. */
|
|
*cost_p = (default_cost + 3 * *cost_p) / 4;
|
|
else if (change_amount == 1)
|
|
*cost_p = (default_cost + *cost_p) / 2;
|
|
else if (change_amount == 2)
|
|
*cost_p = (5 * default_cost + 3 * *cost_p) / 8;
|
|
else
|
|
/* Block differs greatly from previous; prefer default costs. */
|
|
*cost_p = (3 * default_cost + *cost_p) / 4;
|
|
}
|
|
|
|
static forceinline void
|
|
deflate_adjust_costs_impl(struct libdeflate_compressor *c,
|
|
u32 lit_cost, u32 len_sym_cost, int change_amount)
|
|
{
|
|
unsigned i;
|
|
|
|
/* Literals */
|
|
for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
|
|
deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost,
|
|
change_amount);
|
|
|
|
/* Lengths */
|
|
for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
|
|
deflate_adjust_cost(&c->p.n.costs.length[i],
|
|
deflate_default_length_cost(i,
|
|
len_sym_cost),
|
|
change_amount);
|
|
|
|
/* Offset slots */
|
|
for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
|
|
deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
|
|
deflate_default_offset_slot_cost(i),
|
|
change_amount);
|
|
}
|
|
|
|
/*
|
|
* Adjust the costs when beginning a new block.
|
|
*
|
|
* Since the current costs have been optimized for the data, it's undesirable to
|
|
* throw them away and start over with the default costs. At the same time, we
|
|
* don't want to bias the parse by assuming that the next block will be similar
|
|
* to the current block. As a compromise, make the costs closer to the
|
|
* defaults, but don't simply set them to the defaults.
|
|
*/
|
|
static void
|
|
deflate_adjust_costs(struct libdeflate_compressor *c,
|
|
u32 lit_cost, u32 len_sym_cost)
|
|
{
|
|
u64 total_delta = 0;
|
|
u64 cutoff;
|
|
int i;
|
|
|
|
/*
|
|
* Decide how different the current block is from the previous block,
|
|
* using the block splitting statistics from the current and previous
|
|
* blocks. The more different the current block is, the more we prefer
|
|
* the default costs rather than the previous block's costs.
|
|
*
|
|
* The algorithm here is similar to the end-of-block check one, but here
|
|
* we compare two entire blocks rather than a partial block with a small
|
|
* extra part, and therefore we need 64-bit numbers in some places.
|
|
*/
|
|
for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
|
|
u64 prev = (u64)c->p.n.prev_observations[i] *
|
|
c->split_stats.num_observations;
|
|
u64 cur = (u64)c->split_stats.observations[i] *
|
|
c->p.n.prev_num_observations;
|
|
|
|
total_delta += prev > cur ? prev - cur : cur - prev;
|
|
}
|
|
cutoff = ((u64)c->p.n.prev_num_observations *
|
|
c->split_stats.num_observations * 200) / 512;
|
|
|
|
if (4 * total_delta > 9 * cutoff)
|
|
deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3);
|
|
else if (2 * total_delta > 3 * cutoff)
|
|
deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2);
|
|
else if (2 * total_delta > cutoff)
|
|
deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1);
|
|
else
|
|
deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0);
|
|
}
|
|
|
|
/*
|
|
* Find the minimum-cost path through the graph of possible match/literal
|
|
* choices for this block.
|
|
*
|
|
* We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
|
|
* represents the node at the beginning of the block, to
|
|
* 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
|
|
* the block. Edge costs are evaluated using the cost model 'c->p.n.costs'.
|
|
*
|
|
* The algorithm works backwards, starting at the end node and proceeding
|
|
* backwards one node at a time. At each node, the minimum cost to reach the
|
|
* end node is computed and the match/literal choice that begins that path is
|
|
* saved.
|
|
*/
|
|
static void
|
|
deflate_find_min_cost_path(struct libdeflate_compressor *c,
|
|
const u32 block_length,
|
|
const struct lz_match *cache_ptr)
|
|
{
|
|
struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
|
|
struct deflate_optimum_node *cur_node = end_node;
|
|
|
|
cur_node->cost_to_end = 0;
|
|
do {
|
|
unsigned num_matches;
|
|
unsigned literal;
|
|
u32 best_cost_to_end;
|
|
|
|
cur_node--;
|
|
cache_ptr--;
|
|
|
|
num_matches = cache_ptr->length;
|
|
literal = cache_ptr->offset;
|
|
|
|
/* It's always possible to choose a literal. */
|
|
best_cost_to_end = c->p.n.costs.literal[literal] +
|
|
(cur_node + 1)->cost_to_end;
|
|
cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
|
|
|
|
/* Also consider matches if there are any. */
|
|
if (num_matches) {
|
|
const struct lz_match *match;
|
|
unsigned len;
|
|
unsigned offset;
|
|
unsigned offset_slot;
|
|
u32 offset_cost;
|
|
u32 cost_to_end;
|
|
|
|
/*
|
|
* Consider each length from the minimum
|
|
* (DEFLATE_MIN_MATCH_LEN) to the length of the longest
|
|
* match found at this position. For each length, we
|
|
* consider only the smallest offset for which that
|
|
* length is available. Although this is not guaranteed
|
|
* to be optimal due to the possibility of a larger
|
|
* offset costing less than a smaller offset to code,
|
|
* this is a very useful heuristic.
|
|
*/
|
|
match = cache_ptr - num_matches;
|
|
len = DEFLATE_MIN_MATCH_LEN;
|
|
do {
|
|
offset = match->offset;
|
|
offset_slot = deflate_get_offset_slot(c, offset);
|
|
offset_cost = c->p.n.costs.offset_slot[offset_slot];
|
|
do {
|
|
cost_to_end = offset_cost +
|
|
c->p.n.costs.length[len] +
|
|
(cur_node + len)->cost_to_end;
|
|
if (cost_to_end < best_cost_to_end) {
|
|
best_cost_to_end = cost_to_end;
|
|
cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
|
|
}
|
|
} while (++len <= match->length);
|
|
} while (++match != cache_ptr);
|
|
cache_ptr -= num_matches;
|
|
}
|
|
cur_node->cost_to_end = best_cost_to_end;
|
|
} while (cur_node != &c->p.n.optimum_nodes[0]);
|
|
}
|
|
|
|
/*
|
|
* Choose the literal/match sequence to use for the current block. The basic
|
|
* algorithm finds a minimum-cost path through the block's graph of
|
|
* literal/match choices, given a cost model. However, the cost of each symbol
|
|
* is unknown until the Huffman codes have been built, but at the same time the
|
|
* Huffman codes depend on the frequencies of chosen symbols. Consequently,
|
|
* multiple passes must be used to try to approximate an optimal solution. The
|
|
* first pass uses default costs, mixed with the costs from the previous block
|
|
* if any. Later passes use the Huffman codeword lengths from the previous pass
|
|
* as the costs.
|
|
*/
|
|
static void
|
|
deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
|
|
const struct lz_match *cache_ptr, bool is_first_block,
|
|
bool is_final_block)
|
|
{
|
|
unsigned num_passes_remaining = c->p.n.num_optim_passes;
|
|
u32 lit_cost, len_sym_cost;
|
|
u32 i;
|
|
|
|
/* Force the block to really end at the desired length, even if some
|
|
* matches extend beyond it. */
|
|
for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
|
|
ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
|
|
c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
|
|
|
|
/* Make sure the literal/match statistics are up to date. */
|
|
merge_new_observations(&c->split_stats);
|
|
|
|
/* Set the initial costs. */
|
|
deflate_choose_default_litlen_costs(c, block_length,
|
|
&lit_cost, &len_sym_cost);
|
|
if (is_first_block)
|
|
deflate_set_default_costs(c, lit_cost, len_sym_cost);
|
|
else
|
|
deflate_adjust_costs(c, lit_cost, len_sym_cost);
|
|
|
|
do {
|
|
/* Find the minimum cost path for this pass. */
|
|
deflate_find_min_cost_path(c, block_length, cache_ptr);
|
|
|
|
/* Compute frequencies of the chosen symbols. */
|
|
deflate_reset_symbol_frequencies(c);
|
|
deflate_tally_item_list(c, block_length);
|
|
|
|
/* Make the Huffman codes. */
|
|
deflate_make_huffman_codes(&c->freqs, &c->codes);
|
|
|
|
/*
|
|
* Update the costs. After the last optimization pass, the
|
|
* final costs won't be needed for this block, but they will be
|
|
* used in determining the initial costs for the next block.
|
|
*/
|
|
if (--num_passes_remaining || !is_final_block)
|
|
deflate_set_costs_from_codes(c, &c->codes.lens);
|
|
} while (num_passes_remaining);
|
|
}
|
|
|
|
static void deflate_near_optimal_begin_block(struct libdeflate_compressor *c,
|
|
bool is_first_block)
|
|
{
|
|
int i;
|
|
|
|
if (!is_first_block) {
|
|
/*
|
|
* Save some literal/match statistics from the previous block so
|
|
* that deflate_adjust_costs() will be able to decide how much
|
|
* the current block differs from the previous one.
|
|
*/
|
|
for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
|
|
c->p.n.prev_observations[i] =
|
|
c->split_stats.observations[i];
|
|
}
|
|
c->p.n.prev_num_observations = c->split_stats.num_observations;
|
|
}
|
|
init_block_split_stats(&c->split_stats);
|
|
|
|
/*
|
|
* During matchfinding, we keep track of appropximate literal and match
|
|
* length frequencies for the purpose of setting the initial costs.
|
|
*/
|
|
memset(c->freqs.litlen, 0,
|
|
DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
|
|
memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
|
|
}
|
|
|
|
/*
|
|
* This is the "near-optimal" DEFLATE compressor. It computes the optimal
|
|
* representation of each DEFLATE block using a minimum-cost path search over
|
|
* the graph of possible match/literal choices for that block, assuming a
|
|
* certain cost for each Huffman symbol.
|
|
*
|
|
* For several reasons, the end result is not guaranteed to be optimal:
|
|
*
|
|
* - Nonoptimal choice of blocks
|
|
* - Heuristic limitations on which matches are actually considered
|
|
* - Symbol costs are unknown until the symbols have already been chosen
|
|
* (so iterative optimization must be used)
|
|
*/
|
|
static size_t
|
|
deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
|
|
const u8 * restrict in, size_t in_nbytes,
|
|
u8 * restrict out, size_t out_nbytes_avail)
|
|
{
|
|
const u8 *in_next = in;
|
|
const u8 *in_end = in_next + in_nbytes;
|
|
struct deflate_output_bitstream os;
|
|
const u8 *in_cur_base = in_next;
|
|
const u8 *in_next_slide =
|
|
in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
|
|
unsigned max_len = DEFLATE_MAX_MATCH_LEN;
|
|
unsigned nice_len = MIN(c->nice_match_length, max_len);
|
|
u32 next_hashes[2] = {0, 0};
|
|
|
|
deflate_init_output(&os, out, out_nbytes_avail);
|
|
bt_matchfinder_init(&c->p.n.bt_mf);
|
|
|
|
do {
|
|
/* Starting a new DEFLATE block. */
|
|
|
|
struct lz_match *cache_ptr = c->p.n.match_cache;
|
|
const u8 * const in_block_begin = in_next;
|
|
const u8 * const in_max_block_end =
|
|
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
|
|
const u8 *next_observation = in_next;
|
|
|
|
deflate_near_optimal_begin_block(c, in_block_begin == in);
|
|
|
|
/*
|
|
* Find matches until we decide to end the block. We end the
|
|
* block if any of the following is true:
|
|
*
|
|
* (1) Maximum block length has been reached
|
|
* (2) Match catch may overflow.
|
|
* (3) Block split heuristic says to split now.
|
|
*/
|
|
do {
|
|
struct lz_match *matches;
|
|
unsigned best_len;
|
|
size_t remaining = in_end - in_next;
|
|
|
|
/* Slide the window forward if needed. */
|
|
if (in_next == in_next_slide) {
|
|
bt_matchfinder_slide_window(&c->p.n.bt_mf);
|
|
in_cur_base = in_next;
|
|
in_next_slide = in_next +
|
|
MIN(remaining, MATCHFINDER_WINDOW_SIZE);
|
|
}
|
|
|
|
/*
|
|
* Find matches with the current position using the
|
|
* binary tree matchfinder and save them in
|
|
* 'match_cache'.
|
|
*
|
|
* Note: the binary tree matchfinder is more suited for
|
|
* optimal parsing than the hash chain matchfinder. The
|
|
* reasons for this include:
|
|
*
|
|
* - The binary tree matchfinder can find more matches
|
|
* in the same number of steps.
|
|
* - One of the major advantages of hash chains is that
|
|
* skipping positions (not searching for matches at
|
|
* them) is faster; however, with optimal parsing we
|
|
* search for matches at almost all positions, so this
|
|
* advantage of hash chains is negated.
|
|
*/
|
|
matches = cache_ptr;
|
|
best_len = 0;
|
|
adjust_max_and_nice_len(&max_len, &nice_len, remaining);
|
|
if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
|
|
cache_ptr = bt_matchfinder_get_matches(
|
|
&c->p.n.bt_mf,
|
|
in_cur_base,
|
|
in_next - in_cur_base,
|
|
max_len,
|
|
nice_len,
|
|
c->max_search_depth,
|
|
next_hashes,
|
|
&best_len,
|
|
matches);
|
|
}
|
|
c->freqs.litlen[*in_next]++;
|
|
if (in_next >= next_observation) {
|
|
if (best_len >= 4) {
|
|
observe_match(&c->split_stats,
|
|
best_len);
|
|
next_observation = in_next + best_len;
|
|
c->p.n.match_len_freqs[best_len]++;
|
|
} else {
|
|
observe_literal(&c->split_stats,
|
|
*in_next);
|
|
next_observation = in_next + 1;
|
|
}
|
|
}
|
|
|
|
cache_ptr->length = cache_ptr - matches;
|
|
cache_ptr->offset = *in_next;
|
|
in_next++;
|
|
cache_ptr++;
|
|
|
|
/*
|
|
* If there was a very long match found, don't cache any
|
|
* matches for the bytes covered by that match. This
|
|
* avoids degenerate behavior when compressing highly
|
|
* redundant data, where the number of matches can be
|
|
* very large.
|
|
*
|
|
* This heuristic doesn't actually hurt the compression
|
|
* ratio very much. If there's a long match, then the
|
|
* data must be highly compressible, so it doesn't
|
|
* matter much what we do.
|
|
*/
|
|
if (best_len >= DEFLATE_MIN_MATCH_LEN &&
|
|
best_len >= nice_len) {
|
|
--best_len;
|
|
do {
|
|
remaining = in_end - in_next;
|
|
if (in_next == in_next_slide) {
|
|
bt_matchfinder_slide_window(
|
|
&c->p.n.bt_mf);
|
|
in_cur_base = in_next;
|
|
in_next_slide = in_next +
|
|
MIN(remaining,
|
|
MATCHFINDER_WINDOW_SIZE);
|
|
}
|
|
adjust_max_and_nice_len(&max_len,
|
|
&nice_len,
|
|
remaining);
|
|
if (max_len >=
|
|
BT_MATCHFINDER_REQUIRED_NBYTES) {
|
|
bt_matchfinder_skip_byte(
|
|
&c->p.n.bt_mf,
|
|
in_cur_base,
|
|
in_next - in_cur_base,
|
|
nice_len,
|
|
c->max_search_depth,
|
|
next_hashes);
|
|
}
|
|
cache_ptr->length = 0;
|
|
cache_ptr->offset = *in_next;
|
|
c->freqs.litlen[*in_next]++;
|
|
in_next++;
|
|
cache_ptr++;
|
|
} while (--best_len);
|
|
}
|
|
} while (in_next < in_max_block_end &&
|
|
cache_ptr < &c->p.n.match_cache[MATCH_CACHE_LENGTH] &&
|
|
!should_end_block(&c->split_stats,
|
|
in_block_begin, in_next, in_end));
|
|
/*
|
|
* All the matches for this block have been cached. Now choose
|
|
* the sequence of items to output and flush the block.
|
|
*/
|
|
deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
|
|
in_block_begin == in, in_next == in_end);
|
|
deflate_flush_block(c, &os, in_block_begin,
|
|
in_next - in_block_begin,
|
|
NULL, in_next == in_end);
|
|
} while (in_next != in_end);
|
|
|
|
return deflate_flush_output(&os);
|
|
}
|
|
|
|
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
|
|
|
/* Initialize c->offset_slot_fast. */
|
|
static void
|
|
deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
|
|
{
|
|
unsigned offset_slot;
|
|
unsigned offset;
|
|
unsigned offset_end;
|
|
|
|
for (offset_slot = 0;
|
|
offset_slot < ARRAY_LEN(deflate_offset_slot_base);
|
|
offset_slot++)
|
|
{
|
|
offset = deflate_offset_slot_base[offset_slot];
|
|
#if USE_FULL_OFFSET_SLOT_FAST
|
|
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
|
do {
|
|
c->offset_slot_fast[offset] = offset_slot;
|
|
} while (++offset != offset_end);
|
|
#else
|
|
if (offset <= 256) {
|
|
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
|
do {
|
|
c->offset_slot_fast[offset - 1] = offset_slot;
|
|
} while (++offset != offset_end);
|
|
} else {
|
|
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
|
do {
|
|
c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
|
|
} while ((offset += (1 << 7)) != offset_end);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
|
|
libdeflate_alloc_compressor(int compression_level)
|
|
{
|
|
struct libdeflate_compressor *c;
|
|
size_t size = offsetof(struct libdeflate_compressor, p);
|
|
|
|
if (compression_level < 0 || compression_level > 12)
|
|
return NULL;
|
|
|
|
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
|
if (compression_level >= 10)
|
|
size += sizeof(c->p.n);
|
|
else
|
|
#endif
|
|
{
|
|
if (compression_level >= 2)
|
|
size += sizeof(c->p.g);
|
|
else if (compression_level == 1)
|
|
size += sizeof(c->p.f);
|
|
}
|
|
|
|
c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
|
|
if (!c)
|
|
return NULL;
|
|
|
|
c->compression_level = compression_level;
|
|
|
|
/*
|
|
* The higher the compression level, the more we should bother trying to
|
|
* compress very small inputs.
|
|
*/
|
|
c->min_size_to_compress = 56 - (compression_level * 4);
|
|
|
|
switch (compression_level) {
|
|
case 0:
|
|
c->impl = deflate_compress_none;
|
|
break;
|
|
case 1:
|
|
c->impl = deflate_compress_fastest;
|
|
/* max_search_depth is unused */
|
|
c->nice_match_length = 32;
|
|
break;
|
|
case 2:
|
|
c->impl = deflate_compress_greedy;
|
|
c->max_search_depth = 6;
|
|
c->nice_match_length = 10;
|
|
break;
|
|
case 3:
|
|
c->impl = deflate_compress_greedy;
|
|
c->max_search_depth = 12;
|
|
c->nice_match_length = 14;
|
|
break;
|
|
case 4:
|
|
c->impl = deflate_compress_greedy;
|
|
c->max_search_depth = 16;
|
|
c->nice_match_length = 30;
|
|
break;
|
|
case 5:
|
|
c->impl = deflate_compress_lazy;
|
|
c->max_search_depth = 16;
|
|
c->nice_match_length = 30;
|
|
break;
|
|
case 6:
|
|
c->impl = deflate_compress_lazy;
|
|
c->max_search_depth = 35;
|
|
c->nice_match_length = 65;
|
|
break;
|
|
case 7:
|
|
c->impl = deflate_compress_lazy;
|
|
c->max_search_depth = 100;
|
|
c->nice_match_length = 130;
|
|
break;
|
|
case 8:
|
|
c->impl = deflate_compress_lazy2;
|
|
c->max_search_depth = 300;
|
|
c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
|
|
break;
|
|
case 9:
|
|
#if !SUPPORT_NEAR_OPTIMAL_PARSING
|
|
default:
|
|
#endif
|
|
c->impl = deflate_compress_lazy2;
|
|
c->max_search_depth = 600;
|
|
c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
|
|
break;
|
|
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
|
case 10:
|
|
c->impl = deflate_compress_near_optimal;
|
|
c->max_search_depth = 35;
|
|
c->nice_match_length = 75;
|
|
c->p.n.num_optim_passes = 2;
|
|
break;
|
|
case 11:
|
|
c->impl = deflate_compress_near_optimal;
|
|
c->max_search_depth = 70;
|
|
c->nice_match_length = 150;
|
|
c->p.n.num_optim_passes = 3;
|
|
break;
|
|
case 12:
|
|
default:
|
|
c->impl = deflate_compress_near_optimal;
|
|
c->max_search_depth = 150;
|
|
c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
|
|
c->p.n.num_optim_passes = 4;
|
|
break;
|
|
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
|
}
|
|
|
|
deflate_init_offset_slot_fast(c);
|
|
deflate_init_static_codes(c);
|
|
|
|
return c;
|
|
}
|
|
|
|
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
|
|
libdeflate_deflate_compress(struct libdeflate_compressor *c,
|
|
const void *in, size_t in_nbytes,
|
|
void *out, size_t out_nbytes_avail)
|
|
{
|
|
if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
|
|
return 0;
|
|
|
|
/* For extremely small inputs just use a single uncompressed block. */
|
|
if (unlikely(in_nbytes < c->min_size_to_compress)) {
|
|
struct deflate_output_bitstream os;
|
|
deflate_init_output(&os, out, out_nbytes_avail);
|
|
if (in_nbytes == 0)
|
|
in = &os; /* Avoid passing NULL to memcpy() */
|
|
deflate_write_uncompressed_block(&os, in, in_nbytes, true);
|
|
return deflate_flush_output(&os);
|
|
}
|
|
|
|
return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
|
|
}
|
|
|
|
LIBDEFLATEEXPORT void LIBDEFLATEAPI
|
|
libdeflate_free_compressor(struct libdeflate_compressor *c)
|
|
{
|
|
libdeflate_aligned_free(c);
|
|
}
|
|
|
|
unsigned int
|
|
deflate_get_compression_level(struct libdeflate_compressor *c)
|
|
{
|
|
return c->compression_level;
|
|
}
|
|
|
|
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
|
|
libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
|
|
size_t in_nbytes)
|
|
{
|
|
/*
|
|
* The worst case is all uncompressed blocks where one block has length
|
|
* <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
|
|
* Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
|
|
* and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
|
|
*/
|
|
size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
|
|
return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
|
|
}
|