libdeflate/lib/deflate_compress.c

/*
 * deflate_compress.c - a compressor for DEFLATE
 *
 * Copyright 2016 Eric Biggers
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#include "deflate_compress.h"
#include "deflate_constants.h"
#include "unaligned.h"

#include "libdeflate.h"

/******************************************************************************/

/*
 * The following parameters can be changed at build time to customize the
 * compression algorithms slightly:
 *
 * (Note, not all customizable parameters are here.  Some others can be found in
 * libdeflate_alloc_compressor() and in *_matchfinder.h.)
 */

/*
 * If this parameter is defined to 1, then the near-optimal parsing algorithm
 * will be included, and compression levels 10-12 will use it.  This algorithm
 * usually produces a compression ratio significantly better than the other
 * algorithms.  However, it is slow.  If this parameter is defined to 0, then
 * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm.
 */
#define SUPPORT_NEAR_OPTIMAL_PARSING	1

/*
 * If this parameter is defined to 1, then the compressor will maintain a full
 * map from match offsets to offset slots, rather than a condensed map.  This
 * will usually improve performance, especially for the near-optimal parsing
 * algorithm.  However, it will use an additional 32257 bytes of memory.
 */
#define USE_FULL_OFFSET_SLOT_FAST	SUPPORT_NEAR_OPTIMAL_PARSING

/*
 * This is the minimum block length, in uncompressed bytes, which the compressor
 * will use.  This should be a value below which using shorter blocks is very
 * unlikely to be worthwhile, due to the per-block overhead.  This parameter
 * doesn't apply to the final block, which can be arbitrarily short.
 *
 * Defining a fixed minimum block length is needed in order to guarantee a
 * reasonable upper bound on the compressed size.  It's also needed because our
 * block splitting algorithm doesn't work well on very short blocks.
 */
#define MIN_BLOCK_LENGTH	10000

/*
 * This is the soft maximum block length, in uncompressed bytes, which the
 * compressor will use.  This is a "soft" maximum, meaning that the compressor
 * will try to end blocks at this length, but it may go slightly past it if
 * there is a match that straddles this limit.  This parameter doesn't apply to
 * uncompressed blocks, which the DEFLATE format limits to 65535 bytes.
 *
 * This should be a value above which it is very likely that splitting the block
 * would produce a better compression ratio.  Increasing/decreasing this
 * parameter will increase/decrease per-compressor memory usage linearly.
 */
#define SOFT_MAX_BLOCK_LENGTH	300000

/*
 * Block length, in uncompressed bytes, used by deflate_compress_fastest().
 * deflate_compress_fastest() doesn't use the other block length settings.
 */
#define FAST_BLOCK_LENGTH	MIN(32768, SOFT_MAX_BLOCK_LENGTH)

/*
 * These are the maximum codeword lengths, in bits, the compressor will use for
 * each Huffman code.  The DEFLATE format defines limits for these.  However,
 * further limiting litlen codewords to 14 bits is beneficial, since it has
 * negligible effect on compression ratio but allows some optimizations when
 * outputting bits.  (It allows 4 literals to be written at once rather than 3.)
 */
#define MAX_LITLEN_CODEWORD_LEN		14
#define MAX_OFFSET_CODEWORD_LEN		DEFLATE_MAX_OFFSET_CODEWORD_LEN
#define MAX_PRE_CODEWORD_LEN		DEFLATE_MAX_PRE_CODEWORD_LEN

#if SUPPORT_NEAR_OPTIMAL_PARSING

/* Parameters specific to the near-optimal parsing algorithm */

/*
 * BIT_COST is a scaling factor that allows the compressor to consider
 * fractional bit costs when deciding which literal/match sequence to use.  This
 * is useful when the true symbol costs are unknown.  For example, if the
 * compressor thinks that a symbol has 6.5 bits of entropy, it can set its cost
 * to 6.5 bits rather than have to use 6 or 7 bits.  Although in the end each
 * symbol will use a whole number of bits due to the Huffman coding, considering
 * fractional bits can be helpful due to the limited information.
 *
 * BIT_COST should be a power of 2.  A value of 8 or 16 works well.  A higher
 * value isn't very useful since the calculations are approximate anyway.
 */
#define BIT_COST	16

/*
 * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
 * be needed to output a symbol that was unused in the previous optimization
 * pass.  Assigning a default cost allows the symbol to be used in the next
 * optimization pass.  However, the cost should be relatively high because the
 * symbol probably won't be used very many times (if at all).
 */
#define LITERAL_NOSTAT_BITS	13
#define LENGTH_NOSTAT_BITS	13
#define OFFSET_NOSTAT_BITS	10

/*
 * This is (approximately) the maximum number of matches that the compressor
 * will cache per block.  If the match cache becomes full, then the compressor
 * will be forced to end the block early.  This value should be large enough so
 * that this rarely happens, due to the block being ended normally before the
 * cache fills up.  Increasing/decreasing this parameter will increase/decrease
 * per-compressor memory usage linearly.
 */
#define MATCH_CACHE_LENGTH	(SOFT_MAX_BLOCK_LENGTH * 5)

#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

/******************************************************************************/

/* Include the needed matchfinders. */
#define MATCHFINDER_WINDOW_ORDER	DEFLATE_WINDOW_ORDER
#include "hc_matchfinder.h"
#include "ht_matchfinder.h"
#if SUPPORT_NEAR_OPTIMAL_PARSING
#  include "bt_matchfinder.h"
/*
 * This is the maximum number of matches the binary trees matchfinder can find
 * at a single position.  Since the matchfinder never finds more than one match
 * for the same length, presuming one of each possible length is sufficient for
 * an upper bound.  (This says nothing about whether it is worthwhile to
 * consider so many matches; this is just defining the worst case.)
 */
#define MAX_MATCHES_PER_POS	\
	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
#endif

/* Table: length slot => length slot base value  */
static const unsigned deflate_length_slot_base[] = {
	3   , 4   , 5   , 6   , 7   , 8   , 9   , 10  ,
	11  , 13  , 15  , 17  , 19  , 23  , 27  , 31  ,
	35  , 43  , 51  , 59  , 67  , 83  , 99  , 115 ,
	131 , 163 , 195 , 227 , 258 ,
};

/* Table: length slot => number of extra length bits  */
static const u8 deflate_extra_length_bits[] = {
	0   , 0   , 0   , 0   , 0   , 0   , 0   , 0 ,
	1   , 1   , 1   , 1   , 2   , 2   , 2   , 2 ,
	3   , 3   , 3   , 3   , 4   , 4   , 4   , 4 ,
	5   , 5   , 5   , 5   , 0   ,
};

/* Table: offset slot => offset slot base value  */
static const unsigned deflate_offset_slot_base[] = {
	1    , 2    , 3    , 4     , 5     , 7     , 9     , 13    ,
	17   , 25   , 33   , 49    , 65    , 97    , 129   , 193   ,
	257  , 385  , 513  , 769   , 1025  , 1537  , 2049  , 3073  ,
	4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
};

/* Table: offset slot => number of extra offset bits  */
static const u8 deflate_extra_offset_bits[] = {
	0    , 0    , 0    , 0     , 1     , 1     , 2     , 2     ,
	3    , 3    , 4    , 4     , 5     , 5     , 6     , 6     ,
	7    , 7    , 8    , 8     , 9     , 9     , 10    , 10    ,
	11   , 11   , 12   , 12    , 13    , 13    ,
};

/* Table: length => length slot  */
static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
	12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
	16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
	18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
	20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
	21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
	22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
	23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
	26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
	27, 27, 28,
};

/* The order in which precode codeword lengths are stored */
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
};

/* Codewords for the DEFLATE Huffman codes.  */
struct deflate_codewords {
	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
};

/* Codeword lengths (in bits) for the DEFLATE Huffman codes.
 * A zero length means the corresponding symbol had zero frequency.  */
struct deflate_lens {
	u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
	u8 offset[DEFLATE_NUM_OFFSET_SYMS];
};

/* Codewords and lengths for the DEFLATE Huffman codes.  */
struct deflate_codes {
	struct deflate_codewords codewords;
	struct deflate_lens lens;
};

/* Symbol frequency counters for the DEFLATE Huffman codes.  */
struct deflate_freqs {
	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
};

/*
 * Represents a run of literals followed by a match or end-of-block.  This
 * struct is needed to temporarily store items chosen by the parser, since items
 * cannot be written until all items for the block have been chosen and the
 * block's Huffman codes have been computed.
 */
struct deflate_sequence {

	/* Bits 0..22: the number of literals in this run.  This may be 0 and
	 * can be at most about SOFT_MAX_BLOCK_LENGTH.  The literals are not
	 * stored explicitly in this structure; instead, they are read directly
	 * from the uncompressed data.
	 *
	 * Bits 23..31: the length of the match which follows the literals, or 0
	 * if this literal run was the last in the block, so there is no match
	 * which follows it.  */
	u32 litrunlen_and_length;

	/* If 'length' doesn't indicate end-of-block, then this is the offset of
	 * the match which follows the literals.  */
	u16 offset;

	/* If 'length' doesn't indicate end-of-block, then this is the offset
	 * symbol of the match which follows the literals.  */
	u8 offset_symbol;

	/* If 'length' doesn't indicate end-of-block, then this is the length
	 * slot of the match which follows the literals.  */
	u8 length_slot;
};

#if SUPPORT_NEAR_OPTIMAL_PARSING

/* Costs for the near-optimal parsing algorithm.  */
struct deflate_costs {

	/* The cost to output each possible literal.  */
	u32 literal[DEFLATE_NUM_LITERALS];

	/* The cost to output each possible match length.  */
	u32 length[DEFLATE_MAX_MATCH_LEN + 1];

	/* The cost to output a match offset of each possible offset slot.  */
	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
};

/*
 * This structure represents a byte position in the input data and a node in the
 * graph of possible match/literal choices for the current block.
 *
 * Logically, each incoming edge to this node is labeled with a literal or a
 * match that can be taken to reach this position from an earlier position; and
 * each outgoing edge from this node is labeled with a literal or a match that
 * can be taken to advance from this position to a later position.
 *
 * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
 * associate with each node just two pieces of information:
 *
 *	'cost_to_end' is the minimum cost to reach the end of the block from
 *	this position.
 *
 *	'item' represents the literal or match that must be chosen from here to
 *	reach the end of the block with the minimum cost.  Equivalently, this
 *	can be interpreted as the label of the outgoing edge on the minimum-cost
 *	path to the "end of block" node from this node.
 */
struct deflate_optimum_node {

	u32 cost_to_end;

	/*
	 * Notes on the match/literal representation used here:
	 *
	 *	The low bits of 'item' are the length: 1 if this is a literal,
	 *	or the match length if this is a match.
	 *
	 *	The high bits of 'item' are the actual literal byte if this is a
	 *	literal, or the match offset if this is a match.
	 */
#define OPTIMUM_OFFSET_SHIFT 9
#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
	u32 item;

};

#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

/* Block split statistics.  See "Block splitting algorithm" below. */
#define NUM_LITERAL_OBSERVATION_TYPES 8
#define NUM_MATCH_OBSERVATION_TYPES 2
#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
struct block_split_stats {
	u32 new_observations[NUM_OBSERVATION_TYPES];
	u32 observations[NUM_OBSERVATION_TYPES];
	u32 num_new_observations;
	u32 num_observations;
};

/* The main DEFLATE compressor structure  */
struct libdeflate_compressor {

	/* Pointer to the compress() implementation chosen at allocation time */
	size_t (*impl)(struct libdeflate_compressor *,
		       const u8 *, size_t, u8 *, size_t);

	/* Frequency counters for the current block  */
	struct deflate_freqs freqs;

	/* Dynamic Huffman codes for the current block  */
	struct deflate_codes codes;

	/* Static Huffman codes */
	struct deflate_codes static_codes;

	/* Block split statistics for the currently pending block */
	struct block_split_stats split_stats;

	/* A table for fast lookups of offset slot by match offset.
	 *
	 * If the full table is being used, it is a direct mapping from offset
	 * to offset slot.
	 *
	 * If the condensed table is being used, the first 256 entries map
	 * directly to the offset slots of offsets 1 through 256.  The next 256
	 * entries map to the offset slots for the remaining offsets, stepping
	 * through the offsets with a stride of 128.  This relies on the fact
	 * that each of the remaining offset slots contains at least 128 offsets
	 * and has an offset base that is a multiple of 128.  */
#if USE_FULL_OFFSET_SLOT_FAST
	u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
#else
	u8 offset_slot_fast[512];
#endif

	/* The "nice" match length: if a match of this length is found, choose
	 * it immediately without further consideration.  */
	unsigned nice_match_length;

	/* The maximum search depth: consider at most this many potential
	 * matches at each position.  */
	unsigned max_search_depth;

	/* The compression level with which this compressor was created.  */
	unsigned compression_level;

	/* Anything smaller than this we won't bother trying to compress.  */
	unsigned min_size_to_compress;

	/* Temporary space for Huffman code output  */
	u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
	u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
	u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
	unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
	unsigned num_litlen_syms;
	unsigned num_offset_syms;
	unsigned num_explicit_lens;
	unsigned num_precode_items;

	union {
		/* Data for greedy or lazy parsing  */
		struct {
			/* Hash chains matchfinder */
			struct hc_matchfinder hc_mf;

			/* The matches and literals that the parser has chosen
			 * for the current block.  The required length of this
			 * array is limited by the maximum number of matches
			 * that can ever be chosen for a single block, plus one
			 * for the special entry at the end.  */
			struct deflate_sequence sequences[
				DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
					     DEFLATE_MIN_MATCH_LEN) + 1];
		} g; /* (g)reedy */

		/* Data for fastest parsing */
		struct {
			/* Hash table matchfinder */
			struct ht_matchfinder ht_mf;

			struct deflate_sequence sequences[
				DIV_ROUND_UP(FAST_BLOCK_LENGTH,
					     HT_MATCHFINDER_MIN_MATCH_LEN) + 1];
		} f; /* (f)astest */

	#if SUPPORT_NEAR_OPTIMAL_PARSING
		/* Data for near-optimal parsing  */
		struct {

			/* Binary tree matchfinder  */
			struct bt_matchfinder bt_mf;

			/*
			 * Cached matches for the current block.  This array
			 * contains the matches that were found at each position
			 * in the block.  Specifically, for each position, there
			 * is a list of matches found at that position, if any,
			 * sorted by strictly increasing length.  In addition,
			 * following the matches for each position, there is a
			 * special 'struct lz_match' whose 'length' member
			 * contains the number of matches found at that
			 * position, and whose 'offset' member contains the
			 * literal at that position.
			 *
			 * Note: in rare cases, there will be a very high number
			 * of matches in the block and this array will overflow.
			 * If this happens, we force the end of the current
			 * block.  MATCH_CACHE_LENGTH is the length at which we
			 * actually check for overflow.  The extra slots beyond
			 * this are enough to absorb the worst case overflow,
			 * which occurs if starting at
			 * &match_cache[MATCH_CACHE_LENGTH - 1], we write
			 * MAX_MATCHES_PER_POS matches and a match count header,
			 * then skip searching for matches at
			 * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the
			 * match count header for each.
			 */
			struct lz_match match_cache[MATCH_CACHE_LENGTH +
						    MAX_MATCHES_PER_POS +
						    DEFLATE_MAX_MATCH_LEN - 1];

			/*
			 * Array of nodes, one per position, for running the
			 * minimum-cost path algorithm.
			 *
			 * This array must be large enough to accommodate the
			 * worst-case number of nodes, which occurs if we find a
			 * match of length DEFLATE_MAX_MATCH_LEN at position
			 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
			 * length SOFT_MAX_BLOCK_LENGTH - 1 +
			 * DEFLATE_MAX_MATCH_LEN.  Add one for the end-of-block
			 * node.
			 */
			struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
								  DEFLATE_MAX_MATCH_LEN + 1];

			/* The current cost model being used.  */
			struct deflate_costs costs;

			/* Literal/match statistics saved from previous block */
			u32 prev_observations[NUM_OBSERVATION_TYPES];
			u32 prev_num_observations;

			/*
			 * Approximate match length frequencies based on a
			 * greedy parse, gathered during matchfinding.  This is
			 * used for setting the initial symbol costs.
			 */
			u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];

			unsigned num_optim_passes;
		} n; /* (n)ear-optimal */
	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

	} p; /* (p)arser */
};

/*
 * The type for the bitbuffer variable, which temporarily holds bits that are
 * being packed into bytes and written to the output buffer.  For best
 * performance, this should have size equal to a machine word.
 */
typedef machine_word_t bitbuf_t;
#define BITBUF_NBITS	(8 * sizeof(bitbuf_t))

/* Can the specified number of bits always be added to 'bitbuf' after any
 * pending bytes have been flushed?  */
#define CAN_BUFFER(n)	((n) <= BITBUF_NBITS - 7)

/*
 * Structure to keep track of the current state of sending bits to the
 * compressed output buffer.
 */
struct deflate_output_bitstream {

	/* Bits that haven't yet been written to the output buffer.  */
	bitbuf_t bitbuf;

	/* Number of bits currently held in @bitbuf.  */
	unsigned bitcount;

	/* Pointer to the beginning of the output buffer.  */
	u8 *begin;

	/* Pointer to the position in the output buffer at which the next byte
	 * should be written.  */
	u8 *next;

	/* Pointer just past the end of the output buffer.  */
	u8 *end;
};

/*
 * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
 * present following os->end, in order to not overrun the buffer when generating
 * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
 * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
 * to make the compression algorithm produce the same result on all CPU
 * architectures (which is sometimes desirable), we have to unconditionally use
 * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
 */
#define OUTPUT_END_PADDING	8

/* Initialize the output bitstream.  'size' is assumed to be at least
 * OUTPUT_END_PADDING.  */
static void
deflate_init_output(struct deflate_output_bitstream *os,
		    void *buffer, size_t size)
{
	os->bitbuf = 0;
	os->bitcount = 0;
	os->begin = buffer;
	os->next = os->begin;
	os->end = os->begin + size - OUTPUT_END_PADDING;
}

/* Add some bits to the bitbuffer variable of the output bitstream.  The caller
 * must make sure there is enough room.  */
static forceinline void
deflate_add_bits(struct deflate_output_bitstream *os,
		 const bitbuf_t bits, const unsigned num_bits)
{
	os->bitbuf |= bits << os->bitcount;
	os->bitcount += num_bits;
}

/* Flush bits from the bitbuffer variable to the output buffer.  */
static forceinline void
deflate_flush_bits(struct deflate_output_bitstream *os)
{
	if (UNALIGNED_ACCESS_IS_FAST) {
		/* Flush a whole word (branchlessly).  */
		put_unaligned_leword(os->bitbuf, os->next);
		os->bitbuf >>= os->bitcount & ~7;
		os->next += MIN(os->end - os->next, os->bitcount >> 3);
		os->bitcount &= 7;
	} else {
		/* Flush a byte at a time.  */
		while (os->bitcount >= 8) {
			*os->next = os->bitbuf;
			if (os->next != os->end)
				os->next++;
			os->bitcount -= 8;
			os->bitbuf >>= 8;
		}
	}
}

/* Align the bitstream on a byte boundary. */
static forceinline void
deflate_align_bitstream(struct deflate_output_bitstream *os)
{
	os->bitcount += -os->bitcount & 7;
	deflate_flush_bits(os);
}

/*
 * Flush any remaining bits to the output buffer if needed.  Return the total
 * number of bytes written to the output buffer, or 0 if an overflow occurred.
 */
static size_t
deflate_flush_output(struct deflate_output_bitstream *os)
{
	if (os->next == os->end) /* overflow?  */
		return 0;

	while ((int)os->bitcount > 0) {
		*os->next++ = os->bitbuf;
		os->bitcount -= 8;
		os->bitbuf >>= 8;
	}

	return os->next - os->begin;
}

/* Given the binary tree node A[subtree_idx] whose children already
 * satisfy the maxheap property, swap the node with its greater child
 * until it is greater than both its children, so that the maxheap
 * property is satisfied in the subtree rooted at A[subtree_idx].  */
static void
heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
{
	unsigned parent_idx;
	unsigned child_idx;
	u32 v;

	v = A[subtree_idx];
	parent_idx = subtree_idx;
	while ((child_idx = parent_idx * 2) <= length) {
		if (child_idx < length && A[child_idx + 1] > A[child_idx])
			child_idx++;
		if (v >= A[child_idx])
			break;
		A[parent_idx] = A[child_idx];
		parent_idx = child_idx;
	}
	A[parent_idx] = v;
}

/* Rearrange the array 'A' so that it satisfies the maxheap property.
 * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
 */
static void
heapify_array(u32 A[], unsigned length)
{
	unsigned subtree_idx;

	for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
		heapify_subtree(A, length, subtree_idx);
}

/*
 * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
 *
 * Note: name this function heap_sort() instead of heapsort() to avoid colliding
 * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
 * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
 */
static void
heap_sort(u32 A[], unsigned length)
{
	A--; /* Use 1-based indices  */

	heapify_array(A, length);

	while (length >= 2) {
		u32 tmp = A[length];
		A[length] = A[1];
		A[1] = tmp;
		length--;
		heapify_subtree(A, length, 1);
	}
}

#define NUM_SYMBOL_BITS 10
#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)

#define GET_NUM_COUNTERS(num_syms)	((((num_syms) + 3 / 4) + 3) & ~3)
/*
 * Sort the symbols primarily by frequency and secondarily by symbol
 * value.  Discard symbols with zero frequency and fill in an array with
 * the remaining symbols, along with their frequencies.  The low
 * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
 * value, and the remaining bits will contain the frequency.
 *
 * @num_syms
 *	Number of symbols in the alphabet.
 *	Can't be greater than (1 << NUM_SYMBOL_BITS).
 *
 * @freqs[num_syms]
 *	The frequency of each symbol.
 *
 * @lens[num_syms]
 *	An array that eventually will hold the length of each codeword.
 *	This function only fills in the codeword lengths for symbols that
 *	have zero frequency, which are not well defined per se but will
 *	be set to 0.
 *
 * @symout[num_syms]
 *	The output array, described above.
 *
 * Returns the number of entries in 'symout' that were filled.  This is
 * the number of symbols that have nonzero frequency.
 */
static unsigned
sort_symbols(unsigned num_syms, const u32 freqs[restrict],
	     u8 lens[restrict], u32 symout[restrict])
{
	unsigned sym;
	unsigned i;
	unsigned num_used_syms;
	unsigned num_counters;
	unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];

	/* We rely on heapsort, but with an added optimization.  Since
	 * it's common for most symbol frequencies to be low, we first do
	 * a count sort using a limited number of counters.  High
	 * frequencies will be counted in the last counter, and only they
	 * will be sorted with heapsort.
	 *
	 * Note: with more symbols, it is generally beneficial to have more
	 * counters.  About 1 counter per 4 symbols seems fast.
	 *
	 * Note: I also tested radix sort, but even for large symbol
	 * counts (> 255) and frequencies bounded at 16 bits (enabling
	 * radix sort by just two base-256 digits), it didn't seem any
	 * faster than the method implemented here.
	 *
	 * Note: I tested the optimized quicksort implementation from
	 * glibc (with indirection overhead removed), but it was only
	 * marginally faster than the simple heapsort implemented here.
	 *
	 * Tests were done with building the codes for LZX.  Results may
	 * vary for different compression algorithms...!  */

	num_counters = GET_NUM_COUNTERS(num_syms);

	memset(counters, 0, num_counters * sizeof(counters[0]));

	/* Count the frequencies.  */
	for (sym = 0; sym < num_syms; sym++)
		counters[MIN(freqs[sym], num_counters - 1)]++;

	/* Make the counters cumulative, ignoring the zero-th, which
	 * counted symbols with zero frequency.  As a side effect, this
	 * calculates the number of symbols with nonzero frequency.  */
	num_used_syms = 0;
	for (i = 1; i < num_counters; i++) {
		unsigned count = counters[i];
		counters[i] = num_used_syms;
		num_used_syms += count;
	}

	/* Sort nonzero-frequency symbols using the counters.  At the
	 * same time, set the codeword lengths of zero-frequency symbols
	 * to 0.  */
	for (sym = 0; sym < num_syms; sym++) {
		u32 freq = freqs[sym];
		if (freq != 0) {
			symout[counters[MIN(freq, num_counters - 1)]++] =
				sym | (freq << NUM_SYMBOL_BITS);
		} else {
			lens[sym] = 0;
		}
	}

	/* Sort the symbols counted in the last counter.  */
	heap_sort(symout + counters[num_counters - 2],
		  counters[num_counters - 1] - counters[num_counters - 2]);

	return num_used_syms;
}

/*
 * Build the Huffman tree.
 *
 * This is an optimized implementation that
 *	(a) takes advantage of the frequencies being already sorted;
 *	(b) only generates non-leaf nodes, since the non-leaf nodes of a
 *	    Huffman tree are sufficient to generate a canonical code;
 *	(c) Only stores parent pointers, not child pointers;
 *	(d) Produces the nodes in the same memory used for input
 *	    frequency information.
 *
 * Array 'A', which contains 'sym_count' entries, is used for both input
 * and output.  For this function, 'sym_count' must be at least 2.
 *
 * For input, the array must contain the frequencies of the symbols,
 * sorted in increasing order.  Specifically, each entry must contain a
 * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
 * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
 * Although these bits will, in fact, contain the symbols that correspond
 * to the frequencies, this function is concerned with frequencies only
 * and keeps the symbols as-is.
 *
 * For output, this function will produce the non-leaf nodes of the
 * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
 * entries of the array.  Entry A[sym_count - 2] will represent the root
 * node.  Each other node will contain the zero-based index of its parent
 * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
 * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
 * note that although these low bits will, in fact, contain a symbol
 * value, this symbol will have *no relationship* with the Huffman tree
 * node that happens to occupy the same slot.  This is because this
 * implementation only generates the non-leaf nodes of the tree.
 */
static void
build_tree(u32 A[], unsigned sym_count)
{
	/* Index, in 'A', of next lowest frequency symbol that has not
	 * yet been processed.  */
	unsigned i = 0;

	/* Index, in 'A', of next lowest frequency parentless non-leaf
	 * node; or, if equal to 'e', then no such node exists yet.  */
	unsigned b = 0;

	/* Index, in 'A', of next node to allocate as a non-leaf.  */
	unsigned e = 0;

	do {
		unsigned m, n;
		u32 freq_shifted;

		/* Choose the two next lowest frequency entries.  */

		if (i != sym_count &&
		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
			m = i++;
		else
			m = b++;

		if (i != sym_count &&
		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
			n = i++;
		else
			n = b++;

		/* Allocate a non-leaf node and link the entries to it.
		 *
		 * If we link an entry that we're visiting for the first
		 * time (via index 'i'), then we're actually linking a
		 * leaf node and it will have no effect, since the leaf
		 * will be overwritten with a non-leaf when index 'e'
		 * catches up to it.  But it's not any slower to
		 * unconditionally set the parent index.
		 *
		 * We also compute the frequency of the non-leaf node as
		 * the sum of its two children's frequencies.  */

		freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);

		A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
		A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
		A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
		e++;
	} while (sym_count - e > 1);
		/* When just one entry remains, it is a "leaf" that was
		 * linked to some other node.  We ignore it, since the
		 * rest of the array contains the non-leaves which we
		 * need.  (Note that we're assuming the cases with 0 or 1
		 * symbols were handled separately.) */
}

/*
 * Given the stripped-down Huffman tree constructed by build_tree(),
 * determine the number of codewords that should be assigned each
 * possible length, taking into account the length-limited constraint.
 *
 * @A
 *	The array produced by build_tree(), containing parent index
 *	information for the non-leaf nodes of the Huffman tree.  Each
 *	entry in this array is a node; a node's parent always has a
 *	greater index than that node itself.  This function will
 *	overwrite the parent index information in this array, so
 *	essentially it will destroy the tree.  However, the data in the
 *	low NUM_SYMBOL_BITS of each entry will be preserved.
 *
 * @root_idx
 *	The 0-based index of the root node in 'A', and consequently one
 *	less than the number of tree node entries in 'A'.  (Or, really 2
 *	less than the actual length of 'A'.)
 *
 * @len_counts
 *	An array of length ('max_codeword_len' + 1) in which the number of
 *	codewords having each length <= max_codeword_len will be
 *	returned.
 *
 * @max_codeword_len
 *	The maximum permissible codeword length.
 */
static void
compute_length_counts(u32 A[restrict], unsigned root_idx,
		      unsigned len_counts[restrict], unsigned max_codeword_len)
{
	unsigned len;
	int node;

	/* The key observations are:
	 *
	 * (1) We can traverse the non-leaf nodes of the tree, always
	 * visiting a parent before its children, by simply iterating
	 * through the array in reverse order.  Consequently, we can
	 * compute the depth of each node in one pass, overwriting the
	 * parent indices with depths.
	 *
	 * (2) We can initially assume that in the real Huffman tree,
	 * both children of the root are leaves.  This corresponds to two
	 * codewords of length 1.  Then, whenever we visit a (non-leaf)
	 * node during the traversal, we modify this assumption to
	 * account for the current node *not* being a leaf, but rather
	 * its two children being leaves.  This causes the loss of one
	 * codeword for the current depth and the addition of two
	 * codewords for the current depth plus one.
	 *
	 * (3) We can handle the length-limited constraint fairly easily
	 * by simply using the largest length available when a depth
	 * exceeds max_codeword_len.
	 */

	for (len = 0; len <= max_codeword_len; len++)
		len_counts[len] = 0;
	len_counts[1] = 2;

	/* Set the root node's depth to 0.  */
	A[root_idx] &= SYMBOL_MASK;

	for (node = root_idx - 1; node >= 0; node--) {

		/* Calculate the depth of this node.  */

		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
		unsigned depth = parent_depth + 1;
		unsigned len = depth;

		/* Set the depth of this node so that it is available
		 * when its children (if any) are processed.  */

		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);

		/* If needed, decrease the length to meet the
		 * length-limited constraint.  This is not the optimal
		 * method for generating length-limited Huffman codes!
		 * But it should be good enough.  */
		if (len >= max_codeword_len) {
			len = max_codeword_len;
			do {
				len--;
			} while (len_counts[len] == 0);
		}

		/* Account for the fact that we have a non-leaf node at
		 * the current depth.  */
		len_counts[len]--;
		len_counts[len + 1] += 2;
	}
}

/*
 * Generate the codewords for a canonical Huffman code.
 *
 * @A
 *	The output array for codewords.  In addition, initially this
 *	array must contain the symbols, sorted primarily by frequency and
 *	secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
 *	each entry.
 *
 * @len
 *	Output array for codeword lengths.
 *
 * @len_counts
 *	An array that provides the number of codewords that will have
 *	each possible length <= max_codeword_len.
 *
 * @max_codeword_len
 *	Maximum length, in bits, of each codeword.
 *
 * @num_syms
 *	Number of symbols in the alphabet, including symbols with zero
 *	frequency.  This is the length of the 'A' and 'len' arrays.
 */
static void
gen_codewords(u32 A[restrict], u8 lens[restrict],
	      const unsigned len_counts[restrict],
	      unsigned max_codeword_len, unsigned num_syms)
{
	u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
	unsigned i;
	unsigned len;
	unsigned sym;

	/* Given the number of codewords that will have each length,
	 * assign codeword lengths to symbols.  We do this by assigning
	 * the lengths in decreasing order to the symbols sorted
	 * primarily by increasing frequency and secondarily by
	 * increasing symbol value.  */
	for (i = 0, len = max_codeword_len; len >= 1; len--) {
		unsigned count = len_counts[len];
		while (count--)
			lens[A[i++] & SYMBOL_MASK] = len;
	}

	/* Generate the codewords themselves.  We initialize the
	 * 'next_codewords' array to provide the lexicographically first
	 * codeword of each length, then assign codewords in symbol
	 * order.  This produces a canonical code.  */
	next_codewords[0] = 0;
	next_codewords[1] = 0;
	for (len = 2; len <= max_codeword_len; len++)
		next_codewords[len] =
			(next_codewords[len - 1] + len_counts[len - 1]) << 1;

	for (sym = 0; sym < num_syms; sym++)
		A[sym] = next_codewords[lens[sym]]++;
}

/*
 * ---------------------------------------------------------------------
 *			make_canonical_huffman_code()
 * ---------------------------------------------------------------------
 *
 * Given an alphabet and the frequency of each symbol in it, construct a
 * length-limited canonical Huffman code.
 *
 * @num_syms
 *	The number of symbols in the alphabet.  The symbols are the
 *	integers in the range [0, num_syms - 1].  This parameter must be
 *	at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
 *
 * @max_codeword_len
 *	The maximum permissible codeword length.
 *
 * @freqs
 *	An array of @num_syms entries, each of which specifies the
 *	frequency of the corresponding symbol.  It is valid for some,
 *	none, or all of the frequencies to be 0.
 *
 * @lens
 *	An array of @num_syms entries in which this function will return
 *	the length, in bits, of the codeword assigned to each symbol.
 *	Symbols with 0 frequency will not have codewords per se, but
 *	their entries in this array will be set to 0.  No lengths greater
 *	than @max_codeword_len will be assigned.
 *
 * @codewords
 *	An array of @num_syms entries in which this function will return
 *	the codeword for each symbol, right-justified and padded on the
 *	left with zeroes.  Codewords for symbols with 0 frequency will be
 *	undefined.
 *
 * ---------------------------------------------------------------------
 *
 * This function builds a length-limited canonical Huffman code.
 *
 * A length-limited Huffman code contains no codewords longer than some
 * specified length, and has exactly (with some algorithms) or
 * approximately (with the algorithm used here) the minimum weighted path
 * length from the root, given this constraint.
 *
 * A canonical Huffman code satisfies the properties that a longer
 * codeword never lexicographically precedes a shorter codeword, and the
 * lexicographic ordering of codewords of the same length is the same as
 * the lexicographic ordering of the corresponding symbols.  A canonical
 * Huffman code, or more generally a canonical prefix code, can be
 * reconstructed from only a list containing the codeword length of each
 * symbol.
 *
 * The classic algorithm to generate a Huffman code creates a node for
 * each symbol, then inserts these nodes into a min-heap keyed by symbol
 * frequency.  Then, repeatedly, the two lowest-frequency nodes are
 * removed from the min-heap and added as the children of a new node
 * having frequency equal to the sum of its two children, which is then
 * inserted into the min-heap.  When only a single node remains in the
 * min-heap, it is the root of the Huffman tree.  The codeword for each
 * symbol is determined by the path needed to reach the corresponding
 * node from the root.  Descending to the left child appends a 0 bit,
 * whereas descending to the right child appends a 1 bit.
 *
 * The classic algorithm is relatively easy to understand, but it is
 * subject to a number of inefficiencies.  In practice, it is fastest to
 * first sort the symbols by frequency.  (This itself can be subject to
 * an optimization based on the fact that most frequencies tend to be
 * low.)  At the same time, we sort secondarily by symbol value, which
 * aids the process of generating a canonical code.  Then, during tree
 * construction, no heap is necessary because both the leaf nodes and the
 * unparented non-leaf nodes can be easily maintained in sorted order.
 * Consequently, there can never be more than two possibilities for the
 * next-lowest-frequency node.
 *
 * In addition, because we're generating a canonical code, we actually
 * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
 * This is because for canonical code generation we don't need to know
 * where the symbols are in the tree.  Rather, we only need to know how
 * many leaf nodes have each depth (codeword length).  And this
 * information can, in fact, be quickly generated from the tree of
 * non-leaves only.
 *
 * Furthermore, we can build this stripped-down Huffman tree directly in
 * the array in which the codewords are to be generated, provided that
 * these array slots are large enough to hold a symbol and frequency
 * value.
 *
 * Still furthermore, we don't even need to maintain explicit child
 * pointers.  We only need the parent pointers, and even those can be
 * overwritten in-place with depth information as part of the process of
 * extracting codeword lengths from the tree.  So in summary, we do NOT
 * need a big structure like:
 *
 *	struct huffman_tree_node {
 *		unsigned int symbol;
 *		unsigned int frequency;
 *		unsigned int depth;
 *		struct huffman_tree_node *left_child;
 *		struct huffman_tree_node *right_child;
 *	};
 *
 *
 *   ... which often gets used in "naive" implementations of Huffman code
 *   generation.
 *
 * Many of these optimizations are based on the implementation in 7-Zip
 * (source file: C/HuffEnc.c), which has been placed in the public domain
 * by Igor Pavlov.
 */
static void
make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
			    const u32 freqs[restrict],
			    u8 lens[restrict], u32 codewords[restrict])
{
	u32 *A = codewords;
	unsigned num_used_syms;

	STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);

	/* We begin by sorting the symbols primarily by frequency and
	 * secondarily by symbol value.  As an optimization, the array
	 * used for this purpose ('A') shares storage with the space in
	 * which we will eventually return the codewords.  */

	num_used_syms = sort_symbols(num_syms, freqs, lens, A);

	/* 'num_used_syms' is the number of symbols with nonzero
	 * frequency.  This may be less than @num_syms.  'num_used_syms'
	 * is also the number of entries in 'A' that are valid.  Each
	 * entry consists of a distinct symbol and a nonzero frequency
	 * packed into a 32-bit integer.  */

	/* Handle special cases where only 0 or 1 symbols were used (had
	 * nonzero frequency).  */

	if (unlikely(num_used_syms == 0)) {
		/* Code is empty.  sort_symbols() already set all lengths
		 * to 0, so there is nothing more to do.  */
		return;
	}

	if (unlikely(num_used_syms == 1)) {
		/* Only one symbol was used, so we only need one
		 * codeword.  But two codewords are needed to form the
		 * smallest complete Huffman code, which uses codewords 0
		 * and 1.  Therefore, we choose another symbol to which
		 * to assign a codeword.  We use 0 (if the used symbol is
		 * not 0) or 1 (if the used symbol is 0).  In either
		 * case, the lesser-valued symbol must be assigned
		 * codeword 0 so that the resulting code is canonical.  */

		unsigned sym = A[0] & SYMBOL_MASK;
		unsigned nonzero_idx = sym ? sym : 1;

		codewords[0] = 0;
		lens[0] = 1;
		codewords[nonzero_idx] = 1;
		lens[nonzero_idx] = 1;
		return;
	}

	/* Build a stripped-down version of the Huffman tree, sharing the
	 * array 'A' with the symbol values.  Then extract length counts
	 * from the tree and use them to generate the final codewords.  */

	build_tree(A, num_used_syms);

	{
		unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];

		compute_length_counts(A, num_used_syms - 2,
				      len_counts, max_codeword_len);

		gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
	}
}

/*
 * Clear the Huffman symbol frequency counters.
 * This must be called when starting a new DEFLATE block.
 */
static void
deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
{
	memset(&c->freqs, 0, sizeof(c->freqs));
}

/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length.  */
static u32
deflate_reverse_codeword(u32 codeword, u8 len)
{
	/* The following branchless algorithm is faster than going bit by bit.
	 * Note: since no codewords are longer than 16 bits, we only need to
	 * reverse the low 16 bits of the 'u32'.  */
	STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);

	/* Flip adjacent 1-bit fields  */
	codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);

	/* Flip adjacent 2-bit fields  */
	codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);

	/* Flip adjacent 4-bit fields  */
	codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);

	/* Flip adjacent 8-bit fields  */
	codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);

	/* Return the high 'len' bits of the bit-reversed 16 bit value.  */
	return codeword >> (16 - len);
}

/* Make a canonical Huffman code with bit-reversed codewords.  */
static void
deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
			  const u32 freqs[], u8 lens[], u32 codewords[])
{
	unsigned sym;

	make_canonical_huffman_code(num_syms, max_codeword_len,
				    freqs, lens, codewords);

	for (sym = 0; sym < num_syms; sym++)
		codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
}

/*
 * Build the literal/length and offset Huffman codes for a DEFLATE block.
 *
 * This takes as input the frequency tables for each code and produces as output
 * a set of tables that map symbols to codewords and codeword lengths.
 */
static void
deflate_make_huffman_codes(const struct deflate_freqs *freqs,
			   struct deflate_codes *codes)
{
	STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
	STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);

	deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
				  MAX_LITLEN_CODEWORD_LEN,
				  freqs->litlen,
				  codes->lens.litlen,
				  codes->codewords.litlen);

	deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
				  MAX_OFFSET_CODEWORD_LEN,
				  freqs->offset,
				  codes->lens.offset,
				  codes->codewords.offset);
}

/* Initialize c->static_codes.  */
static void
deflate_init_static_codes(struct libdeflate_compressor *c)
{
	unsigned i;

	for (i = 0; i < 144; i++)
		c->freqs.litlen[i] = 1 << (9 - 8);
	for (; i < 256; i++)
		c->freqs.litlen[i] = 1 << (9 - 9);
	for (; i < 280; i++)
		c->freqs.litlen[i] = 1 << (9 - 7);
	for (; i < 288; i++)
		c->freqs.litlen[i] = 1 << (9 - 8);

	for (i = 0; i < 32; i++)
		c->freqs.offset[i] = 1 << (5 - 5);

	deflate_make_huffman_codes(&c->freqs, &c->static_codes);
}

/* Return the offset slot for the specified match offset.  */
static forceinline unsigned
deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
{
#if USE_FULL_OFFSET_SLOT_FAST
	return c->offset_slot_fast[offset];
#else
	if (offset <= 256)
		return c->offset_slot_fast[offset - 1];
	else
		return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
#endif
}

/* Write the header fields common to all DEFLATE block types.  */
static void
deflate_write_block_header(struct deflate_output_bitstream *os,
			   bool is_final_block, unsigned block_type)
{
	deflate_add_bits(os, is_final_block, 1);
	deflate_add_bits(os, block_type, 2);
	deflate_flush_bits(os);
}

static unsigned
deflate_compute_precode_items(const u8 lens[restrict],
			      const unsigned num_lens,
			      u32 precode_freqs[restrict],
			      unsigned precode_items[restrict])
{
	unsigned *itemptr;
	unsigned run_start;
	unsigned run_end;
	unsigned extra_bits;
	u8 len;

	memset(precode_freqs, 0,
	       DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));

	itemptr = precode_items;
	run_start = 0;
	do {
		/* Find the next run of codeword lengths.  */

		/* len = the length being repeated  */
		len = lens[run_start];

		/* Extend the run.  */
		run_end = run_start;
		do {
			run_end++;
		} while (run_end != num_lens && len == lens[run_end]);

		if (len == 0) {
			/* Run of zeroes.  */

			/* Symbol 18: RLE 11 to 138 zeroes at a time.  */
			while ((run_end - run_start) >= 11) {
				extra_bits = MIN((run_end - run_start) - 11, 0x7F);
				precode_freqs[18]++;
				*itemptr++ = 18 | (extra_bits << 5);
				run_start += 11 + extra_bits;
			}

			/* Symbol 17: RLE 3 to 10 zeroes at a time.  */
			if ((run_end - run_start) >= 3) {
				extra_bits = MIN((run_end - run_start) - 3, 0x7);
				precode_freqs[17]++;
				*itemptr++ = 17 | (extra_bits << 5);
				run_start += 3 + extra_bits;
			}
		} else {

			/* A run of nonzero lengths. */

			/* Symbol 16: RLE 3 to 6 of the previous length.  */
			if ((run_end - run_start) >= 4) {
				precode_freqs[len]++;
				*itemptr++ = len;
				run_start++;
				do {
					extra_bits = MIN((run_end - run_start) - 3, 0x3);
					precode_freqs[16]++;
					*itemptr++ = 16 | (extra_bits << 5);
					run_start += 3 + extra_bits;
				} while ((run_end - run_start) >= 3);
			}
		}

		/* Output any remaining lengths without RLE.  */
		while (run_start != run_end) {
			precode_freqs[len]++;
			*itemptr++ = len;
			run_start++;
		}
	} while (run_start != num_lens);

	return itemptr - precode_items;
}

/*
 * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
 * separate Huffman code, the "precode", which contains a symbol for each
 * possible codeword length in the larger code as well as several special
 * symbols to represent repeated codeword lengths (a form of run-length
 * encoding).  The precode is itself constructed in canonical form, and its
 * codeword lengths are represented literally in 19 3-bit fields that
 * immediately precede the compressed codeword lengths of the larger code.
 */

/* Precompute the information needed to output Huffman codes. */
static void
deflate_precompute_huffman_header(struct libdeflate_compressor *c)
{
	/* Compute how many litlen and offset symbols are needed. */

	for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
	     c->num_litlen_syms > 257;
	     c->num_litlen_syms--)
		if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
			break;

	for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
	     c->num_offset_syms > 1;
	     c->num_offset_syms--)
		if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
			break;

	/* If we're not using the full set of literal/length codeword lengths,
	 * then temporarily move the offset codeword lengths over so that the
	 * literal/length and offset codeword lengths are contiguous. */

	STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
		      DEFLATE_NUM_LITLEN_SYMS);

	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
		memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
			(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
			c->num_offset_syms);
	}

	/* Compute the "items" (RLE / literal tokens and extra bits) with which
	 * the codeword lengths in the larger code will be output. */
	c->num_precode_items =
		deflate_compute_precode_items((u8 *)&c->codes.lens,
					      c->num_litlen_syms +
							c->num_offset_syms,
					      c->precode_freqs,
					      c->precode_items);

	/* Build the precode. */
	STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
	deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
				  MAX_PRE_CODEWORD_LEN,
				  c->precode_freqs, c->precode_lens,
				  c->precode_codewords);

	/* Count how many precode lengths we actually need to output. */
	for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
	     c->num_explicit_lens > 4;
	     c->num_explicit_lens--)
		if (c->precode_lens[deflate_precode_lens_permutation[
						c->num_explicit_lens - 1]] != 0)
			break;

	/* Restore the offset codeword lengths if needed. */
	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
		memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
			(u8 *)&c->codes.lens + c->num_litlen_syms,
			c->num_offset_syms);
	}
}

/* Output the Huffman codes. */
static void
deflate_write_huffman_header(struct libdeflate_compressor *c,
			     struct deflate_output_bitstream *os)
{
	unsigned i;

	deflate_add_bits(os, c->num_litlen_syms - 257, 5);
	deflate_add_bits(os, c->num_offset_syms - 1, 5);
	deflate_add_bits(os, c->num_explicit_lens - 4, 4);
	deflate_flush_bits(os);

	/* Output the lengths of the codewords in the precode.  */
	for (i = 0; i < c->num_explicit_lens; i++) {
		deflate_add_bits(os, c->precode_lens[
				       deflate_precode_lens_permutation[i]], 3);
		deflate_flush_bits(os);
	}

	/* Output the encoded lengths of the codewords in the larger code.  */
	for (i = 0; i < c->num_precode_items; i++) {
		unsigned precode_item = c->precode_items[i];
		unsigned precode_sym = precode_item & 0x1F;
		deflate_add_bits(os, c->precode_codewords[precode_sym],
				 c->precode_lens[precode_sym]);
		if (precode_sym >= 16) {
			if (precode_sym == 16)
				deflate_add_bits(os, precode_item >> 5, 2);
			else if (precode_sym == 17)
				deflate_add_bits(os, precode_item >> 5, 3);
			else
				deflate_add_bits(os, precode_item >> 5, 7);
		}
		STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7));
		deflate_flush_bits(os);
	}
}

static void
deflate_write_sequences(struct deflate_output_bitstream * restrict os,
			const struct deflate_codes * restrict codes,
			const struct deflate_sequence sequences[restrict],
			const u8 * restrict in_next)
{
	const struct deflate_sequence *seq = sequences;

	for (;;) {
		u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
		unsigned length = seq->litrunlen_and_length >> 23;
		unsigned length_slot;
		unsigned litlen_symbol;
		unsigned offset_symbol;

		if (litrunlen) {
		#if 1
			while (litrunlen >= 4) {
				unsigned lit0 = in_next[0];
				unsigned lit1 = in_next[1];
				unsigned lit2 = in_next[2];
				unsigned lit3 = in_next[3];

				deflate_add_bits(os, codes->codewords.litlen[lit0],
						 codes->lens.litlen[lit0]);
				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
					deflate_flush_bits(os);

				deflate_add_bits(os, codes->codewords.litlen[lit1],
						 codes->lens.litlen[lit1]);
				if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
					deflate_flush_bits(os);

				deflate_add_bits(os, codes->codewords.litlen[lit2],
						 codes->lens.litlen[lit2]);
				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
					deflate_flush_bits(os);

				deflate_add_bits(os, codes->codewords.litlen[lit3],
						 codes->lens.litlen[lit3]);
				deflate_flush_bits(os);
				in_next += 4;
				litrunlen -= 4;
			}
			if (litrunlen-- != 0) {
				deflate_add_bits(os, codes->codewords.litlen[*in_next],
						 codes->lens.litlen[*in_next]);
				if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
					deflate_flush_bits(os);
				in_next++;
				if (litrunlen-- != 0) {
					deflate_add_bits(os, codes->codewords.litlen[*in_next],
							 codes->lens.litlen[*in_next]);
					if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
						deflate_flush_bits(os);
					in_next++;
					if (litrunlen-- != 0) {
						deflate_add_bits(os, codes->codewords.litlen[*in_next],
								 codes->lens.litlen[*in_next]);
						if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
							deflate_flush_bits(os);
						in_next++;
					}
				}
				if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
					deflate_flush_bits(os);
			}
		#else
			do {
				unsigned lit = *in_next++;
				deflate_add_bits(os, codes->codewords.litlen[lit],
						 codes->lens.litlen[lit]);
				deflate_flush_bits(os);
			} while (--litrunlen);
		#endif
		}

		if (length == 0)
			return;

		in_next += length;

		length_slot = seq->length_slot;
		litlen_symbol = DEFLATE_FIRST_LEN_SYM + length_slot;

		/* Litlen symbol  */
		deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
				 codes->lens.litlen[litlen_symbol]);

		/* Extra length bits  */
		STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
					 DEFLATE_MAX_EXTRA_LENGTH_BITS));
		deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
				 deflate_extra_length_bits[length_slot]);

		if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
				DEFLATE_MAX_EXTRA_LENGTH_BITS +
				MAX_OFFSET_CODEWORD_LEN +
				DEFLATE_MAX_EXTRA_OFFSET_BITS))
			deflate_flush_bits(os);

		/* Offset symbol  */
		offset_symbol = seq->offset_symbol;
		deflate_add_bits(os, codes->codewords.offset[offset_symbol],
				 codes->lens.offset[offset_symbol]);

		if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
				DEFLATE_MAX_EXTRA_OFFSET_BITS))
			deflate_flush_bits(os);

		/* Extra offset bits  */
		deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
				 deflate_extra_offset_bits[offset_symbol]);

		deflate_flush_bits(os);

		seq++;
	}
}

#if SUPPORT_NEAR_OPTIMAL_PARSING
/*
 * Follow the minimum-cost path in the graph of possible match/literal choices
 * for the current block and write out the matches/literals using the specified
 * Huffman codes.
 *
 * Note: this is slightly duplicated with deflate_write_sequences(), the reason
 * being that we don't want to waste time translating between intermediate
 * match/literal representations.
 */
static void
deflate_write_item_list(struct deflate_output_bitstream *os,
			const struct deflate_codes *codes,
			struct libdeflate_compressor *c,
			u32 block_length)
{
	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
	struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
	do {
		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
		unsigned litlen_symbol;
		unsigned length_slot;
		unsigned offset_slot;

		if (length == 1) {
			/* Literal  */
			litlen_symbol = offset;
			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
					 codes->lens.litlen[litlen_symbol]);
			deflate_flush_bits(os);
		} else {
			/* Match length  */
			length_slot = deflate_length_slot[length];
			litlen_symbol = DEFLATE_FIRST_LEN_SYM + length_slot;
			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
					 codes->lens.litlen[litlen_symbol]);

			deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
					 deflate_extra_length_bits[length_slot]);

			if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
					DEFLATE_MAX_EXTRA_LENGTH_BITS +
					MAX_OFFSET_CODEWORD_LEN +
					DEFLATE_MAX_EXTRA_OFFSET_BITS))
				deflate_flush_bits(os);


			/* Match offset  */
			offset_slot = deflate_get_offset_slot(c, offset);
			deflate_add_bits(os, codes->codewords.offset[offset_slot],
					 codes->lens.offset[offset_slot]);

			if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
					DEFLATE_MAX_EXTRA_OFFSET_BITS))
				deflate_flush_bits(os);

			deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
					 deflate_extra_offset_bits[offset_slot]);

			deflate_flush_bits(os);
		}
		cur_node += length;
	} while (cur_node != end_node);
}
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

/* Output the end-of-block symbol.  */
static void
deflate_write_end_of_block(struct deflate_output_bitstream *os,
			   const struct deflate_codes *codes)
{
	deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
			 codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
	deflate_flush_bits(os);
}

static void
deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
				 const u8 *data, u16 len,
				 bool is_final_block)
{
	deflate_write_block_header(os, is_final_block,
				   DEFLATE_BLOCKTYPE_UNCOMPRESSED);
	deflate_align_bitstream(os);

	if (4 + (u32)len >= os->end - os->next) {
		os->next = os->end;
		return;
	}

	put_unaligned_le16(len, os->next);
	os->next += 2;
	put_unaligned_le16(~len, os->next);
	os->next += 2;
	memcpy(os->next, data, len);
	os->next += len;
}

static void
deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
				  const u8 *data, size_t data_length,
				  bool is_final_block)
{
	do {
		u16 len = MIN(data_length, UINT16_MAX);

		deflate_write_uncompressed_block(os, data, len,
					is_final_block && len == data_length);
		data += len;
		data_length -= len;
	} while (data_length != 0);
}

/*
 * Choose the best type of block to use (dynamic Huffman, static Huffman, or
 * uncompressed), then output it.
 */
static void
deflate_flush_block(struct libdeflate_compressor * restrict c,
		    struct deflate_output_bitstream * restrict os,
		    const u8 * restrict block_begin, u32 block_length,
		    const struct deflate_sequence *sequences,
		    bool is_final_block)
{
	static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
	};

	/* Costs are measured in bits */
	u32 dynamic_cost = 0;
	u32 static_cost = 0;
	u32 uncompressed_cost = 0;
	struct deflate_codes *codes;
	int block_type;
	unsigned sym;

	if (sequences != NULL /* !near_optimal */ ||
	    !SUPPORT_NEAR_OPTIMAL_PARSING) {
		/* Tally the end-of-block symbol. */
		c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;

		/* Build dynamic Huffman codes. */
		deflate_make_huffman_codes(&c->freqs, &c->codes);
	} /* Else, this was already done */

	/* Account for the cost of sending dynamic Huffman codes. */
	deflate_precompute_huffman_header(c);
	dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
		u32 extra = deflate_extra_precode_bits[sym];
		dynamic_cost += c->precode_freqs[sym] *
				(extra + c->precode_lens[sym]);
	}

	/* Account for the cost of encoding literals. */
	for (sym = 0; sym < 256; sym++) {
		dynamic_cost += c->freqs.litlen[sym] *
				c->codes.lens.litlen[sym];
	}
	for (sym = 0; sym < 144; sym++)
		static_cost += c->freqs.litlen[sym] * 8;
	for (; sym < 256; sym++)
		static_cost += c->freqs.litlen[sym] * 9;

	/* Account for the cost of encoding the end-of-block symbol. */
	dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK];
	static_cost += 7;

	/* Account for the cost of encoding lengths. */
	for (sym = DEFLATE_FIRST_LEN_SYM;
	     sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits);
	     sym++) {
		u32 extra = deflate_extra_length_bits[
					sym - DEFLATE_FIRST_LEN_SYM];
		dynamic_cost += c->freqs.litlen[sym] *
				(extra + c->codes.lens.litlen[sym]);
		static_cost += c->freqs.litlen[sym] *
				(extra + c->static_codes.lens.litlen[sym]);
	}

	/* Account for the cost of encoding offsets. */
	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
		u32 extra = deflate_extra_offset_bits[sym];
		dynamic_cost += c->freqs.offset[sym] *
				(extra + c->codes.lens.offset[sym]);
		static_cost += c->freqs.offset[sym] * (extra + 5);
	}

	/* Compute the cost of using uncompressed blocks. */
	uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
			     (40 * (DIV_ROUND_UP(block_length,
						 UINT16_MAX) - 1)) +
			     (8 * block_length);

	/* Choose the cheapest block type. */
	if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
		block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
		codes = &c->codes;
	} else if (static_cost < uncompressed_cost) {
		block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
		codes = &c->static_codes;
	} else {
		block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
	}

	/* Now actually output the block. */

	if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
		/* Note: the length being flushed may exceed the maximum length
		 * of an uncompressed block (65535 bytes).  Therefore, more than
		 * one uncompressed block might be needed. */
		deflate_write_uncompressed_blocks(os, block_begin, block_length,
						  is_final_block);
	} else {
		/* Output the block header. */
		deflate_write_block_header(os, is_final_block, block_type);

		/* Output the Huffman codes (dynamic Huffman blocks only). */
		if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
			deflate_write_huffman_header(c, os);

		/* Output the literals, matches, and end-of-block symbol. */
	#if SUPPORT_NEAR_OPTIMAL_PARSING
		if (sequences == NULL)
			deflate_write_item_list(os, codes, c, block_length);
		else
	#endif
			deflate_write_sequences(os, codes, sequences,
						block_begin);
		deflate_write_end_of_block(os, codes);
	}
}

static void
deflate_begin_sequences(struct libdeflate_compressor *c,
			struct deflate_sequence *first_seq)
{
	deflate_reset_symbol_frequencies(c);
	first_seq->litrunlen_and_length = 0;
}

static forceinline void
deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
		       struct deflate_sequence *seq)
{
	c->freqs.litlen[literal]++;
	seq->litrunlen_and_length++;
}

static forceinline void
deflate_choose_match(struct libdeflate_compressor *c,
		     unsigned length, unsigned offset,
		     struct deflate_sequence **seq_p)
{
	struct deflate_sequence *seq = *seq_p;
	unsigned length_slot = deflate_length_slot[length];
	unsigned offset_slot = deflate_get_offset_slot(c, offset);

	c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
	c->freqs.offset[offset_slot]++;

	seq->litrunlen_and_length |= (u32)length << 23;
	seq->offset = offset;
	seq->length_slot = length_slot;
	seq->offset_symbol = offset_slot;

	seq++;
	seq->litrunlen_and_length = 0;
	*seq_p = seq;
}

/******************************************************************************/

/*
 * Block splitting algorithm.  The problem is to decide when it is worthwhile to
 * start a new block with new Huffman codes.  There is a theoretically optimal
 * solution: recursively consider every possible block split, considering the
 * exact cost of each block, and choose the minimum cost approach.  But this is
 * far too slow.  Instead, as an approximation, we can count symbols and after
 * every N symbols, compare the expected distribution of symbols based on the
 * previous data with the actual distribution.  If they differ "by enough", then
 * start a new block.
 *
 * As an optimization and heuristic, we don't distinguish between every symbol
 * but rather we combine many symbols into a single "observation type".  For
 * literals we only look at the high bits and low bits, and for matches we only
 * look at whether the match is long or not.  The assumption is that for typical
 * "real" data, places that are good block boundaries will tend to be noticeable
 * based only on changes in these aggregate frequencies, without looking for
 * subtle differences in individual symbols.  For example, a change from ASCII
 * bytes to non-ASCII bytes, or from few matches (generally less compressible)
 * to many matches (generally more compressible), would be easily noticed based
 * on the aggregates.
 *
 * For determining whether the frequency distributions are "different enough" to
 * start a new block, the simply heuristic of splitting when the sum of absolute
 * differences exceeds a constant seems to be good enough.  We also add a number
 * proportional to the block length so that the algorithm is more likely to end
 * long blocks than short blocks.  This reflects the general expectation that it
 * will become increasingly beneficial to start a new block as the current
 * block grows longer.
 *
 * Finally, for an approximation, it is not strictly necessary that the exact
 * symbols being used are considered.  With "near-optimal parsing", for example,
 * the actual symbols that will be used are unknown until after the block
 * boundary is chosen and the block has been optimized.  Since the final choices
 * cannot be used, we can use preliminary "greedy" choices instead.
 */

/* Initialize the block split statistics when starting a new block. */
static void
init_block_split_stats(struct block_split_stats *stats)
{
	int i;

	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
		stats->new_observations[i] = 0;
		stats->observations[i] = 0;
	}
	stats->num_new_observations = 0;
	stats->num_observations = 0;
}

/* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
 * literal, for 8 possible literal observation types.  */
static forceinline void
observe_literal(struct block_split_stats *stats, u8 lit)
{
	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
	stats->num_new_observations++;
}

/* Match observation.  Heuristic: use one observation type for "short match" and
 * one observation type for "long match".  */
static forceinline void
observe_match(struct block_split_stats *stats, unsigned length)
{
	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
	stats->num_new_observations++;
}

static void
merge_new_observations(struct block_split_stats *stats)
{
	int i;

	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
		stats->observations[i] += stats->new_observations[i];
		stats->new_observations[i] = 0;
	}
	stats->num_observations += stats->num_new_observations;
	stats->num_new_observations = 0;
}

static bool
do_end_block_check(struct block_split_stats *stats, u32 block_length)
{
	if (stats->num_observations > 0) {
		/* Note: to avoid slow divisions, we do not divide by
		 * 'num_observations', but rather do all math with the numbers
		 * multiplied by 'num_observations'.  */
		u32 total_delta = 0;
		int i;

		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
			u32 expected = stats->observations[i] * stats->num_new_observations;
			u32 actual = stats->new_observations[i] * stats->num_observations;
			u32 delta = (actual > expected) ? actual - expected :
							  expected - actual;
			total_delta += delta;
		}

		/* Ready to end the block? */
		if (total_delta + (block_length / 4096) * stats->num_observations >=
		    NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
			return true;
	}
	merge_new_observations(stats);
	return false;
}

static forceinline bool
should_end_block(struct block_split_stats *stats,
		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
{
	/* Ready to check block split statistics? */
	if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
	    in_end - in_next < MIN_BLOCK_LENGTH)
		return false;

	return do_end_block_check(stats, in_next - in_block_begin);
}

/******************************************************************************/

/*
 * Decrease the maximum and nice match lengths if we're approaching the end of
 * the input buffer.
 */
static forceinline void
adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
{
	if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
		*max_len = remaining;
		*nice_len = MIN(*nice_len, *max_len);
	}
}

/*
 * Choose the minimum match length for the greedy and lazy parsers.
 *
 * By default the minimum match length is 3, which is the smallest length the
 * DEFLATE format allows.  However, with greedy and lazy parsing, some data
 * (e.g. DNA sequencing data) benefits greatly from a longer minimum length.
 * Typically, this is because literals are very cheap.  In general, the
 * near-optimal parser handles this case naturally, but the greedy and lazy
 * parsers need a heuristic to decide when to use short matches.
 *
 * The heuristic we use is to make the minimum match length depend on the number
 * of different literals that exist in the data.  If there are many different
 * literals, then literals will probably be expensive, so short matches will
 * probably be worthwhile.  Conversely, if not many literals are used, then
 * probably literals will be cheap and short matches won't be worthwhile.
 */
static unsigned
choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
{
	/* map from num_used_literals to min_len */
	static const u8 min_lens[] = {
		9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
		/* the rest is implicitly 3 */
	};
	unsigned min_len;

	STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
	STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);

	if (num_used_literals >= ARRAY_LEN(min_lens))
		return 3;
	min_len = min_lens[num_used_literals];
	/*
	 * With a low max_search_depth, it may be too hard to find long matches.
	 */
	if (max_search_depth < 16) {
		if (max_search_depth < 5)
			min_len = MIN(min_len, 4);
		else if (max_search_depth < 10)
			min_len = MIN(min_len, 5);
		else
			min_len = MIN(min_len, 7);
	}
	return min_len;
}

static unsigned
calculate_min_match_len(const u8 *data, size_t data_len,
			unsigned max_search_depth)
{
	u8 used[256] = { 0 };
	unsigned num_used_literals = 0;
	int i;

	/*
	 * For an initial approximation, scan the first 4 KiB of data.
	 * recalculate_min_match_len() will update the min_len later.
	 */
	data_len = MIN(data_len, 4096);
	for (i = 0; i < data_len; i++)
		used[data[i]] = 1;
	for (i = 0; i < 256; i++)
		num_used_literals += used[i];
	return choose_min_match_len(num_used_literals, max_search_depth);
}

/*
 * Recalculate the minimum match length for a block, now that we know the
 * distribution of literals that are actually being used (freqs->litlen).
 */
static unsigned
recalculate_min_match_len(const struct deflate_freqs *freqs,
			  unsigned max_search_depth)
{
	u32 literal_freq = 0;
	u32 cutoff;
	unsigned num_used_literals = 0;
	int i;

	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
		literal_freq += freqs->litlen[i];

	cutoff = literal_freq >> 10; /* Ignore literals used very rarely */

	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
		if (freqs->litlen[i] > cutoff)
			num_used_literals++;
	}
	return choose_min_match_len(num_used_literals, max_search_depth);
}

/*
 * This is the level 0 "compressor".  It always outputs uncompressed blocks.
 */
static size_t
deflate_compress_none(struct libdeflate_compressor * restrict c,
		      const u8 * restrict in, size_t in_nbytes,
		      u8 * restrict out, size_t out_nbytes_avail)
{
	struct deflate_output_bitstream os;

	deflate_init_output(&os, out, out_nbytes_avail);

	deflate_write_uncompressed_blocks(&os, in, in_nbytes, true);

	return deflate_flush_output(&os);
}

/*
 * This is a faster variant of deflate_compress_greedy().  It uses the
 * ht_matchfinder rather than the hc_matchfinder.  It also skips the block
 * splitting algorithm and just uses fixed length blocks.  c->max_search_depth
 * has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h.
 */
static size_t
deflate_compress_fastest(struct libdeflate_compressor * restrict c,
			 const u8 * restrict in, size_t in_nbytes,
			 u8 * restrict out, size_t out_nbytes_avail)
{
	const u8 *in_next = in;
	const u8 *in_end = in_next + in_nbytes;
	struct deflate_output_bitstream os;
	const u8 *in_cur_base = in_next;
	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
	unsigned nice_len = MIN(c->nice_match_length, max_len);
	u32 next_hash = 0;

	deflate_init_output(&os, out, out_nbytes_avail);
	ht_matchfinder_init(&c->p.f.ht_mf);

	do {
		/* Starting a new DEFLATE block. */

		const u8 * const in_block_begin = in_next;
		const u8 * const in_max_block_end =
			in_next + MIN(in_end - in_next, FAST_BLOCK_LENGTH);
		struct deflate_sequence *seq = c->p.f.sequences;

		deflate_begin_sequences(c, seq);

		do {
			u32 length;
			u32 offset;
			size_t remaining = in_end - in_next;

			if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
				max_len = remaining;
				if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) {
					do {
						deflate_choose_literal(
							c, *in_next++, seq);
					} while (--max_len);
					break;
				}
				nice_len = MIN(nice_len, max_len);
			}
			length = ht_matchfinder_longest_match(&c->p.f.ht_mf,
							      &in_cur_base,
							      in_next,
							      max_len,
							      nice_len,
							      &next_hash,
							      &offset);
			if (length) {
				/* Match found. */
				deflate_choose_match(c, length, offset, &seq);
				ht_matchfinder_skip_bytes(&c->p.f.ht_mf,
							  &in_cur_base,
							  in_next + 1,
							  in_end,
							  length - 1,
							  &next_hash);
				in_next += length;
			} else {
				/* No match found. */
				deflate_choose_literal(c, *in_next++, seq);
			}

			/* Check if it's time to output another block. */
		} while (in_next < in_max_block_end);

		deflate_flush_block(c, &os, in_block_begin,
				    in_next - in_block_begin,
				    c->p.f.sequences, in_next == in_end);
	} while (in_next != in_end);

	return deflate_flush_output(&os);
}

/*
 * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
 */
static size_t
deflate_compress_greedy(struct libdeflate_compressor * restrict c,
			const u8 * restrict in, size_t in_nbytes,
			u8 * restrict out, size_t out_nbytes_avail)
{
	const u8 *in_next = in;
	const u8 *in_end = in_next + in_nbytes;
	struct deflate_output_bitstream os;
	const u8 *in_cur_base = in_next;
	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
	unsigned nice_len = MIN(c->nice_match_length, max_len);
	u32 next_hashes[2] = {0, 0};

	deflate_init_output(&os, out, out_nbytes_avail);
	hc_matchfinder_init(&c->p.g.hc_mf);

	do {
		/* Starting a new DEFLATE block. */

		const u8 * const in_block_begin = in_next;
		const u8 * const in_max_block_end =
			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
		unsigned min_len;
		struct deflate_sequence *seq = c->p.g.sequences;

		init_block_split_stats(&c->split_stats);
		deflate_begin_sequences(c, seq);

		min_len = calculate_min_match_len(in_next,
						  in_max_block_end - in_next,
						  c->max_search_depth);

		do {
			u32 length;
			u32 offset;

			adjust_max_and_nice_len(&max_len, &nice_len,
						in_end - in_next);
			length = hc_matchfinder_longest_match(
						&c->p.g.hc_mf,
						&in_cur_base,
						in_next,
						min_len - 1,
						max_len,
						nice_len,
						c->max_search_depth,
						next_hashes,
						&offset);

			if (length >= min_len &&
			    (length > DEFLATE_MIN_MATCH_LEN ||
			     offset <= 4096)) {
				/* Match found. */
				deflate_choose_match(c, length, offset, &seq);
				observe_match(&c->split_stats, length);
				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
							  &in_cur_base,
							  in_next + 1,
							  in_end,
							  length - 1,
							  next_hashes);
				in_next += length;
			} else {
				/* No match found. */
				deflate_choose_literal(c, *in_next, seq);
				observe_literal(&c->split_stats, *in_next);
				in_next++;
			}

			/* Check if it's time to output another block. */
		} while (in_next < in_max_block_end &&
			 !should_end_block(&c->split_stats,
					   in_block_begin, in_next, in_end));

		deflate_flush_block(c, &os, in_block_begin,
				    in_next - in_block_begin,
				    c->p.g.sequences, in_next == in_end);
	} while (in_next != in_end);

	return deflate_flush_output(&os);
}

static forceinline size_t
deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
			      const u8 * restrict in, size_t in_nbytes,
			      u8 * restrict out, size_t out_nbytes_avail,
			      bool lazy2)
{
	const u8 *in_next = in;
	const u8 *in_end = in_next + in_nbytes;
	struct deflate_output_bitstream os;
	const u8 *in_cur_base = in_next;
	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
	unsigned nice_len = MIN(c->nice_match_length, max_len);
	u32 next_hashes[2] = {0, 0};

	deflate_init_output(&os, out, out_nbytes_avail);
	hc_matchfinder_init(&c->p.g.hc_mf);

	do {
		/* Starting a new DEFLATE block. */

		const u8 * const in_block_begin = in_next;
		const u8 * const in_max_block_end =
			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
		const u8 *next_recalc_min_len =
			in_next + MIN(in_end - in_next, 10000);
		unsigned min_len = DEFLATE_MIN_MATCH_LEN;
		struct deflate_sequence *seq = c->p.g.sequences;

		init_block_split_stats(&c->split_stats);
		deflate_begin_sequences(c, seq);

		min_len = calculate_min_match_len(in_next,
						  in_max_block_end - in_next,
						  c->max_search_depth);
		do {
			unsigned cur_len;
			unsigned cur_offset;
			unsigned next_len;
			unsigned next_offset;

			/*
			 * Recalculate the minimum match length if it hasn't
			 * been done recently.
			 */
			if (in_next >= next_recalc_min_len) {
				min_len = recalculate_min_match_len(
						&c->freqs,
						c->max_search_depth);
				next_recalc_min_len +=
					MIN(in_end - next_recalc_min_len,
					    in_next - in_block_begin);
			}

			/* Find the longest match at the current position. */
			adjust_max_and_nice_len(&max_len, &nice_len,
						in_end - in_next);
			cur_len = hc_matchfinder_longest_match(
						&c->p.g.hc_mf,
						&in_cur_base,
						in_next,
						min_len - 1,
						max_len,
						nice_len,
						c->max_search_depth,
						next_hashes,
						&cur_offset);
			if (cur_len < min_len ||
			    (cur_len == DEFLATE_MIN_MATCH_LEN &&
			     cur_offset > 8192)) {
				/* No match found.  Choose a literal. */
				deflate_choose_literal(c, *in_next, seq);
				observe_literal(&c->split_stats, *in_next);
				in_next++;
				continue;
			}
			in_next++;

		have_cur_match:
			observe_match(&c->split_stats, cur_len);
			/*
			 * We have a match at the current position.
			 * If it's very long, choose it immediately.
			 */
			if (cur_len >= nice_len) {
				deflate_choose_match(c, cur_len, cur_offset,
						     &seq);
				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
							  &in_cur_base,
							  in_next,
							  in_end,
							  cur_len - 1,
							  next_hashes);
				in_next += cur_len - 1;
				continue;
			}

			/*
			 * Try to find a better match at the next position.
			 *
			 * Note: since we already have a match at the *current*
			 * position, we use only half the 'max_search_depth'
			 * when checking the *next* position.  This is a useful
			 * trade-off because it's more worthwhile to use a
			 * greater search depth on the initial match.
			 *
			 * Note: it's possible to structure the code such that
			 * there's only one call to longest_match(), which
			 * handles both the "find the initial match" and "try to
			 * find a better match" cases.  However, it is faster to
			 * have two call sites, with longest_match() inlined at
			 * each.
			 */
			adjust_max_and_nice_len(&max_len, &nice_len,
						in_end - in_next);
			next_len = hc_matchfinder_longest_match(
						&c->p.g.hc_mf,
						&in_cur_base,
						in_next++,
						cur_len - 1,
						max_len,
						nice_len,
						c->max_search_depth >> 1,
						next_hashes,
						&next_offset);
			if (next_len >= cur_len &&
			    4 * (int)(next_len - cur_len) +
			    ((int)bsr32(cur_offset) -
			     (int)bsr32(next_offset)) > 2) {
				/*
				 * Found a better match at the next position.
				 * Output a literal.  Then the next match
				 * becomes the current match.
				 */
				deflate_choose_literal(c, *(in_next - 2), seq);
				cur_len = next_len;
				cur_offset = next_offset;
				goto have_cur_match;
			}

			if (lazy2) {
				/* In lazy2 mode, look ahead another position */
				adjust_max_and_nice_len(&max_len, &nice_len,
							in_end - in_next);
				next_len = hc_matchfinder_longest_match(
						&c->p.g.hc_mf,
						&in_cur_base,
						in_next++,
						cur_len - 1,
						max_len,
						nice_len,
						c->max_search_depth >> 2,
						next_hashes,
						&next_offset);
				if (next_len >= cur_len &&
				    4 * (int)(next_len - cur_len) +
				    ((int)bsr32(cur_offset) -
				     (int)bsr32(next_offset)) > 6) {
					/*
					 * There's a much better match two
					 * positions ahead, so use two literals.
					 */
					deflate_choose_literal(
						c, *(in_next - 3), seq);
					deflate_choose_literal(
						c, *(in_next - 2), seq);
					cur_len = next_len;
					cur_offset = next_offset;
					goto have_cur_match;
				}
				/*
				 * No better match at either of the next 2
				 * positions.  Output the current match.
				 */
				deflate_choose_match(c, cur_len, cur_offset,
						     &seq);
				if (cur_len > 3) {
					hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
								  &in_cur_base,
								  in_next,
								  in_end,
								  cur_len - 3,
								  next_hashes);
					in_next += cur_len - 3;
				}
			} else { /* !lazy2 */
				/*
				 * No better match at the next position.  Output
				 * the current match.
				 */
				deflate_choose_match(c, cur_len, cur_offset,
						     &seq);
				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
							  &in_cur_base,
							  in_next,
							  in_end,
							  cur_len - 2,
							  next_hashes);
				in_next += cur_len - 2;
			}
			/* Check if it's time to output another block. */
		} while (in_next < in_max_block_end &&
			 !should_end_block(&c->split_stats,
					   in_block_begin, in_next, in_end));

		deflate_flush_block(c, &os, in_block_begin,
				    in_next - in_block_begin,
				    c->p.g.sequences, in_next == in_end);
	} while (in_next != in_end);

	return deflate_flush_output(&os);
}

/*
 * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
 * see if there's a better match at the next position.  If yes, it outputs a
 * literal and continues to the next position.  If no, it outputs the match.
 */
static size_t
deflate_compress_lazy(struct libdeflate_compressor * restrict c,
		      const u8 * restrict in, size_t in_nbytes,
		      u8 * restrict out, size_t out_nbytes_avail)
{
	return deflate_compress_lazy_generic(c, in, in_nbytes, out,
					     out_nbytes_avail, false);
}

/*
 * The lazy2 compressor.  This is similar to the regular lazy one, but it looks
 * for a better match at the next 2 positions rather than the next 1.  This
 * makes it take slightly more time, but compress some inputs slightly more.
 */
static size_t
deflate_compress_lazy2(struct libdeflate_compressor * restrict c,
		       const u8 * restrict in, size_t in_nbytes,
		       u8 * restrict out, size_t out_nbytes_avail)
{
	return deflate_compress_lazy_generic(c, in, in_nbytes, out,
					     out_nbytes_avail, true);
}

#if SUPPORT_NEAR_OPTIMAL_PARSING

/*
 * Follow the minimum-cost path in the graph of possible match/literal choices
 * for the current block and compute the frequencies of the Huffman symbols that
 * would be needed to output those matches and literals.
 */
static void
deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
{
	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
	do {
		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;

		if (length == 1) {
			/* Literal  */
			c->freqs.litlen[offset]++;
		} else {
			/* Match  */
			c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
					deflate_length_slot[length]]++;
			c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
		}
		cur_node += length;
	} while (cur_node != end_node);

	/* Tally the end-of-block symbol. */
	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
}

/* Set the current cost model from the codeword lengths specified in @lens.  */
static void
deflate_set_costs_from_codes(struct libdeflate_compressor *c,
			     const struct deflate_lens *lens)
{
	unsigned i;

	/* Literals  */
	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
		u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
		c->p.n.costs.literal[i] = bits * BIT_COST;
	}

	/* Lengths  */
	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
		unsigned length_slot = deflate_length_slot[i];
		unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot;
		u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
		bits += deflate_extra_length_bits[length_slot];
		c->p.n.costs.length[i] = bits * BIT_COST;
	}

	/* Offset slots  */
	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
		u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
		bits += deflate_extra_offset_bits[i];
		c->p.n.costs.offset_slot[i] = bits * BIT_COST;
	}
}

/*
 * This lookup table gives the default cost of a literal symbol and of a length
 * symbol, depending on the characteristics of the input data.  It was generated
 * by scripts/gen_default_litlen_costs.py.
 *
 * This table is indexed first by the estimated match probability:
 *
 *	i=0: data doesn't contain many matches	[match_prob=0.25]
 *	i=1: neutral				[match_prob=0.50]
 *	i=2: data contains lots of matches	[match_prob=0.75]
 *
 * This lookup produces a subtable which maps the number of distinct used
 * literals to the default cost of a literal symbol, i.e.:
 *
 *	int(-log2((1 - match_prob) / num_used_literals) * BIT_COST)
 *
 * ... for num_used_literals in [1, 256] (and 0, which is copied from 1).  This
 * accounts for literals usually getting cheaper as the number of distinct
 * literals decreases, and as the proportion of literals to matches increases.
 *
 * The lookup also produces the cost of a length symbol, which is:
 *
 *	int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST)
 *
 * Note: we don't currently assign different costs to different literal symbols,
 * or to different length symbols, as this is hard to do.
 */
static const struct {
	u8 used_lits_to_lit_cost[257];
	u8 len_sym_cost;
} default_litlen_costs[] = {
	{ /* match_prob = 0.25 */
		.used_lits_to_lit_cost = {
			6, 6, 22, 32, 38, 43, 48, 51,
			54, 57, 59, 61, 64, 65, 67, 69,
			70, 72, 73, 74, 75, 76, 77, 79,
			80, 80, 81, 82, 83, 84, 85, 85,
			86, 87, 88, 88, 89, 89, 90, 91,
			91, 92, 92, 93, 93, 94, 95, 95,
			96, 96, 96, 97, 97, 98, 98, 99,
			99, 99, 100, 100, 101, 101, 101, 102,
			102, 102, 103, 103, 104, 104, 104, 105,
			105, 105, 105, 106, 106, 106, 107, 107,
			107, 108, 108, 108, 108, 109, 109, 109,
			109, 110, 110, 110, 111, 111, 111, 111,
			112, 112, 112, 112, 112, 113, 113, 113,
			113, 114, 114, 114, 114, 114, 115, 115,
			115, 115, 115, 116, 116, 116, 116, 116,
			117, 117, 117, 117, 117, 118, 118, 118,
			118, 118, 118, 119, 119, 119, 119, 119,
			120, 120, 120, 120, 120, 120, 121, 121,
			121, 121, 121, 121, 121, 122, 122, 122,
			122, 122, 122, 123, 123, 123, 123, 123,
			123, 123, 124, 124, 124, 124, 124, 124,
			124, 125, 125, 125, 125, 125, 125, 125,
			125, 126, 126, 126, 126, 126, 126, 126,
			127, 127, 127, 127, 127, 127, 127, 127,
			128, 128, 128, 128, 128, 128, 128, 128,
			128, 129, 129, 129, 129, 129, 129, 129,
			129, 129, 130, 130, 130, 130, 130, 130,
			130, 130, 130, 131, 131, 131, 131, 131,
			131, 131, 131, 131, 131, 132, 132, 132,
			132, 132, 132, 132, 132, 132, 132, 133,
			133, 133, 133, 133, 133, 133, 133, 133,
			133, 134, 134, 134, 134, 134, 134, 134,
			134,
		},
		.len_sym_cost = 109,
	}, { /* match_prob = 0.5 */
		.used_lits_to_lit_cost = {
			16, 16, 32, 41, 48, 53, 57, 60,
			64, 66, 69, 71, 73, 75, 76, 78,
			80, 81, 82, 83, 85, 86, 87, 88,
			89, 90, 91, 92, 92, 93, 94, 95,
			96, 96, 97, 98, 98, 99, 99, 100,
			101, 101, 102, 102, 103, 103, 104, 104,
			105, 105, 106, 106, 107, 107, 108, 108,
			108, 109, 109, 110, 110, 110, 111, 111,
			112, 112, 112, 113, 113, 113, 114, 114,
			114, 115, 115, 115, 115, 116, 116, 116,
			117, 117, 117, 118, 118, 118, 118, 119,
			119, 119, 119, 120, 120, 120, 120, 121,
			121, 121, 121, 122, 122, 122, 122, 122,
			123, 123, 123, 123, 124, 124, 124, 124,
			124, 125, 125, 125, 125, 125, 126, 126,
			126, 126, 126, 127, 127, 127, 127, 127,
			128, 128, 128, 128, 128, 128, 129, 129,
			129, 129, 129, 129, 130, 130, 130, 130,
			130, 130, 131, 131, 131, 131, 131, 131,
			131, 132, 132, 132, 132, 132, 132, 133,
			133, 133, 133, 133, 133, 133, 134, 134,
			134, 134, 134, 134, 134, 134, 135, 135,
			135, 135, 135, 135, 135, 135, 136, 136,
			136, 136, 136, 136, 136, 136, 137, 137,
			137, 137, 137, 137, 137, 137, 138, 138,
			138, 138, 138, 138, 138, 138, 138, 139,
			139, 139, 139, 139, 139, 139, 139, 139,
			140, 140, 140, 140, 140, 140, 140, 140,
			140, 141, 141, 141, 141, 141, 141, 141,
			141, 141, 141, 142, 142, 142, 142, 142,
			142, 142, 142, 142, 142, 142, 143, 143,
			143, 143, 143, 143, 143, 143, 143, 143,
			144,
		},
		.len_sym_cost = 93,
	}, { /* match_prob = 0.75 */
		.used_lits_to_lit_cost = {
			32, 32, 48, 57, 64, 69, 73, 76,
			80, 82, 85, 87, 89, 91, 92, 94,
			96, 97, 98, 99, 101, 102, 103, 104,
			105, 106, 107, 108, 108, 109, 110, 111,
			112, 112, 113, 114, 114, 115, 115, 116,
			117, 117, 118, 118, 119, 119, 120, 120,
			121, 121, 122, 122, 123, 123, 124, 124,
			124, 125, 125, 126, 126, 126, 127, 127,
			128, 128, 128, 129, 129, 129, 130, 130,
			130, 131, 131, 131, 131, 132, 132, 132,
			133, 133, 133, 134, 134, 134, 134, 135,
			135, 135, 135, 136, 136, 136, 136, 137,
			137, 137, 137, 138, 138, 138, 138, 138,
			139, 139, 139, 139, 140, 140, 140, 140,
			140, 141, 141, 141, 141, 141, 142, 142,
			142, 142, 142, 143, 143, 143, 143, 143,
			144, 144, 144, 144, 144, 144, 145, 145,
			145, 145, 145, 145, 146, 146, 146, 146,
			146, 146, 147, 147, 147, 147, 147, 147,
			147, 148, 148, 148, 148, 148, 148, 149,
			149, 149, 149, 149, 149, 149, 150, 150,
			150, 150, 150, 150, 150, 150, 151, 151,
			151, 151, 151, 151, 151, 151, 152, 152,
			152, 152, 152, 152, 152, 152, 153, 153,
			153, 153, 153, 153, 153, 153, 154, 154,
			154, 154, 154, 154, 154, 154, 154, 155,
			155, 155, 155, 155, 155, 155, 155, 155,
			156, 156, 156, 156, 156, 156, 156, 156,
			156, 157, 157, 157, 157, 157, 157, 157,
			157, 157, 157, 158, 158, 158, 158, 158,
			158, 158, 158, 158, 158, 158, 159, 159,
			159, 159, 159, 159, 159, 159, 159, 159,
			160,
		},
		.len_sym_cost = 84,
	},
};

/*
 * Choose the default costs for literal and length symbols.  These symbols are
 * both part of the litlen alphabet.
 */
static void
deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
				    u32 block_length,
				    u32 *lit_cost, u32 *len_sym_cost)
{
	unsigned num_used_literals = 0;
	u32 literal_freq = block_length;
	u32 match_freq = 0;
	u32 cutoff;
	int i;

	/* Calculate the number of distinct literals that exist in the data. */
	cutoff = literal_freq >> 11; /* Ignore literals used very rarely */
	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
		if (c->freqs.litlen[i] > cutoff)
			num_used_literals++;
	}
	if (num_used_literals == 0)
		num_used_literals = 1;

	/*
	 * Estimate the relative frequency of literals and matches in the
	 * optimal parsing solution.  We don't know the optimal solution, so
	 * this can only be a very rough estimate.  Therefore, we basically use
	 * the match frequency from a greedy parse.  We also apply the min_len
	 * heuristic used by the greedy and lazy parsers, to avoid counting too
	 * many matches when literals are cheaper than short matches.
	 */
	match_freq = 0;
	i = choose_min_match_len(num_used_literals, c->max_search_depth);
	for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
		match_freq += c->p.n.match_len_freqs[i];
		literal_freq -= i * c->p.n.match_len_freqs[i];
	}
	if ((s32)literal_freq < 0) /* shouldn't happen */
		literal_freq = 0;

	if (match_freq > literal_freq)
		i = 2; /* many matches */
	else if (match_freq * 4 > literal_freq)
		i = 1; /* neutral */
	else
		i = 0; /* few matches */

	*lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[
							num_used_literals];
	*len_sym_cost = default_litlen_costs[i].len_sym_cost;
}

static forceinline u32
deflate_default_length_cost(unsigned len, u32 len_sym_cost)
{
	unsigned slot = deflate_length_slot[len];
	u32 num_extra_bits = deflate_extra_length_bits[slot];

	return len_sym_cost + (num_extra_bits * BIT_COST);
}

static forceinline u32
deflate_default_offset_slot_cost(unsigned slot)
{
	u32 num_extra_bits = deflate_extra_offset_bits[slot];
	/*
	 * Assume that all offset symbols are equally probable.
	 * The resulting cost is 'int(-log2(1/30) * BIT_COST)',
	 * where 30 is the number of potentially-used offset symbols.
	 */
	u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000;

	return offset_sym_cost + (num_extra_bits * BIT_COST);
}

/* Set default symbol costs for the first block's first optimization pass. */
static void
deflate_set_default_costs(struct libdeflate_compressor *c,
			  u32 lit_cost, u32 len_sym_cost)
{
	unsigned i;

	/* Literals  */
	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
		c->p.n.costs.literal[i] = lit_cost;

	/* Lengths  */
	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
		c->p.n.costs.length[i] =
			deflate_default_length_cost(i, len_sym_cost);

	/* Offset slots  */
	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
		c->p.n.costs.offset_slot[i] =
			deflate_default_offset_slot_cost(i);
}

static forceinline void
deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount)
{
	if (change_amount == 0)
		/* Block is very similar to previous; prefer previous costs. */
		*cost_p = (default_cost + 3 * *cost_p) / 4;
	else if (change_amount == 1)
		*cost_p = (default_cost + *cost_p) / 2;
	else if (change_amount == 2)
		*cost_p = (5 * default_cost + 3 * *cost_p) / 8;
	else
		/* Block differs greatly from previous; prefer default costs. */
		*cost_p = (3 * default_cost + *cost_p) / 4;
}

static forceinline void
deflate_adjust_costs_impl(struct libdeflate_compressor *c,
			  u32 lit_cost, u32 len_sym_cost, int change_amount)
{
	unsigned i;

	/* Literals  */
	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
		deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost,
				    change_amount);

	/* Lengths  */
	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
		deflate_adjust_cost(&c->p.n.costs.length[i],
				    deflate_default_length_cost(i,
								len_sym_cost),
				    change_amount);

	/* Offset slots  */
	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
				    deflate_default_offset_slot_cost(i),
				    change_amount);
}

/*
 * Adjust the costs when beginning a new block.
 *
 * Since the current costs have been optimized for the data, it's undesirable to
 * throw them away and start over with the default costs.  At the same time, we
 * don't want to bias the parse by assuming that the next block will be similar
 * to the current block.  As a compromise, make the costs closer to the
 * defaults, but don't simply set them to the defaults.
 */
static void
deflate_adjust_costs(struct libdeflate_compressor *c,
		     u32 lit_cost, u32 len_sym_cost)
{
	u64 total_delta = 0;
	u64 cutoff;
	int i;

	/*
	 * Decide how different the current block is from the previous block,
	 * using the block splitting statistics from the current and previous
	 * blocks.  The more different the current block is, the more we prefer
	 * the default costs rather than the previous block's costs.
	 *
	 * The algorithm here is similar to the end-of-block check one, but here
	 * we compare two entire blocks rather than a partial block with a small
	 * extra part, and therefore we need 64-bit numbers in some places.
	 */
	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
		u64 prev = (u64)c->p.n.prev_observations[i] *
			    c->split_stats.num_observations;
		u64 cur = (u64)c->split_stats.observations[i] *
			  c->p.n.prev_num_observations;

		total_delta += prev > cur ? prev - cur : cur - prev;
	}
	cutoff = ((u64)c->p.n.prev_num_observations *
		  c->split_stats.num_observations * 200) / 512;

	if (4 * total_delta > 9 * cutoff)
		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3);
	else if (2 * total_delta > 3 * cutoff)
		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2);
	else if (2 * total_delta > cutoff)
		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1);
	else
		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0);
}

/*
 * Find the minimum-cost path through the graph of possible match/literal
 * choices for this block.
 *
 * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
 * represents the node at the beginning of the block, to
 * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
 * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
 *
 * The algorithm works backwards, starting at the end node and proceeding
 * backwards one node at a time.  At each node, the minimum cost to reach the
 * end node is computed and the match/literal choice that begins that path is
 * saved.
 */
static void
deflate_find_min_cost_path(struct libdeflate_compressor *c,
			   const u32 block_length,
			   const struct lz_match *cache_ptr)
{
	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
	struct deflate_optimum_node *cur_node = end_node;

	cur_node->cost_to_end = 0;
	do {
		unsigned num_matches;
		unsigned literal;
		u32 best_cost_to_end;

		cur_node--;
		cache_ptr--;

		num_matches = cache_ptr->length;
		literal = cache_ptr->offset;

		/* It's always possible to choose a literal.  */
		best_cost_to_end = c->p.n.costs.literal[literal] +
				   (cur_node + 1)->cost_to_end;
		cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;

		/* Also consider matches if there are any.  */
		if (num_matches) {
			const struct lz_match *match;
			unsigned len;
			unsigned offset;
			unsigned offset_slot;
			u32 offset_cost;
			u32 cost_to_end;

			/*
			 * Consider each length from the minimum
			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
			 * match found at this position.  For each length, we
			 * consider only the smallest offset for which that
			 * length is available.  Although this is not guaranteed
			 * to be optimal due to the possibility of a larger
			 * offset costing less than a smaller offset to code,
			 * this is a very useful heuristic.
			 */
			match = cache_ptr - num_matches;
			len = DEFLATE_MIN_MATCH_LEN;
			do {
				offset = match->offset;
				offset_slot = deflate_get_offset_slot(c, offset);
				offset_cost = c->p.n.costs.offset_slot[offset_slot];
				do {
					cost_to_end = offset_cost +
						      c->p.n.costs.length[len] +
						      (cur_node + len)->cost_to_end;
					if (cost_to_end < best_cost_to_end) {
						best_cost_to_end = cost_to_end;
						cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
					}
				} while (++len <= match->length);
			} while (++match != cache_ptr);
			cache_ptr -= num_matches;
		}
		cur_node->cost_to_end = best_cost_to_end;
	} while (cur_node != &c->p.n.optimum_nodes[0]);
}

/*
 * Choose the literal/match sequence to use for the current block.  The basic
 * algorithm finds a minimum-cost path through the block's graph of
 * literal/match choices, given a cost model.  However, the cost of each symbol
 * is unknown until the Huffman codes have been built, but at the same time the
 * Huffman codes depend on the frequencies of chosen symbols.  Consequently,
 * multiple passes must be used to try to approximate an optimal solution.  The
 * first pass uses default costs, mixed with the costs from the previous block
 * if any.  Later passes use the Huffman codeword lengths from the previous pass
 * as the costs.
 */
static void
deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
		       const struct lz_match *cache_ptr, bool is_first_block,
		       bool is_final_block)
{
	unsigned num_passes_remaining = c->p.n.num_optim_passes;
	u32 lit_cost, len_sym_cost;
	u32 i;

	/* Force the block to really end at the desired length, even if some
	 * matches extend beyond it. */
	for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
					ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;

	/* Make sure the literal/match statistics are up to date. */
	merge_new_observations(&c->split_stats);

	/* Set the initial costs. */
	deflate_choose_default_litlen_costs(c, block_length,
					    &lit_cost, &len_sym_cost);
	if (is_first_block)
		deflate_set_default_costs(c, lit_cost, len_sym_cost);
	else
		deflate_adjust_costs(c, lit_cost, len_sym_cost);

	do {
		/* Find the minimum cost path for this pass. */
		deflate_find_min_cost_path(c, block_length, cache_ptr);

		/* Compute frequencies of the chosen symbols. */
		deflate_reset_symbol_frequencies(c);
		deflate_tally_item_list(c, block_length);

		/* Make the Huffman codes. */
		deflate_make_huffman_codes(&c->freqs, &c->codes);

		/*
		 * Update the costs.  After the last optimization pass, the
		 * final costs won't be needed for this block, but they will be
		 * used in determining the initial costs for the next block.
		 */
		if (--num_passes_remaining || !is_final_block)
			deflate_set_costs_from_codes(c, &c->codes.lens);
	} while (num_passes_remaining);
}

static void deflate_near_optimal_begin_block(struct libdeflate_compressor *c,
					     bool is_first_block)
{
	int i;

	if (!is_first_block) {
		/*
		 * Save some literal/match statistics from the previous block so
		 * that deflate_adjust_costs() will be able to decide how much
		 * the current block differs from the previous one.
		 */
		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
			c->p.n.prev_observations[i] =
				c->split_stats.observations[i];
		}
		c->p.n.prev_num_observations = c->split_stats.num_observations;
	}
	init_block_split_stats(&c->split_stats);

	/*
	 * During matchfinding, we keep track of appropximate literal and match
	 * length frequencies for the purpose of setting the initial costs.
	 */
	memset(c->freqs.litlen, 0,
	       DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
	memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
}

/*
 * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
 * representation of each DEFLATE block using a minimum-cost path search over
 * the graph of possible match/literal choices for that block, assuming a
 * certain cost for each Huffman symbol.
 *
 * For several reasons, the end result is not guaranteed to be optimal:
 *
 * - Nonoptimal choice of blocks
 * - Heuristic limitations on which matches are actually considered
 * - Symbol costs are unknown until the symbols have already been chosen
 *   (so iterative optimization must be used)
 */
static size_t
deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
			      const u8 * restrict in, size_t in_nbytes,
			      u8 * restrict out, size_t out_nbytes_avail)
{
	const u8 *in_next = in;
	const u8 *in_end = in_next + in_nbytes;
	struct deflate_output_bitstream os;
	const u8 *in_cur_base = in_next;
	const u8 *in_next_slide =
		in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
	unsigned nice_len = MIN(c->nice_match_length, max_len);
	u32 next_hashes[2] = {0, 0};

	deflate_init_output(&os, out, out_nbytes_avail);
	bt_matchfinder_init(&c->p.n.bt_mf);

	do {
		/* Starting a new DEFLATE block. */

		struct lz_match *cache_ptr = c->p.n.match_cache;
		const u8 * const in_block_begin = in_next;
		const u8 * const in_max_block_end =
			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
		const u8 *next_observation = in_next;

		deflate_near_optimal_begin_block(c, in_block_begin == in);

		/*
		 * Find matches until we decide to end the block.  We end the
		 * block if any of the following is true:
		 *
		 * (1) Maximum block length has been reached
		 * (2) Match catch may overflow.
		 * (3) Block split heuristic says to split now.
		 */
		do {
			struct lz_match *matches;
			unsigned best_len;
			size_t remaining = in_end - in_next;

			/* Slide the window forward if needed.  */
			if (in_next == in_next_slide) {
				bt_matchfinder_slide_window(&c->p.n.bt_mf);
				in_cur_base = in_next;
				in_next_slide = in_next +
					MIN(remaining, MATCHFINDER_WINDOW_SIZE);
			}

			/*
			 * Find matches with the current position using the
			 * binary tree matchfinder and save them in
			 * 'match_cache'.
			 *
			 * Note: the binary tree matchfinder is more suited for
			 * optimal parsing than the hash chain matchfinder.  The
			 * reasons for this include:
			 *
			 * - The binary tree matchfinder can find more matches
			 *   in the same number of steps.
			 * - One of the major advantages of hash chains is that
			 *   skipping positions (not searching for matches at
			 *   them) is faster; however, with optimal parsing we
			 *   search for matches at almost all positions, so this
			 *   advantage of hash chains is negated.
			 */
			matches = cache_ptr;
			best_len = 0;
			adjust_max_and_nice_len(&max_len, &nice_len, remaining);
			if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
				cache_ptr = bt_matchfinder_get_matches(
						&c->p.n.bt_mf,
						in_cur_base,
						in_next - in_cur_base,
						max_len,
						nice_len,
						c->max_search_depth,
						next_hashes,
						&best_len,
						matches);
			}
			c->freqs.litlen[*in_next]++;
			if (in_next >= next_observation) {
				if (best_len >= 4) {
					observe_match(&c->split_stats,
						      best_len);
					next_observation = in_next + best_len;
					c->p.n.match_len_freqs[best_len]++;
				} else {
					observe_literal(&c->split_stats,
							*in_next);
					next_observation = in_next + 1;
				}
			}

			cache_ptr->length = cache_ptr - matches;
			cache_ptr->offset = *in_next;
			in_next++;
			cache_ptr++;

			/*
			 * If there was a very long match found, don't cache any
			 * matches for the bytes covered by that match.  This
			 * avoids degenerate behavior when compressing highly
			 * redundant data, where the number of matches can be
			 * very large.
			 *
			 * This heuristic doesn't actually hurt the compression
			 * ratio very much.  If there's a long match, then the
			 * data must be highly compressible, so it doesn't
			 * matter much what we do.
			 */
			if (best_len >= DEFLATE_MIN_MATCH_LEN &&
			    best_len >= nice_len) {
				--best_len;
				do {
					remaining = in_end - in_next;
					if (in_next == in_next_slide) {
						bt_matchfinder_slide_window(
							&c->p.n.bt_mf);
						in_cur_base = in_next;
						in_next_slide = in_next +
							MIN(remaining,
							    MATCHFINDER_WINDOW_SIZE);
					}
					adjust_max_and_nice_len(&max_len,
								&nice_len,
								remaining);
					if (max_len >=
					    BT_MATCHFINDER_REQUIRED_NBYTES) {
						bt_matchfinder_skip_byte(
							&c->p.n.bt_mf,
							in_cur_base,
							in_next - in_cur_base,
							nice_len,
							c->max_search_depth,
							next_hashes);
					}
					cache_ptr->length = 0;
					cache_ptr->offset = *in_next;
					c->freqs.litlen[*in_next]++;
					in_next++;
					cache_ptr++;
				} while (--best_len);
			}
		} while (in_next < in_max_block_end &&
			 cache_ptr < &c->p.n.match_cache[MATCH_CACHE_LENGTH] &&
			 !should_end_block(&c->split_stats,
					   in_block_begin, in_next, in_end));
		/*
		 * All the matches for this block have been cached.  Now choose
		 * the sequence of items to output and flush the block.
		 */
		deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
				       in_block_begin == in, in_next == in_end);
		deflate_flush_block(c, &os, in_block_begin,
				    in_next - in_block_begin,
				    NULL, in_next == in_end);
	} while (in_next != in_end);

	return deflate_flush_output(&os);
}

#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

/* Initialize c->offset_slot_fast.  */
static void
deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
{
	unsigned offset_slot;
	unsigned offset;
	unsigned offset_end;

	for (offset_slot = 0;
	     offset_slot < ARRAY_LEN(deflate_offset_slot_base);
	     offset_slot++)
	{
		offset = deflate_offset_slot_base[offset_slot];
	#if USE_FULL_OFFSET_SLOT_FAST
		offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
		do {
			c->offset_slot_fast[offset] = offset_slot;
		} while (++offset != offset_end);
	#else
		if (offset <= 256) {
			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
			do {
				c->offset_slot_fast[offset - 1] = offset_slot;
			} while (++offset != offset_end);
		} else {
			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
			do {
				c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
			} while ((offset += (1 << 7)) != offset_end);
		}
	#endif
	}
}

LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
libdeflate_alloc_compressor(int compression_level)
{
	struct libdeflate_compressor *c;
	size_t size = offsetof(struct libdeflate_compressor, p);

	if (compression_level < 0 || compression_level > 12)
		return NULL;

#if SUPPORT_NEAR_OPTIMAL_PARSING
	if (compression_level >= 10)
		size += sizeof(c->p.n);
	else
#endif
	{
		if (compression_level >= 2)
			size += sizeof(c->p.g);
		else if (compression_level == 1)
			size += sizeof(c->p.f);
	}

	c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
	if (!c)
		return NULL;

	c->compression_level = compression_level;

	/*
	 * The higher the compression level, the more we should bother trying to
	 * compress very small inputs.
	 */
	c->min_size_to_compress = 56 - (compression_level * 4);

	switch (compression_level) {
	case 0:
		c->impl = deflate_compress_none;
		break;
	case 1:
		c->impl = deflate_compress_fastest;
		/* max_search_depth is unused */
		c->nice_match_length = 32;
		break;
	case 2:
		c->impl = deflate_compress_greedy;
		c->max_search_depth = 6;
		c->nice_match_length = 10;
		break;
	case 3:
		c->impl = deflate_compress_greedy;
		c->max_search_depth = 12;
		c->nice_match_length = 14;
		break;
	case 4:
		c->impl = deflate_compress_greedy;
		c->max_search_depth = 16;
		c->nice_match_length = 30;
		break;
	case 5:
		c->impl = deflate_compress_lazy;
		c->max_search_depth = 16;
		c->nice_match_length = 30;
		break;
	case 6:
		c->impl = deflate_compress_lazy;
		c->max_search_depth = 35;
		c->nice_match_length = 65;
		break;
	case 7:
		c->impl = deflate_compress_lazy;
		c->max_search_depth = 100;
		c->nice_match_length = 130;
		break;
	case 8:
		c->impl = deflate_compress_lazy2;
		c->max_search_depth = 300;
		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
		break;
	case 9:
#if !SUPPORT_NEAR_OPTIMAL_PARSING
	default:
#endif
		c->impl = deflate_compress_lazy2;
		c->max_search_depth = 600;
		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
		break;
#if SUPPORT_NEAR_OPTIMAL_PARSING
	case 10:
		c->impl = deflate_compress_near_optimal;
		c->max_search_depth = 35;
		c->nice_match_length = 75;
		c->p.n.num_optim_passes = 2;
		break;
	case 11:
		c->impl = deflate_compress_near_optimal;
		c->max_search_depth = 70;
		c->nice_match_length = 150;
		c->p.n.num_optim_passes = 3;
		break;
	case 12:
	default:
		c->impl = deflate_compress_near_optimal;
		c->max_search_depth = 150;
		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
		c->p.n.num_optim_passes = 4;
		break;
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
	}

	deflate_init_offset_slot_fast(c);
	deflate_init_static_codes(c);

	return c;
}

LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_deflate_compress(struct libdeflate_compressor *c,
			    const void *in, size_t in_nbytes,
			    void *out, size_t out_nbytes_avail)
{
	if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
		return 0;

	/* For extremely small inputs just use a single uncompressed block. */
	if (unlikely(in_nbytes < c->min_size_to_compress)) {
		struct deflate_output_bitstream os;
		deflate_init_output(&os, out, out_nbytes_avail);
		if (in_nbytes == 0)
			in = &os; /* Avoid passing NULL to memcpy() */
		deflate_write_uncompressed_block(&os, in, in_nbytes, true);
		return deflate_flush_output(&os);
	}

	return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
}

LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_free_compressor(struct libdeflate_compressor *c)
{
	libdeflate_aligned_free(c);
}

unsigned int
deflate_get_compression_level(struct libdeflate_compressor *c)
{
	return c->compression_level;
}

LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
				  size_t in_nbytes)
{
	/*
	 * The worst case is all uncompressed blocks where one block has length
	 * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
	 * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
	 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
	 */
	size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
	return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
}