diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c
index 0feee06..621b1cb 100644
--- a/lib/deflate_compress.c
+++ b/lib/deflate_compress.c
@@ -212,39 +212,39 @@ check_buildtime_parameters(void)
 
 /******************************************************************************/
 
-/* Table: length slot => length slot base value  */
+/* Table: length slot => length slot base value */
 static const unsigned deflate_length_slot_base[] = {
-	3   , 4   , 5   , 6   , 7   , 8   , 9   , 10  ,
-	11  , 13  , 15  , 17  , 19  , 23  , 27  , 31  ,
-	35  , 43  , 51  , 59  , 67  , 83  , 99  , 115 ,
-	131 , 163 , 195 , 227 , 258 ,
+	3,    4,    5,    6,    7,    8,    9,    10,
+	11,   13,   15,   17,   19,   23,   27,   31,
+	35,   43,   51,   59,   67,   83,   99,   115,
+	131,  163,  195,  227,  258,
 };
 
-/* Table: length slot => number of extra length bits  */
+/* Table: length slot => number of extra length bits */
 static const u8 deflate_extra_length_bits[] = {
-	0   , 0   , 0   , 0   , 0   , 0   , 0   , 0 ,
-	1   , 1   , 1   , 1   , 2   , 2   , 2   , 2 ,
-	3   , 3   , 3   , 3   , 4   , 4   , 4   , 4 ,
-	5   , 5   , 5   , 5   , 0   ,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	1,    1,    1,    1,    2,    2,    2,    2,
+	3,    3,    3,    3,    4,    4,    4,    4,
+	5,    5,    5,    5,    0,
 };
 
-/* Table: offset slot => offset slot base value  */
+/* Table: offset slot => offset slot base value */
 static const unsigned deflate_offset_slot_base[] = {
-	1    , 2    , 3    , 4     , 5     , 7     , 9     , 13    ,
-	17   , 25   , 33   , 49    , 65    , 97    , 129   , 193   ,
-	257  , 385  , 513  , 769   , 1025  , 1537  , 2049  , 3073  ,
-	4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
+	1,     2,     3,     4,     5,     7,     9,     13,
+	17,    25,    33,    49,    65,    97,    129,   193,
+	257,   385,   513,   769,   1025,  1537,  2049,  3073,
+	4097,  6145,  8193,  12289, 16385, 24577,
 };
 
-/* Table: offset slot => number of extra offset bits  */
+/* Table: offset slot => number of extra offset bits */
 static const u8 deflate_extra_offset_bits[] = {
-	0    , 0    , 0    , 0     , 1     , 1     , 2     , 2     ,
-	3    , 3    , 4    , 4     , 5     , 5     , 6     , 6     ,
-	7    , 7    , 8    , 8     , 9     , 9     , 10    , 10    ,
-	11   , 11   , 12   , 12    , 13    , 13    ,
+	0,     0,     0,     0,     1,     1,     2,     2,
+	3,     3,     4,     4,     5,     5,     6,     6,
+	7,     7,     8,     8,     9,     9,     10,    10,
+	11,    11,    12,    12,    13,    13,
 };
 
-/* Table: length => length slot  */
+/* Table: length => length slot */
 static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
 	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
 	12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
@@ -311,26 +311,28 @@ static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
 	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
 };
 
-/* Codewords for the DEFLATE Huffman codes.  */
+/* Codewords for the DEFLATE Huffman codes */
 struct deflate_codewords {
 	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
 	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
 };
 
-/* Codeword lengths (in bits) for the DEFLATE Huffman codes.
- * A zero length means the corresponding symbol had zero frequency.  */
+/*
+ * Codeword lengths (in bits) for the DEFLATE Huffman codes.
+ * A zero length means the corresponding symbol had zero frequency.
+ */
 struct deflate_lens {
 	u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
 	u8 offset[DEFLATE_NUM_OFFSET_SYMS];
 };
 
-/* Codewords and lengths for the DEFLATE Huffman codes.  */
+/* Codewords and lengths for the DEFLATE Huffman codes */
 struct deflate_codes {
 	struct deflate_codewords codewords;
 	struct deflate_lens lens;
 };
 
-/* Symbol frequency counters for the DEFLATE Huffman codes.  */
+/* Symbol frequency counters for the DEFLATE Huffman codes */
 struct deflate_freqs {
 	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
 	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
@@ -344,41 +346,49 @@ struct deflate_freqs {
  */
 struct deflate_sequence {
 
-	/* Bits 0..22: the number of literals in this run.  This may be 0 and
+	/*
+	 * Bits 0..22: the number of literals in this run.  This may be 0 and
 	 * can be at most about SOFT_MAX_BLOCK_LENGTH.  The literals are not
 	 * stored explicitly in this structure; instead, they are read directly
 	 * from the uncompressed data.
 	 *
 	 * Bits 23..31: the length of the match which follows the literals, or 0
 	 * if this literal run was the last in the block, so there is no match
-	 * which follows it.  */
+	 * which follows it.
+	 */
 	u32 litrunlen_and_length;
 
-	/* If 'length' doesn't indicate end-of-block, then this is the offset of
-	 * the match which follows the literals.  */
+	/*
+	 * If 'length' doesn't indicate end-of-block, then this is the offset of
+	 * the match which follows the literals.
+	 */
 	u16 offset;
 
-	/* If 'length' doesn't indicate end-of-block, then this is the offset
-	 * symbol of the match which follows the literals.  */
+	/*
+	 * If 'length' doesn't indicate end-of-block, then this is the offset
+	 * symbol of the match which follows the literals.
+	 */
 	u8 offset_symbol;
 
-	/* If 'length' doesn't indicate end-of-block, then this is the length
-	 * slot of the match which follows the literals.  */
+	/*
+	 * If 'length' doesn't indicate end-of-block, then this is the length
+	 * slot of the match which follows the literals.
+	 */
 	u8 length_slot;
 };
 
 #if SUPPORT_NEAR_OPTIMAL_PARSING
 
-/* Costs for the near-optimal parsing algorithm.  */
+/* Costs for the near-optimal parsing algorithm */
 struct deflate_costs {
 
-	/* The cost to output each possible literal.  */
+	/* The cost to output each possible literal */
 	u32 literal[DEFLATE_NUM_LITERALS];
 
-	/* The cost to output each possible match length.  */
+	/* The cost to output each possible match length */
 	u32 length[DEFLATE_MAX_MATCH_LEN + 1];
 
-	/* The cost to output a match offset of each possible offset slot.  */
+	/* The cost to output a match offset of each possible offset slot */
 	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
 };
 
@@ -426,7 +436,8 @@ struct deflate_optimum_node {
 /* Block split statistics.  See "Block splitting algorithm" below. */
 #define NUM_LITERAL_OBSERVATION_TYPES 8
 #define NUM_MATCH_OBSERVATION_TYPES 2
-#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
+#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
+			       NUM_MATCH_OBSERVATION_TYPES)
 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
 struct block_split_stats {
 	u32 new_observations[NUM_OBSERVATION_TYPES];
@@ -435,51 +446,56 @@ struct block_split_stats {
 	u32 num_observations;
 };
 
-/* The main DEFLATE compressor structure  */
+/* The main DEFLATE compressor structure */
 struct libdeflate_compressor {
 
 	/* Pointer to the compress() implementation chosen at allocation time */
-	size_t (*impl)(struct libdeflate_compressor *,
-		       const u8 *, size_t, u8 *, size_t);
+	size_t (*impl)(struct libdeflate_compressor *c, const u8 *in,
+		       size_t in_nbytes, u8 *out, size_t out_nbytes_avail);
 
-	/* Frequency counters for the current block  */
+	/* Frequency counters for the current block */
 	struct deflate_freqs freqs;
 
-	/* Dynamic Huffman codes for the current block  */
+	/* Dynamic Huffman codes for the current block */
 	struct deflate_codes codes;
 
-	/* Static Huffman codes */
+	/* The static Huffman codes defined by the DEFLATE format */
 	struct deflate_codes static_codes;
 
 	/* Block split statistics for the currently pending block */
 	struct block_split_stats split_stats;
 
-	/* The "nice" match length: if a match of this length is found, choose
-	 * it immediately without further consideration.  */
+	/*
+	 * The "nice" match length: if a match of this length is found, choose
+	 * it immediately without further consideration
+	 */
 	unsigned nice_match_length;
 
-	/* The maximum search depth: consider at most this many potential
-	 * matches at each position.  */
+	/*
+	 * The maximum search depth: consider at most this many potential
+	 * matches at each position
+	 */
 	unsigned max_search_depth;
 
-	/* The compression level with which this compressor was created.  */
+	/* The compression level with which this compressor was created */
 	unsigned compression_level;
 
-	/* Anything smaller than this we won't bother trying to compress.  */
+	/* Anything smaller than this we won't bother trying to compress. */
 	unsigned min_size_to_compress;
 
-	/* Temporary space for Huffman code output  */
+	/* Temporary space for Huffman code output */
 	u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
 	u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
 	u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
-	unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
+	unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS +
+			       DEFLATE_NUM_OFFSET_SYMS];
 	unsigned num_litlen_syms;
 	unsigned num_offset_syms;
 	unsigned num_explicit_lens;
 	unsigned num_precode_items;
 
 	union {
-		/* Data for greedy or lazy parsing  */
+		/* Data for greedy or lazy parsing */
 		struct {
 			/* Hash chains matchfinder */
 			struct hc_matchfinder hc_mf;
@@ -501,10 +517,10 @@ struct libdeflate_compressor {
 		} f; /* (f)astest */
 
 	#if SUPPORT_NEAR_OPTIMAL_PARSING
-		/* Data for near-optimal parsing  */
+		/* Data for near-optimal parsing */
 		struct {
 
-			/* Binary tree matchfinder  */
+			/* Binary tree matchfinder */
 			struct bt_matchfinder bt_mf;
 
 			/*
@@ -552,7 +568,7 @@ struct libdeflate_compressor {
 				    SOFT_MAX_BLOCK_LENGTH - 1 +
 				    DEFLATE_MAX_MATCH_LEN) + 1];
 
-			/* The current cost model being used.  */
+			/* The current cost model being used */
 			struct deflate_costs costs;
 
 			/*
@@ -593,30 +609,34 @@ struct libdeflate_compressor {
 typedef machine_word_t bitbuf_t;
 #define BITBUF_NBITS	(8 * sizeof(bitbuf_t))
 
-/* Can the specified number of bits always be added to 'bitbuf' after any
- * pending bytes have been flushed?  */
+/*
+ * Can the specified number of bits always be added to 'bitbuf' after any
+ * pending bytes have been flushed?
+ */
 #define CAN_BUFFER(n)	((n) <= BITBUF_NBITS - 7)
 
 /*
  * Structure to keep track of the current state of sending bits to the
- * compressed output buffer.
+ * compressed output buffer
  */
 struct deflate_output_bitstream {
 
-	/* Bits that haven't yet been written to the output buffer.  */
+	/* Bits that haven't yet been written to the output buffer */
 	bitbuf_t bitbuf;
 
-	/* Number of bits currently held in @bitbuf.  */
+	/* Number of bits currently held in @bitbuf */
 	unsigned bitcount;
 
-	/* Pointer to the beginning of the output buffer.  */
+	/* Pointer to the beginning of the output buffer */
 	u8 *begin;
 
-	/* Pointer to the position in the output buffer at which the next byte
-	 * should be written.  */
+	/*
+	 * Pointer to the position in the output buffer at which the next byte
+	 * should be written
+	 */
 	u8 *next;
 
-	/* Pointer just past the end of the output buffer.  */
+	/* Pointer just past the end of the output buffer */
 	u8 *end;
 };
 
@@ -631,8 +651,10 @@ struct deflate_output_bitstream {
  */
 #define OUTPUT_END_PADDING	8
 
-/* Initialize the output bitstream.  'size' is assumed to be at least
- * OUTPUT_END_PADDING.  */
+/*
+ * Initialize the output bitstream.  'size' is assumed to be at least
+ * OUTPUT_END_PADDING.
+ */
 static void
 deflate_init_output(struct deflate_output_bitstream *os,
 		    void *buffer, size_t size)
@@ -644,8 +666,10 @@ deflate_init_output(struct deflate_output_bitstream *os,
 	os->end = os->begin + size - OUTPUT_END_PADDING;
 }
 
-/* Add some bits to the bitbuffer variable of the output bitstream.  The caller
- * must make sure there is enough room.  */
+/*
+ * Add some bits to the bitbuffer variable of the output bitstream.  The caller
+ * must make sure there is enough room.
+ */
 static forceinline void
 deflate_add_bits(struct deflate_output_bitstream *os,
 		 const bitbuf_t bits, const unsigned num_bits)
@@ -654,18 +678,18 @@ deflate_add_bits(struct deflate_output_bitstream *os,
 	os->bitcount += num_bits;
 }
 
-/* Flush bits from the bitbuffer variable to the output buffer.  */
+/* Flush bits from the bitbuffer variable to the output buffer. */
 static forceinline void
 deflate_flush_bits(struct deflate_output_bitstream *os)
 {
 	if (UNALIGNED_ACCESS_IS_FAST) {
-		/* Flush a whole word (branchlessly).  */
+		/* Flush a whole word (branchlessly). */
 		put_unaligned_leword(os->bitbuf, os->next);
 		os->bitbuf >>= os->bitcount & ~7;
 		os->next += MIN(os->end - os->next, os->bitcount >> 3);
 		os->bitcount &= 7;
 	} else {
-		/* Flush a byte at a time.  */
+		/* Flush a byte at a time. */
 		while (os->bitcount >= 8) {
 			*os->next = os->bitbuf;
 			if (os->next != os->end)
@@ -691,7 +715,7 @@ deflate_align_bitstream(struct deflate_output_bitstream *os)
 static size_t
 deflate_flush_output(struct deflate_output_bitstream *os)
 {
-	if (os->next == os->end) /* overflow?  */
+	if (os->next == os->end) /* overflow? */
 		return 0;
 
 	while ((int)os->bitcount > 0) {
@@ -703,10 +727,12 @@ deflate_flush_output(struct deflate_output_bitstream *os)
 	return os->next - os->begin;
 }
 
-/* Given the binary tree node A[subtree_idx] whose children already
- * satisfy the maxheap property, swap the node with its greater child
- * until it is greater than both its children, so that the maxheap
- * property is satisfied in the subtree rooted at A[subtree_idx].  */
+/*
+ * Given the binary tree node A[subtree_idx] whose children already satisfy the
+ * maxheap property, swap the node with its greater child until it is greater
+ * than both its children, so that the maxheap property is satisfied in the
+ * subtree rooted at A[subtree_idx].
+ */
 static void
 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
 {
@@ -727,7 +753,8 @@ heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
 	A[parent_idx] = v;
 }
 
-/* Rearrange the array 'A' so that it satisfies the maxheap property.
+/*
+ * Rearrange the array 'A' so that it satisfies the maxheap property.
  * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
  */
 static void
@@ -755,6 +782,7 @@ heap_sort(u32 A[], unsigned length)
 
 	while (length >= 2) {
 		u32 tmp = A[length];
+
 		A[length] = A[1];
 		A[1] = tmp;
 		length--;
@@ -766,12 +794,13 @@ heap_sort(u32 A[], unsigned length)
 #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
 
 #define GET_NUM_COUNTERS(num_syms)	((((num_syms) + 3 / 4) + 3) & ~3)
+
 /*
- * Sort the symbols primarily by frequency and secondarily by symbol
- * value.  Discard symbols with zero frequency and fill in an array with
- * the remaining symbols, along with their frequencies.  The low
- * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
- * value, and the remaining bits will contain the frequency.
+ * Sort the symbols primarily by frequency and secondarily by symbol value.
+ * Discard symbols with zero frequency and fill in an array with the remaining
+ * symbols, along with their frequencies.  The low NUM_SYMBOL_BITS bits of each
+ * array entry will contain the symbol value, and the remaining bits will
+ * contain the frequency.
  *
  * @num_syms
  *	Number of symbols in the alphabet.
@@ -781,16 +810,15 @@ heap_sort(u32 A[], unsigned length)
  *	The frequency of each symbol.
  *
  * @lens[num_syms]
- *	An array that eventually will hold the length of each codeword.
- *	This function only fills in the codeword lengths for symbols that
- *	have zero frequency, which are not well defined per se but will
- *	be set to 0.
+ *	An array that eventually will hold the length of each codeword.  This
+ *	function only fills in the codeword lengths for symbols that have zero
+ *	frequency, which are not well defined per se but will be set to 0.
  *
  * @symout[num_syms]
  *	The output array, described above.
  *
- * Returns the number of entries in 'symout' that were filled.  This is
- * the number of symbols that have nonzero frequency.
+ * Returns the number of entries in 'symout' that were filled.  This is the
+ * number of symbols that have nonzero frequency.
  */
 static unsigned
 sort_symbols(unsigned num_syms, const u32 freqs[restrict],
@@ -802,50 +830,57 @@ sort_symbols(unsigned num_syms, const u32 freqs[restrict],
 	unsigned num_counters;
 	unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
 
-	/* We rely on heapsort, but with an added optimization.  Since
-	 * it's common for most symbol frequencies to be low, we first do
-	 * a count sort using a limited number of counters.  High
-	 * frequencies will be counted in the last counter, and only they
-	 * will be sorted with heapsort.
+	/*
+	 * We rely on heapsort, but with an added optimization.  Since it's
+	 * common for most symbol frequencies to be low, we first do a count
+	 * sort using a limited number of counters.  High frequencies will be
+	 * counted in the last counter, and only they will be sorted with
+	 * heapsort.
 	 *
 	 * Note: with more symbols, it is generally beneficial to have more
 	 * counters.  About 1 counter per 4 symbols seems fast.
 	 *
-	 * Note: I also tested radix sort, but even for large symbol
-	 * counts (> 255) and frequencies bounded at 16 bits (enabling
-	 * radix sort by just two base-256 digits), it didn't seem any
-	 * faster than the method implemented here.
+	 * Note: I also tested radix sort, but even for large symbol counts (>
+	 * 255) and frequencies bounded at 16 bits (enabling radix sort by just
+	 * two base-256 digits), it didn't seem any faster than the method
+	 * implemented here.
 	 *
-	 * Note: I tested the optimized quicksort implementation from
-	 * glibc (with indirection overhead removed), but it was only
-	 * marginally faster than the simple heapsort implemented here.
+	 * Note: I tested the optimized quicksort implementation from glibc
+	 * (with indirection overhead removed), but it was only marginally
+	 * faster than the simple heapsort implemented here.
 	 *
-	 * Tests were done with building the codes for LZX.  Results may
-	 * vary for different compression algorithms...!  */
+	 * Tests were done with building the codes for LZX.  Results may vary
+	 * for different compression algorithms...!
+	 */
 
 	num_counters = GET_NUM_COUNTERS(num_syms);
 
 	memset(counters, 0, num_counters * sizeof(counters[0]));
 
-	/* Count the frequencies.  */
+	/* Count the frequencies. */
 	for (sym = 0; sym < num_syms; sym++)
 		counters[MIN(freqs[sym], num_counters - 1)]++;
 
-	/* Make the counters cumulative, ignoring the zero-th, which
-	 * counted symbols with zero frequency.  As a side effect, this
-	 * calculates the number of symbols with nonzero frequency.  */
+	/*
+	 * Make the counters cumulative, ignoring the zero-th, which counted
+	 * symbols with zero frequency.  As a side effect, this calculates the
+	 * number of symbols with nonzero frequency.
+	 */
 	num_used_syms = 0;
 	for (i = 1; i < num_counters; i++) {
 		unsigned count = counters[i];
+
 		counters[i] = num_used_syms;
 		num_used_syms += count;
 	}
 
-	/* Sort nonzero-frequency symbols using the counters.  At the
-	 * same time, set the codeword lengths of zero-frequency symbols
-	 * to 0.  */
+	/*
+	 * Sort nonzero-frequency symbols using the counters.  At the same time,
+	 * set the codeword lengths of zero-frequency symbols to 0.
+	 */
 	for (sym = 0; sym < num_syms; sym++) {
 		u32 freq = freqs[sym];
+
 		if (freq != 0) {
 			symout[counters[MIN(freq, num_counters - 1)]++] =
 				sym | (freq << NUM_SYMBOL_BITS);
@@ -854,7 +889,7 @@ sort_symbols(unsigned num_syms, const u32 freqs[restrict],
 		}
 	}
 
-	/* Sort the symbols counted in the last counter.  */
+	/* Sort the symbols counted in the last counter. */
 	heap_sort(symout + counters[num_counters - 2],
 		  counters[num_counters - 1] - counters[num_counters - 2]);
 
@@ -866,77 +901,82 @@ sort_symbols(unsigned num_syms, const u32 freqs[restrict],
  *
  * This is an optimized implementation that
  *	(a) takes advantage of the frequencies being already sorted;
- *	(b) only generates non-leaf nodes, since the non-leaf nodes of a
- *	    Huffman tree are sufficient to generate a canonical code;
+ *	(b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
+ *	    tree are sufficient to generate a canonical code;
  *	(c) Only stores parent pointers, not child pointers;
- *	(d) Produces the nodes in the same memory used for input
- *	    frequency information.
+ *	(d) Produces the nodes in the same memory used for input frequency
+ *	    information.
  *
- * Array 'A', which contains 'sym_count' entries, is used for both input
- * and output.  For this function, 'sym_count' must be at least 2.
+ * Array 'A', which contains 'sym_count' entries, is used for both input and
+ * output.  For this function, 'sym_count' must be at least 2.
  *
- * For input, the array must contain the frequencies of the symbols,
- * sorted in increasing order.  Specifically, each entry must contain a
- * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
- * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
- * Although these bits will, in fact, contain the symbols that correspond
- * to the frequencies, this function is concerned with frequencies only
- * and keeps the symbols as-is.
+ * For input, the array must contain the frequencies of the symbols, sorted in
+ * increasing order.  Specifically, each entry must contain a frequency left
+ * shifted by NUM_SYMBOL_BITS bits.  Any data in the low NUM_SYMBOL_BITS bits of
+ * the entries will be ignored by this function.  Although these bits will, in
+ * fact, contain the symbols that correspond to the frequencies, this function
+ * is concerned with frequencies only and keeps the symbols as-is.
  *
- * For output, this function will produce the non-leaf nodes of the
- * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
- * entries of the array.  Entry A[sym_count - 2] will represent the root
- * node.  Each other node will contain the zero-based index of its parent
- * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
- * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
- * note that although these low bits will, in fact, contain a symbol
- * value, this symbol will have *no relationship* with the Huffman tree
- * node that happens to occupy the same slot.  This is because this
+ * For output, this function will produce the non-leaf nodes of the Huffman
+ * tree.  These nodes will be stored in the first (sym_count - 1) entries of the
+ * array.  Entry A[sym_count - 2] will represent the root node.  Each other node
+ * will contain the zero-based index of its parent node in 'A', left shifted by
+ * NUM_SYMBOL_BITS bits.  The low NUM_SYMBOL_BITS bits of each entry in A will
+ * be kept as-is.  Again, note that although these low bits will, in fact,
+ * contain a symbol value, this symbol will have *no relationship* with the
+ * Huffman tree node that happens to occupy the same slot.  This is because this
  * implementation only generates the non-leaf nodes of the tree.
  */
 static void
 build_tree(u32 A[], unsigned sym_count)
 {
-	/* Index, in 'A', of next lowest frequency symbol that has not
-	 * yet been processed.  */
+	/*
+	 * Index, in 'A', of next lowest frequency symbol that has not yet been
+	 * processed.
+	 */
 	unsigned i = 0;
 
-	/* Index, in 'A', of next lowest frequency parentless non-leaf
-	 * node; or, if equal to 'e', then no such node exists yet.  */
+	/*
+	 * Index, in 'A', of next lowest frequency parentless non-leaf node; or,
+	 * if equal to 'e', then no such node exists yet.
+	 */
 	unsigned b = 0;
 
-	/* Index, in 'A', of next node to allocate as a non-leaf.  */
+	/* Index, in 'A', of next node to allocate as a non-leaf. */
 	unsigned e = 0;
 
 	do {
 		unsigned m, n;
 		u32 freq_shifted;
 
-		/* Choose the two next lowest frequency entries.  */
+		/* Choose the two next lowest frequency entries. */
 
 		if (i != sym_count &&
-		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
+		    (b == e ||
+		     (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
 			m = i++;
 		else
 			m = b++;
 
 		if (i != sym_count &&
-		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
+		    (b == e ||
+		     (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
 			n = i++;
 		else
 			n = b++;
 
-		/* Allocate a non-leaf node and link the entries to it.
+		/*
+		 * Allocate a non-leaf node and link the entries to it.
 		 *
-		 * If we link an entry that we're visiting for the first
-		 * time (via index 'i'), then we're actually linking a
-		 * leaf node and it will have no effect, since the leaf
-		 * will be overwritten with a non-leaf when index 'e'
-		 * catches up to it.  But it's not any slower to
-		 * unconditionally set the parent index.
+		 * If we link an entry that we're visiting for the first time
+		 * (via index 'i'), then we're actually linking a leaf node and
+		 * it will have no effect, since the leaf will be overwritten
+		 * with a non-leaf when index 'e' catches up to it.  But it's
+		 * not any slower to unconditionally set the parent index.
 		 *
-		 * We also compute the frequency of the non-leaf node as
-		 * the sum of its two children's frequencies.  */
+		 * We also compute the frequency of the non-leaf node as the sum
+		 * of its two children's frequencies.
+		 */
 
 		freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
 
@@ -945,36 +985,36 @@ build_tree(u32 A[], unsigned sym_count)
 		A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
 		e++;
 	} while (sym_count - e > 1);
-		/* When just one entry remains, it is a "leaf" that was
-		 * linked to some other node.  We ignore it, since the
-		 * rest of the array contains the non-leaves which we
-		 * need.  (Note that we're assuming the cases with 0 or 1
-		 * symbols were handled separately.) */
+		/*
+		 * When just one entry remains, it is a "leaf" that was linked
+		 * to some other node.  We ignore it, since the rest of the
+		 * array contains the non-leaves which we need.  (Note that
+		 * we're assuming the cases with 0 or 1 symbols were handled
+		 * separately.)
+		 */
 }
 
 /*
- * Given the stripped-down Huffman tree constructed by build_tree(),
- * determine the number of codewords that should be assigned each
- * possible length, taking into account the length-limited constraint.
+ * Given the stripped-down Huffman tree constructed by build_tree(), determine
+ * the number of codewords that should be assigned each possible length, taking
+ * into account the length-limited constraint.
  *
  * @A
- *	The array produced by build_tree(), containing parent index
- *	information for the non-leaf nodes of the Huffman tree.  Each
- *	entry in this array is a node; a node's parent always has a
- *	greater index than that node itself.  This function will
- *	overwrite the parent index information in this array, so
- *	essentially it will destroy the tree.  However, the data in the
- *	low NUM_SYMBOL_BITS of each entry will be preserved.
+ *	The array produced by build_tree(), containing parent index information
+ *	for the non-leaf nodes of the Huffman tree.  Each entry in this array is
+ *	a node; a node's parent always has a greater index than that node
+ *	itself.  This function will overwrite the parent index information in
+ *	this array, so essentially it will destroy the tree.  However, the data
+ *	in the low NUM_SYMBOL_BITS of each entry will be preserved.
  *
  * @root_idx
- *	The 0-based index of the root node in 'A', and consequently one
- *	less than the number of tree node entries in 'A'.  (Or, really 2
- *	less than the actual length of 'A'.)
+ *	The 0-based index of the root node in 'A', and consequently one less
+ *	than the number of tree node entries in 'A'.  (Or, really 2 less than
+ *	the actual length of 'A'.)
  *
  * @len_counts
  *	An array of length ('max_codeword_len' + 1) in which the number of
- *	codewords having each length <= max_codeword_len will be
- *	returned.
+ *	codewords having each length <= max_codeword_len will be returned.
  *
  * @max_codeword_len
  *	The maximum permissible codeword length.
@@ -986,53 +1026,55 @@ compute_length_counts(u32 A[restrict], unsigned root_idx,
 	unsigned len;
 	int node;
 
-	/* The key observations are:
+	/*
+	 * The key observations are:
 	 *
-	 * (1) We can traverse the non-leaf nodes of the tree, always
-	 * visiting a parent before its children, by simply iterating
-	 * through the array in reverse order.  Consequently, we can
-	 * compute the depth of each node in one pass, overwriting the
-	 * parent indices with depths.
+	 * (1) We can traverse the non-leaf nodes of the tree, always visiting a
+	 *     parent before its children, by simply iterating through the array
+	 *     in reverse order.  Consequently, we can compute the depth of each
+	 *     node in one pass, overwriting the parent indices with depths.
 	 *
-	 * (2) We can initially assume that in the real Huffman tree,
-	 * both children of the root are leaves.  This corresponds to two
-	 * codewords of length 1.  Then, whenever we visit a (non-leaf)
-	 * node during the traversal, we modify this assumption to
-	 * account for the current node *not* being a leaf, but rather
-	 * its two children being leaves.  This causes the loss of one
-	 * codeword for the current depth and the addition of two
-	 * codewords for the current depth plus one.
+	 * (2) We can initially assume that in the real Huffman tree, both
+	 *     children of the root are leaves.  This corresponds to two
+	 *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
+	 *     during the traversal, we modify this assumption to account for
+	 *     the current node *not* being a leaf, but rather its two children
+	 *     being leaves.  This causes the loss of one codeword for the
+	 *     current depth and the addition of two codewords for the current
+	 *     depth plus one.
 	 *
-	 * (3) We can handle the length-limited constraint fairly easily
-	 * by simply using the largest length available when a depth
-	 * exceeds max_codeword_len.
+	 * (3) We can handle the length-limited constraint fairly easily by
+	 *     simply using the largest length available when a depth exceeds
+	 *     max_codeword_len.
 	 */
 
 	for (len = 0; len <= max_codeword_len; len++)
 		len_counts[len] = 0;
 	len_counts[1] = 2;
 
-	/* Set the root node's depth to 0.  */
+	/* Set the root node's depth to 0. */
 	A[root_idx] &= SYMBOL_MASK;
 
 	for (node = root_idx - 1; node >= 0; node--) {
 
-		/* Calculate the depth of this node.  */
+		/* Calculate the depth of this node. */
 
 		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
 		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
 		unsigned depth = parent_depth + 1;
 		unsigned len = depth;
 
-		/* Set the depth of this node so that it is available
-		 * when its children (if any) are processed.  */
-
+		/*
+		 * Set the depth of this node so that it is available when its
+		 * children (if any) are processed.
+		 */
 		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
 
-		/* If needed, decrease the length to meet the
-		 * length-limited constraint.  This is not the optimal
-		 * method for generating length-limited Huffman codes!
-		 * But it should be good enough.  */
+		/*
+		 * If needed, decrease the length to meet the length-limited
+		 * constraint.  This is not the optimal method for generating
+		 * length-limited Huffman codes!  But it should be good enough.
+		 */
 		if (len >= max_codeword_len) {
 			len = max_codeword_len;
 			do {
@@ -1040,8 +1082,10 @@ compute_length_counts(u32 A[restrict], unsigned root_idx,
 			} while (len_counts[len] == 0);
 		}
 
-		/* Account for the fact that we have a non-leaf node at
-		 * the current depth.  */
+		/*
+		 * Account for the fact that we have a non-leaf node at the
+		 * current depth.
+		 */
 		len_counts[len]--;
 		len_counts[len + 1] += 2;
 	}
@@ -1080,21 +1124,25 @@ gen_codewords(u32 A[restrict], u8 lens[restrict],
 	unsigned len;
 	unsigned sym;
 
-	/* Given the number of codewords that will have each length,
-	 * assign codeword lengths to symbols.  We do this by assigning
-	 * the lengths in decreasing order to the symbols sorted
-	 * primarily by increasing frequency and secondarily by
-	 * increasing symbol value.  */
+	/*
+	 * Given the number of codewords that will have each length, assign
+	 * codeword lengths to symbols.  We do this by assigning the lengths in
+	 * decreasing order to the symbols sorted primarily by increasing
+	 * frequency and secondarily by increasing symbol value.
+	 */
 	for (i = 0, len = max_codeword_len; len >= 1; len--) {
 		unsigned count = len_counts[len];
+
 		while (count--)
 			lens[A[i++] & SYMBOL_MASK] = len;
 	}
 
-	/* Generate the codewords themselves.  We initialize the
+	/*
+	 * Generate the codewords themselves.  We initialize the
 	 * 'next_codewords' array to provide the lexicographically first
-	 * codeword of each length, then assign codewords in symbol
-	 * order.  This produces a canonical code.  */
+	 * codeword of each length, then assign codewords in symbol order.  This
+	 * produces a canonical code.
+	 */
 	next_codewords[0] = 0;
 	next_codewords[1] = 0;
 	for (len = 2; len <= max_codeword_len; len++)
@@ -1114,88 +1162,81 @@ gen_codewords(u32 A[restrict], u8 lens[restrict],
  * length-limited canonical Huffman code.
  *
  * @num_syms
- *	The number of symbols in the alphabet.  The symbols are the
- *	integers in the range [0, num_syms - 1].  This parameter must be
- *	at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
+ *	The number of symbols in the alphabet.  The symbols are the integers in
+ *	the range [0, num_syms - 1].  This parameter must be at least 2 and
+ *	can't be greater than (1 << NUM_SYMBOL_BITS).
  *
  * @max_codeword_len
  *	The maximum permissible codeword length.
  *
  * @freqs
- *	An array of @num_syms entries, each of which specifies the
- *	frequency of the corresponding symbol.  It is valid for some,
- *	none, or all of the frequencies to be 0.
+ *	An array of @num_syms entries, each of which specifies the frequency of
+ *	the corresponding symbol.  It is valid for some, none, or all of the
+ *	frequencies to be 0.
  *
  * @lens
- *	An array of @num_syms entries in which this function will return
- *	the length, in bits, of the codeword assigned to each symbol.
- *	Symbols with 0 frequency will not have codewords per se, but
- *	their entries in this array will be set to 0.  No lengths greater
- *	than @max_codeword_len will be assigned.
+ *	An array of @num_syms entries in which this function will return the
+ *	length, in bits, of the codeword assigned to each symbol.  Symbols with
+ *	0 frequency will not have codewords per se, but their entries in this
+ *	array will be set to 0.  No lengths greater than @max_codeword_len will
+ *	be assigned.
  *
  * @codewords
- *	An array of @num_syms entries in which this function will return
- *	the codeword for each symbol, right-justified and padded on the
- *	left with zeroes.  Codewords for symbols with 0 frequency will be
- *	undefined.
+ *	An array of @num_syms entries in which this function will return the
+ *	codeword for each symbol, right-justified and padded on the left with
+ *	zeroes.  Codewords for symbols with 0 frequency will be undefined.
  *
  * ---------------------------------------------------------------------
  *
  * This function builds a length-limited canonical Huffman code.
  *
  * A length-limited Huffman code contains no codewords longer than some
- * specified length, and has exactly (with some algorithms) or
- * approximately (with the algorithm used here) the minimum weighted path
- * length from the root, given this constraint.
+ * specified length, and has exactly (with some algorithms) or approximately
+ * (with the algorithm used here) the minimum weighted path length from the
+ * root, given this constraint.
  *
- * A canonical Huffman code satisfies the properties that a longer
- * codeword never lexicographically precedes a shorter codeword, and the
- * lexicographic ordering of codewords of the same length is the same as
- * the lexicographic ordering of the corresponding symbols.  A canonical
- * Huffman code, or more generally a canonical prefix code, can be
- * reconstructed from only a list containing the codeword length of each
- * symbol.
+ * A canonical Huffman code satisfies the properties that a longer codeword
+ * never lexicographically precedes a shorter codeword, and the lexicographic
+ * ordering of codewords of the same length is the same as the lexicographic
+ * ordering of the corresponding symbols.  A canonical Huffman code, or more
+ * generally a canonical prefix code, can be reconstructed from only a list
+ * containing the codeword length of each symbol.
  *
- * The classic algorithm to generate a Huffman code creates a node for
- * each symbol, then inserts these nodes into a min-heap keyed by symbol
- * frequency.  Then, repeatedly, the two lowest-frequency nodes are
- * removed from the min-heap and added as the children of a new node
- * having frequency equal to the sum of its two children, which is then
- * inserted into the min-heap.  When only a single node remains in the
- * min-heap, it is the root of the Huffman tree.  The codeword for each
- * symbol is determined by the path needed to reach the corresponding
- * node from the root.  Descending to the left child appends a 0 bit,
- * whereas descending to the right child appends a 1 bit.
+ * The classic algorithm to generate a Huffman code creates a node for each
+ * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
+ * Then, repeatedly, the two lowest-frequency nodes are removed from the
+ * min-heap and added as the children of a new node having frequency equal to
+ * the sum of its two children, which is then inserted into the min-heap.  When
+ * only a single node remains in the min-heap, it is the root of the Huffman
+ * tree.  The codeword for each symbol is determined by the path needed to reach
+ * the corresponding node from the root.  Descending to the left child appends a
+ * 0 bit, whereas descending to the right child appends a 1 bit.
  *
- * The classic algorithm is relatively easy to understand, but it is
- * subject to a number of inefficiencies.  In practice, it is fastest to
- * first sort the symbols by frequency.  (This itself can be subject to
- * an optimization based on the fact that most frequencies tend to be
- * low.)  At the same time, we sort secondarily by symbol value, which
- * aids the process of generating a canonical code.  Then, during tree
- * construction, no heap is necessary because both the leaf nodes and the
- * unparented non-leaf nodes can be easily maintained in sorted order.
- * Consequently, there can never be more than two possibilities for the
- * next-lowest-frequency node.
+ * The classic algorithm is relatively easy to understand, but it is subject to
+ * a number of inefficiencies.  In practice, it is fastest to first sort the
+ * symbols by frequency.  (This itself can be subject to an optimization based
+ * on the fact that most frequencies tend to be low.)  At the same time, we sort
+ * secondarily by symbol value, which aids the process of generating a canonical
+ * code.  Then, during tree construction, no heap is necessary because both the
+ * leaf nodes and the unparented non-leaf nodes can be easily maintained in
+ * sorted order.  Consequently, there can never be more than two possibilities
+ * for the next-lowest-frequency node.
  *
- * In addition, because we're generating a canonical code, we actually
- * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
- * This is because for canonical code generation we don't need to know
- * where the symbols are in the tree.  Rather, we only need to know how
- * many leaf nodes have each depth (codeword length).  And this
- * information can, in fact, be quickly generated from the tree of
- * non-leaves only.
+ * In addition, because we're generating a canonical code, we actually don't
+ * need the leaf nodes of the tree at all, only the non-leaf nodes.  This is
+ * because for canonical code generation we don't need to know where the symbols
+ * are in the tree.  Rather, we only need to know how many leaf nodes have each
+ * depth (codeword length).  And this information can, in fact, be quickly
+ * generated from the tree of non-leaves only.
  *
- * Furthermore, we can build this stripped-down Huffman tree directly in
- * the array in which the codewords are to be generated, provided that
- * these array slots are large enough to hold a symbol and frequency
- * value.
+ * Furthermore, we can build this stripped-down Huffman tree directly in the
+ * array in which the codewords are to be generated, provided that these array
+ * slots are large enough to hold a symbol and frequency value.
  *
- * Still furthermore, we don't even need to maintain explicit child
- * pointers.  We only need the parent pointers, and even those can be
- * overwritten in-place with depth information as part of the process of
- * extracting codeword lengths from the tree.  So in summary, we do NOT
- * need a big structure like:
+ * Still furthermore, we don't even need to maintain explicit child pointers.
+ * We only need the parent pointers, and even those can be overwritten in-place
+ * with depth information as part of the process of extracting codeword lengths
+ * from the tree.  So in summary, we do NOT need a big structure like:
  *
  *	struct huffman_tree_node {
  *		unsigned int symbol;
@@ -1206,12 +1247,11 @@ gen_codewords(u32 A[restrict], u8 lens[restrict],
  *	};
  *
  *
- *   ... which often gets used in "naive" implementations of Huffman code
- *   generation.
+ * ... which often gets used in "naive" implementations of Huffman code
+ * generation.
  *
- * Many of these optimizations are based on the implementation in 7-Zip
- * (source file: C/HuffEnc.c), which has been placed in the public domain
- * by Igor Pavlov.
+ * Many of these optimizations are based on the implementation in 7-Zip (source
+ * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
  */
 static void
 make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
@@ -1223,37 +1263,44 @@ make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 
 	STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
 
-	/* We begin by sorting the symbols primarily by frequency and
-	 * secondarily by symbol value.  As an optimization, the array
-	 * used for this purpose ('A') shares storage with the space in
-	 * which we will eventually return the codewords.  */
-
+	/*
+	 * We begin by sorting the symbols primarily by frequency and
+	 * secondarily by symbol value.  As an optimization, the array used for
+	 * this purpose ('A') shares storage with the space in which we will
+	 * eventually return the codewords.
+	 */
 	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
 
-	/* 'num_used_syms' is the number of symbols with nonzero
-	 * frequency.  This may be less than @num_syms.  'num_used_syms'
-	 * is also the number of entries in 'A' that are valid.  Each
-	 * entry consists of a distinct symbol and a nonzero frequency
-	 * packed into a 32-bit integer.  */
+	/*
+	 * 'num_used_syms' is the number of symbols with nonzero frequency.
+	 * This may be less than @num_syms.  'num_used_syms' is also the number
+	 * of entries in 'A' that are valid.  Each entry consists of a distinct
+	 * symbol and a nonzero frequency packed into a 32-bit integer.
+	 */
 
-	/* Handle special cases where only 0 or 1 symbols were used (had
-	 * nonzero frequency).  */
+	/*
+	 * Handle special cases where only 0 or 1 symbols were used (had nonzero
+	 * frequency).
+	 */
 
 	if (unlikely(num_used_syms == 0)) {
-		/* Code is empty.  sort_symbols() already set all lengths
-		 * to 0, so there is nothing more to do.  */
+		/*
+		 * Code is empty.  sort_symbols() already set all lengths to 0,
+		 * so there is nothing more to do.
+		 */
 		return;
 	}
 
 	if (unlikely(num_used_syms == 1)) {
-		/* Only one symbol was used, so we only need one
-		 * codeword.  But two codewords are needed to form the
-		 * smallest complete Huffman code, which uses codewords 0
-		 * and 1.  Therefore, we choose another symbol to which
-		 * to assign a codeword.  We use 0 (if the used symbol is
-		 * not 0) or 1 (if the used symbol is 0).  In either
-		 * case, the lesser-valued symbol must be assigned
-		 * codeword 0 so that the resulting code is canonical.  */
+		/*
+		 * Only one symbol was used, so we only need one codeword.  But
+		 * two codewords are needed to form the smallest complete
+		 * Huffman code, which uses codewords 0 and 1.  Therefore, we
+		 * choose another symbol to which to assign a codeword.  We use
+		 * 0 (if the used symbol is not 0) or 1 (if the used symbol is
+		 * 0).  In either case, the lesser-valued symbol must be
+		 * assigned codeword 0 so that the resulting code is canonical.
+		 */
 
 		unsigned sym = A[0] & SYMBOL_MASK;
 		unsigned nonzero_idx = sym ? sym : 1;
@@ -1265,9 +1312,11 @@ make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 		return;
 	}
 
-	/* Build a stripped-down version of the Huffman tree, sharing the
-	 * array 'A' with the symbol values.  Then extract length counts
-	 * from the tree and use them to generate the final codewords.  */
+	/*
+	 * Build a stripped-down version of the Huffman tree, sharing the array
+	 * 'A' with the symbol values.  Then extract length counts from the tree
+	 * and use them to generate the final codewords.
+	 */
 
 	build_tree(A, num_used_syms);
 
@@ -1282,8 +1331,8 @@ make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 }
 
 /*
- * Clear the Huffman symbol frequency counters.
- * This must be called when starting a new DEFLATE block.
+ * Clear the Huffman symbol frequency counters.  This must be called when
+ * starting a new DEFLATE block.
  */
 static void
 deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
@@ -1291,32 +1340,34 @@ deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
 	memset(&c->freqs, 0, sizeof(c->freqs));
 }
 
-/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length.  */
+/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */
 static u32
 deflate_reverse_codeword(u32 codeword, u8 len)
 {
-	/* The following branchless algorithm is faster than going bit by bit.
+	/*
+	 * The following branchless algorithm is faster than going bit by bit.
 	 * Note: since no codewords are longer than 16 bits, we only need to
-	 * reverse the low 16 bits of the 'u32'.  */
+	 * reverse the low 16 bits of the 'u32'.
+	 */
 	STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
 
-	/* Flip adjacent 1-bit fields  */
+	/* Flip adjacent 1-bit fields. */
 	codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
 
-	/* Flip adjacent 2-bit fields  */
+	/* Flip adjacent 2-bit fields. */
 	codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
 
-	/* Flip adjacent 4-bit fields  */
+	/* Flip adjacent 4-bit fields. */
 	codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
 
-	/* Flip adjacent 8-bit fields  */
+	/* Flip adjacent 8-bit fields. */
 	codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
 
-	/* Return the high 'len' bits of the bit-reversed 16 bit value.  */
+	/* Return the high 'len' bits of the bit-reversed 16 bit value. */
 	return codeword >> (16 - len);
 }
 
-/* Make a canonical Huffman code with bit-reversed codewords.  */
+/* Make a canonical Huffman code with bit-reversed codewords. */
 static void
 deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 			  const u32 freqs[], u8 lens[], u32 codewords[])
@@ -1327,7 +1378,8 @@ deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 				    freqs, lens, codewords);
 
 	for (sym = 0; sym < num_syms; sym++)
-		codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
+		codewords[sym] = deflate_reverse_codeword(codewords[sym],
+							  lens[sym]);
 }
 
 /*
@@ -1340,8 +1392,10 @@ static void
 deflate_make_huffman_codes(const struct deflate_freqs *freqs,
 			   struct deflate_codes *codes)
 {
-	STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
-	STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
+	STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <=
+		      DEFLATE_MAX_LITLEN_CODEWORD_LEN);
+	STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <=
+		      DEFLATE_MAX_OFFSET_CODEWORD_LEN);
 
 	deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
 				  MAX_LITLEN_CODEWORD_LEN,
@@ -1356,7 +1410,7 @@ deflate_make_huffman_codes(const struct deflate_freqs *freqs,
 				  codes->codewords.offset);
 }
 
-/* Initialize c->static_codes.  */
+/* Initialize c->static_codes. */
 static void
 deflate_init_static_codes(struct libdeflate_compressor *c)
 {
@@ -1395,7 +1449,7 @@ deflate_get_offset_slot(unsigned offset)
 #endif
 }
 
-/* Write the header fields common to all DEFLATE block types.  */
+/* Write the header fields common to all DEFLATE block types. */
 static void
 deflate_write_block_header(struct deflate_output_bitstream *os,
 			   bool is_final_block, unsigned block_type)
@@ -1423,31 +1477,33 @@ deflate_compute_precode_items(const u8 lens[restrict],
 	itemptr = precode_items;
 	run_start = 0;
 	do {
-		/* Find the next run of codeword lengths.  */
+		/* Find the next run of codeword lengths. */
 
-		/* len = the length being repeated  */
+		/* len = the length being repeated */
 		len = lens[run_start];
 
-		/* Extend the run.  */
+		/* Extend the run. */
 		run_end = run_start;
 		do {
 			run_end++;
 		} while (run_end != num_lens && len == lens[run_end]);
 
 		if (len == 0) {
-			/* Run of zeroes.  */
+			/* Run of zeroes. */
 
-			/* Symbol 18: RLE 11 to 138 zeroes at a time.  */
+			/* Symbol 18: RLE 11 to 138 zeroes at a time. */
 			while ((run_end - run_start) >= 11) {
-				extra_bits = MIN((run_end - run_start) - 11, 0x7F);
+				extra_bits = MIN((run_end - run_start) - 11,
+						 0x7F);
 				precode_freqs[18]++;
 				*itemptr++ = 18 | (extra_bits << 5);
 				run_start += 11 + extra_bits;
 			}
 
-			/* Symbol 17: RLE 3 to 10 zeroes at a time.  */
+			/* Symbol 17: RLE 3 to 10 zeroes at a time. */
 			if ((run_end - run_start) >= 3) {
-				extra_bits = MIN((run_end - run_start) - 3, 0x7);
+				extra_bits = MIN((run_end - run_start) - 3,
+						 0x7);
 				precode_freqs[17]++;
 				*itemptr++ = 17 | (extra_bits << 5);
 				run_start += 3 + extra_bits;
@@ -1456,13 +1512,14 @@ deflate_compute_precode_items(const u8 lens[restrict],
 
 			/* A run of nonzero lengths. */
 
-			/* Symbol 16: RLE 3 to 6 of the previous length.  */
+			/* Symbol 16: RLE 3 to 6 of the previous length. */
 			if ((run_end - run_start) >= 4) {
 				precode_freqs[len]++;
 				*itemptr++ = len;
 				run_start++;
 				do {
-					extra_bits = MIN((run_end - run_start) - 3, 0x3);
+					extra_bits = MIN((run_end - run_start) -
+							 3, 0x3);
 					precode_freqs[16]++;
 					*itemptr++ = 16 | (extra_bits << 5);
 					run_start += 3 + extra_bits;
@@ -1470,7 +1527,7 @@ deflate_compute_precode_items(const u8 lens[restrict],
 			}
 		}
 
-		/* Output any remaining lengths without RLE.  */
+		/* Output any remaining lengths without RLE. */
 		while (run_start != run_end) {
 			precode_freqs[len]++;
 			*itemptr++ = len;
@@ -1509,21 +1566,23 @@ deflate_precompute_huffman_header(struct libdeflate_compressor *c)
 		if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
 			break;
 
-	/* If we're not using the full set of literal/length codeword lengths,
+	/*
+	 * If we're not using the full set of literal/length codeword lengths,
 	 * then temporarily move the offset codeword lengths over so that the
-	 * literal/length and offset codeword lengths are contiguous. */
-
+	 * literal/length and offset codeword lengths are contiguous.
+	 */
 	STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
 		      DEFLATE_NUM_LITLEN_SYMS);
-
 	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
 		memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
 			(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
 			c->num_offset_syms);
 	}
 
-	/* Compute the "items" (RLE / literal tokens and extra bits) with which
-	 * the codeword lengths in the larger code will be output. */
+	/*
+	 * Compute the "items" (RLE / literal tokens and extra bits) with which
+	 * the codeword lengths in the larger code will be output.
+	 */
 	c->num_precode_items =
 		deflate_compute_precode_items((u8 *)&c->codes.lens,
 					      c->num_litlen_syms +
@@ -1566,17 +1625,18 @@ deflate_write_huffman_header(struct libdeflate_compressor *c,
 	deflate_add_bits(os, c->num_explicit_lens - 4, 4);
 	deflate_flush_bits(os);
 
-	/* Output the lengths of the codewords in the precode.  */
+	/* Output the lengths of the codewords in the precode. */
 	for (i = 0; i < c->num_explicit_lens; i++) {
 		deflate_add_bits(os, c->precode_lens[
 				       deflate_precode_lens_permutation[i]], 3);
 		deflate_flush_bits(os);
 	}
 
-	/* Output the encoded lengths of the codewords in the larger code.  */
+	/* Output the encoded lengths of the codewords in the larger code. */
 	for (i = 0; i < c->num_precode_items; i++) {
 		unsigned precode_item = c->precode_items[i];
 		unsigned precode_sym = precode_item & 0x1F;
+
 		deflate_add_bits(os, c->precode_codewords[precode_sym],
 				 c->precode_lens[precode_sym]);
 		if (precode_sym >= 16) {
@@ -1689,14 +1749,15 @@ deflate_write_sequences(struct deflate_output_bitstream * restrict os,
 		length_slot = seq->length_slot;
 		litlen_symbol = DEFLATE_FIRST_LEN_SYM + length_slot;
 
-		/* Litlen symbol  */
+		/* Litlen symbol */
 		deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
 				 codes->lens.litlen[litlen_symbol]);
 
-		/* Extra length bits  */
+		/* Extra length bits */
 		STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
 					 DEFLATE_MAX_EXTRA_LENGTH_BITS));
-		deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
+		deflate_add_bits(os,
+				 length - deflate_length_slot_base[length_slot],
 				 deflate_extra_length_bits[length_slot]);
 
 		if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
@@ -1705,7 +1766,7 @@ deflate_write_sequences(struct deflate_output_bitstream * restrict os,
 				DEFLATE_MAX_EXTRA_OFFSET_BITS))
 			deflate_flush_bits(os);
 
-		/* Offset symbol  */
+		/* Offset symbol */
 		offset_symbol = seq->offset_symbol;
 		deflate_add_bits(os, codes->codewords.offset[offset_symbol],
 				 codes->lens.offset[offset_symbol]);
@@ -1714,8 +1775,9 @@ deflate_write_sequences(struct deflate_output_bitstream * restrict os,
 				DEFLATE_MAX_EXTRA_OFFSET_BITS))
 			deflate_flush_bits(os);
 
-		/* Extra offset bits  */
-		deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
+		/* Extra offset bits */
+		deflate_add_bits(os, seq->offset -
+				 deflate_offset_slot_base[offset_symbol],
 				 deflate_extra_offset_bits[offset_symbol]);
 
 		deflate_flush_bits(os);
@@ -1741,7 +1803,8 @@ deflate_write_item_list(struct deflate_output_bitstream *os,
 			u32 block_length)
 {
 	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
-	struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
+	struct deflate_optimum_node * const end_node =
+		&c->p.n.optimum_nodes[block_length];
 	do {
 		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
 		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
@@ -1750,20 +1813,23 @@ deflate_write_item_list(struct deflate_output_bitstream *os,
 		unsigned offset_slot;
 
 		if (length == 1) {
-			/* Literal  */
+			/* Literal */
 			litlen_symbol = offset;
-			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
+			deflate_add_bits(os,
+					 codes->codewords.litlen[litlen_symbol],
 					 codes->lens.litlen[litlen_symbol]);
 			deflate_flush_bits(os);
 		} else {
-			/* Match length  */
+			/* Match length */
 			length_slot = deflate_length_slot[length];
 			litlen_symbol = DEFLATE_FIRST_LEN_SYM + length_slot;
-			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
-					 codes->lens.litlen[litlen_symbol]);
+			deflate_add_bits(os,
+				codes->codewords.litlen[litlen_symbol],
+				codes->lens.litlen[litlen_symbol]);
 
-			deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
-					 deflate_extra_length_bits[length_slot]);
+			deflate_add_bits(os,
+				length - deflate_length_slot_base[length_slot],
+				deflate_extra_length_bits[length_slot]);
 
 			if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
 					DEFLATE_MAX_EXTRA_LENGTH_BITS +
@@ -1772,17 +1838,19 @@ deflate_write_item_list(struct deflate_output_bitstream *os,
 				deflate_flush_bits(os);
 
 
-			/* Match offset  */
+			/* Match offset */
 			offset_slot = c->p.n.offset_slot_full[offset];
-			deflate_add_bits(os, codes->codewords.offset[offset_slot],
-					 codes->lens.offset[offset_slot]);
+			deflate_add_bits(os,
+				codes->codewords.offset[offset_slot],
+				codes->lens.offset[offset_slot]);
 
 			if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
 					DEFLATE_MAX_EXTRA_OFFSET_BITS))
 				deflate_flush_bits(os);
 
-			deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
-					 deflate_extra_offset_bits[offset_slot]);
+			deflate_add_bits(os,
+				offset - deflate_offset_slot_base[offset_slot],
+				deflate_extra_offset_bits[offset_slot]);
 
 			deflate_flush_bits(os);
 		}
@@ -1791,7 +1859,7 @@ deflate_write_item_list(struct deflate_output_bitstream *os,
 }
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
-/* Output the end-of-block symbol.  */
+/* Output the end-of-block symbol. */
 static void
 deflate_write_end_of_block(struct deflate_output_bitstream *os,
 			   const struct deflate_codes *codes)
@@ -1868,7 +1936,7 @@ deflate_flush_block(struct libdeflate_compressor * restrict c,
 
 		/* Build dynamic Huffman codes. */
 		deflate_make_huffman_codes(&c->freqs, &c->codes);
-	} /* Else, this was already done */
+	} /* Else, this was already done. */
 
 	/* Account for the cost of sending dynamic Huffman codes. */
 	deflate_precompute_huffman_header(c);
@@ -1899,6 +1967,7 @@ deflate_flush_block(struct libdeflate_compressor * restrict c,
 	     sym++) {
 		u32 extra = deflate_extra_length_bits[
 					sym - DEFLATE_FIRST_LEN_SYM];
+
 		dynamic_cost += c->freqs.litlen[sym] *
 				(extra + c->codes.lens.litlen[sym]);
 		static_cost += c->freqs.litlen[sym] *
@@ -1933,9 +2002,11 @@ deflate_flush_block(struct libdeflate_compressor * restrict c,
 	/* Now actually output the block. */
 
 	if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
-		/* Note: the length being flushed may exceed the maximum length
+		/*
+		 * Note: the length being flushed may exceed the maximum length
 		 * of an uncompressed block (65535 bytes).  Therefore, more than
-		 * one uncompressed block might be needed. */
+		 * one uncompressed block might be needed.
+		 */
 		deflate_write_uncompressed_blocks(os, block_begin, block_length,
 						  is_final_block);
 	} else {
@@ -2048,8 +2119,10 @@ init_block_split_stats(struct block_split_stats *stats)
 	stats->num_observations = 0;
 }
 
-/* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
- * literal, for 8 possible literal observation types.  */
+/*
+ * Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
+ * literal, for 8 possible literal observation types.
+ */
 static forceinline void
 observe_literal(struct block_split_stats *stats, u8 lit)
 {
@@ -2057,12 +2130,15 @@ observe_literal(struct block_split_stats *stats, u8 lit)
 	stats->num_new_observations++;
 }
 
-/* Match observation.  Heuristic: use one observation type for "short match" and
- * one observation type for "long match".  */
+/*
+ * Match observation.  Heuristic: use one observation type for "short match" and
+ * one observation type for "long match".
+ */
 static forceinline void
 observe_match(struct block_split_stats *stats, unsigned length)
 {
-	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
+	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES +
+				(length >= 9)]++;
 	stats->num_new_observations++;
 }
 
@@ -2083,23 +2159,30 @@ static bool
 do_end_block_check(struct block_split_stats *stats, u32 block_length)
 {
 	if (stats->num_observations > 0) {
-		/* Note: to avoid slow divisions, we do not divide by
+		/*
+		 * Note: to avoid slow divisions, we do not divide by
 		 * 'num_observations', but rather do all math with the numbers
-		 * multiplied by 'num_observations'.  */
+		 * multiplied by 'num_observations'.
+		 */
 		u32 total_delta = 0;
 		int i;
 
 		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
-			u32 expected = stats->observations[i] * stats->num_new_observations;
-			u32 actual = stats->new_observations[i] * stats->num_observations;
+			u32 expected = stats->observations[i] *
+				       stats->num_new_observations;
+			u32 actual = stats->new_observations[i] *
+				     stats->num_observations;
 			u32 delta = (actual > expected) ? actual - expected :
 							  expected - actual;
+
 			total_delta += delta;
 		}
 
 		/* Ready to end the block? */
-		if (total_delta + (block_length / 4096) * stats->num_observations >=
-		    NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
+		if (total_delta +
+		    (block_length / 4096) * stats->num_observations >=
+		    NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 *
+		    stats->num_observations)
 			return true;
 	}
 	merge_new_observations(stats);
@@ -2160,7 +2243,7 @@ choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
 		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
 		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-		/* the rest is implicitly 3 */
+		/* The rest is implicitly 3. */
 	};
 	unsigned min_len;
 
@@ -2220,7 +2303,7 @@ recalculate_min_match_len(const struct deflate_freqs *freqs,
 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
 		literal_freq += freqs->litlen[i];
 
-	cutoff = literal_freq >> 10; /* Ignore literals used very rarely */
+	cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */
 
 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
 		if (freqs->litlen[i] > cutoff)
@@ -2277,7 +2360,7 @@ deflate_compress_fastest(struct libdeflate_compressor * restrict c,
 	ht_matchfinder_init(&c->p.f.ht_mf);
 
 	do {
-		/* Starting a new DEFLATE block. */
+		/* Starting a new DEFLATE block */
 
 		const u8 * const in_block_begin = in_next;
 		const u8 * const in_max_block_end = choose_max_block_end(
@@ -2310,7 +2393,7 @@ deflate_compress_fastest(struct libdeflate_compressor * restrict c,
 							      &next_hash,
 							      &offset);
 			if (length) {
-				/* Match found. */
+				/* Match found */
 				deflate_choose_match(c, length, offset, &seq);
 				ht_matchfinder_skip_bytes(&c->p.f.ht_mf,
 							  &in_cur_base,
@@ -2320,7 +2403,7 @@ deflate_compress_fastest(struct libdeflate_compressor * restrict c,
 							  &next_hash);
 				in_next += length;
 			} else {
-				/* No match found. */
+				/* No match found */
 				deflate_choose_literal(c, *in_next++, seq);
 			}
 
@@ -2356,7 +2439,7 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
 	hc_matchfinder_init(&c->p.g.hc_mf);
 
 	do {
-		/* Starting a new DEFLATE block. */
+		/* Starting a new DEFLATE block */
 
 		const u8 * const in_block_begin = in_next;
 		const u8 * const in_max_block_end = choose_max_block_end(
@@ -2389,7 +2472,7 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
 			if (length >= min_len &&
 			    (length > DEFLATE_MIN_MATCH_LEN ||
 			     offset <= 4096)) {
-				/* Match found. */
+				/* Match found */
 				deflate_choose_match(c, length, offset, &seq);
 				observe_match(&c->split_stats, length);
 				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
@@ -2400,7 +2483,7 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
 							  next_hashes);
 				in_next += length;
 			} else {
-				/* No match found. */
+				/* No match found */
 				deflate_choose_literal(c, *in_next, seq);
 				observe_literal(&c->split_stats, *in_next);
 				in_next++;
@@ -2438,7 +2521,7 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
 	hc_matchfinder_init(&c->p.g.hc_mf);
 
 	do {
-		/* Starting a new DEFLATE block. */
+		/* Starting a new DEFLATE block */
 
 		const u8 * const in_block_begin = in_next;
 		const u8 * const in_max_block_end = choose_max_block_end(
@@ -2496,7 +2579,7 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
 			}
 			in_next++;
 
-		have_cur_match:
+have_cur_match:
 			observe_match(&c->split_stats, cur_len);
 			/*
 			 * We have a match at the current position.
@@ -2671,16 +2754,18 @@ static void
 deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
 {
 	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
-	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
+	struct deflate_optimum_node *end_node =
+		&c->p.n.optimum_nodes[block_length];
+
 	do {
 		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
 		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
 
 		if (length == 1) {
-			/* Literal  */
+			/* Literal */
 			c->freqs.litlen[offset]++;
 		} else {
-			/* Match  */
+			/* Match */
 			c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
 					deflate_length_slot[length]]++;
 			c->freqs.offset[c->p.n.offset_slot_full[offset]]++;
@@ -2692,31 +2777,37 @@ deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
 	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
 }
 
-/* Set the current cost model from the codeword lengths specified in @lens.  */
+/* Set the current cost model from the codeword lengths specified in @lens. */
 static void
 deflate_set_costs_from_codes(struct libdeflate_compressor *c,
 			     const struct deflate_lens *lens)
 {
 	unsigned i;
 
-	/* Literals  */
+	/* Literals */
 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
-		u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
+		u32 bits = (lens->litlen[i] ?
+			    lens->litlen[i] : LITERAL_NOSTAT_BITS);
+
 		c->p.n.costs.literal[i] = bits * BIT_COST;
 	}
 
-	/* Lengths  */
+	/* Lengths */
 	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
 		unsigned length_slot = deflate_length_slot[i];
 		unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot;
-		u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
+		u32 bits = (lens->litlen[litlen_sym] ?
+			    lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
+
 		bits += deflate_extra_length_bits[length_slot];
 		c->p.n.costs.length[i] = bits * BIT_COST;
 	}
 
-	/* Offset slots  */
+	/* Offset slots */
 	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
-		u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
+		u32 bits = (lens->offset[i] ?
+			    lens->offset[i] : OFFSET_NOSTAT_BITS);
+
 		bits += deflate_extra_offset_bits[i];
 		c->p.n.costs.offset_slot[i] = bits * BIT_COST;
 	}
@@ -2883,7 +2974,7 @@ deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
 	unsigned i;
 
 	/* Calculate the number of distinct literals that exist in the data. */
-	cutoff = literal_freq >> 11; /* Ignore literals used very rarely */
+	cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */
 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
 		if (c->freqs.litlen[i] > cutoff)
 			num_used_literals++;
@@ -2951,16 +3042,16 @@ deflate_set_default_costs(struct libdeflate_compressor *c,
 {
 	unsigned i;
 
-	/* Literals  */
+	/* Literals */
 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
 		c->p.n.costs.literal[i] = lit_cost;
 
-	/* Lengths  */
+	/* Lengths */
 	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
 		c->p.n.costs.length[i] =
 			deflate_default_length_cost(i, len_sym_cost);
 
-	/* Offset slots  */
+	/* Offset slots */
 	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
 		c->p.n.costs.offset_slot[i] =
 			deflate_default_offset_slot_cost(i);
@@ -2987,19 +3078,19 @@ deflate_adjust_costs_impl(struct libdeflate_compressor *c,
 {
 	unsigned i;
 
-	/* Literals  */
+	/* Literals */
 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
 		deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost,
 				    change_amount);
 
-	/* Lengths  */
+	/* Lengths */
 	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
 		deflate_adjust_cost(&c->p.n.costs.length[i],
 				    deflate_default_length_cost(i,
 								len_sym_cost),
 				    change_amount);
 
-	/* Offset slots  */
+	/* Offset slots */
 	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
 		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
 				    deflate_default_offset_slot_cost(i),
@@ -3073,7 +3164,8 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
 			   const u32 block_length,
 			   const struct lz_match *cache_ptr)
 {
-	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
+	struct deflate_optimum_node *end_node =
+		&c->p.n.optimum_nodes[block_length];
 	struct deflate_optimum_node *cur_node = end_node;
 
 	cur_node->cost_to_end = 0;
@@ -3088,12 +3180,12 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
 		num_matches = cache_ptr->length;
 		literal = cache_ptr->offset;
 
-		/* It's always possible to choose a literal.  */
+		/* It's always possible to choose a literal. */
 		best_cost_to_end = c->p.n.costs.literal[literal] +
 				   (cur_node + 1)->cost_to_end;
 		cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
 
-		/* Also consider matches if there are any.  */
+		/* Also consider matches if there are any. */
 		if (num_matches) {
 			const struct lz_match *match;
 			unsigned len;
@@ -3117,14 +3209,17 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
 			do {
 				offset = match->offset;
 				offset_slot = c->p.n.offset_slot_full[offset];
-				offset_cost = c->p.n.costs.offset_slot[offset_slot];
+				offset_cost =
+					c->p.n.costs.offset_slot[offset_slot];
 				do {
 					cost_to_end = offset_cost +
-						      c->p.n.costs.length[len] +
-						      (cur_node + len)->cost_to_end;
+						c->p.n.costs.length[len] +
+						(cur_node + len)->cost_to_end;
 					if (cost_to_end < best_cost_to_end) {
 						best_cost_to_end = cost_to_end;
-						cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
+						cur_node->item = len |
+							((u32)offset <<
+							 OPTIMUM_OFFSET_SHIFT);
 					}
 				} while (++len <= match->length);
 			} while (++match != cache_ptr);
@@ -3154,10 +3249,13 @@ deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
 	u32 lit_cost, len_sym_cost;
 	u32 i;
 
-	/* Force the block to really end at the desired length, even if some
-	 * matches extend beyond it. */
-	for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
-					ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
+	/*
+	 * Force the block to really end at the desired length, even if some
+	 * matches extend beyond it.
+	 */
+	for (i = block_length;
+	     i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
+		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
 		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
 
 	/* Make sure the literal/match statistics are up to date. */
@@ -3192,8 +3290,9 @@ deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
 	} while (num_passes_remaining);
 }
 
-static void deflate_near_optimal_begin_block(struct libdeflate_compressor *c,
-					     bool is_first_block)
+static void
+deflate_near_optimal_begin_block(struct libdeflate_compressor *c,
+				 bool is_first_block)
 {
 	int i;
 
@@ -3212,7 +3311,7 @@ static void deflate_near_optimal_begin_block(struct libdeflate_compressor *c,
 	init_block_split_stats(&c->split_stats);
 
 	/*
-	 * During matchfinding, we keep track of appropximate literal and match
+	 * During matchfinding, we keep track of approximate literal and match
 	 * length frequencies for the purpose of setting the initial costs.
 	 */
 	memset(c->freqs.litlen, 0,
@@ -3252,7 +3351,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 	bt_matchfinder_init(&c->p.n.bt_mf);
 
 	do {
-		/* Starting a new DEFLATE block. */
+		/* Starting a new DEFLATE block */
 
 		struct lz_match *cache_ptr = c->p.n.match_cache;
 		const u8 * const in_block_begin = in_next;
@@ -3275,7 +3374,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 			unsigned best_len;
 			size_t remaining = in_end - in_next;
 
-			/* Slide the window forward if needed.  */
+			/* Slide the window forward if needed. */
 			if (in_next == in_next_slide) {
 				bt_matchfinder_slide_window(&c->p.n.bt_mf);
 				in_cur_base = in_next;
@@ -3405,12 +3504,11 @@ deflate_init_offset_slot_full(struct libdeflate_compressor *c)
 	unsigned offset;
 	unsigned offset_end;
 
-	for (offset_slot = 0;
-	     offset_slot < ARRAY_LEN(deflate_offset_slot_base);
-	     offset_slot++)
-	{
+	for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base);
+	     offset_slot++) {
 		offset = deflate_offset_slot_base[offset_slot];
-		offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
+		offset_end = offset +
+			     (1 << deflate_extra_offset_bits[offset_slot]);
 		do {
 			c->p.n.offset_slot_full[offset] = offset_slot;
 		} while (++offset != offset_end);
@@ -3460,7 +3558,7 @@ libdeflate_alloc_compressor(int compression_level)
 		break;
 	case 1:
 		c->impl = deflate_compress_fastest;
-		/* max_search_depth is unused */
+		/* max_search_depth is unused. */
 		c->nice_match_length = 32;
 		break;
 	case 2:
@@ -3545,12 +3643,12 @@ libdeflate_deflate_compress(struct libdeflate_compressor *c,
 	if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
 		return 0;
 
-	/* For extremely small inputs just use a single uncompressed block. */
+	/* For extremely small inputs, just use a single uncompressed block. */
 	if (unlikely(in_nbytes < c->min_size_to_compress)) {
 		struct deflate_output_bitstream os;
 		deflate_init_output(&os, out, out_nbytes_avail);
 		if (in_nbytes == 0)
-			in = &os; /* Avoid passing NULL to memcpy() */
+			in = &os; /* Avoid passing NULL to memcpy(). */
 		deflate_write_uncompressed_block(&os, in, in_nbytes, true);
 		return deflate_flush_output(&os);
 	}
@@ -3580,6 +3678,8 @@ libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
 	 * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
 	 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
 	 */
-	size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
+	size_t max_num_blocks =
+		MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
+
 	return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
 }
diff --git a/lib/deflate_compress.h b/lib/deflate_compress.h
index d97d019..8bb6cb9 100644
--- a/lib/deflate_compress.h
+++ b/lib/deflate_compress.h
@@ -3,8 +3,10 @@
 
 #include "lib_common.h"
 
-/* DEFLATE compression is private to deflate_compress.c, but we do need to be
- * able to query the compression level for zlib and gzip header generation.  */
+/*
+ * DEFLATE compression is private to deflate_compress.c, but we do need to be
+ * able to query the compression level for zlib and gzip header generation.
+ */
 
 struct libdeflate_compressor;