From 5f3208e788f4f5497245b8b09b356ea74c4fb3b4 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Thu, 22 Jan 2015 00:05:01 -0600
Subject: [PATCH] Cleanups and matchfinder updates

---
 CMakeLists.txt               |   2 +-
 libdeflate.h                 |  22 ++-
 src/adler32.c                |   2 +-
 src/adler32.h                |   2 +-
 src/bitops.h                 |  18 +-
 src/bt_matchfinder.h         | 310 +++++++++++++++++++++--------------
 src/compiler-gcc.h           |  10 +-
 src/compiler.h               |  36 ++--
 src/crc32.c                  |   8 +-
 src/deflate_compress.c       | 110 ++++++-------
 src/endianness.h             |   2 +-
 src/gzip_compress.c          |   2 +-
 src/gzip_constants.h         |   2 -
 src/gzip_decompress.c        |   8 +-
 src/hc_matchfinder.h         | 187 +++++++++++++--------
 src/lz_extend.h              |  12 +-
 src/lz_hash.h                |  41 +++++
 src/lz_hash3.h               |  49 ------
 src/matchfinder_avx2.h       |   4 +-
 src/matchfinder_common.h     |   6 +-
 src/matchfinder_nonsliding.h |  16 +-
 src/matchfinder_sliding.h    |  10 +-
 src/matchfinder_sse2.h       |   4 +-
 src/types.h                  |   2 +-
 src/unaligned.h              |  35 +++-
 test/benchmark.c             |   8 +-
 26 files changed, 528 insertions(+), 380 deletions(-)
 create mode 100644 src/lz_hash.h
 delete mode 100644 src/lz_hash3.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e5757b..4ff2f1c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,7 @@ install(FILES libdeflate.h DESTINATION "${CMAKE_INSTALL_PREFIX}/include")
 
 option(BUILD_BENCHMARK "Build benchmark program" OFF)
 add_executable(benchmark test/benchmark.c)
-target_link_libraries(benchmark deflate -lz)
+target_link_libraries(benchmark deflatestatic -lz)
 
 option(BUILD_GEN_CRC32_TABLE "Build CRC32 table generation program" OFF)
 add_executable(gen_crc32_table test/gen_crc32_table.c)
diff --git a/libdeflate.h b/libdeflate.h
index 1d42ed4..e5d6aa9 100644
--- a/libdeflate.h
+++ b/libdeflate.h
@@ -1,7 +1,9 @@
 /*
  * libdeflate.h
  *
- * Public header for the DEFLATE compression library.
+ * Public header for libdeflate.
+ *
+ * This file has no copyright assigned and is placed in the Public Domain.
  */
 
 #ifndef LIBDEFLATE_H
@@ -26,7 +28,9 @@ struct deflate_compressor;
  * fastest, 6 = medium/default, 9 = slowest).  The return value is a pointer to
  * the new DEFLATE compressor, or NULL if out of memory.
  *
- * Note: the sliding window size is defined at compilation time (default 32768).
+ * Note: for compression, the sliding window size is defined at compilation time
+ * to 32768, the largest size permissible in the DEFLATE format.  It cannot be
+ * changed at runtime.
  */
 extern struct deflate_compressor *
 deflate_alloc_compressor(unsigned int compression_level);
@@ -44,7 +48,7 @@ deflate_compress(struct deflate_compressor *compressor,
 		 void *out, size_t out_nbytes_avail);
 
 /*
- * Like deflate_compress(), but store the data in the zlib wrapper format.
+ * Like deflate_compress(), but stores the data in the zlib wrapper format.
  */
 extern size_t
 zlib_compress(struct deflate_compressor *compressor,
@@ -52,7 +56,7 @@ zlib_compress(struct deflate_compressor *compressor,
 	      void *out, size_t out_nbytes_avail);
 
 /*
- * Like deflate_compress(), but store the data in the gzip wrapper format.
+ * Like deflate_compress(), but stores the data in the gzip wrapper format.
  */
 extern size_t
 gzip_compress(struct deflate_compressor *compressor,
@@ -61,7 +65,8 @@ gzip_compress(struct deflate_compressor *compressor,
 
 /*
  * deflate_free_compressor() frees a DEFLATE compressor that was allocated with
- * deflate_alloc_compressor().
+ * deflate_alloc_compressor().  If a NULL pointer is passed in, no action is
+ * taken.
  */
 extern void
 deflate_free_compressor(struct deflate_compressor *compressor);
@@ -79,7 +84,9 @@ struct deflate_decompressor;
  *
  * This function takes no parameters, and the returned decompressor is valid for
  * decompressing data that was compressed at any compression level and with any
- * sliding window size.
+ * sliding window size.  It can also be used for any wrapper format (raw
+ * DEFLATE, zlib, or gzip); however, the appropriate decompression function must
+ * be called.
  */
 extern struct deflate_decompressor *
 deflate_alloc_decompressor(void);
@@ -118,7 +125,8 @@ gzip_decompress(struct deflate_decompressor *decompressor,
 
 /*
  * deflate_free_decompressor() frees a DEFLATE decompressor that was allocated
- * with deflate_alloc_decompressor().
+ * with deflate_alloc_decompressor().  If a NULL pointer is passed in, no action
+ * is taken.
  */
 extern void
 deflate_free_decompressor(struct deflate_decompressor *decompressor);
diff --git a/src/adler32.c b/src/adler32.c
index 13d996e..e5dc9a5 100644
--- a/src/adler32.c
+++ b/src/adler32.c
@@ -39,7 +39,7 @@
 #define UNROLL_FACTOR	4
 
 u32
-adler32(const u8 *buffer, size_t size)
+adler32(const void *buffer, size_t size)
 {
 	u32 s1 = 1;
 	u32 s2 = 0;
diff --git a/src/adler32.h b/src/adler32.h
index 78c2d02..d1964bf 100644
--- a/src/adler32.h
+++ b/src/adler32.h
@@ -9,4 +9,4 @@
 #include "types.h"
 
 extern u32
-adler32(const u8 *buffer, size_t size);
+adler32(const void *buffer, size_t size);
diff --git a/src/bitops.h b/src/bitops.h
index 1e6f68c..97fa7a1 100644
--- a/src/bitops.h
+++ b/src/bitops.h
@@ -11,7 +11,8 @@
 
 /* Find Last Set bit   */
 
-static inline unsigned fls32(u32 v)
+static inline unsigned
+fls32(u32 v)
 {
 #ifdef compiler_fls32
 	return compiler_fls32(v);
@@ -23,7 +24,8 @@ static inline unsigned fls32(u32 v)
 #endif
 }
 
-static inline unsigned fls64(u64 v)
+static inline unsigned
+fls64(u64 v)
 {
 #ifdef compiler_fls64
 	return compiler_fls64(v);
@@ -35,7 +37,8 @@ static inline unsigned fls64(u64 v)
 #endif
 }
 
-static inline unsigned flsw(machine_word_t v)
+static inline unsigned
+flsw(machine_word_t v)
 {
 	BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
 	if (WORDSIZE == 4)
@@ -46,7 +49,8 @@ static inline unsigned flsw(machine_word_t v)
 
 /* Find First Set bit   */
 
-static inline unsigned ffs32(u32 v)
+static inline unsigned
+ffs32(u32 v)
 {
 #ifdef compiler_ffs32
 	return compiler_ffs32(v);
@@ -58,7 +62,8 @@ static inline unsigned ffs32(u32 v)
 #endif
 }
 
-static inline unsigned ffs64(u64 v)
+static inline unsigned
+ffs64(u64 v)
 {
 #ifdef compiler_ffs64
 	return compiler_ffs64(v);
@@ -70,7 +75,8 @@ static inline unsigned ffs64(u64 v)
 #endif
 }
 
-static inline unsigned ffsw(machine_word_t v)
+static inline unsigned
+ffsw(machine_word_t v)
 {
 	BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
 	if (WORDSIZE == 4)
diff --git a/src/bt_matchfinder.h b/src/bt_matchfinder.h
index b827044..57b8efa 100644
--- a/src/bt_matchfinder.h
+++ b/src/bt_matchfinder.h
@@ -1,51 +1,56 @@
 /*
  * bt_matchfinder.h
  *
- * This is a Binary Tree (bt) based matchfinder.
+ * ----------------------------------------------------------------------------
+ *
+ * This is a Binary Trees (bt) based matchfinder.
  *
  * The data structure is a hash table where each hash bucket contains a binary
- * tree of sequences, referenced by position.  The sequences in the binary tree
- * are ordered such that a left child is lexicographically lesser than its
- * parent, and a right child is lexicographically greater than its parent.
+ * tree of sequences whose first 3 bytes share the same hash code.  Each
+ * sequence is identified by its starting position in the input buffer.  Each
+ * binary tree is always sorted such that each left child represents a sequence
+ * lexicographically lesser than its parent and each right child represents a
+ * sequence lexicographically greater than its parent.
  *
- * For each sequence (position) in the input, the first 3 bytes are hashed and
- * the the appropriate binary tree is re-rooted at that sequence (position).
- * Since the sequences are inserted in order, each binary tree maintains the
- * invariant that each child node has greater match offset than its parent.
+ * The algorithm processes the input buffer sequentially.  At each byte
+ * position, the hash code of the first 3 bytes of the sequence beginning at
+ * that position (the sequence being matched against) is computed.  This
+ * identifies the hash bucket to use for that position.  Then, a new binary tree
+ * node is created to represent the current sequence.  Then, in a single tree
+ * traversal, the hash bucket's binary tree is searched for matches and is
+ * re-rooted at the new node.
  *
- * While inserting a sequence, we may search the binary tree for matches with
- * that sequence.  At each step, the length of the match is computed.  The
- * search ends when the sequences get too far away (outside of the sliding
- * window), or when the binary tree ends (in the code this is the same check as
- * "too far away"), or when 'max_search_depth' positions have been searched, or
- * when a match of at least 'nice_len' bytes has been found.
+ * Compared to the simpler algorithm that uses linked lists instead of binary
+ * trees (see hc_matchfinder.h), the binary tree version gains more information
+ * at each node visitation.  Ideally, the binary tree version will examine only
+ * 'log(n)' nodes to find the same matches that the linked list version will
+ * find by examining 'n' nodes.  In addition, the binary tree version can
+ * examine fewer bytes at each node by taking advantage of the common prefixes
+ * that result from the sort order, whereas the linked list version may have to
+ * examine up to the full length of the match at each node.
  *
- * Notes:
+ * However, it is not always best to use the binary tree version.  It requires
+ * nearly twice as much memory as the linked list version, and it takes time to
+ * keep the binary trees sorted, even at positions where the compressor does not
+ * need matches.  Generally, when doing fast compression on small buffers,
+ * binary trees are the wrong approach.  They are best suited for thorough
+ * compression and/or large buffers.
  *
- *	- Typically, we need to search more nodes to find a given match in a
- *	  binary tree versus in a linked list.  However, a binary tree has more
- *	  overhead than a linked list: it needs to be kept sorted, and the inner
- *	  search loop is more complicated.  As a result, binary trees are best
- *	  suited for compression modes where the potential matches are searched
- *	  more thoroughly.
- *
- *	- Since no attempt is made to keep the binary trees balanced, it's
- *	  essential to have the 'max_search_depth' cutoff.  Otherwise it could
- *	  take quadratic time to run data through the matchfinder.
+ * ----------------------------------------------------------------------------
  */
 
 #pragma once
 
 #include "lz_extend.h"
-#include "lz_hash3.h"
+#include "lz_hash.h"
 #include "matchfinder_common.h"
 
-#ifndef BT_MATCHFINDER_HASH_ORDER
-#  if MATCHFINDER_WINDOW_ORDER < 14
-#    define BT_MATCHFINDER_HASH_ORDER 14
-#  else
-#    define BT_MATCHFINDER_HASH_ORDER 15
-#  endif
+#if MATCHFINDER_WINDOW_ORDER < 13
+#  define BT_MATCHFINDER_HASH_ORDER 14
+#elif MATCHFINDER_WINDOW_ORDER < 15
+#  define BT_MATCHFINDER_HASH_ORDER 15
+#else
+#  define BT_MATCHFINDER_HASH_ORDER 16
 #endif
 
 #define BT_MATCHFINDER_HASH_LENGTH	(1UL << BT_MATCHFINDER_HASH_ORDER)
@@ -77,8 +82,37 @@ bt_matchfinder_slide_window(struct bt_matchfinder *mf)
 }
 #endif
 
+static inline u32
+bt_matchfinder_hash_3_bytes(const u8 *in_next)
+{
+	return lz_hash_3_bytes(in_next, BT_MATCHFINDER_HASH_ORDER);
+}
+
+static inline pos_t *
+bt_child(struct bt_matchfinder *mf, pos_t node, int offset)
+{
+	if (MATCHFINDER_WINDOW_ORDER < sizeof(pos_t) * 8) {
+		/* no cast needed */
+		return &mf->child_tab[(matchfinder_slot_for_match(node) << 1) + offset];
+	} else {
+		return &mf->child_tab[((size_t)matchfinder_slot_for_match(node) << 1) + offset];
+	}
+}
+
+static inline pos_t *
+bt_left_child(struct bt_matchfinder *mf, pos_t node)
+{
+	return bt_child(mf, node, 0);
+}
+
+static inline pos_t *
+bt_right_child(struct bt_matchfinder *mf, pos_t node)
+{
+	return bt_child(mf, node, 1);
+}
+
 /*
- * Find matches with the current sequence.
+ * Retrieve a list of matches with the current position.
  *
  * @mf
  *	The matchfinder structure.
@@ -87,115 +121,131 @@ bt_matchfinder_slide_window(struct bt_matchfinder *mf)
  *	time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
  * @in_next
  *	Pointer to the next byte in the input buffer to process.  This is the
- *	pointer to the bytes being matched against.
+ *	pointer to the sequence being matched against.
+ * @min_len
+ *	Only record matches that are at least this long.
  * @max_len
- *	Maximum match length to return.
+ *	The maximum permissible match length at this position.
  * @nice_len
  *	Stop searching if a match of at least this length is found.
+ *	Must be <= @max_len.
  * @max_search_depth
- *	Limit on the number of potential matches to consider.
- * @prev_hash
- *	TODO
- * @matches
- *	Space to write the matches that are found.
+ *	Limit on the number of potential matches to consider.  Must be >= 1.
+ * @next_hash
+ *	Pointer to the hash code for the current sequence, which was computed
+ *	one position in advance so that the binary tree root could be
+ *	prefetched.  This is an input/output parameter.
+ * @best_len_ret
+ *	The length of the longest match found is written here.  (This is
+ *	actually redundant with the 'struct lz_match' array, but this is easier
+ *	for the compiler to optimize when inlined and the caller immediately
+ *	does a check against 'best_len'.)
+ * @lz_matchptr
+ *	An array in which this function will record the matches.  The recorded
+ *	matches will be sorted by strictly increasing length and strictly
+ *	increasing offset.  The maximum number of matches that may be found is
+ *	'min(nice_len, max_len) - 3 + 1'.
  *
- * Returns the number of matches found, which may be anywhere from 0 to
- * (nice_len - 3 + 1), inclusively.  The matches are written to @matches in
- * order of strictly increasing length and strictly increasing offset.  The
- * minimum match length is assumed to be 3.
+ * The return value is a pointer to the next available slot in the @lz_matchptr
+ * array.  (If no matches were found, this will be the same as @lz_matchptr.)
  */
-static inline unsigned
+static inline struct lz_match *
 bt_matchfinder_get_matches(struct bt_matchfinder * const restrict mf,
 			   const u8 * const in_base,
 			   const u8 * const in_next,
+			   const unsigned min_len,
 			   const unsigned max_len,
 			   const unsigned nice_len,
 			   const unsigned max_search_depth,
-			   unsigned long *prev_hash,
-			   struct lz_match * const restrict matches)
+			   u32 * restrict next_hash,
+			   unsigned * restrict best_len_ret,
+			   struct lz_match * restrict lz_matchptr)
 {
-	struct lz_match *lz_matchptr = matches;
 	unsigned depth_remaining = max_search_depth;
-	unsigned hash;
-	pos_t cur_match;
+	u32 hash;
+	pos_t cur_node;
 	const u8 *matchptr;
-	unsigned best_len;
 	pos_t *pending_lt_ptr, *pending_gt_ptr;
 	unsigned best_lt_len, best_gt_len;
 	unsigned len;
-	pos_t *children;
+	unsigned best_len = min_len - 1;
 
-	if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES + 1))
-		return 0;
+	if (unlikely(max_len < LZ_HASH3_REQUIRED_NBYTES + 1)) {
+		*best_len_ret = best_len;
+		return lz_matchptr;
+	}
 
-	hash = *prev_hash;
-	*prev_hash = lz_hash3(in_next + 1, BT_MATCHFINDER_HASH_ORDER);
-	prefetch(&mf->hash_tab[*prev_hash]);
-	cur_match = mf->hash_tab[hash];
+	hash = *next_hash;
+	*next_hash = bt_matchfinder_hash_3_bytes(in_next + 1);
+	cur_node = mf->hash_tab[hash];
 	mf->hash_tab[hash] = in_next - in_base;
+	prefetch(&mf->hash_tab[*next_hash]);
 
-	best_len = 2;
-	pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1];
-	pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1];
+	pending_lt_ptr = bt_left_child(mf, in_next - in_base);
+	pending_gt_ptr = bt_right_child(mf, in_next - in_base);
 	best_lt_len = 0;
 	best_gt_len = 0;
+	len = 0;
+
+	if (!matchfinder_node_valid(cur_node, in_base, in_next)) {
+		*pending_lt_ptr = MATCHFINDER_NULL;
+		*pending_gt_ptr = MATCHFINDER_NULL;
+		*best_len_ret = best_len;
+		return lz_matchptr;
+	}
+
 	for (;;) {
-		if (!matchfinder_match_in_window(cur_match,
-						 in_base, in_next) ||
-		    !depth_remaining--)
-		{
-			*pending_lt_ptr = MATCHFINDER_INITVAL;
-			*pending_gt_ptr = MATCHFINDER_INITVAL;
-			return lz_matchptr - matches;
-		}
-
-		matchptr = &in_base[cur_match];
-		len = min(best_lt_len, best_gt_len);
-
-		children = &mf->child_tab[(unsigned long)
-				matchfinder_slot_for_match(cur_match) << 1];
+		matchptr = &in_base[cur_node];
 
 		if (matchptr[len] == in_next[len]) {
-
 			len = lz_extend(in_next, matchptr, len + 1, max_len);
-
 			if (len > best_len) {
 				best_len = len;
-
 				lz_matchptr->length = len;
 				lz_matchptr->offset = in_next - matchptr;
 				lz_matchptr++;
-
 				if (len >= nice_len) {
-					*pending_lt_ptr = children[0];
-					*pending_gt_ptr = children[1];
-					return lz_matchptr - matches;
+					*pending_lt_ptr = *bt_left_child(mf, cur_node);
+					*pending_gt_ptr = *bt_right_child(mf, cur_node);
+					*best_len_ret = best_len;
+					return lz_matchptr;
 				}
 			}
 		}
 
 		if (matchptr[len] < in_next[len]) {
-			*pending_lt_ptr = cur_match;
-			pending_lt_ptr = &children[1];
-			cur_match = *pending_lt_ptr;
+			*pending_lt_ptr = cur_node;
+			pending_lt_ptr = bt_right_child(mf, cur_node);
+			cur_node = *pending_lt_ptr;
 			best_lt_len = len;
+			if (best_gt_len < len)
+				len = best_gt_len;
 		} else {
-			*pending_gt_ptr = cur_match;
-			pending_gt_ptr = &children[0];
-			cur_match = *pending_gt_ptr;
+			*pending_gt_ptr = cur_node;
+			pending_gt_ptr = bt_left_child(mf, cur_node);
+			cur_node = *pending_gt_ptr;
 			best_gt_len = len;
+			if (best_lt_len < len)
+				len = best_lt_len;
+		}
+
+		if (!matchfinder_node_valid(cur_node, in_base, in_next) || !--depth_remaining) {
+			*pending_lt_ptr = MATCHFINDER_NULL;
+			*pending_gt_ptr = MATCHFINDER_NULL;
+			*best_len_ret = best_len;
+			return lz_matchptr;
 		}
 	}
 }
 
 /*
- * Advance the match-finder, but don't search for matches.
+ * Advance the matchfinder, but don't record any matches.
  *
  * @mf
  *	The matchfinder structure.
  * @in_base
  *	Pointer to the next byte in the input buffer to process _at the last
- *	time bc_matchfinder_init() or bc_matchfinder_slide_window() was called_.
+ *	time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
  * @in_next
  *	Pointer to the next byte in the input buffer to process.
  * @in_end
@@ -204,8 +254,14 @@ bt_matchfinder_get_matches(struct bt_matchfinder * const restrict mf,
  *	Stop searching if a match of at least this length is found.
  * @max_search_depth
  *	Limit on the number of potential matches to consider.
- * @prev_hash
- *	TODO
+ * @next_hash
+ *	Pointer to the hash code for the current sequence, which was computed
+ *	one position in advance so that the binary tree root could be
+ *	prefetched.  This is an input/output parameter.
+ *
+ * Note: this is very similar to bt_matchfinder_get_matches() because both
+ * functions must do hashing and tree re-rooting.  This version just doesn't
+ * actually record any matches.
  */
 static inline void
 bt_matchfinder_skip_position(struct bt_matchfinder * const restrict mf,
@@ -214,66 +270,70 @@ bt_matchfinder_skip_position(struct bt_matchfinder * const restrict mf,
 			     const u8 * const in_end,
 			     const unsigned nice_len,
 			     const unsigned max_search_depth,
-			     unsigned long *prev_hash)
+			     u32 * restrict next_hash)
 {
 	unsigned depth_remaining = max_search_depth;
-	unsigned hash;
-	pos_t cur_match;
+	u32 hash;
+	pos_t cur_node;
 	const u8 *matchptr;
 	pos_t *pending_lt_ptr, *pending_gt_ptr;
 	unsigned best_lt_len, best_gt_len;
 	unsigned len;
-	pos_t *children;
 
-	if (unlikely(in_end - in_next < LZ_HASH_REQUIRED_NBYTES + 1))
+	if (unlikely(in_end - in_next < LZ_HASH3_REQUIRED_NBYTES + 1))
 		return;
 
-	hash = *prev_hash;
-	*prev_hash = lz_hash3(in_next + 1, BT_MATCHFINDER_HASH_ORDER);
-	prefetch(&mf->hash_tab[*prev_hash]);
-	cur_match = mf->hash_tab[hash];
+	hash = *next_hash;
+	*next_hash = bt_matchfinder_hash_3_bytes(in_next + 1);
+	cur_node = mf->hash_tab[hash];
 	mf->hash_tab[hash] = in_next - in_base;
+	prefetch(&mf->hash_tab[*next_hash]);
 
 	depth_remaining = max_search_depth;
-	pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1];
-	pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1];
+	pending_lt_ptr = bt_left_child(mf, in_next - in_base);
+	pending_gt_ptr = bt_right_child(mf, in_next - in_base);
 	best_lt_len = 0;
 	best_gt_len = 0;
+	len = 0;
+
+	if (!matchfinder_node_valid(cur_node, in_base, in_next)) {
+		*pending_lt_ptr = MATCHFINDER_NULL;
+		*pending_gt_ptr = MATCHFINDER_NULL;
+		return;
+	}
+
 	for (;;) {
-		if (!matchfinder_match_in_window(cur_match,
-						 in_base, in_next) ||
-		    !depth_remaining--)
-		{
-			*pending_lt_ptr = MATCHFINDER_INITVAL;
-			*pending_gt_ptr = MATCHFINDER_INITVAL;
-			return;
-		}
-
-		matchptr = &in_base[cur_match];
-		len = min(best_lt_len, best_gt_len);
-
-		children = &mf->child_tab[(unsigned long)
-				matchfinder_slot_for_match(cur_match) << 1];
+		matchptr = &in_base[cur_node];
 
 		if (matchptr[len] == in_next[len]) {
 			len = lz_extend(in_next, matchptr, len + 1, nice_len);
 			if (len == nice_len) {
-				*pending_lt_ptr = children[0];
-				*pending_gt_ptr = children[1];
+				*pending_lt_ptr = *bt_left_child(mf, cur_node);
+				*pending_gt_ptr = *bt_right_child(mf, cur_node);
 				return;
 			}
 		}
 
 		if (matchptr[len] < in_next[len]) {
-			*pending_lt_ptr = cur_match;
-			pending_lt_ptr = &children[1];
-			cur_match = *pending_lt_ptr;
+			*pending_lt_ptr = cur_node;
+			pending_lt_ptr = bt_right_child(mf, cur_node);
+			cur_node = *pending_lt_ptr;
 			best_lt_len = len;
+			if (best_gt_len < len)
+				len = best_gt_len;
 		} else {
-			*pending_gt_ptr = cur_match;
-			pending_gt_ptr = &children[0];
-			cur_match = *pending_gt_ptr;
+			*pending_gt_ptr = cur_node;
+			pending_gt_ptr = bt_left_child(mf, cur_node);
+			cur_node = *pending_gt_ptr;
 			best_gt_len = len;
+			if (best_lt_len < len)
+				len = best_lt_len;
+		}
+
+		if (!matchfinder_node_valid(cur_node, in_base, in_next) || !--depth_remaining) {
+			*pending_lt_ptr = MATCHFINDER_NULL;
+			*pending_gt_ptr = MATCHFINDER_NULL;
+			return;
 		}
 	}
 }
diff --git a/src/compiler-gcc.h b/src/compiler-gcc.h
index b9e1869..e7bfef0 100644
--- a/src/compiler-gcc.h
+++ b/src/compiler-gcc.h
@@ -35,7 +35,7 @@
 #define max(a, b)  ({ __typeof__(a) _a = (a); __typeof__(b) _b = (b); \
 			(_a > _b) ? _a : _b; })
 
-#define swap(a, b) ({ __typeof__(a) _a = a; (a) = (b); (b) = _a; })
+#define swap(a, b) ({ __typeof__(a) _a = (a); (a) = (b); (b) = _a; })
 
 #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
 #  define compiler_bswap32 __builtin_bswap32
@@ -46,7 +46,7 @@
 #  define compiler_bswap16 __builtin_bswap16
 #endif
 
-#define compiler_fls32(n) (31 - __builtin_clz(n))
-#define compiler_fls64(n) (63 - __builtin_clzll(n))
-#define compiler_ffs32(n) __builtin_ctz(n)
-#define compiler_ffs64(n) __builtin_ctzll(n)
+#define compiler_fls32(n)	(31 - __builtin_clz(n))
+#define compiler_fls64(n)	(63 - __builtin_clzll(n))
+#define compiler_ffs32(n)	__builtin_ctz(n)
+#define compiler_ffs64(n)	__builtin_ctzll(n)
diff --git a/src/compiler.h b/src/compiler.h
index 95d2bc3..f48dbb5 100644
--- a/src/compiler.h
+++ b/src/compiler.h
@@ -9,37 +9,34 @@
 #ifdef __GNUC__
 #  include "compiler-gcc.h"
 #else
-#  warning "Unrecognized compiler.  Please add a header file for your compiler."
+#  error "Unrecognized compiler.  Please add a header file for your compiler."
 #endif
 
 #ifndef LIBEXPORT
 #  define LIBEXPORT
 #endif
 
-#ifndef BUILD_BUG_ON
-#  define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
-#endif
-
-#ifndef likely
-#  define likely(expr) (expr)
-#endif
-
-#ifndef unlikely
-#  define unlikely(expr) (expr)
-#endif
-
-#ifndef prefetch
-#  define prefetch(addr)
+#ifndef _packed_attribute
+#  error "missing required definition of _packed_attribute"
 #endif
 
 #ifndef _aligned_attribute
 #  error "missing required definition of _aligned_attribute"
 #endif
 
-#ifndef _packed_attribute
-#  error "missing required definition of _packed_attribute"
+#ifndef likely
+#  define likely(expr)		(expr)
 #endif
 
+#ifndef unlikely
+#  define unlikely(expr)	(expr)
+#endif
+
+#ifndef prefetch
+#  define prefetch(addr)
+#endif
+
+
 #ifndef CPU_IS_BIG_ENDIAN
 #  error "missing required endianness definition"
 #endif
@@ -47,7 +44,6 @@
 #define CPU_IS_LITTLE_ENDIAN (!CPU_IS_BIG_ENDIAN)
 
 #ifndef UNALIGNED_ACCESS_SPEED
-#  warning "assuming unaligned accesses are not allowed"
 #  define UNALIGNED_ACCESS_SPEED 0
 #endif
 
@@ -58,3 +54,7 @@
 #if !defined(min) || !defined(max) || !defined(swap)
 #  error "missing required definitions of min(), max(), and swap() macros"
 #endif
+
+#ifndef BUILD_BUG_ON
+#  define BUILD_BUG_ON(expr)	((void)sizeof(char[1 - 2*!!(expr)]))
+#endif
diff --git a/src/crc32.c b/src/crc32.c
index 0db4b5e..ae03c79 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -71,9 +71,9 @@
  *			else
  *				multiple = 0;
  *
- * 			remainder >>= 1;
- * 			remainder |= (u32)bit << 31;
- * 			remainder ^= multiple;
+ *			remainder >>= 1;
+ *			remainder |= (u32)bit << 31;
+ *			remainder ^= multiple;
  *		}
  *
  *		return ~remainder;
@@ -108,7 +108,7 @@
  *				multiple = divisor;
  *			else
  *				multiple = 0;
- * 			remainder >>= 1;
+ *			remainder >>= 1;
  *			remainder ^= multiple;
  *		}
  *
diff --git a/src/deflate_compress.c b/src/deflate_compress.c
index e566625..ae0d176 100644
--- a/src/deflate_compress.c
+++ b/src/deflate_compress.c
@@ -1961,9 +1961,7 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c,
 	struct lz_match *cache_end;
 	const u8 *in_block_begin;
 	const u8 *in_block_end;
-	unsigned num_matches;
-	unsigned best_len;
-	unsigned long prev_hash = 0;
+	u32 next_hash = 0;
 
 	deflate_init_output(&os, out, out_nbytes_avail);
 	deflate_reset_symbol_frequencies(c);
@@ -1991,6 +1989,9 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c,
 
 		/* Find all match possibilities in this block.  */
 		do {
+			struct lz_match *matches;
+			unsigned best_len;
+
 			/* Decrease the maximum and nice match lengths if we're
 			 * approaching the end of the input buffer.  */
 			if (unlikely(max_len > in_end - in_next)) {
@@ -2028,71 +2029,68 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c,
 			 *   search for matches at almost all positions, so this
 			 *   advantage of hash chains is negated.
 			 */
-			num_matches =
+			matches = cache_ptr;
+			cache_ptr =
 				bt_matchfinder_get_matches(&c->bt_mf,
 							   in_cur_base,
 							   in_next,
+							   DEFLATE_MIN_MATCH_LEN,
 							   max_len,
 							   nice_len,
 							   c->max_search_depth,
-							   &prev_hash,
+							   &next_hash,
+							   &best_len,
 							   cache_ptr);
-			cache_ptr += num_matches;
-			cache_ptr->length = num_matches;
+			cache_ptr->length = cache_ptr - matches;
 			cache_ptr->offset = *in_next;
 			in_next++;
 			cache_ptr++;
 
-			if (num_matches) {
-				best_len = cache_ptr[-2].length;
+			/*
+			 * If there was a very long match found, don't cache any
+			 * matches for the bytes covered by that match.  This
+			 * avoids degenerate behavior when compressing highly
+			 * redundant data, where the number of matches can be
+			 * very large.
+			 *
+			 * This heuristic doesn't actually hurt the compression
+			 * ratio very much.  If there's a long match, then the
+			 * data must be highly compressible, so it doesn't
+			 * matter much what we do.
+			 *
+			 * We also trigger this same case when approaching the
+			 * desired end of the block.  This forces the block to
+			 * reach a "stopping point" where there are no matches
+			 * extending to later positions.  (XXX: this behavior is
+			 * non-optimal and should be improved.)
+			 */
+			if (best_len >= DEFLATE_MIN_MATCH_LEN &&
+			    best_len >= min(nice_len, in_block_end - in_next)) {
+				--best_len;
+				do {
+					if (unlikely(max_len > in_end - in_next)) {
+						max_len = in_end - in_next;
+						nice_len = min(max_len, nice_len);
+					}
+					if (in_next == in_next_slide) {
+						bt_matchfinder_slide_window(&c->bt_mf);
+						in_cur_base = in_next;
+						in_next_slide = in_next + min(in_end - in_next,
+									      MATCHFINDER_WINDOW_SIZE);
+					}
+					bt_matchfinder_skip_position(&c->bt_mf,
+								     in_cur_base,
+								     in_next,
+								     in_end,
+								     nice_len,
+								     c->max_search_depth,
+								     &next_hash);
 
-				/*
-				 * If there was a very long match found, don't
-				 * cache any matches for the bytes covered by
-				 * that match.  This avoids degenerate behavior
-				 * when compressing highly redundant data, where
-				 * the number of matches can be very large.
-				 *
-				 * This heuristic doesn't actually hurt the
-				 * compression ratio very much.  If there's a
-				 * long match, then the data must be highly
-				 * compressible, so it doesn't matter much what
-				 * we do.
-				 *
-				 * We also trigger this same case when
-				 * approaching the desired end of the block.
-				 * This forces the block to reach a "stopping
-				 * point" where there are no matches extending
-				 * to later positions.  (XXX: this behavior is
-				 * non-optimal and should be improved.)
-				 */
-				if (best_len >= min(nice_len, in_block_end - in_next)) {
-					--best_len;
-					do {
-						if (unlikely(max_len > in_end - in_next)) {
-							max_len = in_end - in_next;
-							nice_len = min(max_len, nice_len);
-						}
-						if (in_next == in_next_slide) {
-							bt_matchfinder_slide_window(&c->bt_mf);
-							in_cur_base = in_next;
-							in_next_slide = in_next + min(in_end - in_next,
-										      MATCHFINDER_WINDOW_SIZE);
-						}
-						bt_matchfinder_skip_position(&c->bt_mf,
-									     in_cur_base,
-									     in_next,
-									     in_end,
-									     nice_len,
-									     c->max_search_depth,
-									     &prev_hash);
-
-						cache_ptr->length = 0;
-						cache_ptr->offset = *in_next;
-						in_next++;
-						cache_ptr++;
-					} while (--best_len);
-				}
+					cache_ptr->length = 0;
+					cache_ptr->offset = *in_next;
+					in_next++;
+					cache_ptr++;
+				} while (--best_len);
 			}
 		} while (in_next < in_block_end);
 
diff --git a/src/endianness.h b/src/endianness.h
index 41cfdf6..7f1e9a1 100644
--- a/src/endianness.h
+++ b/src/endianness.h
@@ -1,7 +1,7 @@
 /*
  * endianness.h
  *
- * Inline functions for endianness conversion.
+ * Macros and inline functions for endianness conversion.
  */
 
 #pragma once
diff --git a/src/gzip_compress.c b/src/gzip_compress.c
index c3c626d..acf462f 100644
--- a/src/gzip_compress.c
+++ b/src/gzip_compress.c
@@ -57,7 +57,7 @@ gzip_compress(struct deflate_compressor *c, const void *in, size_t in_size,
 	out_next += 4;
 
 	/* ISIZE */
-	put_unaligned_u32_le(in_size, out_next);
+	put_unaligned_u32_le((u32)in_size, out_next);
 	out_next += 4;
 
 	return out_next - (u8 *)out;
diff --git a/src/gzip_constants.h b/src/gzip_constants.h
index 0041857..87df327 100644
--- a/src/gzip_constants.h
+++ b/src/gzip_constants.h
@@ -6,8 +6,6 @@
 
 #pragma once
 
-#include "compiler.h"
-
 #define GZIP_MIN_HEADER_SIZE	10
 #define GZIP_FOOTER_SIZE	8
 #define GZIP_MIN_OVERHEAD	(GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE)
diff --git a/src/gzip_decompress.c b/src/gzip_decompress.c
index c9ced0c..ac4fcd3 100644
--- a/src/gzip_decompress.c
+++ b/src/gzip_decompress.c
@@ -54,20 +54,16 @@ gzip_decompress(struct deflate_decompressor *d,
 
 	/* Original file name (zero terminated) */
 	if (flg & GZIP_FNAME) {
-		while (*in_next != 0 && ++in_next != in_end)
+		while (*in_next++ != 0 && in_next != in_end)
 			;
-		if (in_next != in_end)
-			in_next++;
 		if (in_end - in_next < GZIP_FOOTER_SIZE)
 			return false;
 	}
 
 	/* File comment (zero terminated) */
 	if (flg & GZIP_FCOMMENT) {
-		while (*in_next != 0 && ++in_next != in_end)
+		while (*in_next++ != 0 && ++in_next != in_end)
 			;
-		if (in_next != in_end)
-			in_next++;
 		if (in_end - in_next < GZIP_FOOTER_SIZE)
 			return false;
 	}
diff --git a/src/hc_matchfinder.h b/src/hc_matchfinder.h
index 67cb746..7b1d8bd 100644
--- a/src/hc_matchfinder.h
+++ b/src/hc_matchfinder.h
@@ -1,37 +1,102 @@
 /*
  * hc_matchfinder.h
  *
- * This is a Hash Chain (hc) based matchfinder.
+ * ---------------------------------------------------------------------------
+ *
+ *				   Algorithm
+ *
+ * This is a Hash Chains (hc) based matchfinder.
  *
  * The data structure is a hash table where each hash bucket contains a linked
- * list of sequences, referenced by position.
+ * list (or "chain") of sequences whose first 3 bytes share the same hash code.
+ * Each sequence is identified by its starting position in the input buffer.
  *
- * For each sequence (position) in the input, the first 3 bytes are hashed and
- * that sequence (position) is prepended to the appropriate linked list in the
- * hash table.  Since the sequences are inserted in order, each list is always
- * sorted by increasing match offset.
+ * The algorithm processes the input buffer sequentially.  At each byte
+ * position, the hash code of the first 3 bytes of the sequence beginning at
+ * that position (the sequence being matched against) is computed.  This
+ * identifies the hash bucket to use for that position.  Then, this hash
+ * bucket's linked list is searched for matches.  Then, a new linked list node
+ * is created to represent the current sequence and is prepended to the list.
  *
- * At the same time as inserting a sequence, we may search the linked list for
- * matches with that sequence.  At each step, the length of the match is
- * computed.  The search ends when the sequences get too far away (outside of
- * the sliding window), or when the list ends (in the code this is the same
- * check as "too far away"), or when 'max_search_depth' positions have been
- * searched, or when a match of at least 'nice_len' bytes has been found.
+ * This algorithm has several useful properties:
+ *
+ * - It only finds true Lempel-Ziv matches; i.e., those where the matching
+ *   sequence occurs prior to the sequence being matched against.
+ *
+ * - The sequences in each linked list are always sorted by decreasing starting
+ *   position.  Therefore, the closest (smallest offset) matches are found
+ *   first, which in many compression formats tend to be the cheapest to encode.
+ *
+ * - Although fast running time is not guaranteed due to the possibility of the
+ *   lists getting very long, the worst degenerate behavior can be easily
+ *   prevented by capping the number of nodes searched at each position.
+ *
+ * - If the compressor decides not to search for matches at a certain position,
+ *   then that position can be quickly inserted without searching the list.
+ *
+ * - The algorithm is adaptable to sliding windows: just store the positions
+ *   relative to a "base" value that is updated from time to time, and stop
+ *   searching each list when the sequences get too far away.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ *				Notes on usage
+ *
+ * You must define MATCHFINDER_WINDOW_ORDER before including this header because
+ * that determines which integer type to use for positions.  Since 16-bit
+ * integers are faster than 32-bit integers due to reduced memory usage (and
+ * therefore reduced cache pressure), the code only uses 32-bit integers if they
+ * are needed to represent all possible positions.
+ *
+ * In addition, you must allocate the 'struct hc_matchfinder' on a
+ * MATCHFINDER_ALIGNMENT-aligned boundary.
+ *
+ * ----------------------------------------------------------------------------
+ *
+ *				 Optimizations
+ *
+ * The longest_match() and skip_positions() functions are inlined into the
+ * compressors that use them.  This isn't just about saving the overhead of a
+ * function call.  These functions are intended to be called from the inner
+ * loops of compressors, where giving the compiler more control over register
+ * allocation is very helpful.  There is also significant benefit to be gained
+ * from allowing the CPU to predict branches independently at each call site.
+ * For example, "lazy"-style compressors can be written with two calls to
+ * longest_match(), each of which starts with a different 'best_len' and
+ * therefore has significantly different performance characteristics.
+ *
+ * Although any hash function can be used, a multiplicative hash is fast and
+ * works well.
+ *
+ * On some processors, it is significantly faster to extend matches by whole
+ * words (32 or 64 bits) instead of by individual bytes.  For this to be the
+ * case, the processor must implement unaligned memory accesses efficiently and
+ * must have either a fast "find first set bit" instruction or a fast "find last
+ * set bit" instruction, depending on the processor's endianness.
+ *
+ * The code uses one loop for finding the first match and one loop for finding a
+ * longer match.  Each of these loops is tuned for its respective task and in
+ * combination are faster than a single generalized loop that handles both
+ * tasks.
+ *
+ * The code also uses a tight inner loop that only compares the last and first
+ * bytes of a potential match.  It is only when these bytes match that a full
+ * match extension is attempted.
+ *
+ * ----------------------------------------------------------------------------
  */
 
 #pragma once
 
 #include "lz_extend.h"
-#include "lz_hash3.h"
+#include "lz_hash.h"
 #include "matchfinder_common.h"
 #include "unaligned.h"
 
-#ifndef HC_MATCHFINDER_HASH_ORDER
-#  if MATCHFINDER_WINDOW_ORDER < 14
-#    define HC_MATCHFINDER_HASH_ORDER 14
-#  else
-#    define HC_MATCHFINDER_HASH_ORDER 15
-#  endif
+#if MATCHFINDER_WINDOW_ORDER < 14
+#  define HC_MATCHFINDER_HASH_ORDER 14
+#else
+#  define HC_MATCHFINDER_HASH_ORDER 15
 #endif
 
 #define HC_MATCHFINDER_HASH_LENGTH	(1UL << HC_MATCHFINDER_HASH_ORDER)
@@ -73,17 +138,18 @@ hc_matchfinder_slide_window(struct hc_matchfinder *mf)
  *	time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_.
  * @in_next
  *	Pointer to the next byte in the input buffer to process.  This is the
- *	pointer to the bytes being matched against.
+ *	pointer to the sequence being matched against.
  * @best_len
- *	Require a match at least this long.
+ *	Require a match longer than this length.
  * @max_len
- *	Maximum match length to return.
+ *	The maximum permissible match length at this position.
  * @nice_len
  *	Stop searching if a match of at least this length is found.
+ *	Must be <= @max_len.
  * @max_search_depth
- *	Limit on the number of potential matches to consider.
+ *	Limit on the number of potential matches to consider.  Must be >= 1.
  * @offset_ret
- *	The match offset is returned here.
+ *	If a match is found, its offset is returned in this location.
  *
  * Return the length of the match found, or 'best_len' if no match longer than
  * 'best_len' was found.
@@ -102,61 +168,57 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
 	const u8 *best_matchptr = best_matchptr; /* uninitialized */
 	const u8 *matchptr;
 	unsigned len;
-	unsigned hash;
-	pos_t cur_match;
 	u32 first_3_bytes;
+	u32 hash;
+	pos_t cur_node;
 
-	/* Insert the current sequence into the appropriate hash chain.  */
-	if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES))
+	/* Insert the current sequence into the appropriate linked list.  */
+	if (unlikely(max_len < LOAD_U24_REQUIRED_NBYTES))
 		goto out;
 	first_3_bytes = load_u24_unaligned(in_next);
-	hash = lz_hash3_u24(first_3_bytes, HC_MATCHFINDER_HASH_ORDER);
-	cur_match = mf->hash_tab[hash];
-	mf->next_tab[in_next - in_base] = cur_match;
+	hash = lz_hash(first_3_bytes, HC_MATCHFINDER_HASH_ORDER);
+	cur_node = mf->hash_tab[hash];
+	mf->next_tab[in_next - in_base] = cur_node;
 	mf->hash_tab[hash] = in_next - in_base;
 
 	if (unlikely(best_len >= max_len))
 		goto out;
 
-	/* Search the appropriate hash chain for matches.  */
+	/* Search the appropriate linked list for matches.  */
 
-	if (!(matchfinder_match_in_window(cur_match, in_base, in_next)))
+	if (!(matchfinder_node_valid(cur_node, in_base, in_next)))
 		goto out;
 
 	if (best_len < 3) {
 		for (;;) {
 			/* No length 3 match found yet.
 			 * Check the first 3 bytes.  */
-			matchptr = &in_base[cur_match];
+			matchptr = &in_base[cur_node];
 
 			if (load_u24_unaligned(matchptr) == first_3_bytes)
 				break;
 
-			/* Not a match; keep trying.  */
-			cur_match = mf->next_tab[
-					matchfinder_slot_for_match(cur_match)];
-			if (!matchfinder_match_in_window(cur_match,
-							 in_base, in_next))
-				goto out;
-			if (!--depth_remaining)
+			/* The first 3 bytes did not match.  Keep trying.  */
+			cur_node = mf->next_tab[
+					matchfinder_slot_for_match(cur_node)];
+			if (!matchfinder_node_valid(cur_node, in_base, in_next) ||
+			    !--depth_remaining)
 				goto out;
 		}
 
-		/* Found a length 3 match.  */
+		/* Found a match of length >= 3.  Extend it to its full length.  */
 		best_matchptr = matchptr;
 		best_len = lz_extend(in_next, best_matchptr, 3, max_len);
 		if (best_len >= nice_len)
 			goto out;
-		cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)];
-		if (!matchfinder_match_in_window(cur_match, in_base, in_next))
-			goto out;
-		if (!--depth_remaining)
+		cur_node = mf->next_tab[matchfinder_slot_for_match(cur_node)];
+		if (!matchfinder_node_valid(cur_node, in_base, in_next) || !--depth_remaining)
 			goto out;
 	}
 
 	for (;;) {
 		for (;;) {
-			matchptr = &in_base[cur_match];
+			matchptr = &in_base[cur_node];
 
 			/* Already found a length 3 match.  Try for a longer match;
 			 * start by checking the last 2 bytes and the first 4 bytes.  */
@@ -170,17 +232,16 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
 		#endif
 				break;
 
-			cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)];
-			if (!matchfinder_match_in_window(cur_match, in_base, in_next))
-				goto out;
-			if (!--depth_remaining)
+			cur_node = mf->next_tab[matchfinder_slot_for_match(cur_node)];
+			if (!matchfinder_node_valid(cur_node, in_base, in_next) || !--depth_remaining)
 				goto out;
 		}
 
-		if (UNALIGNED_ACCESS_IS_FAST)
-			len = 4;
-		else
-			len = 0;
+	#if UNALIGNED_ACCESS_IS_FAST
+		len = 4;
+	#else
+		len = 0;
+	#endif
 		len = lz_extend(in_next, matchptr, len, max_len);
 		if (len > best_len) {
 			best_len = len;
@@ -188,10 +249,8 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
 			if (best_len >= nice_len)
 				goto out;
 		}
-		cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)];
-		if (!matchfinder_match_in_window(cur_match, in_base, in_next))
-			goto out;
-		if (!--depth_remaining)
+		cur_node = mf->next_tab[matchfinder_slot_for_match(cur_node)];
+		if (!matchfinder_node_valid(cur_node, in_base, in_next) || !--depth_remaining)
 			goto out;
 	}
 out:
@@ -200,7 +259,7 @@ out:
 }
 
 /*
- * Advance the match-finder, but don't search for matches.
+ * Advance the matchfinder, but don't search for matches.
  *
  * @mf
  *	The matchfinder structure.
@@ -212,7 +271,7 @@ out:
  * @in_end
  *	Pointer to the end of the input buffer.
  * @count
- *	Number of bytes to skip; must be > 0.
+ *	The number of bytes to advance.  Must be > 0.
  */
 static inline void
 hc_matchfinder_skip_positions(struct hc_matchfinder * restrict mf,
@@ -221,13 +280,13 @@ hc_matchfinder_skip_positions(struct hc_matchfinder * restrict mf,
 			      const u8 *in_end,
 			      unsigned count)
 {
-	unsigned hash;
+	u32 hash;
 
-	if (unlikely(in_next + count >= in_end - LZ_HASH_REQUIRED_NBYTES))
+	if (unlikely(in_next + count >= in_end - LZ_HASH3_REQUIRED_NBYTES))
 		return;
 
 	do {
-		hash = lz_hash3(in_next, HC_MATCHFINDER_HASH_ORDER);
+		hash = lz_hash_3_bytes(in_next, HC_MATCHFINDER_HASH_ORDER);
 		mf->next_tab[in_next - in_base] = mf->hash_tab[hash];
 		mf->hash_tab[hash] = in_next - in_base;
 		in_next++;
diff --git a/src/lz_extend.h b/src/lz_extend.h
index 94b281a..be5a677 100644
--- a/src/lz_extend.h
+++ b/src/lz_extend.h
@@ -24,12 +24,12 @@ lz_extend(const u8 * const strptr, const u8 * const matchptr,
 
 		if (likely(max_len - len >= 4 * WORDSIZE)) {
 
-		#define COMPARE_WORD_STEP					\
-			v_word = load_word_unaligned(&matchptr[len]) ^		\
-				 load_word_unaligned(&strptr[len]);		\
-			if (v_word != 0)					\
-				goto word_differs;				\
-			len += WORDSIZE;					\
+		#define COMPARE_WORD_STEP				\
+			v_word = load_word_unaligned(&matchptr[len]) ^	\
+				 load_word_unaligned(&strptr[len]);	\
+			if (v_word != 0)				\
+				goto word_differs;			\
+			len += WORDSIZE;				\
 
 			COMPARE_WORD_STEP
 			COMPARE_WORD_STEP
diff --git a/src/lz_hash.h b/src/lz_hash.h
new file mode 100644
index 0000000..419baa6
--- /dev/null
+++ b/src/lz_hash.h
@@ -0,0 +1,41 @@
+/*
+ * lz_hash.h
+ *
+ * Hashing for Lempel-Ziv matchfinding.
+ */
+
+#ifndef _LZ_HASH_H
+#define _LZ_HASH_H
+
+#include "unaligned.h"
+
+/*
+ * The hash function: given a sequence prefix held in the low-order bits of a
+ * 32-bit value, multiply by a carefully-chosen large constant.  Discard any
+ * bits of the product that don't fit in a 32-bit value, but take the
+ * next-highest @num_bits bits of the product as the hash value, as those have
+ * the most randomness.
+ */
+static inline u32
+lz_hash(u32 seq, unsigned num_bits)
+{
+	return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
+}
+
+/*
+ * Hash the 3-byte sequence beginning at @p, producing a hash of length
+ * @num_bits bits.  At least LZ_HASH3_REQUIRED_NBYTES bytes of data must be
+ * available at @p; note that this may be more than 3.
+ */
+static inline u32
+lz_hash_3_bytes(const u8 *p, unsigned num_bits)
+{
+	u32 seq = load_u24_unaligned(p);
+	if (num_bits >= 24)
+		return seq;
+	return lz_hash(seq, num_bits);
+}
+
+#define LZ_HASH3_REQUIRED_NBYTES LOAD_U24_REQUIRED_NBYTES
+
+#endif /* _LZ_HASH_H */
diff --git a/src/lz_hash3.h b/src/lz_hash3.h
deleted file mode 100644
index ec322d9..0000000
--- a/src/lz_hash3.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * lz_hash3.h
- *
- * 3-byte hashing for Lempel-Ziv matchfinding.
- */
-
-#pragma once
-
-#include "unaligned.h"
-
-static inline u32
-loaded_u32_to_u24(u32 v)
-{
-	if (CPU_IS_LITTLE_ENDIAN)
-		return v & 0xFFFFFF;
-	else
-		return v >> 8;
-}
-
-static inline u32
-load_u24_unaligned(const u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST)
-		return loaded_u32_to_u24(load_u32_unaligned(p));
-	else
-		return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
-}
-
-static inline u32
-lz_hash3_u24(u32 str, unsigned num_bits)
-{
-	return (u32)(str * 0x1E35A7BD) >> (32 - num_bits);
-}
-
-/*
- * Hash the next 3-byte sequence in the window, producing a hash of length
- * 'num_bits' bits.  At least LZ_HASH_REQUIRED_NBYTES must be available at 'p';
- * this might be 4 bytes rather than 3 because an unaligned load is faster on
- * some architectures.
- */
-static inline u32
-lz_hash3(const u8 *p, unsigned num_bits)
-{
-	return lz_hash3_u24(load_u24_unaligned(p), num_bits);
-}
-
-/* Number of bytes the hash function actually requires be available, due to the
- * possibility of an unaligned load.  */
-#define LZ_HASH_REQUIRED_NBYTES (UNALIGNED_ACCESS_IS_FAST ? 4 : 3)
diff --git a/src/matchfinder_avx2.h b/src/matchfinder_avx2.h
index fe98b63..3bdb1a9 100644
--- a/src/matchfinder_avx2.h
+++ b/src/matchfinder_avx2.h
@@ -16,9 +16,9 @@ matchfinder_init_avx2(pos_t *data, size_t size)
 		return false;
 
 	if (sizeof(pos_t) == 2)
-		v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
+		v = _mm256_set1_epi16((u16)MATCHFINDER_NULL);
 	else if (sizeof(pos_t) == 4)
-		v = _mm256_set1_epi32(MATCHFINDER_INITVAL);
+		v = _mm256_set1_epi32((u32)MATCHFINDER_NULL);
 	else
 		return false;
 
diff --git a/src/matchfinder_common.h b/src/matchfinder_common.h
index 2a01336..1ffbed9 100644
--- a/src/matchfinder_common.h
+++ b/src/matchfinder_common.h
@@ -60,7 +60,7 @@ static inline bool
 matchfinder_memset_init_okay(void)
 {
 	/* All bytes must match in order to use memset.  */
-	const pos_t v = MATCHFINDER_INITVAL;
+	const pos_t v = MATCHFINDER_NULL;
 	if (sizeof(pos_t) == 2)
 		return (u8)v == (u8)(v >> 8);
 	if (sizeof(pos_t) == 4)
@@ -93,12 +93,12 @@ matchfinder_init(pos_t *data, size_t num_entries)
 #endif
 
 	if (matchfinder_memset_init_okay()) {
-		memset(data, (u8)MATCHFINDER_INITVAL, size);
+		memset(data, (u8)MATCHFINDER_NULL, size);
 		return;
 	}
 
 	for (size_t i = 0; i < num_entries; i++)
-		data[i] = MATCHFINDER_INITVAL;
+		data[i] = MATCHFINDER_NULL;
 }
 
 #if MATCHFINDER_IS_SLIDING
diff --git a/src/matchfinder_nonsliding.h b/src/matchfinder_nonsliding.h
index e08f461..4717214 100644
--- a/src/matchfinder_nonsliding.h
+++ b/src/matchfinder_nonsliding.h
@@ -16,12 +16,12 @@ typedef u32 pos_t;
 
 /* Not all the bits of the position type are needed, so the sign bit can be
  * reserved to mean "out of bounds".  */
-#define MATCHFINDER_INITVAL ((pos_t)-1)
+#define MATCHFINDER_NULL ((pos_t)-1)
 
 static inline bool
-matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next)
+matchfinder_node_valid(pos_t cur_node, const u8 *in_base, const u8 *in_next)
 {
-	return !(cur_match & ((pos_t)1 << (sizeof(pos_t) * 8 - 1)));
+	return !(cur_node & ((pos_t)1 << (sizeof(pos_t) * 8 - 1)));
 }
 
 #else
@@ -30,18 +30,18 @@ matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_nex
  * This prevents the beginning of the buffer from matching anything; however,
  * this doesn't matter much.  */
 
-#define MATCHFINDER_INITVAL ((pos_t)0)
+#define MATCHFINDER_NULL ((pos_t)0)
 
 static inline bool
-matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next)
+matchfinder_node_valid(pos_t cur_node, const u8 *in_base, const u8 *in_next)
 {
-	return cur_match != 0;
+	return cur_node != 0;
 }
 
 #endif
 
 static inline pos_t
-matchfinder_slot_for_match(pos_t cur_match)
+matchfinder_slot_for_match(pos_t cur_node)
 {
-	return cur_match;
+	return cur_node;
 }
diff --git a/src/matchfinder_sliding.h b/src/matchfinder_sliding.h
index 4b8a515..2fb715a 100644
--- a/src/matchfinder_sliding.h
+++ b/src/matchfinder_sliding.h
@@ -13,18 +13,18 @@ typedef s16 pos_t;
 typedef s32 pos_t;
 #endif
 
-#define MATCHFINDER_INITVAL ((pos_t)-MATCHFINDER_WINDOW_SIZE)
+#define MATCHFINDER_NULL ((pos_t)-MATCHFINDER_WINDOW_SIZE)
 
 /* In the sliding window case, positions are stored relative to 'in_base'.  */
 
 static inline bool
-matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next)
+matchfinder_node_valid(pos_t cur_node, const u8 *in_base, const u8 *in_next)
 {
-	return cur_match > (pos_t)((in_next - in_base) - MATCHFINDER_WINDOW_SIZE);
+	return cur_node > (pos_t)((in_next - in_base) - MATCHFINDER_WINDOW_SIZE);
 }
 
 static inline pos_t
-matchfinder_slot_for_match(pos_t cur_match)
+matchfinder_slot_for_match(pos_t cur_node)
 {
-	return cur_match & (MATCHFINDER_WINDOW_SIZE - 1);
+	return cur_node & (MATCHFINDER_WINDOW_SIZE - 1);
 }
diff --git a/src/matchfinder_sse2.h b/src/matchfinder_sse2.h
index cc27600..574f9b2 100644
--- a/src/matchfinder_sse2.h
+++ b/src/matchfinder_sse2.h
@@ -16,9 +16,9 @@ matchfinder_init_sse2(pos_t *data, size_t size)
 		return false;
 
 	if (sizeof(pos_t) == 2)
-		v = _mm_set1_epi16(MATCHFINDER_INITVAL);
+		v = _mm_set1_epi16((u16)MATCHFINDER_NULL);
 	else if (sizeof(pos_t) == 4)
-		v = _mm_set1_epi32(MATCHFINDER_INITVAL);
+		v = _mm_set1_epi32((u32)MATCHFINDER_NULL);
 	else
 		return false;
 
diff --git a/src/types.h b/src/types.h
index 205dbd3..eff0404 100644
--- a/src/types.h
+++ b/src/types.h
@@ -6,9 +6,9 @@
 
 #pragma once
 
-#include <inttypes.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdint.h>
 
 typedef uint8_t  u8;
 typedef uint16_t u16;
diff --git a/src/unaligned.h b/src/unaligned.h
index d5b0f95..24a588f 100644
--- a/src/unaligned.h
+++ b/src/unaligned.h
@@ -1,7 +1,7 @@
 /*
  * unaligned.h
  *
- * Inline functions for unaligned memory access.
+ * Inline functions for unaligned memory accesses.
  */
 
 #pragma once
@@ -214,3 +214,36 @@ put_unaligned_u32_be(u32 v, void *p)
 		p8[3] = (v >> 0) & 0xFF;
 	}
 }
+
+/*
+ * Given a 32-bit value that was loaded with the platform's native endianness,
+ * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
+ * bits contain the first 3 bytes, arranged in octets in a platform-dependent
+ * order, at the memory location from which the input 32-bit value was loaded.
+ */
+static inline u32
+loaded_u32_to_u24(u32 v)
+{
+	if (CPU_IS_LITTLE_ENDIAN)
+		return v & 0xFFFFFF;
+	else
+		return v >> 8;
+}
+
+/*
+ * Load the next 3 bytes from the memory location @p into the 24 low-order bits
+ * of a 32-bit value.  The order in which the 3 bytes will be arranged as octets
+ * in the 24 bits is platform-dependent.  At least LOAD_U24_REQUIRED_NBYTES
+ * bytes must be available at @p; note that this may be more than 3.
+ */
+static inline u32
+load_u24_unaligned(const u8 *p)
+{
+#if UNALIGNED_ACCESS_IS_FAST
+#  define LOAD_U24_REQUIRED_NBYTES 4
+	return loaded_u32_to_u24(load_u32_unaligned(p));
+#else
+#  define LOAD_U24_REQUIRED_NBYTES 3
+	return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
+#endif
+}
diff --git a/test/benchmark.c b/test/benchmark.c
index 205060e..7b020f9 100644
--- a/test/benchmark.c
+++ b/test/benchmark.c
@@ -1,11 +1,9 @@
 /*
  * benchmark.c - A compression testing and benchmark program.
  *
- * The author dedicates this file to the public domain.
- * You can do whatever you want with this file.
+ * This file has no copyright assigned and is placed in the Public Domain.
  */
 
-
 #define _FILE_OFFSET_BITS 64
 #define _GNU_SOURCE
 
@@ -419,9 +417,9 @@ main(int argc, char **argv)
 	       wrapper == NO_WRAPPER ? "None" :
 	       wrapper == ZLIB_WRAPPER ? "zlib" : "gzip");
 	printf("\tCompression engine: %s\n",
-	       compress_with_libz ? "zlib" : "libdeflate");
+	       compress_with_libz ? "libz" : "libdeflate");
 	printf("\tDecompression engine: %s\n",
-	       decompress_with_libz ? "zlib" : "libdeflate");
+	       decompress_with_libz ? "libz" : "libdeflate");
 
 	ubuf1 = malloc(chunk_size);
 	ubuf2 = malloc(chunk_size);