From 5f3208e788f4f5497245b8b09b356ea74c4fb3b4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 22 Jan 2015 00:05:01 -0600 Subject: [PATCH] Cleanups and matchfinder updates --- CMakeLists.txt | 2 +- libdeflate.h | 22 ++- src/adler32.c | 2 +- src/adler32.h | 2 +- src/bitops.h | 18 +- src/bt_matchfinder.h | 310 +++++++++++++++++++++-------------- src/compiler-gcc.h | 10 +- src/compiler.h | 36 ++-- src/crc32.c | 8 +- src/deflate_compress.c | 110 ++++++------- src/endianness.h | 2 +- src/gzip_compress.c | 2 +- src/gzip_constants.h | 2 - src/gzip_decompress.c | 8 +- src/hc_matchfinder.h | 187 +++++++++++++-------- src/lz_extend.h | 12 +- src/lz_hash.h | 41 +++++ src/lz_hash3.h | 49 ------ src/matchfinder_avx2.h | 4 +- src/matchfinder_common.h | 6 +- src/matchfinder_nonsliding.h | 16 +- src/matchfinder_sliding.h | 10 +- src/matchfinder_sse2.h | 4 +- src/types.h | 2 +- src/unaligned.h | 35 +++- test/benchmark.c | 8 +- 26 files changed, 528 insertions(+), 380 deletions(-) create mode 100644 src/lz_hash.h delete mode 100644 src/lz_hash3.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e5757b..4ff2f1c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,7 +78,7 @@ install(FILES libdeflate.h DESTINATION "${CMAKE_INSTALL_PREFIX}/include") option(BUILD_BENCHMARK "Build benchmark program" OFF) add_executable(benchmark test/benchmark.c) -target_link_libraries(benchmark deflate -lz) +target_link_libraries(benchmark deflatestatic -lz) option(BUILD_GEN_CRC32_TABLE "Build CRC32 table generation program" OFF) add_executable(gen_crc32_table test/gen_crc32_table.c) diff --git a/libdeflate.h b/libdeflate.h index 1d42ed4..e5d6aa9 100644 --- a/libdeflate.h +++ b/libdeflate.h @@ -1,7 +1,9 @@ /* * libdeflate.h * - * Public header for the DEFLATE compression library. + * Public header for libdeflate. + * + * This file has no copyright assigned and is placed in the Public Domain. */ #ifndef LIBDEFLATE_H @@ -26,7 +28,9 @@ struct deflate_compressor; * fastest, 6 = medium/default, 9 = slowest). The return value is a pointer to * the new DEFLATE compressor, or NULL if out of memory. * - * Note: the sliding window size is defined at compilation time (default 32768). + * Note: for compression, the sliding window size is defined at compilation time + * to 32768, the largest size permissible in the DEFLATE format. It cannot be + * changed at runtime. */ extern struct deflate_compressor * deflate_alloc_compressor(unsigned int compression_level); @@ -44,7 +48,7 @@ deflate_compress(struct deflate_compressor *compressor, void *out, size_t out_nbytes_avail); /* - * Like deflate_compress(), but store the data in the zlib wrapper format. + * Like deflate_compress(), but stores the data in the zlib wrapper format. */ extern size_t zlib_compress(struct deflate_compressor *compressor, @@ -52,7 +56,7 @@ zlib_compress(struct deflate_compressor *compressor, void *out, size_t out_nbytes_avail); /* - * Like deflate_compress(), but store the data in the gzip wrapper format. + * Like deflate_compress(), but stores the data in the gzip wrapper format. */ extern size_t gzip_compress(struct deflate_compressor *compressor, @@ -61,7 +65,8 @@ gzip_compress(struct deflate_compressor *compressor, /* * deflate_free_compressor() frees a DEFLATE compressor that was allocated with - * deflate_alloc_compressor(). + * deflate_alloc_compressor(). If a NULL pointer is passed in, no action is + * taken. */ extern void deflate_free_compressor(struct deflate_compressor *compressor); @@ -79,7 +84,9 @@ struct deflate_decompressor; * * This function takes no parameters, and the returned decompressor is valid for * decompressing data that was compressed at any compression level and with any - * sliding window size. + * sliding window size. It can also be used for any wrapper format (raw + * DEFLATE, zlib, or gzip); however, the appropriate decompression function must + * be called. */ extern struct deflate_decompressor * deflate_alloc_decompressor(void); @@ -118,7 +125,8 @@ gzip_decompress(struct deflate_decompressor *decompressor, /* * deflate_free_decompressor() frees a DEFLATE decompressor that was allocated - * with deflate_alloc_decompressor(). + * with deflate_alloc_decompressor(). If a NULL pointer is passed in, no action + * is taken. */ extern void deflate_free_decompressor(struct deflate_decompressor *decompressor); diff --git a/src/adler32.c b/src/adler32.c index 13d996e..e5dc9a5 100644 --- a/src/adler32.c +++ b/src/adler32.c @@ -39,7 +39,7 @@ #define UNROLL_FACTOR 4 u32 -adler32(const u8 *buffer, size_t size) +adler32(const void *buffer, size_t size) { u32 s1 = 1; u32 s2 = 0; diff --git a/src/adler32.h b/src/adler32.h index 78c2d02..d1964bf 100644 --- a/src/adler32.h +++ b/src/adler32.h @@ -9,4 +9,4 @@ #include "types.h" extern u32 -adler32(const u8 *buffer, size_t size); +adler32(const void *buffer, size_t size); diff --git a/src/bitops.h b/src/bitops.h index 1e6f68c..97fa7a1 100644 --- a/src/bitops.h +++ b/src/bitops.h @@ -11,7 +11,8 @@ /* Find Last Set bit */ -static inline unsigned fls32(u32 v) +static inline unsigned +fls32(u32 v) { #ifdef compiler_fls32 return compiler_fls32(v); @@ -23,7 +24,8 @@ static inline unsigned fls32(u32 v) #endif } -static inline unsigned fls64(u64 v) +static inline unsigned +fls64(u64 v) { #ifdef compiler_fls64 return compiler_fls64(v); @@ -35,7 +37,8 @@ static inline unsigned fls64(u64 v) #endif } -static inline unsigned flsw(machine_word_t v) +static inline unsigned +flsw(machine_word_t v) { BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8); if (WORDSIZE == 4) @@ -46,7 +49,8 @@ static inline unsigned flsw(machine_word_t v) /* Find First Set bit */ -static inline unsigned ffs32(u32 v) +static inline unsigned +ffs32(u32 v) { #ifdef compiler_ffs32 return compiler_ffs32(v); @@ -58,7 +62,8 @@ static inline unsigned ffs32(u32 v) #endif } -static inline unsigned ffs64(u64 v) +static inline unsigned +ffs64(u64 v) { #ifdef compiler_ffs64 return compiler_ffs64(v); @@ -70,7 +75,8 @@ static inline unsigned ffs64(u64 v) #endif } -static inline unsigned ffsw(machine_word_t v) +static inline unsigned +ffsw(machine_word_t v) { BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8); if (WORDSIZE == 4) diff --git a/src/bt_matchfinder.h b/src/bt_matchfinder.h index b827044..57b8efa 100644 --- a/src/bt_matchfinder.h +++ b/src/bt_matchfinder.h @@ -1,51 +1,56 @@ /* * bt_matchfinder.h * - * This is a Binary Tree (bt) based matchfinder. + * ---------------------------------------------------------------------------- + * + * This is a Binary Trees (bt) based matchfinder. * * The data structure is a hash table where each hash bucket contains a binary - * tree of sequences, referenced by position. The sequences in the binary tree - * are ordered such that a left child is lexicographically lesser than its - * parent, and a right child is lexicographically greater than its parent. + * tree of sequences whose first 3 bytes share the same hash code. Each + * sequence is identified by its starting position in the input buffer. Each + * binary tree is always sorted such that each left child represents a sequence + * lexicographically lesser than its parent and each right child represents a + * sequence lexicographically greater than its parent. * - * For each sequence (position) in the input, the first 3 bytes are hashed and - * the the appropriate binary tree is re-rooted at that sequence (position). - * Since the sequences are inserted in order, each binary tree maintains the - * invariant that each child node has greater match offset than its parent. + * The algorithm processes the input buffer sequentially. At each byte + * position, the hash code of the first 3 bytes of the sequence beginning at + * that position (the sequence being matched against) is computed. This + * identifies the hash bucket to use for that position. Then, a new binary tree + * node is created to represent the current sequence. Then, in a single tree + * traversal, the hash bucket's binary tree is searched for matches and is + * re-rooted at the new node. * - * While inserting a sequence, we may search the binary tree for matches with - * that sequence. At each step, the length of the match is computed. The - * search ends when the sequences get too far away (outside of the sliding - * window), or when the binary tree ends (in the code this is the same check as - * "too far away"), or when 'max_search_depth' positions have been searched, or - * when a match of at least 'nice_len' bytes has been found. + * Compared to the simpler algorithm that uses linked lists instead of binary + * trees (see hc_matchfinder.h), the binary tree version gains more information + * at each node visitation. Ideally, the binary tree version will examine only + * 'log(n)' nodes to find the same matches that the linked list version will + * find by examining 'n' nodes. In addition, the binary tree version can + * examine fewer bytes at each node by taking advantage of the common prefixes + * that result from the sort order, whereas the linked list version may have to + * examine up to the full length of the match at each node. * - * Notes: + * However, it is not always best to use the binary tree version. It requires + * nearly twice as much memory as the linked list version, and it takes time to + * keep the binary trees sorted, even at positions where the compressor does not + * need matches. Generally, when doing fast compression on small buffers, + * binary trees are the wrong approach. They are best suited for thorough + * compression and/or large buffers. * - * - Typically, we need to search more nodes to find a given match in a - * binary tree versus in a linked list. However, a binary tree has more - * overhead than a linked list: it needs to be kept sorted, and the inner - * search loop is more complicated. As a result, binary trees are best - * suited for compression modes where the potential matches are searched - * more thoroughly. - * - * - Since no attempt is made to keep the binary trees balanced, it's - * essential to have the 'max_search_depth' cutoff. Otherwise it could - * take quadratic time to run data through the matchfinder. + * ---------------------------------------------------------------------------- */ #pragma once #include "lz_extend.h" -#include "lz_hash3.h" +#include "lz_hash.h" #include "matchfinder_common.h" -#ifndef BT_MATCHFINDER_HASH_ORDER -# if MATCHFINDER_WINDOW_ORDER < 14 -# define BT_MATCHFINDER_HASH_ORDER 14 -# else -# define BT_MATCHFINDER_HASH_ORDER 15 -# endif +#if MATCHFINDER_WINDOW_ORDER < 13 +# define BT_MATCHFINDER_HASH_ORDER 14 +#elif MATCHFINDER_WINDOW_ORDER < 15 +# define BT_MATCHFINDER_HASH_ORDER 15 +#else +# define BT_MATCHFINDER_HASH_ORDER 16 #endif #define BT_MATCHFINDER_HASH_LENGTH (1UL << BT_MATCHFINDER_HASH_ORDER) @@ -77,8 +82,37 @@ bt_matchfinder_slide_window(struct bt_matchfinder *mf) } #endif +static inline u32 +bt_matchfinder_hash_3_bytes(const u8 *in_next) +{ + return lz_hash_3_bytes(in_next, BT_MATCHFINDER_HASH_ORDER); +} + +static inline pos_t * +bt_child(struct bt_matchfinder *mf, pos_t node, int offset) +{ + if (MATCHFINDER_WINDOW_ORDER < sizeof(pos_t) * 8) { + /* no cast needed */ + return &mf->child_tab[(matchfinder_slot_for_match(node) << 1) + offset]; + } else { + return &mf->child_tab[((size_t)matchfinder_slot_for_match(node) << 1) + offset]; + } +} + +static inline pos_t * +bt_left_child(struct bt_matchfinder *mf, pos_t node) +{ + return bt_child(mf, node, 0); +} + +static inline pos_t * +bt_right_child(struct bt_matchfinder *mf, pos_t node) +{ + return bt_child(mf, node, 1); +} + /* - * Find matches with the current sequence. + * Retrieve a list of matches with the current position. * * @mf * The matchfinder structure. @@ -87,115 +121,131 @@ bt_matchfinder_slide_window(struct bt_matchfinder *mf) * time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_. * @in_next * Pointer to the next byte in the input buffer to process. This is the - * pointer to the bytes being matched against. + * pointer to the sequence being matched against. + * @min_len + * Only record matches that are at least this long. * @max_len - * Maximum match length to return. + * The maximum permissible match length at this position. * @nice_len * Stop searching if a match of at least this length is found. + * Must be <= @max_len. * @max_search_depth - * Limit on the number of potential matches to consider. - * @prev_hash - * TODO - * @matches - * Space to write the matches that are found. + * Limit on the number of potential matches to consider. Must be >= 1. + * @next_hash + * Pointer to the hash code for the current sequence, which was computed + * one position in advance so that the binary tree root could be + * prefetched. This is an input/output parameter. + * @best_len_ret + * The length of the longest match found is written here. (This is + * actually redundant with the 'struct lz_match' array, but this is easier + * for the compiler to optimize when inlined and the caller immediately + * does a check against 'best_len'.) + * @lz_matchptr + * An array in which this function will record the matches. The recorded + * matches will be sorted by strictly increasing length and strictly + * increasing offset. The maximum number of matches that may be found is + * 'min(nice_len, max_len) - 3 + 1'. * - * Returns the number of matches found, which may be anywhere from 0 to - * (nice_len - 3 + 1), inclusively. The matches are written to @matches in - * order of strictly increasing length and strictly increasing offset. The - * minimum match length is assumed to be 3. + * The return value is a pointer to the next available slot in the @lz_matchptr + * array. (If no matches were found, this will be the same as @lz_matchptr.) */ -static inline unsigned +static inline struct lz_match * bt_matchfinder_get_matches(struct bt_matchfinder * const restrict mf, const u8 * const in_base, const u8 * const in_next, + const unsigned min_len, const unsigned max_len, const unsigned nice_len, const unsigned max_search_depth, - unsigned long *prev_hash, - struct lz_match * const restrict matches) + u32 * restrict next_hash, + unsigned * restrict best_len_ret, + struct lz_match * restrict lz_matchptr) { - struct lz_match *lz_matchptr = matches; unsigned depth_remaining = max_search_depth; - unsigned hash; - pos_t cur_match; + u32 hash; + pos_t cur_node; const u8 *matchptr; - unsigned best_len; pos_t *pending_lt_ptr, *pending_gt_ptr; unsigned best_lt_len, best_gt_len; unsigned len; - pos_t *children; + unsigned best_len = min_len - 1; - if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES + 1)) - return 0; + if (unlikely(max_len < LZ_HASH3_REQUIRED_NBYTES + 1)) { + *best_len_ret = best_len; + return lz_matchptr; + } - hash = *prev_hash; - *prev_hash = lz_hash3(in_next + 1, BT_MATCHFINDER_HASH_ORDER); - prefetch(&mf->hash_tab[*prev_hash]); - cur_match = mf->hash_tab[hash]; + hash = *next_hash; + *next_hash = bt_matchfinder_hash_3_bytes(in_next + 1); + cur_node = mf->hash_tab[hash]; mf->hash_tab[hash] = in_next - in_base; + prefetch(&mf->hash_tab[*next_hash]); - best_len = 2; - pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1]; - pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1]; + pending_lt_ptr = bt_left_child(mf, in_next - in_base); + pending_gt_ptr = bt_right_child(mf, in_next - in_base); best_lt_len = 0; best_gt_len = 0; + len = 0; + + if (!matchfinder_node_valid(cur_node, in_base, in_next)) { + *pending_lt_ptr = MATCHFINDER_NULL; + *pending_gt_ptr = MATCHFINDER_NULL; + *best_len_ret = best_len; + return lz_matchptr; + } + for (;;) { - if (!matchfinder_match_in_window(cur_match, - in_base, in_next) || - !depth_remaining--) - { - *pending_lt_ptr = MATCHFINDER_INITVAL; - *pending_gt_ptr = MATCHFINDER_INITVAL; - return lz_matchptr - matches; - } - - matchptr = &in_base[cur_match]; - len = min(best_lt_len, best_gt_len); - - children = &mf->child_tab[(unsigned long) - matchfinder_slot_for_match(cur_match) << 1]; + matchptr = &in_base[cur_node]; if (matchptr[len] == in_next[len]) { - len = lz_extend(in_next, matchptr, len + 1, max_len); - if (len > best_len) { best_len = len; - lz_matchptr->length = len; lz_matchptr->offset = in_next - matchptr; lz_matchptr++; - if (len >= nice_len) { - *pending_lt_ptr = children[0]; - *pending_gt_ptr = children[1]; - return lz_matchptr - matches; + *pending_lt_ptr = *bt_left_child(mf, cur_node); + *pending_gt_ptr = *bt_right_child(mf, cur_node); + *best_len_ret = best_len; + return lz_matchptr; } } } if (matchptr[len] < in_next[len]) { - *pending_lt_ptr = cur_match; - pending_lt_ptr = &children[1]; - cur_match = *pending_lt_ptr; + *pending_lt_ptr = cur_node; + pending_lt_ptr = bt_right_child(mf, cur_node); + cur_node = *pending_lt_ptr; best_lt_len = len; + if (best_gt_len < len) + len = best_gt_len; } else { - *pending_gt_ptr = cur_match; - pending_gt_ptr = &children[0]; - cur_match = *pending_gt_ptr; + *pending_gt_ptr = cur_node; + pending_gt_ptr = bt_left_child(mf, cur_node); + cur_node = *pending_gt_ptr; best_gt_len = len; + if (best_lt_len < len) + len = best_lt_len; + } + + if (!matchfinder_node_valid(cur_node, in_base, in_next) || !--depth_remaining) { + *pending_lt_ptr = MATCHFINDER_NULL; + *pending_gt_ptr = MATCHFINDER_NULL; + *best_len_ret = best_len; + return lz_matchptr; } } } /* - * Advance the match-finder, but don't search for matches. + * Advance the matchfinder, but don't record any matches. * * @mf * The matchfinder structure. * @in_base * Pointer to the next byte in the input buffer to process _at the last - * time bc_matchfinder_init() or bc_matchfinder_slide_window() was called_. + * time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_. * @in_next * Pointer to the next byte in the input buffer to process. * @in_end @@ -204,8 +254,14 @@ bt_matchfinder_get_matches(struct bt_matchfinder * const restrict mf, * Stop searching if a match of at least this length is found. * @max_search_depth * Limit on the number of potential matches to consider. - * @prev_hash - * TODO + * @next_hash + * Pointer to the hash code for the current sequence, which was computed + * one position in advance so that the binary tree root could be + * prefetched. This is an input/output parameter. + * + * Note: this is very similar to bt_matchfinder_get_matches() because both + * functions must do hashing and tree re-rooting. This version just doesn't + * actually record any matches. */ static inline void bt_matchfinder_skip_position(struct bt_matchfinder * const restrict mf, @@ -214,66 +270,70 @@ bt_matchfinder_skip_position(struct bt_matchfinder * const restrict mf, const u8 * const in_end, const unsigned nice_len, const unsigned max_search_depth, - unsigned long *prev_hash) + u32 * restrict next_hash) { unsigned depth_remaining = max_search_depth; - unsigned hash; - pos_t cur_match; + u32 hash; + pos_t cur_node; const u8 *matchptr; pos_t *pending_lt_ptr, *pending_gt_ptr; unsigned best_lt_len, best_gt_len; unsigned len; - pos_t *children; - if (unlikely(in_end - in_next < LZ_HASH_REQUIRED_NBYTES + 1)) + if (unlikely(in_end - in_next < LZ_HASH3_REQUIRED_NBYTES + 1)) return; - hash = *prev_hash; - *prev_hash = lz_hash3(in_next + 1, BT_MATCHFINDER_HASH_ORDER); - prefetch(&mf->hash_tab[*prev_hash]); - cur_match = mf->hash_tab[hash]; + hash = *next_hash; + *next_hash = bt_matchfinder_hash_3_bytes(in_next + 1); + cur_node = mf->hash_tab[hash]; mf->hash_tab[hash] = in_next - in_base; + prefetch(&mf->hash_tab[*next_hash]); depth_remaining = max_search_depth; - pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1]; - pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1]; + pending_lt_ptr = bt_left_child(mf, in_next - in_base); + pending_gt_ptr = bt_right_child(mf, in_next - in_base); best_lt_len = 0; best_gt_len = 0; + len = 0; + + if (!matchfinder_node_valid(cur_node, in_base, in_next)) { + *pending_lt_ptr = MATCHFINDER_NULL; + *pending_gt_ptr = MATCHFINDER_NULL; + return; + } + for (;;) { - if (!matchfinder_match_in_window(cur_match, - in_base, in_next) || - !depth_remaining--) - { - *pending_lt_ptr = MATCHFINDER_INITVAL; - *pending_gt_ptr = MATCHFINDER_INITVAL; - return; - } - - matchptr = &in_base[cur_match]; - len = min(best_lt_len, best_gt_len); - - children = &mf->child_tab[(unsigned long) - matchfinder_slot_for_match(cur_match) << 1]; + matchptr = &in_base[cur_node]; if (matchptr[len] == in_next[len]) { len = lz_extend(in_next, matchptr, len + 1, nice_len); if (len == nice_len) { - *pending_lt_ptr = children[0]; - *pending_gt_ptr = children[1]; + *pending_lt_ptr = *bt_left_child(mf, cur_node); + *pending_gt_ptr = *bt_right_child(mf, cur_node); return; } } if (matchptr[len] < in_next[len]) { - *pending_lt_ptr = cur_match; - pending_lt_ptr = &children[1]; - cur_match = *pending_lt_ptr; + *pending_lt_ptr = cur_node; + pending_lt_ptr = bt_right_child(mf, cur_node); + cur_node = *pending_lt_ptr; best_lt_len = len; + if (best_gt_len < len) + len = best_gt_len; } else { - *pending_gt_ptr = cur_match; - pending_gt_ptr = &children[0]; - cur_match = *pending_gt_ptr; + *pending_gt_ptr = cur_node; + pending_gt_ptr = bt_left_child(mf, cur_node); + cur_node = *pending_gt_ptr; best_gt_len = len; + if (best_lt_len < len) + len = best_lt_len; + } + + if (!matchfinder_node_valid(cur_node, in_base, in_next) || !--depth_remaining) { + *pending_lt_ptr = MATCHFINDER_NULL; + *pending_gt_ptr = MATCHFINDER_NULL; + return; } } } diff --git a/src/compiler-gcc.h b/src/compiler-gcc.h index b9e1869..e7bfef0 100644 --- a/src/compiler-gcc.h +++ b/src/compiler-gcc.h @@ -35,7 +35,7 @@ #define max(a, b) ({ __typeof__(a) _a = (a); __typeof__(b) _b = (b); \ (_a > _b) ? _a : _b; }) -#define swap(a, b) ({ __typeof__(a) _a = a; (a) = (b); (b) = _a; }) +#define swap(a, b) ({ __typeof__(a) _a = (a); (a) = (b); (b) = _a; }) #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) # define compiler_bswap32 __builtin_bswap32 @@ -46,7 +46,7 @@ # define compiler_bswap16 __builtin_bswap16 #endif -#define compiler_fls32(n) (31 - __builtin_clz(n)) -#define compiler_fls64(n) (63 - __builtin_clzll(n)) -#define compiler_ffs32(n) __builtin_ctz(n) -#define compiler_ffs64(n) __builtin_ctzll(n) +#define compiler_fls32(n) (31 - __builtin_clz(n)) +#define compiler_fls64(n) (63 - __builtin_clzll(n)) +#define compiler_ffs32(n) __builtin_ctz(n) +#define compiler_ffs64(n) __builtin_ctzll(n) diff --git a/src/compiler.h b/src/compiler.h index 95d2bc3..f48dbb5 100644 --- a/src/compiler.h +++ b/src/compiler.h @@ -9,37 +9,34 @@ #ifdef __GNUC__ # include "compiler-gcc.h" #else -# warning "Unrecognized compiler. Please add a header file for your compiler." +# error "Unrecognized compiler. Please add a header file for your compiler." #endif #ifndef LIBEXPORT # define LIBEXPORT #endif -#ifndef BUILD_BUG_ON -# define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) -#endif - -#ifndef likely -# define likely(expr) (expr) -#endif - -#ifndef unlikely -# define unlikely(expr) (expr) -#endif - -#ifndef prefetch -# define prefetch(addr) +#ifndef _packed_attribute +# error "missing required definition of _packed_attribute" #endif #ifndef _aligned_attribute # error "missing required definition of _aligned_attribute" #endif -#ifndef _packed_attribute -# error "missing required definition of _packed_attribute" +#ifndef likely +# define likely(expr) (expr) #endif +#ifndef unlikely +# define unlikely(expr) (expr) +#endif + +#ifndef prefetch +# define prefetch(addr) +#endif + + #ifndef CPU_IS_BIG_ENDIAN # error "missing required endianness definition" #endif @@ -47,7 +44,6 @@ #define CPU_IS_LITTLE_ENDIAN (!CPU_IS_BIG_ENDIAN) #ifndef UNALIGNED_ACCESS_SPEED -# warning "assuming unaligned accesses are not allowed" # define UNALIGNED_ACCESS_SPEED 0 #endif @@ -58,3 +54,7 @@ #if !defined(min) || !defined(max) || !defined(swap) # error "missing required definitions of min(), max(), and swap() macros" #endif + +#ifndef BUILD_BUG_ON +# define BUILD_BUG_ON(expr) ((void)sizeof(char[1 - 2*!!(expr)])) +#endif diff --git a/src/crc32.c b/src/crc32.c index 0db4b5e..ae03c79 100644 --- a/src/crc32.c +++ b/src/crc32.c @@ -71,9 +71,9 @@ * else * multiple = 0; * - * remainder >>= 1; - * remainder |= (u32)bit << 31; - * remainder ^= multiple; + * remainder >>= 1; + * remainder |= (u32)bit << 31; + * remainder ^= multiple; * } * * return ~remainder; @@ -108,7 +108,7 @@ * multiple = divisor; * else * multiple = 0; - * remainder >>= 1; + * remainder >>= 1; * remainder ^= multiple; * } * diff --git a/src/deflate_compress.c b/src/deflate_compress.c index e566625..ae0d176 100644 --- a/src/deflate_compress.c +++ b/src/deflate_compress.c @@ -1961,9 +1961,7 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c, struct lz_match *cache_end; const u8 *in_block_begin; const u8 *in_block_end; - unsigned num_matches; - unsigned best_len; - unsigned long prev_hash = 0; + u32 next_hash = 0; deflate_init_output(&os, out, out_nbytes_avail); deflate_reset_symbol_frequencies(c); @@ -1991,6 +1989,9 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c, /* Find all match possibilities in this block. */ do { + struct lz_match *matches; + unsigned best_len; + /* Decrease the maximum and nice match lengths if we're * approaching the end of the input buffer. */ if (unlikely(max_len > in_end - in_next)) { @@ -2028,71 +2029,68 @@ deflate_compress_near_optimal(struct deflate_compressor * restrict c, * search for matches at almost all positions, so this * advantage of hash chains is negated. */ - num_matches = + matches = cache_ptr; + cache_ptr = bt_matchfinder_get_matches(&c->bt_mf, in_cur_base, in_next, + DEFLATE_MIN_MATCH_LEN, max_len, nice_len, c->max_search_depth, - &prev_hash, + &next_hash, + &best_len, cache_ptr); - cache_ptr += num_matches; - cache_ptr->length = num_matches; + cache_ptr->length = cache_ptr - matches; cache_ptr->offset = *in_next; in_next++; cache_ptr++; - if (num_matches) { - best_len = cache_ptr[-2].length; + /* + * If there was a very long match found, don't cache any + * matches for the bytes covered by that match. This + * avoids degenerate behavior when compressing highly + * redundant data, where the number of matches can be + * very large. + * + * This heuristic doesn't actually hurt the compression + * ratio very much. If there's a long match, then the + * data must be highly compressible, so it doesn't + * matter much what we do. + * + * We also trigger this same case when approaching the + * desired end of the block. This forces the block to + * reach a "stopping point" where there are no matches + * extending to later positions. (XXX: this behavior is + * non-optimal and should be improved.) + */ + if (best_len >= DEFLATE_MIN_MATCH_LEN && + best_len >= min(nice_len, in_block_end - in_next)) { + --best_len; + do { + if (unlikely(max_len > in_end - in_next)) { + max_len = in_end - in_next; + nice_len = min(max_len, nice_len); + } + if (in_next == in_next_slide) { + bt_matchfinder_slide_window(&c->bt_mf); + in_cur_base = in_next; + in_next_slide = in_next + min(in_end - in_next, + MATCHFINDER_WINDOW_SIZE); + } + bt_matchfinder_skip_position(&c->bt_mf, + in_cur_base, + in_next, + in_end, + nice_len, + c->max_search_depth, + &next_hash); - /* - * If there was a very long match found, don't - * cache any matches for the bytes covered by - * that match. This avoids degenerate behavior - * when compressing highly redundant data, where - * the number of matches can be very large. - * - * This heuristic doesn't actually hurt the - * compression ratio very much. If there's a - * long match, then the data must be highly - * compressible, so it doesn't matter much what - * we do. - * - * We also trigger this same case when - * approaching the desired end of the block. - * This forces the block to reach a "stopping - * point" where there are no matches extending - * to later positions. (XXX: this behavior is - * non-optimal and should be improved.) - */ - if (best_len >= min(nice_len, in_block_end - in_next)) { - --best_len; - do { - if (unlikely(max_len > in_end - in_next)) { - max_len = in_end - in_next; - nice_len = min(max_len, nice_len); - } - if (in_next == in_next_slide) { - bt_matchfinder_slide_window(&c->bt_mf); - in_cur_base = in_next; - in_next_slide = in_next + min(in_end - in_next, - MATCHFINDER_WINDOW_SIZE); - } - bt_matchfinder_skip_position(&c->bt_mf, - in_cur_base, - in_next, - in_end, - nice_len, - c->max_search_depth, - &prev_hash); - - cache_ptr->length = 0; - cache_ptr->offset = *in_next; - in_next++; - cache_ptr++; - } while (--best_len); - } + cache_ptr->length = 0; + cache_ptr->offset = *in_next; + in_next++; + cache_ptr++; + } while (--best_len); } } while (in_next < in_block_end); diff --git a/src/endianness.h b/src/endianness.h index 41cfdf6..7f1e9a1 100644 --- a/src/endianness.h +++ b/src/endianness.h @@ -1,7 +1,7 @@ /* * endianness.h * - * Inline functions for endianness conversion. + * Macros and inline functions for endianness conversion. */ #pragma once diff --git a/src/gzip_compress.c b/src/gzip_compress.c index c3c626d..acf462f 100644 --- a/src/gzip_compress.c +++ b/src/gzip_compress.c @@ -57,7 +57,7 @@ gzip_compress(struct deflate_compressor *c, const void *in, size_t in_size, out_next += 4; /* ISIZE */ - put_unaligned_u32_le(in_size, out_next); + put_unaligned_u32_le((u32)in_size, out_next); out_next += 4; return out_next - (u8 *)out; diff --git a/src/gzip_constants.h b/src/gzip_constants.h index 0041857..87df327 100644 --- a/src/gzip_constants.h +++ b/src/gzip_constants.h @@ -6,8 +6,6 @@ #pragma once -#include "compiler.h" - #define GZIP_MIN_HEADER_SIZE 10 #define GZIP_FOOTER_SIZE 8 #define GZIP_MIN_OVERHEAD (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE) diff --git a/src/gzip_decompress.c b/src/gzip_decompress.c index c9ced0c..ac4fcd3 100644 --- a/src/gzip_decompress.c +++ b/src/gzip_decompress.c @@ -54,20 +54,16 @@ gzip_decompress(struct deflate_decompressor *d, /* Original file name (zero terminated) */ if (flg & GZIP_FNAME) { - while (*in_next != 0 && ++in_next != in_end) + while (*in_next++ != 0 && in_next != in_end) ; - if (in_next != in_end) - in_next++; if (in_end - in_next < GZIP_FOOTER_SIZE) return false; } /* File comment (zero terminated) */ if (flg & GZIP_FCOMMENT) { - while (*in_next != 0 && ++in_next != in_end) + while (*in_next++ != 0 && ++in_next != in_end) ; - if (in_next != in_end) - in_next++; if (in_end - in_next < GZIP_FOOTER_SIZE) return false; } diff --git a/src/hc_matchfinder.h b/src/hc_matchfinder.h index 67cb746..7b1d8bd 100644 --- a/src/hc_matchfinder.h +++ b/src/hc_matchfinder.h @@ -1,37 +1,102 @@ /* * hc_matchfinder.h * - * This is a Hash Chain (hc) based matchfinder. + * --------------------------------------------------------------------------- + * + * Algorithm + * + * This is a Hash Chains (hc) based matchfinder. * * The data structure is a hash table where each hash bucket contains a linked - * list of sequences, referenced by position. + * list (or "chain") of sequences whose first 3 bytes share the same hash code. + * Each sequence is identified by its starting position in the input buffer. * - * For each sequence (position) in the input, the first 3 bytes are hashed and - * that sequence (position) is prepended to the appropriate linked list in the - * hash table. Since the sequences are inserted in order, each list is always - * sorted by increasing match offset. + * The algorithm processes the input buffer sequentially. At each byte + * position, the hash code of the first 3 bytes of the sequence beginning at + * that position (the sequence being matched against) is computed. This + * identifies the hash bucket to use for that position. Then, this hash + * bucket's linked list is searched for matches. Then, a new linked list node + * is created to represent the current sequence and is prepended to the list. * - * At the same time as inserting a sequence, we may search the linked list for - * matches with that sequence. At each step, the length of the match is - * computed. The search ends when the sequences get too far away (outside of - * the sliding window), or when the list ends (in the code this is the same - * check as "too far away"), or when 'max_search_depth' positions have been - * searched, or when a match of at least 'nice_len' bytes has been found. + * This algorithm has several useful properties: + * + * - It only finds true Lempel-Ziv matches; i.e., those where the matching + * sequence occurs prior to the sequence being matched against. + * + * - The sequences in each linked list are always sorted by decreasing starting + * position. Therefore, the closest (smallest offset) matches are found + * first, which in many compression formats tend to be the cheapest to encode. + * + * - Although fast running time is not guaranteed due to the possibility of the + * lists getting very long, the worst degenerate behavior can be easily + * prevented by capping the number of nodes searched at each position. + * + * - If the compressor decides not to search for matches at a certain position, + * then that position can be quickly inserted without searching the list. + * + * - The algorithm is adaptable to sliding windows: just store the positions + * relative to a "base" value that is updated from time to time, and stop + * searching each list when the sequences get too far away. + * + * --------------------------------------------------------------------------- + * + * Notes on usage + * + * You must define MATCHFINDER_WINDOW_ORDER before including this header because + * that determines which integer type to use for positions. Since 16-bit + * integers are faster than 32-bit integers due to reduced memory usage (and + * therefore reduced cache pressure), the code only uses 32-bit integers if they + * are needed to represent all possible positions. + * + * In addition, you must allocate the 'struct hc_matchfinder' on a + * MATCHFINDER_ALIGNMENT-aligned boundary. + * + * ---------------------------------------------------------------------------- + * + * Optimizations + * + * The longest_match() and skip_positions() functions are inlined into the + * compressors that use them. This isn't just about saving the overhead of a + * function call. These functions are intended to be called from the inner + * loops of compressors, where giving the compiler more control over register + * allocation is very helpful. There is also significant benefit to be gained + * from allowing the CPU to predict branches independently at each call site. + * For example, "lazy"-style compressors can be written with two calls to + * longest_match(), each of which starts with a different 'best_len' and + * therefore has significantly different performance characteristics. + * + * Although any hash function can be used, a multiplicative hash is fast and + * works well. + * + * On some processors, it is significantly faster to extend matches by whole + * words (32 or 64 bits) instead of by individual bytes. For this to be the + * case, the processor must implement unaligned memory accesses efficiently and + * must have either a fast "find first set bit" instruction or a fast "find last + * set bit" instruction, depending on the processor's endianness. + * + * The code uses one loop for finding the first match and one loop for finding a + * longer match. Each of these loops is tuned for its respective task and in + * combination are faster than a single generalized loop that handles both + * tasks. + * + * The code also uses a tight inner loop that only compares the last and first + * bytes of a potential match. It is only when these bytes match that a full + * match extension is attempted. + * + * ---------------------------------------------------------------------------- */ #pragma once #include "lz_extend.h" -#include "lz_hash3.h" +#include "lz_hash.h" #include "matchfinder_common.h" #include "unaligned.h" -#ifndef HC_MATCHFINDER_HASH_ORDER -# if MATCHFINDER_WINDOW_ORDER < 14 -# define HC_MATCHFINDER_HASH_ORDER 14 -# else -# define HC_MATCHFINDER_HASH_ORDER 15 -# endif +#if MATCHFINDER_WINDOW_ORDER < 14 +# define HC_MATCHFINDER_HASH_ORDER 14 +#else +# define HC_MATCHFINDER_HASH_ORDER 15 #endif #define HC_MATCHFINDER_HASH_LENGTH (1UL << HC_MATCHFINDER_HASH_ORDER) @@ -73,17 +138,18 @@ hc_matchfinder_slide_window(struct hc_matchfinder *mf) * time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_. * @in_next * Pointer to the next byte in the input buffer to process. This is the - * pointer to the bytes being matched against. + * pointer to the sequence being matched against. * @best_len - * Require a match at least this long. + * Require a match longer than this length. * @max_len - * Maximum match length to return. + * The maximum permissible match length at this position. * @nice_len * Stop searching if a match of at least this length is found. + * Must be <= @max_len. * @max_search_depth - * Limit on the number of potential matches to consider. + * Limit on the number of potential matches to consider. Must be >= 1. * @offset_ret - * The match offset is returned here. + * If a match is found, its offset is returned in this location. * * Return the length of the match found, or 'best_len' if no match longer than * 'best_len' was found. @@ -102,61 +168,57 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, const u8 *best_matchptr = best_matchptr; /* uninitialized */ const u8 *matchptr; unsigned len; - unsigned hash; - pos_t cur_match; u32 first_3_bytes; + u32 hash; + pos_t cur_node; - /* Insert the current sequence into the appropriate hash chain. */ - if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES)) + /* Insert the current sequence into the appropriate linked list. */ + if (unlikely(max_len < LOAD_U24_REQUIRED_NBYTES)) goto out; first_3_bytes = load_u24_unaligned(in_next); - hash = lz_hash3_u24(first_3_bytes, HC_MATCHFINDER_HASH_ORDER); - cur_match = mf->hash_tab[hash]; - mf->next_tab[in_next - in_base] = cur_match; + hash = lz_hash(first_3_bytes, HC_MATCHFINDER_HASH_ORDER); + cur_node = mf->hash_tab[hash]; + mf->next_tab[in_next - in_base] = cur_node; mf->hash_tab[hash] = in_next - in_base; if (unlikely(best_len >= max_len)) goto out; - /* Search the appropriate hash chain for matches. */ + /* Search the appropriate linked list for matches. */ - if (!(matchfinder_match_in_window(cur_match, in_base, in_next))) + if (!(matchfinder_node_valid(cur_node, in_base, in_next))) goto out; if (best_len < 3) { for (;;) { /* No length 3 match found yet. * Check the first 3 bytes. */ - matchptr = &in_base[cur_match]; + matchptr = &in_base[cur_node]; if (load_u24_unaligned(matchptr) == first_3_bytes) break; - /* Not a match; keep trying. */ - cur_match = mf->next_tab[ - matchfinder_slot_for_match(cur_match)]; - if (!matchfinder_match_in_window(cur_match, - in_base, in_next)) - goto out; - if (!--depth_remaining) + /* The first 3 bytes did not match. Keep trying. */ + cur_node = mf->next_tab[ + matchfinder_slot_for_match(cur_node)]; + if (!matchfinder_node_valid(cur_node, in_base, in_next) || + !--depth_remaining) goto out; } - /* Found a length 3 match. */ + /* Found a match of length >= 3. Extend it to its full length. */ best_matchptr = matchptr; best_len = lz_extend(in_next, best_matchptr, 3, max_len); if (best_len >= nice_len) goto out; - cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)]; - if (!matchfinder_match_in_window(cur_match, in_base, in_next)) - goto out; - if (!--depth_remaining) + cur_node = mf->next_tab[matchfinder_slot_for_match(cur_node)]; + if (!matchfinder_node_valid(cur_node, in_base, in_next) || !--depth_remaining) goto out; } for (;;) { for (;;) { - matchptr = &in_base[cur_match]; + matchptr = &in_base[cur_node]; /* Already found a length 3 match. Try for a longer match; * start by checking the last 2 bytes and the first 4 bytes. */ @@ -170,17 +232,16 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, #endif break; - cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)]; - if (!matchfinder_match_in_window(cur_match, in_base, in_next)) - goto out; - if (!--depth_remaining) + cur_node = mf->next_tab[matchfinder_slot_for_match(cur_node)]; + if (!matchfinder_node_valid(cur_node, in_base, in_next) || !--depth_remaining) goto out; } - if (UNALIGNED_ACCESS_IS_FAST) - len = 4; - else - len = 0; + #if UNALIGNED_ACCESS_IS_FAST + len = 4; + #else + len = 0; + #endif len = lz_extend(in_next, matchptr, len, max_len); if (len > best_len) { best_len = len; @@ -188,10 +249,8 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, if (best_len >= nice_len) goto out; } - cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)]; - if (!matchfinder_match_in_window(cur_match, in_base, in_next)) - goto out; - if (!--depth_remaining) + cur_node = mf->next_tab[matchfinder_slot_for_match(cur_node)]; + if (!matchfinder_node_valid(cur_node, in_base, in_next) || !--depth_remaining) goto out; } out: @@ -200,7 +259,7 @@ out: } /* - * Advance the match-finder, but don't search for matches. + * Advance the matchfinder, but don't search for matches. * * @mf * The matchfinder structure. @@ -212,7 +271,7 @@ out: * @in_end * Pointer to the end of the input buffer. * @count - * Number of bytes to skip; must be > 0. + * The number of bytes to advance. Must be > 0. */ static inline void hc_matchfinder_skip_positions(struct hc_matchfinder * restrict mf, @@ -221,13 +280,13 @@ hc_matchfinder_skip_positions(struct hc_matchfinder * restrict mf, const u8 *in_end, unsigned count) { - unsigned hash; + u32 hash; - if (unlikely(in_next + count >= in_end - LZ_HASH_REQUIRED_NBYTES)) + if (unlikely(in_next + count >= in_end - LZ_HASH3_REQUIRED_NBYTES)) return; do { - hash = lz_hash3(in_next, HC_MATCHFINDER_HASH_ORDER); + hash = lz_hash_3_bytes(in_next, HC_MATCHFINDER_HASH_ORDER); mf->next_tab[in_next - in_base] = mf->hash_tab[hash]; mf->hash_tab[hash] = in_next - in_base; in_next++; diff --git a/src/lz_extend.h b/src/lz_extend.h index 94b281a..be5a677 100644 --- a/src/lz_extend.h +++ b/src/lz_extend.h @@ -24,12 +24,12 @@ lz_extend(const u8 * const strptr, const u8 * const matchptr, if (likely(max_len - len >= 4 * WORDSIZE)) { - #define COMPARE_WORD_STEP \ - v_word = load_word_unaligned(&matchptr[len]) ^ \ - load_word_unaligned(&strptr[len]); \ - if (v_word != 0) \ - goto word_differs; \ - len += WORDSIZE; \ + #define COMPARE_WORD_STEP \ + v_word = load_word_unaligned(&matchptr[len]) ^ \ + load_word_unaligned(&strptr[len]); \ + if (v_word != 0) \ + goto word_differs; \ + len += WORDSIZE; \ COMPARE_WORD_STEP COMPARE_WORD_STEP diff --git a/src/lz_hash.h b/src/lz_hash.h new file mode 100644 index 0000000..419baa6 --- /dev/null +++ b/src/lz_hash.h @@ -0,0 +1,41 @@ +/* + * lz_hash.h + * + * Hashing for Lempel-Ziv matchfinding. + */ + +#ifndef _LZ_HASH_H +#define _LZ_HASH_H + +#include "unaligned.h" + +/* + * The hash function: given a sequence prefix held in the low-order bits of a + * 32-bit value, multiply by a carefully-chosen large constant. Discard any + * bits of the product that don't fit in a 32-bit value, but take the + * next-highest @num_bits bits of the product as the hash value, as those have + * the most randomness. + */ +static inline u32 +lz_hash(u32 seq, unsigned num_bits) +{ + return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits); +} + +/* + * Hash the 3-byte sequence beginning at @p, producing a hash of length + * @num_bits bits. At least LZ_HASH3_REQUIRED_NBYTES bytes of data must be + * available at @p; note that this may be more than 3. + */ +static inline u32 +lz_hash_3_bytes(const u8 *p, unsigned num_bits) +{ + u32 seq = load_u24_unaligned(p); + if (num_bits >= 24) + return seq; + return lz_hash(seq, num_bits); +} + +#define LZ_HASH3_REQUIRED_NBYTES LOAD_U24_REQUIRED_NBYTES + +#endif /* _LZ_HASH_H */ diff --git a/src/lz_hash3.h b/src/lz_hash3.h deleted file mode 100644 index ec322d9..0000000 --- a/src/lz_hash3.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * lz_hash3.h - * - * 3-byte hashing for Lempel-Ziv matchfinding. - */ - -#pragma once - -#include "unaligned.h" - -static inline u32 -loaded_u32_to_u24(u32 v) -{ - if (CPU_IS_LITTLE_ENDIAN) - return v & 0xFFFFFF; - else - return v >> 8; -} - -static inline u32 -load_u24_unaligned(const u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) - return loaded_u32_to_u24(load_u32_unaligned(p)); - else - return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); -} - -static inline u32 -lz_hash3_u24(u32 str, unsigned num_bits) -{ - return (u32)(str * 0x1E35A7BD) >> (32 - num_bits); -} - -/* - * Hash the next 3-byte sequence in the window, producing a hash of length - * 'num_bits' bits. At least LZ_HASH_REQUIRED_NBYTES must be available at 'p'; - * this might be 4 bytes rather than 3 because an unaligned load is faster on - * some architectures. - */ -static inline u32 -lz_hash3(const u8 *p, unsigned num_bits) -{ - return lz_hash3_u24(load_u24_unaligned(p), num_bits); -} - -/* Number of bytes the hash function actually requires be available, due to the - * possibility of an unaligned load. */ -#define LZ_HASH_REQUIRED_NBYTES (UNALIGNED_ACCESS_IS_FAST ? 4 : 3) diff --git a/src/matchfinder_avx2.h b/src/matchfinder_avx2.h index fe98b63..3bdb1a9 100644 --- a/src/matchfinder_avx2.h +++ b/src/matchfinder_avx2.h @@ -16,9 +16,9 @@ matchfinder_init_avx2(pos_t *data, size_t size) return false; if (sizeof(pos_t) == 2) - v = _mm256_set1_epi16(MATCHFINDER_INITVAL); + v = _mm256_set1_epi16((u16)MATCHFINDER_NULL); else if (sizeof(pos_t) == 4) - v = _mm256_set1_epi32(MATCHFINDER_INITVAL); + v = _mm256_set1_epi32((u32)MATCHFINDER_NULL); else return false; diff --git a/src/matchfinder_common.h b/src/matchfinder_common.h index 2a01336..1ffbed9 100644 --- a/src/matchfinder_common.h +++ b/src/matchfinder_common.h @@ -60,7 +60,7 @@ static inline bool matchfinder_memset_init_okay(void) { /* All bytes must match in order to use memset. */ - const pos_t v = MATCHFINDER_INITVAL; + const pos_t v = MATCHFINDER_NULL; if (sizeof(pos_t) == 2) return (u8)v == (u8)(v >> 8); if (sizeof(pos_t) == 4) @@ -93,12 +93,12 @@ matchfinder_init(pos_t *data, size_t num_entries) #endif if (matchfinder_memset_init_okay()) { - memset(data, (u8)MATCHFINDER_INITVAL, size); + memset(data, (u8)MATCHFINDER_NULL, size); return; } for (size_t i = 0; i < num_entries; i++) - data[i] = MATCHFINDER_INITVAL; + data[i] = MATCHFINDER_NULL; } #if MATCHFINDER_IS_SLIDING diff --git a/src/matchfinder_nonsliding.h b/src/matchfinder_nonsliding.h index e08f461..4717214 100644 --- a/src/matchfinder_nonsliding.h +++ b/src/matchfinder_nonsliding.h @@ -16,12 +16,12 @@ typedef u32 pos_t; /* Not all the bits of the position type are needed, so the sign bit can be * reserved to mean "out of bounds". */ -#define MATCHFINDER_INITVAL ((pos_t)-1) +#define MATCHFINDER_NULL ((pos_t)-1) static inline bool -matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next) +matchfinder_node_valid(pos_t cur_node, const u8 *in_base, const u8 *in_next) { - return !(cur_match & ((pos_t)1 << (sizeof(pos_t) * 8 - 1))); + return !(cur_node & ((pos_t)1 << (sizeof(pos_t) * 8 - 1))); } #else @@ -30,18 +30,18 @@ matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_nex * This prevents the beginning of the buffer from matching anything; however, * this doesn't matter much. */ -#define MATCHFINDER_INITVAL ((pos_t)0) +#define MATCHFINDER_NULL ((pos_t)0) static inline bool -matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next) +matchfinder_node_valid(pos_t cur_node, const u8 *in_base, const u8 *in_next) { - return cur_match != 0; + return cur_node != 0; } #endif static inline pos_t -matchfinder_slot_for_match(pos_t cur_match) +matchfinder_slot_for_match(pos_t cur_node) { - return cur_match; + return cur_node; } diff --git a/src/matchfinder_sliding.h b/src/matchfinder_sliding.h index 4b8a515..2fb715a 100644 --- a/src/matchfinder_sliding.h +++ b/src/matchfinder_sliding.h @@ -13,18 +13,18 @@ typedef s16 pos_t; typedef s32 pos_t; #endif -#define MATCHFINDER_INITVAL ((pos_t)-MATCHFINDER_WINDOW_SIZE) +#define MATCHFINDER_NULL ((pos_t)-MATCHFINDER_WINDOW_SIZE) /* In the sliding window case, positions are stored relative to 'in_base'. */ static inline bool -matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next) +matchfinder_node_valid(pos_t cur_node, const u8 *in_base, const u8 *in_next) { - return cur_match > (pos_t)((in_next - in_base) - MATCHFINDER_WINDOW_SIZE); + return cur_node > (pos_t)((in_next - in_base) - MATCHFINDER_WINDOW_SIZE); } static inline pos_t -matchfinder_slot_for_match(pos_t cur_match) +matchfinder_slot_for_match(pos_t cur_node) { - return cur_match & (MATCHFINDER_WINDOW_SIZE - 1); + return cur_node & (MATCHFINDER_WINDOW_SIZE - 1); } diff --git a/src/matchfinder_sse2.h b/src/matchfinder_sse2.h index cc27600..574f9b2 100644 --- a/src/matchfinder_sse2.h +++ b/src/matchfinder_sse2.h @@ -16,9 +16,9 @@ matchfinder_init_sse2(pos_t *data, size_t size) return false; if (sizeof(pos_t) == 2) - v = _mm_set1_epi16(MATCHFINDER_INITVAL); + v = _mm_set1_epi16((u16)MATCHFINDER_NULL); else if (sizeof(pos_t) == 4) - v = _mm_set1_epi32(MATCHFINDER_INITVAL); + v = _mm_set1_epi32((u32)MATCHFINDER_NULL); else return false; diff --git a/src/types.h b/src/types.h index 205dbd3..eff0404 100644 --- a/src/types.h +++ b/src/types.h @@ -6,9 +6,9 @@ #pragma once -#include #include #include +#include typedef uint8_t u8; typedef uint16_t u16; diff --git a/src/unaligned.h b/src/unaligned.h index d5b0f95..24a588f 100644 --- a/src/unaligned.h +++ b/src/unaligned.h @@ -1,7 +1,7 @@ /* * unaligned.h * - * Inline functions for unaligned memory access. + * Inline functions for unaligned memory accesses. */ #pragma once @@ -214,3 +214,36 @@ put_unaligned_u32_be(u32 v, void *p) p8[3] = (v >> 0) & 0xFF; } } + +/* + * Given a 32-bit value that was loaded with the platform's native endianness, + * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24 + * bits contain the first 3 bytes, arranged in octets in a platform-dependent + * order, at the memory location from which the input 32-bit value was loaded. + */ +static inline u32 +loaded_u32_to_u24(u32 v) +{ + if (CPU_IS_LITTLE_ENDIAN) + return v & 0xFFFFFF; + else + return v >> 8; +} + +/* + * Load the next 3 bytes from the memory location @p into the 24 low-order bits + * of a 32-bit value. The order in which the 3 bytes will be arranged as octets + * in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES + * bytes must be available at @p; note that this may be more than 3. + */ +static inline u32 +load_u24_unaligned(const u8 *p) +{ +#if UNALIGNED_ACCESS_IS_FAST +# define LOAD_U24_REQUIRED_NBYTES 4 + return loaded_u32_to_u24(load_u32_unaligned(p)); +#else +# define LOAD_U24_REQUIRED_NBYTES 3 + return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); +#endif +} diff --git a/test/benchmark.c b/test/benchmark.c index 205060e..7b020f9 100644 --- a/test/benchmark.c +++ b/test/benchmark.c @@ -1,11 +1,9 @@ /* * benchmark.c - A compression testing and benchmark program. * - * The author dedicates this file to the public domain. - * You can do whatever you want with this file. + * This file has no copyright assigned and is placed in the Public Domain. */ - #define _FILE_OFFSET_BITS 64 #define _GNU_SOURCE @@ -419,9 +417,9 @@ main(int argc, char **argv) wrapper == NO_WRAPPER ? "None" : wrapper == ZLIB_WRAPPER ? "zlib" : "gzip"); printf("\tCompression engine: %s\n", - compress_with_libz ? "zlib" : "libdeflate"); + compress_with_libz ? "libz" : "libdeflate"); printf("\tDecompression engine: %s\n", - decompress_with_libz ? "zlib" : "libdeflate"); + decompress_with_libz ? "libz" : "libdeflate"); ubuf1 = malloc(chunk_size); ubuf2 = malloc(chunk_size);