diff --git a/lib/ht_matchfinder.h b/lib/ht_matchfinder.h new file mode 100644 index 0000000..e8323c3 --- /dev/null +++ b/lib/ht_matchfinder.h @@ -0,0 +1,234 @@ +/* + * ht_matchfinder.h - Lempel-Ziv matchfinding with a hash table + * + * Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * --------------------------------------------------------------------------- + * + * This is a Hash Table (ht) matchfinder. + * + * This is a variant of the Hash Chains (hc) matchfinder that is optimized for + * very fast compression. The ht_matchfinder stores the hash chains inline in + * the hash table, whereas the hc_matchfinder stores them in a separate array. + * Storing the hash chains inline is the faster method when max_search_depth + * (the maximum chain length) is very small. It is not appropriate when + * max_search_depth is larger, as then it uses too much memory. + * + * Due to its focus on speed, the ht_matchfinder doesn't support length 3 + * matches. It also doesn't allow max_search_depth to vary at runtime; it is + * fixed at build time as HT_MATCHFINDER_BUCKET_SIZE. + * + * See hc_matchfinder.h for more information. + */ + +#ifndef LIB_HT_MATCHFINDER_H +#define LIB_HT_MATCHFINDER_H + +#include "matchfinder_common.h" + +#define HT_MATCHFINDER_HASH_ORDER 15 +#define HT_MATCHFINDER_BUCKET_SIZE 2 + +#define HT_MATCHFINDER_MIN_MATCH_LEN 4 +/* Minimum value of max_len for ht_matchfinder_longest_match() */ +#define HT_MATCHFINDER_REQUIRED_NBYTES 5 + +struct ht_matchfinder { + mf_pos_t hash_tab[1UL << HT_MATCHFINDER_HASH_ORDER] + [HT_MATCHFINDER_BUCKET_SIZE]; +} MATCHFINDER_ALIGNED; + +static forceinline void +ht_matchfinder_init(struct ht_matchfinder *mf) +{ + STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_init((mf_pos_t *)mf, sizeof(*mf)); +} + +static forceinline void +ht_matchfinder_slide_window(struct ht_matchfinder *mf) +{ + matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf)); +} + +/* Note: max_len must be >= HT_MATCHFINDER_REQUIRED_NBYTES */ +static forceinline u32 +ht_matchfinder_longest_match(struct ht_matchfinder * const restrict mf, + const u8 ** const restrict in_base_p, + const u8 * const restrict in_next, + const u32 max_len, + const u32 nice_len, + u32 * const restrict next_hash, + u32 * const restrict offset_ret) +{ + u32 best_len = 0; + const u8 *best_matchptr = in_next; + u32 cur_pos = in_next - *in_base_p; + const u8 *in_base; + mf_pos_t cutoff; + u32 hash; + u32 seq; + mf_pos_t cur_node; + const u8 *matchptr; +#if HT_MATCHFINDER_BUCKET_SIZE > 1 + mf_pos_t to_insert; + u32 len; +#endif +#if HT_MATCHFINDER_BUCKET_SIZE > 2 + int i; +#endif + + /* This is assumed throughout this function. */ + STATIC_ASSERT(HT_MATCHFINDER_MIN_MATCH_LEN == 4); + + if (cur_pos == MATCHFINDER_WINDOW_SIZE) { + ht_matchfinder_slide_window(mf); + *in_base_p += MATCHFINDER_WINDOW_SIZE; + cur_pos = 0; + } + in_base = *in_base_p; + cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE; + + hash = *next_hash; + STATIC_ASSERT(HT_MATCHFINDER_REQUIRED_NBYTES == 5); + *next_hash = lz_hash(get_unaligned_le32(in_next + 1), + HT_MATCHFINDER_HASH_ORDER); + seq = load_u32_unaligned(in_next); + prefetchw(&mf->hash_tab[*next_hash]); +#if HT_MATCHFINDER_BUCKET_SIZE == 1 + /* Hand-unrolled version for BUCKET_SIZE == 1 */ + cur_node = mf->hash_tab[hash][0]; + mf->hash_tab[hash][0] = cur_pos; + if (cur_node <= cutoff) + goto out; + matchptr = &in_base[cur_node]; + if (load_u32_unaligned(matchptr) == seq) { + best_len = lz_extend(in_next, matchptr, 4, max_len); + best_matchptr = matchptr; + } +#elif HT_MATCHFINDER_BUCKET_SIZE == 2 + /* + * Hand-unrolled version for BUCKET_SIZE == 2. The logic here also + * differs slightly in that it copies the first entry to the second even + * if nice_len is reached on the first, as this can be slightly faster. + */ + cur_node = mf->hash_tab[hash][0]; + mf->hash_tab[hash][0] = cur_pos; + if (cur_node <= cutoff) + goto out; + matchptr = &in_base[cur_node]; + + to_insert = cur_node; + cur_node = mf->hash_tab[hash][1]; + mf->hash_tab[hash][1] = to_insert; + + if (load_u32_unaligned(matchptr) == seq) { + best_len = lz_extend(in_next, matchptr, 4, max_len); + best_matchptr = matchptr; + if (cur_node <= cutoff || best_len >= nice_len) + goto out; + matchptr = &in_base[cur_node]; + if (load_u32_unaligned(matchptr) == seq && + load_u32_unaligned(matchptr + best_len - 3) == + load_u32_unaligned(in_next + best_len - 3)) { + len = lz_extend(in_next, matchptr, 4, max_len); + if (len > best_len) { + best_len = len; + best_matchptr = matchptr; + } + } + } else { + if (cur_node <= cutoff) + goto out; + matchptr = &in_base[cur_node]; + if (load_u32_unaligned(matchptr) == seq) { + best_len = lz_extend(in_next, matchptr, 4, max_len); + best_matchptr = matchptr; + } + } +#else + /* Generic version for HT_MATCHFINDER_BUCKET_SIZE > 2 */ + to_insert = cur_pos; + for (i = 0; i < HT_MATCHFINDER_BUCKET_SIZE; i++) { + cur_node = mf->hash_tab[hash][i]; + mf->hash_tab[hash][i] = to_insert; + if (cur_node <= cutoff) + goto out; + matchptr = &in_base[cur_node]; + if (load_u32_unaligned(matchptr) == seq) { + len = lz_extend(in_next, matchptr, 4, max_len); + if (len > best_len) { + best_len = len; + best_matchptr = matchptr; + if (best_len >= nice_len) + goto out; + } + } + to_insert = cur_node; + } +#endif +out: + *offset_ret = in_next - best_matchptr; + return best_len; +} + +static forceinline void +ht_matchfinder_skip_bytes(struct ht_matchfinder * const restrict mf, + const u8 ** const restrict in_base_p, + const u8 *in_next, + const u8 * const in_end, + const u32 count, + u32 * const restrict next_hash) +{ + s32 cur_pos = in_next - *in_base_p; + u32 hash; + u32 remaining = count; + int i; + + if (unlikely(count + HT_MATCHFINDER_REQUIRED_NBYTES > in_end - in_next)) + return; + + if (cur_pos + count - 1 >= MATCHFINDER_WINDOW_SIZE) { + ht_matchfinder_slide_window(mf); + *in_base_p += MATCHFINDER_WINDOW_SIZE; + cur_pos -= MATCHFINDER_WINDOW_SIZE; + } + + hash = *next_hash; + do { + for (i = HT_MATCHFINDER_BUCKET_SIZE - 1; i > 0; i--) + mf->hash_tab[hash][i] = mf->hash_tab[hash][i - 1]; + mf->hash_tab[hash][0] = cur_pos; + + hash = lz_hash(get_unaligned_le32(++in_next), + HT_MATCHFINDER_HASH_ORDER); + cur_pos++; + } while (--remaining); + + prefetchw(&mf->hash_tab[hash]); + *next_hash = hash; +} + +#endif /* LIB_HT_MATCHFINDER_H */