diff --git a/lib/arm/matchfinder_impl.h b/lib/arm/matchfinder_impl.h index aa1a0c7..9e711cc 100644 --- a/lib/arm/matchfinder_impl.h +++ b/lib/arm/matchfinder_impl.h @@ -26,68 +26,56 @@ */ #ifdef __ARM_NEON -# if MATCHFINDER_ALIGNMENT < 16 -# undef MATCHFINDER_ALIGNMENT -# define MATCHFINDER_ALIGNMENT 16 -# endif # include -static forceinline bool +static forceinline void matchfinder_init_neon(mf_pos_t *data, size_t size) { - int16x8_t v, *p; - size_t n; - - if (size % (sizeof(int16x8_t) * 4) != 0) - return false; - - STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = (int16x8_t) { + int16x8_t *p = (int16x8_t *)data; + int16x8_t v = (int16x8_t) { MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, }; - p = (int16x8_t *)data; - n = size / (sizeof(int16x8_t) * 4); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + do { p[0] = v; p[1] = v; p[2] = v; p[3] = v; p += 4; - } while (--n); - return true; + size -= 4 * sizeof(*p); + } while (size != 0); } -#undef arch_matchfinder_init -#define arch_matchfinder_init matchfinder_init_neon +#define matchfinder_init matchfinder_init_neon -static forceinline bool +static forceinline void matchfinder_rebase_neon(mf_pos_t *data, size_t size) { - int16x8_t v, *p; - size_t n; - - if (size % (sizeof(int16x8_t) * 4) != 0) - return false; - - STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = (int16x8_t) { + int16x8_t *p = (int16x8_t *)data; + int16x8_t v = (int16x8_t) { (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, }; - p = (int16x8_t *)data; - n = size / (sizeof(int16x8_t) * 4); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + do { p[0] = vqaddq_s16(p[0], v); p[1] = vqaddq_s16(p[1], v); p[2] = vqaddq_s16(p[2], v); p[3] = vqaddq_s16(p[3], v); p += 4; - } while (--n); - return true; + size -= 4 * sizeof(*p); + } while (size != 0); } -#undef arch_matchfinder_rebase -#define arch_matchfinder_rebase matchfinder_rebase_neon +#define matchfinder_rebase matchfinder_rebase_neon #endif /* __ARM_NEON */ diff --git a/lib/bt_matchfinder.h b/lib/bt_matchfinder.h index 49fc0bf..546a19d 100644 --- a/lib/bt_matchfinder.h +++ b/lib/bt_matchfinder.h @@ -71,9 +71,9 @@ #define BT_MATCHFINDER_HASH3_WAYS 2 #define BT_MATCHFINDER_HASH4_ORDER 16 -#define BT_MATCHFINDER_TOTAL_HASH_LENGTH \ - ((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \ - (1UL << BT_MATCHFINDER_HASH4_ORDER)) +#define BT_MATCHFINDER_TOTAL_HASH_SIZE \ + (((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \ + (1UL << BT_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t)) /* Representation of a match found by the bt_matchfinder */ struct lz_match { @@ -101,7 +101,7 @@ struct bt_matchfinder { } #ifdef _aligned_attribute -_aligned_attribute(MATCHFINDER_ALIGNMENT) +_aligned_attribute(MATCHFINDER_MEM_ALIGNMENT) #endif ; @@ -109,14 +109,18 @@ _aligned_attribute(MATCHFINDER_ALIGNMENT) static forceinline void bt_matchfinder_init(struct bt_matchfinder *mf) { - matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_LENGTH); + STATIC_ASSERT(BT_MATCHFINDER_TOTAL_HASH_SIZE % + MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_SIZE); } static forceinline void bt_matchfinder_slide_window(struct bt_matchfinder *mf) { - matchfinder_rebase((mf_pos_t *)mf, - sizeof(struct bt_matchfinder) / sizeof(mf_pos_t)); + STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf)); } static forceinline mf_pos_t * diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c index 36572f7..cf43798 100644 --- a/lib/deflate_compress.c +++ b/lib/deflate_compress.c @@ -2704,7 +2704,7 @@ libdeflate_alloc_compressor(int compression_level) size += sizeof(c->p.g); #endif - c = libdeflate_aligned_malloc(MATCHFINDER_ALIGNMENT, size); + c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size); if (!c) return NULL; diff --git a/lib/hc_matchfinder.h b/lib/hc_matchfinder.h index 8412a6f..becf844 100644 --- a/lib/hc_matchfinder.h +++ b/lib/hc_matchfinder.h @@ -111,9 +111,9 @@ #define HC_MATCHFINDER_HASH3_ORDER 15 #define HC_MATCHFINDER_HASH4_ORDER 16 -#define HC_MATCHFINDER_TOTAL_HASH_LENGTH \ - ((1UL << HC_MATCHFINDER_HASH3_ORDER) + \ - (1UL << HC_MATCHFINDER_HASH4_ORDER)) +#define HC_MATCHFINDER_TOTAL_HASH_SIZE \ + (((1UL << HC_MATCHFINDER_HASH3_ORDER) + \ + (1UL << HC_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t)) struct hc_matchfinder { @@ -130,7 +130,7 @@ struct hc_matchfinder { } #ifdef _aligned_attribute - _aligned_attribute(MATCHFINDER_ALIGNMENT) + _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT) #endif ; @@ -138,14 +138,18 @@ struct hc_matchfinder { static forceinline void hc_matchfinder_init(struct hc_matchfinder *mf) { - matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_LENGTH); + STATIC_ASSERT(HC_MATCHFINDER_TOTAL_HASH_SIZE % + MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_SIZE); } static forceinline void hc_matchfinder_slide_window(struct hc_matchfinder *mf) { - matchfinder_rebase((mf_pos_t *)mf, - sizeof(struct hc_matchfinder) / sizeof(mf_pos_t)); + STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf)); } /* diff --git a/lib/matchfinder_common.h b/lib/matchfinder_common.h index edd9fb7..1f18f8b 100644 --- a/lib/matchfinder_common.h +++ b/lib/matchfinder_common.h @@ -18,11 +18,15 @@ typedef s16 mf_pos_t; #define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE) -#define MATCHFINDER_ALIGNMENT 8 - -#define arch_matchfinder_init(data, size) false -#define arch_matchfinder_rebase(data, size) false +/* + * Required alignment of the matchfinder buffer pointer and size. The values + * here come from the AVX-2 implementation, which is the worst case. + */ +#define MATCHFINDER_MEM_ALIGNMENT 32 +#define MATCHFINDER_SIZE_ALIGNMENT 128 +#undef matchfinder_init +#undef matchfinder_rebase #ifdef _aligned_attribute # if defined(__arm__) || defined(__aarch64__) # include "arm/matchfinder_impl.h" @@ -36,19 +40,20 @@ typedef s16 mf_pos_t; * * Essentially, this is an optimized memset(). * - * 'data' must be aligned to a MATCHFINDER_ALIGNMENT boundary. + * 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and + * 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT. */ +#ifndef matchfinder_init static forceinline void -matchfinder_init(mf_pos_t *data, size_t num_entries) +matchfinder_init(mf_pos_t *data, size_t size) { + size_t num_entries = size / sizeof(*data); size_t i; - if (arch_matchfinder_init(data, num_entries * sizeof(data[0]))) - return; - for (i = 0; i < num_entries; i++) data[i] = MATCHFINDER_INITVAL; } +#endif /* * Slide the matchfinder by WINDOW_SIZE bytes. @@ -71,14 +76,13 @@ matchfinder_init(mf_pos_t *data, size_t num_entries) * of "hash chains", and 2-ary in the case of "binary trees". In either case, * the links need to be rebased in the same way. */ +#ifndef matchfinder_rebase static forceinline void -matchfinder_rebase(mf_pos_t *data, size_t num_entries) +matchfinder_rebase(mf_pos_t *data, size_t size) { + size_t num_entries = size / sizeof(*data); size_t i; - if (arch_matchfinder_rebase(data, num_entries * sizeof(data[0]))) - return; - if (MATCHFINDER_WINDOW_SIZE == 32768) { /* Branchless version for 32768 byte windows. If the value was * already negative, clear all bits except the sign bit; this @@ -101,6 +105,7 @@ matchfinder_rebase(mf_pos_t *data, size_t num_entries) data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; } } +#endif /* * The hash function: given a sequence prefix held in the low-order bits of a diff --git a/lib/x86/matchfinder_impl.h b/lib/x86/matchfinder_impl.h index 735bb48..22a6c82 100644 --- a/lib/x86/matchfinder_impl.h +++ b/lib/x86/matchfinder_impl.h @@ -26,47 +26,38 @@ */ #ifdef __AVX2__ -# if MATCHFINDER_ALIGNMENT < 32 -# undef MATCHFINDER_ALIGNMENT -# define MATCHFINDER_ALIGNMENT 32 -# endif # include -static forceinline bool +static forceinline void matchfinder_init_avx2(mf_pos_t *data, size_t size) { - __m256i v, *p; - size_t n; - - if (size % (sizeof(__m256i) * 4) != 0) - return false; + __m256i *p = (__m256i *)data; + __m256i v = _mm256_set1_epi16(MATCHFINDER_INITVAL); + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = _mm256_set1_epi16(MATCHFINDER_INITVAL); - p = (__m256i *)data; - n = size / (sizeof(__m256i) * 4); + do { p[0] = v; p[1] = v; p[2] = v; p[3] = v; p += 4; - } while (--n); - return true; + size -= 4 * sizeof(*p); + } while (size != 0); } +#define matchfinder_init matchfinder_init_avx2 -static forceinline bool +static forceinline void matchfinder_rebase_avx2(mf_pos_t *data, size_t size) { - __m256i v, *p; - size_t n; - - if (size % (sizeof(__m256i) * 4) != 0) - return false; + __m256i *p = (__m256i *)data; + __m256i v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); - p = (__m256i *)data; - n = size / (sizeof(__m256i) * 4); + do { /* PADDSW: Add Packed Signed Integers With Signed Saturation */ p[0] = _mm256_adds_epi16(p[0], v); @@ -74,53 +65,44 @@ matchfinder_rebase_avx2(mf_pos_t *data, size_t size) p[2] = _mm256_adds_epi16(p[2], v); p[3] = _mm256_adds_epi16(p[3], v); p += 4; - } while (--n); - return true; + size -= 4 * sizeof(*p); + } while (size != 0); } -#endif /* __AVX2__ */ +#define matchfinder_rebase matchfinder_rebase_avx2 -#ifdef __SSE2__ -# if MATCHFINDER_ALIGNMENT < 16 -# undef MATCHFINDER_ALIGNMENT -# define MATCHFINDER_ALIGNMENT 16 -# endif +#elif defined(__SSE2__) # include -static forceinline bool +static forceinline void matchfinder_init_sse2(mf_pos_t *data, size_t size) { - __m128i v, *p; - size_t n; - - if (size % (sizeof(__m128i) * 4) != 0) - return false; + __m128i *p = (__m128i *)data; + __m128i v = _mm_set1_epi16(MATCHFINDER_INITVAL); + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = _mm_set1_epi16(MATCHFINDER_INITVAL); - p = (__m128i *)data; - n = size / (sizeof(__m128i) * 4); + do { p[0] = v; p[1] = v; p[2] = v; p[3] = v; p += 4; - } while (--n); - return true; + size -= 4 * sizeof(*p); + } while (size != 0); } +#define matchfinder_init matchfinder_init_sse2 -static forceinline bool +static forceinline void matchfinder_rebase_sse2(mf_pos_t *data, size_t size) { - __m128i v, *p; - size_t n; - - if (size % (sizeof(__m128i) * 4) != 0) - return false; + __m128i *p = (__m128i *)data; + __m128i v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); - p = (__m128i *)data; - n = size / (sizeof(__m128i) * 4); + do { /* PADDSW: Add Packed Signed Integers With Signed Saturation */ p[0] = _mm_adds_epi16(p[0], v); @@ -128,37 +110,8 @@ matchfinder_rebase_sse2(mf_pos_t *data, size_t size) p[2] = _mm_adds_epi16(p[2], v); p[3] = _mm_adds_epi16(p[3], v); p += 4; - } while (--n); - return true; + size -= 4 * sizeof(*p); + } while (size != 0); } +#define matchfinder_rebase matchfinder_rebase_sse2 #endif /* __SSE2__ */ - -#undef arch_matchfinder_init -static forceinline bool -arch_matchfinder_init(mf_pos_t *data, size_t size) -{ -#ifdef __AVX2__ - if (matchfinder_init_avx2(data, size)) - return true; -#endif -#ifdef __SSE2__ - if (matchfinder_init_sse2(data, size)) - return true; -#endif - return false; -} - -#undef arch_matchfinder_rebase -static forceinline bool -arch_matchfinder_rebase(mf_pos_t *data, size_t size) -{ -#ifdef __AVX2__ - if (matchfinder_rebase_avx2(data, size)) - return true; -#endif -#ifdef __SSE2__ - if (matchfinder_rebase_sse2(data, size)) - return true; -#endif - return false; -}