diff --git a/Makefile b/Makefile index dd8ea3c..a8d300b 100644 --- a/Makefile +++ b/Makefile @@ -29,6 +29,10 @@ SUPPORT_NEAR_OPTIMAL_PARSING := yes # This is faster but ***insecure***! Default to secure. UNSAFE_DECOMPRESSION := no +# Will the decompressor detect CPU features at runtime in order to run more +# optimized code? This only affects some platforms and architectures. +RUNTIME_CPU_DETECTION := yes + # The compiler and archiver CC := gcc AR := ar @@ -62,12 +66,19 @@ ifeq ($(UNSAFE_DECOMPRESSION),yes) override CFLAGS += -DUNSAFE_DECOMPRESSION=1 endif +ifeq ($(RUNTIME_CPU_DETECTION),yes) + override CFLAGS += -DRUNTIME_CPU_DETECTION=1 +endif + SRC := src/aligned_malloc.c ifeq ($(SUPPORT_COMPRESSION),yes) SRC += src/deflate_compress.c endif ifeq ($(SUPPORT_DECOMPRESSION),yes) SRC += src/deflate_decompress.c + ifeq ($(RUNTIME_CPU_DETECTION),yes) + SRC += src/x86_cpu_features.c + endif endif ifeq ($(SUPPORT_ZLIB),yes) ifeq ($(SUPPORT_COMPRESSION),yes) diff --git a/src/decompress_impl.h b/src/decompress_impl.h new file mode 100644 index 0000000..ad51d2b --- /dev/null +++ b/src/decompress_impl.h @@ -0,0 +1,364 @@ +/* + * decompress_impl.h + * + * The actual DEFLATE decompression routine, lifted out of deflate_decompress.c + * so that it can be compiled multiple times with different target instruction + * sets. + */ + +static bool ATTRIBUTES +FUNCNAME(struct deflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes) +{ + u8 *out_next = out; + u8 * const out_end = out_next + out_nbytes; + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + bitbuf_t bitbuf = 0; + unsigned bitsleft = 0; + size_t overrun_count = 0; + unsigned i; + unsigned is_final_block; + unsigned block_type; + u16 len; + u16 nlen; + unsigned num_litlen_syms; + unsigned num_offset_syms; + +next_block: + /* Starting to read the next block. */ + ; + + STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4)); + ENSURE_BITS(1 + 2 + 5 + 5 + 4); + + /* BFINAL: 1 bit */ + is_final_block = POP_BITS(1); + + /* BTYPE: 2 bits */ + block_type = POP_BITS(2); + + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { + + /* Dynamic Huffman block. */ + + /* The order in which precode lengths are stored. */ + static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + + unsigned num_explicit_precode_lens; + + /* Read the codeword length counts. */ + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257); + num_litlen_syms = POP_BITS(5) + 257; + + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1); + num_offset_syms = POP_BITS(5) + 1; + + STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4); + num_explicit_precode_lens = POP_BITS(4) + 4; + + /* Read the precode codeword lengths. */ + STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); + if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) { + + ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3); + + for (i = 0; i < num_explicit_precode_lens; i++) + d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); + } else { + for (i = 0; i < num_explicit_precode_lens; i++) { + ENSURE_BITS(3); + d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); + } + } + + for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) + d->precode_lens[deflate_precode_lens_permutation[i]] = 0; + + /* Build the decode table for the precode. */ + SAFETY_CHECK(build_precode_decode_table(d)); + + /* Expand the literal/length and offset codeword lengths. */ + for (i = 0; i < num_litlen_syms + num_offset_syms; ) { + u32 entry; + unsigned presym; + u8 rep_val; + unsigned rep_count; + + ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7); + + /* (The code below assumes that the precode decode table + * does not have any subtables.) */ + STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); + + /* Read the next precode symbol. */ + entry = d->precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)]; + REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); + presym = entry >> HUFFDEC_RESULT_SHIFT; + + if (presym < 16) { + /* Explicit codeword length */ + d->lens[i++] = presym; + continue; + } + + /* Run-length encoded codeword lengths */ + + /* Note: we don't need verify that the repeat count + * doesn't overflow the number of elements, since we + * have enough extra spaces to allow for the worst-case + * overflow (138 zeroes when only 1 length was + * remaining). + * + * In the case of the small repeat counts (presyms 16 + * and 17), it is fastest to always write the maximum + * number of entries. That gets rid of branches that + * would otherwise be required. + * + * It is not just because of the numerical order that + * our checks go in the order 'presym < 16', 'presym == + * 16', and 'presym == 17'. For typical data this is + * ordered from most frequent to least frequent case. + */ + STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); + + if (presym == 16) { + /* Repeat the previous length 3 - 6 times */ + SAFETY_CHECK(i != 0); + rep_val = d->lens[i - 1]; + STATIC_ASSERT(3 + ((1 << 2) - 1) == 6); + rep_count = 3 + POP_BITS(2); + d->lens[i + 0] = rep_val; + d->lens[i + 1] = rep_val; + d->lens[i + 2] = rep_val; + d->lens[i + 3] = rep_val; + d->lens[i + 4] = rep_val; + d->lens[i + 5] = rep_val; + i += rep_count; + } else if (presym == 17) { + /* Repeat zero 3 - 10 times */ + STATIC_ASSERT(3 + ((1 << 3) - 1) == 10); + rep_count = 3 + POP_BITS(3); + d->lens[i + 0] = 0; + d->lens[i + 1] = 0; + d->lens[i + 2] = 0; + d->lens[i + 3] = 0; + d->lens[i + 4] = 0; + d->lens[i + 5] = 0; + d->lens[i + 6] = 0; + d->lens[i + 7] = 0; + d->lens[i + 8] = 0; + d->lens[i + 9] = 0; + i += rep_count; + } else { + /* Repeat zero 11 - 138 times */ + STATIC_ASSERT(11 + ((1 << 7) - 1) == 138); + rep_count = 11 + POP_BITS(7); + memset(&d->lens[i], 0, rep_count * sizeof(d->lens[i])); + i += rep_count; + } + } + } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + + /* Uncompressed block: copy 'len' bytes literally from the input + * buffer to the output buffer. */ + + ALIGN_INPUT(); + + SAFETY_CHECK(in_end - in_next >= 4); + + len = READ_U16(); + nlen = READ_U16(); + + SAFETY_CHECK(len == (u16)~nlen); + SAFETY_CHECK(len <= out_end - out_next); + SAFETY_CHECK(len <= in_end - in_next); + + memcpy(out_next, in_next, len); + in_next += len; + out_next += len; + + goto block_done; + + } else { + SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); + + /* Static Huffman block: set the static Huffman codeword + * lengths. Then the remainder is the same as decompressing a + * dynamic Huffman block. */ + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); + + for (i = 0; i < 144; i++) + d->lens[i] = 8; + for (; i < 256; i++) + d->lens[i] = 9; + for (; i < 280; i++) + d->lens[i] = 7; + for (; i < 288; i++) + d->lens[i] = 8; + + for (; i < 288 + 32; i++) + d->lens[i] = 5; + + num_litlen_syms = 288; + num_offset_syms = 32; + + } + + /* Decompressing a Huffman block (either dynamic or static) */ + + SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); + SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); + + /* The main DEFLATE decode loop */ + for (;;) { + u32 entry; + u32 length; + u32 offset; + + /* Decode a litlen symbol. */ + ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN); + entry = d->litlen_decode_table[BITS(LITLEN_TABLEBITS)]; + if (entry & HUFFDEC_SUBTABLE_POINTER) { + /* Litlen subtable required (uncommon case) */ + REMOVE_BITS(LITLEN_TABLEBITS); + entry = d->litlen_decode_table[ + ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + + BITS(entry & HUFFDEC_LENGTH_MASK)]; + } + REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); + if (entry & HUFFDEC_LITERAL) { + /* Literal */ + SAFETY_CHECK(out_next < out_end); + *out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT); + continue; + } + + /* Match or end-of-block */ + + entry >>= HUFFDEC_RESULT_SHIFT; + ENSURE_BITS(MAX_ENSURE); + + /* Pop the extra length bits and add them to the length base to + * produce the full length. */ + length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) + + POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK); + + /* The match destination must not end after the end of the + * output buffer. For efficiency, combine this check with the + * end-of-block check. We're using 0 for the special + * end-of-block length, so subtract 1 and it turn it into + * SIZE_MAX. */ + STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0); + if (unlikely((size_t)length - 1 > out_end - out_next)) { + SAFETY_CHECK(length == HUFFDEC_END_OF_BLOCK_LENGTH); + goto block_done; + } + + /* Decode the match offset. */ + + entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)]; + if (entry & HUFFDEC_SUBTABLE_POINTER) { + /* Offset subtable required (uncommon case) */ + REMOVE_BITS(OFFSET_TABLEBITS); + entry = d->offset_decode_table[ + ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + + BITS(entry & HUFFDEC_LENGTH_MASK)]; + } + REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); + entry >>= HUFFDEC_RESULT_SHIFT; + + STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS + + DEFLATE_MAX_OFFSET_CODEWORD_LEN) && + CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS)); + if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS + + DEFLATE_MAX_OFFSET_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_OFFSET_BITS)) + ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS); + + /* Pop the extra offset bits and add them to the offset base to + * produce the full offset. */ + offset = (entry & HUFFDEC_OFFSET_BASE_MASK) + + POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT); + + /* The match source must not begin before the beginning of the + * output buffer. */ + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + + /* Copy the match: 'length' bytes at 'out_next - offset' to + * 'out_next'. */ + + if (UNALIGNED_ACCESS_IS_FAST && + length <= (3 * WORDSIZE) && + offset >= WORDSIZE && + length + (3 * WORDSIZE) <= out_end - out_next) + { + /* Fast case: short length, no overlaps if we copy one + * word at a time, and we aren't getting too close to + * the end of the output array. */ + copy_word_unaligned(out_next - offset + (0 * WORDSIZE), + out_next + (0 * WORDSIZE)); + copy_word_unaligned(out_next - offset + (1 * WORDSIZE), + out_next + (1 * WORDSIZE)); + copy_word_unaligned(out_next - offset + (2 * WORDSIZE), + out_next + (2 * WORDSIZE)); + } else { + const u8 *src = out_next - offset; + u8 *dst = out_next; + u8 *end = out_next + length; + + if (UNALIGNED_ACCESS_IS_FAST && + likely(out_end - end >= WORDSIZE - 1)) { + if (offset >= WORDSIZE) { + copy_word_unaligned(src, dst); + src += WORDSIZE; + dst += WORDSIZE; + if (dst < end) { + do { + copy_word_unaligned(src, dst); + src += WORDSIZE; + dst += WORDSIZE; + } while (dst < end); + } + } else if (offset == 1) { + machine_word_t v = repeat_byte(*(dst - 1)); + do { + store_word_unaligned(v, dst); + src += WORDSIZE; + dst += WORDSIZE; + } while (dst < end); + } else { + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < end); + } + } else { + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < end); + } + } + + out_next += length; + } + +block_done: + /* Finished decoding a block. */ + + if (!is_final_block) + goto next_block; + + /* That was the last block. Return %true if we got all the output we + * expected, otherwise %false. */ + return (out_next == out_end); +} diff --git a/src/deflate_decompress.c b/src/deflate_decompress.c index 9344b88..ee3451f 100644 --- a/src/deflate_decompress.c +++ b/src/deflate_decompress.c @@ -10,8 +10,9 @@ * --------------------------------------------------------------------------- * * This is a highly optimized DEFLATE decompressor. On x86_64 it decompresses - * data in about 52% of the time of zlib. On other architectures it should - * still be significantly faster than zlib, but the difference may be smaller. + * data in about 52% of the time of zlib (48% if BMI2 instructions are + * available). On other architectures it should still be significantly faster + * than zlib, but the difference may be smaller. * * Why this is faster than zlib's implementation: * @@ -22,6 +23,8 @@ * - Other optimizations to remove unnecessary branches * - Only full-buffer decompression is supported, so the code doesn't need to * support stopping and resuming decompression. + * - On x86_64, compile a version of the decompression routine using BMI2 + * instructions and use it automatically at runtime when supported. */ #include @@ -31,6 +34,7 @@ #include "deflate_constants.h" #include "unaligned.h" +#include "x86_cpu_features.h" /* By default, if the expression passed to SAFETY_CHECK() evaluates to false, * then deflate_decompress() immediately returns false as the compressed data is @@ -793,6 +797,50 @@ copy_word_unaligned(const void *src, void *dst) * Main decompression routine *****************************************************************************/ +#define FUNCNAME deflate_decompress_default +#define ATTRIBUTES +#include "decompress_impl.h" +#undef FUNCNAME +#undef ATTRIBUTES + +#if X86_CPU_FEATURES_ENABLED && !defined(__BMI2__) +# define FUNCNAME deflate_decompress_bmi2 +# define ATTRIBUTES __attribute__((target("bmi2"))) +# include "decompress_impl.h" +# undef FUNCNAME +# undef ATTRIBUTES +# define DISPATCH_ENABLED 1 +#endif + +#if DISPATCH_ENABLED + +static bool +dispatch(struct deflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes); + +typedef bool (*decompress_func_t)(struct deflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes); + +static decompress_func_t decompress_impl = dispatch; + +static bool +dispatch(struct deflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes) +{ + decompress_func_t f = deflate_decompress_default; +#if X86_CPU_FEATURES_ENABLED + if (x86_have_cpu_feature(X86_CPU_FEATURE_BMI2)) + f = deflate_decompress_bmi2; +#endif + decompress_impl = f; + return (*f)(d, in, in_nbytes, out, out_nbytes); +} +#endif /* DISPATCH_ENABLED */ + + /* * This is the main DEFLATE decompression routine. It decompresses 'in_nbytes' * bytes of compressed data from the buffer 'in' and writes the uncompressed @@ -801,362 +849,20 @@ copy_word_unaligned(const void *src, void *dst) * and only if decompression was successful. A return value of %false indicates * that either the compressed data is invalid or it does not decompress to * exactly 'out_nbytes' bytes of uncompressed data. + * + * The real code is in decompress_impl.h. The part here just handles calling + * the appropriate implementation depending on the CPU features at runtime. */ LIBEXPORT bool deflate_decompress(struct deflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes) { - u8 *out_next = out; - u8 * const out_end = out_next + out_nbytes; - const u8 *in_next = in; - const u8 * const in_end = in_next + in_nbytes; - bitbuf_t bitbuf = 0; - unsigned bitsleft = 0; - size_t overrun_count = 0; - unsigned i; - unsigned is_final_block; - unsigned block_type; - u16 len; - u16 nlen; - unsigned num_litlen_syms; - unsigned num_offset_syms; - -next_block: - /* Starting to read the next block. */ - ; - - STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4)); - ENSURE_BITS(1 + 2 + 5 + 5 + 4); - - /* BFINAL: 1 bit */ - is_final_block = POP_BITS(1); - - /* BTYPE: 2 bits */ - block_type = POP_BITS(2); - - if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { - - /* Dynamic Huffman block. */ - - /* The order in which precode lengths are stored. */ - static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { - 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 - }; - - unsigned num_explicit_precode_lens; - - /* Read the codeword length counts. */ - - STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257); - num_litlen_syms = POP_BITS(5) + 257; - - STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1); - num_offset_syms = POP_BITS(5) + 1; - - STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4); - num_explicit_precode_lens = POP_BITS(4) + 4; - - /* Read the precode codeword lengths. */ - STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); - if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) { - - ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3); - - for (i = 0; i < num_explicit_precode_lens; i++) - d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); - } else { - for (i = 0; i < num_explicit_precode_lens; i++) { - ENSURE_BITS(3); - d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); - } - } - - for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) - d->precode_lens[deflate_precode_lens_permutation[i]] = 0; - - /* Build the decode table for the precode. */ - SAFETY_CHECK(build_precode_decode_table(d)); - - /* Expand the literal/length and offset codeword lengths. */ - for (i = 0; i < num_litlen_syms + num_offset_syms; ) { - u32 entry; - unsigned presym; - u8 rep_val; - unsigned rep_count; - - ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7); - - /* (The code below assumes that the precode decode table - * does not have any subtables.) */ - STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); - - /* Read the next precode symbol. */ - entry = d->precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)]; - REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); - presym = entry >> HUFFDEC_RESULT_SHIFT; - - if (presym < 16) { - /* Explicit codeword length */ - d->lens[i++] = presym; - continue; - } - - /* Run-length encoded codeword lengths */ - - /* Note: we don't need verify that the repeat count - * doesn't overflow the number of elements, since we - * have enough extra spaces to allow for the worst-case - * overflow (138 zeroes when only 1 length was - * remaining). - * - * In the case of the small repeat counts (presyms 16 - * and 17), it is fastest to always write the maximum - * number of entries. That gets rid of branches that - * would otherwise be required. - * - * It is not just because of the numerical order that - * our checks go in the order 'presym < 16', 'presym == - * 16', and 'presym == 17'. For typical data this is - * ordered from most frequent to least frequent case. - */ - STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); - - if (presym == 16) { - /* Repeat the previous length 3 - 6 times */ - SAFETY_CHECK(i != 0); - rep_val = d->lens[i - 1]; - STATIC_ASSERT(3 + ((1 << 2) - 1) == 6); - rep_count = 3 + POP_BITS(2); - d->lens[i + 0] = rep_val; - d->lens[i + 1] = rep_val; - d->lens[i + 2] = rep_val; - d->lens[i + 3] = rep_val; - d->lens[i + 4] = rep_val; - d->lens[i + 5] = rep_val; - i += rep_count; - } else if (presym == 17) { - /* Repeat zero 3 - 10 times */ - STATIC_ASSERT(3 + ((1 << 3) - 1) == 10); - rep_count = 3 + POP_BITS(3); - d->lens[i + 0] = 0; - d->lens[i + 1] = 0; - d->lens[i + 2] = 0; - d->lens[i + 3] = 0; - d->lens[i + 4] = 0; - d->lens[i + 5] = 0; - d->lens[i + 6] = 0; - d->lens[i + 7] = 0; - d->lens[i + 8] = 0; - d->lens[i + 9] = 0; - i += rep_count; - } else { - /* Repeat zero 11 - 138 times */ - STATIC_ASSERT(11 + ((1 << 7) - 1) == 138); - rep_count = 11 + POP_BITS(7); - memset(&d->lens[i], 0, rep_count * sizeof(d->lens[i])); - i += rep_count; - } - } - } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { - - /* Uncompressed block: copy 'len' bytes literally from the input - * buffer to the output buffer. */ - - ALIGN_INPUT(); - - SAFETY_CHECK(in_end - in_next >= 4); - - len = READ_U16(); - nlen = READ_U16(); - - SAFETY_CHECK(len == (u16)~nlen); - SAFETY_CHECK(len <= out_end - out_next); - SAFETY_CHECK(len <= in_end - in_next); - - memcpy(out_next, in_next, len); - in_next += len; - out_next += len; - - goto block_done; - - } else { - SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); - - /* Static Huffman block: set the static Huffman codeword - * lengths. Then the remainder is the same as decompressing a - * dynamic Huffman block. */ - - STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); - STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); - - for (i = 0; i < 144; i++) - d->lens[i] = 8; - for (; i < 256; i++) - d->lens[i] = 9; - for (; i < 280; i++) - d->lens[i] = 7; - for (; i < 288; i++) - d->lens[i] = 8; - - for (; i < 288 + 32; i++) - d->lens[i] = 5; - - num_litlen_syms = 288; - num_offset_syms = 32; - - } - - /* Decompressing a Huffman block (either dynamic or static) */ - - SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); - SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); - - /* The main DEFLATE decode loop */ - for (;;) { - u32 entry; - u32 length; - u32 offset; - - /* Decode a litlen symbol. */ - ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN); - entry = d->litlen_decode_table[BITS(LITLEN_TABLEBITS)]; - if (entry & HUFFDEC_SUBTABLE_POINTER) { - /* Litlen subtable required (uncommon case) */ - REMOVE_BITS(LITLEN_TABLEBITS); - entry = d->litlen_decode_table[ - ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + - BITS(entry & HUFFDEC_LENGTH_MASK)]; - } - REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); - if (entry & HUFFDEC_LITERAL) { - /* Literal */ - SAFETY_CHECK(out_next < out_end); - *out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT); - continue; - } - - /* Match or end-of-block */ - - entry >>= HUFFDEC_RESULT_SHIFT; - ENSURE_BITS(MAX_ENSURE); - - /* Pop the extra length bits and add them to the length base to - * produce the full length. */ - length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) + - POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK); - - /* The match destination must not end after the end of the - * output buffer. For efficiency, combine this check with the - * end-of-block check. We're using 0 for the special - * end-of-block length, so subtract 1 and it turn it into - * SIZE_MAX. */ - STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0); - if (unlikely((size_t)length - 1 > out_end - out_next)) { - SAFETY_CHECK(length == HUFFDEC_END_OF_BLOCK_LENGTH); - goto block_done; - } - - /* Decode the match offset. */ - - entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)]; - if (entry & HUFFDEC_SUBTABLE_POINTER) { - /* Offset subtable required (uncommon case) */ - REMOVE_BITS(OFFSET_TABLEBITS); - entry = d->offset_decode_table[ - ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + - BITS(entry & HUFFDEC_LENGTH_MASK)]; - } - REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); - entry >>= HUFFDEC_RESULT_SHIFT; - - STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS + - DEFLATE_MAX_OFFSET_CODEWORD_LEN) && - CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS)); - if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS + - DEFLATE_MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS); - - /* Pop the extra offset bits and add them to the offset base to - * produce the full offset. */ - offset = (entry & HUFFDEC_OFFSET_BASE_MASK) + - POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT); - - /* The match source must not begin before the beginning of the - * output buffer. */ - SAFETY_CHECK(offset <= out_next - (const u8 *)out); - - /* Copy the match: 'length' bytes at 'out_next - offset' to - * 'out_next'. */ - - if (UNALIGNED_ACCESS_IS_FAST && - length <= (3 * WORDSIZE) && - offset >= WORDSIZE && - length + (3 * WORDSIZE) <= out_end - out_next) - { - /* Fast case: short length, no overlaps if we copy one - * word at a time, and we aren't getting too close to - * the end of the output array. */ - copy_word_unaligned(out_next - offset + (0 * WORDSIZE), - out_next + (0 * WORDSIZE)); - copy_word_unaligned(out_next - offset + (1 * WORDSIZE), - out_next + (1 * WORDSIZE)); - copy_word_unaligned(out_next - offset + (2 * WORDSIZE), - out_next + (2 * WORDSIZE)); - } else { - const u8 *src = out_next - offset; - u8 *dst = out_next; - u8 *end = out_next + length; - - if (UNALIGNED_ACCESS_IS_FAST && - likely(out_end - end >= WORDSIZE - 1)) { - if (offset >= WORDSIZE) { - copy_word_unaligned(src, dst); - src += WORDSIZE; - dst += WORDSIZE; - if (dst < end) { - do { - copy_word_unaligned(src, dst); - src += WORDSIZE; - dst += WORDSIZE; - } while (dst < end); - } - } else if (offset == 1) { - machine_word_t v = repeat_byte(*(dst - 1)); - do { - store_word_unaligned(v, dst); - src += WORDSIZE; - dst += WORDSIZE; - } while (dst < end); - } else { - *dst++ = *src++; - *dst++ = *src++; - do { - *dst++ = *src++; - } while (dst < end); - } - } else { - *dst++ = *src++; - *dst++ = *src++; - do { - *dst++ = *src++; - } while (dst < end); - } - } - - out_next += length; - } - -block_done: - /* Finished decoding a block. */ - - if (!is_final_block) - goto next_block; - - /* That was the last block. Return %true if we got all the output we - * expected, otherwise %false. */ - return (out_next == out_end); +#if DISPATCH_ENABLED + return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes); +#else + return deflate_decompress_default(d, in, in_nbytes, out, out_nbytes); +#endif } LIBEXPORT struct deflate_decompressor * diff --git a/src/x86_cpu_features.c b/src/x86_cpu_features.c new file mode 100644 index 0000000..b0014b5 --- /dev/null +++ b/src/x86_cpu_features.c @@ -0,0 +1,145 @@ +/* + * x86_cpu_features.c - feature detection for x86 processors + * + * Author: Eric Biggers + * Year: 2015 + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + +#include "x86_cpu_features.h" + +#ifdef X86_CPU_FEATURES_ENABLED + +#define DEBUG 0 + +#if DEBUG +# include +#endif + +u32 _x86_cpu_features = 0; + +/* With old GCC versions we have to manually save and restore the x86_32 PIC + * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 */ +#if defined(__i386__) && defined(__PIC__) +# define EBX_CONSTRAINT "=r" +#else +# define EBX_CONSTRAINT "=b" +#endif + +/* Execute the CPUID instruction. */ +static inline void +cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) +{ + __asm__(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" + "cpuid \n" + ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" + : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) + : "a" (leaf), "c" (subleaf)); +} + +/* Read an extended control register. */ +static inline u64 +read_xcr(u32 index) +{ + u32 edx, eax; + + /* Execute the "xgetbv" instruction. Old versions of binutils do not + * recognize this instruction, so list the raw bytes instead. */ + __asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index)); + + return ((u64)edx << 32) | eax; +} + +#define IS_SET(reg, bit) ((reg) & ((u32)1 << (bit))) + +/* Initialize _x86_cpu_features with bits for interesting processor features. */ +void +x86_setup_cpu_features(void) +{ + u32 features = 0; + u32 dummy1, dummy2, dummy3, dummy4; + u32 max_function; + u32 features_1, features_2, features_3, features_4; + bool os_saves_ymm_regs = false; + + /* Get maximum supported function */ + cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4); + if (max_function < 1) + goto out; + + /* Standard feature flags */ + cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1); + + if (IS_SET(features_1, 25)) + features |= X86_CPU_FEATURE_SSE; + + if (IS_SET(features_1, 26)) + features |= X86_CPU_FEATURE_SSE2; + + if (IS_SET(features_2, 0)) + features |= X86_CPU_FEATURE_SSE3; + + if (IS_SET(features_2, 9)) + features |= X86_CPU_FEATURE_SSSE3; + + if (IS_SET(features_2, 19)) + features |= X86_CPU_FEATURE_SSE4_1; + + if (IS_SET(features_2, 20)) + features |= X86_CPU_FEATURE_SSE4_2; + + if (IS_SET(features_2, 27)) /* OSXSAVE set? */ + if ((read_xcr(0) & 0x6) == 0x6) + os_saves_ymm_regs = true; + + if (os_saves_ymm_regs && IS_SET(features_2, 28)) + features |= X86_CPU_FEATURE_AVX; + + if (max_function < 7) + goto out; + + /* Extended feature flags */ + cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4); + + if (IS_SET(features_3, 3)) + features |= X86_CPU_FEATURE_BMI; + + if (os_saves_ymm_regs && IS_SET(features_3, 5)) + features |= X86_CPU_FEATURE_AVX2; + + if (IS_SET(features_3, 8)) + features |= X86_CPU_FEATURE_BMI2; + +out: + +#if DEBUG + printf("Detected x86 CPU features: "); + if (features & X86_CPU_FEATURE_SSE) + printf("SSE "); + if (features & X86_CPU_FEATURE_SSE2) + printf("SSE2 "); + if (features & X86_CPU_FEATURE_SSE3) + printf("SSE3 "); + if (features & X86_CPU_FEATURE_SSSE3) + printf("SSSE3 "); + if (features & X86_CPU_FEATURE_SSE4_1) + printf("SSE4.1 "); + if (features & X86_CPU_FEATURE_SSE4_2) + printf("SSE4.2 "); + if (features & X86_CPU_FEATURE_BMI) + printf("BMI "); + if (features & X86_CPU_FEATURE_AVX) + printf("AVX "); + if (features & X86_CPU_FEATURE_BMI2) + printf("BMI2 "); + if (features & X86_CPU_FEATURE_AVX2) + printf("AVX2 "); + printf("\n"); +#endif /* DEBUG */ + + _x86_cpu_features = features | X86_CPU_FEATURES_KNOWN; +} + +#endif /* X86_CPU_FEATURES_ENABLED */ diff --git a/src/x86_cpu_features.h b/src/x86_cpu_features.h new file mode 100644 index 0000000..5f0c7e0 --- /dev/null +++ b/src/x86_cpu_features.h @@ -0,0 +1,43 @@ +/* + * x86_cpu_features.h - feature detection for x86 processors + */ + +#pragma once + +#include "util.h" + +#if RUNTIME_CPU_DETECTION && defined(__GNUC__) && defined(__x86_64__) +# define X86_CPU_FEATURES_ENABLED 1 +#endif + + +#if X86_CPU_FEATURES_ENABLED + +#define X86_CPU_FEATURE_SSE 0x00000001 +#define X86_CPU_FEATURE_SSE2 0x00000002 +#define X86_CPU_FEATURE_SSE3 0x00000004 +#define X86_CPU_FEATURE_SSSE3 0x00000008 +#define X86_CPU_FEATURE_SSE4_1 0x00000010 +#define X86_CPU_FEATURE_SSE4_2 0x00000020 +#define X86_CPU_FEATURE_AVX 0x00000040 +#define X86_CPU_FEATURE_BMI 0x00000080 +#define X86_CPU_FEATURE_AVX2 0x00000100 +#define X86_CPU_FEATURE_BMI2 0x00000200 + +#define X86_CPU_FEATURES_KNOWN 0x80000000 + +extern u32 _x86_cpu_features; + +extern void +x86_setup_cpu_features(void); + +/* Does the processor have the specified feature? */ +static inline bool +x86_have_cpu_feature(u32 feature) +{ + if (_x86_cpu_features == 0) + x86_setup_cpu_features(); + return _x86_cpu_features & feature; +} + +#endif /* X86_CPU_FEATURES_ENABLED */