mirror of
https://github.com/cuberite/libdeflate.git
synced 2025-09-10 12:58:30 -04:00
Choose BMI2-optimized decompression routine at runtime
This commit is contained in:
parent
16f3b420a0
commit
e731f4b510
11
Makefile
11
Makefile
@ -29,6 +29,10 @@ SUPPORT_NEAR_OPTIMAL_PARSING := yes
|
|||||||
# This is faster but ***insecure***! Default to secure.
|
# This is faster but ***insecure***! Default to secure.
|
||||||
UNSAFE_DECOMPRESSION := no
|
UNSAFE_DECOMPRESSION := no
|
||||||
|
|
||||||
|
# Will the decompressor detect CPU features at runtime in order to run more
|
||||||
|
# optimized code? This only affects some platforms and architectures.
|
||||||
|
RUNTIME_CPU_DETECTION := yes
|
||||||
|
|
||||||
# The compiler and archiver
|
# The compiler and archiver
|
||||||
CC := gcc
|
CC := gcc
|
||||||
AR := ar
|
AR := ar
|
||||||
@ -62,12 +66,19 @@ ifeq ($(UNSAFE_DECOMPRESSION),yes)
|
|||||||
override CFLAGS += -DUNSAFE_DECOMPRESSION=1
|
override CFLAGS += -DUNSAFE_DECOMPRESSION=1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(RUNTIME_CPU_DETECTION),yes)
|
||||||
|
override CFLAGS += -DRUNTIME_CPU_DETECTION=1
|
||||||
|
endif
|
||||||
|
|
||||||
SRC := src/aligned_malloc.c
|
SRC := src/aligned_malloc.c
|
||||||
ifeq ($(SUPPORT_COMPRESSION),yes)
|
ifeq ($(SUPPORT_COMPRESSION),yes)
|
||||||
SRC += src/deflate_compress.c
|
SRC += src/deflate_compress.c
|
||||||
endif
|
endif
|
||||||
ifeq ($(SUPPORT_DECOMPRESSION),yes)
|
ifeq ($(SUPPORT_DECOMPRESSION),yes)
|
||||||
SRC += src/deflate_decompress.c
|
SRC += src/deflate_decompress.c
|
||||||
|
ifeq ($(RUNTIME_CPU_DETECTION),yes)
|
||||||
|
SRC += src/x86_cpu_features.c
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
ifeq ($(SUPPORT_ZLIB),yes)
|
ifeq ($(SUPPORT_ZLIB),yes)
|
||||||
ifeq ($(SUPPORT_COMPRESSION),yes)
|
ifeq ($(SUPPORT_COMPRESSION),yes)
|
||||||
|
364
src/decompress_impl.h
Normal file
364
src/decompress_impl.h
Normal file
@ -0,0 +1,364 @@
|
|||||||
|
/*
|
||||||
|
* decompress_impl.h
|
||||||
|
*
|
||||||
|
* The actual DEFLATE decompression routine, lifted out of deflate_decompress.c
|
||||||
|
* so that it can be compiled multiple times with different target instruction
|
||||||
|
* sets.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static bool ATTRIBUTES
|
||||||
|
FUNCNAME(struct deflate_decompressor * restrict d,
|
||||||
|
const void * restrict in, size_t in_nbytes,
|
||||||
|
void * restrict out, size_t out_nbytes)
|
||||||
|
{
|
||||||
|
u8 *out_next = out;
|
||||||
|
u8 * const out_end = out_next + out_nbytes;
|
||||||
|
const u8 *in_next = in;
|
||||||
|
const u8 * const in_end = in_next + in_nbytes;
|
||||||
|
bitbuf_t bitbuf = 0;
|
||||||
|
unsigned bitsleft = 0;
|
||||||
|
size_t overrun_count = 0;
|
||||||
|
unsigned i;
|
||||||
|
unsigned is_final_block;
|
||||||
|
unsigned block_type;
|
||||||
|
u16 len;
|
||||||
|
u16 nlen;
|
||||||
|
unsigned num_litlen_syms;
|
||||||
|
unsigned num_offset_syms;
|
||||||
|
|
||||||
|
next_block:
|
||||||
|
/* Starting to read the next block. */
|
||||||
|
;
|
||||||
|
|
||||||
|
STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
|
||||||
|
ENSURE_BITS(1 + 2 + 5 + 5 + 4);
|
||||||
|
|
||||||
|
/* BFINAL: 1 bit */
|
||||||
|
is_final_block = POP_BITS(1);
|
||||||
|
|
||||||
|
/* BTYPE: 2 bits */
|
||||||
|
block_type = POP_BITS(2);
|
||||||
|
|
||||||
|
if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
|
||||||
|
|
||||||
|
/* Dynamic Huffman block. */
|
||||||
|
|
||||||
|
/* The order in which precode lengths are stored. */
|
||||||
|
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
|
||||||
|
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
|
||||||
|
};
|
||||||
|
|
||||||
|
unsigned num_explicit_precode_lens;
|
||||||
|
|
||||||
|
/* Read the codeword length counts. */
|
||||||
|
|
||||||
|
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
|
||||||
|
num_litlen_syms = POP_BITS(5) + 257;
|
||||||
|
|
||||||
|
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
|
||||||
|
num_offset_syms = POP_BITS(5) + 1;
|
||||||
|
|
||||||
|
STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
|
||||||
|
num_explicit_precode_lens = POP_BITS(4) + 4;
|
||||||
|
|
||||||
|
/* Read the precode codeword lengths. */
|
||||||
|
STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
|
||||||
|
if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) {
|
||||||
|
|
||||||
|
ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3);
|
||||||
|
|
||||||
|
for (i = 0; i < num_explicit_precode_lens; i++)
|
||||||
|
d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
|
||||||
|
} else {
|
||||||
|
for (i = 0; i < num_explicit_precode_lens; i++) {
|
||||||
|
ENSURE_BITS(3);
|
||||||
|
d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
|
||||||
|
d->precode_lens[deflate_precode_lens_permutation[i]] = 0;
|
||||||
|
|
||||||
|
/* Build the decode table for the precode. */
|
||||||
|
SAFETY_CHECK(build_precode_decode_table(d));
|
||||||
|
|
||||||
|
/* Expand the literal/length and offset codeword lengths. */
|
||||||
|
for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
|
||||||
|
u32 entry;
|
||||||
|
unsigned presym;
|
||||||
|
u8 rep_val;
|
||||||
|
unsigned rep_count;
|
||||||
|
|
||||||
|
ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
|
||||||
|
|
||||||
|
/* (The code below assumes that the precode decode table
|
||||||
|
* does not have any subtables.) */
|
||||||
|
STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
|
||||||
|
|
||||||
|
/* Read the next precode symbol. */
|
||||||
|
entry = d->precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
|
||||||
|
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
|
||||||
|
presym = entry >> HUFFDEC_RESULT_SHIFT;
|
||||||
|
|
||||||
|
if (presym < 16) {
|
||||||
|
/* Explicit codeword length */
|
||||||
|
d->lens[i++] = presym;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Run-length encoded codeword lengths */
|
||||||
|
|
||||||
|
/* Note: we don't need verify that the repeat count
|
||||||
|
* doesn't overflow the number of elements, since we
|
||||||
|
* have enough extra spaces to allow for the worst-case
|
||||||
|
* overflow (138 zeroes when only 1 length was
|
||||||
|
* remaining).
|
||||||
|
*
|
||||||
|
* In the case of the small repeat counts (presyms 16
|
||||||
|
* and 17), it is fastest to always write the maximum
|
||||||
|
* number of entries. That gets rid of branches that
|
||||||
|
* would otherwise be required.
|
||||||
|
*
|
||||||
|
* It is not just because of the numerical order that
|
||||||
|
* our checks go in the order 'presym < 16', 'presym ==
|
||||||
|
* 16', and 'presym == 17'. For typical data this is
|
||||||
|
* ordered from most frequent to least frequent case.
|
||||||
|
*/
|
||||||
|
STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
|
||||||
|
|
||||||
|
if (presym == 16) {
|
||||||
|
/* Repeat the previous length 3 - 6 times */
|
||||||
|
SAFETY_CHECK(i != 0);
|
||||||
|
rep_val = d->lens[i - 1];
|
||||||
|
STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
|
||||||
|
rep_count = 3 + POP_BITS(2);
|
||||||
|
d->lens[i + 0] = rep_val;
|
||||||
|
d->lens[i + 1] = rep_val;
|
||||||
|
d->lens[i + 2] = rep_val;
|
||||||
|
d->lens[i + 3] = rep_val;
|
||||||
|
d->lens[i + 4] = rep_val;
|
||||||
|
d->lens[i + 5] = rep_val;
|
||||||
|
i += rep_count;
|
||||||
|
} else if (presym == 17) {
|
||||||
|
/* Repeat zero 3 - 10 times */
|
||||||
|
STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
|
||||||
|
rep_count = 3 + POP_BITS(3);
|
||||||
|
d->lens[i + 0] = 0;
|
||||||
|
d->lens[i + 1] = 0;
|
||||||
|
d->lens[i + 2] = 0;
|
||||||
|
d->lens[i + 3] = 0;
|
||||||
|
d->lens[i + 4] = 0;
|
||||||
|
d->lens[i + 5] = 0;
|
||||||
|
d->lens[i + 6] = 0;
|
||||||
|
d->lens[i + 7] = 0;
|
||||||
|
d->lens[i + 8] = 0;
|
||||||
|
d->lens[i + 9] = 0;
|
||||||
|
i += rep_count;
|
||||||
|
} else {
|
||||||
|
/* Repeat zero 11 - 138 times */
|
||||||
|
STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
|
||||||
|
rep_count = 11 + POP_BITS(7);
|
||||||
|
memset(&d->lens[i], 0, rep_count * sizeof(d->lens[i]));
|
||||||
|
i += rep_count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
|
||||||
|
|
||||||
|
/* Uncompressed block: copy 'len' bytes literally from the input
|
||||||
|
* buffer to the output buffer. */
|
||||||
|
|
||||||
|
ALIGN_INPUT();
|
||||||
|
|
||||||
|
SAFETY_CHECK(in_end - in_next >= 4);
|
||||||
|
|
||||||
|
len = READ_U16();
|
||||||
|
nlen = READ_U16();
|
||||||
|
|
||||||
|
SAFETY_CHECK(len == (u16)~nlen);
|
||||||
|
SAFETY_CHECK(len <= out_end - out_next);
|
||||||
|
SAFETY_CHECK(len <= in_end - in_next);
|
||||||
|
|
||||||
|
memcpy(out_next, in_next, len);
|
||||||
|
in_next += len;
|
||||||
|
out_next += len;
|
||||||
|
|
||||||
|
goto block_done;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
|
||||||
|
|
||||||
|
/* Static Huffman block: set the static Huffman codeword
|
||||||
|
* lengths. Then the remainder is the same as decompressing a
|
||||||
|
* dynamic Huffman block. */
|
||||||
|
|
||||||
|
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
|
||||||
|
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
|
||||||
|
|
||||||
|
for (i = 0; i < 144; i++)
|
||||||
|
d->lens[i] = 8;
|
||||||
|
for (; i < 256; i++)
|
||||||
|
d->lens[i] = 9;
|
||||||
|
for (; i < 280; i++)
|
||||||
|
d->lens[i] = 7;
|
||||||
|
for (; i < 288; i++)
|
||||||
|
d->lens[i] = 8;
|
||||||
|
|
||||||
|
for (; i < 288 + 32; i++)
|
||||||
|
d->lens[i] = 5;
|
||||||
|
|
||||||
|
num_litlen_syms = 288;
|
||||||
|
num_offset_syms = 32;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Decompressing a Huffman block (either dynamic or static) */
|
||||||
|
|
||||||
|
SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
|
||||||
|
SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
|
||||||
|
|
||||||
|
/* The main DEFLATE decode loop */
|
||||||
|
for (;;) {
|
||||||
|
u32 entry;
|
||||||
|
u32 length;
|
||||||
|
u32 offset;
|
||||||
|
|
||||||
|
/* Decode a litlen symbol. */
|
||||||
|
ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
|
||||||
|
entry = d->litlen_decode_table[BITS(LITLEN_TABLEBITS)];
|
||||||
|
if (entry & HUFFDEC_SUBTABLE_POINTER) {
|
||||||
|
/* Litlen subtable required (uncommon case) */
|
||||||
|
REMOVE_BITS(LITLEN_TABLEBITS);
|
||||||
|
entry = d->litlen_decode_table[
|
||||||
|
((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
|
||||||
|
BITS(entry & HUFFDEC_LENGTH_MASK)];
|
||||||
|
}
|
||||||
|
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
|
||||||
|
if (entry & HUFFDEC_LITERAL) {
|
||||||
|
/* Literal */
|
||||||
|
SAFETY_CHECK(out_next < out_end);
|
||||||
|
*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Match or end-of-block */
|
||||||
|
|
||||||
|
entry >>= HUFFDEC_RESULT_SHIFT;
|
||||||
|
ENSURE_BITS(MAX_ENSURE);
|
||||||
|
|
||||||
|
/* Pop the extra length bits and add them to the length base to
|
||||||
|
* produce the full length. */
|
||||||
|
length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
|
||||||
|
POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
|
||||||
|
|
||||||
|
/* The match destination must not end after the end of the
|
||||||
|
* output buffer. For efficiency, combine this check with the
|
||||||
|
* end-of-block check. We're using 0 for the special
|
||||||
|
* end-of-block length, so subtract 1 and it turn it into
|
||||||
|
* SIZE_MAX. */
|
||||||
|
STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
|
||||||
|
if (unlikely((size_t)length - 1 > out_end - out_next)) {
|
||||||
|
SAFETY_CHECK(length == HUFFDEC_END_OF_BLOCK_LENGTH);
|
||||||
|
goto block_done;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Decode the match offset. */
|
||||||
|
|
||||||
|
entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
|
||||||
|
if (entry & HUFFDEC_SUBTABLE_POINTER) {
|
||||||
|
/* Offset subtable required (uncommon case) */
|
||||||
|
REMOVE_BITS(OFFSET_TABLEBITS);
|
||||||
|
entry = d->offset_decode_table[
|
||||||
|
((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
|
||||||
|
BITS(entry & HUFFDEC_LENGTH_MASK)];
|
||||||
|
}
|
||||||
|
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
|
||||||
|
entry >>= HUFFDEC_RESULT_SHIFT;
|
||||||
|
|
||||||
|
STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
|
||||||
|
DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
|
||||||
|
CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
|
||||||
|
if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
|
||||||
|
DEFLATE_MAX_OFFSET_CODEWORD_LEN +
|
||||||
|
DEFLATE_MAX_EXTRA_OFFSET_BITS))
|
||||||
|
ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
|
||||||
|
|
||||||
|
/* Pop the extra offset bits and add them to the offset base to
|
||||||
|
* produce the full offset. */
|
||||||
|
offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
|
||||||
|
POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
|
||||||
|
|
||||||
|
/* The match source must not begin before the beginning of the
|
||||||
|
* output buffer. */
|
||||||
|
SAFETY_CHECK(offset <= out_next - (const u8 *)out);
|
||||||
|
|
||||||
|
/* Copy the match: 'length' bytes at 'out_next - offset' to
|
||||||
|
* 'out_next'. */
|
||||||
|
|
||||||
|
if (UNALIGNED_ACCESS_IS_FAST &&
|
||||||
|
length <= (3 * WORDSIZE) &&
|
||||||
|
offset >= WORDSIZE &&
|
||||||
|
length + (3 * WORDSIZE) <= out_end - out_next)
|
||||||
|
{
|
||||||
|
/* Fast case: short length, no overlaps if we copy one
|
||||||
|
* word at a time, and we aren't getting too close to
|
||||||
|
* the end of the output array. */
|
||||||
|
copy_word_unaligned(out_next - offset + (0 * WORDSIZE),
|
||||||
|
out_next + (0 * WORDSIZE));
|
||||||
|
copy_word_unaligned(out_next - offset + (1 * WORDSIZE),
|
||||||
|
out_next + (1 * WORDSIZE));
|
||||||
|
copy_word_unaligned(out_next - offset + (2 * WORDSIZE),
|
||||||
|
out_next + (2 * WORDSIZE));
|
||||||
|
} else {
|
||||||
|
const u8 *src = out_next - offset;
|
||||||
|
u8 *dst = out_next;
|
||||||
|
u8 *end = out_next + length;
|
||||||
|
|
||||||
|
if (UNALIGNED_ACCESS_IS_FAST &&
|
||||||
|
likely(out_end - end >= WORDSIZE - 1)) {
|
||||||
|
if (offset >= WORDSIZE) {
|
||||||
|
copy_word_unaligned(src, dst);
|
||||||
|
src += WORDSIZE;
|
||||||
|
dst += WORDSIZE;
|
||||||
|
if (dst < end) {
|
||||||
|
do {
|
||||||
|
copy_word_unaligned(src, dst);
|
||||||
|
src += WORDSIZE;
|
||||||
|
dst += WORDSIZE;
|
||||||
|
} while (dst < end);
|
||||||
|
}
|
||||||
|
} else if (offset == 1) {
|
||||||
|
machine_word_t v = repeat_byte(*(dst - 1));
|
||||||
|
do {
|
||||||
|
store_word_unaligned(v, dst);
|
||||||
|
src += WORDSIZE;
|
||||||
|
dst += WORDSIZE;
|
||||||
|
} while (dst < end);
|
||||||
|
} else {
|
||||||
|
*dst++ = *src++;
|
||||||
|
*dst++ = *src++;
|
||||||
|
do {
|
||||||
|
*dst++ = *src++;
|
||||||
|
} while (dst < end);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
*dst++ = *src++;
|
||||||
|
*dst++ = *src++;
|
||||||
|
do {
|
||||||
|
*dst++ = *src++;
|
||||||
|
} while (dst < end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out_next += length;
|
||||||
|
}
|
||||||
|
|
||||||
|
block_done:
|
||||||
|
/* Finished decoding a block. */
|
||||||
|
|
||||||
|
if (!is_final_block)
|
||||||
|
goto next_block;
|
||||||
|
|
||||||
|
/* That was the last block. Return %true if we got all the output we
|
||||||
|
* expected, otherwise %false. */
|
||||||
|
return (out_next == out_end);
|
||||||
|
}
|
@ -10,8 +10,9 @@
|
|||||||
* ---------------------------------------------------------------------------
|
* ---------------------------------------------------------------------------
|
||||||
*
|
*
|
||||||
* This is a highly optimized DEFLATE decompressor. On x86_64 it decompresses
|
* This is a highly optimized DEFLATE decompressor. On x86_64 it decompresses
|
||||||
* data in about 52% of the time of zlib. On other architectures it should
|
* data in about 52% of the time of zlib (48% if BMI2 instructions are
|
||||||
* still be significantly faster than zlib, but the difference may be smaller.
|
* available). On other architectures it should still be significantly faster
|
||||||
|
* than zlib, but the difference may be smaller.
|
||||||
*
|
*
|
||||||
* Why this is faster than zlib's implementation:
|
* Why this is faster than zlib's implementation:
|
||||||
*
|
*
|
||||||
@ -22,6 +23,8 @@
|
|||||||
* - Other optimizations to remove unnecessary branches
|
* - Other optimizations to remove unnecessary branches
|
||||||
* - Only full-buffer decompression is supported, so the code doesn't need to
|
* - Only full-buffer decompression is supported, so the code doesn't need to
|
||||||
* support stopping and resuming decompression.
|
* support stopping and resuming decompression.
|
||||||
|
* - On x86_64, compile a version of the decompression routine using BMI2
|
||||||
|
* instructions and use it automatically at runtime when supported.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -31,6 +34,7 @@
|
|||||||
|
|
||||||
#include "deflate_constants.h"
|
#include "deflate_constants.h"
|
||||||
#include "unaligned.h"
|
#include "unaligned.h"
|
||||||
|
#include "x86_cpu_features.h"
|
||||||
|
|
||||||
/* By default, if the expression passed to SAFETY_CHECK() evaluates to false,
|
/* By default, if the expression passed to SAFETY_CHECK() evaluates to false,
|
||||||
* then deflate_decompress() immediately returns false as the compressed data is
|
* then deflate_decompress() immediately returns false as the compressed data is
|
||||||
@ -793,6 +797,50 @@ copy_word_unaligned(const void *src, void *dst)
|
|||||||
* Main decompression routine
|
* Main decompression routine
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define FUNCNAME deflate_decompress_default
|
||||||
|
#define ATTRIBUTES
|
||||||
|
#include "decompress_impl.h"
|
||||||
|
#undef FUNCNAME
|
||||||
|
#undef ATTRIBUTES
|
||||||
|
|
||||||
|
#if X86_CPU_FEATURES_ENABLED && !defined(__BMI2__)
|
||||||
|
# define FUNCNAME deflate_decompress_bmi2
|
||||||
|
# define ATTRIBUTES __attribute__((target("bmi2")))
|
||||||
|
# include "decompress_impl.h"
|
||||||
|
# undef FUNCNAME
|
||||||
|
# undef ATTRIBUTES
|
||||||
|
# define DISPATCH_ENABLED 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if DISPATCH_ENABLED
|
||||||
|
|
||||||
|
static bool
|
||||||
|
dispatch(struct deflate_decompressor * restrict d,
|
||||||
|
const void * restrict in, size_t in_nbytes,
|
||||||
|
void * restrict out, size_t out_nbytes);
|
||||||
|
|
||||||
|
typedef bool (*decompress_func_t)(struct deflate_decompressor * restrict d,
|
||||||
|
const void * restrict in, size_t in_nbytes,
|
||||||
|
void * restrict out, size_t out_nbytes);
|
||||||
|
|
||||||
|
static decompress_func_t decompress_impl = dispatch;
|
||||||
|
|
||||||
|
static bool
|
||||||
|
dispatch(struct deflate_decompressor * restrict d,
|
||||||
|
const void * restrict in, size_t in_nbytes,
|
||||||
|
void * restrict out, size_t out_nbytes)
|
||||||
|
{
|
||||||
|
decompress_func_t f = deflate_decompress_default;
|
||||||
|
#if X86_CPU_FEATURES_ENABLED
|
||||||
|
if (x86_have_cpu_feature(X86_CPU_FEATURE_BMI2))
|
||||||
|
f = deflate_decompress_bmi2;
|
||||||
|
#endif
|
||||||
|
decompress_impl = f;
|
||||||
|
return (*f)(d, in, in_nbytes, out, out_nbytes);
|
||||||
|
}
|
||||||
|
#endif /* DISPATCH_ENABLED */
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is the main DEFLATE decompression routine. It decompresses 'in_nbytes'
|
* This is the main DEFLATE decompression routine. It decompresses 'in_nbytes'
|
||||||
* bytes of compressed data from the buffer 'in' and writes the uncompressed
|
* bytes of compressed data from the buffer 'in' and writes the uncompressed
|
||||||
@ -801,362 +849,20 @@ copy_word_unaligned(const void *src, void *dst)
|
|||||||
* and only if decompression was successful. A return value of %false indicates
|
* and only if decompression was successful. A return value of %false indicates
|
||||||
* that either the compressed data is invalid or it does not decompress to
|
* that either the compressed data is invalid or it does not decompress to
|
||||||
* exactly 'out_nbytes' bytes of uncompressed data.
|
* exactly 'out_nbytes' bytes of uncompressed data.
|
||||||
|
*
|
||||||
|
* The real code is in decompress_impl.h. The part here just handles calling
|
||||||
|
* the appropriate implementation depending on the CPU features at runtime.
|
||||||
*/
|
*/
|
||||||
LIBEXPORT bool
|
LIBEXPORT bool
|
||||||
deflate_decompress(struct deflate_decompressor * restrict d,
|
deflate_decompress(struct deflate_decompressor * restrict d,
|
||||||
const void * restrict in, size_t in_nbytes,
|
const void * restrict in, size_t in_nbytes,
|
||||||
void * restrict out, size_t out_nbytes)
|
void * restrict out, size_t out_nbytes)
|
||||||
{
|
{
|
||||||
u8 *out_next = out;
|
#if DISPATCH_ENABLED
|
||||||
u8 * const out_end = out_next + out_nbytes;
|
return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes);
|
||||||
const u8 *in_next = in;
|
#else
|
||||||
const u8 * const in_end = in_next + in_nbytes;
|
return deflate_decompress_default(d, in, in_nbytes, out, out_nbytes);
|
||||||
bitbuf_t bitbuf = 0;
|
#endif
|
||||||
unsigned bitsleft = 0;
|
|
||||||
size_t overrun_count = 0;
|
|
||||||
unsigned i;
|
|
||||||
unsigned is_final_block;
|
|
||||||
unsigned block_type;
|
|
||||||
u16 len;
|
|
||||||
u16 nlen;
|
|
||||||
unsigned num_litlen_syms;
|
|
||||||
unsigned num_offset_syms;
|
|
||||||
|
|
||||||
next_block:
|
|
||||||
/* Starting to read the next block. */
|
|
||||||
;
|
|
||||||
|
|
||||||
STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
|
|
||||||
ENSURE_BITS(1 + 2 + 5 + 5 + 4);
|
|
||||||
|
|
||||||
/* BFINAL: 1 bit */
|
|
||||||
is_final_block = POP_BITS(1);
|
|
||||||
|
|
||||||
/* BTYPE: 2 bits */
|
|
||||||
block_type = POP_BITS(2);
|
|
||||||
|
|
||||||
if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
|
|
||||||
|
|
||||||
/* Dynamic Huffman block. */
|
|
||||||
|
|
||||||
/* The order in which precode lengths are stored. */
|
|
||||||
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
|
|
||||||
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
|
|
||||||
};
|
|
||||||
|
|
||||||
unsigned num_explicit_precode_lens;
|
|
||||||
|
|
||||||
/* Read the codeword length counts. */
|
|
||||||
|
|
||||||
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
|
|
||||||
num_litlen_syms = POP_BITS(5) + 257;
|
|
||||||
|
|
||||||
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
|
|
||||||
num_offset_syms = POP_BITS(5) + 1;
|
|
||||||
|
|
||||||
STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
|
|
||||||
num_explicit_precode_lens = POP_BITS(4) + 4;
|
|
||||||
|
|
||||||
/* Read the precode codeword lengths. */
|
|
||||||
STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
|
|
||||||
if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) {
|
|
||||||
|
|
||||||
ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3);
|
|
||||||
|
|
||||||
for (i = 0; i < num_explicit_precode_lens; i++)
|
|
||||||
d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
|
|
||||||
} else {
|
|
||||||
for (i = 0; i < num_explicit_precode_lens; i++) {
|
|
||||||
ENSURE_BITS(3);
|
|
||||||
d->precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
|
|
||||||
d->precode_lens[deflate_precode_lens_permutation[i]] = 0;
|
|
||||||
|
|
||||||
/* Build the decode table for the precode. */
|
|
||||||
SAFETY_CHECK(build_precode_decode_table(d));
|
|
||||||
|
|
||||||
/* Expand the literal/length and offset codeword lengths. */
|
|
||||||
for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
|
|
||||||
u32 entry;
|
|
||||||
unsigned presym;
|
|
||||||
u8 rep_val;
|
|
||||||
unsigned rep_count;
|
|
||||||
|
|
||||||
ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
|
|
||||||
|
|
||||||
/* (The code below assumes that the precode decode table
|
|
||||||
* does not have any subtables.) */
|
|
||||||
STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
|
|
||||||
|
|
||||||
/* Read the next precode symbol. */
|
|
||||||
entry = d->precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
|
|
||||||
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
|
|
||||||
presym = entry >> HUFFDEC_RESULT_SHIFT;
|
|
||||||
|
|
||||||
if (presym < 16) {
|
|
||||||
/* Explicit codeword length */
|
|
||||||
d->lens[i++] = presym;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Run-length encoded codeword lengths */
|
|
||||||
|
|
||||||
/* Note: we don't need verify that the repeat count
|
|
||||||
* doesn't overflow the number of elements, since we
|
|
||||||
* have enough extra spaces to allow for the worst-case
|
|
||||||
* overflow (138 zeroes when only 1 length was
|
|
||||||
* remaining).
|
|
||||||
*
|
|
||||||
* In the case of the small repeat counts (presyms 16
|
|
||||||
* and 17), it is fastest to always write the maximum
|
|
||||||
* number of entries. That gets rid of branches that
|
|
||||||
* would otherwise be required.
|
|
||||||
*
|
|
||||||
* It is not just because of the numerical order that
|
|
||||||
* our checks go in the order 'presym < 16', 'presym ==
|
|
||||||
* 16', and 'presym == 17'. For typical data this is
|
|
||||||
* ordered from most frequent to least frequent case.
|
|
||||||
*/
|
|
||||||
STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
|
|
||||||
|
|
||||||
if (presym == 16) {
|
|
||||||
/* Repeat the previous length 3 - 6 times */
|
|
||||||
SAFETY_CHECK(i != 0);
|
|
||||||
rep_val = d->lens[i - 1];
|
|
||||||
STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
|
|
||||||
rep_count = 3 + POP_BITS(2);
|
|
||||||
d->lens[i + 0] = rep_val;
|
|
||||||
d->lens[i + 1] = rep_val;
|
|
||||||
d->lens[i + 2] = rep_val;
|
|
||||||
d->lens[i + 3] = rep_val;
|
|
||||||
d->lens[i + 4] = rep_val;
|
|
||||||
d->lens[i + 5] = rep_val;
|
|
||||||
i += rep_count;
|
|
||||||
} else if (presym == 17) {
|
|
||||||
/* Repeat zero 3 - 10 times */
|
|
||||||
STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
|
|
||||||
rep_count = 3 + POP_BITS(3);
|
|
||||||
d->lens[i + 0] = 0;
|
|
||||||
d->lens[i + 1] = 0;
|
|
||||||
d->lens[i + 2] = 0;
|
|
||||||
d->lens[i + 3] = 0;
|
|
||||||
d->lens[i + 4] = 0;
|
|
||||||
d->lens[i + 5] = 0;
|
|
||||||
d->lens[i + 6] = 0;
|
|
||||||
d->lens[i + 7] = 0;
|
|
||||||
d->lens[i + 8] = 0;
|
|
||||||
d->lens[i + 9] = 0;
|
|
||||||
i += rep_count;
|
|
||||||
} else {
|
|
||||||
/* Repeat zero 11 - 138 times */
|
|
||||||
STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
|
|
||||||
rep_count = 11 + POP_BITS(7);
|
|
||||||
memset(&d->lens[i], 0, rep_count * sizeof(d->lens[i]));
|
|
||||||
i += rep_count;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
|
|
||||||
|
|
||||||
/* Uncompressed block: copy 'len' bytes literally from the input
|
|
||||||
* buffer to the output buffer. */
|
|
||||||
|
|
||||||
ALIGN_INPUT();
|
|
||||||
|
|
||||||
SAFETY_CHECK(in_end - in_next >= 4);
|
|
||||||
|
|
||||||
len = READ_U16();
|
|
||||||
nlen = READ_U16();
|
|
||||||
|
|
||||||
SAFETY_CHECK(len == (u16)~nlen);
|
|
||||||
SAFETY_CHECK(len <= out_end - out_next);
|
|
||||||
SAFETY_CHECK(len <= in_end - in_next);
|
|
||||||
|
|
||||||
memcpy(out_next, in_next, len);
|
|
||||||
in_next += len;
|
|
||||||
out_next += len;
|
|
||||||
|
|
||||||
goto block_done;
|
|
||||||
|
|
||||||
} else {
|
|
||||||
SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
|
|
||||||
|
|
||||||
/* Static Huffman block: set the static Huffman codeword
|
|
||||||
* lengths. Then the remainder is the same as decompressing a
|
|
||||||
* dynamic Huffman block. */
|
|
||||||
|
|
||||||
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
|
|
||||||
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
|
|
||||||
|
|
||||||
for (i = 0; i < 144; i++)
|
|
||||||
d->lens[i] = 8;
|
|
||||||
for (; i < 256; i++)
|
|
||||||
d->lens[i] = 9;
|
|
||||||
for (; i < 280; i++)
|
|
||||||
d->lens[i] = 7;
|
|
||||||
for (; i < 288; i++)
|
|
||||||
d->lens[i] = 8;
|
|
||||||
|
|
||||||
for (; i < 288 + 32; i++)
|
|
||||||
d->lens[i] = 5;
|
|
||||||
|
|
||||||
num_litlen_syms = 288;
|
|
||||||
num_offset_syms = 32;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Decompressing a Huffman block (either dynamic or static) */
|
|
||||||
|
|
||||||
SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
|
|
||||||
SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
|
|
||||||
|
|
||||||
/* The main DEFLATE decode loop */
|
|
||||||
for (;;) {
|
|
||||||
u32 entry;
|
|
||||||
u32 length;
|
|
||||||
u32 offset;
|
|
||||||
|
|
||||||
/* Decode a litlen symbol. */
|
|
||||||
ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
|
|
||||||
entry = d->litlen_decode_table[BITS(LITLEN_TABLEBITS)];
|
|
||||||
if (entry & HUFFDEC_SUBTABLE_POINTER) {
|
|
||||||
/* Litlen subtable required (uncommon case) */
|
|
||||||
REMOVE_BITS(LITLEN_TABLEBITS);
|
|
||||||
entry = d->litlen_decode_table[
|
|
||||||
((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
|
|
||||||
BITS(entry & HUFFDEC_LENGTH_MASK)];
|
|
||||||
}
|
|
||||||
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
|
|
||||||
if (entry & HUFFDEC_LITERAL) {
|
|
||||||
/* Literal */
|
|
||||||
SAFETY_CHECK(out_next < out_end);
|
|
||||||
*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Match or end-of-block */
|
|
||||||
|
|
||||||
entry >>= HUFFDEC_RESULT_SHIFT;
|
|
||||||
ENSURE_BITS(MAX_ENSURE);
|
|
||||||
|
|
||||||
/* Pop the extra length bits and add them to the length base to
|
|
||||||
* produce the full length. */
|
|
||||||
length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
|
|
||||||
POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
|
|
||||||
|
|
||||||
/* The match destination must not end after the end of the
|
|
||||||
* output buffer. For efficiency, combine this check with the
|
|
||||||
* end-of-block check. We're using 0 for the special
|
|
||||||
* end-of-block length, so subtract 1 and it turn it into
|
|
||||||
* SIZE_MAX. */
|
|
||||||
STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
|
|
||||||
if (unlikely((size_t)length - 1 > out_end - out_next)) {
|
|
||||||
SAFETY_CHECK(length == HUFFDEC_END_OF_BLOCK_LENGTH);
|
|
||||||
goto block_done;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Decode the match offset. */
|
|
||||||
|
|
||||||
entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
|
|
||||||
if (entry & HUFFDEC_SUBTABLE_POINTER) {
|
|
||||||
/* Offset subtable required (uncommon case) */
|
|
||||||
REMOVE_BITS(OFFSET_TABLEBITS);
|
|
||||||
entry = d->offset_decode_table[
|
|
||||||
((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
|
|
||||||
BITS(entry & HUFFDEC_LENGTH_MASK)];
|
|
||||||
}
|
|
||||||
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
|
|
||||||
entry >>= HUFFDEC_RESULT_SHIFT;
|
|
||||||
|
|
||||||
STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
|
|
||||||
DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
|
|
||||||
CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
|
|
||||||
if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
|
|
||||||
DEFLATE_MAX_OFFSET_CODEWORD_LEN +
|
|
||||||
DEFLATE_MAX_EXTRA_OFFSET_BITS))
|
|
||||||
ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
|
|
||||||
|
|
||||||
/* Pop the extra offset bits and add them to the offset base to
|
|
||||||
* produce the full offset. */
|
|
||||||
offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
|
|
||||||
POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
|
|
||||||
|
|
||||||
/* The match source must not begin before the beginning of the
|
|
||||||
* output buffer. */
|
|
||||||
SAFETY_CHECK(offset <= out_next - (const u8 *)out);
|
|
||||||
|
|
||||||
/* Copy the match: 'length' bytes at 'out_next - offset' to
|
|
||||||
* 'out_next'. */
|
|
||||||
|
|
||||||
if (UNALIGNED_ACCESS_IS_FAST &&
|
|
||||||
length <= (3 * WORDSIZE) &&
|
|
||||||
offset >= WORDSIZE &&
|
|
||||||
length + (3 * WORDSIZE) <= out_end - out_next)
|
|
||||||
{
|
|
||||||
/* Fast case: short length, no overlaps if we copy one
|
|
||||||
* word at a time, and we aren't getting too close to
|
|
||||||
* the end of the output array. */
|
|
||||||
copy_word_unaligned(out_next - offset + (0 * WORDSIZE),
|
|
||||||
out_next + (0 * WORDSIZE));
|
|
||||||
copy_word_unaligned(out_next - offset + (1 * WORDSIZE),
|
|
||||||
out_next + (1 * WORDSIZE));
|
|
||||||
copy_word_unaligned(out_next - offset + (2 * WORDSIZE),
|
|
||||||
out_next + (2 * WORDSIZE));
|
|
||||||
} else {
|
|
||||||
const u8 *src = out_next - offset;
|
|
||||||
u8 *dst = out_next;
|
|
||||||
u8 *end = out_next + length;
|
|
||||||
|
|
||||||
if (UNALIGNED_ACCESS_IS_FAST &&
|
|
||||||
likely(out_end - end >= WORDSIZE - 1)) {
|
|
||||||
if (offset >= WORDSIZE) {
|
|
||||||
copy_word_unaligned(src, dst);
|
|
||||||
src += WORDSIZE;
|
|
||||||
dst += WORDSIZE;
|
|
||||||
if (dst < end) {
|
|
||||||
do {
|
|
||||||
copy_word_unaligned(src, dst);
|
|
||||||
src += WORDSIZE;
|
|
||||||
dst += WORDSIZE;
|
|
||||||
} while (dst < end);
|
|
||||||
}
|
|
||||||
} else if (offset == 1) {
|
|
||||||
machine_word_t v = repeat_byte(*(dst - 1));
|
|
||||||
do {
|
|
||||||
store_word_unaligned(v, dst);
|
|
||||||
src += WORDSIZE;
|
|
||||||
dst += WORDSIZE;
|
|
||||||
} while (dst < end);
|
|
||||||
} else {
|
|
||||||
*dst++ = *src++;
|
|
||||||
*dst++ = *src++;
|
|
||||||
do {
|
|
||||||
*dst++ = *src++;
|
|
||||||
} while (dst < end);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
*dst++ = *src++;
|
|
||||||
*dst++ = *src++;
|
|
||||||
do {
|
|
||||||
*dst++ = *src++;
|
|
||||||
} while (dst < end);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
out_next += length;
|
|
||||||
}
|
|
||||||
|
|
||||||
block_done:
|
|
||||||
/* Finished decoding a block. */
|
|
||||||
|
|
||||||
if (!is_final_block)
|
|
||||||
goto next_block;
|
|
||||||
|
|
||||||
/* That was the last block. Return %true if we got all the output we
|
|
||||||
* expected, otherwise %false. */
|
|
||||||
return (out_next == out_end);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LIBEXPORT struct deflate_decompressor *
|
LIBEXPORT struct deflate_decompressor *
|
||||||
|
145
src/x86_cpu_features.c
Normal file
145
src/x86_cpu_features.c
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
/*
|
||||||
|
* x86_cpu_features.c - feature detection for x86 processors
|
||||||
|
*
|
||||||
|
* Author: Eric Biggers
|
||||||
|
* Year: 2015
|
||||||
|
*
|
||||||
|
* The author dedicates this file to the public domain.
|
||||||
|
* You can do whatever you want with this file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "x86_cpu_features.h"
|
||||||
|
|
||||||
|
#ifdef X86_CPU_FEATURES_ENABLED
|
||||||
|
|
||||||
|
#define DEBUG 0
|
||||||
|
|
||||||
|
#if DEBUG
|
||||||
|
# include <stdio.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
u32 _x86_cpu_features = 0;
|
||||||
|
|
||||||
|
/* With old GCC versions we have to manually save and restore the x86_32 PIC
|
||||||
|
* register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 */
|
||||||
|
#if defined(__i386__) && defined(__PIC__)
|
||||||
|
# define EBX_CONSTRAINT "=r"
|
||||||
|
#else
|
||||||
|
# define EBX_CONSTRAINT "=b"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Execute the CPUID instruction. */
|
||||||
|
static inline void
|
||||||
|
cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
|
||||||
|
{
|
||||||
|
__asm__(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n"
|
||||||
|
"cpuid \n"
|
||||||
|
".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
|
||||||
|
: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
|
||||||
|
: "a" (leaf), "c" (subleaf));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read an extended control register. */
|
||||||
|
static inline u64
|
||||||
|
read_xcr(u32 index)
|
||||||
|
{
|
||||||
|
u32 edx, eax;
|
||||||
|
|
||||||
|
/* Execute the "xgetbv" instruction. Old versions of binutils do not
|
||||||
|
* recognize this instruction, so list the raw bytes instead. */
|
||||||
|
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
|
||||||
|
|
||||||
|
return ((u64)edx << 32) | eax;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define IS_SET(reg, bit) ((reg) & ((u32)1 << (bit)))
|
||||||
|
|
||||||
|
/* Initialize _x86_cpu_features with bits for interesting processor features. */
|
||||||
|
void
|
||||||
|
x86_setup_cpu_features(void)
|
||||||
|
{
|
||||||
|
u32 features = 0;
|
||||||
|
u32 dummy1, dummy2, dummy3, dummy4;
|
||||||
|
u32 max_function;
|
||||||
|
u32 features_1, features_2, features_3, features_4;
|
||||||
|
bool os_saves_ymm_regs = false;
|
||||||
|
|
||||||
|
/* Get maximum supported function */
|
||||||
|
cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
|
||||||
|
if (max_function < 1)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
/* Standard feature flags */
|
||||||
|
cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
|
||||||
|
|
||||||
|
if (IS_SET(features_1, 25))
|
||||||
|
features |= X86_CPU_FEATURE_SSE;
|
||||||
|
|
||||||
|
if (IS_SET(features_1, 26))
|
||||||
|
features |= X86_CPU_FEATURE_SSE2;
|
||||||
|
|
||||||
|
if (IS_SET(features_2, 0))
|
||||||
|
features |= X86_CPU_FEATURE_SSE3;
|
||||||
|
|
||||||
|
if (IS_SET(features_2, 9))
|
||||||
|
features |= X86_CPU_FEATURE_SSSE3;
|
||||||
|
|
||||||
|
if (IS_SET(features_2, 19))
|
||||||
|
features |= X86_CPU_FEATURE_SSE4_1;
|
||||||
|
|
||||||
|
if (IS_SET(features_2, 20))
|
||||||
|
features |= X86_CPU_FEATURE_SSE4_2;
|
||||||
|
|
||||||
|
if (IS_SET(features_2, 27)) /* OSXSAVE set? */
|
||||||
|
if ((read_xcr(0) & 0x6) == 0x6)
|
||||||
|
os_saves_ymm_regs = true;
|
||||||
|
|
||||||
|
if (os_saves_ymm_regs && IS_SET(features_2, 28))
|
||||||
|
features |= X86_CPU_FEATURE_AVX;
|
||||||
|
|
||||||
|
if (max_function < 7)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
/* Extended feature flags */
|
||||||
|
cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
|
||||||
|
|
||||||
|
if (IS_SET(features_3, 3))
|
||||||
|
features |= X86_CPU_FEATURE_BMI;
|
||||||
|
|
||||||
|
if (os_saves_ymm_regs && IS_SET(features_3, 5))
|
||||||
|
features |= X86_CPU_FEATURE_AVX2;
|
||||||
|
|
||||||
|
if (IS_SET(features_3, 8))
|
||||||
|
features |= X86_CPU_FEATURE_BMI2;
|
||||||
|
|
||||||
|
out:
|
||||||
|
|
||||||
|
#if DEBUG
|
||||||
|
printf("Detected x86 CPU features: ");
|
||||||
|
if (features & X86_CPU_FEATURE_SSE)
|
||||||
|
printf("SSE ");
|
||||||
|
if (features & X86_CPU_FEATURE_SSE2)
|
||||||
|
printf("SSE2 ");
|
||||||
|
if (features & X86_CPU_FEATURE_SSE3)
|
||||||
|
printf("SSE3 ");
|
||||||
|
if (features & X86_CPU_FEATURE_SSSE3)
|
||||||
|
printf("SSSE3 ");
|
||||||
|
if (features & X86_CPU_FEATURE_SSE4_1)
|
||||||
|
printf("SSE4.1 ");
|
||||||
|
if (features & X86_CPU_FEATURE_SSE4_2)
|
||||||
|
printf("SSE4.2 ");
|
||||||
|
if (features & X86_CPU_FEATURE_BMI)
|
||||||
|
printf("BMI ");
|
||||||
|
if (features & X86_CPU_FEATURE_AVX)
|
||||||
|
printf("AVX ");
|
||||||
|
if (features & X86_CPU_FEATURE_BMI2)
|
||||||
|
printf("BMI2 ");
|
||||||
|
if (features & X86_CPU_FEATURE_AVX2)
|
||||||
|
printf("AVX2 ");
|
||||||
|
printf("\n");
|
||||||
|
#endif /* DEBUG */
|
||||||
|
|
||||||
|
_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* X86_CPU_FEATURES_ENABLED */
|
43
src/x86_cpu_features.h
Normal file
43
src/x86_cpu_features.h
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
/*
|
||||||
|
* x86_cpu_features.h - feature detection for x86 processors
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "util.h"
|
||||||
|
|
||||||
|
#if RUNTIME_CPU_DETECTION && defined(__GNUC__) && defined(__x86_64__)
|
||||||
|
# define X86_CPU_FEATURES_ENABLED 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if X86_CPU_FEATURES_ENABLED
|
||||||
|
|
||||||
|
#define X86_CPU_FEATURE_SSE 0x00000001
|
||||||
|
#define X86_CPU_FEATURE_SSE2 0x00000002
|
||||||
|
#define X86_CPU_FEATURE_SSE3 0x00000004
|
||||||
|
#define X86_CPU_FEATURE_SSSE3 0x00000008
|
||||||
|
#define X86_CPU_FEATURE_SSE4_1 0x00000010
|
||||||
|
#define X86_CPU_FEATURE_SSE4_2 0x00000020
|
||||||
|
#define X86_CPU_FEATURE_AVX 0x00000040
|
||||||
|
#define X86_CPU_FEATURE_BMI 0x00000080
|
||||||
|
#define X86_CPU_FEATURE_AVX2 0x00000100
|
||||||
|
#define X86_CPU_FEATURE_BMI2 0x00000200
|
||||||
|
|
||||||
|
#define X86_CPU_FEATURES_KNOWN 0x80000000
|
||||||
|
|
||||||
|
extern u32 _x86_cpu_features;
|
||||||
|
|
||||||
|
extern void
|
||||||
|
x86_setup_cpu_features(void);
|
||||||
|
|
||||||
|
/* Does the processor have the specified feature? */
|
||||||
|
static inline bool
|
||||||
|
x86_have_cpu_feature(u32 feature)
|
||||||
|
{
|
||||||
|
if (_x86_cpu_features == 0)
|
||||||
|
x86_setup_cpu_features();
|
||||||
|
return _x86_cpu_features & feature;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* X86_CPU_FEATURES_ENABLED */
|
Loading…
x
Reference in New Issue
Block a user