diff --git a/CMakeLists.txt b/CMakeLists.txt index 112bee45..f3c11828 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,6 +142,7 @@ set(compiler_only add_subdirectory(folly EXCLUDE_FROM_ALL) add_subdirectory(fbthrift EXCLUDE_FROM_ALL) +add_subdirectory(fsst EXCLUDE_FROM_ALL) if(NOT (ZSTD_FOUND AND PREFER_SYSTEM_ZSTD)) add_subdirectory(zstd/build/cmake EXCLUDE_FROM_ALL) endif() @@ -491,11 +492,14 @@ foreach(tgt dwarfs ${BINARY_TARGETS}) endif() endforeach() +target_include_directories(dwarfs PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/fsst) + target_link_libraries( dwarfs metadata_thrift thrift_light folly + fsst ${Boost_LIBRARIES} PkgConfig::LIBARCHIVE PkgConfig::LIBLZ4 diff --git a/fsst/CMakeLists.txt b/fsst/CMakeLists.txt new file mode 100644 index 00000000..0719d422 --- /dev/null +++ b/fsst/CMakeLists.txt @@ -0,0 +1,41 @@ +cmake_minimum_required(VERSION 3.0) + +project(fsst) + +find_package(Threads REQUIRED) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +set(CMAKE_VERBOSE_MAKEFILE ON) + +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) +if(COMPILER_SUPPORTS_MARCH_NATIVE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") +endif() + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +if(NOT MSVC) + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG") +else() + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /O3 /DNDEBUG") +endif() + +if(CMAKE_BUILD_TYPE STREQUAL Release) +set_source_files_properties( + fsst_avx512.cpp + PROPERTIES + COMPILE_FLAGS -O1 +) +endif() + +add_library(fsst libfsst.cpp fsst_avx512.cpp fsst_avx512_unroll1.inc fsst_avx512_unroll2.inc fsst_avx512_unroll3.inc fsst_avx512_unroll4.inc) +add_executable(binary fsst.cpp) +target_link_libraries (binary LINK_PUBLIC fsst) +target_link_libraries (binary LINK_PUBLIC Threads::Threads) +set_target_properties(binary PROPERTIES OUTPUT_NAME fsst) diff --git a/fsst/LICENSE b/fsst/LICENSE new file mode 100644 index 00000000..edb46d2d --- /dev/null +++ b/fsst/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018-2020, CWI, TU Munich, FSU Jena + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/fsst/Makefile.linux b/fsst/Makefile.linux new file mode 100644 index 00000000..bce30110 --- /dev/null +++ b/fsst/Makefile.linux @@ -0,0 +1,20 @@ +# the old makefile (before switching to cmake) +# +# it has a minor usefulness in still that it can generate the fsst_avx512_unrollX.inc files which we now just added to the repo +SHELL := /bin/bash + +OPT=-O3 -march=native -DNDEBUG + +all: fsst +clean: + -@rm -f libfsst.[oa] fsst_avx512.o fsst +fsst: fsst.cpp libfsst.a + g++ -std=c++17 -W -Wall -ofsst $(OPT) -g fsst.cpp -L. -lfsst -lpthread +libfsst.a: libfsst.cpp libfsst.hpp fsst.h fsst_avx512.o + g++ -std=c++17 -W -Wall -c $(OPT) -g libfsst.cpp + ar ru $@ libfsst.o fsst_avx512.o + ranlib $@ +fsst_avx512_unroll%.inc: fsst_avx512.inc + awk '{ if ($$0 != '//') for(i=1;i<='$*';i++) {s=$$0; gsub(/X/,i,s); print s}}' fsst_avx512.inc > fsst_avx512_unroll$*.inc; +fsst_avx512.o: fsst_avx512.cpp fsst_avx512_unroll1.inc fsst_avx512_unroll2.inc fsst_avx512_unroll3.inc fsst_avx512_unroll4.inc + g++ -std=c++17 -W -Wall -g -O1 -march=native -c fsst_avx512.cpp # -O1: no constant propagation reduces register pressure and improves unrolling diff --git a/fsst/README.md b/fsst/README.md new file mode 100644 index 00000000..95062ed9 --- /dev/null +++ b/fsst/README.md @@ -0,0 +1,25 @@ +# FSST +Fast Static Symbol Table (FSST): fast text compression that allows random access + +[![Watch the video](https://github.com/cwida/fsst/raw/master/fsst-presentation.png)](https://github.com/cwida/fsst/raw/master/fsst-presentation.mp4) + +Authors: +- Peter Boncz (CWI) +- Viktor Leis (FSU Jena) +- Thomas Neumann (TU Munchen) + +You can contact the authors via the issues of this FSST source repository : https://github.com/cwida/fsst + +FSST: Fast Static Symbol Table compression +see the PVLDB paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf + +FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e. where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. LZ4 (which is block-based), FSST further achieves similar decompression speed and compression speed, and better compression ratio. + +FSST encodes strings using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could be seen as strings again and fit in whatever your program is that manipulates strings. An optional 0-terminated mode (like, C-strings) is also supported. + +FSST ensures that strings that are equal, are also equal in their compressed form. This means equality comparisons can be performed without decompressing the strings. + +FSST compression is quite useful in database systems and data file formats. It e.g., allows fine-grained decompression of values in case of selection predicates that are pushed down into a scan operator. But, very often FSST even allows to postpone decompression of string data. This means hash tables (in joins and aggregations) become smaller, and network communication (in case of distributed query processing) is reduced. All of this without requiring much structural changes to existing systems: after all, FSST compressed strings still remain strings. + +The implementation of FSST is quite portable, using CMake and has been verified to work on 64-bits x86 computers running Linux, MacOS and Windows. + diff --git a/fsst/fsst.cpp b/fsst/fsst.cpp new file mode 100644 index 00000000..3ab8f6eb --- /dev/null +++ b/fsst/fsst.cpp @@ -0,0 +1,192 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2019, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include "fsst.h" // the official FSST API -- also usable by C mortals +#include +#include +#include +#include +#include +#include +using namespace std; + +// Utility to compress and decompress (-d) data with FSST (using stdin and stdout). +// +// The utility has a poor-man's async I/O in that it uses double buffering for input and output, +// and two background pthreads for reading and writing. The idea is to make the CPU overlap with I/O. +// +// The data format is quite simple. A FSST compressed file is a sequence of blocks, each with format: +// (1) 3-byte block length field (max blocksize is hence 16MB). This byte-length includes (1), (2) and (3). +// (2) FSST dictionary as produced by fst_export(). +// (3) the FSST compressed data. +// +// The natural strength of FSST is in fact not block-based compression, but rather the compression and +// *individual* decompression of many small strings separately. Think of compressed databases and (column-store) +// data formats. But, this utility is to serve as an apples-to-apples comparison point with utilities like lz4. + +namespace { + +class BinarySemaphore { + private: + mutex m; + condition_variable cv; + bool value; + + public: + explicit BinarySemaphore(bool initialValue = false) : value(initialValue) {} + void wait() { + unique_lock lock(m); + while (!value) cv.wait(lock); + value = false; + } + void post() { + { unique_lock lock(m); value = true; } + cv.notify_one(); + } +}; + +bool stopThreads = false; +BinarySemaphore srcDoneIO[2], dstDoneIO[2], srcDoneCPU[2], dstDoneCPU[2]; +unsigned char *srcBuf[2] = { NULL, NULL }; +unsigned char *dstBuf[2] = { NULL, NULL }; +unsigned char *dstMem[2] = { NULL, NULL }; +size_t srcLen[2] = { 0, 0 }; +size_t dstLen[2] = { 0, 0 }; + +#define FSST_MEMBUF (1ULL<<22) +int decompress = 0; +size_t blksz = FSST_MEMBUF-(1+FSST_MAXHEADER/2); // block size of compression (max compressed size must fit 3 bytes) + +#define DESERIALIZE(p) (((unsigned long long) (p)[0]) << 16) | (((unsigned long long) (p)[1]) << 8) | ((unsigned long long) (p)[2]) +#define SERIALIZE(l,p) { (p)[0] = ((l)>>16)&255; (p)[1] = ((l)>>8)&255; (p)[2] = (l)&255; } + +void reader(ifstream& src) { + for(int swap=0; true; swap = 1-swap) { + srcDoneCPU[swap].wait(); + if (stopThreads) break; + src.read((char*) srcBuf[swap], blksz); + srcLen[swap] = (unsigned long) src.gcount(); + if (decompress) { + if (blksz && srcLen[swap] == blksz) { + blksz = DESERIALIZE(srcBuf[swap]+blksz-3); // read size of next block + srcLen[swap] -= 3; // cut off size bytes + } else { + blksz = 0; + } + } + srcDoneIO[swap].post(); + } +} + +void writer(ofstream& dst) { + for(int swap=0; true; swap = 1-swap) { + dstDoneCPU[swap].wait(); + if (!dstLen[swap]) break; + dst.write((char*) dstBuf[swap], dstLen[swap]); + dstDoneIO[swap].post(); + } + for(int swap=0; swap<2; swap++) + dstDoneIO[swap].post(); +} + +} + +int main(int argc, char* argv[]) { + size_t srcTot = 0, dstTot = 0; + if (argc < 2 || argc > 4 || (argc == 4 && (argv[1][0] != '-' || argv[1][1] != 'd' || argv[1][2]))) { + cerr << "usage: " << argv[0] << " -d infile outfile" << endl; + cerr << " " << argv[0] << " infile outfile" << endl; + cerr << " " << argv[0] << " infile" << endl; + return -1; + } + decompress = (argc == 4); + string srcfile(argv[1+decompress]), dstfile; + if (argc == 2) { + dstfile = srcfile + ".fsst"; + } else { + dstfile = argv[2+decompress]; + } + ifstream src; + ofstream dst; + src.open(srcfile, ios::binary); + dst.open(dstfile, ios::binary); + dst.exceptions(ios_base::failbit); + dst.exceptions(ios_base::badbit); + src.exceptions(ios_base::badbit); + if (decompress) { + unsigned char tmp[3]; + src.read((char*) tmp, 3); + if (src.gcount() != 3) { + cerr << "failed to open input." << endl; + return -1; + } + blksz = DESERIALIZE(tmp); // read first block size + } + vector buffer(FSST_MEMBUF*6); + srcBuf[0] = buffer.data(); + srcBuf[1] = srcBuf[0] + (FSST_MEMBUF*(1ULL+decompress)); + dstMem[0] = srcBuf[1] + (FSST_MEMBUF*(1ULL+decompress)); + dstMem[1] = dstMem[0] + (FSST_MEMBUF*(2ULL-decompress)); + + for(int swap=0; swap<2; swap++) { + srcDoneCPU[swap].post(); // input buffer is not being processed initially + dstDoneIO[swap].post(); // output buffer is not being written initially + } + thread readerThread([&src]{ reader(src); }); + thread writerThread([&dst]{ writer(dst); }); + + for(int swap=0; true; swap = 1-swap) { + srcDoneIO[swap].wait(); // wait until input buffer is available (i.e. done reading) + dstDoneIO[swap].wait(); // wait until output buffer is ready writing hence free for use + if (srcLen[swap] == 0) { + dstLen[swap] = 0; + break; + } + if (decompress) { + fsst_decoder_t decoder; + size_t hdr = fsst_import(&decoder, srcBuf[swap]); + dstLen[swap] = fsst_decompress(&decoder, srcLen[swap] - hdr, srcBuf[swap] + hdr, FSST_MEMBUF, dstBuf[swap] = dstMem[swap]); + } else { + unsigned char tmp[FSST_MAXHEADER]; + fsst_encoder_t *encoder = fsst_create(1, &srcLen[swap], &srcBuf[swap], 0); + size_t hdr = fsst_export(encoder, tmp); + if (fsst_compress(encoder, 1, &srcLen[swap], &srcBuf[swap], FSST_MEMBUF*2, dstMem[swap]+FSST_MAXHEADER+3, + &dstLen[swap], &dstBuf[swap]) < 1) return -1; + dstLen[swap] += 3 + hdr; + dstBuf[swap] -= 3 + hdr; + SERIALIZE(dstLen[swap],dstBuf[swap]); // block starts with size + copy(tmp, tmp+hdr, dstBuf[swap]+3); // then the header (followed by the compressed bytes which are already there) + fsst_destroy(encoder); + } + srcTot += srcLen[swap]; + dstTot += dstLen[swap]; + srcDoneCPU[swap].post(); // input buffer may be re-used by the reader for the next block + dstDoneCPU[swap].post(); // output buffer is ready for writing out + } + cerr << (decompress?"Dec":"C") << "ompressed " << srcTot << " bytes into " << dstTot << " bytes ==> " << (int) ((100*dstTot)/srcTot) << "%" << endl; + + // force wait until all background writes finished + stopThreads = true; + for(int swap=0; swap<2; swap++) { + srcDoneCPU[swap].post(); + dstDoneCPU[swap].post(); + } + dstDoneIO[0].wait(); + dstDoneIO[1].wait(); + readerThread.join(); + writerThread.join(); +} diff --git a/fsst/fsst.h b/fsst/fsst.h new file mode 100644 index 00000000..8ffa2189 --- /dev/null +++ b/fsst/fsst.h @@ -0,0 +1,222 @@ +/* + * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019 + * + * =================================================================================================================================== + * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): + * + * Copyright 2018-2020, CWI, TU Munich, FSU Jena + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files + * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst + * =================================================================================================================================== + * + * FSST: Fast Static Symbol Table compression + * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf + * + * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e. + * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual + * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is + * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text. + * + * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) + * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression + * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could + * be seen as strings again and fit in whatever your program is that manipulates strings. + * + * useful property: FSST ensures that strings that are equal, are also equal in their compressed form. + * + * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of + * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. + * + * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program + * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings). + * + * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are + * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length. + */ +#ifndef _FSST_H_ +#define _FSST_H_ + +#ifdef _MSC_VER +#define __restrict__ +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#define __ORDER_LITTLE_ENDIAN__ 2 +#include +static inline int __builtin_ctzl(unsigned long long x) { + unsigned long ret; + _BitScanForward64(&ret, x); + return (int)ret; +} +#endif + +#ifdef __cplusplus +#define FSST_FALLTHROUGH [[fallthrough]] +extern "C" { +#else +#define FSST_FALLTHROUGH +#endif + +#include + +/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */ +#define FSST_ESC 255 + +/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */ +typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */ + +/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */ +typedef struct { + unsigned long long version; /* version id */ + unsigned char zeroTerminated; /* terminator is a single-byte code that does not appear in longer symbols */ + unsigned char len[255]; /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */ + unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ +} fsst_decoder_t; + +/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */ +fsst_encoder_t* +fsst_create( + size_t n, /* IN: number of strings in batch to sample from. */ + size_t lenIn[], /* IN: byte-lengths of the inputs */ + unsigned char *strIn[], /* IN: string start pointers. */ + int zeroTerminated /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */ +); + +/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */ +fsst_encoder_t* +fsst_duplicate( + fsst_encoder_t *encoder /* IN: the symbol table to duplicate. */ +); + +#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */ + +/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */ +unsigned int /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */ +fsst_export( + fsst_encoder_t *encoder, /* IN: the symbol table to dump. */ + unsigned char *buf /* OUT: pointer to a byte-buffer where to serialize this symbol table. */ +); + +/* Deallocate encoder. */ +void +fsst_destroy(fsst_encoder_t*); + +/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */ +unsigned int /* OUT: number of bytes consumed in buf (0 on failure). */ +fsst_import( + fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */ + unsigned char *buf /* OUT: pointer to a byte-buffer where fsst_export() serialized this symbol table. */ +); + +/* Return a decoder structure from an encoder. */ +fsst_decoder_t +fsst_decoder( + fsst_encoder_t *encoder +); + +/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */ +/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */ +size_t /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ +fsst_compress( + fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */ + size_t nstrings, /* IN: number of strings in batch to compress. */ + size_t lenIn[], /* IN: byte-lengths of the inputs */ + unsigned char *strIn[], /* IN: input string start pointers. */ + size_t outsize, /* IN: byte-length of output buffer. */ + unsigned char *output, /* OUT: memory buffer to put the compressed strings in (one after the other). */ + size_t lenOut[], /* OUT: byte-lengths of the compressed strings. */ + unsigned char *strOut[] /* OUT: output string start pointers. Will all point into [output,output+size). */ +); + +/* Decompress a single string, inlined for speed. */ +inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */ +fsst_decompress( + fsst_decoder_t *decoder, /* IN: use this symbol table for compression. */ + size_t lenIn, /* IN: byte-length of compressed string. */ + unsigned char *strIn, /* IN: compressed string. */ + size_t size, /* IN: byte-length of output buffer. */ + unsigned char *output /* OUT: memory buffer to put the decompressed string in. */ +) { + unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len; + unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output; + unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol; + size_t code, posOut = 0, posIn = 0; +#ifndef FSST_MUST_ALIGN_STORES /* define this if your platform does not allow unaligned memory access */ +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + while (posOut+32 <= size && posIn+4 <= lenIn) { + unsigned int nextBlock = *((unsigned int*) (strIn+posIn)); + unsigned int escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u); + if (escapeMask == 0) { + code = strIn[posIn++]; *(unsigned long long*) (strOut+posOut) = symbol[code]; posOut += len[code]; + code = strIn[posIn++]; *(unsigned long long*) (strOut+posOut) = symbol[code]; posOut += len[code]; + code = strIn[posIn++]; *(unsigned long long*) (strOut+posOut) = symbol[code]; posOut += len[code]; + code = strIn[posIn++]; *(unsigned long long*) (strOut+posOut) = symbol[code]; posOut += len[code]; + } else { + unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3; + switch(firstEscapePos) { /* Duff's device */ + case 3: code = strIn[posIn++]; *(unsigned long long*) (strOut+posOut) = symbol[code]; posOut += len[code]; FSST_FALLTHROUGH; + case 2: code = strIn[posIn++]; *(unsigned long long*) (strOut+posOut) = symbol[code]; posOut += len[code]; FSST_FALLTHROUGH; + case 1: code = strIn[posIn++]; *(unsigned long long*) (strOut+posOut) = symbol[code]; posOut += len[code]; FSST_FALLTHROUGH; + case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */ + } + } + } + if (posOut+24 <= size) { // handle the possibly 3 last bytes without a loop + if (posIn+2 <= lenIn) { + strOut[posOut] = strIn[posIn+1]; + if (strIn[posIn] != FSST_ESC) { + code = strIn[posIn++]; *(unsigned long long*) (strOut+posOut) = symbol[code]; posOut += len[code]; + if (strIn[posIn] != FSST_ESC) { + code = strIn[posIn++]; *(unsigned long long*) (strOut+posOut) = symbol[code]; posOut += len[code]; + } else { + posIn += 2; strOut[posOut++] = strIn[posIn-1]; + } + } else { + posIn += 2; posOut++; + } + } + if (posIn < lenIn) { // last code cannot be an escape + code = strIn[posIn++]; *(unsigned long long*) (strOut+posOut) = symbol[code]; posOut += len[code]; + } + } +#else + while (posOut+8 <= size && posIn < lenIn) + if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */ + *(unsigned long long*) (strOut+posOut) = symbol[code]; /* unaligned memory write */ + posOut += len[code]; + } else { + strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */ + posIn++; posOut++; + } +#endif +#endif + while (posIn < lenIn) + if ((code = strIn[posIn++]) < FSST_ESC) { + size_t posWrite = posOut, endWrite = posOut + len[code]; + unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite; + if ((posOut = endWrite) > size) endWrite = size; + for(; posWrite < endWrite; posWrite++) /* only write if there is room */ + strOut[posWrite] = symbolPointer[posWrite]; + } else { + if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */ + posIn++; posOut++; + } + if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0; + return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _FSST_H_ */ diff --git a/fsst/fsst_avx512.cpp b/fsst/fsst_avx512.cpp new file mode 100644 index 00000000..a2b7b5e5 --- /dev/null +++ b/fsst/fsst_avx512.cpp @@ -0,0 +1,140 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include "libfsst.hpp" + +#if defined(__x86_64__) || defined(_M_X64) +#include + +#ifdef _WIN32 +bool fsst_hasAVX512() { + int info[4]; + __cpuidex(info, 0x00000007, 0); + return (info[1]>>16)&1; +} +#else +#include +bool fsst_hasAVX512() { + int info[4]; + __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]); + return (info[1]>>16)&1; +} +#endif +#else +bool fsst_hasAVX512() { return false; } +#endif + +// BULK COMPRESSION OF STRINGS +// +// In one call of this function, we can compress 512 strings, each of maximum length 511 bytes. +// strings can be shorter than 511 bytes, no problem, but if they are longer we need to cut them up. +// +// In each iteration of the while loop, we find one code in each of the unroll*8 strings, i.e. (8,16,24 or 32) for resp. unroll=1,2,3,4 +// unroll3 performs best on my hardware +// +// In the worst case, each final encoded string occupies 512KB bytes (512*1024; with 1024=512xexception, exception = 2 bytes). +// - hence codeBase is a buffer of 512KB (needs 19 bits jobs), symbolBase of 256KB (needs 18 bits jobs). +// +// 'jobX' controls the encoding of each string and is therefore a u64 with format [out:19][pos:9][end:18][cur:18] (low-to-high bits) +// The field 'pos' tells which string we are processing (0..511). We need this info as strings will complete compressing out-of-order. +// +// Strings will have different lengths, and when a string is finished, we reload from the buffer of 512 input strings. +// This continues until we have less than (8,16,24 or 32; depending on unroll) strings left to process. +// - so 'processed' is the amount of strings we started processing and it is between [480,512]. +// Note that when we quit, there will still be some (<32) strings that we started to process but which are unfinished. +// - so 'unfinished' is that amount. These unfinished strings will be encoded further using the scalar method. +// +// Apart from the coded strings, we return in a output[] array of size 'processed' the job values of the 'finished' strings. +// In the following 'unfinished' slots (processed=finished+unfinished) we output the 'job' values of the unfinished strings. +// +// For the finished strings, we need [out:19] to see the compressed size and [pos:9] to see which string we refer to. +// For the unfinished strings, we need all fields of 'job' to continue the compression with scalar code (see SIMD code in compressBatch). +// +// THIS IS A SEPARATE CODE FILE NOT BECAUSE OF MY LOVE FOR MODULARIZED CODE BUT BECAUSE IT ALLOWS TO COMPILE IT WITH DIFFERENT FLAGS +// in particular, unrolling is crucial for gather/scatter performance, but requires registers. the #define all_* expressions however, +// will be detected to be constants by g++ -O2 and will be precomputed and placed into AVX512 registers - spoiling 9 of them. +// This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization. +// Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores. + +size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) { + size_t processed = 0; + // define some constants (all_x means that all 8 lanes contain 64-bits value X) +#ifdef __AVX512F__ + //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c + __m512i all_MASK = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1)); + __m512i all_PRIME = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME)); + __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE)); +#define all_HASH _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE) +#define all_ONE _mm512_srli_epi64(all_MASK, 63) +#define all_M19 _mm512_srli_epi64(all_MASK, 45) +#define all_M18 _mm512_srli_epi64(all_MASK, 46) +#define all_M28 _mm512_srli_epi64(all_MASK, 36) +#define all_FFFFFF _mm512_srli_epi64(all_MASK, 40) +#define all_FFFF _mm512_srli_epi64(all_MASK, 48) +#define all_FF _mm512_srli_epi64(all_MASK, 56) + + SIMDjob *inputEnd = input+n; + assert(n >= unroll*8 && n <= 512); // should be close to 512 + __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4 + __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll + u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll + + if (unroll >= 4) { + while (input+delta1+delta2+delta3+delta4 < inputEnd) { + #include "fsst_avx512_unroll4.inc" + } + } else if (unroll == 3) { + while (input+delta1+delta2+delta3 < inputEnd) { + #include "fsst_avx512_unroll3.inc" + } + } else if (unroll == 2) { + while (input+delta1+delta2 < inputEnd) { + #include "fsst_avx512_unroll2.inc" + } + } else { + while (input+delta1 < inputEnd) { + #include "fsst_avx512_unroll1.inc" + } + } + + // flush the job states of the unfinished strings at the end of output[] + processed = n - (inputEnd - input); + u32 unfinished = 0; + if (unroll > 1) { + if (unroll > 2) { + if (unroll > 3) { + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4); + unfinished += _mm_popcnt_u32((int) loadmask4); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3); + unfinished += _mm_popcnt_u32((int) loadmask3); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2); + unfinished += _mm_popcnt_u32((int) loadmask2); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1); +#else + (void) symbolTable; + (void) codeBase; + (void) symbolBase; + (void) input; + (void) output; + (void) n; + (void) unroll; +#endif + return processed; +} diff --git a/fsst/fsst_avx512.inc b/fsst/fsst_avx512.inc new file mode 100644 index 00000000..0a74541d --- /dev/null +++ b/fsst/fsst_avx512.inc @@ -0,0 +1,57 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmaskX=11111111, deltaX=8). + jobX = _mm512_mask_expandloadu_epi64(jobX, loadmaskX, input); input += deltaX; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i wordX = _mm512_i64gather_epi64(_mm512_srli_epi64(jobX, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // codeX: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i codeX = _mm512_i64gather_epi64(_mm512_and_epi64(wordX, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + __m512i posX = _mm512_mullo_epi64(_mm512_and_epi64(wordX, all_FFFFFF), all_PRIME); + // hash them into a random number: posX = posX*PRIME; posX ^= posX>>SHIFT + posX = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(posX,_mm512_srli_epi64(posX,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + __m512i iclX = _mm512_i64gather_epi64(posX, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the writeX register (in case it turns out to be an escaped byte). + __m512i writeX = _mm512_slli_epi64(_mm512_and_epi64(wordX, all_FF), 8); + // lookup just like the iclX above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symbX = _mm512_i64gather_epi64(posX, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + posX = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(iclX, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 matchX = _mm512_cmpeq_epi64_mask(symbX, _mm512_and_epi64(wordX, posX)) & _mm512_cmplt_epi64_mask(iclX, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + codeX = _mm512_mask_mov_epi64(codeX, matchX, _mm512_srli_epi64(iclX, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + writeX = _mm512_or_epi64(writeX, _mm512_and_epi64(codeX, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + codeX = _mm512_and_epi64(codeX, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(jobX, all_M19), writeX, 1); + // increase the jobX.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + jobX = _mm512_add_epi64(jobX, _mm512_slli_epi64(_mm512_srli_epi64(codeX, FSST_LEN_BITS), 46)); + // increase the jobX.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + jobX = _mm512_add_epi64(jobX, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(codeX, 8), all_ONE))); + // test which lanes are done now (jobX.cur==jobX.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the jobX register) + loadmaskX = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(jobX, 46), _mm512_and_epi64(_mm512_srli_epi64(jobX, 28), all_M18)); + // calculate the amount of lanes in jobX that are done + deltaX = _mm_popcnt_u32((int) loadmaskX); + // write out the job state for the lanes that are done (we need the final 'jobX.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmaskX, jobX); output += deltaX; diff --git a/fsst/fsst_avx512_unroll1.inc b/fsst/fsst_avx512_unroll1.inc new file mode 100644 index 00000000..f4b81c79 --- /dev/null +++ b/fsst/fsst_avx512_unroll1.inc @@ -0,0 +1,57 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; diff --git a/fsst/fsst_avx512_unroll2.inc b/fsst/fsst_avx512_unroll2.inc new file mode 100644 index 00000000..aa33cd7e --- /dev/null +++ b/fsst/fsst_avx512_unroll2.inc @@ -0,0 +1,114 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; diff --git a/fsst/fsst_avx512_unroll3.inc b/fsst/fsst_avx512_unroll3.inc new file mode 100644 index 00000000..e2057032 --- /dev/null +++ b/fsst/fsst_avx512_unroll3.inc @@ -0,0 +1,171 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + job3 = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + __m512i word3 = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code3 = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + __m512i pos3 = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + pos3 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + __m512i write3 = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + pos3 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + __mmask8 match3 = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + code3 = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + write3 = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + code3 = _mm512_and_epi64(code3, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + job3 = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + job3 = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + // calculate the amount of lanes in job3 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + delta3 = _mm_popcnt_u32((int) loadmask3); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; + _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3; diff --git a/fsst/fsst_avx512_unroll4.inc b/fsst/fsst_avx512_unroll4.inc new file mode 100644 index 00000000..15cca7c9 --- /dev/null +++ b/fsst/fsst_avx512_unroll4.inc @@ -0,0 +1,228 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E4PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask4=11111111, delta4=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + job3 = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; + job4 = _mm512_mask_expandloadu_epi64(job4, loadmask4, input); input += delta4; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + __m512i word3 = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); + __m512i word4 = _mm512_i64gather_epi64(_mm512_srli_epi64(job4, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code4: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code3 = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code4 = _mm512_i64gather_epi64(_mm512_and_epi64(word4, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + __m512i pos3 = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME); + __m512i pos4 = _mm512_mullo_epi64(_mm512_and_epi64(word4, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT + // hash them into a random number: pos4 = pos4*PRIME; pos4 ^= pos4>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + pos3 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4); + pos4 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos4,_mm512_srli_epi64(pos4,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl4 = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write4 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + __m512i write3 = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8); + __m512i write4 = _mm512_slli_epi64(_mm512_and_epi64(word4, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl4 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb4 = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + pos3 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF)); + pos4 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl4, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + __mmask8 match3 = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE); + __mmask8 match4 = _mm512_cmpeq_epi64_mask(symb4, _mm512_and_epi64(word4, pos4)) & _mm512_cmplt_epi64_mask(icl4, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + code3 = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16)); + code4 = _mm512_mask_mov_epi64(code4, match4, _mm512_srli_epi64(icl4, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + write3 = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF)); + write4 = _mm512_or_epi64(write4, _mm512_and_epi64(code4, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + code3 = _mm512_and_epi64(code3, all_FFFF); + code4 = _mm512_and_epi64(code4, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job4, all_M19), write4, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job4.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + job3 = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46)); + job4 = _mm512_add_epi64(job4, _mm512_slli_epi64(_mm512_srli_epi64(code4, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job4.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + job3 = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE))); + job4 = _mm512_add_epi64(job4, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code4, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register) + // test which lanes are done now (job4.cur==job4.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job4 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18)); + loadmask4 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job4, 46), _mm512_and_epi64(_mm512_srli_epi64(job4, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + // calculate the amount of lanes in job3 that are done + // calculate the amount of lanes in job4 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + delta3 = _mm_popcnt_u32((int) loadmask3); + delta4 = _mm_popcnt_u32((int) loadmask4); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job4.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; + _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3; + _mm512_mask_compressstoreu_epi64(output, loadmask4, job4); output += delta4; diff --git a/fsst/libfsst.cpp b/fsst/libfsst.cpp new file mode 100644 index 00000000..36e75a27 --- /dev/null +++ b/fsst/libfsst.cpp @@ -0,0 +1,632 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include "libfsst.hpp" + +Symbol concat(Symbol a, Symbol b) { + Symbol s; + u32 length = a.length()+b.length(); + if (length > Symbol::maxLength) length = Symbol::maxLength; + s.set_code_len(FSST_CODE_MASK, length); + s.val.num = (b.val.num << (8*a.length())) | a.val.num; + return s; +} + +namespace std { +template <> +class hash { + public: + size_t operator()(const QSymbol& q) const { + uint64_t k = q.symbol.val.num; + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + uint64_t h = 0x8445d61a4e774912 ^ (8*m); + k *= m; + k ^= k >> r; + k *= m; + h ^= k; + h *= m; + h ^= h >> r; + h *= m; + h ^= h >> r; + return h; + } +}; +} + +bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; } + +std::ostream& operator<<(std::ostream& out, const Symbol& s) { + for (u32 i=0; i line, size_t len[], bool zeroTerminated=false) { + SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable(); + int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception) + size_t sampleFrac = 128; + + // start by determining the terminator. We use the (lowest) most infrequent byte as terminator + st->zeroTerminated = zeroTerminated; + if (zeroTerminated) { + st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency + } else { + u16 byteHisto[256]; + memset(byteHisto, 0, sizeof(byteHisto)); + for(size_t i=0; iterminator = 256; + while(i-- > 0) { + if (byteHisto[i] > minSize) continue; + st->terminator = i; + minSize = byteHisto[i]; + } + } + assert(st->terminator != 256); + + // a random number between 0 and 128 + auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); }; + + // compress sample, and compute (pair-)frequencies + auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain + int gain = 0; + + for(size_t i=0; i sampleFrac) continue; + } + if (cur < end) { + u16 pos2 = 255, pos1 = st->findLongestSymbol(cur, end); + cur += st->symbols[pos1].length(); + gain += (int) (st->symbols[pos1].length()-(1+isEscapeCode(pos1))); + while (true) { + u8* old = cur; + counters.count1Inc(pos1); + // count single symbol (i.e. an option is not extending it) + if (st->symbols[pos1].length() != 1) + counters.count1Inc(*cur); + if (cur(cur)[0]; + size_t pos = word & 0xFFFFFF; + size_t idx = FSST_HASH(pos)&(st->hashTabSize-1); + Symbol s = st->hashTab[idx]; + pos2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK; + word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) { + pos2 = s.code(); + cur += s.length(); + } else if (pos2 >= FSST_CODE_BASE) { + cur += 2; + } else { + pos2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK; + cur += 1; + } + } else if (cur==end) { + break; + } else { + assert(curfindLongestSymbol(cur, end); + cur += st->symbols[pos2].length(); + } + + // compute compressed output size + gain += ((int) (cur-old))-(1+isEscapeCode(pos2)); + + // now count the subsequent two symbols we encode as an extension possibility + if (sampleFrac < 128) { // no need to count pairs in final round + counters.count2Inc(pos1, pos2); + if ((cur-old) > 1) // do not count escaped bytes doubly + counters.count2Inc(pos1, *old); + } + pos1 = pos2; + } + } + } + return gain; + }; + + auto makeTable = [&](SymbolTable *st, Counters &counters) { + // hashmap of c (needed because we can generate duplicate candidates) + unordered_set cands; + + // artificially make terminater the most frequent symbol so it gets included + u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator; + counters.count1Set(terminator,65535); + + auto addOrInc = [&](unordered_set &cands, Symbol s, u64 count) { + if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!! + QSymbol q; + q.symbol = s; + q.gain = count * s.length(); + auto it = cands.find(q); + if (it != cands.end()) { + q.gain += (*it).gain; + cands.erase(*it); + } + cands.insert(q); + }; + + // add candidate symbols based on counted frequency + for (u32 pos1=0; pos1nSymbols; pos1++) { + u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!! + if (!cnt1) continue; + + // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed + Symbol s1 = st->symbols[pos1]; + addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1); + + if (sampleFrac >= 128 || // last round we do not create new (combined) symbols + s1.length() == Symbol::maxLength || // symbol cannot be extended + s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte + continue; + } + for (u32 pos2=0; pos2nSymbols; pos2++) { + u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!! + if (!cnt2) continue; + + // create a new symbol + Symbol s2 = st->symbols[pos2]; + Symbol s3 = concat(s1, s2); + if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte + addOrInc(cands, s3, cnt2); + } + } + + // insert candidates into priority queue (by gain) + auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); }; + priority_queue,decltype(cmpGn)> pq(cmpGn); + for (auto& q : cands) + pq.push(q); + + // Create new symbol map using best candidates + st->clear(); + while (st->nSymbols < 255 && !pq.empty()) { + QSymbol q = pq.top(); + pq.pop(); + st->add(q.symbol); + } + }; + + u8 bestCounters[512*sizeof(u16)]; +#ifdef NONOPT_FSST + for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) { + sampleFrac = frac; +#else + for(sampleFrac=8; true; sampleFrac += 30) { +#endif + memset(&counters, 0, sizeof(Counters)); + long gain = compressCount(st, counters); + if (gain >= bestGain) { // a new best solution! + counters.backup1(bestCounters); + *bestTable = *st; bestGain = gain; + } + if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128) + makeTable(st, counters); + } + delete st; + counters.restore1(bestCounters); + makeTable(bestTable, counters); + bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression + return bestTable; +} + +static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, size_t len[], u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) { + size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size; + u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings + SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer + SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this) + size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs) + + while (curLine < nlines && outOff <= (1<<19)) { + size_t prevLine = curLine, chunk, curOff = 0; + + // bail out if the output buffer cannot hold the compressed next string fully + if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7 + else budget -= (len[curLine]-curOff)*2; + + strOut[curLine] = (u8*) 0; + lenOut[curLine] = 0; + + do { + do { + chunk = len[curLine] - curOff; + if (chunk > 511) { + chunk = 511; // large strings need to be chopped up into segments of 511 bytes + } + // create a job in this batch + SIMDjob job; + job.cur = inOff; + job.end = job.cur + chunk; + job.pos = batchPos; + job.out = outOff; + + // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros) + outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes. + if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk + + // register job in this batch + input[batchPos] = job; + jobLine[batchPos] = curLine; + + if (chunk == 0) { + empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out + } else { + // copy string chunk into temp buffer + memcpy(symbolBase + inOff, line[curLine] + curOff, chunk); + inOff += chunk; + curOff += chunk; + symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded + } + if (++batchPos == 512) break; + } while(curOff < len[curLine]); + + if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines)) { // cannot accumulate more? + if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling) + // radix-sort jobs on length (longest string first) + // -- this provides best load balancing and allows to skip empty jobs at the end + u16 sortpos[513]; + memset(sortpos, 0, sizeof(sortpos)); + + // calculate length histo + for(size_t i=0; i(cur)[0]; + u64 code = symbolTable.shortCodes[word & 0xFFFF]; + size_t pos = word & 0xFFFFFF; + size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1); + Symbol s = symbolTable.hashTab[idx]; + out[1] = (u8) word; // speculatively write out escaped byte + word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.val.num == word) { + *out++ = (u8) s.code(); cur += s.length(); + } else { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); + cur += (code>>FSST_LEN_BITS); + } + } + job.out = out - codeBase; + } + // postprocess job info + job.cur = 0; + job.end = job.out - input[job.pos].out; // misuse .end field as compressed size + job.out = input[job.pos].out; // reset offset to start of encoded string + input[job.pos] = job; + } + + // copy out the result data + for(size_t i=0; i(cur)[0]; + size_t code = symbolTable.shortCodes[word & 0xFFFF]; + if (noSuffixOpt && ((u8) code) < suffixLim) { + // 2 byte code without having to worry about longer matches + *out++ = (u8) code; cur += 2; + } else { + size_t pos = word & 0xFFFFFF; + size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1); + Symbol s = symbolTable.hashTab[idx]; + out[1] = (u8) word; // speculatively write out escaped byte + word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.val.num == word) { + *out++ = (u8) s.code(); cur += s.length(); + } else if (avoidBranch) { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); + cur += (code>>FSST_LEN_BITS); + } else if ((u8) code < byteLim) { + // 2 byte code after checking there is no longer pattern + *out++ = (u8) code; cur += 2; + } else { + // 1 byte code or miss. + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse + cur++; + } + } + } + }; + + for(curLine=0; curLine 511) { + chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST + skipCopy = false; // need to put terminator, so no in place mem usage possible + } + if ((2*chunk+7) > (size_t) (lim-out)) { + return curLine; // out of memory + } + if (!skipCopy) { // only in case of short zero-terminated strings, we can avoid copying + memcpy(buf, cur, chunk); + cur = buf; + buf[chunk] = (u8) symbolTable.terminator; + } + end = cur + chunk; + // based on symboltable stats, choose a variant that is nice to the branch predictor + if (noSuffixOpt) { + compressVariant(true,false); + } else if (avoidBranch) { + compressVariant(false,true); + } else { + compressVariant(false, false); + } + } while((curOff += chunk) < lenIn[curLine]); + lenOut[curLine] = (size_t) (out - strOut[curLine]); + } + return curLine; +} + +#define FSST_SAMPLELINE ((size_t) 512) + +// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes +vector makeSample(u8* sampleBuf, u8* strIn[], size_t **lenRef, size_t nlines) { + size_t totSize = 0, *lenIn = *lenRef; + vector sample; + + for(size_t i=0; i sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample + Encoder *encoder = new Encoder(); + encoder->symbolTable = shared_ptr(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated)); + if (sampleLen != lenIn) delete[] sampleLen; + delete[] sampleBuf; + return (fsst_encoder_t*) encoder; +} + +/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */ +extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) { + Encoder *e = new Encoder(); + e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr + return (fsst_encoder_t*) e; +} + +// export a symbol table in compact format. +extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) { + Encoder *e = (Encoder*) encoder; + // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there. + // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t + // (such functionality could be useful to append compressed data to an existing block). + // + // However, the hash function in the encoder hash table is endian-sensitive, and given its + // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables. + // Doing a endian-conversion during hashing will be slow and self-defeating. + // + // Overall, we could support reconstructing an encoder for incremental compression, but + // should enforce equal-endianness. Bit of a bummer. Not going there now. + // + // The version field is now there just for future-proofness, but not used yet + + // version allows keeping track of fsst versions, track endianness, and encoder reconstruction + u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0 + (((u64) e->symbolTable->suffixLim) << 24) | + (((u64) e->symbolTable->terminator) << 16) | + (((u64) e->symbolTable->nSymbols) << 8) | + FSST_ENDIAN_MARKER; // least significant byte is nonzero + + /* do not assume unaligned reads here */ + memcpy(buf, &version, 8); + buf[8] = e->symbolTable->zeroTerminated; + for(u32 i=0; i<8; i++) + buf[9+i] = (u8) e->symbolTable->lenHisto[i]; + u32 pos = 17; + + // emit only the used bytes of the symbols + for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++) + for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++) + buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes + + return pos; // length of what was serialized +} + +#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */ + +extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 *buf) { + u64 version = 0; + u32 code, pos = 17; + u8 lenHisto[8]; + + // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped) + memcpy(&version, buf, 8); + if ((version>>32) != FSST_VERSION) return 0; + decoder->zeroTerminated = buf[8]&1; + memcpy(lenHisto, buf+9, 8); + + // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) + decoder->len[0] = 1; + decoder->symbol[0] = 0; + + // we use lenHisto[0] as 1-byte symbol run length (at the end) + code = decoder->zeroTerminated; + if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end + + // now get all symbols from the buffer + for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */ + for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) { + decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */ + decoder->symbol[code] = 0; + for(u32 j=0; jlen[code]; j++) + ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols + } + } + if (decoder->zeroTerminated) lenHisto[0]++; + + // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols). + while(code<255) { + decoder->symbol[code] = FSST_CORRUPT; + decoder->len[code++] = 8; + } + return pos; +} + +// runtime check for simd +inline size_t _compressImpl(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { +#ifndef NONOPT_FSST + if (simd && fsst_hasAVX512()) + return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); +#endif + (void) simd; + return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch); +} +size_t compressImpl(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); +} + +// adaptive choosing of scalar compression method based on symbol length histogram +inline size_t _compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { + bool avoidBranch = false, noSuffixOpt = false; + if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) { + noSuffixOpt = true; + } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) && + (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) && + (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) { + avoidBranch = true; + } + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); +} +size_t compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { + return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); +} + +// the main compression function (everything automatic) +extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) { + // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB) + size_t totLen = accumulate(lenIn, lenIn+nlines, 0); + int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); + return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd); +} + +/* deallocate encoder */ +extern "C" void fsst_destroy(fsst_encoder_t* encoder) { + Encoder *e = (Encoder*) encoder; + delete e; +} + +/* very lazy implementation relying on export and import */ +extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) { + u8 buf[sizeof(fsst_decoder_t)]; + u32 cnt1 = fsst_export(encoder, buf); + fsst_decoder_t decoder; + u32 cnt2 = fsst_import(&decoder, buf); + assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; + return decoder; +} diff --git a/fsst/libfsst.hpp b/fsst/libfsst.hpp new file mode 100644 index 00000000..4d06645f --- /dev/null +++ b/fsst/libfsst.hpp @@ -0,0 +1,454 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#include "fsst.h" // the official FSST API -- also usable by C mortals + +/* unsigned integers */ +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +#define FSST_ENDIAN_MARKER ((u64) 1) +#define FSST_VERSION_20190218 20190218 +#define FSST_VERSION ((u64) FSST_VERSION_20190218) + +// "symbols" are character sequences (up to 8 bytes) +// A symbol is compressed into a "code" of, in principle, one byte. But, we added an exception mechanism: +// byte 255 followed by byte X represents the single-byte symbol X. Its code is 256+X. + +// we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length +#define FSST_LEN_BITS 12 +#define FSST_CODE_BITS 9 +#define FSST_CODE_BASE 256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */ +#define FSST_CODE_MAX (1UL<=8) { + val.num = reinterpret_cast(input)[0]; + len = 8; + } else { +#ifdef MEMDEBUG + for(u32 i=0; i(input)&63)<=(64-8)) { + u64 eight = reinterpret_cast(input)[0]; + val.num = (eight<>ignoredBits; + } else { + val.num = reinterpret_cast(input+len-8)[0]>>ignoredBits; + } +#endif + } + set_code_len(FSST_CODE_MAX, len); + } + void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); } + + u32 length() const { return (u32) (icl >> 28); } + u16 code() const { return (icl >> 16) & FSST_CODE_MASK; } + u32 ignoredBits() const { return (u32) icl; } + + u8 first() const { assert( length() >= 1); return 0xFF & val.num; } + u16 first2() const { assert( length() >= 2); return 0xFFFF & val.num; } + +#define FSST_HASH_LOG2SIZE 10 +#define FSST_HASH_PRIME 2971215073LL +#define FSST_SHIFT 15 +#define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT)) + size_t hash() const { size_t v = 0xFFFFFF & val.num; return FSST_HASH(v); } // hash on the next 3 bytes +}; + +// Symbol that can be put in a queue, ordered on gain +struct QSymbol{ + Symbol symbol; + mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols + bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); } +}; + +// we construct FSST symbol tables using a random sample of about 16KB (1<<14) +#define FSST_SAMPLETARGET (1<<14) +#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET) + +// two phases of compression, before and after optimize(): +// +// (1) to encode values we probe (and maintain) three datastructures: +// - u16 byteCodes[65536] array at the position of the next byte (s.length==1) +// - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2) +// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), +// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are +// pseudo codes representing a single byte these will become escapes) +// +// (2) when we finished looking for the best symbol table we call optimize() to reshape it: +// - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1) +// length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes +// (allows shortcut during compression) +// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding +// to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one +// byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[] +// and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction +// is that adding a new code/symbol is expensive (you have to touch shortCodes[] in 256 places). This optimization was +// hence added to make symbolTable construction faster. +// +// this final layout allows for the fastest compression code, only currently present in compressBulk + +// in the hash table, the icl field contains (low-to-high) ignoredBits:16,code:12,length:4 +#define FSST_ICL_FREE ((15<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of icl (len=8,code=FSST_CODE_MASK) indicates free bucket + +// ignoredBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key +// ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster +// +// the gain field is only used in the symbol queue that sorts symbols on gain + +struct SymbolTable { + static const u32 hashTabSize = 1<> (u8) s.icl); + return true; + } + bool add(Symbol s) { + assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX); + u32 len = s.length(); + s.set_code_len(FSST_CODE_BASE + nSymbols, len); + if (len == 1) { + byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<> ((u8) hashTab[idx].icl)))) { + return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol + } + if (s.length() >= 2) { + u16 code = shortCodes[s.first2()] & FSST_CODE_MASK; + if (code >= FSST_CODE_BASE) return code; + } + return byteCodes[s.first()] & FSST_CODE_MASK; + } + u16 findLongestSymbol(u8* cur, u8* end) const { + return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol + } + + // rationale for finalize: + // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable() + // consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize() + // (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass) + // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf. + // we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). + // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[] + // Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte + // symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time. + // + // In all, we change the layout and coding, as follows.. + // + // before finalize(): + // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255 + // - The first 256 codes are pseudo symbols (all escaped bytes) + // + // after finalize(): + // - table layout is symbols[0..nSymbols>, with nSymbols < 256. + // - Real codes are [0,nSymbols>. 8-th bit not set. + // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255 + // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last) + // the two-byte codes are split in two sections: + // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression + // + // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore). + // + void finalize(u8 zeroTerminated) { + assert(nSymbols <= 255); + u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated); + + // compute running sum of code lengths (starting offsets for each length) + rsum[0] = byteLim; // 1-byte codes are highest + rsum[1] = zeroTerminated; + for(u32 i=1; i<7; i++) + rsum[i+1] = rsum[i] + lenHisto[i]; + + // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim) + suffixLim = rsum[1]; + symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only) + + for(u32 i=zeroTerminated, j=rsum[2]; i 1 && first2 == s2.first2()) // test if symbol k is a suffix of s + opt = 0; + } + newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim + } else + newCode[i] = rsum[len-1]++; + s1.set_code_len(newCode[i],len); + symbols[newCode[i]] = s1; + } + // renumber the codes in byteCodes[] + for(u32 i=0; i<256; i++) + if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) + byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS); + else + byteCodes[i] = 511 + (1 << FSST_LEN_BITS); + + // renumber the codes in shortCodes[] + for(u32 i=0; i<65536; i++) + if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) + shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS)); + else + shortCodes[i] = byteCodes[i&0xFF]; + + // replace the symbols in the hash table + for(u32 i=0; i>8; + } + void count1Inc(u32 pos1) { + if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) + count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)... + } + void count2Inc(u32 pos1, u32 pos2) { + if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) + // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively + count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample) + } + u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range + // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros + u64 high = *(u64*) &count1High[pos1]; // note: this reads 8 subsequent counters [pos1..pos1+7] + + u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes + high = (high >> (zero << 3)) & 255; // advance to nonzero counter + if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 + return 0; // all zero + + u32 low = count1Low[pos1]; + if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) + return (u32) ((high << 8) + low); + } + u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range + // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros + u64 high = *(u64*) &count2High[pos1][pos2>>1]; // note: this reads 16 subsequent counters [pos2..pos2+15] + high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters + + u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters + high = (high >> (zero << 2)) & 15; // advance to nonzero counter + if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 + return 0UL; // all zero + + u32 low = count2Low[pos1][pos2]; + if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) + return (u32) ((high << 8) + low); + } + void backup1(u8 *buf) { + memcpy(buf, count1High, FSST_CODE_MAX); + memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX); + } + void restore1(u8 *buf) { + memcpy(count1High, buf, FSST_CODE_MAX); + memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX); + } +}; +#endif + + +#define FSST_BUFSZ (3<<19) // 768KB + +// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression +struct Encoder { + shared_ptr symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc) + union { + Counters counters; // for counting symbol occurences during map construction + u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) + }; +}; + +// job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call) +struct SIMDjob { + u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB) +}; + +extern bool +fsst_hasAVX512(); // runtime check for avx512 capability + +extern size_t +fsst_compressAVX512( + SymbolTable &symbolTable, + u8* codeBase, // IN: base address for codes, i.e. compression output (points to simdbuf+256KB) + u8* symbolBase, // IN: base address for string bytes, i.e. compression input (points to simdbuf) + SIMDjob* input, // IN: input array (size n) with job information: what to encode, where to store it. + SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer. + size_t n, // IN: size of arrays input and output (should be max 512) + size_t unroll); // IN: degree of SIMD unrolling + +// C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree) +size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); +size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd);