chore: update libfsst

2025-08-04 02:06:22 -04:00 · 2024-02-03 10:59:25 +01:00 · 2024-02-03 10:59:25 +01:00 · 52d68adbda
commit 52d68adbda
parent 55fc77d58a
9 changed files with 961 additions and 20 deletions
--- a/fsst/CMakeLists.txt
+++ b/fsst/CMakeLists.txt
@ -39,3 +39,10 @@ add_executable(binary fsst.cpp)
 target_link_libraries (binary LINK_PUBLIC fsst)
 target_link_libraries (binary LINK_PUBLIC Threads::Threads)
 set_target_properties(binary PROPERTIES OUTPUT_NAME fsst)
 add_library(fsst12 libfsst12.cpp)
 add_executable(binary12 fsst.cpp)
 target_link_libraries (binary12 LINK_PUBLIC fsst12)
 target_link_libraries (binary12 LINK_PUBLIC Threads::Threads)
 set_target_properties(binary12 PROPERTIES OUTPUT_NAME fsst12)
 set_target_properties(binary12 PROPERTIES COMPILE_FLAGS -DFSST12)
--- a/fsst/README.md
+++ b/fsst/README.md
@ -23,3 +23,13 @@ FSST compression is quite useful in database systems and data file formats. It e
 The implementation of FSST is quite portable, using CMake and has been verified to work on 64-bits x86 computers running Linux, Windows and MacOS (the latter also using arm64).
 FSST12 is an alternative version of FSST that uses 12-bits symbols, and hence can encode up to 4096 symbols (of max 8 bytes long). 
 It does not need an escaping mechanism as the first 256 codes are single-byte symbols consisting of only that byte. 
 These symbols ensure that FSST12 can always find some symbol matching the next input, but a code is 1.5bytes (12 bits) and those symbols are 1 byte, so there is still compression loss when that happens (though in FSST8 the penalty for an escape is heavier 2x compression loss).
 FSST12 lookup tables are 16x bigger than for 8-bits FSST (~8KB on average in storage, 32KB in memory), so a larger granularity of encoding volume is needed.
 Generally speaking, FSST12 needs 1.5x longer symbols on average than FSST to achieve the same compression ratio. 
 This is also what happens, by and large, because its symbol table can hold 16x more symbols, so there is room for more symbols that are much less frequent (which longer symbols are) and thus would not make the "worthwhile" cut in FSST8.
 FSST12 therefore can deal with data distributions that are less focused than natural (say, "english") text. For instance, JSON and XML compress better with it.
 Decoding it does need a larger lookup table, and encoding it is slower due to the increased memory pressure needed for 4096x4096 counters (and the absence of AVX512 path - for x86).
--- a/fsst/fsst.cpp
+++ b/fsst/fsst.cpp
@ -15,7 +15,11 @@
 // IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 //
 // You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
 #ifdef FSST12
 #include "fsst12.h" // the official FSST API -- also usable by C mortals
 #else
 #include "fsst.h" // the official FSST API -- also usable by C mortals
 #endif
 #include <condition_variable>
 #include <iostream>
 #include <fstream>
--- a/fsst/fsst.h
+++ b/fsst/fsst.h
@ -168,8 +168,11 @@ fsst_decompress(
         unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3;
         switch(firstEscapePos) { /* Duff's device */
         case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
                 // fall through
         case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
                 // fall through
         case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
                 // fall through
         case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */
         }
      }
--- a/fsst/fsst12.h
+++ b/fsst/fsst12.h
@ -0,0 +1,187 @@
 /* 
 * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019
 *
 * ===================================================================================================================================
 * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
 *
 * Copyright 2018-2020, CWI, TU Munich, FSU Jena
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, 
 * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 
 * furnished to do so, subject to the following conditions:
 *
 * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
 * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
 * ===================================================================================================================================
 *
 * FSST: Fast Static Symbol Table compression 
 * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf
 *
 * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e.
 * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual
 * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is 
 * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text.
 *
 * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) 
 * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression 
 * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could 
 * be seen as strings again and fit in whatever your program is that manipulates strings.
 *
 * useful property: FSST ensures that strings that are equal, are also equal in their compressed form.
 * 
 * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of 
 * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. 
 *
 * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program
 * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings).
 *
 * This is the 12-bits version of FSST: it uses a 4K dictionary (rather than the 256 dictionary and 8-bits codes)
 * 12-bits FSST often does not work better dan 8-bits, but it will outperform it on datasets that are more chaotic, such as JSON
 * and widely diverse URLs.
 */
 #ifndef FSST_INCLUDED_H
 #define FSST_INCLUDED_H
 #include "assert.h"
 #ifdef __cplusplus
 #include <cstring>
 extern "C" {
 #endif
 /* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */
 typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~3MB) C++ object */
 /* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */
 typedef struct {
   unsigned long long version;      /* version id */
   unsigned char len[4096];         /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */
   unsigned long long symbol[4096]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ 
 } fsst_decoder_t;
 /* Calibrate a FSST dictionary from a batch of strings (it is best to provide at least 16KB of data). */
 fsst_encoder_t*  
 fsst_create(
   unsigned long n,         /* IN: number of strings in batch to sample from. */
   unsigned long lenIn[],   /* IN: byte-lengths of the inputs */
   unsigned char *strIn[],  /* IN: string start pointers. */
   int dummy
 );
 /* Create another encoder instance, necessary to do multi-threaded encoding using the same dictionary. */ 
 fsst_encoder_t*    
 fsst_duplicate(
   fsst_encoder_t *encoder   /* IN: the dictionary to duplicate. */ 
 );
 #define FSST_MAXHEADER (8+16+4096+32768) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */
 /* Space-efficient dictionary serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */
 unsigned int                /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */
 fsst_export(
   fsst_encoder_t *encoder,  /* IN: the dictionary to dump. */ 
   unsigned char *buf       /* OUT: pointer to a byte-buffer where to serialize this dictionary. */
 ); 
 /* Deallocate encoder. */
 void
 fsst_destroy(fsst_encoder_t*);
 /* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */
 unsigned int                /* OUT: number of bytes consumed in buf (0 on failure). */
 fsst_import(
   fsst_decoder_t *decoder,  /* IN: this dictionary will be overwritten. */ 
   unsigned char *buf       /* OUT: pointer to a byte-buffer where fsst_export() serialized this dictionary. */
 ); 
 /* Return a decoder structure from an encoder. */
 fsst_decoder_t    
 fsst_decoder(
   fsst_encoder_t *encoder   
 );
 /* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */
 /* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */
 unsigned long               /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ 
 fsst_compress(
   fsst_encoder_t *encoder,  /* IN: encoder obtained from fsst_create(). */
   unsigned long nstrings,  /* IN: number of strings in batch to compress. */
   unsigned long lenIn[],   /* IN: byte-lengths of the inputs */
   unsigned char *strIn[],  /* IN: input string start pointers. */
   unsigned long outsize,   /* IN: byte-length of output buffer. */
   unsigned char *output,   /* OUT: memory buffer to put the compressed strings in (one after the other). */
   unsigned long lenOut[],   /* OUT: byte-lengths of the compressed strings. */
   unsigned char *strOut[]  /* OUT: output string start pointers. Will all point into [output,output+size). */
 );
 /* Decompress a single string, inlined for speed. */
 inline unsigned long        /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */
 fsst_decompress(
   fsst_decoder_t *decoder,  /* IN: use this dictionary for compression. */
   unsigned long lenIn,     /* IN: byte-length of compressed string. */
   unsigned char *strIn,    /* IN: compressed string. */
   unsigned long size,      /* IN: byte-length of output buffer. */
   unsigned char *output    /* OUT: memory buffer to put the decompressed string in. */
 ) {
   unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len;
   unsigned long*__restrict__ symbol = (unsigned long* __restrict__) decoder->symbol; 
   unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output;
   unsigned long posOut = 0, posIn = 0;
 #define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long))
 #ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */
   while (posOut+16 <= size && posIn+4 < lenIn) {
      unsigned int code, code0, code1;
      memcpy(&code, strIn+posIn, sizeof(unsigned int));
      code0 = code & 4095;
      code1 = (code >> 12) & 4095;
      posIn += 3;
      FSST_UNALIGNED_STORE(strOut+posOut, symbol[code0]); 
      posOut += len[code0];
      FSST_UNALIGNED_STORE(strOut+posOut, symbol[code1]); 
      posOut += len[code1];
   }
   if (posOut+8 <= size && posIn < lenIn) {
      unsigned short code;
      memcpy(&code, strIn+posIn, sizeof(unsigned short));
      code &= 4095;
      posIn=lenIn;
      FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); 
      posOut += len[code];
   }
 #endif
   while (posIn+3 < lenIn) {
      unsigned int code, code0, code1;
      memcpy(&code, strIn+posIn, sizeof(unsigned int));
      code0 = code & 4095;
      code1 = (code >> 12) & 4095;
      posIn += 3;
      unsigned char *__restrict__ src, *__restrict__ lim, *__restrict__ dst = strOut+posOut;
      for(lim=strOut+((posOut+len[code0])>size?size:posOut+len[code0]), src=(unsigned char*__restrict__) &symbol[code0]; dst < lim; dst++, src++) *dst = *src;
      posOut += len[code0];
      for(lim=strOut+((posOut+len[code1])>size?size:posOut+len[code1]), src=(unsigned char*__restrict__) &symbol[code1]; dst < lim; dst++, src++) *dst = *src;
      posOut += len[code1];
   }
   if (posIn < lenIn) {
      unsigned short code;
      memcpy(&code, strIn+posIn, sizeof(unsigned short));
      code &= 4095;
      posIn=lenIn;
      unsigned char *__restrict__ src, *__restrict__ lim, *__restrict__ dst = strOut+posOut;
      for(lim=strOut+((posOut+len[code])>size?size:posOut+len[code]), src=(unsigned char*__restrict__) &symbol[code]; dst < lim; dst++, src++) *dst = *src;
      posOut += len[code];
   }
   return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */
 }
 #ifdef __cplusplus
 }
 #endif
 #endif /* _FSST_INCLUDED_H_ */
--- a/fsst/libfsst.cpp
+++ b/fsst/libfsst.cpp
@ -17,12 +17,6 @@
 // You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
 #include "libfsst.hpp"
 inline uint64_t fsst_unaligned_load(u8 const* V) {
  uint64_t Ret;
  memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
  return Ret;
 }
 Symbol concat(Symbol a, Symbol b) {
   Symbol s;
   u32 length = a.length()+b.length();
@ -97,7 +91,7 @@ SymbolTable *buildSymbolTable(Counters& counters, vector<u8*> line, size_t len[]
      int gain = 0;
      for(size_t i=0; i<line.size(); i++) {
-         u8* cur = line[i];
+         u8* cur = line[i], *start = cur;
         u8* end = cur + len[i];
         if (sampleFrac < 128) {
@ -105,7 +99,6 @@ SymbolTable *buildSymbolTable(Counters& counters, vector<u8*> line, size_t len[]
            if (rnd128(i) > sampleFrac) continue;
         }
         if (cur < end) {
            u8* start = cur;
            u16 code2 = 255, code1 = st->findLongestSymbol(cur, end);
            cur += st->symbols[code1].length();
            gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1)));
@ -147,7 +140,6 @@ SymbolTable *buildSymbolTable(Counters& counters, vector<u8*> line, size_t len[]
               // compute compressed output size
               gain += ((int) (cur-start))-(1+isEscapeCode(code2));
               // now count the subsequent two symbols we encode as an extension codesibility
               if (sampleFrac < 128) { // no need to count pairs in final round
 	          // consider the symbol that is the concatenation of the two last symbols
                  counters.count2Inc(code1, code2);
@ -384,10 +376,12 @@ static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size
 // optimized adaptive *scalar* compression method
 static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, size_t lenIn[], u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) {
-   u8 buf[512], *cur = NULL, *end =  NULL, *lim = out + size;
+   u8 *cur = NULL, *end =  NULL, *lim = out + size;
   size_t curLine, suffixLim = symbolTable.suffixLim;
   u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0];
   u8 buf[512+8] = {}; /* +8 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */
   // three variants are possible. dead code falls away since the bool arguments are constants
   auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
      while (cur < end) {
@ -427,22 +421,20 @@ static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, size_
      size_t chunk, curOff = 0;
      strOut[curLine] = out;
      do {
         bool skipCopy = symbolTable.zeroTerminated;
         cur = strIn[curLine] + curOff; 
         chunk = lenIn[curLine] - curOff;
         if (chunk > 511) {
            chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST 
            skipCopy = false; // need to put terminator, so no in place mem usage possible
         }
         if ((2*chunk+7) > (size_t) (lim-out)) {
            return curLine; // out of memory
         }
-         if (!skipCopy) { // only in case of short zero-terminated strings, we can avoid copying
+         // copy the string to the 511-byte buffer
-            memcpy(buf, cur, chunk);
+         memcpy(buf, cur, chunk);
-            cur = buf;
+         buf[chunk] = (u8) symbolTable.terminator;
-            buf[chunk] = (u8) symbolTable.terminator;
+         cur = buf;
         } 
         end = cur + chunk; 
         // based on symboltable stats, choose a variant that is nice to the branch predictor
         if (noSuffixOpt) {
            compressVariant(true,false);
--- a/fsst/libfsst.hpp
+++ b/fsst/libfsst.hpp
@ -56,6 +56,12 @@ typedef uint64_t u64;
 #define FSST_CODE_MAX       (1UL<<FSST_CODE_BITS) /* all bits set: indicating a symbol that has not been assigned a code yet */
 #define FSST_CODE_MASK      (FSST_CODE_MAX-1UL)   /* all bits set: indicating a symbol that has not been assigned a code yet */
 inline uint64_t fsst_unaligned_load(u8 const* V) {
    uint64_t Ret;
    memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
    return Ret;
 }
 struct Symbol {
   static const unsigned maxLength = 8;
@ -110,7 +116,7 @@ struct QSymbol{
 // two phases of compression, before and after optimize():
 //
 // (1) to encode values we probe (and maintain) three datastructures:
-// - u16 byteCodes[65536] array at the position of the next byte  (s.length==1)
+// - u16 byteCodes[256] array at the position of the next byte  (s.length==1)
 // - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2)
 // - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), 
 // this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are 
@ -373,7 +379,7 @@ struct Counters {
   }
   u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range
      // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros
-      u64 high = *(u64*) &count1High[pos1]; // note: this reads 8 subsequent counters [pos1..pos1+7]
+      u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7]
      u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes
      high = (high >> (zero << 3)) & 255; // advance to nonzero counter
@ -386,7 +392,7 @@ struct Counters {
   }
   u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range
      // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros
-      u64 high = *(u64*) &count2High[pos1][pos2>>1]; // note: this reads 16 subsequent counters [pos2..pos2+15]
+      u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15]
      high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters
      u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters
--- a/fsst/libfsst12.cpp
+++ b/fsst/libfsst12.cpp
@ -0,0 +1,422 @@
 // this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
 //
 // Copyright 2018-2019, CWI, TU Munich
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
 // (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
 // merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
 // IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 //
 // You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
 #include "libfsst12.hpp"
 #include <math.h>
 #include <string.h>
 Symbol concat(Symbol a, Symbol b) {
   Symbol s;
   u32 length = min(8, a.length()+b.length());
   s.set_code_len(FSST_CODE_MASK, length);
   *(u64*) s.symbol = ((*(u64*) b.symbol) << (8*a.length())) | *(u64*) a.symbol;
   return s;
 }
 namespace std {
 template <>
 class hash<Symbol> {
   public:
   size_t operator()(const Symbol& s) const {
      uint64_t k = *(u64*) s.symbol;
      const uint64_t m = 0xc6a4a7935bd1e995;
      const int r = 47;
      uint64_t h = 0x8445d61a4e774912 ^ (8*m);
      k *= m;
      k ^= k >> r;
      k *= m;
      h ^= k;
      h *= m;
      h ^= h >> r;
      h *= m;
      h ^= h >> r;
      return h;
   }
 };
 }
 std::ostream& operator<<(std::ostream& out, const Symbol& s) {
   for (u32 i=0; i<s.length(); i++)
      out << s.symbol[i];
   return out;
 }
 #define FSST_SAMPLETARGET (1<<17) 
 #define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET) 
 SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector<ulong>& sample, ulong len[], u8* line[]) {
   ulong sampleSize = max(sampleParam, FSST_SAMPLEMAXSZ); // if sampleParam is negative, we need to ignore part of the last line
   SymbolMap *st = new SymbolMap(), *bestMap = new SymbolMap();
   long bestGain = -sampleSize; // worst case (everything exception)
   ulong sampleFrac = 128;
   for(ulong i=0; i<sample.size(); i++) {
      u8* cur = line[sample[i]];
      u8* end = cur + len[sample[i]];
      if (sampleParam < 0 && i+1 == sample.size()) 
         cur -= sampleSize; // use only last part of last line (which could be too long for an efficient sample)
   }
   u32 minSize = FSST_SAMPLEMAXSZ;
   // a random number between 0 and 128
   auto rnd128 = [&](ulong i) { return 1 + (FSST_HASH((i+1)*sampleFrac)&127); };
   // compress sample, and compute (pair-)frequencies
   auto compressCount = [&](SymbolMap *st, Counters &counters) { // returns gain
      long gain = 0;
      for(ulong i=0; i<sample.size(); i++) {
         u8* cur = line[sample[i]];
         u8* end = cur + len[sample[i]];
         if (sampleParam < 0 && i+1 == sample.size()) { 
            cur -= sampleParam; // use only last part of last line (which could be too long for an efficient sample)
            if ((end-cur) > 500) end = cur + ((end-cur)*sampleFrac)/128; // shorten long lines to the sample fraction
         } else if (sampleFrac < 128) {
            // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
            if (rnd128(i) > sampleFrac) continue;
         }
         if (cur < end) {
            u16 pos2 = 0, pos1 = st->findExpansion(Symbol(cur, end));
            cur += pos1 >> 12;
            pos1 &= FSST_CODE_MASK;
            while (true) {
 	       u8 *old = cur;
               counters.count1Inc(pos1);
               if (cur<end-7) {
                  ulong word = fsst_unaligned_load(cur);
                  ulong pos = (u32) word; // key is first 4 bytes!!
                  ulong idx = FSST_HASH(pos)&(st->hashTabSize-1);
                  Symbol s = st->hashTab[idx];
                  pos2 = st->shortCodes[word & 0xFFFF];
                  word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl);
                  if ((s.gcl < FSST_GCL_FREE) && (*(u64*) s.symbol == word)) {
                     pos2 = s.code(); cur += s.length();
                  } else {
                     cur += (pos2 >> 12);
                     pos2 &= FSST_CODE_MASK;
                  }
               } else if (cur==end) {
                  break;
               } else {
                  assert(cur<end);
                  pos2 = st->findExpansion(Symbol(cur, end));
                  cur += pos2 >> 12;
                  pos2 &= FSST_CODE_MASK;
               }
               // compute compressed output size (later divide by 2)
               gain += 2*(cur-old)-3;
               // now count the subsequent two symbols we encode as an extension possibility
               if (sampleFrac < 128) { // no need to count pairs in final round
                  counters.count2Inc(pos1, pos2);
               }
               pos1 = pos2;
            }
         }
      }
      return gain; 
   };
   auto makeMap = [&](SymbolMap *st, Counters &counters) {
      // hashmap of c (needed because we can generate duplicate candidates)
      unordered_set<Symbol> cands;
      auto addOrInc = [&](unordered_set<Symbol> &cands, Symbol s, u32 count) {
         auto it = cands.find(s);
         s.gain = s.length()*count;
         if (it != cands.end()) {
            s.gain += (*it).gain;
            cands.erase(*it);
         }
         cands.insert(s);
      };
      // add candidate symbols based on counted frequency
      for (u32 pos1=0; pos1<st->symbolCount; pos1++) { 
         u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
         if (!cnt1) continue;
         Symbol s1 = st->symbols[pos1];
         if (s1.length() > 1) { // 1-byte symbols are always in the map
            addOrInc(cands, s1, cnt1);
         }
         if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
             s1.length() == Symbol::maxLength) { // symbol cannot be extended
            continue;
         }
         for (u32 pos2=0; pos2<st->symbolCount; pos2++) { 
            u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
            if (!cnt2) continue;
            // create a new symbol
            Symbol s2 = st->symbols[pos2];
            Symbol s3 = concat(s1, s2);
            addOrInc(cands, s3, cnt2);
         }
      }
      // insert candidates into priority queue (by gain)
      auto cmpGn = [](const Symbol& q1, const Symbol& q2) { return q1.gain < q2.gain; };
      priority_queue<Symbol,vector<Symbol>,decltype(cmpGn)> pq(cmpGn);
      for (auto& q : cands)
         pq.push(q);
      // Create new symbol map using best candidates
      st->clear();
      while (st->symbolCount < 4096 && !pq.empty()) {
         Symbol s = pq.top();
         pq.pop();
         st->add(s);
      }
   };
 #ifdef NONOPT_FSST
   for(ulong frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
      sampleFrac = frac;
 #else
   for(sampleFrac=14; true; sampleFrac = sampleFrac + 38) {
 #endif
      memset(&counters, 0, sizeof(Counters));
      long gain = compressCount(st, counters);
      if (gain >= bestGain) { // a new best solution!
         *bestMap = *st; bestGain = gain;
      } 
      if (sampleFrac >= 128) break; // we do 4 rounds (sampleFrac=14,52,90,128)
      makeMap(st, counters);
   }
   delete st;
   return bestMap;
 }
 // optimized adaptive *scalar* compression method
 static inline ulong compressBulk(SymbolMap &symbolMap, ulong nlines, ulong lenIn[], u8* strIn[], ulong size, u8* out, ulong lenOut[], u8* strOut[]) {
   u8 *lim = out + size;
   ulong curLine;
   for(curLine=0; curLine<nlines; curLine++) {
      u8 *cur = strIn[curLine]; 
      u8 *end = cur + lenIn[curLine]; 
      strOut[curLine] = out;
      while (cur+16 <= end && (lim-out) >= 8) {
         u64 word = fsst_unaligned_load(cur);
         ulong code = symbolMap.shortCodes[word & 0xFFFF];
         ulong pos = (u32) word; // key is first 4 bytes
         ulong idx = FSST_HASH(pos)&(symbolMap.hashTabSize-1);
         Symbol s = symbolMap.hashTab[idx];
         word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl);
         if ((s.gcl < FSST_GCL_FREE) && *(ulong*) s.symbol == word) {
            code = s.gcl >> 16;
         }
         cur += (code >> 12);
         u32 res = code & FSST_CODE_MASK;
         word = fsst_unaligned_load(cur);
         code = symbolMap.shortCodes[word & 0xFFFF];
         pos = (u32) word; // key is first 4 bytes
         idx = FSST_HASH(pos)&(symbolMap.hashTabSize-1);
         s = symbolMap.hashTab[idx];
         word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl);
         if ((s.gcl < FSST_GCL_FREE) && *(ulong*) s.symbol == word) {
           code = s.gcl >> 16;
         }
         cur += (code >> 12);
         res |= (code&FSST_CODE_MASK) << 12;
         memcpy(out, &res, sizeof(u64));
         out += 3; 
      }
      while (cur < end) {
         ulong code = symbolMap.findExpansion(Symbol(cur, end));
         u32 res = (code&FSST_CODE_MASK);
         if (out+8 > lim) {
             return curLine; // u32 write would be out of bounds (out of output memory) 
         }
         cur += code >> 12;
         if (cur >= end) {
            memcpy(out, &res, sizeof(u64));
 	    out += 2;
            break;
         }
         code = symbolMap.findExpansion(Symbol(cur, end));
         res |= (code&FSST_CODE_MASK) << 12;
         cur += code >> 12;
         memcpy(out, &res, sizeof(u64));
 	 out += 3;
      } 
      lenOut[curLine] = out - strOut[curLine];
   } 
   return curLine;
 }
 long makeSample(vector<ulong> &sample, ulong nlines, ulong len[]) {
   ulong i, sampleRnd = 1, sampleProb = 256, sampleSize = 0, totSize = 0;
   ulong sampleTarget = FSST_SAMPLETARGET;
   for(i=0; i<nlines; i++) 
      totSize += len[i];
   if (totSize > FSST_SAMPLETARGET) {
      // if the batch is larger than the sampletarget, sample this fraction  
      sampleProb = max(((ulong) 4),(256*sampleTarget) / totSize);
   } else {
      // too little data. But ok, do not include lines multiple times, just use everything once
      sampleTarget = totSize; // sampleProb will be 256/256 (aka 100%) 
   } 
   do {
      // if nlines is very large and strings are small (8, so we need 4K lines), we still expect 4K*256/4 iterations total worst case
      for(i=0; i<nlines; i++) { 
         // cheaply draw a random number to select (or not) each line
         sampleRnd = FSST_HASH(sampleRnd);
         if ((sampleRnd&255) < sampleProb) {
            sample.push_back(i);
            sampleSize += len[i];
            if (sampleSize >= sampleTarget) // enough? 
               i = nlines; // break out of both loops; 
         }
      }
      sampleProb *= 4; //accelerate the selection process at expense of front-bias (4,16,64,256: 4 passes max)
   } while(i <= nlines); // basically continue until we have enough
   // if the last line (only line?) is excessively long, return a negative samplesize (the amount of front bytes to skip)
   long sampleLong = (long) sampleSize;
   assert(sampleLong > 0);
   return (sampleLong < FSST_SAMPLEMAXSZ)?sampleLong:FSST_SAMPLEMAXSZ-sampleLong; 
 }
 extern "C" fsst_encoder_t* fsst_create(ulong n, ulong lenIn[], u8 *strIn[], int dummy) {
   vector<ulong> sample;
   (void) dummy;
   long sampleSize = makeSample(sample, n?n:1, lenIn); // careful handling of input to get a right-size and representative sample
   Encoder *encoder = new Encoder();
   encoder->symbolMap = shared_ptr<SymbolMap>(buildSymbolMap(encoder->counters, sampleSize, sample, lenIn, strIn));
   return (fsst_encoder_t*) encoder;
 }
 /* create another encoder instance, necessary to do multi-threaded encoding using the same dictionary */
 extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) {
   Encoder *e = new Encoder();
   e->symbolMap = ((Encoder*)encoder)->symbolMap; // it is a shared_ptr
   return (fsst_encoder_t*) e;
 }
 // export a dictionary in compact format. 
 extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) {
   Encoder *e = (Encoder*) encoder;
   // In ->version there is a versionnr, but we hide also suffixLim/terminator/symbolCount there.
   // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t
   // (such functionality could be useful to append compressed data to an existing block).
   //
   // However, the hash function in the encoder hash table is endian-sensitive, and given its
   // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
   // Doing a endian-conversion during hashing will be slow and self-defeating.
   //
   // Overall, we could support reconstructing an encoder for incremental compression, but 
   // should enforce equal-endianness. Bit of a bummer. Not going there now.
   // 
   // The version field is now there just for future-proofness, but not used yet
   // version allows keeping track of fsst versions, track endianness, and encoder reconstruction
   u64 version = (FSST_VERSION << 32) | FSST_ENDIAN_MARKER; // least significant byte is nonzero
   /* do not assume unaligned reads here */
   memcpy(buf, &version, 8);
   memcpy(buf+8, e->symbolMap->lenHisto, 16); // serialize the lenHisto
   u32 pos = 24;
   // emit only the used bytes of the symbols 
   for(u32 i = 0; i < e->symbolMap->symbolCount; i++) {
      buf[pos++] = e->symbolMap->symbols[i].length();
      for(u32 j = 0; j < e->symbolMap->symbols[i].length(); j++) {
         buf[pos++] = ((u8*) &e->symbolMap->symbols[i].symbol)[j]; // serialize used symbol bytes
      }
   }
   return pos; // length of what was serialized
 }
 #define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */
 extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 *buf) {
   u64 version = 0, symbolCount = 0;
   u32 pos = 24;
   u16 lenHisto[8];
   // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
   memcpy(&version, buf, 8);
   if ((version>>32) != FSST_VERSION) return 0;
   memcpy(lenHisto, buf+8, 16);
   for(u32 i=0; i<8; i++) 
     symbolCount += lenHisto[i]; 
   for(u32 i = 0; i < symbolCount; i++) {
      u32 len = decoder->len[i] = buf[pos++];
      for(u32 j = 0; j < len; j++) {
        ((u8*) &decoder->symbol[i])[j] = buf[pos++];
      }
   }
   // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
   while(symbolCount<4096) {
       decoder->symbol[symbolCount] = FSST_CORRUPT;    
       decoder->len[symbolCount++] = 8;
   }
   return pos;
 }
 // runtime check for simd
 inline ulong _compressImpl(Encoder *e, ulong nlines, ulong lenIn[], u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
   (void) noSuffixOpt;
   (void) avoidBranch;
   (void) simd;
   return compressBulk(*e->symbolMap, nlines, lenIn, strIn, size, output, lenOut, strOut);
 }
 ulong compressImpl(Encoder *e, ulong nlines, ulong lenIn[], u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
 }
 // adaptive choosing of scalar compression method based on symbol length histogram 
 inline ulong _compressAuto(Encoder *e, ulong nlines, ulong lenIn[], u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], int simd) {
   (void) simd;
   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, false, false, false);
 }
 ulong compressAuto(Encoder *e, ulong nlines, ulong lenIn[], u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], int simd) {
   return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
 }
 // the main compression function (everything automatic)
 extern "C" ulong fsst_compress(fsst_encoder_t *encoder, ulong nlines, ulong lenIn[], u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[]) {
   // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
   ulong totLen = accumulate(lenIn, lenIn+nlines, 0);
   int simd = totLen > nlines*12 && (nlines > 64 || totLen > (ulong) 1<<15); 
   return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd);
 }
 /* deallocate encoder */
 extern "C" void fsst_destroy(fsst_encoder_t* encoder) {
   Encoder *e = (Encoder*) encoder; 
   delete e;
 }
 /* very lazy implementation relying on export and import */
 extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) {
   u8 buf[sizeof(fsst_decoder_t)];
   u32 cnt1 = fsst_export(encoder, buf);
   fsst_decoder_t decoder;
   u32 cnt2 = fsst_import(&decoder, buf);
   assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; 
   return decoder;
 }
--- a/fsst/libfsst12.hpp
+++ b/fsst/libfsst12.hpp
@ -0,0 +1,310 @@
 // this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
 // 
 // Copyright 2018-2019, CWI, TU Munich
 // 
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files   
 // (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,   
 // merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is   
 // furnished to do so, subject to the following conditions:
 // 
 // - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 // 
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
 // IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
 //                 
 // You can contact the authors via the FSST source repository : https://github.com/cwida/fsst 
 #include <algorithm>
 #include <cassert>
 #include <cstring>
 #include <fstream>
 #include <iostream>
 #include <numeric>
 #include <memory>
 #include <queue>
 #include <string>
 #include <unordered_set>
 #include <vector>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
 #include <fcntl.h>
 using namespace std;
 #include "fsst12.h" // the official FSST API -- also usable by C mortals
 /* workhorse type for string and buffer lengths: 64-bits on 64-bits platforms and 32-bits on 32-bits platforms */
 typedef unsigned long ulong; 
 /* unsigned integers */
 typedef uint8_t u8;
 typedef uint16_t u16;
 typedef uint32_t u32;
 typedef uint64_t u64;
 #define FSST_ENDIAN_MARKER ((u64) 1)
 #define FSST_VERSION_20190218 20190218
 #define FSST_VERSION ((u64) FSST_VERSION_20190218)
 // "symbols" are character sequences (up to 8 bytes)
 // A symbol is compressed into a "code" of, 1.5 bytes (12 bits)
 #define FSST_CODE_MAX 4096
 #define FSST_CODE_MASK      ((u16) (FSST_CODE_MAX-1)) 
 inline uint64_t fsst_unaligned_load(u8 const* V) {
    uint64_t Ret;
    memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
    return Ret;
 }
 struct Symbol {
   static const unsigned maxLength = 8;
   // gcl = u32 garbageBits:16,code:12,length:4 -- but we avoid exposing this bit-field notation
   u32 gcl;  // use a single u32 to be sure "code" is accessed with one load and can be compared with one comparison
   mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of Symbols
   // the byte sequence that this symbol stands for
   u8 symbol[maxLength]; 
   Symbol() : gcl(0) {}
   explicit Symbol(u8 c, u16 code) : gcl((1<<28)|(code<<16)|7) { *(u64*) symbol = c; } // single-char symbol
   explicit Symbol(const char* input, u32 len) {
      if (len < 8) {
         *(u64*) symbol = 0;
         for(u32 i=0; i<len; i++) symbol[i] = input[i];
         set_code_len(0, len);
      } else {
         *(u64*) symbol = *(u64*) input;
         set_code_len(0, 8);
      }
   }
   explicit Symbol(const char* begin, const char* end) : Symbol(begin, end-begin) {}
   explicit Symbol(u8* begin, u8* end) : Symbol((const char*)begin, end-begin) {}
   void set_code_len(u32 code, u32 len) { gcl = (len<<28)|(code<<16)|((8-len)*8); }
   u8 length() const { return gcl >> 28; }
   u16 code() const { return (gcl >> 16) & FSST_CODE_MASK; }
   u8 garbageBits() const { return gcl; }
   u8 first() const { return 0xFF & *(u64*) symbol; }
   u16 first2() const { assert(length() > 1); return (0xFFFF & *(u64*) symbol); }
 #define FSST_HASH_LOG2SIZE 14
 #define FSST_HASH_SHIFT 15 
 #define FSST_HASH_PRIME1 2971215073LL
 #define FSST_HASH(w) (((w)*FSST_HASH_PRIME1)^(((w)*FSST_HASH_PRIME1)>>13))
   ulong hash() const { uint v0 = 0xFFFFFFFF & *(ulong*) symbol; return FSST_HASH(v0); }
   bool operator==(const Symbol& other) const { return *(u64*) symbol == *(u64*) other.symbol && length() == other.length(); }
 };
 // during search for the best dictionary, we probe both (in this order, first wins):  
 // - Symbol hashtable[8192] (keyed by the next four bytes, for s.length>2 -- certain 4-byte sequences will map to the same 3-byte symbol), 
 // - u16 shortCodes[65536] array at the position of the next two-byte pattern (s.length==2) and 
 // this search will yield a u16 code, it points into Symbol symbols[4096].
 // you always find a hit, because the lowest 256 codes are all single-byte symbols
 // in the hash table, the gcl field contains (low-to-high) garbageBits:16,code:12,length:4 
 #define FSST_GCL_FREE ((8<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of gcl (len=8,code=FSST_CODE_MASK) indicates free bucket
 // garbageBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key
 //             ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster
 //
 // the gain field is only used in the symbol queue that sorts symbols on gain
 struct SymbolMap {
   static const u32 hashTabSize = 1<<FSST_HASH_LOG2SIZE; // smallest size that incurs no precision loss
   // lookup table using the next two bytes (65536 codes), or just the next single byte
   u16 shortCodes[65536]; // shortCode[X] contains code for 2-byte symbol, contains 1-byte code X&255 if there is no 2-byte symbol
   // 'symbols' is the current symbol  table symbol[code].symbol is the max 8-byte 'symbol' for single-byte 'code'
   Symbol symbols[4096];  
   // replicate long symbols in hashTab (avoid indirection). 
   Symbol hashTab[hashTabSize]; // used for all symbols of 3 and more bytes
   u32 symbolCount;       // amount of symbols in the map (max 4096)
   bool zeroTerminated;   // whether we are expecting zero-terminated strings (we then also produce zero-terminated compressed strings)
   u16 lenHisto[8];        // lenHisto[x] is the amount of symbols of byte-length (x+1) in this SymbolMap
   SymbolMap() : symbolCount(256), zeroTerminated(false) {
      // stuff done once at startup
      Symbol unused = Symbol(0,FSST_CODE_MASK); // single-char symbol, exception code
      for (u32 i=0; i<256; i++) {
         symbols[i] = Symbol((u8)i,i); // single-byte symbol
      }
      for (u32 i=256; i<4096; i++) {
         symbols[i] = unused; // all other symbols are unused.
      }
      // stuff done when re-using a symbolmap during the search for the best map
      clear(); // clears the arrays (hortCodes and hashTab) and histo
   }
   void clear() {
      Symbol s;
      s.gcl = FSST_GCL_FREE; //marks empty in hashtab
      s.gain = 0; 
      for(u32 i=0; i<hashTabSize; i++)
         hashTab[i] = s;
      for(u32 i=0; i<65536; i++)
         shortCodes[i] = 4096 | (i & 255); // single-byte symbol
      memset(lenHisto, 0, sizeof(lenHisto)); // all unused
      lenHisto[0] = symbolCount = 256; // no need to clean symbols[] as no symbols are used
   }
   u32 load() {
      u32 ret = 0;
      for(u32 i=0; i<hashTabSize; i++)
         ret += (hashTab[i].gcl < FSST_GCL_FREE);
      return ret;
   }
   bool hashInsert(Symbol s) {
      u32 idx = s.hash() & (hashTabSize-1);
      bool taken = (hashTab[idx].gcl < FSST_GCL_FREE);
      if (taken) return false; // collision in hash table
      hashTab[idx].gcl = s.gcl;
      hashTab[idx].gain = 0;
      *(u64*) hashTab[idx].symbol = (*(u64*) s.symbol) & (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl);
      return true;
   }
   bool add(Symbol s) {
      assert(symbolCount < 4096);
      u32 len = s.length();
      assert(len > 1);
      s.set_code_len(symbolCount, len);
      if (len == 2) {
         assert(shortCodes[s.first2()] == 4096 + s.first()); // cannot be in use
         shortCodes[s.first2()] = 8192 + symbolCount; // 8192 = (len == 2) << 12
      } else if (!hashInsert(s)) {
         return false;
      }
      symbols[symbolCount++] = s;
      lenHisto[len-1]++;
      return true;
   }
   /// Find symbol in hash table, return code
   u16 hashFind(Symbol s) const {
      ulong idx = s.hash() & (hashTabSize-1);
      if (hashTab[idx].gcl < FSST_GCL_FREE && 
          *(u64*) hashTab[idx].symbol == (*(u64*) s.symbol & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].gcl)))) 
         return (hashTab[idx].gcl>>16); // matched a long symbol 
      return 0;
   }
   /// Find longest expansion, return code
   u16 findExpansion(Symbol s) const {
      if (s.length() == 1) { 
 	return 4096 + s.first();
      }
      u16 ret = hashFind(s);
      return ret?ret:shortCodes[s.first2()];
   }
 };
 #if 0 //def NONOPT_FSST
 struct Counters {
   u16 count1[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample 
   u16 count2[FSST_CODE_MAX][FSST_CODE_MAX]; // array to count subsequent combinations of two symbols in the sample 
   void count1Set(u32 pos1, u16 val) { 
      count1[pos1] = val;
   }
   void count1Inc(u32 pos1) { 
      count1[pos1]++;
   }
   void count2Inc(u32 pos1, u32 pos2) {  
      count2[pos1][pos2]++;
   }
   u32 count1GetNext(u32 &pos1) { 
      return count1[pos1];
   }
   u32 count2GetNext(u32 pos1, u32 &pos2) { 
      return count2[pos1][pos2];
   }
   void backup1(u8 *buf) {
      memcpy(buf, count1, FSST_CODE_MAX*sizeof(u16));
   }
   void restore1(u8 *buf) {
      memcpy(count1, buf, FSST_CODE_MAX*sizeof(u16));
   }
 };
 #else
 // we keep two counters count1[pos] and count2[pos1][pos2] of resp 16 and 12-bits. Both are split into two columns for performance reasons
 // first reason is to make the column we update the most during symbolTable construction (the low bits) thinner, thus reducing CPU cache pressure.
 // second reason is that when scanning the array, after seeing a 64-bits 0 in the high bits column, we can quickly skip over many codes (15 or 7)
 struct Counters {
   // high arrays come before low arrays, because our GetNext() methods may overrun their 64-bits reads a few bytes
   u8 count1High[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample (16-bits)
   u8 count1Low[FSST_CODE_MAX];    // it is split in a low and high byte: cnt = count1High*256 + count1Low
   u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high)
   u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX];    // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2)
   // 385KB  -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB)
   void count1Set(u32 pos1, u16 val) { 
      count1Low[pos1] = val&255;
      count1High[pos1] = val>>8;
   }
   void count1Inc(u32 pos1) { 
      if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
         count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)...
   }
   void count2Inc(u32 pos1, u32 pos2) {  
       if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
          // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively
          count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample)
   }
   u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range
      // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros
      u64 high = *(u64*) &count1High[pos1]; // note: this reads 8 subsequent counters [pos1..pos1+7]
      u32 zero = high?(__builtin_ctzl(high)>>3):7; // number of zero bytes
      high = (high >> (zero << 3)) & 255; // advance to nonzero counter
      if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
         return 0; // all zero
      u64 low = count1Low[pos1];
      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
      return (high << 8) + low;
   }
   u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range
      // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros
      u64 high = *(u64*) &count2High[pos1][pos2>>1]; // note: this reads 16 subsequent counters [pos2..pos2+15]
      high >>= (pos2&1) << 2; // odd pos2: ignore the lowest 4 bits & we see only 15 counters
      u32 zero = high?(__builtin_ctzl(high)>>2):(15-(pos2&1)); // number of zero 4-bits counters
      high = (high >> (zero << 2)) & 15;  // advance to nonzero counter
      if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
         return 0; // all zero
      u64 low = count2Low[pos1][pos2];
      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
      return (high << 8) + low;
   }
   void backup1(u8 *buf) {
      memcpy(buf, count1High, FSST_CODE_MAX);
      memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX);
   }
   void restore1(u8 *buf) {
      memcpy(count1High, buf, FSST_CODE_MAX);
      memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX);
   }
 }; 
 #endif
 // an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression 
 struct Encoder {
   shared_ptr<SymbolMap> symbolMap; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc)
   union {
      Counters counters;     // for counting symbol occurences during map construction
   };
 };
 // C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree)
 ulong compressImpl(Encoder *encoder, ulong n, ulong lenIn[], u8 *strIn[], ulong size, u8 * output, ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd);
 ulong compressAuto(Encoder *encoder, ulong n, ulong lenIn[], u8 *strIn[], ulong size, u8 * output, ulong *lenOut, u8 *strOut[], int simd);