Update to latest version of libfsst

This commit is contained in:
Marcus Holland-Moritz 2022-11-17 13:19:47 +01:00
parent 21a6d688b4
commit bd09004889

View File

@ -105,50 +105,58 @@ SymbolTable *buildSymbolTable(Counters& counters, vector<u8*> line, size_t len[]
if (rnd128(i) > sampleFrac) continue; if (rnd128(i) > sampleFrac) continue;
} }
if (cur < end) { if (cur < end) {
u16 pos2 = 255, pos1 = st->findLongestSymbol(cur, end); u8* start = cur;
cur += st->symbols[pos1].length(); u16 code2 = 255, code1 = st->findLongestSymbol(cur, end);
gain += (int) (st->symbols[pos1].length()-(1+isEscapeCode(pos1))); cur += st->symbols[code1].length();
gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1)));
while (true) { while (true) {
u8* old = cur; // count single symbol (i.e. an option is not extending it)
counters.count1Inc(pos1); counters.count1Inc(code1);
// as an alternative, consider just using the next byte..
if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly
counters.count1Inc(*start);
if (cur==end) { if (cur==end) {
break; break;
} }
// count single symbol (i.e. an option is not extending it)
if (st->symbols[pos1].length() != 1) // now match a new symbol
counters.count1Inc(*cur); start = cur;
if (cur<end-7) { if (cur<end-7) {
u64 word = fsst_unaligned_load(cur); u64 word = fsst_unaligned_load(cur);
size_t pos = word & 0xFFFFFF; size_t code = word & 0xFFFFFF;
size_t idx = FSST_HASH(pos)&(st->hashTabSize-1); size_t idx = FSST_HASH(code)&(st->hashTabSize-1);
Symbol s = st->hashTab[idx]; Symbol s = st->hashTab[idx];
pos2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK; code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) { if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) {
pos2 = s.code(); code2 = s.code();
cur += s.length(); cur += s.length();
} else if (pos2 >= FSST_CODE_BASE) { } else if (code2 >= FSST_CODE_BASE) {
cur += 2; cur += 2;
} else { } else {
pos2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK; code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
cur += 1; cur += 1;
} }
} else { } else {
assert(cur<end); code2 = st->findLongestSymbol(cur, end);
pos2 = st->findLongestSymbol(cur, end); cur += st->symbols[code2].length();
cur += st->symbols[pos2].length();
} }
// compute compressed output size // compute compressed output size
gain += ((int) (cur-old))-(1+isEscapeCode(pos2)); gain += ((int) (cur-start))-(1+isEscapeCode(code2));
// now count the subsequent two symbols we encode as an extension possibility // now count the subsequent two symbols we encode as an extension codesibility
if (sampleFrac < 128) { // no need to count pairs in final round if (sampleFrac < 128) { // no need to count pairs in final round
counters.count2Inc(pos1, pos2); // consider the symbol that is the concatenation of the two last symbols
if ((cur-old) > 1) // do not count escaped bytes doubly counters.count2Inc(code1, code2);
counters.count2Inc(pos1, *old);
// as an alternative, consider just extending with the next byte..
if ((cur-start) > 1) // ..but do not count single byte extensions doubly
counters.count2Inc(code1, *start);
} }
pos1 = pos2; code1 = code2;
} }
} }
} }