fsst: deterministic symbol tables (needed to fix #91)

This fixes what I believe is a bug in the fsst library that causes
symbol tables to be non-deterministic. There's an open issue/PR for
the library, so it's not yet clear if this fix is correct/optimal.
This commit is contained in:
Marcus Holland-Moritz 2022-08-03 18:59:47 +02:00
parent 422146d7a2
commit 183a16d953

View File

@ -111,6 +111,9 @@ SymbolTable *buildSymbolTable(Counters& counters, vector<u8*> line, size_t len[]
while (true) {
u8* old = cur;
counters.count1Inc(pos1);
if (cur==end) {
break;
}
// count single symbol (i.e. an option is not extending it)
if (st->symbols[pos1].length() != 1)
counters.count1Inc(*cur);
@ -130,8 +133,6 @@ SymbolTable *buildSymbolTable(Counters& counters, vector<u8*> line, size_t len[]
pos2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
cur += 1;
}
} else if (cur==end) {
break;
} else {
assert(cur<end);
pos2 = st->findLongestSymbol(cur, end);