diff --git a/CMakeLists.txt b/CMakeLists.txt index a546028d..16525e51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -495,6 +495,7 @@ if(WITH_TESTS) test/filesystem_test.cpp test/filesystem_writer_test.cpp test/fragment_category_test.cpp + test/fsst_test.cpp test/glob_matcher_test.cpp test/global_metadata_test.cpp test/integral_value_parser_test.cpp diff --git a/cmake/libdwarfs.cmake b/cmake/libdwarfs.cmake index bd1678f1..581e9a89 100644 --- a/cmake/libdwarfs.cmake +++ b/cmake/libdwarfs.cmake @@ -51,6 +51,7 @@ add_library( src/internal/file_status_conv.cpp src/internal/fs_section.cpp src/internal/fs_section_checker.cpp + src/internal/fsst.cpp src/internal/glob_to_regex.cpp src/internal/malloc_buffer.cpp src/internal/metadata_utils.cpp diff --git a/include/dwarfs/internal/fsst.h b/include/dwarfs/internal/fsst.h new file mode 100644 index 00000000..fb588e88 --- /dev/null +++ b/include/dwarfs/internal/fsst.h @@ -0,0 +1,73 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the “Software”), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace dwarfs::internal { + +class fsst_encoder { + public: + struct bulk_compression_result { + std::string dictionary; + std::string buffer; + std::vector compressed_data; + }; + + static std::optional + compress(std::span data, bool force = false); + static std::optional + compress(std::span data, bool force = false); +}; + +class fsst_decoder { + public: + explicit fsst_decoder(std::string_view dictionary); + + std::string decompress(std::string_view data) const { + return impl_->decompress(data); + } + + class impl { + public: + virtual ~impl() = default; + + virtual std::string decompress(std::string_view data) const = 0; + }; + + private: + std::unique_ptr impl_; +}; + +} // namespace dwarfs::internal diff --git a/src/internal/fsst.cpp b/src/internal/fsst.cpp new file mode 100644 index 00000000..5517116a --- /dev/null +++ b/src/internal/fsst.cpp @@ -0,0 +1,174 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the “Software”), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include + +#include + +#include + +#include + +namespace dwarfs::internal { + +namespace { + +template +std::optional +fsst_compress_(std::span input, bool force) { + std::optional output; + + if (input.empty()) { + return output; + } + + auto const size = input.size(); + size_t total_input_size = 0; + std::vector len_vec; + std::vector ptr_vec; + + len_vec.reserve(size); + ptr_vec.reserve(size); + + for (auto const& s : input) { + ptr_vec.emplace_back(reinterpret_cast(s.data())); + len_vec.emplace_back(s.size()); + total_input_size += s.size(); + } + + std::unique_ptr<::fsst_encoder_t, decltype(&::fsst_destroy)> enc{ + ::fsst_create(size, len_vec.data(), ptr_vec.data(), 0), &::fsst_destroy}; + + std::string symtab; + + symtab.resize(sizeof(::fsst_decoder_t)); + + auto const symtab_size = + ::fsst_export(enc.get(), reinterpret_cast(symtab.data())); + symtab.resize(symtab_size); + + std::vector out_len_vec; + std::vector out_ptr_vec; + std::string buffer; + + out_len_vec.resize(size); + out_ptr_vec.resize(size); + + if (symtab_size >= total_input_size && !force) { + return output; + } + + buffer.resize(total_input_size); + + for (;;) { + auto const num_compressed = ::fsst_compress( + enc.get(), size, len_vec.data(), ptr_vec.data(), buffer.size(), + reinterpret_cast(buffer.data()), out_len_vec.data(), + out_ptr_vec.data()); + + if (num_compressed == size) { + break; + } else if (!force) { + return output; + } + + buffer.resize(2 * buffer.size()); + } + + size_t const compressed_size = + (out_ptr_vec.back() - out_ptr_vec.front()) + out_len_vec.back(); + + if (symtab_size + compressed_size >= total_input_size && !force) { + return output; + } + + assert(reinterpret_cast(out_ptr_vec.front()) == buffer.data()); + assert(compressed_size == std::accumulate(out_len_vec.begin(), + out_len_vec.end(), + static_cast(0))); + + buffer.resize(compressed_size); + + output.emplace(); + + output->dictionary = std::move(symtab); + output->buffer = std::move(buffer); + output->compressed_data.reserve(size); + + for (size_t i = 0; i < size; ++i) { + output->compressed_data.emplace_back(std::string_view( + reinterpret_cast(out_ptr_vec[i]), out_len_vec[i])); + } + + return output; +} + +class fsst_decoder_ : public fsst_decoder::impl { + public: + explicit fsst_decoder_(std::string_view dictionary) { + auto const read = ::fsst_import( + &decoder_, reinterpret_cast(dictionary.data())); + if (read != dictionary.size()) { + throw std::runtime_error(fmt::format( + "read {0} symtab bytes, expected {1}", read, dictionary.size())); + } + } + + std::string decompress(std::string_view data) const override { + thread_local std::string out; + auto const size = data.size(); + out.resize(8 * size); + auto outlen = ::fsst_decompress( + &decoder_, size, reinterpret_cast(data.data()), + out.size(), reinterpret_cast(out.data())); + out.resize(outlen); + return out; + } + + private: + ::fsst_decoder_t decoder_; +}; + +} // namespace + +auto fsst_encoder::compress(std::span data, bool force) + -> std::optional { + return fsst_compress_(data, force); +} + +auto fsst_encoder::compress(std::span data, bool force) + -> std::optional { + return fsst_compress_(data, force); +} + +fsst_decoder::fsst_decoder(std::string_view dictionary) + : impl_{std::make_unique(dictionary)} {} + +} // namespace dwarfs::internal diff --git a/src/internal/string_table.cpp b/src/internal/string_table.cpp index 295c5715..0f30a645 100644 --- a/src/internal/string_table.cpp +++ b/src/internal/string_table.cpp @@ -31,11 +31,10 @@ #include -#include - #include #include +#include #include namespace dwarfs::internal { @@ -78,16 +77,8 @@ class packed_string_table : public string_table::impl { auto st = v_.symtab(); DWARFS_CHECK(st, "symtab unexpectedly unset"); - dec_ = std::make_unique(); - auto read = fsst_import( - dec_.get(), reinterpret_cast(st->data())); - - if (read != st->size()) { - DWARFS_THROW(runtime_error, - fmt::format("read {0} symtab bytes, expected {1}", read, - st->size())); - } + dec_.emplace(st.value()); ti << "imported dictionary for " << name << " string table"; } @@ -118,14 +109,7 @@ class packed_string_table : public string_table::impl { } if constexpr (PackedData) { - thread_local std::string out; - size_t size = end - beg; - out.resize(8 * size); - auto outlen = fsst_decompress( - dec_.get(), size, reinterpret_cast(beg), - out.size(), reinterpret_cast(out.data())); - out.resize(outlen); - return out; + return dec_->decompress(std::string_view{beg, end}); } return {beg, end}; @@ -158,7 +142,7 @@ class packed_string_table : public string_table::impl { string_table::PackedTableView v_; char const* const buffer_; std::vector index_; - std::unique_ptr dec_; + std::optional dec_; }; string_table::string_table(LegacyTableView v) @@ -191,94 +175,28 @@ template thrift::metadata::string_table string_table::pack_generic(std::span input, pack_options const& options) { - auto size = input.size(); - bool pack_data = options.pack_data; - size_t total_input_size = 0; - std::string buffer; - std::string symtab; - std::vector out_len_vec; - std::vector out_ptr_vec; + auto const size = input.size(); + std::optional res; - if (input.empty()) { - pack_data = false; - } - - if (pack_data) { - std::vector len_vec; - std::vector ptr_vec; - - len_vec.reserve(size); - ptr_vec.reserve(size); - - for (auto const& s : input) { - ptr_vec.emplace_back(reinterpret_cast(s.data())); - len_vec.emplace_back(s.size()); - total_input_size += s.size(); - } - - std::unique_ptr<::fsst_encoder_t, decltype(&::fsst_destroy)> enc{ - ::fsst_create(size, len_vec.data(), ptr_vec.data(), 0), - &::fsst_destroy}; - - symtab.resize(sizeof(::fsst_decoder_t)); - - auto symtab_size = ::fsst_export( - enc.get(), reinterpret_cast(symtab.data())); - symtab.resize(symtab_size); - - if (symtab.size() < total_input_size or options.force_pack_data) { - out_len_vec.resize(size); - out_ptr_vec.resize(size); - - buffer.resize(options.force_pack_data ? total_input_size - : total_input_size - symtab.size()); - size_t num_compressed = 0; - - for (;;) { - num_compressed = ::fsst_compress( - enc.get(), size, len_vec.data(), ptr_vec.data(), buffer.size(), - reinterpret_cast(buffer.data()), out_len_vec.data(), - out_ptr_vec.data()); - - if (num_compressed == size || !options.force_pack_data) { - break; - } - - buffer.resize(2 * buffer.size()); - } - - pack_data = num_compressed == size; - } else { - pack_data = false; - } - } else { - for (auto const& s : input) { - total_input_size += s.size(); - } + if (options.pack_data) { + res = fsst_encoder::compress(input, options.force_pack_data); } thrift::metadata::string_table output; - if (pack_data) { + if (res.has_value()) { // store compressed - size_t compressed_size = - (out_ptr_vec.back() - out_ptr_vec.front()) + out_len_vec.back(); - - DWARFS_CHECK(reinterpret_cast(out_ptr_vec.front()) == buffer.data(), - "string table compression pointer mismatch"); - // TODO: only enable this in debug mode - DWARFS_CHECK(compressed_size == std::accumulate(out_len_vec.begin(), - out_len_vec.end(), - static_cast(0)), - "string table compression pointer mismatch"); - - buffer.resize(compressed_size); - output.buffer()->swap(buffer); - output.symtab() = std::move(symtab); + output.buffer() = std::move(res->buffer); + output.symtab() = std::move(res->dictionary); output.index()->resize(size); - std::ranges::copy(out_len_vec, output.index()->begin()); + for (size_t i = 0; i < size; ++i) { + output.index()[i] = res->compressed_data[i].size(); + } } else { // store uncompressed + auto const total_input_size = + std::accumulate(input.begin(), input.end(), size_t{0}, + [](size_t n, auto const& s) { return n + s.size(); }); output.buffer()->reserve(total_input_size); output.index()->reserve(size); for (auto const& s : input) { diff --git a/test/fsst_test.cpp b/test/fsst_test.cpp new file mode 100644 index 00000000..e05c8a5f --- /dev/null +++ b/test/fsst_test.cpp @@ -0,0 +1,1135 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include + +using namespace std::string_view_literals; +using namespace dwarfs::internal; + +namespace { + +constexpr std::array test_strings{ + "aburabozu", + "abuzz", + "acacatechol", + "acclamator", + "accumulatively", + "acephalan", + "acetaldehydrase", + "acetone", + "ackman", + "acquaintant", + "acquisited", + "acraniate", + "acushla", + "adamantoma", + "addability", + "adipomatous", + "adjectively", + "administrative", + "Adramelech", + "aku", + "alaihi", + "alburnum", + "alcogene", + "alcoholdom", + "alcoholometric", + "alliciency", + "alloerotism", + "allowedly", + "alluring", + "alpenhorn", + "alphabetics", + "alternariose", + "Amasta", + "amberoid", + "ambidexterity", + "ambient", + "Ambystoma", + "anaerobia", + "analogically", + "anamnionic", + "Anaryan", + "Anastasia", + "aniseikonia", + "Anophelinae", + "antebridal", + "antiministerialist", + "antivenom", + "antivermicular", + "antoeci", + "aphorize", + "aphrizite", + "apsidal", + "aquopentamminecobaltic", + "ardeb", + "areometric", + "argumentatory", + "armpit", + "arteriostenosis", + "Arthurian", + "Aruac", + "asbestoidal", + "aspirata", + "assise", + "astigmatical", + "asynaptic", + "asystolic", + "atomician", + "attachment", + "attingence", + "aurothiosulphuric", + "autoanalytic", + "autoinduction", + "automata", + "autosymbolic", + "avenalin", + "axmanship", + "azine", + "babloh", + "babuina", + "babyishly", + "Bacchides", + "bacteriform", + "Baniva", + "baronetical", + "bathroomed", + "bauta", + "beaminess", + "beamwork", + "becircled", + "bedrop", + "bepearl", + "bereaven", + "besieging", + "betinge", + "Bharata", + "bibliothecal", + "bicuspid", + "binarium", + "birch", + "Birkeniidae", + "bishopling", + "blacklegs", + "blandness", + "Blankit", + "blenching", + "blockheadedly", + "Blumea", + "blunderful", + "bonnyvis", + "boobery", + "botanize", + "botryoid", + "bountifulness", + "brachystaphylic", + "Brachystomata", + "branchful", + "Branchiopoda", + "braunite", + "breeder", + "brideweed", + "broadpiece", + "bronchomucormycosis", + "Buddleia", + "buffer", + "bullionless", + "bump", + "burnt", + "burtonize", + "butlerage", + "cacodemon", + "calangay", + "calfling", + "canniness", + "Cantabrize", + "capocchia", + "carapacic", + "carnotite", + "carpentering", + "caryophyllous", + "cashcuttee", + "castlet", + "categorist", + "causticity", + "cavate", + "cavernous", + "cecidologist", + "centiliter", + "cephalopathy", + "Cercolabidae", + "cerebrosensorial", + "Cesare", + "Chaouia", + "chapatty", + "chargeman", + "chati", + "chatteration", + "cheecha", + "chest", + "cheve", + "chiastic", + "chiastoneury", + "chlorite", + "chronal", + "churnmilk", + "circuity", + "circumoral", + "clackety", + "clearstarch", + "Cleistothecopsis", + "clifflike", + "clitelline", + "clithe", + "coaration", + "codomestication", + "coferment", + "cog", + "Colchis", + "collocatory", + "colombier", + "colophonium", + "colpindach", + "concern", + "concessible", + "conjugal", + "conjury", + "consenting", + "constable", + "continentality", + "contraponend", + "Contraposaune", + "conventionally", + "coprecipitation", + "coprophilous", + "corespect", + "cossette", + "cotranslator", + "cottonbush", + "councilorship", + "coverchief", + "crampy", + "craniovertebral", + "Craterid", + "creamy", + "credulity", + "criticship", + "cubbishly", + "cunila", + "Cyanastraceae", + "cyanaurate", + "cyanochroia", + "cyanole", + "cylindrograph", + "Cypselid", + "dakir", + "dasturi", + "dealkylate", + "deaminize", + "decretum", + "dehors", + "demipike", + "Dendroidea", + "diaphony", + "dicing", + "diglyceride", + "Dioon", + "diphenylchloroarsine", + "Disamis", + "disassociation", + "discircumspection", + "discursiveness", + "disdiaclast", + "disembower", + "disfashion", + "dishonorably", + "disroot", + "distastefulness", + "distinctly", + "distortionist", + "disyllabic", + "divertibility", + "doored", + "dorsoventral", + "doughlike", + "downstairs", + "dragade", + "drugless", + "Dryope", + "duke", + "dusting", + "earthed", + "eatable", + "ebracteate", + "ectopic", + "eelworm", + "elect", + "electrion", + "electrocontractility", + "electromerism", + "electropotential", + "elusive", + "embolium", + "emissile", + "Emmental", + "Empidonax", + "emptor", + "enclitical", + "endotheca", + "endurer", + "enjoyably", + "epistemological", + "epizoan", + "equalist", + "equally", + "equerry", + "equiangular", + "equinoctially", + "equiparant", + "Eriophorum", + "erotic", + "Esopus", + "espadon", + "ethmovomerine", + "euphemist", + "Europasian", + "evagation", + "excusator", + "exemplarily", + "exhalatory", + "exiguity", + "expectance", + "expeditiousness", + "extemporally", + "fabiform", + "facultate", + "fallacy", + "fantoccini", + "fanwork", + "fastland", + "Faustian", + "fawnskin", + "fetch", + "ficklety", + "figurize", + "Filipiniana", + "fingery", + "finiteness", + "flayer", + "flindosy", + "flinger", + "flinthearted", + "flogging", + "folliful", + "foodstuff", + "foredeck", + "forewoman", + "fortieth", + "fortin", + "fraternally", + "freeholdership", + "freewill", + "Fremontia", + "friarly", + "Friulian", + "fuguist", + "fulgentness", + "furfuraceously", + "furiosa", + "galant", + "galany", + "gastrocnemius", + "gaywings", + "gazelle", + "geminiform", + "generic", + "geologize", + "geophilous", + "germal", + "gerontocrat", + "gien", + "glaucin", + "gleefully", + "Gliridae", + "glottogony", + "Glyconian", + "gnatty", + "gobiesocid", + "gonoplasm", + "granula", + "gudewife", + "Guisard", + "gumwood", + "gurgle", + "Gyges", + "haggardly", + "hammerwort", + "hammochrysos", + "hangingly", + "haptene", + "hardpan", + "harr", + "hashish", + "hauchecornite", + "helleborein", + "hemiplegy", + "hent", + "herborization", + "heroify", + "Hesperian", + "heteroerotism", + "histology", + "hoboism", + "honoree", + "hookheal", + "horned", + "houseminder", + "huantajayite", + "hubmaking", + "hunkerous", + "Huterian", + "hyaenodont", + "hybridization", + "hydroboracite", + "hymenopterologist", + "hypostilbite", + "ichthyopolist", + "idiomorphism", + "idoneous", + "immanity", + "immeritorious", + "impartiality", + "impersonate", + "improvisatorially", + "impuberal", + "inclinableness", + "inconsequential", + "incopresentable", + "incrustant", + "incurvation", + "indecorous", + "indictee", + "informant", + "infracentral", + "ingeldable", + "inherently", + "initially", + "initiation", + "inscribableness", + "insocially", + "intercreate", + "interisland", + "interzooecial", + "introsentient", + "inversed", + "investment", + "invigor", + "ironheartedly", + "isomerical", + "isospondylous", + "itatartrate", + "jadery", + "janitor", + "Jebusi", + "jimpness", + "jinny", + "Jo", + "jugulum", + "kale", + "kalymmocyte", + "kelyphite", + "kerbstone", + "kettle", + "khedive", + "Koelreuteria", + "Koreshan", + "kuttar", + "lairdess", + "Lappish", + "latch", + "Latinize", + "laudatorily", + "laumontite", + "lavaret", + "leaky", + "legislative", + "legislatorial", + "leoncito", + "leopard", + "lipoid", + "liroconite", + "livingness", + "loasaceous", + "loathness", + "logarithmetically", + "logorrhea", + "loquacious", + "lotto", + "lowerable", + "lycoperdaceous", + "maintainer", + "Malaclemys", + "mammalogist", + "maney", + "Margery", + "marron", + "mastoidohumeral", + "mauger", + "mazzard", + "meered", + "melicerous", + "meningomyelitis", + "merocrystalline", + "mesogyrate", + "mesolabe", + "mesothermal", + "metacresol", + "meteorical", + "metronomic", + "Michigander", + "microchemical", + "micropolariscope", + "microtomic", + "mildewer", + "misdo", + "misemphasis", + "misgovernance", + "misrender", + "monoid", + "mooncreeper", + "moratory", + "morbidity", + "mottramite", + "moundlet", + "muleman", + "multiplex", + "multitudinal", + "musquaw", + "myope", + "Myrcia", + "mythogonic", + "Nabalitic", + "nailproof", + "naipkin", + "nasociliary", + "Nearctica", + "neophilological", + "neuromyelitis", + "nickelic", + "nidology", + "niello", + "niggardize", + "nonacquittal", + "nonadult", + "noncoloring", + "nonconducive", + "noncreeping", + "noncurling", + "nondegeneration", + "nongraduated", + "nonheritor", + "nonoccupation", + "nonplanar", + "nonprevalence", + "nonretiring", + "nonrhyming", + "nonsecretory", + "nonspecial", + "nonsubstantiation", + "norbergite", + "Notus", + "nucleon", + "number", + "nuncupatively", + "nymphid", + "Observantist", + "odontonosology", + "offendant", + "Oklahoma", + "oligosite", + "omniparity", + "oncosis", + "ophthalmiatrics", + "ophthalmitis", + "opposure", + "orendite", + "Orientalia", + "ornithosaurian", + "orthosemidine", + "orthotactic", + "Oryza", + "oscheocele", + "osse", + "ostempyesis", + "ostreoid", + "Otariinae", + "outcropper", + "outsmart", + "outsuck", + "outwander", + "overcolor", + "overdeeming", + "overdrowsed", + "overjawed", + "overpitched", + "overpole", + "overremissness", + "overspring", + "oversqueak", + "oversystematic", + "overtrump", + "oxberry", + "oxyketone", + "palpiform", + "Panak", + "pancreatotomy", + "Panorpidae", + "Pantagruel", + "Pantagruelically", + "pantamorphic", + "pantochromism", + "pantophile", + "papaverous", + "Paradoxides", + "paranymphal", + "parasitotropic", + "parfilage", + "Parnassus", + "partisan", + "partitive", + "pathoanatomical", + "pauseful", + "pedagogism", + "Pedetidae", + "pejorate", + "pelican", + "pelmatogram", + "peltiferous", + "pendragon", + "pensive", + "pentaspherical", + "Percheron", + "periphyllum", + "peritomize", + "peritonsillitis", + "pervasively", + "Petiolata", + "phalarope", + "pharmacognosia", + "Phenalgin", + "philomystic", + "Pholadacea", + "phonophotography", + "photographize", + "photolysis", + "photometrograph", + "phraseologically", + "phrenic", + "Phyteus", + "phytomorphic", + "pietistic", + "pikle", + "pinacone", + "pinsons", + "plasterer", + "play", + "plenicorn", + "pleomastia", + "plessimeter", + "pleuroperitonaeal", + "plexiform", + "plumade", + "pluviometrical", + "pneumony", + "podder", + "podophthalmitic", + "pokable", + "Polistes", + "porcellanid", + "postspinous", + "potto", + "powwower", + "praesystolic", + "pram", + "preaccomplishment", + "preanterior", + "preboding", + "precordiality", + "predeficient", + "pregranite", + "prehistorics", + "preliability", + "premeditative", + "prepatriotic", + "presbytic", + "prespecialist", + "proceed", + "Proctotrypidae", + "proextension", + "profitlessness", + "projecture", + "promptbook", + "proreduction", + "prosodiac", + "protomorph", + "protosiphonaceous", + "provoker", + "proxenos", + "proximally", + "Prunella", + "prunelle", + "pseudocartilaginous", + "Pseudopeziza", + "pseudosocialistic", + "pseudosyllogism", + "psychoautomatic", + "Pteranodon", + "Ptolemaic", + "pulverization", + "pyrochlore", + "quibble", + "quinize", + "quintette", + "quintile", + "Rajah", + "Rastaban", + "rebato", + "Rebecca", + "rebolt", + "reburn", + "recarburization", + "receptionism", + "recession", + "recipient", + "redjacket", + "reflorescent", + "refusion", + "regimentalled", + "Reichslander", + "remilitarize", + "remindal", + "renomination", + "repersuade", + "repertorium", + "replenisher", + "representable", + "reprise", + "reserved", + "resmell", + "reticulovenose", + "retrace", + "retraxit", + "retrenchable", + "reventilate", + "rhabdomal", + "Rhaetian", + "rhubarb", + "Rhynchospora", + "Ribhus", + "ricksha", + "rimose", + "Russolatry", + "saccharomyces", + "saddlery", + "sagacious", + "samkara", + "sauntering", + "Sciarinae", + "scoon", + "scranning", + "scribblatory", + "scride", + "Scriptureless", + "scullionish", + "seamanship", + "seashore", + "sedentarily", + "selvaged", + "sematic", + "semiantique", + "semicollar", + "semigenuflection", + "semiorb", + "semiordinate", + "semioxidated", + "semiproof", + "semiquadrantly", + "semisociative", + "semitheological", + "semuncia", + "sensal", + "septarian", + "seriation", + "serpentina", + "serranoid", + "shaftman", + "Shakespeareana", + "shandrydan", + "sheepbiter", + "Shetlandic", + "shoddywards", + "showless", + "sifting", + "signifier", + "sinoauricular", + "siphonapterous", + "siphonosome", + "sittee", + "smellage", + "Smyrniot", + "sniffing", + "snubbishness", + "soapberry", + "sociologizer", + "softball", + "solemnize", + "solitudinize", + "somatical", + "somnolently", + "sooky", + "soonish", + "sparsely", + "spathed", + "speechmaking", + "spellword", + "Sphaerocarpus", + "sphindid", + "splanchnodynia", + "splenocyte", + "spondylexarthrosis", + "spongiolin", + "sporeling", + "spotted", + "squireless", + "stachys", + "Stalinism", + "stampweed", + "stannate", + "stanner", + "statesmanship", + "stauracin", + "stenosed", + "stereoscopically", + "stickwater", + "Stilophora", + "stimulability", + "stonify", + "storkish", + "stoutly", + "stove", + "strenuousness", + "strongbox", + "sturdiness", + "sufflation", + "sulfamethazine", + "sunshining", + "supercarbonate", + "superfluousness", + "superfortunate", + "superreliance", + "supramaxilla", + "surinamine", + "surprisable", + "surrebut", + "swapping", + "Swazi", + "swingable", + "Synchytrium", + "syndesmology", + "syntaxist", + "tabor", + "tairn", + "tangle", + "Tantony", + "tartaret", + "teammate", + "tearable", + "telecommunication", + "telford", + "tempre", + "tender", + "testicle", + "thegnly", + "theoretician", + "theosophism", + "Thiobacillus", + "throatroot", + "Thunnidae", + "tidewater", + "Timonian", + "Timuquan", + "tolerable", + "tonicobalsamic", + "tonsillectomize", + "toolstock", + "tournant", + "trabacolo", + "tragicomicality", + "tramway", + "translative", + "transmigrationist", + "trianthous", + "trichitis", + "tricoryphean", + "trimesitic", + "trionychoid", + "tristichous", + "trona", + "Tsuga", + "turbaned", + "turkeyberry", + "twangy", + "ultraconfident", + "ultraconservative", + "Ulvales", + "unalimentary", + "unamply", + "unauthentic", + "unbold", + "unceremented", + "uncially", + "uncompact", + "unconcernment", + "unconsoling", + "uncultured", + "undecreed", + "undefinedly", + "undeformedness", + "undenominated", + "undercharged", + "underpassion", + "undevelopable", + "unduncelike", + "unduty", + "unexcitable", + "unfanned", + "unfence", + "unfighting", + "unglorious", + "ungrow", + "unhaste", + "unifocal", + "unilabiated", + "unimperialistic", + "unimposedly", + "unincarnate", + "unliquid", + "unmechanize", + "unmellowed", + "unmistakable", + "unmuddle", + "unnagging", + "unnegotiableness", + "unobstruct", + "unobtrusiveness", + "unorganically", + "unperishably", + "unplacid", + "unpolled", + "unpossessed", + "unprivileged", + "unpronounced", + "unproportioned", + "unpurged", + "unreclined", + "unregretted", + "unremittingness", + "unrepresentedness", + "unruled", + "unsalutary", + "unsalvability", + "unsanctify", + "unsaponified", + "unseated", + "unseldom", + "unshavenly", + "unsolvable", + "unstopper", + "unsung", + "unsupplicated", + "untaintedness", + "untenty", + "unthaw", + "untrainedly", + "untranspassable", + "unvetoed", + "unvocalized", + "unwalled", + "unwarlike", + "unwhisked", + "unwrathful", + "uphoard", + "upprick", + "uptrain", + "upwork", + "upwreathe", + "urbanely", + "ureometry", + "ureterocystoscope", + "urethroscopy", + "urological", + "urticarial", + "usara", + "vacantry", + "vaccinogenous", + "valeramide", + "valonia", + "vanillinic", + "velate", + "viewless", + "visceroparietal", + "vituperative", + "vocably", + "volatilizable", + "voucher", + "Wagneriana", + "waketime", + "walleye", + "wappenschaw", + "waxweed", + "wear", + "weatherer", + "weave", + "werewolfism", + "wheem", + "whippletree", + "whistlewing", + "whom", + "wicking", + "widowership", + "windwayward", + "wisehearted", + "workbench", + "worldish", + "worsening", + "xenian", + "yachting", + "Yugoslavic", + "zebu", + "zimme", + "zoocurrent", + "zoopraxiscope", +}; + +constexpr auto total_string_length = std::accumulate( + test_strings.begin(), test_strings.end(), 0, + [](size_t sum, std::string_view str) { return sum + str.size(); }); + +} // namespace + +TEST(fsst_test, basic) { + auto const res = fsst_encoder::compress(test_strings); + + ASSERT_TRUE(res.has_value()); + EXPECT_EQ(test_strings.size(), res->compressed_data.size()); + EXPECT_GT(res->dictionary.size(), 550); + EXPECT_LT(res->dictionary.size(), 600); + EXPECT_LT(res->buffer.size(), 9 * total_string_length / 17); + + auto const decoder = fsst_decoder{res->dictionary}; + + for (size_t i = 0; i < test_strings.size(); ++i) { + auto const& str = test_strings[i]; + auto const& compressed_data = res->compressed_data[i]; + + auto const decompressed = decoder.decompress(compressed_data); + + EXPECT_EQ(str, decompressed); + } +} + +TEST(fsst_random_test, random_strings) { +#ifdef DWARFS_TEST_CROSS_COMPILE + static constexpr int num_random_tests = 100; +#else + static constexpr int num_random_tests = 1000; +#endif + + std::mt19937 rng{42}; + std::uniform_int_distribution sample_size_dist(0, 100); + std::vector sample_sizes; + + sample_sizes.reserve(num_random_tests); + sample_sizes.push_back(0); // Definitely include the empty set + std::ranges::generate_n(std::back_inserter(sample_sizes), + num_random_tests - 1, + [&]() { return sample_size_dist(rng); }); + + for (auto const sample_size : sample_sizes) { + std::vector input(sample_size); + std::ranges::sample(test_strings, input.begin(), input.size(), rng); + + auto const res = fsst_encoder::compress(input, true); + auto const res2 = fsst_encoder::compress(input); + + if (sample_size == 0) { + ASSERT_FALSE(res.has_value()); + ASSERT_FALSE(res2.has_value()); + } else { + ASSERT_TRUE(res.has_value()); + EXPECT_EQ(input.size(), res->compressed_data.size()); + + auto const total_input_length = + std::accumulate(input.begin(), input.end(), size_t{0}, + [](size_t n, auto const& s) { return n + s.size(); }); + + if (res->dictionary.size() + res->buffer.size() < total_input_length) { + EXPECT_TRUE(res2.has_value()); + } else { + EXPECT_FALSE(res2.has_value()); + } + + if (sample_size >= 500) { + EXPECT_LE(res->buffer.size(), 60 * total_input_length / 100); + } else if (sample_size >= 200) { + EXPECT_LE(res->buffer.size(), 70 * total_input_length / 100); + } else if (sample_size >= 100) { + EXPECT_LE(res->buffer.size(), 100 * total_input_length / 100); + } else if (sample_size >= 20) { + EXPECT_LE(res->buffer.size(), 120 * total_input_length / 100); + } else { + EXPECT_LE(res->buffer.size(), 200 * total_input_length / 100); + } + + auto const decoder = fsst_decoder{res->dictionary}; + + for (size_t i = 0; i < input.size(); ++i) { + auto const& str = input[i]; + auto const& compressed_data = res->compressed_data[i]; + + auto const decompressed = decoder.decompress(compressed_data); + + EXPECT_EQ(str, decompressed); + } + } + } +} diff --git a/test/tool_main_test.cpp b/test/tool_main_test.cpp index f5b9f52a..c882e917 100644 --- a/test/tool_main_test.cpp +++ b/test/tool_main_test.cpp @@ -2114,12 +2114,10 @@ TEST(mkdwarfs_test, pack_mode_all) { auto fs = t.fs_from_stdout(); auto info = fs.info_as_json({.features = reader::fsinfo_features::for_level(2)}); - std::set expected = {"packed_chunk_table", - "packed_directories", - "packed_names", - "packed_names_index", - "packed_shared_files_table", - "packed_symlinks_index"}; + std::set expected = { + "packed_chunk_table", "packed_directories", "packed_names", + "packed_names_index", "packed_shared_files_table", "packed_symlinks", + "packed_symlinks_index"}; std::set fsopt; for (auto const& opt : info["options"]) { fsopt.insert(opt.get());