refactor: factor out fsst components

This commit is contained in:
Marcus Holland-Moritz 2025-08-19 15:00:25 +02:00
parent 1e48783d6d
commit 8893513c8f
7 changed files with 1405 additions and 105 deletions

View File

@ -495,6 +495,7 @@ if(WITH_TESTS)
test/filesystem_test.cpp
test/filesystem_writer_test.cpp
test/fragment_category_test.cpp
test/fsst_test.cpp
test/glob_matcher_test.cpp
test/global_metadata_test.cpp
test/integral_value_parser_test.cpp

View File

@ -51,6 +51,7 @@ add_library(
src/internal/file_status_conv.cpp
src/internal/fs_section.cpp
src/internal/fs_section_checker.cpp
src/internal/fsst.cpp
src/internal/glob_to_regex.cpp
src/internal/malloc_buffer.cpp
src/internal/metadata_utils.cpp

View File

@ -0,0 +1,73 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the Software), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* SPDX-License-Identifier: MIT
*/
#pragma once
#include <memory>
#include <optional>
#include <span>
#include <string>
#include <string_view>
#include <vector>
namespace dwarfs::internal {
class fsst_encoder {
public:
struct bulk_compression_result {
std::string dictionary;
std::string buffer;
std::vector<std::string_view> compressed_data;
};
static std::optional<bulk_compression_result>
compress(std::span<std::string_view const> data, bool force = false);
static std::optional<bulk_compression_result>
compress(std::span<std::string const> data, bool force = false);
};
class fsst_decoder {
public:
explicit fsst_decoder(std::string_view dictionary);
std::string decompress(std::string_view data) const {
return impl_->decompress(data);
}
class impl {
public:
virtual ~impl() = default;
virtual std::string decompress(std::string_view data) const = 0;
};
private:
std::unique_ptr<impl const> impl_;
};
} // namespace dwarfs::internal

174
src/internal/fsst.cpp Normal file
View File

@ -0,0 +1,174 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the Software), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* SPDX-License-Identifier: MIT
*/
#include <cassert>
#include <numeric>
#include <stdexcept>
#include <dwarfs/internal/fsst.h>
#include <fmt/format.h>
#include <fsst.h>
namespace dwarfs::internal {
namespace {
template <typename T>
std::optional<fsst_encoder::bulk_compression_result>
fsst_compress_(std::span<T const> input, bool force) {
std::optional<fsst_encoder::bulk_compression_result> output;
if (input.empty()) {
return output;
}
auto const size = input.size();
size_t total_input_size = 0;
std::vector<size_t> len_vec;
std::vector<unsigned char const*> ptr_vec;
len_vec.reserve(size);
ptr_vec.reserve(size);
for (auto const& s : input) {
ptr_vec.emplace_back(reinterpret_cast<unsigned char const*>(s.data()));
len_vec.emplace_back(s.size());
total_input_size += s.size();
}
std::unique_ptr<::fsst_encoder_t, decltype(&::fsst_destroy)> enc{
::fsst_create(size, len_vec.data(), ptr_vec.data(), 0), &::fsst_destroy};
std::string symtab;
symtab.resize(sizeof(::fsst_decoder_t));
auto const symtab_size =
::fsst_export(enc.get(), reinterpret_cast<unsigned char*>(symtab.data()));
symtab.resize(symtab_size);
std::vector<size_t> out_len_vec;
std::vector<unsigned char*> out_ptr_vec;
std::string buffer;
out_len_vec.resize(size);
out_ptr_vec.resize(size);
if (symtab_size >= total_input_size && !force) {
return output;
}
buffer.resize(total_input_size);
for (;;) {
auto const num_compressed = ::fsst_compress(
enc.get(), size, len_vec.data(), ptr_vec.data(), buffer.size(),
reinterpret_cast<unsigned char*>(buffer.data()), out_len_vec.data(),
out_ptr_vec.data());
if (num_compressed == size) {
break;
} else if (!force) {
return output;
}
buffer.resize(2 * buffer.size());
}
size_t const compressed_size =
(out_ptr_vec.back() - out_ptr_vec.front()) + out_len_vec.back();
if (symtab_size + compressed_size >= total_input_size && !force) {
return output;
}
assert(reinterpret_cast<char*>(out_ptr_vec.front()) == buffer.data());
assert(compressed_size == std::accumulate(out_len_vec.begin(),
out_len_vec.end(),
static_cast<size_t>(0)));
buffer.resize(compressed_size);
output.emplace();
output->dictionary = std::move(symtab);
output->buffer = std::move(buffer);
output->compressed_data.reserve(size);
for (size_t i = 0; i < size; ++i) {
output->compressed_data.emplace_back(std::string_view(
reinterpret_cast<char*>(out_ptr_vec[i]), out_len_vec[i]));
}
return output;
}
class fsst_decoder_ : public fsst_decoder::impl {
public:
explicit fsst_decoder_(std::string_view dictionary) {
auto const read = ::fsst_import(
&decoder_, reinterpret_cast<unsigned char const*>(dictionary.data()));
if (read != dictionary.size()) {
throw std::runtime_error(fmt::format(
"read {0} symtab bytes, expected {1}", read, dictionary.size()));
}
}
std::string decompress(std::string_view data) const override {
thread_local std::string out;
auto const size = data.size();
out.resize(8 * size);
auto outlen = ::fsst_decompress(
&decoder_, size, reinterpret_cast<unsigned char const*>(data.data()),
out.size(), reinterpret_cast<unsigned char*>(out.data()));
out.resize(outlen);
return out;
}
private:
::fsst_decoder_t decoder_;
};
} // namespace
auto fsst_encoder::compress(std::span<std::string_view const> data, bool force)
-> std::optional<bulk_compression_result> {
return fsst_compress_(data, force);
}
auto fsst_encoder::compress(std::span<std::string const> data, bool force)
-> std::optional<bulk_compression_result> {
return fsst_compress_(data, force);
}
fsst_decoder::fsst_decoder(std::string_view dictionary)
: impl_{std::make_unique<fsst_decoder_>(dictionary)} {}
} // namespace dwarfs::internal

View File

@ -31,11 +31,10 @@
#include <fmt/format.h>
#include <fsst.h>
#include <dwarfs/error.h>
#include <dwarfs/logger.h>
#include <dwarfs/internal/fsst.h>
#include <dwarfs/internal/string_table.h>
namespace dwarfs::internal {
@ -78,16 +77,8 @@ class packed_string_table : public string_table::impl {
auto st = v_.symtab();
DWARFS_CHECK(st, "symtab unexpectedly unset");
dec_ = std::make_unique<fsst_decoder_t>();
auto read = fsst_import(
dec_.get(), reinterpret_cast<unsigned char const*>(st->data()));
if (read != st->size()) {
DWARFS_THROW(runtime_error,
fmt::format("read {0} symtab bytes, expected {1}", read,
st->size()));
}
dec_.emplace(st.value());
ti << "imported dictionary for " << name << " string table";
}
@ -118,14 +109,7 @@ class packed_string_table : public string_table::impl {
}
if constexpr (PackedData) {
thread_local std::string out;
size_t size = end - beg;
out.resize(8 * size);
auto outlen = fsst_decompress(
dec_.get(), size, reinterpret_cast<unsigned char const*>(beg),
out.size(), reinterpret_cast<unsigned char*>(out.data()));
out.resize(outlen);
return out;
return dec_->decompress(std::string_view{beg, end});
}
return {beg, end};
@ -158,7 +142,7 @@ class packed_string_table : public string_table::impl {
string_table::PackedTableView v_;
char const* const buffer_;
std::vector<uint32_t> index_;
std::unique_ptr<fsst_decoder_t> dec_;
std::optional<fsst_decoder> dec_;
};
string_table::string_table(LegacyTableView v)
@ -191,94 +175,28 @@ template <typename T>
thrift::metadata::string_table
string_table::pack_generic(std::span<T const> input,
pack_options const& options) {
auto size = input.size();
bool pack_data = options.pack_data;
size_t total_input_size = 0;
std::string buffer;
std::string symtab;
std::vector<size_t> out_len_vec;
std::vector<unsigned char*> out_ptr_vec;
auto const size = input.size();
std::optional<fsst_encoder::bulk_compression_result> res;
if (input.empty()) {
pack_data = false;
}
if (pack_data) {
std::vector<size_t> len_vec;
std::vector<unsigned char const*> ptr_vec;
len_vec.reserve(size);
ptr_vec.reserve(size);
for (auto const& s : input) {
ptr_vec.emplace_back(reinterpret_cast<unsigned char const*>(s.data()));
len_vec.emplace_back(s.size());
total_input_size += s.size();
}
std::unique_ptr<::fsst_encoder_t, decltype(&::fsst_destroy)> enc{
::fsst_create(size, len_vec.data(), ptr_vec.data(), 0),
&::fsst_destroy};
symtab.resize(sizeof(::fsst_decoder_t));
auto symtab_size = ::fsst_export(
enc.get(), reinterpret_cast<unsigned char*>(symtab.data()));
symtab.resize(symtab_size);
if (symtab.size() < total_input_size or options.force_pack_data) {
out_len_vec.resize(size);
out_ptr_vec.resize(size);
buffer.resize(options.force_pack_data ? total_input_size
: total_input_size - symtab.size());
size_t num_compressed = 0;
for (;;) {
num_compressed = ::fsst_compress(
enc.get(), size, len_vec.data(), ptr_vec.data(), buffer.size(),
reinterpret_cast<unsigned char*>(buffer.data()), out_len_vec.data(),
out_ptr_vec.data());
if (num_compressed == size || !options.force_pack_data) {
break;
}
buffer.resize(2 * buffer.size());
}
pack_data = num_compressed == size;
} else {
pack_data = false;
}
} else {
for (auto const& s : input) {
total_input_size += s.size();
}
if (options.pack_data) {
res = fsst_encoder::compress(input, options.force_pack_data);
}
thrift::metadata::string_table output;
if (pack_data) {
if (res.has_value()) {
// store compressed
size_t compressed_size =
(out_ptr_vec.back() - out_ptr_vec.front()) + out_len_vec.back();
DWARFS_CHECK(reinterpret_cast<char*>(out_ptr_vec.front()) == buffer.data(),
"string table compression pointer mismatch");
// TODO: only enable this in debug mode
DWARFS_CHECK(compressed_size == std::accumulate(out_len_vec.begin(),
out_len_vec.end(),
static_cast<size_t>(0)),
"string table compression pointer mismatch");
buffer.resize(compressed_size);
output.buffer()->swap(buffer);
output.symtab() = std::move(symtab);
output.buffer() = std::move(res->buffer);
output.symtab() = std::move(res->dictionary);
output.index()->resize(size);
std::ranges::copy(out_len_vec, output.index()->begin());
for (size_t i = 0; i < size; ++i) {
output.index()[i] = res->compressed_data[i].size();
}
} else {
// store uncompressed
auto const total_input_size =
std::accumulate(input.begin(), input.end(), size_t{0},
[](size_t n, auto const& s) { return n + s.size(); });
output.buffer()->reserve(total_input_size);
output.index()->reserve(size);
for (auto const& s : input) {

1135
test/fsst_test.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@ -2114,11 +2114,9 @@ TEST(mkdwarfs_test, pack_mode_all) {
auto fs = t.fs_from_stdout();
auto info =
fs.info_as_json({.features = reader::fsinfo_features::for_level(2)});
std::set<std::string> expected = {"packed_chunk_table",
"packed_directories",
"packed_names",
"packed_names_index",
"packed_shared_files_table",
std::set<std::string> expected = {
"packed_chunk_table", "packed_directories", "packed_names",
"packed_names_index", "packed_shared_files_table", "packed_symlinks",
"packed_symlinks_index"};
std::set<std::string> fsopt;
for (auto const& opt : info["options"]) {