mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-09 04:19:10 -04:00
refactor: factor out fsst components
This commit is contained in:
parent
1e48783d6d
commit
8893513c8f
@ -495,6 +495,7 @@ if(WITH_TESTS)
|
|||||||
test/filesystem_test.cpp
|
test/filesystem_test.cpp
|
||||||
test/filesystem_writer_test.cpp
|
test/filesystem_writer_test.cpp
|
||||||
test/fragment_category_test.cpp
|
test/fragment_category_test.cpp
|
||||||
|
test/fsst_test.cpp
|
||||||
test/glob_matcher_test.cpp
|
test/glob_matcher_test.cpp
|
||||||
test/global_metadata_test.cpp
|
test/global_metadata_test.cpp
|
||||||
test/integral_value_parser_test.cpp
|
test/integral_value_parser_test.cpp
|
||||||
|
@ -51,6 +51,7 @@ add_library(
|
|||||||
src/internal/file_status_conv.cpp
|
src/internal/file_status_conv.cpp
|
||||||
src/internal/fs_section.cpp
|
src/internal/fs_section.cpp
|
||||||
src/internal/fs_section_checker.cpp
|
src/internal/fs_section_checker.cpp
|
||||||
|
src/internal/fsst.cpp
|
||||||
src/internal/glob_to_regex.cpp
|
src/internal/glob_to_regex.cpp
|
||||||
src/internal/malloc_buffer.cpp
|
src/internal/malloc_buffer.cpp
|
||||||
src/internal/metadata_utils.cpp
|
src/internal/metadata_utils.cpp
|
||||||
|
73
include/dwarfs/internal/fsst.h
Normal file
73
include/dwarfs/internal/fsst.h
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||||
|
/**
|
||||||
|
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||||
|
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||||
|
*
|
||||||
|
* This file is part of dwarfs.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the “Software”), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <optional>
|
||||||
|
#include <span>
|
||||||
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace dwarfs::internal {
|
||||||
|
|
||||||
|
class fsst_encoder {
|
||||||
|
public:
|
||||||
|
struct bulk_compression_result {
|
||||||
|
std::string dictionary;
|
||||||
|
std::string buffer;
|
||||||
|
std::vector<std::string_view> compressed_data;
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::optional<bulk_compression_result>
|
||||||
|
compress(std::span<std::string_view const> data, bool force = false);
|
||||||
|
static std::optional<bulk_compression_result>
|
||||||
|
compress(std::span<std::string const> data, bool force = false);
|
||||||
|
};
|
||||||
|
|
||||||
|
class fsst_decoder {
|
||||||
|
public:
|
||||||
|
explicit fsst_decoder(std::string_view dictionary);
|
||||||
|
|
||||||
|
std::string decompress(std::string_view data) const {
|
||||||
|
return impl_->decompress(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
class impl {
|
||||||
|
public:
|
||||||
|
virtual ~impl() = default;
|
||||||
|
|
||||||
|
virtual std::string decompress(std::string_view data) const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<impl const> impl_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace dwarfs::internal
|
174
src/internal/fsst.cpp
Normal file
174
src/internal/fsst.cpp
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||||
|
/**
|
||||||
|
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||||
|
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||||
|
*
|
||||||
|
* This file is part of dwarfs.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the “Software”), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <numeric>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
#include <dwarfs/internal/fsst.h>
|
||||||
|
|
||||||
|
#include <fmt/format.h>
|
||||||
|
|
||||||
|
#include <fsst.h>
|
||||||
|
|
||||||
|
namespace dwarfs::internal {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
std::optional<fsst_encoder::bulk_compression_result>
|
||||||
|
fsst_compress_(std::span<T const> input, bool force) {
|
||||||
|
std::optional<fsst_encoder::bulk_compression_result> output;
|
||||||
|
|
||||||
|
if (input.empty()) {
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto const size = input.size();
|
||||||
|
size_t total_input_size = 0;
|
||||||
|
std::vector<size_t> len_vec;
|
||||||
|
std::vector<unsigned char const*> ptr_vec;
|
||||||
|
|
||||||
|
len_vec.reserve(size);
|
||||||
|
ptr_vec.reserve(size);
|
||||||
|
|
||||||
|
for (auto const& s : input) {
|
||||||
|
ptr_vec.emplace_back(reinterpret_cast<unsigned char const*>(s.data()));
|
||||||
|
len_vec.emplace_back(s.size());
|
||||||
|
total_input_size += s.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<::fsst_encoder_t, decltype(&::fsst_destroy)> enc{
|
||||||
|
::fsst_create(size, len_vec.data(), ptr_vec.data(), 0), &::fsst_destroy};
|
||||||
|
|
||||||
|
std::string symtab;
|
||||||
|
|
||||||
|
symtab.resize(sizeof(::fsst_decoder_t));
|
||||||
|
|
||||||
|
auto const symtab_size =
|
||||||
|
::fsst_export(enc.get(), reinterpret_cast<unsigned char*>(symtab.data()));
|
||||||
|
symtab.resize(symtab_size);
|
||||||
|
|
||||||
|
std::vector<size_t> out_len_vec;
|
||||||
|
std::vector<unsigned char*> out_ptr_vec;
|
||||||
|
std::string buffer;
|
||||||
|
|
||||||
|
out_len_vec.resize(size);
|
||||||
|
out_ptr_vec.resize(size);
|
||||||
|
|
||||||
|
if (symtab_size >= total_input_size && !force) {
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.resize(total_input_size);
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
auto const num_compressed = ::fsst_compress(
|
||||||
|
enc.get(), size, len_vec.data(), ptr_vec.data(), buffer.size(),
|
||||||
|
reinterpret_cast<unsigned char*>(buffer.data()), out_len_vec.data(),
|
||||||
|
out_ptr_vec.data());
|
||||||
|
|
||||||
|
if (num_compressed == size) {
|
||||||
|
break;
|
||||||
|
} else if (!force) {
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.resize(2 * buffer.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t const compressed_size =
|
||||||
|
(out_ptr_vec.back() - out_ptr_vec.front()) + out_len_vec.back();
|
||||||
|
|
||||||
|
if (symtab_size + compressed_size >= total_input_size && !force) {
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(reinterpret_cast<char*>(out_ptr_vec.front()) == buffer.data());
|
||||||
|
assert(compressed_size == std::accumulate(out_len_vec.begin(),
|
||||||
|
out_len_vec.end(),
|
||||||
|
static_cast<size_t>(0)));
|
||||||
|
|
||||||
|
buffer.resize(compressed_size);
|
||||||
|
|
||||||
|
output.emplace();
|
||||||
|
|
||||||
|
output->dictionary = std::move(symtab);
|
||||||
|
output->buffer = std::move(buffer);
|
||||||
|
output->compressed_data.reserve(size);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size; ++i) {
|
||||||
|
output->compressed_data.emplace_back(std::string_view(
|
||||||
|
reinterpret_cast<char*>(out_ptr_vec[i]), out_len_vec[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
class fsst_decoder_ : public fsst_decoder::impl {
|
||||||
|
public:
|
||||||
|
explicit fsst_decoder_(std::string_view dictionary) {
|
||||||
|
auto const read = ::fsst_import(
|
||||||
|
&decoder_, reinterpret_cast<unsigned char const*>(dictionary.data()));
|
||||||
|
if (read != dictionary.size()) {
|
||||||
|
throw std::runtime_error(fmt::format(
|
||||||
|
"read {0} symtab bytes, expected {1}", read, dictionary.size()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string decompress(std::string_view data) const override {
|
||||||
|
thread_local std::string out;
|
||||||
|
auto const size = data.size();
|
||||||
|
out.resize(8 * size);
|
||||||
|
auto outlen = ::fsst_decompress(
|
||||||
|
&decoder_, size, reinterpret_cast<unsigned char const*>(data.data()),
|
||||||
|
out.size(), reinterpret_cast<unsigned char*>(out.data()));
|
||||||
|
out.resize(outlen);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
::fsst_decoder_t decoder_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
auto fsst_encoder::compress(std::span<std::string_view const> data, bool force)
|
||||||
|
-> std::optional<bulk_compression_result> {
|
||||||
|
return fsst_compress_(data, force);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto fsst_encoder::compress(std::span<std::string const> data, bool force)
|
||||||
|
-> std::optional<bulk_compression_result> {
|
||||||
|
return fsst_compress_(data, force);
|
||||||
|
}
|
||||||
|
|
||||||
|
fsst_decoder::fsst_decoder(std::string_view dictionary)
|
||||||
|
: impl_{std::make_unique<fsst_decoder_>(dictionary)} {}
|
||||||
|
|
||||||
|
} // namespace dwarfs::internal
|
@ -31,11 +31,10 @@
|
|||||||
|
|
||||||
#include <fmt/format.h>
|
#include <fmt/format.h>
|
||||||
|
|
||||||
#include <fsst.h>
|
|
||||||
|
|
||||||
#include <dwarfs/error.h>
|
#include <dwarfs/error.h>
|
||||||
#include <dwarfs/logger.h>
|
#include <dwarfs/logger.h>
|
||||||
|
|
||||||
|
#include <dwarfs/internal/fsst.h>
|
||||||
#include <dwarfs/internal/string_table.h>
|
#include <dwarfs/internal/string_table.h>
|
||||||
|
|
||||||
namespace dwarfs::internal {
|
namespace dwarfs::internal {
|
||||||
@ -78,16 +77,8 @@ class packed_string_table : public string_table::impl {
|
|||||||
|
|
||||||
auto st = v_.symtab();
|
auto st = v_.symtab();
|
||||||
DWARFS_CHECK(st, "symtab unexpectedly unset");
|
DWARFS_CHECK(st, "symtab unexpectedly unset");
|
||||||
dec_ = std::make_unique<fsst_decoder_t>();
|
|
||||||
|
|
||||||
auto read = fsst_import(
|
dec_.emplace(st.value());
|
||||||
dec_.get(), reinterpret_cast<unsigned char const*>(st->data()));
|
|
||||||
|
|
||||||
if (read != st->size()) {
|
|
||||||
DWARFS_THROW(runtime_error,
|
|
||||||
fmt::format("read {0} symtab bytes, expected {1}", read,
|
|
||||||
st->size()));
|
|
||||||
}
|
|
||||||
|
|
||||||
ti << "imported dictionary for " << name << " string table";
|
ti << "imported dictionary for " << name << " string table";
|
||||||
}
|
}
|
||||||
@ -118,14 +109,7 @@ class packed_string_table : public string_table::impl {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (PackedData) {
|
if constexpr (PackedData) {
|
||||||
thread_local std::string out;
|
return dec_->decompress(std::string_view{beg, end});
|
||||||
size_t size = end - beg;
|
|
||||||
out.resize(8 * size);
|
|
||||||
auto outlen = fsst_decompress(
|
|
||||||
dec_.get(), size, reinterpret_cast<unsigned char const*>(beg),
|
|
||||||
out.size(), reinterpret_cast<unsigned char*>(out.data()));
|
|
||||||
out.resize(outlen);
|
|
||||||
return out;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return {beg, end};
|
return {beg, end};
|
||||||
@ -158,7 +142,7 @@ class packed_string_table : public string_table::impl {
|
|||||||
string_table::PackedTableView v_;
|
string_table::PackedTableView v_;
|
||||||
char const* const buffer_;
|
char const* const buffer_;
|
||||||
std::vector<uint32_t> index_;
|
std::vector<uint32_t> index_;
|
||||||
std::unique_ptr<fsst_decoder_t> dec_;
|
std::optional<fsst_decoder> dec_;
|
||||||
};
|
};
|
||||||
|
|
||||||
string_table::string_table(LegacyTableView v)
|
string_table::string_table(LegacyTableView v)
|
||||||
@ -191,94 +175,28 @@ template <typename T>
|
|||||||
thrift::metadata::string_table
|
thrift::metadata::string_table
|
||||||
string_table::pack_generic(std::span<T const> input,
|
string_table::pack_generic(std::span<T const> input,
|
||||||
pack_options const& options) {
|
pack_options const& options) {
|
||||||
auto size = input.size();
|
auto const size = input.size();
|
||||||
bool pack_data = options.pack_data;
|
std::optional<fsst_encoder::bulk_compression_result> res;
|
||||||
size_t total_input_size = 0;
|
|
||||||
std::string buffer;
|
|
||||||
std::string symtab;
|
|
||||||
std::vector<size_t> out_len_vec;
|
|
||||||
std::vector<unsigned char*> out_ptr_vec;
|
|
||||||
|
|
||||||
if (input.empty()) {
|
if (options.pack_data) {
|
||||||
pack_data = false;
|
res = fsst_encoder::compress(input, options.force_pack_data);
|
||||||
}
|
|
||||||
|
|
||||||
if (pack_data) {
|
|
||||||
std::vector<size_t> len_vec;
|
|
||||||
std::vector<unsigned char const*> ptr_vec;
|
|
||||||
|
|
||||||
len_vec.reserve(size);
|
|
||||||
ptr_vec.reserve(size);
|
|
||||||
|
|
||||||
for (auto const& s : input) {
|
|
||||||
ptr_vec.emplace_back(reinterpret_cast<unsigned char const*>(s.data()));
|
|
||||||
len_vec.emplace_back(s.size());
|
|
||||||
total_input_size += s.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unique_ptr<::fsst_encoder_t, decltype(&::fsst_destroy)> enc{
|
|
||||||
::fsst_create(size, len_vec.data(), ptr_vec.data(), 0),
|
|
||||||
&::fsst_destroy};
|
|
||||||
|
|
||||||
symtab.resize(sizeof(::fsst_decoder_t));
|
|
||||||
|
|
||||||
auto symtab_size = ::fsst_export(
|
|
||||||
enc.get(), reinterpret_cast<unsigned char*>(symtab.data()));
|
|
||||||
symtab.resize(symtab_size);
|
|
||||||
|
|
||||||
if (symtab.size() < total_input_size or options.force_pack_data) {
|
|
||||||
out_len_vec.resize(size);
|
|
||||||
out_ptr_vec.resize(size);
|
|
||||||
|
|
||||||
buffer.resize(options.force_pack_data ? total_input_size
|
|
||||||
: total_input_size - symtab.size());
|
|
||||||
size_t num_compressed = 0;
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
num_compressed = ::fsst_compress(
|
|
||||||
enc.get(), size, len_vec.data(), ptr_vec.data(), buffer.size(),
|
|
||||||
reinterpret_cast<unsigned char*>(buffer.data()), out_len_vec.data(),
|
|
||||||
out_ptr_vec.data());
|
|
||||||
|
|
||||||
if (num_compressed == size || !options.force_pack_data) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
buffer.resize(2 * buffer.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
pack_data = num_compressed == size;
|
|
||||||
} else {
|
|
||||||
pack_data = false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (auto const& s : input) {
|
|
||||||
total_input_size += s.size();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
thrift::metadata::string_table output;
|
thrift::metadata::string_table output;
|
||||||
|
|
||||||
if (pack_data) {
|
if (res.has_value()) {
|
||||||
// store compressed
|
// store compressed
|
||||||
size_t compressed_size =
|
output.buffer() = std::move(res->buffer);
|
||||||
(out_ptr_vec.back() - out_ptr_vec.front()) + out_len_vec.back();
|
output.symtab() = std::move(res->dictionary);
|
||||||
|
|
||||||
DWARFS_CHECK(reinterpret_cast<char*>(out_ptr_vec.front()) == buffer.data(),
|
|
||||||
"string table compression pointer mismatch");
|
|
||||||
// TODO: only enable this in debug mode
|
|
||||||
DWARFS_CHECK(compressed_size == std::accumulate(out_len_vec.begin(),
|
|
||||||
out_len_vec.end(),
|
|
||||||
static_cast<size_t>(0)),
|
|
||||||
"string table compression pointer mismatch");
|
|
||||||
|
|
||||||
buffer.resize(compressed_size);
|
|
||||||
output.buffer()->swap(buffer);
|
|
||||||
output.symtab() = std::move(symtab);
|
|
||||||
output.index()->resize(size);
|
output.index()->resize(size);
|
||||||
std::ranges::copy(out_len_vec, output.index()->begin());
|
for (size_t i = 0; i < size; ++i) {
|
||||||
|
output.index()[i] = res->compressed_data[i].size();
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// store uncompressed
|
// store uncompressed
|
||||||
|
auto const total_input_size =
|
||||||
|
std::accumulate(input.begin(), input.end(), size_t{0},
|
||||||
|
[](size_t n, auto const& s) { return n + s.size(); });
|
||||||
output.buffer()->reserve(total_input_size);
|
output.buffer()->reserve(total_input_size);
|
||||||
output.index()->reserve(size);
|
output.index()->reserve(size);
|
||||||
for (auto const& s : input) {
|
for (auto const& s : input) {
|
||||||
|
1135
test/fsst_test.cpp
Normal file
1135
test/fsst_test.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -2114,12 +2114,10 @@ TEST(mkdwarfs_test, pack_mode_all) {
|
|||||||
auto fs = t.fs_from_stdout();
|
auto fs = t.fs_from_stdout();
|
||||||
auto info =
|
auto info =
|
||||||
fs.info_as_json({.features = reader::fsinfo_features::for_level(2)});
|
fs.info_as_json({.features = reader::fsinfo_features::for_level(2)});
|
||||||
std::set<std::string> expected = {"packed_chunk_table",
|
std::set<std::string> expected = {
|
||||||
"packed_directories",
|
"packed_chunk_table", "packed_directories", "packed_names",
|
||||||
"packed_names",
|
"packed_names_index", "packed_shared_files_table", "packed_symlinks",
|
||||||
"packed_names_index",
|
"packed_symlinks_index"};
|
||||||
"packed_shared_files_table",
|
|
||||||
"packed_symlinks_index"};
|
|
||||||
std::set<std::string> fsopt;
|
std::set<std::string> fsopt;
|
||||||
for (auto const& opt : info["options"]) {
|
for (auto const& opt : info["options"]) {
|
||||||
fsopt.insert(opt.get<std::string>());
|
fsopt.insert(opt.get<std::string>());
|
||||||
|
Loading…
x
Reference in New Issue
Block a user