feat: optionally add category metadata to file system metadata

This commit is contained in:
Marcus Holland-Moritz 2024-11-28 14:45:51 +01:00
parent 886a107a0e
commit ca2e5c9e35
11 changed files with 115 additions and 19 deletions

View File

@ -383,6 +383,22 @@ Most other options are concerned with compression tuning:
Don't add a creation timestamp. This is useful when bit-identical file
system images are required to be produced from the same input.
- `--no-category-names`:
Don't add category names to the file system. This is only relevant if
the file system uses categories at all. Adding category names allows
for setting per-category options when rewriting a file system, to get
per-category info with `dwarfsck`, or to see the categories that have
been identified in a file. This implicitly adds `--no-category-metadata`.
- `--no-category-metadata`:
Don't add category metadata to the file system. This goes beyond just
the category names and additionally stores the metadata identified
during categorization. This metadata is needed for metadata-dependent
compression algorithms (e.g. FLAC). So if you're planning to recompress
categorized blocks from a metadata-independent to a metadata-dependent
compression algorithm, this is going to be impossible if the metadata
information is not available.
- `--file-hash=none`|*name*:
Select the hashing function to be used for file deduplication. If `none`
is chosen, file deduplication is disabled. By default, the built-in

View File

@ -38,16 +38,14 @@ class block_manager {
size_t get_logical_block() const;
void set_written_block(size_t logical_block, size_t written_block,
fragment_category::value_type category);
fragment_category category);
void map_logical_blocks(std::vector<chunk_type>& vec) const;
std::vector<fragment_category::value_type>
get_written_block_categories() const;
std::vector<fragment_category> get_written_block_categories() const;
private:
std::mutex mutable mx_;
size_t mutable num_blocks_{0};
std::vector<std::optional<std::pair<size_t, fragment_category::value_type>>>
block_map_;
std::vector<std::optional<std::pair<size_t, fragment_category>>> block_map_;
};
} // namespace dwarfs::writer::internal

View File

@ -87,6 +87,15 @@ class metadata_builder {
impl_->set_block_categories(std::move(block_categories));
}
void set_category_metadata_json(std::vector<std::string> metadata_json) {
impl_->set_category_metadata_json(std::move(metadata_json));
}
void
set_block_category_metadata(std::map<uint32_t, uint32_t> block_metadata) {
impl_->set_block_category_metadata(std::move(block_metadata));
}
void add_symlink_table_entry(size_t index, uint32_t entry) {
impl_->add_symlink_table_entry(index, entry);
}
@ -121,6 +130,10 @@ class metadata_builder {
set_category_names(std::vector<std::string> category_names) = 0;
virtual void
set_block_categories(std::vector<uint32_t> block_categories) = 0;
virtual void
set_category_metadata_json(std::vector<std::string> metadata_json) = 0;
virtual void set_block_category_metadata(
std::map<uint32_t, uint32_t> block_metadata) = 0;
virtual void add_symlink_table_entry(size_t index, uint32_t entry) = 0;
virtual void gather_chunks(inode_manager const& im, block_manager const& bm,
size_t chunk_count) = 0;

View File

@ -53,6 +53,8 @@ struct metadata_options {
bool pack_symlinks_index{false};
bool force_pack_string_tables{false};
bool no_create_timestamp{false};
bool no_category_names{false};
bool no_category_metadata{false};
size_t inode_size_cache_min_chunk_count{128};
};

View File

@ -294,6 +294,8 @@ void analyze_frozen(std::ostream& os,
} \
} while (0)
#define META_OPT_MAP_SIZE(x) META_OPT_LIST_SIZE(x)
#define META_OPT_STRING_LIST_SIZE(x) \
do { \
if (auto list = meta.x()) { \
@ -402,6 +404,8 @@ void analyze_frozen(std::ostream& os,
META_OPT_STRING_LIST_SIZE(category_names);
META_OPT_LIST_SIZE(block_categories);
META_OPT_STRING_LIST_SIZE(category_metadata_json);
META_OPT_MAP_SIZE(block_category_metadata);
#undef META_LIST_SIZE
#undef META_OPT_STRING_SET_SIZE

View File

@ -38,7 +38,7 @@ size_t block_manager::get_logical_block() const {
void block_manager::set_written_block(size_t logical_block,
size_t written_block,
fragment_category::value_type category) {
fragment_category category) {
std::lock_guard lock{mx_};
assert(logical_block < num_blocks_);
if (block_map_.size() < num_blocks_) {
@ -56,9 +56,9 @@ void block_manager::map_logical_blocks(std::vector<chunk_type>& vec) const {
}
}
std::vector<fragment_category::value_type>
std::vector<fragment_category>
block_manager::get_written_block_categories() const {
std::vector<fragment_category::value_type> result;
std::vector<fragment_category> result;
{
std::lock_guard lock{mx_};

View File

@ -95,6 +95,16 @@ class metadata_builder_ final : public metadata_builder::impl {
md_.block_categories() = std::move(block_categories);
}
void
set_category_metadata_json(std::vector<std::string> metadata_json) override {
md_.category_metadata_json() = std::move(metadata_json);
}
void set_block_category_metadata(
std::map<uint32_t, uint32_t> block_metadata) override {
md_.block_category_metadata() = std::move(block_metadata);
}
void add_symlink_table_entry(size_t index, uint32_t entry) override {
DWARFS_NOTHROW(md_.symlink_table()->at(index)) = entry;
}
@ -314,6 +324,17 @@ thrift::metadata::metadata const& metadata_builder_<LoggerPolicy>::build() {
ti << "saving symlinks table...";
}
if (options_.no_category_names) {
md_.category_names().reset();
md_.block_categories().reset();
}
if (options_.no_category_names || options_.no_category_metadata) {
md_.category_metadata_json().reset();
md_.block_category_metadata().reset();
}
// TODO: don't overwrite all options when upgrading!
md_.options() = fsopts;
md_.features() = features_.get();
md_.dwarfs_version() = std::string("libdwarfs ") + DWARFS_GIT_ID;

View File

@ -43,6 +43,8 @@
#include <fmt/format.h>
#include <range/v3/view/enumerate.hpp>
#include <dwarfs/error.h>
#include <dwarfs/file_access.h>
#include <dwarfs/history.h>
@ -826,8 +828,7 @@ void scanner_<LoggerPolicy>::scan(
[blockmgr, logical_block_num,
category](auto physical_block_num) {
blockmgr->set_written_block(logical_block_num,
physical_block_num,
category.value());
physical_block_num, category);
},
meta);
});
@ -918,30 +919,51 @@ void scanner_<LoggerPolicy>::scan(
mdb.set_shared_files_table(std::move(ssfv.get_shared_files()));
if (auto catmgr = options_.inode.categorizer_mgr) {
std::unordered_map<fragment_category::value_type,
fragment_category::value_type>
std::unordered_map<fragment_category::value_type, uint32_t>
category_indices;
std::unordered_map<fragment_category, uint32_t> category_metadata_indices;
std::vector<std::string> category_names;
std::vector<std::string> category_metadata;
category_indices.reserve(frag_info.info.size());
category_names.reserve(frag_info.info.size());
for (auto const& ci : frag_info.info) {
auto [it, inserted] =
category_indices.emplace(ci.category, category_names.size());
if (inserted) {
if (category_indices.emplace(ci.category, category_names.size()).second) {
category_names.emplace_back(catmgr->category_name(ci.category));
}
}
for (auto const& cat : frag_info.categories) {
auto metadata = catmgr->category_metadata(cat);
if (!metadata.empty()) {
if (category_metadata_indices.emplace(cat, category_metadata.size())
.second) {
category_metadata.emplace_back(std::move(metadata));
}
}
}
auto written_categories = blockmgr->get_written_block_categories();
std::vector<uint32_t> block_categories(written_categories.size());
std::map<uint32_t, uint32_t> block_cat_metadata;
std::transform(written_categories.begin(), written_categories.end(),
written_categories.begin(),
[&](auto const& cat) { return category_indices.at(cat); });
block_categories.begin(), [&](auto const& cat) {
return category_indices.at(cat.value());
});
for (auto const& [i, cat] : ranges::views::enumerate(written_categories)) {
if (auto it = category_metadata_indices.find(cat);
it != category_metadata_indices.end()) {
block_cat_metadata.emplace(i, it->second);
}
}
mdb.set_category_names(std::move(category_names));
mdb.set_block_categories(std::move(written_categories));
mdb.set_block_categories(std::move(block_categories));
mdb.set_category_metadata_json(std::move(category_metadata));
mdb.set_block_category_metadata(std::move(block_cat_metadata));
}
mdb.set_block_size(segmenter_factory_.get_block_size());

View File

@ -154,7 +154,7 @@ void run_segmenter_benchmark(::benchmark::State& state, unsigned granularity,
auto logical_block_num) {
auto physical_block_num = written.size();
written.push_back(blk);
blkmgr->set_written_block(logical_block_num, physical_block_num, 0);
blkmgr->set_written_block(logical_block_num, physical_block_num, {});
});
// begin benchmarking code

View File

@ -422,4 +422,18 @@ struct metadata {
// Size cache for highly fragmented file inodes
30: optional inode_size_cache reg_file_size_cache
//==========================================================//
// fields added with dwarfs-0.13.0, file system version 2.5 //
//==========================================================//
// Unique block categorization metadata JSON strings. These
// can be used to compress a block with a metadata-dependent
// algorithm after having been compressed with a general
// purpose algorithm.
31: optional list<string> category_metadata_json
// The metadata associated with each block. Maps from block
// number to index into `categorization_metadata_json`.
32: optional map<UInt32, UInt32> block_category_metadata
}

View File

@ -663,6 +663,12 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
("time-resolution",
po::value<std::string>(&time_resolution)->default_value("sec"),
resolution_desc.c_str())
("no-category-names",
po::value<bool>(&options.metadata.no_category_names)->zero_tokens(),
"don't add category names to file system")
("no-category-metadata",
po::value<bool>(&options.metadata.no_category_metadata)->zero_tokens(),
"don't add category metadata to file system")
("pack-metadata,P",
po::value<std::string>(&pack_metadata)->default_value("auto"),
"pack certain metadata elements (auto, all, none, chunk_table, "