mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-08 20:12:56 -04:00
feat: optionally add category metadata to file system metadata
This commit is contained in:
parent
886a107a0e
commit
ca2e5c9e35
@ -383,6 +383,22 @@ Most other options are concerned with compression tuning:
|
||||
Don't add a creation timestamp. This is useful when bit-identical file
|
||||
system images are required to be produced from the same input.
|
||||
|
||||
- `--no-category-names`:
|
||||
Don't add category names to the file system. This is only relevant if
|
||||
the file system uses categories at all. Adding category names allows
|
||||
for setting per-category options when rewriting a file system, to get
|
||||
per-category info with `dwarfsck`, or to see the categories that have
|
||||
been identified in a file. This implicitly adds `--no-category-metadata`.
|
||||
|
||||
- `--no-category-metadata`:
|
||||
Don't add category metadata to the file system. This goes beyond just
|
||||
the category names and additionally stores the metadata identified
|
||||
during categorization. This metadata is needed for metadata-dependent
|
||||
compression algorithms (e.g. FLAC). So if you're planning to recompress
|
||||
categorized blocks from a metadata-independent to a metadata-dependent
|
||||
compression algorithm, this is going to be impossible if the metadata
|
||||
information is not available.
|
||||
|
||||
- `--file-hash=none`|*name*:
|
||||
Select the hashing function to be used for file deduplication. If `none`
|
||||
is chosen, file deduplication is disabled. By default, the built-in
|
||||
|
@ -38,16 +38,14 @@ class block_manager {
|
||||
|
||||
size_t get_logical_block() const;
|
||||
void set_written_block(size_t logical_block, size_t written_block,
|
||||
fragment_category::value_type category);
|
||||
fragment_category category);
|
||||
void map_logical_blocks(std::vector<chunk_type>& vec) const;
|
||||
std::vector<fragment_category::value_type>
|
||||
get_written_block_categories() const;
|
||||
std::vector<fragment_category> get_written_block_categories() const;
|
||||
|
||||
private:
|
||||
std::mutex mutable mx_;
|
||||
size_t mutable num_blocks_{0};
|
||||
std::vector<std::optional<std::pair<size_t, fragment_category::value_type>>>
|
||||
block_map_;
|
||||
std::vector<std::optional<std::pair<size_t, fragment_category>>> block_map_;
|
||||
};
|
||||
|
||||
} // namespace dwarfs::writer::internal
|
||||
|
@ -87,6 +87,15 @@ class metadata_builder {
|
||||
impl_->set_block_categories(std::move(block_categories));
|
||||
}
|
||||
|
||||
void set_category_metadata_json(std::vector<std::string> metadata_json) {
|
||||
impl_->set_category_metadata_json(std::move(metadata_json));
|
||||
}
|
||||
|
||||
void
|
||||
set_block_category_metadata(std::map<uint32_t, uint32_t> block_metadata) {
|
||||
impl_->set_block_category_metadata(std::move(block_metadata));
|
||||
}
|
||||
|
||||
void add_symlink_table_entry(size_t index, uint32_t entry) {
|
||||
impl_->add_symlink_table_entry(index, entry);
|
||||
}
|
||||
@ -121,6 +130,10 @@ class metadata_builder {
|
||||
set_category_names(std::vector<std::string> category_names) = 0;
|
||||
virtual void
|
||||
set_block_categories(std::vector<uint32_t> block_categories) = 0;
|
||||
virtual void
|
||||
set_category_metadata_json(std::vector<std::string> metadata_json) = 0;
|
||||
virtual void set_block_category_metadata(
|
||||
std::map<uint32_t, uint32_t> block_metadata) = 0;
|
||||
virtual void add_symlink_table_entry(size_t index, uint32_t entry) = 0;
|
||||
virtual void gather_chunks(inode_manager const& im, block_manager const& bm,
|
||||
size_t chunk_count) = 0;
|
||||
|
@ -53,6 +53,8 @@ struct metadata_options {
|
||||
bool pack_symlinks_index{false};
|
||||
bool force_pack_string_tables{false};
|
||||
bool no_create_timestamp{false};
|
||||
bool no_category_names{false};
|
||||
bool no_category_metadata{false};
|
||||
size_t inode_size_cache_min_chunk_count{128};
|
||||
};
|
||||
|
||||
|
@ -294,6 +294,8 @@ void analyze_frozen(std::ostream& os,
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define META_OPT_MAP_SIZE(x) META_OPT_LIST_SIZE(x)
|
||||
|
||||
#define META_OPT_STRING_LIST_SIZE(x) \
|
||||
do { \
|
||||
if (auto list = meta.x()) { \
|
||||
@ -402,6 +404,8 @@ void analyze_frozen(std::ostream& os,
|
||||
|
||||
META_OPT_STRING_LIST_SIZE(category_names);
|
||||
META_OPT_LIST_SIZE(block_categories);
|
||||
META_OPT_STRING_LIST_SIZE(category_metadata_json);
|
||||
META_OPT_MAP_SIZE(block_category_metadata);
|
||||
|
||||
#undef META_LIST_SIZE
|
||||
#undef META_OPT_STRING_SET_SIZE
|
||||
|
@ -38,7 +38,7 @@ size_t block_manager::get_logical_block() const {
|
||||
|
||||
void block_manager::set_written_block(size_t logical_block,
|
||||
size_t written_block,
|
||||
fragment_category::value_type category) {
|
||||
fragment_category category) {
|
||||
std::lock_guard lock{mx_};
|
||||
assert(logical_block < num_blocks_);
|
||||
if (block_map_.size() < num_blocks_) {
|
||||
@ -56,9 +56,9 @@ void block_manager::map_logical_blocks(std::vector<chunk_type>& vec) const {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<fragment_category::value_type>
|
||||
std::vector<fragment_category>
|
||||
block_manager::get_written_block_categories() const {
|
||||
std::vector<fragment_category::value_type> result;
|
||||
std::vector<fragment_category> result;
|
||||
|
||||
{
|
||||
std::lock_guard lock{mx_};
|
||||
|
@ -95,6 +95,16 @@ class metadata_builder_ final : public metadata_builder::impl {
|
||||
md_.block_categories() = std::move(block_categories);
|
||||
}
|
||||
|
||||
void
|
||||
set_category_metadata_json(std::vector<std::string> metadata_json) override {
|
||||
md_.category_metadata_json() = std::move(metadata_json);
|
||||
}
|
||||
|
||||
void set_block_category_metadata(
|
||||
std::map<uint32_t, uint32_t> block_metadata) override {
|
||||
md_.block_category_metadata() = std::move(block_metadata);
|
||||
}
|
||||
|
||||
void add_symlink_table_entry(size_t index, uint32_t entry) override {
|
||||
DWARFS_NOTHROW(md_.symlink_table()->at(index)) = entry;
|
||||
}
|
||||
@ -314,6 +324,17 @@ thrift::metadata::metadata const& metadata_builder_<LoggerPolicy>::build() {
|
||||
ti << "saving symlinks table...";
|
||||
}
|
||||
|
||||
if (options_.no_category_names) {
|
||||
md_.category_names().reset();
|
||||
md_.block_categories().reset();
|
||||
}
|
||||
|
||||
if (options_.no_category_names || options_.no_category_metadata) {
|
||||
md_.category_metadata_json().reset();
|
||||
md_.block_category_metadata().reset();
|
||||
}
|
||||
|
||||
// TODO: don't overwrite all options when upgrading!
|
||||
md_.options() = fsopts;
|
||||
md_.features() = features_.get();
|
||||
md_.dwarfs_version() = std::string("libdwarfs ") + DWARFS_GIT_ID;
|
||||
|
@ -43,6 +43,8 @@
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <range/v3/view/enumerate.hpp>
|
||||
|
||||
#include <dwarfs/error.h>
|
||||
#include <dwarfs/file_access.h>
|
||||
#include <dwarfs/history.h>
|
||||
@ -826,8 +828,7 @@ void scanner_<LoggerPolicy>::scan(
|
||||
[blockmgr, logical_block_num,
|
||||
category](auto physical_block_num) {
|
||||
blockmgr->set_written_block(logical_block_num,
|
||||
physical_block_num,
|
||||
category.value());
|
||||
physical_block_num, category);
|
||||
},
|
||||
meta);
|
||||
});
|
||||
@ -918,30 +919,51 @@ void scanner_<LoggerPolicy>::scan(
|
||||
mdb.set_shared_files_table(std::move(ssfv.get_shared_files()));
|
||||
|
||||
if (auto catmgr = options_.inode.categorizer_mgr) {
|
||||
std::unordered_map<fragment_category::value_type,
|
||||
fragment_category::value_type>
|
||||
std::unordered_map<fragment_category::value_type, uint32_t>
|
||||
category_indices;
|
||||
std::unordered_map<fragment_category, uint32_t> category_metadata_indices;
|
||||
std::vector<std::string> category_names;
|
||||
std::vector<std::string> category_metadata;
|
||||
|
||||
category_indices.reserve(frag_info.info.size());
|
||||
category_names.reserve(frag_info.info.size());
|
||||
|
||||
for (auto const& ci : frag_info.info) {
|
||||
auto [it, inserted] =
|
||||
category_indices.emplace(ci.category, category_names.size());
|
||||
if (inserted) {
|
||||
if (category_indices.emplace(ci.category, category_names.size()).second) {
|
||||
category_names.emplace_back(catmgr->category_name(ci.category));
|
||||
}
|
||||
}
|
||||
|
||||
for (auto const& cat : frag_info.categories) {
|
||||
auto metadata = catmgr->category_metadata(cat);
|
||||
if (!metadata.empty()) {
|
||||
if (category_metadata_indices.emplace(cat, category_metadata.size())
|
||||
.second) {
|
||||
category_metadata.emplace_back(std::move(metadata));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto written_categories = blockmgr->get_written_block_categories();
|
||||
std::vector<uint32_t> block_categories(written_categories.size());
|
||||
std::map<uint32_t, uint32_t> block_cat_metadata;
|
||||
|
||||
std::transform(written_categories.begin(), written_categories.end(),
|
||||
written_categories.begin(),
|
||||
[&](auto const& cat) { return category_indices.at(cat); });
|
||||
block_categories.begin(), [&](auto const& cat) {
|
||||
return category_indices.at(cat.value());
|
||||
});
|
||||
|
||||
for (auto const& [i, cat] : ranges::views::enumerate(written_categories)) {
|
||||
if (auto it = category_metadata_indices.find(cat);
|
||||
it != category_metadata_indices.end()) {
|
||||
block_cat_metadata.emplace(i, it->second);
|
||||
}
|
||||
}
|
||||
|
||||
mdb.set_category_names(std::move(category_names));
|
||||
mdb.set_block_categories(std::move(written_categories));
|
||||
mdb.set_block_categories(std::move(block_categories));
|
||||
mdb.set_category_metadata_json(std::move(category_metadata));
|
||||
mdb.set_block_category_metadata(std::move(block_cat_metadata));
|
||||
}
|
||||
|
||||
mdb.set_block_size(segmenter_factory_.get_block_size());
|
||||
|
@ -154,7 +154,7 @@ void run_segmenter_benchmark(::benchmark::State& state, unsigned granularity,
|
||||
auto logical_block_num) {
|
||||
auto physical_block_num = written.size();
|
||||
written.push_back(blk);
|
||||
blkmgr->set_written_block(logical_block_num, physical_block_num, 0);
|
||||
blkmgr->set_written_block(logical_block_num, physical_block_num, {});
|
||||
});
|
||||
|
||||
// begin benchmarking code
|
||||
|
@ -422,4 +422,18 @@ struct metadata {
|
||||
|
||||
// Size cache for highly fragmented file inodes
|
||||
30: optional inode_size_cache reg_file_size_cache
|
||||
|
||||
//==========================================================//
|
||||
// fields added with dwarfs-0.13.0, file system version 2.5 //
|
||||
//==========================================================//
|
||||
|
||||
// Unique block categorization metadata JSON strings. These
|
||||
// can be used to compress a block with a metadata-dependent
|
||||
// algorithm after having been compressed with a general
|
||||
// purpose algorithm.
|
||||
31: optional list<string> category_metadata_json
|
||||
|
||||
// The metadata associated with each block. Maps from block
|
||||
// number to index into `categorization_metadata_json`.
|
||||
32: optional map<UInt32, UInt32> block_category_metadata
|
||||
}
|
||||
|
@ -663,6 +663,12 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
|
||||
("time-resolution",
|
||||
po::value<std::string>(&time_resolution)->default_value("sec"),
|
||||
resolution_desc.c_str())
|
||||
("no-category-names",
|
||||
po::value<bool>(&options.metadata.no_category_names)->zero_tokens(),
|
||||
"don't add category names to file system")
|
||||
("no-category-metadata",
|
||||
po::value<bool>(&options.metadata.no_category_metadata)->zero_tokens(),
|
||||
"don't add category metadata to file system")
|
||||
("pack-metadata,P",
|
||||
po::value<std::string>(&pack_metadata)->default_value("auto"),
|
||||
"pack certain metadata elements (auto, all, none, chunk_table, "
|
||||
|
Loading…
x
Reference in New Issue
Block a user