mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-09 12:28:13 -04:00
feat: optionally add category metadata to file system metadata
This commit is contained in:
parent
886a107a0e
commit
ca2e5c9e35
@ -383,6 +383,22 @@ Most other options are concerned with compression tuning:
|
|||||||
Don't add a creation timestamp. This is useful when bit-identical file
|
Don't add a creation timestamp. This is useful when bit-identical file
|
||||||
system images are required to be produced from the same input.
|
system images are required to be produced from the same input.
|
||||||
|
|
||||||
|
- `--no-category-names`:
|
||||||
|
Don't add category names to the file system. This is only relevant if
|
||||||
|
the file system uses categories at all. Adding category names allows
|
||||||
|
for setting per-category options when rewriting a file system, to get
|
||||||
|
per-category info with `dwarfsck`, or to see the categories that have
|
||||||
|
been identified in a file. This implicitly adds `--no-category-metadata`.
|
||||||
|
|
||||||
|
- `--no-category-metadata`:
|
||||||
|
Don't add category metadata to the file system. This goes beyond just
|
||||||
|
the category names and additionally stores the metadata identified
|
||||||
|
during categorization. This metadata is needed for metadata-dependent
|
||||||
|
compression algorithms (e.g. FLAC). So if you're planning to recompress
|
||||||
|
categorized blocks from a metadata-independent to a metadata-dependent
|
||||||
|
compression algorithm, this is going to be impossible if the metadata
|
||||||
|
information is not available.
|
||||||
|
|
||||||
- `--file-hash=none`|*name*:
|
- `--file-hash=none`|*name*:
|
||||||
Select the hashing function to be used for file deduplication. If `none`
|
Select the hashing function to be used for file deduplication. If `none`
|
||||||
is chosen, file deduplication is disabled. By default, the built-in
|
is chosen, file deduplication is disabled. By default, the built-in
|
||||||
|
@ -38,16 +38,14 @@ class block_manager {
|
|||||||
|
|
||||||
size_t get_logical_block() const;
|
size_t get_logical_block() const;
|
||||||
void set_written_block(size_t logical_block, size_t written_block,
|
void set_written_block(size_t logical_block, size_t written_block,
|
||||||
fragment_category::value_type category);
|
fragment_category category);
|
||||||
void map_logical_blocks(std::vector<chunk_type>& vec) const;
|
void map_logical_blocks(std::vector<chunk_type>& vec) const;
|
||||||
std::vector<fragment_category::value_type>
|
std::vector<fragment_category> get_written_block_categories() const;
|
||||||
get_written_block_categories() const;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::mutex mutable mx_;
|
std::mutex mutable mx_;
|
||||||
size_t mutable num_blocks_{0};
|
size_t mutable num_blocks_{0};
|
||||||
std::vector<std::optional<std::pair<size_t, fragment_category::value_type>>>
|
std::vector<std::optional<std::pair<size_t, fragment_category>>> block_map_;
|
||||||
block_map_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace dwarfs::writer::internal
|
} // namespace dwarfs::writer::internal
|
||||||
|
@ -87,6 +87,15 @@ class metadata_builder {
|
|||||||
impl_->set_block_categories(std::move(block_categories));
|
impl_->set_block_categories(std::move(block_categories));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void set_category_metadata_json(std::vector<std::string> metadata_json) {
|
||||||
|
impl_->set_category_metadata_json(std::move(metadata_json));
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
set_block_category_metadata(std::map<uint32_t, uint32_t> block_metadata) {
|
||||||
|
impl_->set_block_category_metadata(std::move(block_metadata));
|
||||||
|
}
|
||||||
|
|
||||||
void add_symlink_table_entry(size_t index, uint32_t entry) {
|
void add_symlink_table_entry(size_t index, uint32_t entry) {
|
||||||
impl_->add_symlink_table_entry(index, entry);
|
impl_->add_symlink_table_entry(index, entry);
|
||||||
}
|
}
|
||||||
@ -121,6 +130,10 @@ class metadata_builder {
|
|||||||
set_category_names(std::vector<std::string> category_names) = 0;
|
set_category_names(std::vector<std::string> category_names) = 0;
|
||||||
virtual void
|
virtual void
|
||||||
set_block_categories(std::vector<uint32_t> block_categories) = 0;
|
set_block_categories(std::vector<uint32_t> block_categories) = 0;
|
||||||
|
virtual void
|
||||||
|
set_category_metadata_json(std::vector<std::string> metadata_json) = 0;
|
||||||
|
virtual void set_block_category_metadata(
|
||||||
|
std::map<uint32_t, uint32_t> block_metadata) = 0;
|
||||||
virtual void add_symlink_table_entry(size_t index, uint32_t entry) = 0;
|
virtual void add_symlink_table_entry(size_t index, uint32_t entry) = 0;
|
||||||
virtual void gather_chunks(inode_manager const& im, block_manager const& bm,
|
virtual void gather_chunks(inode_manager const& im, block_manager const& bm,
|
||||||
size_t chunk_count) = 0;
|
size_t chunk_count) = 0;
|
||||||
|
@ -53,6 +53,8 @@ struct metadata_options {
|
|||||||
bool pack_symlinks_index{false};
|
bool pack_symlinks_index{false};
|
||||||
bool force_pack_string_tables{false};
|
bool force_pack_string_tables{false};
|
||||||
bool no_create_timestamp{false};
|
bool no_create_timestamp{false};
|
||||||
|
bool no_category_names{false};
|
||||||
|
bool no_category_metadata{false};
|
||||||
size_t inode_size_cache_min_chunk_count{128};
|
size_t inode_size_cache_min_chunk_count{128};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -294,6 +294,8 @@ void analyze_frozen(std::ostream& os,
|
|||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#define META_OPT_MAP_SIZE(x) META_OPT_LIST_SIZE(x)
|
||||||
|
|
||||||
#define META_OPT_STRING_LIST_SIZE(x) \
|
#define META_OPT_STRING_LIST_SIZE(x) \
|
||||||
do { \
|
do { \
|
||||||
if (auto list = meta.x()) { \
|
if (auto list = meta.x()) { \
|
||||||
@ -402,6 +404,8 @@ void analyze_frozen(std::ostream& os,
|
|||||||
|
|
||||||
META_OPT_STRING_LIST_SIZE(category_names);
|
META_OPT_STRING_LIST_SIZE(category_names);
|
||||||
META_OPT_LIST_SIZE(block_categories);
|
META_OPT_LIST_SIZE(block_categories);
|
||||||
|
META_OPT_STRING_LIST_SIZE(category_metadata_json);
|
||||||
|
META_OPT_MAP_SIZE(block_category_metadata);
|
||||||
|
|
||||||
#undef META_LIST_SIZE
|
#undef META_LIST_SIZE
|
||||||
#undef META_OPT_STRING_SET_SIZE
|
#undef META_OPT_STRING_SET_SIZE
|
||||||
|
@ -38,7 +38,7 @@ size_t block_manager::get_logical_block() const {
|
|||||||
|
|
||||||
void block_manager::set_written_block(size_t logical_block,
|
void block_manager::set_written_block(size_t logical_block,
|
||||||
size_t written_block,
|
size_t written_block,
|
||||||
fragment_category::value_type category) {
|
fragment_category category) {
|
||||||
std::lock_guard lock{mx_};
|
std::lock_guard lock{mx_};
|
||||||
assert(logical_block < num_blocks_);
|
assert(logical_block < num_blocks_);
|
||||||
if (block_map_.size() < num_blocks_) {
|
if (block_map_.size() < num_blocks_) {
|
||||||
@ -56,9 +56,9 @@ void block_manager::map_logical_blocks(std::vector<chunk_type>& vec) const {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<fragment_category::value_type>
|
std::vector<fragment_category>
|
||||||
block_manager::get_written_block_categories() const {
|
block_manager::get_written_block_categories() const {
|
||||||
std::vector<fragment_category::value_type> result;
|
std::vector<fragment_category> result;
|
||||||
|
|
||||||
{
|
{
|
||||||
std::lock_guard lock{mx_};
|
std::lock_guard lock{mx_};
|
||||||
|
@ -95,6 +95,16 @@ class metadata_builder_ final : public metadata_builder::impl {
|
|||||||
md_.block_categories() = std::move(block_categories);
|
md_.block_categories() = std::move(block_categories);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
set_category_metadata_json(std::vector<std::string> metadata_json) override {
|
||||||
|
md_.category_metadata_json() = std::move(metadata_json);
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_block_category_metadata(
|
||||||
|
std::map<uint32_t, uint32_t> block_metadata) override {
|
||||||
|
md_.block_category_metadata() = std::move(block_metadata);
|
||||||
|
}
|
||||||
|
|
||||||
void add_symlink_table_entry(size_t index, uint32_t entry) override {
|
void add_symlink_table_entry(size_t index, uint32_t entry) override {
|
||||||
DWARFS_NOTHROW(md_.symlink_table()->at(index)) = entry;
|
DWARFS_NOTHROW(md_.symlink_table()->at(index)) = entry;
|
||||||
}
|
}
|
||||||
@ -314,6 +324,17 @@ thrift::metadata::metadata const& metadata_builder_<LoggerPolicy>::build() {
|
|||||||
ti << "saving symlinks table...";
|
ti << "saving symlinks table...";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (options_.no_category_names) {
|
||||||
|
md_.category_names().reset();
|
||||||
|
md_.block_categories().reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options_.no_category_names || options_.no_category_metadata) {
|
||||||
|
md_.category_metadata_json().reset();
|
||||||
|
md_.block_category_metadata().reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: don't overwrite all options when upgrading!
|
||||||
md_.options() = fsopts;
|
md_.options() = fsopts;
|
||||||
md_.features() = features_.get();
|
md_.features() = features_.get();
|
||||||
md_.dwarfs_version() = std::string("libdwarfs ") + DWARFS_GIT_ID;
|
md_.dwarfs_version() = std::string("libdwarfs ") + DWARFS_GIT_ID;
|
||||||
|
@ -43,6 +43,8 @@
|
|||||||
|
|
||||||
#include <fmt/format.h>
|
#include <fmt/format.h>
|
||||||
|
|
||||||
|
#include <range/v3/view/enumerate.hpp>
|
||||||
|
|
||||||
#include <dwarfs/error.h>
|
#include <dwarfs/error.h>
|
||||||
#include <dwarfs/file_access.h>
|
#include <dwarfs/file_access.h>
|
||||||
#include <dwarfs/history.h>
|
#include <dwarfs/history.h>
|
||||||
@ -826,8 +828,7 @@ void scanner_<LoggerPolicy>::scan(
|
|||||||
[blockmgr, logical_block_num,
|
[blockmgr, logical_block_num,
|
||||||
category](auto physical_block_num) {
|
category](auto physical_block_num) {
|
||||||
blockmgr->set_written_block(logical_block_num,
|
blockmgr->set_written_block(logical_block_num,
|
||||||
physical_block_num,
|
physical_block_num, category);
|
||||||
category.value());
|
|
||||||
},
|
},
|
||||||
meta);
|
meta);
|
||||||
});
|
});
|
||||||
@ -918,30 +919,51 @@ void scanner_<LoggerPolicy>::scan(
|
|||||||
mdb.set_shared_files_table(std::move(ssfv.get_shared_files()));
|
mdb.set_shared_files_table(std::move(ssfv.get_shared_files()));
|
||||||
|
|
||||||
if (auto catmgr = options_.inode.categorizer_mgr) {
|
if (auto catmgr = options_.inode.categorizer_mgr) {
|
||||||
std::unordered_map<fragment_category::value_type,
|
std::unordered_map<fragment_category::value_type, uint32_t>
|
||||||
fragment_category::value_type>
|
|
||||||
category_indices;
|
category_indices;
|
||||||
|
std::unordered_map<fragment_category, uint32_t> category_metadata_indices;
|
||||||
std::vector<std::string> category_names;
|
std::vector<std::string> category_names;
|
||||||
|
std::vector<std::string> category_metadata;
|
||||||
|
|
||||||
category_indices.reserve(frag_info.info.size());
|
category_indices.reserve(frag_info.info.size());
|
||||||
category_names.reserve(frag_info.info.size());
|
category_names.reserve(frag_info.info.size());
|
||||||
|
|
||||||
for (auto const& ci : frag_info.info) {
|
for (auto const& ci : frag_info.info) {
|
||||||
auto [it, inserted] =
|
if (category_indices.emplace(ci.category, category_names.size()).second) {
|
||||||
category_indices.emplace(ci.category, category_names.size());
|
|
||||||
if (inserted) {
|
|
||||||
category_names.emplace_back(catmgr->category_name(ci.category));
|
category_names.emplace_back(catmgr->category_name(ci.category));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (auto const& cat : frag_info.categories) {
|
||||||
|
auto metadata = catmgr->category_metadata(cat);
|
||||||
|
if (!metadata.empty()) {
|
||||||
|
if (category_metadata_indices.emplace(cat, category_metadata.size())
|
||||||
|
.second) {
|
||||||
|
category_metadata.emplace_back(std::move(metadata));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
auto written_categories = blockmgr->get_written_block_categories();
|
auto written_categories = blockmgr->get_written_block_categories();
|
||||||
|
std::vector<uint32_t> block_categories(written_categories.size());
|
||||||
|
std::map<uint32_t, uint32_t> block_cat_metadata;
|
||||||
|
|
||||||
std::transform(written_categories.begin(), written_categories.end(),
|
std::transform(written_categories.begin(), written_categories.end(),
|
||||||
written_categories.begin(),
|
block_categories.begin(), [&](auto const& cat) {
|
||||||
[&](auto const& cat) { return category_indices.at(cat); });
|
return category_indices.at(cat.value());
|
||||||
|
});
|
||||||
|
|
||||||
|
for (auto const& [i, cat] : ranges::views::enumerate(written_categories)) {
|
||||||
|
if (auto it = category_metadata_indices.find(cat);
|
||||||
|
it != category_metadata_indices.end()) {
|
||||||
|
block_cat_metadata.emplace(i, it->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
mdb.set_category_names(std::move(category_names));
|
mdb.set_category_names(std::move(category_names));
|
||||||
mdb.set_block_categories(std::move(written_categories));
|
mdb.set_block_categories(std::move(block_categories));
|
||||||
|
mdb.set_category_metadata_json(std::move(category_metadata));
|
||||||
|
mdb.set_block_category_metadata(std::move(block_cat_metadata));
|
||||||
}
|
}
|
||||||
|
|
||||||
mdb.set_block_size(segmenter_factory_.get_block_size());
|
mdb.set_block_size(segmenter_factory_.get_block_size());
|
||||||
|
@ -154,7 +154,7 @@ void run_segmenter_benchmark(::benchmark::State& state, unsigned granularity,
|
|||||||
auto logical_block_num) {
|
auto logical_block_num) {
|
||||||
auto physical_block_num = written.size();
|
auto physical_block_num = written.size();
|
||||||
written.push_back(blk);
|
written.push_back(blk);
|
||||||
blkmgr->set_written_block(logical_block_num, physical_block_num, 0);
|
blkmgr->set_written_block(logical_block_num, physical_block_num, {});
|
||||||
});
|
});
|
||||||
|
|
||||||
// begin benchmarking code
|
// begin benchmarking code
|
||||||
|
@ -422,4 +422,18 @@ struct metadata {
|
|||||||
|
|
||||||
// Size cache for highly fragmented file inodes
|
// Size cache for highly fragmented file inodes
|
||||||
30: optional inode_size_cache reg_file_size_cache
|
30: optional inode_size_cache reg_file_size_cache
|
||||||
|
|
||||||
|
//==========================================================//
|
||||||
|
// fields added with dwarfs-0.13.0, file system version 2.5 //
|
||||||
|
//==========================================================//
|
||||||
|
|
||||||
|
// Unique block categorization metadata JSON strings. These
|
||||||
|
// can be used to compress a block with a metadata-dependent
|
||||||
|
// algorithm after having been compressed with a general
|
||||||
|
// purpose algorithm.
|
||||||
|
31: optional list<string> category_metadata_json
|
||||||
|
|
||||||
|
// The metadata associated with each block. Maps from block
|
||||||
|
// number to index into `categorization_metadata_json`.
|
||||||
|
32: optional map<UInt32, UInt32> block_category_metadata
|
||||||
}
|
}
|
||||||
|
@ -663,6 +663,12 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
|
|||||||
("time-resolution",
|
("time-resolution",
|
||||||
po::value<std::string>(&time_resolution)->default_value("sec"),
|
po::value<std::string>(&time_resolution)->default_value("sec"),
|
||||||
resolution_desc.c_str())
|
resolution_desc.c_str())
|
||||||
|
("no-category-names",
|
||||||
|
po::value<bool>(&options.metadata.no_category_names)->zero_tokens(),
|
||||||
|
"don't add category names to file system")
|
||||||
|
("no-category-metadata",
|
||||||
|
po::value<bool>(&options.metadata.no_category_metadata)->zero_tokens(),
|
||||||
|
"don't add category metadata to file system")
|
||||||
("pack-metadata,P",
|
("pack-metadata,P",
|
||||||
po::value<std::string>(&pack_metadata)->default_value("auto"),
|
po::value<std::string>(&pack_metadata)->default_value("auto"),
|
||||||
"pack certain metadata elements (auto, all, none, chunk_table, "
|
"pack certain metadata elements (auto, all, none, chunk_table, "
|
||||||
|
Loading…
x
Reference in New Issue
Block a user