diff --git a/doc/mkdwarfs.md b/doc/mkdwarfs.md index 5e6378bc..4fd595e6 100644 --- a/doc/mkdwarfs.md +++ b/doc/mkdwarfs.md @@ -383,6 +383,22 @@ Most other options are concerned with compression tuning: Don't add a creation timestamp. This is useful when bit-identical file system images are required to be produced from the same input. +- `--no-category-names`: + Don't add category names to the file system. This is only relevant if + the file system uses categories at all. Adding category names allows + for setting per-category options when rewriting a file system, to get + per-category info with `dwarfsck`, or to see the categories that have + been identified in a file. This implicitly adds `--no-category-metadata`. + +- `--no-category-metadata`: + Don't add category metadata to the file system. This goes beyond just + the category names and additionally stores the metadata identified + during categorization. This metadata is needed for metadata-dependent + compression algorithms (e.g. FLAC). So if you're planning to recompress + categorized blocks from a metadata-independent to a metadata-dependent + compression algorithm, this is going to be impossible if the metadata + information is not available. + - `--file-hash=none`|*name*: Select the hashing function to be used for file deduplication. If `none` is chosen, file deduplication is disabled. By default, the built-in diff --git a/include/dwarfs/writer/internal/block_manager.h b/include/dwarfs/writer/internal/block_manager.h index 126bff38..f24c4735 100644 --- a/include/dwarfs/writer/internal/block_manager.h +++ b/include/dwarfs/writer/internal/block_manager.h @@ -38,16 +38,14 @@ class block_manager { size_t get_logical_block() const; void set_written_block(size_t logical_block, size_t written_block, - fragment_category::value_type category); + fragment_category category); void map_logical_blocks(std::vector& vec) const; - std::vector - get_written_block_categories() const; + std::vector get_written_block_categories() const; private: std::mutex mutable mx_; size_t mutable num_blocks_{0}; - std::vector>> - block_map_; + std::vector>> block_map_; }; } // namespace dwarfs::writer::internal diff --git a/include/dwarfs/writer/internal/metadata_builder.h b/include/dwarfs/writer/internal/metadata_builder.h index b0292839..09d089a8 100644 --- a/include/dwarfs/writer/internal/metadata_builder.h +++ b/include/dwarfs/writer/internal/metadata_builder.h @@ -87,6 +87,15 @@ class metadata_builder { impl_->set_block_categories(std::move(block_categories)); } + void set_category_metadata_json(std::vector metadata_json) { + impl_->set_category_metadata_json(std::move(metadata_json)); + } + + void + set_block_category_metadata(std::map block_metadata) { + impl_->set_block_category_metadata(std::move(block_metadata)); + } + void add_symlink_table_entry(size_t index, uint32_t entry) { impl_->add_symlink_table_entry(index, entry); } @@ -121,6 +130,10 @@ class metadata_builder { set_category_names(std::vector category_names) = 0; virtual void set_block_categories(std::vector block_categories) = 0; + virtual void + set_category_metadata_json(std::vector metadata_json) = 0; + virtual void set_block_category_metadata( + std::map block_metadata) = 0; virtual void add_symlink_table_entry(size_t index, uint32_t entry) = 0; virtual void gather_chunks(inode_manager const& im, block_manager const& bm, size_t chunk_count) = 0; diff --git a/include/dwarfs/writer/metadata_options.h b/include/dwarfs/writer/metadata_options.h index 837419dc..090149d6 100644 --- a/include/dwarfs/writer/metadata_options.h +++ b/include/dwarfs/writer/metadata_options.h @@ -53,6 +53,8 @@ struct metadata_options { bool pack_symlinks_index{false}; bool force_pack_string_tables{false}; bool no_create_timestamp{false}; + bool no_category_names{false}; + bool no_category_metadata{false}; size_t inode_size_cache_min_chunk_count{128}; }; diff --git a/src/reader/internal/metadata_v2.cpp b/src/reader/internal/metadata_v2.cpp index 88ec5426..6835f195 100644 --- a/src/reader/internal/metadata_v2.cpp +++ b/src/reader/internal/metadata_v2.cpp @@ -294,6 +294,8 @@ void analyze_frozen(std::ostream& os, } \ } while (0) +#define META_OPT_MAP_SIZE(x) META_OPT_LIST_SIZE(x) + #define META_OPT_STRING_LIST_SIZE(x) \ do { \ if (auto list = meta.x()) { \ @@ -402,6 +404,8 @@ void analyze_frozen(std::ostream& os, META_OPT_STRING_LIST_SIZE(category_names); META_OPT_LIST_SIZE(block_categories); + META_OPT_STRING_LIST_SIZE(category_metadata_json); + META_OPT_MAP_SIZE(block_category_metadata); #undef META_LIST_SIZE #undef META_OPT_STRING_SET_SIZE diff --git a/src/writer/internal/block_manager.cpp b/src/writer/internal/block_manager.cpp index 8737d5a3..272ebd2c 100644 --- a/src/writer/internal/block_manager.cpp +++ b/src/writer/internal/block_manager.cpp @@ -38,7 +38,7 @@ size_t block_manager::get_logical_block() const { void block_manager::set_written_block(size_t logical_block, size_t written_block, - fragment_category::value_type category) { + fragment_category category) { std::lock_guard lock{mx_}; assert(logical_block < num_blocks_); if (block_map_.size() < num_blocks_) { @@ -56,9 +56,9 @@ void block_manager::map_logical_blocks(std::vector& vec) const { } } -std::vector +std::vector block_manager::get_written_block_categories() const { - std::vector result; + std::vector result; { std::lock_guard lock{mx_}; diff --git a/src/writer/internal/metadata_builder.cpp b/src/writer/internal/metadata_builder.cpp index 733547dc..ed796234 100644 --- a/src/writer/internal/metadata_builder.cpp +++ b/src/writer/internal/metadata_builder.cpp @@ -95,6 +95,16 @@ class metadata_builder_ final : public metadata_builder::impl { md_.block_categories() = std::move(block_categories); } + void + set_category_metadata_json(std::vector metadata_json) override { + md_.category_metadata_json() = std::move(metadata_json); + } + + void set_block_category_metadata( + std::map block_metadata) override { + md_.block_category_metadata() = std::move(block_metadata); + } + void add_symlink_table_entry(size_t index, uint32_t entry) override { DWARFS_NOTHROW(md_.symlink_table()->at(index)) = entry; } @@ -314,6 +324,17 @@ thrift::metadata::metadata const& metadata_builder_::build() { ti << "saving symlinks table..."; } + if (options_.no_category_names) { + md_.category_names().reset(); + md_.block_categories().reset(); + } + + if (options_.no_category_names || options_.no_category_metadata) { + md_.category_metadata_json().reset(); + md_.block_category_metadata().reset(); + } + + // TODO: don't overwrite all options when upgrading! md_.options() = fsopts; md_.features() = features_.get(); md_.dwarfs_version() = std::string("libdwarfs ") + DWARFS_GIT_ID; diff --git a/src/writer/scanner.cpp b/src/writer/scanner.cpp index 0a58e5ae..b863f891 100644 --- a/src/writer/scanner.cpp +++ b/src/writer/scanner.cpp @@ -43,6 +43,8 @@ #include +#include + #include #include #include @@ -826,8 +828,7 @@ void scanner_::scan( [blockmgr, logical_block_num, category](auto physical_block_num) { blockmgr->set_written_block(logical_block_num, - physical_block_num, - category.value()); + physical_block_num, category); }, meta); }); @@ -918,30 +919,51 @@ void scanner_::scan( mdb.set_shared_files_table(std::move(ssfv.get_shared_files())); if (auto catmgr = options_.inode.categorizer_mgr) { - std::unordered_map + std::unordered_map category_indices; + std::unordered_map category_metadata_indices; std::vector category_names; + std::vector category_metadata; category_indices.reserve(frag_info.info.size()); category_names.reserve(frag_info.info.size()); for (auto const& ci : frag_info.info) { - auto [it, inserted] = - category_indices.emplace(ci.category, category_names.size()); - if (inserted) { + if (category_indices.emplace(ci.category, category_names.size()).second) { category_names.emplace_back(catmgr->category_name(ci.category)); } } + for (auto const& cat : frag_info.categories) { + auto metadata = catmgr->category_metadata(cat); + if (!metadata.empty()) { + if (category_metadata_indices.emplace(cat, category_metadata.size()) + .second) { + category_metadata.emplace_back(std::move(metadata)); + } + } + } + auto written_categories = blockmgr->get_written_block_categories(); + std::vector block_categories(written_categories.size()); + std::map block_cat_metadata; std::transform(written_categories.begin(), written_categories.end(), - written_categories.begin(), - [&](auto const& cat) { return category_indices.at(cat); }); + block_categories.begin(), [&](auto const& cat) { + return category_indices.at(cat.value()); + }); + + for (auto const& [i, cat] : ranges::views::enumerate(written_categories)) { + if (auto it = category_metadata_indices.find(cat); + it != category_metadata_indices.end()) { + block_cat_metadata.emplace(i, it->second); + } + } mdb.set_category_names(std::move(category_names)); - mdb.set_block_categories(std::move(written_categories)); + mdb.set_block_categories(std::move(block_categories)); + mdb.set_category_metadata_json(std::move(category_metadata)); + mdb.set_block_category_metadata(std::move(block_cat_metadata)); } mdb.set_block_size(segmenter_factory_.get_block_size()); diff --git a/test/segmenter_benchmark.cpp b/test/segmenter_benchmark.cpp index 7c55627d..89162f45 100644 --- a/test/segmenter_benchmark.cpp +++ b/test/segmenter_benchmark.cpp @@ -154,7 +154,7 @@ void run_segmenter_benchmark(::benchmark::State& state, unsigned granularity, auto logical_block_num) { auto physical_block_num = written.size(); written.push_back(blk); - blkmgr->set_written_block(logical_block_num, physical_block_num, 0); + blkmgr->set_written_block(logical_block_num, physical_block_num, {}); }); // begin benchmarking code diff --git a/thrift/metadata.thrift b/thrift/metadata.thrift index a504eb50..e03e40a4 100644 --- a/thrift/metadata.thrift +++ b/thrift/metadata.thrift @@ -422,4 +422,18 @@ struct metadata { // Size cache for highly fragmented file inodes 30: optional inode_size_cache reg_file_size_cache + + //==========================================================// + // fields added with dwarfs-0.13.0, file system version 2.5 // + //==========================================================// + + // Unique block categorization metadata JSON strings. These + // can be used to compress a block with a metadata-dependent + // algorithm after having been compressed with a general + // purpose algorithm. + 31: optional list category_metadata_json + + // The metadata associated with each block. Maps from block + // number to index into `categorization_metadata_json`. + 32: optional map block_category_metadata } diff --git a/tools/src/mkdwarfs_main.cpp b/tools/src/mkdwarfs_main.cpp index f64f1fbc..863e893b 100644 --- a/tools/src/mkdwarfs_main.cpp +++ b/tools/src/mkdwarfs_main.cpp @@ -663,6 +663,12 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) { ("time-resolution", po::value(&time_resolution)->default_value("sec"), resolution_desc.c_str()) + ("no-category-names", + po::value(&options.metadata.no_category_names)->zero_tokens(), + "don't add category names to file system") + ("no-category-metadata", + po::value(&options.metadata.no_category_metadata)->zero_tokens(), + "don't add category metadata to file system") ("pack-metadata,P", po::value(&pack_metadata)->default_value("auto"), "pack certain metadata elements (auto, all, none, chunk_table, "