diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index 483670e1..7c71609c 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "dwarfs/categorized_option.h" @@ -136,6 +137,8 @@ struct scanner_options { struct rewrite_options { bool recompress_block{false}; bool recompress_metadata{false}; + std::unordered_set recompress_categories; + bool recompress_categories_exclude{false}; bool enable_history{true}; std::optional> command_line_arguments; history_config history; diff --git a/src/dwarfs/filesystem_v2.cpp b/src/dwarfs/filesystem_v2.cpp index ba72975d..ccf43a6e 100644 --- a/src/dwarfs/filesystem_v2.cpp +++ b/src/dwarfs/filesystem_v2.cpp @@ -48,6 +48,7 @@ #include "dwarfs/options.h" #include "dwarfs/performance_monitor.h" #include "dwarfs/progress.h" +#include "dwarfs/util.h" #include "dwarfs/worker_group.h" namespace dwarfs { @@ -581,51 +582,79 @@ void filesystem_::rewrite(progress& prog, size_t block_no{0}; - auto log_recompress = - [&](const auto& s, - std::optional const& cat = - std::nullopt) { + auto log_rewrite = + [&](bool compressing, const auto& s, + std::optional const& cat) { + auto prefix = compressing ? "recompressing" : "copying"; std::string catinfo; + std::string compinfo; if (cat) { catinfo = fmt::format(", {}", cat_resolver.category_name(*cat)); } - LOG_VERBOSE << "recompressing " << get_section_name(s->type()) << " (" - << get_compression_name(s->compression()) << catinfo - << ") using '" - << writer.get_compressor(s->type(), cat).describe() << "'"; + if (compressing) { + compinfo = fmt::format( + " using '{}'", writer.get_compressor(s->type(), cat).describe()); + } + LOG_VERBOSE << prefix << " " << size_with_unit(s->length()) << " " + << get_section_name(s->type()) << " (" + << get_compression_name(s->compression()) << catinfo << ")" + << compinfo; }; - auto copy_compressed = [&](const auto& s) { - LOG_VERBOSE << "copying " << get_section_name(s->type()) << " (" - << get_compression_name(s->compression()) << ")"; - writer.write_compressed_section(s->type(), s->compression(), s->data(*mm_)); - }; + auto log_recompress = + [&](const auto& s, + std::optional const& cat = + std::nullopt) { log_rewrite(true, s, cat); }; + + auto copy_compressed = + [&](const auto& s, + std::optional const& cat = + std::nullopt) { + log_rewrite(false, s, cat); + writer.write_compressed_section(s->type(), s->compression(), + s->data(*mm_)); + }; parser_.rewind(); while (auto s = parser_.next_section()) { switch (s->type()) { - case section_type::BLOCK: - if (opts.recompress_block) { - std::optional cat; + case section_type::BLOCK: { + std::optional cat; + bool recompress_block{true}; - if (auto catstr = meta_.get_block_category(block_no)) { + if (opts.recompress_block) { + auto catstr = meta_.get_block_category(block_no); + + if (catstr) { cat = cat_resolver.category_value(catstr.value()); + if (!cat) { LOG_ERROR << "unknown category '" << catstr.value() << "' for block " << block_no; } - } + if (!opts.recompress_categories.empty()) { + bool is_in_set{opts.recompress_categories.count(catstr.value()) > + 0}; + + recompress_block = + opts.recompress_categories_exclude ? !is_in_set : is_in_set; + } + } + } + + if (recompress_block) { log_recompress(s, cat); writer.write_section(section_type::BLOCK, s->compression(), s->data(*mm_), cat); } else { - copy_compressed(s); + copy_compressed(s, cat); } + ++block_no; - break; + } break; case section_type::METADATA_V2_SCHEMA: case section_type::METADATA_V2: diff --git a/src/mkdwarfs_main.cpp b/src/mkdwarfs_main.cpp index 73ac8600..b1d5f209 100644 --- a/src/mkdwarfs_main.cpp +++ b/src/mkdwarfs_main.cpp @@ -281,7 +281,7 @@ int mkdwarfs_main(int argc, sys_char** argv) { metadata_compression, log_level_str, timestamp, time_resolution, progress_mode, recompress_opts, pack_metadata, file_hash_algo, debug_filter, max_similarity_size, input_list_str, chmod_str, - categorizer_list_str, history_compression; + categorizer_list_str, history_compression, recompress_categories; std::vector filter; std::vector order, max_lookback_blocks, window_size, window_step, bloom_filter_size, compression; @@ -376,6 +376,9 @@ int mkdwarfs_main(int argc, sys_char** argv) { ("recompress", po::value(&recompress_opts)->implicit_value("all"), "recompress an existing filesystem (none, block, metadata, all)") + ("recompress-categories", + po::value(&recompress_categories), + "only recompress blocks of these categories") ("categorize", po::value(&categorizer_list_str) ->implicit_value("pcmaudio,incompressible"), @@ -383,7 +386,7 @@ int mkdwarfs_main(int argc, sys_char** argv) { ("order", po::value>(&order) ->value_name("[cat::]arg")->multitoken()->composing(), - order_desc.c_str()) // TODO + order_desc.c_str()) ("max-similarity-size", po::value(&max_similarity_size), "maximum file size to compute similarity") @@ -714,6 +717,15 @@ int mkdwarfs_main(int argc, sys_char** argv) { std::cerr << "invalid recompress mode: " << recompress_opts << "\n"; return 1; } + + if (!recompress_categories.empty()) { + std::string_view input = recompress_categories; + if (input.front() == '!') { + rw_opts.recompress_categories_exclude = true; + input.remove_prefix(1); + } + boost::split(rw_opts.recompress_categories, input, boost::is_any_of(",")); + } } if (file_hash_algo == "none") { @@ -1018,6 +1030,13 @@ int mkdwarfs_main(int argc, sys_char** argv) { cat_resolver = std::make_shared( input_filesystem->get_all_block_categories()); + + for (auto const& cat : rw_opts.recompress_categories) { + if (!cat_resolver->category_value(cat)) { + std::cerr << "error: no category '" << cat << "' in input filesystem\n"; + return 1; + } + } } else { cat_resolver = options.inode.categorizer_mgr; }