feat(mkdwarfs): allow selection of categories to recompress

This commit is contained in:
Marcus Holland-Moritz 2023-12-17 10:49:19 +01:00
parent 2c91e80119
commit bbeffbfd98
3 changed files with 73 additions and 22 deletions

View File

@ -27,6 +27,7 @@
#include <iosfwd> #include <iosfwd>
#include <memory> #include <memory>
#include <optional> #include <optional>
#include <unordered_set>
#include <vector> #include <vector>
#include "dwarfs/categorized_option.h" #include "dwarfs/categorized_option.h"
@ -136,6 +137,8 @@ struct scanner_options {
struct rewrite_options { struct rewrite_options {
bool recompress_block{false}; bool recompress_block{false};
bool recompress_metadata{false}; bool recompress_metadata{false};
std::unordered_set<std::string> recompress_categories;
bool recompress_categories_exclude{false};
bool enable_history{true}; bool enable_history{true};
std::optional<std::vector<std::string>> command_line_arguments; std::optional<std::vector<std::string>> command_line_arguments;
history_config history; history_config history;

View File

@ -48,6 +48,7 @@
#include "dwarfs/options.h" #include "dwarfs/options.h"
#include "dwarfs/performance_monitor.h" #include "dwarfs/performance_monitor.h"
#include "dwarfs/progress.h" #include "dwarfs/progress.h"
#include "dwarfs/util.h"
#include "dwarfs/worker_group.h" #include "dwarfs/worker_group.h"
namespace dwarfs { namespace dwarfs {
@ -581,51 +582,79 @@ void filesystem_<LoggerPolicy>::rewrite(progress& prog,
size_t block_no{0}; size_t block_no{0};
auto log_recompress = auto log_rewrite =
[&](const auto& s, [&](bool compressing, const auto& s,
std::optional<fragment_category::value_type> const& cat = std::optional<fragment_category::value_type> const& cat) {
std::nullopt) { auto prefix = compressing ? "recompressing" : "copying";
std::string catinfo; std::string catinfo;
std::string compinfo;
if (cat) { if (cat) {
catinfo = fmt::format(", {}", cat_resolver.category_name(*cat)); catinfo = fmt::format(", {}", cat_resolver.category_name(*cat));
} }
LOG_VERBOSE << "recompressing " << get_section_name(s->type()) << " (" if (compressing) {
<< get_compression_name(s->compression()) << catinfo compinfo = fmt::format(
<< ") using '" " using '{}'", writer.get_compressor(s->type(), cat).describe());
<< writer.get_compressor(s->type(), cat).describe() << "'"; }
LOG_VERBOSE << prefix << " " << size_with_unit(s->length()) << " "
<< get_section_name(s->type()) << " ("
<< get_compression_name(s->compression()) << catinfo << ")"
<< compinfo;
}; };
auto copy_compressed = [&](const auto& s) { auto log_recompress =
LOG_VERBOSE << "copying " << get_section_name(s->type()) << " (" [&](const auto& s,
<< get_compression_name(s->compression()) << ")"; std::optional<fragment_category::value_type> const& cat =
writer.write_compressed_section(s->type(), s->compression(), s->data(*mm_)); std::nullopt) { log_rewrite(true, s, cat); };
auto copy_compressed =
[&](const auto& s,
std::optional<fragment_category::value_type> const& cat =
std::nullopt) {
log_rewrite(false, s, cat);
writer.write_compressed_section(s->type(), s->compression(),
s->data(*mm_));
}; };
parser_.rewind(); parser_.rewind();
while (auto s = parser_.next_section()) { while (auto s = parser_.next_section()) {
switch (s->type()) { switch (s->type()) {
case section_type::BLOCK: case section_type::BLOCK: {
if (opts.recompress_block) {
std::optional<fragment_category::value_type> cat; std::optional<fragment_category::value_type> cat;
bool recompress_block{true};
if (auto catstr = meta_.get_block_category(block_no)) { if (opts.recompress_block) {
auto catstr = meta_.get_block_category(block_no);
if (catstr) {
cat = cat_resolver.category_value(catstr.value()); cat = cat_resolver.category_value(catstr.value());
if (!cat) { if (!cat) {
LOG_ERROR << "unknown category '" << catstr.value() LOG_ERROR << "unknown category '" << catstr.value()
<< "' for block " << block_no; << "' for block " << block_no;
} }
if (!opts.recompress_categories.empty()) {
bool is_in_set{opts.recompress_categories.count(catstr.value()) >
0};
recompress_block =
opts.recompress_categories_exclude ? !is_in_set : is_in_set;
}
}
} }
if (recompress_block) {
log_recompress(s, cat); log_recompress(s, cat);
writer.write_section(section_type::BLOCK, s->compression(), writer.write_section(section_type::BLOCK, s->compression(),
s->data(*mm_), cat); s->data(*mm_), cat);
} else { } else {
copy_compressed(s); copy_compressed(s, cat);
} }
++block_no; ++block_no;
break; } break;
case section_type::METADATA_V2_SCHEMA: case section_type::METADATA_V2_SCHEMA:
case section_type::METADATA_V2: case section_type::METADATA_V2:

View File

@ -281,7 +281,7 @@ int mkdwarfs_main(int argc, sys_char** argv) {
metadata_compression, log_level_str, timestamp, time_resolution, metadata_compression, log_level_str, timestamp, time_resolution,
progress_mode, recompress_opts, pack_metadata, file_hash_algo, progress_mode, recompress_opts, pack_metadata, file_hash_algo,
debug_filter, max_similarity_size, input_list_str, chmod_str, debug_filter, max_similarity_size, input_list_str, chmod_str,
categorizer_list_str, history_compression; categorizer_list_str, history_compression, recompress_categories;
std::vector<sys_string> filter; std::vector<sys_string> filter;
std::vector<std::string> order, max_lookback_blocks, window_size, window_step, std::vector<std::string> order, max_lookback_blocks, window_size, window_step,
bloom_filter_size, compression; bloom_filter_size, compression;
@ -376,6 +376,9 @@ int mkdwarfs_main(int argc, sys_char** argv) {
("recompress", ("recompress",
po::value<std::string>(&recompress_opts)->implicit_value("all"), po::value<std::string>(&recompress_opts)->implicit_value("all"),
"recompress an existing filesystem (none, block, metadata, all)") "recompress an existing filesystem (none, block, metadata, all)")
("recompress-categories",
po::value<std::string>(&recompress_categories),
"only recompress blocks of these categories")
("categorize", ("categorize",
po::value<std::string>(&categorizer_list_str) po::value<std::string>(&categorizer_list_str)
->implicit_value("pcmaudio,incompressible"), ->implicit_value("pcmaudio,incompressible"),
@ -383,7 +386,7 @@ int mkdwarfs_main(int argc, sys_char** argv) {
("order", ("order",
po::value<std::vector<std::string>>(&order) po::value<std::vector<std::string>>(&order)
->value_name("[cat::]arg")->multitoken()->composing(), ->value_name("[cat::]arg")->multitoken()->composing(),
order_desc.c_str()) // TODO order_desc.c_str())
("max-similarity-size", ("max-similarity-size",
po::value<std::string>(&max_similarity_size), po::value<std::string>(&max_similarity_size),
"maximum file size to compute similarity") "maximum file size to compute similarity")
@ -714,6 +717,15 @@ int mkdwarfs_main(int argc, sys_char** argv) {
std::cerr << "invalid recompress mode: " << recompress_opts << "\n"; std::cerr << "invalid recompress mode: " << recompress_opts << "\n";
return 1; return 1;
} }
if (!recompress_categories.empty()) {
std::string_view input = recompress_categories;
if (input.front() == '!') {
rw_opts.recompress_categories_exclude = true;
input.remove_prefix(1);
}
boost::split(rw_opts.recompress_categories, input, boost::is_any_of(","));
}
} }
if (file_hash_algo == "none") { if (file_hash_algo == "none") {
@ -1018,6 +1030,13 @@ int mkdwarfs_main(int argc, sys_char** argv) {
cat_resolver = std::make_shared<filesystem_block_category_resolver>( cat_resolver = std::make_shared<filesystem_block_category_resolver>(
input_filesystem->get_all_block_categories()); input_filesystem->get_all_block_categories());
for (auto const& cat : rw_opts.recompress_categories) {
if (!cat_resolver->category_value(cat)) {
std::cerr << "error: no category '" << cat << "' in input filesystem\n";
return 1;
}
}
} else { } else {
cat_resolver = options.inode.categorizer_mgr; cat_resolver = options.inode.categorizer_mgr;
} }