feat(mkdwarfs): allow selection of categories to recompress

This commit is contained in:
Marcus Holland-Moritz 2023-12-17 10:49:19 +01:00
parent 2c91e80119
commit bbeffbfd98
3 changed files with 73 additions and 22 deletions

View File

@ -27,6 +27,7 @@
#include <iosfwd>
#include <memory>
#include <optional>
#include <unordered_set>
#include <vector>
#include "dwarfs/categorized_option.h"
@ -136,6 +137,8 @@ struct scanner_options {
struct rewrite_options {
bool recompress_block{false};
bool recompress_metadata{false};
std::unordered_set<std::string> recompress_categories;
bool recompress_categories_exclude{false};
bool enable_history{true};
std::optional<std::vector<std::string>> command_line_arguments;
history_config history;

View File

@ -48,6 +48,7 @@
#include "dwarfs/options.h"
#include "dwarfs/performance_monitor.h"
#include "dwarfs/progress.h"
#include "dwarfs/util.h"
#include "dwarfs/worker_group.h"
namespace dwarfs {
@ -581,51 +582,79 @@ void filesystem_<LoggerPolicy>::rewrite(progress& prog,
size_t block_no{0};
auto log_recompress =
[&](const auto& s,
std::optional<fragment_category::value_type> const& cat =
std::nullopt) {
auto log_rewrite =
[&](bool compressing, const auto& s,
std::optional<fragment_category::value_type> const& cat) {
auto prefix = compressing ? "recompressing" : "copying";
std::string catinfo;
std::string compinfo;
if (cat) {
catinfo = fmt::format(", {}", cat_resolver.category_name(*cat));
}
LOG_VERBOSE << "recompressing " << get_section_name(s->type()) << " ("
<< get_compression_name(s->compression()) << catinfo
<< ") using '"
<< writer.get_compressor(s->type(), cat).describe() << "'";
if (compressing) {
compinfo = fmt::format(
" using '{}'", writer.get_compressor(s->type(), cat).describe());
}
LOG_VERBOSE << prefix << " " << size_with_unit(s->length()) << " "
<< get_section_name(s->type()) << " ("
<< get_compression_name(s->compression()) << catinfo << ")"
<< compinfo;
};
auto copy_compressed = [&](const auto& s) {
LOG_VERBOSE << "copying " << get_section_name(s->type()) << " ("
<< get_compression_name(s->compression()) << ")";
writer.write_compressed_section(s->type(), s->compression(), s->data(*mm_));
};
auto log_recompress =
[&](const auto& s,
std::optional<fragment_category::value_type> const& cat =
std::nullopt) { log_rewrite(true, s, cat); };
auto copy_compressed =
[&](const auto& s,
std::optional<fragment_category::value_type> const& cat =
std::nullopt) {
log_rewrite(false, s, cat);
writer.write_compressed_section(s->type(), s->compression(),
s->data(*mm_));
};
parser_.rewind();
while (auto s = parser_.next_section()) {
switch (s->type()) {
case section_type::BLOCK:
if (opts.recompress_block) {
std::optional<fragment_category::value_type> cat;
case section_type::BLOCK: {
std::optional<fragment_category::value_type> cat;
bool recompress_block{true};
if (auto catstr = meta_.get_block_category(block_no)) {
if (opts.recompress_block) {
auto catstr = meta_.get_block_category(block_no);
if (catstr) {
cat = cat_resolver.category_value(catstr.value());
if (!cat) {
LOG_ERROR << "unknown category '" << catstr.value()
<< "' for block " << block_no;
}
}
if (!opts.recompress_categories.empty()) {
bool is_in_set{opts.recompress_categories.count(catstr.value()) >
0};
recompress_block =
opts.recompress_categories_exclude ? !is_in_set : is_in_set;
}
}
}
if (recompress_block) {
log_recompress(s, cat);
writer.write_section(section_type::BLOCK, s->compression(),
s->data(*mm_), cat);
} else {
copy_compressed(s);
copy_compressed(s, cat);
}
++block_no;
break;
} break;
case section_type::METADATA_V2_SCHEMA:
case section_type::METADATA_V2:

View File

@ -281,7 +281,7 @@ int mkdwarfs_main(int argc, sys_char** argv) {
metadata_compression, log_level_str, timestamp, time_resolution,
progress_mode, recompress_opts, pack_metadata, file_hash_algo,
debug_filter, max_similarity_size, input_list_str, chmod_str,
categorizer_list_str, history_compression;
categorizer_list_str, history_compression, recompress_categories;
std::vector<sys_string> filter;
std::vector<std::string> order, max_lookback_blocks, window_size, window_step,
bloom_filter_size, compression;
@ -376,6 +376,9 @@ int mkdwarfs_main(int argc, sys_char** argv) {
("recompress",
po::value<std::string>(&recompress_opts)->implicit_value("all"),
"recompress an existing filesystem (none, block, metadata, all)")
("recompress-categories",
po::value<std::string>(&recompress_categories),
"only recompress blocks of these categories")
("categorize",
po::value<std::string>(&categorizer_list_str)
->implicit_value("pcmaudio,incompressible"),
@ -383,7 +386,7 @@ int mkdwarfs_main(int argc, sys_char** argv) {
("order",
po::value<std::vector<std::string>>(&order)
->value_name("[cat::]arg")->multitoken()->composing(),
order_desc.c_str()) // TODO
order_desc.c_str())
("max-similarity-size",
po::value<std::string>(&max_similarity_size),
"maximum file size to compute similarity")
@ -714,6 +717,15 @@ int mkdwarfs_main(int argc, sys_char** argv) {
std::cerr << "invalid recompress mode: " << recompress_opts << "\n";
return 1;
}
if (!recompress_categories.empty()) {
std::string_view input = recompress_categories;
if (input.front() == '!') {
rw_opts.recompress_categories_exclude = true;
input.remove_prefix(1);
}
boost::split(rw_opts.recompress_categories, input, boost::is_any_of(","));
}
}
if (file_hash_algo == "none") {
@ -1018,6 +1030,13 @@ int mkdwarfs_main(int argc, sys_char** argv) {
cat_resolver = std::make_shared<filesystem_block_category_resolver>(
input_filesystem->get_all_block_categories());
for (auto const& cat : rw_opts.recompress_categories) {
if (!cat_resolver->category_value(cat)) {
std::cerr << "error: no category '" << cat << "' in input filesystem\n";
return 1;
}
}
} else {
cat_resolver = options.inode.categorizer_mgr;
}