dwarfs/tools/src/mkdwarfs_main.cpp
2025-04-07 21:32:26 +02:00

1446 lines
47 KiB
C++

/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <algorithm>
#include <array>
#include <cerrno>
#include <cstdio>
#include <ctime>
#include <filesystem>
#include <iostream>
#include <iterator>
#include <map>
#include <memory>
#include <optional>
#include <sstream>
#include <stdexcept>
#include <string>
#include <string_view>
#include <thread>
#include <utility>
#include <variant>
#include <vector>
#ifdef _WIN32
#include <io.h>
#endif
#include <boost/algorithm/string/join.hpp>
#include <boost/program_options.hpp>
#include <fmt/format.h>
#if FMT_VERSION >= 110000
#include <fmt/ranges.h>
#endif
#include <range/v3/view/enumerate.hpp>
#include <dwarfs/block_compressor.h>
#include <dwarfs/block_compressor_parser.h>
#include <dwarfs/checksum.h>
#include <dwarfs/compressor_registry.h>
#include <dwarfs/config.h>
#include <dwarfs/conv.h>
#include <dwarfs/decompressor_registry.h>
#include <dwarfs/error.h>
#include <dwarfs/file_access.h>
#include <dwarfs/integral_value_parser.h>
#include <dwarfs/logger.h>
#include <dwarfs/match.h>
#include <dwarfs/mmap.h>
#include <dwarfs/os_access.h>
#include <dwarfs/reader/filesystem_options.h>
#include <dwarfs/reader/filesystem_v2.h>
#include <dwarfs/sorted_array_map.h>
#include <dwarfs/string.h>
#include <dwarfs/terminal.h>
#include <dwarfs/thread_pool.h>
#include <dwarfs/tool/iolayer.h>
#include <dwarfs/tool/program_options_helpers.h>
#include <dwarfs/tool/tool.h>
#include <dwarfs/util.h>
#include <dwarfs/utility/rewrite_filesystem.h>
#include <dwarfs/utility/rewrite_options.h>
#include <dwarfs/writer/categorizer.h>
#include <dwarfs/writer/category_parser.h>
#include <dwarfs/writer/chmod_entry_transformer.h>
#include <dwarfs/writer/console_writer.h>
#include <dwarfs/writer/entry_factory.h>
#include <dwarfs/writer/filesystem_block_category_resolver.h>
#include <dwarfs/writer/filesystem_writer.h>
#include <dwarfs/writer/filesystem_writer_options.h>
#include <dwarfs/writer/filter_debug.h>
#include <dwarfs/writer/fragment_order_parser.h>
#include <dwarfs/writer/rule_based_entry_filter.h>
#include <dwarfs/writer/scanner.h>
#include <dwarfs/writer/scanner_options.h>
#include <dwarfs/writer/segmenter_factory.h>
#include <dwarfs/writer/writer_progress.h>
#include <dwarfs_tool_main.h>
#include <dwarfs_tool_manpage.h>
namespace po = boost::program_options;
namespace dwarfs::tool {
namespace {
using namespace std::string_view_literals;
constexpr sorted_array_map progress_modes{
std::pair{"none"sv, writer::console_writer::NONE},
std::pair{"simple"sv, writer::console_writer::SIMPLE},
std::pair{"ascii"sv, writer::console_writer::ASCII},
std::pair{"unicode"sv, writer::console_writer::UNICODE},
};
constexpr auto default_progress_mode = "unicode";
constexpr sorted_array_map debug_filter_modes{
std::pair{"included"sv, writer::debug_filter_mode::INCLUDED},
std::pair{"included-files"sv, writer::debug_filter_mode::INCLUDED_FILES},
std::pair{"excluded"sv, writer::debug_filter_mode::EXCLUDED},
std::pair{"excluded-files"sv, writer::debug_filter_mode::EXCLUDED_FILES},
std::pair{"files"sv, writer::debug_filter_mode::FILES},
std::pair{"all"sv, writer::debug_filter_mode::ALL},
};
constexpr sorted_array_map time_resolutions{
std::pair{"sec"sv, 1},
std::pair{"min"sv, 60},
std::pair{"hour"sv, 3600},
std::pair{"day"sv, 86400},
};
constexpr size_t min_block_size_bits{10};
constexpr size_t max_block_size_bits{30};
struct level_defaults {
unsigned block_size_bits;
std::string_view data_compression;
std::string_view schema_history_compression;
std::string_view metadata_compression;
unsigned window_size;
unsigned window_step;
std::string_view order;
};
#if defined(DWARFS_HAVE_LIBLZ4)
#define ALG_DATA_1 "lz4"
#define ALG_DATA_2 "lz4hc:level=9"
#define ALG_DATA_3 "lz4hc:level=9"
#elif defined(DWARFS_HAVE_LIBZSTD)
#define ALG_DATA_1 "zstd:level=1"
#define ALG_DATA_2 "zstd:level=4"
#define ALG_DATA_3 "zstd:level=7"
#elif defined(DWARFS_HAVE_LIBLZMA)
#define ALG_DATA_1 "lzma:level=1"
#define ALG_DATA_2 "lzma:level=2"
#define ALG_DATA_3 "lzma:level=3"
#else
#define ALG_DATA_1 "null"
#define ALG_DATA_2 "null"
#define ALG_DATA_3 "null"
#endif
#if defined(DWARFS_HAVE_LIBZSTD)
#define ALG_DATA_4 "zstd:level=11"
#define ALG_DATA_5 "zstd:level=19"
#define ALG_DATA_6 "zstd:level=22"
#define ALG_DATA_7 "zstd:level=22"
#elif defined(DWARFS_HAVE_LIBLZMA)
#define ALG_DATA_4 "lzma:level=3"
#define ALG_DATA_5 "lzma:level=4"
#define ALG_DATA_6 "lzma:level=5"
#define ALG_DATA_7 "lzma:level=8"
#elif defined(DWARFS_HAVE_LIBLZ4)
#define ALG_DATA_4 "lz4hc:level=9"
#define ALG_DATA_5 "lz4hc:level=9"
#define ALG_DATA_6 "lz4hc:level=9"
#define ALG_DATA_7 "lz4hc:level=9"
#else
#define ALG_DATA_4 "null"
#define ALG_DATA_5 "null"
#define ALG_DATA_6 "null"
#define ALG_DATA_7 "null"
#endif
#if defined(DWARFS_HAVE_LIBLZMA)
#define ALG_DATA_8 "lzma:level=9"
#define ALG_DATA_9 "lzma:level=9"
#elif defined(DWARFS_HAVE_LIBZSTD)
#define ALG_DATA_8 "zstd:level=22"
#define ALG_DATA_9 "zstd:level=22"
#elif defined(DWARFS_HAVE_LIBLZ4)
#define ALG_DATA_8 "lz4hc:level=9"
#define ALG_DATA_9 "lz4hc:level=9"
#else
#define ALG_DATA_8 "null"
#define ALG_DATA_9 "null"
#endif
#if defined(DWARFS_HAVE_LIBZSTD)
#define ALG_SCHEMA "zstd:level=16"
#elif defined(DWARFS_HAVE_LIBLZMA)
#define ALG_SCHEMA "lzma:level=4"
#elif defined(DWARFS_HAVE_LIBLZ4)
#define ALG_SCHEMA "lz4hc:level=9"
#else
#define ALG_SCHEMA "null"
#endif
#if defined(DWARFS_HAVE_LIBZSTD)
#define ALG_METADATA_7 "zstd:level=22"
#elif defined(DWARFS_HAVE_LIBLZMA)
#define ALG_METADATA_7 "lzma:level=9"
#elif defined(DWARFS_HAVE_LIBLZ4)
#define ALG_METADATA_7 "lz4hc:level=9"
#else
#define ALG_METADATA_7 "null"
#endif
#if defined(DWARFS_HAVE_LIBLZMA)
#define ALG_METADATA_9 "lzma:level=9"
#elif defined(DWARFS_HAVE_LIBZSTD)
#define ALG_METADATA_9 "zstd:level=22"
#elif defined(DWARFS_HAVE_LIBLZ4)
#define ALG_METADATA_9 "lz4hc:level=9"
#else
#define ALG_METADATA_9 "null"
#endif
constexpr std::array<level_defaults, 10> levels{{
// clang-format off
/* 0 */ {20, "null", "null" , "null", 0, 0, "none"},
/* 1 */ {20, ALG_DATA_1, ALG_SCHEMA, "null", 0, 0, "path"},
/* 2 */ {20, ALG_DATA_2, ALG_SCHEMA, "null", 0, 0, "path"},
/* 3 */ {21, ALG_DATA_3, ALG_SCHEMA, "null", 12, 1, "similarity"},
/* 4 */ {22, ALG_DATA_4, ALG_SCHEMA, "null", 12, 2, "similarity"},
/* 5 */ {23, ALG_DATA_5, ALG_SCHEMA, "null", 12, 2, "similarity"},
/* 6 */ {24, ALG_DATA_6, ALG_SCHEMA, "null", 12, 3, "nilsimsa"},
/* 7 */ {24, ALG_DATA_7, ALG_SCHEMA, ALG_METADATA_7, 12, 3, "nilsimsa"},
/* 8 */ {24, ALG_DATA_8, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa"},
/* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa"},
// clang-format on
}};
using categorize_defaults_type =
std::unordered_map<std::string, std::vector<std::string>>;
categorize_defaults_type const& categorize_defaults_common() {
static categorize_defaults_type const defaults{
// clang-format off
{"--compression", {"incompressible::null"}},
// clang-format on
};
return defaults;
}
categorize_defaults_type const& categorize_defaults_level(unsigned level) {
static categorize_defaults_type const defaults_fast{
// clang-format off
{"--order", {"pcmaudio/waveform::revpath", "fits/image::revpath"}},
{"--window-size", {"pcmaudio/waveform::0", "fits/image::0"}},
{"--compression", {
#ifdef DWARFS_HAVE_FLAC
"pcmaudio/waveform::flac:level=3",
#else
"pcmaudio/waveform::zstd:level=3",
#endif
#ifdef DWARFS_HAVE_RICEPP
"fits/image::ricepp",
#else
"fits/image::zstd:level=3",
#endif
}},
// clang-format on
};
static categorize_defaults_type const defaults_medium{
// clang-format off
{"--order", {"pcmaudio/waveform::revpath", "fits/image::revpath"}},
{"--window-size", {"pcmaudio/waveform::20", "fits/image::0"}},
{"--compression", {
#ifdef DWARFS_HAVE_FLAC
"pcmaudio/waveform::flac:level=5",
#else
"pcmaudio/waveform::zstd:level=5",
#endif
#ifdef DWARFS_HAVE_RICEPP
"fits/image::ricepp",
#else
"fits/image::zstd:level=5",
#endif
}},
// clang-format on
};
static categorize_defaults_type const defaults_slow{
// clang-format off
{"--order", {"fits/image::revpath"}},
{"--window-size", {"pcmaudio/waveform::16", "fits/image::0"}},
{"--compression", {
#ifdef DWARFS_HAVE_FLAC
"pcmaudio/waveform::flac:level=8",
#else
"pcmaudio/waveform::zstd:level=8",
#endif
#ifdef DWARFS_HAVE_RICEPP
"fits/image::ricepp",
#else
"fits/image::zstd:level=8",
#endif
}},
// clang-format on
};
static constexpr std::array<categorize_defaults_type const*, 10>
defaults_level{{
// clang-format off
/* 0 */ &defaults_fast,
/* 1 */ &defaults_fast,
/* 2 */ &defaults_fast,
/* 3 */ &defaults_fast,
/* 4 */ &defaults_fast,
/* 5 */ &defaults_medium,
/* 6 */ &defaults_medium,
/* 7 */ &defaults_medium,
/* 8 */ &defaults_slow,
/* 9 */ &defaults_slow,
// clang-format on
}};
return *defaults_level.at(level);
}
constexpr unsigned default_level = 7;
class categorize_optval {
public:
categorize_optval() = default;
explicit categorize_optval(std::string const& val, bool expl = false)
: value_{val}
, is_explicit_{expl} {}
bool empty() const { return value_.empty(); }
std::string const& value() const { return value_; }
bool is_implicit_default() const { return !empty() && !is_explicit_; }
bool is_explicit() const { return is_explicit_; }
template <typename T>
void add_implicit_defaults(T& cop) const {
if (is_implicit_default()) {
if (auto it = defaults_.find(cop.name()); it != defaults_.end()) {
for (auto const& v : it->second) {
cop.parse_fallback(v);
}
}
}
}
void add_defaults(categorize_defaults_type const& defaults) {
for (auto const& [key, values] : defaults) {
auto& vs = defaults_[key];
vs.insert(vs.end(), values.begin(), values.end());
}
}
private:
categorize_defaults_type defaults_;
std::string value_;
bool is_explicit_{false};
};
std::ostream& operator<<(std::ostream& os, categorize_optval const& optval) {
return os << optval.value() << (optval.is_explicit() ? " (explicit)" : "");
}
void validate(boost::any& v, std::vector<std::string> const& values,
categorize_optval*, int) {
po::validators::check_first_occurrence(v);
v = categorize_optval{po::validators::get_single_string(values), true};
}
} // namespace
int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
using namespace std::chrono_literals;
size_t const num_cpu = std::max(hardware_concurrency(), 1U);
static constexpr size_t const kDefaultMaxActiveBlocks{1};
static constexpr size_t const kDefaultBloomFilterSize{4};
writer::segmenter_factory::config sf_config;
sys_string path_str, input_list_str, output_str, header_str;
std::string memory_limit, schema_compression, metadata_compression, timestamp,
time_resolution, progress_mode, recompress_opts, pack_metadata,
file_hash_algo, debug_filter, max_similarity_size, chmod_str,
history_compression, recompress_categories;
std::vector<sys_string> filter;
std::vector<std::string> order, max_lookback_blocks, window_size, window_step,
bloom_filter_size, compression;
size_t num_workers, num_scanner_workers, num_segmenter_workers;
bool no_progress = false, remove_header = false, no_section_index = false,
force_overwrite = false, no_history = false,
no_history_timestamps = false, no_history_command_line = false;
unsigned level;
int compress_niceness;
uint16_t uid, gid;
categorize_optval categorizer_list;
integral_value_parser<size_t> max_lookback_parser;
integral_value_parser<unsigned> window_size_parser(0, 24);
integral_value_parser<unsigned> window_step_parser(0, 8);
integral_value_parser<unsigned> bloom_filter_size_parser(0, 10);
writer::fragment_order_parser order_parser;
block_compressor_parser compressor_parser;
writer::scanner_options options;
logger_options logopts;
auto order_desc = "inode fragments order (" +
dwarfs::writer::fragment_order_parser::choices() + ")";
auto progress_desc =
fmt::format("progress mode ({})",
fmt::join(ranges::views::keys(progress_modes), ", "));
auto debug_filter_desc =
fmt::format("show effect of filter rules without producing an image ({})",
fmt::join(ranges::views::keys(debug_filter_modes), ", "));
auto resolution_desc =
fmt::format("time resolution in seconds or ({})",
fmt::join(ranges::views::keys(time_resolutions), ", "));
auto hash_list = checksum::available_algorithms();
auto file_hash_desc = fmt::format(
"choice of file hashing function (none, {})", fmt::join(hash_list, ", "));
auto& catreg = writer::categorizer_registry::instance();
auto categorize_desc =
fmt::format("enable categorizers in the given order ({})",
fmt::join(catreg.categorizer_names(), ", "));
auto lvl_def_val = [](auto opt) {
return fmt::format("arg (={})", levels[default_level].*opt);
};
auto dep_def_val = [](auto dep) { return fmt::format("arg (={})", dep); };
auto cat_def_val = [](auto def) {
return fmt::format("[cat::]arg (={})", def);
};
auto lvl_cat_def_val = [](auto opt) {
return fmt::format("[cat::]arg (={})", levels[default_level].*opt);
};
// clang-format off
po::options_description basic_opts("Options");
basic_opts.add_options()
("input,i",
po_sys_value<sys_string>(&path_str),
"path to root directory or source filesystem")
("input-list",
po_sys_value<sys_string>(&input_list_str),
"file containing list of file paths relative to root directory "
"or - for stdin")
("output,o",
po_sys_value<sys_string>(&output_str),
"filesystem output name or - for stdout")
("force,f",
po::value<bool>(&force_overwrite)->zero_tokens(),
"force overwrite of existing output image")
("compress-level,l",
po::value<unsigned>(&level)->default_value(default_level),
"compression level (0=fast, 9=best, please see man page for details)")
;
tool::add_common_options(basic_opts, logopts);
basic_opts.add_options()
("long-help,H",
"output full help message and exit")
;
po::options_description advanced_opts("Advanced options");
advanced_opts.add_options()
("block-size-bits,S",
po::value<unsigned>(&sf_config.block_size_bits)
->value_name(lvl_def_val(&level_defaults::block_size_bits)),
"block size bits (size = 2^arg bits)")
("num-workers,N",
po::value<size_t>(&num_workers)->default_value(num_cpu),
"number of writer (compression) worker threads")
("compress-niceness",
po::value<int>(&compress_niceness)->default_value(5),
"compression worker threads niceness")
("num-scanner-workers",
po::value<size_t>(&num_scanner_workers)
->value_name(dep_def_val("num-workers")),
"number of scanner (hasher/categorizer) worker threads")
("num-segmenter-workers",
po::value<size_t>(&num_segmenter_workers)
->value_name(dep_def_val("num-workers")),
"number of segmenter worker threads")
("memory-limit,L",
po::value<std::string>(&memory_limit)->default_value("1g"),
"block manager memory limit")
("recompress",
po::value<std::string>(&recompress_opts)->implicit_value("all"),
"recompress an existing filesystem (none, block, metadata, all)")
("recompress-categories",
po::value<std::string>(&recompress_categories),
"only recompress blocks of these categories")
("categorize",
po::value<categorize_optval>(&categorizer_list)
->implicit_value(categorize_optval("fits,pcmaudio,incompressible")),
categorize_desc.c_str())
("order",
po::value<std::vector<std::string>>(&order)
->value_name(lvl_cat_def_val(&level_defaults::order))
->multitoken()->composing(),
order_desc.c_str())
("max-similarity-size",
po::value<std::string>(&max_similarity_size),
"maximum file size to compute similarity")
("file-hash",
po::value<std::string>(&file_hash_algo)->default_value("xxh3-128"),
file_hash_desc.c_str())
("progress",
po::value<std::string>(&progress_mode)->default_value(default_progress_mode),
progress_desc.c_str())
("no-progress",
po::value<bool>(&no_progress)->zero_tokens(),
"don't show progress")
;
po::options_description filesystem_opts("File system options");
filesystem_opts.add_options()
("with-devices",
po::value<bool>(&options.with_devices)->zero_tokens(),
"include block and character devices")
("with-specials",
po::value<bool>(&options.with_specials)->zero_tokens(),
"include named fifo and sockets")
("header",
po_sys_value<sys_string>(&header_str),
"prepend output filesystem with contents of this file")
("remove-header",
po::value<bool>(&remove_header)->zero_tokens(),
"remove any header present before filesystem data"
" (use with --recompress)")
("no-section-index",
po::value<bool>(&no_section_index)->zero_tokens(),
"don't add section index to file system")
("no-history",
po::value<bool>(&no_history)->zero_tokens(),
"don't add history to file system")
("no-history-timestamps",
po::value<bool>(&no_history_timestamps)->zero_tokens(),
"don't add timestamps to file system history")
("no-history-command-line",
po::value<bool>(&no_history_command_line)->zero_tokens(),
"don't add command line to file system history")
;
po::options_description segmenter_opts("Segmenter options");
segmenter_opts.add_options()
("max-lookback-blocks,B",
po::value<std::vector<std::string>>(&max_lookback_blocks)
->value_name(cat_def_val(kDefaultMaxActiveBlocks))
->multitoken()->composing(),
"how many blocks to scan for segments")
("window-size,W",
po::value<std::vector<std::string>>(&window_size)
->value_name(lvl_cat_def_val(&level_defaults::window_size))
->multitoken()->composing(),
"window sizes for block hashing")
("window-step,w",
po::value<std::vector<std::string>>(&window_step)
->value_name(lvl_cat_def_val(&level_defaults::window_step))
->multitoken()->composing(),
"window step (as right shift of size)")
("bloom-filter-size",
po::value<std::vector<std::string>>(&bloom_filter_size)
->value_name(cat_def_val(kDefaultBloomFilterSize))
->multitoken()->composing(),
"bloom filter size (2^N*values bits)")
;
po::options_description compressor_opts("Compressor options");
compressor_opts.add_options()
("compression,C",
po::value<std::vector<std::string>>(&compression)
->value_name(lvl_cat_def_val(&level_defaults::data_compression))
->multitoken()->composing(),
"block compression algorithm")
("schema-compression",
po::value<std::string>(&schema_compression)
->value_name(lvl_def_val(&level_defaults::schema_history_compression)),
"metadata schema compression algorithm")
("metadata-compression",
po::value<std::string>(&metadata_compression)
->value_name(lvl_def_val(&level_defaults::metadata_compression)),
"metadata compression algorithm")
("history-compression",
po::value<std::string>(&history_compression)
->value_name(lvl_def_val(&level_defaults::schema_history_compression)),
"history compression algorithm")
;
po::options_description filter_opts("Filter options");
filter_opts.add_options()
("filter,F",
po_sys_value<std::vector<sys_string>>(&filter)
->multitoken()->composing(),
"add filter rule")
("debug-filter",
po::value<std::string>(&debug_filter)->implicit_value("all"),
debug_filter_desc.c_str())
("remove-empty-dirs",
po::value<bool>(&options.remove_empty_dirs)->zero_tokens(),
"remove empty directories in file system")
;
po::options_description metadata_opts("Metadata options");
metadata_opts.add_options()
("set-owner",
po::value<uint16_t>(&uid),
"set owner (uid) for whole file system")
("set-group",
po::value<uint16_t>(&gid),
"set group (gid) for whole file system")
("chmod",
po::value<std::string>(&chmod_str),
"recursively apply permission changes")
("no-create-timestamp",
po::value<bool>(&options.no_create_timestamp)->zero_tokens(),
"don't add create timestamp to file system")
("set-time",
po::value<std::string>(&timestamp),
"set timestamp for whole file system (unixtime or 'now')")
("keep-all-times",
po::value<bool>(&options.keep_all_times)->zero_tokens(),
"save atime and ctime in addition to mtime")
("time-resolution",
po::value<std::string>(&time_resolution)->default_value("sec"),
resolution_desc.c_str())
("pack-metadata,P",
po::value<std::string>(&pack_metadata)->default_value("auto"),
"pack certain metadata elements (auto, all, none, chunk_table, "
"directories, shared_files, names, names_index, symlinks, "
"symlinks_index, force, plain)")
;
// clang-format on
po::options_description opts;
opts.add(basic_opts)
.add(advanced_opts)
.add(filter_opts)
.add(segmenter_opts)
.add(compressor_opts)
.add(filesystem_opts)
.add(metadata_opts);
catreg.add_options(opts);
po::variables_map vm;
std::vector<std::string> command_line;
command_line.reserve(argc);
for (int i = 0; i < argc; ++i) {
command_line.emplace_back(sys_string_to_string(argv[i]));
}
try {
auto parsed = po::parse_command_line(argc, argv, opts);
po::store(parsed, vm);
po::notify(vm);
auto unrecognized =
po::collect_unrecognized(parsed.options, po::include_positional);
if (!unrecognized.empty()) {
iol.err << "error: unrecognized argument(s) '"
<< sys_string_to_string(boost::join(unrecognized, " ")) << "'\n";
return 1;
}
} catch (po::error const& e) {
iol.err << "error: " << e.what() << "\n";
return 1;
}
#ifdef DWARFS_BUILTIN_MANPAGE
if (vm.contains("man")) {
tool::show_manpage(tool::manpage::get_mkdwarfs_manpage(), iol);
return 0;
}
#endif
auto constexpr usage = "Usage: mkdwarfs [OPTIONS...]\n";
auto extra_deps = [](library_dependencies& deps) {
compressor_registry::instance().add_library_dependencies(deps);
decompressor_registry::instance().add_library_dependencies(deps);
};
if (vm.contains("long-help")) {
constexpr std::string_view block_data_hdr{"Block Data"};
constexpr std::string_view schema_history_hdr{"Schema/History"};
constexpr std::string_view metadata_hdr{"Metadata"};
size_t l_dc{block_data_hdr.size()}, l_sc{schema_history_hdr.size()},
l_mc{metadata_hdr.size()}, l_or{0};
for (auto const& l : levels) {
l_dc = std::max(l_dc, l.data_compression.size());
l_sc = std::max(l_sc, l.schema_history_compression.size());
l_mc = std::max(l_mc, l.metadata_compression.size());
l_or = std::max(l_or, l.order.size());
}
std::string sep(30 + l_dc + l_sc + l_mc + l_or, '-');
iol.out << tool::tool_header("mkdwarfs", extra_deps) << usage << opts
<< "\n"
<< "Compression level defaults:\n"
<< " " << sep << "\n"
<< fmt::format(" Level Block {:{}s} {:s} Inode\n",
"Compression Algorithm", 4 + l_dc + l_sc + l_mc,
"Window")
<< fmt::format(" Size {:{}s} {:{}s} {:{}s} {:6s}\n",
block_data_hdr, l_dc, schema_history_hdr, l_sc,
metadata_hdr, l_mc, "Size/Step Order")
<< " " << sep << "\n";
for (auto const& [i, l] : ranges::views::enumerate(levels)) {
iol.out << fmt::format(" {:1d} {:2d} {:{}s} {:{}s} {:{}s}"
" {:2d} / {:1d} {:{}s}",
i, l.block_size_bits, l.data_compression, l_dc,
l.schema_history_compression, l_sc,
l.metadata_compression, l_mc, l.window_size,
l.window_step, l.order, l_or)
<< "\n";
}
iol.out << " " << sep << "\n";
iol.out << "\nCompression algorithms:\n";
compressor_registry::instance().for_each_algorithm(
[&iol](compression_type, compressor_info const& info) {
iol.out << fmt::format(" {:9}{}\n", info.name(), info.description());
for (auto const& opt : info.options()) {
iol.out << fmt::format(" {}\n", opt);
}
});
iol.out << "\nCategories:\n";
for (auto const& name : catreg.categorizer_names()) {
stream_logger lgr(iol.term, iol.err);
auto categorizer = catreg.create(lgr, name, vm);
iol.out << " [" << name << "]\n";
for (auto cat : categorizer->categories()) {
iol.out << " " << cat << "\n";
}
}
iol.out << "\n";
return 0;
}
if (vm.contains("help") or
!(vm.contains("input") or vm.contains("input-list")) or
(!vm.contains("output") and !vm.contains("debug-filter"))) {
iol.out << tool::tool_header("mkdwarfs", extra_deps) << usage << "\n"
<< basic_opts << "\n";
return 0;
}
if (level >= levels.size()) {
iol.err << "error: invalid compression level\n";
return 1;
}
auto const& defaults = levels[level];
categorizer_list.add_defaults(categorize_defaults_common());
categorizer_list.add_defaults(categorize_defaults_level(level));
if (!vm.contains("block-size-bits")) {
sf_config.block_size_bits = defaults.block_size_bits;
}
if (!vm.contains("schema-compression")) {
schema_compression = defaults.schema_history_compression;
}
if (!vm.contains("history-compression")) {
history_compression = defaults.schema_history_compression;
}
if (!vm.contains("metadata-compression")) {
metadata_compression = defaults.metadata_compression;
}
if (sf_config.block_size_bits < min_block_size_bits ||
sf_config.block_size_bits > max_block_size_bits) {
iol.err << "error: block size must be between " << min_block_size_bits
<< " and " << max_block_size_bits << "\n";
return 1;
}
std::filesystem::path path(path_str);
std::optional<std::vector<std::filesystem::path>> input_list;
if (vm.contains("input-list")) {
if (vm.contains("filter")) {
iol.err << "error: cannot combine --input-list and --filter\n";
return 1;
}
// implicitly turn on
options.with_devices = true;
options.with_specials = true;
if (!vm.contains("input")) {
path = iol.os->current_path();
}
std::filesystem::path input_list_path(input_list_str);
std::unique_ptr<input_stream> ifs;
std::istream* is;
if (input_list_path == "-") {
is = &iol.in;
} else {
std::error_code ec;
ifs = iol.file->open_input(input_list_path, ec);
if (ec) {
iol.err << "cannot open input list file '" << input_list_path
<< "': " << ec.message() << "\n";
return 1;
}
is = &ifs->is();
}
std::string line;
input_list.emplace();
while (std::getline(*is, line)) {
std::filesystem::path p(line);
if (p.has_root_directory()) {
p = iol.os->canonical(p);
}
input_list->emplace_back(std::move(p));
}
}
path = iol.os->canonical(path);
bool recompress = vm.contains("recompress");
utility::rewrite_options rw_opts;
if (recompress) {
std::unordered_map<std::string, unsigned> const modes{
{"all", 3},
{"metadata", 2},
{"block", 1},
{"none", 0},
};
if (auto it = modes.find(recompress_opts); it != modes.end()) {
rw_opts.recompress_block = it->second & 1;
rw_opts.recompress_metadata = it->second & 2;
} else {
iol.err << "invalid recompress mode: " << recompress_opts << "\n";
return 1;
}
if (!recompress_categories.empty()) {
std::string_view input = recompress_categories;
if (input.front() == '!') {
rw_opts.recompress_categories_exclude = true;
input.remove_prefix(1);
}
rw_opts.recompress_categories =
split_to<std::unordered_set<std::string>>(input, ',');
}
}
if (file_hash_algo == "none") {
options.file_hash_algorithm.reset();
} else if (checksum::is_available(file_hash_algo)) {
options.file_hash_algorithm = file_hash_algo;
} else {
iol.err << "error: unknown file hash function '" << file_hash_algo << "'\n";
return 1;
}
if (vm.contains("max-similarity-size")) {
auto size = parse_size_with_unit(max_similarity_size);
if (size > 0) {
options.inode.max_similarity_scan_size = size;
}
}
size_t mem_limit = parse_size_with_unit(memory_limit);
if (!vm.contains("num-scanner-workers")) {
num_scanner_workers = num_workers;
}
if (!vm.contains("num-segmenter-workers")) {
num_segmenter_workers = num_workers;
}
options.num_segmenter_workers = num_segmenter_workers;
if (vm.contains("debug-filter")) {
if (auto it = debug_filter_modes.find(debug_filter);
it != debug_filter_modes.end()) {
options.debug_filter_function =
[&iol, mode = it->second](bool exclude,
writer::entry_interface const& ei) {
debug_filter_output(iol.out, exclude, ei, mode);
};
no_progress = true;
} else {
iol.err << "error: invalid filter debug mode '" << debug_filter << "'\n";
return 1;
}
}
if (!progress_modes.contains(progress_mode)) {
iol.err << "error: invalid progress mode '" << progress_mode << "'\n";
return 1;
}
if (no_progress) {
progress_mode = "none";
}
if (progress_mode != "none" && !iol.term->is_tty(iol.err)) {
progress_mode = "simple";
}
auto pg_mode = DWARFS_NOTHROW(progress_modes.at(progress_mode));
writer::console_writer lgr(iol.term, iol.err, pg_mode,
recompress ? writer::console_writer::REWRITE
: writer::console_writer::NORMAL,
logopts);
std::unique_ptr<writer::rule_based_entry_filter> rule_filter;
if (!filter.empty()) {
rule_filter =
std::make_unique<writer::rule_based_entry_filter>(lgr, iol.file);
rule_filter->set_root_path(path);
for (auto const& rule : filter) {
auto srule = sys_string_to_string(rule);
try {
rule_filter->add_rule(srule);
} catch (std::exception const& e) {
iol.err << "error: could not parse filter rule '" << srule
<< "': " << e.what() << "\n";
return 1;
}
}
}
std::vector<std::unique_ptr<writer::entry_transformer>> transformers;
if (vm.contains("chmod")) {
if (chmod_str == "norm") {
chmod_str = "ug-st,=Xr";
}
auto chmod_exprs = split_to<std::vector<std::string_view>>(chmod_str, ',');
auto mask = get_current_umask();
for (auto expr : chmod_exprs) {
transformers.push_back(
writer::create_chmod_entry_transformer(expr, mask));
}
}
if (vm.contains("set-owner")) {
options.uid = uid;
}
if (vm.contains("set-group")) {
options.gid = gid;
}
if (vm.contains("set-time")) {
if (timestamp == "now") {
options.timestamp = std::time(nullptr);
} else if (auto val = try_to<uint64_t>(timestamp)) {
options.timestamp = val;
} else {
try {
auto tp = parse_time_point(timestamp);
options.timestamp = std::chrono::duration_cast<std::chrono::seconds>(
tp.time_since_epoch())
.count();
} catch (std::exception const& e) {
iol.err << "error: " << e.what() << "\n";
return 1;
}
}
}
if (auto it = time_resolutions.find(time_resolution);
it != time_resolutions.end()) {
options.time_resolution_sec = it->second;
} else if (auto val = try_to<uint32_t>(time_resolution)) {
options.time_resolution_sec = *val;
if (options.time_resolution_sec == 0) {
iol.err << "error: the argument to '--time-resolution' must be nonzero\n";
return 1;
}
} else {
iol.err << "error: the argument ('" << time_resolution
<< "') to '--time-resolution' is invalid\n";
return 1;
}
if (!pack_metadata.empty() and pack_metadata != "none") {
if (pack_metadata == "auto") {
options.force_pack_string_tables = false;
options.pack_chunk_table = false;
options.pack_directories = false;
options.pack_shared_files_table = false;
options.pack_names = true;
options.pack_names_index = false;
options.pack_symlinks = true;
options.pack_symlinks_index = false;
} else {
auto pack_opts =
split_to<std::vector<std::string_view>>(pack_metadata, ',');
for (auto const& opt : pack_opts) {
if (opt == "chunk_table") {
options.pack_chunk_table = true;
} else if (opt == "directories") {
options.pack_directories = true;
} else if (opt == "shared_files") {
options.pack_shared_files_table = true;
} else if (opt == "names") {
options.pack_names = true;
} else if (opt == "names_index") {
options.pack_names_index = true;
} else if (opt == "symlinks") {
options.pack_symlinks = true;
} else if (opt == "symlinks_index") {
options.pack_symlinks_index = true;
} else if (opt == "force") {
options.force_pack_string_tables = true;
} else if (opt == "plain") {
options.plain_names_table = true;
options.plain_symlinks_table = true;
} else if (opt == "all") {
options.pack_chunk_table = true;
options.pack_directories = true;
options.pack_shared_files_table = true;
options.pack_names = true;
options.pack_names_index = true;
options.pack_symlinks = true;
options.pack_symlinks_index = true;
} else {
iol.err << "error: the argument ('" << opt
<< "') to '--pack-metadata' is invalid\n";
return 1;
}
}
}
}
auto interval = pg_mode == writer::console_writer::NONE ||
pg_mode == writer::console_writer::SIMPLE
? 2000ms
: 200ms;
writer::filesystem_writer_options fswopts;
fswopts.max_queue_size = mem_limit;
fswopts.worst_case_block_size = UINT64_C(1) << sf_config.block_size_bits;
fswopts.remove_header = remove_header;
fswopts.no_section_index = no_section_index;
std::unique_ptr<input_stream> header_ifs;
if (!header_str.empty()) {
std::filesystem::path header(header_str);
std::error_code ec;
header_ifs = iol.file->open_input_binary(header, ec);
if (ec) {
iol.err << "error: cannot open header file '" << header
<< "': " << ec.message() << "\n";
return 1;
}
}
LOG_PROXY(debug_logger_policy, lgr);
writer::writer_progress::update_function_type updater;
if (options.debug_filter_function) {
updater = [](writer::writer_progress&, bool) {};
} else {
updater = [&](writer::writer_progress& p, bool last) {
lgr.update(p, last);
};
}
writer::writer_progress prog(std::move(updater), interval);
// No more streaming to iol.err after this point as this would
// cause a race with the progress thread.
auto min_memory_req =
num_workers * (UINT64_C(1) << sf_config.block_size_bits);
// TODO:
if (mem_limit < min_memory_req /* && compression != "null" */) {
LOG_WARN << "low memory limit (" << size_with_unit(mem_limit) << "), need "
<< size_with_unit(min_memory_req) << " to efficiently compress "
<< size_with_unit(UINT64_C(1) << sf_config.block_size_bits)
<< " blocks with " << num_workers << " threads";
}
std::filesystem::path output(output_str);
std::variant<std::monostate, std::unique_ptr<output_stream>,
std::ostringstream>
os;
if (!options.debug_filter_function) {
if (output != "-") {
if (iol.file->exists(output) && !force_overwrite) {
LOG_ERROR << "output file already exists, use --force to overwrite";
return 1;
}
std::error_code ec;
auto stream = iol.file->open_output_binary(output, ec);
if (ec) {
LOG_ERROR << "cannot open output file '" << output
<< "': " << ec.message();
return 1;
}
assert(stream);
os.emplace<std::unique_ptr<output_stream>>(std::move(stream));
} else {
ensure_binary_mode(iol.out);
}
} else {
os.emplace<std::ostringstream>();
}
options.enable_history = !no_history;
rw_opts.enable_history = !no_history;
if (options.enable_history) {
options.history.with_timestamps = !no_history_timestamps;
rw_opts.history.with_timestamps = !no_history_timestamps;
if (!no_history_command_line) {
options.command_line_arguments = command_line;
rw_opts.command_line_arguments = command_line;
}
}
if (!categorizer_list.empty()) {
auto categorizers =
split_to<std::vector<std::string>>(categorizer_list.value(), ',');
options.inode.categorizer_mgr =
std::make_shared<writer::categorizer_manager>(lgr);
for (auto const& name : categorizers) {
options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm));
}
}
std::optional<reader::filesystem_v2> input_filesystem;
std::shared_ptr<writer::category_resolver> cat_resolver;
if (recompress) {
input_filesystem.emplace(
lgr, *iol.os, path,
reader::filesystem_options{
.image_offset = reader::filesystem_options::IMAGE_OFFSET_AUTO});
LOG_INFO << "checking input filesystem...";
{
auto tv = LOG_TIMED_VERBOSE;
if (auto num_errors =
input_filesystem->check(reader::filesystem_check_level::CHECKSUM);
num_errors != 0) {
LOG_ERROR << "input filesystem is corrupt: detected " << num_errors
<< " error(s)";
return 1;
}
tv << "checked input filesystem";
}
cat_resolver = std::make_shared<writer::filesystem_block_category_resolver>(
input_filesystem->get_all_block_categories());
for (auto const& cat : rw_opts.recompress_categories) {
if (!cat_resolver->category_value(cat)) {
LOG_ERROR << "no category '" << cat << "' in input filesystem";
return 1;
}
}
} else {
cat_resolver = options.inode.categorizer_mgr;
}
writer::category_parser cp(cat_resolver);
try {
{
writer::contextual_option_parser cop(
"--order", options.inode.fragment_order, cp, order_parser);
cop.parse(defaults.order);
cop.parse(order);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
}
{
writer::contextual_option_parser cop("--max-lookback-blocks",
sf_config.max_active_blocks, cp,
max_lookback_parser);
sf_config.max_active_blocks.set_default(kDefaultMaxActiveBlocks);
cop.parse(max_lookback_blocks);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
}
{
writer::contextual_option_parser cop("--window-size",
sf_config.blockhash_window_size, cp,
window_size_parser);
sf_config.blockhash_window_size.set_default(defaults.window_size);
cop.parse(window_size);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
}
{
writer::contextual_option_parser cop("--window-step",
sf_config.window_increment_shift, cp,
window_step_parser);
sf_config.window_increment_shift.set_default(defaults.window_step);
cop.parse(window_step);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
}
{
writer::contextual_option_parser cop("--bloom-filter-size",
sf_config.bloom_filter_size, cp,
bloom_filter_size_parser);
sf_config.bloom_filter_size.set_default(kDefaultBloomFilterSize);
cop.parse(bloom_filter_size);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
}
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
block_compressor schema_bc(schema_compression);
block_compressor metadata_bc(metadata_compression);
block_compressor history_bc(history_compression);
thread_pool compress_pool(lgr, *iol.os, "compress", num_workers,
std::numeric_limits<size_t>::max(),
compress_niceness);
std::optional<writer::filesystem_writer> fsw;
try {
std::ostream& fsw_os =
os |
match{[&](std::monostate) -> std::ostream& { return iol.out; },
[&](std::unique_ptr<output_stream>& os) -> std::ostream& {
return os->os();
},
[&](std::ostringstream& oss) -> std::ostream& { return oss; }};
fsw.emplace(fsw_os, lgr, compress_pool, prog, fswopts,
header_ifs ? &header_ifs->is() : nullptr);
fsw->add_section_compressor(section_type::METADATA_V2_SCHEMA, schema_bc);
fsw->add_section_compressor(section_type::METADATA_V2, metadata_bc);
fsw->add_section_compressor(section_type::HISTORY, history_bc);
writer::categorized_option<block_compressor> compression_opt;
writer::contextual_option_parser cop("--compression", compression_opt, cp,
compressor_parser);
compression_opt.set_default(
block_compressor(std::string(defaults.data_compression)));
cop.parse(compression);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
{
auto bc = compression_opt.get();
if (!bc.metadata_requirements().empty()) {
throw std::runtime_error(
fmt::format("compression '{}' cannot be used without a category: "
"metadata requirements not met",
bc.describe()));
}
fsw->add_default_compressor(std::move(bc));
}
if (recompress) {
compression_opt.visit_contextual(
[&fsw](auto cat, block_compressor const& bc) {
fsw->add_category_compressor(cat, bc);
});
} else {
compression_opt.visit_contextual([catmgr = options.inode.categorizer_mgr,
&fsw](auto cat,
block_compressor const& bc) {
try {
catmgr->set_metadata_requirements(cat, bc.metadata_requirements());
fsw->add_category_compressor(cat, bc);
} catch (std::exception const& e) {
throw std::runtime_error(
fmt::format("compression '{}' cannot be used for category '{}': "
"metadata requirements not met ({})",
bc.describe(), catmgr->category_name(cat), e.what()));
}
});
}
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
auto ti = LOG_TIMED_INFO;
try {
if (recompress) {
utility::rewrite_filesystem(lgr, *input_filesystem, *fsw, *cat_resolver,
rw_opts);
} else {
writer::segmenter_factory sf(lgr, prog, options.inode.categorizer_mgr,
sf_config);
writer::entry_factory ef;
thread_pool scanner_pool(lgr, *iol.os, "scanner", num_scanner_workers);
writer::scanner s(lgr, scanner_pool, sf, ef, *iol.os, options);
if (rule_filter) {
s.add_filter(std::move(rule_filter));
}
for (auto& t : transformers) {
s.add_transformer(std::move(t));
}
s.scan(*fsw, path, prog, input_list, iol.file);
options.inode.categorizer_mgr.reset();
}
} catch (std::exception const& e) {
LOG_ERROR << exception_str(e);
return 1;
}
if (!options.debug_filter_function) {
std::error_code ec;
auto cpu_time = compress_pool.get_cpu_time(ec);
if (ec) {
LOG_WARN << "could not measure CPU time: " << ec.message();
} else {
LOG_INFO << "compression CPU time: " << time_with_unit(cpu_time);
}
}
{
auto ec = os | match{[](std::monostate) -> int { return 0; },
[&](std::unique_ptr<output_stream>& os) -> int {
std::error_code ec;
os->close(ec);
if (ec) {
LOG_ERROR << "failed to close output file '"
<< output << "': " << ec.message();
return 1;
}
os.reset();
return 0;
},
[](std::ostringstream& oss [[maybe_unused]]) -> int {
assert(oss.str().empty());
return 0;
}};
if (ec != 0) {
return ec;
}
}
auto errors = prog.errors();
if (!options.debug_filter_function) {
std::ostringstream err;
if (errors) {
err << "with " << errors << " error";
if (errors > 1) {
err << "s";
}
} else {
err << "without errors";
}
ti << "filesystem " << (recompress ? "rewritten " : "created ")
<< err.str();
}
return errors > 0 ? 2 : 0;
}
} // namespace dwarfs::tool