dwarfs/src/mkdwarfs_main.cpp
2023-12-30 12:22:35 +01:00

1339 lines
43 KiB
C++

/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <algorithm>
#include <array>
#include <cerrno>
#include <cstdio>
#include <ctime>
#include <filesystem>
#include <iostream>
#include <iterator>
#include <map>
#include <memory>
#include <optional>
#include <sstream>
#include <stdexcept>
#include <string>
#include <string_view>
#include <thread>
#include <utility>
#include <variant>
#include <vector>
#ifdef _WIN32
#include <io.h>
#endif
#include <boost/algorithm/string.hpp>
#include <boost/program_options.hpp>
#include <folly/Conv.h>
#include <folly/FileUtil.h>
#include <folly/gen/String.h>
#include <folly/portability/SysStat.h>
#include <folly/system/HardwareConcurrency.h>
#include <fmt/format.h>
#include "dwarfs/block_compressor.h"
#include "dwarfs/block_compressor_parser.h"
#include "dwarfs/builtin_script.h"
#include "dwarfs/categorizer.h"
#include "dwarfs/category_parser.h"
#include "dwarfs/chmod_entry_transformer.h"
#include "dwarfs/console_writer.h"
#include "dwarfs/entry.h"
#include "dwarfs/error.h"
#include "dwarfs/file_access.h"
#include "dwarfs/filesystem_block_category_resolver.h"
#include "dwarfs/filesystem_v2.h"
#include "dwarfs/filesystem_writer.h"
#include "dwarfs/filter_debug.h"
#include "dwarfs/fragment_order_parser.h"
#include "dwarfs/integral_value_parser.h"
#include "dwarfs/iolayer.h"
#include "dwarfs/logger.h"
#include "dwarfs/mmap.h"
#include "dwarfs/options.h"
#include "dwarfs/options_interface.h"
#include "dwarfs/os_access.h"
#include "dwarfs/overloaded.h"
#include "dwarfs/program_options_helpers.h"
#include "dwarfs/progress.h"
#include "dwarfs/scanner.h"
#include "dwarfs/script.h"
#include "dwarfs/segmenter_factory.h"
#include "dwarfs/terminal.h"
#include "dwarfs/tool.h"
#include "dwarfs/util.h"
#include "dwarfs_tool_main.h"
namespace po = boost::program_options;
namespace dwarfs {
namespace {
const std::map<std::string, console_writer::progress_mode> progress_modes{
{"none", console_writer::NONE},
{"simple", console_writer::SIMPLE},
{"ascii", console_writer::ASCII},
#ifndef _WIN32
{"unicode", console_writer::UNICODE},
#endif
};
const std::string default_progress_mode =
#ifdef _WIN32
"ascii"
#else
"unicode"
#endif
;
const std::map<std::string, debug_filter_mode> debug_filter_modes{
{"included", debug_filter_mode::INCLUDED},
{"included-files", debug_filter_mode::INCLUDED_FILES},
{"excluded", debug_filter_mode::EXCLUDED},
{"excluded-files", debug_filter_mode::EXCLUDED_FILES},
{"files", debug_filter_mode::FILES},
{"all", debug_filter_mode::ALL},
};
const std::map<std::string, uint32_t> time_resolutions{
{"sec", 1},
{"min", 60},
{"hour", 3600},
{"day", 86400},
};
constexpr size_t min_block_size_bits{10};
constexpr size_t max_block_size_bits{30};
struct level_defaults {
unsigned block_size_bits;
std::string_view data_compression;
std::string_view schema_history_compression;
std::string_view metadata_compression;
unsigned window_size;
unsigned window_step;
std::string_view order;
};
#if defined(DWARFS_HAVE_LIBLZ4)
#define ALG_DATA_1 "lz4"
#define ALG_DATA_2 "lz4hc:level=9"
#define ALG_DATA_3 "lz4hc:level=9"
#elif defined(DWARFS_HAVE_LIBZSTD)
#define ALG_DATA_1 "zstd:level=1"
#define ALG_DATA_2 "zstd:level=4"
#define ALG_DATA_3 "zstd:level=7"
#elif defined(DWARFS_HAVE_LIBLZMA)
#define ALG_DATA_1 "lzma:level=1"
#define ALG_DATA_2 "lzma:level=2"
#define ALG_DATA_3 "lzma:level=3"
#else
#define ALG_DATA_1 "null"
#define ALG_DATA_2 "null"
#define ALG_DATA_3 "null"
#endif
#if defined(DWARFS_HAVE_LIBZSTD)
#define ALG_DATA_4 "zstd:level=11"
#define ALG_DATA_5 "zstd:level=19"
#define ALG_DATA_6 "zstd:level=22"
#define ALG_DATA_7 "zstd:level=22"
#elif defined(DWARFS_HAVE_LIBLZMA)
#define ALG_DATA_4 "lzma:level=3"
#define ALG_DATA_5 "lzma:level=4"
#define ALG_DATA_6 "lzma:level=5"
#define ALG_DATA_7 "lzma:level=8"
#elif defined(DWARFS_HAVE_LIBLZ4)
#define ALG_DATA_4 "lz4hc:level=9"
#define ALG_DATA_5 "lz4hc:level=9"
#define ALG_DATA_6 "lz4hc:level=9"
#define ALG_DATA_7 "lz4hc:level=9"
#else
#define ALG_DATA_4 "null"
#define ALG_DATA_5 "null"
#define ALG_DATA_6 "null"
#define ALG_DATA_7 "null"
#endif
#if defined(DWARFS_HAVE_LIBLZMA)
#define ALG_DATA_8 "lzma:level=9"
#define ALG_DATA_9 "lzma:level=9"
#elif defined(DWARFS_HAVE_LIBZSTD)
#define ALG_DATA_8 "zstd:level=22"
#define ALG_DATA_9 "zstd:level=22"
#elif defined(DWARFS_HAVE_LIBLZ4)
#define ALG_DATA_8 "lz4hc:level=9"
#define ALG_DATA_9 "lz4hc:level=9"
#else
#define ALG_DATA_8 "null"
#define ALG_DATA_9 "null"
#endif
#if defined(DWARFS_HAVE_LIBZSTD)
#define ALG_SCHEMA "zstd:level=16"
#elif defined(DWARFS_HAVE_LIBLZMA)
#define ALG_SCHEMA "lzma:level=4"
#elif defined(DWARFS_HAVE_LIBLZ4)
#define ALG_SCHEMA "lz4hc:level=9"
#else
#define ALG_SCHEMA "null"
#endif
#if defined(DWARFS_HAVE_LIBZSTD)
#define ALG_METADATA_7 "zstd:level=22"
#elif defined(DWARFS_HAVE_LIBLZMA)
#define ALG_METADATA_7 "lzma:level=9"
#elif defined(DWARFS_HAVE_LIBLZ4)
#define ALG_METADATA_7 "lz4hc:level=9"
#else
#define ALG_METADATA_7 "null"
#endif
#if defined(DWARFS_HAVE_LIBLZMA)
#define ALG_METADATA_9 "lzma:level=9"
#elif defined(DWARFS_HAVE_LIBZSTD)
#define ALG_METADATA_9 "zstd:level=22"
#elif defined(DWARFS_HAVE_LIBLZ4)
#define ALG_METADATA_9 "lz4hc:level=9"
#else
#define ALG_METADATA_9 "null"
#endif
constexpr std::array<level_defaults, 10> levels{{
// clang-format off
/* 0 */ {20, "null", "null" , "null", 0, 0, "none"},
/* 1 */ {20, ALG_DATA_1, ALG_SCHEMA, "null", 0, 0, "path"},
/* 2 */ {20, ALG_DATA_2, ALG_SCHEMA, "null", 0, 0, "path"},
/* 3 */ {21, ALG_DATA_3, ALG_SCHEMA, "null", 12, 1, "similarity"},
/* 4 */ {22, ALG_DATA_4, ALG_SCHEMA, "null", 12, 2, "similarity"},
/* 5 */ {23, ALG_DATA_5, ALG_SCHEMA, "null", 12, 2, "similarity"},
/* 6 */ {24, ALG_DATA_6, ALG_SCHEMA, "null", 12, 3, "nilsimsa"},
/* 7 */ {24, ALG_DATA_7, ALG_SCHEMA, ALG_METADATA_7, 12, 3, "nilsimsa"},
/* 8 */ {24, ALG_DATA_8, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa"},
/* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa"},
// clang-format on
}};
const std::unordered_map<std::string, std::vector<std::string>>
categorize_defaults_common{
// clang-format off
{"--compression", {"incompressible::null"}},
// clang-format on
};
const std::unordered_map<std::string, std::vector<std::string>>
categorize_defaults_fast{
// clang-format off
{"--order", {"pcmaudio/waveform::revpath"}},
{"--window-size", {"pcmaudio/waveform::0"}},
#ifdef DWARFS_HAVE_FLAC
{"--compression", {"pcmaudio/waveform::flac:level=3"}},
#else
{"--compression", {"pcmaudio/waveform::zstd:level=3"}},
#endif
// clang-format on
};
const std::unordered_map<std::string, std::vector<std::string>>
categorize_defaults_medium{
// clang-format off
{"--order", {"pcmaudio/waveform::revpath"}},
{"--window-size", {"pcmaudio/waveform::20"}},
#ifdef DWARFS_HAVE_FLAC
{"--compression", {"pcmaudio/waveform::flac:level=5"}},
#else
{"--compression", {"pcmaudio/waveform::zstd:level=5"}},
#endif
// clang-format on
};
const std::unordered_map<std::string, std::vector<std::string>>
categorize_defaults_slow{
// clang-format off
{"--window-size", {"pcmaudio/waveform::16"}},
#ifdef DWARFS_HAVE_FLAC
{"--compression", {"pcmaudio/waveform::flac:level=8"}},
#else
{"--compression", {"pcmaudio/waveform::zstd:level=8"}},
#endif
// clang-format on
};
constexpr std::array<
std::unordered_map<std::string, std::vector<std::string>> const*, 10>
categorize_defaults_level{{
// clang-format off
/* 0 */ &categorize_defaults_fast,
/* 1 */ &categorize_defaults_fast,
/* 2 */ &categorize_defaults_fast,
/* 3 */ &categorize_defaults_fast,
/* 4 */ &categorize_defaults_fast,
/* 5 */ &categorize_defaults_medium,
/* 6 */ &categorize_defaults_medium,
/* 7 */ &categorize_defaults_medium,
/* 8 */ &categorize_defaults_slow,
/* 9 */ &categorize_defaults_slow,
// clang-format on
}};
constexpr unsigned default_level = 7;
class categorize_optval {
public:
std::string value;
bool is_explicit{false};
categorize_optval() = default;
explicit categorize_optval(std::string const& val, bool expl = false)
: value{val}
, is_explicit{expl} {}
bool is_implicit_default() const { return !value.empty() && !is_explicit; }
template <typename T>
void add_implicit_defaults(T& cop) const {
if (is_implicit_default()) {
if (auto it = defaults_.find(cop.name()); it != defaults_.end()) {
for (auto const& value : it->second) {
cop.parse_fallback(value);
}
}
}
}
void
add_defaults(std::unordered_map<std::string, std::vector<std::string>> const&
defaults) {
for (auto const& [key, values] : defaults) {
auto& vs = defaults_[key];
vs.insert(vs.end(), values.begin(), values.end());
}
}
private:
std::unordered_map<std::string, std::vector<std::string>> defaults_;
};
std::ostream& operator<<(std::ostream& os, categorize_optval const& optval) {
return os << optval.value << (optval.is_explicit ? " (explicit)" : "");
}
void validate(boost::any& v, std::vector<std::string> const& values,
categorize_optval*, int) {
po::validators::check_first_occurrence(v);
v = categorize_optval{po::validators::get_single_string(values), true};
}
} // namespace
int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
using namespace folly::gen;
const size_t num_cpu = std::max(folly::hardware_concurrency(), 1u);
segmenter_factory::config sf_config;
sys_string path_str, input_list_str, output_str, header_str;
std::string memory_limit, script_arg, schema_compression,
metadata_compression, log_level_str, timestamp, time_resolution,
progress_mode, recompress_opts, pack_metadata, file_hash_algo,
debug_filter, max_similarity_size, chmod_str, history_compression,
recompress_categories;
std::vector<sys_string> filter;
std::vector<std::string> order, max_lookback_blocks, window_size, window_step,
bloom_filter_size, compression;
size_t num_workers, num_scanner_workers, num_segmenter_workers;
bool no_progress = false, remove_header = false, no_section_index = false,
force_overwrite = false, no_history = false,
no_history_timestamps = false, no_history_command_line = false;
unsigned level;
int compress_niceness;
uint16_t uid, gid;
categorize_optval categorizer_list;
integral_value_parser<size_t> max_lookback_parser;
integral_value_parser<unsigned> window_size_parser(0, 24);
integral_value_parser<unsigned> window_step_parser(0, 8);
integral_value_parser<unsigned> bloom_filter_size_parser(0, 10);
fragment_order_parser order_parser;
block_compressor_parser compressor_parser;
scanner_options options;
auto order_desc = "inode fragments order (" + order_parser.choices() + ")";
auto progress_desc = "progress mode (" +
(from(progress_modes) | get<0>() | unsplit(", ")) + ")";
auto debug_filter_desc =
"show effect of filter rules without producing an image (" +
(from(debug_filter_modes) | get<0>() | unsplit(", ")) + ")";
auto resolution_desc = "time resolution in seconds or (" +
(from(time_resolutions) | get<0>() | unsplit(", ")) +
")";
auto hash_list = checksum::available_algorithms();
auto file_hash_desc = "choice of file hashing function (none, " +
(from(hash_list) | unsplit(", ")) + ")";
auto& catreg = categorizer_registry::instance();
auto categorize_desc = "enable categorizers in the given order (" +
(from(catreg.categorizer_names()) | unsplit(", ")) +
")";
// clang-format off
po::options_description basic_opts("Options");
basic_opts.add_options()
("input,i",
po_sys_value<sys_string>(&path_str),
"path to root directory or source filesystem")
("input-list",
po_sys_value<sys_string>(&input_list_str),
"file containing list of paths relative to root directory")
("output,o",
po_sys_value<sys_string>(&output_str),
"filesystem output name")
("force,f",
po::value<bool>(&force_overwrite)->zero_tokens(),
"force overwrite of existing output image")
("compress-level,l",
po::value<unsigned>(&level)->default_value(default_level),
"compression level (0=fast, 9=best, please see man page for details)")
;
add_common_options(basic_opts, log_level_str);
basic_opts.add_options()
("long-help,H",
"output full help message and exit")
;
po::options_description advanced_opts("Advanced options");
advanced_opts.add_options()
("block-size-bits,S",
po::value<unsigned>(&sf_config.block_size_bits),
"block size bits (size = 2^arg bits)")
("num-workers,N",
po::value<size_t>(&num_workers)->default_value(num_cpu),
"number of writer (compression) worker threads")
("compress-niceness",
po::value<int>(&compress_niceness)->default_value(5),
"compression worker threads niceness")
("num-scanner-workers",
po::value<size_t>(&num_scanner_workers),
"number of scanner (hashing/categorizing) worker threads")
("num-segmenter-workers",
po::value<size_t>(&num_segmenter_workers),
"number of segmenter worker threads")
("memory-limit,L",
po::value<std::string>(&memory_limit)->default_value("1g"),
"block manager memory limit")
("recompress",
po::value<std::string>(&recompress_opts)->implicit_value("all"),
"recompress an existing filesystem (none, block, metadata, all)")
("recompress-categories",
po::value<std::string>(&recompress_categories),
"only recompress blocks of these categories")
("categorize",
po::value<categorize_optval>(&categorizer_list)
->implicit_value(categorize_optval("pcmaudio,incompressible")),
categorize_desc.c_str())
("order",
po::value<std::vector<std::string>>(&order)
->value_name("[cat::]arg")->multitoken()->composing(),
order_desc.c_str())
("max-similarity-size",
po::value<std::string>(&max_similarity_size),
"maximum file size to compute similarity")
("file-hash",
po::value<std::string>(&file_hash_algo)->default_value("xxh3-128"),
file_hash_desc.c_str())
("progress",
po::value<std::string>(&progress_mode)->default_value(default_progress_mode),
progress_desc.c_str())
("no-progress",
po::value<bool>(&no_progress)->zero_tokens(),
"don't show progress")
;
po::options_description filesystem_opts("File system options");
filesystem_opts.add_options()
("with-devices",
po::value<bool>(&options.with_devices)->zero_tokens(),
"include block and character devices")
("with-specials",
po::value<bool>(&options.with_specials)->zero_tokens(),
"include named fifo and sockets")
("header",
po_sys_value<sys_string>(&header_str),
"prepend output filesystem with contents of this file")
("remove-header",
po::value<bool>(&remove_header)->zero_tokens(),
"remove any header present before filesystem data"
" (use with --recompress)")
("no-section-index",
po::value<bool>(&no_section_index)->zero_tokens(),
"don't add section index to file system")
("no-history",
po::value<bool>(&no_history)->zero_tokens(),
"don't add history to file system")
("no-history-timestamps",
po::value<bool>(&no_history_timestamps)->zero_tokens(),
"don't add timestamps to file system history")
("no-history-command-line",
po::value<bool>(&no_history_command_line)->zero_tokens(),
"don't add command line to file system history")
;
po::options_description segmenter_opts("Segmenter options");
segmenter_opts.add_options()
("max-lookback-blocks,B",
po::value<std::vector<std::string>>(&max_lookback_blocks)
->value_name("[cat::]arg")->multitoken()->composing(),
"how many blocks to scan for segments")
("window-size,W",
po::value<std::vector<std::string>>(&window_size)
->value_name("[cat::]arg")->multitoken()->composing(),
"window sizes for block hashing")
("window-step,w",
po::value<std::vector<std::string>>(&window_step)
->value_name("[cat::]arg")->multitoken()->composing(),
"window step (as right shift of size)")
("bloom-filter-size",
po::value<std::vector<std::string>>(&bloom_filter_size)
->value_name("[cat::]arg")->multitoken()->composing(),
"bloom filter size (2^N*values bits)")
;
po::options_description compressor_opts("Compressor options");
compressor_opts.add_options()
("compression,C",
po::value<std::vector<std::string>>(&compression)
->value_name("[cat::]arg")->multitoken()->composing(),
"block compression algorithm")
("schema-compression",
po::value<std::string>(&schema_compression),
"metadata schema compression algorithm")
("metadata-compression",
po::value<std::string>(&metadata_compression),
"metadata compression algorithm")
("history-compression",
po::value<std::string>(&history_compression),
"history compression algorithm")
;
po::options_description filter_opts("Filter options");
filter_opts.add_options()
("filter,F",
po_sys_value<std::vector<sys_string>>(&filter)
->multitoken()->composing(),
"add filter rule")
("debug-filter",
po::value<std::string>(&debug_filter)->implicit_value("all"),
debug_filter_desc.c_str())
("remove-empty-dirs",
po::value<bool>(&options.remove_empty_dirs)->zero_tokens(),
"remove empty directories in file system")
;
po::options_description metadata_opts("Metadata options");
metadata_opts.add_options()
("set-owner",
po::value<uint16_t>(&uid),
"set owner (uid) for whole file system")
("set-group",
po::value<uint16_t>(&gid),
"set group (gid) for whole file system")
("chmod",
po::value<std::string>(&chmod_str),
"recursively apply permission changes")
("no-create-timestamp",
po::value<bool>(&options.no_create_timestamp)->zero_tokens(),
"don't add create timestamp to file system")
("set-time",
po::value<std::string>(&timestamp),
"set timestamp for whole file system (unixtime or 'now')")
("keep-all-times",
po::value<bool>(&options.keep_all_times)->zero_tokens(),
"save atime and ctime in addition to mtime")
("time-resolution",
po::value<std::string>(&time_resolution)->default_value("sec"),
resolution_desc.c_str())
("pack-metadata,P",
po::value<std::string>(&pack_metadata)->default_value("auto"),
"pack certain metadata elements (auto, all, none, chunk_table, "
"directories, shared_files, names, names_index, symlinks, "
"symlinks_index, force, plain)")
;
// clang-format on
po::options_description opts;
opts.add(basic_opts)
.add(advanced_opts)
.add(filter_opts)
.add(segmenter_opts)
.add(compressor_opts)
.add(filesystem_opts)
.add(metadata_opts);
catreg.add_options(opts);
po::variables_map vm;
auto& sys_err_out = SYS_CERR;
std::vector<std::string> command_line;
command_line.reserve(argc);
for (int i = 0; i < argc; ++i) {
command_line.emplace_back(sys_string_to_string(argv[i]));
}
try {
auto parsed = po::parse_command_line(argc, argv, opts);
po::store(parsed, vm);
po::notify(vm);
auto unrecognized =
po::collect_unrecognized(parsed.options, po::include_positional);
if (!unrecognized.empty()) {
sys_err_out << "error: unrecognized argument(s) '"
<< boost::join(unrecognized, " ") << "'\n";
return 1;
}
} catch (po::error const& e) {
iol.err << "error: " << e.what() << "\n";
return 1;
}
auto constexpr usage = "Usage: mkdwarfs [OPTIONS...]\n";
if (vm.count("long-help")) {
std::string_view constexpr block_data_hdr{"Block Data"};
std::string_view constexpr schema_history_hdr{"Schema/History"};
std::string_view constexpr metadata_hdr{"Metadata"};
size_t l_dc{block_data_hdr.size()}, l_sc{schema_history_hdr.size()},
l_mc{metadata_hdr.size()}, l_or{0};
for (auto const& l : levels) {
l_dc = std::max(l_dc, l.data_compression.size());
l_sc = std::max(l_sc, l.schema_history_compression.size());
l_mc = std::max(l_mc, l.metadata_compression.size());
l_or = std::max(l_or, l.order.size());
}
std::string sep(30 + l_dc + l_sc + l_mc + l_or, '-');
iol.out << tool_header("mkdwarfs") << usage << opts << "\n"
<< "Compression level defaults:\n"
<< " " << sep << "\n"
<< fmt::format(" Level Block {:{}s} {:s} Inode\n",
"Compression Algorithm", 4 + l_dc + l_sc + l_mc,
"Window")
<< fmt::format(" Size {:{}s} {:{}s} {:{}s} {:6s}\n",
block_data_hdr, l_dc, schema_history_hdr, l_sc,
metadata_hdr, l_mc, "Size/Step Order")
<< " " << sep << "\n";
int level = 0;
for (auto const& l : levels) {
iol.out << fmt::format(" {:1d} {:2d} {:{}s} {:{}s} {:{}s}"
" {:2d} / {:1d} {:{}s}",
level, l.block_size_bits, l.data_compression, l_dc,
l.schema_history_compression, l_sc,
l.metadata_compression, l_mc, l.window_size,
l.window_step, l.order, l_or)
<< "\n";
++level;
}
iol.out << " " << sep << "\n";
iol.out << "\nCompression algorithms:\n";
compression_registry::instance().for_each_algorithm(
[&iol](compression_type, compression_info const& info) {
iol.out << fmt::format(" {:9}{}\n", info.name(), info.description());
for (auto const& opt : info.options()) {
iol.out << fmt::format(" {}\n", opt);
}
});
iol.out << "\nCategories:\n";
for (auto const& name : catreg.categorizer_names()) {
stream_logger lgr(iol.term, iol.err);
auto categorizer = catreg.create(lgr, name, vm);
iol.out << " [" << name << "]\n";
for (auto cat : categorizer->categories()) {
iol.out << " " << cat << "\n";
}
}
iol.out << "\n";
return 0;
}
if (vm.count("help") or !(vm.count("input") or vm.count("input-list")) or
(!vm.count("output") and !vm.count("debug-filter"))) {
iol.out << tool_header("mkdwarfs") << usage << "\n" << basic_opts << "\n";
return 0;
}
if (level >= levels.size()) {
iol.err << "error: invalid compression level\n";
return 1;
}
auto const& defaults = levels[level];
categorizer_list.add_defaults(categorize_defaults_common);
categorizer_list.add_defaults(*categorize_defaults_level[level]);
if (!vm.count("block-size-bits")) {
sf_config.block_size_bits = defaults.block_size_bits;
}
if (!vm.count("schema-compression")) {
schema_compression = defaults.schema_history_compression;
}
if (!vm.count("history-compression")) {
history_compression = defaults.schema_history_compression;
}
if (!vm.count("metadata-compression")) {
metadata_compression = defaults.metadata_compression;
}
if (sf_config.block_size_bits < min_block_size_bits ||
sf_config.block_size_bits > max_block_size_bits) {
iol.err << "error: block size must be between " << min_block_size_bits
<< " and " << max_block_size_bits << "\n";
return 1;
}
std::filesystem::path path(path_str);
std::optional<std::vector<std::filesystem::path>> input_list;
if (vm.count("input-list")) {
if (vm.count("filter")) {
iol.err << "error: cannot use --input-list and --filter\n";
return 1;
}
// implicitly turn on
options.with_devices = true;
options.with_specials = true;
if (!vm.count("input")) {
path = iol.os->current_path();
}
std::filesystem::path input_list_path(input_list_str);
std::unique_ptr<input_stream> ifs;
std::istream* is;
if (input_list_path == "-") {
is = &iol.in;
} else {
std::error_code ec;
ifs = iol.file->open_input(input_list_path, ec);
if (ec) {
iol.err << "error opening file '" << input_list_path
<< "': " << ec.message() << "\n";
return 1;
}
is = &ifs->is();
}
std::string line;
input_list.emplace();
while (std::getline(*is, line)) {
input_list->emplace_back(line);
}
}
path = iol.os->canonical(path);
bool recompress = vm.count("recompress");
rewrite_options rw_opts;
if (recompress) {
std::unordered_map<std::string, unsigned> const modes{
{"all", 3},
{"metadata", 2},
{"block", 1},
{"none", 0},
};
if (auto it = modes.find(recompress_opts); it != modes.end()) {
rw_opts.recompress_block = it->second & 1;
rw_opts.recompress_metadata = it->second & 2;
} else {
iol.err << "invalid recompress mode: " << recompress_opts << "\n";
return 1;
}
if (!recompress_categories.empty()) {
std::string_view input = recompress_categories;
if (input.front() == '!') {
rw_opts.recompress_categories_exclude = true;
input.remove_prefix(1);
}
boost::split(rw_opts.recompress_categories, input, boost::is_any_of(","));
}
}
if (file_hash_algo == "none") {
options.file_hash_algorithm.reset();
} else if (checksum::is_available(file_hash_algo)) {
options.file_hash_algorithm = file_hash_algo;
} else {
iol.err << "error: unknown file hash function '" << file_hash_algo << "'\n";
return 1;
}
if (vm.count("max-similarity-size")) {
auto size = parse_size_with_unit(max_similarity_size);
if (size > 0) {
options.inode.max_similarity_scan_size = size;
}
}
size_t mem_limit = parse_size_with_unit(memory_limit);
if (!vm.count("num-scanner-workers")) {
num_scanner_workers = num_workers;
}
if (!vm.count("num-segmenter-workers")) {
num_segmenter_workers = num_workers;
}
worker_group wg_compress("compress", num_workers,
std::numeric_limits<size_t>::max(),
compress_niceness);
worker_group wg_scanner("scanner", num_scanner_workers);
options.num_segmenter_workers = num_segmenter_workers;
if (vm.count("debug-filter")) {
if (auto it = debug_filter_modes.find(debug_filter);
it != debug_filter_modes.end()) {
options.debug_filter_function =
[&iol, mode = it->second](bool exclude, entry const* pe) {
debug_filter_output(iol.out, exclude, pe, mode);
};
no_progress = true;
} else {
iol.err << "error: invalid filter debug mode '" << debug_filter << "'\n";
return 1;
}
}
if (no_progress) {
progress_mode = "none";
}
if (progress_mode != "none" && !iol.term->is_fancy(iol.err)) {
progress_mode = "simple";
}
if (!progress_modes.count(progress_mode)) {
iol.err << "error: invalid progress mode '" << progress_mode << "'\n";
return 1;
}
auto pg_mode = DWARFS_NOTHROW(progress_modes.at(progress_mode));
auto log_level = logger::parse_level(log_level_str);
console_writer lgr(iol.term, iol.err, pg_mode, log_level,
recompress ? console_writer::REWRITE
: console_writer::NORMAL,
log_level >= logger::VERBOSE);
std::shared_ptr<script> script;
if (!filter.empty() or vm.count("chmod")) {
auto bs = std::make_shared<builtin_script>(lgr);
if (!filter.empty()) {
bs->set_root_path(path);
for (auto const& rule : filter) {
auto srule = sys_string_to_string(rule);
try {
bs->add_filter_rule(srule);
} catch (std::exception const& e) {
iol.err << "error: could not parse filter rule '" << srule
<< "': " << e.what() << "\n";
return 1;
}
}
}
if (vm.count("chmod")) {
if (chmod_str == "norm") {
chmod_str = "ug-st,=Xr";
}
std::vector<std::string_view> chmod_exprs;
boost::split(chmod_exprs, chmod_str, boost::is_any_of(","));
// I'm pretty certain these warnings by Flawfinder are false positives.
// After all, we're just doing a no-op by re-setting the original value
// in order to read it.
auto mask = ::umask(0077); /* Flawfinder: ignore */
::umask(mask); /* Flawfinder: ignore */
for (auto expr : chmod_exprs) {
bs->add_transformer(create_chmod_entry_transformer(expr, mask));
}
}
script = bs;
}
if (vm.count("set-owner")) {
options.uid = uid;
}
if (vm.count("set-group")) {
options.gid = gid;
}
if (vm.count("set-time")) {
if (timestamp == "now") {
options.timestamp = std::time(nullptr);
} else if (auto val = folly::tryTo<uint64_t>(timestamp)) {
options.timestamp = *val;
} else {
iol.err << "error: argument for option '--set-time' must be numeric or "
"`now`\n";
return 1;
}
}
if (auto it = time_resolutions.find(time_resolution);
it != time_resolutions.end()) {
options.time_resolution_sec = it->second;
} else if (auto val = folly::tryTo<uint32_t>(time_resolution)) {
options.time_resolution_sec = *val;
if (options.time_resolution_sec == 0) {
iol.err << "error: the argument to '--time-resolution' must be nonzero\n";
return 1;
}
} else {
iol.err << "error: the argument ('" << time_resolution
<< "') to '--time-resolution' is invalid\n";
return 1;
}
if (!pack_metadata.empty() and pack_metadata != "none") {
if (pack_metadata == "auto") {
options.force_pack_string_tables = false;
options.pack_chunk_table = false;
options.pack_directories = false;
options.pack_shared_files_table = false;
options.pack_names = true;
options.pack_names_index = false;
options.pack_symlinks = true;
options.pack_symlinks_index = false;
} else {
std::vector<std::string> pack_opts;
boost::split(pack_opts, pack_metadata, boost::is_any_of(","));
for (auto const& opt : pack_opts) {
if (opt == "chunk_table") {
options.pack_chunk_table = true;
} else if (opt == "directories") {
options.pack_directories = true;
} else if (opt == "shared_files") {
options.pack_shared_files_table = true;
} else if (opt == "names") {
options.pack_names = true;
} else if (opt == "names_index") {
options.pack_names_index = true;
} else if (opt == "symlinks") {
options.pack_symlinks = true;
} else if (opt == "symlinks_index") {
options.pack_symlinks_index = true;
} else if (opt == "force") {
options.force_pack_string_tables = true;
} else if (opt == "plain") {
options.plain_names_table = true;
options.plain_symlinks_table = true;
} else if (opt == "all") {
options.pack_chunk_table = true;
options.pack_directories = true;
options.pack_shared_files_table = true;
options.pack_names = true;
options.pack_names_index = true;
options.pack_symlinks = true;
options.pack_symlinks_index = true;
} else {
iol.err << "error: the argument ('" << opt
<< "') to '--pack-metadata' is invalid\n";
return 1;
}
}
}
}
unsigned interval_ms =
pg_mode == console_writer::NONE || pg_mode == console_writer::SIMPLE
? 2000
: 200;
filesystem_writer_options fswopts;
fswopts.max_queue_size = mem_limit;
fswopts.worst_case_block_size = UINT64_C(1) << sf_config.block_size_bits;
fswopts.remove_header = remove_header;
fswopts.no_section_index = no_section_index;
std::unique_ptr<input_stream> header_ifs;
if (!header_str.empty()) {
std::filesystem::path header(header_str);
std::error_code ec;
header_ifs = iol.file->open_input_binary(header, ec);
if (ec) {
iol.err << "error: cannot open header file '" << header
<< "': " << ec.message() << "\n";
return 1;
}
}
LOG_PROXY(debug_logger_policy, lgr);
folly::Function<void(progress&, bool)> updater;
if (options.debug_filter_function) {
updater = [](progress&, bool) {};
} else {
updater = [&](progress& p, bool last) { lgr.update(p, last); };
}
progress prog(std::move(updater), interval_ms);
auto min_memory_req =
num_workers * (UINT64_C(1) << sf_config.block_size_bits);
// TODO:
if (mem_limit < min_memory_req /* && compression != "null" */) {
LOG_WARN << "low memory limit (" << size_with_unit(mem_limit) << "), need "
<< size_with_unit(min_memory_req) << " to efficiently compress "
<< size_with_unit(UINT64_C(1) << sf_config.block_size_bits)
<< " blocks with " << num_workers << " threads";
}
std::filesystem::path output(output_str);
std::variant<std::monostate, std::unique_ptr<output_stream>,
std::ostringstream>
os;
if (!options.debug_filter_function) {
if (output != "-") {
if (iol.file->exists(output) && !force_overwrite) {
iol.err
<< "error: output file already exists, use --force to overwrite\n";
return 1;
}
std::error_code ec;
auto stream = iol.file->open_output_binary(output, ec);
if (ec) {
iol.err << "error: cannot open output file '" << output
<< "': " << ec.message() << "\n";
return 1;
}
assert(stream);
os.emplace<std::unique_ptr<output_stream>>(std::move(stream));
} else {
#ifdef _WIN32
::_setmode(::_fileno(stdout), _O_BINARY);
#endif
}
} else {
os.emplace<std::ostringstream>();
}
options.enable_history = !no_history;
rw_opts.enable_history = !no_history;
if (options.enable_history) {
options.history.with_timestamps = !no_history_timestamps;
rw_opts.history.with_timestamps = !no_history_timestamps;
if (!no_history_command_line) {
options.command_line_arguments = command_line;
rw_opts.command_line_arguments = command_line;
}
}
if (!categorizer_list.value.empty()) {
std::vector<std::string> categorizers;
boost::split(categorizers, categorizer_list.value, boost::is_any_of(","));
options.inode.categorizer_mgr = std::make_shared<categorizer_manager>(lgr);
for (auto const& name : categorizers) {
options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm));
}
}
std::unique_ptr<filesystem_v2> input_filesystem;
std::shared_ptr<category_resolver> cat_resolver;
if (recompress) {
filesystem_options fsopts;
fsopts.image_offset = filesystem_options::IMAGE_OFFSET_AUTO;
input_filesystem = std::make_unique<filesystem_v2>(
lgr, std::make_shared<dwarfs::mmap>(path), fsopts);
cat_resolver = std::make_shared<filesystem_block_category_resolver>(
input_filesystem->get_all_block_categories());
for (auto const& cat : rw_opts.recompress_categories) {
if (!cat_resolver->category_value(cat)) {
iol.err << "error: no category '" << cat << "' in input filesystem\n";
return 1;
}
}
} else {
cat_resolver = options.inode.categorizer_mgr;
}
category_parser cp(cat_resolver);
try {
{
contextual_option_parser cop("--order", options.inode.fragment_order, cp,
order_parser);
cop.parse(defaults.order);
cop.parse(order);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
}
{
contextual_option_parser cop("--max-lookback-blocks",
sf_config.max_active_blocks, cp,
max_lookback_parser);
sf_config.max_active_blocks.set_default(1);
cop.parse(max_lookback_blocks);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
}
{
contextual_option_parser cop("--window-size",
sf_config.blockhash_window_size, cp,
window_size_parser);
sf_config.blockhash_window_size.set_default(defaults.window_size);
cop.parse(window_size);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
}
{
contextual_option_parser cop("--window-step",
sf_config.window_increment_shift, cp,
window_step_parser);
sf_config.window_increment_shift.set_default(defaults.window_step);
cop.parse(window_step);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
}
{
contextual_option_parser cop("--bloom-filter-size",
sf_config.bloom_filter_size, cp,
bloom_filter_size_parser);
sf_config.bloom_filter_size.set_default(4);
cop.parse(bloom_filter_size);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
}
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
block_compressor schema_bc(schema_compression);
block_compressor metadata_bc(metadata_compression);
block_compressor history_bc(history_compression);
std::unique_ptr<filesystem_writer> fsw;
try {
std::ostream& fsw_os = std::visit(
overloaded(
[&](std::monostate) -> std::ostream& { return iol.out; },
[&](std::unique_ptr<output_stream>& os) -> std::ostream& {
return os->os();
},
[&](std::ostringstream& oss) -> std::ostream& { return oss; }),
os);
fsw = std::make_unique<filesystem_writer>(
fsw_os, lgr, wg_compress, prog, schema_bc, metadata_bc, history_bc,
fswopts, header_ifs ? &header_ifs->is() : nullptr);
categorized_option<block_compressor> compression_opt;
contextual_option_parser cop("--compression", compression_opt, cp,
compressor_parser);
compression_opt.set_default(
block_compressor(std::string(defaults.data_compression)));
cop.parse(compression);
categorizer_list.add_implicit_defaults(cop);
LOG_VERBOSE << cop.as_string();
fsw->add_default_compressor(compression_opt.get());
if (recompress) {
compression_opt.visit_contextual(
[catres = cat_resolver, &fsw](auto cat, block_compressor const& bc) {
fsw->add_category_compressor(cat, bc);
});
} else {
compression_opt.visit_contextual([catmgr = options.inode.categorizer_mgr,
&fsw](auto cat,
block_compressor const& bc) {
try {
catmgr->set_metadata_requirements(cat, bc.metadata_requirements());
fsw->add_category_compressor(cat, bc);
} catch (std::exception const& e) {
throw std::runtime_error(
fmt::format("compression '{}' cannot be used for category '{}': "
"metadata requirements not met ({})",
bc.describe(), catmgr->category_name(cat), e.what()));
}
});
}
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
auto ti = LOG_TIMED_INFO;
try {
if (recompress) {
input_filesystem->rewrite(prog, *fsw, *cat_resolver, rw_opts);
wg_compress.wait();
} else {
auto sf = std::make_shared<segmenter_factory>(
lgr, prog, options.inode.categorizer_mgr, sf_config);
scanner s(lgr, wg_scanner, std::move(sf), entry_factory::create(), iol.os,
std::move(script), options);
if (input_list) {
s.scan(*fsw, path, prog, *input_list);
} else {
s.scan(*fsw, path, prog);
}
options.inode.categorizer_mgr.reset();
}
} catch (runtime_error const& e) {
LOG_ERROR << e.what();
return 1;
} catch (system_error const& e) {
LOG_ERROR << e.what();
return 1;
}
if (!options.debug_filter_function) {
LOG_INFO << "compression CPU time: "
<< time_with_unit(wg_compress.get_cpu_time());
}
{
auto ec = std::visit(
overloaded([](std::monostate) -> int { return 0; },
[&](std::unique_ptr<output_stream>& os) -> int {
std::error_code ec;
os->close(ec);
if (ec) {
LOG_ERROR << "failed to close output file '" << output
<< "': " << ec.message();
return 1;
}
os.reset();
return 0;
},
[](std::ostringstream& oss [[maybe_unused]]) -> int {
assert(oss.str().empty());
return 0;
}),
os);
if (ec != 0) {
return ec;
}
}
if (!options.debug_filter_function) {
std::ostringstream err;
if (prog.errors) {
err << "with " << prog.errors << " error";
if (prog.errors > 1) {
err << "s";
}
} else {
err << "without errors";
}
ti << "filesystem " << (recompress ? "rewritten " : "created ")
<< err.str();
}
return prog.errors > 0;
}
int mkdwarfs_main(int argc, sys_char** argv) {
return mkdwarfs_main(argc, argv, iolayer::system_default());
}
int mkdwarfs_main(std::span<std::string> args, iolayer const& iol) {
return call_sys_main_iolayer(args, iol, mkdwarfs_main);
}
int mkdwarfs_main(std::span<std::string_view> args, iolayer const& iol) {
return call_sys_main_iolayer(args, iol, mkdwarfs_main);
}
} // namespace dwarfs