mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-07 19:41:54 -04:00
Support for simple filter rules (potential fix for github #6)
This commit is contained in:
parent
c2f00d78c3
commit
1215a30f78
@ -298,6 +298,7 @@ list(
|
||||
src/dwarfs/block_cache.cpp
|
||||
src/dwarfs/block_compressor.cpp
|
||||
src/dwarfs/block_manager.cpp
|
||||
src/dwarfs/builtin_script.cpp
|
||||
src/dwarfs/checksum.cpp
|
||||
src/dwarfs/console_writer.cpp
|
||||
src/dwarfs/entry.cpp
|
||||
|
@ -245,6 +245,18 @@ Most other options are concerned with compression tuning:
|
||||
Last but not least, if scripting support is built into `mkdwarfs`, you can
|
||||
choose `script` to let the script determine the order.
|
||||
|
||||
- `-F`, `--filter=`*rule*:
|
||||
Add a filter rule. This option can be specified multiple times.
|
||||
See [FILTER RULES](#filter-rules) for more details.
|
||||
|
||||
- `--debug-filter`[`=all`|`=excluded`|`=excluded-files`|`=files`|`=included`|`=included-files`]:
|
||||
Show the effect of the filter rules without creating a file system.
|
||||
If no argument is passed to the option, all included/excluded files and
|
||||
directories are shown (same as with `all`). `files` will omit all
|
||||
directories. `included` and `excluded` will only show the corresponding
|
||||
set of files/directories. `included-files` and `excluded-files` work
|
||||
as before, but again omit all directories.
|
||||
|
||||
- `--remove-empty-dirs`:
|
||||
Removes all empty directories from the output file system, recursively.
|
||||
This is particularly useful when using scripts that filter out a lot of
|
||||
@ -445,6 +457,67 @@ further compress the block. So if you're really desperately trying
|
||||
to reduce the image size, enabling `all` packing would be an option
|
||||
at the cost of using a lot more memory when using the filesystem.
|
||||
|
||||
## FILTER RULES
|
||||
|
||||
The filter rules have been inspired by the `rsync` utility. They
|
||||
look very similar, but there are differences. These rules are quite
|
||||
powerful, yet they're somewhat hard to get used to.
|
||||
|
||||
There are only 3 different kinds of rules:
|
||||
|
||||
- `+ `pattern
|
||||
An "include" rule.
|
||||
|
||||
- `- `pattern
|
||||
An "exclude" rule.
|
||||
|
||||
- `. `file
|
||||
A merge file rule. Rules are read (recursively) from the
|
||||
specified file.
|
||||
|
||||
Ultimately, only include and exclude rules remain in the rule set
|
||||
as file rules are merged in at the place where they occur.
|
||||
|
||||
The most important rule to remember when building a rule set is that
|
||||
all rules are applied strictly in order and processing stops at the
|
||||
first matching rule. If no rules match, the default is to include the
|
||||
entry.
|
||||
|
||||
Patterns can be anchored or floating. Anchored patterns are patterns
|
||||
that start with a `/`. These patterns match relative to the file
|
||||
system root (i.e. the `--input` path). Floating patterns match in
|
||||
any directory in the hierarchy.
|
||||
|
||||
Patterns ending with a `/` only match directories. All other patterns
|
||||
only match non-directories.
|
||||
|
||||
Patterns support `?` and `*` wildcards matching a single character
|
||||
and any number of characters, respectively. These patterns don't match
|
||||
across directory separators (`/`).
|
||||
|
||||
Patterns also support the `**` wildcard, which matches across directory
|
||||
separators.
|
||||
|
||||
Patterns also support character classes.
|
||||
|
||||
Here's an example rule set:
|
||||
```
|
||||
+ File/Spec/[EM]*.pm
|
||||
- unicore/**.pl
|
||||
+ *.pl
|
||||
- *
|
||||
```
|
||||
This set of rules will include all files matching `File/Spec/[EM]*.pm`
|
||||
anywhere in the hierarchy. It will also include all `*.pl` files, except
|
||||
for those anywhere below a `unicore` directory. The last rule excludes
|
||||
all other files.
|
||||
|
||||
This will likely leave a lot of empty directories around, but these can
|
||||
be removed using `--remove-empty-dirs`.
|
||||
|
||||
You can use the `--debug-filter` option to show the sets of included
|
||||
and excluded files without building an actual file system.
|
||||
|
||||
## INTERNAL OPERATION
|
||||
|
||||
Internally, `mkdwarfs` runs in two completely separate phases. The first
|
||||
|
68
include/dwarfs/builtin_script.h
Normal file
68
include/dwarfs/builtin_script.h
Normal file
@ -0,0 +1,68 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "dwarfs/inode.h"
|
||||
#include "dwarfs/script.h"
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
class logger;
|
||||
|
||||
class builtin_script : public script {
|
||||
public:
|
||||
builtin_script(logger& lgr);
|
||||
~builtin_script();
|
||||
|
||||
void set_root_path(std::string const& path) { impl_->set_root_path(path); }
|
||||
void add_filter_rule(std::string const& rule) {
|
||||
impl_->add_filter_rule(rule);
|
||||
};
|
||||
|
||||
bool has_configure() const override;
|
||||
bool has_filter() const override;
|
||||
bool has_transform() const override;
|
||||
bool has_order() const override;
|
||||
|
||||
void configure(options_interface const& oi) override;
|
||||
bool filter(entry_interface const& ei) override;
|
||||
void transform(entry_interface& ei) override;
|
||||
void order(inode_vector& iv) override;
|
||||
|
||||
class impl {
|
||||
public:
|
||||
virtual ~impl() = default;
|
||||
|
||||
virtual void set_root_path(std::string const& path) = 0;
|
||||
virtual void add_filter_rule(std::string const& rule) = 0;
|
||||
virtual bool filter(entry_interface const& ei) = 0;
|
||||
virtual bool has_filter() const = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
std::unique_ptr<impl> impl_;
|
||||
};
|
||||
|
||||
} // namespace dwarfs
|
@ -75,6 +75,7 @@ class entry : public entry_interface {
|
||||
std::shared_ptr<entry> parent() const;
|
||||
void set_name(const std::string& name);
|
||||
std::string path() const override;
|
||||
std::string dpath() const override;
|
||||
const std::string& name() const override { return name_; }
|
||||
size_t size() const override { return stat_.st_size; }
|
||||
virtual type_t type() const = 0;
|
||||
|
@ -30,6 +30,7 @@ namespace dwarfs {
|
||||
class entry_interface : public object {
|
||||
public:
|
||||
virtual std::string path() const = 0;
|
||||
virtual std::string dpath() const = 0;
|
||||
virtual std::string const& name() const = 0;
|
||||
virtual std::string type_string() const = 0;
|
||||
virtual size_t size() const = 0;
|
||||
|
@ -23,6 +23,7 @@
|
||||
|
||||
#include <chrono>
|
||||
#include <cstddef>
|
||||
#include <functional>
|
||||
#include <iosfwd>
|
||||
#include <optional>
|
||||
|
||||
@ -30,6 +31,8 @@
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
class entry;
|
||||
|
||||
enum class mlock_mode { NONE, TRY, MUST };
|
||||
|
||||
enum class cache_tidy_strategy { NONE, EXPIRY_TIME, BLOCK_SWAPPED_OUT };
|
||||
@ -108,6 +111,7 @@ struct scanner_options {
|
||||
bool pack_symlinks_index{false};
|
||||
bool force_pack_string_tables{false};
|
||||
bool no_create_timestamp{true};
|
||||
std::optional<std::function<void(bool, entry const*)>> debug_filter_function;
|
||||
};
|
||||
|
||||
struct rewrite_options {
|
||||
|
266
src/dwarfs/builtin_script.cpp
Normal file
266
src/dwarfs/builtin_script.cpp
Normal file
@ -0,0 +1,266 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <cassert>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
#include <unordered_set>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include "dwarfs/builtin_script.h"
|
||||
#include "dwarfs/entry_interface.h"
|
||||
#include "dwarfs/logger.h"
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
struct filter_rule {
|
||||
enum class rule_type {
|
||||
include,
|
||||
exclude,
|
||||
};
|
||||
|
||||
filter_rule(rule_type type, bool floating, std::string const& re,
|
||||
std::string const& rule)
|
||||
: type{type}
|
||||
, floating{floating}
|
||||
, re{re}
|
||||
, rule{rule} {}
|
||||
|
||||
rule_type type;
|
||||
bool floating;
|
||||
std::regex re;
|
||||
std::string rule;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
class builtin_script_ : public builtin_script::impl {
|
||||
public:
|
||||
builtin_script_(logger& lgr);
|
||||
|
||||
void set_root_path(std::string const& path) override;
|
||||
void add_filter_rule(std::string const& rule) override;
|
||||
|
||||
bool filter(entry_interface const& ei) override;
|
||||
|
||||
bool has_filter() const override { return !filter_.empty(); }
|
||||
|
||||
private:
|
||||
void add_filter_rule(std::unordered_set<std::string>& seen_files,
|
||||
std::string const& rule);
|
||||
|
||||
filter_rule compile_filter_rule(std::string const& rule);
|
||||
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
std::string root_path_;
|
||||
std::vector<filter_rule> filter_;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
auto builtin_script_<LoggerPolicy>::compile_filter_rule(std::string const& rule)
|
||||
-> filter_rule {
|
||||
std::string r;
|
||||
filter_rule::rule_type type;
|
||||
|
||||
auto* p = rule.c_str();
|
||||
|
||||
switch (*p) {
|
||||
case '+':
|
||||
type = filter_rule::rule_type::include;
|
||||
break;
|
||||
case '-':
|
||||
type = filter_rule::rule_type::exclude;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("rules must start with + or -");
|
||||
}
|
||||
|
||||
while (*++p == ' ')
|
||||
;
|
||||
|
||||
// If the start of the pattern is not explicitly anchored, make it floating.
|
||||
bool floating = *p && *p != '/';
|
||||
|
||||
if (floating) {
|
||||
r += ".*/";
|
||||
}
|
||||
|
||||
while (*p) {
|
||||
switch (*p) {
|
||||
case '\\':
|
||||
r += *p++;
|
||||
if (p) {
|
||||
r += *p++;
|
||||
}
|
||||
continue;
|
||||
|
||||
case '*': {
|
||||
int nstar = 1;
|
||||
while (*++p == '*') {
|
||||
++nstar;
|
||||
}
|
||||
switch (nstar) {
|
||||
case 1:
|
||||
if (r.ends_with('/') and (*p == '/' or *p == '\0')) {
|
||||
r += "[^/]+";
|
||||
} else {
|
||||
r += "[^/]*";
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
r += ".*";
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("too many *s");
|
||||
}
|
||||
}
|
||||
continue;
|
||||
|
||||
case '?':
|
||||
r += "[^/]";
|
||||
break;
|
||||
|
||||
case '.':
|
||||
case '+':
|
||||
case '^':
|
||||
case '$':
|
||||
case '(':
|
||||
case ')':
|
||||
case '{':
|
||||
case '}':
|
||||
case '|':
|
||||
r += '\\';
|
||||
r += *p;
|
||||
break;
|
||||
|
||||
default:
|
||||
r += *p;
|
||||
break;
|
||||
}
|
||||
|
||||
++p;
|
||||
}
|
||||
|
||||
LOG_DEBUG << "'" << rule << "' -> '" << r << "' [floating=" << floating
|
||||
<< "]";
|
||||
|
||||
return filter_rule(type, floating, r, rule);
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
builtin_script_<LoggerPolicy>::builtin_script_(logger& lgr)
|
||||
: log_(lgr) {}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void builtin_script_<LoggerPolicy>::set_root_path(std::string const& path) {
|
||||
root_path_ = path;
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void builtin_script_<LoggerPolicy>::add_filter_rule(std::string const& rule) {
|
||||
std::unordered_set<std::string> seen_files;
|
||||
add_filter_rule(seen_files, rule);
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void builtin_script_<LoggerPolicy>::add_filter_rule(
|
||||
std::unordered_set<std::string>& seen_files, std::string const& rule) {
|
||||
if (rule.starts_with('.')) {
|
||||
auto file = std::regex_replace(rule, std::regex("^. +"), "");
|
||||
|
||||
if (!seen_files.emplace(file).second) {
|
||||
throw std::runtime_error(
|
||||
fmt::format("recursion detected while opening file: {}", file));
|
||||
}
|
||||
|
||||
std::ifstream ifs(file);
|
||||
|
||||
if (!ifs.is_open()) {
|
||||
throw std::runtime_error(fmt::format("error opening file: {}", file));
|
||||
}
|
||||
|
||||
std::string line;
|
||||
|
||||
while (std::getline(ifs, line)) {
|
||||
if (line.starts_with('#')) {
|
||||
continue;
|
||||
}
|
||||
if (line.find_first_not_of(" \t") == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
add_filter_rule(seen_files, line);
|
||||
}
|
||||
|
||||
seen_files.erase(file);
|
||||
} else {
|
||||
filter_.push_back(compile_filter_rule(rule));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
bool builtin_script_<LoggerPolicy>::filter(entry_interface const& ei) {
|
||||
std::string path = ei.dpath();
|
||||
std::string relpath = path;
|
||||
|
||||
if (relpath.size() >= root_path_.size()) {
|
||||
assert(relpath.substr(0, root_path_.size()) == root_path_);
|
||||
relpath.erase(0, root_path_.size());
|
||||
}
|
||||
|
||||
for (const auto& r : filter_) {
|
||||
if (std::regex_match(r.floating ? path : relpath, r.re)) {
|
||||
LOG_TRACE << path << " matched rule '" << r.rule << "'";
|
||||
switch (r.type) {
|
||||
case filter_rule::rule_type::include:
|
||||
return true;
|
||||
|
||||
case filter_rule::rule_type::exclude:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TRACE << path << " matched no rule";
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
builtin_script::builtin_script(logger& lgr)
|
||||
: impl_(make_unique_logging_object<impl, builtin_script_, logger_policies>(
|
||||
lgr)) {}
|
||||
|
||||
builtin_script::~builtin_script() = default;
|
||||
|
||||
bool builtin_script::has_configure() const { return false; }
|
||||
bool builtin_script::has_filter() const { return impl_->has_filter(); }
|
||||
bool builtin_script::has_transform() const { return false; }
|
||||
bool builtin_script::has_order() const { return false; }
|
||||
|
||||
void builtin_script::configure(options_interface const&) { assert(false); }
|
||||
|
||||
bool builtin_script::filter(entry_interface const& ei) {
|
||||
return impl_->filter(ei);
|
||||
}
|
||||
|
||||
void builtin_script::transform(entry_interface&) { assert(false); }
|
||||
void builtin_script::order(inode_vector&) { assert(false); }
|
||||
|
||||
} // namespace dwarfs
|
@ -66,6 +66,14 @@ std::string entry::path() const {
|
||||
return name_;
|
||||
}
|
||||
|
||||
std::string entry::dpath() const {
|
||||
auto p = path();
|
||||
if (type() == E_DIR) {
|
||||
p += '/';
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
std::string entry::type_string() const {
|
||||
auto mode = stat_.st_mode;
|
||||
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <deque>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <mutex>
|
||||
#include <numeric>
|
||||
@ -676,6 +677,7 @@ std::shared_ptr<entry>
|
||||
scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
|
||||
file_scanner& fs) {
|
||||
auto root = entry_->create(*os_, path);
|
||||
bool const debug_filter = options_.debug_filter_function.has_value();
|
||||
|
||||
if (root->type() != entry::E_DIR) {
|
||||
DWARFS_THROW(runtime_error, fmt::format("'{}' must be a directory", path));
|
||||
@ -704,18 +706,28 @@ scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
|
||||
|
||||
try {
|
||||
auto pe = entry_->create(*os_, name, parent);
|
||||
bool exclude = false;
|
||||
|
||||
if (script_) {
|
||||
if (script_->has_filter() && !script_->filter(*pe)) {
|
||||
LOG_DEBUG << "skipping " << pe->path();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (script_->has_transform()) {
|
||||
exclude = true;
|
||||
} else if (script_->has_transform()) {
|
||||
script_->transform(*pe);
|
||||
}
|
||||
}
|
||||
|
||||
if (debug_filter) {
|
||||
(*options_.debug_filter_function)(exclude, pe.get());
|
||||
}
|
||||
|
||||
if (exclude) {
|
||||
if (!debug_filter) {
|
||||
LOG_DEBUG << "excluding " << pe->dpath();
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (pe) {
|
||||
switch (pe->type()) {
|
||||
case entry::E_FILE:
|
||||
@ -748,25 +760,33 @@ scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
|
||||
case entry::E_DIR:
|
||||
// prog.current.store(pe.get());
|
||||
prog.dirs_found++;
|
||||
pe->scan(*os_, prog);
|
||||
if (!debug_filter) {
|
||||
pe->scan(*os_, prog);
|
||||
}
|
||||
subdirs.push_back(pe);
|
||||
break;
|
||||
|
||||
case entry::E_FILE:
|
||||
prog.files_found++;
|
||||
fs.scan(dynamic_cast<file*>(pe.get()));
|
||||
if (!debug_filter) {
|
||||
fs.scan(dynamic_cast<file*>(pe.get()));
|
||||
}
|
||||
break;
|
||||
|
||||
case entry::E_LINK:
|
||||
prog.symlinks_found++;
|
||||
pe->scan(*os_, prog);
|
||||
if (!debug_filter) {
|
||||
pe->scan(*os_, prog);
|
||||
}
|
||||
prog.symlinks_scanned++;
|
||||
break;
|
||||
|
||||
case entry::E_DEVICE:
|
||||
case entry::E_OTHER:
|
||||
prog.specials_found++;
|
||||
pe->scan(*os_, prog);
|
||||
if (!debug_filter) {
|
||||
pe->scan(*os_, prog);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -796,7 +816,9 @@ scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
|
||||
template <typename LoggerPolicy>
|
||||
void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
const std::string& path, progress& prog) {
|
||||
LOG_INFO << "scanning " << path;
|
||||
if (!options_.debug_filter_function) {
|
||||
LOG_INFO << "scanning " << path;
|
||||
}
|
||||
|
||||
prog.set_status_function(status_string);
|
||||
|
||||
@ -806,6 +828,10 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
|
||||
auto root = scan_tree(path, prog, fs);
|
||||
|
||||
if (options_.debug_filter_function) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (options_.remove_empty_dirs) {
|
||||
LOG_INFO << "removing empty directories...";
|
||||
auto d = dynamic_cast<dir*>(root.get());
|
||||
|
192
src/mkdwarfs.cpp
192
src/mkdwarfs.cpp
@ -57,6 +57,7 @@
|
||||
|
||||
#include "dwarfs/block_compressor.h"
|
||||
#include "dwarfs/block_manager.h"
|
||||
#include "dwarfs/builtin_script.h"
|
||||
#include "dwarfs/console_writer.h"
|
||||
#include "dwarfs/entry.h"
|
||||
#include "dwarfs/error.h"
|
||||
@ -93,6 +94,16 @@ namespace {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
enum class debug_filter_mode {
|
||||
OFF,
|
||||
INCLUDED,
|
||||
INCLUDED_FILES,
|
||||
EXCLUDED,
|
||||
EXCLUDED_FILES,
|
||||
FILES,
|
||||
ALL
|
||||
};
|
||||
|
||||
const std::map<std::string, file_order_mode> order_choices{
|
||||
{"none", file_order_mode::NONE},
|
||||
{"path", file_order_mode::PATH},
|
||||
@ -110,6 +121,15 @@ const std::map<std::string, console_writer::progress_mode> progress_modes{
|
||||
{"unicode", console_writer::UNICODE},
|
||||
};
|
||||
|
||||
const std::map<std::string, debug_filter_mode> debug_filter_modes{
|
||||
{"included", debug_filter_mode::INCLUDED},
|
||||
{"included-files", debug_filter_mode::INCLUDED_FILES},
|
||||
{"excluded", debug_filter_mode::EXCLUDED},
|
||||
{"excluded-files", debug_filter_mode::EXCLUDED_FILES},
|
||||
{"files", debug_filter_mode::FILES},
|
||||
{"all", debug_filter_mode::ALL},
|
||||
};
|
||||
|
||||
const std::map<std::string, uint32_t> time_resolutions{
|
||||
{"sec", 1},
|
||||
{"min", 60},
|
||||
@ -120,6 +140,32 @@ const std::map<std::string, uint32_t> time_resolutions{
|
||||
constexpr size_t min_block_size_bits{10};
|
||||
constexpr size_t max_block_size_bits{30};
|
||||
|
||||
void debug_filter_output(std::ostream& os, bool exclude, entry const* pe,
|
||||
debug_filter_mode mode) {
|
||||
if (exclude ? mode == debug_filter_mode::INCLUDED or
|
||||
mode == debug_filter_mode::INCLUDED_FILES
|
||||
: mode == debug_filter_mode::EXCLUDED or
|
||||
mode == debug_filter_mode::EXCLUDED_FILES) {
|
||||
return;
|
||||
}
|
||||
|
||||
bool const files_only = mode == debug_filter_mode::FILES or
|
||||
mode == debug_filter_mode::INCLUDED_FILES or
|
||||
mode == debug_filter_mode::EXCLUDED_FILES;
|
||||
|
||||
if (files_only and pe->type() == entry::E_DIR) {
|
||||
return;
|
||||
}
|
||||
|
||||
char const* prefix = "";
|
||||
|
||||
if (mode == debug_filter_mode::FILES or mode == debug_filter_mode::ALL) {
|
||||
prefix = exclude ? "- " : "+ ";
|
||||
}
|
||||
|
||||
os << prefix << pe->dpath() << "\n";
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace dwarfs {
|
||||
@ -339,7 +385,8 @@ int mkdwarfs(int argc, char** argv) {
|
||||
std::string path, output, memory_limit, script_arg, compression, header,
|
||||
schema_compression, metadata_compression, log_level_str, timestamp,
|
||||
time_resolution, order, progress_mode, recompress_opts, pack_metadata,
|
||||
file_hash_algo;
|
||||
file_hash_algo, debug_filter;
|
||||
std::vector<std::string> filter;
|
||||
size_t num_workers;
|
||||
bool no_progress = false, remove_header = false, no_section_index = false,
|
||||
force_overwrite = false;
|
||||
@ -354,6 +401,10 @@ int mkdwarfs(int argc, char** argv) {
|
||||
auto progress_desc = "progress mode (" +
|
||||
(from(progress_modes) | get<0>() | unsplit(", ")) + ")";
|
||||
|
||||
auto debug_filter_desc =
|
||||
"show effect of filter rules without producing an image (" +
|
||||
(from(debug_filter_modes) | get<0>() | unsplit(", ")) + ")";
|
||||
|
||||
auto resolution_desc = "time resolution in seconds or (" +
|
||||
(from(time_resolutions) | get<0>() | unsplit(", ")) +
|
||||
")";
|
||||
@ -439,6 +490,12 @@ int mkdwarfs(int argc, char** argv) {
|
||||
po::value<std::string>(&script_arg),
|
||||
"Python script for customization")
|
||||
#endif
|
||||
("filter,F",
|
||||
po::value<std::vector<std::string>>(&filter)->multitoken(),
|
||||
"add filter rule")
|
||||
("debug-filter",
|
||||
po::value<std::string>(&debug_filter)->implicit_value("all"),
|
||||
debug_filter_desc.c_str())
|
||||
("remove-empty-dirs",
|
||||
po::value<bool>(&options.remove_empty_dirs)->zero_tokens(),
|
||||
"remove empty directories in file system")
|
||||
@ -498,7 +555,8 @@ int mkdwarfs(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (vm.count("help") or !vm.count("input") or !vm.count("output")) {
|
||||
if (vm.count("help") or !vm.count("input") or
|
||||
(!vm.count("output") and !vm.count("debug-filter"))) {
|
||||
size_t l_dc = 0, l_sc = 0, l_mc = 0, l_or = 0;
|
||||
for (auto const& l : levels) {
|
||||
l_dc = std::max(l_dc, l.data_compression.size());
|
||||
@ -683,6 +741,21 @@ int mkdwarfs(int argc, char** argv) {
|
||||
worker_group wg_compress("compress", num_workers);
|
||||
worker_group wg_scanner("scanner", num_workers);
|
||||
|
||||
if (vm.count("debug-filter")) {
|
||||
if (auto it = debug_filter_modes.find(debug_filter);
|
||||
it != debug_filter_modes.end()) {
|
||||
options.debug_filter_function = [mode = it->second](bool exclude,
|
||||
entry const* pe) {
|
||||
debug_filter_output(std::cout, exclude, pe, mode);
|
||||
};
|
||||
no_progress = true;
|
||||
} else {
|
||||
std::cerr << "error: invalid filter debug mode '" << debug_filter
|
||||
<< "'\n";
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (no_progress) {
|
||||
progress_mode = "none";
|
||||
}
|
||||
@ -728,6 +801,30 @@ int mkdwarfs(int argc, char** argv) {
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!filter.empty()) {
|
||||
if (script) {
|
||||
std::cerr
|
||||
<< "error: scripts and filters are not simultaneously supported\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto bs = std::make_shared<builtin_script>(lgr);
|
||||
|
||||
bs->set_root_path(path);
|
||||
|
||||
for (auto const& rule : filter) {
|
||||
try {
|
||||
bs->add_filter_rule(rule);
|
||||
} catch (std::exception const& e) {
|
||||
std::cerr << "error: could not parse filter rule '" << rule
|
||||
<< "': " << e.what() << "\n";
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
script = bs;
|
||||
}
|
||||
|
||||
bool force_similarity = false;
|
||||
|
||||
if (script && script->has_configure()) {
|
||||
@ -853,8 +950,15 @@ int mkdwarfs(int argc, char** argv) {
|
||||
|
||||
LOG_PROXY(debug_logger_policy, lgr);
|
||||
|
||||
progress prog([&](const progress& p, bool last) { lgr.update(p, last); },
|
||||
interval_ms);
|
||||
folly::Function<void(const progress&, bool)> updater;
|
||||
|
||||
if (options.debug_filter_function) {
|
||||
updater = [](const progress&, bool) {};
|
||||
} else {
|
||||
updater = [&](const progress& p, bool last) { lgr.update(p, last); };
|
||||
}
|
||||
|
||||
progress prog(std::move(updater), interval_ms);
|
||||
|
||||
block_compressor bc(compression);
|
||||
block_compressor schema_bc(schema_compression);
|
||||
@ -869,21 +973,30 @@ int mkdwarfs(int argc, char** argv) {
|
||||
<< " blocks with " << num_workers << " threads";
|
||||
}
|
||||
|
||||
if (std::filesystem::exists(output) && !force_overwrite) {
|
||||
std::cerr << "error: output file already exists, use --force to overwrite"
|
||||
<< std::endl;
|
||||
return 1;
|
||||
std::unique_ptr<std::ostream> os;
|
||||
|
||||
if (!options.debug_filter_function) {
|
||||
if (std::filesystem::exists(output) && !force_overwrite) {
|
||||
std::cerr << "error: output file already exists, use --force to overwrite"
|
||||
<< std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto ofs = std::make_unique<std::ofstream>(output, std::ios::binary |
|
||||
std::ios::trunc);
|
||||
|
||||
if (ofs->bad() || !ofs->is_open()) {
|
||||
std::cerr << "error: cannot open output file '" << output
|
||||
<< "': " << strerror(errno) << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
os = std::move(ofs);
|
||||
} else {
|
||||
os = std::make_unique<std::ostringstream>();
|
||||
}
|
||||
|
||||
std::ofstream ofs(output, std::ios::binary | std::ios::trunc);
|
||||
|
||||
if (ofs.bad() || !ofs.is_open()) {
|
||||
std::cerr << "error: cannot open output file '" << output
|
||||
<< "': " << strerror(errno) << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
filesystem_writer fsw(ofs, lgr, wg_compress, prog, bc, schema_bc, metadata_bc,
|
||||
filesystem_writer fsw(*os, lgr, wg_compress, prog, bc, schema_bc, metadata_bc,
|
||||
fswopts, header_ifs.get());
|
||||
|
||||
auto ti = LOG_TIMED_INFO;
|
||||
@ -914,29 +1027,42 @@ int mkdwarfs(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG_INFO << "compression CPU time: "
|
||||
<< time_with_unit(wg_compress.get_cpu_time());
|
||||
|
||||
ofs.close();
|
||||
|
||||
if (ofs.bad()) {
|
||||
LOG_ERROR << "failed to close output file '" << output
|
||||
<< "': " << strerror(errno);
|
||||
return 1;
|
||||
if (!options.debug_filter_function) {
|
||||
LOG_INFO << "compression CPU time: "
|
||||
<< time_with_unit(wg_compress.get_cpu_time());
|
||||
}
|
||||
|
||||
std::ostringstream err;
|
||||
if (auto ofs = dynamic_cast<std::ofstream*>(os.get())) {
|
||||
ofs->close();
|
||||
|
||||
if (prog.errors) {
|
||||
err << "with " << prog.errors << " error";
|
||||
if (prog.errors > 1) {
|
||||
err << "s";
|
||||
if (ofs->bad()) {
|
||||
LOG_ERROR << "failed to close output file '" << output
|
||||
<< "': " << strerror(errno);
|
||||
return 1;
|
||||
}
|
||||
} else if (auto oss = dynamic_cast<std::ostringstream*>(os.get())) {
|
||||
assert(oss->str().empty());
|
||||
} else {
|
||||
err << "without errors";
|
||||
assert(false);
|
||||
}
|
||||
|
||||
ti << "filesystem " << (recompress ? "rewritten " : "created ") << err.str();
|
||||
os.reset();
|
||||
|
||||
if (!options.debug_filter_function) {
|
||||
std::ostringstream err;
|
||||
|
||||
if (prog.errors) {
|
||||
err << "with " << prog.errors << " error";
|
||||
if (prog.errors > 1) {
|
||||
err << "s";
|
||||
}
|
||||
} else {
|
||||
err << "without errors";
|
||||
}
|
||||
|
||||
ti << "filesystem " << (recompress ? "rewritten " : "created ")
|
||||
<< err.str();
|
||||
}
|
||||
|
||||
return prog.errors > 0;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user