Add --input-list option to pass in a list of files

This commit is contained in:
Marcus Holland-Moritz 2022-11-06 21:03:01 +01:00
parent ff5f99f3d9
commit eb8803d6df
6 changed files with 315 additions and 106 deletions

View File

@ -27,9 +27,17 @@ There two mandatory options for specifying the input and output:
- `-i`, `--input=`*path*|*file*:
Path to the root directory containing the files from which you want to
build a filesystem. If the `--recompress` option is given, this argument
build a file system. If the `--recompress` option is given, this argument
is the source filesystem.
- `--input-list=`*file*|`-`:
Read list of paths to add to the file system from this file or from stdin.
The pathames will be interpreted relative to the path given with `--input`.
If `--input` is omitted, the pathames will be interpreted relative to the
current directory. If you want files to be stored in the exact same order
as read from this list (because, for example, you have already sorted them
by similarity or access frequency), you must also pass `--order=none`.
- `-o`, `--output=`*file*:
File name of the output filesystem.

View File

@ -174,11 +174,17 @@ class dir : public entry {
return inode_num_;
}
std::shared_ptr<entry> find(std::string_view name);
private:
using entry_ptr = std::shared_ptr<entry>;
using lookup_table = std::unordered_map<std::string_view, entry_ptr>;
void populate_lookup_table();
std::vector<std::shared_ptr<entry>> entries_;
std::optional<uint32_t> inode_num_;
std::unique_ptr<lookup_table> lookup_;
};
class link : public entry {

View File

@ -21,7 +21,10 @@
#pragma once
#include <filesystem>
#include <memory>
#include <optional>
#include <span>
#include <string>
#include "dwarfs/block_manager.h"
@ -44,8 +47,10 @@ class scanner {
std::shared_ptr<entry_factory> ef, std::shared_ptr<os_access> os,
std::shared_ptr<script> scr, const scanner_options& options);
void scan(filesystem_writer& fsw, const std::string& path, progress& prog) {
impl_->scan(fsw, path, prog);
void scan(filesystem_writer& fsw, const std::string& path, progress& prog,
std::optional<std::span<std::filesystem::path const>> list =
std::nullopt) {
impl_->scan(fsw, path, prog, list);
}
class impl {
@ -53,7 +58,8 @@ class scanner {
virtual ~impl() = default;
virtual void
scan(filesystem_writer& fsw, const std::string& path, progress& prog) = 0;
scan(filesystem_writer& fsw, const std::string& path, progress& prog,
std::optional<std::span<std::filesystem::path const>> list) = 0;
};
private:

View File

@ -235,7 +235,13 @@ void file::hardlink(file* other, progress& prog) {
entry::type_t dir::type() const { return E_DIR; }
void dir::add(std::shared_ptr<entry> e) { entries_.emplace_back(std::move(e)); }
void dir::add(std::shared_ptr<entry> e) {
if (lookup_) {
auto r [[maybe_unused]] = lookup_->emplace(e->name(), e);
assert(r.second);
}
entries_.emplace_back(std::move(e));
}
void dir::walk(std::function<void(entry*)> const& f) {
f(this);
@ -323,6 +329,40 @@ void dir::remove_empty_dirs(progress& prog) {
prog.dirs_found -= num;
entries_.erase(last, entries_.end());
}
lookup_.reset();
}
std::shared_ptr<entry> dir::find(std::string_view name) {
if (!lookup_ && entries_.size() >= 16) {
populate_lookup_table();
}
if (lookup_) {
if (auto it = lookup_->find(name); it != lookup_->end()) {
return it->second;
}
} else {
auto it = std::find_if(entries_.begin(), entries_.end(),
[name](auto& e) { return e->name() == name; });
if (it != entries_.end()) {
return *it;
}
}
return nullptr;
}
void dir::populate_lookup_table() {
assert(!lookup_);
lookup_ = std::make_unique<lookup_table>();
lookup_->reserve(entries_.size());
for (auto const& e : entries_) {
auto r [[maybe_unused]] = lookup_->emplace(e->name(), e);
assert(r.second);
}
}
entry::type_t link::type() const { return E_LINK; }

View File

@ -284,13 +284,23 @@ class scanner_ final : public scanner::impl {
std::shared_ptr<entry_factory> ef, std::shared_ptr<os_access> os,
std::shared_ptr<script> scr, const scanner_options& options);
void scan(filesystem_writer& fsw, const std::string& path,
progress& prog) override;
void
scan(filesystem_writer& fsw, const std::string& path, progress& prog,
std::optional<std::span<std::filesystem::path const>> list) override;
private:
std::shared_ptr<entry>
scan_tree(const std::string& path, progress& prog, detail::file_scanner& fs);
std::shared_ptr<entry> scan_list(const std::string& path,
std::span<std::filesystem::path const> list,
progress& prog, detail::file_scanner& fs);
std::shared_ptr<entry>
add_entry(std::string const& name, std::shared_ptr<dir> parent,
progress& prog, detail::file_scanner& fs,
bool debug_filter = false);
const block_manager::config& cfg_;
const scanner_options& options_;
std::shared_ptr<entry_factory> entry_;
@ -317,6 +327,111 @@ scanner_<LoggerPolicy>::scanner_(logger& lgr, worker_group& wg,
, lgr_(lgr)
, LOG_PROXY_INIT(lgr_) {}
template <typename LoggerPolicy>
std::shared_ptr<entry>
scanner_<LoggerPolicy>::add_entry(std::string const& name,
std::shared_ptr<dir> parent, progress& prog,
detail::file_scanner& fs, bool debug_filter) {
try {
auto pe = entry_->create(*os_, name, parent);
bool exclude = false;
if (script_) {
if (script_->has_filter() && !script_->filter(*pe)) {
exclude = true;
} else if (script_->has_transform()) {
script_->transform(*pe);
}
}
if (debug_filter) {
(*options_.debug_filter_function)(exclude, pe.get());
}
if (exclude) {
if (!debug_filter) {
LOG_DEBUG << "excluding " << pe->dpath();
}
return nullptr;
}
if (pe) {
switch (pe->type()) {
case entry::E_FILE:
if (os_->access(pe->path(), R_OK)) {
LOG_ERROR << "cannot access: " << pe->path();
prog.errors++;
return nullptr;
}
break;
case entry::E_DEVICE:
if (!options_.with_devices) {
return nullptr;
}
break;
case entry::E_OTHER:
if (!options_.with_specials) {
return nullptr;
}
break;
default:
break;
}
parent->add(pe);
switch (pe->type()) {
case entry::E_DIR:
// prog.current.store(pe.get());
prog.dirs_found++;
if (!debug_filter) {
pe->scan(*os_, prog);
}
break;
case entry::E_FILE:
prog.files_found++;
if (!debug_filter) {
fs.scan(dynamic_cast<file*>(pe.get()));
}
break;
case entry::E_LINK:
prog.symlinks_found++;
if (!debug_filter) {
pe->scan(*os_, prog);
}
prog.symlinks_scanned++;
break;
case entry::E_DEVICE:
case entry::E_OTHER:
prog.specials_found++;
if (!debug_filter) {
pe->scan(*os_, prog);
}
break;
default:
LOG_ERROR << "unsupported entry type: " << int(pe->type());
prog.errors++;
break;
}
}
return pe;
} catch (const std::system_error& e) {
LOG_ERROR << "error reading entry: " << e.what();
prog.errors++;
}
return nullptr;
}
template <typename LoggerPolicy>
std::shared_ptr<entry>
scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
@ -349,100 +464,10 @@ scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
continue;
}
try {
auto pe = entry_->create(*os_, name, parent);
bool exclude = false;
if (script_) {
if (script_->has_filter() && !script_->filter(*pe)) {
exclude = true;
} else if (script_->has_transform()) {
script_->transform(*pe);
}
if (auto pe = add_entry(name, parent, prog, fs, debug_filter)) {
if (pe->type() == entry::E_DIR) {
subdirs.push_back(pe);
}
if (debug_filter) {
(*options_.debug_filter_function)(exclude, pe.get());
}
if (exclude) {
if (!debug_filter) {
LOG_DEBUG << "excluding " << pe->dpath();
}
continue;
}
if (pe) {
switch (pe->type()) {
case entry::E_FILE:
if (os_->access(pe->path(), R_OK)) {
LOG_ERROR << "cannot access: " << pe->path();
prog.errors++;
continue;
}
break;
case entry::E_DEVICE:
if (!options_.with_devices) {
continue;
}
break;
case entry::E_OTHER:
if (!options_.with_specials) {
continue;
}
break;
default:
break;
}
parent->add(pe);
switch (pe->type()) {
case entry::E_DIR:
// prog.current.store(pe.get());
prog.dirs_found++;
if (!debug_filter) {
pe->scan(*os_, prog);
}
subdirs.push_back(pe);
break;
case entry::E_FILE:
prog.files_found++;
if (!debug_filter) {
fs.scan(dynamic_cast<file*>(pe.get()));
}
break;
case entry::E_LINK:
prog.symlinks_found++;
if (!debug_filter) {
pe->scan(*os_, prog);
}
prog.symlinks_scanned++;
break;
case entry::E_DEVICE:
case entry::E_OTHER:
prog.specials_found++;
if (!debug_filter) {
pe->scan(*os_, prog);
}
break;
default:
LOG_ERROR << "unsupported entry type: " << int(pe->type());
prog.errors++;
break;
}
}
} catch (const std::system_error& e) {
LOG_ERROR << "error reading entry: " << e.what();
prog.errors++;
}
}
@ -459,8 +484,88 @@ scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
}
template <typename LoggerPolicy>
void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
const std::string& path, progress& prog) {
std::shared_ptr<entry>
scanner_<LoggerPolicy>::scan_list(const std::string& path,
std::span<std::filesystem::path const> list,
progress& prog, detail::file_scanner& fs) {
if (script_ && script_->has_filter()) {
DWARFS_THROW(runtime_error, "cannot use filters with file lists");
}
auto ti = LOG_TIMED_INFO;
auto root = entry_->create(*os_, path);
auto root_path = std::filesystem::path(path);
if (root->type() != entry::E_DIR) {
DWARFS_THROW(runtime_error, fmt::format("'{}' must be a directory", path));
}
auto ensure_path = [this, &prog, &fs](std::filesystem::path const& path,
std::shared_ptr<entry> root) {
for (auto const& p : path) {
if (auto d = std::dynamic_pointer_cast<dir>(root)) {
if (auto e = d->find(p.string())) {
root = e;
} else {
root = add_entry(p.string(), d, prog, fs);
if (root && root->type() == entry::E_DIR) {
prog.dirs_scanned++;
} else {
DWARFS_THROW(runtime_error,
fmt::format("invalid path '{}'", path.string()));
}
}
} else {
DWARFS_THROW(runtime_error,
fmt::format("invalid path '{}'", path.string()));
}
}
return root;
};
std::unordered_map<std::string, std::shared_ptr<dir>> dir_cache;
for (auto const& p : list) {
auto pp = p.parent_path();
std::shared_ptr<dir> pd;
if (auto it = dir_cache.find(pp.string()); it != dir_cache.end()) {
pd = it->second;
} else {
pd = std::dynamic_pointer_cast<dir>(ensure_path(pp, root));
if (pd) {
dir_cache.emplace(pp.string(), pd);
} else {
DWARFS_THROW(runtime_error,
fmt::format("invalid path '{}'", p.string()));
}
}
auto const& fname = p.filename().string();
if (auto pe = pd->find(fname)) {
continue;
}
if (auto pe = add_entry(fname, pd, prog, fs)) {
if (pe->type() == entry::E_DIR) {
prog.dirs_scanned++;
}
}
}
ti << "scanned input list";
return root;
}
template <typename LoggerPolicy>
void scanner_<LoggerPolicy>::scan(
filesystem_writer& fsw, const std::string& path, progress& prog,
std::optional<std::span<std::filesystem::path const>> list) {
if (!options_.debug_filter_function) {
LOG_INFO << "scanning " << path;
}
@ -471,7 +576,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
detail::file_scanner fs(wg_, *os_, im, options_.inode,
options_.file_hash_algorithm, prog);
auto root = scan_tree(path, prog, fs);
auto root =
list ? scan_list(path, *list, prog, fs) : scan_tree(path, prog, fs);
if (options_.debug_filter_function) {
return;

View File

@ -372,7 +372,7 @@ int mkdwarfs(int argc, char** argv) {
std::string path, output, memory_limit, script_arg, compression, header,
schema_compression, metadata_compression, log_level_str, timestamp,
time_resolution, order, progress_mode, recompress_opts, pack_metadata,
file_hash_algo, debug_filter, max_similarity_size;
file_hash_algo, debug_filter, max_similarity_size, input_list_str;
std::vector<std::string> filter;
size_t num_workers, num_scanner_workers;
bool no_progress = false, remove_header = false, no_section_index = false,
@ -413,6 +413,9 @@ int mkdwarfs(int argc, char** argv) {
("force,f",
po::value<bool>(&force_overwrite)->zero_tokens(),
"force overwrite of existing output image")
("input-list",
po::value<std::string>(&input_list_str),
"file containing list of paths relative to root directory")
("compress-level,l",
po::value<unsigned>(&level)->default_value(default_level),
"compression level (0=fast, 9=best, please see man page for details)")
@ -548,7 +551,7 @@ int mkdwarfs(int argc, char** argv) {
return 1;
}
if (vm.count("help") or !vm.count("input") or
if (vm.count("help") or !(vm.count("input") or vm.count("input-list")) or
(!vm.count("output") and !vm.count("debug-filter"))) {
size_t l_dc = 0, l_sc = 0, l_mc = 0, l_or = 0;
for (auto const& l : levels) {
@ -643,6 +646,42 @@ int mkdwarfs(int argc, char** argv) {
return 1;
}
std::optional<std::vector<std::filesystem::path>> input_list;
if (vm.count("input-list")) {
if (vm.count("filter")) {
std::cerr << "error: cannot use --input-list and --filter\n";
return 1;
}
if (!vm.count("input")) {
path = std::filesystem::current_path().string();
}
std::unique_ptr<std::ifstream> ifs;
std::istream* is;
if (input_list_str == "-") {
is = &std::cin;
} else {
ifs = std::make_unique<std::ifstream>(input_list_str);
if (!ifs->is_open()) {
throw std::runtime_error(
fmt::format("error opening file: {}", input_list_str));
}
is = ifs.get();
}
std::string line;
input_list.emplace();
while (std::getline(*is, line)) {
input_list->emplace_back(line);
}
}
bool recompress = vm.count("recompress");
rewrite_options rw_opts;
if (recompress) {
@ -1011,7 +1050,11 @@ int mkdwarfs(int argc, char** argv) {
std::make_shared<os_access_posix>(), std::move(script),
options);
s.scan(fsw, path, prog);
if (input_list) {
s.scan(fsw, path, prog, *input_list);
} else {
s.scan(fsw, path, prog);
}
}
} catch (runtime_error const& e) {
LOG_ERROR << e.what();