mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-09 12:28:13 -04:00
Add --input-list option to pass in a list of files
This commit is contained in:
parent
ff5f99f3d9
commit
eb8803d6df
@ -27,9 +27,17 @@ There two mandatory options for specifying the input and output:
|
||||
|
||||
- `-i`, `--input=`*path*|*file*:
|
||||
Path to the root directory containing the files from which you want to
|
||||
build a filesystem. If the `--recompress` option is given, this argument
|
||||
build a file system. If the `--recompress` option is given, this argument
|
||||
is the source filesystem.
|
||||
|
||||
- `--input-list=`*file*|`-`:
|
||||
Read list of paths to add to the file system from this file or from stdin.
|
||||
The pathames will be interpreted relative to the path given with `--input`.
|
||||
If `--input` is omitted, the pathames will be interpreted relative to the
|
||||
current directory. If you want files to be stored in the exact same order
|
||||
as read from this list (because, for example, you have already sorted them
|
||||
by similarity or access frequency), you must also pass `--order=none`.
|
||||
|
||||
- `-o`, `--output=`*file*:
|
||||
File name of the output filesystem.
|
||||
|
||||
|
@ -174,11 +174,17 @@ class dir : public entry {
|
||||
return inode_num_;
|
||||
}
|
||||
|
||||
std::shared_ptr<entry> find(std::string_view name);
|
||||
|
||||
private:
|
||||
using entry_ptr = std::shared_ptr<entry>;
|
||||
using lookup_table = std::unordered_map<std::string_view, entry_ptr>;
|
||||
|
||||
void populate_lookup_table();
|
||||
|
||||
std::vector<std::shared_ptr<entry>> entries_;
|
||||
std::optional<uint32_t> inode_num_;
|
||||
std::unique_ptr<lookup_table> lookup_;
|
||||
};
|
||||
|
||||
class link : public entry {
|
||||
|
@ -21,7 +21,10 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <filesystem>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
#include <string>
|
||||
|
||||
#include "dwarfs/block_manager.h"
|
||||
@ -44,8 +47,10 @@ class scanner {
|
||||
std::shared_ptr<entry_factory> ef, std::shared_ptr<os_access> os,
|
||||
std::shared_ptr<script> scr, const scanner_options& options);
|
||||
|
||||
void scan(filesystem_writer& fsw, const std::string& path, progress& prog) {
|
||||
impl_->scan(fsw, path, prog);
|
||||
void scan(filesystem_writer& fsw, const std::string& path, progress& prog,
|
||||
std::optional<std::span<std::filesystem::path const>> list =
|
||||
std::nullopt) {
|
||||
impl_->scan(fsw, path, prog, list);
|
||||
}
|
||||
|
||||
class impl {
|
||||
@ -53,7 +58,8 @@ class scanner {
|
||||
virtual ~impl() = default;
|
||||
|
||||
virtual void
|
||||
scan(filesystem_writer& fsw, const std::string& path, progress& prog) = 0;
|
||||
scan(filesystem_writer& fsw, const std::string& path, progress& prog,
|
||||
std::optional<std::span<std::filesystem::path const>> list) = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
|
@ -235,7 +235,13 @@ void file::hardlink(file* other, progress& prog) {
|
||||
|
||||
entry::type_t dir::type() const { return E_DIR; }
|
||||
|
||||
void dir::add(std::shared_ptr<entry> e) { entries_.emplace_back(std::move(e)); }
|
||||
void dir::add(std::shared_ptr<entry> e) {
|
||||
if (lookup_) {
|
||||
auto r [[maybe_unused]] = lookup_->emplace(e->name(), e);
|
||||
assert(r.second);
|
||||
}
|
||||
entries_.emplace_back(std::move(e));
|
||||
}
|
||||
|
||||
void dir::walk(std::function<void(entry*)> const& f) {
|
||||
f(this);
|
||||
@ -323,6 +329,40 @@ void dir::remove_empty_dirs(progress& prog) {
|
||||
prog.dirs_found -= num;
|
||||
entries_.erase(last, entries_.end());
|
||||
}
|
||||
|
||||
lookup_.reset();
|
||||
}
|
||||
|
||||
std::shared_ptr<entry> dir::find(std::string_view name) {
|
||||
if (!lookup_ && entries_.size() >= 16) {
|
||||
populate_lookup_table();
|
||||
}
|
||||
|
||||
if (lookup_) {
|
||||
if (auto it = lookup_->find(name); it != lookup_->end()) {
|
||||
return it->second;
|
||||
}
|
||||
} else {
|
||||
auto it = std::find_if(entries_.begin(), entries_.end(),
|
||||
[name](auto& e) { return e->name() == name; });
|
||||
if (it != entries_.end()) {
|
||||
return *it;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void dir::populate_lookup_table() {
|
||||
assert(!lookup_);
|
||||
|
||||
lookup_ = std::make_unique<lookup_table>();
|
||||
lookup_->reserve(entries_.size());
|
||||
|
||||
for (auto const& e : entries_) {
|
||||
auto r [[maybe_unused]] = lookup_->emplace(e->name(), e);
|
||||
assert(r.second);
|
||||
}
|
||||
}
|
||||
|
||||
entry::type_t link::type() const { return E_LINK; }
|
||||
|
@ -284,13 +284,23 @@ class scanner_ final : public scanner::impl {
|
||||
std::shared_ptr<entry_factory> ef, std::shared_ptr<os_access> os,
|
||||
std::shared_ptr<script> scr, const scanner_options& options);
|
||||
|
||||
void scan(filesystem_writer& fsw, const std::string& path,
|
||||
progress& prog) override;
|
||||
void
|
||||
scan(filesystem_writer& fsw, const std::string& path, progress& prog,
|
||||
std::optional<std::span<std::filesystem::path const>> list) override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<entry>
|
||||
scan_tree(const std::string& path, progress& prog, detail::file_scanner& fs);
|
||||
|
||||
std::shared_ptr<entry> scan_list(const std::string& path,
|
||||
std::span<std::filesystem::path const> list,
|
||||
progress& prog, detail::file_scanner& fs);
|
||||
|
||||
std::shared_ptr<entry>
|
||||
add_entry(std::string const& name, std::shared_ptr<dir> parent,
|
||||
progress& prog, detail::file_scanner& fs,
|
||||
bool debug_filter = false);
|
||||
|
||||
const block_manager::config& cfg_;
|
||||
const scanner_options& options_;
|
||||
std::shared_ptr<entry_factory> entry_;
|
||||
@ -317,6 +327,111 @@ scanner_<LoggerPolicy>::scanner_(logger& lgr, worker_group& wg,
|
||||
, lgr_(lgr)
|
||||
, LOG_PROXY_INIT(lgr_) {}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
std::shared_ptr<entry>
|
||||
scanner_<LoggerPolicy>::add_entry(std::string const& name,
|
||||
std::shared_ptr<dir> parent, progress& prog,
|
||||
detail::file_scanner& fs, bool debug_filter) {
|
||||
try {
|
||||
auto pe = entry_->create(*os_, name, parent);
|
||||
bool exclude = false;
|
||||
|
||||
if (script_) {
|
||||
if (script_->has_filter() && !script_->filter(*pe)) {
|
||||
exclude = true;
|
||||
} else if (script_->has_transform()) {
|
||||
script_->transform(*pe);
|
||||
}
|
||||
}
|
||||
|
||||
if (debug_filter) {
|
||||
(*options_.debug_filter_function)(exclude, pe.get());
|
||||
}
|
||||
|
||||
if (exclude) {
|
||||
if (!debug_filter) {
|
||||
LOG_DEBUG << "excluding " << pe->dpath();
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (pe) {
|
||||
switch (pe->type()) {
|
||||
case entry::E_FILE:
|
||||
if (os_->access(pe->path(), R_OK)) {
|
||||
LOG_ERROR << "cannot access: " << pe->path();
|
||||
prog.errors++;
|
||||
return nullptr;
|
||||
}
|
||||
break;
|
||||
|
||||
case entry::E_DEVICE:
|
||||
if (!options_.with_devices) {
|
||||
return nullptr;
|
||||
}
|
||||
break;
|
||||
|
||||
case entry::E_OTHER:
|
||||
if (!options_.with_specials) {
|
||||
return nullptr;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
parent->add(pe);
|
||||
|
||||
switch (pe->type()) {
|
||||
case entry::E_DIR:
|
||||
// prog.current.store(pe.get());
|
||||
prog.dirs_found++;
|
||||
if (!debug_filter) {
|
||||
pe->scan(*os_, prog);
|
||||
}
|
||||
break;
|
||||
|
||||
case entry::E_FILE:
|
||||
prog.files_found++;
|
||||
if (!debug_filter) {
|
||||
fs.scan(dynamic_cast<file*>(pe.get()));
|
||||
}
|
||||
break;
|
||||
|
||||
case entry::E_LINK:
|
||||
prog.symlinks_found++;
|
||||
if (!debug_filter) {
|
||||
pe->scan(*os_, prog);
|
||||
}
|
||||
prog.symlinks_scanned++;
|
||||
break;
|
||||
|
||||
case entry::E_DEVICE:
|
||||
case entry::E_OTHER:
|
||||
prog.specials_found++;
|
||||
if (!debug_filter) {
|
||||
pe->scan(*os_, prog);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
LOG_ERROR << "unsupported entry type: " << int(pe->type());
|
||||
prog.errors++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return pe;
|
||||
} catch (const std::system_error& e) {
|
||||
LOG_ERROR << "error reading entry: " << e.what();
|
||||
prog.errors++;
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
std::shared_ptr<entry>
|
||||
scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
|
||||
@ -349,100 +464,10 @@ scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
auto pe = entry_->create(*os_, name, parent);
|
||||
bool exclude = false;
|
||||
|
||||
if (script_) {
|
||||
if (script_->has_filter() && !script_->filter(*pe)) {
|
||||
exclude = true;
|
||||
} else if (script_->has_transform()) {
|
||||
script_->transform(*pe);
|
||||
}
|
||||
if (auto pe = add_entry(name, parent, prog, fs, debug_filter)) {
|
||||
if (pe->type() == entry::E_DIR) {
|
||||
subdirs.push_back(pe);
|
||||
}
|
||||
|
||||
if (debug_filter) {
|
||||
(*options_.debug_filter_function)(exclude, pe.get());
|
||||
}
|
||||
|
||||
if (exclude) {
|
||||
if (!debug_filter) {
|
||||
LOG_DEBUG << "excluding " << pe->dpath();
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (pe) {
|
||||
switch (pe->type()) {
|
||||
case entry::E_FILE:
|
||||
if (os_->access(pe->path(), R_OK)) {
|
||||
LOG_ERROR << "cannot access: " << pe->path();
|
||||
prog.errors++;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
|
||||
case entry::E_DEVICE:
|
||||
if (!options_.with_devices) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
|
||||
case entry::E_OTHER:
|
||||
if (!options_.with_specials) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
parent->add(pe);
|
||||
|
||||
switch (pe->type()) {
|
||||
case entry::E_DIR:
|
||||
// prog.current.store(pe.get());
|
||||
prog.dirs_found++;
|
||||
if (!debug_filter) {
|
||||
pe->scan(*os_, prog);
|
||||
}
|
||||
subdirs.push_back(pe);
|
||||
break;
|
||||
|
||||
case entry::E_FILE:
|
||||
prog.files_found++;
|
||||
if (!debug_filter) {
|
||||
fs.scan(dynamic_cast<file*>(pe.get()));
|
||||
}
|
||||
break;
|
||||
|
||||
case entry::E_LINK:
|
||||
prog.symlinks_found++;
|
||||
if (!debug_filter) {
|
||||
pe->scan(*os_, prog);
|
||||
}
|
||||
prog.symlinks_scanned++;
|
||||
break;
|
||||
|
||||
case entry::E_DEVICE:
|
||||
case entry::E_OTHER:
|
||||
prog.specials_found++;
|
||||
if (!debug_filter) {
|
||||
pe->scan(*os_, prog);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
LOG_ERROR << "unsupported entry type: " << int(pe->type());
|
||||
prog.errors++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (const std::system_error& e) {
|
||||
LOG_ERROR << "error reading entry: " << e.what();
|
||||
prog.errors++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -459,8 +484,88 @@ scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
const std::string& path, progress& prog) {
|
||||
std::shared_ptr<entry>
|
||||
scanner_<LoggerPolicy>::scan_list(const std::string& path,
|
||||
std::span<std::filesystem::path const> list,
|
||||
progress& prog, detail::file_scanner& fs) {
|
||||
if (script_ && script_->has_filter()) {
|
||||
DWARFS_THROW(runtime_error, "cannot use filters with file lists");
|
||||
}
|
||||
|
||||
auto ti = LOG_TIMED_INFO;
|
||||
|
||||
auto root = entry_->create(*os_, path);
|
||||
auto root_path = std::filesystem::path(path);
|
||||
|
||||
if (root->type() != entry::E_DIR) {
|
||||
DWARFS_THROW(runtime_error, fmt::format("'{}' must be a directory", path));
|
||||
}
|
||||
|
||||
auto ensure_path = [this, &prog, &fs](std::filesystem::path const& path,
|
||||
std::shared_ptr<entry> root) {
|
||||
for (auto const& p : path) {
|
||||
if (auto d = std::dynamic_pointer_cast<dir>(root)) {
|
||||
if (auto e = d->find(p.string())) {
|
||||
root = e;
|
||||
} else {
|
||||
root = add_entry(p.string(), d, prog, fs);
|
||||
if (root && root->type() == entry::E_DIR) {
|
||||
prog.dirs_scanned++;
|
||||
} else {
|
||||
DWARFS_THROW(runtime_error,
|
||||
fmt::format("invalid path '{}'", path.string()));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
DWARFS_THROW(runtime_error,
|
||||
fmt::format("invalid path '{}'", path.string()));
|
||||
}
|
||||
}
|
||||
|
||||
return root;
|
||||
};
|
||||
|
||||
std::unordered_map<std::string, std::shared_ptr<dir>> dir_cache;
|
||||
|
||||
for (auto const& p : list) {
|
||||
auto pp = p.parent_path();
|
||||
std::shared_ptr<dir> pd;
|
||||
|
||||
if (auto it = dir_cache.find(pp.string()); it != dir_cache.end()) {
|
||||
pd = it->second;
|
||||
} else {
|
||||
pd = std::dynamic_pointer_cast<dir>(ensure_path(pp, root));
|
||||
|
||||
if (pd) {
|
||||
dir_cache.emplace(pp.string(), pd);
|
||||
} else {
|
||||
DWARFS_THROW(runtime_error,
|
||||
fmt::format("invalid path '{}'", p.string()));
|
||||
}
|
||||
}
|
||||
|
||||
auto const& fname = p.filename().string();
|
||||
|
||||
if (auto pe = pd->find(fname)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto pe = add_entry(fname, pd, prog, fs)) {
|
||||
if (pe->type() == entry::E_DIR) {
|
||||
prog.dirs_scanned++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ti << "scanned input list";
|
||||
|
||||
return root;
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void scanner_<LoggerPolicy>::scan(
|
||||
filesystem_writer& fsw, const std::string& path, progress& prog,
|
||||
std::optional<std::span<std::filesystem::path const>> list) {
|
||||
if (!options_.debug_filter_function) {
|
||||
LOG_INFO << "scanning " << path;
|
||||
}
|
||||
@ -471,7 +576,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
detail::file_scanner fs(wg_, *os_, im, options_.inode,
|
||||
options_.file_hash_algorithm, prog);
|
||||
|
||||
auto root = scan_tree(path, prog, fs);
|
||||
auto root =
|
||||
list ? scan_list(path, *list, prog, fs) : scan_tree(path, prog, fs);
|
||||
|
||||
if (options_.debug_filter_function) {
|
||||
return;
|
||||
|
@ -372,7 +372,7 @@ int mkdwarfs(int argc, char** argv) {
|
||||
std::string path, output, memory_limit, script_arg, compression, header,
|
||||
schema_compression, metadata_compression, log_level_str, timestamp,
|
||||
time_resolution, order, progress_mode, recompress_opts, pack_metadata,
|
||||
file_hash_algo, debug_filter, max_similarity_size;
|
||||
file_hash_algo, debug_filter, max_similarity_size, input_list_str;
|
||||
std::vector<std::string> filter;
|
||||
size_t num_workers, num_scanner_workers;
|
||||
bool no_progress = false, remove_header = false, no_section_index = false,
|
||||
@ -413,6 +413,9 @@ int mkdwarfs(int argc, char** argv) {
|
||||
("force,f",
|
||||
po::value<bool>(&force_overwrite)->zero_tokens(),
|
||||
"force overwrite of existing output image")
|
||||
("input-list",
|
||||
po::value<std::string>(&input_list_str),
|
||||
"file containing list of paths relative to root directory")
|
||||
("compress-level,l",
|
||||
po::value<unsigned>(&level)->default_value(default_level),
|
||||
"compression level (0=fast, 9=best, please see man page for details)")
|
||||
@ -548,7 +551,7 @@ int mkdwarfs(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (vm.count("help") or !vm.count("input") or
|
||||
if (vm.count("help") or !(vm.count("input") or vm.count("input-list")) or
|
||||
(!vm.count("output") and !vm.count("debug-filter"))) {
|
||||
size_t l_dc = 0, l_sc = 0, l_mc = 0, l_or = 0;
|
||||
for (auto const& l : levels) {
|
||||
@ -643,6 +646,42 @@ int mkdwarfs(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::optional<std::vector<std::filesystem::path>> input_list;
|
||||
|
||||
if (vm.count("input-list")) {
|
||||
if (vm.count("filter")) {
|
||||
std::cerr << "error: cannot use --input-list and --filter\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!vm.count("input")) {
|
||||
path = std::filesystem::current_path().string();
|
||||
}
|
||||
|
||||
std::unique_ptr<std::ifstream> ifs;
|
||||
std::istream* is;
|
||||
|
||||
if (input_list_str == "-") {
|
||||
is = &std::cin;
|
||||
} else {
|
||||
ifs = std::make_unique<std::ifstream>(input_list_str);
|
||||
|
||||
if (!ifs->is_open()) {
|
||||
throw std::runtime_error(
|
||||
fmt::format("error opening file: {}", input_list_str));
|
||||
}
|
||||
|
||||
is = ifs.get();
|
||||
}
|
||||
|
||||
std::string line;
|
||||
input_list.emplace();
|
||||
|
||||
while (std::getline(*is, line)) {
|
||||
input_list->emplace_back(line);
|
||||
}
|
||||
}
|
||||
|
||||
bool recompress = vm.count("recompress");
|
||||
rewrite_options rw_opts;
|
||||
if (recompress) {
|
||||
@ -1011,7 +1050,11 @@ int mkdwarfs(int argc, char** argv) {
|
||||
std::make_shared<os_access_posix>(), std::move(script),
|
||||
options);
|
||||
|
||||
s.scan(fsw, path, prog);
|
||||
if (input_list) {
|
||||
s.scan(fsw, path, prog, *input_list);
|
||||
} else {
|
||||
s.scan(fsw, path, prog);
|
||||
}
|
||||
}
|
||||
} catch (runtime_error const& e) {
|
||||
LOG_ERROR << e.what();
|
||||
|
Loading…
x
Reference in New Issue
Block a user