mirror of
https://github.com/mhx/dwarfs.git
synced 2025-08-03 09:47:01 -04:00
feat: support extracting only files matching glob patterns (fixes gh #243)
This commit is contained in:
parent
853d8ca966
commit
2ff7602192
@ -44,6 +44,17 @@ to disk:
|
||||
case the default is to extract the files to the current directory, or
|
||||
to write the archive data to stdout.
|
||||
|
||||
- `--pattern=`*glob-pattern*:
|
||||
If specified, only extract entries matching the pattern. Can be specified
|
||||
multiple times, in which case all files matching one or more patterns will
|
||||
be extracted. Can also be specified without `--pattern` for simplicity.
|
||||
If prefixed with `i:`, the glob pattern match will be case-insensitive,
|
||||
e.g. `i:**/*.txt` would match all `.txt`, `.TXT` and `.tXt` files. If you
|
||||
want the *actual* pattern to start with `i:`, you have to prefix the
|
||||
pattern with `:`, i.e. `:i:...`. The glob patterns support the wildcards
|
||||
`*` and `?`, character classes (`[avt]`), ranges (`[a-h]`), complementation
|
||||
(`[!a-h]`) and globstar (`**`).
|
||||
|
||||
- `-O`, `--image-offset=`*value*|`auto`:
|
||||
Specify the byte offset at which the filesystem is located in the image.
|
||||
Use `auto` to detect the offset automatically. This is also the default.
|
||||
|
@ -30,6 +30,7 @@
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
class glob_matcher;
|
||||
class library_dependencies;
|
||||
class logger;
|
||||
class os_access;
|
||||
@ -72,7 +73,13 @@ class filesystem_extractor {
|
||||
bool extract(reader::filesystem_v2 const& fs,
|
||||
filesystem_extractor_options const& opts =
|
||||
filesystem_extractor_options()) {
|
||||
return impl_->extract(fs, opts);
|
||||
return impl_->extract(fs, nullptr, opts);
|
||||
}
|
||||
|
||||
bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
|
||||
filesystem_extractor_options const& opts =
|
||||
filesystem_extractor_options()) {
|
||||
return impl_->extract(fs, matcher, opts);
|
||||
}
|
||||
|
||||
class impl {
|
||||
@ -84,8 +91,9 @@ class filesystem_extractor {
|
||||
virtual void open_stream(std::ostream& os, std::string const& format) = 0;
|
||||
virtual void open_disk(std::filesystem::path const& output) = 0;
|
||||
virtual void close() = 0;
|
||||
virtual bool extract(reader::filesystem_v2 const& fs,
|
||||
filesystem_extractor_options const& opts) = 0;
|
||||
virtual bool
|
||||
extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
|
||||
filesystem_extractor_options const& opts) = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <unordered_set>
|
||||
|
||||
// This is required to avoid Windows.h being pulled in by libarchive
|
||||
// and polluting our environment with all sorts of shit.
|
||||
@ -41,6 +42,7 @@
|
||||
|
||||
#include <dwarfs/file_stat.h>
|
||||
#include <dwarfs/fstypes.h>
|
||||
#include <dwarfs/glob_matcher.h>
|
||||
#include <dwarfs/library_dependencies.h>
|
||||
#include <dwarfs/logger.h>
|
||||
#include <dwarfs/os_access.h>
|
||||
@ -189,7 +191,7 @@ class filesystem_extractor_ final : public filesystem_extractor::impl {
|
||||
}
|
||||
}
|
||||
|
||||
bool extract(reader::filesystem_v2 const& fs,
|
||||
bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
|
||||
filesystem_extractor_options const& opts) override;
|
||||
|
||||
private:
|
||||
@ -249,7 +251,8 @@ class filesystem_extractor_ final : public filesystem_extractor::impl {
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
bool filesystem_extractor_<LoggerPolicy>::extract(
|
||||
reader::filesystem_v2 const& fs, filesystem_extractor_options const& opts) {
|
||||
reader::filesystem_v2 const& fs, glob_matcher const* matcher,
|
||||
filesystem_extractor_options const& opts) {
|
||||
DWARFS_CHECK(a_, "filesystem not opened");
|
||||
|
||||
auto lr = ::archive_entry_linkresolver_new();
|
||||
@ -351,6 +354,23 @@ bool filesystem_extractor_<LoggerPolicy>::extract(
|
||||
}
|
||||
};
|
||||
|
||||
std::unordered_set<std::filesystem::path> matched_dirs;
|
||||
|
||||
if (matcher) {
|
||||
fs.walk([&](auto entry) {
|
||||
if (!entry.inode().is_directory()) {
|
||||
if (matcher->match(entry.unix_path())) {
|
||||
while (auto parent = entry.parent()) {
|
||||
if (!matched_dirs.insert(parent->fs_path()).second) {
|
||||
break;
|
||||
}
|
||||
entry = *parent;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fs.walk_data_order([&](auto entry) {
|
||||
// TODO: we can surely early abort walk() somehow
|
||||
if (entry.is_root() || hard_error) {
|
||||
@ -359,6 +379,23 @@ bool filesystem_extractor_<LoggerPolicy>::extract(
|
||||
|
||||
auto inode = entry.inode();
|
||||
|
||||
if (matcher) {
|
||||
LOG_TRACE << "checking " << entry.unix_path();
|
||||
if (inode.is_directory()) {
|
||||
if (!matched_dirs.contains(entry.fs_path())) {
|
||||
LOG_TRACE << "skipping directory " << entry.fs_path();
|
||||
// no need to extract this directory
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
if (!matcher->match(entry.unix_path())) {
|
||||
LOG_TRACE << "skipping " << entry.fs_path();
|
||||
// no match, skip this entry
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto ae = ::archive_entry_new();
|
||||
auto stbuf = fs.getattr(inode);
|
||||
|
||||
|
@ -2000,6 +2000,37 @@ TEST(dwarfsextract_test, mtree) {
|
||||
EXPECT_THAT(out, ::testing::HasSubstr("type=file"));
|
||||
}
|
||||
|
||||
TEST(dwarfsextract_test, patterns) {
|
||||
auto mkdt = mkdwarfs_tester::create_empty();
|
||||
mkdt.add_test_file_tree();
|
||||
ASSERT_EQ(0, mkdt.run({"-i", "/", "-o", "-", "--with-devices"}) != 0)
|
||||
<< mkdt.err();
|
||||
auto t = dwarfsextract_tester::create_with_image(mkdt.out());
|
||||
ASSERT_EQ(0, t.run({"-i", "image.dwarfs", "-f", "mtree", "**/*.enc",
|
||||
"{dev,etc,lib,var}/[m-ot-z]*"}))
|
||||
<< t.err();
|
||||
auto out = t.out();
|
||||
EXPECT_TRUE(out.starts_with("#mtree")) << out;
|
||||
std::vector<std::string> const expected{
|
||||
"./dev",
|
||||
"./dev/tty37",
|
||||
"./etc",
|
||||
"./etc/netconfig",
|
||||
"./usr",
|
||||
"./usr/lib64",
|
||||
"./usr/lib64/tcl8.6",
|
||||
"./usr/lib64/tcl8.6/encoding",
|
||||
"./usr/lib64/tcl8.6/encoding/cp950.enc",
|
||||
"./usr/lib64/tcl8.6/encoding/iso8859-8.enc",
|
||||
};
|
||||
auto mtree = test::parse_mtree(out);
|
||||
std::vector<std::string> actual;
|
||||
for (auto const& entry : mtree) {
|
||||
actual.push_back(entry.first);
|
||||
}
|
||||
EXPECT_EQ(expected, actual);
|
||||
}
|
||||
|
||||
TEST(dwarfsextract_test, stdout_progress_error) {
|
||||
auto t = dwarfsextract_tester::create_with_image();
|
||||
EXPECT_NE(0,
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include <dwarfs/config.h>
|
||||
#include <dwarfs/glob_matcher.h>
|
||||
#include <dwarfs/library_dependencies.h>
|
||||
#include <dwarfs/logger.h>
|
||||
#include <dwarfs/mmap.h>
|
||||
@ -77,6 +78,9 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
|
||||
("output,o",
|
||||
po_sys_value<sys_string>(&output),
|
||||
"output file or directory")
|
||||
("pattern",
|
||||
po::value<std::vector<std::string>>(),
|
||||
"only extract files matching these patterns")
|
||||
("image-offset,O",
|
||||
po::value<std::string>(&image_offset)->default_value("auto"),
|
||||
"filesystem image offset in bytes")
|
||||
@ -111,10 +115,17 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
|
||||
|
||||
tool::add_common_options(opts, logopts);
|
||||
|
||||
po::positional_options_description pos;
|
||||
pos.add("pattern", -1);
|
||||
|
||||
po::variables_map vm;
|
||||
|
||||
try {
|
||||
po::store(po::parse_command_line(argc, argv, opts), vm);
|
||||
po::store(po::basic_command_line_parser<sys_char>(argc, argv)
|
||||
.options(opts)
|
||||
.positional(pos)
|
||||
.run(),
|
||||
vm);
|
||||
po::notify(vm);
|
||||
} catch (po::error const& e) {
|
||||
iol.err << "error: " << e.what() << "\n";
|
||||
@ -141,6 +152,13 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::unique_ptr<glob_matcher> matcher;
|
||||
|
||||
if (vm.count("pattern")) {
|
||||
matcher = std::make_unique<glob_matcher>(
|
||||
vm["pattern"].as<std::vector<std::string>>());
|
||||
}
|
||||
|
||||
int rv = 0;
|
||||
|
||||
try {
|
||||
@ -214,7 +232,7 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
|
||||
};
|
||||
}
|
||||
|
||||
rv = fsx.extract(fs, fsx_opts) ? 0 : 2;
|
||||
rv = fsx.extract(fs, matcher.get(), fsx_opts) ? 0 : 2;
|
||||
|
||||
fsx.close();
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user