feat: support extracting only files matching glob patterns (fixes gh #243)

This commit is contained in:
Marcus Holland-Moritz 2024-11-16 20:14:04 +01:00
parent 853d8ca966
commit 2ff7602192
5 changed files with 112 additions and 7 deletions

View File

@ -44,6 +44,17 @@ to disk:
case the default is to extract the files to the current directory, or case the default is to extract the files to the current directory, or
to write the archive data to stdout. to write the archive data to stdout.
- `--pattern=`*glob-pattern*:
If specified, only extract entries matching the pattern. Can be specified
multiple times, in which case all files matching one or more patterns will
be extracted. Can also be specified without `--pattern` for simplicity.
If prefixed with `i:`, the glob pattern match will be case-insensitive,
e.g. `i:**/*.txt` would match all `.txt`, `.TXT` and `.tXt` files. If you
want the *actual* pattern to start with `i:`, you have to prefix the
pattern with `:`, i.e. `:i:...`. The glob patterns support the wildcards
`*` and `?`, character classes (`[avt]`), ranges (`[a-h]`), complementation
(`[!a-h]`) and globstar (`**`).
- `-O`, `--image-offset=`*value*|`auto`: - `-O`, `--image-offset=`*value*|`auto`:
Specify the byte offset at which the filesystem is located in the image. Specify the byte offset at which the filesystem is located in the image.
Use `auto` to detect the offset automatically. This is also the default. Use `auto` to detect the offset automatically. This is also the default.

View File

@ -30,6 +30,7 @@
namespace dwarfs { namespace dwarfs {
class glob_matcher;
class library_dependencies; class library_dependencies;
class logger; class logger;
class os_access; class os_access;
@ -72,7 +73,13 @@ class filesystem_extractor {
bool extract(reader::filesystem_v2 const& fs, bool extract(reader::filesystem_v2 const& fs,
filesystem_extractor_options const& opts = filesystem_extractor_options const& opts =
filesystem_extractor_options()) { filesystem_extractor_options()) {
return impl_->extract(fs, opts); return impl_->extract(fs, nullptr, opts);
}
bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts =
filesystem_extractor_options()) {
return impl_->extract(fs, matcher, opts);
} }
class impl { class impl {
@ -84,8 +91,9 @@ class filesystem_extractor {
virtual void open_stream(std::ostream& os, std::string const& format) = 0; virtual void open_stream(std::ostream& os, std::string const& format) = 0;
virtual void open_disk(std::filesystem::path const& output) = 0; virtual void open_disk(std::filesystem::path const& output) = 0;
virtual void close() = 0; virtual void close() = 0;
virtual bool extract(reader::filesystem_v2 const& fs, virtual bool
filesystem_extractor_options const& opts) = 0; extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts) = 0;
}; };
private: private:

View File

@ -24,6 +24,7 @@
#include <memory> #include <memory>
#include <mutex> #include <mutex>
#include <thread> #include <thread>
#include <unordered_set>
// This is required to avoid Windows.h being pulled in by libarchive // This is required to avoid Windows.h being pulled in by libarchive
// and polluting our environment with all sorts of shit. // and polluting our environment with all sorts of shit.
@ -41,6 +42,7 @@
#include <dwarfs/file_stat.h> #include <dwarfs/file_stat.h>
#include <dwarfs/fstypes.h> #include <dwarfs/fstypes.h>
#include <dwarfs/glob_matcher.h>
#include <dwarfs/library_dependencies.h> #include <dwarfs/library_dependencies.h>
#include <dwarfs/logger.h> #include <dwarfs/logger.h>
#include <dwarfs/os_access.h> #include <dwarfs/os_access.h>
@ -189,7 +191,7 @@ class filesystem_extractor_ final : public filesystem_extractor::impl {
} }
} }
bool extract(reader::filesystem_v2 const& fs, bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts) override; filesystem_extractor_options const& opts) override;
private: private:
@ -249,7 +251,8 @@ class filesystem_extractor_ final : public filesystem_extractor::impl {
template <typename LoggerPolicy> template <typename LoggerPolicy>
bool filesystem_extractor_<LoggerPolicy>::extract( bool filesystem_extractor_<LoggerPolicy>::extract(
reader::filesystem_v2 const& fs, filesystem_extractor_options const& opts) { reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts) {
DWARFS_CHECK(a_, "filesystem not opened"); DWARFS_CHECK(a_, "filesystem not opened");
auto lr = ::archive_entry_linkresolver_new(); auto lr = ::archive_entry_linkresolver_new();
@ -351,6 +354,23 @@ bool filesystem_extractor_<LoggerPolicy>::extract(
} }
}; };
std::unordered_set<std::filesystem::path> matched_dirs;
if (matcher) {
fs.walk([&](auto entry) {
if (!entry.inode().is_directory()) {
if (matcher->match(entry.unix_path())) {
while (auto parent = entry.parent()) {
if (!matched_dirs.insert(parent->fs_path()).second) {
break;
}
entry = *parent;
}
}
}
});
}
fs.walk_data_order([&](auto entry) { fs.walk_data_order([&](auto entry) {
// TODO: we can surely early abort walk() somehow // TODO: we can surely early abort walk() somehow
if (entry.is_root() || hard_error) { if (entry.is_root() || hard_error) {
@ -359,6 +379,23 @@ bool filesystem_extractor_<LoggerPolicy>::extract(
auto inode = entry.inode(); auto inode = entry.inode();
if (matcher) {
LOG_TRACE << "checking " << entry.unix_path();
if (inode.is_directory()) {
if (!matched_dirs.contains(entry.fs_path())) {
LOG_TRACE << "skipping directory " << entry.fs_path();
// no need to extract this directory
return;
}
} else {
if (!matcher->match(entry.unix_path())) {
LOG_TRACE << "skipping " << entry.fs_path();
// no match, skip this entry
return;
}
}
}
auto ae = ::archive_entry_new(); auto ae = ::archive_entry_new();
auto stbuf = fs.getattr(inode); auto stbuf = fs.getattr(inode);

View File

@ -2000,6 +2000,37 @@ TEST(dwarfsextract_test, mtree) {
EXPECT_THAT(out, ::testing::HasSubstr("type=file")); EXPECT_THAT(out, ::testing::HasSubstr("type=file"));
} }
TEST(dwarfsextract_test, patterns) {
auto mkdt = mkdwarfs_tester::create_empty();
mkdt.add_test_file_tree();
ASSERT_EQ(0, mkdt.run({"-i", "/", "-o", "-", "--with-devices"}) != 0)
<< mkdt.err();
auto t = dwarfsextract_tester::create_with_image(mkdt.out());
ASSERT_EQ(0, t.run({"-i", "image.dwarfs", "-f", "mtree", "**/*.enc",
"{dev,etc,lib,var}/[m-ot-z]*"}))
<< t.err();
auto out = t.out();
EXPECT_TRUE(out.starts_with("#mtree")) << out;
std::vector<std::string> const expected{
"./dev",
"./dev/tty37",
"./etc",
"./etc/netconfig",
"./usr",
"./usr/lib64",
"./usr/lib64/tcl8.6",
"./usr/lib64/tcl8.6/encoding",
"./usr/lib64/tcl8.6/encoding/cp950.enc",
"./usr/lib64/tcl8.6/encoding/iso8859-8.enc",
};
auto mtree = test::parse_mtree(out);
std::vector<std::string> actual;
for (auto const& entry : mtree) {
actual.push_back(entry.first);
}
EXPECT_EQ(expected, actual);
}
TEST(dwarfsextract_test, stdout_progress_error) { TEST(dwarfsextract_test, stdout_progress_error) {
auto t = dwarfsextract_tester::create_with_image(); auto t = dwarfsextract_tester::create_with_image();
EXPECT_NE(0, EXPECT_NE(0,

View File

@ -27,6 +27,7 @@
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#include <dwarfs/config.h> #include <dwarfs/config.h>
#include <dwarfs/glob_matcher.h>
#include <dwarfs/library_dependencies.h> #include <dwarfs/library_dependencies.h>
#include <dwarfs/logger.h> #include <dwarfs/logger.h>
#include <dwarfs/mmap.h> #include <dwarfs/mmap.h>
@ -77,6 +78,9 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
("output,o", ("output,o",
po_sys_value<sys_string>(&output), po_sys_value<sys_string>(&output),
"output file or directory") "output file or directory")
("pattern",
po::value<std::vector<std::string>>(),
"only extract files matching these patterns")
("image-offset,O", ("image-offset,O",
po::value<std::string>(&image_offset)->default_value("auto"), po::value<std::string>(&image_offset)->default_value("auto"),
"filesystem image offset in bytes") "filesystem image offset in bytes")
@ -111,10 +115,17 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
tool::add_common_options(opts, logopts); tool::add_common_options(opts, logopts);
po::positional_options_description pos;
pos.add("pattern", -1);
po::variables_map vm; po::variables_map vm;
try { try {
po::store(po::parse_command_line(argc, argv, opts), vm); po::store(po::basic_command_line_parser<sys_char>(argc, argv)
.options(opts)
.positional(pos)
.run(),
vm);
po::notify(vm); po::notify(vm);
} catch (po::error const& e) { } catch (po::error const& e) {
iol.err << "error: " << e.what() << "\n"; iol.err << "error: " << e.what() << "\n";
@ -141,6 +152,13 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
return 0; return 0;
} }
std::unique_ptr<glob_matcher> matcher;
if (vm.count("pattern")) {
matcher = std::make_unique<glob_matcher>(
vm["pattern"].as<std::vector<std::string>>());
}
int rv = 0; int rv = 0;
try { try {
@ -214,7 +232,7 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
}; };
} }
rv = fsx.extract(fs, fsx_opts) ? 0 : 2; rv = fsx.extract(fs, matcher.get(), fsx_opts) ? 0 : 2;
fsx.close(); fsx.close();