Support extracting corrupted file systems (fixes github #51)

This adds two new options `--continue-on-error` and
`--disable-integrity-check` to `dwarfsextract` that allow
extracting data from corrupted images.
This commit is contained in:
Marcus Holland-Moritz 2022-11-08 11:56:10 +01:00
parent 8550c47873
commit a326f533d5
7 changed files with 89 additions and 26 deletions

View File

@ -56,6 +56,18 @@ to disk:
if no output directory is specified). For a full list of supported formats,
see libarchive-formats(5).
- `--continue-on-error`:
Try to continue with extraction even when errors are encountered. This
only applies to errors when reading from the file system image. Errors
when writing the extracted files will still be fatal.
- `--disable-integrity-check`:
This option disables all block integrity checks on the file system data.
There is a non-zero chance that this allows further data to be read from
corrupted file systems. However, there's also a non-zero chance that it
will completely crash the program. So please don't use this unless you
know what you're doing.
- `-n`, `--num-workers=`*value*:
Number of worker threads used for extracting the filesystem.

View File

@ -30,6 +30,11 @@ namespace dwarfs {
class filesystem_v2;
class logger;
struct filesystem_extractor_options {
size_t max_queued_bytes{4096};
bool continue_on_error{false};
};
class filesystem_extractor {
public:
filesystem_extractor(logger& lgr);
@ -46,8 +51,10 @@ class filesystem_extractor {
void close() { return impl_->close(); }
void extract(filesystem_v2 const& fs, size_t max_queued_bytes) {
return impl_->extract(fs, max_queued_bytes);
bool
extract(filesystem_v2 const& fs, filesystem_extractor_options const& opts =
filesystem_extractor_options()) {
return impl_->extract(fs, opts);
}
class impl {
@ -59,7 +66,8 @@ class filesystem_extractor {
virtual void open_stream(std::ostream& os, std::string const& format) = 0;
virtual void open_disk(std::string const& output) = 0;
virtual void close() = 0;
virtual void extract(filesystem_v2 const& fs, size_t max_queued_bytes) = 0;
virtual bool extract(filesystem_v2 const& fs,
filesystem_extractor_options const& opts) = 0;
};
private:

View File

@ -43,6 +43,7 @@ struct block_cache_options {
double decompress_ratio{1.0};
bool mm_release{true};
bool init_workers{true};
bool disable_block_integrity_check{false};
};
struct cache_tidy_config {

View File

@ -56,7 +56,7 @@ namespace dwarfs {
class cached_block {
public:
cached_block(logger& lgr, fs_section const& b, std::shared_ptr<mmif> mm,
bool release)
bool release, bool disable_integrity_check)
: decompressor_(std::make_unique<block_decompressor>(
b.compression(), mm->as<uint8_t>(b.start()), b.length(), data_))
, mm_(std::move(mm))
@ -64,7 +64,7 @@ class cached_block {
, LOG_PROXY_INIT(lgr)
, release_(release)
, uncompressed_size_{decompressor_->uncompressed_size()} {
if (!section_.check_fast(*mm_)) {
if (!disable_integrity_check && !section_.check_fast(*mm_)) {
DWARFS_THROW(runtime_error, "block data integrity check failed");
}
}
@ -483,7 +483,7 @@ class block_cache_ final : public block_cache::impl {
auto block = std::make_shared<cached_block>(
LOG_GET_LOGGER, DWARFS_NOTHROW(block_.at(block_no)), mm_,
options_.mm_release);
options_.mm_release, options_.disable_block_integrity_check);
++blocks_created_;
// Make a new set for the block

View File

@ -72,6 +72,11 @@ class cache_semaphore {
int64_t size_{0};
};
class archive_error : public std::runtime_error {
public:
using std::runtime_error::runtime_error;
};
} // namespace
template <typename LoggerPolicy>
@ -146,7 +151,8 @@ class filesystem_extractor_ final : public filesystem_extractor::impl {
closefd(pipefd_[0]);
}
void extract(filesystem_v2 const& fs, size_t max_queued_bytes) override;
bool extract(filesystem_v2 const& fs,
filesystem_extractor_options const& opts) override;
private:
void closefd(int& fd) {
@ -189,7 +195,7 @@ class filesystem_extractor_ final : public filesystem_extractor::impl {
break;
case ARCHIVE_RETRY:
case ARCHIVE_FATAL:
DWARFS_THROW(runtime_error, std::string(archive_error_string(a_)));
throw archive_error(std::string(archive_error_string(a_)));
}
}
@ -200,8 +206,8 @@ class filesystem_extractor_ final : public filesystem_extractor::impl {
};
template <typename LoggerPolicy>
void filesystem_extractor_<LoggerPolicy>::extract(filesystem_v2 const& fs,
size_t max_queued_bytes) {
bool filesystem_extractor_<LoggerPolicy>::extract(
filesystem_v2 const& fs, filesystem_extractor_options const& opts) {
DWARFS_CHECK(a_, "filesystem not opened");
auto lr = ::archive_entry_linkresolver_new();
@ -217,11 +223,13 @@ void filesystem_extractor_<LoggerPolicy>::extract(filesystem_v2 const& fs,
worker_group archiver("archiver", 1);
cache_semaphore sem;
LOG_DEBUG << "extractor semaphore size: " << max_queued_bytes << " bytes";
LOG_DEBUG << "extractor semaphore size: " << opts.max_queued_bytes
<< " bytes";
sem.post(max_queued_bytes);
sem.post(opts.max_queued_bytes);
std::atomic<bool> abort{false};
std::atomic<size_t> hard_error{0};
std::atomic<size_t> soft_error{0};
auto do_archive = [&](::archive_entry* ae,
inode_view entry) { // TODO: inode vs. entry
@ -232,14 +240,16 @@ void filesystem_extractor_<LoggerPolicy>::extract(filesystem_v2 const& fs,
size_t pos = 0;
size_t remain = size;
while (remain > 0 && !abort) {
size_t bs = remain < max_queued_bytes ? remain : max_queued_bytes;
while (remain > 0 && hard_error == 0) {
size_t bs =
remain < opts.max_queued_bytes ? remain : opts.max_queued_bytes;
sem.wait(bs);
if (auto ranges = fs.readv(fd, bs, pos)) {
archiver.add_job([this, &sem, &abort, ranges = std::move(*ranges), ae,
pos, remain, bs, size]() mutable {
archiver.add_job([this, &sem, &hard_error, &soft_error, &opts,
ranges = std::move(*ranges), ae, pos, remain, bs,
size]() mutable {
try {
if (pos == 0) {
LOG_DEBUG << "extracting " << ::archive_entry_pathname(ae)
@ -256,9 +266,17 @@ void filesystem_extractor_<LoggerPolicy>::extract(filesystem_v2 const& fs,
archive_entry_free(ae);
}
sem.post(bs);
} catch (archive_error const& e) {
LOG_ERROR << folly::exceptionStr(e);
++hard_error;
} catch (...) {
LOG_ERROR << folly::exceptionStr(std::current_exception());
abort = true;
if (opts.continue_on_error) {
LOG_WARN << folly::exceptionStr(std::current_exception());
++soft_error;
} else {
LOG_ERROR << folly::exceptionStr(std::current_exception());
++hard_error;
}
archive_entry_free(ae);
}
});
@ -273,13 +291,13 @@ void filesystem_extractor_<LoggerPolicy>::extract(filesystem_v2 const& fs,
remain -= bs;
}
} else {
archiver.add_job([this, ae, &abort] {
archiver.add_job([this, ae, &hard_error] {
SCOPE_EXIT { ::archive_entry_free(ae); };
try {
check_result(::archive_write_header(a_, ae));
} catch (...) {
LOG_ERROR << folly::exceptionStr(std::current_exception());
abort = true;
hard_error = true;
}
});
}
@ -287,7 +305,7 @@ void filesystem_extractor_<LoggerPolicy>::extract(filesystem_v2 const& fs,
fs.walk_data_order([&](auto entry) {
// TODO: we can surely early abort walk() somehow
if (entry.is_root() || abort) {
if (entry.is_root() || hard_error) {
return;
}
@ -329,7 +347,7 @@ void filesystem_extractor_<LoggerPolicy>::extract(filesystem_v2 const& fs,
archiver.wait();
if (abort) {
if (hard_error) {
DWARFS_THROW(runtime_error, "extraction aborted");
}
@ -340,6 +358,15 @@ void filesystem_extractor_<LoggerPolicy>::extract(filesystem_v2 const& fs,
if (ae) {
DWARFS_THROW(runtime_error, "unexpected deferred entry");
}
if (soft_error > 0) {
LOG_ERROR << "extraction finished with " << soft_error << " error(s)";
return false;
}
LOG_INFO << "extraction finished without errors";
return true;
}
filesystem_extractor::filesystem_extractor(logger& lgr)

View File

@ -45,6 +45,7 @@ int dwarfsextract(int argc, char** argv) {
std::string filesystem, output, format, cache_size_str, log_level,
image_offset;
size_t num_workers;
bool continue_on_error{false}, disable_integrity_check{false};
// clang-format off
po::options_description opts("Command line options");
@ -61,6 +62,12 @@ int dwarfsextract(int argc, char** argv) {
("format,f",
po::value<std::string>(&format),
"output format")
("continue-on-error",
po::value<bool>(&continue_on_error)->zero_tokens(),
"continue if errors are encountered")
("disable-integrity-check",
po::value<bool>(&disable_integrity_check)->zero_tokens(),
"disable file system image block integrity check (dangerous)")
("num-workers,n",
po::value<size_t>(&num_workers)->default_value(4),
"number of worker threads")
@ -89,6 +96,8 @@ int dwarfsextract(int argc, char** argv) {
return 0;
}
int rv = 0;
try {
auto level = logger::parse_level(log_level);
stream_logger lgr(std::cerr, level, level >= logger::DEBUG);
@ -103,6 +112,7 @@ int dwarfsextract(int argc, char** argv) {
fsopts.block_cache.max_bytes = parse_size_with_unit(cache_size_str);
fsopts.block_cache.num_workers = num_workers;
fsopts.block_cache.disable_block_integrity_check = disable_integrity_check;
fsopts.metadata.enable_nlink = true;
filesystem_v2 fs(lgr, std::make_shared<mmap>(filesystem), fsopts);
@ -117,7 +127,12 @@ int dwarfsextract(int argc, char** argv) {
fsx.open_archive(output, format);
}
fsx.extract(fs, fsopts.block_cache.max_bytes);
filesystem_extractor_options fsx_opts;
fsx_opts.max_queued_bytes = fsopts.block_cache.max_bytes;
fsx_opts.continue_on_error = continue_on_error;
rv = fsx.extract(fs, fsx_opts) ? 0 : 2;
fsx.close();
} catch (runtime_error const& e) {
@ -131,7 +146,7 @@ int dwarfsextract(int argc, char** argv) {
return 1;
}
return 0;
return rv;
}
} // namespace

View File

@ -972,7 +972,7 @@ void check_compat(logger& lgr, filesystem_v2 const& fs,
std::ostringstream oss;
EXPECT_NO_THROW(ext.open_stream(oss, "mtree"));
EXPECT_NO_THROW(ext.extract(fs, 4096));
EXPECT_NO_THROW(ext.extract(fs));
EXPECT_NO_THROW(ext.close());
std::istringstream iss(oss.str());