From a326f533d53b397cd00f9b064fc7ec9a7537327d Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 8 Nov 2022 11:56:10 +0100 Subject: [PATCH] Support extracting corrupted file systems (fixes github #51) This adds two new options `--continue-on-error` and `--disable-integrity-check` to `dwarfsextract` that allow extracting data from corrupted images. --- doc/dwarfsextract.md | 12 ++++++ include/dwarfs/filesystem_extractor.h | 14 ++++-- include/dwarfs/options.h | 1 + src/dwarfs/block_cache.cpp | 6 +-- src/dwarfs/filesystem_extractor.cpp | 61 +++++++++++++++++++-------- src/dwarfsextract.cpp | 19 ++++++++- test/dwarfs_compat.cpp | 2 +- 7 files changed, 89 insertions(+), 26 deletions(-) diff --git a/doc/dwarfsextract.md b/doc/dwarfsextract.md index d1fe458b..65907a01 100644 --- a/doc/dwarfsextract.md +++ b/doc/dwarfsextract.md @@ -56,6 +56,18 @@ to disk: if no output directory is specified). For a full list of supported formats, see libarchive-formats(5). +- `--continue-on-error`: + Try to continue with extraction even when errors are encountered. This + only applies to errors when reading from the file system image. Errors + when writing the extracted files will still be fatal. + +- `--disable-integrity-check`: + This option disables all block integrity checks on the file system data. + There is a non-zero chance that this allows further data to be read from + corrupted file systems. However, there's also a non-zero chance that it + will completely crash the program. So please don't use this unless you + know what you're doing. + - `-n`, `--num-workers=`*value*: Number of worker threads used for extracting the filesystem. diff --git a/include/dwarfs/filesystem_extractor.h b/include/dwarfs/filesystem_extractor.h index 198e25e2..327bb8f6 100644 --- a/include/dwarfs/filesystem_extractor.h +++ b/include/dwarfs/filesystem_extractor.h @@ -30,6 +30,11 @@ namespace dwarfs { class filesystem_v2; class logger; +struct filesystem_extractor_options { + size_t max_queued_bytes{4096}; + bool continue_on_error{false}; +}; + class filesystem_extractor { public: filesystem_extractor(logger& lgr); @@ -46,8 +51,10 @@ class filesystem_extractor { void close() { return impl_->close(); } - void extract(filesystem_v2 const& fs, size_t max_queued_bytes) { - return impl_->extract(fs, max_queued_bytes); + bool + extract(filesystem_v2 const& fs, filesystem_extractor_options const& opts = + filesystem_extractor_options()) { + return impl_->extract(fs, opts); } class impl { @@ -59,7 +66,8 @@ class filesystem_extractor { virtual void open_stream(std::ostream& os, std::string const& format) = 0; virtual void open_disk(std::string const& output) = 0; virtual void close() = 0; - virtual void extract(filesystem_v2 const& fs, size_t max_queued_bytes) = 0; + virtual bool extract(filesystem_v2 const& fs, + filesystem_extractor_options const& opts) = 0; }; private: diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index eb3631b3..5ec860b7 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -43,6 +43,7 @@ struct block_cache_options { double decompress_ratio{1.0}; bool mm_release{true}; bool init_workers{true}; + bool disable_block_integrity_check{false}; }; struct cache_tidy_config { diff --git a/src/dwarfs/block_cache.cpp b/src/dwarfs/block_cache.cpp index 947cfe6e..f13cc55b 100644 --- a/src/dwarfs/block_cache.cpp +++ b/src/dwarfs/block_cache.cpp @@ -56,7 +56,7 @@ namespace dwarfs { class cached_block { public: cached_block(logger& lgr, fs_section const& b, std::shared_ptr mm, - bool release) + bool release, bool disable_integrity_check) : decompressor_(std::make_unique( b.compression(), mm->as(b.start()), b.length(), data_)) , mm_(std::move(mm)) @@ -64,7 +64,7 @@ class cached_block { , LOG_PROXY_INIT(lgr) , release_(release) , uncompressed_size_{decompressor_->uncompressed_size()} { - if (!section_.check_fast(*mm_)) { + if (!disable_integrity_check && !section_.check_fast(*mm_)) { DWARFS_THROW(runtime_error, "block data integrity check failed"); } } @@ -483,7 +483,7 @@ class block_cache_ final : public block_cache::impl { auto block = std::make_shared( LOG_GET_LOGGER, DWARFS_NOTHROW(block_.at(block_no)), mm_, - options_.mm_release); + options_.mm_release, options_.disable_block_integrity_check); ++blocks_created_; // Make a new set for the block diff --git a/src/dwarfs/filesystem_extractor.cpp b/src/dwarfs/filesystem_extractor.cpp index 0c2172d6..4109e838 100644 --- a/src/dwarfs/filesystem_extractor.cpp +++ b/src/dwarfs/filesystem_extractor.cpp @@ -72,6 +72,11 @@ class cache_semaphore { int64_t size_{0}; }; +class archive_error : public std::runtime_error { + public: + using std::runtime_error::runtime_error; +}; + } // namespace template @@ -146,7 +151,8 @@ class filesystem_extractor_ final : public filesystem_extractor::impl { closefd(pipefd_[0]); } - void extract(filesystem_v2 const& fs, size_t max_queued_bytes) override; + bool extract(filesystem_v2 const& fs, + filesystem_extractor_options const& opts) override; private: void closefd(int& fd) { @@ -189,7 +195,7 @@ class filesystem_extractor_ final : public filesystem_extractor::impl { break; case ARCHIVE_RETRY: case ARCHIVE_FATAL: - DWARFS_THROW(runtime_error, std::string(archive_error_string(a_))); + throw archive_error(std::string(archive_error_string(a_))); } } @@ -200,8 +206,8 @@ class filesystem_extractor_ final : public filesystem_extractor::impl { }; template -void filesystem_extractor_::extract(filesystem_v2 const& fs, - size_t max_queued_bytes) { +bool filesystem_extractor_::extract( + filesystem_v2 const& fs, filesystem_extractor_options const& opts) { DWARFS_CHECK(a_, "filesystem not opened"); auto lr = ::archive_entry_linkresolver_new(); @@ -217,11 +223,13 @@ void filesystem_extractor_::extract(filesystem_v2 const& fs, worker_group archiver("archiver", 1); cache_semaphore sem; - LOG_DEBUG << "extractor semaphore size: " << max_queued_bytes << " bytes"; + LOG_DEBUG << "extractor semaphore size: " << opts.max_queued_bytes + << " bytes"; - sem.post(max_queued_bytes); + sem.post(opts.max_queued_bytes); - std::atomic abort{false}; + std::atomic hard_error{0}; + std::atomic soft_error{0}; auto do_archive = [&](::archive_entry* ae, inode_view entry) { // TODO: inode vs. entry @@ -232,14 +240,16 @@ void filesystem_extractor_::extract(filesystem_v2 const& fs, size_t pos = 0; size_t remain = size; - while (remain > 0 && !abort) { - size_t bs = remain < max_queued_bytes ? remain : max_queued_bytes; + while (remain > 0 && hard_error == 0) { + size_t bs = + remain < opts.max_queued_bytes ? remain : opts.max_queued_bytes; sem.wait(bs); if (auto ranges = fs.readv(fd, bs, pos)) { - archiver.add_job([this, &sem, &abort, ranges = std::move(*ranges), ae, - pos, remain, bs, size]() mutable { + archiver.add_job([this, &sem, &hard_error, &soft_error, &opts, + ranges = std::move(*ranges), ae, pos, remain, bs, + size]() mutable { try { if (pos == 0) { LOG_DEBUG << "extracting " << ::archive_entry_pathname(ae) @@ -256,9 +266,17 @@ void filesystem_extractor_::extract(filesystem_v2 const& fs, archive_entry_free(ae); } sem.post(bs); + } catch (archive_error const& e) { + LOG_ERROR << folly::exceptionStr(e); + ++hard_error; } catch (...) { - LOG_ERROR << folly::exceptionStr(std::current_exception()); - abort = true; + if (opts.continue_on_error) { + LOG_WARN << folly::exceptionStr(std::current_exception()); + ++soft_error; + } else { + LOG_ERROR << folly::exceptionStr(std::current_exception()); + ++hard_error; + } archive_entry_free(ae); } }); @@ -273,13 +291,13 @@ void filesystem_extractor_::extract(filesystem_v2 const& fs, remain -= bs; } } else { - archiver.add_job([this, ae, &abort] { + archiver.add_job([this, ae, &hard_error] { SCOPE_EXIT { ::archive_entry_free(ae); }; try { check_result(::archive_write_header(a_, ae)); } catch (...) { LOG_ERROR << folly::exceptionStr(std::current_exception()); - abort = true; + hard_error = true; } }); } @@ -287,7 +305,7 @@ void filesystem_extractor_::extract(filesystem_v2 const& fs, fs.walk_data_order([&](auto entry) { // TODO: we can surely early abort walk() somehow - if (entry.is_root() || abort) { + if (entry.is_root() || hard_error) { return; } @@ -329,7 +347,7 @@ void filesystem_extractor_::extract(filesystem_v2 const& fs, archiver.wait(); - if (abort) { + if (hard_error) { DWARFS_THROW(runtime_error, "extraction aborted"); } @@ -340,6 +358,15 @@ void filesystem_extractor_::extract(filesystem_v2 const& fs, if (ae) { DWARFS_THROW(runtime_error, "unexpected deferred entry"); } + + if (soft_error > 0) { + LOG_ERROR << "extraction finished with " << soft_error << " error(s)"; + return false; + } + + LOG_INFO << "extraction finished without errors"; + + return true; } filesystem_extractor::filesystem_extractor(logger& lgr) diff --git a/src/dwarfsextract.cpp b/src/dwarfsextract.cpp index 58b94a5b..4917cc5f 100644 --- a/src/dwarfsextract.cpp +++ b/src/dwarfsextract.cpp @@ -45,6 +45,7 @@ int dwarfsextract(int argc, char** argv) { std::string filesystem, output, format, cache_size_str, log_level, image_offset; size_t num_workers; + bool continue_on_error{false}, disable_integrity_check{false}; // clang-format off po::options_description opts("Command line options"); @@ -61,6 +62,12 @@ int dwarfsextract(int argc, char** argv) { ("format,f", po::value(&format), "output format") + ("continue-on-error", + po::value(&continue_on_error)->zero_tokens(), + "continue if errors are encountered") + ("disable-integrity-check", + po::value(&disable_integrity_check)->zero_tokens(), + "disable file system image block integrity check (dangerous)") ("num-workers,n", po::value(&num_workers)->default_value(4), "number of worker threads") @@ -89,6 +96,8 @@ int dwarfsextract(int argc, char** argv) { return 0; } + int rv = 0; + try { auto level = logger::parse_level(log_level); stream_logger lgr(std::cerr, level, level >= logger::DEBUG); @@ -103,6 +112,7 @@ int dwarfsextract(int argc, char** argv) { fsopts.block_cache.max_bytes = parse_size_with_unit(cache_size_str); fsopts.block_cache.num_workers = num_workers; + fsopts.block_cache.disable_block_integrity_check = disable_integrity_check; fsopts.metadata.enable_nlink = true; filesystem_v2 fs(lgr, std::make_shared(filesystem), fsopts); @@ -117,7 +127,12 @@ int dwarfsextract(int argc, char** argv) { fsx.open_archive(output, format); } - fsx.extract(fs, fsopts.block_cache.max_bytes); + filesystem_extractor_options fsx_opts; + + fsx_opts.max_queued_bytes = fsopts.block_cache.max_bytes; + fsx_opts.continue_on_error = continue_on_error; + + rv = fsx.extract(fs, fsx_opts) ? 0 : 2; fsx.close(); } catch (runtime_error const& e) { @@ -131,7 +146,7 @@ int dwarfsextract(int argc, char** argv) { return 1; } - return 0; + return rv; } } // namespace diff --git a/test/dwarfs_compat.cpp b/test/dwarfs_compat.cpp index 523edc4e..7dd74d2a 100644 --- a/test/dwarfs_compat.cpp +++ b/test/dwarfs_compat.cpp @@ -972,7 +972,7 @@ void check_compat(logger& lgr, filesystem_v2 const& fs, std::ostringstream oss; EXPECT_NO_THROW(ext.open_stream(oss, "mtree")); - EXPECT_NO_THROW(ext.extract(fs, 4096)); + EXPECT_NO_THROW(ext.extract(fs)); EXPECT_NO_THROW(ext.close()); std::istringstream iss(oss.str());