Fix #104: read large files in chunks rather than fully

This changes the way data is sent to libarchive. For files larger than `max_queued_bytes`, instead of fully reading the compressed file and then sending the whole file to libarchive at once, the code now reads chunks of at most `max_queued_bytes` and sends the chunks to libarchive independently. Small files are treated as before. When extracting large files, this method is actually a lot faster as it puts less strain on the memory allocator.
2025-09-12 05:49:56 -04:00 · 2022-10-21 11:12:28 +02:00 · 2022-10-21 11:12:28 +02:00 · 186eb763a3
commit 186eb763a3
parent dc8490f583
1 changed files with 42 additions and 21 deletions
--- a/src/dwarfs/filesystem_extractor.cpp
+++ b/src/dwarfs/filesystem_extractor.cpp
@ -217,6 +217,8 @@ void filesystem_extractor_<LoggerPolicy>::extract(filesystem_v2 const& fs,
  worker_group archiver("archiver", 1);
  cache_semaphore sem;
  LOG_DEBUG << "extractor semaphore size: " << max_queued_bytes << " bytes";
  sem.post(max_queued_bytes);
  std::atomic<bool> abort{false};
@ -227,29 +229,48 @@ void filesystem_extractor_<LoggerPolicy>::extract(filesystem_v2 const& fs,
        S_ISREG(entry.mode()) && size > 0) {
      auto fd = fs.open(entry);
-      sem.wait(size);
+      size_t pos = 0;
      size_t remain = size;
-      if (auto ranges = fs.readv(fd, size, 0)) {
+      while (remain > 0 && !abort) {
        size_t bs = remain < max_queued_bytes ? remain : max_queued_bytes;
        sem.wait(bs);
        if (auto ranges = fs.readv(fd, bs, pos)) {
          archiver.add_job([this, &sem, &abort, ranges = std::move(*ranges), ae,
-                          size]() mutable {
+                            pos, remain, bs, size]() mutable {
          SCOPE_EXIT { ::archive_entry_free(ae); };
            try {
-            LOG_TRACE << "archiving " << ::archive_entry_pathname(ae);
+              if (pos == 0) {
                LOG_DEBUG << "extracting " << ::archive_entry_pathname(ae)
                          << " (" << size << " bytes)";
                check_result(::archive_write_header(a_, ae));
              }
              for (auto& r : ranges) {
                auto br = r.get();
-              LOG_TRACE << "writing " << br.size() << " bytes";
+                LOG_TRACE << "[" << pos << "] writing " << br.size()
                          << " bytes for " << ::archive_entry_pathname(ae);
                check_result(::archive_write_data(a_, br.data(), br.size()));
              }
-            sem.post(size);
+              if (bs == remain) {
                archive_entry_free(ae);
              }
              sem.post(bs);
            } catch (...) {
              LOG_ERROR << folly::exceptionStr(std::current_exception());
              abort = true;
              archive_entry_free(ae);
            }
          });
        } else {
-        LOG_ERROR << "error reading inode [" << fd
+          LOG_ERROR << "error reading " << bs << " bytes at offset " << pos
                    << " from  inode [" << fd
                    << "]: " << ::strerror(-ranges.error());
          break;
        }
        pos += bs;
        remain -= bs;
      }
    } else {
      archiver.add_job([this, ae, &abort] {