fix(file_scanner): unexpected "inode has no file" (fixes gh #217)

When introducing an optimization to skip hashing of large files if they already differ in the first 4 KiB, only `unique_size_` was updated to be keyed on (size, start_hash). However, the same change is necessary for `first_file_hashed_`, as there can otherwise be collisions if files with the same size, but different start hashes are processed at the same time.
2025-09-09 12:28:13 -04:00 · 2024-04-29 09:31:57 +02:00 · 2024-04-29 09:31:57 +02:00 · 7bdc90223a
commit 7bdc90223a
parent 407b75b28c
2 changed files with 95 additions and 7 deletions
--- a/src/dwarfs/file_scanner.cpp
+++ b/src/dwarfs/file_scanner.cpp
@ -154,7 +154,8 @@ class file_scanner_ final : public file_scanner::impl {
  // We need this lookup table to later find the unique_size_ entry
  // given just a file pointer.
  folly::F14FastMap<file const*, uint64_t> file_start_hash_;
-  folly::F14FastMap<uint64_t, std::shared_ptr<condition_barrier>>
+  folly::F14FastMap<std::pair<uint64_t, uint64_t>,
+                    std::shared_ptr<condition_barrier>>
      first_file_hashed_;
  folly::F14FastMap<uint64_t, inode::files_vector> by_raw_inode_;
  folly::F14FastMap<std::string_view, inode::files_vector> by_hash_;
@ -310,8 +311,9 @@ void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {
    file_start_hash_.emplace(p, start_hash);
  }

-  auto [it, is_new] = unique_size_.emplace(std::make_pair(size, start_hash),
-                                           inode::files_vector());
+  auto const unique_key = std::make_pair(size, start_hash);
+
+  auto [it, is_new] = unique_size_.emplace(unique_key, inode::files_vector());

  if (is_new) {
    // A file (size, start_hash) that has never been seen before. We can safely
@ -332,7 +334,7 @@ void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {
      // This is any file of this (size, start_hash) after the second file
      std::lock_guard lock(mx_);

-      if (auto ffi = first_file_hashed_.find(size);
+      if (auto ffi = first_file_hashed_.find(unique_key);
          ffi != first_file_hashed_.end()) {
        cv = ffi->second;
      }
@ -347,12 +349,12 @@ void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {
      {
        std::lock_guard lock(mx_);
        DWARFS_CHECK(
-            first_file_hashed_.emplace(size, cv).second,
+            first_file_hashed_.emplace(unique_key, cv).second,
            "internal error: first file condition barrier already exists");
      }

      // Add a job for the first file
-      wg_.add_job([this, p = it->second.front(), cv] {
+      wg_.add_job([this, p = it->second.front(), cv, unique_key] {
        hash_file(p);

        {
@ -371,7 +373,7 @@ void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {

          cv->set();

-          first_file_hashed_.erase(p->size());
+          first_file_hashed_.erase(unique_key);
        }

        cv->notify();
--- a/test/tool_main_test.cpp
+++ b/test/tool_main_test.cpp
@ -2701,3 +2701,89 @@ TEST(block_cache, sequential_access_detector) {
    }
  }
 }
+
+TEST(file_scanner, large_file_handling) {
+  using namespace std::chrono_literals;
+
+  /*
+    We have 5 files, each 1MB in size. Files 0 and 3 are identical, as are
+    files 1, 2 and 4. In order to reproduce the bug from github #217, we must
+    ensure the following order of events. Note that this description is only
+    accurate for the old, buggy code.
+
+    [10ms] `f0` is discovered; the first 4K are hashed; unique_size_ is
+           updated with (s, h0) -> f0; inode i0 is created
+
+    [20ms] `f1` is discovered; the first 4K are hashed; unique_size_ is
+           updated with (s, h1) -> f1; inode i1 is created
+
+    [30ms] `f2` is discovered; the first 4K are hashed; (s, h2) == (s, h1)
+           is found in unique_size_; latch l0 is created in slot s; a hash
+           job is started for f1; unique_size_[(s, h2)] -> []; a hash job is
+           started for f2
+
+    [40ms] `f3` is discovered; the first 4K are hashed; (s, h3) == (s, h0)
+           is found in unique_size_; latch l1 is created but cannot be
+           stored in slot s because it's occupied by l0; a hash job is
+           started for f0; unique_size_[(s, h3)] -> []; a hash job is
+           started for f3
+
+    [50ms] `f4` is discovered; the first 4K are hashed; (s, h4) == (s, h0)
+           is found in unique_size_; latch l0 is found in slot s [where we
+           would have rather expected l1]; a hash job is started for f4
+
+    [60ms] the hash job for f1 completes; latch l0 is released; f1 (i1) is
+           added to `by_hash_`; latch l0 is removed from slot s
+
+    [70ms] the hash job for f4 completes; latch l0 has already been released;
+           the hash for f4 is not in `by_hash_`; a new inode i2 is created;
+           f4 (i2) is added to `by_hash_` [THIS IS THE BUG]
+
+    [80ms] the hash job for f0 completes; latch l1 is released; the hash for
+           f0 is already in `by_hash_` [per f4, which shouldn't be there yet];
+           f0 (i0) is added to `by_hash_`; an attempt is made to remove latch
+           l1 from slot s [but it's not there, which isn't checked]
+
+    [90ms] the hash job for f2 completes; latch l0 has already been released;
+           the hash for f2 == f1 is already in `by_hash_`; f2 (i1) is added
+           [this is irrelevant]
+
+   [100ms] the hash job for f3 completes; latch l1 has already been released;
+           the hash for f3 == f0 is already in `by_hash_`; f3 (i0) is added
+           [this is irrelevant]
+  */
+
+  std::vector<std::string> data(5, test::loremipsum(1 << 20));
+  std::vector<std::chrono::milliseconds> delays{40ms, 30ms, 60ms, 60ms, 20ms};
+
+  data[1][100] ^= 0x01;
+  data[2][100] ^= 0x01;
+
+  auto t = mkdwarfs_tester::create_empty();
+  t.add_root_dir();
+
+  for (size_t i = 0; i < data.size(); ++i) {
+    auto file = fmt::format("f{}", i);
+    t.os->add_file(file, data[i]);
+    t.os->set_map_file_delay(fs::path{"/"} / file, delays[i]);
+  }
+
+  t.os->set_map_file_delay_min_size(10'000);
+  t.os->set_dir_reader_delay(10ms);
+
+  ASSERT_EQ(0, t.run("-i / -o - -l1")) << t.err();
+
+  auto fs = t.fs_from_stdout();
+
+  for (size_t i = 0; i < data.size(); ++i) {
+    auto iv = fs.find(fmt::format("f{}", i).c_str());
+    ASSERT_TRUE(iv) << i;
+    file_stat st;
+    ASSERT_EQ(0, fs.getattr(*iv, &st)) << i;
+    std::string buffer;
+    buffer.resize(st.size);
+    auto nread = fs.read(iv->inode_num(), buffer.data(), st.size);
+    EXPECT_EQ(data[i].size(), nread) << i;
+    EXPECT_EQ(data[i], buffer) << i;
+  }
+}