From 7bdc90223a8fce463087f0730bdf99e73c4d63c0 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Mon, 29 Apr 2024 09:31:57 +0200 Subject: [PATCH] fix(file_scanner): unexpected "inode has no file" (fixes gh #217) When introducing an optimization to skip hashing of large files if they already differ in the first 4 KiB, only `unique_size_` was updated to be keyed on (size, start_hash). However, the same change is necessary for `first_file_hashed_`, as there can otherwise be collisions if files with the same size, but different start hashes are processed at the same time. --- src/dwarfs/file_scanner.cpp | 16 ++++--- test/tool_main_test.cpp | 86 +++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 7 deletions(-) diff --git a/src/dwarfs/file_scanner.cpp b/src/dwarfs/file_scanner.cpp index e7b645ad..340863e4 100644 --- a/src/dwarfs/file_scanner.cpp +++ b/src/dwarfs/file_scanner.cpp @@ -154,7 +154,8 @@ class file_scanner_ final : public file_scanner::impl { // We need this lookup table to later find the unique_size_ entry // given just a file pointer. folly::F14FastMap file_start_hash_; - folly::F14FastMap> + folly::F14FastMap, + std::shared_ptr> first_file_hashed_; folly::F14FastMap by_raw_inode_; folly::F14FastMap by_hash_; @@ -310,8 +311,9 @@ void file_scanner_::scan_dedupe(file* p) { file_start_hash_.emplace(p, start_hash); } - auto [it, is_new] = unique_size_.emplace(std::make_pair(size, start_hash), - inode::files_vector()); + auto const unique_key = std::make_pair(size, start_hash); + + auto [it, is_new] = unique_size_.emplace(unique_key, inode::files_vector()); if (is_new) { // A file (size, start_hash) that has never been seen before. We can safely @@ -332,7 +334,7 @@ void file_scanner_::scan_dedupe(file* p) { // This is any file of this (size, start_hash) after the second file std::lock_guard lock(mx_); - if (auto ffi = first_file_hashed_.find(size); + if (auto ffi = first_file_hashed_.find(unique_key); ffi != first_file_hashed_.end()) { cv = ffi->second; } @@ -347,12 +349,12 @@ void file_scanner_::scan_dedupe(file* p) { { std::lock_guard lock(mx_); DWARFS_CHECK( - first_file_hashed_.emplace(size, cv).second, + first_file_hashed_.emplace(unique_key, cv).second, "internal error: first file condition barrier already exists"); } // Add a job for the first file - wg_.add_job([this, p = it->second.front(), cv] { + wg_.add_job([this, p = it->second.front(), cv, unique_key] { hash_file(p); { @@ -371,7 +373,7 @@ void file_scanner_::scan_dedupe(file* p) { cv->set(); - first_file_hashed_.erase(p->size()); + first_file_hashed_.erase(unique_key); } cv->notify(); diff --git a/test/tool_main_test.cpp b/test/tool_main_test.cpp index 5f2540ff..2c14803a 100644 --- a/test/tool_main_test.cpp +++ b/test/tool_main_test.cpp @@ -2701,3 +2701,89 @@ TEST(block_cache, sequential_access_detector) { } } } + +TEST(file_scanner, large_file_handling) { + using namespace std::chrono_literals; + + /* + We have 5 files, each 1MB in size. Files 0 and 3 are identical, as are + files 1, 2 and 4. In order to reproduce the bug from github #217, we must + ensure the following order of events. Note that this description is only + accurate for the old, buggy code. + + [10ms] `f0` is discovered; the first 4K are hashed; unique_size_ is + updated with (s, h0) -> f0; inode i0 is created + + [20ms] `f1` is discovered; the first 4K are hashed; unique_size_ is + updated with (s, h1) -> f1; inode i1 is created + + [30ms] `f2` is discovered; the first 4K are hashed; (s, h2) == (s, h1) + is found in unique_size_; latch l0 is created in slot s; a hash + job is started for f1; unique_size_[(s, h2)] -> []; a hash job is + started for f2 + + [40ms] `f3` is discovered; the first 4K are hashed; (s, h3) == (s, h0) + is found in unique_size_; latch l1 is created but cannot be + stored in slot s because it's occupied by l0; a hash job is + started for f0; unique_size_[(s, h3)] -> []; a hash job is + started for f3 + + [50ms] `f4` is discovered; the first 4K are hashed; (s, h4) == (s, h0) + is found in unique_size_; latch l0 is found in slot s [where we + would have rather expected l1]; a hash job is started for f4 + + [60ms] the hash job for f1 completes; latch l0 is released; f1 (i1) is + added to `by_hash_`; latch l0 is removed from slot s + + [70ms] the hash job for f4 completes; latch l0 has already been released; + the hash for f4 is not in `by_hash_`; a new inode i2 is created; + f4 (i2) is added to `by_hash_` [THIS IS THE BUG] + + [80ms] the hash job for f0 completes; latch l1 is released; the hash for + f0 is already in `by_hash_` [per f4, which shouldn't be there yet]; + f0 (i0) is added to `by_hash_`; an attempt is made to remove latch + l1 from slot s [but it's not there, which isn't checked] + + [90ms] the hash job for f2 completes; latch l0 has already been released; + the hash for f2 == f1 is already in `by_hash_`; f2 (i1) is added + [this is irrelevant] + + [100ms] the hash job for f3 completes; latch l1 has already been released; + the hash for f3 == f0 is already in `by_hash_`; f3 (i0) is added + [this is irrelevant] + */ + + std::vector data(5, test::loremipsum(1 << 20)); + std::vector delays{40ms, 30ms, 60ms, 60ms, 20ms}; + + data[1][100] ^= 0x01; + data[2][100] ^= 0x01; + + auto t = mkdwarfs_tester::create_empty(); + t.add_root_dir(); + + for (size_t i = 0; i < data.size(); ++i) { + auto file = fmt::format("f{}", i); + t.os->add_file(file, data[i]); + t.os->set_map_file_delay(fs::path{"/"} / file, delays[i]); + } + + t.os->set_map_file_delay_min_size(10'000); + t.os->set_dir_reader_delay(10ms); + + ASSERT_EQ(0, t.run("-i / -o - -l1")) << t.err(); + + auto fs = t.fs_from_stdout(); + + for (size_t i = 0; i < data.size(); ++i) { + auto iv = fs.find(fmt::format("f{}", i).c_str()); + ASSERT_TRUE(iv) << i; + file_stat st; + ASSERT_EQ(0, fs.getattr(*iv, &st)) << i; + std::string buffer; + buffer.resize(st.size); + auto nread = fs.read(iv->inode_num(), buffer.data(), st.size); + EXPECT_EQ(data[i].size(), nread) << i; + EXPECT_EQ(data[i], buffer) << i; + } +}