mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-09 12:28:13 -04:00
fix(file_scanner): unexpected "inode has no file" (fixes gh #217)
When introducing an optimization to skip hashing of large files if they already differ in the first 4 KiB, only `unique_size_` was updated to be keyed on (size, start_hash). However, the same change is necessary for `first_file_hashed_`, as there can otherwise be collisions if files with the same size, but different start hashes are processed at the same time.
This commit is contained in:
parent
407b75b28c
commit
7bdc90223a
@ -154,7 +154,8 @@ class file_scanner_ final : public file_scanner::impl {
|
||||
// We need this lookup table to later find the unique_size_ entry
|
||||
// given just a file pointer.
|
||||
folly::F14FastMap<file const*, uint64_t> file_start_hash_;
|
||||
folly::F14FastMap<uint64_t, std::shared_ptr<condition_barrier>>
|
||||
folly::F14FastMap<std::pair<uint64_t, uint64_t>,
|
||||
std::shared_ptr<condition_barrier>>
|
||||
first_file_hashed_;
|
||||
folly::F14FastMap<uint64_t, inode::files_vector> by_raw_inode_;
|
||||
folly::F14FastMap<std::string_view, inode::files_vector> by_hash_;
|
||||
@ -310,8 +311,9 @@ void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {
|
||||
file_start_hash_.emplace(p, start_hash);
|
||||
}
|
||||
|
||||
auto [it, is_new] = unique_size_.emplace(std::make_pair(size, start_hash),
|
||||
inode::files_vector());
|
||||
auto const unique_key = std::make_pair(size, start_hash);
|
||||
|
||||
auto [it, is_new] = unique_size_.emplace(unique_key, inode::files_vector());
|
||||
|
||||
if (is_new) {
|
||||
// A file (size, start_hash) that has never been seen before. We can safely
|
||||
@ -332,7 +334,7 @@ void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {
|
||||
// This is any file of this (size, start_hash) after the second file
|
||||
std::lock_guard lock(mx_);
|
||||
|
||||
if (auto ffi = first_file_hashed_.find(size);
|
||||
if (auto ffi = first_file_hashed_.find(unique_key);
|
||||
ffi != first_file_hashed_.end()) {
|
||||
cv = ffi->second;
|
||||
}
|
||||
@ -347,12 +349,12 @@ void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {
|
||||
{
|
||||
std::lock_guard lock(mx_);
|
||||
DWARFS_CHECK(
|
||||
first_file_hashed_.emplace(size, cv).second,
|
||||
first_file_hashed_.emplace(unique_key, cv).second,
|
||||
"internal error: first file condition barrier already exists");
|
||||
}
|
||||
|
||||
// Add a job for the first file
|
||||
wg_.add_job([this, p = it->second.front(), cv] {
|
||||
wg_.add_job([this, p = it->second.front(), cv, unique_key] {
|
||||
hash_file(p);
|
||||
|
||||
{
|
||||
@ -371,7 +373,7 @@ void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {
|
||||
|
||||
cv->set();
|
||||
|
||||
first_file_hashed_.erase(p->size());
|
||||
first_file_hashed_.erase(unique_key);
|
||||
}
|
||||
|
||||
cv->notify();
|
||||
|
@ -2701,3 +2701,89 @@ TEST(block_cache, sequential_access_detector) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(file_scanner, large_file_handling) {
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
/*
|
||||
We have 5 files, each 1MB in size. Files 0 and 3 are identical, as are
|
||||
files 1, 2 and 4. In order to reproduce the bug from github #217, we must
|
||||
ensure the following order of events. Note that this description is only
|
||||
accurate for the old, buggy code.
|
||||
|
||||
[10ms] `f0` is discovered; the first 4K are hashed; unique_size_ is
|
||||
updated with (s, h0) -> f0; inode i0 is created
|
||||
|
||||
[20ms] `f1` is discovered; the first 4K are hashed; unique_size_ is
|
||||
updated with (s, h1) -> f1; inode i1 is created
|
||||
|
||||
[30ms] `f2` is discovered; the first 4K are hashed; (s, h2) == (s, h1)
|
||||
is found in unique_size_; latch l0 is created in slot s; a hash
|
||||
job is started for f1; unique_size_[(s, h2)] -> []; a hash job is
|
||||
started for f2
|
||||
|
||||
[40ms] `f3` is discovered; the first 4K are hashed; (s, h3) == (s, h0)
|
||||
is found in unique_size_; latch l1 is created but cannot be
|
||||
stored in slot s because it's occupied by l0; a hash job is
|
||||
started for f0; unique_size_[(s, h3)] -> []; a hash job is
|
||||
started for f3
|
||||
|
||||
[50ms] `f4` is discovered; the first 4K are hashed; (s, h4) == (s, h0)
|
||||
is found in unique_size_; latch l0 is found in slot s [where we
|
||||
would have rather expected l1]; a hash job is started for f4
|
||||
|
||||
[60ms] the hash job for f1 completes; latch l0 is released; f1 (i1) is
|
||||
added to `by_hash_`; latch l0 is removed from slot s
|
||||
|
||||
[70ms] the hash job for f4 completes; latch l0 has already been released;
|
||||
the hash for f4 is not in `by_hash_`; a new inode i2 is created;
|
||||
f4 (i2) is added to `by_hash_` [THIS IS THE BUG]
|
||||
|
||||
[80ms] the hash job for f0 completes; latch l1 is released; the hash for
|
||||
f0 is already in `by_hash_` [per f4, which shouldn't be there yet];
|
||||
f0 (i0) is added to `by_hash_`; an attempt is made to remove latch
|
||||
l1 from slot s [but it's not there, which isn't checked]
|
||||
|
||||
[90ms] the hash job for f2 completes; latch l0 has already been released;
|
||||
the hash for f2 == f1 is already in `by_hash_`; f2 (i1) is added
|
||||
[this is irrelevant]
|
||||
|
||||
[100ms] the hash job for f3 completes; latch l1 has already been released;
|
||||
the hash for f3 == f0 is already in `by_hash_`; f3 (i0) is added
|
||||
[this is irrelevant]
|
||||
*/
|
||||
|
||||
std::vector<std::string> data(5, test::loremipsum(1 << 20));
|
||||
std::vector<std::chrono::milliseconds> delays{40ms, 30ms, 60ms, 60ms, 20ms};
|
||||
|
||||
data[1][100] ^= 0x01;
|
||||
data[2][100] ^= 0x01;
|
||||
|
||||
auto t = mkdwarfs_tester::create_empty();
|
||||
t.add_root_dir();
|
||||
|
||||
for (size_t i = 0; i < data.size(); ++i) {
|
||||
auto file = fmt::format("f{}", i);
|
||||
t.os->add_file(file, data[i]);
|
||||
t.os->set_map_file_delay(fs::path{"/"} / file, delays[i]);
|
||||
}
|
||||
|
||||
t.os->set_map_file_delay_min_size(10'000);
|
||||
t.os->set_dir_reader_delay(10ms);
|
||||
|
||||
ASSERT_EQ(0, t.run("-i / -o - -l1")) << t.err();
|
||||
|
||||
auto fs = t.fs_from_stdout();
|
||||
|
||||
for (size_t i = 0; i < data.size(); ++i) {
|
||||
auto iv = fs.find(fmt::format("f{}", i).c_str());
|
||||
ASSERT_TRUE(iv) << i;
|
||||
file_stat st;
|
||||
ASSERT_EQ(0, fs.getattr(*iv, &st)) << i;
|
||||
std::string buffer;
|
||||
buffer.resize(st.size);
|
||||
auto nread = fs.read(iv->inode_num(), buffer.data(), st.size);
|
||||
EXPECT_EQ(data[i].size(), nread) << i;
|
||||
EXPECT_EQ(data[i], buffer) << i;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user