From a0377f74e15b99ba8be5d4bebe3ccf66f727edb2 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 30 Jan 2024 19:36:37 +0100 Subject: [PATCH] feat(file_scanner): prevent unnecessary hashing of large files --- src/dwarfs/file_scanner.cpp | 76 +++++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 15 deletions(-) diff --git a/src/dwarfs/file_scanner.cpp b/src/dwarfs/file_scanner.cpp index 6b37280f..f732dcb6 100644 --- a/src/dwarfs/file_scanner.cpp +++ b/src/dwarfs/file_scanner.cpp @@ -26,6 +26,9 @@ #include #include +#include + +#include "dwarfs/checksum.h" #include "dwarfs/entry.h" #include "dwarfs/file_scanner.h" #include "dwarfs/inode.h" @@ -41,6 +44,9 @@ namespace dwarfs::detail { namespace { +constexpr size_t const kLargeFileThreshold = 1024 * 1024; +constexpr size_t const kLargeFileStartHashSize = 4096; + template class file_scanner_ final : public file_scanner::impl { public: @@ -94,7 +100,14 @@ class file_scanner_ final : public file_scanner::impl { uint32_t num_unique_{0}; folly::F14FastMap hardlinks_; std::mutex mx_; - folly::F14FastMap unique_size_; + // The pair stores the file size and optionally a hash of the first + // 4 KiB of the file. If there's a collision, the worst that can + // happen is that we unnecessary hash a file that is not a duplicate. + folly::F14FastMap, inode::files_vector> + unique_size_; + // We need this lookup table to later find the unique_size_ entry + // given just a file pointer. + folly::F14FastMap file_start_hash_; folly::F14FastMap> first_file_hashed_; folly::F14FastMap by_raw_inode_; @@ -102,18 +115,19 @@ class file_scanner_ final : public file_scanner::impl { }; // The `unique_size_` table holds an entry for each file size we -// discover: +// discover, and optionally - for large files - an XXH3 hash of the +// first 4 KiB of the file. // -// - When we first discover a new file size, we know for sure that -// this file is *not* a duplicate of a file we've seen before. -// Thus, we can immediately create a new inode, and we can +// - When we first discover a new file size (+hash), we know for +// sure that this file is *not* a duplicate of a file we've seen +// before. Thus, we can immediately create a new inode, and we can // immediately start similarity scanning for this inode. // -// - When we discover the second file of particular size, we must -// hash both files to see if they're identical. We already have -// an inode for the first file, so we must delay the creation of -// a new inode until we know that the second file is not a -// duplicate. +// - When we discover the second file of particular size (+hash), we +// must fully hash both files (using the user-provided algorithm) +// to see if they're identical. We already have an inode for the +// first file, so we must delay the creation of a new inode until +// we know that the second file is not a duplicate. // // - Exactly the same applies for subsequent files. // @@ -129,6 +143,15 @@ class file_scanner_ final : public file_scanner::impl { // stored. As long as the first file's hash has not been stored, // it is still present in `unique_size_`. It will be removed // from `unique_size_` after its hash has been stored. +// +// - The optional hash value of the first 4 KiB of a large file is +// useful if there are a lot of large files with the same size. +// One potential scenario is uncompressed images which are very +// likely to have the same size, but very unlikely to have the +// same contents. The choice of 4 KiB is arbitrary, as is the +// threshold of 1 MiB for "large files". The 4 KiB hash is computed +// synchronously, so this could be a potential bottleneck; however, +// it should happen rarely enough to not be a problem. template file_scanner_::file_scanner_( @@ -143,6 +166,8 @@ file_scanner_::file_scanner_( template void file_scanner_::scan(file* p) { + // This method is supposed to be called from a single thread only. + if (p->num_hard_links() > 1) { auto& vec = hardlinks_[p->raw_inode_num()]; vec.push_back(p); @@ -178,11 +203,12 @@ void file_scanner_::finalize(uint32_t& inode_num) { if (hash_algo_) { finalize_hardlinks([this](file const* p) -> inode::files_vector& { - auto it = by_hash_.find(p->hash()); - if (it != by_hash_.end()) { + if (auto it = by_hash_.find(p->hash()); it != by_hash_.end()) { return it->second; } - return unique_size_.at(p->size()); + auto it = file_start_hash_.find(p); + uint64_t hash = it != file_start_hash_.end() ? it->second : 0; + return unique_size_.at({p->size(), hash}); }); finalize_files(unique_size_, inode_num, obj_num); finalize_files(by_raw_inode_, inode_num, obj_num); @@ -199,8 +225,26 @@ template void file_scanner_::scan_dedupe(file* p) { // We need no lock yet, as `unique_size_` is only manipulated from // this thread. - auto size = p->size(); - auto [it, is_new] = unique_size_.emplace(size, inode::files_vector()); + uint64_t size = p->size(); + uint64_t start_hash{0}; + + if (size >= kLargeFileThreshold && !p->is_invalid()) { + try { + auto mm = os_.map_file(p->fs_path(), kLargeFileStartHashSize); + checksum cs(checksum::algorithm::XXH3_64); + cs.update(mm->addr(), kLargeFileStartHashSize); + cs.finalize(&start_hash); + } catch (...) { + LOG_ERROR << "failed to map file " << p->path_as_string() << ": " + << folly::exceptionStr(std::current_exception()) + << ", creating empty file"; + ++prog_.errors; + p->set_invalid(); + } + } + + auto [it, is_new] = unique_size_.emplace(std::make_pair(size, start_hash), + inode::files_vector()); if (is_new) { // A file size that has never been seen before. We can safely @@ -263,6 +307,8 @@ void file_scanner_::scan_dedupe(file* p) { cv->notify(); }); + // Clear files vector, but don't delete the hash table entry, + // to indicate that files of this size *must* be hashed. it->second.clear(); }