From 4168649c2f61dd8b30a9f0d6e1e81e3e8bff16be Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Mon, 14 Apr 2025 09:07:56 +0200 Subject: [PATCH] refactor(file_scanner): phmap is as least as fast as F14 (and smaller) --- src/writer/internal/file_scanner.cpp | 40 +++++++++++++++------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/src/writer/internal/file_scanner.cpp b/src/writer/internal/file_scanner.cpp index 30a8a676..6ccd6e82 100644 --- a/src/writer/internal/file_scanner.cpp +++ b/src/writer/internal/file_scanner.cpp @@ -28,10 +28,11 @@ #include #include -#include #include +#include + #include #include @@ -74,6 +75,9 @@ class file_scanner_ final : public file_scanner::impl { void dump(std::ostream& os) const override; private: + template + using fast_map_type = phmap::flat_hash_map; + void scan_dedupe(file* p); void hash_file(file* p); void add_inode(file* p, int lineno); @@ -82,7 +86,7 @@ class file_scanner_ final : public file_scanner::impl { void finalize_hardlinks(Lookup const& lookup); template - void finalize_files(folly::F14FastMap& fmap, + void finalize_files(fast_map_type& fmap, uint32_t& inode_num, uint32_t& obj_num); template @@ -128,20 +132,20 @@ class file_scanner_ final : public file_scanner::impl { progress& prog_; file_scanner::options const opts_; uint32_t num_unique_{0}; - folly::F14FastMap hardlinks_; + fast_map_type hardlinks_; std::mutex mutable mx_; // The pair stores the file size and optionally a hash of the first // 4 KiB of the file. If there's a collision, the worst that can // happen is that we unnecessary hash a file that is not a duplicate. - folly::F14FastMap, inode::files_vector> + fast_map_type, inode::files_vector> unique_size_; // We need this lookup table to later find the unique_size_ entry // given just a file pointer. - folly::F14FastMap file_start_hash_; - folly::F14FastMap, std::shared_ptr> + fast_map_type file_start_hash_; + fast_map_type, std::shared_ptr> first_file_hashed_; - folly::F14FastMap by_raw_inode_; - folly::F14FastMap by_hash_; + fast_map_type by_raw_inode_; + fast_map_type by_hash_; struct inode_create_info { inode const* i; @@ -468,22 +472,22 @@ void file_scanner_::finalize_hardlinks(Lookup const& lookup) { template template void file_scanner_::finalize_files( - folly::F14FastMap& fmap, uint32_t& inode_num, + fast_map_type& fmap, uint32_t& inode_num, uint32_t& obj_num) { std::vector> ent; auto tv = LOG_TIMED_VERBOSE; ent.reserve(fmap.size()); - fmap.eraseInto( - fmap.begin(), fmap.end(), [&ent](KeyType&& k, inode::files_vector&& fv) { - if (!fv.empty()) { - if constexpr (UniqueOnly) { - DWARFS_CHECK(fv.size() == fv.front()->refcount(), "internal error"); - } - ent.emplace_back(std::move(k), std::move(fv)); - } - }); + for (auto& [k, fv] : fmap) { + if (!fv.empty()) { + if constexpr (UniqueOnly) { + DWARFS_CHECK(fv.size() == fv.front()->refcount(), "internal error"); + } + ent.emplace_back(std::move(k), std::move(fv)); + } + } + fmap.clear(); std::sort(ent.begin(), ent.end(), [](auto& left, auto& right) { return left.first < right.first; });