Factor out file_scanner

2025-09-09 04:19:10 -04:00 · 2022-10-30 11:18:08 +01:00 · 2022-10-30 11:18:08 +01:00 · c2e3cdfecb
commit c2e3cdfecb
parent b41a400e32
4 changed files with 496 additions and 360 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -302,6 +302,7 @@ list(
  src/dwarfs/console_writer.cpp
  src/dwarfs/entry.cpp
  src/dwarfs/error.cpp
+  src/dwarfs/file_scanner.cpp
  src/dwarfs/filesystem_extractor.cpp
  src/dwarfs/filesystem_v2.cpp
  src/dwarfs/filesystem_writer.cpp
--- a/include/dwarfs/file_scanner.h
+++ b/include/dwarfs/file_scanner.h
@ -0,0 +1,64 @@
+/* vim:set ts=2 sw=2 sts=2 et: */
+/**
+ * \author     Marcus Holland-Moritz (github@mhxnet.de)
+ * \copyright  Copyright (c) Marcus Holland-Moritz
+ *
+ * This file is part of dwarfs.
+ *
+ * dwarfs is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * dwarfs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with dwarfs.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <string>
+
+namespace dwarfs {
+
+class file;
+class inode_manager;
+class os_access;
+class progress;
+class worker_group;
+
+struct inode_options;
+
+namespace detail {
+
+class file_scanner {
+ public:
+  file_scanner(worker_group& wg, os_access& os, inode_manager& im,
+               inode_options const& ino_opts,
+               std::optional<std::string> const& hash_algo, progress& prog);
+
+  void scan(file* p) { impl_->scan(p); }
+  void finalize(uint32_t& inode_num) { impl_->finalize(inode_num); }
+  uint32_t num_unique() const { return impl_->num_unique(); }
+
+  class impl {
+   public:
+    virtual ~impl() = default;
+
+    virtual void scan(file* p) = 0;
+    virtual void finalize(uint32_t& inode_num) = 0;
+    virtual uint32_t num_unique() const = 0;
+  };
+
+ private:
+  std::unique_ptr<impl> impl_;
+};
+
+} // namespace detail
+} // namespace dwarfs
--- a/src/dwarfs/file_scanner.cpp
+++ b/src/dwarfs/file_scanner.cpp
@ -0,0 +1,426 @@
+/* vim:set ts=2 sw=2 sts=2 et: */
+/**
+ * \author     Marcus Holland-Moritz (github@mhxnet.de)
+ * \copyright  Copyright (c) Marcus Holland-Moritz
+ *
+ * This file is part of dwarfs.
+ *
+ * dwarfs is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * dwarfs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with dwarfs.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <mutex>
+#include <string_view>
+#include <vector>
+
+#include <folly/container/F14Map.h>
+
+#include "dwarfs/entry.h"
+#include "dwarfs/file_scanner.h"
+#include "dwarfs/inode.h"
+#include "dwarfs/inode_manager.h"
+#include "dwarfs/logger.h"
+#include "dwarfs/options.h"
+#include "dwarfs/os_access.h"
+#include "dwarfs/progress.h"
+#include "dwarfs/worker_group.h"
+
+namespace dwarfs::detail {
+
+namespace {
+
+class file_scanner_ : public file_scanner::impl {
+ public:
+  file_scanner_(worker_group& wg, os_access& os, inode_manager& im,
+                inode_options const& ino_opts,
+                std::optional<std::string> const& hash_algo, progress& prog);
+
+  void scan(file* p) override;
+  void finalize(uint32_t& inode_num) override;
+
+  uint32_t num_unique() const override { return num_unique_; }
+
+ private:
+  class condition_barrier {
+   public:
+    void set() { ready_ = true; }
+
+    void notify() { cv_.notify_all(); }
+
+    void wait(std::unique_lock<std::mutex>& lock) {
+      cv_.wait(lock, [this] { return ready_; });
+    }
+
+   private:
+    std::condition_variable cv_;
+    bool ready_{false};
+  };
+
+  void scan_dedupe(file* p);
+  void hash_file(file* p);
+  void add_inode(file* p);
+
+  template <typename Lookup>
+  void finalize_hardlinks(Lookup&& lookup);
+
+  template <bool UniqueOnly = false, typename KeyType>
+  void finalize_files(folly::F14FastMap<KeyType, inode::files_vector>& fmap,
+                      uint32_t& inode_num, uint32_t& obj_num);
+
+  template <bool Unique, typename KeyType>
+  void
+  finalize_inodes(std::vector<std::pair<KeyType, inode::files_vector>>& ent,
+                  uint32_t& inode_num, uint32_t& obj_num);
+
+  worker_group& wg_;
+  os_access& os_;
+  inode_manager& im_;
+  inode_options const& ino_opts_;
+  std::optional<std::string> const hash_algo_;
+  progress& prog_;
+  uint32_t num_unique_{0};
+  folly::F14FastMap<uint64_t, inode::files_vector> hardlinks_;
+  std::mutex mx_;
+  folly::F14FastMap<uint64_t, inode::files_vector> unique_size_;
+  folly::F14FastMap<uint64_t, std::shared_ptr<condition_barrier>>
+      first_file_hashed_;
+  folly::F14FastMap<uint64_t, inode::files_vector> by_raw_inode_;
+  folly::F14FastMap<std::string_view, inode::files_vector> by_hash_;
+};
+
+// The `unique_size_` table holds an entry for each file size we
+// discover:
+//
+// - When we first discover a new file size, we know for sure that
+//   this file is *not* a duplicate of a file we've seen before.
+//   Thus, we can immediately create a new inode, and we can
+//   immediately start similarity scanning for this inode.
+//
+// - When we discover the second file of particular size, we must
+//   hash both files to see if they're identical. We already have
+//   an inode for the first file, so we must delay the creation of
+//   a new inode until we know that the second file is not a
+//   duplicate.
+//
+// - Exactly the same applies for subsequent files.
+//
+// - We must ensure that the presence of a hash is checked in
+//   `by_hash_` for subsequent files only if the first file's
+//   hash has been computed and stored. Otherwise, if a subsequent
+//   file's hash computation finishes before the first file, we
+//   assume (potentially wrongly) that the subsequent file is not
+//   a duplicate.
+//
+// - So subsequent files must wait for the first file unless we
+//   know up front that the first file's hash has already been
+//   stored. As long as the first file's hash has not been stored,
+//   it is still present in `unique_size_`. It will be removed
+//   from `unique_size_` after its hash has been stored.
+
+file_scanner_::file_scanner_(worker_group& wg, os_access& os, inode_manager& im,
+                             inode_options const& ino_opts,
+                             std::optional<std::string> const& hash_algo,
+                             progress& prog)
+    : wg_(wg)
+    , os_(os)
+    , im_(im)
+    , ino_opts_(ino_opts)
+    , hash_algo_{hash_algo}
+    , prog_(prog) {}
+
+void file_scanner_::scan(file* p) {
+  if (p->num_hard_links() > 1) {
+    auto& vec = hardlinks_[p->raw_inode_num()];
+    vec.push_back(p);
+
+    if (vec.size() > 1) {
+      p->hardlink(vec[0], prog_);
+      ++prog_.files_scanned;
+      return;
+    }
+  }
+
+  p->create_data();
+
+  prog_.original_size += p->size();
+
+  if (hash_algo_) {
+    scan_dedupe(p);
+  } else {
+    prog_.current.store(p);
+    p->scan(nullptr, prog_, hash_algo_); // TODO
+
+    by_raw_inode_[p->raw_inode_num()].push_back(p);
+
+    add_inode(p);
+  }
+}
+
+void file_scanner_::finalize(uint32_t& inode_num) {
+  uint32_t obj_num = 0;
+
+  assert(first_file_hashed_.empty());
+
+  if (hash_algo_) {
+    finalize_hardlinks([this](file const* p) -> inode::files_vector& {
+      auto it = by_hash_.find(p->hash());
+      if (it != by_hash_.end()) {
+        return it->second;
+      }
+      return unique_size_.at(p->size());
+    });
+    finalize_files<true>(unique_size_, inode_num, obj_num);
+    finalize_files(by_hash_, inode_num, obj_num);
+  } else {
+    finalize_hardlinks([this](file const* p) -> inode::files_vector& {
+      return by_raw_inode_.at(p->raw_inode_num());
+    });
+    finalize_files(by_raw_inode_, inode_num, obj_num);
+  }
+}
+
+void file_scanner_::scan_dedupe(file* p) {
+  // We need no lock yet, as `unique_size_` is only manipulated from
+  // this thread.
+  auto size = p->size();
+  auto [it, is_new] = unique_size_.emplace(size, inode::files_vector());
+
+  if (is_new) {
+    // A file size that has never been seen before. We can safely
+    // create a new inode and we'll keep track of the file.
+    it->second.push_back(p);
+
+    {
+      std::lock_guard lock(mx_);
+      add_inode(p);
+    }
+  } else {
+    // This file size has been seen before, so this is potentially
+    // a duplicate.
+
+    std::shared_ptr<condition_barrier> cv;
+
+    if (it->second.empty()) {
+      // This is any file of this size after the second file
+      std::lock_guard lock(mx_);
+
+      if (auto ffi = first_file_hashed_.find(size);
+          ffi != first_file_hashed_.end()) {
+        cv = ffi->second;
+      }
+    } else {
+      // This is the second file of this size. We now need to hash
+      // both the first and second file and ensure that the first
+      // file's hash is stored to `by_hash_` first. We set up a
+      // condition variable to synchronize insertion into `by_hash_`.
+
+      cv = std::make_shared<condition_barrier>();
+
+      {
+        std::lock_guard lock(mx_);
+        first_file_hashed_.emplace(size, cv);
+      }
+
+      // Add a job for the first file
+      wg_.add_job([this, p = it->second.front(), cv] {
+        hash_file(p);
+
+        {
+          std::lock_guard lock(mx_);
+
+          auto& ref = by_hash_[p->hash()];
+
+          assert(ref.empty());
+          assert(p->get_inode());
+
+          ref.push_back(p);
+
+          cv->set();
+
+          first_file_hashed_.erase(p->size());
+        }
+
+        cv->notify();
+      });
+
+      it->second.clear();
+    }
+
+    // Add a job for any subsequent files
+    wg_.add_job([this, p, cv] {
+      hash_file(p);
+
+      {
+        std::unique_lock lock(mx_);
+
+        if (cv) {
+          // Wait until the first file of this size has been added to
+          // `by_hash_`.
+          cv->wait(lock);
+        }
+
+        auto& ref = by_hash_[p->hash()];
+
+        if (ref.empty()) {
+          // This is *not* a duplicate. We must allocate a new inode.
+          add_inode(p);
+        } else {
+          auto inode = ref.front()->get_inode();
+          assert(inode);
+          p->set_inode(inode);
+          ++prog_.files_scanned;
+          ++prog_.duplicate_files;
+          prog_.saved_by_deduplication += p->size();
+        }
+
+        ref.push_back(p);
+      }
+    });
+  }
+}
+
+void file_scanner_::hash_file(file* p) {
+  auto const size = p->size();
+  std::shared_ptr<mmif> mm;
+
+  if (size > 0) {
+    mm = os_.map_file(p->path(), size);
+  }
+
+  prog_.current.store(p);
+  p->scan(mm, prog_, hash_algo_);
+}
+
+void file_scanner_::add_inode(file* p) {
+  assert(!p->get_inode());
+
+  auto inode = im_.create_inode();
+
+  p->set_inode(inode);
+
+  if (ino_opts_.needs_scan()) {
+    wg_.add_job([this, p, inode = std::move(inode)] {
+      std::shared_ptr<mmif> mm;
+      auto const size = p->size();
+      if (size > 0) {
+        mm = os_.map_file(p->path(), size);
+      }
+      inode->scan(mm, ino_opts_);
+      ++prog_.similarity_scans;
+      prog_.similarity_bytes += size;
+      ++prog_.inodes_scanned;
+      ++prog_.files_scanned;
+    });
+  } else {
+    ++prog_.inodes_scanned;
+    ++prog_.files_scanned;
+  }
+}
+
+template <typename Lookup>
+void file_scanner_::finalize_hardlinks(Lookup&& lookup) {
+  for (auto& kv : hardlinks_) {
+    auto& hlv = kv.second;
+    if (hlv.size() > 1) {
+      auto& fv = lookup(hlv.front());
+      // TODO: for (auto p : hlv | std::views::drop(1))
+      std::for_each(hlv.begin() + 1, hlv.end(), [&fv](auto p) {
+        p->set_inode(fv.front()->get_inode());
+        fv.push_back(p);
+      });
+    }
+  }
+
+  hardlinks_.clear();
+}
+
+template <bool UniqueOnly, typename KeyType>
+void file_scanner_::finalize_files(
+    folly::F14FastMap<KeyType, inode::files_vector>& fmap, uint32_t& inode_num,
+    uint32_t& obj_num) {
+  std::vector<std::pair<KeyType, inode::files_vector>> ent;
+  ent.reserve(fmap.size());
+  fmap.eraseInto(
+      fmap.begin(), fmap.end(), [&ent](KeyType&& k, inode::files_vector&& fv) {
+        if (!fv.empty()) {
+          if constexpr (UniqueOnly) {
+            DWARFS_CHECK(fv.size() == fv.front()->refcount(), "internal error");
+          }
+          ent.emplace_back(std::move(k), std::move(fv));
+        }
+      });
+  std::sort(ent.begin(), ent.end(),
+            [](auto& left, auto& right) { return left.first < right.first; });
+
+  DWARFS_CHECK(fmap.empty(), "expected file map to be empty");
+
+  finalize_inodes<true>(ent, inode_num, obj_num);
+  if constexpr (!UniqueOnly) {
+    finalize_inodes<false>(ent, inode_num, obj_num);
+  }
+}
+
+template <bool Unique, typename KeyType>
+void file_scanner_::finalize_inodes(
+    std::vector<std::pair<KeyType, inode::files_vector>>& ent,
+    uint32_t& inode_num, uint32_t& obj_num) {
+  for (auto& p : ent) {
+    auto& files = p.second;
+
+    if constexpr (Unique) {
+      // this is true regardless of how the files are ordered
+      if (files.size() > files.front()->refcount()) {
+        continue;
+      }
+
+      ++num_unique_;
+    } else {
+      if (files.empty()) {
+        continue;
+      }
+
+      DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file");
+    }
+
+    // this isn't strictly necessary, but helps metadata compression
+    std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
+      return a->path() < b->path();
+    });
+
+    for (auto fp : files) {
+      // need to check because hardlinks share the same number
+      if (!fp->inode_num()) {
+        fp->set_inode_num(inode_num);
+        ++inode_num;
+      }
+    }
+
+    auto fp = files.front();
+    auto inode = fp->get_inode();
+    assert(inode);
+    inode->set_num(obj_num);
+    inode->set_files(std::move(files));
+
+    ++obj_num;
+  }
+}
+} // namespace
+
+file_scanner::file_scanner(worker_group& wg, os_access& os, inode_manager& im,
+                           inode_options const& ino_opts,
+                           std::optional<std::string> const& hash_algo,
+                           progress& prog)
+    : impl_{std::make_unique<file_scanner_>(wg, os, im, ino_opts, hash_algo,
+                                            prog)} {}
+
+} // namespace dwarfs::detail
--- a/src/dwarfs/scanner.cpp
+++ b/src/dwarfs/scanner.cpp
@ -24,13 +24,10 @@
 #include <cstring>
 #include <ctime>
 #include <deque>
-#include <iostream>
 #include <iterator>
-#include <mutex>
 #include <numeric>
 #include <stdexcept>
 #include <string>
-#include <string_view>
 #include <system_error>
 #include <utility>
 #include <vector>
@ -38,13 +35,13 @@
 #include <unistd.h>

 #include <folly/ExceptionString.h>
-#include <folly/container/F14Map.h>

 #include <fmt/format.h>

 #include "dwarfs/block_data.h"
 #include "dwarfs/entry.h"
 #include "dwarfs/error.h"
+#include "dwarfs/file_scanner.h"
 #include "dwarfs/filesystem_writer.h"
 #include "dwarfs/global_entry_data.h"
 #include "dwarfs/inode.h"
@ -75,358 +72,6 @@ class visitor_base : public entry_visitor {
  void visit(device*) override {}
 };

-class file_scanner {
- public:
-  file_scanner(worker_group& wg, os_access& os, inode_manager& im,
-               inode_options const& ino_opts,
-               std::optional<std::string> const& hash_algo, progress& prog)
-      : wg_(wg)
-      , os_(os)
-      , im_(im)
-      , ino_opts_(ino_opts)
-      , hash_algo_{hash_algo}
-      , prog_(prog) {}
-
-  void scan(file* p) {
-    if (p->num_hard_links() > 1) {
-      auto& vec = hardlinks_[p->raw_inode_num()];
-      vec.push_back(p);
-
-      if (vec.size() > 1) {
-        p->hardlink(vec[0], prog_);
-        ++prog_.files_scanned;
-        return;
-      }
-    }
-
-    p->create_data();
-
-    prog_.original_size += p->size();
-
-    if (hash_algo_) {
-      scan_dedupe(p);
-    } else {
-      prog_.current.store(p);
-      p->scan(nullptr, prog_, hash_algo_); // TODO
-
-      by_raw_inode_[p->raw_inode_num()].push_back(p);
-
-      add_inode(p);
-    }
-  }
-
-  void finalize(uint32_t& inode_num) {
-    uint32_t obj_num = 0;
-
-    assert(first_file_hashed_.empty());
-
-    if (hash_algo_) {
-      finalize_hardlinks([this](file const* p) -> inode::files_vector& {
-        auto it = by_hash_.find(p->hash());
-        if (it != by_hash_.end()) {
-          return it->second;
-        }
-        return unique_size_.at(p->size());
-      });
-      finalize_files<true>(unique_size_, inode_num, obj_num);
-      finalize_files(by_hash_, inode_num, obj_num);
-    } else {
-      finalize_hardlinks([this](file const* p) -> inode::files_vector& {
-        return by_raw_inode_.at(p->raw_inode_num());
-      });
-      finalize_files(by_raw_inode_, inode_num, obj_num);
-    }
-  }
-
-  uint32_t num_unique() const { return num_unique_; }
-
- private:
-  class condition_barrier {
-   public:
-    void set() { ready_ = true; }
-
-    void notify() { cv_.notify_all(); }
-
-    void wait(std::unique_lock<std::mutex>& lock) {
-      cv_.wait(lock, [this] { return ready_; });
-    }
-
-   private:
-    std::condition_variable cv_;
-    bool ready_{false};
-  };
-
-  void scan_dedupe(file* p) {
-    // The `unique_size_` table holds an entry for each file size we
-    // discover:
-    //
-    // - When we first discover a new file size, we know for sure that
-    //   this file is *not* a duplicate of a file we've seen before.
-    //   Thus, we can immediately create a new inode, and we can
-    //   immediately start similarity scanning for this inode.
-    //
-    // - When we discover the second file of particular size, we must
-    //   hash both files to see if they're identical. We already have
-    //   an inode for the first file, so we must delay the creation of
-    //   a new inode until we know that the second file is not a
-    //   duplicate.
-    //
-    // - Exactly the same applies for subsequent files.
-    //
-    // - We must ensure that the presence of a hash is checked in
-    //   `by_hash_` for subsequent files only if the first file's
-    //   hash has been computed and stored. Otherwise, if a subsequent
-    //   file's hash computation finishes before the first file, we
-    //   assume (potentially wrongly) that the subsequent file is not
-    //   a duplicate.
-    //
-    // - So subsequent files must wait for the first file unless we
-    //   know up front that the first file's hash has already been
-    //   stored. As long as the first file's hash has not been stored,
-    //   it is still present in `unique_size_`. It will be removed
-    //   from `unique_size_` after its hash has been stored.
-
-    // We need no lock yet, as `unique_size_` is only manipulated from
-    // this thread.
-    auto size = p->size();
-    auto [it, is_new] = unique_size_.emplace(size, inode::files_vector());
-
-    if (is_new) {
-      // A file size that has never been seen before. We can safely
-      // create a new inode and we'll keep track of the file.
-      it->second.push_back(p);
-
-      {
-        std::lock_guard lock(mx_);
-        add_inode(p);
-      }
-    } else {
-      // This file size has been seen before, so this is potentially
-      // a duplicate.
-
-      std::shared_ptr<condition_barrier> cv;
-
-      if (it->second.empty()) {
-        // This is any file of this size after the second file
-        std::lock_guard lock(mx_);
-
-        if (auto ffi = first_file_hashed_.find(size);
-            ffi != first_file_hashed_.end()) {
-          cv = ffi->second;
-        }
-      } else {
-        // This is the second file of this size. We now need to hash
-        // both the first and second file and ensure that the first
-        // file's hash is stored to `by_hash_` first. We set up a
-        // condition variable to synchronize insertion into `by_hash_`.
-
-        cv = std::make_shared<condition_barrier>();
-
-        {
-          std::lock_guard lock(mx_);
-          first_file_hashed_.emplace(size, cv);
-        }
-
-        // Add a job for the first file
-        wg_.add_job([this, p = it->second.front(), cv] {
-          hash_file(p);
-
-          {
-            std::lock_guard lock(mx_);
-
-            auto& ref = by_hash_[p->hash()];
-
-            assert(ref.empty());
-            assert(p->get_inode());
-
-            ref.push_back(p);
-
-            cv->set();
-
-            first_file_hashed_.erase(p->size());
-          }
-
-          cv->notify();
-        });
-
-        it->second.clear();
-      }
-
-      // Add a job for any subsequent files
-      wg_.add_job([this, p, cv] {
-        hash_file(p);
-
-        {
-          std::unique_lock lock(mx_);
-
-          if (cv) {
-            // Wait until the first file of this size has been added to
-            // `by_hash_`.
-            cv->wait(lock);
-          }
-
-          auto& ref = by_hash_[p->hash()];
-
-          if (ref.empty()) {
-            // This is *not* a duplicate. We must allocate a new inode.
-            add_inode(p);
-          } else {
-            auto inode = ref.front()->get_inode();
-            assert(inode);
-            p->set_inode(inode);
-            ++prog_.files_scanned;
-            ++prog_.duplicate_files;
-            prog_.saved_by_deduplication += p->size();
-          }
-
-          ref.push_back(p);
-        }
-      });
-    }
-  }
-
-  void hash_file(file* p) {
-    auto const size = p->size();
-    std::shared_ptr<mmif> mm;
-
-    if (size > 0) {
-      mm = os_.map_file(p->path(), size);
-    }
-
-    prog_.current.store(p);
-    p->scan(mm, prog_, hash_algo_);
-  }
-
-  void add_inode(file* p) {
-    assert(!p->get_inode());
-
-    auto inode = im_.create_inode();
-
-    p->set_inode(inode);
-
-    if (ino_opts_.needs_scan()) {
-      wg_.add_job([this, p, inode = std::move(inode)] {
-        std::shared_ptr<mmif> mm;
-        auto const size = p->size();
-        if (size > 0) {
-          mm = os_.map_file(p->path(), size);
-        }
-        inode->scan(mm, ino_opts_);
-        ++prog_.similarity_scans;
-        prog_.similarity_bytes += size;
-        ++prog_.inodes_scanned;
-        ++prog_.files_scanned;
-      });
-    } else {
-      ++prog_.inodes_scanned;
-      ++prog_.files_scanned;
-    }
-  }
-
-  template <typename Lookup>
-  void finalize_hardlinks(Lookup&& lookup) {
-    for (auto& kv : hardlinks_) {
-      auto& hlv = kv.second;
-      if (hlv.size() > 1) {
-        auto& fv = lookup(hlv.front());
-        // TODO: for (auto p : hlv | std::views::drop(1))
-        std::for_each(hlv.begin() + 1, hlv.end(), [&fv](auto p) {
-          p->set_inode(fv.front()->get_inode());
-          fv.push_back(p);
-        });
-      }
-    }
-
-    hardlinks_.clear();
-  }
-
-  template <bool UniqueOnly = false, typename KeyType>
-  void finalize_files(folly::F14FastMap<KeyType, inode::files_vector>& fmap,
-                      uint32_t& inode_num, uint32_t& obj_num) {
-    std::vector<std::pair<KeyType, inode::files_vector>> ent;
-    ent.reserve(fmap.size());
-    fmap.eraseInto(fmap.begin(), fmap.end(),
-                   [&ent](KeyType&& k, inode::files_vector&& fv) {
-                     if (!fv.empty()) {
-                       if constexpr (UniqueOnly) {
-                         DWARFS_CHECK(fv.size() == fv.front()->refcount(),
-                                      "internal error");
-                       }
-                       ent.emplace_back(std::move(k), std::move(fv));
-                     }
-                   });
-    std::sort(ent.begin(), ent.end(),
-              [](auto& left, auto& right) { return left.first < right.first; });
-
-    DWARFS_CHECK(fmap.empty(), "expected file map to be empty");
-
-    finalize_inodes<true>(ent, inode_num, obj_num);
-    if constexpr (!UniqueOnly) {
-      finalize_inodes<false>(ent, inode_num, obj_num);
-    }
-  }
-
-  template <bool Unique, typename KeyType>
-  void
-  finalize_inodes(std::vector<std::pair<KeyType, inode::files_vector>>& ent,
-                  uint32_t& inode_num, uint32_t& obj_num) {
-    for (auto& p : ent) {
-      auto& files = p.second;
-
-      if constexpr (Unique) {
-        // this is true regardless of how the files are ordered
-        if (files.size() > files.front()->refcount()) {
-          continue;
-        }
-
-        ++num_unique_;
-      } else {
-        if (files.empty()) {
-          continue;
-        }
-
-        DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file");
-      }
-
-      // this isn't strictly necessary, but helps metadata compression
-      std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
-        return a->path() < b->path();
-      });
-
-      for (auto fp : files) {
-        // need to check because hardlinks share the same number
-        if (!fp->inode_num()) {
-          fp->set_inode_num(inode_num);
-          ++inode_num;
-        }
-      }
-
-      auto fp = files.front();
-      auto inode = fp->get_inode();
-      assert(inode);
-      inode->set_num(obj_num);
-      inode->set_files(std::move(files));
-
-      ++obj_num;
-    }
-  }
-
-  worker_group& wg_;
-  os_access& os_;
-  inode_manager& im_;
-  inode_options const& ino_opts_;
-  std::optional<std::string> const hash_algo_;
-  progress& prog_;
-  uint32_t num_unique_{0};
-  folly::F14FastMap<uint64_t, inode::files_vector> hardlinks_;
-  std::mutex mx_;
-  folly::F14FastMap<uint64_t, inode::files_vector> unique_size_;
-  folly::F14FastMap<uint64_t, std::shared_ptr<condition_barrier>>
-      first_file_hashed_;
-  folly::F14FastMap<uint64_t, inode::files_vector> by_raw_inode_;
-  folly::F14FastMap<std::string_view, inode::files_vector> by_hash_;
-};
-
 class dir_set_inode_visitor : public visitor_base {
 public:
  explicit dir_set_inode_visitor(uint32_t& inode_num)
@ -644,7 +289,7 @@ class scanner_ final : public scanner::impl {

 private:
  std::shared_ptr<entry>
-  scan_tree(const std::string& path, progress& prog, file_scanner& fs);
+  scan_tree(const std::string& path, progress& prog, detail::file_scanner& fs);

  const block_manager::config& cfg_;
  const scanner_options& options_;
@ -675,7 +320,7 @@ scanner_<LoggerPolicy>::scanner_(logger& lgr, worker_group& wg,
 template <typename LoggerPolicy>
 std::shared_ptr<entry>
 scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
-                                  file_scanner& fs) {
+                                  detail::file_scanner& fs) {
  auto root = entry_->create(*os_, path);
  bool const debug_filter = options_.debug_filter_function.has_value();

@ -823,8 +468,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
  prog.set_status_function(status_string);

  inode_manager im(lgr_, prog);
-  file_scanner fs(wg_, *os_, im, options_.inode, options_.file_hash_algorithm,
-                  prog);
+  detail::file_scanner fs(wg_, *os_, im, options_.inode,
+                          options_.file_hash_algorithm, prog);

  auto root = scan_tree(path, prog, fs);