mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-09 04:19:10 -04:00
Factor out file_scanner
This commit is contained in:
parent
b41a400e32
commit
c2e3cdfecb
@ -302,6 +302,7 @@ list(
|
||||
src/dwarfs/console_writer.cpp
|
||||
src/dwarfs/entry.cpp
|
||||
src/dwarfs/error.cpp
|
||||
src/dwarfs/file_scanner.cpp
|
||||
src/dwarfs/filesystem_extractor.cpp
|
||||
src/dwarfs/filesystem_v2.cpp
|
||||
src/dwarfs/filesystem_writer.cpp
|
||||
|
64
include/dwarfs/file_scanner.h
Normal file
64
include/dwarfs/file_scanner.h
Normal file
@ -0,0 +1,64 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
class file;
|
||||
class inode_manager;
|
||||
class os_access;
|
||||
class progress;
|
||||
class worker_group;
|
||||
|
||||
struct inode_options;
|
||||
|
||||
namespace detail {
|
||||
|
||||
class file_scanner {
|
||||
public:
|
||||
file_scanner(worker_group& wg, os_access& os, inode_manager& im,
|
||||
inode_options const& ino_opts,
|
||||
std::optional<std::string> const& hash_algo, progress& prog);
|
||||
|
||||
void scan(file* p) { impl_->scan(p); }
|
||||
void finalize(uint32_t& inode_num) { impl_->finalize(inode_num); }
|
||||
uint32_t num_unique() const { return impl_->num_unique(); }
|
||||
|
||||
class impl {
|
||||
public:
|
||||
virtual ~impl() = default;
|
||||
|
||||
virtual void scan(file* p) = 0;
|
||||
virtual void finalize(uint32_t& inode_num) = 0;
|
||||
virtual uint32_t num_unique() const = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
std::unique_ptr<impl> impl_;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
} // namespace dwarfs
|
426
src/dwarfs/file_scanner.cpp
Normal file
426
src/dwarfs/file_scanner.cpp
Normal file
@ -0,0 +1,426 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <mutex>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include <folly/container/F14Map.h>
|
||||
|
||||
#include "dwarfs/entry.h"
|
||||
#include "dwarfs/file_scanner.h"
|
||||
#include "dwarfs/inode.h"
|
||||
#include "dwarfs/inode_manager.h"
|
||||
#include "dwarfs/logger.h"
|
||||
#include "dwarfs/options.h"
|
||||
#include "dwarfs/os_access.h"
|
||||
#include "dwarfs/progress.h"
|
||||
#include "dwarfs/worker_group.h"
|
||||
|
||||
namespace dwarfs::detail {
|
||||
|
||||
namespace {
|
||||
|
||||
class file_scanner_ : public file_scanner::impl {
|
||||
public:
|
||||
file_scanner_(worker_group& wg, os_access& os, inode_manager& im,
|
||||
inode_options const& ino_opts,
|
||||
std::optional<std::string> const& hash_algo, progress& prog);
|
||||
|
||||
void scan(file* p) override;
|
||||
void finalize(uint32_t& inode_num) override;
|
||||
|
||||
uint32_t num_unique() const override { return num_unique_; }
|
||||
|
||||
private:
|
||||
class condition_barrier {
|
||||
public:
|
||||
void set() { ready_ = true; }
|
||||
|
||||
void notify() { cv_.notify_all(); }
|
||||
|
||||
void wait(std::unique_lock<std::mutex>& lock) {
|
||||
cv_.wait(lock, [this] { return ready_; });
|
||||
}
|
||||
|
||||
private:
|
||||
std::condition_variable cv_;
|
||||
bool ready_{false};
|
||||
};
|
||||
|
||||
void scan_dedupe(file* p);
|
||||
void hash_file(file* p);
|
||||
void add_inode(file* p);
|
||||
|
||||
template <typename Lookup>
|
||||
void finalize_hardlinks(Lookup&& lookup);
|
||||
|
||||
template <bool UniqueOnly = false, typename KeyType>
|
||||
void finalize_files(folly::F14FastMap<KeyType, inode::files_vector>& fmap,
|
||||
uint32_t& inode_num, uint32_t& obj_num);
|
||||
|
||||
template <bool Unique, typename KeyType>
|
||||
void
|
||||
finalize_inodes(std::vector<std::pair<KeyType, inode::files_vector>>& ent,
|
||||
uint32_t& inode_num, uint32_t& obj_num);
|
||||
|
||||
worker_group& wg_;
|
||||
os_access& os_;
|
||||
inode_manager& im_;
|
||||
inode_options const& ino_opts_;
|
||||
std::optional<std::string> const hash_algo_;
|
||||
progress& prog_;
|
||||
uint32_t num_unique_{0};
|
||||
folly::F14FastMap<uint64_t, inode::files_vector> hardlinks_;
|
||||
std::mutex mx_;
|
||||
folly::F14FastMap<uint64_t, inode::files_vector> unique_size_;
|
||||
folly::F14FastMap<uint64_t, std::shared_ptr<condition_barrier>>
|
||||
first_file_hashed_;
|
||||
folly::F14FastMap<uint64_t, inode::files_vector> by_raw_inode_;
|
||||
folly::F14FastMap<std::string_view, inode::files_vector> by_hash_;
|
||||
};
|
||||
|
||||
// The `unique_size_` table holds an entry for each file size we
|
||||
// discover:
|
||||
//
|
||||
// - When we first discover a new file size, we know for sure that
|
||||
// this file is *not* a duplicate of a file we've seen before.
|
||||
// Thus, we can immediately create a new inode, and we can
|
||||
// immediately start similarity scanning for this inode.
|
||||
//
|
||||
// - When we discover the second file of particular size, we must
|
||||
// hash both files to see if they're identical. We already have
|
||||
// an inode for the first file, so we must delay the creation of
|
||||
// a new inode until we know that the second file is not a
|
||||
// duplicate.
|
||||
//
|
||||
// - Exactly the same applies for subsequent files.
|
||||
//
|
||||
// - We must ensure that the presence of a hash is checked in
|
||||
// `by_hash_` for subsequent files only if the first file's
|
||||
// hash has been computed and stored. Otherwise, if a subsequent
|
||||
// file's hash computation finishes before the first file, we
|
||||
// assume (potentially wrongly) that the subsequent file is not
|
||||
// a duplicate.
|
||||
//
|
||||
// - So subsequent files must wait for the first file unless we
|
||||
// know up front that the first file's hash has already been
|
||||
// stored. As long as the first file's hash has not been stored,
|
||||
// it is still present in `unique_size_`. It will be removed
|
||||
// from `unique_size_` after its hash has been stored.
|
||||
|
||||
file_scanner_::file_scanner_(worker_group& wg, os_access& os, inode_manager& im,
|
||||
inode_options const& ino_opts,
|
||||
std::optional<std::string> const& hash_algo,
|
||||
progress& prog)
|
||||
: wg_(wg)
|
||||
, os_(os)
|
||||
, im_(im)
|
||||
, ino_opts_(ino_opts)
|
||||
, hash_algo_{hash_algo}
|
||||
, prog_(prog) {}
|
||||
|
||||
void file_scanner_::scan(file* p) {
|
||||
if (p->num_hard_links() > 1) {
|
||||
auto& vec = hardlinks_[p->raw_inode_num()];
|
||||
vec.push_back(p);
|
||||
|
||||
if (vec.size() > 1) {
|
||||
p->hardlink(vec[0], prog_);
|
||||
++prog_.files_scanned;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
p->create_data();
|
||||
|
||||
prog_.original_size += p->size();
|
||||
|
||||
if (hash_algo_) {
|
||||
scan_dedupe(p);
|
||||
} else {
|
||||
prog_.current.store(p);
|
||||
p->scan(nullptr, prog_, hash_algo_); // TODO
|
||||
|
||||
by_raw_inode_[p->raw_inode_num()].push_back(p);
|
||||
|
||||
add_inode(p);
|
||||
}
|
||||
}
|
||||
|
||||
void file_scanner_::finalize(uint32_t& inode_num) {
|
||||
uint32_t obj_num = 0;
|
||||
|
||||
assert(first_file_hashed_.empty());
|
||||
|
||||
if (hash_algo_) {
|
||||
finalize_hardlinks([this](file const* p) -> inode::files_vector& {
|
||||
auto it = by_hash_.find(p->hash());
|
||||
if (it != by_hash_.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return unique_size_.at(p->size());
|
||||
});
|
||||
finalize_files<true>(unique_size_, inode_num, obj_num);
|
||||
finalize_files(by_hash_, inode_num, obj_num);
|
||||
} else {
|
||||
finalize_hardlinks([this](file const* p) -> inode::files_vector& {
|
||||
return by_raw_inode_.at(p->raw_inode_num());
|
||||
});
|
||||
finalize_files(by_raw_inode_, inode_num, obj_num);
|
||||
}
|
||||
}
|
||||
|
||||
void file_scanner_::scan_dedupe(file* p) {
|
||||
// We need no lock yet, as `unique_size_` is only manipulated from
|
||||
// this thread.
|
||||
auto size = p->size();
|
||||
auto [it, is_new] = unique_size_.emplace(size, inode::files_vector());
|
||||
|
||||
if (is_new) {
|
||||
// A file size that has never been seen before. We can safely
|
||||
// create a new inode and we'll keep track of the file.
|
||||
it->second.push_back(p);
|
||||
|
||||
{
|
||||
std::lock_guard lock(mx_);
|
||||
add_inode(p);
|
||||
}
|
||||
} else {
|
||||
// This file size has been seen before, so this is potentially
|
||||
// a duplicate.
|
||||
|
||||
std::shared_ptr<condition_barrier> cv;
|
||||
|
||||
if (it->second.empty()) {
|
||||
// This is any file of this size after the second file
|
||||
std::lock_guard lock(mx_);
|
||||
|
||||
if (auto ffi = first_file_hashed_.find(size);
|
||||
ffi != first_file_hashed_.end()) {
|
||||
cv = ffi->second;
|
||||
}
|
||||
} else {
|
||||
// This is the second file of this size. We now need to hash
|
||||
// both the first and second file and ensure that the first
|
||||
// file's hash is stored to `by_hash_` first. We set up a
|
||||
// condition variable to synchronize insertion into `by_hash_`.
|
||||
|
||||
cv = std::make_shared<condition_barrier>();
|
||||
|
||||
{
|
||||
std::lock_guard lock(mx_);
|
||||
first_file_hashed_.emplace(size, cv);
|
||||
}
|
||||
|
||||
// Add a job for the first file
|
||||
wg_.add_job([this, p = it->second.front(), cv] {
|
||||
hash_file(p);
|
||||
|
||||
{
|
||||
std::lock_guard lock(mx_);
|
||||
|
||||
auto& ref = by_hash_[p->hash()];
|
||||
|
||||
assert(ref.empty());
|
||||
assert(p->get_inode());
|
||||
|
||||
ref.push_back(p);
|
||||
|
||||
cv->set();
|
||||
|
||||
first_file_hashed_.erase(p->size());
|
||||
}
|
||||
|
||||
cv->notify();
|
||||
});
|
||||
|
||||
it->second.clear();
|
||||
}
|
||||
|
||||
// Add a job for any subsequent files
|
||||
wg_.add_job([this, p, cv] {
|
||||
hash_file(p);
|
||||
|
||||
{
|
||||
std::unique_lock lock(mx_);
|
||||
|
||||
if (cv) {
|
||||
// Wait until the first file of this size has been added to
|
||||
// `by_hash_`.
|
||||
cv->wait(lock);
|
||||
}
|
||||
|
||||
auto& ref = by_hash_[p->hash()];
|
||||
|
||||
if (ref.empty()) {
|
||||
// This is *not* a duplicate. We must allocate a new inode.
|
||||
add_inode(p);
|
||||
} else {
|
||||
auto inode = ref.front()->get_inode();
|
||||
assert(inode);
|
||||
p->set_inode(inode);
|
||||
++prog_.files_scanned;
|
||||
++prog_.duplicate_files;
|
||||
prog_.saved_by_deduplication += p->size();
|
||||
}
|
||||
|
||||
ref.push_back(p);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void file_scanner_::hash_file(file* p) {
|
||||
auto const size = p->size();
|
||||
std::shared_ptr<mmif> mm;
|
||||
|
||||
if (size > 0) {
|
||||
mm = os_.map_file(p->path(), size);
|
||||
}
|
||||
|
||||
prog_.current.store(p);
|
||||
p->scan(mm, prog_, hash_algo_);
|
||||
}
|
||||
|
||||
void file_scanner_::add_inode(file* p) {
|
||||
assert(!p->get_inode());
|
||||
|
||||
auto inode = im_.create_inode();
|
||||
|
||||
p->set_inode(inode);
|
||||
|
||||
if (ino_opts_.needs_scan()) {
|
||||
wg_.add_job([this, p, inode = std::move(inode)] {
|
||||
std::shared_ptr<mmif> mm;
|
||||
auto const size = p->size();
|
||||
if (size > 0) {
|
||||
mm = os_.map_file(p->path(), size);
|
||||
}
|
||||
inode->scan(mm, ino_opts_);
|
||||
++prog_.similarity_scans;
|
||||
prog_.similarity_bytes += size;
|
||||
++prog_.inodes_scanned;
|
||||
++prog_.files_scanned;
|
||||
});
|
||||
} else {
|
||||
++prog_.inodes_scanned;
|
||||
++prog_.files_scanned;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Lookup>
|
||||
void file_scanner_::finalize_hardlinks(Lookup&& lookup) {
|
||||
for (auto& kv : hardlinks_) {
|
||||
auto& hlv = kv.second;
|
||||
if (hlv.size() > 1) {
|
||||
auto& fv = lookup(hlv.front());
|
||||
// TODO: for (auto p : hlv | std::views::drop(1))
|
||||
std::for_each(hlv.begin() + 1, hlv.end(), [&fv](auto p) {
|
||||
p->set_inode(fv.front()->get_inode());
|
||||
fv.push_back(p);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
hardlinks_.clear();
|
||||
}
|
||||
|
||||
template <bool UniqueOnly, typename KeyType>
|
||||
void file_scanner_::finalize_files(
|
||||
folly::F14FastMap<KeyType, inode::files_vector>& fmap, uint32_t& inode_num,
|
||||
uint32_t& obj_num) {
|
||||
std::vector<std::pair<KeyType, inode::files_vector>> ent;
|
||||
ent.reserve(fmap.size());
|
||||
fmap.eraseInto(
|
||||
fmap.begin(), fmap.end(), [&ent](KeyType&& k, inode::files_vector&& fv) {
|
||||
if (!fv.empty()) {
|
||||
if constexpr (UniqueOnly) {
|
||||
DWARFS_CHECK(fv.size() == fv.front()->refcount(), "internal error");
|
||||
}
|
||||
ent.emplace_back(std::move(k), std::move(fv));
|
||||
}
|
||||
});
|
||||
std::sort(ent.begin(), ent.end(),
|
||||
[](auto& left, auto& right) { return left.first < right.first; });
|
||||
|
||||
DWARFS_CHECK(fmap.empty(), "expected file map to be empty");
|
||||
|
||||
finalize_inodes<true>(ent, inode_num, obj_num);
|
||||
if constexpr (!UniqueOnly) {
|
||||
finalize_inodes<false>(ent, inode_num, obj_num);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool Unique, typename KeyType>
|
||||
void file_scanner_::finalize_inodes(
|
||||
std::vector<std::pair<KeyType, inode::files_vector>>& ent,
|
||||
uint32_t& inode_num, uint32_t& obj_num) {
|
||||
for (auto& p : ent) {
|
||||
auto& files = p.second;
|
||||
|
||||
if constexpr (Unique) {
|
||||
// this is true regardless of how the files are ordered
|
||||
if (files.size() > files.front()->refcount()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
++num_unique_;
|
||||
} else {
|
||||
if (files.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file");
|
||||
}
|
||||
|
||||
// this isn't strictly necessary, but helps metadata compression
|
||||
std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
|
||||
return a->path() < b->path();
|
||||
});
|
||||
|
||||
for (auto fp : files) {
|
||||
// need to check because hardlinks share the same number
|
||||
if (!fp->inode_num()) {
|
||||
fp->set_inode_num(inode_num);
|
||||
++inode_num;
|
||||
}
|
||||
}
|
||||
|
||||
auto fp = files.front();
|
||||
auto inode = fp->get_inode();
|
||||
assert(inode);
|
||||
inode->set_num(obj_num);
|
||||
inode->set_files(std::move(files));
|
||||
|
||||
++obj_num;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
file_scanner::file_scanner(worker_group& wg, os_access& os, inode_manager& im,
|
||||
inode_options const& ino_opts,
|
||||
std::optional<std::string> const& hash_algo,
|
||||
progress& prog)
|
||||
: impl_{std::make_unique<file_scanner_>(wg, os, im, ino_opts, hash_algo,
|
||||
prog)} {}
|
||||
|
||||
} // namespace dwarfs::detail
|
@ -24,13 +24,10 @@
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <deque>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <mutex>
|
||||
#include <numeric>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <system_error>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
@ -38,13 +35,13 @@
|
||||
#include <unistd.h>
|
||||
|
||||
#include <folly/ExceptionString.h>
|
||||
#include <folly/container/F14Map.h>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include "dwarfs/block_data.h"
|
||||
#include "dwarfs/entry.h"
|
||||
#include "dwarfs/error.h"
|
||||
#include "dwarfs/file_scanner.h"
|
||||
#include "dwarfs/filesystem_writer.h"
|
||||
#include "dwarfs/global_entry_data.h"
|
||||
#include "dwarfs/inode.h"
|
||||
@ -75,358 +72,6 @@ class visitor_base : public entry_visitor {
|
||||
void visit(device*) override {}
|
||||
};
|
||||
|
||||
class file_scanner {
|
||||
public:
|
||||
file_scanner(worker_group& wg, os_access& os, inode_manager& im,
|
||||
inode_options const& ino_opts,
|
||||
std::optional<std::string> const& hash_algo, progress& prog)
|
||||
: wg_(wg)
|
||||
, os_(os)
|
||||
, im_(im)
|
||||
, ino_opts_(ino_opts)
|
||||
, hash_algo_{hash_algo}
|
||||
, prog_(prog) {}
|
||||
|
||||
void scan(file* p) {
|
||||
if (p->num_hard_links() > 1) {
|
||||
auto& vec = hardlinks_[p->raw_inode_num()];
|
||||
vec.push_back(p);
|
||||
|
||||
if (vec.size() > 1) {
|
||||
p->hardlink(vec[0], prog_);
|
||||
++prog_.files_scanned;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
p->create_data();
|
||||
|
||||
prog_.original_size += p->size();
|
||||
|
||||
if (hash_algo_) {
|
||||
scan_dedupe(p);
|
||||
} else {
|
||||
prog_.current.store(p);
|
||||
p->scan(nullptr, prog_, hash_algo_); // TODO
|
||||
|
||||
by_raw_inode_[p->raw_inode_num()].push_back(p);
|
||||
|
||||
add_inode(p);
|
||||
}
|
||||
}
|
||||
|
||||
void finalize(uint32_t& inode_num) {
|
||||
uint32_t obj_num = 0;
|
||||
|
||||
assert(first_file_hashed_.empty());
|
||||
|
||||
if (hash_algo_) {
|
||||
finalize_hardlinks([this](file const* p) -> inode::files_vector& {
|
||||
auto it = by_hash_.find(p->hash());
|
||||
if (it != by_hash_.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return unique_size_.at(p->size());
|
||||
});
|
||||
finalize_files<true>(unique_size_, inode_num, obj_num);
|
||||
finalize_files(by_hash_, inode_num, obj_num);
|
||||
} else {
|
||||
finalize_hardlinks([this](file const* p) -> inode::files_vector& {
|
||||
return by_raw_inode_.at(p->raw_inode_num());
|
||||
});
|
||||
finalize_files(by_raw_inode_, inode_num, obj_num);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t num_unique() const { return num_unique_; }
|
||||
|
||||
private:
|
||||
class condition_barrier {
|
||||
public:
|
||||
void set() { ready_ = true; }
|
||||
|
||||
void notify() { cv_.notify_all(); }
|
||||
|
||||
void wait(std::unique_lock<std::mutex>& lock) {
|
||||
cv_.wait(lock, [this] { return ready_; });
|
||||
}
|
||||
|
||||
private:
|
||||
std::condition_variable cv_;
|
||||
bool ready_{false};
|
||||
};
|
||||
|
||||
void scan_dedupe(file* p) {
|
||||
// The `unique_size_` table holds an entry for each file size we
|
||||
// discover:
|
||||
//
|
||||
// - When we first discover a new file size, we know for sure that
|
||||
// this file is *not* a duplicate of a file we've seen before.
|
||||
// Thus, we can immediately create a new inode, and we can
|
||||
// immediately start similarity scanning for this inode.
|
||||
//
|
||||
// - When we discover the second file of particular size, we must
|
||||
// hash both files to see if they're identical. We already have
|
||||
// an inode for the first file, so we must delay the creation of
|
||||
// a new inode until we know that the second file is not a
|
||||
// duplicate.
|
||||
//
|
||||
// - Exactly the same applies for subsequent files.
|
||||
//
|
||||
// - We must ensure that the presence of a hash is checked in
|
||||
// `by_hash_` for subsequent files only if the first file's
|
||||
// hash has been computed and stored. Otherwise, if a subsequent
|
||||
// file's hash computation finishes before the first file, we
|
||||
// assume (potentially wrongly) that the subsequent file is not
|
||||
// a duplicate.
|
||||
//
|
||||
// - So subsequent files must wait for the first file unless we
|
||||
// know up front that the first file's hash has already been
|
||||
// stored. As long as the first file's hash has not been stored,
|
||||
// it is still present in `unique_size_`. It will be removed
|
||||
// from `unique_size_` after its hash has been stored.
|
||||
|
||||
// We need no lock yet, as `unique_size_` is only manipulated from
|
||||
// this thread.
|
||||
auto size = p->size();
|
||||
auto [it, is_new] = unique_size_.emplace(size, inode::files_vector());
|
||||
|
||||
if (is_new) {
|
||||
// A file size that has never been seen before. We can safely
|
||||
// create a new inode and we'll keep track of the file.
|
||||
it->second.push_back(p);
|
||||
|
||||
{
|
||||
std::lock_guard lock(mx_);
|
||||
add_inode(p);
|
||||
}
|
||||
} else {
|
||||
// This file size has been seen before, so this is potentially
|
||||
// a duplicate.
|
||||
|
||||
std::shared_ptr<condition_barrier> cv;
|
||||
|
||||
if (it->second.empty()) {
|
||||
// This is any file of this size after the second file
|
||||
std::lock_guard lock(mx_);
|
||||
|
||||
if (auto ffi = first_file_hashed_.find(size);
|
||||
ffi != first_file_hashed_.end()) {
|
||||
cv = ffi->second;
|
||||
}
|
||||
} else {
|
||||
// This is the second file of this size. We now need to hash
|
||||
// both the first and second file and ensure that the first
|
||||
// file's hash is stored to `by_hash_` first. We set up a
|
||||
// condition variable to synchronize insertion into `by_hash_`.
|
||||
|
||||
cv = std::make_shared<condition_barrier>();
|
||||
|
||||
{
|
||||
std::lock_guard lock(mx_);
|
||||
first_file_hashed_.emplace(size, cv);
|
||||
}
|
||||
|
||||
// Add a job for the first file
|
||||
wg_.add_job([this, p = it->second.front(), cv] {
|
||||
hash_file(p);
|
||||
|
||||
{
|
||||
std::lock_guard lock(mx_);
|
||||
|
||||
auto& ref = by_hash_[p->hash()];
|
||||
|
||||
assert(ref.empty());
|
||||
assert(p->get_inode());
|
||||
|
||||
ref.push_back(p);
|
||||
|
||||
cv->set();
|
||||
|
||||
first_file_hashed_.erase(p->size());
|
||||
}
|
||||
|
||||
cv->notify();
|
||||
});
|
||||
|
||||
it->second.clear();
|
||||
}
|
||||
|
||||
// Add a job for any subsequent files
|
||||
wg_.add_job([this, p, cv] {
|
||||
hash_file(p);
|
||||
|
||||
{
|
||||
std::unique_lock lock(mx_);
|
||||
|
||||
if (cv) {
|
||||
// Wait until the first file of this size has been added to
|
||||
// `by_hash_`.
|
||||
cv->wait(lock);
|
||||
}
|
||||
|
||||
auto& ref = by_hash_[p->hash()];
|
||||
|
||||
if (ref.empty()) {
|
||||
// This is *not* a duplicate. We must allocate a new inode.
|
||||
add_inode(p);
|
||||
} else {
|
||||
auto inode = ref.front()->get_inode();
|
||||
assert(inode);
|
||||
p->set_inode(inode);
|
||||
++prog_.files_scanned;
|
||||
++prog_.duplicate_files;
|
||||
prog_.saved_by_deduplication += p->size();
|
||||
}
|
||||
|
||||
ref.push_back(p);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void hash_file(file* p) {
|
||||
auto const size = p->size();
|
||||
std::shared_ptr<mmif> mm;
|
||||
|
||||
if (size > 0) {
|
||||
mm = os_.map_file(p->path(), size);
|
||||
}
|
||||
|
||||
prog_.current.store(p);
|
||||
p->scan(mm, prog_, hash_algo_);
|
||||
}
|
||||
|
||||
void add_inode(file* p) {
|
||||
assert(!p->get_inode());
|
||||
|
||||
auto inode = im_.create_inode();
|
||||
|
||||
p->set_inode(inode);
|
||||
|
||||
if (ino_opts_.needs_scan()) {
|
||||
wg_.add_job([this, p, inode = std::move(inode)] {
|
||||
std::shared_ptr<mmif> mm;
|
||||
auto const size = p->size();
|
||||
if (size > 0) {
|
||||
mm = os_.map_file(p->path(), size);
|
||||
}
|
||||
inode->scan(mm, ino_opts_);
|
||||
++prog_.similarity_scans;
|
||||
prog_.similarity_bytes += size;
|
||||
++prog_.inodes_scanned;
|
||||
++prog_.files_scanned;
|
||||
});
|
||||
} else {
|
||||
++prog_.inodes_scanned;
|
||||
++prog_.files_scanned;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Lookup>
|
||||
void finalize_hardlinks(Lookup&& lookup) {
|
||||
for (auto& kv : hardlinks_) {
|
||||
auto& hlv = kv.second;
|
||||
if (hlv.size() > 1) {
|
||||
auto& fv = lookup(hlv.front());
|
||||
// TODO: for (auto p : hlv | std::views::drop(1))
|
||||
std::for_each(hlv.begin() + 1, hlv.end(), [&fv](auto p) {
|
||||
p->set_inode(fv.front()->get_inode());
|
||||
fv.push_back(p);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
hardlinks_.clear();
|
||||
}
|
||||
|
||||
template <bool UniqueOnly = false, typename KeyType>
|
||||
void finalize_files(folly::F14FastMap<KeyType, inode::files_vector>& fmap,
|
||||
uint32_t& inode_num, uint32_t& obj_num) {
|
||||
std::vector<std::pair<KeyType, inode::files_vector>> ent;
|
||||
ent.reserve(fmap.size());
|
||||
fmap.eraseInto(fmap.begin(), fmap.end(),
|
||||
[&ent](KeyType&& k, inode::files_vector&& fv) {
|
||||
if (!fv.empty()) {
|
||||
if constexpr (UniqueOnly) {
|
||||
DWARFS_CHECK(fv.size() == fv.front()->refcount(),
|
||||
"internal error");
|
||||
}
|
||||
ent.emplace_back(std::move(k), std::move(fv));
|
||||
}
|
||||
});
|
||||
std::sort(ent.begin(), ent.end(),
|
||||
[](auto& left, auto& right) { return left.first < right.first; });
|
||||
|
||||
DWARFS_CHECK(fmap.empty(), "expected file map to be empty");
|
||||
|
||||
finalize_inodes<true>(ent, inode_num, obj_num);
|
||||
if constexpr (!UniqueOnly) {
|
||||
finalize_inodes<false>(ent, inode_num, obj_num);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool Unique, typename KeyType>
|
||||
void
|
||||
finalize_inodes(std::vector<std::pair<KeyType, inode::files_vector>>& ent,
|
||||
uint32_t& inode_num, uint32_t& obj_num) {
|
||||
for (auto& p : ent) {
|
||||
auto& files = p.second;
|
||||
|
||||
if constexpr (Unique) {
|
||||
// this is true regardless of how the files are ordered
|
||||
if (files.size() > files.front()->refcount()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
++num_unique_;
|
||||
} else {
|
||||
if (files.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file");
|
||||
}
|
||||
|
||||
// this isn't strictly necessary, but helps metadata compression
|
||||
std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
|
||||
return a->path() < b->path();
|
||||
});
|
||||
|
||||
for (auto fp : files) {
|
||||
// need to check because hardlinks share the same number
|
||||
if (!fp->inode_num()) {
|
||||
fp->set_inode_num(inode_num);
|
||||
++inode_num;
|
||||
}
|
||||
}
|
||||
|
||||
auto fp = files.front();
|
||||
auto inode = fp->get_inode();
|
||||
assert(inode);
|
||||
inode->set_num(obj_num);
|
||||
inode->set_files(std::move(files));
|
||||
|
||||
++obj_num;
|
||||
}
|
||||
}
|
||||
|
||||
worker_group& wg_;
|
||||
os_access& os_;
|
||||
inode_manager& im_;
|
||||
inode_options const& ino_opts_;
|
||||
std::optional<std::string> const hash_algo_;
|
||||
progress& prog_;
|
||||
uint32_t num_unique_{0};
|
||||
folly::F14FastMap<uint64_t, inode::files_vector> hardlinks_;
|
||||
std::mutex mx_;
|
||||
folly::F14FastMap<uint64_t, inode::files_vector> unique_size_;
|
||||
folly::F14FastMap<uint64_t, std::shared_ptr<condition_barrier>>
|
||||
first_file_hashed_;
|
||||
folly::F14FastMap<uint64_t, inode::files_vector> by_raw_inode_;
|
||||
folly::F14FastMap<std::string_view, inode::files_vector> by_hash_;
|
||||
};
|
||||
|
||||
class dir_set_inode_visitor : public visitor_base {
|
||||
public:
|
||||
explicit dir_set_inode_visitor(uint32_t& inode_num)
|
||||
@ -644,7 +289,7 @@ class scanner_ final : public scanner::impl {
|
||||
|
||||
private:
|
||||
std::shared_ptr<entry>
|
||||
scan_tree(const std::string& path, progress& prog, file_scanner& fs);
|
||||
scan_tree(const std::string& path, progress& prog, detail::file_scanner& fs);
|
||||
|
||||
const block_manager::config& cfg_;
|
||||
const scanner_options& options_;
|
||||
@ -675,7 +320,7 @@ scanner_<LoggerPolicy>::scanner_(logger& lgr, worker_group& wg,
|
||||
template <typename LoggerPolicy>
|
||||
std::shared_ptr<entry>
|
||||
scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
|
||||
file_scanner& fs) {
|
||||
detail::file_scanner& fs) {
|
||||
auto root = entry_->create(*os_, path);
|
||||
bool const debug_filter = options_.debug_filter_function.has_value();
|
||||
|
||||
@ -823,8 +468,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
prog.set_status_function(status_string);
|
||||
|
||||
inode_manager im(lgr_, prog);
|
||||
file_scanner fs(wg_, *os_, im, options_.inode, options_.file_hash_algorithm,
|
||||
prog);
|
||||
detail::file_scanner fs(wg_, *os_, im, options_.inode,
|
||||
options_.file_hash_algorithm, prog);
|
||||
|
||||
auto root = scan_tree(path, prog, fs);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user