mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-09 04:19:10 -04:00
Factor out file_scanner
This commit is contained in:
parent
b41a400e32
commit
c2e3cdfecb
@ -302,6 +302,7 @@ list(
|
|||||||
src/dwarfs/console_writer.cpp
|
src/dwarfs/console_writer.cpp
|
||||||
src/dwarfs/entry.cpp
|
src/dwarfs/entry.cpp
|
||||||
src/dwarfs/error.cpp
|
src/dwarfs/error.cpp
|
||||||
|
src/dwarfs/file_scanner.cpp
|
||||||
src/dwarfs/filesystem_extractor.cpp
|
src/dwarfs/filesystem_extractor.cpp
|
||||||
src/dwarfs/filesystem_v2.cpp
|
src/dwarfs/filesystem_v2.cpp
|
||||||
src/dwarfs/filesystem_writer.cpp
|
src/dwarfs/filesystem_writer.cpp
|
||||||
|
64
include/dwarfs/file_scanner.h
Normal file
64
include/dwarfs/file_scanner.h
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||||
|
/**
|
||||||
|
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||||
|
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||||
|
*
|
||||||
|
* This file is part of dwarfs.
|
||||||
|
*
|
||||||
|
* dwarfs is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* dwarfs is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <optional>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace dwarfs {
|
||||||
|
|
||||||
|
class file;
|
||||||
|
class inode_manager;
|
||||||
|
class os_access;
|
||||||
|
class progress;
|
||||||
|
class worker_group;
|
||||||
|
|
||||||
|
struct inode_options;
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
|
||||||
|
class file_scanner {
|
||||||
|
public:
|
||||||
|
file_scanner(worker_group& wg, os_access& os, inode_manager& im,
|
||||||
|
inode_options const& ino_opts,
|
||||||
|
std::optional<std::string> const& hash_algo, progress& prog);
|
||||||
|
|
||||||
|
void scan(file* p) { impl_->scan(p); }
|
||||||
|
void finalize(uint32_t& inode_num) { impl_->finalize(inode_num); }
|
||||||
|
uint32_t num_unique() const { return impl_->num_unique(); }
|
||||||
|
|
||||||
|
class impl {
|
||||||
|
public:
|
||||||
|
virtual ~impl() = default;
|
||||||
|
|
||||||
|
virtual void scan(file* p) = 0;
|
||||||
|
virtual void finalize(uint32_t& inode_num) = 0;
|
||||||
|
virtual uint32_t num_unique() const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<impl> impl_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace detail
|
||||||
|
} // namespace dwarfs
|
426
src/dwarfs/file_scanner.cpp
Normal file
426
src/dwarfs/file_scanner.cpp
Normal file
@ -0,0 +1,426 @@
|
|||||||
|
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||||
|
/**
|
||||||
|
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||||
|
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||||
|
*
|
||||||
|
* This file is part of dwarfs.
|
||||||
|
*
|
||||||
|
* dwarfs is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* dwarfs is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <mutex>
|
||||||
|
#include <string_view>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <folly/container/F14Map.h>
|
||||||
|
|
||||||
|
#include "dwarfs/entry.h"
|
||||||
|
#include "dwarfs/file_scanner.h"
|
||||||
|
#include "dwarfs/inode.h"
|
||||||
|
#include "dwarfs/inode_manager.h"
|
||||||
|
#include "dwarfs/logger.h"
|
||||||
|
#include "dwarfs/options.h"
|
||||||
|
#include "dwarfs/os_access.h"
|
||||||
|
#include "dwarfs/progress.h"
|
||||||
|
#include "dwarfs/worker_group.h"
|
||||||
|
|
||||||
|
namespace dwarfs::detail {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
class file_scanner_ : public file_scanner::impl {
|
||||||
|
public:
|
||||||
|
file_scanner_(worker_group& wg, os_access& os, inode_manager& im,
|
||||||
|
inode_options const& ino_opts,
|
||||||
|
std::optional<std::string> const& hash_algo, progress& prog);
|
||||||
|
|
||||||
|
void scan(file* p) override;
|
||||||
|
void finalize(uint32_t& inode_num) override;
|
||||||
|
|
||||||
|
uint32_t num_unique() const override { return num_unique_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
class condition_barrier {
|
||||||
|
public:
|
||||||
|
void set() { ready_ = true; }
|
||||||
|
|
||||||
|
void notify() { cv_.notify_all(); }
|
||||||
|
|
||||||
|
void wait(std::unique_lock<std::mutex>& lock) {
|
||||||
|
cv_.wait(lock, [this] { return ready_; });
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::condition_variable cv_;
|
||||||
|
bool ready_{false};
|
||||||
|
};
|
||||||
|
|
||||||
|
void scan_dedupe(file* p);
|
||||||
|
void hash_file(file* p);
|
||||||
|
void add_inode(file* p);
|
||||||
|
|
||||||
|
template <typename Lookup>
|
||||||
|
void finalize_hardlinks(Lookup&& lookup);
|
||||||
|
|
||||||
|
template <bool UniqueOnly = false, typename KeyType>
|
||||||
|
void finalize_files(folly::F14FastMap<KeyType, inode::files_vector>& fmap,
|
||||||
|
uint32_t& inode_num, uint32_t& obj_num);
|
||||||
|
|
||||||
|
template <bool Unique, typename KeyType>
|
||||||
|
void
|
||||||
|
finalize_inodes(std::vector<std::pair<KeyType, inode::files_vector>>& ent,
|
||||||
|
uint32_t& inode_num, uint32_t& obj_num);
|
||||||
|
|
||||||
|
worker_group& wg_;
|
||||||
|
os_access& os_;
|
||||||
|
inode_manager& im_;
|
||||||
|
inode_options const& ino_opts_;
|
||||||
|
std::optional<std::string> const hash_algo_;
|
||||||
|
progress& prog_;
|
||||||
|
uint32_t num_unique_{0};
|
||||||
|
folly::F14FastMap<uint64_t, inode::files_vector> hardlinks_;
|
||||||
|
std::mutex mx_;
|
||||||
|
folly::F14FastMap<uint64_t, inode::files_vector> unique_size_;
|
||||||
|
folly::F14FastMap<uint64_t, std::shared_ptr<condition_barrier>>
|
||||||
|
first_file_hashed_;
|
||||||
|
folly::F14FastMap<uint64_t, inode::files_vector> by_raw_inode_;
|
||||||
|
folly::F14FastMap<std::string_view, inode::files_vector> by_hash_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// The `unique_size_` table holds an entry for each file size we
|
||||||
|
// discover:
|
||||||
|
//
|
||||||
|
// - When we first discover a new file size, we know for sure that
|
||||||
|
// this file is *not* a duplicate of a file we've seen before.
|
||||||
|
// Thus, we can immediately create a new inode, and we can
|
||||||
|
// immediately start similarity scanning for this inode.
|
||||||
|
//
|
||||||
|
// - When we discover the second file of particular size, we must
|
||||||
|
// hash both files to see if they're identical. We already have
|
||||||
|
// an inode for the first file, so we must delay the creation of
|
||||||
|
// a new inode until we know that the second file is not a
|
||||||
|
// duplicate.
|
||||||
|
//
|
||||||
|
// - Exactly the same applies for subsequent files.
|
||||||
|
//
|
||||||
|
// - We must ensure that the presence of a hash is checked in
|
||||||
|
// `by_hash_` for subsequent files only if the first file's
|
||||||
|
// hash has been computed and stored. Otherwise, if a subsequent
|
||||||
|
// file's hash computation finishes before the first file, we
|
||||||
|
// assume (potentially wrongly) that the subsequent file is not
|
||||||
|
// a duplicate.
|
||||||
|
//
|
||||||
|
// - So subsequent files must wait for the first file unless we
|
||||||
|
// know up front that the first file's hash has already been
|
||||||
|
// stored. As long as the first file's hash has not been stored,
|
||||||
|
// it is still present in `unique_size_`. It will be removed
|
||||||
|
// from `unique_size_` after its hash has been stored.
|
||||||
|
|
||||||
|
file_scanner_::file_scanner_(worker_group& wg, os_access& os, inode_manager& im,
|
||||||
|
inode_options const& ino_opts,
|
||||||
|
std::optional<std::string> const& hash_algo,
|
||||||
|
progress& prog)
|
||||||
|
: wg_(wg)
|
||||||
|
, os_(os)
|
||||||
|
, im_(im)
|
||||||
|
, ino_opts_(ino_opts)
|
||||||
|
, hash_algo_{hash_algo}
|
||||||
|
, prog_(prog) {}
|
||||||
|
|
||||||
|
void file_scanner_::scan(file* p) {
|
||||||
|
if (p->num_hard_links() > 1) {
|
||||||
|
auto& vec = hardlinks_[p->raw_inode_num()];
|
||||||
|
vec.push_back(p);
|
||||||
|
|
||||||
|
if (vec.size() > 1) {
|
||||||
|
p->hardlink(vec[0], prog_);
|
||||||
|
++prog_.files_scanned;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
p->create_data();
|
||||||
|
|
||||||
|
prog_.original_size += p->size();
|
||||||
|
|
||||||
|
if (hash_algo_) {
|
||||||
|
scan_dedupe(p);
|
||||||
|
} else {
|
||||||
|
prog_.current.store(p);
|
||||||
|
p->scan(nullptr, prog_, hash_algo_); // TODO
|
||||||
|
|
||||||
|
by_raw_inode_[p->raw_inode_num()].push_back(p);
|
||||||
|
|
||||||
|
add_inode(p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void file_scanner_::finalize(uint32_t& inode_num) {
|
||||||
|
uint32_t obj_num = 0;
|
||||||
|
|
||||||
|
assert(first_file_hashed_.empty());
|
||||||
|
|
||||||
|
if (hash_algo_) {
|
||||||
|
finalize_hardlinks([this](file const* p) -> inode::files_vector& {
|
||||||
|
auto it = by_hash_.find(p->hash());
|
||||||
|
if (it != by_hash_.end()) {
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
return unique_size_.at(p->size());
|
||||||
|
});
|
||||||
|
finalize_files<true>(unique_size_, inode_num, obj_num);
|
||||||
|
finalize_files(by_hash_, inode_num, obj_num);
|
||||||
|
} else {
|
||||||
|
finalize_hardlinks([this](file const* p) -> inode::files_vector& {
|
||||||
|
return by_raw_inode_.at(p->raw_inode_num());
|
||||||
|
});
|
||||||
|
finalize_files(by_raw_inode_, inode_num, obj_num);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void file_scanner_::scan_dedupe(file* p) {
|
||||||
|
// We need no lock yet, as `unique_size_` is only manipulated from
|
||||||
|
// this thread.
|
||||||
|
auto size = p->size();
|
||||||
|
auto [it, is_new] = unique_size_.emplace(size, inode::files_vector());
|
||||||
|
|
||||||
|
if (is_new) {
|
||||||
|
// A file size that has never been seen before. We can safely
|
||||||
|
// create a new inode and we'll keep track of the file.
|
||||||
|
it->second.push_back(p);
|
||||||
|
|
||||||
|
{
|
||||||
|
std::lock_guard lock(mx_);
|
||||||
|
add_inode(p);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// This file size has been seen before, so this is potentially
|
||||||
|
// a duplicate.
|
||||||
|
|
||||||
|
std::shared_ptr<condition_barrier> cv;
|
||||||
|
|
||||||
|
if (it->second.empty()) {
|
||||||
|
// This is any file of this size after the second file
|
||||||
|
std::lock_guard lock(mx_);
|
||||||
|
|
||||||
|
if (auto ffi = first_file_hashed_.find(size);
|
||||||
|
ffi != first_file_hashed_.end()) {
|
||||||
|
cv = ffi->second;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// This is the second file of this size. We now need to hash
|
||||||
|
// both the first and second file and ensure that the first
|
||||||
|
// file's hash is stored to `by_hash_` first. We set up a
|
||||||
|
// condition variable to synchronize insertion into `by_hash_`.
|
||||||
|
|
||||||
|
cv = std::make_shared<condition_barrier>();
|
||||||
|
|
||||||
|
{
|
||||||
|
std::lock_guard lock(mx_);
|
||||||
|
first_file_hashed_.emplace(size, cv);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a job for the first file
|
||||||
|
wg_.add_job([this, p = it->second.front(), cv] {
|
||||||
|
hash_file(p);
|
||||||
|
|
||||||
|
{
|
||||||
|
std::lock_guard lock(mx_);
|
||||||
|
|
||||||
|
auto& ref = by_hash_[p->hash()];
|
||||||
|
|
||||||
|
assert(ref.empty());
|
||||||
|
assert(p->get_inode());
|
||||||
|
|
||||||
|
ref.push_back(p);
|
||||||
|
|
||||||
|
cv->set();
|
||||||
|
|
||||||
|
first_file_hashed_.erase(p->size());
|
||||||
|
}
|
||||||
|
|
||||||
|
cv->notify();
|
||||||
|
});
|
||||||
|
|
||||||
|
it->second.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a job for any subsequent files
|
||||||
|
wg_.add_job([this, p, cv] {
|
||||||
|
hash_file(p);
|
||||||
|
|
||||||
|
{
|
||||||
|
std::unique_lock lock(mx_);
|
||||||
|
|
||||||
|
if (cv) {
|
||||||
|
// Wait until the first file of this size has been added to
|
||||||
|
// `by_hash_`.
|
||||||
|
cv->wait(lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto& ref = by_hash_[p->hash()];
|
||||||
|
|
||||||
|
if (ref.empty()) {
|
||||||
|
// This is *not* a duplicate. We must allocate a new inode.
|
||||||
|
add_inode(p);
|
||||||
|
} else {
|
||||||
|
auto inode = ref.front()->get_inode();
|
||||||
|
assert(inode);
|
||||||
|
p->set_inode(inode);
|
||||||
|
++prog_.files_scanned;
|
||||||
|
++prog_.duplicate_files;
|
||||||
|
prog_.saved_by_deduplication += p->size();
|
||||||
|
}
|
||||||
|
|
||||||
|
ref.push_back(p);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void file_scanner_::hash_file(file* p) {
|
||||||
|
auto const size = p->size();
|
||||||
|
std::shared_ptr<mmif> mm;
|
||||||
|
|
||||||
|
if (size > 0) {
|
||||||
|
mm = os_.map_file(p->path(), size);
|
||||||
|
}
|
||||||
|
|
||||||
|
prog_.current.store(p);
|
||||||
|
p->scan(mm, prog_, hash_algo_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void file_scanner_::add_inode(file* p) {
|
||||||
|
assert(!p->get_inode());
|
||||||
|
|
||||||
|
auto inode = im_.create_inode();
|
||||||
|
|
||||||
|
p->set_inode(inode);
|
||||||
|
|
||||||
|
if (ino_opts_.needs_scan()) {
|
||||||
|
wg_.add_job([this, p, inode = std::move(inode)] {
|
||||||
|
std::shared_ptr<mmif> mm;
|
||||||
|
auto const size = p->size();
|
||||||
|
if (size > 0) {
|
||||||
|
mm = os_.map_file(p->path(), size);
|
||||||
|
}
|
||||||
|
inode->scan(mm, ino_opts_);
|
||||||
|
++prog_.similarity_scans;
|
||||||
|
prog_.similarity_bytes += size;
|
||||||
|
++prog_.inodes_scanned;
|
||||||
|
++prog_.files_scanned;
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
++prog_.inodes_scanned;
|
||||||
|
++prog_.files_scanned;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Lookup>
|
||||||
|
void file_scanner_::finalize_hardlinks(Lookup&& lookup) {
|
||||||
|
for (auto& kv : hardlinks_) {
|
||||||
|
auto& hlv = kv.second;
|
||||||
|
if (hlv.size() > 1) {
|
||||||
|
auto& fv = lookup(hlv.front());
|
||||||
|
// TODO: for (auto p : hlv | std::views::drop(1))
|
||||||
|
std::for_each(hlv.begin() + 1, hlv.end(), [&fv](auto p) {
|
||||||
|
p->set_inode(fv.front()->get_inode());
|
||||||
|
fv.push_back(p);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
hardlinks_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool UniqueOnly, typename KeyType>
|
||||||
|
void file_scanner_::finalize_files(
|
||||||
|
folly::F14FastMap<KeyType, inode::files_vector>& fmap, uint32_t& inode_num,
|
||||||
|
uint32_t& obj_num) {
|
||||||
|
std::vector<std::pair<KeyType, inode::files_vector>> ent;
|
||||||
|
ent.reserve(fmap.size());
|
||||||
|
fmap.eraseInto(
|
||||||
|
fmap.begin(), fmap.end(), [&ent](KeyType&& k, inode::files_vector&& fv) {
|
||||||
|
if (!fv.empty()) {
|
||||||
|
if constexpr (UniqueOnly) {
|
||||||
|
DWARFS_CHECK(fv.size() == fv.front()->refcount(), "internal error");
|
||||||
|
}
|
||||||
|
ent.emplace_back(std::move(k), std::move(fv));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
std::sort(ent.begin(), ent.end(),
|
||||||
|
[](auto& left, auto& right) { return left.first < right.first; });
|
||||||
|
|
||||||
|
DWARFS_CHECK(fmap.empty(), "expected file map to be empty");
|
||||||
|
|
||||||
|
finalize_inodes<true>(ent, inode_num, obj_num);
|
||||||
|
if constexpr (!UniqueOnly) {
|
||||||
|
finalize_inodes<false>(ent, inode_num, obj_num);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool Unique, typename KeyType>
|
||||||
|
void file_scanner_::finalize_inodes(
|
||||||
|
std::vector<std::pair<KeyType, inode::files_vector>>& ent,
|
||||||
|
uint32_t& inode_num, uint32_t& obj_num) {
|
||||||
|
for (auto& p : ent) {
|
||||||
|
auto& files = p.second;
|
||||||
|
|
||||||
|
if constexpr (Unique) {
|
||||||
|
// this is true regardless of how the files are ordered
|
||||||
|
if (files.size() > files.front()->refcount()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
++num_unique_;
|
||||||
|
} else {
|
||||||
|
if (files.empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file");
|
||||||
|
}
|
||||||
|
|
||||||
|
// this isn't strictly necessary, but helps metadata compression
|
||||||
|
std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
|
||||||
|
return a->path() < b->path();
|
||||||
|
});
|
||||||
|
|
||||||
|
for (auto fp : files) {
|
||||||
|
// need to check because hardlinks share the same number
|
||||||
|
if (!fp->inode_num()) {
|
||||||
|
fp->set_inode_num(inode_num);
|
||||||
|
++inode_num;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto fp = files.front();
|
||||||
|
auto inode = fp->get_inode();
|
||||||
|
assert(inode);
|
||||||
|
inode->set_num(obj_num);
|
||||||
|
inode->set_files(std::move(files));
|
||||||
|
|
||||||
|
++obj_num;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
file_scanner::file_scanner(worker_group& wg, os_access& os, inode_manager& im,
|
||||||
|
inode_options const& ino_opts,
|
||||||
|
std::optional<std::string> const& hash_algo,
|
||||||
|
progress& prog)
|
||||||
|
: impl_{std::make_unique<file_scanner_>(wg, os, im, ino_opts, hash_algo,
|
||||||
|
prog)} {}
|
||||||
|
|
||||||
|
} // namespace dwarfs::detail
|
@ -24,13 +24,10 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <deque>
|
#include <deque>
|
||||||
#include <iostream>
|
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
#include <mutex>
|
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <string_view>
|
|
||||||
#include <system_error>
|
#include <system_error>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -38,13 +35,13 @@
|
|||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include <folly/ExceptionString.h>
|
#include <folly/ExceptionString.h>
|
||||||
#include <folly/container/F14Map.h>
|
|
||||||
|
|
||||||
#include <fmt/format.h>
|
#include <fmt/format.h>
|
||||||
|
|
||||||
#include "dwarfs/block_data.h"
|
#include "dwarfs/block_data.h"
|
||||||
#include "dwarfs/entry.h"
|
#include "dwarfs/entry.h"
|
||||||
#include "dwarfs/error.h"
|
#include "dwarfs/error.h"
|
||||||
|
#include "dwarfs/file_scanner.h"
|
||||||
#include "dwarfs/filesystem_writer.h"
|
#include "dwarfs/filesystem_writer.h"
|
||||||
#include "dwarfs/global_entry_data.h"
|
#include "dwarfs/global_entry_data.h"
|
||||||
#include "dwarfs/inode.h"
|
#include "dwarfs/inode.h"
|
||||||
@ -75,358 +72,6 @@ class visitor_base : public entry_visitor {
|
|||||||
void visit(device*) override {}
|
void visit(device*) override {}
|
||||||
};
|
};
|
||||||
|
|
||||||
class file_scanner {
|
|
||||||
public:
|
|
||||||
file_scanner(worker_group& wg, os_access& os, inode_manager& im,
|
|
||||||
inode_options const& ino_opts,
|
|
||||||
std::optional<std::string> const& hash_algo, progress& prog)
|
|
||||||
: wg_(wg)
|
|
||||||
, os_(os)
|
|
||||||
, im_(im)
|
|
||||||
, ino_opts_(ino_opts)
|
|
||||||
, hash_algo_{hash_algo}
|
|
||||||
, prog_(prog) {}
|
|
||||||
|
|
||||||
void scan(file* p) {
|
|
||||||
if (p->num_hard_links() > 1) {
|
|
||||||
auto& vec = hardlinks_[p->raw_inode_num()];
|
|
||||||
vec.push_back(p);
|
|
||||||
|
|
||||||
if (vec.size() > 1) {
|
|
||||||
p->hardlink(vec[0], prog_);
|
|
||||||
++prog_.files_scanned;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
p->create_data();
|
|
||||||
|
|
||||||
prog_.original_size += p->size();
|
|
||||||
|
|
||||||
if (hash_algo_) {
|
|
||||||
scan_dedupe(p);
|
|
||||||
} else {
|
|
||||||
prog_.current.store(p);
|
|
||||||
p->scan(nullptr, prog_, hash_algo_); // TODO
|
|
||||||
|
|
||||||
by_raw_inode_[p->raw_inode_num()].push_back(p);
|
|
||||||
|
|
||||||
add_inode(p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void finalize(uint32_t& inode_num) {
|
|
||||||
uint32_t obj_num = 0;
|
|
||||||
|
|
||||||
assert(first_file_hashed_.empty());
|
|
||||||
|
|
||||||
if (hash_algo_) {
|
|
||||||
finalize_hardlinks([this](file const* p) -> inode::files_vector& {
|
|
||||||
auto it = by_hash_.find(p->hash());
|
|
||||||
if (it != by_hash_.end()) {
|
|
||||||
return it->second;
|
|
||||||
}
|
|
||||||
return unique_size_.at(p->size());
|
|
||||||
});
|
|
||||||
finalize_files<true>(unique_size_, inode_num, obj_num);
|
|
||||||
finalize_files(by_hash_, inode_num, obj_num);
|
|
||||||
} else {
|
|
||||||
finalize_hardlinks([this](file const* p) -> inode::files_vector& {
|
|
||||||
return by_raw_inode_.at(p->raw_inode_num());
|
|
||||||
});
|
|
||||||
finalize_files(by_raw_inode_, inode_num, obj_num);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t num_unique() const { return num_unique_; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
class condition_barrier {
|
|
||||||
public:
|
|
||||||
void set() { ready_ = true; }
|
|
||||||
|
|
||||||
void notify() { cv_.notify_all(); }
|
|
||||||
|
|
||||||
void wait(std::unique_lock<std::mutex>& lock) {
|
|
||||||
cv_.wait(lock, [this] { return ready_; });
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::condition_variable cv_;
|
|
||||||
bool ready_{false};
|
|
||||||
};
|
|
||||||
|
|
||||||
void scan_dedupe(file* p) {
|
|
||||||
// The `unique_size_` table holds an entry for each file size we
|
|
||||||
// discover:
|
|
||||||
//
|
|
||||||
// - When we first discover a new file size, we know for sure that
|
|
||||||
// this file is *not* a duplicate of a file we've seen before.
|
|
||||||
// Thus, we can immediately create a new inode, and we can
|
|
||||||
// immediately start similarity scanning for this inode.
|
|
||||||
//
|
|
||||||
// - When we discover the second file of particular size, we must
|
|
||||||
// hash both files to see if they're identical. We already have
|
|
||||||
// an inode for the first file, so we must delay the creation of
|
|
||||||
// a new inode until we know that the second file is not a
|
|
||||||
// duplicate.
|
|
||||||
//
|
|
||||||
// - Exactly the same applies for subsequent files.
|
|
||||||
//
|
|
||||||
// - We must ensure that the presence of a hash is checked in
|
|
||||||
// `by_hash_` for subsequent files only if the first file's
|
|
||||||
// hash has been computed and stored. Otherwise, if a subsequent
|
|
||||||
// file's hash computation finishes before the first file, we
|
|
||||||
// assume (potentially wrongly) that the subsequent file is not
|
|
||||||
// a duplicate.
|
|
||||||
//
|
|
||||||
// - So subsequent files must wait for the first file unless we
|
|
||||||
// know up front that the first file's hash has already been
|
|
||||||
// stored. As long as the first file's hash has not been stored,
|
|
||||||
// it is still present in `unique_size_`. It will be removed
|
|
||||||
// from `unique_size_` after its hash has been stored.
|
|
||||||
|
|
||||||
// We need no lock yet, as `unique_size_` is only manipulated from
|
|
||||||
// this thread.
|
|
||||||
auto size = p->size();
|
|
||||||
auto [it, is_new] = unique_size_.emplace(size, inode::files_vector());
|
|
||||||
|
|
||||||
if (is_new) {
|
|
||||||
// A file size that has never been seen before. We can safely
|
|
||||||
// create a new inode and we'll keep track of the file.
|
|
||||||
it->second.push_back(p);
|
|
||||||
|
|
||||||
{
|
|
||||||
std::lock_guard lock(mx_);
|
|
||||||
add_inode(p);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// This file size has been seen before, so this is potentially
|
|
||||||
// a duplicate.
|
|
||||||
|
|
||||||
std::shared_ptr<condition_barrier> cv;
|
|
||||||
|
|
||||||
if (it->second.empty()) {
|
|
||||||
// This is any file of this size after the second file
|
|
||||||
std::lock_guard lock(mx_);
|
|
||||||
|
|
||||||
if (auto ffi = first_file_hashed_.find(size);
|
|
||||||
ffi != first_file_hashed_.end()) {
|
|
||||||
cv = ffi->second;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// This is the second file of this size. We now need to hash
|
|
||||||
// both the first and second file and ensure that the first
|
|
||||||
// file's hash is stored to `by_hash_` first. We set up a
|
|
||||||
// condition variable to synchronize insertion into `by_hash_`.
|
|
||||||
|
|
||||||
cv = std::make_shared<condition_barrier>();
|
|
||||||
|
|
||||||
{
|
|
||||||
std::lock_guard lock(mx_);
|
|
||||||
first_file_hashed_.emplace(size, cv);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add a job for the first file
|
|
||||||
wg_.add_job([this, p = it->second.front(), cv] {
|
|
||||||
hash_file(p);
|
|
||||||
|
|
||||||
{
|
|
||||||
std::lock_guard lock(mx_);
|
|
||||||
|
|
||||||
auto& ref = by_hash_[p->hash()];
|
|
||||||
|
|
||||||
assert(ref.empty());
|
|
||||||
assert(p->get_inode());
|
|
||||||
|
|
||||||
ref.push_back(p);
|
|
||||||
|
|
||||||
cv->set();
|
|
||||||
|
|
||||||
first_file_hashed_.erase(p->size());
|
|
||||||
}
|
|
||||||
|
|
||||||
cv->notify();
|
|
||||||
});
|
|
||||||
|
|
||||||
it->second.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add a job for any subsequent files
|
|
||||||
wg_.add_job([this, p, cv] {
|
|
||||||
hash_file(p);
|
|
||||||
|
|
||||||
{
|
|
||||||
std::unique_lock lock(mx_);
|
|
||||||
|
|
||||||
if (cv) {
|
|
||||||
// Wait until the first file of this size has been added to
|
|
||||||
// `by_hash_`.
|
|
||||||
cv->wait(lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto& ref = by_hash_[p->hash()];
|
|
||||||
|
|
||||||
if (ref.empty()) {
|
|
||||||
// This is *not* a duplicate. We must allocate a new inode.
|
|
||||||
add_inode(p);
|
|
||||||
} else {
|
|
||||||
auto inode = ref.front()->get_inode();
|
|
||||||
assert(inode);
|
|
||||||
p->set_inode(inode);
|
|
||||||
++prog_.files_scanned;
|
|
||||||
++prog_.duplicate_files;
|
|
||||||
prog_.saved_by_deduplication += p->size();
|
|
||||||
}
|
|
||||||
|
|
||||||
ref.push_back(p);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void hash_file(file* p) {
|
|
||||||
auto const size = p->size();
|
|
||||||
std::shared_ptr<mmif> mm;
|
|
||||||
|
|
||||||
if (size > 0) {
|
|
||||||
mm = os_.map_file(p->path(), size);
|
|
||||||
}
|
|
||||||
|
|
||||||
prog_.current.store(p);
|
|
||||||
p->scan(mm, prog_, hash_algo_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void add_inode(file* p) {
|
|
||||||
assert(!p->get_inode());
|
|
||||||
|
|
||||||
auto inode = im_.create_inode();
|
|
||||||
|
|
||||||
p->set_inode(inode);
|
|
||||||
|
|
||||||
if (ino_opts_.needs_scan()) {
|
|
||||||
wg_.add_job([this, p, inode = std::move(inode)] {
|
|
||||||
std::shared_ptr<mmif> mm;
|
|
||||||
auto const size = p->size();
|
|
||||||
if (size > 0) {
|
|
||||||
mm = os_.map_file(p->path(), size);
|
|
||||||
}
|
|
||||||
inode->scan(mm, ino_opts_);
|
|
||||||
++prog_.similarity_scans;
|
|
||||||
prog_.similarity_bytes += size;
|
|
||||||
++prog_.inodes_scanned;
|
|
||||||
++prog_.files_scanned;
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
++prog_.inodes_scanned;
|
|
||||||
++prog_.files_scanned;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Lookup>
|
|
||||||
void finalize_hardlinks(Lookup&& lookup) {
|
|
||||||
for (auto& kv : hardlinks_) {
|
|
||||||
auto& hlv = kv.second;
|
|
||||||
if (hlv.size() > 1) {
|
|
||||||
auto& fv = lookup(hlv.front());
|
|
||||||
// TODO: for (auto p : hlv | std::views::drop(1))
|
|
||||||
std::for_each(hlv.begin() + 1, hlv.end(), [&fv](auto p) {
|
|
||||||
p->set_inode(fv.front()->get_inode());
|
|
||||||
fv.push_back(p);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
hardlinks_.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
template <bool UniqueOnly = false, typename KeyType>
|
|
||||||
void finalize_files(folly::F14FastMap<KeyType, inode::files_vector>& fmap,
|
|
||||||
uint32_t& inode_num, uint32_t& obj_num) {
|
|
||||||
std::vector<std::pair<KeyType, inode::files_vector>> ent;
|
|
||||||
ent.reserve(fmap.size());
|
|
||||||
fmap.eraseInto(fmap.begin(), fmap.end(),
|
|
||||||
[&ent](KeyType&& k, inode::files_vector&& fv) {
|
|
||||||
if (!fv.empty()) {
|
|
||||||
if constexpr (UniqueOnly) {
|
|
||||||
DWARFS_CHECK(fv.size() == fv.front()->refcount(),
|
|
||||||
"internal error");
|
|
||||||
}
|
|
||||||
ent.emplace_back(std::move(k), std::move(fv));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
std::sort(ent.begin(), ent.end(),
|
|
||||||
[](auto& left, auto& right) { return left.first < right.first; });
|
|
||||||
|
|
||||||
DWARFS_CHECK(fmap.empty(), "expected file map to be empty");
|
|
||||||
|
|
||||||
finalize_inodes<true>(ent, inode_num, obj_num);
|
|
||||||
if constexpr (!UniqueOnly) {
|
|
||||||
finalize_inodes<false>(ent, inode_num, obj_num);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <bool Unique, typename KeyType>
|
|
||||||
void
|
|
||||||
finalize_inodes(std::vector<std::pair<KeyType, inode::files_vector>>& ent,
|
|
||||||
uint32_t& inode_num, uint32_t& obj_num) {
|
|
||||||
for (auto& p : ent) {
|
|
||||||
auto& files = p.second;
|
|
||||||
|
|
||||||
if constexpr (Unique) {
|
|
||||||
// this is true regardless of how the files are ordered
|
|
||||||
if (files.size() > files.front()->refcount()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
++num_unique_;
|
|
||||||
} else {
|
|
||||||
if (files.empty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file");
|
|
||||||
}
|
|
||||||
|
|
||||||
// this isn't strictly necessary, but helps metadata compression
|
|
||||||
std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
|
|
||||||
return a->path() < b->path();
|
|
||||||
});
|
|
||||||
|
|
||||||
for (auto fp : files) {
|
|
||||||
// need to check because hardlinks share the same number
|
|
||||||
if (!fp->inode_num()) {
|
|
||||||
fp->set_inode_num(inode_num);
|
|
||||||
++inode_num;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto fp = files.front();
|
|
||||||
auto inode = fp->get_inode();
|
|
||||||
assert(inode);
|
|
||||||
inode->set_num(obj_num);
|
|
||||||
inode->set_files(std::move(files));
|
|
||||||
|
|
||||||
++obj_num;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
worker_group& wg_;
|
|
||||||
os_access& os_;
|
|
||||||
inode_manager& im_;
|
|
||||||
inode_options const& ino_opts_;
|
|
||||||
std::optional<std::string> const hash_algo_;
|
|
||||||
progress& prog_;
|
|
||||||
uint32_t num_unique_{0};
|
|
||||||
folly::F14FastMap<uint64_t, inode::files_vector> hardlinks_;
|
|
||||||
std::mutex mx_;
|
|
||||||
folly::F14FastMap<uint64_t, inode::files_vector> unique_size_;
|
|
||||||
folly::F14FastMap<uint64_t, std::shared_ptr<condition_barrier>>
|
|
||||||
first_file_hashed_;
|
|
||||||
folly::F14FastMap<uint64_t, inode::files_vector> by_raw_inode_;
|
|
||||||
folly::F14FastMap<std::string_view, inode::files_vector> by_hash_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class dir_set_inode_visitor : public visitor_base {
|
class dir_set_inode_visitor : public visitor_base {
|
||||||
public:
|
public:
|
||||||
explicit dir_set_inode_visitor(uint32_t& inode_num)
|
explicit dir_set_inode_visitor(uint32_t& inode_num)
|
||||||
@ -644,7 +289,7 @@ class scanner_ final : public scanner::impl {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<entry>
|
std::shared_ptr<entry>
|
||||||
scan_tree(const std::string& path, progress& prog, file_scanner& fs);
|
scan_tree(const std::string& path, progress& prog, detail::file_scanner& fs);
|
||||||
|
|
||||||
const block_manager::config& cfg_;
|
const block_manager::config& cfg_;
|
||||||
const scanner_options& options_;
|
const scanner_options& options_;
|
||||||
@ -675,7 +320,7 @@ scanner_<LoggerPolicy>::scanner_(logger& lgr, worker_group& wg,
|
|||||||
template <typename LoggerPolicy>
|
template <typename LoggerPolicy>
|
||||||
std::shared_ptr<entry>
|
std::shared_ptr<entry>
|
||||||
scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
|
scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog,
|
||||||
file_scanner& fs) {
|
detail::file_scanner& fs) {
|
||||||
auto root = entry_->create(*os_, path);
|
auto root = entry_->create(*os_, path);
|
||||||
bool const debug_filter = options_.debug_filter_function.has_value();
|
bool const debug_filter = options_.debug_filter_function.has_value();
|
||||||
|
|
||||||
@ -823,8 +468,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
|||||||
prog.set_status_function(status_string);
|
prog.set_status_function(status_string);
|
||||||
|
|
||||||
inode_manager im(lgr_, prog);
|
inode_manager im(lgr_, prog);
|
||||||
file_scanner fs(wg_, *os_, im, options_.inode, options_.file_hash_algorithm,
|
detail::file_scanner fs(wg_, *os_, im, options_.inode,
|
||||||
prog);
|
options_.file_hash_algorithm, prog);
|
||||||
|
|
||||||
auto root = scan_tree(path, prog, fs);
|
auto root = scan_tree(path, prog, fs);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user