Detect hard links and optimize code to avoid duplicate scanning

This commit is contained in:
Marcus Holland-Moritz 2020-12-08 16:39:23 +01:00
parent 01107e0d39
commit c4a8fd7969
6 changed files with 53 additions and 8 deletions

View File

@ -119,11 +119,18 @@ class file : public entry {
void accept(entry_visitor& v, bool preorder) override;
uint32_t inode_num() const override;
void scan(os_access& os, progress& prog) override;
void create_data();
void hardlink(file* other, progress& prog);
uint64_t raw_inode_num() const;
unsigned num_hard_links() const;
private:
using hash_type = std::array<char, 20>;
struct data {
using hash_type = std::array<char, 20>;
hash_type hash{0};
};
hash_type hash_{0};
std::shared_ptr<data> data_;
std::shared_ptr<inode> inode_;
};

View File

@ -62,12 +62,14 @@ class progress {
std::atomic<size_t> links_scanned{0};
std::atomic<size_t> specials_found{0};
std::atomic<size_t> duplicate_files{0};
std::atomic<size_t> hardlinks{0};
std::atomic<size_t> block_count{0};
std::atomic<size_t> chunk_count{0};
std::atomic<size_t> inodes_written{0};
std::atomic<size_t> blocks_written{0};
std::atomic<size_t> errors{0};
std::atomic<uint64_t> original_size{0};
std::atomic<uint64_t> hardlink_size{0};
std::atomic<uint64_t> saved_by_deduplication{0};
std::atomic<uint64_t> saved_by_segmentation{0};
std::atomic<uint64_t> filesystem_size{0};

View File

@ -121,7 +121,8 @@ void console_writer::update(const progress& p, bool last) {
<< "scanned/found: " << p.dirs_scanned << "/" << p.dirs_found
<< " dirs, " << p.links_scanned << "/" << p.links_found << " links, "
<< p.files_scanned << "/" << p.files_found << " files" << newline
<< p.files_scanned << "/" << p.files_found << "(" << p.hardlinks
<< ") files" << newline
<< "original size: " << size_with_unit(p.original_size)
<< ", dedupe: " << size_with_unit(p.saved_by_deduplication) << " ("

View File

@ -144,7 +144,8 @@ uint64_t entry::get_ctime() const { return stat_.st_ctime; }
void entry::set_ctime(uint64_t ctime) { stat_.st_atime = ctime; }
std::string_view file::hash() const {
return std::string_view(&hash_[0], hash_.size());
auto& h = data_->hash;
return std::string_view(&h[0], h.size());
}
void file::set_inode(std::shared_ptr<inode> ino) {
@ -162,18 +163,37 @@ uint32_t file::inode_num() const { return inode_->num(); }
void file::accept(entry_visitor& v, bool) { v.visit(this); }
void file::scan(os_access& os, progress& prog) {
static_assert(SHA_DIGEST_LENGTH == sizeof(hash_type));
static_assert(SHA_DIGEST_LENGTH == sizeof(data::hash_type));
if (size_t s = size(); s > 0) {
prog.original_size += s;
auto mm = os.map_file(path(), s);
::SHA1(mm->as<unsigned char>(), s,
reinterpret_cast<unsigned char*>(&hash_[0]));
reinterpret_cast<unsigned char*>(&data_->hash[0]));
} else {
::SHA1(nullptr, 0, reinterpret_cast<unsigned char*>(&hash_[0]));
::SHA1(nullptr, 0, reinterpret_cast<unsigned char*>(&data_->hash[0]));
}
}
uint64_t file::raw_inode_num() const { return status().st_ino; }
unsigned file::num_hard_links() const { return status().st_nlink; }
void file::create_data() {
assert(!data_);
data_ = std::make_shared<data>();
}
void file::hardlink(file* other, progress& prog) {
assert(!data_);
assert(other->data_);
if (size_t s = size(); s > 0) {
prog.original_size += s;
prog.hardlink_size += s;
}
data_ = other->data_;
}
entry::type_t dir::type() const { return E_DIR; }
void dir::add(std::shared_ptr<entry> e) { entries_.emplace_back(std::move(e)); }

View File

@ -76,8 +76,21 @@ class scan_files_visitor : public visitor_base {
, os_(os)
, prog_(prog) {}
// TODO: avoid scanning hardlinks multiple times
void visit(file* p) override {
if (p->num_hard_links() > 1) {
auto ino = p->raw_inode_num();
auto [it, is_new] = cache_.emplace(ino, p);
if (!is_new) {
p->hardlink(it->second, prog_);
prog_.files_scanned++;
prog_.hardlinks++;
return;
}
}
p->create_data();
wg_.add_job([=] {
prog_.current.store(p);
p->scan(os_, prog_);
@ -89,6 +102,7 @@ class scan_files_visitor : public visitor_base {
worker_group& wg_;
os_access& os_;
progress& prog_;
std::unordered_map<uint64_t, file*> cache_;
};
class file_deduplication_visitor : public visitor_base {

View File

@ -136,6 +136,7 @@ class os_access_mock : public os_access {
st->st_mtime = 234;
st->st_ctime = 345;
st->st_rdev = sst.st_rdev;
st->st_nlink = 1;
}
std::string readlink(const std::string& path, size_t size) const override {