mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-09 12:28:13 -04:00
Detect hard links and optimize code to avoid duplicate scanning
This commit is contained in:
parent
01107e0d39
commit
c4a8fd7969
@ -119,11 +119,18 @@ class file : public entry {
|
||||
void accept(entry_visitor& v, bool preorder) override;
|
||||
uint32_t inode_num() const override;
|
||||
void scan(os_access& os, progress& prog) override;
|
||||
void create_data();
|
||||
void hardlink(file* other, progress& prog);
|
||||
uint64_t raw_inode_num() const;
|
||||
unsigned num_hard_links() const;
|
||||
|
||||
private:
|
||||
using hash_type = std::array<char, 20>;
|
||||
struct data {
|
||||
using hash_type = std::array<char, 20>;
|
||||
hash_type hash{0};
|
||||
};
|
||||
|
||||
hash_type hash_{0};
|
||||
std::shared_ptr<data> data_;
|
||||
std::shared_ptr<inode> inode_;
|
||||
};
|
||||
|
||||
|
@ -62,12 +62,14 @@ class progress {
|
||||
std::atomic<size_t> links_scanned{0};
|
||||
std::atomic<size_t> specials_found{0};
|
||||
std::atomic<size_t> duplicate_files{0};
|
||||
std::atomic<size_t> hardlinks{0};
|
||||
std::atomic<size_t> block_count{0};
|
||||
std::atomic<size_t> chunk_count{0};
|
||||
std::atomic<size_t> inodes_written{0};
|
||||
std::atomic<size_t> blocks_written{0};
|
||||
std::atomic<size_t> errors{0};
|
||||
std::atomic<uint64_t> original_size{0};
|
||||
std::atomic<uint64_t> hardlink_size{0};
|
||||
std::atomic<uint64_t> saved_by_deduplication{0};
|
||||
std::atomic<uint64_t> saved_by_segmentation{0};
|
||||
std::atomic<uint64_t> filesystem_size{0};
|
||||
|
@ -121,7 +121,8 @@ void console_writer::update(const progress& p, bool last) {
|
||||
|
||||
<< "scanned/found: " << p.dirs_scanned << "/" << p.dirs_found
|
||||
<< " dirs, " << p.links_scanned << "/" << p.links_found << " links, "
|
||||
<< p.files_scanned << "/" << p.files_found << " files" << newline
|
||||
<< p.files_scanned << "/" << p.files_found << "(" << p.hardlinks
|
||||
<< ") files" << newline
|
||||
|
||||
<< "original size: " << size_with_unit(p.original_size)
|
||||
<< ", dedupe: " << size_with_unit(p.saved_by_deduplication) << " ("
|
||||
|
@ -144,7 +144,8 @@ uint64_t entry::get_ctime() const { return stat_.st_ctime; }
|
||||
void entry::set_ctime(uint64_t ctime) { stat_.st_atime = ctime; }
|
||||
|
||||
std::string_view file::hash() const {
|
||||
return std::string_view(&hash_[0], hash_.size());
|
||||
auto& h = data_->hash;
|
||||
return std::string_view(&h[0], h.size());
|
||||
}
|
||||
|
||||
void file::set_inode(std::shared_ptr<inode> ino) {
|
||||
@ -162,18 +163,37 @@ uint32_t file::inode_num() const { return inode_->num(); }
|
||||
void file::accept(entry_visitor& v, bool) { v.visit(this); }
|
||||
|
||||
void file::scan(os_access& os, progress& prog) {
|
||||
static_assert(SHA_DIGEST_LENGTH == sizeof(hash_type));
|
||||
static_assert(SHA_DIGEST_LENGTH == sizeof(data::hash_type));
|
||||
|
||||
if (size_t s = size(); s > 0) {
|
||||
prog.original_size += s;
|
||||
auto mm = os.map_file(path(), s);
|
||||
::SHA1(mm->as<unsigned char>(), s,
|
||||
reinterpret_cast<unsigned char*>(&hash_[0]));
|
||||
reinterpret_cast<unsigned char*>(&data_->hash[0]));
|
||||
} else {
|
||||
::SHA1(nullptr, 0, reinterpret_cast<unsigned char*>(&hash_[0]));
|
||||
::SHA1(nullptr, 0, reinterpret_cast<unsigned char*>(&data_->hash[0]));
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t file::raw_inode_num() const { return status().st_ino; }
|
||||
|
||||
unsigned file::num_hard_links() const { return status().st_nlink; }
|
||||
|
||||
void file::create_data() {
|
||||
assert(!data_);
|
||||
data_ = std::make_shared<data>();
|
||||
}
|
||||
|
||||
void file::hardlink(file* other, progress& prog) {
|
||||
assert(!data_);
|
||||
assert(other->data_);
|
||||
if (size_t s = size(); s > 0) {
|
||||
prog.original_size += s;
|
||||
prog.hardlink_size += s;
|
||||
}
|
||||
data_ = other->data_;
|
||||
}
|
||||
|
||||
entry::type_t dir::type() const { return E_DIR; }
|
||||
|
||||
void dir::add(std::shared_ptr<entry> e) { entries_.emplace_back(std::move(e)); }
|
||||
|
@ -76,8 +76,21 @@ class scan_files_visitor : public visitor_base {
|
||||
, os_(os)
|
||||
, prog_(prog) {}
|
||||
|
||||
// TODO: avoid scanning hardlinks multiple times
|
||||
void visit(file* p) override {
|
||||
if (p->num_hard_links() > 1) {
|
||||
auto ino = p->raw_inode_num();
|
||||
auto [it, is_new] = cache_.emplace(ino, p);
|
||||
|
||||
if (!is_new) {
|
||||
p->hardlink(it->second, prog_);
|
||||
prog_.files_scanned++;
|
||||
prog_.hardlinks++;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
p->create_data();
|
||||
|
||||
wg_.add_job([=] {
|
||||
prog_.current.store(p);
|
||||
p->scan(os_, prog_);
|
||||
@ -89,6 +102,7 @@ class scan_files_visitor : public visitor_base {
|
||||
worker_group& wg_;
|
||||
os_access& os_;
|
||||
progress& prog_;
|
||||
std::unordered_map<uint64_t, file*> cache_;
|
||||
};
|
||||
|
||||
class file_deduplication_visitor : public visitor_base {
|
||||
|
@ -136,6 +136,7 @@ class os_access_mock : public os_access {
|
||||
st->st_mtime = 234;
|
||||
st->st_ctime = 345;
|
||||
st->st_rdev = sst.st_rdev;
|
||||
st->st_nlink = 1;
|
||||
}
|
||||
|
||||
std::string readlink(const std::string& path, size_t size) const override {
|
||||
|
Loading…
x
Reference in New Issue
Block a user