mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-09 20:41:04 -04:00
Detect hard links and optimize code to avoid duplicate scanning
This commit is contained in:
parent
01107e0d39
commit
c4a8fd7969
@ -119,11 +119,18 @@ class file : public entry {
|
|||||||
void accept(entry_visitor& v, bool preorder) override;
|
void accept(entry_visitor& v, bool preorder) override;
|
||||||
uint32_t inode_num() const override;
|
uint32_t inode_num() const override;
|
||||||
void scan(os_access& os, progress& prog) override;
|
void scan(os_access& os, progress& prog) override;
|
||||||
|
void create_data();
|
||||||
|
void hardlink(file* other, progress& prog);
|
||||||
|
uint64_t raw_inode_num() const;
|
||||||
|
unsigned num_hard_links() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
using hash_type = std::array<char, 20>;
|
struct data {
|
||||||
|
using hash_type = std::array<char, 20>;
|
||||||
|
hash_type hash{0};
|
||||||
|
};
|
||||||
|
|
||||||
hash_type hash_{0};
|
std::shared_ptr<data> data_;
|
||||||
std::shared_ptr<inode> inode_;
|
std::shared_ptr<inode> inode_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -62,12 +62,14 @@ class progress {
|
|||||||
std::atomic<size_t> links_scanned{0};
|
std::atomic<size_t> links_scanned{0};
|
||||||
std::atomic<size_t> specials_found{0};
|
std::atomic<size_t> specials_found{0};
|
||||||
std::atomic<size_t> duplicate_files{0};
|
std::atomic<size_t> duplicate_files{0};
|
||||||
|
std::atomic<size_t> hardlinks{0};
|
||||||
std::atomic<size_t> block_count{0};
|
std::atomic<size_t> block_count{0};
|
||||||
std::atomic<size_t> chunk_count{0};
|
std::atomic<size_t> chunk_count{0};
|
||||||
std::atomic<size_t> inodes_written{0};
|
std::atomic<size_t> inodes_written{0};
|
||||||
std::atomic<size_t> blocks_written{0};
|
std::atomic<size_t> blocks_written{0};
|
||||||
std::atomic<size_t> errors{0};
|
std::atomic<size_t> errors{0};
|
||||||
std::atomic<uint64_t> original_size{0};
|
std::atomic<uint64_t> original_size{0};
|
||||||
|
std::atomic<uint64_t> hardlink_size{0};
|
||||||
std::atomic<uint64_t> saved_by_deduplication{0};
|
std::atomic<uint64_t> saved_by_deduplication{0};
|
||||||
std::atomic<uint64_t> saved_by_segmentation{0};
|
std::atomic<uint64_t> saved_by_segmentation{0};
|
||||||
std::atomic<uint64_t> filesystem_size{0};
|
std::atomic<uint64_t> filesystem_size{0};
|
||||||
|
@ -121,7 +121,8 @@ void console_writer::update(const progress& p, bool last) {
|
|||||||
|
|
||||||
<< "scanned/found: " << p.dirs_scanned << "/" << p.dirs_found
|
<< "scanned/found: " << p.dirs_scanned << "/" << p.dirs_found
|
||||||
<< " dirs, " << p.links_scanned << "/" << p.links_found << " links, "
|
<< " dirs, " << p.links_scanned << "/" << p.links_found << " links, "
|
||||||
<< p.files_scanned << "/" << p.files_found << " files" << newline
|
<< p.files_scanned << "/" << p.files_found << "(" << p.hardlinks
|
||||||
|
<< ") files" << newline
|
||||||
|
|
||||||
<< "original size: " << size_with_unit(p.original_size)
|
<< "original size: " << size_with_unit(p.original_size)
|
||||||
<< ", dedupe: " << size_with_unit(p.saved_by_deduplication) << " ("
|
<< ", dedupe: " << size_with_unit(p.saved_by_deduplication) << " ("
|
||||||
|
@ -144,7 +144,8 @@ uint64_t entry::get_ctime() const { return stat_.st_ctime; }
|
|||||||
void entry::set_ctime(uint64_t ctime) { stat_.st_atime = ctime; }
|
void entry::set_ctime(uint64_t ctime) { stat_.st_atime = ctime; }
|
||||||
|
|
||||||
std::string_view file::hash() const {
|
std::string_view file::hash() const {
|
||||||
return std::string_view(&hash_[0], hash_.size());
|
auto& h = data_->hash;
|
||||||
|
return std::string_view(&h[0], h.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void file::set_inode(std::shared_ptr<inode> ino) {
|
void file::set_inode(std::shared_ptr<inode> ino) {
|
||||||
@ -162,18 +163,37 @@ uint32_t file::inode_num() const { return inode_->num(); }
|
|||||||
void file::accept(entry_visitor& v, bool) { v.visit(this); }
|
void file::accept(entry_visitor& v, bool) { v.visit(this); }
|
||||||
|
|
||||||
void file::scan(os_access& os, progress& prog) {
|
void file::scan(os_access& os, progress& prog) {
|
||||||
static_assert(SHA_DIGEST_LENGTH == sizeof(hash_type));
|
static_assert(SHA_DIGEST_LENGTH == sizeof(data::hash_type));
|
||||||
|
|
||||||
if (size_t s = size(); s > 0) {
|
if (size_t s = size(); s > 0) {
|
||||||
prog.original_size += s;
|
prog.original_size += s;
|
||||||
auto mm = os.map_file(path(), s);
|
auto mm = os.map_file(path(), s);
|
||||||
::SHA1(mm->as<unsigned char>(), s,
|
::SHA1(mm->as<unsigned char>(), s,
|
||||||
reinterpret_cast<unsigned char*>(&hash_[0]));
|
reinterpret_cast<unsigned char*>(&data_->hash[0]));
|
||||||
} else {
|
} else {
|
||||||
::SHA1(nullptr, 0, reinterpret_cast<unsigned char*>(&hash_[0]));
|
::SHA1(nullptr, 0, reinterpret_cast<unsigned char*>(&data_->hash[0]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t file::raw_inode_num() const { return status().st_ino; }
|
||||||
|
|
||||||
|
unsigned file::num_hard_links() const { return status().st_nlink; }
|
||||||
|
|
||||||
|
void file::create_data() {
|
||||||
|
assert(!data_);
|
||||||
|
data_ = std::make_shared<data>();
|
||||||
|
}
|
||||||
|
|
||||||
|
void file::hardlink(file* other, progress& prog) {
|
||||||
|
assert(!data_);
|
||||||
|
assert(other->data_);
|
||||||
|
if (size_t s = size(); s > 0) {
|
||||||
|
prog.original_size += s;
|
||||||
|
prog.hardlink_size += s;
|
||||||
|
}
|
||||||
|
data_ = other->data_;
|
||||||
|
}
|
||||||
|
|
||||||
entry::type_t dir::type() const { return E_DIR; }
|
entry::type_t dir::type() const { return E_DIR; }
|
||||||
|
|
||||||
void dir::add(std::shared_ptr<entry> e) { entries_.emplace_back(std::move(e)); }
|
void dir::add(std::shared_ptr<entry> e) { entries_.emplace_back(std::move(e)); }
|
||||||
|
@ -76,8 +76,21 @@ class scan_files_visitor : public visitor_base {
|
|||||||
, os_(os)
|
, os_(os)
|
||||||
, prog_(prog) {}
|
, prog_(prog) {}
|
||||||
|
|
||||||
// TODO: avoid scanning hardlinks multiple times
|
|
||||||
void visit(file* p) override {
|
void visit(file* p) override {
|
||||||
|
if (p->num_hard_links() > 1) {
|
||||||
|
auto ino = p->raw_inode_num();
|
||||||
|
auto [it, is_new] = cache_.emplace(ino, p);
|
||||||
|
|
||||||
|
if (!is_new) {
|
||||||
|
p->hardlink(it->second, prog_);
|
||||||
|
prog_.files_scanned++;
|
||||||
|
prog_.hardlinks++;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
p->create_data();
|
||||||
|
|
||||||
wg_.add_job([=] {
|
wg_.add_job([=] {
|
||||||
prog_.current.store(p);
|
prog_.current.store(p);
|
||||||
p->scan(os_, prog_);
|
p->scan(os_, prog_);
|
||||||
@ -89,6 +102,7 @@ class scan_files_visitor : public visitor_base {
|
|||||||
worker_group& wg_;
|
worker_group& wg_;
|
||||||
os_access& os_;
|
os_access& os_;
|
||||||
progress& prog_;
|
progress& prog_;
|
||||||
|
std::unordered_map<uint64_t, file*> cache_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class file_deduplication_visitor : public visitor_base {
|
class file_deduplication_visitor : public visitor_base {
|
||||||
|
@ -136,6 +136,7 @@ class os_access_mock : public os_access {
|
|||||||
st->st_mtime = 234;
|
st->st_mtime = 234;
|
||||||
st->st_ctime = 345;
|
st->st_ctime = 345;
|
||||||
st->st_rdev = sst.st_rdev;
|
st->st_rdev = sst.st_rdev;
|
||||||
|
st->st_nlink = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string readlink(const std::string& path, size_t size) const override {
|
std::string readlink(const std::string& path, size_t size) const override {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user