From c4a8fd79697966367a7f1fde201e326444c9b6f3 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 8 Dec 2020 16:39:23 +0100 Subject: [PATCH] Detect hard links and optimize code to avoid duplicate scanning --- include/dwarfs/entry.h | 11 +++++++++-- include/dwarfs/progress.h | 2 ++ src/dwarfs/console_writer.cpp | 3 ++- src/dwarfs/entry.cpp | 28 ++++++++++++++++++++++++---- src/dwarfs/scanner.cpp | 16 +++++++++++++++- test/dwarfs.cpp | 1 + 6 files changed, 53 insertions(+), 8 deletions(-) diff --git a/include/dwarfs/entry.h b/include/dwarfs/entry.h index ffd77935..08ebfc4c 100644 --- a/include/dwarfs/entry.h +++ b/include/dwarfs/entry.h @@ -119,11 +119,18 @@ class file : public entry { void accept(entry_visitor& v, bool preorder) override; uint32_t inode_num() const override; void scan(os_access& os, progress& prog) override; + void create_data(); + void hardlink(file* other, progress& prog); + uint64_t raw_inode_num() const; + unsigned num_hard_links() const; private: - using hash_type = std::array; + struct data { + using hash_type = std::array; + hash_type hash{0}; + }; - hash_type hash_{0}; + std::shared_ptr data_; std::shared_ptr inode_; }; diff --git a/include/dwarfs/progress.h b/include/dwarfs/progress.h index 9c900bf9..c38fbbe1 100644 --- a/include/dwarfs/progress.h +++ b/include/dwarfs/progress.h @@ -62,12 +62,14 @@ class progress { std::atomic links_scanned{0}; std::atomic specials_found{0}; std::atomic duplicate_files{0}; + std::atomic hardlinks{0}; std::atomic block_count{0}; std::atomic chunk_count{0}; std::atomic inodes_written{0}; std::atomic blocks_written{0}; std::atomic errors{0}; std::atomic original_size{0}; + std::atomic hardlink_size{0}; std::atomic saved_by_deduplication{0}; std::atomic saved_by_segmentation{0}; std::atomic filesystem_size{0}; diff --git a/src/dwarfs/console_writer.cpp b/src/dwarfs/console_writer.cpp index ffb92187..828c36dd 100644 --- a/src/dwarfs/console_writer.cpp +++ b/src/dwarfs/console_writer.cpp @@ -121,7 +121,8 @@ void console_writer::update(const progress& p, bool last) { << "scanned/found: " << p.dirs_scanned << "/" << p.dirs_found << " dirs, " << p.links_scanned << "/" << p.links_found << " links, " - << p.files_scanned << "/" << p.files_found << " files" << newline + << p.files_scanned << "/" << p.files_found << "(" << p.hardlinks + << ") files" << newline << "original size: " << size_with_unit(p.original_size) << ", dedupe: " << size_with_unit(p.saved_by_deduplication) << " (" diff --git a/src/dwarfs/entry.cpp b/src/dwarfs/entry.cpp index cedff45f..844a70f7 100644 --- a/src/dwarfs/entry.cpp +++ b/src/dwarfs/entry.cpp @@ -144,7 +144,8 @@ uint64_t entry::get_ctime() const { return stat_.st_ctime; } void entry::set_ctime(uint64_t ctime) { stat_.st_atime = ctime; } std::string_view file::hash() const { - return std::string_view(&hash_[0], hash_.size()); + auto& h = data_->hash; + return std::string_view(&h[0], h.size()); } void file::set_inode(std::shared_ptr ino) { @@ -162,18 +163,37 @@ uint32_t file::inode_num() const { return inode_->num(); } void file::accept(entry_visitor& v, bool) { v.visit(this); } void file::scan(os_access& os, progress& prog) { - static_assert(SHA_DIGEST_LENGTH == sizeof(hash_type)); + static_assert(SHA_DIGEST_LENGTH == sizeof(data::hash_type)); if (size_t s = size(); s > 0) { prog.original_size += s; auto mm = os.map_file(path(), s); ::SHA1(mm->as(), s, - reinterpret_cast(&hash_[0])); + reinterpret_cast(&data_->hash[0])); } else { - ::SHA1(nullptr, 0, reinterpret_cast(&hash_[0])); + ::SHA1(nullptr, 0, reinterpret_cast(&data_->hash[0])); } } +uint64_t file::raw_inode_num() const { return status().st_ino; } + +unsigned file::num_hard_links() const { return status().st_nlink; } + +void file::create_data() { + assert(!data_); + data_ = std::make_shared(); +} + +void file::hardlink(file* other, progress& prog) { + assert(!data_); + assert(other->data_); + if (size_t s = size(); s > 0) { + prog.original_size += s; + prog.hardlink_size += s; + } + data_ = other->data_; +} + entry::type_t dir::type() const { return E_DIR; } void dir::add(std::shared_ptr e) { entries_.emplace_back(std::move(e)); } diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index a6c2fdd2..b770bfa7 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -76,8 +76,21 @@ class scan_files_visitor : public visitor_base { , os_(os) , prog_(prog) {} - // TODO: avoid scanning hardlinks multiple times void visit(file* p) override { + if (p->num_hard_links() > 1) { + auto ino = p->raw_inode_num(); + auto [it, is_new] = cache_.emplace(ino, p); + + if (!is_new) { + p->hardlink(it->second, prog_); + prog_.files_scanned++; + prog_.hardlinks++; + return; + } + } + + p->create_data(); + wg_.add_job([=] { prog_.current.store(p); p->scan(os_, prog_); @@ -89,6 +102,7 @@ class scan_files_visitor : public visitor_base { worker_group& wg_; os_access& os_; progress& prog_; + std::unordered_map cache_; }; class file_deduplication_visitor : public visitor_base { diff --git a/test/dwarfs.cpp b/test/dwarfs.cpp index eb54b962..26bfb04c 100644 --- a/test/dwarfs.cpp +++ b/test/dwarfs.cpp @@ -136,6 +136,7 @@ class os_access_mock : public os_access { st->st_mtime = 234; st->st_ctime = 345; st->st_rdev = sst.st_rdev; + st->st_nlink = 1; } std::string readlink(const std::string& path, size_t size) const override {