From 29acde87b5ca6b6f53b0db6eb8ce1743146336b0 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Sun, 21 Mar 2021 12:43:35 +0100 Subject: [PATCH] Optimize scanning This parallelizes scanning so that files only have to be touched twice (scanning + segmenting) instead of three times (scanning, scanning again, segmenting). --- include/dwarfs/entry.h | 2 + include/dwarfs/inode.h | 7 +- src/dwarfs/entry.cpp | 13 ++- src/dwarfs/inode_manager.cpp | 73 ++++++------ src/dwarfs/scanner.cpp | 209 +++++++++++++++++++---------------- 5 files changed, 164 insertions(+), 140 deletions(-) diff --git a/include/dwarfs/entry.h b/include/dwarfs/entry.h index 54aa08ad..8a38de92 100644 --- a/include/dwarfs/entry.h +++ b/include/dwarfs/entry.h @@ -48,6 +48,7 @@ class link; class dir; class device; class inode; +class mmif; class os_access; class progress; class global_entry_data; @@ -124,6 +125,7 @@ class file : public entry { std::shared_ptr get_inode() const; void accept(entry_visitor& v, bool preorder) override; void scan(os_access& os, progress& prog) override; + void scan(std::shared_ptr const& mm, progress& prog); void create_data(); void hardlink(file* other, progress& prog); uint32_t unique_file_id() const; diff --git a/include/dwarfs/inode.h b/include/dwarfs/inode.h index d699c1be..45266929 100644 --- a/include/dwarfs/inode.h +++ b/include/dwarfs/inode.h @@ -22,6 +22,7 @@ #pragma once #include +#include #include #include @@ -35,7 +36,7 @@ class chunk; } class file; -class os_access; +class mmif; struct inode_options; @@ -44,7 +45,9 @@ class inode : public object { using files_vector = folly::small_vector; virtual void set_files(files_vector&& fv) = 0; - virtual void scan(os_access& os, inode_options const& options) = 0; + virtual void + scan(std::shared_ptr const& mm, inode_options const& options) = 0; + virtual void set_num(uint32_t num) = 0; virtual uint32_t num() const = 0; virtual uint32_t similarity_hash() const = 0; virtual std::vector const& nilsimsa_similarity_hash() const = 0; diff --git a/src/dwarfs/entry.cpp b/src/dwarfs/entry.cpp index 62dbc4e8..6988f519 100644 --- a/src/dwarfs/entry.cpp +++ b/src/dwarfs/entry.cpp @@ -158,13 +158,22 @@ std::shared_ptr file::get_inode() const { return inode_; } void file::accept(entry_visitor& v, bool) { v.visit(this); } void file::scan(os_access& os, progress& prog) { + std::shared_ptr mm; + + if (size_t s = size(); s > 0) { + mm = os.map_file(path(), s); + } + + scan(mm, prog); +} + +void file::scan(std::shared_ptr const& mm, progress& prog) { constexpr auto alg = checksum::algorithm::SHA1; static_assert(checksum::digest_size(alg) == sizeof(data::hash_type)); if (size_t s = size(); s > 0) { - constexpr size_t chunk_size = 16 << 20; + constexpr size_t chunk_size = 32 << 20; prog.original_size += s; - auto mm = os.map_file(path(), s); checksum cs(alg); size_t offset = 0; diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index 0b67d1b9..d628d4e8 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -37,7 +37,6 @@ #include "dwarfs/mmif.h" #include "dwarfs/nilsimsa.h" #include "dwarfs/options.h" -#include "dwarfs/os_access.h" #include "dwarfs/progress.h" #include "dwarfs/script.h" #include "dwarfs/similarity.h" @@ -90,10 +89,12 @@ class inode_ : public inode { public: using chunk_type = thrift::metadata::chunk; - inode_(uint32_t n) - : num_{n} {} + void set_num(uint32_t num) override { + DWARFS_CHECK(!num_, "attempt to set inode number multiple times"); + num_ = num; + } - uint32_t num() const override { return num_; } + uint32_t num() const override { return num_.value(); } uint32_t similarity_hash() const override { if (files_.empty()) { @@ -117,45 +118,41 @@ class inode_ : public inode { files_ = std::move(fv); } - void scan(os_access& os, inode_options const& opts) override { + void + scan(std::shared_ptr const& mm, inode_options const& opts) override { if (opts.needs_scan()) { - auto file = files_.front(); - auto size = file->size(); - - if (size > 0) { - similarity sc; - nilsimsa nc; - - auto update_hashes = [&](uint8_t const* data, size_t size) { - if (opts.with_similarity) { - sc.update(data, size); - } - - if (opts.with_nilsimsa) { - nc.update(data, size); - } - }; - - constexpr size_t chunk_size = 16 << 20; - auto mm = os.map_file(file->path(), size); - size_t offset = 0; - - while (size >= chunk_size) { - update_hashes(mm->as(offset), chunk_size); - mm->release_until(offset); - offset += chunk_size; - size -= chunk_size; - } - - update_hashes(mm->as(offset), size); + similarity sc; + nilsimsa nc; + auto update_hashes = [&](uint8_t const* data, size_t size) { if (opts.with_similarity) { - similarity_hash_ = sc.finalize(); + sc.update(data, size); } if (opts.with_nilsimsa) { - nilsimsa_similarity_hash_ = nc.finalize(); + nc.update(data, size); } + }; + + constexpr size_t chunk_size = 32 << 20; + size_t offset = 0; + size_t size = mm->size(); + + while (size >= chunk_size) { + update_hashes(mm->as(offset), chunk_size); + mm->release_until(offset); + offset += chunk_size; + size -= chunk_size; + } + + update_hashes(mm->as(offset), size); + + if (opts.with_similarity) { + similarity_hash_ = sc.finalize(); + } + + if (opts.with_nilsimsa) { + nilsimsa_similarity_hash_ = nc.finalize(); } } } @@ -184,7 +181,7 @@ class inode_ : public inode { } private: - uint32_t const num_; + std::optional num_; uint32_t similarity_hash_{0}; files_vector files_; std::vector chunks_; @@ -201,7 +198,7 @@ class inode_manager_ final : public inode_manager::impl { , prog_(prog) {} std::shared_ptr create_inode() override { - auto ino = std::make_shared(inodes_.size()); + auto ino = std::make_shared(); inodes_.push_back(ino); return ino; } diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index d64d5240..46870b31 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -76,119 +77,141 @@ class visitor_base : public entry_visitor { class scan_files_visitor : public visitor_base { public: - scan_files_visitor(worker_group& wg, os_access& os, progress& prog, - uint32_t& inode_num) + scan_files_visitor(worker_group& wg, os_access& os, inode_manager& im, + inode_options const& ino_opts, progress& prog) : wg_(wg) , os_(os) - , prog_(prog) - , inode_num_(inode_num) {} + , im_(im) + , ino_opts_(ino_opts) + , prog_(prog) {} void visit(file* p) override { if (p->num_hard_links() > 1) { auto ino = p->raw_inode_num(); - auto [it, is_new] = cache_.emplace(ino, p); + auto [it, is_new] = hardlink_cache_.emplace(ino, p); if (!is_new) { p->hardlink(it->second, prog_); ++prog_.files_scanned; + hardlinked_.push_back(p); return; } } p->create_data(); - ++inode_num_; wg_.add_job([=] { + auto const size = p->size(); + std::shared_ptr mm; + + if (size > 0) { + mm = os_.map_file(p->path(), size); + } + prog_.current.store(p); - p->scan(os_, prog_); + p->scan(mm, prog_); ++prog_.files_scanned; + std::shared_ptr inode; + + { + std::lock_guard lock(mx_); + auto& ref = hash_[p->hash()]; + if (ref.empty()) { + inode = im_.create_inode(); + p->set_inode(inode); + } else { + p->set_inode(ref.front()->get_inode()); + } + ref.push_back(p); + } + + if (inode) { + if (ino_opts_.needs_scan()) { + if (mm) { + inode->scan(mm, ino_opts_); + } + ++prog_.inodes_scanned; + } + } else { + ++prog_.duplicate_files; + prog_.saved_by_deduplication += size; + } }); } - private: - worker_group& wg_; - os_access& os_; - progress& prog_; - folly::F14FastMap cache_; - uint32_t& inode_num_; -}; + void finalize(uint32_t& inode_num) { + hardlink_cache_.clear(); -class file_deduplication_visitor : public visitor_base { - public: - file_deduplication_visitor(uint32_t first_file_inode) - : inode_num_{first_file_inode} {} - - void visit(file* p) override { hash_[p->hash()].push_back(p); } - - void deduplicate_files(worker_group& wg, os_access& os, inode_manager& im, - inode_options const& ino_opts, progress& prog) { - auto check_scan = [&](auto inode) { - if (ino_opts.needs_scan()) { - wg.add_job([&, inode = std::move(inode)] { - prog.current = inode->any(); - inode->scan(os, ino_opts); - ++prog.inodes_scanned; - }); - } - }; - - for (auto& p : hash_) { - if (p.second.size() > p.second.front()->refcount()) { - continue; - } - - auto fp = p.second.front(); - auto inode = im.create_inode(); - - ++num_unique_; - - fp->set_inode_num(inode_num_++); - fp->set_inode(inode); - - inode->set_files(std::move(p.second)); - - check_scan(std::move(inode)); + for (auto p : hardlinked_) { + auto& fv = hash_[p->hash()]; + p->set_inode(fv.front()->get_inode()); + fv.push_back(p); } - for (auto& p : hash_) { - auto& files = p.second; + hardlinked_.clear(); - if (files.empty()) { - continue; - } + uint32_t obj_num = 0; - DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file"); + finalize_inodes(inode_num, obj_num); + finalize_inodes(inode_num, obj_num); - std::sort(files.begin(), files.end(), [](file const* a, file const* b) { - return a->path() < b->path(); - }); - - auto inode = im.create_inode(); - - for (auto fp : files) { - if (!fp->inode_num()) { - fp->set_inode_num(inode_num_++); - } - fp->set_inode(inode); - } - - auto dupes = files.size() - 1; - prog.duplicate_files += dupes; - prog.saved_by_deduplication += dupes * files.front()->size(); - - inode->set_files(std::move(files)); - - check_scan(std::move(inode)); - } + hash_.clear(); } - uint32_t inode_num_end() const { return inode_num_; } uint32_t num_unique() const { return num_unique_; } private: - folly::F14FastMap hash_; - uint32_t inode_num_; + template + void finalize_inodes(uint32_t& inode_num, uint32_t& obj_num) { + for (auto& p : hash_) { + auto& files = p.second; + + if constexpr (Unique) { + std::sort(files.begin(), files.end(), [](file const* a, file const* b) { + return a->path() < b->path(); + }); + + // this is true regardless of how the files are ordered + if (files.size() > files.front()->refcount()) { + continue; + } + + ++num_unique_; + } else { + if (files.empty()) { + continue; + } + + DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file"); + } + + for (auto fp : files) { + // need to check because hardlinks share the same number + if (!fp->inode_num()) { + fp->set_inode_num(inode_num); + ++inode_num; + } + } + + auto fp = files.front(); + auto inode = fp->get_inode(); + inode->set_num(obj_num); + inode->set_files(std::move(files)); + + ++obj_num; + } + } + + worker_group& wg_; + os_access& os_; + inode_manager& im_; + inode_options const& ino_opts_; + progress& prog_; uint32_t num_unique_{0}; + std::vector hardlinked_; + folly::F14FastMap hardlink_cache_; + std::mutex mx_; + folly::F14FastMap hash_; }; class dir_set_inode_visitor : public visitor_base { @@ -298,6 +321,8 @@ class save_directories_visitor : public visitor_base { dummy.parent_entry = 0; dummy.first_entry = mv2.dir_entries_ref()->size(); mv2.directories.push_back(dummy); + + directories_.clear(); } private: @@ -580,36 +605,24 @@ void scanner_::scan(filesystem_writer& fsw, link_set_inode_visitor lsiv(first_file_inode); root->accept(lsiv, true); + inode_manager im(lgr_, prog); + // now scan all files - uint32_t first_device_inode = first_file_inode; - scan_files_visitor sfv(wg_, *os_, prog, first_device_inode); + scan_files_visitor sfv(wg_, *os_, im, options_.inode, prog); root->accept(sfv); LOG_INFO << "waiting for background scanners..."; wg_.wait(); - LOG_INFO << "finding duplicate files..."; - - inode_manager im(lgr_, prog); - - file_deduplication_visitor fdv(first_file_inode); - root->accept(fdv); - - fdv.deduplicate_files(wg_, *os_, im, options_.inode, prog); - - DWARFS_CHECK(fdv.inode_num_end() == first_device_inode, - "inconsistent inode numbers"); + LOG_INFO << "finalizing file inodes..."; + uint32_t first_device_inode = first_file_inode; + sfv.finalize(first_device_inode); LOG_INFO << "saved " << size_with_unit(prog.saved_by_deduplication) << " / " << size_with_unit(prog.original_size) << " in " << prog.duplicate_files << "/" << prog.files_found << " duplicate files"; - if (options_.inode.needs_scan()) { - LOG_INFO << "waiting for inode scanners..."; - wg_.wait(); - } - global_entry_data ge_data(options_); thrift::metadata::metadata mv2; @@ -725,7 +738,7 @@ void scanner_::scan(filesystem_writer& fsw, LOG_INFO << "saving shared files table..."; save_shared_files_visitor ssfv(first_file_inode, first_device_inode, - fdv.num_unique()); + sfv.num_unique()); root->accept(ssfv); if (options_.pack_shared_files_table) { ssfv.pack_shared_files();