mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-13 14:27:30 -04:00
Optimize scanning
This parallelizes scanning so that files only have to be touched twice (scanning + segmenting) instead of three times (scanning, scanning again, segmenting).
This commit is contained in:
parent
592c7376ce
commit
29acde87b5
@ -48,6 +48,7 @@ class link;
|
||||
class dir;
|
||||
class device;
|
||||
class inode;
|
||||
class mmif;
|
||||
class os_access;
|
||||
class progress;
|
||||
class global_entry_data;
|
||||
@ -124,6 +125,7 @@ class file : public entry {
|
||||
std::shared_ptr<inode> get_inode() const;
|
||||
void accept(entry_visitor& v, bool preorder) override;
|
||||
void scan(os_access& os, progress& prog) override;
|
||||
void scan(std::shared_ptr<mmif> const& mm, progress& prog);
|
||||
void create_data();
|
||||
void hardlink(file* other, progress& prog);
|
||||
uint32_t unique_file_id() const;
|
||||
|
@ -22,6 +22,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include <folly/small_vector.h>
|
||||
@ -35,7 +36,7 @@ class chunk;
|
||||
}
|
||||
|
||||
class file;
|
||||
class os_access;
|
||||
class mmif;
|
||||
|
||||
struct inode_options;
|
||||
|
||||
@ -44,7 +45,9 @@ class inode : public object {
|
||||
using files_vector = folly::small_vector<file*, 1>;
|
||||
|
||||
virtual void set_files(files_vector&& fv) = 0;
|
||||
virtual void scan(os_access& os, inode_options const& options) = 0;
|
||||
virtual void
|
||||
scan(std::shared_ptr<mmif> const& mm, inode_options const& options) = 0;
|
||||
virtual void set_num(uint32_t num) = 0;
|
||||
virtual uint32_t num() const = 0;
|
||||
virtual uint32_t similarity_hash() const = 0;
|
||||
virtual std::vector<uint64_t> const& nilsimsa_similarity_hash() const = 0;
|
||||
|
@ -158,13 +158,22 @@ std::shared_ptr<inode> file::get_inode() const { return inode_; }
|
||||
void file::accept(entry_visitor& v, bool) { v.visit(this); }
|
||||
|
||||
void file::scan(os_access& os, progress& prog) {
|
||||
std::shared_ptr<mmif> mm;
|
||||
|
||||
if (size_t s = size(); s > 0) {
|
||||
mm = os.map_file(path(), s);
|
||||
}
|
||||
|
||||
scan(mm, prog);
|
||||
}
|
||||
|
||||
void file::scan(std::shared_ptr<mmif> const& mm, progress& prog) {
|
||||
constexpr auto alg = checksum::algorithm::SHA1;
|
||||
static_assert(checksum::digest_size(alg) == sizeof(data::hash_type));
|
||||
|
||||
if (size_t s = size(); s > 0) {
|
||||
constexpr size_t chunk_size = 16 << 20;
|
||||
constexpr size_t chunk_size = 32 << 20;
|
||||
prog.original_size += s;
|
||||
auto mm = os.map_file(path(), s);
|
||||
checksum cs(alg);
|
||||
size_t offset = 0;
|
||||
|
||||
|
@ -37,7 +37,6 @@
|
||||
#include "dwarfs/mmif.h"
|
||||
#include "dwarfs/nilsimsa.h"
|
||||
#include "dwarfs/options.h"
|
||||
#include "dwarfs/os_access.h"
|
||||
#include "dwarfs/progress.h"
|
||||
#include "dwarfs/script.h"
|
||||
#include "dwarfs/similarity.h"
|
||||
@ -90,10 +89,12 @@ class inode_ : public inode {
|
||||
public:
|
||||
using chunk_type = thrift::metadata::chunk;
|
||||
|
||||
inode_(uint32_t n)
|
||||
: num_{n} {}
|
||||
void set_num(uint32_t num) override {
|
||||
DWARFS_CHECK(!num_, "attempt to set inode number multiple times");
|
||||
num_ = num;
|
||||
}
|
||||
|
||||
uint32_t num() const override { return num_; }
|
||||
uint32_t num() const override { return num_.value(); }
|
||||
|
||||
uint32_t similarity_hash() const override {
|
||||
if (files_.empty()) {
|
||||
@ -117,12 +118,9 @@ class inode_ : public inode {
|
||||
files_ = std::move(fv);
|
||||
}
|
||||
|
||||
void scan(os_access& os, inode_options const& opts) override {
|
||||
void
|
||||
scan(std::shared_ptr<mmif> const& mm, inode_options const& opts) override {
|
||||
if (opts.needs_scan()) {
|
||||
auto file = files_.front();
|
||||
auto size = file->size();
|
||||
|
||||
if (size > 0) {
|
||||
similarity sc;
|
||||
nilsimsa nc;
|
||||
|
||||
@ -136,9 +134,9 @@ class inode_ : public inode {
|
||||
}
|
||||
};
|
||||
|
||||
constexpr size_t chunk_size = 16 << 20;
|
||||
auto mm = os.map_file(file->path(), size);
|
||||
constexpr size_t chunk_size = 32 << 20;
|
||||
size_t offset = 0;
|
||||
size_t size = mm->size();
|
||||
|
||||
while (size >= chunk_size) {
|
||||
update_hashes(mm->as<uint8_t>(offset), chunk_size);
|
||||
@ -158,7 +156,6 @@ class inode_ : public inode {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void add_chunk(size_t block, size_t offset, size_t size) override {
|
||||
chunk_type c;
|
||||
@ -184,7 +181,7 @@ class inode_ : public inode {
|
||||
}
|
||||
|
||||
private:
|
||||
uint32_t const num_;
|
||||
std::optional<uint32_t> num_;
|
||||
uint32_t similarity_hash_{0};
|
||||
files_vector files_;
|
||||
std::vector<chunk_type> chunks_;
|
||||
@ -201,7 +198,7 @@ class inode_manager_ final : public inode_manager::impl {
|
||||
, prog_(prog) {}
|
||||
|
||||
std::shared_ptr<inode> create_inode() override {
|
||||
auto ino = std::make_shared<inode_>(inodes_.size());
|
||||
auto ino = std::make_shared<inode_>();
|
||||
inodes_.push_back(ino);
|
||||
return ino;
|
||||
}
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <ctime>
|
||||
#include <deque>
|
||||
#include <iterator>
|
||||
#include <mutex>
|
||||
#include <numeric>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
@ -76,119 +77,141 @@ class visitor_base : public entry_visitor {
|
||||
|
||||
class scan_files_visitor : public visitor_base {
|
||||
public:
|
||||
scan_files_visitor(worker_group& wg, os_access& os, progress& prog,
|
||||
uint32_t& inode_num)
|
||||
scan_files_visitor(worker_group& wg, os_access& os, inode_manager& im,
|
||||
inode_options const& ino_opts, progress& prog)
|
||||
: wg_(wg)
|
||||
, os_(os)
|
||||
, prog_(prog)
|
||||
, inode_num_(inode_num) {}
|
||||
, im_(im)
|
||||
, ino_opts_(ino_opts)
|
||||
, prog_(prog) {}
|
||||
|
||||
void visit(file* p) override {
|
||||
if (p->num_hard_links() > 1) {
|
||||
auto ino = p->raw_inode_num();
|
||||
auto [it, is_new] = cache_.emplace(ino, p);
|
||||
auto [it, is_new] = hardlink_cache_.emplace(ino, p);
|
||||
|
||||
if (!is_new) {
|
||||
p->hardlink(it->second, prog_);
|
||||
++prog_.files_scanned;
|
||||
hardlinked_.push_back(p);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
p->create_data();
|
||||
++inode_num_;
|
||||
|
||||
wg_.add_job([=] {
|
||||
auto const size = p->size();
|
||||
std::shared_ptr<mmif> mm;
|
||||
|
||||
if (size > 0) {
|
||||
mm = os_.map_file(p->path(), size);
|
||||
}
|
||||
|
||||
prog_.current.store(p);
|
||||
p->scan(os_, prog_);
|
||||
p->scan(mm, prog_);
|
||||
++prog_.files_scanned;
|
||||
std::shared_ptr<inode> inode;
|
||||
|
||||
{
|
||||
std::lock_guard lock(mx_);
|
||||
auto& ref = hash_[p->hash()];
|
||||
if (ref.empty()) {
|
||||
inode = im_.create_inode();
|
||||
p->set_inode(inode);
|
||||
} else {
|
||||
p->set_inode(ref.front()->get_inode());
|
||||
}
|
||||
ref.push_back(p);
|
||||
}
|
||||
|
||||
if (inode) {
|
||||
if (ino_opts_.needs_scan()) {
|
||||
if (mm) {
|
||||
inode->scan(mm, ino_opts_);
|
||||
}
|
||||
++prog_.inodes_scanned;
|
||||
}
|
||||
} else {
|
||||
++prog_.duplicate_files;
|
||||
prog_.saved_by_deduplication += size;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void finalize(uint32_t& inode_num) {
|
||||
hardlink_cache_.clear();
|
||||
|
||||
for (auto p : hardlinked_) {
|
||||
auto& fv = hash_[p->hash()];
|
||||
p->set_inode(fv.front()->get_inode());
|
||||
fv.push_back(p);
|
||||
}
|
||||
|
||||
hardlinked_.clear();
|
||||
|
||||
uint32_t obj_num = 0;
|
||||
|
||||
finalize_inodes<true>(inode_num, obj_num);
|
||||
finalize_inodes<false>(inode_num, obj_num);
|
||||
|
||||
hash_.clear();
|
||||
}
|
||||
|
||||
uint32_t num_unique() const { return num_unique_; }
|
||||
|
||||
private:
|
||||
worker_group& wg_;
|
||||
os_access& os_;
|
||||
progress& prog_;
|
||||
folly::F14FastMap<uint64_t, file*> cache_;
|
||||
uint32_t& inode_num_;
|
||||
};
|
||||
|
||||
class file_deduplication_visitor : public visitor_base {
|
||||
public:
|
||||
file_deduplication_visitor(uint32_t first_file_inode)
|
||||
: inode_num_{first_file_inode} {}
|
||||
|
||||
void visit(file* p) override { hash_[p->hash()].push_back(p); }
|
||||
|
||||
void deduplicate_files(worker_group& wg, os_access& os, inode_manager& im,
|
||||
inode_options const& ino_opts, progress& prog) {
|
||||
auto check_scan = [&](auto inode) {
|
||||
if (ino_opts.needs_scan()) {
|
||||
wg.add_job([&, inode = std::move(inode)] {
|
||||
prog.current = inode->any();
|
||||
inode->scan(os, ino_opts);
|
||||
++prog.inodes_scanned;
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
for (auto& p : hash_) {
|
||||
if (p.second.size() > p.second.front()->refcount()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto fp = p.second.front();
|
||||
auto inode = im.create_inode();
|
||||
|
||||
++num_unique_;
|
||||
|
||||
fp->set_inode_num(inode_num_++);
|
||||
fp->set_inode(inode);
|
||||
|
||||
inode->set_files(std::move(p.second));
|
||||
|
||||
check_scan(std::move(inode));
|
||||
}
|
||||
|
||||
template <bool Unique>
|
||||
void finalize_inodes(uint32_t& inode_num, uint32_t& obj_num) {
|
||||
for (auto& p : hash_) {
|
||||
auto& files = p.second;
|
||||
|
||||
if constexpr (Unique) {
|
||||
std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
|
||||
return a->path() < b->path();
|
||||
});
|
||||
|
||||
// this is true regardless of how the files are ordered
|
||||
if (files.size() > files.front()->refcount()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
++num_unique_;
|
||||
} else {
|
||||
if (files.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file");
|
||||
|
||||
std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
|
||||
return a->path() < b->path();
|
||||
});
|
||||
|
||||
auto inode = im.create_inode();
|
||||
}
|
||||
|
||||
for (auto fp : files) {
|
||||
// need to check because hardlinks share the same number
|
||||
if (!fp->inode_num()) {
|
||||
fp->set_inode_num(inode_num_++);
|
||||
fp->set_inode_num(inode_num);
|
||||
++inode_num;
|
||||
}
|
||||
fp->set_inode(inode);
|
||||
}
|
||||
|
||||
auto dupes = files.size() - 1;
|
||||
prog.duplicate_files += dupes;
|
||||
prog.saved_by_deduplication += dupes * files.front()->size();
|
||||
|
||||
auto fp = files.front();
|
||||
auto inode = fp->get_inode();
|
||||
inode->set_num(obj_num);
|
||||
inode->set_files(std::move(files));
|
||||
|
||||
check_scan(std::move(inode));
|
||||
++obj_num;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t inode_num_end() const { return inode_num_; }
|
||||
uint32_t num_unique() const { return num_unique_; }
|
||||
|
||||
private:
|
||||
folly::F14FastMap<std::string_view, inode::files_vector> hash_;
|
||||
uint32_t inode_num_;
|
||||
worker_group& wg_;
|
||||
os_access& os_;
|
||||
inode_manager& im_;
|
||||
inode_options const& ino_opts_;
|
||||
progress& prog_;
|
||||
uint32_t num_unique_{0};
|
||||
std::vector<file*> hardlinked_;
|
||||
folly::F14FastMap<uint64_t, file*> hardlink_cache_;
|
||||
std::mutex mx_;
|
||||
folly::F14FastMap<std::string_view, inode::files_vector> hash_;
|
||||
};
|
||||
|
||||
class dir_set_inode_visitor : public visitor_base {
|
||||
@ -298,6 +321,8 @@ class save_directories_visitor : public visitor_base {
|
||||
dummy.parent_entry = 0;
|
||||
dummy.first_entry = mv2.dir_entries_ref()->size();
|
||||
mv2.directories.push_back(dummy);
|
||||
|
||||
directories_.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
@ -580,36 +605,24 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
link_set_inode_visitor lsiv(first_file_inode);
|
||||
root->accept(lsiv, true);
|
||||
|
||||
inode_manager im(lgr_, prog);
|
||||
|
||||
// now scan all files
|
||||
uint32_t first_device_inode = first_file_inode;
|
||||
scan_files_visitor sfv(wg_, *os_, prog, first_device_inode);
|
||||
scan_files_visitor sfv(wg_, *os_, im, options_.inode, prog);
|
||||
root->accept(sfv);
|
||||
|
||||
LOG_INFO << "waiting for background scanners...";
|
||||
wg_.wait();
|
||||
|
||||
LOG_INFO << "finding duplicate files...";
|
||||
|
||||
inode_manager im(lgr_, prog);
|
||||
|
||||
file_deduplication_visitor fdv(first_file_inode);
|
||||
root->accept(fdv);
|
||||
|
||||
fdv.deduplicate_files(wg_, *os_, im, options_.inode, prog);
|
||||
|
||||
DWARFS_CHECK(fdv.inode_num_end() == first_device_inode,
|
||||
"inconsistent inode numbers");
|
||||
LOG_INFO << "finalizing file inodes...";
|
||||
uint32_t first_device_inode = first_file_inode;
|
||||
sfv.finalize(first_device_inode);
|
||||
|
||||
LOG_INFO << "saved " << size_with_unit(prog.saved_by_deduplication) << " / "
|
||||
<< size_with_unit(prog.original_size) << " in "
|
||||
<< prog.duplicate_files << "/" << prog.files_found
|
||||
<< " duplicate files";
|
||||
|
||||
if (options_.inode.needs_scan()) {
|
||||
LOG_INFO << "waiting for inode scanners...";
|
||||
wg_.wait();
|
||||
}
|
||||
|
||||
global_entry_data ge_data(options_);
|
||||
thrift::metadata::metadata mv2;
|
||||
|
||||
@ -725,7 +738,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
|
||||
LOG_INFO << "saving shared files table...";
|
||||
save_shared_files_visitor ssfv(first_file_inode, first_device_inode,
|
||||
fdv.num_unique());
|
||||
sfv.num_unique());
|
||||
root->accept(ssfv);
|
||||
if (options_.pack_shared_files_table) {
|
||||
ssfv.pack_shared_files();
|
||||
|
Loading…
x
Reference in New Issue
Block a user