Optimize files table

This commit is contained in:
Marcus Holland-Moritz 2021-03-17 11:13:51 +01:00
parent ad51853c82
commit daecc748db
7 changed files with 136 additions and 64 deletions

View File

@ -82,12 +82,14 @@ class entry : public entry_interface {
global_entry_data const& data) const;
void update(global_entry_data& data) const;
virtual void accept(entry_visitor& v, bool preorder = false) = 0;
void set_inode_num(uint32_t inode_num) { inode_num_ = inode_num; }
uint32_t inode_num() const { return inode_num_; }
virtual void scan(os_access& os, progress& prog) = 0;
const struct ::stat& status() const { return stat_; }
void set_entry_index(uint32_t index) { entry_index_ = index; }
std::optional<uint32_t> entry_index() const { return entry_index_; }
std::optional<uint32_t> const& entry_index() const { return entry_index_; }
uint64_t raw_inode_num() const { return stat_.st_ino; }
uint64_t num_hard_links() const { return stat_.st_nlink; }
virtual void set_inode_num(uint32_t ino) = 0;
virtual std::optional<uint32_t> const& inode_num() const = 0;
// more methods from entry_interface
uint16_t get_permissions() const override;
@ -107,7 +109,6 @@ class entry : public entry_interface {
std::string name_;
std::weak_ptr<entry> parent_;
struct ::stat stat_;
uint32_t inode_num_{0};
std::optional<uint32_t> entry_index_;
};
@ -125,14 +126,17 @@ class file : public entry {
void scan(os_access& os, progress& prog) override;
void create_data();
void hardlink(file* other, progress& prog);
uint64_t raw_inode_num() const;
unsigned num_hard_links() const;
uint32_t unique_file_id() const;
void set_inode_num(uint32_t ino) override;
std::optional<uint32_t> const& inode_num() const override;
private:
struct data {
using hash_type = std::array<char, 20>;
hash_type hash{0};
uint32_t refcount{1};
std::optional<uint32_t> inode_num;
};
std::shared_ptr<data> data_;
@ -157,10 +161,16 @@ class dir : public entry {
bool empty() const { return entries_.empty(); }
void remove_empty_dirs(progress& prog);
void set_inode_num(uint32_t ino) override { inode_num_ = ino; }
std::optional<uint32_t> const& inode_num() const override {
return inode_num_;
}
private:
using entry_ptr = std::shared_ptr<entry>;
std::vector<std::shared_ptr<entry>> entries_;
std::optional<uint32_t> inode_num_;
};
class link : public entry {
@ -172,8 +182,14 @@ class link : public entry {
void accept(entry_visitor& v, bool preorder) override;
void scan(os_access& os, progress& prog) override;
void set_inode_num(uint32_t ino) override { inode_num_ = ino; }
std::optional<uint32_t> const& inode_num() const override {
return inode_num_;
}
private:
std::string link_;
std::optional<uint32_t> inode_num_;
};
/**
@ -188,6 +204,14 @@ class device : public entry {
void accept(entry_visitor& v, bool preorder) override;
void scan(os_access& os, progress& prog) override;
uint64_t device_id() const;
void set_inode_num(uint32_t ino) override { inode_num_ = ino; }
std::optional<uint32_t> const& inode_num() const override {
return inode_num_;
}
private:
std::optional<uint32_t> inode_num_;
};
class entry_factory {

View File

@ -45,7 +45,6 @@ class inode : public object {
virtual void set_files(files_vector&& fv) = 0;
virtual void scan(os_access& os, inode_options const& options) = 0;
virtual void set_num(uint32_t num) = 0;
virtual uint32_t num() const = 0;
virtual uint32_t similarity_hash() const = 0;
virtual std::vector<uint64_t> const& nilsimsa_similarity_hash() const = 0;

View File

@ -46,12 +46,13 @@ class inode_manager {
size_t count() const { return impl_->count(); }
void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order, uint32_t first_inode,
order_cb const& fn) {
impl_->order_inodes(std::move(scr), file_order, first_inode, fn);
file_order_options const& file_order, order_cb const& fn) {
impl_->order_inodes(std::move(scr), file_order, fn);
}
void for_each_inode(inode_cb const& fn) const { impl_->for_each_inode(fn); }
void for_each_inode_in_order(inode_cb const& fn) const {
impl_->for_each_inode_in_order(fn);
}
class impl {
public:
@ -59,10 +60,10 @@ class inode_manager {
virtual std::shared_ptr<inode> create_inode() = 0;
virtual size_t count() const = 0;
virtual void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order,
uint32_t first_inode, order_cb const& fn) = 0;
virtual void for_each_inode(
virtual void
order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order, order_cb const& fn) = 0;
virtual void for_each_inode_in_order(
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
};

View File

@ -186,9 +186,16 @@ void file::scan(os_access& os, progress& prog) {
uint32_t file::unique_file_id() const { return inode_->num(); }
uint64_t file::raw_inode_num() const { return status().st_ino; }
void file::set_inode_num(uint32_t inode_num) {
DWARFS_CHECK(data_, "file data unset");
DWARFS_CHECK(!data_->inode_num, "attempt to set inode number more than once");
data_->inode_num = inode_num;
}
unsigned file::num_hard_links() const { return status().st_nlink; }
std::optional<uint32_t> const& file::inode_num() const {
DWARFS_CHECK(data_, "file data unset");
return data_->inode_num;
}
void file::create_data() {
assert(!data_);
@ -201,6 +208,7 @@ void file::hardlink(file* other, progress& prog) {
prog.hardlink_size += size();
++prog.hardlinks;
data_ = other->data_;
++data_->refcount;
}
entry::type_t dir::type() const { return E_DIR; }
@ -250,8 +258,8 @@ void dir::pack_entry(thrift::metadata::metadata& mv2,
global_entry_data const& data) const {
auto& de = mv2.dir_entries_ref()->emplace_back();
de.name_index = has_parent() ? data.get_name_index(name()) : 0;
de.inode_num = inode_num();
entry::pack(DWARFS_NOTHROW(mv2.entries.at(inode_num())), data);
de.inode_num = DWARFS_NOTHROW(inode_num().value());
entry::pack(DWARFS_NOTHROW(mv2.entries.at(de.inode_num)), data);
}
void dir::pack(thrift::metadata::metadata& mv2,
@ -272,8 +280,8 @@ void dir::pack(thrift::metadata::metadata& mv2,
e->set_entry_index(mv2.dir_entries_ref()->size());
auto& de = mv2.dir_entries_ref()->emplace_back();
de.name_index = data.get_name_index(e->name());
de.inode_num = e->inode_num();
e->pack(DWARFS_NOTHROW(mv2.entries.at(e->inode_num())), data);
de.inode_num = DWARFS_NOTHROW(e->inode_num().value());
e->pack(DWARFS_NOTHROW(mv2.entries.at(de.inode_num)), data);
}
}

View File

@ -90,7 +90,8 @@ class inode_ : public inode {
public:
using chunk_type = thrift::metadata::chunk;
void set_num(uint32_t num) override { num_ = num; }
inode_(uint32_t n)
: num_{n} {}
uint32_t num() const override { return num_; }
@ -183,7 +184,7 @@ class inode_ : public inode {
}
private:
uint32_t num_{std::numeric_limits<uint32_t>::max()};
uint32_t const num_;
uint32_t similarity_hash_{0};
files_vector files_;
std::vector<chunk_type> chunks_;
@ -200,7 +201,7 @@ class inode_manager_ final : public inode_manager::impl {
, prog_(prog) {}
std::shared_ptr<inode> create_inode() override {
auto ino = std::make_shared<inode_>();
auto ino = std::make_shared<inode_>(inodes_.size());
inodes_.push_back(ino);
return ino;
}
@ -208,14 +209,20 @@ class inode_manager_ final : public inode_manager::impl {
size_t count() const override { return inodes_.size(); }
void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order, uint32_t first_inode,
file_order_options const& file_order,
inode_manager::order_cb const& fn) override;
void
for_each_inode(std::function<void(std::shared_ptr<inode> const&)> const& fn)
void for_each_inode_in_order(
std::function<void(std::shared_ptr<inode> const&)> const& fn)
const override {
for (const auto& ino : inodes_) {
fn(ino);
std::vector<uint32_t> index;
index.resize(inodes_.size());
std::iota(index.begin(), index.end(), size_t(0));
std::sort(index.begin(), index.end(), [this](size_t a, size_t b) {
return inodes_[a]->num() < inodes_[b]->num();
});
for (auto i : index) {
fn(inodes_[i]);
}
}
@ -261,16 +268,9 @@ class inode_manager_ final : public inode_manager::impl {
void presort_index(std::vector<std::shared_ptr<inode>>& inodes,
std::vector<uint32_t>& index);
void
order_inodes_by_nilsimsa(inode_manager::order_cb const& fn, uint32_t inode_no,
void order_inodes_by_nilsimsa(inode_manager::order_cb const& fn,
file_order_options const& file_order);
void number_inodes(size_t first_no) {
for (auto& i : inodes_) {
i->set_num(first_no++);
}
}
std::vector<std::shared_ptr<inode>> inodes_;
LOG_PROXY_DECL(LoggerPolicy);
progress& prog_;
@ -279,7 +279,7 @@ class inode_manager_ final : public inode_manager::impl {
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes(
std::shared_ptr<script> scr, file_order_options const& file_order,
uint32_t first_inode, inode_manager::order_cb const& fn) {
inode_manager::order_cb const& fn) {
switch (file_order.mode) {
case file_order_mode::NONE:
LOG_INFO << "keeping inode order";
@ -316,14 +316,13 @@ void inode_manager_<LoggerPolicy>::order_inodes(
LOG_INFO << "ordering " << count()
<< " inodes using nilsimsa similarity...";
auto ti = LOG_TIMED_INFO;
order_inodes_by_nilsimsa(fn, first_inode, file_order);
order_inodes_by_nilsimsa(fn, file_order);
ti << count() << " inodes ordered";
return;
}
}
LOG_INFO << "assigning file inodes...";
number_inodes(first_inode);
for (const auto& ino : inodes_) {
fn(ino);
}
@ -372,8 +371,7 @@ void inode_manager_<LoggerPolicy>::presort_index(
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
inode_manager::order_cb const& fn, uint32_t inode_no,
file_order_options const& file_order) {
inode_manager::order_cb const& fn, file_order_options const& file_order) {
auto count = inodes_.size();
std::vector<std::shared_ptr<inode>> inodes;
@ -389,7 +387,6 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
auto finalize_inode = [&]() {
inodes_.push_back(std::move(inodes[index.back()]));
index.pop_back();
inodes_.back()->set_num(inode_no++);
return fn(inodes_.back());
};

View File

@ -571,6 +571,13 @@ void metadata_<LoggerPolicy>::dump(
}
if (auto uf = meta_.unique_files_table()) {
os << "unique_files_table: " << uf->size() << std::endl;
std::vector<uint32_t> uni;
uni.resize(meta_.chunk_table().size());
for (auto f : *uf) {
++uni.at(f);
}
os << "unique files: " << std::count(uni.begin(), uni.end(), 1)
<< std::endl;
}
os << "symlink_table_offset: " << symlink_table_offset_ << std::endl;
os << "file_index_offset: " << file_index_offset_ << std::endl;
@ -732,6 +739,7 @@ void metadata_<LoggerPolicy>::walk_data_order_impl(
});
if (auto dep = meta_.dir_entries()) {
// TODO: this is *even more complicated* now :-)
auto ufp = meta_.unique_files_table();
auto mid =
std::stable_partition(entries.begin(), entries.end(),

View File

@ -86,14 +86,13 @@ class scan_files_visitor : public visitor_base {
if (!is_new) {
p->hardlink(it->second, prog_);
p->set_inode_num(it->second->inode_num());
++prog_.files_scanned;
return;
}
}
p->create_data();
p->set_inode_num(inode_num_++);
++inode_num_;
wg_.add_job([=] {
prog_.current.store(p);
@ -112,22 +111,58 @@ class scan_files_visitor : public visitor_base {
class file_deduplication_visitor : public visitor_base {
public:
file_deduplication_visitor(uint32_t first_file_inode)
: inode_num_{first_file_inode} {}
void visit(file* p) override { hash_[p->hash()].push_back(p); }
void deduplicate_files(worker_group& wg, os_access& os, inode_manager& im,
inode_options const& ino_opts, progress& prog) {
auto check_scan = [&](auto inode) {
if (ino_opts.needs_scan()) {
wg.add_job([&, inode = std::move(inode)] {
prog.current = inode->any();
inode->scan(os, ino_opts);
++prog.inodes_scanned;
});
}
};
for (auto& p : hash_) {
if (p.second.size() > 1) {
continue;
}
auto fp = p.second.front();
auto inode = im.create_inode();
fp->set_inode_num(inode_num_++);
fp->set_inode(inode);
inode->set_files(std::move(p.second));
check_scan(std::move(inode));
}
for (auto& p : hash_) {
auto& files = p.second;
if (files.size() > 1) {
if (files.empty()) {
continue;
}
DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file");
std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
return a->path() < b->path();
});
}
auto inode = im.create_inode();
for (auto fp : files) {
if (!fp->inode_num()) {
fp->set_inode_num(inode_num_++);
}
fp->set_inode(inode);
}
@ -138,18 +173,15 @@ class file_deduplication_visitor : public visitor_base {
inode->set_files(std::move(files));
if (ino_opts.needs_scan()) {
wg.add_job([&, inode] {
prog.current = inode->any();
inode->scan(os, ino_opts);
++prog.inodes_scanned;
});
}
check_scan(std::move(inode));
}
}
uint32_t inode_num_end() const { return inode_num_; }
private:
folly::F14FastMap<std::string_view, inode::files_vector> hash_;
uint32_t inode_num_;
};
class dir_set_inode_visitor : public visitor_base {
@ -243,7 +275,7 @@ class save_directories_visitor : public visitor_base {
directories_.resize(num_directories);
}
void visit(dir* p) override { directories_[p->inode_num()] = p; }
void visit(dir* p) override { directories_.at(p->inode_num().value()) = p; }
void pack(thrift::metadata::metadata& mv2, global_entry_data& ge_data) {
for (auto p : directories_) {
@ -273,7 +305,8 @@ class save_unique_files_visitor : public visitor_base {
}
void visit(file* p) override {
unique_files_.at(p->inode_num() - inode_begin_) = p->unique_file_id();
unique_files_.at(p->inode_num().value() - inode_begin_) =
p->unique_file_id();
}
std::vector<uint32_t>& get_unique_files() { return unique_files_; }
@ -514,11 +547,14 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
inode_manager im(lgr_, prog);
file_deduplication_visitor fdv;
file_deduplication_visitor fdv(first_file_inode);
root->accept(fdv);
fdv.deduplicate_files(wg_, *os_, im, options_.inode, prog);
DWARFS_CHECK(fdv.inode_num_end() == first_device_inode,
"inconsistent inode numbers");
LOG_INFO << "saved " << size_with_unit(prog.saved_by_deduplication) << " / "
<< size_with_unit(prog.original_size) << " in "
<< prog.duplicate_files << "/" << prog.files_found
@ -559,7 +595,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
ep->update(ge_data);
if (auto lp = dynamic_cast<link*>(ep)) {
DWARFS_NOTHROW(
mv2.symlink_table.at(ep->inode_num() - first_link_inode)) =
mv2.symlink_table.at(ep->inode_num().value() - first_link_inode)) =
ge_data.get_symlink_table_entry(lp->linkname());
}
});
@ -570,7 +606,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
worker_group blockify("blockify", 1, 1 << 20);
im.order_inodes(script_, options_.file_order, 0,
im.order_inodes(script_, options_.file_order,
[&](std::shared_ptr<inode> const& ino) {
blockify.add_job([&] {
prog.current.store(ino.get());
@ -605,8 +641,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
// TODO: we should be able to start this once all blocks have been
// submitted for compression
im.for_each_inode([&](std::shared_ptr<inode> const& ino) {
// TODO: no need for this offset stuff here...
im.for_each_inode_in_order([&](std::shared_ptr<inode> const& ino) {
DWARFS_NOTHROW(mv2.chunk_table.at(ino->num())) = mv2.chunks.size();
ino->append_chunks_to(mv2.chunks);
});