mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-13 06:16:55 -04:00
Optimize files table
This commit is contained in:
parent
ad51853c82
commit
daecc748db
@ -82,12 +82,14 @@ class entry : public entry_interface {
|
||||
global_entry_data const& data) const;
|
||||
void update(global_entry_data& data) const;
|
||||
virtual void accept(entry_visitor& v, bool preorder = false) = 0;
|
||||
void set_inode_num(uint32_t inode_num) { inode_num_ = inode_num; }
|
||||
uint32_t inode_num() const { return inode_num_; }
|
||||
virtual void scan(os_access& os, progress& prog) = 0;
|
||||
const struct ::stat& status() const { return stat_; }
|
||||
void set_entry_index(uint32_t index) { entry_index_ = index; }
|
||||
std::optional<uint32_t> entry_index() const { return entry_index_; }
|
||||
std::optional<uint32_t> const& entry_index() const { return entry_index_; }
|
||||
uint64_t raw_inode_num() const { return stat_.st_ino; }
|
||||
uint64_t num_hard_links() const { return stat_.st_nlink; }
|
||||
virtual void set_inode_num(uint32_t ino) = 0;
|
||||
virtual std::optional<uint32_t> const& inode_num() const = 0;
|
||||
|
||||
// more methods from entry_interface
|
||||
uint16_t get_permissions() const override;
|
||||
@ -107,7 +109,6 @@ class entry : public entry_interface {
|
||||
std::string name_;
|
||||
std::weak_ptr<entry> parent_;
|
||||
struct ::stat stat_;
|
||||
uint32_t inode_num_{0};
|
||||
std::optional<uint32_t> entry_index_;
|
||||
};
|
||||
|
||||
@ -125,14 +126,17 @@ class file : public entry {
|
||||
void scan(os_access& os, progress& prog) override;
|
||||
void create_data();
|
||||
void hardlink(file* other, progress& prog);
|
||||
uint64_t raw_inode_num() const;
|
||||
unsigned num_hard_links() const;
|
||||
uint32_t unique_file_id() const;
|
||||
|
||||
void set_inode_num(uint32_t ino) override;
|
||||
std::optional<uint32_t> const& inode_num() const override;
|
||||
|
||||
private:
|
||||
struct data {
|
||||
using hash_type = std::array<char, 20>;
|
||||
hash_type hash{0};
|
||||
uint32_t refcount{1};
|
||||
std::optional<uint32_t> inode_num;
|
||||
};
|
||||
|
||||
std::shared_ptr<data> data_;
|
||||
@ -157,10 +161,16 @@ class dir : public entry {
|
||||
bool empty() const { return entries_.empty(); }
|
||||
void remove_empty_dirs(progress& prog);
|
||||
|
||||
void set_inode_num(uint32_t ino) override { inode_num_ = ino; }
|
||||
std::optional<uint32_t> const& inode_num() const override {
|
||||
return inode_num_;
|
||||
}
|
||||
|
||||
private:
|
||||
using entry_ptr = std::shared_ptr<entry>;
|
||||
|
||||
std::vector<std::shared_ptr<entry>> entries_;
|
||||
std::optional<uint32_t> inode_num_;
|
||||
};
|
||||
|
||||
class link : public entry {
|
||||
@ -172,8 +182,14 @@ class link : public entry {
|
||||
void accept(entry_visitor& v, bool preorder) override;
|
||||
void scan(os_access& os, progress& prog) override;
|
||||
|
||||
void set_inode_num(uint32_t ino) override { inode_num_ = ino; }
|
||||
std::optional<uint32_t> const& inode_num() const override {
|
||||
return inode_num_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string link_;
|
||||
std::optional<uint32_t> inode_num_;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -188,6 +204,14 @@ class device : public entry {
|
||||
void accept(entry_visitor& v, bool preorder) override;
|
||||
void scan(os_access& os, progress& prog) override;
|
||||
uint64_t device_id() const;
|
||||
|
||||
void set_inode_num(uint32_t ino) override { inode_num_ = ino; }
|
||||
std::optional<uint32_t> const& inode_num() const override {
|
||||
return inode_num_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::optional<uint32_t> inode_num_;
|
||||
};
|
||||
|
||||
class entry_factory {
|
||||
|
@ -45,7 +45,6 @@ class inode : public object {
|
||||
|
||||
virtual void set_files(files_vector&& fv) = 0;
|
||||
virtual void scan(os_access& os, inode_options const& options) = 0;
|
||||
virtual void set_num(uint32_t num) = 0;
|
||||
virtual uint32_t num() const = 0;
|
||||
virtual uint32_t similarity_hash() const = 0;
|
||||
virtual std::vector<uint64_t> const& nilsimsa_similarity_hash() const = 0;
|
||||
|
@ -46,12 +46,13 @@ class inode_manager {
|
||||
size_t count() const { return impl_->count(); }
|
||||
|
||||
void order_inodes(std::shared_ptr<script> scr,
|
||||
file_order_options const& file_order, uint32_t first_inode,
|
||||
order_cb const& fn) {
|
||||
impl_->order_inodes(std::move(scr), file_order, first_inode, fn);
|
||||
file_order_options const& file_order, order_cb const& fn) {
|
||||
impl_->order_inodes(std::move(scr), file_order, fn);
|
||||
}
|
||||
|
||||
void for_each_inode(inode_cb const& fn) const { impl_->for_each_inode(fn); }
|
||||
void for_each_inode_in_order(inode_cb const& fn) const {
|
||||
impl_->for_each_inode_in_order(fn);
|
||||
}
|
||||
|
||||
class impl {
|
||||
public:
|
||||
@ -59,10 +60,10 @@ class inode_manager {
|
||||
|
||||
virtual std::shared_ptr<inode> create_inode() = 0;
|
||||
virtual size_t count() const = 0;
|
||||
virtual void order_inodes(std::shared_ptr<script> scr,
|
||||
file_order_options const& file_order,
|
||||
uint32_t first_inode, order_cb const& fn) = 0;
|
||||
virtual void for_each_inode(
|
||||
virtual void
|
||||
order_inodes(std::shared_ptr<script> scr,
|
||||
file_order_options const& file_order, order_cb const& fn) = 0;
|
||||
virtual void for_each_inode_in_order(
|
||||
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
|
||||
};
|
||||
|
||||
|
@ -186,9 +186,16 @@ void file::scan(os_access& os, progress& prog) {
|
||||
|
||||
uint32_t file::unique_file_id() const { return inode_->num(); }
|
||||
|
||||
uint64_t file::raw_inode_num() const { return status().st_ino; }
|
||||
void file::set_inode_num(uint32_t inode_num) {
|
||||
DWARFS_CHECK(data_, "file data unset");
|
||||
DWARFS_CHECK(!data_->inode_num, "attempt to set inode number more than once");
|
||||
data_->inode_num = inode_num;
|
||||
}
|
||||
|
||||
unsigned file::num_hard_links() const { return status().st_nlink; }
|
||||
std::optional<uint32_t> const& file::inode_num() const {
|
||||
DWARFS_CHECK(data_, "file data unset");
|
||||
return data_->inode_num;
|
||||
}
|
||||
|
||||
void file::create_data() {
|
||||
assert(!data_);
|
||||
@ -201,6 +208,7 @@ void file::hardlink(file* other, progress& prog) {
|
||||
prog.hardlink_size += size();
|
||||
++prog.hardlinks;
|
||||
data_ = other->data_;
|
||||
++data_->refcount;
|
||||
}
|
||||
|
||||
entry::type_t dir::type() const { return E_DIR; }
|
||||
@ -250,8 +258,8 @@ void dir::pack_entry(thrift::metadata::metadata& mv2,
|
||||
global_entry_data const& data) const {
|
||||
auto& de = mv2.dir_entries_ref()->emplace_back();
|
||||
de.name_index = has_parent() ? data.get_name_index(name()) : 0;
|
||||
de.inode_num = inode_num();
|
||||
entry::pack(DWARFS_NOTHROW(mv2.entries.at(inode_num())), data);
|
||||
de.inode_num = DWARFS_NOTHROW(inode_num().value());
|
||||
entry::pack(DWARFS_NOTHROW(mv2.entries.at(de.inode_num)), data);
|
||||
}
|
||||
|
||||
void dir::pack(thrift::metadata::metadata& mv2,
|
||||
@ -272,8 +280,8 @@ void dir::pack(thrift::metadata::metadata& mv2,
|
||||
e->set_entry_index(mv2.dir_entries_ref()->size());
|
||||
auto& de = mv2.dir_entries_ref()->emplace_back();
|
||||
de.name_index = data.get_name_index(e->name());
|
||||
de.inode_num = e->inode_num();
|
||||
e->pack(DWARFS_NOTHROW(mv2.entries.at(e->inode_num())), data);
|
||||
de.inode_num = DWARFS_NOTHROW(e->inode_num().value());
|
||||
e->pack(DWARFS_NOTHROW(mv2.entries.at(de.inode_num)), data);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -90,7 +90,8 @@ class inode_ : public inode {
|
||||
public:
|
||||
using chunk_type = thrift::metadata::chunk;
|
||||
|
||||
void set_num(uint32_t num) override { num_ = num; }
|
||||
inode_(uint32_t n)
|
||||
: num_{n} {}
|
||||
|
||||
uint32_t num() const override { return num_; }
|
||||
|
||||
@ -183,7 +184,7 @@ class inode_ : public inode {
|
||||
}
|
||||
|
||||
private:
|
||||
uint32_t num_{std::numeric_limits<uint32_t>::max()};
|
||||
uint32_t const num_;
|
||||
uint32_t similarity_hash_{0};
|
||||
files_vector files_;
|
||||
std::vector<chunk_type> chunks_;
|
||||
@ -200,7 +201,7 @@ class inode_manager_ final : public inode_manager::impl {
|
||||
, prog_(prog) {}
|
||||
|
||||
std::shared_ptr<inode> create_inode() override {
|
||||
auto ino = std::make_shared<inode_>();
|
||||
auto ino = std::make_shared<inode_>(inodes_.size());
|
||||
inodes_.push_back(ino);
|
||||
return ino;
|
||||
}
|
||||
@ -208,14 +209,20 @@ class inode_manager_ final : public inode_manager::impl {
|
||||
size_t count() const override { return inodes_.size(); }
|
||||
|
||||
void order_inodes(std::shared_ptr<script> scr,
|
||||
file_order_options const& file_order, uint32_t first_inode,
|
||||
file_order_options const& file_order,
|
||||
inode_manager::order_cb const& fn) override;
|
||||
|
||||
void
|
||||
for_each_inode(std::function<void(std::shared_ptr<inode> const&)> const& fn)
|
||||
void for_each_inode_in_order(
|
||||
std::function<void(std::shared_ptr<inode> const&)> const& fn)
|
||||
const override {
|
||||
for (const auto& ino : inodes_) {
|
||||
fn(ino);
|
||||
std::vector<uint32_t> index;
|
||||
index.resize(inodes_.size());
|
||||
std::iota(index.begin(), index.end(), size_t(0));
|
||||
std::sort(index.begin(), index.end(), [this](size_t a, size_t b) {
|
||||
return inodes_[a]->num() < inodes_[b]->num();
|
||||
});
|
||||
for (auto i : index) {
|
||||
fn(inodes_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -261,15 +268,8 @@ class inode_manager_ final : public inode_manager::impl {
|
||||
void presort_index(std::vector<std::shared_ptr<inode>>& inodes,
|
||||
std::vector<uint32_t>& index);
|
||||
|
||||
void
|
||||
order_inodes_by_nilsimsa(inode_manager::order_cb const& fn, uint32_t inode_no,
|
||||
file_order_options const& file_order);
|
||||
|
||||
void number_inodes(size_t first_no) {
|
||||
for (auto& i : inodes_) {
|
||||
i->set_num(first_no++);
|
||||
}
|
||||
}
|
||||
void order_inodes_by_nilsimsa(inode_manager::order_cb const& fn,
|
||||
file_order_options const& file_order);
|
||||
|
||||
std::vector<std::shared_ptr<inode>> inodes_;
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
@ -279,7 +279,7 @@ class inode_manager_ final : public inode_manager::impl {
|
||||
template <typename LoggerPolicy>
|
||||
void inode_manager_<LoggerPolicy>::order_inodes(
|
||||
std::shared_ptr<script> scr, file_order_options const& file_order,
|
||||
uint32_t first_inode, inode_manager::order_cb const& fn) {
|
||||
inode_manager::order_cb const& fn) {
|
||||
switch (file_order.mode) {
|
||||
case file_order_mode::NONE:
|
||||
LOG_INFO << "keeping inode order";
|
||||
@ -316,14 +316,13 @@ void inode_manager_<LoggerPolicy>::order_inodes(
|
||||
LOG_INFO << "ordering " << count()
|
||||
<< " inodes using nilsimsa similarity...";
|
||||
auto ti = LOG_TIMED_INFO;
|
||||
order_inodes_by_nilsimsa(fn, first_inode, file_order);
|
||||
order_inodes_by_nilsimsa(fn, file_order);
|
||||
ti << count() << " inodes ordered";
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_INFO << "assigning file inodes...";
|
||||
number_inodes(first_inode);
|
||||
for (const auto& ino : inodes_) {
|
||||
fn(ino);
|
||||
}
|
||||
@ -372,8 +371,7 @@ void inode_manager_<LoggerPolicy>::presort_index(
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
inode_manager::order_cb const& fn, uint32_t inode_no,
|
||||
file_order_options const& file_order) {
|
||||
inode_manager::order_cb const& fn, file_order_options const& file_order) {
|
||||
auto count = inodes_.size();
|
||||
|
||||
std::vector<std::shared_ptr<inode>> inodes;
|
||||
@ -389,7 +387,6 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
auto finalize_inode = [&]() {
|
||||
inodes_.push_back(std::move(inodes[index.back()]));
|
||||
index.pop_back();
|
||||
inodes_.back()->set_num(inode_no++);
|
||||
return fn(inodes_.back());
|
||||
};
|
||||
|
||||
|
@ -571,6 +571,13 @@ void metadata_<LoggerPolicy>::dump(
|
||||
}
|
||||
if (auto uf = meta_.unique_files_table()) {
|
||||
os << "unique_files_table: " << uf->size() << std::endl;
|
||||
std::vector<uint32_t> uni;
|
||||
uni.resize(meta_.chunk_table().size());
|
||||
for (auto f : *uf) {
|
||||
++uni.at(f);
|
||||
}
|
||||
os << "unique files: " << std::count(uni.begin(), uni.end(), 1)
|
||||
<< std::endl;
|
||||
}
|
||||
os << "symlink_table_offset: " << symlink_table_offset_ << std::endl;
|
||||
os << "file_index_offset: " << file_index_offset_ << std::endl;
|
||||
@ -732,6 +739,7 @@ void metadata_<LoggerPolicy>::walk_data_order_impl(
|
||||
});
|
||||
|
||||
if (auto dep = meta_.dir_entries()) {
|
||||
// TODO: this is *even more complicated* now :-)
|
||||
auto ufp = meta_.unique_files_table();
|
||||
auto mid =
|
||||
std::stable_partition(entries.begin(), entries.end(),
|
||||
|
@ -86,14 +86,13 @@ class scan_files_visitor : public visitor_base {
|
||||
|
||||
if (!is_new) {
|
||||
p->hardlink(it->second, prog_);
|
||||
p->set_inode_num(it->second->inode_num());
|
||||
++prog_.files_scanned;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
p->create_data();
|
||||
p->set_inode_num(inode_num_++);
|
||||
++inode_num_;
|
||||
|
||||
wg_.add_job([=] {
|
||||
prog_.current.store(p);
|
||||
@ -112,22 +111,58 @@ class scan_files_visitor : public visitor_base {
|
||||
|
||||
class file_deduplication_visitor : public visitor_base {
|
||||
public:
|
||||
file_deduplication_visitor(uint32_t first_file_inode)
|
||||
: inode_num_{first_file_inode} {}
|
||||
|
||||
void visit(file* p) override { hash_[p->hash()].push_back(p); }
|
||||
|
||||
void deduplicate_files(worker_group& wg, os_access& os, inode_manager& im,
|
||||
inode_options const& ino_opts, progress& prog) {
|
||||
auto check_scan = [&](auto inode) {
|
||||
if (ino_opts.needs_scan()) {
|
||||
wg.add_job([&, inode = std::move(inode)] {
|
||||
prog.current = inode->any();
|
||||
inode->scan(os, ino_opts);
|
||||
++prog.inodes_scanned;
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
for (auto& p : hash_) {
|
||||
if (p.second.size() > 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto fp = p.second.front();
|
||||
auto inode = im.create_inode();
|
||||
|
||||
fp->set_inode_num(inode_num_++);
|
||||
fp->set_inode(inode);
|
||||
|
||||
inode->set_files(std::move(p.second));
|
||||
|
||||
check_scan(std::move(inode));
|
||||
}
|
||||
|
||||
for (auto& p : hash_) {
|
||||
auto& files = p.second;
|
||||
|
||||
if (files.size() > 1) {
|
||||
std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
|
||||
return a->path() < b->path();
|
||||
});
|
||||
if (files.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file");
|
||||
|
||||
std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
|
||||
return a->path() < b->path();
|
||||
});
|
||||
|
||||
auto inode = im.create_inode();
|
||||
|
||||
for (auto fp : files) {
|
||||
if (!fp->inode_num()) {
|
||||
fp->set_inode_num(inode_num_++);
|
||||
}
|
||||
fp->set_inode(inode);
|
||||
}
|
||||
|
||||
@ -138,18 +173,15 @@ class file_deduplication_visitor : public visitor_base {
|
||||
|
||||
inode->set_files(std::move(files));
|
||||
|
||||
if (ino_opts.needs_scan()) {
|
||||
wg.add_job([&, inode] {
|
||||
prog.current = inode->any();
|
||||
inode->scan(os, ino_opts);
|
||||
++prog.inodes_scanned;
|
||||
});
|
||||
}
|
||||
check_scan(std::move(inode));
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t inode_num_end() const { return inode_num_; }
|
||||
|
||||
private:
|
||||
folly::F14FastMap<std::string_view, inode::files_vector> hash_;
|
||||
uint32_t inode_num_;
|
||||
};
|
||||
|
||||
class dir_set_inode_visitor : public visitor_base {
|
||||
@ -243,7 +275,7 @@ class save_directories_visitor : public visitor_base {
|
||||
directories_.resize(num_directories);
|
||||
}
|
||||
|
||||
void visit(dir* p) override { directories_[p->inode_num()] = p; }
|
||||
void visit(dir* p) override { directories_.at(p->inode_num().value()) = p; }
|
||||
|
||||
void pack(thrift::metadata::metadata& mv2, global_entry_data& ge_data) {
|
||||
for (auto p : directories_) {
|
||||
@ -273,7 +305,8 @@ class save_unique_files_visitor : public visitor_base {
|
||||
}
|
||||
|
||||
void visit(file* p) override {
|
||||
unique_files_.at(p->inode_num() - inode_begin_) = p->unique_file_id();
|
||||
unique_files_.at(p->inode_num().value() - inode_begin_) =
|
||||
p->unique_file_id();
|
||||
}
|
||||
|
||||
std::vector<uint32_t>& get_unique_files() { return unique_files_; }
|
||||
@ -514,11 +547,14 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
|
||||
inode_manager im(lgr_, prog);
|
||||
|
||||
file_deduplication_visitor fdv;
|
||||
file_deduplication_visitor fdv(first_file_inode);
|
||||
root->accept(fdv);
|
||||
|
||||
fdv.deduplicate_files(wg_, *os_, im, options_.inode, prog);
|
||||
|
||||
DWARFS_CHECK(fdv.inode_num_end() == first_device_inode,
|
||||
"inconsistent inode numbers");
|
||||
|
||||
LOG_INFO << "saved " << size_with_unit(prog.saved_by_deduplication) << " / "
|
||||
<< size_with_unit(prog.original_size) << " in "
|
||||
<< prog.duplicate_files << "/" << prog.files_found
|
||||
@ -559,7 +595,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
ep->update(ge_data);
|
||||
if (auto lp = dynamic_cast<link*>(ep)) {
|
||||
DWARFS_NOTHROW(
|
||||
mv2.symlink_table.at(ep->inode_num() - first_link_inode)) =
|
||||
mv2.symlink_table.at(ep->inode_num().value() - first_link_inode)) =
|
||||
ge_data.get_symlink_table_entry(lp->linkname());
|
||||
}
|
||||
});
|
||||
@ -570,7 +606,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
|
||||
worker_group blockify("blockify", 1, 1 << 20);
|
||||
|
||||
im.order_inodes(script_, options_.file_order, 0,
|
||||
im.order_inodes(script_, options_.file_order,
|
||||
[&](std::shared_ptr<inode> const& ino) {
|
||||
blockify.add_job([&] {
|
||||
prog.current.store(ino.get());
|
||||
@ -605,8 +641,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
|
||||
// TODO: we should be able to start this once all blocks have been
|
||||
// submitted for compression
|
||||
im.for_each_inode([&](std::shared_ptr<inode> const& ino) {
|
||||
// TODO: no need for this offset stuff here...
|
||||
im.for_each_inode_in_order([&](std::shared_ptr<inode> const& ino) {
|
||||
DWARFS_NOTHROW(mv2.chunk_table.at(ino->num())) = mv2.chunks.size();
|
||||
ino->append_chunks_to(mv2.chunks);
|
||||
});
|
||||
|
Loading…
x
Reference in New Issue
Block a user