Clean up inode manager

This commit is contained in:
Marcus Holland-Moritz 2020-12-07 22:20:17 +01:00
parent 63c9e9a3c3
commit e3bbeac5d6

View File

@ -161,56 +161,17 @@ class inode_manager_ : public inode_manager::impl {
void order_inodes(std::shared_ptr<script> scr, file_order_mode file_order,
uint32_t first_inode,
inode_manager::inode_cb const& fn) override {
switch (file_order) {
case file_order_mode::NONE:
log_.info() << "keeping inode order";
break;
inode_manager::inode_cb const& fn) override;
case file_order_mode::PATH: {
log_.info() << "ordering " << count() << " inodes by path name...";
auto ti = log_.timed_info();
order_inodes_by_path();
ti << count() << " inodes ordered";
break;
}
case file_order_mode::SCRIPT: {
if (!scr->has_order()) {
throw std::runtime_error("script cannot order inodes");
}
log_.info() << "ordering " << count() << " inodes using script...";
auto ti = log_.timed_info();
scr->order(inodes_);
ti << count() << " inodes ordered";
break;
}
case file_order_mode::SIMILARITY: {
log_.info() << "ordering " << count() << " inodes by similarity...";
auto ti = log_.timed_info();
order_inodes_by_similarity();
ti << count() << " inodes ordered";
break;
}
case file_order_mode::NILSIMSA: {
log_.info() << "ordering " << count()
<< " inodes using nilsimsa similarity...";
auto ti = log_.timed_info();
order_inodes_by_nilsimsa(fn, first_inode);
ti << count() << " inodes ordered";
break;
}
}
if (file_order != file_order_mode::NILSIMSA) {
log_.info() << "assigning file inodes...";
number_inodes(first_inode);
for_each_inode(fn);
void
for_each_inode(std::function<void(std::shared_ptr<inode> const&)> const& fn)
const override {
for (const auto& ino : inodes_) {
fn(ino);
}
}
private:
void order_inodes_by_path() {
std::vector<std::string> paths;
std::vector<size_t> index(inodes_.size());
@ -250,93 +211,178 @@ class inode_manager_ : public inode_manager::impl {
}
void order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn,
uint32_t inode_no) {
auto finalize_inode = [&](auto& ino) {
ino->set_num(inode_no++);
fn(ino);
uint32_t inode_no);
void number_inodes(size_t first_no) {
for (auto& i : inodes_) {
i->set_num(first_no++);
}
}
std::vector<std::shared_ptr<inode>> inodes_;
log_proxy<LoggerPolicy> log_;
};
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes(
std::shared_ptr<script> scr, file_order_mode file_order,
uint32_t first_inode, inode_manager::inode_cb const& fn) {
switch (file_order) {
case file_order_mode::NONE:
log_.info() << "keeping inode order";
break;
case file_order_mode::PATH: {
log_.info() << "ordering " << count() << " inodes by path name...";
auto ti = log_.timed_info();
order_inodes_by_path();
ti << count() << " inodes ordered";
break;
}
case file_order_mode::SCRIPT: {
if (!scr->has_order()) {
throw std::runtime_error("script cannot order inodes");
}
log_.info() << "ordering " << count() << " inodes using script...";
auto ti = log_.timed_info();
scr->order(inodes_);
ti << count() << " inodes ordered";
break;
}
case file_order_mode::SIMILARITY: {
log_.info() << "ordering " << count() << " inodes by similarity...";
auto ti = log_.timed_info();
order_inodes_by_similarity();
ti << count() << " inodes ordered";
break;
}
case file_order_mode::NILSIMSA: {
log_.info() << "ordering " << count()
<< " inodes using nilsimsa similarity...";
auto ti = log_.timed_info();
order_inodes_by_nilsimsa(fn, first_inode);
ti << count() << " inodes ordered";
break;
}
}
if (file_order != file_order_mode::NILSIMSA) {
log_.info() << "assigning file inodes...";
number_inodes(first_inode);
for_each_inode(fn);
}
}
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
inode_manager::inode_cb const& fn, uint32_t inode_no) {
auto finalize_inode = [&](auto& ino) {
ino->set_num(inode_no++);
fn(ino);
};
auto count = inodes_.size();
// skip all empty inodes (this is at most one)
auto beg = std::partition(inodes_.begin(), inodes_.end(),
[](auto const& p) { return p->size() == 0; });
for (auto it = inodes_.begin(); it != beg; ++it) {
finalize_inode(*it);
}
// find the largest inode
std::nth_element(beg, beg, inodes_.end(), [](auto const& a, auto const& b) {
return (a->size() > b->size() ||
(a->size() == b->size() && a->any()->path() < b->any()->path()));
});
finalize_inode(*beg);
// build a cache for the remaining inodes
std::vector<nilsimsa_cache_entry> cache;
std::deque<uint32_t> index;
index.resize(std::distance(beg + 1, inodes_.end()));
std::iota(index.begin(), index.end(), 0);
cache.reserve(index.size());
for (auto it = beg + 1; it != inodes_.end(); ++it) {
cache.emplace_back(std::move(*it));
}
assert(index.size() == cache.size());
// and temporarily remove from the original array
inodes_.erase(beg + 1, inodes_.end());
while (!index.empty()) {
// compare reference inode with all remaining inodes
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
for (auto& d : cache) {
d.similarity = dwarfs::nilsimsa_similarity(ref_hash, d.hash);
}
auto cmp = [&cache](uint32_t a, uint32_t b) {
auto& da = cache[a];
auto& db = cache[b];
return da.similarity > db.similarity ||
(da.similarity == db.similarity &&
(da.size > db.size || (da.size == db.size && da.path < db.path)));
};
auto count = inodes_.size();
size_t depth = 0;
size_t depth_thresh;
const int sim_thresh_depth = 16;
const int sim_thresh = 0;
const size_t max_depth = 2000;
const size_t depth_step = 500;
// skip all empty inodes (this is at most one)
auto beg = std::partition(inodes_.begin(), inodes_.end(),
[](auto const& p) { return p->size() == 0; });
for (auto it = inodes_.begin(); it != beg; ++it) {
finalize_inode(*it);
}
// find the largest inode
std::nth_element(beg, beg, inodes_.end(), [](auto const& a, auto const& b) {
return (a->size() > b->size() ||
(a->size() == b->size() && a->any()->path() < b->any()->path()));
});
finalize_inode(*beg);
// build a cache for the remaining inodes
std::vector<nilsimsa_cache_entry> cache;
std::deque<uint32_t> index;
index.resize(std::distance(beg + 1, inodes_.end()));
std::iota(index.begin(), index.end(), 0);
cache.reserve(index.size());
for (auto it = beg + 1; it != inodes_.end(); ++it) {
cache.emplace_back(std::move(*it));
}
assert(index.size() == cache.size());
// and temporarily remove from the original array
inodes_.erase(beg + 1, inodes_.end());
while (!index.empty()) {
// compare reference inode with all remaining inodes
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
for (auto& d : cache) {
d.similarity = dwarfs::nilsimsa_similarity(ref_hash, d.hash);
}
auto cmp = [&cache](uint32_t a, uint32_t b) {
auto& da = cache[a];
auto& db = cache[b];
return da.similarity > db.similarity ||
(da.similarity == db.similarity &&
(da.size > db.size ||
(da.size == db.size && da.path < db.path)));
};
size_t depth = 0;
size_t depth_thresh;
const int sim_thresh_depth = 16;
const int sim_thresh = 0;
const size_t max_depth = 2000;
const size_t depth_step = 500;
if (index.size() > max_depth) {
while (depth < max_depth && depth + depth_step < index.size()) {
std::partial_sort(index.begin() + depth,
index.begin() + depth + depth_step, index.end(),
cmp);
depth += depth_step;
if (cache[index[0]].similarity - cache[index[depth - 1]].similarity >
sim_thresh_depth) {
do {
--depth;
} while (cache[index[0]].similarity -
cache[index[depth - 1]].similarity >
sim_thresh_depth);
break;
}
if (index.size() > max_depth) {
while (depth < max_depth && depth + depth_step < index.size()) {
std::partial_sort(index.begin() + depth,
index.begin() + depth + depth_step, index.end(), cmp);
depth += depth_step;
if (cache[index[0]].similarity - cache[index[depth - 1]].similarity >
sim_thresh_depth) {
do {
--depth;
} while (cache[index[0]].similarity -
cache[index[depth - 1]].similarity >
sim_thresh_depth);
break;
}
depth_thresh = depth / 2;
} else {
std::sort(index.begin(), index.end(), cmp);
depth = index.size();
depth_thresh = 0;
}
depth_thresh = depth / 2;
} else {
std::sort(index.begin(), index.end(), cmp);
depth = index.size();
depth_thresh = 0;
}
auto sim = cache[index.front()].similarity;
while (!index.empty() && depth > depth_thresh &&
sim - cache[index.front()].similarity <= sim_thresh) {
inodes_.push_back(std::move(cache[index.front()].ino));
finalize_inode(inodes_.back());
index.pop_front();
--depth;
}
while (depth > depth_thresh) {
ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
for (size_t i = 0; i < depth; ++i) {
cache[index[i]].similarity =
dwarfs::nilsimsa_similarity(ref_hash, cache[index[i]].hash);
}
auto sim = cache[index.front()].similarity;
std::partial_sort(index.begin(), index.begin() + (depth - depth_thresh),
index.begin() + depth, cmp);
sim = cache[index.front()].similarity;
while (!index.empty() && depth > depth_thresh &&
sim - cache[index.front()].similarity <= sim_thresh) {
@ -345,52 +391,13 @@ class inode_manager_ : public inode_manager::impl {
index.pop_front();
--depth;
}
while (depth > depth_thresh) {
ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
for (size_t i = 0; i < depth; ++i) {
cache[index[i]].similarity =
dwarfs::nilsimsa_similarity(ref_hash, cache[index[i]].hash);
}
std::partial_sort(index.begin(), index.begin() + (depth - depth_thresh),
index.begin() + depth, cmp);
sim = cache[index.front()].similarity;
while (!index.empty() && depth > depth_thresh &&
sim - cache[index.front()].similarity <= sim_thresh) {
inodes_.push_back(std::move(cache[index.front()].ino));
finalize_inode(inodes_.back());
index.pop_front();
--depth;
}
}
}
if (count != inodes_.size()) {
throw std::runtime_error("internal error: nilsimsa ordering failed");
}
}
void number_inodes(size_t first_no) {
for (auto& i : inodes_) {
i->set_num(first_no++);
}
if (count != inodes_.size()) {
throw std::runtime_error("internal error: nilsimsa ordering failed");
}
void
for_each_inode(std::function<void(std::shared_ptr<inode> const&)> const& fn)
const override {
for (const auto& ino : inodes_) {
fn(ino);
}
}
private:
std::vector<std::shared_ptr<inode>> inodes_;
log_proxy<LoggerPolicy> log_;
};
}
inode_manager::inode_manager(logger& lgr)
: impl_(make_unique_logging_object<impl, inode_manager_, logger_policies>(