mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-14 14:59:52 -04:00
Clean up inode manager
This commit is contained in:
parent
63c9e9a3c3
commit
e3bbeac5d6
@ -161,56 +161,17 @@ class inode_manager_ : public inode_manager::impl {
|
||||
|
||||
void order_inodes(std::shared_ptr<script> scr, file_order_mode file_order,
|
||||
uint32_t first_inode,
|
||||
inode_manager::inode_cb const& fn) override {
|
||||
switch (file_order) {
|
||||
case file_order_mode::NONE:
|
||||
log_.info() << "keeping inode order";
|
||||
break;
|
||||
inode_manager::inode_cb const& fn) override;
|
||||
|
||||
case file_order_mode::PATH: {
|
||||
log_.info() << "ordering " << count() << " inodes by path name...";
|
||||
auto ti = log_.timed_info();
|
||||
order_inodes_by_path();
|
||||
ti << count() << " inodes ordered";
|
||||
break;
|
||||
}
|
||||
|
||||
case file_order_mode::SCRIPT: {
|
||||
if (!scr->has_order()) {
|
||||
throw std::runtime_error("script cannot order inodes");
|
||||
}
|
||||
log_.info() << "ordering " << count() << " inodes using script...";
|
||||
auto ti = log_.timed_info();
|
||||
scr->order(inodes_);
|
||||
ti << count() << " inodes ordered";
|
||||
break;
|
||||
}
|
||||
|
||||
case file_order_mode::SIMILARITY: {
|
||||
log_.info() << "ordering " << count() << " inodes by similarity...";
|
||||
auto ti = log_.timed_info();
|
||||
order_inodes_by_similarity();
|
||||
ti << count() << " inodes ordered";
|
||||
break;
|
||||
}
|
||||
|
||||
case file_order_mode::NILSIMSA: {
|
||||
log_.info() << "ordering " << count()
|
||||
<< " inodes using nilsimsa similarity...";
|
||||
auto ti = log_.timed_info();
|
||||
order_inodes_by_nilsimsa(fn, first_inode);
|
||||
ti << count() << " inodes ordered";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (file_order != file_order_mode::NILSIMSA) {
|
||||
log_.info() << "assigning file inodes...";
|
||||
number_inodes(first_inode);
|
||||
for_each_inode(fn);
|
||||
void
|
||||
for_each_inode(std::function<void(std::shared_ptr<inode> const&)> const& fn)
|
||||
const override {
|
||||
for (const auto& ino : inodes_) {
|
||||
fn(ino);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void order_inodes_by_path() {
|
||||
std::vector<std::string> paths;
|
||||
std::vector<size_t> index(inodes_.size());
|
||||
@ -250,93 +211,178 @@ class inode_manager_ : public inode_manager::impl {
|
||||
}
|
||||
|
||||
void order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn,
|
||||
uint32_t inode_no) {
|
||||
auto finalize_inode = [&](auto& ino) {
|
||||
ino->set_num(inode_no++);
|
||||
fn(ino);
|
||||
uint32_t inode_no);
|
||||
|
||||
void number_inodes(size_t first_no) {
|
||||
for (auto& i : inodes_) {
|
||||
i->set_num(first_no++);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<inode>> inodes_;
|
||||
log_proxy<LoggerPolicy> log_;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void inode_manager_<LoggerPolicy>::order_inodes(
|
||||
std::shared_ptr<script> scr, file_order_mode file_order,
|
||||
uint32_t first_inode, inode_manager::inode_cb const& fn) {
|
||||
switch (file_order) {
|
||||
case file_order_mode::NONE:
|
||||
log_.info() << "keeping inode order";
|
||||
break;
|
||||
|
||||
case file_order_mode::PATH: {
|
||||
log_.info() << "ordering " << count() << " inodes by path name...";
|
||||
auto ti = log_.timed_info();
|
||||
order_inodes_by_path();
|
||||
ti << count() << " inodes ordered";
|
||||
break;
|
||||
}
|
||||
|
||||
case file_order_mode::SCRIPT: {
|
||||
if (!scr->has_order()) {
|
||||
throw std::runtime_error("script cannot order inodes");
|
||||
}
|
||||
log_.info() << "ordering " << count() << " inodes using script...";
|
||||
auto ti = log_.timed_info();
|
||||
scr->order(inodes_);
|
||||
ti << count() << " inodes ordered";
|
||||
break;
|
||||
}
|
||||
|
||||
case file_order_mode::SIMILARITY: {
|
||||
log_.info() << "ordering " << count() << " inodes by similarity...";
|
||||
auto ti = log_.timed_info();
|
||||
order_inodes_by_similarity();
|
||||
ti << count() << " inodes ordered";
|
||||
break;
|
||||
}
|
||||
|
||||
case file_order_mode::NILSIMSA: {
|
||||
log_.info() << "ordering " << count()
|
||||
<< " inodes using nilsimsa similarity...";
|
||||
auto ti = log_.timed_info();
|
||||
order_inodes_by_nilsimsa(fn, first_inode);
|
||||
ti << count() << " inodes ordered";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (file_order != file_order_mode::NILSIMSA) {
|
||||
log_.info() << "assigning file inodes...";
|
||||
number_inodes(first_inode);
|
||||
for_each_inode(fn);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
inode_manager::inode_cb const& fn, uint32_t inode_no) {
|
||||
auto finalize_inode = [&](auto& ino) {
|
||||
ino->set_num(inode_no++);
|
||||
fn(ino);
|
||||
};
|
||||
|
||||
auto count = inodes_.size();
|
||||
|
||||
// skip all empty inodes (this is at most one)
|
||||
auto beg = std::partition(inodes_.begin(), inodes_.end(),
|
||||
[](auto const& p) { return p->size() == 0; });
|
||||
|
||||
for (auto it = inodes_.begin(); it != beg; ++it) {
|
||||
finalize_inode(*it);
|
||||
}
|
||||
|
||||
// find the largest inode
|
||||
std::nth_element(beg, beg, inodes_.end(), [](auto const& a, auto const& b) {
|
||||
return (a->size() > b->size() ||
|
||||
(a->size() == b->size() && a->any()->path() < b->any()->path()));
|
||||
});
|
||||
|
||||
finalize_inode(*beg);
|
||||
|
||||
// build a cache for the remaining inodes
|
||||
std::vector<nilsimsa_cache_entry> cache;
|
||||
std::deque<uint32_t> index;
|
||||
index.resize(std::distance(beg + 1, inodes_.end()));
|
||||
std::iota(index.begin(), index.end(), 0);
|
||||
cache.reserve(index.size());
|
||||
|
||||
for (auto it = beg + 1; it != inodes_.end(); ++it) {
|
||||
cache.emplace_back(std::move(*it));
|
||||
}
|
||||
|
||||
assert(index.size() == cache.size());
|
||||
|
||||
// and temporarily remove from the original array
|
||||
inodes_.erase(beg + 1, inodes_.end());
|
||||
|
||||
while (!index.empty()) {
|
||||
// compare reference inode with all remaining inodes
|
||||
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
||||
for (auto& d : cache) {
|
||||
d.similarity = dwarfs::nilsimsa_similarity(ref_hash, d.hash);
|
||||
}
|
||||
|
||||
auto cmp = [&cache](uint32_t a, uint32_t b) {
|
||||
auto& da = cache[a];
|
||||
auto& db = cache[b];
|
||||
return da.similarity > db.similarity ||
|
||||
(da.similarity == db.similarity &&
|
||||
(da.size > db.size || (da.size == db.size && da.path < db.path)));
|
||||
};
|
||||
|
||||
auto count = inodes_.size();
|
||||
size_t depth = 0;
|
||||
size_t depth_thresh;
|
||||
const int sim_thresh_depth = 16;
|
||||
const int sim_thresh = 0;
|
||||
const size_t max_depth = 2000;
|
||||
const size_t depth_step = 500;
|
||||
|
||||
// skip all empty inodes (this is at most one)
|
||||
auto beg = std::partition(inodes_.begin(), inodes_.end(),
|
||||
[](auto const& p) { return p->size() == 0; });
|
||||
|
||||
for (auto it = inodes_.begin(); it != beg; ++it) {
|
||||
finalize_inode(*it);
|
||||
}
|
||||
|
||||
// find the largest inode
|
||||
std::nth_element(beg, beg, inodes_.end(), [](auto const& a, auto const& b) {
|
||||
return (a->size() > b->size() ||
|
||||
(a->size() == b->size() && a->any()->path() < b->any()->path()));
|
||||
});
|
||||
|
||||
finalize_inode(*beg);
|
||||
|
||||
// build a cache for the remaining inodes
|
||||
std::vector<nilsimsa_cache_entry> cache;
|
||||
std::deque<uint32_t> index;
|
||||
index.resize(std::distance(beg + 1, inodes_.end()));
|
||||
std::iota(index.begin(), index.end(), 0);
|
||||
cache.reserve(index.size());
|
||||
|
||||
for (auto it = beg + 1; it != inodes_.end(); ++it) {
|
||||
cache.emplace_back(std::move(*it));
|
||||
}
|
||||
|
||||
assert(index.size() == cache.size());
|
||||
|
||||
// and temporarily remove from the original array
|
||||
inodes_.erase(beg + 1, inodes_.end());
|
||||
|
||||
while (!index.empty()) {
|
||||
// compare reference inode with all remaining inodes
|
||||
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
||||
for (auto& d : cache) {
|
||||
d.similarity = dwarfs::nilsimsa_similarity(ref_hash, d.hash);
|
||||
}
|
||||
|
||||
auto cmp = [&cache](uint32_t a, uint32_t b) {
|
||||
auto& da = cache[a];
|
||||
auto& db = cache[b];
|
||||
return da.similarity > db.similarity ||
|
||||
(da.similarity == db.similarity &&
|
||||
(da.size > db.size ||
|
||||
(da.size == db.size && da.path < db.path)));
|
||||
};
|
||||
|
||||
size_t depth = 0;
|
||||
size_t depth_thresh;
|
||||
const int sim_thresh_depth = 16;
|
||||
const int sim_thresh = 0;
|
||||
const size_t max_depth = 2000;
|
||||
const size_t depth_step = 500;
|
||||
|
||||
if (index.size() > max_depth) {
|
||||
while (depth < max_depth && depth + depth_step < index.size()) {
|
||||
std::partial_sort(index.begin() + depth,
|
||||
index.begin() + depth + depth_step, index.end(),
|
||||
cmp);
|
||||
depth += depth_step;
|
||||
if (cache[index[0]].similarity - cache[index[depth - 1]].similarity >
|
||||
sim_thresh_depth) {
|
||||
do {
|
||||
--depth;
|
||||
} while (cache[index[0]].similarity -
|
||||
cache[index[depth - 1]].similarity >
|
||||
sim_thresh_depth);
|
||||
break;
|
||||
}
|
||||
if (index.size() > max_depth) {
|
||||
while (depth < max_depth && depth + depth_step < index.size()) {
|
||||
std::partial_sort(index.begin() + depth,
|
||||
index.begin() + depth + depth_step, index.end(), cmp);
|
||||
depth += depth_step;
|
||||
if (cache[index[0]].similarity - cache[index[depth - 1]].similarity >
|
||||
sim_thresh_depth) {
|
||||
do {
|
||||
--depth;
|
||||
} while (cache[index[0]].similarity -
|
||||
cache[index[depth - 1]].similarity >
|
||||
sim_thresh_depth);
|
||||
break;
|
||||
}
|
||||
depth_thresh = depth / 2;
|
||||
} else {
|
||||
std::sort(index.begin(), index.end(), cmp);
|
||||
depth = index.size();
|
||||
depth_thresh = 0;
|
||||
}
|
||||
depth_thresh = depth / 2;
|
||||
} else {
|
||||
std::sort(index.begin(), index.end(), cmp);
|
||||
depth = index.size();
|
||||
depth_thresh = 0;
|
||||
}
|
||||
|
||||
auto sim = cache[index.front()].similarity;
|
||||
|
||||
while (!index.empty() && depth > depth_thresh &&
|
||||
sim - cache[index.front()].similarity <= sim_thresh) {
|
||||
inodes_.push_back(std::move(cache[index.front()].ino));
|
||||
finalize_inode(inodes_.back());
|
||||
index.pop_front();
|
||||
--depth;
|
||||
}
|
||||
|
||||
while (depth > depth_thresh) {
|
||||
ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
||||
for (size_t i = 0; i < depth; ++i) {
|
||||
cache[index[i]].similarity =
|
||||
dwarfs::nilsimsa_similarity(ref_hash, cache[index[i]].hash);
|
||||
}
|
||||
|
||||
auto sim = cache[index.front()].similarity;
|
||||
std::partial_sort(index.begin(), index.begin() + (depth - depth_thresh),
|
||||
index.begin() + depth, cmp);
|
||||
|
||||
sim = cache[index.front()].similarity;
|
||||
|
||||
while (!index.empty() && depth > depth_thresh &&
|
||||
sim - cache[index.front()].similarity <= sim_thresh) {
|
||||
@ -345,52 +391,13 @@ class inode_manager_ : public inode_manager::impl {
|
||||
index.pop_front();
|
||||
--depth;
|
||||
}
|
||||
|
||||
while (depth > depth_thresh) {
|
||||
ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
||||
for (size_t i = 0; i < depth; ++i) {
|
||||
cache[index[i]].similarity =
|
||||
dwarfs::nilsimsa_similarity(ref_hash, cache[index[i]].hash);
|
||||
}
|
||||
|
||||
std::partial_sort(index.begin(), index.begin() + (depth - depth_thresh),
|
||||
index.begin() + depth, cmp);
|
||||
|
||||
sim = cache[index.front()].similarity;
|
||||
|
||||
while (!index.empty() && depth > depth_thresh &&
|
||||
sim - cache[index.front()].similarity <= sim_thresh) {
|
||||
inodes_.push_back(std::move(cache[index.front()].ino));
|
||||
finalize_inode(inodes_.back());
|
||||
index.pop_front();
|
||||
--depth;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (count != inodes_.size()) {
|
||||
throw std::runtime_error("internal error: nilsimsa ordering failed");
|
||||
}
|
||||
}
|
||||
|
||||
void number_inodes(size_t first_no) {
|
||||
for (auto& i : inodes_) {
|
||||
i->set_num(first_no++);
|
||||
}
|
||||
if (count != inodes_.size()) {
|
||||
throw std::runtime_error("internal error: nilsimsa ordering failed");
|
||||
}
|
||||
|
||||
void
|
||||
for_each_inode(std::function<void(std::shared_ptr<inode> const&)> const& fn)
|
||||
const override {
|
||||
for (const auto& ino : inodes_) {
|
||||
fn(ino);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::shared_ptr<inode>> inodes_;
|
||||
log_proxy<LoggerPolicy> log_;
|
||||
};
|
||||
}
|
||||
|
||||
inode_manager::inode_manager(logger& lgr)
|
||||
: impl_(make_unique_logging_object<impl, inode_manager_, logger_policies>(
|
||||
|
Loading…
x
Reference in New Issue
Block a user