Remove old nilsimsa code

This commit is contained in:
Marcus Holland-Moritz 2020-12-09 17:28:43 +01:00
parent 9f734adc3a
commit 2efc231393
3 changed files with 5 additions and 143 deletions

View File

@ -59,8 +59,7 @@ enum class file_order_mode {
PATH,
SCRIPT,
SIMILARITY,
NILSIMSA,
NILSIMSA2
NILSIMSA
};
struct file_order_options {

View File

@ -212,9 +212,6 @@ class inode_manager_ : public inode_manager::impl {
}
void order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn,
uint32_t inode_no);
void order_inodes_by_nilsimsa2(inode_manager::inode_cb const& fn,
uint32_t inode_no,
file_order_options const& file_order);
@ -268,16 +265,7 @@ void inode_manager_<LoggerPolicy>::order_inodes(
log_.info() << "ordering " << count()
<< " inodes using nilsimsa similarity...";
auto ti = log_.timed_info();
order_inodes_by_nilsimsa(fn, first_inode);
ti << count() << " inodes ordered";
return;
}
case file_order_mode::NILSIMSA2: {
log_.info() << "ordering " << count()
<< " inodes using nilsimsa2 similarity...";
auto ti = log_.timed_info();
order_inodes_by_nilsimsa2(fn, first_inode, file_order);
order_inodes_by_nilsimsa(fn, first_inode, file_order);
ti << count() << " inodes ordered";
return;
}
@ -290,129 +278,6 @@ void inode_manager_<LoggerPolicy>::order_inodes(
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
inode_manager::inode_cb const& fn, uint32_t inode_no) {
auto finalize_inode = [&](auto& ino) {
ino->set_num(inode_no++);
fn(ino);
};
auto count = inodes_.size();
// skip all empty inodes (this is at most one)
auto beg = std::partition(inodes_.begin(), inodes_.end(),
[](auto const& p) { return p->size() == 0; });
for (auto it = inodes_.begin(); it != beg; ++it) {
finalize_inode(*it);
}
// find the largest inode
std::nth_element(beg, beg, inodes_.end(), [](auto const& a, auto const& b) {
return (a->size() > b->size() ||
(a->size() == b->size() && a->any()->path() < b->any()->path()));
});
finalize_inode(*beg);
// build a cache for the remaining inodes
std::vector<nilsimsa_cache_entry> cache;
std::deque<uint32_t> index;
index.resize(std::distance(beg + 1, inodes_.end()));
std::iota(index.begin(), index.end(), 0);
cache.reserve(index.size());
for (auto it = beg + 1; it != inodes_.end(); ++it) {
cache.emplace_back(std::move(*it));
}
assert(index.size() == cache.size());
// and temporarily remove from the original array
inodes_.erase(beg + 1, inodes_.end());
while (!index.empty()) {
// compare reference inode with all remaining inodes
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
for (auto& d : cache) {
d.similarity = dwarfs::nilsimsa_similarity(ref_hash, d.hash);
}
auto cmp = [&cache](uint32_t a, uint32_t b) {
auto& da = cache[a];
auto& db = cache[b];
return da.similarity > db.similarity ||
(da.similarity == db.similarity &&
(da.size > db.size || (da.size == db.size && da.path < db.path)));
};
size_t depth = 0;
size_t depth_thresh;
const int sim_thresh_depth = 16;
const int sim_thresh = 0;
const size_t max_depth = 2000;
const size_t depth_step = 500;
if (index.size() > max_depth) {
while (depth < max_depth && depth + depth_step < index.size()) {
std::partial_sort(index.begin() + depth,
index.begin() + depth + depth_step, index.end(), cmp);
depth += depth_step;
if (cache[index[0]].similarity - cache[index[depth - 1]].similarity >
sim_thresh_depth) {
do {
--depth;
} while (cache[index[0]].similarity -
cache[index[depth - 1]].similarity >
sim_thresh_depth);
break;
}
}
depth_thresh = depth / 2;
} else {
std::sort(index.begin(), index.end(), cmp);
depth = index.size();
depth_thresh = 0;
}
auto sim = cache[index.front()].similarity;
while (!index.empty() && depth > depth_thresh &&
sim - cache[index.front()].similarity <= sim_thresh) {
inodes_.push_back(std::move(cache[index.front()].ino));
finalize_inode(inodes_.back());
index.pop_front();
--depth;
}
while (depth > depth_thresh) {
ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
for (size_t i = 0; i < depth; ++i) {
cache[index[i]].similarity =
dwarfs::nilsimsa_similarity(ref_hash, cache[index[i]].hash);
}
std::partial_sort(index.begin(), index.begin() + (depth - depth_thresh),
index.begin() + depth, cmp);
sim = cache[index.front()].similarity;
while (!index.empty() && depth > depth_thresh &&
sim - cache[index.front()].similarity <= sim_thresh) {
inodes_.push_back(std::move(cache[index.front()].ino));
finalize_inode(inodes_.back());
index.pop_front();
--depth;
}
}
}
if (count != inodes_.size()) {
throw std::runtime_error("internal error: nilsimsa ordering failed");
}
}
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa2(
inode_manager::inode_cb const& fn, uint32_t inode_no,
file_order_options const& file_order) {
auto count = inodes_.size();

View File

@ -95,8 +95,7 @@ const std::map<std::string, file_order_mode> order_choices{
{"script", file_order_mode::SCRIPT},
#endif
{"similarity", file_order_mode::SIMILARITY},
{"nilsimsa", file_order_mode::NILSIMSA},
{"nilsimsa2", file_order_mode::NILSIMSA2}};
{"nilsimsa", file_order_mode::NILSIMSA}};
const std::map<std::string, uint32_t> time_resolutions{
{"sec", 1},
@ -482,7 +481,7 @@ int mkdwarfs(int argc, char** argv) {
it != order_choices.end()) {
options.file_order.mode = it->second;
if (order_opts.size() > 1) {
if (options.file_order.mode != file_order_mode::NILSIMSA2) {
if (options.file_order.mode != file_order_mode::NILSIMSA) {
throw std::runtime_error(
fmt::format("file order mode '{}' does not support options",
order_opts.front()));
@ -614,8 +613,7 @@ int mkdwarfs(int argc, char** argv) {
force_similarity ||
options.file_order.mode == file_order_mode::SIMILARITY;
options.inode.with_nilsimsa =
options.file_order.mode == file_order_mode::NILSIMSA ||
options.file_order.mode == file_order_mode::NILSIMSA2;
options.file_order.mode == file_order_mode::NILSIMSA;
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
std::make_shared<os_access_posix>(), std::move(script), options);