mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-09 04:19:10 -04:00
Remove old nilsimsa code
This commit is contained in:
parent
9f734adc3a
commit
2efc231393
@ -59,8 +59,7 @@ enum class file_order_mode {
|
||||
PATH,
|
||||
SCRIPT,
|
||||
SIMILARITY,
|
||||
NILSIMSA,
|
||||
NILSIMSA2
|
||||
NILSIMSA
|
||||
};
|
||||
|
||||
struct file_order_options {
|
||||
|
@ -212,9 +212,6 @@ class inode_manager_ : public inode_manager::impl {
|
||||
}
|
||||
|
||||
void order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn,
|
||||
uint32_t inode_no);
|
||||
|
||||
void order_inodes_by_nilsimsa2(inode_manager::inode_cb const& fn,
|
||||
uint32_t inode_no,
|
||||
file_order_options const& file_order);
|
||||
|
||||
@ -268,16 +265,7 @@ void inode_manager_<LoggerPolicy>::order_inodes(
|
||||
log_.info() << "ordering " << count()
|
||||
<< " inodes using nilsimsa similarity...";
|
||||
auto ti = log_.timed_info();
|
||||
order_inodes_by_nilsimsa(fn, first_inode);
|
||||
ti << count() << " inodes ordered";
|
||||
return;
|
||||
}
|
||||
|
||||
case file_order_mode::NILSIMSA2: {
|
||||
log_.info() << "ordering " << count()
|
||||
<< " inodes using nilsimsa2 similarity...";
|
||||
auto ti = log_.timed_info();
|
||||
order_inodes_by_nilsimsa2(fn, first_inode, file_order);
|
||||
order_inodes_by_nilsimsa(fn, first_inode, file_order);
|
||||
ti << count() << " inodes ordered";
|
||||
return;
|
||||
}
|
||||
@ -290,129 +278,6 @@ void inode_manager_<LoggerPolicy>::order_inodes(
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
inode_manager::inode_cb const& fn, uint32_t inode_no) {
|
||||
auto finalize_inode = [&](auto& ino) {
|
||||
ino->set_num(inode_no++);
|
||||
fn(ino);
|
||||
};
|
||||
|
||||
auto count = inodes_.size();
|
||||
|
||||
// skip all empty inodes (this is at most one)
|
||||
auto beg = std::partition(inodes_.begin(), inodes_.end(),
|
||||
[](auto const& p) { return p->size() == 0; });
|
||||
|
||||
for (auto it = inodes_.begin(); it != beg; ++it) {
|
||||
finalize_inode(*it);
|
||||
}
|
||||
|
||||
// find the largest inode
|
||||
std::nth_element(beg, beg, inodes_.end(), [](auto const& a, auto const& b) {
|
||||
return (a->size() > b->size() ||
|
||||
(a->size() == b->size() && a->any()->path() < b->any()->path()));
|
||||
});
|
||||
|
||||
finalize_inode(*beg);
|
||||
|
||||
// build a cache for the remaining inodes
|
||||
std::vector<nilsimsa_cache_entry> cache;
|
||||
std::deque<uint32_t> index;
|
||||
index.resize(std::distance(beg + 1, inodes_.end()));
|
||||
std::iota(index.begin(), index.end(), 0);
|
||||
cache.reserve(index.size());
|
||||
|
||||
for (auto it = beg + 1; it != inodes_.end(); ++it) {
|
||||
cache.emplace_back(std::move(*it));
|
||||
}
|
||||
|
||||
assert(index.size() == cache.size());
|
||||
|
||||
// and temporarily remove from the original array
|
||||
inodes_.erase(beg + 1, inodes_.end());
|
||||
|
||||
while (!index.empty()) {
|
||||
// compare reference inode with all remaining inodes
|
||||
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
||||
for (auto& d : cache) {
|
||||
d.similarity = dwarfs::nilsimsa_similarity(ref_hash, d.hash);
|
||||
}
|
||||
|
||||
auto cmp = [&cache](uint32_t a, uint32_t b) {
|
||||
auto& da = cache[a];
|
||||
auto& db = cache[b];
|
||||
return da.similarity > db.similarity ||
|
||||
(da.similarity == db.similarity &&
|
||||
(da.size > db.size || (da.size == db.size && da.path < db.path)));
|
||||
};
|
||||
|
||||
size_t depth = 0;
|
||||
size_t depth_thresh;
|
||||
const int sim_thresh_depth = 16;
|
||||
const int sim_thresh = 0;
|
||||
const size_t max_depth = 2000;
|
||||
const size_t depth_step = 500;
|
||||
|
||||
if (index.size() > max_depth) {
|
||||
while (depth < max_depth && depth + depth_step < index.size()) {
|
||||
std::partial_sort(index.begin() + depth,
|
||||
index.begin() + depth + depth_step, index.end(), cmp);
|
||||
depth += depth_step;
|
||||
if (cache[index[0]].similarity - cache[index[depth - 1]].similarity >
|
||||
sim_thresh_depth) {
|
||||
do {
|
||||
--depth;
|
||||
} while (cache[index[0]].similarity -
|
||||
cache[index[depth - 1]].similarity >
|
||||
sim_thresh_depth);
|
||||
break;
|
||||
}
|
||||
}
|
||||
depth_thresh = depth / 2;
|
||||
} else {
|
||||
std::sort(index.begin(), index.end(), cmp);
|
||||
depth = index.size();
|
||||
depth_thresh = 0;
|
||||
}
|
||||
|
||||
auto sim = cache[index.front()].similarity;
|
||||
|
||||
while (!index.empty() && depth > depth_thresh &&
|
||||
sim - cache[index.front()].similarity <= sim_thresh) {
|
||||
inodes_.push_back(std::move(cache[index.front()].ino));
|
||||
finalize_inode(inodes_.back());
|
||||
index.pop_front();
|
||||
--depth;
|
||||
}
|
||||
|
||||
while (depth > depth_thresh) {
|
||||
ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
||||
for (size_t i = 0; i < depth; ++i) {
|
||||
cache[index[i]].similarity =
|
||||
dwarfs::nilsimsa_similarity(ref_hash, cache[index[i]].hash);
|
||||
}
|
||||
|
||||
std::partial_sort(index.begin(), index.begin() + (depth - depth_thresh),
|
||||
index.begin() + depth, cmp);
|
||||
|
||||
sim = cache[index.front()].similarity;
|
||||
|
||||
while (!index.empty() && depth > depth_thresh &&
|
||||
sim - cache[index.front()].similarity <= sim_thresh) {
|
||||
inodes_.push_back(std::move(cache[index.front()].ino));
|
||||
finalize_inode(inodes_.back());
|
||||
index.pop_front();
|
||||
--depth;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (count != inodes_.size()) {
|
||||
throw std::runtime_error("internal error: nilsimsa ordering failed");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa2(
|
||||
inode_manager::inode_cb const& fn, uint32_t inode_no,
|
||||
file_order_options const& file_order) {
|
||||
auto count = inodes_.size();
|
||||
|
@ -95,8 +95,7 @@ const std::map<std::string, file_order_mode> order_choices{
|
||||
{"script", file_order_mode::SCRIPT},
|
||||
#endif
|
||||
{"similarity", file_order_mode::SIMILARITY},
|
||||
{"nilsimsa", file_order_mode::NILSIMSA},
|
||||
{"nilsimsa2", file_order_mode::NILSIMSA2}};
|
||||
{"nilsimsa", file_order_mode::NILSIMSA}};
|
||||
|
||||
const std::map<std::string, uint32_t> time_resolutions{
|
||||
{"sec", 1},
|
||||
@ -482,7 +481,7 @@ int mkdwarfs(int argc, char** argv) {
|
||||
it != order_choices.end()) {
|
||||
options.file_order.mode = it->second;
|
||||
if (order_opts.size() > 1) {
|
||||
if (options.file_order.mode != file_order_mode::NILSIMSA2) {
|
||||
if (options.file_order.mode != file_order_mode::NILSIMSA) {
|
||||
throw std::runtime_error(
|
||||
fmt::format("file order mode '{}' does not support options",
|
||||
order_opts.front()));
|
||||
@ -614,8 +613,7 @@ int mkdwarfs(int argc, char** argv) {
|
||||
force_similarity ||
|
||||
options.file_order.mode == file_order_mode::SIMILARITY;
|
||||
options.inode.with_nilsimsa =
|
||||
options.file_order.mode == file_order_mode::NILSIMSA ||
|
||||
options.file_order.mode == file_order_mode::NILSIMSA2;
|
||||
options.file_order.mode == file_order_mode::NILSIMSA;
|
||||
|
||||
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
|
||||
std::make_shared<os_access_posix>(), std::move(script), options);
|
||||
|
Loading…
x
Reference in New Issue
Block a user