mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-10 13:04:15 -04:00
Remove old nilsimsa code
This commit is contained in:
parent
9f734adc3a
commit
2efc231393
@ -59,8 +59,7 @@ enum class file_order_mode {
|
|||||||
PATH,
|
PATH,
|
||||||
SCRIPT,
|
SCRIPT,
|
||||||
SIMILARITY,
|
SIMILARITY,
|
||||||
NILSIMSA,
|
NILSIMSA
|
||||||
NILSIMSA2
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct file_order_options {
|
struct file_order_options {
|
||||||
|
@ -212,9 +212,6 @@ class inode_manager_ : public inode_manager::impl {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn,
|
void order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn,
|
||||||
uint32_t inode_no);
|
|
||||||
|
|
||||||
void order_inodes_by_nilsimsa2(inode_manager::inode_cb const& fn,
|
|
||||||
uint32_t inode_no,
|
uint32_t inode_no,
|
||||||
file_order_options const& file_order);
|
file_order_options const& file_order);
|
||||||
|
|
||||||
@ -268,16 +265,7 @@ void inode_manager_<LoggerPolicy>::order_inodes(
|
|||||||
log_.info() << "ordering " << count()
|
log_.info() << "ordering " << count()
|
||||||
<< " inodes using nilsimsa similarity...";
|
<< " inodes using nilsimsa similarity...";
|
||||||
auto ti = log_.timed_info();
|
auto ti = log_.timed_info();
|
||||||
order_inodes_by_nilsimsa(fn, first_inode);
|
order_inodes_by_nilsimsa(fn, first_inode, file_order);
|
||||||
ti << count() << " inodes ordered";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
case file_order_mode::NILSIMSA2: {
|
|
||||||
log_.info() << "ordering " << count()
|
|
||||||
<< " inodes using nilsimsa2 similarity...";
|
|
||||||
auto ti = log_.timed_info();
|
|
||||||
order_inodes_by_nilsimsa2(fn, first_inode, file_order);
|
|
||||||
ti << count() << " inodes ordered";
|
ti << count() << " inodes ordered";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -290,129 +278,6 @@ void inode_manager_<LoggerPolicy>::order_inodes(
|
|||||||
|
|
||||||
template <typename LoggerPolicy>
|
template <typename LoggerPolicy>
|
||||||
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||||
inode_manager::inode_cb const& fn, uint32_t inode_no) {
|
|
||||||
auto finalize_inode = [&](auto& ino) {
|
|
||||||
ino->set_num(inode_no++);
|
|
||||||
fn(ino);
|
|
||||||
};
|
|
||||||
|
|
||||||
auto count = inodes_.size();
|
|
||||||
|
|
||||||
// skip all empty inodes (this is at most one)
|
|
||||||
auto beg = std::partition(inodes_.begin(), inodes_.end(),
|
|
||||||
[](auto const& p) { return p->size() == 0; });
|
|
||||||
|
|
||||||
for (auto it = inodes_.begin(); it != beg; ++it) {
|
|
||||||
finalize_inode(*it);
|
|
||||||
}
|
|
||||||
|
|
||||||
// find the largest inode
|
|
||||||
std::nth_element(beg, beg, inodes_.end(), [](auto const& a, auto const& b) {
|
|
||||||
return (a->size() > b->size() ||
|
|
||||||
(a->size() == b->size() && a->any()->path() < b->any()->path()));
|
|
||||||
});
|
|
||||||
|
|
||||||
finalize_inode(*beg);
|
|
||||||
|
|
||||||
// build a cache for the remaining inodes
|
|
||||||
std::vector<nilsimsa_cache_entry> cache;
|
|
||||||
std::deque<uint32_t> index;
|
|
||||||
index.resize(std::distance(beg + 1, inodes_.end()));
|
|
||||||
std::iota(index.begin(), index.end(), 0);
|
|
||||||
cache.reserve(index.size());
|
|
||||||
|
|
||||||
for (auto it = beg + 1; it != inodes_.end(); ++it) {
|
|
||||||
cache.emplace_back(std::move(*it));
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(index.size() == cache.size());
|
|
||||||
|
|
||||||
// and temporarily remove from the original array
|
|
||||||
inodes_.erase(beg + 1, inodes_.end());
|
|
||||||
|
|
||||||
while (!index.empty()) {
|
|
||||||
// compare reference inode with all remaining inodes
|
|
||||||
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
|
||||||
for (auto& d : cache) {
|
|
||||||
d.similarity = dwarfs::nilsimsa_similarity(ref_hash, d.hash);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto cmp = [&cache](uint32_t a, uint32_t b) {
|
|
||||||
auto& da = cache[a];
|
|
||||||
auto& db = cache[b];
|
|
||||||
return da.similarity > db.similarity ||
|
|
||||||
(da.similarity == db.similarity &&
|
|
||||||
(da.size > db.size || (da.size == db.size && da.path < db.path)));
|
|
||||||
};
|
|
||||||
|
|
||||||
size_t depth = 0;
|
|
||||||
size_t depth_thresh;
|
|
||||||
const int sim_thresh_depth = 16;
|
|
||||||
const int sim_thresh = 0;
|
|
||||||
const size_t max_depth = 2000;
|
|
||||||
const size_t depth_step = 500;
|
|
||||||
|
|
||||||
if (index.size() > max_depth) {
|
|
||||||
while (depth < max_depth && depth + depth_step < index.size()) {
|
|
||||||
std::partial_sort(index.begin() + depth,
|
|
||||||
index.begin() + depth + depth_step, index.end(), cmp);
|
|
||||||
depth += depth_step;
|
|
||||||
if (cache[index[0]].similarity - cache[index[depth - 1]].similarity >
|
|
||||||
sim_thresh_depth) {
|
|
||||||
do {
|
|
||||||
--depth;
|
|
||||||
} while (cache[index[0]].similarity -
|
|
||||||
cache[index[depth - 1]].similarity >
|
|
||||||
sim_thresh_depth);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
depth_thresh = depth / 2;
|
|
||||||
} else {
|
|
||||||
std::sort(index.begin(), index.end(), cmp);
|
|
||||||
depth = index.size();
|
|
||||||
depth_thresh = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto sim = cache[index.front()].similarity;
|
|
||||||
|
|
||||||
while (!index.empty() && depth > depth_thresh &&
|
|
||||||
sim - cache[index.front()].similarity <= sim_thresh) {
|
|
||||||
inodes_.push_back(std::move(cache[index.front()].ino));
|
|
||||||
finalize_inode(inodes_.back());
|
|
||||||
index.pop_front();
|
|
||||||
--depth;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (depth > depth_thresh) {
|
|
||||||
ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
|
||||||
for (size_t i = 0; i < depth; ++i) {
|
|
||||||
cache[index[i]].similarity =
|
|
||||||
dwarfs::nilsimsa_similarity(ref_hash, cache[index[i]].hash);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::partial_sort(index.begin(), index.begin() + (depth - depth_thresh),
|
|
||||||
index.begin() + depth, cmp);
|
|
||||||
|
|
||||||
sim = cache[index.front()].similarity;
|
|
||||||
|
|
||||||
while (!index.empty() && depth > depth_thresh &&
|
|
||||||
sim - cache[index.front()].similarity <= sim_thresh) {
|
|
||||||
inodes_.push_back(std::move(cache[index.front()].ino));
|
|
||||||
finalize_inode(inodes_.back());
|
|
||||||
index.pop_front();
|
|
||||||
--depth;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (count != inodes_.size()) {
|
|
||||||
throw std::runtime_error("internal error: nilsimsa ordering failed");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename LoggerPolicy>
|
|
||||||
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa2(
|
|
||||||
inode_manager::inode_cb const& fn, uint32_t inode_no,
|
inode_manager::inode_cb const& fn, uint32_t inode_no,
|
||||||
file_order_options const& file_order) {
|
file_order_options const& file_order) {
|
||||||
auto count = inodes_.size();
|
auto count = inodes_.size();
|
||||||
|
@ -95,8 +95,7 @@ const std::map<std::string, file_order_mode> order_choices{
|
|||||||
{"script", file_order_mode::SCRIPT},
|
{"script", file_order_mode::SCRIPT},
|
||||||
#endif
|
#endif
|
||||||
{"similarity", file_order_mode::SIMILARITY},
|
{"similarity", file_order_mode::SIMILARITY},
|
||||||
{"nilsimsa", file_order_mode::NILSIMSA},
|
{"nilsimsa", file_order_mode::NILSIMSA}};
|
||||||
{"nilsimsa2", file_order_mode::NILSIMSA2}};
|
|
||||||
|
|
||||||
const std::map<std::string, uint32_t> time_resolutions{
|
const std::map<std::string, uint32_t> time_resolutions{
|
||||||
{"sec", 1},
|
{"sec", 1},
|
||||||
@ -482,7 +481,7 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
it != order_choices.end()) {
|
it != order_choices.end()) {
|
||||||
options.file_order.mode = it->second;
|
options.file_order.mode = it->second;
|
||||||
if (order_opts.size() > 1) {
|
if (order_opts.size() > 1) {
|
||||||
if (options.file_order.mode != file_order_mode::NILSIMSA2) {
|
if (options.file_order.mode != file_order_mode::NILSIMSA) {
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
fmt::format("file order mode '{}' does not support options",
|
fmt::format("file order mode '{}' does not support options",
|
||||||
order_opts.front()));
|
order_opts.front()));
|
||||||
@ -614,8 +613,7 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
force_similarity ||
|
force_similarity ||
|
||||||
options.file_order.mode == file_order_mode::SIMILARITY;
|
options.file_order.mode == file_order_mode::SIMILARITY;
|
||||||
options.inode.with_nilsimsa =
|
options.inode.with_nilsimsa =
|
||||||
options.file_order.mode == file_order_mode::NILSIMSA ||
|
options.file_order.mode == file_order_mode::NILSIMSA;
|
||||||
options.file_order.mode == file_order_mode::NILSIMSA2;
|
|
||||||
|
|
||||||
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
|
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
|
||||||
std::make_shared<os_access_posix>(), std::move(script), options);
|
std::make_shared<os_access_posix>(), std::move(script), options);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user