diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index 976a71a5..8df504b6 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -59,8 +59,7 @@ enum class file_order_mode { PATH, SCRIPT, SIMILARITY, - NILSIMSA, - NILSIMSA2 + NILSIMSA }; struct file_order_options { diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index 3cfe69df..27c12be6 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -212,9 +212,6 @@ class inode_manager_ : public inode_manager::impl { } void order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn, - uint32_t inode_no); - - void order_inodes_by_nilsimsa2(inode_manager::inode_cb const& fn, uint32_t inode_no, file_order_options const& file_order); @@ -268,16 +265,7 @@ void inode_manager_::order_inodes( log_.info() << "ordering " << count() << " inodes using nilsimsa similarity..."; auto ti = log_.timed_info(); - order_inodes_by_nilsimsa(fn, first_inode); - ti << count() << " inodes ordered"; - return; - } - - case file_order_mode::NILSIMSA2: { - log_.info() << "ordering " << count() - << " inodes using nilsimsa2 similarity..."; - auto ti = log_.timed_info(); - order_inodes_by_nilsimsa2(fn, first_inode, file_order); + order_inodes_by_nilsimsa(fn, first_inode, file_order); ti << count() << " inodes ordered"; return; } @@ -290,129 +278,6 @@ void inode_manager_::order_inodes( template void inode_manager_::order_inodes_by_nilsimsa( - inode_manager::inode_cb const& fn, uint32_t inode_no) { - auto finalize_inode = [&](auto& ino) { - ino->set_num(inode_no++); - fn(ino); - }; - - auto count = inodes_.size(); - - // skip all empty inodes (this is at most one) - auto beg = std::partition(inodes_.begin(), inodes_.end(), - [](auto const& p) { return p->size() == 0; }); - - for (auto it = inodes_.begin(); it != beg; ++it) { - finalize_inode(*it); - } - - // find the largest inode - std::nth_element(beg, beg, inodes_.end(), [](auto const& a, auto const& b) { - return (a->size() > b->size() || - (a->size() == b->size() && a->any()->path() < b->any()->path())); - }); - - finalize_inode(*beg); - - // build a cache for the remaining inodes - std::vector cache; - std::deque index; - index.resize(std::distance(beg + 1, inodes_.end())); - std::iota(index.begin(), index.end(), 0); - cache.reserve(index.size()); - - for (auto it = beg + 1; it != inodes_.end(); ++it) { - cache.emplace_back(std::move(*it)); - } - - assert(index.size() == cache.size()); - - // and temporarily remove from the original array - inodes_.erase(beg + 1, inodes_.end()); - - while (!index.empty()) { - // compare reference inode with all remaining inodes - auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data(); - for (auto& d : cache) { - d.similarity = dwarfs::nilsimsa_similarity(ref_hash, d.hash); - } - - auto cmp = [&cache](uint32_t a, uint32_t b) { - auto& da = cache[a]; - auto& db = cache[b]; - return da.similarity > db.similarity || - (da.similarity == db.similarity && - (da.size > db.size || (da.size == db.size && da.path < db.path))); - }; - - size_t depth = 0; - size_t depth_thresh; - const int sim_thresh_depth = 16; - const int sim_thresh = 0; - const size_t max_depth = 2000; - const size_t depth_step = 500; - - if (index.size() > max_depth) { - while (depth < max_depth && depth + depth_step < index.size()) { - std::partial_sort(index.begin() + depth, - index.begin() + depth + depth_step, index.end(), cmp); - depth += depth_step; - if (cache[index[0]].similarity - cache[index[depth - 1]].similarity > - sim_thresh_depth) { - do { - --depth; - } while (cache[index[0]].similarity - - cache[index[depth - 1]].similarity > - sim_thresh_depth); - break; - } - } - depth_thresh = depth / 2; - } else { - std::sort(index.begin(), index.end(), cmp); - depth = index.size(); - depth_thresh = 0; - } - - auto sim = cache[index.front()].similarity; - - while (!index.empty() && depth > depth_thresh && - sim - cache[index.front()].similarity <= sim_thresh) { - inodes_.push_back(std::move(cache[index.front()].ino)); - finalize_inode(inodes_.back()); - index.pop_front(); - --depth; - } - - while (depth > depth_thresh) { - ref_hash = inodes_.back()->nilsimsa_similarity_hash().data(); - for (size_t i = 0; i < depth; ++i) { - cache[index[i]].similarity = - dwarfs::nilsimsa_similarity(ref_hash, cache[index[i]].hash); - } - - std::partial_sort(index.begin(), index.begin() + (depth - depth_thresh), - index.begin() + depth, cmp); - - sim = cache[index.front()].similarity; - - while (!index.empty() && depth > depth_thresh && - sim - cache[index.front()].similarity <= sim_thresh) { - inodes_.push_back(std::move(cache[index.front()].ino)); - finalize_inode(inodes_.back()); - index.pop_front(); - --depth; - } - } - } - - if (count != inodes_.size()) { - throw std::runtime_error("internal error: nilsimsa ordering failed"); - } -} - -template -void inode_manager_::order_inodes_by_nilsimsa2( inode_manager::inode_cb const& fn, uint32_t inode_no, file_order_options const& file_order) { auto count = inodes_.size(); diff --git a/src/mkdwarfs.cpp b/src/mkdwarfs.cpp index 9f23086d..9df559ca 100644 --- a/src/mkdwarfs.cpp +++ b/src/mkdwarfs.cpp @@ -95,8 +95,7 @@ const std::map order_choices{ {"script", file_order_mode::SCRIPT}, #endif {"similarity", file_order_mode::SIMILARITY}, - {"nilsimsa", file_order_mode::NILSIMSA}, - {"nilsimsa2", file_order_mode::NILSIMSA2}}; + {"nilsimsa", file_order_mode::NILSIMSA}}; const std::map time_resolutions{ {"sec", 1}, @@ -482,7 +481,7 @@ int mkdwarfs(int argc, char** argv) { it != order_choices.end()) { options.file_order.mode = it->second; if (order_opts.size() > 1) { - if (options.file_order.mode != file_order_mode::NILSIMSA2) { + if (options.file_order.mode != file_order_mode::NILSIMSA) { throw std::runtime_error( fmt::format("file order mode '{}' does not support options", order_opts.front())); @@ -614,8 +613,7 @@ int mkdwarfs(int argc, char** argv) { force_similarity || options.file_order.mode == file_order_mode::SIMILARITY; options.inode.with_nilsimsa = - options.file_order.mode == file_order_mode::NILSIMSA || - options.file_order.mode == file_order_mode::NILSIMSA2; + options.file_order.mode == file_order_mode::NILSIMSA; scanner s(lgr, wg_scanner, cfg, entry_factory::create(), std::make_shared(), std::move(script), options);