diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3154642f..96d6b5f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -395,6 +395,7 @@ list(
src/dwarfs/scanner.cpp
src/dwarfs/segmenter.cpp
src/dwarfs/similarity.cpp
+ src/dwarfs/similarity_ordering.cpp
src/dwarfs/string_table.cpp
src/dwarfs/terminal.cpp
src/dwarfs/util.cpp
diff --git a/include/dwarfs/similarity_ordering.h b/include/dwarfs/similarity_ordering.h
new file mode 100644
index 00000000..71659c7f
--- /dev/null
+++ b/include/dwarfs/similarity_ordering.h
@@ -0,0 +1,87 @@
+/* vim:set ts=2 sw=2 sts=2 et: */
+/**
+ * \author Marcus Holland-Moritz (github@mhxnet.de)
+ * \copyright Copyright (c) Marcus Holland-Moritz
+ *
+ * This file is part of dwarfs.
+ *
+ * dwarfs is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * dwarfs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with dwarfs. If not, see .
+ */
+
+#pragma once
+
+#include
+#include
+#include
+#include
+
+namespace dwarfs {
+
+class logger;
+class progress;
+class worker_group;
+
+class similarity_element_view {
+ public:
+ ~similarity_element_view() = default;
+
+ virtual bool exists(size_t i) const = 0;
+ virtual size_t size() const = 0;
+ virtual size_t weight(size_t i) const = 0;
+ virtual bool bitvec_less(size_t a, size_t b) const = 0;
+ virtual bool order_less(size_t a, size_t b) const = 0;
+ virtual bool bits_equal(size_t a, size_t b) const = 0;
+ virtual std::string description(size_t i) const = 0;
+};
+
+template
+class basic_array_similarity_element_view : public similarity_element_view {
+ public:
+ static_assert(Bits % (8 * sizeof(BitsType)) == 0);
+ static constexpr size_t const bitvec_size = Bits / (8 * sizeof(BitsType));
+ using bitvec_type = std::array;
+
+ virtual bitvec_type const& get_bits(size_t i) const = 0;
+};
+
+struct similarity_ordering_options {
+ size_t max_children{256};
+ size_t max_cluster_size{256};
+};
+
+class similarity_ordering {
+ public:
+ using index_value_type = uint32_t;
+
+ similarity_ordering(logger& lgr, progress& prog, worker_group& wg,
+ similarity_ordering_options const& opts);
+
+ std::future> order_nilsimsa(
+ basic_array_similarity_element_view<256, uint64_t> const& ev) const {
+ return impl_->order_nilsimsa(ev);
+ }
+
+ class impl {
+ public:
+ virtual ~impl() = default;
+
+ virtual std::future> order_nilsimsa(
+ basic_array_similarity_element_view<256, uint64_t> const& ev) const = 0;
+ };
+
+ private:
+ std::unique_ptr impl_;
+};
+
+} // namespace dwarfs
diff --git a/src/dwarfs/similarity_ordering.cpp b/src/dwarfs/similarity_ordering.cpp
new file mode 100644
index 00000000..ec4469c4
--- /dev/null
+++ b/src/dwarfs/similarity_ordering.cpp
@@ -0,0 +1,631 @@
+/* vim:set ts=2 sw=2 sts=2 et: */
+/**
+ * \author Marcus Holland-Moritz (github@mhxnet.de)
+ * \copyright Copyright (c) Marcus Holland-Moritz
+ *
+ * This file is part of dwarfs.
+ *
+ * dwarfs is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * dwarfs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with dwarfs. If not, see .
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include "dwarfs/logger.h"
+#include "dwarfs/progress.h"
+#include "dwarfs/similarity_ordering.h"
+#include "dwarfs/worker_group.h"
+
+namespace dwarfs {
+
+namespace {
+
+// TODO: move out of here
+class job_tracker {
+ public:
+ explicit job_tracker(folly::Function&& on_jobs_done)
+ : on_jobs_done_{std::move(on_jobs_done)} {}
+
+ void start_job() {
+ std::lock_guard lock(mx_);
+ ++active_;
+ }
+
+ void finish_job() {
+ bool all_done = false;
+ {
+ std::lock_guard lock(mx_);
+ assert(active_ > 0);
+ --active_;
+ all_done = active_ == 0;
+ }
+ if (all_done) {
+ on_jobs_done_();
+ }
+ }
+
+ private:
+ std::mutex mx_;
+ size_t active_{0};
+ folly::Function on_jobs_done_;
+};
+
+template
+int distance(std::array const& a, std::array const& b) {
+ int d = 0;
+ for (size_t i = 0; i < N; ++i) {
+ d += folly::popcount(a[i] ^ b[i]);
+ }
+ return d;
+}
+
+template
+class basic_centroid {
+ public:
+ static_assert(Bits % (8 * sizeof(BitsType)) == 0);
+ static constexpr size_t const array_size = Bits / (8 * sizeof(BitsType));
+ using value_type = std::array;
+ using bits_type = folly::Bits;
+
+ basic_centroid() {
+ std::fill(centroid_.begin(), centroid_.end(), 0);
+ std::fill(bitcounts_.begin(), bitcounts_.end(), 0);
+ }
+
+ value_type const& value() const { return centroid_; };
+
+ void add(value_type const& vec) {
+ ++veccount_;
+ for (size_t bit = 0; bit < Bits; ++bit) {
+ bitcounts_[bit] += bits_type::test(vec.data(), bit) ? 1 : 0;
+ if (bitcounts_[bit] > veccount_ / 2) {
+ bits_type::set(centroid_.data(), bit);
+ } else {
+ bits_type::clear(centroid_.data(), bit);
+ }
+ }
+ }
+
+ auto distance_to(value_type const& vec) const {
+ return distance(centroid_, vec);
+ }
+
+ private:
+ value_type centroid_;
+ std::array bitcounts_;
+ CountsType veccount_;
+};
+
+template
+struct basic_cluster {
+ using centroid_type = basic_centroid;
+ using index_value_type = IndexValueType;
+ using index_type = std::vector;
+
+ basic_cluster() = default;
+ explicit basic_cluster(index_type&& index)
+ : index{std::move(index)} {}
+
+ centroid_type centroid;
+ index_type index;
+};
+
+template
+struct basic_cluster_tree_node {
+ using cluster_type = ClusterType;
+ using index_type = typename cluster_type::index_type;
+ using index_value_type = typename cluster_type::index_value_type;
+ using cluster_pointer = std::unique_ptr;
+ using children_vector = std::vector>;
+
+ basic_cluster_tree_node()
+ : v{std::make_unique()} {}
+ basic_cluster_tree_node(index_type&& index)
+ : v{std::make_unique(std::move(index))} {}
+
+ children_vector const& children() const {
+ return std::get(v);
+ }
+ children_vector& children() { return std::get(v); }
+
+ cluster_type const& cluster() const { return *std::get(v); }
+ cluster_type& cluster() { return *std::get(v); }
+
+ bool is_leaf() const { return std::holds_alternative(v); }
+
+ std::string description() const {
+ if (is_leaf()) {
+ return fmt::format("{} items", cluster().index.size());
+ } else {
+ return fmt::format("{} children", children().size());
+ }
+ }
+
+ index_value_type first_index() const {
+ if (is_leaf()) {
+ return cluster().index.front();
+ }
+ return children().front().first_index();
+ }
+
+ index_value_type last_index() const {
+ if (is_leaf()) {
+ return cluster().index.back();
+ }
+ return children().back().last_index();
+ }
+
+ std::variant v;
+};
+
+} // namespace
+
+template
+class similarity_ordering_ final : public similarity_ordering::impl {
+ public:
+ using index_value_type = similarity_ordering::index_value_type;
+ using index_type = std::vector;
+ using duplicates_map =
+ std::unordered_map>;
+ using nilsimsa_element_view =
+ basic_array_similarity_element_view<256, uint64_t>;
+ using nilsimsa_cluster =
+ basic_cluster<256, uint64_t, uint32_t, index_value_type>;
+ using nilsimsa_cluster_tree_node = basic_cluster_tree_node;
+
+ similarity_ordering_(logger& lgr, progress& prog, worker_group& wg,
+ similarity_ordering_options const& opts)
+ : LOG_PROXY_INIT(lgr)
+ , prog_{prog}
+ , wg_{wg}
+ , opts_{opts} {}
+
+ std::future
+ order_nilsimsa(nilsimsa_element_view const& ev) const override;
+
+ private:
+ index_type build_index(similarity_element_view const& ev) const;
+ duplicates_map
+ find_duplicates(similarity_element_view const& ev, index_type& index) const;
+
+ template
+ size_t
+ total_distance(basic_array_similarity_element_view const& ev,
+ index_type const& index) const;
+
+ template
+ void
+ order_cluster(basic_array_similarity_element_view const& ev,
+ index_type& index) const;
+
+ template
+ size_t order_tree_rec(
+ basic_cluster_tree_node<
+ basic_cluster>& node,
+ basic_array_similarity_element_view const& ev) const;
+
+ template
+ void cluster_by_distance(
+ basic_cluster_tree_node<
+ basic_cluster>& node,
+ basic_array_similarity_element_view const& ev,
+ int max_distance) const;
+
+ template
+ void cluster_rec(
+ basic_cluster_tree_node<
+ basic_cluster>& node,
+ basic_array_similarity_element_view const& ev,
+ std::shared_ptr jt, int max_distance) const;
+
+ template
+ void cluster(basic_cluster_tree_node>& root,
+ basic_array_similarity_element_view const& ev,
+ std::shared_ptr jt) const;
+
+ template
+ void collect_rec(
+ basic_cluster_tree_node<
+ basic_cluster>& node,
+ basic_array_similarity_element_view const& ev,
+ duplicates_map& dup, index_type& ordered, std::string indent) const;
+
+ template
+ void order_impl(
+ std::promise&& promise,
+ basic_array_similarity_element_view const& ev) const;
+
+ LOG_PROXY_DECL(LoggerPolicy);
+ progress& prog_;
+ worker_group& wg_;
+ similarity_ordering_options const opts_;
+};
+
+template
+auto similarity_ordering_::build_index(
+ similarity_element_view const& ev) const -> index_type {
+ index_type index;
+
+ {
+ auto tt = LOG_TIMED_TRACE;
+
+ index.reserve(ev.size());
+ for (index_value_type i = 0; i < ev.size(); ++i) {
+ if (ev.exists(i)) {
+ index.push_back(i);
+ }
+ }
+ index.shrink_to_fit();
+
+ tt << "build index: " << ev.size() << " -> " << index.size();
+ }
+
+ return index;
+}
+
+template
+auto similarity_ordering_::find_duplicates(
+ similarity_element_view const& ev, index_type& index) const
+ -> duplicates_map {
+ duplicates_map dm;
+
+ {
+ auto tt = LOG_TIMED_TRACE;
+
+ std::sort(index.begin(), index.end(),
+ [&ev](auto a, auto b) { return ev.bitvec_less(a, b); });
+
+ tt << "sort index of " << index.size() << " elements";
+ }
+
+ {
+ auto tt = LOG_TIMED_TRACE;
+
+ if (!index.empty()) {
+ auto src = index.begin();
+ auto dst = src;
+
+ while (++src != index.end()) {
+ if (ev.bits_equal(*dst, *src)) {
+ dm[*dst].push_back(*src);
+ } else if (++dst != src) {
+ *dst = std::move(*src);
+ }
+ }
+
+ index.erase(++dst, index.end());
+ }
+
+ tt << "find duplicates: " << index.size() << " unique / " << dm.size()
+ << " groups";
+ }
+
+ return dm;
+}
+
+template
+template
+size_t similarity_ordering_::total_distance(
+ basic_array_similarity_element_view const& ev,
+ index_type const& index) const {
+ size_t td = 0;
+
+ if (!index.empty()) {
+ auto* prev = &ev.get_bits(index[0]);
+
+ for (size_t i = 1; i < index.size(); ++i) {
+ auto& curr = ev.get_bits(index[i]);
+ td += distance(*prev, curr);
+ prev = &curr;
+ }
+ }
+
+ return td;
+}
+
+template
+template
+void similarity_ordering_::order_cluster(
+ basic_array_similarity_element_view const& ev,
+ index_type& index) const {
+ if (!index.empty()) {
+ // TODO: try simulated annealing again? reproducibly?
+
+ std::sort(index.begin(), index.end(),
+ [&ev](auto a, auto b) { return ev.order_less(a, b); });
+
+ // TODO: maybe it's worth caching bits pointers beforehand?
+ for (size_t i = 0; i < index.size() - 1; ++i) {
+ auto& bi = ev.get_bits(index[i]);
+ int best_distance = std::numeric_limits::max();
+ size_t best_index = 0;
+
+ for (size_t k = i + 1; k < index.size(); ++k) {
+ auto& bk = ev.get_bits(index[k]);
+ auto d = distance(bi, bk);
+ if (d < best_distance) {
+ best_distance = d;
+ best_index = k;
+ if (best_distance <= 1) {
+ break;
+ }
+ }
+ }
+
+ if (best_index > 0 && i + 1 != best_index) {
+ std::swap(index[i + 1], index[best_index]);
+ }
+ }
+ }
+}
+
+template
+template
+size_t similarity_ordering_::order_tree_rec(
+ basic_cluster_tree_node<
+ basic_cluster>& node,
+ basic_array_similarity_element_view const& ev) const {
+ using node_type = std::decay_t;
+ using bitvec_type =
+ typename basic_array_similarity_element_view::bitvec_type;
+
+ if (node.is_leaf()) {
+ auto& cluster = node.cluster();
+ return std::accumulate(
+ cluster.index.begin(), cluster.index.end(), size_t(0),
+ [&ev](size_t acc, size_t i) { return acc + ev.weight(i); });
+ }
+
+ auto& children = node.children();
+ std::vector<
+ std::tuple>
+ bits;
+ bits.reserve(children.size());
+ size_t total_weight = 0;
+
+ for (auto& cn : children) {
+ auto weight = order_tree_rec(cn, ev);
+ bits.emplace_back(&ev.get_bits(cn.first_index()),
+ &ev.get_bits(cn.last_index()), &cn, weight);
+ total_weight += weight;
+ }
+
+ // all children of this node are ordered now
+
+ std::stable_sort(bits.begin(), bits.end(), [](auto const& a, auto const& b) {
+ return std::get<3>(a) > std::get<3>(b);
+ });
+
+ for (size_t i = 0; i < bits.size() - 1; ++i) {
+ auto bi = std::get<1>(bits[i]);
+ int best_distance = std::numeric_limits::max();
+ size_t best_index = 0;
+
+ for (size_t k = i + 1; k < bits.size(); ++k) {
+ auto bk = std::get<0>(bits[k]);
+ auto d = distance(*bi, *bk);
+ if (d < best_distance) {
+ best_distance = d;
+ best_index = k;
+ if (best_distance <= 1) {
+ break;
+ }
+ }
+ }
+
+ if (best_index > 0 && i + 1 != best_index) {
+ std::swap(bits[i + 1], bits[best_index]);
+ }
+ }
+
+ std::vector ordered_children;
+ ordered_children.reserve(children.size());
+
+ for (auto& b : bits) {
+ ordered_children.emplace_back(std::move(*std::get<2>(b)));
+ }
+
+ children.swap(ordered_children);
+
+ return total_weight;
+}
+
+template
+template
+void similarity_ordering_::cluster_by_distance(
+ basic_cluster_tree_node<
+ basic_cluster>& node,
+ basic_array_similarity_element_view const& ev,
+ int max_distance) const {
+ using node_type = std::decay_t;
+ using cluster_type = typename node_type::cluster_type;
+ typename node_type::children_vector children;
+
+ auto td = LOG_TIMED_DEBUG;
+
+ for (auto i : node.cluster().index) {
+ auto const& vec = ev.get_bits(i);
+ cluster_type* match = nullptr;
+ int best_distance = std::numeric_limits::max();
+ cluster_type* best_match = nullptr;
+
+ for (auto& c : children) {
+ auto& cluster = c.cluster();
+ auto d = cluster.centroid.distance_to(vec);
+
+ if (d <= max_distance) {
+ match = &cluster;
+ break;
+ } else if (d < best_distance) {
+ best_distance = d;
+ best_match = &cluster;
+ }
+ }
+
+ if (!match) {
+ if (children.size() < opts_.max_children) {
+ auto& nn = children.emplace_back();
+ match = &nn.cluster();
+ } else {
+ match = best_match;
+ }
+ }
+
+ match->centroid.add(vec);
+ match->index.push_back(i);
+ }
+
+ td << "cluster_by_distance: " << node.cluster().index.size() << " -> "
+ << children.size() << ")";
+
+ node.v = std::move(children);
+}
+
+template
+template
+void similarity_ordering_::cluster_rec(
+ basic_cluster_tree_node<
+ basic_cluster>& node,
+ basic_array_similarity_element_view const& ev,
+ std::shared_ptr jt, int max_distance) const {
+ cluster_by_distance(node, ev, max_distance);
+
+ for (auto& cn : node.children()) {
+ if (max_distance > 1 &&
+ cn.cluster().index.size() > opts_.max_cluster_size) {
+ jt->start_job();
+ wg_.add_job([this, &cn, &ev, jt, md = max_distance / 2] {
+ cluster_rec(cn, ev, jt, md);
+ jt->finish_job();
+ });
+ } else if (cn.cluster().index.size() > 1) {
+ jt->start_job();
+ wg_.add_job([this, &index = cn.cluster().index, &ev, jt] {
+ order_cluster(ev, index);
+ jt->finish_job();
+ });
+ }
+ }
+}
+
+template
+template
+void similarity_ordering_::cluster(
+ basic_cluster_tree_node<
+ basic_cluster>& root,
+ basic_array_similarity_element_view const& ev,
+ std::shared_ptr jt) const {
+ jt->start_job();
+ wg_.add_job([this, &root, &ev, jt] {
+ cluster_rec(root, ev, jt, Bits / 2);
+ jt->finish_job();
+ });
+}
+
+template
+template
+void similarity_ordering_::collect_rec(
+ basic_cluster_tree_node<
+ basic_cluster>& node,
+ basic_array_similarity_element_view const& ev,
+ duplicates_map& dup, index_type& ordered, std::string indent) const {
+ if (node.is_leaf()) {
+ for (auto e : node.cluster().index) {
+ LOG_TRACE << indent << " " << ev.description(e) << " -> "
+ << node.cluster().centroid.distance_to(ev.get_bits(e));
+
+ ordered.push_back(e);
+
+ if (auto it = dup.find(e); it != dup.end()) {
+ auto& dupvec = it->second;
+
+ std::sort(dupvec.begin(), dupvec.end(),
+ [&ev](auto a, auto b) { return ev.order_less(a, b); });
+
+ for (auto i : dupvec) {
+ LOG_TRACE << indent << " + " << ev.description(i) << " -> "
+ << node.cluster().centroid.distance_to(ev.get_bits(i));
+ ordered.push_back(i);
+ }
+ }
+ }
+ } else {
+ // TODO: order children, probably do this as a separate (parallel)
+ // step before collecting
+
+ for (auto const& [i, cn] : folly::enumerate(node.children())) {
+ LOG_TRACE << indent << "[" << i << "] " << cn.description();
+ collect_rec(cn, ev, dup, ordered, indent + " ");
+ }
+ }
+}
+
+template
+template
+void similarity_ordering_::order_impl(
+ std::promise&& promise,
+ basic_array_similarity_element_view const& ev) const {
+ auto index = build_index(ev);
+
+ LOG_INFO << "total distance before ordering: " << total_distance(ev, index);
+
+ size_t size_hint = index.size();
+ auto duplicates = find_duplicates(ev, index);
+ auto root = std::make_shared(std::move(index));
+
+ auto jt = std::make_shared(
+ [this, size_hint, &ev, p = std::move(promise), root,
+ dup = std::move(duplicates)]() mutable {
+ {
+ auto ti = LOG_TIMED_INFO;
+ order_tree_rec(*root, ev);
+ ti << "order_tree_rec";
+ }
+ index_type rv;
+ rv.reserve(size_hint);
+ collect_rec(*root, ev, dup, rv, "");
+ LOG_INFO << "total distance after ordering: " << total_distance(ev, rv);
+ p.set_value(std::move(rv));
+ });
+
+ cluster(*root, ev, jt);
+}
+
+template
+auto similarity_ordering_::order_nilsimsa(
+ nilsimsa_element_view const& ev) const -> std::future {
+ std::promise prom;
+ auto future = prom.get_future();
+ wg_.add_job([this, prom = std::move(prom), &ev]() mutable {
+ order_impl(std::move(prom), ev);
+ });
+ return future;
+}
+
+similarity_ordering::similarity_ordering(
+ logger& lgr, progress& prog, worker_group& wg,
+ similarity_ordering_options const& opts)
+ : impl_(make_unique_logging_object(lgr, prog, wg, opts)) {}
+
+} // namespace dwarfs