diff --git a/CMakeLists.txt b/CMakeLists.txt index 3154642f..96d6b5f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -395,6 +395,7 @@ list( src/dwarfs/scanner.cpp src/dwarfs/segmenter.cpp src/dwarfs/similarity.cpp + src/dwarfs/similarity_ordering.cpp src/dwarfs/string_table.cpp src/dwarfs/terminal.cpp src/dwarfs/util.cpp diff --git a/include/dwarfs/similarity_ordering.h b/include/dwarfs/similarity_ordering.h new file mode 100644 index 00000000..71659c7f --- /dev/null +++ b/include/dwarfs/similarity_ordering.h @@ -0,0 +1,87 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include +#include +#include + +namespace dwarfs { + +class logger; +class progress; +class worker_group; + +class similarity_element_view { + public: + ~similarity_element_view() = default; + + virtual bool exists(size_t i) const = 0; + virtual size_t size() const = 0; + virtual size_t weight(size_t i) const = 0; + virtual bool bitvec_less(size_t a, size_t b) const = 0; + virtual bool order_less(size_t a, size_t b) const = 0; + virtual bool bits_equal(size_t a, size_t b) const = 0; + virtual std::string description(size_t i) const = 0; +}; + +template +class basic_array_similarity_element_view : public similarity_element_view { + public: + static_assert(Bits % (8 * sizeof(BitsType)) == 0); + static constexpr size_t const bitvec_size = Bits / (8 * sizeof(BitsType)); + using bitvec_type = std::array; + + virtual bitvec_type const& get_bits(size_t i) const = 0; +}; + +struct similarity_ordering_options { + size_t max_children{256}; + size_t max_cluster_size{256}; +}; + +class similarity_ordering { + public: + using index_value_type = uint32_t; + + similarity_ordering(logger& lgr, progress& prog, worker_group& wg, + similarity_ordering_options const& opts); + + std::future> order_nilsimsa( + basic_array_similarity_element_view<256, uint64_t> const& ev) const { + return impl_->order_nilsimsa(ev); + } + + class impl { + public: + virtual ~impl() = default; + + virtual std::future> order_nilsimsa( + basic_array_similarity_element_view<256, uint64_t> const& ev) const = 0; + }; + + private: + std::unique_ptr impl_; +}; + +} // namespace dwarfs diff --git a/src/dwarfs/similarity_ordering.cpp b/src/dwarfs/similarity_ordering.cpp new file mode 100644 index 00000000..ec4469c4 --- /dev/null +++ b/src/dwarfs/similarity_ordering.cpp @@ -0,0 +1,631 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "dwarfs/logger.h" +#include "dwarfs/progress.h" +#include "dwarfs/similarity_ordering.h" +#include "dwarfs/worker_group.h" + +namespace dwarfs { + +namespace { + +// TODO: move out of here +class job_tracker { + public: + explicit job_tracker(folly::Function&& on_jobs_done) + : on_jobs_done_{std::move(on_jobs_done)} {} + + void start_job() { + std::lock_guard lock(mx_); + ++active_; + } + + void finish_job() { + bool all_done = false; + { + std::lock_guard lock(mx_); + assert(active_ > 0); + --active_; + all_done = active_ == 0; + } + if (all_done) { + on_jobs_done_(); + } + } + + private: + std::mutex mx_; + size_t active_{0}; + folly::Function on_jobs_done_; +}; + +template +int distance(std::array const& a, std::array const& b) { + int d = 0; + for (size_t i = 0; i < N; ++i) { + d += folly::popcount(a[i] ^ b[i]); + } + return d; +} + +template +class basic_centroid { + public: + static_assert(Bits % (8 * sizeof(BitsType)) == 0); + static constexpr size_t const array_size = Bits / (8 * sizeof(BitsType)); + using value_type = std::array; + using bits_type = folly::Bits; + + basic_centroid() { + std::fill(centroid_.begin(), centroid_.end(), 0); + std::fill(bitcounts_.begin(), bitcounts_.end(), 0); + } + + value_type const& value() const { return centroid_; }; + + void add(value_type const& vec) { + ++veccount_; + for (size_t bit = 0; bit < Bits; ++bit) { + bitcounts_[bit] += bits_type::test(vec.data(), bit) ? 1 : 0; + if (bitcounts_[bit] > veccount_ / 2) { + bits_type::set(centroid_.data(), bit); + } else { + bits_type::clear(centroid_.data(), bit); + } + } + } + + auto distance_to(value_type const& vec) const { + return distance(centroid_, vec); + } + + private: + value_type centroid_; + std::array bitcounts_; + CountsType veccount_; +}; + +template +struct basic_cluster { + using centroid_type = basic_centroid; + using index_value_type = IndexValueType; + using index_type = std::vector; + + basic_cluster() = default; + explicit basic_cluster(index_type&& index) + : index{std::move(index)} {} + + centroid_type centroid; + index_type index; +}; + +template +struct basic_cluster_tree_node { + using cluster_type = ClusterType; + using index_type = typename cluster_type::index_type; + using index_value_type = typename cluster_type::index_value_type; + using cluster_pointer = std::unique_ptr; + using children_vector = std::vector>; + + basic_cluster_tree_node() + : v{std::make_unique()} {} + basic_cluster_tree_node(index_type&& index) + : v{std::make_unique(std::move(index))} {} + + children_vector const& children() const { + return std::get(v); + } + children_vector& children() { return std::get(v); } + + cluster_type const& cluster() const { return *std::get(v); } + cluster_type& cluster() { return *std::get(v); } + + bool is_leaf() const { return std::holds_alternative(v); } + + std::string description() const { + if (is_leaf()) { + return fmt::format("{} items", cluster().index.size()); + } else { + return fmt::format("{} children", children().size()); + } + } + + index_value_type first_index() const { + if (is_leaf()) { + return cluster().index.front(); + } + return children().front().first_index(); + } + + index_value_type last_index() const { + if (is_leaf()) { + return cluster().index.back(); + } + return children().back().last_index(); + } + + std::variant v; +}; + +} // namespace + +template +class similarity_ordering_ final : public similarity_ordering::impl { + public: + using index_value_type = similarity_ordering::index_value_type; + using index_type = std::vector; + using duplicates_map = + std::unordered_map>; + using nilsimsa_element_view = + basic_array_similarity_element_view<256, uint64_t>; + using nilsimsa_cluster = + basic_cluster<256, uint64_t, uint32_t, index_value_type>; + using nilsimsa_cluster_tree_node = basic_cluster_tree_node; + + similarity_ordering_(logger& lgr, progress& prog, worker_group& wg, + similarity_ordering_options const& opts) + : LOG_PROXY_INIT(lgr) + , prog_{prog} + , wg_{wg} + , opts_{opts} {} + + std::future + order_nilsimsa(nilsimsa_element_view const& ev) const override; + + private: + index_type build_index(similarity_element_view const& ev) const; + duplicates_map + find_duplicates(similarity_element_view const& ev, index_type& index) const; + + template + size_t + total_distance(basic_array_similarity_element_view const& ev, + index_type const& index) const; + + template + void + order_cluster(basic_array_similarity_element_view const& ev, + index_type& index) const; + + template + size_t order_tree_rec( + basic_cluster_tree_node< + basic_cluster>& node, + basic_array_similarity_element_view const& ev) const; + + template + void cluster_by_distance( + basic_cluster_tree_node< + basic_cluster>& node, + basic_array_similarity_element_view const& ev, + int max_distance) const; + + template + void cluster_rec( + basic_cluster_tree_node< + basic_cluster>& node, + basic_array_similarity_element_view const& ev, + std::shared_ptr jt, int max_distance) const; + + template + void cluster(basic_cluster_tree_node>& root, + basic_array_similarity_element_view const& ev, + std::shared_ptr jt) const; + + template + void collect_rec( + basic_cluster_tree_node< + basic_cluster>& node, + basic_array_similarity_element_view const& ev, + duplicates_map& dup, index_type& ordered, std::string indent) const; + + template + void order_impl( + std::promise&& promise, + basic_array_similarity_element_view const& ev) const; + + LOG_PROXY_DECL(LoggerPolicy); + progress& prog_; + worker_group& wg_; + similarity_ordering_options const opts_; +}; + +template +auto similarity_ordering_::build_index( + similarity_element_view const& ev) const -> index_type { + index_type index; + + { + auto tt = LOG_TIMED_TRACE; + + index.reserve(ev.size()); + for (index_value_type i = 0; i < ev.size(); ++i) { + if (ev.exists(i)) { + index.push_back(i); + } + } + index.shrink_to_fit(); + + tt << "build index: " << ev.size() << " -> " << index.size(); + } + + return index; +} + +template +auto similarity_ordering_::find_duplicates( + similarity_element_view const& ev, index_type& index) const + -> duplicates_map { + duplicates_map dm; + + { + auto tt = LOG_TIMED_TRACE; + + std::sort(index.begin(), index.end(), + [&ev](auto a, auto b) { return ev.bitvec_less(a, b); }); + + tt << "sort index of " << index.size() << " elements"; + } + + { + auto tt = LOG_TIMED_TRACE; + + if (!index.empty()) { + auto src = index.begin(); + auto dst = src; + + while (++src != index.end()) { + if (ev.bits_equal(*dst, *src)) { + dm[*dst].push_back(*src); + } else if (++dst != src) { + *dst = std::move(*src); + } + } + + index.erase(++dst, index.end()); + } + + tt << "find duplicates: " << index.size() << " unique / " << dm.size() + << " groups"; + } + + return dm; +} + +template +template +size_t similarity_ordering_::total_distance( + basic_array_similarity_element_view const& ev, + index_type const& index) const { + size_t td = 0; + + if (!index.empty()) { + auto* prev = &ev.get_bits(index[0]); + + for (size_t i = 1; i < index.size(); ++i) { + auto& curr = ev.get_bits(index[i]); + td += distance(*prev, curr); + prev = &curr; + } + } + + return td; +} + +template +template +void similarity_ordering_::order_cluster( + basic_array_similarity_element_view const& ev, + index_type& index) const { + if (!index.empty()) { + // TODO: try simulated annealing again? reproducibly? + + std::sort(index.begin(), index.end(), + [&ev](auto a, auto b) { return ev.order_less(a, b); }); + + // TODO: maybe it's worth caching bits pointers beforehand? + for (size_t i = 0; i < index.size() - 1; ++i) { + auto& bi = ev.get_bits(index[i]); + int best_distance = std::numeric_limits::max(); + size_t best_index = 0; + + for (size_t k = i + 1; k < index.size(); ++k) { + auto& bk = ev.get_bits(index[k]); + auto d = distance(bi, bk); + if (d < best_distance) { + best_distance = d; + best_index = k; + if (best_distance <= 1) { + break; + } + } + } + + if (best_index > 0 && i + 1 != best_index) { + std::swap(index[i + 1], index[best_index]); + } + } + } +} + +template +template +size_t similarity_ordering_::order_tree_rec( + basic_cluster_tree_node< + basic_cluster>& node, + basic_array_similarity_element_view const& ev) const { + using node_type = std::decay_t; + using bitvec_type = + typename basic_array_similarity_element_view::bitvec_type; + + if (node.is_leaf()) { + auto& cluster = node.cluster(); + return std::accumulate( + cluster.index.begin(), cluster.index.end(), size_t(0), + [&ev](size_t acc, size_t i) { return acc + ev.weight(i); }); + } + + auto& children = node.children(); + std::vector< + std::tuple> + bits; + bits.reserve(children.size()); + size_t total_weight = 0; + + for (auto& cn : children) { + auto weight = order_tree_rec(cn, ev); + bits.emplace_back(&ev.get_bits(cn.first_index()), + &ev.get_bits(cn.last_index()), &cn, weight); + total_weight += weight; + } + + // all children of this node are ordered now + + std::stable_sort(bits.begin(), bits.end(), [](auto const& a, auto const& b) { + return std::get<3>(a) > std::get<3>(b); + }); + + for (size_t i = 0; i < bits.size() - 1; ++i) { + auto bi = std::get<1>(bits[i]); + int best_distance = std::numeric_limits::max(); + size_t best_index = 0; + + for (size_t k = i + 1; k < bits.size(); ++k) { + auto bk = std::get<0>(bits[k]); + auto d = distance(*bi, *bk); + if (d < best_distance) { + best_distance = d; + best_index = k; + if (best_distance <= 1) { + break; + } + } + } + + if (best_index > 0 && i + 1 != best_index) { + std::swap(bits[i + 1], bits[best_index]); + } + } + + std::vector ordered_children; + ordered_children.reserve(children.size()); + + for (auto& b : bits) { + ordered_children.emplace_back(std::move(*std::get<2>(b))); + } + + children.swap(ordered_children); + + return total_weight; +} + +template +template +void similarity_ordering_::cluster_by_distance( + basic_cluster_tree_node< + basic_cluster>& node, + basic_array_similarity_element_view const& ev, + int max_distance) const { + using node_type = std::decay_t; + using cluster_type = typename node_type::cluster_type; + typename node_type::children_vector children; + + auto td = LOG_TIMED_DEBUG; + + for (auto i : node.cluster().index) { + auto const& vec = ev.get_bits(i); + cluster_type* match = nullptr; + int best_distance = std::numeric_limits::max(); + cluster_type* best_match = nullptr; + + for (auto& c : children) { + auto& cluster = c.cluster(); + auto d = cluster.centroid.distance_to(vec); + + if (d <= max_distance) { + match = &cluster; + break; + } else if (d < best_distance) { + best_distance = d; + best_match = &cluster; + } + } + + if (!match) { + if (children.size() < opts_.max_children) { + auto& nn = children.emplace_back(); + match = &nn.cluster(); + } else { + match = best_match; + } + } + + match->centroid.add(vec); + match->index.push_back(i); + } + + td << "cluster_by_distance: " << node.cluster().index.size() << " -> " + << children.size() << ")"; + + node.v = std::move(children); +} + +template +template +void similarity_ordering_::cluster_rec( + basic_cluster_tree_node< + basic_cluster>& node, + basic_array_similarity_element_view const& ev, + std::shared_ptr jt, int max_distance) const { + cluster_by_distance(node, ev, max_distance); + + for (auto& cn : node.children()) { + if (max_distance > 1 && + cn.cluster().index.size() > opts_.max_cluster_size) { + jt->start_job(); + wg_.add_job([this, &cn, &ev, jt, md = max_distance / 2] { + cluster_rec(cn, ev, jt, md); + jt->finish_job(); + }); + } else if (cn.cluster().index.size() > 1) { + jt->start_job(); + wg_.add_job([this, &index = cn.cluster().index, &ev, jt] { + order_cluster(ev, index); + jt->finish_job(); + }); + } + } +} + +template +template +void similarity_ordering_::cluster( + basic_cluster_tree_node< + basic_cluster>& root, + basic_array_similarity_element_view const& ev, + std::shared_ptr jt) const { + jt->start_job(); + wg_.add_job([this, &root, &ev, jt] { + cluster_rec(root, ev, jt, Bits / 2); + jt->finish_job(); + }); +} + +template +template +void similarity_ordering_::collect_rec( + basic_cluster_tree_node< + basic_cluster>& node, + basic_array_similarity_element_view const& ev, + duplicates_map& dup, index_type& ordered, std::string indent) const { + if (node.is_leaf()) { + for (auto e : node.cluster().index) { + LOG_TRACE << indent << " " << ev.description(e) << " -> " + << node.cluster().centroid.distance_to(ev.get_bits(e)); + + ordered.push_back(e); + + if (auto it = dup.find(e); it != dup.end()) { + auto& dupvec = it->second; + + std::sort(dupvec.begin(), dupvec.end(), + [&ev](auto a, auto b) { return ev.order_less(a, b); }); + + for (auto i : dupvec) { + LOG_TRACE << indent << " + " << ev.description(i) << " -> " + << node.cluster().centroid.distance_to(ev.get_bits(i)); + ordered.push_back(i); + } + } + } + } else { + // TODO: order children, probably do this as a separate (parallel) + // step before collecting + + for (auto const& [i, cn] : folly::enumerate(node.children())) { + LOG_TRACE << indent << "[" << i << "] " << cn.description(); + collect_rec(cn, ev, dup, ordered, indent + " "); + } + } +} + +template +template +void similarity_ordering_::order_impl( + std::promise&& promise, + basic_array_similarity_element_view const& ev) const { + auto index = build_index(ev); + + LOG_INFO << "total distance before ordering: " << total_distance(ev, index); + + size_t size_hint = index.size(); + auto duplicates = find_duplicates(ev, index); + auto root = std::make_shared(std::move(index)); + + auto jt = std::make_shared( + [this, size_hint, &ev, p = std::move(promise), root, + dup = std::move(duplicates)]() mutable { + { + auto ti = LOG_TIMED_INFO; + order_tree_rec(*root, ev); + ti << "order_tree_rec"; + } + index_type rv; + rv.reserve(size_hint); + collect_rec(*root, ev, dup, rv, ""); + LOG_INFO << "total distance after ordering: " << total_distance(ev, rv); + p.set_value(std::move(rv)); + }); + + cluster(*root, ev, jt); +} + +template +auto similarity_ordering_::order_nilsimsa( + nilsimsa_element_view const& ev) const -> std::future { + std::promise prom; + auto future = prom.get_future(); + wg_.add_job([this, prom = std::move(prom), &ev]() mutable { + order_impl(std::move(prom), ev); + }); + return future; +} + +similarity_ordering::similarity_ordering( + logger& lgr, progress& prog, worker_group& wg, + similarity_ordering_options const& opts) + : impl_(make_unique_logging_object(lgr, prog, wg, opts)) {} + +} // namespace dwarfs