From cac5b778e898029f02d607b4d8ed4dd55ee135cf Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Fri, 21 Mar 2025 22:58:03 +0100 Subject: [PATCH] refactor: replace multiversioning --- cmake/compile.cmake | 2 +- include/dwarfs/compiler.h | 7 +- src/writer/internal/nilsimsa.cpp | 3 - src/writer/internal/similarity_ordering.cpp | 120 +++++++++++++++----- test/multiversioning_benchmark.cpp | 101 +++++++++++++++- 5 files changed, 194 insertions(+), 39 deletions(-) diff --git a/cmake/compile.cmake b/cmake/compile.cmake index 859a30aa..6f99cc74 100644 --- a/cmake/compile.cmake +++ b/cmake/compile.cmake @@ -62,7 +62,7 @@ if(NOT WIN32) if(LDD_EXE) execute_process(COMMAND ${LDD_EXE} --version ERROR_VARIABLE LDD_VERSION) if(LDD_VERSION MATCHES "musl libc") - add_compile_definitions(DWARFS_MUSL=1 _LARGEFILE64_SOURCE) + add_compile_definitions(_LARGEFILE64_SOURCE) endif() endif() diff --git a/include/dwarfs/compiler.h b/include/dwarfs/compiler.h index b07f3c7c..7933fc90 100644 --- a/include/dwarfs/compiler.h +++ b/include/dwarfs/compiler.h @@ -33,9 +33,10 @@ #endif #endif -#if !defined(DWARFS_SANITIZE_THREAD) && !defined(DWARFS_MUSL) && \ - defined(__x86_64__) && __has_attribute(target_clones) -#define DWARFS_MULTIVERSIONING 1 +#if defined(__has_builtin) +#if __has_builtin(__builtin_cpu_supports) && __has_attribute(target) +#define DWARFS_USE_CPU_FEATURES 1 +#endif #endif #if defined(__GNUC__) || defined(__clang__) diff --git a/src/writer/internal/nilsimsa.cpp b/src/writer/internal/nilsimsa.cpp index cf67c7ea..22b3d03f 100644 --- a/src/writer/internal/nilsimsa.cpp +++ b/src/writer/internal/nilsimsa.cpp @@ -140,9 +140,6 @@ class nilsimsa::impl { size_ += size; } -#ifdef DWARFS_MULTIVERSIONING - // __attribute__((target_clones("arch=tigerlake", "default"))) -#endif void update_fast(uint8_t const* data, size_t size) { uint8_t w1 = w_[0]; uint8_t w2 = w_[1]; diff --git a/src/writer/internal/similarity_ordering.cpp b/src/writer/internal/similarity_ordering.cpp index 17ee2a4f..61d6b6f5 100644 --- a/src/writer/internal/similarity_ordering.cpp +++ b/src/writer/internal/similarity_ordering.cpp @@ -20,6 +20,7 @@ */ #include +#include #include #include #include @@ -78,46 +79,111 @@ template int distance(std::array const& a, std::array const& b) { int d = 0; for (size_t i = 0; i < N; ++i) { - d += folly::popcount(a[i] ^ b[i]); + d += std::popcount(a[i] ^ b[i]); } return d; } -#ifdef DWARFS_MULTIVERSIONING -#ifdef __clang__ -__attribute__((target_clones("avx512vpopcntdq", "popcnt", "default"))) +#if defined(DWARFS_USE_CPU_FEATURES) && defined(__x86_64__) +#define DWARFS_USE_POPCNT +#endif + +enum class cpu_feature { + none, + popcnt, +}; + +cpu_feature detect_cpu_feature() { +#ifdef DWARFS_USE_POPCNT + static cpu_feature const feature = [] { + if (__builtin_cpu_supports("popcnt")) { + return cpu_feature::popcnt; + } + return cpu_feature::none; + }(); + return feature; #else -__attribute__((target_clones("popcnt", "default"))) + return cpu_feature::none; #endif +} + +template +decltype(auto) cpu_dispatch(Args&&... args) { +#ifdef DWARFS_USE_POPCNT + auto feature = detect_cpu_feature(); + switch (feature) { + case cpu_feature::popcnt: + return Fn::template call(std::forward(args)...); + default: + break; + } #endif -int distance(std::array const& a, std::array const& b) { + return Fn::template call(std::forward(args)...); +} + +int distance_default(std::array const& a, + std::array const& b) { return distance(a, b); } +#ifdef DWARFS_USE_POPCNT +__attribute__((__target__("popcnt"))) int +distance_popcnt(std::array const& a, + std::array const& b) { + return distance(a, b); +} +#endif + +struct distance_cpu { + template + static int + call(std::array const& a, std::array const& b) { +#ifdef DWARFS_USE_POPCNT + if constexpr (CpuFeature == cpu_feature::popcnt) { + return distance_popcnt(a, b); + } +#endif + return distance_default(a, b); + } +}; + +struct order_by_shortest_path_cpu { + template + static void + call(size_t count, GetI const& geti, GetK const& getk, Swap const& swapper) { + for (size_t i = 0; i < count - 1; ++i) { + auto bi = geti(i); + int best_distance = std::numeric_limits::max(); + size_t best_index = 0; + + for (size_t k = i + 1; k < count; ++k) { + auto bk = getk(k); + auto d = distance_cpu::template call(*bi, *bk); + if (d < best_distance) { + best_distance = d; + best_index = k; + if (best_distance <= 1) { + break; + } + } + } + + if (best_index > 0 && i + 1 != best_index) { + swapper(i + 1, best_index); + } + } + } +}; + +int distance(std::array const& a, + std::array const& b) { + return cpu_dispatch(a, b); +} + template void order_by_shortest_path(size_t count, GetI const& geti, GetK const& getk, Swap const& swapper) { - for (size_t i = 0; i < count - 1; ++i) { - auto bi = geti(i); - int best_distance = std::numeric_limits::max(); - size_t best_index = 0; - - for (size_t k = i + 1; k < count; ++k) { - auto bk = getk(k); - auto d = distance(*bi, *bk); - if (d < best_distance) { - best_distance = d; - best_index = k; - if (best_distance <= 1) { - break; - } - } - } - - if (best_index > 0 && i + 1 != best_index) { - swapper(i + 1, best_index); - } - } + cpu_dispatch(count, geti, getk, swapper); } template const& a, std::array const& b) { return d; } -#ifdef DWARFS_MULTIVERSIONING -#ifdef __clang__ -__attribute__((target_clones("avx512vpopcntdq", "popcnt", "default"))) +#if defined(DWARFS_USE_CPU_FEATURES) && defined(__x86_64__) +#define DWARFS_USE_POPCNT +#endif + +enum class cpu_feature { + none, + popcnt, +}; + +cpu_feature detect_cpu_feature() { +#ifdef DWARFS_USE_POPCNT + static cpu_feature const feature = [] { + if (__builtin_cpu_supports("popcnt")) { + return cpu_feature::popcnt; + } + return cpu_feature::none; + }(); + return feature; #else -__attribute__((target_clones("popcnt", "default"))) + return cpu_feature::none; #endif +} + +template +decltype(auto) cpu_dispatch(Args&&... args) { +#ifdef DWARFS_USE_POPCNT + auto feature = detect_cpu_feature(); + switch (feature) { + case cpu_feature::popcnt: + return Fn::template call(std::forward(args)...); + default: + break; + } #endif -int distance(std::array const& a, std::array const& b) { + return Fn::template call(std::forward(args)...); +} + +int distance_default(std::array const& a, + std::array const& b) { return distance(a, b); } +#ifdef DWARFS_USE_POPCNT +__attribute__((__target__("popcnt"))) int +distance_popcnt(std::array const& a, + std::array const& b) { + return distance(a, b); +} +#endif + +struct distance_cpu { + template + static int + call(std::array const& a, std::array const& b) { +#ifdef DWARFS_USE_POPCNT + if constexpr (CpuFeature == cpu_feature::popcnt) { + return distance_popcnt(a, b); + } +#endif + return distance_default(a, b); + } +}; + +int distance(std::array const& a, + std::array const& b) { + return cpu_dispatch(a, b); +} void nilsimsa_distance(::benchmark::State& state) { std::independent_bits_engine::digits, uint64_t> @@ -74,6 +130,38 @@ void nilsimsa_distance(::benchmark::State& state) { } } +#ifdef DWARFS_USE_POPCNT +void nilsimsa_distance_cpu(::benchmark::State& state) { + std::independent_bits_engine::digits, uint64_t> + rng; + static constexpr unsigned const kNumData{1024}; + std::vector> data(kNumData); + for (auto& a : data) { + std::generate(begin(a), end(a), std::ref(rng)); + } + unsigned i{0}, k{1}; + int d; + + switch (detect_cpu_feature()) { + case cpu_feature::popcnt: + for (auto _ : state) { + ::benchmark::DoNotOptimize( + d = distance_cpu::template call( + data[i++ % kNumData], data[k++ % kNumData])); + } + break; + default: + for (auto _ : state) { + ::benchmark::DoNotOptimize( + d = distance_cpu::template call( + data[i++ % kNumData], data[k++ % kNumData])); + } + break; + } +} +#endif + void nilsimsa_update(::benchmark::State& state) { std::independent_bits_engine::digits, uint16_t> @@ -92,6 +180,9 @@ void nilsimsa_update(::benchmark::State& state) { } // namespace BENCHMARK(nilsimsa_distance); +#ifdef DWARFS_USE_POPCNT +BENCHMARK(nilsimsa_distance_cpu); +#endif BENCHMARK(nilsimsa_update); BENCHMARK_MAIN();