refactor: replace multiversioning

This commit is contained in:
Marcus Holland-Moritz 2025-03-21 22:58:03 +01:00
parent 514ca07e64
commit cac5b778e8
5 changed files with 194 additions and 39 deletions

View File

@ -62,7 +62,7 @@ if(NOT WIN32)
if(LDD_EXE) if(LDD_EXE)
execute_process(COMMAND ${LDD_EXE} --version ERROR_VARIABLE LDD_VERSION) execute_process(COMMAND ${LDD_EXE} --version ERROR_VARIABLE LDD_VERSION)
if(LDD_VERSION MATCHES "musl libc") if(LDD_VERSION MATCHES "musl libc")
add_compile_definitions(DWARFS_MUSL=1 _LARGEFILE64_SOURCE) add_compile_definitions(_LARGEFILE64_SOURCE)
endif() endif()
endif() endif()

View File

@ -33,9 +33,10 @@
#endif #endif
#endif #endif
#if !defined(DWARFS_SANITIZE_THREAD) && !defined(DWARFS_MUSL) && \ #if defined(__has_builtin)
defined(__x86_64__) && __has_attribute(target_clones) #if __has_builtin(__builtin_cpu_supports) && __has_attribute(target)
#define DWARFS_MULTIVERSIONING 1 #define DWARFS_USE_CPU_FEATURES 1
#endif
#endif #endif
#if defined(__GNUC__) || defined(__clang__) #if defined(__GNUC__) || defined(__clang__)

View File

@ -140,9 +140,6 @@ class nilsimsa::impl {
size_ += size; size_ += size;
} }
#ifdef DWARFS_MULTIVERSIONING
// __attribute__((target_clones("arch=tigerlake", "default")))
#endif
void update_fast(uint8_t const* data, size_t size) { void update_fast(uint8_t const* data, size_t size) {
uint8_t w1 = w_[0]; uint8_t w1 = w_[0];
uint8_t w2 = w_[1]; uint8_t w2 = w_[1];

View File

@ -20,6 +20,7 @@
*/ */
#include <algorithm> #include <algorithm>
#include <bit>
#include <limits> #include <limits>
#include <mutex> #include <mutex>
#include <numeric> #include <numeric>
@ -78,46 +79,111 @@ template <typename T, size_t N>
int distance(std::array<T, N> const& a, std::array<T, N> const& b) { int distance(std::array<T, N> const& a, std::array<T, N> const& b) {
int d = 0; int d = 0;
for (size_t i = 0; i < N; ++i) { for (size_t i = 0; i < N; ++i) {
d += folly::popcount(a[i] ^ b[i]); d += std::popcount(a[i] ^ b[i]);
} }
return d; return d;
} }
#ifdef DWARFS_MULTIVERSIONING #if defined(DWARFS_USE_CPU_FEATURES) && defined(__x86_64__)
#ifdef __clang__ #define DWARFS_USE_POPCNT
__attribute__((target_clones("avx512vpopcntdq", "popcnt", "default"))) #endif
enum class cpu_feature {
none,
popcnt,
};
cpu_feature detect_cpu_feature() {
#ifdef DWARFS_USE_POPCNT
static cpu_feature const feature = [] {
if (__builtin_cpu_supports("popcnt")) {
return cpu_feature::popcnt;
}
return cpu_feature::none;
}();
return feature;
#else #else
__attribute__((target_clones("popcnt", "default"))) return cpu_feature::none;
#endif #endif
}
template <typename Fn, typename... Args>
decltype(auto) cpu_dispatch(Args&&... args) {
#ifdef DWARFS_USE_POPCNT
auto feature = detect_cpu_feature();
switch (feature) {
case cpu_feature::popcnt:
return Fn::template call<cpu_feature::popcnt>(std::forward<Args>(args)...);
default:
break;
}
#endif #endif
int distance(std::array<uint64_t, 4> const& a, std::array<uint64_t, 4> const& b) { return Fn::template call<cpu_feature::none>(std::forward<Args>(args)...);
}
int distance_default(std::array<uint64_t, 4> const& a,
std::array<uint64_t, 4> const& b) {
return distance<uint64_t, 4>(a, b); return distance<uint64_t, 4>(a, b);
} }
#ifdef DWARFS_USE_POPCNT
__attribute__((__target__("popcnt"))) int
distance_popcnt(std::array<uint64_t, 4> const& a,
std::array<uint64_t, 4> const& b) {
return distance<uint64_t, 4>(a, b);
}
#endif
struct distance_cpu {
template <cpu_feature CpuFeature>
static int
call(std::array<uint64_t, 4> const& a, std::array<uint64_t, 4> const& b) {
#ifdef DWARFS_USE_POPCNT
if constexpr (CpuFeature == cpu_feature::popcnt) {
return distance_popcnt(a, b);
}
#endif
return distance_default(a, b);
}
};
struct order_by_shortest_path_cpu {
template <cpu_feature CpuFeature, typename GetI, typename GetK, typename Swap>
static void
call(size_t count, GetI const& geti, GetK const& getk, Swap const& swapper) {
for (size_t i = 0; i < count - 1; ++i) {
auto bi = geti(i);
int best_distance = std::numeric_limits<int>::max();
size_t best_index = 0;
for (size_t k = i + 1; k < count; ++k) {
auto bk = getk(k);
auto d = distance_cpu::template call<CpuFeature>(*bi, *bk);
if (d < best_distance) {
best_distance = d;
best_index = k;
if (best_distance <= 1) {
break;
}
}
}
if (best_index > 0 && i + 1 != best_index) {
swapper(i + 1, best_index);
}
}
}
};
int distance(std::array<uint64_t, 4> const& a,
std::array<uint64_t, 4> const& b) {
return cpu_dispatch<distance_cpu>(a, b);
}
template <typename GetI, typename GetK, typename Swap> template <typename GetI, typename GetK, typename Swap>
void order_by_shortest_path(size_t count, GetI const& geti, GetK const& getk, void order_by_shortest_path(size_t count, GetI const& geti, GetK const& getk,
Swap const& swapper) { Swap const& swapper) {
for (size_t i = 0; i < count - 1; ++i) { cpu_dispatch<order_by_shortest_path_cpu>(count, geti, getk, swapper);
auto bi = geti(i);
int best_distance = std::numeric_limits<int>::max();
size_t best_index = 0;
for (size_t k = i + 1; k < count; ++k) {
auto bk = getk(k);
auto d = distance(*bi, *bk);
if (d < best_distance) {
best_distance = d;
best_index = k;
if (best_distance <= 1) {
break;
}
}
}
if (best_index > 0 && i + 1 != best_index) {
swapper(i + 1, best_index);
}
}
} }
template <size_t Bits, typename BitsType = uint64_t, template <size_t Bits, typename BitsType = uint64_t,

View File

@ -45,17 +45,73 @@ int distance(std::array<T, N> const& a, std::array<T, N> const& b) {
return d; return d;
} }
#ifdef DWARFS_MULTIVERSIONING #if defined(DWARFS_USE_CPU_FEATURES) && defined(__x86_64__)
#ifdef __clang__ #define DWARFS_USE_POPCNT
__attribute__((target_clones("avx512vpopcntdq", "popcnt", "default"))) #endif
enum class cpu_feature {
none,
popcnt,
};
cpu_feature detect_cpu_feature() {
#ifdef DWARFS_USE_POPCNT
static cpu_feature const feature = [] {
if (__builtin_cpu_supports("popcnt")) {
return cpu_feature::popcnt;
}
return cpu_feature::none;
}();
return feature;
#else #else
__attribute__((target_clones("popcnt", "default"))) return cpu_feature::none;
#endif #endif
}
template <typename Fn, typename... Args>
decltype(auto) cpu_dispatch(Args&&... args) {
#ifdef DWARFS_USE_POPCNT
auto feature = detect_cpu_feature();
switch (feature) {
case cpu_feature::popcnt:
return Fn::template call<cpu_feature::popcnt>(std::forward<Args>(args)...);
default:
break;
}
#endif #endif
int distance(std::array<uint64_t, 4> const& a, std::array<uint64_t, 4> const& b) { return Fn::template call<cpu_feature::none>(std::forward<Args>(args)...);
}
int distance_default(std::array<uint64_t, 4> const& a,
std::array<uint64_t, 4> const& b) {
return distance<uint64_t, 4>(a, b); return distance<uint64_t, 4>(a, b);
} }
#ifdef DWARFS_USE_POPCNT
__attribute__((__target__("popcnt"))) int
distance_popcnt(std::array<uint64_t, 4> const& a,
std::array<uint64_t, 4> const& b) {
return distance<uint64_t, 4>(a, b);
}
#endif
struct distance_cpu {
template <cpu_feature CpuFeature>
static int
call(std::array<uint64_t, 4> const& a, std::array<uint64_t, 4> const& b) {
#ifdef DWARFS_USE_POPCNT
if constexpr (CpuFeature == cpu_feature::popcnt) {
return distance_popcnt(a, b);
}
#endif
return distance_default(a, b);
}
};
int distance(std::array<uint64_t, 4> const& a,
std::array<uint64_t, 4> const& b) {
return cpu_dispatch<distance_cpu>(a, b);
}
void nilsimsa_distance(::benchmark::State& state) { void nilsimsa_distance(::benchmark::State& state) {
std::independent_bits_engine<std::mt19937_64, std::independent_bits_engine<std::mt19937_64,
std::numeric_limits<uint64_t>::digits, uint64_t> std::numeric_limits<uint64_t>::digits, uint64_t>
@ -74,6 +130,38 @@ void nilsimsa_distance(::benchmark::State& state) {
} }
} }
#ifdef DWARFS_USE_POPCNT
void nilsimsa_distance_cpu(::benchmark::State& state) {
std::independent_bits_engine<std::mt19937_64,
std::numeric_limits<uint64_t>::digits, uint64_t>
rng;
static constexpr unsigned const kNumData{1024};
std::vector<std::array<uint64_t, 4>> data(kNumData);
for (auto& a : data) {
std::generate(begin(a), end(a), std::ref(rng));
}
unsigned i{0}, k{1};
int d;
switch (detect_cpu_feature()) {
case cpu_feature::popcnt:
for (auto _ : state) {
::benchmark::DoNotOptimize(
d = distance_cpu::template call<cpu_feature::popcnt>(
data[i++ % kNumData], data[k++ % kNumData]));
}
break;
default:
for (auto _ : state) {
::benchmark::DoNotOptimize(
d = distance_cpu::template call<cpu_feature::none>(
data[i++ % kNumData], data[k++ % kNumData]));
}
break;
}
}
#endif
void nilsimsa_update(::benchmark::State& state) { void nilsimsa_update(::benchmark::State& state) {
std::independent_bits_engine<std::mt19937_64, std::independent_bits_engine<std::mt19937_64,
std::numeric_limits<uint8_t>::digits, uint16_t> std::numeric_limits<uint8_t>::digits, uint16_t>
@ -92,6 +180,9 @@ void nilsimsa_update(::benchmark::State& state) {
} // namespace } // namespace
BENCHMARK(nilsimsa_distance); BENCHMARK(nilsimsa_distance);
#ifdef DWARFS_USE_POPCNT
BENCHMARK(nilsimsa_distance_cpu);
#endif
BENCHMARK(nilsimsa_update); BENCHMARK(nilsimsa_update);
BENCHMARK_MAIN(); BENCHMARK_MAIN();