diff --git a/ricepp/CMakeLists.txt b/ricepp/CMakeLists.txt index 659d02e4..305c41c4 100644 --- a/ricepp/CMakeLists.txt +++ b/ricepp/CMakeLists.txt @@ -39,11 +39,45 @@ if(WIN32) add_compile_definitions(_WIN32_WINNT=0x0601 WINVER=0x0601) endif() -add_library(ricepp ricepp.cpp) +add_library(ricepp_fallback OBJECT ricepp_cpuspecific.cpp) +target_compile_definitions(ricepp_fallback PRIVATE RICEPP_CPU_VARIANT=fallback) +list(APPEND RICEPP_LIBS_CPUSPECIFIC ricepp_fallback) -target_include_directories(ricepp PUBLIC include) +if(NOT (WIN32 OR CMAKE_CXX_FLAGS MATCHES "-march=")) + CHECK_CXX_COMPILER_FLAG(-mbmi2 COMPILER_SUPPORTS_MBMI2) + CHECK_CXX_COMPILER_FLAG(-mavx512vl COMPILER_SUPPORTS_MAVX512VL) + CHECK_CXX_COMPILER_FLAG(-mavx512vbmi COMPILER_SUPPORTS_MAVX512VBMI) + + if(COMPILER_SUPPORTS_MBMI2) + add_library(ricepp_bmi2 OBJECT ricepp_cpuspecific.cpp) + target_compile_options(ricepp_bmi2 PRIVATE -mbmi2) + target_compile_definitions(ricepp_bmi2 PRIVATE RICEPP_CPU_VARIANT=has_bmi2) + list(APPEND RICEPP_LIBS_CPUSPECIFIC ricepp_bmi2) + list(APPEND RICEPP_CPU_SUPPORT RICEPP_CPU_BMI2) + + if(COMPILER_SUPPORTS_MAVX512VL AND COMPILER_SUPPORTS_MAVX512VBMI) + add_library(ricepp_bmi2_avx512 OBJECT ricepp_cpuspecific.cpp) + target_compile_options(ricepp_bmi2_avx512 PRIVATE -mbmi2 -mavx512vl -mavx512vbmi) + target_compile_definitions(ricepp_bmi2_avx512 PRIVATE RICEPP_CPU_VARIANT=has_bmi2_avx512) + list(APPEND RICEPP_LIBS_CPUSPECIFIC ricepp_bmi2_avx512) + list(APPEND RICEPP_CPU_SUPPORT RICEPP_CPU_BMI2_AVX512) + endif() + endif() +endif() + +foreach(target ${RICEPP_LIBS_CPUSPECIFIC}) + message(STATUS "[ricepp] adding CPU target: ${target}") + target_include_directories(${target} PUBLIC include) + target_link_libraries(${target} PUBLIC range-v3) + target_compile_features(${target} PUBLIC cxx_std_20) + list(APPEND RICEPP_OBJECTS_CPUSPECIFIC $) +endforeach() + +add_library(ricepp ricepp.cpp ${RICEPP_OBJECTS_CPUSPECIFIC}) target_link_libraries(ricepp PUBLIC range-v3) +target_include_directories(ricepp PUBLIC include) target_compile_features(ricepp PUBLIC cxx_std_20) +target_compile_definitions(ricepp PRIVATE ${RICEPP_CPU_SUPPORT}) # # TODO: remove/rework # add_executable(ricepp_demo ricepp_demo.cpp) diff --git a/ricepp/ricepp.cpp b/ricepp/ricepp.cpp index 293e50ac..d19a0370 100644 --- a/ricepp/ricepp.cpp +++ b/ricepp/ricepp.cpp @@ -19,251 +19,60 @@ * along with ricepp. If not, see . */ -#include #include +#include +#include +#include +#include -#include -#include -#include -#include #include +#include "ricepp_cpuspecific.h" + namespace ricepp { namespace { -template -class dynamic_pixel_traits { - public: - using value_type = ValueType; - static constexpr size_t const kBitCount = - std::numeric_limits::digits; - static constexpr value_type const kAllOnes = - std::numeric_limits::max(); +detail::cpu_variant get_cpu_variant_init() { +#ifndef _WIN32 +#if defined(__has_builtin) +#if __has_builtin(__builtin_cpu_supports) + __builtin_cpu_init(); - dynamic_pixel_traits(std::endian byteorder, - unsigned unused_lsb_count) noexcept - : unused_lsb_count_{unused_lsb_count} - , byteorder_{byteorder} -#ifndef NDEBUG - , lsb_mask_{static_cast(~(kAllOnes << unused_lsb_count))} - , msb_mask_{static_cast(~(kAllOnes >> unused_lsb_count))} + bool const has_avx512vl = __builtin_cpu_supports("avx512vl"); + bool const has_avx512vbmi = __builtin_cpu_supports("avx512vbmi"); + bool const has_bmi2 = __builtin_cpu_supports("bmi2"); + + if (has_avx512vl && has_avx512vbmi && has_bmi2) { + return detail::cpu_variant::has_bmi2_avx512; + } + + if (has_bmi2) { + return detail::cpu_variant::has_bmi2; + } +#endif #endif - { - assert(unused_lsb_count < kBitCount); - } - - [[nodiscard]] value_type read(value_type value) const noexcept { - value_type tmp = byteswap(value, byteorder_); - assert((tmp & lsb_mask_) == 0); - return tmp >> unused_lsb_count_; - } - - [[nodiscard]] value_type write(value_type value) const noexcept { - assert((value & msb_mask_) == 0); - return byteswap(static_cast(value << unused_lsb_count_), - byteorder_); - } - - private: - unsigned const unused_lsb_count_; - std::endian const byteorder_; -#ifndef NDEBUG - value_type const lsb_mask_; - value_type const msb_mask_; #endif -}; -template -class static_pixel_traits { - public: - using value_type = ValueType; - static constexpr size_t const kBitCount = - std::numeric_limits::digits; - static constexpr value_type const kAllOnes = - std::numeric_limits::max(); - static constexpr std::endian const kByteOrder = ByteOrder; - static constexpr unsigned const kUnusedLsbCount = UnusedLsbCount; - static constexpr value_type const kLsbMask = - static_cast(~(kAllOnes << kUnusedLsbCount)); - static constexpr value_type const kMsbMask = - static_cast(~(kAllOnes >> kUnusedLsbCount)); - static_assert(kUnusedLsbCount < kBitCount); - - [[nodiscard]] static value_type read(value_type value) noexcept { - value_type tmp = byteswap(value); - assert((tmp & kLsbMask) == 0); - return tmp >> kUnusedLsbCount; - } - - [[nodiscard]] static value_type write(value_type value) noexcept { - assert((value & kMsbMask) == 0); - return byteswap( - static_cast(value << kUnusedLsbCount)); - } -}; - -template -class codec_impl final - : public codec_interface, - public PixelTraits { - public: - using pixel_type = typename PixelTraits::value_type; - using codec_type = codec; - - codec_impl(PixelTraits const& traits, size_t block_size) - : PixelTraits{traits} - , block_size_{block_size} {} - - std::vector - encode(std::span input) const override { - return encode_impl(input.data(), input.size()); - } - - size_t worst_case_encoded_bytes(size_t pixel_count) const override { - codec_type codec{block_size_, *this}; - return worst_case_encoded_bytes_impl(codec, pixel_count); - } - - size_t - worst_case_encoded_bytes(std::span input) const override { - return worst_case_encoded_bytes(input.size()); - } - - std::span encode(std::span output, - std::span input) const override { - return encode_impl(output.data(), output.size(), input.data(), - input.size()); - } - - void decode(std::span output, - std::span input) const override { - decode_impl(output.data(), output.size(), input.data(), input.size()); - } - - private: - size_t worst_case_encoded_bytes_impl(codec_type& codec, size_t size) const { - return (codec.worst_case_bit_count(size) + 8 - 1) / 8; - } - - std::vector - encode_impl(pixel_type const* __restrict input, size_t size) const { - return encode_impl(std::span{input, size}); - } - - std::span - encode_impl(uint8_t* __restrict output, size_t output_size, - pixel_type const* __restrict input, size_t input_size) const { - return encode_impl(std::span{output, output_size}, - std::span{input, input_size}); - } - - void decode_impl(pixel_type* __restrict output, size_t output_size, - uint8_t const* __restrict input, size_t input_size) const { - return decode_impl(std::span{output, output_size}, - std::span{input, input_size}); - } - - std::vector encode_impl(std::span input) const { - std::vector output; - codec_type codec{block_size_, *this}; - output.resize(worst_case_encoded_bytes_impl(codec, input.size())); - bitstream_writer writer{output.begin()}; - codec.encode(input, writer); - output.resize(std::distance(output.begin(), writer.iterator())); - return output; - } - - std::span encode_impl(std::span output, - std::span input) const { - codec_type codec{block_size_, *this}; - assert(output.size() >= worst_case_encoded_bytes_impl(codec, input.size())); - bitstream_writer writer{output.begin()}; - codec.encode(input, writer); - return std::span{output.begin(), writer.iterator()}; - } - - void decode_impl(std::span output, - std::span input) const { - bitstream_reader reader{input.begin(), input.end()}; - codec_type codec{block_size_, *this}; - codec.decode(output, reader); - } - - private: - size_t const block_size_; -}; - -template -std::unique_ptr> -create_codec_(size_t block_size, PixelTraits const& traits) { - if (block_size <= 512) { - return std::make_unique>( - traits, block_size); - } - - return nullptr; + return detail::cpu_variant::fallback; } -template -std::unique_ptr> -create_codec_(size_t block_size, size_t component_stream_count, - PixelTraits const& traits) { - switch (component_stream_count) { - case 1: - return create_codec_<1, PixelTraits>(block_size, traits); - - case 2: - return create_codec_<2, PixelTraits>(block_size, traits); - - default: - break; - } - - return nullptr; +detail::cpu_variant get_cpu_variant() { + static detail::cpu_variant const variant = get_cpu_variant_init(); + return variant; } -template -std::unique_ptr> -create_codec_(size_t block_size, size_t component_stream_count) { - using pixel_traits = - static_pixel_traits; - - if (auto codec = create_codec_( - block_size, component_stream_count, pixel_traits{})) { - return codec; +void show_cpu_variant(std::string_view variant) { + if (std::getenv("RICEPP_SHOW_CPU_VARIANT")) { + std::cerr << "ricepp: using " << variant << " CPU variant\n"; } - - return nullptr; } -template -std::unique_ptr> -create_codec_(codec_config const& config) { - if (config.byteorder == std::endian::big) { - switch (config.unused_lsb_count) { - case 0: - return create_codec_( - config.block_size, config.component_stream_count); - - case 2: - return create_codec_( - config.block_size, config.component_stream_count); - - case 4: - return create_codec_( - config.block_size, config.component_stream_count); - } - } - - using pixel_traits = dynamic_pixel_traits; - - return create_codec_( - config.block_size, config.component_stream_count, - pixel_traits{config.byteorder, config.unused_lsb_count}); +void show_cpu_variant_once(std::string_view variant) { + static auto const _ = [&variant]() { + show_cpu_variant(variant); + return true; + }(); } } // namespace @@ -271,11 +80,30 @@ create_codec_(codec_config const& config) { template <> std::unique_ptr> create_codec(codec_config const& config) { - if (auto codec = create_codec_(config)) { - return codec; + switch (get_cpu_variant()) { +#ifdef RICEPP_CPU_BMI2_AVX512 + case detail::cpu_variant::has_bmi2_avx512: + show_cpu_variant_once("BMI2+AVX512"); + return detail::create_codec_cpuspecific_< + uint16_t, detail::cpu_variant::has_bmi2_avx512>(config); +#endif + +#ifdef RICEPP_CPU_BMI2 + case detail::cpu_variant::has_bmi2: + show_cpu_variant_once("BMI2"); + return detail::create_codec_cpuspecific_( + config); +#endif + + default: + show_cpu_variant_once("fallback"); + return detail::create_codec_cpuspecific_( + config); } - throw std::runtime_error("Unsupported configuration"); + throw std::runtime_error("internal error: unknown CPU variant"); } } // namespace ricepp diff --git a/ricepp/ricepp_cpuspecific.cpp b/ricepp/ricepp_cpuspecific.cpp new file mode 100644 index 00000000..d4a73a9d --- /dev/null +++ b/ricepp/ricepp_cpuspecific.cpp @@ -0,0 +1,289 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of ricepp. + * + * ricepp is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ricepp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with ricepp. If not, see . + */ + +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include "ricepp_cpuspecific.h" + +namespace ricepp { + +namespace { + +template +class dynamic_pixel_traits { + public: + using value_type = ValueType; + static constexpr size_t const kBitCount = + std::numeric_limits::digits; + static constexpr value_type const kAllOnes = + std::numeric_limits::max(); + + dynamic_pixel_traits(std::endian byteorder, + unsigned unused_lsb_count) noexcept + : unused_lsb_count_{unused_lsb_count} + , byteorder_{byteorder} +#ifndef NDEBUG + , lsb_mask_{static_cast(~(kAllOnes << unused_lsb_count))} + , msb_mask_{static_cast(~(kAllOnes >> unused_lsb_count))} +#endif + { + assert(unused_lsb_count < kBitCount); + } + + [[nodiscard]] value_type read(value_type value) const noexcept { + value_type tmp = byteswap(value, byteorder_); + assert((tmp & lsb_mask_) == 0); + return tmp >> unused_lsb_count_; + } + + [[nodiscard]] value_type write(value_type value) const noexcept { + assert((value & msb_mask_) == 0); + return byteswap(static_cast(value << unused_lsb_count_), + byteorder_); + } + + private: + unsigned const unused_lsb_count_; + std::endian const byteorder_; +#ifndef NDEBUG + value_type const lsb_mask_; + value_type const msb_mask_; +#endif +}; + +template +class static_pixel_traits { + public: + using value_type = ValueType; + static constexpr size_t const kBitCount = + std::numeric_limits::digits; + static constexpr value_type const kAllOnes = + std::numeric_limits::max(); + static constexpr std::endian const kByteOrder = ByteOrder; + static constexpr unsigned const kUnusedLsbCount = UnusedLsbCount; + static constexpr value_type const kLsbMask = + static_cast(~(kAllOnes << kUnusedLsbCount)); + static constexpr value_type const kMsbMask = + static_cast(~(kAllOnes >> kUnusedLsbCount)); + static_assert(kUnusedLsbCount < kBitCount); + + [[nodiscard]] static value_type read(value_type value) noexcept { + value_type tmp = byteswap(value); + assert((tmp & kLsbMask) == 0); + return tmp >> kUnusedLsbCount; + } + + [[nodiscard]] static value_type write(value_type value) noexcept { + assert((value & kMsbMask) == 0); + return byteswap( + static_cast(value << kUnusedLsbCount)); + } +}; + +template +class codec_impl final + : public codec_interface, + public PixelTraits { + public: + using pixel_type = typename PixelTraits::value_type; + using codec_type = codec; + + codec_impl(PixelTraits const& traits, size_t block_size) + : PixelTraits{traits} + , block_size_{block_size} {} + + std::vector + encode(std::span input) const override { + return encode_impl(input.data(), input.size()); + } + + size_t worst_case_encoded_bytes(size_t pixel_count) const override { + codec_type codec{block_size_, *this}; + return worst_case_encoded_bytes_impl(codec, pixel_count); + } + + size_t + worst_case_encoded_bytes(std::span input) const override { + return worst_case_encoded_bytes(input.size()); + } + + std::span encode(std::span output, + std::span input) const override { + return encode_impl(output.data(), output.size(), input.data(), + input.size()); + } + + void decode(std::span output, + std::span input) const override { + decode_impl(output.data(), output.size(), input.data(), input.size()); + } + + private: + size_t worst_case_encoded_bytes_impl(codec_type& codec, size_t size) const { + return (codec.worst_case_bit_count(size) + 8 - 1) / 8; + } + + std::vector + encode_impl(pixel_type const* __restrict input, size_t size) const { + return encode_impl(std::span{input, size}); + } + + std::span + encode_impl(uint8_t* __restrict output, size_t output_size, + pixel_type const* __restrict input, size_t input_size) const { + return encode_impl(std::span{output, output_size}, + std::span{input, input_size}); + } + + void decode_impl(pixel_type* __restrict output, size_t output_size, + uint8_t const* __restrict input, size_t input_size) const { + return decode_impl(std::span{output, output_size}, + std::span{input, input_size}); + } + + std::vector encode_impl(std::span input) const { + std::vector output; + codec_type codec{block_size_, *this}; + output.resize(worst_case_encoded_bytes_impl(codec, input.size())); + bitstream_writer writer{output.begin()}; + codec.encode(input, writer); + output.resize(std::distance(output.begin(), writer.iterator())); + return output; + } + + std::span encode_impl(std::span output, + std::span input) const { + codec_type codec{block_size_, *this}; + assert(output.size() >= worst_case_encoded_bytes_impl(codec, input.size())); + bitstream_writer writer{output.begin()}; + codec.encode(input, writer); + return std::span{output.begin(), writer.iterator()}; + } + + void decode_impl(std::span output, + std::span input) const { + bitstream_reader reader{input.begin(), input.end()}; + codec_type codec{block_size_, *this}; + codec.decode(output, reader); + } + + private: + size_t const block_size_; +}; + +template +std::unique_ptr> +create_codec_(size_t block_size, PixelTraits const& traits) { + if (block_size <= 512) { + return std::make_unique>( + traits, block_size); + } + + return nullptr; +} + +template +std::unique_ptr> +create_codec_(size_t block_size, size_t component_stream_count, + PixelTraits const& traits) { + switch (component_stream_count) { + case 1: + return create_codec_<1, PixelTraits>(block_size, traits); + + case 2: + return create_codec_<2, PixelTraits>(block_size, traits); + + default: + break; + } + + return nullptr; +} + +template +std::unique_ptr> +create_codec_(size_t block_size, size_t component_stream_count) { + using pixel_traits = + static_pixel_traits; + + if (auto codec = create_codec_( + block_size, component_stream_count, pixel_traits{})) { + return codec; + } + + return nullptr; +} + +template +std::unique_ptr> +create_codec_(codec_config const& config) { + if (config.byteorder == std::endian::big) { + switch (config.unused_lsb_count) { + case 0: + return create_codec_( + config.block_size, config.component_stream_count); + + case 2: + return create_codec_( + config.block_size, config.component_stream_count); + + case 4: + return create_codec_( + config.block_size, config.component_stream_count); + } + } + + using pixel_traits = dynamic_pixel_traits; + + return create_codec_( + config.block_size, config.component_stream_count, + pixel_traits{config.byteorder, config.unused_lsb_count}); +} + +} // namespace + +namespace detail { + +template <> +std::unique_ptr> +create_codec_cpuspecific_( + codec_config const& config) { + if (auto codec = create_codec_(config)) { + return codec; + } + + throw std::runtime_error("Unsupported configuration"); +} + +} // namespace detail +} // namespace ricepp diff --git a/ricepp/ricepp_cpuspecific.h b/ricepp/ricepp_cpuspecific.h new file mode 100644 index 00000000..6827c233 --- /dev/null +++ b/ricepp/ricepp_cpuspecific.h @@ -0,0 +1,46 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of ricepp. + * + * ricepp is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ricepp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with ricepp. If not, see . + */ + +#pragma once + +#include +#include + +#include + +namespace ricepp { + +struct codec_config; + +namespace detail { + +enum class cpu_variant { + fallback, + has_bmi2, + has_bmi2_avx512, +}; + +template +std::unique_ptr> +create_codec_cpuspecific_(codec_config const& config); + +} // namespace detail +} // namespace ricepp