chore(ricepp): more perf tweaks

This commit is contained in:
Marcus Holland-Moritz 2024-02-24 21:13:36 +01:00
parent 766b6632af
commit 0ca8349b8b
4 changed files with 53 additions and 43 deletions

View File

@ -54,7 +54,7 @@ class bitstream_reader final {
assert(num_bits <= std::numeric_limits<T>::digits);
T bits = 0;
uint16_t pos = 0;
if (num_bits > 0) {
if (num_bits > 0) [[likely]] {
for (;;) {
size_t const remain = kBitsTypeBits - bit_pos_;
if (num_bits <= remain) {
@ -91,11 +91,11 @@ class bitstream_reader final {
if (bits != bits_type{}) [[likely]] {
size_t const ffs = std::countr_zero(bits);
assert(ffs < kBitsTypeBits);
if (ffs + 1 == kBitsTypeBits) [[unlikely]] {
bit_pos_ = 0;
} else {
if (ffs + 1 != kBitsTypeBits) {
data_ = bits;
bit_pos_ = ffs + 1;
} else {
bit_pos_ = 0;
}
return zeros + ffs;
}
@ -112,8 +112,7 @@ class bitstream_reader final {
RICEPP_FORCE_INLINE void skip_bits(size_t num_bits) {
assert(bit_pos_ + num_bits <= kBitsTypeBits);
bit_pos_ += num_bits;
bit_pos_ &= kBitsTypeBits - 1;
bit_pos_ = (bit_pos_ + num_bits) & (kBitsTypeBits - 1);
}
RICEPP_FORCE_INLINE bool peek_bit() {
@ -123,16 +122,17 @@ class bitstream_reader final {
RICEPP_FORCE_INLINE bits_type peek_bits(size_t num_bits) {
assert(bit_pos_ + num_bits <= kBitsTypeBits);
if (bit_pos_ == 0) [[unlikely]] {
auto const bp = bit_pos_;
if (bp == 0) {
data_ = read_packet();
}
// The remainder of this function is equivalent to:
//
// return _bextr_u64(data_, bit_pos_, num_bits);
// return _bextr_u64(data_, bp, num_bits);
//
// However, in practice, at least clang generates code that is as fast
// as the intrinsic, so we use the following code for portability.
bits_type bits = data_ >> bit_pos_;
bits_type bits = data_ >> bp;
if (num_bits < kBitsTypeBits) [[likely]] {
bits &= (static_cast<bits_type>(1) << num_bits) - 1;
}

View File

@ -76,7 +76,7 @@ class bitstream_writer final {
write_packet(bits);
repeat -= kBitsTypeBits;
}
if (repeat > 0) {
if (repeat > 0) [[likely]] {
write_bits_impl(bits, repeat);
}
}
@ -86,11 +86,17 @@ class bitstream_writer final {
static constexpr size_t kArgBits{std::numeric_limits<T>::digits};
assert(bit_pos_ < kBitsTypeBits);
assert(num_bits <= kArgBits);
while (num_bits > 0) {
size_t const bits_to_write = std::min(num_bits, kBitsTypeBits - bit_pos_);
write_bits_impl(bits, bits_to_write);
bits >>= bits_to_write;
num_bits -= bits_to_write;
if (num_bits > 0) [[likely]] {
for (;;) {
size_t const bits_to_write =
std::min(num_bits, kBitsTypeBits - bit_pos_);
write_bits_impl(bits, bits_to_write);
bits >>= bits_to_write;
if (num_bits == bits_to_write) [[likely]] {
break;
}
num_bits -= bits_to_write;
}
}
}
@ -111,7 +117,7 @@ class bitstream_writer final {
private:
RICEPP_FORCE_INLINE void write_bits_impl(bits_type bits, size_t num_bits) {
assert(bit_pos_ + num_bits <= kBitsTypeBits);
if (num_bits < kBitsTypeBits) {
if (num_bits < kBitsTypeBits) [[likely]] {
bits &= (static_cast<bits_type>(1) << num_bits) - 1;
}
data_ |= bits << bit_pos_;

View File

@ -50,22 +50,26 @@ void decode_block(V block, BitstreamReader& reader, PixelTraits const& traits,
auto const fsp1 = reader.template read_bits<value_type>(kFsBits);
if (fsp1 == 0) [[unlikely]] {
std::fill(block.begin(), block.end(), traits.write(last));
} else if (fsp1 > kFsMax) [[unlikely]] {
for (auto& b : block) {
b = reader.template read_bits<value_type>(kPixelBits);
if (fsp1 > 0) {
if (fsp1 <= kFsMax) {
auto const fs = fsp1 - 1;
for (auto& b : block) {
value_type diff = reader.find_first_set() << fs;
diff |= reader.template read_bits<value_type>(fs);
last += static_cast<std::make_signed_t<value_type>>(
((diff & 1) * value_type(-1)) ^ (diff >> 1));
// last += static_cast<std::make_signed_t<value_type>>(
// (diff & 1) ? ~(diff >> 1) : (diff >> 1));
b = traits.write(last);
}
} else {
for (auto& b : block) {
b = reader.template read_bits<value_type>(kPixelBits);
}
last = traits.read(block.back());
}
last = traits.read(block.back());
} else {
auto const fs = fsp1 - 1;
for (auto& b : block) {
value_type diff = reader.find_first_set() << fs;
diff |= reader.template read_bits<value_type>(fs);
last += static_cast<std::make_signed_t<value_type>>(
(diff & 1) ? ~(diff >> 1) : (diff >> 1));
b = traits.write(last);
}
std::fill(block.begin(), block.end(), traits.write(last));
}
last_value = last;

View File

@ -117,23 +117,12 @@ void encode_block(V block, BitstreamWriter& writer, PixelTraits const& traits,
last_value = last;
if (sum == 0) [[unlikely]] {
// All differences are zero, so just write a zero fs and we're done.
writer.write_bits(0U, kFsBits);
} else {
if (sum > 0) [[likely]] {
// Find the best bit position to split the difference values.
auto const [fs, bits_used] =
compute_best_split<kFsMax>(delta, block.size(), sum);
if (fs >= kFsMax || bits_used >= kPixelBits * block.size()) [[unlikely]] {
// Difference values are too large for entropy coding. Just plain copy
// the input pixel data. This is really unlikely, so reading the input
// pixels again is fine.
writer.write_bits(kFsMax + 1, kFsBits);
for (auto& b : block) {
writer.write_bits(b, kPixelBits);
}
} else {
if (fs < kFsMax && bits_used < kPixelBits * block.size()) [[likely]] {
// Encode the difference values using Rice entropy coding.
writer.write_bits(fs + 1, kFsBits);
for (size_t i = 0; i < block.size(); ++i) {
@ -145,7 +134,18 @@ void encode_block(V block, BitstreamWriter& writer, PixelTraits const& traits,
writer.write_bit(1);
writer.write_bits(diff, fs);
}
} else {
// Difference values are too large for entropy coding. Just plain copy
// the input pixel data. This is really unlikely, so reading the input
// pixels again is fine.
writer.write_bits(kFsMax + 1, kFsBits);
for (auto& b : block) {
writer.write_bits(b, kPixelBits);
}
}
} else {
// All differences are zero, so just write a zero fs and we're done.
writer.write_bits(0U, kFsBits);
}
}