mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-09 12:28:13 -04:00
chore(ricepp): more perf tweaks
This commit is contained in:
parent
766b6632af
commit
0ca8349b8b
@ -54,7 +54,7 @@ class bitstream_reader final {
|
||||
assert(num_bits <= std::numeric_limits<T>::digits);
|
||||
T bits = 0;
|
||||
uint16_t pos = 0;
|
||||
if (num_bits > 0) {
|
||||
if (num_bits > 0) [[likely]] {
|
||||
for (;;) {
|
||||
size_t const remain = kBitsTypeBits - bit_pos_;
|
||||
if (num_bits <= remain) {
|
||||
@ -91,11 +91,11 @@ class bitstream_reader final {
|
||||
if (bits != bits_type{}) [[likely]] {
|
||||
size_t const ffs = std::countr_zero(bits);
|
||||
assert(ffs < kBitsTypeBits);
|
||||
if (ffs + 1 == kBitsTypeBits) [[unlikely]] {
|
||||
bit_pos_ = 0;
|
||||
} else {
|
||||
if (ffs + 1 != kBitsTypeBits) {
|
||||
data_ = bits;
|
||||
bit_pos_ = ffs + 1;
|
||||
} else {
|
||||
bit_pos_ = 0;
|
||||
}
|
||||
return zeros + ffs;
|
||||
}
|
||||
@ -112,8 +112,7 @@ class bitstream_reader final {
|
||||
|
||||
RICEPP_FORCE_INLINE void skip_bits(size_t num_bits) {
|
||||
assert(bit_pos_ + num_bits <= kBitsTypeBits);
|
||||
bit_pos_ += num_bits;
|
||||
bit_pos_ &= kBitsTypeBits - 1;
|
||||
bit_pos_ = (bit_pos_ + num_bits) & (kBitsTypeBits - 1);
|
||||
}
|
||||
|
||||
RICEPP_FORCE_INLINE bool peek_bit() {
|
||||
@ -123,16 +122,17 @@ class bitstream_reader final {
|
||||
|
||||
RICEPP_FORCE_INLINE bits_type peek_bits(size_t num_bits) {
|
||||
assert(bit_pos_ + num_bits <= kBitsTypeBits);
|
||||
if (bit_pos_ == 0) [[unlikely]] {
|
||||
auto const bp = bit_pos_;
|
||||
if (bp == 0) {
|
||||
data_ = read_packet();
|
||||
}
|
||||
// The remainder of this function is equivalent to:
|
||||
//
|
||||
// return _bextr_u64(data_, bit_pos_, num_bits);
|
||||
// return _bextr_u64(data_, bp, num_bits);
|
||||
//
|
||||
// However, in practice, at least clang generates code that is as fast
|
||||
// as the intrinsic, so we use the following code for portability.
|
||||
bits_type bits = data_ >> bit_pos_;
|
||||
bits_type bits = data_ >> bp;
|
||||
if (num_bits < kBitsTypeBits) [[likely]] {
|
||||
bits &= (static_cast<bits_type>(1) << num_bits) - 1;
|
||||
}
|
||||
|
@ -76,7 +76,7 @@ class bitstream_writer final {
|
||||
write_packet(bits);
|
||||
repeat -= kBitsTypeBits;
|
||||
}
|
||||
if (repeat > 0) {
|
||||
if (repeat > 0) [[likely]] {
|
||||
write_bits_impl(bits, repeat);
|
||||
}
|
||||
}
|
||||
@ -86,11 +86,17 @@ class bitstream_writer final {
|
||||
static constexpr size_t kArgBits{std::numeric_limits<T>::digits};
|
||||
assert(bit_pos_ < kBitsTypeBits);
|
||||
assert(num_bits <= kArgBits);
|
||||
while (num_bits > 0) {
|
||||
size_t const bits_to_write = std::min(num_bits, kBitsTypeBits - bit_pos_);
|
||||
write_bits_impl(bits, bits_to_write);
|
||||
bits >>= bits_to_write;
|
||||
num_bits -= bits_to_write;
|
||||
if (num_bits > 0) [[likely]] {
|
||||
for (;;) {
|
||||
size_t const bits_to_write =
|
||||
std::min(num_bits, kBitsTypeBits - bit_pos_);
|
||||
write_bits_impl(bits, bits_to_write);
|
||||
bits >>= bits_to_write;
|
||||
if (num_bits == bits_to_write) [[likely]] {
|
||||
break;
|
||||
}
|
||||
num_bits -= bits_to_write;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -111,7 +117,7 @@ class bitstream_writer final {
|
||||
private:
|
||||
RICEPP_FORCE_INLINE void write_bits_impl(bits_type bits, size_t num_bits) {
|
||||
assert(bit_pos_ + num_bits <= kBitsTypeBits);
|
||||
if (num_bits < kBitsTypeBits) {
|
||||
if (num_bits < kBitsTypeBits) [[likely]] {
|
||||
bits &= (static_cast<bits_type>(1) << num_bits) - 1;
|
||||
}
|
||||
data_ |= bits << bit_pos_;
|
||||
|
@ -50,22 +50,26 @@ void decode_block(V block, BitstreamReader& reader, PixelTraits const& traits,
|
||||
|
||||
auto const fsp1 = reader.template read_bits<value_type>(kFsBits);
|
||||
|
||||
if (fsp1 == 0) [[unlikely]] {
|
||||
std::fill(block.begin(), block.end(), traits.write(last));
|
||||
} else if (fsp1 > kFsMax) [[unlikely]] {
|
||||
for (auto& b : block) {
|
||||
b = reader.template read_bits<value_type>(kPixelBits);
|
||||
if (fsp1 > 0) {
|
||||
if (fsp1 <= kFsMax) {
|
||||
auto const fs = fsp1 - 1;
|
||||
for (auto& b : block) {
|
||||
value_type diff = reader.find_first_set() << fs;
|
||||
diff |= reader.template read_bits<value_type>(fs);
|
||||
last += static_cast<std::make_signed_t<value_type>>(
|
||||
((diff & 1) * value_type(-1)) ^ (diff >> 1));
|
||||
// last += static_cast<std::make_signed_t<value_type>>(
|
||||
// (diff & 1) ? ~(diff >> 1) : (diff >> 1));
|
||||
b = traits.write(last);
|
||||
}
|
||||
} else {
|
||||
for (auto& b : block) {
|
||||
b = reader.template read_bits<value_type>(kPixelBits);
|
||||
}
|
||||
last = traits.read(block.back());
|
||||
}
|
||||
last = traits.read(block.back());
|
||||
} else {
|
||||
auto const fs = fsp1 - 1;
|
||||
for (auto& b : block) {
|
||||
value_type diff = reader.find_first_set() << fs;
|
||||
diff |= reader.template read_bits<value_type>(fs);
|
||||
last += static_cast<std::make_signed_t<value_type>>(
|
||||
(diff & 1) ? ~(diff >> 1) : (diff >> 1));
|
||||
b = traits.write(last);
|
||||
}
|
||||
std::fill(block.begin(), block.end(), traits.write(last));
|
||||
}
|
||||
|
||||
last_value = last;
|
||||
|
@ -117,23 +117,12 @@ void encode_block(V block, BitstreamWriter& writer, PixelTraits const& traits,
|
||||
|
||||
last_value = last;
|
||||
|
||||
if (sum == 0) [[unlikely]] {
|
||||
// All differences are zero, so just write a zero fs and we're done.
|
||||
writer.write_bits(0U, kFsBits);
|
||||
} else {
|
||||
if (sum > 0) [[likely]] {
|
||||
// Find the best bit position to split the difference values.
|
||||
auto const [fs, bits_used] =
|
||||
compute_best_split<kFsMax>(delta, block.size(), sum);
|
||||
|
||||
if (fs >= kFsMax || bits_used >= kPixelBits * block.size()) [[unlikely]] {
|
||||
// Difference values are too large for entropy coding. Just plain copy
|
||||
// the input pixel data. This is really unlikely, so reading the input
|
||||
// pixels again is fine.
|
||||
writer.write_bits(kFsMax + 1, kFsBits);
|
||||
for (auto& b : block) {
|
||||
writer.write_bits(b, kPixelBits);
|
||||
}
|
||||
} else {
|
||||
if (fs < kFsMax && bits_used < kPixelBits * block.size()) [[likely]] {
|
||||
// Encode the difference values using Rice entropy coding.
|
||||
writer.write_bits(fs + 1, kFsBits);
|
||||
for (size_t i = 0; i < block.size(); ++i) {
|
||||
@ -145,7 +134,18 @@ void encode_block(V block, BitstreamWriter& writer, PixelTraits const& traits,
|
||||
writer.write_bit(1);
|
||||
writer.write_bits(diff, fs);
|
||||
}
|
||||
} else {
|
||||
// Difference values are too large for entropy coding. Just plain copy
|
||||
// the input pixel data. This is really unlikely, so reading the input
|
||||
// pixels again is fine.
|
||||
writer.write_bits(kFsMax + 1, kFsBits);
|
||||
for (auto& b : block) {
|
||||
writer.write_bits(b, kPixelBits);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// All differences are zero, so just write a zero fs and we're done.
|
||||
writer.write_bits(0U, kFsBits);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user