mirror of
https://github.com/cuberite/libdeflate.git
synced 2025-08-04 02:06:31 -04:00

gcc 10 is miscompiling libdeflate on x86_64 at -O3 due to a regression in how packed structs are handled (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994). Work around this by just always using memcpy() for unaligned accesses. It's unclear that the "packed struct" approach is worthwhile to maintain anymore. Currently I'm only aware that it's useful with old gcc's on arm32. Hopefully, compilers are good enough now that we can simply use memcpy() everywhere. Update https://github.com/ebiggers/libdeflate/issues/64
229 lines
5.5 KiB
C
229 lines
5.5 KiB
C
/*
|
|
* unaligned.h - inline functions for unaligned memory accesses
|
|
*/
|
|
|
|
#ifndef LIB_UNALIGNED_H
|
|
#define LIB_UNALIGNED_H
|
|
|
|
#include "lib_common.h"
|
|
|
|
/***** Unaligned loads and stores without endianness conversion *****/
|
|
|
|
/*
|
|
* memcpy() is portable, and it usually gets optimized appropriately by modern
|
|
* compilers. I.e., each memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled
|
|
* to a load or store instruction, not to an actual function call.
|
|
*
|
|
* We no longer use the "packed struct" approach, as that is nonstandard, has
|
|
* unclear semantics, and doesn't receive enough testing
|
|
* (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994).
|
|
*
|
|
* arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception
|
|
* where memcpy() generates inefficient code
|
|
* (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366). However, we no longer
|
|
* consider that one case important enough to maintain different code for.
|
|
* If you run into it, please just use a newer version of gcc (or use clang).
|
|
*/
|
|
|
|
#define DEFINE_UNALIGNED_TYPE(type) \
|
|
static forceinline type \
|
|
load_##type##_unaligned(const void *p) \
|
|
{ \
|
|
type v; \
|
|
memcpy(&v, p, sizeof(v)); \
|
|
return v; \
|
|
} \
|
|
\
|
|
static forceinline void \
|
|
store_##type##_unaligned(type v, void *p) \
|
|
{ \
|
|
memcpy(p, &v, sizeof(v)); \
|
|
}
|
|
|
|
DEFINE_UNALIGNED_TYPE(u16)
|
|
DEFINE_UNALIGNED_TYPE(u32)
|
|
DEFINE_UNALIGNED_TYPE(u64)
|
|
DEFINE_UNALIGNED_TYPE(machine_word_t)
|
|
|
|
#define load_word_unaligned load_machine_word_t_unaligned
|
|
#define store_word_unaligned store_machine_word_t_unaligned
|
|
|
|
/***** Unaligned loads with endianness conversion *****/
|
|
|
|
static forceinline u16
|
|
get_unaligned_le16(const u8 *p)
|
|
{
|
|
if (UNALIGNED_ACCESS_IS_FAST)
|
|
return le16_bswap(load_u16_unaligned(p));
|
|
else
|
|
return ((u16)p[1] << 8) | p[0];
|
|
}
|
|
|
|
static forceinline u16
|
|
get_unaligned_be16(const u8 *p)
|
|
{
|
|
if (UNALIGNED_ACCESS_IS_FAST)
|
|
return be16_bswap(load_u16_unaligned(p));
|
|
else
|
|
return ((u16)p[0] << 8) | p[1];
|
|
}
|
|
|
|
static forceinline u32
|
|
get_unaligned_le32(const u8 *p)
|
|
{
|
|
if (UNALIGNED_ACCESS_IS_FAST)
|
|
return le32_bswap(load_u32_unaligned(p));
|
|
else
|
|
return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
|
|
((u32)p[1] << 8) | p[0];
|
|
}
|
|
|
|
static forceinline u32
|
|
get_unaligned_be32(const u8 *p)
|
|
{
|
|
if (UNALIGNED_ACCESS_IS_FAST)
|
|
return be32_bswap(load_u32_unaligned(p));
|
|
else
|
|
return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
|
|
((u32)p[2] << 8) | p[3];
|
|
}
|
|
|
|
static forceinline u64
|
|
get_unaligned_le64(const u8 *p)
|
|
{
|
|
if (UNALIGNED_ACCESS_IS_FAST)
|
|
return le64_bswap(load_u64_unaligned(p));
|
|
else
|
|
return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
|
|
((u64)p[5] << 40) | ((u64)p[4] << 32) |
|
|
((u64)p[3] << 24) | ((u64)p[2] << 16) |
|
|
((u64)p[1] << 8) | p[0];
|
|
}
|
|
|
|
static forceinline machine_word_t
|
|
get_unaligned_leword(const u8 *p)
|
|
{
|
|
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
|
|
if (WORDBITS == 32)
|
|
return get_unaligned_le32(p);
|
|
else
|
|
return get_unaligned_le64(p);
|
|
}
|
|
|
|
/***** Unaligned stores with endianness conversion *****/
|
|
|
|
static forceinline void
|
|
put_unaligned_le16(u16 v, u8 *p)
|
|
{
|
|
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
store_u16_unaligned(le16_bswap(v), p);
|
|
} else {
|
|
p[0] = (u8)(v >> 0);
|
|
p[1] = (u8)(v >> 8);
|
|
}
|
|
}
|
|
|
|
static forceinline void
|
|
put_unaligned_be16(u16 v, u8 *p)
|
|
{
|
|
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
store_u16_unaligned(be16_bswap(v), p);
|
|
} else {
|
|
p[0] = (u8)(v >> 8);
|
|
p[1] = (u8)(v >> 0);
|
|
}
|
|
}
|
|
|
|
static forceinline void
|
|
put_unaligned_le32(u32 v, u8 *p)
|
|
{
|
|
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
store_u32_unaligned(le32_bswap(v), p);
|
|
} else {
|
|
p[0] = (u8)(v >> 0);
|
|
p[1] = (u8)(v >> 8);
|
|
p[2] = (u8)(v >> 16);
|
|
p[3] = (u8)(v >> 24);
|
|
}
|
|
}
|
|
|
|
static forceinline void
|
|
put_unaligned_be32(u32 v, u8 *p)
|
|
{
|
|
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
store_u32_unaligned(be32_bswap(v), p);
|
|
} else {
|
|
p[0] = (u8)(v >> 24);
|
|
p[1] = (u8)(v >> 16);
|
|
p[2] = (u8)(v >> 8);
|
|
p[3] = (u8)(v >> 0);
|
|
}
|
|
}
|
|
|
|
static forceinline void
|
|
put_unaligned_le64(u64 v, u8 *p)
|
|
{
|
|
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
store_u64_unaligned(le64_bswap(v), p);
|
|
} else {
|
|
p[0] = (u8)(v >> 0);
|
|
p[1] = (u8)(v >> 8);
|
|
p[2] = (u8)(v >> 16);
|
|
p[3] = (u8)(v >> 24);
|
|
p[4] = (u8)(v >> 32);
|
|
p[5] = (u8)(v >> 40);
|
|
p[6] = (u8)(v >> 48);
|
|
p[7] = (u8)(v >> 56);
|
|
}
|
|
}
|
|
|
|
static forceinline void
|
|
put_unaligned_leword(machine_word_t v, u8 *p)
|
|
{
|
|
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
|
|
if (WORDBITS == 32)
|
|
put_unaligned_le32(v, p);
|
|
else
|
|
put_unaligned_le64(v, p);
|
|
}
|
|
|
|
/***** 24-bit loads *****/
|
|
|
|
/*
|
|
* Given a 32-bit value that was loaded with the platform's native endianness,
|
|
* return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
|
|
* bits contain the first 3 bytes, arranged in octets in a platform-dependent
|
|
* order, at the memory location from which the input 32-bit value was loaded.
|
|
*/
|
|
static forceinline u32
|
|
loaded_u32_to_u24(u32 v)
|
|
{
|
|
if (CPU_IS_LITTLE_ENDIAN())
|
|
return v & 0xFFFFFF;
|
|
else
|
|
return v >> 8;
|
|
}
|
|
|
|
/*
|
|
* Load the next 3 bytes from the memory location @p into the 24 low-order bits
|
|
* of a 32-bit value. The order in which the 3 bytes will be arranged as octets
|
|
* in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES
|
|
* bytes must be available at @p; note that this may be more than 3.
|
|
*/
|
|
static forceinline u32
|
|
load_u24_unaligned(const u8 *p)
|
|
{
|
|
#if UNALIGNED_ACCESS_IS_FAST
|
|
# define LOAD_U24_REQUIRED_NBYTES 4
|
|
return loaded_u32_to_u24(load_u32_unaligned(p));
|
|
#else
|
|
# define LOAD_U24_REQUIRED_NBYTES 3
|
|
if (CPU_IS_LITTLE_ENDIAN())
|
|
return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
|
|
else
|
|
return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
|
|
#endif
|
|
}
|
|
|
|
#endif /* LIB_UNALIGNED_H */
|