libdeflate/lib/unaligned.h
Eric Biggers 0f5238f0ad lib: remove the "packed struct" approach to unaligned memory access
gcc 10 is miscompiling libdeflate on x86_64 at -O3 due to a regression
in how packed structs are handled
(https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994).

Work around this by just always using memcpy() for unaligned accesses.
It's unclear that the "packed struct" approach is worthwhile to maintain
anymore.  Currently I'm only aware that it's useful with old gcc's on
arm32.  Hopefully, compilers are good enough now that we can simply use
memcpy() everywhere.

Update https://github.com/ebiggers/libdeflate/issues/64
2020-05-08 23:03:58 -07:00

229 lines
5.5 KiB
C

/*
* unaligned.h - inline functions for unaligned memory accesses
*/
#ifndef LIB_UNALIGNED_H
#define LIB_UNALIGNED_H
#include "lib_common.h"
/***** Unaligned loads and stores without endianness conversion *****/
/*
* memcpy() is portable, and it usually gets optimized appropriately by modern
* compilers. I.e., each memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled
* to a load or store instruction, not to an actual function call.
*
* We no longer use the "packed struct" approach, as that is nonstandard, has
* unclear semantics, and doesn't receive enough testing
* (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994).
*
* arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception
* where memcpy() generates inefficient code
* (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366). However, we no longer
* consider that one case important enough to maintain different code for.
* If you run into it, please just use a newer version of gcc (or use clang).
*/
#define DEFINE_UNALIGNED_TYPE(type) \
static forceinline type \
load_##type##_unaligned(const void *p) \
{ \
type v; \
memcpy(&v, p, sizeof(v)); \
return v; \
} \
\
static forceinline void \
store_##type##_unaligned(type v, void *p) \
{ \
memcpy(p, &v, sizeof(v)); \
}
DEFINE_UNALIGNED_TYPE(u16)
DEFINE_UNALIGNED_TYPE(u32)
DEFINE_UNALIGNED_TYPE(u64)
DEFINE_UNALIGNED_TYPE(machine_word_t)
#define load_word_unaligned load_machine_word_t_unaligned
#define store_word_unaligned store_machine_word_t_unaligned
/***** Unaligned loads with endianness conversion *****/
static forceinline u16
get_unaligned_le16(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return le16_bswap(load_u16_unaligned(p));
else
return ((u16)p[1] << 8) | p[0];
}
static forceinline u16
get_unaligned_be16(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return be16_bswap(load_u16_unaligned(p));
else
return ((u16)p[0] << 8) | p[1];
}
static forceinline u32
get_unaligned_le32(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return le32_bswap(load_u32_unaligned(p));
else
return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
((u32)p[1] << 8) | p[0];
}
static forceinline u32
get_unaligned_be32(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return be32_bswap(load_u32_unaligned(p));
else
return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
((u32)p[2] << 8) | p[3];
}
static forceinline u64
get_unaligned_le64(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return le64_bswap(load_u64_unaligned(p));
else
return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
((u64)p[5] << 40) | ((u64)p[4] << 32) |
((u64)p[3] << 24) | ((u64)p[2] << 16) |
((u64)p[1] << 8) | p[0];
}
static forceinline machine_word_t
get_unaligned_leword(const u8 *p)
{
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
if (WORDBITS == 32)
return get_unaligned_le32(p);
else
return get_unaligned_le64(p);
}
/***** Unaligned stores with endianness conversion *****/
static forceinline void
put_unaligned_le16(u16 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u16_unaligned(le16_bswap(v), p);
} else {
p[0] = (u8)(v >> 0);
p[1] = (u8)(v >> 8);
}
}
static forceinline void
put_unaligned_be16(u16 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u16_unaligned(be16_bswap(v), p);
} else {
p[0] = (u8)(v >> 8);
p[1] = (u8)(v >> 0);
}
}
static forceinline void
put_unaligned_le32(u32 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u32_unaligned(le32_bswap(v), p);
} else {
p[0] = (u8)(v >> 0);
p[1] = (u8)(v >> 8);
p[2] = (u8)(v >> 16);
p[3] = (u8)(v >> 24);
}
}
static forceinline void
put_unaligned_be32(u32 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u32_unaligned(be32_bswap(v), p);
} else {
p[0] = (u8)(v >> 24);
p[1] = (u8)(v >> 16);
p[2] = (u8)(v >> 8);
p[3] = (u8)(v >> 0);
}
}
static forceinline void
put_unaligned_le64(u64 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u64_unaligned(le64_bswap(v), p);
} else {
p[0] = (u8)(v >> 0);
p[1] = (u8)(v >> 8);
p[2] = (u8)(v >> 16);
p[3] = (u8)(v >> 24);
p[4] = (u8)(v >> 32);
p[5] = (u8)(v >> 40);
p[6] = (u8)(v >> 48);
p[7] = (u8)(v >> 56);
}
}
static forceinline void
put_unaligned_leword(machine_word_t v, u8 *p)
{
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
if (WORDBITS == 32)
put_unaligned_le32(v, p);
else
put_unaligned_le64(v, p);
}
/***** 24-bit loads *****/
/*
* Given a 32-bit value that was loaded with the platform's native endianness,
* return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
* bits contain the first 3 bytes, arranged in octets in a platform-dependent
* order, at the memory location from which the input 32-bit value was loaded.
*/
static forceinline u32
loaded_u32_to_u24(u32 v)
{
if (CPU_IS_LITTLE_ENDIAN())
return v & 0xFFFFFF;
else
return v >> 8;
}
/*
* Load the next 3 bytes from the memory location @p into the 24 low-order bits
* of a 32-bit value. The order in which the 3 bytes will be arranged as octets
* in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES
* bytes must be available at @p; note that this may be more than 3.
*/
static forceinline u32
load_u24_unaligned(const u8 *p)
{
#if UNALIGNED_ACCESS_IS_FAST
# define LOAD_U24_REQUIRED_NBYTES 4
return loaded_u32_to_u24(load_u32_unaligned(p));
#else
# define LOAD_U24_REQUIRED_NBYTES 3
if (CPU_IS_LITTLE_ENDIAN())
return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
else
return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
#endif
}
#endif /* LIB_UNALIGNED_H */