lib/arm/crc32: add support for ARM CRC32 instructions

Add a CRC32 implementation that uses the ARM CRC32 instructions.

This is simpler and faster than the PMULL implementation.  On AWS
Graviton2, the performance improvement is about 70%.  On Hikey960, the
performance improvement is about 30% for the Cortex-A53 cores or about
5% for the Cortex-A73 cores.

Based on work by Greg V <greg@unrelenting.technology>
(https://github.com/ebiggers/libdeflate/pull/45)
and Andrew Steinborn <git@steinborn.me>
(https://github.com/ebiggers/libdeflate/pull/76).
This commit is contained in:
Eric Biggers 2020-10-10 23:02:50 -07:00
parent 2eeaa9282e
commit ea88fa822f
2 changed files with 83 additions and 1 deletions

View File

@ -28,6 +28,80 @@
#include "cpu_features.h"
/* Implementation using ARM CRC32 instructions */
#undef DISPATCH_ARM
#if !defined(DEFAULT_IMPL) && \
(defined(__ARM_FEATURE_CRC32) || \
(ARM_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS))
# ifdef __ARM_FEATURE_CRC32
# define ATTRIBUTES
# define DEFAULT_IMPL crc32_arm
# else
# ifdef __arm__
# ifdef __clang__
# define ATTRIBUTES __attribute__((target("armv8-a,crc")))
# else
# define ATTRIBUTES __attribute__((target("arch=armv8-a+crc")))
# endif
# else
# ifdef __clang__
# define ATTRIBUTES __attribute__((target("crc")))
# else
# define ATTRIBUTES __attribute__((target("+crc")))
# endif
# endif
# define DISPATCH 1
# define DISPATCH_ARM 1
# endif
/*
* gcc's (as of 10.1) version of arm_acle.h for arm32, and clang's (as of
* 10.0.1) version of arm_acle.h for both arm32 and arm64, have a bug where they
* only define the CRC32 functions like __crc32b() when __ARM_FEATURE_CRC32 is
* defined. That prevents them from being used via __attribute__((target)) when
* the main target doesn't have CRC32 support enabled. The actual built-ins
* like __builtin_arm_crc32b() are available and work, however; it's just the
* wrappers in arm_acle.h like __crc32b() that erroneously don't get defined.
* Work around this by manually defining __ARM_FEATURE_CRC32.
*/
#ifndef __ARM_FEATURE_CRC32
# define __ARM_FEATURE_CRC32 1
#endif
#include <arm_acle.h>
static u32 ATTRIBUTES
crc32_arm(u32 remainder, const u8 *p, size_t size)
{
while (size != 0 && (uintptr_t)p & 7) {
remainder = __crc32b(remainder, *p++);
size--;
}
while (size >= 32) {
remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 0)));
remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 1)));
remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 2)));
remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 3)));
p += 32;
size -= 32;
}
while (size >= 8) {
remainder = __crc32d(remainder, le64_bswap(*(u64 *)p));
p += 8;
size -= 8;
}
while (size != 0) {
remainder = __crc32b(remainder, *p++);
size--;
}
return remainder;
}
#undef ATTRIBUTES
#endif /* Implementation using ARM CRC32 instructions */
/*
* CRC-32 folding with ARM Crypto extension-PMULL
*
@ -155,6 +229,10 @@ arch_select_crc32_func(void)
{
u32 features = get_cpu_features();
#ifdef DISPATCH_ARM
if (features & ARM_CPU_FEATURE_CRC32)
return crc32_arm;
#endif
#ifdef DISPATCH_PMULL
if (features & ARM_CPU_FEATURE_PMULL)
return crc32_pmull;

View File

@ -112,9 +112,13 @@ i386|x86_64)
fi
;;
arm*|aarch*)
if have_cpu_feature crc32; then
do_benchmark "ARM"
disable_cpu_feature "crc32" "-march=armv8-a+nocrc"
fi
if have_cpu_feature pmull; then
do_benchmark "PMULL"
disable_cpu_feature "pmull" "-march=armv8-a+nocrypto"
disable_cpu_feature "pmull" "-march=armv8-a+nocrc+nocrypto"
fi
;;
esac