From cc1871e674c2508f88dd77106c9f0ba0dbee2120 Mon Sep 17 00:00:00 2001 From: Ko- Date: Thu, 16 Aug 2018 02:01:57 -0700 Subject: [PATCH] Add optimized bignum multiplication for Aarch64. x0-x3 are skipped such that function parameters to not have to be moved. MULADDC_INIT and MULADDC_STOP are mostly empty because it is more efficient to keep everything in registers (and that should easily be possible). I considered a MULADDC_HUIT implementation, but could not think of something that would be more efficient than basically 8 consecutive MULADDC_CORE. You could combine the loads and stores, but it's probably more efficient to interleave them with arithmetic, depending on the specific microarchitecture. NEON allows to do a 64x64->128 bit multiplication (and optional accumulation) in one instruction, but is not great at handling carries. --- include/mbedtls/bn_mul.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/include/mbedtls/bn_mul.h b/include/mbedtls/bn_mul.h index f7cb07252..4200ad43a 100644 --- a/include/mbedtls/bn_mul.h +++ b/include/mbedtls/bn_mul.h @@ -198,6 +198,30 @@ #endif /* AMD64 */ +#if defined(__aarch64__) + +#define MULADDC_INIT \ + asm( + +#define MULADDC_CORE \ + "ldr x4, [%3], #8 \n\t" \ + "ldr x5, [%4] \n\t" \ + "mul x6, x4, %6 \n\t" \ + "umulh x7, x4, %6 \n\t" \ + "adds x5, x5, x6 \n\t" \ + "adc x7, x7, xzr \n\t" \ + "adds x5, x5, %5 \n\t" \ + "adc %0, x7, xzr \n\t" \ + "str x5, [%1], #8 \n\t" + +#define MULADDC_STOP \ + : "+r" (c), "=r" (d), "=r" (s) \ + : "r" (s), "r" (d), "r" (c), "r" (b) \ + : "x4", "x5", "x6", "x7", "cc" \ + ); + +#endif /* Aarch64 */ + #if defined(__mc68020__) || defined(__mcpu32__) #define MULADDC_INIT \