These wire Andy Polyakov's implementations up to the kernel. We make a few small changes to the assembly: - Entries and exits use the proper kernel convention macro. - CPU feature checking is done in C by the glue code, so that has been removed from the assembly. - The function names have been renamed to fit kernel conventions. - Labels have been renamed to fit kernel conventions. - The neon code can jump to the scalar code when it makes sense to do so. The NEON code uses base 2^26, while the scalar code uses base 2^64 on 64-bit and base 2^32 on 32-bit. If we hit the unfortunate situation of using NEON and then having to go back to scalar -- because the user is silly and has called the update function from two separate contexts -- then we need to convert back to the original base before proceeding. It is possible to reason that the initial reduction below is sufficient given the implementation invariants. However, for an avoidance of doubt and because this is not performance critical, we do the full reduction anyway. This conversion is found in the glue code, and a proof of correctness may be easily obtained from Z3: <https://xn--4db.cc/ltPtHCKN/py>. Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx> Cc: Russell King <linux@xxxxxxxxxxxxxxx> Cc: linux-arm-kernel@xxxxxxxxxxxxxxxxxxx Cc: Samuel Neves <sneves@xxxxxxxxx> Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@xxxxxxxxx> Cc: Andy Lutomirski <luto@xxxxxxxxxx> Cc: Greg KH <gregkh@xxxxxxxxxxxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Cc: kernel-hardening@xxxxxxxxxxxxxxxxxx Cc: linux-crypto@xxxxxxxxxxxxxxx --- lib/zinc/Makefile | 2 + lib/zinc/poly1305/poly1305-arm-glue.c | 140 +++++++++++++++++ ...ly1305-arm-cryptogams.S => poly1305-arm.S} | 147 ++++++------------ ...05-arm64-cryptogams.S => poly1305-arm64.S} | 127 +++++---------- lib/zinc/poly1305/poly1305.c | 2 + 5 files changed, 231 insertions(+), 187 deletions(-) create mode 100644 lib/zinc/poly1305/poly1305-arm-glue.c rename lib/zinc/poly1305/{poly1305-arm-cryptogams.S => poly1305-arm.S} (91%) rename lib/zinc/poly1305/{poly1305-arm64-cryptogams.S => poly1305-arm64.S} (89%) diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile index a8943d960b6a..c09fd3de60f9 100644 --- a/lib/zinc/Makefile +++ b/lib/zinc/Makefile @@ -12,4 +12,6 @@ obj-$(CONFIG_ZINC_CHACHA20) += zinc_chacha20.o zinc_poly1305-y := poly1305/poly1305.o zinc_poly1305-$(CONFIG_ZINC_ARCH_X86_64) += poly1305/poly1305-x86_64.o +zinc_poly1305-$(CONFIG_ZINC_ARCH_ARM) += poly1305/poly1305-arm.o +zinc_poly1305-$(CONFIG_ZINC_ARCH_ARM64) += poly1305/poly1305-arm64.o obj-$(CONFIG_ZINC_POLY1305) += zinc_poly1305.o diff --git a/lib/zinc/poly1305/poly1305-arm-glue.c b/lib/zinc/poly1305/poly1305-arm-glue.c new file mode 100644 index 000000000000..f4f08ecffbf6 --- /dev/null +++ b/lib/zinc/poly1305/poly1305-arm-glue.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> + +asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]); +asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]); +asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]); + +static bool poly1305_use_neon __ro_after_init; +static bool *const poly1305_nobs[] __initconst = { &poly1305_use_neon }; + +static void __init poly1305_fpu_init(void) +{ +#if defined(CONFIG_ZINC_ARCH_ARM64) + poly1305_use_neon = elf_hwcap & HWCAP_ASIMD; +#elif defined(CONFIG_ZINC_ARCH_ARM) + poly1305_use_neon = elf_hwcap & HWCAP_NEON; +#endif +} + +#if defined(CONFIG_ZINC_ARCH_ARM64) +struct poly1305_arch_internal { + union { + u32 h[5]; + struct { + u64 h0, h1, h2; + }; + }; + u64 is_base2_26; + u64 r[2]; +}; +#elif defined(CONFIG_ZINC_ARCH_ARM) +struct poly1305_arch_internal { + union { + u32 h[5]; + struct { + u64 h0, h1; + u32 h2; + } __packed; + }; + u32 r[4]; + u32 is_base2_26; +}; +#endif + +/* The NEON code uses base 2^26, while the scalar code uses base 2^64 on 64-bit + * and base 2^32 on 32-bit. If we hit the unfortunate situation of using NEON + * and then having to go back to scalar -- because the user is silly and has + * called the update function from two separate contexts -- then we need to + * convert back to the original base before proceeding. The below function is + * written for 64-bit integers, and so we have to swap words at the end on + * big-endian 32-bit. It is possible to reason that the initial reduction below + * is sufficient given the implementation invariants. However, for an avoidance + * of doubt and because this is not performance critical, we do the full + * reduction anyway. + */ +static void convert_to_base2_64(void *ctx) +{ + struct poly1305_arch_internal *state = ctx; + u32 cy; + + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !state->is_base2_26) + return; + + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; + state->h0 = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; + state->h1 = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); + state->h2 = state->h[4] >> 24; + if (IS_ENABLED(CONFIG_ZINC_ARCH_ARM) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) { + state->h0 = rol64(state->h0, 32); + state->h1 = rol64(state->h1, 32); + } +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) + cy = (state->h2 >> 2) + (state->h2 & ~3ULL); + state->h2 &= 3; + state->h0 += cy; + state->h1 += (cy = ULT(state->h0, cy)); + state->h2 += ULT(state->h1, cy); +#undef ULT + state->is_base2_26 = 0; +} + +static inline bool poly1305_init_arch(void *ctx, + const u8 key[POLY1305_KEY_SIZE]) +{ + poly1305_init_arm(ctx, key); + return true; +} + +static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp, + size_t len, const u32 padbit, + simd_context_t *simd_context) +{ + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || + PAGE_SIZE % POLY1305_BLOCK_SIZE); + + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon || + !simd_use(simd_context)) { + convert_to_base2_64(ctx); + poly1305_blocks_arm(ctx, inp, len, padbit); + return true; + } + + for (;;) { + const size_t bytes = min_t(size_t, len, PAGE_SIZE); + + poly1305_blocks_neon(ctx, inp, bytes, padbit); + len -= bytes; + if (!len) + break; + inp += bytes; + simd_relax(simd_context); + } + return true; +} + +static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE], + const u32 nonce[4], + simd_context_t *simd_context) +{ + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon || + !simd_use(simd_context)) { + convert_to_base2_64(ctx); + poly1305_emit_arm(ctx, mac, nonce); + } else + poly1305_emit_neon(ctx, mac, nonce); + return true; +} diff --git a/lib/zinc/poly1305/poly1305-arm-cryptogams.S b/lib/zinc/poly1305/poly1305-arm.S similarity index 91% rename from lib/zinc/poly1305/poly1305-arm-cryptogams.S rename to lib/zinc/poly1305/poly1305-arm.S index 884b465030e4..4a0e9d451119 100644 --- a/lib/zinc/poly1305/poly1305-arm-cryptogams.S +++ b/lib/zinc/poly1305/poly1305-arm.S @@ -1,9 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ /* + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@xxxxxxxxxxx>. All Rights Reserved. + * + * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. */ -#include "arm_arch.h" +#include <linux/linkage.h> .text #if defined(__thumb2__) @@ -13,13 +16,8 @@ .code 32 #endif -.globl poly1305_emit -.globl poly1305_blocks -.globl poly1305_init -.type poly1305_init,%function .align 5 -poly1305_init: -.Lpoly1305_init: +ENTRY(poly1305_init_arm) stmdb sp!,{r4-r11} eor r3,r3,r3 @@ -38,10 +36,6 @@ poly1305_init: moveq r0,#0 beq .Lno_key -#if __ARM_MAX_ARCH__>=7 - adr r11,.Lpoly1305_init - ldr r12,.LOPENSSL_armcap -#endif ldrb r4,[r1,#0] mov r10,#0x0fffffff ldrb r5,[r1,#1] @@ -56,12 +50,6 @@ poly1305_init: ldrb r7,[r1,#6] and r4,r4,r10 -#if __ARM_MAX_ARCH__>=7 - ldr r12,[r11,r12] @ OPENSSL_armcap_P -# ifdef __APPLE__ - ldr r12,[r12] -# endif -#endif ldrb r8,[r1,#7] orr r5,r5,r6,lsl#8 ldrb r6,[r1,#8] @@ -71,35 +59,6 @@ poly1305_init: ldrb r8,[r1,#10] and r5,r5,r3 -#if __ARM_MAX_ARCH__>=7 - tst r12,#ARMV7_NEON @ check for NEON -# ifdef __APPLE__ - adr r9,poly1305_blocks_neon - adr r11,poly1305_blocks -# ifdef __thumb2__ - it ne -# endif - movne r11,r9 - adr r12,poly1305_emit - adr r10,poly1305_emit_neon -# ifdef __thumb2__ - it ne -# endif - movne r12,r10 -# else -# ifdef __thumb2__ - itete eq -# endif - addeq r12,r11,#(poly1305_emit-.Lpoly1305_init) - addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) - addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init) - addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) -# endif -# ifdef __thumb2__ - orr r12,r12,#1 @ thumb-ify address - orr r11,r11,#1 -# endif -#endif ldrb r9,[r1,#11] orr r6,r6,r7,lsl#8 ldrb r7,[r1,#12] @@ -118,26 +77,20 @@ poly1305_init: str r6,[r0,#8] and r7,r7,r3 str r7,[r0,#12] -#if __ARM_MAX_ARCH__>=7 - stmia r2,{r11,r12} @ fill functions table - mov r0,#1 -#else - mov r0,#0 -#endif .Lno_key: ldmia sp!,{r4-r11} -#if __ARM_ARCH__>=5 +#if __LINUX_ARM_ARCH__ >= 5 bx lr @ bx lr #else tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif -.size poly1305_init,.-poly1305_init -.type poly1305_blocks,%function +ENDPROC(poly1305_init_arm) + .align 5 -poly1305_blocks: -.Lpoly1305_blocks: +ENTRY(poly1305_blocks_arm) +.Lpoly1305_blocks_arm: stmdb sp!,{r3-r11,lr} ands r2,r2,#-16 @@ -158,11 +111,11 @@ poly1305_blocks: b .Loop .Loop: -#if __ARM_ARCH__<7 +#if __LINUX_ARM_ARCH__ < 7 ldrb r0,[lr],#16 @ load input -# ifdef __thumb2__ +#ifdef __thumb2__ it hi -# endif +#endif addhi r8,r8,#1 @ 1<<128 ldrb r1,[lr,#-15] ldrb r2,[lr,#-14] @@ -201,19 +154,19 @@ poly1305_blocks: orr r3,r2,r3,lsl#24 #else ldr r0,[lr],#16 @ load input -# ifdef __thumb2__ +#ifdef __thumb2__ it hi -# endif +#endif addhi r8,r8,#1 @ padbit ldr r1,[lr,#-12] ldr r2,[lr,#-8] ldr r3,[lr,#-4] -# ifdef __ARMEB__ +#ifdef __ARMEB__ rev r0,r0 rev r1,r1 rev r2,r2 rev r3,r3 -# endif +#endif adds r4,r4,r0 @ accumulate input str lr,[sp,#8] @ offload input pointer adcs r5,r5,r1 @@ -283,7 +236,7 @@ poly1305_blocks: stmia r0,{r4-r8} @ store the result .Lno_data: -#if __ARM_ARCH__>=5 +#if __LINUX_ARM_ARCH__ >= 5 ldmia sp!,{r3-r11,pc} #else ldmia sp!,{r3-r11,lr} @@ -291,13 +244,12 @@ poly1305_blocks: moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif -.size poly1305_blocks,.-poly1305_blocks -.type poly1305_emit,%function +ENDPROC(poly1305_blocks_arm) + .align 5 -poly1305_emit: +ENTRY(poly1305_emit_arm) stmdb sp!,{r4-r11} .Lpoly1305_emit_enter: - ldmia r0,{r3-r7} adds r8,r3,#5 @ compare to modulus adcs r9,r4,#0 @@ -332,13 +284,13 @@ poly1305_emit: adcs r5,r5,r10 adc r6,r6,r11 -#if __ARM_ARCH__>=7 -# ifdef __ARMEB__ +#if __LINUX_ARM_ARCH__ >= 7 +#ifdef __ARMEB__ rev r3,r3 rev r4,r4 rev r5,r5 rev r6,r6 -# endif +#endif str r3,[r1,#0] str r4,[r1,#4] str r5,[r1,#8] @@ -377,20 +329,22 @@ poly1305_emit: strb r6,[r1,#15] #endif ldmia sp!,{r4-r11} -#if __ARM_ARCH__>=5 +#if __LINUX_ARM_ARCH__ >= 5 bx lr @ bx lr #else tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif -.size poly1305_emit,.-poly1305_emit -#if __ARM_MAX_ARCH__>=7 +ENDPROC(poly1305_emit_arm) + + +#ifdef CONFIG_KERNEL_MODE_NEON .fpu neon -.type poly1305_init_neon,%function .align 5 -poly1305_init_neon: +ENTRY(poly1305_init_neon) +.Lpoly1305_init_neon: ldr r4,[r0,#20] @ load key base 2^32 ldr r5,[r0,#24] ldr r6,[r0,#28] @@ -600,11 +554,10 @@ poly1305_init_neon: vst1.32 {d8[1]},[r7] bx lr @ bx lr -.size poly1305_init_neon,.-poly1305_init_neon +ENDPROC(poly1305_init_neon) -.type poly1305_blocks_neon,%function .align 5 -poly1305_blocks_neon: +ENTRY(poly1305_blocks_neon) ldr ip,[r0,#36] @ is_base2_26 ands r2,r2,#-16 beq .Lno_data_neon @@ -612,7 +565,7 @@ poly1305_blocks_neon: cmp r2,#64 bhs .Lenter_neon tst ip,ip @ is_base2_26? - beq .Lpoly1305_blocks + beq .Lpoly1305_blocks_arm .Lenter_neon: stmdb sp!,{r4-r7} @@ -622,7 +575,7 @@ poly1305_blocks_neon: bne .Lbase2_26_neon stmdb sp!,{r1-r3,lr} - bl poly1305_init_neon + bl .Lpoly1305_init_neon ldr r4,[r0,#0] @ load hash value base 2^32 ldr r5,[r0,#4] @@ -686,12 +639,12 @@ poly1305_blocks_neon: sub r2,r2,#16 add r4,r1,#32 -# ifdef __ARMEB__ +#ifdef __ARMEB__ vrev32.8 q10,q10 vrev32.8 q13,q13 vrev32.8 q11,q11 vrev32.8 q12,q12 -# endif +#endif vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 vshl.u32 d26,d26,#18 @@ -735,12 +688,12 @@ poly1305_blocks_neon: addhi r7,r0,#(48+1*9*4) addhi r6,r0,#(48+3*9*4) -# ifdef __ARMEB__ +#ifdef __ARMEB__ vrev32.8 q10,q10 vrev32.8 q13,q13 vrev32.8 q11,q11 vrev32.8 q12,q12 -# endif +#endif vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 vshl.u32 q13,q13,#18 @@ -866,12 +819,12 @@ poly1305_blocks_neon: vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] add r1,r1,#64 -# ifdef __ARMEB__ +#ifdef __ARMEB__ vrev32.8 q10,q10 vrev32.8 q11,q11 vrev32.8 q12,q12 vrev32.8 q13,q13 -# endif +#endif @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @ lazy reduction interleaved with base 2^32 -> base 2^26 of @@ -1086,11 +1039,10 @@ poly1305_blocks_neon: ldmia sp!,{r4-r7} .Lno_data_neon: bx lr @ bx lr -.size poly1305_blocks_neon,.-poly1305_blocks_neon +ENDPROC(poly1305_blocks_neon) -.type poly1305_emit_neon,%function .align 5 -poly1305_emit_neon: +ENTRY(poly1305_emit_neon) ldr ip,[r0,#36] @ is_base2_26 stmdb sp!,{r4-r11} @@ -1144,12 +1096,12 @@ poly1305_emit_neon: adcs r5,r5,r10 adc r6,r6,r11 -# ifdef __ARMEB__ +#ifdef __ARMEB__ rev r3,r3 rev r4,r4 rev r5,r5 rev r6,r6 -# endif +#endif str r3,[r1,#0] @ store the result str r4,[r1,#4] str r5,[r1,#8] @@ -1157,16 +1109,9 @@ poly1305_emit_neon: ldmia sp!,{r4-r11} bx lr @ bx lr -.size poly1305_emit_neon,.-poly1305_emit_neon +ENDPROC(poly1305_emit_neon) .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.Lpoly1305_init -#endif -.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro@xxxxxxxxxxx>" -.align 2 -#if __ARM_MAX_ARCH__>=7 -.comm OPENSSL_armcap_P,4,4 #endif diff --git a/lib/zinc/poly1305/poly1305-arm64-cryptogams.S b/lib/zinc/poly1305/poly1305-arm64.S similarity index 89% rename from lib/zinc/poly1305/poly1305-arm64-cryptogams.S rename to lib/zinc/poly1305/poly1305-arm64.S index 0ecb50a83ec0..5f4e7fb0a836 100644 --- a/lib/zinc/poly1305/poly1305-arm64-cryptogams.S +++ b/lib/zinc/poly1305/poly1305-arm64.S @@ -1,21 +1,16 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ /* + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@xxxxxxxxxxx>. All Rights Reserved. + * + * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. */ -#include "arm_arch.h" - +#include <linux/linkage.h> .text -// forward "declarations" are required for Apple - -.globl poly1305_blocks -.globl poly1305_emit - -.globl poly1305_init -.type poly1305_init,%function .align 5 -poly1305_init: +ENTRY(poly1305_init_arm) cmp x1,xzr stp xzr,xzr,[x0] // zero hash value stp xzr,xzr,[x0,#16] // [along with is_base2_26] @@ -23,18 +18,10 @@ poly1305_init: csel x0,xzr,x0,eq b.eq .Lno_key -#ifdef __ILP32__ - ldrsw x11,.LOPENSSL_armcap_P -#else - ldr x11,.LOPENSSL_armcap_P -#endif - adr x10,.LOPENSSL_armcap_P - ldp x7,x8,[x1] // load key mov x9,#0xfffffffc0fffffff movk x9,#0x0fff,lsl#48 - ldr w17,[x10,x11] -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x7,x7 // flip bytes rev x8,x8 #endif @@ -43,30 +30,12 @@ poly1305_init: and x8,x8,x9 // &=0ffffffc0ffffffc stp x7,x8,[x0,#32] // save key value - tst w17,#ARMV7_NEON - - adr x12,poly1305_blocks - adr x7,poly1305_blocks_neon - adr x13,poly1305_emit - adr x8,poly1305_emit_neon - - csel x12,x12,x7,eq - csel x13,x13,x8,eq - -#ifdef __ILP32__ - stp w12,w13,[x2] -#else - stp x12,x13,[x2] -#endif - - mov x0,#1 .Lno_key: ret -.size poly1305_init,.-poly1305_init +ENDPROC(poly1305_init_arm) -.type poly1305_blocks,%function .align 5 -poly1305_blocks: +ENTRY(poly1305_blocks_arm) ands x2,x2,#-16 b.eq .Lno_data @@ -80,7 +49,7 @@ poly1305_blocks: .Loop: ldp x10,x11,[x1],#16 // load input sub x2,x2,#16 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x10,x10 rev x11,x11 #endif @@ -126,11 +95,10 @@ poly1305_blocks: .Lno_data: ret -.size poly1305_blocks,.-poly1305_blocks +ENDPROC(poly1305_blocks_arm) -.type poly1305_emit,%function .align 5 -poly1305_emit: +ENTRY(poly1305_emit_arm) ldp x4,x5,[x0] // load hash base 2^64 ldr x6,[x0,#16] ldp x10,x11,[x2] // load nonce @@ -144,23 +112,23 @@ poly1305_emit: csel x4,x4,x12,eq csel x5,x5,x13,eq -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x10,x10,#32 // flip nonce words ror x11,x11,#32 #endif adds x4,x4,x10 // accumulate nonce adc x5,x5,x11 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x4,x4 // flip output bytes rev x5,x5 #endif stp x4,x5,[x1] // write result ret -.size poly1305_emit,.-poly1305_emit -.type poly1305_mult,%function +ENDPROC(poly1305_emit_arm) + .align 5 -poly1305_mult: +__poly1305_mult: mul x12,x4,x7 // h0*r0 umulh x13,x4,x7 @@ -193,11 +161,8 @@ poly1305_mult: adc x6,x6,xzr ret -.size poly1305_mult,.-poly1305_mult -.type poly1305_splat,%function -.align 5 -poly1305_splat: +__poly1305_splat: and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 ubfx x13,x4,#26,#26 extr x14,x5,x4,#52 @@ -220,15 +185,14 @@ poly1305_splat: str w15,[x0,#16*8] // s4 ret -.size poly1305_splat,.-poly1305_splat -.type poly1305_blocks_neon,%function +#ifdef CONFIG_KERNEL_MODE_NEON .align 5 -poly1305_blocks_neon: +ENTRY(poly1305_blocks_neon) ldr x17,[x0,#24] cmp x2,#128 b.hs .Lblocks_neon - cbz x17,poly1305_blocks + cbz x17,poly1305_blocks_arm .Lblocks_neon: stp x29,x30,[sp,#-80]! @@ -268,7 +232,7 @@ poly1305_blocks_neon: adcs x5,x5,xzr adc x6,x6,xzr -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x12,x12 rev x13,x13 #endif @@ -276,7 +240,7 @@ poly1305_blocks_neon: adcs x5,x5,x13 adc x6,x6,x3 - bl poly1305_mult + bl __poly1305_mult ldr x30,[sp,#8] cbz x3,.Lstore_base2_64_neon @@ -314,7 +278,7 @@ poly1305_blocks_neon: ldp x12,x13,[x1],#16 // load input sub x2,x2,#16 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x12,x12 rev x13,x13 #endif @@ -322,7 +286,7 @@ poly1305_blocks_neon: adcs x5,x5,x13 adc x6,x6,x3 - bl poly1305_mult + bl __poly1305_mult .Linit_neon: and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 @@ -349,19 +313,19 @@ poly1305_blocks_neon: mov x5,x8 mov x6,xzr add x0,x0,#48+12 - bl poly1305_splat + bl __poly1305_splat - bl poly1305_mult // r^2 + bl __poly1305_mult // r^2 sub x0,x0,#4 - bl poly1305_splat + bl __poly1305_splat - bl poly1305_mult // r^3 + bl __poly1305_mult // r^3 sub x0,x0,#4 - bl poly1305_splat + bl __poly1305_splat - bl poly1305_mult // r^4 + bl __poly1305_mult // r^4 sub x0,x0,#4 - bl poly1305_splat + bl __poly1305_splat ldr x30,[sp,#8] add x16,x1,#32 @@ -399,7 +363,7 @@ poly1305_blocks_neon: lsl x3,x3,#24 add x15,x0,#48 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 @@ -435,7 +399,7 @@ poly1305_blocks_neon: ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 ld1 {v8.4s},[x15] -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 @@ -496,7 +460,7 @@ poly1305_blocks_neon: umull v20.2d,v14.2s,v1.s[2] ldp x9,x13,[x16],#48 umull v19.2d,v14.2s,v0.s[2] -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 @@ -561,7 +525,7 @@ poly1305_blocks_neon: umlal v23.2d,v11.2s,v3.s[0] umlal v20.2d,v11.2s,v8.s[0] umlal v21.2d,v11.2s,v0.s[0] -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 @@ -801,13 +765,12 @@ poly1305_blocks_neon: .Lno_data_neon: ldr x29,[sp],#80 ret -.size poly1305_blocks_neon,.-poly1305_blocks_neon +ENDPROC(poly1305_blocks_neon) -.type poly1305_emit_neon,%function .align 5 -poly1305_emit_neon: +ENTRY(poly1305_emit_neon) ldr x17,[x0,#24] - cbz x17,poly1305_emit + cbz x17,poly1305_emit_arm ldp w10,w11,[x0] // load hash value base 2^26 ldp w12,w13,[x0,#8] @@ -840,30 +803,22 @@ poly1305_emit_neon: csel x4,x4,x12,eq csel x5,x5,x13,eq -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x10,x10,#32 // flip nonce words ror x11,x11,#32 #endif adds x4,x4,x10 // accumulate nonce adc x5,x5,x11 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x4,x4 // flip output bytes rev x5,x5 #endif stp x4,x5,[x1] // write result ret -.size poly1305_emit_neon,.-poly1305_emit_neon +ENDPROC(poly1305_emit_neon) .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. #endif -.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 diff --git a/lib/zinc/poly1305/poly1305.c b/lib/zinc/poly1305/poly1305.c index 51af7045cac8..9dc85f62e806 100644 --- a/lib/zinc/poly1305/poly1305.c +++ b/lib/zinc/poly1305/poly1305.c @@ -18,6 +18,8 @@ #if defined(CONFIG_ZINC_ARCH_X86_64) #include "poly1305-x86_64-glue.c" +#elif defined(CONFIG_ZINC_ARCH_ARM) || defined(CONFIG_ZINC_ARCH_ARM64) +#include "poly1305-arm-glue.c" #else static inline bool poly1305_init_arch(void *ctx, const u8 key[POLY1305_KEY_SIZE]) -- 2.19.0