On 25 September 2018 at 16:56, Jason A. Donenfeld <Jason@xxxxxxxxx> wrote: > These NEON and non-NEON implementations come from Andy Polyakov's > implementation, and are included here in raw form without modification, > so that subsequent commits that fix these up for the kernel can see how > it has changed. This awkward commit splitting has been requested for the > ARM[64] implementations in particular. > > While this is CRYPTOGAMS code, the originating code for this happens to > be the same as OpenSSL's commit 87cc649f30aaf69b351701875b9dac07c29ce8a2 > > Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx> > Based-on-code-from: Andy Polyakov <appro@xxxxxxxxxxx> > Cc: Samuel Neves <sneves@xxxxxxxxx> > Cc: Andy Lutomirski <luto@xxxxxxxxxx> > Cc: Greg KH <gregkh@xxxxxxxxxxxxxxxxxxx> > Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@xxxxxxxxx> > Cc: Andy Polyakov <appro@xxxxxxxxxxx> > Cc: Russell King <linux@xxxxxxxxxxxxxxx> > Cc: linux-arm-kernel@xxxxxxxxxxxxxxxxxxx As I mentioned before, I'd prefer this to be based on the original .pl but if I am the only one objecting to this, I guess I can live with it. > --- > lib/zinc/chacha20/chacha20-arm-cryptogams.S | 1440 ++++++++++++ > lib/zinc/chacha20/chacha20-arm64-cryptogams.S | 1973 +++++++++++++++++ > 2 files changed, 3413 insertions(+) > create mode 100644 lib/zinc/chacha20/chacha20-arm-cryptogams.S > create mode 100644 lib/zinc/chacha20/chacha20-arm64-cryptogams.S > > diff --git a/lib/zinc/chacha20/chacha20-arm-cryptogams.S b/lib/zinc/chacha20/chacha20-arm-cryptogams.S > new file mode 100644 > index 000000000000..05a3a9e6e93f > --- /dev/null > +++ b/lib/zinc/chacha20/chacha20-arm-cryptogams.S > @@ -0,0 +1,1440 @@ > +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ > +/* > + * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@xxxxxxxxxxx>. All Rights Reserved. > + */ > + > +#include "arm_arch.h" > + > +.text > +#if defined(__thumb2__) || defined(__clang__) > +.syntax unified > +#endif > +#if defined(__thumb2__) > +.thumb > +#else > +.code 32 > +#endif > + > +#if defined(__thumb2__) || defined(__clang__) > +#define ldrhsb ldrbhs > +#endif > + > +.align 5 > +.Lsigma: > +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral > +.Lone: > +.long 1,0,0,0 > +.Lrot8: > +.long 0x02010003,0x06050407 > +#if __ARM_MAX_ARCH__>=7 > +.LOPENSSL_armcap: > +.word OPENSSL_armcap_P-.LChaCha20_ctr32 > +#else > +.word -1 > +#endif > + > +.globl ChaCha20_ctr32 > +.type ChaCha20_ctr32,%function > +.align 5 > +ChaCha20_ctr32: > +.LChaCha20_ctr32: > + ldr r12,[sp,#0] @ pull pointer to counter and nonce > + stmdb sp!,{r0-r2,r4-r11,lr} > +#if __ARM_ARCH__<7 && !defined(__thumb2__) > + sub r14,pc,#16 @ ChaCha20_ctr32 > +#else > + adr r14,.LChaCha20_ctr32 > +#endif > + cmp r2,#0 @ len==0? > +#ifdef __thumb2__ > + itt eq > +#endif > + addeq sp,sp,#4*3 > + beq .Lno_data > +#if __ARM_MAX_ARCH__>=7 > + cmp r2,#192 @ test len > + bls .Lshort > + ldr r4,[r14,#-24] > + ldr r4,[r14,r4] > +# ifdef __APPLE__ > + ldr r4,[r4] > +# endif > + tst r4,#ARMV7_NEON > + bne .LChaCha20_neon > +.Lshort: > +#endif > + ldmia r12,{r4-r7} @ load counter and nonce > + sub sp,sp,#4*(16) @ off-load area > + sub r14,r14,#64 @ .Lsigma > + stmdb sp!,{r4-r7} @ copy counter and nonce > + ldmia r3,{r4-r11} @ load key > + ldmia r14,{r0-r3} @ load sigma > + stmdb sp!,{r4-r11} @ copy key > + stmdb sp!,{r0-r3} @ copy sigma > + str r10,[sp,#4*(16+10)] @ off-load "rx" > + str r11,[sp,#4*(16+11)] @ off-load "rx" > + b .Loop_outer_enter > + > +.align 4 > +.Loop_outer: > + ldmia sp,{r0-r9} @ load key material > + str r11,[sp,#4*(32+2)] @ save len > + str r12, [sp,#4*(32+1)] @ save inp > + str r14, [sp,#4*(32+0)] @ save out > +.Loop_outer_enter: > + ldr r11, [sp,#4*(15)] > + mov r4,r4,ror#19 @ twist b[0..3] > + ldr r12,[sp,#4*(12)] @ modulo-scheduled load > + mov r5,r5,ror#19 > + ldr r10, [sp,#4*(13)] > + mov r6,r6,ror#19 > + ldr r14,[sp,#4*(14)] > + mov r7,r7,ror#19 > + mov r11,r11,ror#8 @ twist d[0..3] > + mov r12,r12,ror#8 > + mov r10,r10,ror#8 > + mov r14,r14,ror#8 > + str r11, [sp,#4*(16+15)] > + mov r11,#10 > + b .Loop > + > +.align 4 > +.Loop: > + subs r11,r11,#1 > + add r0,r0,r4,ror#13 > + add r1,r1,r5,ror#13 > + eor r12,r0,r12,ror#24 > + eor r10,r1,r10,ror#24 > + add r8,r8,r12,ror#16 > + add r9,r9,r10,ror#16 > + eor r4,r8,r4,ror#13 > + eor r5,r9,r5,ror#13 > + add r0,r0,r4,ror#20 > + add r1,r1,r5,ror#20 > + eor r12,r0,r12,ror#16 > + eor r10,r1,r10,ror#16 > + add r8,r8,r12,ror#24 > + str r10,[sp,#4*(16+13)] > + add r9,r9,r10,ror#24 > + ldr r10,[sp,#4*(16+15)] > + str r8,[sp,#4*(16+8)] > + eor r4,r4,r8,ror#12 > + str r9,[sp,#4*(16+9)] > + eor r5,r5,r9,ror#12 > + ldr r8,[sp,#4*(16+10)] > + add r2,r2,r6,ror#13 > + ldr r9,[sp,#4*(16+11)] > + add r3,r3,r7,ror#13 > + eor r14,r2,r14,ror#24 > + eor r10,r3,r10,ror#24 > + add r8,r8,r14,ror#16 > + add r9,r9,r10,ror#16 > + eor r6,r8,r6,ror#13 > + eor r7,r9,r7,ror#13 > + add r2,r2,r6,ror#20 > + add r3,r3,r7,ror#20 > + eor r14,r2,r14,ror#16 > + eor r10,r3,r10,ror#16 > + add r8,r8,r14,ror#24 > + add r9,r9,r10,ror#24 > + eor r6,r6,r8,ror#12 > + eor r7,r7,r9,ror#12 > + add r0,r0,r5,ror#13 > + add r1,r1,r6,ror#13 > + eor r10,r0,r10,ror#24 > + eor r12,r1,r12,ror#24 > + add r8,r8,r10,ror#16 > + add r9,r9,r12,ror#16 > + eor r5,r8,r5,ror#13 > + eor r6,r9,r6,ror#13 > + add r0,r0,r5,ror#20 > + add r1,r1,r6,ror#20 > + eor r10,r0,r10,ror#16 > + eor r12,r1,r12,ror#16 > + str r10,[sp,#4*(16+15)] > + add r8,r8,r10,ror#24 > + ldr r10,[sp,#4*(16+13)] > + add r9,r9,r12,ror#24 > + str r8,[sp,#4*(16+10)] > + eor r5,r5,r8,ror#12 > + str r9,[sp,#4*(16+11)] > + eor r6,r6,r9,ror#12 > + ldr r8,[sp,#4*(16+8)] > + add r2,r2,r7,ror#13 > + ldr r9,[sp,#4*(16+9)] > + add r3,r3,r4,ror#13 > + eor r10,r2,r10,ror#24 > + eor r14,r3,r14,ror#24 > + add r8,r8,r10,ror#16 > + add r9,r9,r14,ror#16 > + eor r7,r8,r7,ror#13 > + eor r4,r9,r4,ror#13 > + add r2,r2,r7,ror#20 > + add r3,r3,r4,ror#20 > + eor r10,r2,r10,ror#16 > + eor r14,r3,r14,ror#16 > + add r8,r8,r10,ror#24 > + add r9,r9,r14,ror#24 > + eor r7,r7,r8,ror#12 > + eor r4,r4,r9,ror#12 > + bne .Loop > + > + ldr r11,[sp,#4*(32+2)] @ load len > + > + str r8, [sp,#4*(16+8)] @ modulo-scheduled store > + str r9, [sp,#4*(16+9)] > + str r12,[sp,#4*(16+12)] > + str r10, [sp,#4*(16+13)] > + str r14,[sp,#4*(16+14)] > + > + @ at this point we have first half of 512-bit result in > + @ rx and second half at sp+4*(16+8) > + > + cmp r11,#64 @ done yet? > +#ifdef __thumb2__ > + itete lo > +#endif > + addlo r12,sp,#4*(0) @ shortcut or ... > + ldrhs r12,[sp,#4*(32+1)] @ ... load inp > + addlo r14,sp,#4*(0) @ shortcut or ... > + ldrhs r14,[sp,#4*(32+0)] @ ... load out > + > + ldr r8,[sp,#4*(0)] @ load key material > + ldr r9,[sp,#4*(1)] > + > +#if __ARM_ARCH__>=6 || !defined(__ARMEB__) > +# if __ARM_ARCH__<7 > + orr r10,r12,r14 > + tst r10,#3 @ are input and output aligned? > + ldr r10,[sp,#4*(2)] > + bne .Lunaligned > + cmp r11,#64 @ restore flags > +# else > + ldr r10,[sp,#4*(2)] > +# endif > + ldr r11,[sp,#4*(3)] > + > + add r0,r0,r8 @ accumulate key material > + add r1,r1,r9 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhs r8,[r12],#16 @ load input > + ldrhs r9,[r12,#-12] > + > + add r2,r2,r10 > + add r3,r3,r11 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhs r10,[r12,#-8] > + ldrhs r11,[r12,#-4] > +# if __ARM_ARCH__>=6 && defined(__ARMEB__) > + rev r0,r0 > + rev r1,r1 > + rev r2,r2 > + rev r3,r3 > +# endif > +# ifdef __thumb2__ > + itt hs > +# endif > + eorhs r0,r0,r8 @ xor with input > + eorhs r1,r1,r9 > + add r8,sp,#4*(4) > + str r0,[r14],#16 @ store output > +# ifdef __thumb2__ > + itt hs > +# endif > + eorhs r2,r2,r10 > + eorhs r3,r3,r11 > + ldmia r8,{r8-r11} @ load key material > + str r1,[r14,#-12] > + str r2,[r14,#-8] > + str r3,[r14,#-4] > + > + add r4,r8,r4,ror#13 @ accumulate key material > + add r5,r9,r5,ror#13 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhs r8,[r12],#16 @ load input > + ldrhs r9,[r12,#-12] > + add r6,r10,r6,ror#13 > + add r7,r11,r7,ror#13 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhs r10,[r12,#-8] > + ldrhs r11,[r12,#-4] > +# if __ARM_ARCH__>=6 && defined(__ARMEB__) > + rev r4,r4 > + rev r5,r5 > + rev r6,r6 > + rev r7,r7 > +# endif > +# ifdef __thumb2__ > + itt hs > +# endif > + eorhs r4,r4,r8 > + eorhs r5,r5,r9 > + add r8,sp,#4*(8) > + str r4,[r14],#16 @ store output > +# ifdef __thumb2__ > + itt hs > +# endif > + eorhs r6,r6,r10 > + eorhs r7,r7,r11 > + str r5,[r14,#-12] > + ldmia r8,{r8-r11} @ load key material > + str r6,[r14,#-8] > + add r0,sp,#4*(16+8) > + str r7,[r14,#-4] > + > + ldmia r0,{r0-r7} @ load second half > + > + add r0,r0,r8 @ accumulate key material > + add r1,r1,r9 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhs r8,[r12],#16 @ load input > + ldrhs r9,[r12,#-12] > +# ifdef __thumb2__ > + itt hi > +# endif > + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it > + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it > + add r2,r2,r10 > + add r3,r3,r11 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhs r10,[r12,#-8] > + ldrhs r11,[r12,#-4] > +# if __ARM_ARCH__>=6 && defined(__ARMEB__) > + rev r0,r0 > + rev r1,r1 > + rev r2,r2 > + rev r3,r3 > +# endif > +# ifdef __thumb2__ > + itt hs > +# endif > + eorhs r0,r0,r8 > + eorhs r1,r1,r9 > + add r8,sp,#4*(12) > + str r0,[r14],#16 @ store output > +# ifdef __thumb2__ > + itt hs > +# endif > + eorhs r2,r2,r10 > + eorhs r3,r3,r11 > + str r1,[r14,#-12] > + ldmia r8,{r8-r11} @ load key material > + str r2,[r14,#-8] > + str r3,[r14,#-4] > + > + add r4,r8,r4,ror#24 @ accumulate key material > + add r5,r9,r5,ror#24 > +# ifdef __thumb2__ > + itt hi > +# endif > + addhi r8,r8,#1 @ next counter value > + strhi r8,[sp,#4*(12)] @ save next counter value > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhs r8,[r12],#16 @ load input > + ldrhs r9,[r12,#-12] > + add r6,r10,r6,ror#24 > + add r7,r11,r7,ror#24 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhs r10,[r12,#-8] > + ldrhs r11,[r12,#-4] > +# if __ARM_ARCH__>=6 && defined(__ARMEB__) > + rev r4,r4 > + rev r5,r5 > + rev r6,r6 > + rev r7,r7 > +# endif > +# ifdef __thumb2__ > + itt hs > +# endif > + eorhs r4,r4,r8 > + eorhs r5,r5,r9 > +# ifdef __thumb2__ > + it ne > +# endif > + ldrne r8,[sp,#4*(32+2)] @ re-load len > +# ifdef __thumb2__ > + itt hs > +# endif > + eorhs r6,r6,r10 > + eorhs r7,r7,r11 > + str r4,[r14],#16 @ store output > + str r5,[r14,#-12] > +# ifdef __thumb2__ > + it hs > +# endif > + subhs r11,r8,#64 @ len-=64 > + str r6,[r14,#-8] > + str r7,[r14,#-4] > + bhi .Loop_outer > + > + beq .Ldone > +# if __ARM_ARCH__<7 > + b .Ltail > + > +.align 4 > +.Lunaligned: @ unaligned endian-neutral path > + cmp r11,#64 @ restore flags > +# endif > +#endif > +#if __ARM_ARCH__<7 > + ldr r11,[sp,#4*(3)] > + add r0,r8,r0 @ accumulate key material > + add r1,r9,r1 > + add r2,r10,r2 > +# ifdef __thumb2__ > + itete lo > +# endif > + eorlo r8,r8,r8 @ zero or ... > + ldrhsb r8,[r12],#16 @ ... load input > + eorlo r9,r9,r9 > + ldrhsb r9,[r12,#-12] > + > + add r3,r11,r3 > +# ifdef __thumb2__ > + itete lo > +# endif > + eorlo r10,r10,r10 > + ldrhsb r10,[r12,#-8] > + eorlo r11,r11,r11 > + ldrhsb r11,[r12,#-4] > + > + eor r0,r8,r0 @ xor with input (or zero) > + eor r1,r9,r1 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-15] @ load more input > + ldrhsb r9,[r12,#-11] > + eor r2,r10,r2 > + strb r0,[r14],#16 @ store output > + eor r3,r11,r3 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-7] > + ldrhsb r11,[r12,#-3] > + strb r1,[r14,#-12] > + eor r0,r8,r0,lsr#8 > + strb r2,[r14,#-8] > + eor r1,r9,r1,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-14] @ load more input > + ldrhsb r9,[r12,#-10] > + strb r3,[r14,#-4] > + eor r2,r10,r2,lsr#8 > + strb r0,[r14,#-15] > + eor r3,r11,r3,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-6] > + ldrhsb r11,[r12,#-2] > + strb r1,[r14,#-11] > + eor r0,r8,r0,lsr#8 > + strb r2,[r14,#-7] > + eor r1,r9,r1,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-13] @ load more input > + ldrhsb r9,[r12,#-9] > + strb r3,[r14,#-3] > + eor r2,r10,r2,lsr#8 > + strb r0,[r14,#-14] > + eor r3,r11,r3,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-5] > + ldrhsb r11,[r12,#-1] > + strb r1,[r14,#-10] > + strb r2,[r14,#-6] > + eor r0,r8,r0,lsr#8 > + strb r3,[r14,#-2] > + eor r1,r9,r1,lsr#8 > + strb r0,[r14,#-13] > + eor r2,r10,r2,lsr#8 > + strb r1,[r14,#-9] > + eor r3,r11,r3,lsr#8 > + strb r2,[r14,#-5] > + strb r3,[r14,#-1] > + add r8,sp,#4*(4+0) > + ldmia r8,{r8-r11} @ load key material > + add r0,sp,#4*(16+8) > + add r4,r8,r4,ror#13 @ accumulate key material > + add r5,r9,r5,ror#13 > + add r6,r10,r6,ror#13 > +# ifdef __thumb2__ > + itete lo > +# endif > + eorlo r8,r8,r8 @ zero or ... > + ldrhsb r8,[r12],#16 @ ... load input > + eorlo r9,r9,r9 > + ldrhsb r9,[r12,#-12] > + > + add r7,r11,r7,ror#13 > +# ifdef __thumb2__ > + itete lo > +# endif > + eorlo r10,r10,r10 > + ldrhsb r10,[r12,#-8] > + eorlo r11,r11,r11 > + ldrhsb r11,[r12,#-4] > + > + eor r4,r8,r4 @ xor with input (or zero) > + eor r5,r9,r5 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-15] @ load more input > + ldrhsb r9,[r12,#-11] > + eor r6,r10,r6 > + strb r4,[r14],#16 @ store output > + eor r7,r11,r7 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-7] > + ldrhsb r11,[r12,#-3] > + strb r5,[r14,#-12] > + eor r4,r8,r4,lsr#8 > + strb r6,[r14,#-8] > + eor r5,r9,r5,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-14] @ load more input > + ldrhsb r9,[r12,#-10] > + strb r7,[r14,#-4] > + eor r6,r10,r6,lsr#8 > + strb r4,[r14,#-15] > + eor r7,r11,r7,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-6] > + ldrhsb r11,[r12,#-2] > + strb r5,[r14,#-11] > + eor r4,r8,r4,lsr#8 > + strb r6,[r14,#-7] > + eor r5,r9,r5,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-13] @ load more input > + ldrhsb r9,[r12,#-9] > + strb r7,[r14,#-3] > + eor r6,r10,r6,lsr#8 > + strb r4,[r14,#-14] > + eor r7,r11,r7,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-5] > + ldrhsb r11,[r12,#-1] > + strb r5,[r14,#-10] > + strb r6,[r14,#-6] > + eor r4,r8,r4,lsr#8 > + strb r7,[r14,#-2] > + eor r5,r9,r5,lsr#8 > + strb r4,[r14,#-13] > + eor r6,r10,r6,lsr#8 > + strb r5,[r14,#-9] > + eor r7,r11,r7,lsr#8 > + strb r6,[r14,#-5] > + strb r7,[r14,#-1] > + add r8,sp,#4*(4+4) > + ldmia r8,{r8-r11} @ load key material > + ldmia r0,{r0-r7} @ load second half > +# ifdef __thumb2__ > + itt hi > +# endif > + strhi r10,[sp,#4*(16+10)] @ copy "rx" > + strhi r11,[sp,#4*(16+11)] @ copy "rx" > + add r0,r8,r0 @ accumulate key material > + add r1,r9,r1 > + add r2,r10,r2 > +# ifdef __thumb2__ > + itete lo > +# endif > + eorlo r8,r8,r8 @ zero or ... > + ldrhsb r8,[r12],#16 @ ... load input > + eorlo r9,r9,r9 > + ldrhsb r9,[r12,#-12] > + > + add r3,r11,r3 > +# ifdef __thumb2__ > + itete lo > +# endif > + eorlo r10,r10,r10 > + ldrhsb r10,[r12,#-8] > + eorlo r11,r11,r11 > + ldrhsb r11,[r12,#-4] > + > + eor r0,r8,r0 @ xor with input (or zero) > + eor r1,r9,r1 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-15] @ load more input > + ldrhsb r9,[r12,#-11] > + eor r2,r10,r2 > + strb r0,[r14],#16 @ store output > + eor r3,r11,r3 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-7] > + ldrhsb r11,[r12,#-3] > + strb r1,[r14,#-12] > + eor r0,r8,r0,lsr#8 > + strb r2,[r14,#-8] > + eor r1,r9,r1,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-14] @ load more input > + ldrhsb r9,[r12,#-10] > + strb r3,[r14,#-4] > + eor r2,r10,r2,lsr#8 > + strb r0,[r14,#-15] > + eor r3,r11,r3,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-6] > + ldrhsb r11,[r12,#-2] > + strb r1,[r14,#-11] > + eor r0,r8,r0,lsr#8 > + strb r2,[r14,#-7] > + eor r1,r9,r1,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-13] @ load more input > + ldrhsb r9,[r12,#-9] > + strb r3,[r14,#-3] > + eor r2,r10,r2,lsr#8 > + strb r0,[r14,#-14] > + eor r3,r11,r3,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-5] > + ldrhsb r11,[r12,#-1] > + strb r1,[r14,#-10] > + strb r2,[r14,#-6] > + eor r0,r8,r0,lsr#8 > + strb r3,[r14,#-2] > + eor r1,r9,r1,lsr#8 > + strb r0,[r14,#-13] > + eor r2,r10,r2,lsr#8 > + strb r1,[r14,#-9] > + eor r3,r11,r3,lsr#8 > + strb r2,[r14,#-5] > + strb r3,[r14,#-1] > + add r8,sp,#4*(4+8) > + ldmia r8,{r8-r11} @ load key material > + add r4,r8,r4,ror#24 @ accumulate key material > +# ifdef __thumb2__ > + itt hi > +# endif > + addhi r8,r8,#1 @ next counter value > + strhi r8,[sp,#4*(12)] @ save next counter value > + add r5,r9,r5,ror#24 > + add r6,r10,r6,ror#24 > +# ifdef __thumb2__ > + itete lo > +# endif > + eorlo r8,r8,r8 @ zero or ... > + ldrhsb r8,[r12],#16 @ ... load input > + eorlo r9,r9,r9 > + ldrhsb r9,[r12,#-12] > + > + add r7,r11,r7,ror#24 > +# ifdef __thumb2__ > + itete lo > +# endif > + eorlo r10,r10,r10 > + ldrhsb r10,[r12,#-8] > + eorlo r11,r11,r11 > + ldrhsb r11,[r12,#-4] > + > + eor r4,r8,r4 @ xor with input (or zero) > + eor r5,r9,r5 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-15] @ load more input > + ldrhsb r9,[r12,#-11] > + eor r6,r10,r6 > + strb r4,[r14],#16 @ store output > + eor r7,r11,r7 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-7] > + ldrhsb r11,[r12,#-3] > + strb r5,[r14,#-12] > + eor r4,r8,r4,lsr#8 > + strb r6,[r14,#-8] > + eor r5,r9,r5,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-14] @ load more input > + ldrhsb r9,[r12,#-10] > + strb r7,[r14,#-4] > + eor r6,r10,r6,lsr#8 > + strb r4,[r14,#-15] > + eor r7,r11,r7,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-6] > + ldrhsb r11,[r12,#-2] > + strb r5,[r14,#-11] > + eor r4,r8,r4,lsr#8 > + strb r6,[r14,#-7] > + eor r5,r9,r5,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r8,[r12,#-13] @ load more input > + ldrhsb r9,[r12,#-9] > + strb r7,[r14,#-3] > + eor r6,r10,r6,lsr#8 > + strb r4,[r14,#-14] > + eor r7,r11,r7,lsr#8 > +# ifdef __thumb2__ > + itt hs > +# endif > + ldrhsb r10,[r12,#-5] > + ldrhsb r11,[r12,#-1] > + strb r5,[r14,#-10] > + strb r6,[r14,#-6] > + eor r4,r8,r4,lsr#8 > + strb r7,[r14,#-2] > + eor r5,r9,r5,lsr#8 > + strb r4,[r14,#-13] > + eor r6,r10,r6,lsr#8 > + strb r5,[r14,#-9] > + eor r7,r11,r7,lsr#8 > + strb r6,[r14,#-5] > + strb r7,[r14,#-1] > +# ifdef __thumb2__ > + it ne > +# endif > + ldrne r8,[sp,#4*(32+2)] @ re-load len > +# ifdef __thumb2__ > + it hs > +# endif > + subhs r11,r8,#64 @ len-=64 > + bhi .Loop_outer > + > + beq .Ldone > +#endif > + > +.Ltail: > + ldr r12,[sp,#4*(32+1)] @ load inp > + add r9,sp,#4*(0) > + ldr r14,[sp,#4*(32+0)] @ load out > + > +.Loop_tail: > + ldrb r10,[r9],#1 @ read buffer on stack > + ldrb r11,[r12],#1 @ read input > + subs r8,r8,#1 > + eor r11,r11,r10 > + strb r11,[r14],#1 @ store output > + bne .Loop_tail > + > +.Ldone: > + add sp,sp,#4*(32+3) > +.Lno_data: > + ldmia sp!,{r4-r11,pc} > +.size ChaCha20_ctr32,.-ChaCha20_ctr32 > +#if __ARM_MAX_ARCH__>=7 > +.arch armv7-a > +.fpu neon > + > +.type ChaCha20_neon,%function > +.align 5 > +ChaCha20_neon: > + ldr r12,[sp,#0] @ pull pointer to counter and nonce > + stmdb sp!,{r0-r2,r4-r11,lr} > +.LChaCha20_neon: > + adr r14,.Lsigma > + vstmdb sp!,{d8-d15} @ ABI spec says so > + stmdb sp!,{r0-r3} > + > + vld1.32 {q1-q2},[r3] @ load key > + ldmia r3,{r4-r11} @ load key > + > + sub sp,sp,#4*(16+16) > + vld1.32 {q3},[r12] @ load counter and nonce > + add r12,sp,#4*8 > + ldmia r14,{r0-r3} @ load sigma > + vld1.32 {q0},[r14]! @ load sigma > + vld1.32 {q12},[r14]! @ one > + @ vld1.32 {d30},[r14] @ rot8 > + vst1.32 {q2-q3},[r12] @ copy 1/2key|counter|nonce > + vst1.32 {q0-q1},[sp] @ copy sigma|1/2key > + > + str r10,[sp,#4*(16+10)] @ off-load "rx" > + str r11,[sp,#4*(16+11)] @ off-load "rx" > + vshl.i32 d26,d24,#1 @ two > + vstr d24,[sp,#4*(16+0)] > + vshl.i32 d28,d24,#2 @ four > + vstr d26,[sp,#4*(16+2)] > + vmov q4,q0 > + vstr d28,[sp,#4*(16+4)] > + vmov q8,q0 > + @ vstr d30,[sp,#4*(16+6)] > + vmov q5,q1 > + vmov q9,q1 > + b .Loop_neon_enter > + > +.align 4 > +.Loop_neon_outer: > + ldmia sp,{r0-r9} @ load key material > + cmp r11,#64*2 @ if len<=64*2 > + bls .Lbreak_neon @ switch to integer-only > + @ vldr d30,[sp,#4*(16+6)] @ rot8 > + vmov q4,q0 > + str r11,[sp,#4*(32+2)] @ save len > + vmov q8,q0 > + str r12, [sp,#4*(32+1)] @ save inp > + vmov q5,q1 > + str r14, [sp,#4*(32+0)] @ save out > + vmov q9,q1 > +.Loop_neon_enter: > + ldr r11, [sp,#4*(15)] > + mov r4,r4,ror#19 @ twist b[0..3] > + vadd.i32 q7,q3,q12 @ counter+1 > + ldr r12,[sp,#4*(12)] @ modulo-scheduled load > + mov r5,r5,ror#19 > + vmov q6,q2 > + ldr r10, [sp,#4*(13)] > + mov r6,r6,ror#19 > + vmov q10,q2 > + ldr r14,[sp,#4*(14)] > + mov r7,r7,ror#19 > + vadd.i32 q11,q7,q12 @ counter+2 > + add r12,r12,#3 @ counter+3 > + mov r11,r11,ror#8 @ twist d[0..3] > + mov r12,r12,ror#8 > + mov r10,r10,ror#8 > + mov r14,r14,ror#8 > + str r11, [sp,#4*(16+15)] > + mov r11,#10 > + b .Loop_neon > + > +.align 4 > +.Loop_neon: > + subs r11,r11,#1 > + vadd.i32 q0,q0,q1 > + add r0,r0,r4,ror#13 > + vadd.i32 q4,q4,q5 > + add r1,r1,r5,ror#13 > + vadd.i32 q8,q8,q9 > + eor r12,r0,r12,ror#24 > + veor q3,q3,q0 > + eor r10,r1,r10,ror#24 > + veor q7,q7,q4 > + add r8,r8,r12,ror#16 > + veor q11,q11,q8 > + add r9,r9,r10,ror#16 > + vrev32.16 q3,q3 > + eor r4,r8,r4,ror#13 > + vrev32.16 q7,q7 > + eor r5,r9,r5,ror#13 > + vrev32.16 q11,q11 > + add r0,r0,r4,ror#20 > + vadd.i32 q2,q2,q3 > + add r1,r1,r5,ror#20 > + vadd.i32 q6,q6,q7 > + eor r12,r0,r12,ror#16 > + vadd.i32 q10,q10,q11 > + eor r10,r1,r10,ror#16 > + veor q12,q1,q2 > + add r8,r8,r12,ror#24 > + veor q13,q5,q6 > + str r10,[sp,#4*(16+13)] > + veor q14,q9,q10 > + add r9,r9,r10,ror#24 > + vshr.u32 q1,q12,#20 > + ldr r10,[sp,#4*(16+15)] > + vshr.u32 q5,q13,#20 > + str r8,[sp,#4*(16+8)] > + vshr.u32 q9,q14,#20 > + eor r4,r4,r8,ror#12 > + vsli.32 q1,q12,#12 > + str r9,[sp,#4*(16+9)] > + vsli.32 q5,q13,#12 > + eor r5,r5,r9,ror#12 > + vsli.32 q9,q14,#12 > + ldr r8,[sp,#4*(16+10)] > + vadd.i32 q0,q0,q1 > + add r2,r2,r6,ror#13 > + vadd.i32 q4,q4,q5 > + ldr r9,[sp,#4*(16+11)] > + vadd.i32 q8,q8,q9 > + add r3,r3,r7,ror#13 > + veor q12,q3,q0 > + eor r14,r2,r14,ror#24 > + veor q13,q7,q4 > + eor r10,r3,r10,ror#24 > + veor q14,q11,q8 > + add r8,r8,r14,ror#16 > + vshr.u32 q3,q12,#24 > + add r9,r9,r10,ror#16 > + vshr.u32 q7,q13,#24 > + eor r6,r8,r6,ror#13 > + vshr.u32 q11,q14,#24 > + eor r7,r9,r7,ror#13 > + vsli.32 q3,q12,#8 > + add r2,r2,r6,ror#20 > + vsli.32 q7,q13,#8 > + add r3,r3,r7,ror#20 > + vsli.32 q11,q14,#8 > + eor r14,r2,r14,ror#16 > + vadd.i32 q2,q2,q3 > + eor r10,r3,r10,ror#16 > + vadd.i32 q6,q6,q7 > + add r8,r8,r14,ror#24 > + vadd.i32 q10,q10,q11 > + add r9,r9,r10,ror#24 > + veor q12,q1,q2 > + eor r6,r6,r8,ror#12 > + veor q13,q5,q6 > + eor r7,r7,r9,ror#12 > + veor q14,q9,q10 > + vshr.u32 q1,q12,#25 > + vshr.u32 q5,q13,#25 > + vshr.u32 q9,q14,#25 > + vsli.32 q1,q12,#7 > + vsli.32 q5,q13,#7 > + vsli.32 q9,q14,#7 > + vext.8 q2,q2,q2,#8 > + vext.8 q6,q6,q6,#8 > + vext.8 q10,q10,q10,#8 > + vext.8 q1,q1,q1,#4 > + vext.8 q5,q5,q5,#4 > + vext.8 q9,q9,q9,#4 > + vext.8 q3,q3,q3,#12 > + vext.8 q7,q7,q7,#12 > + vext.8 q11,q11,q11,#12 > + vadd.i32 q0,q0,q1 > + add r0,r0,r5,ror#13 > + vadd.i32 q4,q4,q5 > + add r1,r1,r6,ror#13 > + vadd.i32 q8,q8,q9 > + eor r10,r0,r10,ror#24 > + veor q3,q3,q0 > + eor r12,r1,r12,ror#24 > + veor q7,q7,q4 > + add r8,r8,r10,ror#16 > + veor q11,q11,q8 > + add r9,r9,r12,ror#16 > + vrev32.16 q3,q3 > + eor r5,r8,r5,ror#13 > + vrev32.16 q7,q7 > + eor r6,r9,r6,ror#13 > + vrev32.16 q11,q11 > + add r0,r0,r5,ror#20 > + vadd.i32 q2,q2,q3 > + add r1,r1,r6,ror#20 > + vadd.i32 q6,q6,q7 > + eor r10,r0,r10,ror#16 > + vadd.i32 q10,q10,q11 > + eor r12,r1,r12,ror#16 > + veor q12,q1,q2 > + str r10,[sp,#4*(16+15)] > + veor q13,q5,q6 > + add r8,r8,r10,ror#24 > + veor q14,q9,q10 > + ldr r10,[sp,#4*(16+13)] > + vshr.u32 q1,q12,#20 > + add r9,r9,r12,ror#24 > + vshr.u32 q5,q13,#20 > + str r8,[sp,#4*(16+10)] > + vshr.u32 q9,q14,#20 > + eor r5,r5,r8,ror#12 > + vsli.32 q1,q12,#12 > + str r9,[sp,#4*(16+11)] > + vsli.32 q5,q13,#12 > + eor r6,r6,r9,ror#12 > + vsli.32 q9,q14,#12 > + ldr r8,[sp,#4*(16+8)] > + vadd.i32 q0,q0,q1 > + add r2,r2,r7,ror#13 > + vadd.i32 q4,q4,q5 > + ldr r9,[sp,#4*(16+9)] > + vadd.i32 q8,q8,q9 > + add r3,r3,r4,ror#13 > + veor q12,q3,q0 > + eor r10,r2,r10,ror#24 > + veor q13,q7,q4 > + eor r14,r3,r14,ror#24 > + veor q14,q11,q8 > + add r8,r8,r10,ror#16 > + vshr.u32 q3,q12,#24 > + add r9,r9,r14,ror#16 > + vshr.u32 q7,q13,#24 > + eor r7,r8,r7,ror#13 > + vshr.u32 q11,q14,#24 > + eor r4,r9,r4,ror#13 > + vsli.32 q3,q12,#8 > + add r2,r2,r7,ror#20 > + vsli.32 q7,q13,#8 > + add r3,r3,r4,ror#20 > + vsli.32 q11,q14,#8 > + eor r10,r2,r10,ror#16 > + vadd.i32 q2,q2,q3 > + eor r14,r3,r14,ror#16 > + vadd.i32 q6,q6,q7 > + add r8,r8,r10,ror#24 > + vadd.i32 q10,q10,q11 > + add r9,r9,r14,ror#24 > + veor q12,q1,q2 > + eor r7,r7,r8,ror#12 > + veor q13,q5,q6 > + eor r4,r4,r9,ror#12 > + veor q14,q9,q10 > + vshr.u32 q1,q12,#25 > + vshr.u32 q5,q13,#25 > + vshr.u32 q9,q14,#25 > + vsli.32 q1,q12,#7 > + vsli.32 q5,q13,#7 > + vsli.32 q9,q14,#7 > + vext.8 q2,q2,q2,#8 > + vext.8 q6,q6,q6,#8 > + vext.8 q10,q10,q10,#8 > + vext.8 q1,q1,q1,#12 > + vext.8 q5,q5,q5,#12 > + vext.8 q9,q9,q9,#12 > + vext.8 q3,q3,q3,#4 > + vext.8 q7,q7,q7,#4 > + vext.8 q11,q11,q11,#4 > + bne .Loop_neon > + > + add r11,sp,#32 > + vld1.32 {q12-q13},[sp] @ load key material > + vld1.32 {q14-q15},[r11] > + > + ldr r11,[sp,#4*(32+2)] @ load len > + > + str r8, [sp,#4*(16+8)] @ modulo-scheduled store > + str r9, [sp,#4*(16+9)] > + str r12,[sp,#4*(16+12)] > + str r10, [sp,#4*(16+13)] > + str r14,[sp,#4*(16+14)] > + > + @ at this point we have first half of 512-bit result in > + @ rx and second half at sp+4*(16+8) > + > + ldr r12,[sp,#4*(32+1)] @ load inp > + ldr r14,[sp,#4*(32+0)] @ load out > + > + vadd.i32 q0,q0,q12 @ accumulate key material > + vadd.i32 q4,q4,q12 > + vadd.i32 q8,q8,q12 > + vldr d24,[sp,#4*(16+0)] @ one > + > + vadd.i32 q1,q1,q13 > + vadd.i32 q5,q5,q13 > + vadd.i32 q9,q9,q13 > + vldr d26,[sp,#4*(16+2)] @ two > + > + vadd.i32 q2,q2,q14 > + vadd.i32 q6,q6,q14 > + vadd.i32 q10,q10,q14 > + vadd.i32 d14,d14,d24 @ counter+1 > + vadd.i32 d22,d22,d26 @ counter+2 > + > + vadd.i32 q3,q3,q15 > + vadd.i32 q7,q7,q15 > + vadd.i32 q11,q11,q15 > + > + cmp r11,#64*4 > + blo .Ltail_neon > + > + vld1.8 {q12-q13},[r12]! @ load input > + mov r11,sp > + vld1.8 {q14-q15},[r12]! > + veor q0,q0,q12 @ xor with input > + veor q1,q1,q13 > + vld1.8 {q12-q13},[r12]! > + veor q2,q2,q14 > + veor q3,q3,q15 > + vld1.8 {q14-q15},[r12]! > + > + veor q4,q4,q12 > + vst1.8 {q0-q1},[r14]! @ store output > + veor q5,q5,q13 > + vld1.8 {q12-q13},[r12]! > + veor q6,q6,q14 > + vst1.8 {q2-q3},[r14]! > + veor q7,q7,q15 > + vld1.8 {q14-q15},[r12]! > + > + veor q8,q8,q12 > + vld1.32 {q0-q1},[r11]! @ load for next iteration > + veor d25,d25,d25 > + vldr d24,[sp,#4*(16+4)] @ four > + veor q9,q9,q13 > + vld1.32 {q2-q3},[r11] > + veor q10,q10,q14 > + vst1.8 {q4-q5},[r14]! > + veor q11,q11,q15 > + vst1.8 {q6-q7},[r14]! > + > + vadd.i32 d6,d6,d24 @ next counter value > + vldr d24,[sp,#4*(16+0)] @ one > + > + ldmia sp,{r8-r11} @ load key material > + add r0,r0,r8 @ accumulate key material > + ldr r8,[r12],#16 @ load input > + vst1.8 {q8-q9},[r14]! > + add r1,r1,r9 > + ldr r9,[r12,#-12] > + vst1.8 {q10-q11},[r14]! > + add r2,r2,r10 > + ldr r10,[r12,#-8] > + add r3,r3,r11 > + ldr r11,[r12,#-4] > +# ifdef __ARMEB__ > + rev r0,r0 > + rev r1,r1 > + rev r2,r2 > + rev r3,r3 > +# endif > + eor r0,r0,r8 @ xor with input > + add r8,sp,#4*(4) > + eor r1,r1,r9 > + str r0,[r14],#16 @ store output > + eor r2,r2,r10 > + str r1,[r14,#-12] > + eor r3,r3,r11 > + ldmia r8,{r8-r11} @ load key material > + str r2,[r14,#-8] > + str r3,[r14,#-4] > + > + add r4,r8,r4,ror#13 @ accumulate key material > + ldr r8,[r12],#16 @ load input > + add r5,r9,r5,ror#13 > + ldr r9,[r12,#-12] > + add r6,r10,r6,ror#13 > + ldr r10,[r12,#-8] > + add r7,r11,r7,ror#13 > + ldr r11,[r12,#-4] > +# ifdef __ARMEB__ > + rev r4,r4 > + rev r5,r5 > + rev r6,r6 > + rev r7,r7 > +# endif > + eor r4,r4,r8 > + add r8,sp,#4*(8) > + eor r5,r5,r9 > + str r4,[r14],#16 @ store output > + eor r6,r6,r10 > + str r5,[r14,#-12] > + eor r7,r7,r11 > + ldmia r8,{r8-r11} @ load key material > + str r6,[r14,#-8] > + add r0,sp,#4*(16+8) > + str r7,[r14,#-4] > + > + ldmia r0,{r0-r7} @ load second half > + > + add r0,r0,r8 @ accumulate key material > + ldr r8,[r12],#16 @ load input > + add r1,r1,r9 > + ldr r9,[r12,#-12] > +# ifdef __thumb2__ > + it hi > +# endif > + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it > + add r2,r2,r10 > + ldr r10,[r12,#-8] > +# ifdef __thumb2__ > + it hi > +# endif > + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it > + add r3,r3,r11 > + ldr r11,[r12,#-4] > +# ifdef __ARMEB__ > + rev r0,r0 > + rev r1,r1 > + rev r2,r2 > + rev r3,r3 > +# endif > + eor r0,r0,r8 > + add r8,sp,#4*(12) > + eor r1,r1,r9 > + str r0,[r14],#16 @ store output > + eor r2,r2,r10 > + str r1,[r14,#-12] > + eor r3,r3,r11 > + ldmia r8,{r8-r11} @ load key material > + str r2,[r14,#-8] > + str r3,[r14,#-4] > + > + add r4,r8,r4,ror#24 @ accumulate key material > + add r8,r8,#4 @ next counter value > + add r5,r9,r5,ror#24 > + str r8,[sp,#4*(12)] @ save next counter value > + ldr r8,[r12],#16 @ load input > + add r6,r10,r6,ror#24 > + add r4,r4,#3 @ counter+3 > + ldr r9,[r12,#-12] > + add r7,r11,r7,ror#24 > + ldr r10,[r12,#-8] > + ldr r11,[r12,#-4] > +# ifdef __ARMEB__ > + rev r4,r4 > + rev r5,r5 > + rev r6,r6 > + rev r7,r7 > +# endif > + eor r4,r4,r8 > +# ifdef __thumb2__ > + it hi > +# endif > + ldrhi r8,[sp,#4*(32+2)] @ re-load len > + eor r5,r5,r9 > + eor r6,r6,r10 > + str r4,[r14],#16 @ store output > + eor r7,r7,r11 > + str r5,[r14,#-12] > + sub r11,r8,#64*4 @ len-=64*4 > + str r6,[r14,#-8] > + str r7,[r14,#-4] > + bhi .Loop_neon_outer > + > + b .Ldone_neon > + > +.align 4 > +.Lbreak_neon: > + @ harmonize NEON and integer-only stack frames: load data > + @ from NEON frame, but save to integer-only one; distance > + @ between the two is 4*(32+4+16-32)=4*(20). > + > + str r11, [sp,#4*(20+32+2)] @ save len > + add r11,sp,#4*(32+4) > + str r12, [sp,#4*(20+32+1)] @ save inp > + str r14, [sp,#4*(20+32+0)] @ save out > + > + ldr r12,[sp,#4*(16+10)] > + ldr r14,[sp,#4*(16+11)] > + vldmia r11,{d8-d15} @ fulfill ABI requirement > + str r12,[sp,#4*(20+16+10)] @ copy "rx" > + str r14,[sp,#4*(20+16+11)] @ copy "rx" > + > + ldr r11, [sp,#4*(15)] > + mov r4,r4,ror#19 @ twist b[0..3] > + ldr r12,[sp,#4*(12)] @ modulo-scheduled load > + mov r5,r5,ror#19 > + ldr r10, [sp,#4*(13)] > + mov r6,r6,ror#19 > + ldr r14,[sp,#4*(14)] > + mov r7,r7,ror#19 > + mov r11,r11,ror#8 @ twist d[0..3] > + mov r12,r12,ror#8 > + mov r10,r10,ror#8 > + mov r14,r14,ror#8 > + str r11, [sp,#4*(20+16+15)] > + add r11,sp,#4*(20) > + vst1.32 {q0-q1},[r11]! @ copy key > + add sp,sp,#4*(20) @ switch frame > + vst1.32 {q2-q3},[r11] > + mov r11,#10 > + b .Loop @ go integer-only > + > +.align 4 > +.Ltail_neon: > + cmp r11,#64*3 > + bhs .L192_or_more_neon > + cmp r11,#64*2 > + bhs .L128_or_more_neon > + cmp r11,#64*1 > + bhs .L64_or_more_neon > + > + add r8,sp,#4*(8) > + vst1.8 {q0-q1},[sp] > + add r10,sp,#4*(0) > + vst1.8 {q2-q3},[r8] > + b .Loop_tail_neon > + > +.align 4 > +.L64_or_more_neon: > + vld1.8 {q12-q13},[r12]! > + vld1.8 {q14-q15},[r12]! > + veor q0,q0,q12 > + veor q1,q1,q13 > + veor q2,q2,q14 > + veor q3,q3,q15 > + vst1.8 {q0-q1},[r14]! > + vst1.8 {q2-q3},[r14]! > + > + beq .Ldone_neon > + > + add r8,sp,#4*(8) > + vst1.8 {q4-q5},[sp] > + add r10,sp,#4*(0) > + vst1.8 {q6-q7},[r8] > + sub r11,r11,#64*1 @ len-=64*1 > + b .Loop_tail_neon > + > +.align 4 > +.L128_or_more_neon: > + vld1.8 {q12-q13},[r12]! > + vld1.8 {q14-q15},[r12]! > + veor q0,q0,q12 > + veor q1,q1,q13 > + vld1.8 {q12-q13},[r12]! > + veor q2,q2,q14 > + veor q3,q3,q15 > + vld1.8 {q14-q15},[r12]! > + > + veor q4,q4,q12 > + veor q5,q5,q13 > + vst1.8 {q0-q1},[r14]! > + veor q6,q6,q14 > + vst1.8 {q2-q3},[r14]! > + veor q7,q7,q15 > + vst1.8 {q4-q5},[r14]! > + vst1.8 {q6-q7},[r14]! > + > + beq .Ldone_neon > + > + add r8,sp,#4*(8) > + vst1.8 {q8-q9},[sp] > + add r10,sp,#4*(0) > + vst1.8 {q10-q11},[r8] > + sub r11,r11,#64*2 @ len-=64*2 > + b .Loop_tail_neon > + > +.align 4 > +.L192_or_more_neon: > + vld1.8 {q12-q13},[r12]! > + vld1.8 {q14-q15},[r12]! > + veor q0,q0,q12 > + veor q1,q1,q13 > + vld1.8 {q12-q13},[r12]! > + veor q2,q2,q14 > + veor q3,q3,q15 > + vld1.8 {q14-q15},[r12]! > + > + veor q4,q4,q12 > + veor q5,q5,q13 > + vld1.8 {q12-q13},[r12]! > + veor q6,q6,q14 > + vst1.8 {q0-q1},[r14]! > + veor q7,q7,q15 > + vld1.8 {q14-q15},[r12]! > + > + veor q8,q8,q12 > + vst1.8 {q2-q3},[r14]! > + veor q9,q9,q13 > + vst1.8 {q4-q5},[r14]! > + veor q10,q10,q14 > + vst1.8 {q6-q7},[r14]! > + veor q11,q11,q15 > + vst1.8 {q8-q9},[r14]! > + vst1.8 {q10-q11},[r14]! > + > + beq .Ldone_neon > + > + ldmia sp,{r8-r11} @ load key material > + add r0,r0,r8 @ accumulate key material > + add r8,sp,#4*(4) > + add r1,r1,r9 > + add r2,r2,r10 > + add r3,r3,r11 > + ldmia r8,{r8-r11} @ load key material > + > + add r4,r8,r4,ror#13 @ accumulate key material > + add r8,sp,#4*(8) > + add r5,r9,r5,ror#13 > + add r6,r10,r6,ror#13 > + add r7,r11,r7,ror#13 > + ldmia r8,{r8-r11} @ load key material > +# ifdef __ARMEB__ > + rev r0,r0 > + rev r1,r1 > + rev r2,r2 > + rev r3,r3 > + rev r4,r4 > + rev r5,r5 > + rev r6,r6 > + rev r7,r7 > +# endif > + stmia sp,{r0-r7} > + add r0,sp,#4*(16+8) > + > + ldmia r0,{r0-r7} @ load second half > + > + add r0,r0,r8 @ accumulate key material > + add r8,sp,#4*(12) > + add r1,r1,r9 > + add r2,r2,r10 > + add r3,r3,r11 > + ldmia r8,{r8-r11} @ load key material > + > + add r4,r8,r4,ror#24 @ accumulate key material > + add r8,sp,#4*(8) > + add r5,r9,r5,ror#24 > + add r4,r4,#3 @ counter+3 > + add r6,r10,r6,ror#24 > + add r7,r11,r7,ror#24 > + ldr r11,[sp,#4*(32+2)] @ re-load len > +# ifdef __ARMEB__ > + rev r0,r0 > + rev r1,r1 > + rev r2,r2 > + rev r3,r3 > + rev r4,r4 > + rev r5,r5 > + rev r6,r6 > + rev r7,r7 > +# endif > + stmia r8,{r0-r7} > + add r10,sp,#4*(0) > + sub r11,r11,#64*3 @ len-=64*3 > + > +.Loop_tail_neon: > + ldrb r8,[r10],#1 @ read buffer on stack > + ldrb r9,[r12],#1 @ read input > + subs r11,r11,#1 > + eor r8,r8,r9 > + strb r8,[r14],#1 @ store output > + bne .Loop_tail_neon > + > +.Ldone_neon: > + add sp,sp,#4*(32+4) > + vldmia sp,{d8-d15} > + add sp,sp,#4*(16+3) > + ldmia sp!,{r4-r11,pc} > +.size ChaCha20_neon,.-ChaCha20_neon > +.comm OPENSSL_armcap_P,4,4 > +#endif > diff --git a/lib/zinc/chacha20/chacha20-arm64-cryptogams.S b/lib/zinc/chacha20/chacha20-arm64-cryptogams.S > new file mode 100644 > index 000000000000..4d029bfdad3a > --- /dev/null > +++ b/lib/zinc/chacha20/chacha20-arm64-cryptogams.S > @@ -0,0 +1,1973 @@ > +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ > +/* > + * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@xxxxxxxxxxx>. All Rights Reserved. > + */ > + > +#include "arm_arch.h" > + > +.text > + > + > + > +.align 5 > +.Lsigma: > +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral > +.Lone: > +.long 1,0,0,0 > +.LOPENSSL_armcap_P: > +#ifdef __ILP32__ > +.long OPENSSL_armcap_P-. > +#else > +.quad OPENSSL_armcap_P-. > +#endif > +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 > +.align 2 > + > +.globl ChaCha20_ctr32 > +.type ChaCha20_ctr32,%function > +.align 5 > +ChaCha20_ctr32: > + cbz x2,.Labort > + adr x5,.LOPENSSL_armcap_P > + cmp x2,#192 > + b.lo .Lshort > +#ifdef __ILP32__ > + ldrsw x6,[x5] > +#else > + ldr x6,[x5] > +#endif > + ldr w17,[x6,x5] > + tst w17,#ARMV7_NEON > + b.ne ChaCha20_neon > + > +.Lshort: > + stp x29,x30,[sp,#-96]! > + add x29,sp,#0 > + > + adr x5,.Lsigma > + stp x19,x20,[sp,#16] > + stp x21,x22,[sp,#32] > + stp x23,x24,[sp,#48] > + stp x25,x26,[sp,#64] > + stp x27,x28,[sp,#80] > + sub sp,sp,#64 > + > + ldp x22,x23,[x5] // load sigma > + ldp x24,x25,[x3] // load key > + ldp x26,x27,[x3,#16] > + ldp x28,x30,[x4] // load counter > +#ifdef __ARMEB__ > + ror x24,x24,#32 > + ror x25,x25,#32 > + ror x26,x26,#32 > + ror x27,x27,#32 > + ror x28,x28,#32 > + ror x30,x30,#32 > +#endif > + > +.Loop_outer: > + mov w5,w22 // unpack key block > + lsr x6,x22,#32 > + mov w7,w23 > + lsr x8,x23,#32 > + mov w9,w24 > + lsr x10,x24,#32 > + mov w11,w25 > + lsr x12,x25,#32 > + mov w13,w26 > + lsr x14,x26,#32 > + mov w15,w27 > + lsr x16,x27,#32 > + mov w17,w28 > + lsr x19,x28,#32 > + mov w20,w30 > + lsr x21,x30,#32 > + > + mov x4,#10 > + subs x2,x2,#64 > +.Loop: > + sub x4,x4,#1 > + add w5,w5,w9 > + add w6,w6,w10 > + add w7,w7,w11 > + add w8,w8,w12 > + eor w17,w17,w5 > + eor w19,w19,w6 > + eor w20,w20,w7 > + eor w21,w21,w8 > + ror w17,w17,#16 > + ror w19,w19,#16 > + ror w20,w20,#16 > + ror w21,w21,#16 > + add w13,w13,w17 > + add w14,w14,w19 > + add w15,w15,w20 > + add w16,w16,w21 > + eor w9,w9,w13 > + eor w10,w10,w14 > + eor w11,w11,w15 > + eor w12,w12,w16 > + ror w9,w9,#20 > + ror w10,w10,#20 > + ror w11,w11,#20 > + ror w12,w12,#20 > + add w5,w5,w9 > + add w6,w6,w10 > + add w7,w7,w11 > + add w8,w8,w12 > + eor w17,w17,w5 > + eor w19,w19,w6 > + eor w20,w20,w7 > + eor w21,w21,w8 > + ror w17,w17,#24 > + ror w19,w19,#24 > + ror w20,w20,#24 > + ror w21,w21,#24 > + add w13,w13,w17 > + add w14,w14,w19 > + add w15,w15,w20 > + add w16,w16,w21 > + eor w9,w9,w13 > + eor w10,w10,w14 > + eor w11,w11,w15 > + eor w12,w12,w16 > + ror w9,w9,#25 > + ror w10,w10,#25 > + ror w11,w11,#25 > + ror w12,w12,#25 > + add w5,w5,w10 > + add w6,w6,w11 > + add w7,w7,w12 > + add w8,w8,w9 > + eor w21,w21,w5 > + eor w17,w17,w6 > + eor w19,w19,w7 > + eor w20,w20,w8 > + ror w21,w21,#16 > + ror w17,w17,#16 > + ror w19,w19,#16 > + ror w20,w20,#16 > + add w15,w15,w21 > + add w16,w16,w17 > + add w13,w13,w19 > + add w14,w14,w20 > + eor w10,w10,w15 > + eor w11,w11,w16 > + eor w12,w12,w13 > + eor w9,w9,w14 > + ror w10,w10,#20 > + ror w11,w11,#20 > + ror w12,w12,#20 > + ror w9,w9,#20 > + add w5,w5,w10 > + add w6,w6,w11 > + add w7,w7,w12 > + add w8,w8,w9 > + eor w21,w21,w5 > + eor w17,w17,w6 > + eor w19,w19,w7 > + eor w20,w20,w8 > + ror w21,w21,#24 > + ror w17,w17,#24 > + ror w19,w19,#24 > + ror w20,w20,#24 > + add w15,w15,w21 > + add w16,w16,w17 > + add w13,w13,w19 > + add w14,w14,w20 > + eor w10,w10,w15 > + eor w11,w11,w16 > + eor w12,w12,w13 > + eor w9,w9,w14 > + ror w10,w10,#25 > + ror w11,w11,#25 > + ror w12,w12,#25 > + ror w9,w9,#25 > + cbnz x4,.Loop > + > + add w5,w5,w22 // accumulate key block > + add x6,x6,x22,lsr#32 > + add w7,w7,w23 > + add x8,x8,x23,lsr#32 > + add w9,w9,w24 > + add x10,x10,x24,lsr#32 > + add w11,w11,w25 > + add x12,x12,x25,lsr#32 > + add w13,w13,w26 > + add x14,x14,x26,lsr#32 > + add w15,w15,w27 > + add x16,x16,x27,lsr#32 > + add w17,w17,w28 > + add x19,x19,x28,lsr#32 > + add w20,w20,w30 > + add x21,x21,x30,lsr#32 > + > + b.lo .Ltail > + > + add x5,x5,x6,lsl#32 // pack > + add x7,x7,x8,lsl#32 > + ldp x6,x8,[x1,#0] // load input > + add x9,x9,x10,lsl#32 > + add x11,x11,x12,lsl#32 > + ldp x10,x12,[x1,#16] > + add x13,x13,x14,lsl#32 > + add x15,x15,x16,lsl#32 > + ldp x14,x16,[x1,#32] > + add x17,x17,x19,lsl#32 > + add x20,x20,x21,lsl#32 > + ldp x19,x21,[x1,#48] > + add x1,x1,#64 > +#ifdef __ARMEB__ > + rev x5,x5 > + rev x7,x7 > + rev x9,x9 > + rev x11,x11 > + rev x13,x13 > + rev x15,x15 > + rev x17,x17 > + rev x20,x20 > +#endif > + eor x5,x5,x6 > + eor x7,x7,x8 > + eor x9,x9,x10 > + eor x11,x11,x12 > + eor x13,x13,x14 > + eor x15,x15,x16 > + eor x17,x17,x19 > + eor x20,x20,x21 > + > + stp x5,x7,[x0,#0] // store output > + add x28,x28,#1 // increment counter > + stp x9,x11,[x0,#16] > + stp x13,x15,[x0,#32] > + stp x17,x20,[x0,#48] > + add x0,x0,#64 > + > + b.hi .Loop_outer > + > + ldp x19,x20,[x29,#16] > + add sp,sp,#64 > + ldp x21,x22,[x29,#32] > + ldp x23,x24,[x29,#48] > + ldp x25,x26,[x29,#64] > + ldp x27,x28,[x29,#80] > + ldp x29,x30,[sp],#96 > +.Labort: > + ret > + > +.align 4 > +.Ltail: > + add x2,x2,#64 > +.Less_than_64: > + sub x0,x0,#1 > + add x1,x1,x2 > + add x0,x0,x2 > + add x4,sp,x2 > + neg x2,x2 > + > + add x5,x5,x6,lsl#32 // pack > + add x7,x7,x8,lsl#32 > + add x9,x9,x10,lsl#32 > + add x11,x11,x12,lsl#32 > + add x13,x13,x14,lsl#32 > + add x15,x15,x16,lsl#32 > + add x17,x17,x19,lsl#32 > + add x20,x20,x21,lsl#32 > +#ifdef __ARMEB__ > + rev x5,x5 > + rev x7,x7 > + rev x9,x9 > + rev x11,x11 > + rev x13,x13 > + rev x15,x15 > + rev x17,x17 > + rev x20,x20 > +#endif > + stp x5,x7,[sp,#0] > + stp x9,x11,[sp,#16] > + stp x13,x15,[sp,#32] > + stp x17,x20,[sp,#48] > + > +.Loop_tail: > + ldrb w10,[x1,x2] > + ldrb w11,[x4,x2] > + add x2,x2,#1 > + eor w10,w10,w11 > + strb w10,[x0,x2] > + cbnz x2,.Loop_tail > + > + stp xzr,xzr,[sp,#0] > + stp xzr,xzr,[sp,#16] > + stp xzr,xzr,[sp,#32] > + stp xzr,xzr,[sp,#48] > + > + ldp x19,x20,[x29,#16] > + add sp,sp,#64 > + ldp x21,x22,[x29,#32] > + ldp x23,x24,[x29,#48] > + ldp x25,x26,[x29,#64] > + ldp x27,x28,[x29,#80] > + ldp x29,x30,[sp],#96 > + ret > +.size ChaCha20_ctr32,.-ChaCha20_ctr32 > + > +.type ChaCha20_neon,%function > +.align 5 > +ChaCha20_neon: > + stp x29,x30,[sp,#-96]! > + add x29,sp,#0 > + > + adr x5,.Lsigma > + stp x19,x20,[sp,#16] > + stp x21,x22,[sp,#32] > + stp x23,x24,[sp,#48] > + stp x25,x26,[sp,#64] > + stp x27,x28,[sp,#80] > + cmp x2,#512 > + b.hs .L512_or_more_neon > + > + sub sp,sp,#64 > + > + ldp x22,x23,[x5] // load sigma > + ld1 {v24.4s},[x5],#16 > + ldp x24,x25,[x3] // load key > + ldp x26,x27,[x3,#16] > + ld1 {v25.4s,v26.4s},[x3] > + ldp x28,x30,[x4] // load counter > + ld1 {v27.4s},[x4] > + ld1 {v31.4s},[x5] > +#ifdef __ARMEB__ > + rev64 v24.4s,v24.4s > + ror x24,x24,#32 > + ror x25,x25,#32 > + ror x26,x26,#32 > + ror x27,x27,#32 > + ror x28,x28,#32 > + ror x30,x30,#32 > +#endif > + add v27.4s,v27.4s,v31.4s // += 1 > + add v28.4s,v27.4s,v31.4s > + add v29.4s,v28.4s,v31.4s > + shl v31.4s,v31.4s,#2 // 1 -> 4 > + > +.Loop_outer_neon: > + mov w5,w22 // unpack key block > + lsr x6,x22,#32 > + mov v0.16b,v24.16b > + mov w7,w23 > + lsr x8,x23,#32 > + mov v4.16b,v24.16b > + mov w9,w24 > + lsr x10,x24,#32 > + mov v16.16b,v24.16b > + mov w11,w25 > + mov v1.16b,v25.16b > + lsr x12,x25,#32 > + mov v5.16b,v25.16b > + mov w13,w26 > + mov v17.16b,v25.16b > + lsr x14,x26,#32 > + mov v3.16b,v27.16b > + mov w15,w27 > + mov v7.16b,v28.16b > + lsr x16,x27,#32 > + mov v19.16b,v29.16b > + mov w17,w28 > + mov v2.16b,v26.16b > + lsr x19,x28,#32 > + mov v6.16b,v26.16b > + mov w20,w30 > + mov v18.16b,v26.16b > + lsr x21,x30,#32 > + > + mov x4,#10 > + subs x2,x2,#256 > +.Loop_neon: > + sub x4,x4,#1 > + add v0.4s,v0.4s,v1.4s > + add w5,w5,w9 > + add v4.4s,v4.4s,v5.4s > + add w6,w6,w10 > + add v16.4s,v16.4s,v17.4s > + add w7,w7,w11 > + eor v3.16b,v3.16b,v0.16b > + add w8,w8,w12 > + eor v7.16b,v7.16b,v4.16b > + eor w17,w17,w5 > + eor v19.16b,v19.16b,v16.16b > + eor w19,w19,w6 > + rev32 v3.8h,v3.8h > + eor w20,w20,w7 > + rev32 v7.8h,v7.8h > + eor w21,w21,w8 > + rev32 v19.8h,v19.8h > + ror w17,w17,#16 > + add v2.4s,v2.4s,v3.4s > + ror w19,w19,#16 > + add v6.4s,v6.4s,v7.4s > + ror w20,w20,#16 > + add v18.4s,v18.4s,v19.4s > + ror w21,w21,#16 > + eor v20.16b,v1.16b,v2.16b > + add w13,w13,w17 > + eor v21.16b,v5.16b,v6.16b > + add w14,w14,w19 > + eor v22.16b,v17.16b,v18.16b > + add w15,w15,w20 > + ushr v1.4s,v20.4s,#20 > + add w16,w16,w21 > + ushr v5.4s,v21.4s,#20 > + eor w9,w9,w13 > + ushr v17.4s,v22.4s,#20 > + eor w10,w10,w14 > + sli v1.4s,v20.4s,#12 > + eor w11,w11,w15 > + sli v5.4s,v21.4s,#12 > + eor w12,w12,w16 > + sli v17.4s,v22.4s,#12 > + ror w9,w9,#20 > + add v0.4s,v0.4s,v1.4s > + ror w10,w10,#20 > + add v4.4s,v4.4s,v5.4s > + ror w11,w11,#20 > + add v16.4s,v16.4s,v17.4s > + ror w12,w12,#20 > + eor v20.16b,v3.16b,v0.16b > + add w5,w5,w9 > + eor v21.16b,v7.16b,v4.16b > + add w6,w6,w10 > + eor v22.16b,v19.16b,v16.16b > + add w7,w7,w11 > + ushr v3.4s,v20.4s,#24 > + add w8,w8,w12 > + ushr v7.4s,v21.4s,#24 > + eor w17,w17,w5 > + ushr v19.4s,v22.4s,#24 > + eor w19,w19,w6 > + sli v3.4s,v20.4s,#8 > + eor w20,w20,w7 > + sli v7.4s,v21.4s,#8 > + eor w21,w21,w8 > + sli v19.4s,v22.4s,#8 > + ror w17,w17,#24 > + add v2.4s,v2.4s,v3.4s > + ror w19,w19,#24 > + add v6.4s,v6.4s,v7.4s > + ror w20,w20,#24 > + add v18.4s,v18.4s,v19.4s > + ror w21,w21,#24 > + eor v20.16b,v1.16b,v2.16b > + add w13,w13,w17 > + eor v21.16b,v5.16b,v6.16b > + add w14,w14,w19 > + eor v22.16b,v17.16b,v18.16b > + add w15,w15,w20 > + ushr v1.4s,v20.4s,#25 > + add w16,w16,w21 > + ushr v5.4s,v21.4s,#25 > + eor w9,w9,w13 > + ushr v17.4s,v22.4s,#25 > + eor w10,w10,w14 > + sli v1.4s,v20.4s,#7 > + eor w11,w11,w15 > + sli v5.4s,v21.4s,#7 > + eor w12,w12,w16 > + sli v17.4s,v22.4s,#7 > + ror w9,w9,#25 > + ext v2.16b,v2.16b,v2.16b,#8 > + ror w10,w10,#25 > + ext v6.16b,v6.16b,v6.16b,#8 > + ror w11,w11,#25 > + ext v18.16b,v18.16b,v18.16b,#8 > + ror w12,w12,#25 > + ext v3.16b,v3.16b,v3.16b,#12 > + ext v7.16b,v7.16b,v7.16b,#12 > + ext v19.16b,v19.16b,v19.16b,#12 > + ext v1.16b,v1.16b,v1.16b,#4 > + ext v5.16b,v5.16b,v5.16b,#4 > + ext v17.16b,v17.16b,v17.16b,#4 > + add v0.4s,v0.4s,v1.4s > + add w5,w5,w10 > + add v4.4s,v4.4s,v5.4s > + add w6,w6,w11 > + add v16.4s,v16.4s,v17.4s > + add w7,w7,w12 > + eor v3.16b,v3.16b,v0.16b > + add w8,w8,w9 > + eor v7.16b,v7.16b,v4.16b > + eor w21,w21,w5 > + eor v19.16b,v19.16b,v16.16b > + eor w17,w17,w6 > + rev32 v3.8h,v3.8h > + eor w19,w19,w7 > + rev32 v7.8h,v7.8h > + eor w20,w20,w8 > + rev32 v19.8h,v19.8h > + ror w21,w21,#16 > + add v2.4s,v2.4s,v3.4s > + ror w17,w17,#16 > + add v6.4s,v6.4s,v7.4s > + ror w19,w19,#16 > + add v18.4s,v18.4s,v19.4s > + ror w20,w20,#16 > + eor v20.16b,v1.16b,v2.16b > + add w15,w15,w21 > + eor v21.16b,v5.16b,v6.16b > + add w16,w16,w17 > + eor v22.16b,v17.16b,v18.16b > + add w13,w13,w19 > + ushr v1.4s,v20.4s,#20 > + add w14,w14,w20 > + ushr v5.4s,v21.4s,#20 > + eor w10,w10,w15 > + ushr v17.4s,v22.4s,#20 > + eor w11,w11,w16 > + sli v1.4s,v20.4s,#12 > + eor w12,w12,w13 > + sli v5.4s,v21.4s,#12 > + eor w9,w9,w14 > + sli v17.4s,v22.4s,#12 > + ror w10,w10,#20 > + add v0.4s,v0.4s,v1.4s > + ror w11,w11,#20 > + add v4.4s,v4.4s,v5.4s > + ror w12,w12,#20 > + add v16.4s,v16.4s,v17.4s > + ror w9,w9,#20 > + eor v20.16b,v3.16b,v0.16b > + add w5,w5,w10 > + eor v21.16b,v7.16b,v4.16b > + add w6,w6,w11 > + eor v22.16b,v19.16b,v16.16b > + add w7,w7,w12 > + ushr v3.4s,v20.4s,#24 > + add w8,w8,w9 > + ushr v7.4s,v21.4s,#24 > + eor w21,w21,w5 > + ushr v19.4s,v22.4s,#24 > + eor w17,w17,w6 > + sli v3.4s,v20.4s,#8 > + eor w19,w19,w7 > + sli v7.4s,v21.4s,#8 > + eor w20,w20,w8 > + sli v19.4s,v22.4s,#8 > + ror w21,w21,#24 > + add v2.4s,v2.4s,v3.4s > + ror w17,w17,#24 > + add v6.4s,v6.4s,v7.4s > + ror w19,w19,#24 > + add v18.4s,v18.4s,v19.4s > + ror w20,w20,#24 > + eor v20.16b,v1.16b,v2.16b > + add w15,w15,w21 > + eor v21.16b,v5.16b,v6.16b > + add w16,w16,w17 > + eor v22.16b,v17.16b,v18.16b > + add w13,w13,w19 > + ushr v1.4s,v20.4s,#25 > + add w14,w14,w20 > + ushr v5.4s,v21.4s,#25 > + eor w10,w10,w15 > + ushr v17.4s,v22.4s,#25 > + eor w11,w11,w16 > + sli v1.4s,v20.4s,#7 > + eor w12,w12,w13 > + sli v5.4s,v21.4s,#7 > + eor w9,w9,w14 > + sli v17.4s,v22.4s,#7 > + ror w10,w10,#25 > + ext v2.16b,v2.16b,v2.16b,#8 > + ror w11,w11,#25 > + ext v6.16b,v6.16b,v6.16b,#8 > + ror w12,w12,#25 > + ext v18.16b,v18.16b,v18.16b,#8 > + ror w9,w9,#25 > + ext v3.16b,v3.16b,v3.16b,#4 > + ext v7.16b,v7.16b,v7.16b,#4 > + ext v19.16b,v19.16b,v19.16b,#4 > + ext v1.16b,v1.16b,v1.16b,#12 > + ext v5.16b,v5.16b,v5.16b,#12 > + ext v17.16b,v17.16b,v17.16b,#12 > + cbnz x4,.Loop_neon > + > + add w5,w5,w22 // accumulate key block > + add v0.4s,v0.4s,v24.4s > + add x6,x6,x22,lsr#32 > + add v4.4s,v4.4s,v24.4s > + add w7,w7,w23 > + add v16.4s,v16.4s,v24.4s > + add x8,x8,x23,lsr#32 > + add v2.4s,v2.4s,v26.4s > + add w9,w9,w24 > + add v6.4s,v6.4s,v26.4s > + add x10,x10,x24,lsr#32 > + add v18.4s,v18.4s,v26.4s > + add w11,w11,w25 > + add v3.4s,v3.4s,v27.4s > + add x12,x12,x25,lsr#32 > + add w13,w13,w26 > + add v7.4s,v7.4s,v28.4s > + add x14,x14,x26,lsr#32 > + add w15,w15,w27 > + add v19.4s,v19.4s,v29.4s > + add x16,x16,x27,lsr#32 > + add w17,w17,w28 > + add v1.4s,v1.4s,v25.4s > + add x19,x19,x28,lsr#32 > + add w20,w20,w30 > + add v5.4s,v5.4s,v25.4s > + add x21,x21,x30,lsr#32 > + add v17.4s,v17.4s,v25.4s > + > + b.lo .Ltail_neon > + > + add x5,x5,x6,lsl#32 // pack > + add x7,x7,x8,lsl#32 > + ldp x6,x8,[x1,#0] // load input > + add x9,x9,x10,lsl#32 > + add x11,x11,x12,lsl#32 > + ldp x10,x12,[x1,#16] > + add x13,x13,x14,lsl#32 > + add x15,x15,x16,lsl#32 > + ldp x14,x16,[x1,#32] > + add x17,x17,x19,lsl#32 > + add x20,x20,x21,lsl#32 > + ldp x19,x21,[x1,#48] > + add x1,x1,#64 > +#ifdef __ARMEB__ > + rev x5,x5 > + rev x7,x7 > + rev x9,x9 > + rev x11,x11 > + rev x13,x13 > + rev x15,x15 > + rev x17,x17 > + rev x20,x20 > +#endif > + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 > + eor x5,x5,x6 > + eor x7,x7,x8 > + eor x9,x9,x10 > + eor x11,x11,x12 > + eor x13,x13,x14 > + eor v0.16b,v0.16b,v20.16b > + eor x15,x15,x16 > + eor v1.16b,v1.16b,v21.16b > + eor x17,x17,x19 > + eor v2.16b,v2.16b,v22.16b > + eor x20,x20,x21 > + eor v3.16b,v3.16b,v23.16b > + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 > + > + stp x5,x7,[x0,#0] // store output > + add x28,x28,#4 // increment counter > + stp x9,x11,[x0,#16] > + add v27.4s,v27.4s,v31.4s // += 4 > + stp x13,x15,[x0,#32] > + add v28.4s,v28.4s,v31.4s > + stp x17,x20,[x0,#48] > + add v29.4s,v29.4s,v31.4s > + add x0,x0,#64 > + > + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 > + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 > + > + eor v4.16b,v4.16b,v20.16b > + eor v5.16b,v5.16b,v21.16b > + eor v6.16b,v6.16b,v22.16b > + eor v7.16b,v7.16b,v23.16b > + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 > + > + eor v16.16b,v16.16b,v0.16b > + eor v17.16b,v17.16b,v1.16b > + eor v18.16b,v18.16b,v2.16b > + eor v19.16b,v19.16b,v3.16b > + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 > + > + b.hi .Loop_outer_neon > + > + ldp x19,x20,[x29,#16] > + add sp,sp,#64 > + ldp x21,x22,[x29,#32] > + ldp x23,x24,[x29,#48] > + ldp x25,x26,[x29,#64] > + ldp x27,x28,[x29,#80] > + ldp x29,x30,[sp],#96 > + ret > + > +.Ltail_neon: > + add x2,x2,#256 > + cmp x2,#64 > + b.lo .Less_than_64 > + > + add x5,x5,x6,lsl#32 // pack > + add x7,x7,x8,lsl#32 > + ldp x6,x8,[x1,#0] // load input > + add x9,x9,x10,lsl#32 > + add x11,x11,x12,lsl#32 > + ldp x10,x12,[x1,#16] > + add x13,x13,x14,lsl#32 > + add x15,x15,x16,lsl#32 > + ldp x14,x16,[x1,#32] > + add x17,x17,x19,lsl#32 > + add x20,x20,x21,lsl#32 > + ldp x19,x21,[x1,#48] > + add x1,x1,#64 > +#ifdef __ARMEB__ > + rev x5,x5 > + rev x7,x7 > + rev x9,x9 > + rev x11,x11 > + rev x13,x13 > + rev x15,x15 > + rev x17,x17 > + rev x20,x20 > +#endif > + eor x5,x5,x6 > + eor x7,x7,x8 > + eor x9,x9,x10 > + eor x11,x11,x12 > + eor x13,x13,x14 > + eor x15,x15,x16 > + eor x17,x17,x19 > + eor x20,x20,x21 > + > + stp x5,x7,[x0,#0] // store output > + add x28,x28,#4 // increment counter > + stp x9,x11,[x0,#16] > + stp x13,x15,[x0,#32] > + stp x17,x20,[x0,#48] > + add x0,x0,#64 > + b.eq .Ldone_neon > + sub x2,x2,#64 > + cmp x2,#64 > + b.lo .Less_than_128 > + > + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 > + eor v0.16b,v0.16b,v20.16b > + eor v1.16b,v1.16b,v21.16b > + eor v2.16b,v2.16b,v22.16b > + eor v3.16b,v3.16b,v23.16b > + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 > + b.eq .Ldone_neon > + sub x2,x2,#64 > + cmp x2,#64 > + b.lo .Less_than_192 > + > + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 > + eor v4.16b,v4.16b,v20.16b > + eor v5.16b,v5.16b,v21.16b > + eor v6.16b,v6.16b,v22.16b > + eor v7.16b,v7.16b,v23.16b > + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 > + b.eq .Ldone_neon > + sub x2,x2,#64 > + > + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] > + b .Last_neon > + > +.Less_than_128: > + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] > + b .Last_neon > +.Less_than_192: > + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] > + b .Last_neon > + > +.align 4 > +.Last_neon: > + sub x0,x0,#1 > + add x1,x1,x2 > + add x0,x0,x2 > + add x4,sp,x2 > + neg x2,x2 > + > +.Loop_tail_neon: > + ldrb w10,[x1,x2] > + ldrb w11,[x4,x2] > + add x2,x2,#1 > + eor w10,w10,w11 > + strb w10,[x0,x2] > + cbnz x2,.Loop_tail_neon > + > + stp xzr,xzr,[sp,#0] > + stp xzr,xzr,[sp,#16] > + stp xzr,xzr,[sp,#32] > + stp xzr,xzr,[sp,#48] > + > +.Ldone_neon: > + ldp x19,x20,[x29,#16] > + add sp,sp,#64 > + ldp x21,x22,[x29,#32] > + ldp x23,x24,[x29,#48] > + ldp x25,x26,[x29,#64] > + ldp x27,x28,[x29,#80] > + ldp x29,x30,[sp],#96 > + ret > +.size ChaCha20_neon,.-ChaCha20_neon > +.type ChaCha20_512_neon,%function > +.align 5 > +ChaCha20_512_neon: > + stp x29,x30,[sp,#-96]! > + add x29,sp,#0 > + > + adr x5,.Lsigma > + stp x19,x20,[sp,#16] > + stp x21,x22,[sp,#32] > + stp x23,x24,[sp,#48] > + stp x25,x26,[sp,#64] > + stp x27,x28,[sp,#80] > + > +.L512_or_more_neon: > + sub sp,sp,#128+64 > + > + ldp x22,x23,[x5] // load sigma > + ld1 {v24.4s},[x5],#16 > + ldp x24,x25,[x3] // load key > + ldp x26,x27,[x3,#16] > + ld1 {v25.4s,v26.4s},[x3] > + ldp x28,x30,[x4] // load counter > + ld1 {v27.4s},[x4] > + ld1 {v31.4s},[x5] > +#ifdef __ARMEB__ > + rev64 v24.4s,v24.4s > + ror x24,x24,#32 > + ror x25,x25,#32 > + ror x26,x26,#32 > + ror x27,x27,#32 > + ror x28,x28,#32 > + ror x30,x30,#32 > +#endif > + add v27.4s,v27.4s,v31.4s // += 1 > + stp q24,q25,[sp,#0] // off-load key block, invariant part > + add v27.4s,v27.4s,v31.4s // not typo > + str q26,[sp,#32] > + add v28.4s,v27.4s,v31.4s > + add v29.4s,v28.4s,v31.4s > + add v30.4s,v29.4s,v31.4s > + shl v31.4s,v31.4s,#2 // 1 -> 4 > + > + stp d8,d9,[sp,#128+0] // meet ABI requirements > + stp d10,d11,[sp,#128+16] > + stp d12,d13,[sp,#128+32] > + stp d14,d15,[sp,#128+48] > + > + sub x2,x2,#512 // not typo > + > +.Loop_outer_512_neon: > + mov v0.16b,v24.16b > + mov v4.16b,v24.16b > + mov v8.16b,v24.16b > + mov v12.16b,v24.16b > + mov v16.16b,v24.16b > + mov v20.16b,v24.16b > + mov v1.16b,v25.16b > + mov w5,w22 // unpack key block > + mov v5.16b,v25.16b > + lsr x6,x22,#32 > + mov v9.16b,v25.16b > + mov w7,w23 > + mov v13.16b,v25.16b > + lsr x8,x23,#32 > + mov v17.16b,v25.16b > + mov w9,w24 > + mov v21.16b,v25.16b > + lsr x10,x24,#32 > + mov v3.16b,v27.16b > + mov w11,w25 > + mov v7.16b,v28.16b > + lsr x12,x25,#32 > + mov v11.16b,v29.16b > + mov w13,w26 > + mov v15.16b,v30.16b > + lsr x14,x26,#32 > + mov v2.16b,v26.16b > + mov w15,w27 > + mov v6.16b,v26.16b > + lsr x16,x27,#32 > + add v19.4s,v3.4s,v31.4s // +4 > + mov w17,w28 > + add v23.4s,v7.4s,v31.4s // +4 > + lsr x19,x28,#32 > + mov v10.16b,v26.16b > + mov w20,w30 > + mov v14.16b,v26.16b > + lsr x21,x30,#32 > + mov v18.16b,v26.16b > + stp q27,q28,[sp,#48] // off-load key block, variable part > + mov v22.16b,v26.16b > + str q29,[sp,#80] > + > + mov x4,#5 > + subs x2,x2,#512 > +.Loop_upper_neon: > + sub x4,x4,#1 > + add v0.4s,v0.4s,v1.4s > + add w5,w5,w9 > + add v4.4s,v4.4s,v5.4s > + add w6,w6,w10 > + add v8.4s,v8.4s,v9.4s > + add w7,w7,w11 > + add v12.4s,v12.4s,v13.4s > + add w8,w8,w12 > + add v16.4s,v16.4s,v17.4s > + eor w17,w17,w5 > + add v20.4s,v20.4s,v21.4s > + eor w19,w19,w6 > + eor v3.16b,v3.16b,v0.16b > + eor w20,w20,w7 > + eor v7.16b,v7.16b,v4.16b > + eor w21,w21,w8 > + eor v11.16b,v11.16b,v8.16b > + ror w17,w17,#16 > + eor v15.16b,v15.16b,v12.16b > + ror w19,w19,#16 > + eor v19.16b,v19.16b,v16.16b > + ror w20,w20,#16 > + eor v23.16b,v23.16b,v20.16b > + ror w21,w21,#16 > + rev32 v3.8h,v3.8h > + add w13,w13,w17 > + rev32 v7.8h,v7.8h > + add w14,w14,w19 > + rev32 v11.8h,v11.8h > + add w15,w15,w20 > + rev32 v15.8h,v15.8h > + add w16,w16,w21 > + rev32 v19.8h,v19.8h > + eor w9,w9,w13 > + rev32 v23.8h,v23.8h > + eor w10,w10,w14 > + add v2.4s,v2.4s,v3.4s > + eor w11,w11,w15 > + add v6.4s,v6.4s,v7.4s > + eor w12,w12,w16 > + add v10.4s,v10.4s,v11.4s > + ror w9,w9,#20 > + add v14.4s,v14.4s,v15.4s > + ror w10,w10,#20 > + add v18.4s,v18.4s,v19.4s > + ror w11,w11,#20 > + add v22.4s,v22.4s,v23.4s > + ror w12,w12,#20 > + eor v24.16b,v1.16b,v2.16b > + add w5,w5,w9 > + eor v25.16b,v5.16b,v6.16b > + add w6,w6,w10 > + eor v26.16b,v9.16b,v10.16b > + add w7,w7,w11 > + eor v27.16b,v13.16b,v14.16b > + add w8,w8,w12 > + eor v28.16b,v17.16b,v18.16b > + eor w17,w17,w5 > + eor v29.16b,v21.16b,v22.16b > + eor w19,w19,w6 > + ushr v1.4s,v24.4s,#20 > + eor w20,w20,w7 > + ushr v5.4s,v25.4s,#20 > + eor w21,w21,w8 > + ushr v9.4s,v26.4s,#20 > + ror w17,w17,#24 > + ushr v13.4s,v27.4s,#20 > + ror w19,w19,#24 > + ushr v17.4s,v28.4s,#20 > + ror w20,w20,#24 > + ushr v21.4s,v29.4s,#20 > + ror w21,w21,#24 > + sli v1.4s,v24.4s,#12 > + add w13,w13,w17 > + sli v5.4s,v25.4s,#12 > + add w14,w14,w19 > + sli v9.4s,v26.4s,#12 > + add w15,w15,w20 > + sli v13.4s,v27.4s,#12 > + add w16,w16,w21 > + sli v17.4s,v28.4s,#12 > + eor w9,w9,w13 > + sli v21.4s,v29.4s,#12 > + eor w10,w10,w14 > + add v0.4s,v0.4s,v1.4s > + eor w11,w11,w15 > + add v4.4s,v4.4s,v5.4s > + eor w12,w12,w16 > + add v8.4s,v8.4s,v9.4s > + ror w9,w9,#25 > + add v12.4s,v12.4s,v13.4s > + ror w10,w10,#25 > + add v16.4s,v16.4s,v17.4s > + ror w11,w11,#25 > + add v20.4s,v20.4s,v21.4s > + ror w12,w12,#25 > + eor v24.16b,v3.16b,v0.16b > + add w5,w5,w10 > + eor v25.16b,v7.16b,v4.16b > + add w6,w6,w11 > + eor v26.16b,v11.16b,v8.16b > + add w7,w7,w12 > + eor v27.16b,v15.16b,v12.16b > + add w8,w8,w9 > + eor v28.16b,v19.16b,v16.16b > + eor w21,w21,w5 > + eor v29.16b,v23.16b,v20.16b > + eor w17,w17,w6 > + ushr v3.4s,v24.4s,#24 > + eor w19,w19,w7 > + ushr v7.4s,v25.4s,#24 > + eor w20,w20,w8 > + ushr v11.4s,v26.4s,#24 > + ror w21,w21,#16 > + ushr v15.4s,v27.4s,#24 > + ror w17,w17,#16 > + ushr v19.4s,v28.4s,#24 > + ror w19,w19,#16 > + ushr v23.4s,v29.4s,#24 > + ror w20,w20,#16 > + sli v3.4s,v24.4s,#8 > + add w15,w15,w21 > + sli v7.4s,v25.4s,#8 > + add w16,w16,w17 > + sli v11.4s,v26.4s,#8 > + add w13,w13,w19 > + sli v15.4s,v27.4s,#8 > + add w14,w14,w20 > + sli v19.4s,v28.4s,#8 > + eor w10,w10,w15 > + sli v23.4s,v29.4s,#8 > + eor w11,w11,w16 > + add v2.4s,v2.4s,v3.4s > + eor w12,w12,w13 > + add v6.4s,v6.4s,v7.4s > + eor w9,w9,w14 > + add v10.4s,v10.4s,v11.4s > + ror w10,w10,#20 > + add v14.4s,v14.4s,v15.4s > + ror w11,w11,#20 > + add v18.4s,v18.4s,v19.4s > + ror w12,w12,#20 > + add v22.4s,v22.4s,v23.4s > + ror w9,w9,#20 > + eor v24.16b,v1.16b,v2.16b > + add w5,w5,w10 > + eor v25.16b,v5.16b,v6.16b > + add w6,w6,w11 > + eor v26.16b,v9.16b,v10.16b > + add w7,w7,w12 > + eor v27.16b,v13.16b,v14.16b > + add w8,w8,w9 > + eor v28.16b,v17.16b,v18.16b > + eor w21,w21,w5 > + eor v29.16b,v21.16b,v22.16b > + eor w17,w17,w6 > + ushr v1.4s,v24.4s,#25 > + eor w19,w19,w7 > + ushr v5.4s,v25.4s,#25 > + eor w20,w20,w8 > + ushr v9.4s,v26.4s,#25 > + ror w21,w21,#24 > + ushr v13.4s,v27.4s,#25 > + ror w17,w17,#24 > + ushr v17.4s,v28.4s,#25 > + ror w19,w19,#24 > + ushr v21.4s,v29.4s,#25 > + ror w20,w20,#24 > + sli v1.4s,v24.4s,#7 > + add w15,w15,w21 > + sli v5.4s,v25.4s,#7 > + add w16,w16,w17 > + sli v9.4s,v26.4s,#7 > + add w13,w13,w19 > + sli v13.4s,v27.4s,#7 > + add w14,w14,w20 > + sli v17.4s,v28.4s,#7 > + eor w10,w10,w15 > + sli v21.4s,v29.4s,#7 > + eor w11,w11,w16 > + ext v2.16b,v2.16b,v2.16b,#8 > + eor w12,w12,w13 > + ext v6.16b,v6.16b,v6.16b,#8 > + eor w9,w9,w14 > + ext v10.16b,v10.16b,v10.16b,#8 > + ror w10,w10,#25 > + ext v14.16b,v14.16b,v14.16b,#8 > + ror w11,w11,#25 > + ext v18.16b,v18.16b,v18.16b,#8 > + ror w12,w12,#25 > + ext v22.16b,v22.16b,v22.16b,#8 > + ror w9,w9,#25 > + ext v3.16b,v3.16b,v3.16b,#12 > + ext v7.16b,v7.16b,v7.16b,#12 > + ext v11.16b,v11.16b,v11.16b,#12 > + ext v15.16b,v15.16b,v15.16b,#12 > + ext v19.16b,v19.16b,v19.16b,#12 > + ext v23.16b,v23.16b,v23.16b,#12 > + ext v1.16b,v1.16b,v1.16b,#4 > + ext v5.16b,v5.16b,v5.16b,#4 > + ext v9.16b,v9.16b,v9.16b,#4 > + ext v13.16b,v13.16b,v13.16b,#4 > + ext v17.16b,v17.16b,v17.16b,#4 > + ext v21.16b,v21.16b,v21.16b,#4 > + add v0.4s,v0.4s,v1.4s > + add w5,w5,w9 > + add v4.4s,v4.4s,v5.4s > + add w6,w6,w10 > + add v8.4s,v8.4s,v9.4s > + add w7,w7,w11 > + add v12.4s,v12.4s,v13.4s > + add w8,w8,w12 > + add v16.4s,v16.4s,v17.4s > + eor w17,w17,w5 > + add v20.4s,v20.4s,v21.4s > + eor w19,w19,w6 > + eor v3.16b,v3.16b,v0.16b > + eor w20,w20,w7 > + eor v7.16b,v7.16b,v4.16b > + eor w21,w21,w8 > + eor v11.16b,v11.16b,v8.16b > + ror w17,w17,#16 > + eor v15.16b,v15.16b,v12.16b > + ror w19,w19,#16 > + eor v19.16b,v19.16b,v16.16b > + ror w20,w20,#16 > + eor v23.16b,v23.16b,v20.16b > + ror w21,w21,#16 > + rev32 v3.8h,v3.8h > + add w13,w13,w17 > + rev32 v7.8h,v7.8h > + add w14,w14,w19 > + rev32 v11.8h,v11.8h > + add w15,w15,w20 > + rev32 v15.8h,v15.8h > + add w16,w16,w21 > + rev32 v19.8h,v19.8h > + eor w9,w9,w13 > + rev32 v23.8h,v23.8h > + eor w10,w10,w14 > + add v2.4s,v2.4s,v3.4s > + eor w11,w11,w15 > + add v6.4s,v6.4s,v7.4s > + eor w12,w12,w16 > + add v10.4s,v10.4s,v11.4s > + ror w9,w9,#20 > + add v14.4s,v14.4s,v15.4s > + ror w10,w10,#20 > + add v18.4s,v18.4s,v19.4s > + ror w11,w11,#20 > + add v22.4s,v22.4s,v23.4s > + ror w12,w12,#20 > + eor v24.16b,v1.16b,v2.16b > + add w5,w5,w9 > + eor v25.16b,v5.16b,v6.16b > + add w6,w6,w10 > + eor v26.16b,v9.16b,v10.16b > + add w7,w7,w11 > + eor v27.16b,v13.16b,v14.16b > + add w8,w8,w12 > + eor v28.16b,v17.16b,v18.16b > + eor w17,w17,w5 > + eor v29.16b,v21.16b,v22.16b > + eor w19,w19,w6 > + ushr v1.4s,v24.4s,#20 > + eor w20,w20,w7 > + ushr v5.4s,v25.4s,#20 > + eor w21,w21,w8 > + ushr v9.4s,v26.4s,#20 > + ror w17,w17,#24 > + ushr v13.4s,v27.4s,#20 > + ror w19,w19,#24 > + ushr v17.4s,v28.4s,#20 > + ror w20,w20,#24 > + ushr v21.4s,v29.4s,#20 > + ror w21,w21,#24 > + sli v1.4s,v24.4s,#12 > + add w13,w13,w17 > + sli v5.4s,v25.4s,#12 > + add w14,w14,w19 > + sli v9.4s,v26.4s,#12 > + add w15,w15,w20 > + sli v13.4s,v27.4s,#12 > + add w16,w16,w21 > + sli v17.4s,v28.4s,#12 > + eor w9,w9,w13 > + sli v21.4s,v29.4s,#12 > + eor w10,w10,w14 > + add v0.4s,v0.4s,v1.4s > + eor w11,w11,w15 > + add v4.4s,v4.4s,v5.4s > + eor w12,w12,w16 > + add v8.4s,v8.4s,v9.4s > + ror w9,w9,#25 > + add v12.4s,v12.4s,v13.4s > + ror w10,w10,#25 > + add v16.4s,v16.4s,v17.4s > + ror w11,w11,#25 > + add v20.4s,v20.4s,v21.4s > + ror w12,w12,#25 > + eor v24.16b,v3.16b,v0.16b > + add w5,w5,w10 > + eor v25.16b,v7.16b,v4.16b > + add w6,w6,w11 > + eor v26.16b,v11.16b,v8.16b > + add w7,w7,w12 > + eor v27.16b,v15.16b,v12.16b > + add w8,w8,w9 > + eor v28.16b,v19.16b,v16.16b > + eor w21,w21,w5 > + eor v29.16b,v23.16b,v20.16b > + eor w17,w17,w6 > + ushr v3.4s,v24.4s,#24 > + eor w19,w19,w7 > + ushr v7.4s,v25.4s,#24 > + eor w20,w20,w8 > + ushr v11.4s,v26.4s,#24 > + ror w21,w21,#16 > + ushr v15.4s,v27.4s,#24 > + ror w17,w17,#16 > + ushr v19.4s,v28.4s,#24 > + ror w19,w19,#16 > + ushr v23.4s,v29.4s,#24 > + ror w20,w20,#16 > + sli v3.4s,v24.4s,#8 > + add w15,w15,w21 > + sli v7.4s,v25.4s,#8 > + add w16,w16,w17 > + sli v11.4s,v26.4s,#8 > + add w13,w13,w19 > + sli v15.4s,v27.4s,#8 > + add w14,w14,w20 > + sli v19.4s,v28.4s,#8 > + eor w10,w10,w15 > + sli v23.4s,v29.4s,#8 > + eor w11,w11,w16 > + add v2.4s,v2.4s,v3.4s > + eor w12,w12,w13 > + add v6.4s,v6.4s,v7.4s > + eor w9,w9,w14 > + add v10.4s,v10.4s,v11.4s > + ror w10,w10,#20 > + add v14.4s,v14.4s,v15.4s > + ror w11,w11,#20 > + add v18.4s,v18.4s,v19.4s > + ror w12,w12,#20 > + add v22.4s,v22.4s,v23.4s > + ror w9,w9,#20 > + eor v24.16b,v1.16b,v2.16b > + add w5,w5,w10 > + eor v25.16b,v5.16b,v6.16b > + add w6,w6,w11 > + eor v26.16b,v9.16b,v10.16b > + add w7,w7,w12 > + eor v27.16b,v13.16b,v14.16b > + add w8,w8,w9 > + eor v28.16b,v17.16b,v18.16b > + eor w21,w21,w5 > + eor v29.16b,v21.16b,v22.16b > + eor w17,w17,w6 > + ushr v1.4s,v24.4s,#25 > + eor w19,w19,w7 > + ushr v5.4s,v25.4s,#25 > + eor w20,w20,w8 > + ushr v9.4s,v26.4s,#25 > + ror w21,w21,#24 > + ushr v13.4s,v27.4s,#25 > + ror w17,w17,#24 > + ushr v17.4s,v28.4s,#25 > + ror w19,w19,#24 > + ushr v21.4s,v29.4s,#25 > + ror w20,w20,#24 > + sli v1.4s,v24.4s,#7 > + add w15,w15,w21 > + sli v5.4s,v25.4s,#7 > + add w16,w16,w17 > + sli v9.4s,v26.4s,#7 > + add w13,w13,w19 > + sli v13.4s,v27.4s,#7 > + add w14,w14,w20 > + sli v17.4s,v28.4s,#7 > + eor w10,w10,w15 > + sli v21.4s,v29.4s,#7 > + eor w11,w11,w16 > + ext v2.16b,v2.16b,v2.16b,#8 > + eor w12,w12,w13 > + ext v6.16b,v6.16b,v6.16b,#8 > + eor w9,w9,w14 > + ext v10.16b,v10.16b,v10.16b,#8 > + ror w10,w10,#25 > + ext v14.16b,v14.16b,v14.16b,#8 > + ror w11,w11,#25 > + ext v18.16b,v18.16b,v18.16b,#8 > + ror w12,w12,#25 > + ext v22.16b,v22.16b,v22.16b,#8 > + ror w9,w9,#25 > + ext v3.16b,v3.16b,v3.16b,#4 > + ext v7.16b,v7.16b,v7.16b,#4 > + ext v11.16b,v11.16b,v11.16b,#4 > + ext v15.16b,v15.16b,v15.16b,#4 > + ext v19.16b,v19.16b,v19.16b,#4 > + ext v23.16b,v23.16b,v23.16b,#4 > + ext v1.16b,v1.16b,v1.16b,#12 > + ext v5.16b,v5.16b,v5.16b,#12 > + ext v9.16b,v9.16b,v9.16b,#12 > + ext v13.16b,v13.16b,v13.16b,#12 > + ext v17.16b,v17.16b,v17.16b,#12 > + ext v21.16b,v21.16b,v21.16b,#12 > + cbnz x4,.Loop_upper_neon > + > + add w5,w5,w22 // accumulate key block > + add x6,x6,x22,lsr#32 > + add w7,w7,w23 > + add x8,x8,x23,lsr#32 > + add w9,w9,w24 > + add x10,x10,x24,lsr#32 > + add w11,w11,w25 > + add x12,x12,x25,lsr#32 > + add w13,w13,w26 > + add x14,x14,x26,lsr#32 > + add w15,w15,w27 > + add x16,x16,x27,lsr#32 > + add w17,w17,w28 > + add x19,x19,x28,lsr#32 > + add w20,w20,w30 > + add x21,x21,x30,lsr#32 > + > + add x5,x5,x6,lsl#32 // pack > + add x7,x7,x8,lsl#32 > + ldp x6,x8,[x1,#0] // load input > + add x9,x9,x10,lsl#32 > + add x11,x11,x12,lsl#32 > + ldp x10,x12,[x1,#16] > + add x13,x13,x14,lsl#32 > + add x15,x15,x16,lsl#32 > + ldp x14,x16,[x1,#32] > + add x17,x17,x19,lsl#32 > + add x20,x20,x21,lsl#32 > + ldp x19,x21,[x1,#48] > + add x1,x1,#64 > +#ifdef __ARMEB__ > + rev x5,x5 > + rev x7,x7 > + rev x9,x9 > + rev x11,x11 > + rev x13,x13 > + rev x15,x15 > + rev x17,x17 > + rev x20,x20 > +#endif > + eor x5,x5,x6 > + eor x7,x7,x8 > + eor x9,x9,x10 > + eor x11,x11,x12 > + eor x13,x13,x14 > + eor x15,x15,x16 > + eor x17,x17,x19 > + eor x20,x20,x21 > + > + stp x5,x7,[x0,#0] // store output > + add x28,x28,#1 // increment counter > + mov w5,w22 // unpack key block > + lsr x6,x22,#32 > + stp x9,x11,[x0,#16] > + mov w7,w23 > + lsr x8,x23,#32 > + stp x13,x15,[x0,#32] > + mov w9,w24 > + lsr x10,x24,#32 > + stp x17,x20,[x0,#48] > + add x0,x0,#64 > + mov w11,w25 > + lsr x12,x25,#32 > + mov w13,w26 > + lsr x14,x26,#32 > + mov w15,w27 > + lsr x16,x27,#32 > + mov w17,w28 > + lsr x19,x28,#32 > + mov w20,w30 > + lsr x21,x30,#32 > + > + mov x4,#5 > +.Loop_lower_neon: > + sub x4,x4,#1 > + add v0.4s,v0.4s,v1.4s > + add w5,w5,w9 > + add v4.4s,v4.4s,v5.4s > + add w6,w6,w10 > + add v8.4s,v8.4s,v9.4s > + add w7,w7,w11 > + add v12.4s,v12.4s,v13.4s > + add w8,w8,w12 > + add v16.4s,v16.4s,v17.4s > + eor w17,w17,w5 > + add v20.4s,v20.4s,v21.4s > + eor w19,w19,w6 > + eor v3.16b,v3.16b,v0.16b > + eor w20,w20,w7 > + eor v7.16b,v7.16b,v4.16b > + eor w21,w21,w8 > + eor v11.16b,v11.16b,v8.16b > + ror w17,w17,#16 > + eor v15.16b,v15.16b,v12.16b > + ror w19,w19,#16 > + eor v19.16b,v19.16b,v16.16b > + ror w20,w20,#16 > + eor v23.16b,v23.16b,v20.16b > + ror w21,w21,#16 > + rev32 v3.8h,v3.8h > + add w13,w13,w17 > + rev32 v7.8h,v7.8h > + add w14,w14,w19 > + rev32 v11.8h,v11.8h > + add w15,w15,w20 > + rev32 v15.8h,v15.8h > + add w16,w16,w21 > + rev32 v19.8h,v19.8h > + eor w9,w9,w13 > + rev32 v23.8h,v23.8h > + eor w10,w10,w14 > + add v2.4s,v2.4s,v3.4s > + eor w11,w11,w15 > + add v6.4s,v6.4s,v7.4s > + eor w12,w12,w16 > + add v10.4s,v10.4s,v11.4s > + ror w9,w9,#20 > + add v14.4s,v14.4s,v15.4s > + ror w10,w10,#20 > + add v18.4s,v18.4s,v19.4s > + ror w11,w11,#20 > + add v22.4s,v22.4s,v23.4s > + ror w12,w12,#20 > + eor v24.16b,v1.16b,v2.16b > + add w5,w5,w9 > + eor v25.16b,v5.16b,v6.16b > + add w6,w6,w10 > + eor v26.16b,v9.16b,v10.16b > + add w7,w7,w11 > + eor v27.16b,v13.16b,v14.16b > + add w8,w8,w12 > + eor v28.16b,v17.16b,v18.16b > + eor w17,w17,w5 > + eor v29.16b,v21.16b,v22.16b > + eor w19,w19,w6 > + ushr v1.4s,v24.4s,#20 > + eor w20,w20,w7 > + ushr v5.4s,v25.4s,#20 > + eor w21,w21,w8 > + ushr v9.4s,v26.4s,#20 > + ror w17,w17,#24 > + ushr v13.4s,v27.4s,#20 > + ror w19,w19,#24 > + ushr v17.4s,v28.4s,#20 > + ror w20,w20,#24 > + ushr v21.4s,v29.4s,#20 > + ror w21,w21,#24 > + sli v1.4s,v24.4s,#12 > + add w13,w13,w17 > + sli v5.4s,v25.4s,#12 > + add w14,w14,w19 > + sli v9.4s,v26.4s,#12 > + add w15,w15,w20 > + sli v13.4s,v27.4s,#12 > + add w16,w16,w21 > + sli v17.4s,v28.4s,#12 > + eor w9,w9,w13 > + sli v21.4s,v29.4s,#12 > + eor w10,w10,w14 > + add v0.4s,v0.4s,v1.4s > + eor w11,w11,w15 > + add v4.4s,v4.4s,v5.4s > + eor w12,w12,w16 > + add v8.4s,v8.4s,v9.4s > + ror w9,w9,#25 > + add v12.4s,v12.4s,v13.4s > + ror w10,w10,#25 > + add v16.4s,v16.4s,v17.4s > + ror w11,w11,#25 > + add v20.4s,v20.4s,v21.4s > + ror w12,w12,#25 > + eor v24.16b,v3.16b,v0.16b > + add w5,w5,w10 > + eor v25.16b,v7.16b,v4.16b > + add w6,w6,w11 > + eor v26.16b,v11.16b,v8.16b > + add w7,w7,w12 > + eor v27.16b,v15.16b,v12.16b > + add w8,w8,w9 > + eor v28.16b,v19.16b,v16.16b > + eor w21,w21,w5 > + eor v29.16b,v23.16b,v20.16b > + eor w17,w17,w6 > + ushr v3.4s,v24.4s,#24 > + eor w19,w19,w7 > + ushr v7.4s,v25.4s,#24 > + eor w20,w20,w8 > + ushr v11.4s,v26.4s,#24 > + ror w21,w21,#16 > + ushr v15.4s,v27.4s,#24 > + ror w17,w17,#16 > + ushr v19.4s,v28.4s,#24 > + ror w19,w19,#16 > + ushr v23.4s,v29.4s,#24 > + ror w20,w20,#16 > + sli v3.4s,v24.4s,#8 > + add w15,w15,w21 > + sli v7.4s,v25.4s,#8 > + add w16,w16,w17 > + sli v11.4s,v26.4s,#8 > + add w13,w13,w19 > + sli v15.4s,v27.4s,#8 > + add w14,w14,w20 > + sli v19.4s,v28.4s,#8 > + eor w10,w10,w15 > + sli v23.4s,v29.4s,#8 > + eor w11,w11,w16 > + add v2.4s,v2.4s,v3.4s > + eor w12,w12,w13 > + add v6.4s,v6.4s,v7.4s > + eor w9,w9,w14 > + add v10.4s,v10.4s,v11.4s > + ror w10,w10,#20 > + add v14.4s,v14.4s,v15.4s > + ror w11,w11,#20 > + add v18.4s,v18.4s,v19.4s > + ror w12,w12,#20 > + add v22.4s,v22.4s,v23.4s > + ror w9,w9,#20 > + eor v24.16b,v1.16b,v2.16b > + add w5,w5,w10 > + eor v25.16b,v5.16b,v6.16b > + add w6,w6,w11 > + eor v26.16b,v9.16b,v10.16b > + add w7,w7,w12 > + eor v27.16b,v13.16b,v14.16b > + add w8,w8,w9 > + eor v28.16b,v17.16b,v18.16b > + eor w21,w21,w5 > + eor v29.16b,v21.16b,v22.16b > + eor w17,w17,w6 > + ushr v1.4s,v24.4s,#25 > + eor w19,w19,w7 > + ushr v5.4s,v25.4s,#25 > + eor w20,w20,w8 > + ushr v9.4s,v26.4s,#25 > + ror w21,w21,#24 > + ushr v13.4s,v27.4s,#25 > + ror w17,w17,#24 > + ushr v17.4s,v28.4s,#25 > + ror w19,w19,#24 > + ushr v21.4s,v29.4s,#25 > + ror w20,w20,#24 > + sli v1.4s,v24.4s,#7 > + add w15,w15,w21 > + sli v5.4s,v25.4s,#7 > + add w16,w16,w17 > + sli v9.4s,v26.4s,#7 > + add w13,w13,w19 > + sli v13.4s,v27.4s,#7 > + add w14,w14,w20 > + sli v17.4s,v28.4s,#7 > + eor w10,w10,w15 > + sli v21.4s,v29.4s,#7 > + eor w11,w11,w16 > + ext v2.16b,v2.16b,v2.16b,#8 > + eor w12,w12,w13 > + ext v6.16b,v6.16b,v6.16b,#8 > + eor w9,w9,w14 > + ext v10.16b,v10.16b,v10.16b,#8 > + ror w10,w10,#25 > + ext v14.16b,v14.16b,v14.16b,#8 > + ror w11,w11,#25 > + ext v18.16b,v18.16b,v18.16b,#8 > + ror w12,w12,#25 > + ext v22.16b,v22.16b,v22.16b,#8 > + ror w9,w9,#25 > + ext v3.16b,v3.16b,v3.16b,#12 > + ext v7.16b,v7.16b,v7.16b,#12 > + ext v11.16b,v11.16b,v11.16b,#12 > + ext v15.16b,v15.16b,v15.16b,#12 > + ext v19.16b,v19.16b,v19.16b,#12 > + ext v23.16b,v23.16b,v23.16b,#12 > + ext v1.16b,v1.16b,v1.16b,#4 > + ext v5.16b,v5.16b,v5.16b,#4 > + ext v9.16b,v9.16b,v9.16b,#4 > + ext v13.16b,v13.16b,v13.16b,#4 > + ext v17.16b,v17.16b,v17.16b,#4 > + ext v21.16b,v21.16b,v21.16b,#4 > + add v0.4s,v0.4s,v1.4s > + add w5,w5,w9 > + add v4.4s,v4.4s,v5.4s > + add w6,w6,w10 > + add v8.4s,v8.4s,v9.4s > + add w7,w7,w11 > + add v12.4s,v12.4s,v13.4s > + add w8,w8,w12 > + add v16.4s,v16.4s,v17.4s > + eor w17,w17,w5 > + add v20.4s,v20.4s,v21.4s > + eor w19,w19,w6 > + eor v3.16b,v3.16b,v0.16b > + eor w20,w20,w7 > + eor v7.16b,v7.16b,v4.16b > + eor w21,w21,w8 > + eor v11.16b,v11.16b,v8.16b > + ror w17,w17,#16 > + eor v15.16b,v15.16b,v12.16b > + ror w19,w19,#16 > + eor v19.16b,v19.16b,v16.16b > + ror w20,w20,#16 > + eor v23.16b,v23.16b,v20.16b > + ror w21,w21,#16 > + rev32 v3.8h,v3.8h > + add w13,w13,w17 > + rev32 v7.8h,v7.8h > + add w14,w14,w19 > + rev32 v11.8h,v11.8h > + add w15,w15,w20 > + rev32 v15.8h,v15.8h > + add w16,w16,w21 > + rev32 v19.8h,v19.8h > + eor w9,w9,w13 > + rev32 v23.8h,v23.8h > + eor w10,w10,w14 > + add v2.4s,v2.4s,v3.4s > + eor w11,w11,w15 > + add v6.4s,v6.4s,v7.4s > + eor w12,w12,w16 > + add v10.4s,v10.4s,v11.4s > + ror w9,w9,#20 > + add v14.4s,v14.4s,v15.4s > + ror w10,w10,#20 > + add v18.4s,v18.4s,v19.4s > + ror w11,w11,#20 > + add v22.4s,v22.4s,v23.4s > + ror w12,w12,#20 > + eor v24.16b,v1.16b,v2.16b > + add w5,w5,w9 > + eor v25.16b,v5.16b,v6.16b > + add w6,w6,w10 > + eor v26.16b,v9.16b,v10.16b > + add w7,w7,w11 > + eor v27.16b,v13.16b,v14.16b > + add w8,w8,w12 > + eor v28.16b,v17.16b,v18.16b > + eor w17,w17,w5 > + eor v29.16b,v21.16b,v22.16b > + eor w19,w19,w6 > + ushr v1.4s,v24.4s,#20 > + eor w20,w20,w7 > + ushr v5.4s,v25.4s,#20 > + eor w21,w21,w8 > + ushr v9.4s,v26.4s,#20 > + ror w17,w17,#24 > + ushr v13.4s,v27.4s,#20 > + ror w19,w19,#24 > + ushr v17.4s,v28.4s,#20 > + ror w20,w20,#24 > + ushr v21.4s,v29.4s,#20 > + ror w21,w21,#24 > + sli v1.4s,v24.4s,#12 > + add w13,w13,w17 > + sli v5.4s,v25.4s,#12 > + add w14,w14,w19 > + sli v9.4s,v26.4s,#12 > + add w15,w15,w20 > + sli v13.4s,v27.4s,#12 > + add w16,w16,w21 > + sli v17.4s,v28.4s,#12 > + eor w9,w9,w13 > + sli v21.4s,v29.4s,#12 > + eor w10,w10,w14 > + add v0.4s,v0.4s,v1.4s > + eor w11,w11,w15 > + add v4.4s,v4.4s,v5.4s > + eor w12,w12,w16 > + add v8.4s,v8.4s,v9.4s > + ror w9,w9,#25 > + add v12.4s,v12.4s,v13.4s > + ror w10,w10,#25 > + add v16.4s,v16.4s,v17.4s > + ror w11,w11,#25 > + add v20.4s,v20.4s,v21.4s > + ror w12,w12,#25 > + eor v24.16b,v3.16b,v0.16b > + add w5,w5,w10 > + eor v25.16b,v7.16b,v4.16b > + add w6,w6,w11 > + eor v26.16b,v11.16b,v8.16b > + add w7,w7,w12 > + eor v27.16b,v15.16b,v12.16b > + add w8,w8,w9 > + eor v28.16b,v19.16b,v16.16b > + eor w21,w21,w5 > + eor v29.16b,v23.16b,v20.16b > + eor w17,w17,w6 > + ushr v3.4s,v24.4s,#24 > + eor w19,w19,w7 > + ushr v7.4s,v25.4s,#24 > + eor w20,w20,w8 > + ushr v11.4s,v26.4s,#24 > + ror w21,w21,#16 > + ushr v15.4s,v27.4s,#24 > + ror w17,w17,#16 > + ushr v19.4s,v28.4s,#24 > + ror w19,w19,#16 > + ushr v23.4s,v29.4s,#24 > + ror w20,w20,#16 > + sli v3.4s,v24.4s,#8 > + add w15,w15,w21 > + sli v7.4s,v25.4s,#8 > + add w16,w16,w17 > + sli v11.4s,v26.4s,#8 > + add w13,w13,w19 > + sli v15.4s,v27.4s,#8 > + add w14,w14,w20 > + sli v19.4s,v28.4s,#8 > + eor w10,w10,w15 > + sli v23.4s,v29.4s,#8 > + eor w11,w11,w16 > + add v2.4s,v2.4s,v3.4s > + eor w12,w12,w13 > + add v6.4s,v6.4s,v7.4s > + eor w9,w9,w14 > + add v10.4s,v10.4s,v11.4s > + ror w10,w10,#20 > + add v14.4s,v14.4s,v15.4s > + ror w11,w11,#20 > + add v18.4s,v18.4s,v19.4s > + ror w12,w12,#20 > + add v22.4s,v22.4s,v23.4s > + ror w9,w9,#20 > + eor v24.16b,v1.16b,v2.16b > + add w5,w5,w10 > + eor v25.16b,v5.16b,v6.16b > + add w6,w6,w11 > + eor v26.16b,v9.16b,v10.16b > + add w7,w7,w12 > + eor v27.16b,v13.16b,v14.16b > + add w8,w8,w9 > + eor v28.16b,v17.16b,v18.16b > + eor w21,w21,w5 > + eor v29.16b,v21.16b,v22.16b > + eor w17,w17,w6 > + ushr v1.4s,v24.4s,#25 > + eor w19,w19,w7 > + ushr v5.4s,v25.4s,#25 > + eor w20,w20,w8 > + ushr v9.4s,v26.4s,#25 > + ror w21,w21,#24 > + ushr v13.4s,v27.4s,#25 > + ror w17,w17,#24 > + ushr v17.4s,v28.4s,#25 > + ror w19,w19,#24 > + ushr v21.4s,v29.4s,#25 > + ror w20,w20,#24 > + sli v1.4s,v24.4s,#7 > + add w15,w15,w21 > + sli v5.4s,v25.4s,#7 > + add w16,w16,w17 > + sli v9.4s,v26.4s,#7 > + add w13,w13,w19 > + sli v13.4s,v27.4s,#7 > + add w14,w14,w20 > + sli v17.4s,v28.4s,#7 > + eor w10,w10,w15 > + sli v21.4s,v29.4s,#7 > + eor w11,w11,w16 > + ext v2.16b,v2.16b,v2.16b,#8 > + eor w12,w12,w13 > + ext v6.16b,v6.16b,v6.16b,#8 > + eor w9,w9,w14 > + ext v10.16b,v10.16b,v10.16b,#8 > + ror w10,w10,#25 > + ext v14.16b,v14.16b,v14.16b,#8 > + ror w11,w11,#25 > + ext v18.16b,v18.16b,v18.16b,#8 > + ror w12,w12,#25 > + ext v22.16b,v22.16b,v22.16b,#8 > + ror w9,w9,#25 > + ext v3.16b,v3.16b,v3.16b,#4 > + ext v7.16b,v7.16b,v7.16b,#4 > + ext v11.16b,v11.16b,v11.16b,#4 > + ext v15.16b,v15.16b,v15.16b,#4 > + ext v19.16b,v19.16b,v19.16b,#4 > + ext v23.16b,v23.16b,v23.16b,#4 > + ext v1.16b,v1.16b,v1.16b,#12 > + ext v5.16b,v5.16b,v5.16b,#12 > + ext v9.16b,v9.16b,v9.16b,#12 > + ext v13.16b,v13.16b,v13.16b,#12 > + ext v17.16b,v17.16b,v17.16b,#12 > + ext v21.16b,v21.16b,v21.16b,#12 > + cbnz x4,.Loop_lower_neon > + > + add w5,w5,w22 // accumulate key block > + ldp q24,q25,[sp,#0] > + add x6,x6,x22,lsr#32 > + ldp q26,q27,[sp,#32] > + add w7,w7,w23 > + ldp q28,q29,[sp,#64] > + add x8,x8,x23,lsr#32 > + add v0.4s,v0.4s,v24.4s > + add w9,w9,w24 > + add v4.4s,v4.4s,v24.4s > + add x10,x10,x24,lsr#32 > + add v8.4s,v8.4s,v24.4s > + add w11,w11,w25 > + add v12.4s,v12.4s,v24.4s > + add x12,x12,x25,lsr#32 > + add v16.4s,v16.4s,v24.4s > + add w13,w13,w26 > + add v20.4s,v20.4s,v24.4s > + add x14,x14,x26,lsr#32 > + add v2.4s,v2.4s,v26.4s > + add w15,w15,w27 > + add v6.4s,v6.4s,v26.4s > + add x16,x16,x27,lsr#32 > + add v10.4s,v10.4s,v26.4s > + add w17,w17,w28 > + add v14.4s,v14.4s,v26.4s > + add x19,x19,x28,lsr#32 > + add v18.4s,v18.4s,v26.4s > + add w20,w20,w30 > + add v22.4s,v22.4s,v26.4s > + add x21,x21,x30,lsr#32 > + add v19.4s,v19.4s,v31.4s // +4 > + add x5,x5,x6,lsl#32 // pack > + add v23.4s,v23.4s,v31.4s // +4 > + add x7,x7,x8,lsl#32 > + add v3.4s,v3.4s,v27.4s > + ldp x6,x8,[x1,#0] // load input > + add v7.4s,v7.4s,v28.4s > + add x9,x9,x10,lsl#32 > + add v11.4s,v11.4s,v29.4s > + add x11,x11,x12,lsl#32 > + add v15.4s,v15.4s,v30.4s > + ldp x10,x12,[x1,#16] > + add v19.4s,v19.4s,v27.4s > + add x13,x13,x14,lsl#32 > + add v23.4s,v23.4s,v28.4s > + add x15,x15,x16,lsl#32 > + add v1.4s,v1.4s,v25.4s > + ldp x14,x16,[x1,#32] > + add v5.4s,v5.4s,v25.4s > + add x17,x17,x19,lsl#32 > + add v9.4s,v9.4s,v25.4s > + add x20,x20,x21,lsl#32 > + add v13.4s,v13.4s,v25.4s > + ldp x19,x21,[x1,#48] > + add v17.4s,v17.4s,v25.4s > + add x1,x1,#64 > + add v21.4s,v21.4s,v25.4s > + > +#ifdef __ARMEB__ > + rev x5,x5 > + rev x7,x7 > + rev x9,x9 > + rev x11,x11 > + rev x13,x13 > + rev x15,x15 > + rev x17,x17 > + rev x20,x20 > +#endif > + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 > + eor x5,x5,x6 > + eor x7,x7,x8 > + eor x9,x9,x10 > + eor x11,x11,x12 > + eor x13,x13,x14 > + eor v0.16b,v0.16b,v24.16b > + eor x15,x15,x16 > + eor v1.16b,v1.16b,v25.16b > + eor x17,x17,x19 > + eor v2.16b,v2.16b,v26.16b > + eor x20,x20,x21 > + eor v3.16b,v3.16b,v27.16b > + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 > + > + stp x5,x7,[x0,#0] // store output > + add x28,x28,#7 // increment counter > + stp x9,x11,[x0,#16] > + stp x13,x15,[x0,#32] > + stp x17,x20,[x0,#48] > + add x0,x0,#64 > + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 > + > + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 > + eor v4.16b,v4.16b,v24.16b > + eor v5.16b,v5.16b,v25.16b > + eor v6.16b,v6.16b,v26.16b > + eor v7.16b,v7.16b,v27.16b > + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 > + > + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 > + eor v8.16b,v8.16b,v0.16b > + ldp q24,q25,[sp,#0] > + eor v9.16b,v9.16b,v1.16b > + ldp q26,q27,[sp,#32] > + eor v10.16b,v10.16b,v2.16b > + eor v11.16b,v11.16b,v3.16b > + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 > + > + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 > + eor v12.16b,v12.16b,v4.16b > + eor v13.16b,v13.16b,v5.16b > + eor v14.16b,v14.16b,v6.16b > + eor v15.16b,v15.16b,v7.16b > + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 > + > + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 > + eor v16.16b,v16.16b,v8.16b > + eor v17.16b,v17.16b,v9.16b > + eor v18.16b,v18.16b,v10.16b > + eor v19.16b,v19.16b,v11.16b > + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 > + > + shl v0.4s,v31.4s,#1 // 4 -> 8 > + eor v20.16b,v20.16b,v12.16b > + eor v21.16b,v21.16b,v13.16b > + eor v22.16b,v22.16b,v14.16b > + eor v23.16b,v23.16b,v15.16b > + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 > + > + add v27.4s,v27.4s,v0.4s // += 8 > + add v28.4s,v28.4s,v0.4s > + add v29.4s,v29.4s,v0.4s > + add v30.4s,v30.4s,v0.4s > + > + b.hs .Loop_outer_512_neon > + > + adds x2,x2,#512 > + ushr v0.4s,v31.4s,#2 // 4 -> 1 > + > + ldp d8,d9,[sp,#128+0] // meet ABI requirements > + ldp d10,d11,[sp,#128+16] > + ldp d12,d13,[sp,#128+32] > + ldp d14,d15,[sp,#128+48] > + > + stp q24,q31,[sp,#0] // wipe off-load area > + stp q24,q31,[sp,#32] > + stp q24,q31,[sp,#64] > + > + b.eq .Ldone_512_neon > + > + cmp x2,#192 > + sub v27.4s,v27.4s,v0.4s // -= 1 > + sub v28.4s,v28.4s,v0.4s > + sub v29.4s,v29.4s,v0.4s > + add sp,sp,#128 > + b.hs .Loop_outer_neon > + > + eor v25.16b,v25.16b,v25.16b > + eor v26.16b,v26.16b,v26.16b > + eor v27.16b,v27.16b,v27.16b > + eor v28.16b,v28.16b,v28.16b > + eor v29.16b,v29.16b,v29.16b > + eor v30.16b,v30.16b,v30.16b > + b .Loop_outer > + > +.Ldone_512_neon: > + ldp x19,x20,[x29,#16] > + add sp,sp,#128+64 > + ldp x21,x22,[x29,#32] > + ldp x23,x24,[x29,#48] > + ldp x25,x26,[x29,#64] > + ldp x27,x28,[x29,#80] > + ldp x29,x30,[sp],#96 > + ret > +.size ChaCha20_512_neon,.-ChaCha20_512_neon > -- > 2.19.0 >