On Mon, Oct 07, 2019 at 06:45:48PM +0200, Ard Biesheuvel wrote: > diff --git a/arch/arm/crypto/chacha-scalar-core.S b/arch/arm/crypto/chacha-scalar-core.S > index 2140319b64a0..0970ae107590 100644 > --- a/arch/arm/crypto/chacha-scalar-core.S > +++ b/arch/arm/crypto/chacha-scalar-core.S > @@ -41,14 +41,6 @@ > X14 .req r12 > X15 .req r14 > > -.Lexpand_32byte_k: > - // "expand 32-byte k" > - .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 > - > -#ifdef __thumb2__ > -# define adrl adr > -#endif > - > .macro __rev out, in, t0, t1, t2 > .if __LINUX_ARM_ARCH__ >= 6 > rev \out, \in > @@ -391,61 +383,65 @@ > .endm // _chacha > > /* > - * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8], > - * const u32 iv[4]); > + * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, > + * u32 *state, int nrounds); > */ > -ENTRY(chacha20_arm) > +ENTRY(chacha_doarm) > cmp r2, #0 // len == 0? > reteq lr > > + ldr ip, [sp] > + cmp ip, #12 > + > push {r0-r2,r4-r11,lr} > > // Push state x0-x15 onto stack. > // Also store an extra copy of x10-x11 just before the state. > > - ldr r4, [sp, #48] // iv > - mov r0, sp > - sub sp, #80 > - > - // iv: x12-x15 > - ldm r4, {X12,X13,X14,X15} > - stmdb r0!, {X12,X13,X14,X15} > + add X12, r3, #48 > + ldm X12, {X12,X13,X14,X15} > + push {X12,X13,X14,X15} > + sub sp, sp, #64 > > - // key: x4-x11 > - __ldrd X8_X10, X9_X11, r3, 24 > + __ldrd X8_X10, X9_X11, r3, 40 > __strd X8_X10, X9_X11, sp, 8 > - stmdb r0!, {X8_X10, X9_X11} > - ldm r3, {X4-X9_X11} > - stmdb r0!, {X4-X9_X11} > - > - // constants: x0-x3 > - adrl X3, .Lexpand_32byte_k > - ldm X3, {X0-X3} > + __strd X8_X10, X9_X11, sp, 56 > + ldm r3, {X0-X9_X11} > __strd X0, X1, sp, 16 > __strd X2, X3, sp, 24 > + __strd X4, X5, sp, 32 > + __strd X6, X7, sp, 40 > + __strd X8_X10, X9_X11, sp, 48 > > + beq 1f > _chacha 20 > > - add sp, #76 > +0: add sp, #76 > pop {r4-r11, pc} > -ENDPROC(chacha20_arm) > + > +1: _chacha 12 > + b 0b > +ENDPROC(chacha_doarm) > > /* > - * void hchacha20_arm(const u32 state[16], u32 out[8]); > + * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds); > */ > -ENTRY(hchacha20_arm) > +ENTRY(hchacha_block_arm) > push {r1,r4-r11,lr} > > + cmp r2, #12 // ChaCha12 ? > + > mov r14, r0 > ldmia r14!, {r0-r11} // load x0-x11 > push {r10-r11} // store x10-x11 to stack > ldm r14, {r10-r12,r14} // load x12-x15 > sub sp, #8 > > + beq 1f > _chacha_permute 20 > > // Skip over (unused0-unused1, x10-x11) > - add sp, #16 > +0: add sp, #16 > > // Fix up rotations of x12-x15 > ror X12, X12, #drot > @@ -458,4 +454,7 @@ ENTRY(hchacha20_arm) > stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} > > pop {r4-r11,pc} > -ENDPROC(hchacha20_arm) > + > +1: _chacha_permute 12 > + b 0b > +ENDPROC(hchacha_block_arm) > -- FYI, I've also had a version of this code supporting both the 12 and 20-round variants sitting around here: https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git/commit/?h=chacha-arm-scalar&id=fc51d8012742f591da3204b327a865f6109d472a I'll take a closer look at this later, but you might want to take a quick look at what I did, just in case I happened to do anything in a better way. - Eric