This MIPS32r2 implementation comes from René van Dorst and me and results in a nice speedup on the usual OpenWRT targets. Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx> Signed-off-by: René van Dorst <opensource@xxxxxxxxxx> Cc: Samuel Neves <sneves@xxxxxxxxx> Cc: Andy Lutomirski <luto@xxxxxxxxxx> Cc: Greg KH <gregkh@xxxxxxxxxxxxxxxxxxx> Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@xxxxxxxxx> Cc: Ralf Baechle <ralf@xxxxxxxxxxxxxx> Cc: Paul Burton <paul.burton@xxxxxxxx> Cc: James Hogan <jhogan@xxxxxxxxxx> Cc: linux-mips@xxxxxxxxxxxxxx --- lib/zinc/Makefile | 4 + lib/zinc/chacha20/chacha20-mips-glue.h | 28 ++ lib/zinc/chacha20/chacha20-mips.S | 474 +++++++++++++++++++++++++ 3 files changed, 506 insertions(+) create mode 100644 lib/zinc/chacha20/chacha20-mips-glue.h create mode 100644 lib/zinc/chacha20/chacha20-mips.S diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile index 32e4bd94ea0b..9f6a5e65d729 100644 --- a/lib/zinc/Makefile +++ b/lib/zinc/Makefile @@ -17,6 +17,10 @@ ifeq ($(CONFIG_ZINC_ARCH_ARM64),y) zinc-y += chacha20/chacha20-arm64.o CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h endif +ifeq ($(CONFIG_ZINC_ARCH_MIPS)$(CONFIG_CPU_MIPS32_R2),yy) +zinc-y += chacha20/chacha20-mips.o +CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-mips-glue.h +endif endif zinc-y += main.o diff --git a/lib/zinc/chacha20/chacha20-mips-glue.h b/lib/zinc/chacha20/chacha20-mips-glue.h new file mode 100644 index 000000000000..5b2c8cec36c8 --- /dev/null +++ b/lib/zinc/chacha20/chacha20-mips-glue.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. + */ + +#include <zinc/chacha20.h> + +asmlinkage void chacha20_mips(u8 *out, const u8 *in, const size_t len, + const u32 key[8], const u32 counter[4]); +void __init chacha20_fpu_init(void) +{ +} + +static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len, + const u32 key[8], const u32 counter[4], + simd_context_t simd_context) +{ + chacha20_mips(dst, src, len, key, counter); + return true; +} + +static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce, + const u8 *key, simd_context_t simd_context) +{ + return false; +} + +#define HAVE_CHACHA20_ARCH_IMPLEMENTATION diff --git a/lib/zinc/chacha20/chacha20-mips.S b/lib/zinc/chacha20/chacha20-mips.S new file mode 100644 index 000000000000..77da2c2fb240 --- /dev/null +++ b/lib/zinc/chacha20/chacha20-mips.S @@ -0,0 +1,474 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2016-2018 René van Dorst <opensource@xxxxxxxxxx>. All Rights Reserved. + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. + */ + +#define MASK_U32 0x3c +#define MASK_BYTES 0x03 +#define CHACHA20_BLOCK_SIZE 64 +#define STACK_SIZE 4*16 + +#define X0 $t0 +#define X1 $t1 +#define X2 $t2 +#define X3 $t3 +#define X4 $t4 +#define X5 $t5 +#define X6 $t6 +#define X7 $t7 +#define X8 $v1 +#define X9 $fp +#define X10 $s7 +#define X11 $s6 +#define X12 $s5 +#define X13 $s4 +#define X14 $s3 +#define X15 $s2 +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ +#define T0 $s1 +#define T1 $s0 +#define T(n) T ## n +#define X(n) X ## n + +/* Input arguments */ +#define OUT $a0 +#define IN $a1 +#define BYTES $a2 +/* KEY and NONCE argument must be u32 aligned */ +#define KEY $a3 +/* NONCE pointer is given via stack */ +#define NONCE $t9 + +/* Output argument */ +/* NONCE[0] is kept in a register and not in memory. + * We don't want to touch original value in memory. + * Must be incremented every loop iteration. + */ +#define NONCE_0 $v0 + +/* SAVED_X and SAVED_CA are set in the jump table. + * Use regs which are overwritten on exit else we don't leak clear data. + * They are used to handling the last bytes which are not multiple of 4. + */ +#define SAVED_X X15 +#define SAVED_CA $ra + +#define PTR_LAST_ROUND $t8 + +/* ChaCha20 constants and stack location */ +#define CONSTANT_OFS_SP 48 +#define UNALIGNED_OFS_SP 40 + +#define CONSTANT_1 0x61707865 +#define CONSTANT_2 0x3320646e +#define CONSTANT_3 0x79622d32 +#define CONSTANT_4 0x6b206574 + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define MSB 0 +#define LSB 3 +#define ROTx rotl +#define ROTR(n) rotr n, 24 +#define CPU_TO_LE32(n) \ + wsbh n; \ + rotr n, 16; +#else +#define MSB 3 +#define LSB 0 +#define ROTx rotr +#define CPU_TO_LE32(n) +#define ROTR(n) +#endif + +#define STORE_UNALIGNED(x, a, s, o) \ +.Lchacha20_mips_xor_unaligned_ ## x ## _b: ; \ + .if ((s != NONCE) || (o != 0)); \ + lw T0, o(s); \ + .endif; \ + lwl T1, x-4+MSB ## (IN); \ + lwr T1, x-4+LSB ## (IN); \ + .if ((s == NONCE) && (o == 0)); \ + addu X ## a, NONCE_0; \ + .else; \ + addu X ## a, T0; \ + .endif; \ + CPU_TO_LE32(X ## a); \ + xor X ## a, T1; \ + swl X ## a, x-4+MSB ## (OUT); \ + swr X ## a, x-4+LSB ## (OUT); + +#define STORE_ALIGNED(x, a, s, o) \ +.Lchacha20_mips_xor_aligned_ ## x ## _b: ; \ + .if ((s != NONCE) || (o != 0)); \ + lw T0, o(s); \ + .endif; \ + lw T1, x-4 ## (IN); \ + .if ((s == NONCE) && (o == 0)); \ + addu X ## a, NONCE_0; \ + .else; \ + addu X ## a, T0; \ + .endif; \ + CPU_TO_LE32(X ## a); \ + xor X ## a, T1; \ + sw X ## a, x-4 ## (OUT); + +/* Jump table macro. + * Used for setup and handling the last bytes, which are not multiple of 4. + * X15 is free to store Xn + * Every jumptable entry must be equal in size. + */ +#define JMPTBL_ALIGNED(x, a, s, o) \ +.Lchacha20_mips_jmptbl_aligned_ ## a: ; \ + .if ((s == NONCE) && (o == 0)); \ + move SAVED_CA, NONCE_0; \ + .else; \ + lw SAVED_CA, o(s);\ + .endif; \ + b .Lchacha20_mips_xor_aligned_ ## x ## _b; \ + move SAVED_X, X ## a; + +#define JMPTBL_UNALIGNED(x, a, s, o) \ +.Lchacha20_mips_jmptbl_unaligned_ ## a: ; \ + .if ((s == NONCE) && (o == 0)); \ + move SAVED_CA, NONCE_0; \ + .else; \ + lw SAVED_CA, o(s);\ + .endif; \ + b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \ + move SAVED_X, X ## a; + +#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ + addu X(A), X(K); \ + addu X(B), X(L); \ + addu X(C), X(M); \ + addu X(D), X(N); \ + xor X(V), X(A); \ + xor X(W), X(B); \ + xor X(Y), X(C); \ + xor X(Z), X(D); \ + rotl X(V), S; \ + rotl X(W), S; \ + rotl X(Y), S; \ + rotl X(Z), S; + +.text +.set reorder +.set noat +.globl chacha20_mips +.ent chacha20_mips +chacha20_mips: + .frame $sp, STACK_SIZE, $ra + /* This is in the fifth argument */ + lw NONCE, 16($sp) + + /* Return bytes = 0. */ + .set noreorder + beqz BYTES, .Lchacha20_mips_end + addiu $sp, -STACK_SIZE + .set reorder + + /* Calculate PTR_LAST_ROUND */ + addiu PTR_LAST_ROUND, BYTES, -1 + ins PTR_LAST_ROUND, $zero, 0, 6 + addu PTR_LAST_ROUND, OUT + + /* Save s0-s7, fp, ra. */ + sw $ra, 0($sp) + sw $fp, 4($sp) + sw $s0, 8($sp) + sw $s1, 12($sp) + sw $s2, 16($sp) + sw $s3, 20($sp) + sw $s4, 24($sp) + sw $s5, 28($sp) + sw $s6, 32($sp) + sw $s7, 36($sp) + + lw NONCE_0, 0(NONCE) + /* Test IN or OUT is unaligned. + * UNALIGNED (T1) = ( IN | OUT ) & 0x00000003 + */ + or T1, IN, OUT + andi T1, 0x3 + + /* Load constant */ + lui X0, %hi(CONSTANT_1) + lui X1, %hi(CONSTANT_2) + lui X2, %hi(CONSTANT_3) + lui X3, %hi(CONSTANT_4) + ori X0, %lo(CONSTANT_1) + ori X1, %lo(CONSTANT_2) + ori X2, %lo(CONSTANT_3) + ori X3, %lo(CONSTANT_4) + + /* Store constant on stack. */ + sw X0, 0+CONSTANT_OFS_SP($sp) + sw X1, 4+CONSTANT_OFS_SP($sp) + sw X2, 8+CONSTANT_OFS_SP($sp) + sw X3, 12+CONSTANT_OFS_SP($sp) + + sw T1, UNALIGNED_OFS_SP($sp) + + .set noreorder + b .Lchacha20_rounds_start + andi BYTES, (CHACHA20_BLOCK_SIZE-1) + .set reorder + +.align 4 +.Loop_chacha20_rounds: + addiu IN, CHACHA20_BLOCK_SIZE + addiu OUT, CHACHA20_BLOCK_SIZE + addiu NONCE_0, 1 + + lw X0, 0+CONSTANT_OFS_SP($sp) + lw X1, 4+CONSTANT_OFS_SP($sp) + lw X2, 8+CONSTANT_OFS_SP($sp) + lw X3, 12+CONSTANT_OFS_SP($sp) + lw T1, UNALIGNED_OFS_SP($sp) + +.Lchacha20_rounds_start: + lw X4, 0(KEY) + lw X5, 4(KEY) + lw X6, 8(KEY) + lw X7, 12(KEY) + lw X8, 16(KEY) + lw X9, 20(KEY) + lw X10, 24(KEY) + lw X11, 28(KEY) + + move X12, NONCE_0 + lw X13, 4(NONCE) + lw X14, 8(NONCE) + lw X15, 12(NONCE) + + li $at, 9 +.Loop_chacha20_xor_rounds: + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); + .set noreorder + bnez $at, .Loop_chacha20_xor_rounds + addiu $at, -1 + + /* Unaligned? Jump */ + bnez T1, .Loop_chacha20_unaligned + andi $at, BYTES, MASK_U32 + + /* Last round? No jump */ + bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_aligned_64_b + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0) + + /* Full block? Jump */ + beqz BYTES, .Lchacha20_mips_xor_aligned_64_b + /* Calculate lower half jump table addr and offset */ + ins T0, $at, 2, 6 + + subu T0, $at + addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0) + + jr T0 + /* Delay slot */ + nop + + .set reorder + +.Loop_chacha20_unaligned: + .set noreorder + + /* Last round? no jump */ + bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_unaligned_64_b + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0) + + /* Full block? Jump */ + beqz BYTES, .Lchacha20_mips_xor_unaligned_64_b + + /* Calculate lower half jump table addr and offset */ + ins T0, $at, 2, 6 + subu T0, $at + addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0) + + jr T0 + /* Delay slot */ + nop + + .set reorder + +/* Aligned code path + */ +.align 4 + STORE_ALIGNED(64, 15, NONCE,12) + STORE_ALIGNED(60, 14, NONCE, 8) + STORE_ALIGNED(56, 13, NONCE, 4) + STORE_ALIGNED(52, 12, NONCE, 0) + STORE_ALIGNED(48, 11, KEY, 28) + STORE_ALIGNED(44, 10, KEY, 24) + STORE_ALIGNED(40, 9, KEY, 20) + STORE_ALIGNED(36, 8, KEY, 16) + STORE_ALIGNED(32, 7, KEY, 12) + STORE_ALIGNED(28, 6, KEY, 8) + STORE_ALIGNED(24, 5, KEY, 4) + STORE_ALIGNED(20, 4, KEY, 0) + STORE_ALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP) + STORE_ALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP) + STORE_ALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP) +.Lchacha20_mips_xor_aligned_4_b: + /* STORE_ALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) */ + lw T0, 0+CONSTANT_OFS_SP($sp) + lw T1, 0(IN) + addu X0, T0 + CPU_TO_LE32(X0) + xor X0, T1 + .set noreorder + bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds + sw X0, 0(OUT) + .set reorder + + .set noreorder + bne $at, BYTES, .Lchacha20_mips_xor_bytes + /* Empty delayslot, Increase NONCE_0, return NONCE_0 value */ + addiu NONCE_0, 1 + .set noreorder + +.Lchacha20_mips_xor_done: + /* Restore used registers */ + lw $ra, 0($sp) + lw $fp, 4($sp) + lw $s0, 8($sp) + lw $s1, 12($sp) + lw $s2, 16($sp) + lw $s3, 20($sp) + lw $s4, 24($sp) + lw $s5, 28($sp) + lw $s6, 32($sp) + lw $s7, 36($sp) +.Lchacha20_mips_end: + .set noreorder + jr $ra + addiu $sp, STACK_SIZE + .set reorder + + .set noreorder + /* Start jump table */ + JMPTBL_ALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP) + JMPTBL_ALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP) + JMPTBL_ALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP) + JMPTBL_ALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP) + JMPTBL_ALIGNED(16, 4, KEY, 0) + JMPTBL_ALIGNED(20, 5, KEY, 4) + JMPTBL_ALIGNED(24, 6, KEY, 8) + JMPTBL_ALIGNED(28, 7, KEY, 12) + JMPTBL_ALIGNED(32, 8, KEY, 16) + JMPTBL_ALIGNED(36, 9, KEY, 20) + JMPTBL_ALIGNED(40, 10, KEY, 24) + JMPTBL_ALIGNED(44, 11, KEY, 28) + JMPTBL_ALIGNED(48, 12, NONCE, 0) + JMPTBL_ALIGNED(52, 13, NONCE, 4) + JMPTBL_ALIGNED(56, 14, NONCE, 8) + JMPTBL_ALIGNED(60, 15, NONCE,12) + /* End jump table */ + .set reorder + +/* Unaligned code path + */ + STORE_UNALIGNED(64, 15, NONCE,12) + STORE_UNALIGNED(60, 14, NONCE, 8) + STORE_UNALIGNED(56, 13, NONCE, 4) + STORE_UNALIGNED(52, 12, NONCE, 0) + STORE_UNALIGNED(48, 11, KEY, 28) + STORE_UNALIGNED(44, 10, KEY, 24) + STORE_UNALIGNED(40, 9, KEY, 20) + STORE_UNALIGNED(36, 8, KEY, 16) + STORE_UNALIGNED(32, 7, KEY, 12) + STORE_UNALIGNED(28, 6, KEY, 8) + STORE_UNALIGNED(24, 5, KEY, 4) + STORE_UNALIGNED(20, 4, KEY, 0) + STORE_UNALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP) + STORE_UNALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP) + STORE_UNALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP) +.Lchacha20_mips_xor_unaligned_4_b: + /* STORE_UNALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) */ + lw T0, 0+CONSTANT_OFS_SP($sp) + lwl T1, 0+MSB(IN) + lwr T1, 0+LSB(IN) + addu X0, T0 + CPU_TO_LE32(X0) + xor X0, T1 + swl X0, 0+MSB(OUT) + .set noreorder + bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds + swr X0, 0+LSB(OUT) + .set reorder + + /* Fall through to byte handling */ + .set noreorder + beq $at, BYTES, .Lchacha20_mips_xor_done + /* Empty delayslot, increase NONCE_0, return NONCE_0 value */ +.Lchacha20_mips_xor_unaligned_0_b: +.Lchacha20_mips_xor_aligned_0_b: + addiu NONCE_0, 1 + .set reorder + +.Lchacha20_mips_xor_bytes: + addu OUT, $at + addu IN, $at + addu SAVED_X, SAVED_CA + /* First byte */ + lbu T1, 0(IN) + andi $at, BYTES, 2 + CPU_TO_LE32(SAVED_X) + ROTR(SAVED_X) + xor T1, SAVED_X + .set noreorder + beqz $at, .Lchacha20_mips_xor_done + sb T1, 0(OUT) + .set reorder + /* Second byte */ + lbu T1, 1(IN) + andi $at, BYTES, 1 + ROTx SAVED_X, 8 + xor T1, SAVED_X + .set noreorder + beqz $at, .Lchacha20_mips_xor_done + sb T1, 1(OUT) + .set reorder + /* Third byte */ + lbu T1, 2(IN) + ROTx SAVED_X, 8 + xor T1, SAVED_X + .set noreorder + b .Lchacha20_mips_xor_done + sb T1, 2(OUT) + .set reorder +.set noreorder + +.Lchacha20_mips_jmptbl_unaligned: + /* Start jump table */ + JMPTBL_UNALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP) + JMPTBL_UNALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP) + JMPTBL_UNALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP) + JMPTBL_UNALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP) + JMPTBL_UNALIGNED(16, 4, KEY, 0) + JMPTBL_UNALIGNED(20, 5, KEY, 4) + JMPTBL_UNALIGNED(24, 6, KEY, 8) + JMPTBL_UNALIGNED(28, 7, KEY, 12) + JMPTBL_UNALIGNED(32, 8, KEY, 16) + JMPTBL_UNALIGNED(36, 9, KEY, 20) + JMPTBL_UNALIGNED(40, 10, KEY, 24) + JMPTBL_UNALIGNED(44, 11, KEY, 28) + JMPTBL_UNALIGNED(48, 12, NONCE, 0) + JMPTBL_UNALIGNED(52, 13, NONCE, 4) + JMPTBL_UNALIGNED(56, 14, NONCE, 8) + JMPTBL_UNALIGNED(60, 15, NONCE,12) + /* End jump table */ +.set reorder + +.end chacha20_mips +.set at -- 2.19.0