It's 7% faster in vdso_test_getrandom bench-single test and 21% faster in vdso_test_getrandom bench-multi test than the generic LoongArch implementation. Signed-off-by: Xi Ruoyao <xry111@xxxxxxxxxxx> --- arch/loongarch/vdso/Makefile | 4 + arch/loongarch/vdso/vgetrandom-chacha-lsx.S | 162 ++++++++++++++++++++ arch/loongarch/vdso/vgetrandom-chacha.S | 13 ++ 3 files changed, 179 insertions(+) create mode 100644 arch/loongarch/vdso/vgetrandom-chacha-lsx.S diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index c8c5d9a7c80c..cab92c3a70a4 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -8,6 +8,10 @@ obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o sigreturn.o obj-vdso-$(CONFIG_VDSO_GETRANDOM) += vgetrandom.o vgetrandom-chacha.o memset.o +ifdef CONFIG_CPU_HAS_LSX +obj-vdso-$(CONFIG_VDSO_GETRANDOM) += vgetrandom-chacha-lsx.o +endif + # Common compiler flags between ABIs. ccflags-vdso := \ $(filter -I%,$(KBUILD_CFLAGS)) \ diff --git a/arch/loongarch/vdso/vgetrandom-chacha-lsx.S b/arch/loongarch/vdso/vgetrandom-chacha-lsx.S new file mode 100644 index 000000000000..6d8c886d78c8 --- /dev/null +++ b/arch/loongarch/vdso/vgetrandom-chacha-lsx.S @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Xi Ruoyao <xry111@xxxxxxxxxxx>. All Rights Reserved. + * + * Based on arch/x86/entry/vdso/vgetrandom-chacha.S: + * + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights + * Reserved. + */ + +#include <asm/asm.h> +#include <asm/regdef.h> +#include <linux/linkage.h> + +.section .rodata +.align 4 +CONSTANTS: .octa 0x6b20657479622d323320646e61707865 + +.text + +/* + * Loongson SIMD eXtension implementation of ChaCha20. Produces a given + * positive number of blocks of output with a nonce of 0, taking an input + * key and 8-byte counter. Importantly does not spill to the stack. Its + * arguments are: + * + * a0: output bytes + * a1: 32-byte key input + * a2: 8-byte counter input/output + * a3: number of 64-byte blocks to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack_lsx) +#define output a0 +#define key a1 +#define counter a2 +#define nblocks a3 +#define i t0 +/* LSX registers vr0-vr23 are caller-save. */ +#define state0 $vr0 +#define state1 $vr1 +#define state2 $vr2 +#define state3 $vr3 +#define copy0 $vr4 +#define copy1 $vr5 +#define copy2 $vr6 +#define copy3 $vr7 +#define one $vr8 + + /* copy0 = "expand 32-byte k" */ + la.pcrel t1, CONSTANTS + vld copy0, t1, 0 + /* copy1, copy2 = key */ + vld copy1, key, 0 + vld copy2, key, 0x10 + /* copy3 = counter || zero nonce */ + vldrepl.d copy3, counter, 0 + vinsgr2vr.d copy3, zero, 1 + /* one = 1 || 0 */ + vldi one, 0b0110000000001 + vinsgr2vr.d one, zero, 1 + +.Lblock: + /* state = copy */ + vori.b state0, copy0, 0 + vori.b state1, copy1, 0 + vori.b state2, copy2, 0 + vori.b state3, copy3, 0 + + li.w i, 10 +.Lpermute: + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + vadd.w state0, state0, state1 + vxor.v state3, state3, state0 + vrotri.w state3, state3, 16 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + vadd.w state2, state2, state3 + vxor.v state1, state1, state2 + vrotri.w state1, state1, 20 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + vadd.w state0, state0, state1 + vxor.v state3, state3, state0 + vrotri.w state3, state3, 24 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + vadd.w state2, state2, state3 + vxor.v state1, state1, state2 + vrotri.w state1, state1, 25 + + /* state1[0,1,2,3] = state1[1,2,3,0] */ + vshuf4i.w state1, state1, 0b00111001 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + vshuf4i.w state2, state2, 0b01001110 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + vshuf4i.w state3, state3, 0b10010011 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + vadd.w state0, state0, state1 + vxor.v state3, state3, state0 + vrotri.w state3, state3, 16 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + vadd.w state2, state2, state3 + vxor.v state1, state1, state2 + vrotri.w state1, state1, 20 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + vadd.w state0, state0, state1 + vxor.v state3, state3, state0 + vrotri.w state3, state3, 24 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + vadd.w state2, state2, state3 + vxor.v state1, state1, state2 + vrotri.w state1, state1, 25 + + /* state1[0,1,2,3] = state1[3,0,1,2] */ + vshuf4i.w state1, state1, 0b10010011 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + vshuf4i.w state2, state2, 0b01001110 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + vshuf4i.w state3, state3, 0b00111001 + + addi.w i, i, -1 + bnez i, .Lpermute + + /* output0 = state0 + copy0 */ + vadd.w state0, state0, copy0 + vst state0, output, 0 + /* output1 = state1 + copy1 */ + vadd.w state1, state1, copy1 + vst state1, output, 0x10 + /* output2 = state2 + copy2 */ + vadd.w state2, state2, copy2 + vst state2, output, 0x20 + /* output3 = state3 + copy3 */ + vadd.w state3, state3, copy3 + vst state3, output, 0x30 + + /* ++copy3.counter */ + vadd.d copy3, copy3, one + + /* output += 64 */ + PTR_ADDI output, output, 64 + /* --nblocks */ + PTR_ADDI nblocks, nblocks, -1 + bnez nblocks, .Lblock + + /* counter = copy3.counter */ + vstelm.d copy3, counter, 0, 0 + + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ + vldi state0, 0 + vldi state1, 0 + vldi state2, 0 + vldi state3, 0 + vldi copy1, 0 + vldi copy2, 0 + + jr ra +SYM_FUNC_END(__arch_chacha20_blocks_nostack_lsx) diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S index 2e42198f2faf..1931119e12a6 100644 --- a/arch/loongarch/vdso/vgetrandom-chacha.S +++ b/arch/loongarch/vdso/vgetrandom-chacha.S @@ -7,6 +7,11 @@ #include <asm/regdef.h> #include <linux/linkage.h> +#ifdef CONFIG_CPU_HAS_LSX +# include <asm/alternative-asm.h> +# include <asm/cpu.h> +#endif + .text /* Salsa20 quarter-round */ @@ -78,8 +83,16 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) * The ABI requires s0-s9 saved, and sp aligned to 16-byte. * This does not violate the stack-less requirement: no sensitive data * is spilled onto the stack. + * + * Rewrite the very first instruction to jump to the LSX implementation + * if LSX is available. */ +#ifdef CONFIG_CPU_HAS_LSX + ALTERNATIVE __stringify(PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN), \ + "b __arch_chacha20_blocks_nostack_lsx", CPU_FEATURE_LSX +#else PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN +#endif REG_S s0, sp, 0 REG_S s1, sp, SZREG REG_S s2, sp, SZREG * 2 -- 2.46.0