Hi Xi, Le 01/09/2024 à 08:13, Xi Ruoyao a écrit :
Hook up the generic vDSO implementation to the LoongArch vDSO data page by providing the required __arch_chacha20_blocks_nostack, __arch_get_k_vdso_rng_data, and getrandom_syscall implementations. Signed-off-by: Xi Ruoyao <xry111@xxxxxxxxxxx> ---
...
diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..7e86a50f6e85 --- /dev/null +++ b/arch/loongarch/vdso/vgetrandom-chacha.S @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Xi Ruoyao <xry111@xxxxxxxxxxx>. All Rights Reserved. + */ + +#include <asm/asm.h> +#include <asm/regdef.h> +#include <linux/linkage.h> + +.text + +/* Salsa20 quarter-round */ +.macro QR a b c d + add.w \a, \a, \b + xor \d, \d, \a + rotri.w \d, \d, 16 + + add.w \c, \c, \d + xor \b, \b, \c + rotri.w \b, \b, 20 + + add.w \a, \a, \b + xor \d, \d, \a + rotri.w \d, \d, 24 + + add.w \c, \c, \d + xor \b, \b, \c + rotri.w \b, \b, 25 +.endm +
I know nothing about Loongarch assembly and execution performance, but I see that GCC groups operations by 4 when building reference_chacha20_blocks() from vdso_test_chacha, see below.
Shouldn't you do the same and group ROUNDs by 4 just like I did on powerpc ? (https://github.com/torvalds/linux/blob/master/arch/powerpc/kernel/vdso/vgetrandom-chacha.S)
0000000000000134 <.L3>: 134: 001061d8 add.w $s1, $t2, $s1 138: 0015c312 xor $t6, $s1, $t4 13c: 26000070 ldptr.d $t4, $sp, 0 140: 001036d6 add.w $fp, $fp, $t1 144: 001065f9 add.w $s2, $t3, $s2 148: 0010335a add.w $s3, $s3, $t0 14c: 00159ad3 xor $t7, $fp, $a2 150: 0015c344 xor $a0, $s3, $t4 154: 0015c731 xor $t5, $s2, $t5 158: 004cc273 rotri.w $t7, $t7, 0x10 15c: 004cc252 rotri.w $t6, $t6, 0x10 160: 004cc231 rotri.w $t5, $t5, 0x10 164: 004cc084 rotri.w $a0, $a0, 0x10 168: 00104766 add.w $a2, $s4, $t5 16c: 00102088 add.w $a4, $a0, $a4 170: 00102669 add.w $a5, $t7, $a5 174: 001048e7 add.w $a3, $a3, $t6 178: 0015b530 xor $t4, $a5, $t1 17c: 0015b10c xor $t0, $a4, $t0 180: 0015b8ee xor $t2, $a3, $t2 184: 0015bccf xor $t3, $a2, $t3 188: 004cd18d rotri.w $t1, $t0, 0x14 18c: 004cd210 rotri.w $t4, $t4, 0x14 190: 004cd1ce rotri.w $t2, $t2, 0x14 194: 004cd1ef rotri.w $t3, $t3, 0x14 198: 001042d6 add.w $fp, $fp, $t4 19c: 00103b18 add.w $s1, $s1, $t2 1a0: 00103f39 add.w $s2, $s2, $t3 1a4: 0010375a add.w $s3, $s3, $t1 1a8: 0015ced3 xor $t7, $fp, $t7 1ac: 0015cb12 xor $t6, $s1, $t6 1b0: 0015c731 xor $t5, $s2, $t5 1b4: 00159344 xor $a0, $s3, $a0 1b8: 004ce274 rotri.w $t8, $t7, 0x18 1bc: 004ce084 rotri.w $a0, $a0, 0x18 1c0: 004ce253 rotri.w $t7, $t6, 0x18 1c4: 004ce232 rotri.w $t6, $t5, 0x18 1c8: 00105129 add.w $a5, $a5, $t8 1cc: 00101111 add.w $t5, $a4, $a0 1d0: 00104ce7 add.w $a3, $a3, $t7 1d4: 001048c6 add.w $a2, $a2, $t6 1d8: 0015c130 xor $t4, $a5, $t4 1dc: 0015b8ee xor $t2, $a3, $t2 1e0: 0015bccf xor $t3, $a2, $t3 1e4: 0015b62d xor $t1, $t5, $t1 1e8: 004ce610 rotri.w $t4, $t4, 0x19 1ec: 004ce5ce rotri.w $t2, $t2, 0x19 1f0: 004ce5ef rotri.w $t3, $t3, 0x19 1f4: 004ce5ad rotri.w $t1, $t1, 0x19 1f8: 00103ad6 add.w $fp, $fp, $t2 1fc: 00103f18 add.w $s1, $s1, $t3 200: 00103739 add.w $s2, $s2, $t1 204: 0010435a add.w $s3, $s3, $t4 208: 001592c4 xor $a0, $fp, $a0 20c: 0015d314 xor $t8, $s1, $t8 210: 0015cf33 xor $t7, $s2, $t7 214: 0015cb52 xor $t6, $s3, $t6 218: 004cc084 rotri.w $a0, $a0, 0x10 21c: 004cc294 rotri.w $t8, $t8, 0x10 220: 004cc273 rotri.w $t7, $t7, 0x10 224: 004cc252 rotri.w $t6, $t6, 0x10 228: 001010dc add.w $s5, $a2, $a0 22c: 0010523d add.w $s6, $t5, $t8 230: 00104d3e add.w $s7, $a5, $t7 234: 001048ff add.w $s8, $a3, $t6 238: 0015c3ec xor $t0, $s8, $t4 23c: 0015bb8e xor $t2, $s5, $t2 240: 0015bfaf xor $t3, $s6, $t3 244: 0015b7cd xor $t1, $s7, $t1 248: 004cd1ad rotri.w $t1, $t1, 0x14 24c: 004cd18c rotri.w $t0, $t0, 0x14 250: 004cd1ce rotri.w $t2, $t2, 0x14 254: 004cd1ef rotri.w $t3, $t3, 0x14 258: 00103ad7 add.w $s0, $fp, $t2 25c: 00103f0a add.w $a6, $s1, $t3 260: 0010372b add.w $a7, $s2, $t1 264: 00103341 add.w $ra, $s3, $t0 268: 001592e4 xor $a0, $s0, $a0 26c: 0015d154 xor $t8, $a6, $t8 270: 0015cd73 xor $t7, $a7, $t7 274: 0015c832 xor $t6, $ra, $t6 278: 004ce084 rotri.w $a0, $a0, 0x18 27c: 004ce294 rotri.w $t8, $t8, 0x18 280: 004ce273 rotri.w $t7, $t7, 0x18 284: 004ce252 rotri.w $t6, $t6, 0x18 288: 0010139c add.w $s5, $s5, $a0 28c: 001053bd add.w $s6, $s6, $t8 290: 00104fde add.w $s7, $s7, $t7 294: 00104bff add.w $s8, $s8, $t6 298: 0015b7d1 xor $t5, $s7, $t1 29c: 0015bb8e xor $t2, $s5, $t2 2a0: 0015b3ed xor $t1, $s8, $t0 2a4: 0015bfaf xor $t3, $s6, $t3 2a8: 0040808c slli.w $t0, $a0, 0x0 2ac: 004ce631 rotri.w $t5, $t5, 0x19 2b0: 004ce5ce rotri.w $t2, $t2, 0x19 2b4: 004ce5ef rotri.w $t3, $t3, 0x19 2b8: 004ce5ad rotri.w $t1, $t1, 0x19 2bc: 2700006c stptr.d $t0, $sp, 0 2c0: 02bffca5 addi.w $a1, $a1, -1(0xfff) 2c4: 0040822c slli.w $t0, $t5, 0x0 2c8: 004082f6 slli.w $fp, $s0, 0x0 2cc: 0040839b slli.w $s4, $s5, 0x0 2d0: 004081ce slli.w $t2, $t2, 0x0 2d4: 00408158 slli.w $s1, $a6, 0x0 2d8: 00408286 slli.w $a2, $t8, 0x0 2dc: 004083a8 slli.w $a4, $s6, 0x0 2e0: 004081ef slli.w $t3, $t3, 0x0 2e4: 00408179 slli.w $s2, $a7, 0x0 2e8: 00408270 slli.w $t4, $t7, 0x0 2ec: 004083c9 slli.w $a5, $s7, 0x0 2f0: 0040803a slli.w $s3, $ra, 0x0 2f4: 00408251 slli.w $t5, $t6, 0x0 2f8: 004083e7 slli.w $a3, $s8, 0x0 2fc: 004081ad slli.w $t1, $t1, 0x0 300: 47fe34bf bnez $a1, -460(0x7ffe34) # 134 <.L3> Christophe