Hi Adhemerval, I have just a couple of more points below, on the BE handling in the asm. On Mon, 2 Sept 2024 at 18:19, Adhemerval Zanella <adhemerval.zanella@xxxxxxxxxx> wrote: > > Hook up the generic vDSO implementation to the aarch64 vDSO data page. > The _vdso_rng_data required data is placed within the _vdso_data vvar > page, by using a offset larger than the vdso_data. > > The vDSO function requires a ChaCha20 implementation that does not write > to the stack, and that can do an entire ChaCha20 permutation. The one > provided uses NEON on the permute operation, with a fallback to the > syscall for chips that do not support AdvSIMD. > > This also passes the vdso_test_chacha test along with > vdso_test_getrandom. The vdso_test_getrandom bench-single result on > Neoverse-N1 shows: > > vdso: 25000000 times in 0.783884250 seconds > libc: 25000000 times in 8.780275399 seconds > syscall: 25000000 times in 8.786581518 seconds > > A small fixup to arch/arm64/include/asm/mman.h was required to avoid > pulling kernel code into the vDSO, similar to what's already done in > arch/arm64/include/asm/rwonce.h. > > Signed-off-by: Adhemerval Zanella <adhemerval.zanella@xxxxxxxxxx> > --- > arch/arm64/Kconfig | 1 + > arch/arm64/include/asm/mman.h | 6 +- > arch/arm64/include/asm/vdso.h | 6 + > arch/arm64/include/asm/vdso/getrandom.h | 50 ++++++ > arch/arm64/include/asm/vdso/vsyscall.h | 10 ++ > arch/arm64/kernel/vdso.c | 6 - > arch/arm64/kernel/vdso/Makefile | 25 ++- > arch/arm64/kernel/vdso/vdso | 1 + > arch/arm64/kernel/vdso/vdso.lds.S | 4 + > arch/arm64/kernel/vdso/vgetrandom-chacha.S | 178 +++++++++++++++++++++ > arch/arm64/kernel/vdso/vgetrandom.c | 15 ++ > tools/arch/arm64/vdso | 1 + > tools/include/linux/compiler.h | 4 + > tools/testing/selftests/vDSO/Makefile | 3 +- > 14 files changed, 294 insertions(+), 16 deletions(-) > create mode 100644 arch/arm64/include/asm/vdso/getrandom.h > create mode 120000 arch/arm64/kernel/vdso/vdso > create mode 100644 arch/arm64/kernel/vdso/vgetrandom-chacha.S > create mode 100644 arch/arm64/kernel/vdso/vgetrandom.c > create mode 120000 tools/arch/arm64/vdso > ... > diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S > new file mode 100644 > index 000000000000..4e5f9c349522 > --- /dev/null > +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S > @@ -0,0 +1,178 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +#include <linux/linkage.h> > +#include <asm/cache.h> > +#include <asm/assembler.h> > + > + .text > + > +#define state0 v0 > +#define state1 v1 > +#define state2 v2 > +#define state3 v3 > +#define copy0 v4 > +#define copy0_q q4 > +#define copy1 v5 > +#define copy2 v6 > +#define copy3 v7 > +#define copy3_d d7 > +#define one_d d16 > +#define one_q q16 > +#define one_v v16 > +#define tmp v17 > +#define rot8 v18 > + > +/* > + * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive > + * number of blocks of output with nonce 0, taking an input key and 8-bytes > + * counter. Importantly does not spill to the stack. > + * > + * This implementation avoids d8-d15 because they are callee-save in user > + * space. > + * > + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, > + * const uint8_t *key, > + * uint32_t *counter, > + * size_t nblocks) > + * > + * x0: output bytes > + * x1: 32-byte key input > + * x2: 8-byte counter input/output > + * x3: number of 64-byte block to write to output > + */ > +SYM_FUNC_START(__arch_chacha20_blocks_nostack) > + > + /* copy0 = "expand 32-byte k" */ > + mov_q x8, 0x3320646e61707865 > + mov_q x9, 0x6b20657479622d32 > + mov copy0.d[0], x8 > + mov copy0.d[1], x9 > + > + /* copy1,copy2 = key */ > + ld1 { copy1.4s, copy2.4s }, [x1] > + /* copy3 = counter || zero nonce */ > + ldr copy3_d, [x2] > +CPU_BE( rev64 copy3.4s, copy3.4s) > + This loads 2 u32s as a single u64, and then swaps them if we are running on BE. So better to just use ld1 {copy3.2s}, [x2] here, and drop the CPU_BE() special case. > + movi one_v.2s, #1 > + uzp1 one_v.4s, one_v.4s, one_v.4s > + > +.Lblock: > + /* copy state to auxiliary vectors for the final add after the permute. */ > + mov state0.16b, copy0.16b > + mov state1.16b, copy1.16b > + mov state2.16b, copy2.16b > + mov state3.16b, copy3.16b > + > + mov w4, 20 > +.Lpermute: > + /* > + * Permute one 64-byte block where the state matrix is stored in the four NEON > + * registers state0-state3. It performs matrix operations on four words in parallel, > + * but requires shuffling to rearrange the words after each round. > + */ > + > +.Ldoubleround: > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ > + add state0.4s, state0.4s, state1.4s > + eor state3.16b, state3.16b, state0.16b > + rev32 state3.8h, state3.8h > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ > + add state2.4s, state2.4s, state3.4s > + eor tmp.16b, state1.16b, state2.16b > + shl state1.4s, tmp.4s, #12 > + sri state1.4s, tmp.4s, #20 > + > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ > + add state0.4s, state0.4s, state1.4s > + eor tmp.16b, state3.16b, state0.16b > + shl state3.4s, tmp.4s, #8 > + sri state3.4s, tmp.4s, #24 > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ > + add state2.4s, state2.4s, state3.4s > + eor tmp.16b, state1.16b, state2.16b > + shl state1.4s, tmp.4s, #7 > + sri state1.4s, tmp.4s, #25 > + > + /* state1[0,1,2,3] = state1[1,2,3,0] */ > + ext state1.16b, state1.16b, state1.16b, #4 > + /* state2[0,1,2,3] = state2[2,3,0,1] */ > + ext state2.16b, state2.16b, state2.16b, #8 > + /* state3[0,1,2,3] = state3[1,2,3,0] */ > + ext state3.16b, state3.16b, state3.16b, #12 > + > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ > + add state0.4s, state0.4s, state1.4s > + eor state3.16b, state3.16b, state0.16b > + rev32 state3.8h, state3.8h > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ > + add state2.4s, state2.4s, state3.4s > + eor tmp.16b, state1.16b, state2.16b > + shl state1.4s, tmp.4s, #12 > + sri state1.4s, tmp.4s, #20 > + > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ > + add state0.4s, state0.4s, state1.4s > + eor tmp.16b, state3.16b, state0.16b > + shl state3.4s, tmp.4s, #8 > + sri state3.4s, tmp.4s, #24 > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ > + add state2.4s, state2.4s, state3.4s > + eor tmp.16b, state1.16b, state2.16b > + shl state1.4s, tmp.4s, #7 > + sri state1.4s, tmp.4s, #25 > + > + /* state1[0,1,2,3] = state1[3,0,1,2] */ > + ext state1.16b, state1.16b, state1.16b, #12 > + /* state2[0,1,2,3] = state2[2,3,0,1] */ > + ext state2.16b, state2.16b, state2.16b, #8 > + /* state3[0,1,2,3] = state3[1,2,3,0] */ > + ext state3.16b, state3.16b, state3.16b, #4 > + > + subs w4, w4, #2 > + b.ne .Ldoubleround > + > + /* output0 = state0 + state0 */ > + add state0.4s, state0.4s, copy0.4s > +CPU_BE( rev32 state0.16b, state0.16b) > + /* output1 = state1 + state1 */ > + add state1.4s, state1.4s, copy1.4s > +CPU_BE( rev32 state1.16b, state1.16b) > + /* output2 = state2 + state2 */ > + add state2.4s, state2.4s, copy2.4s > +CPU_BE( rev32 state2.16b, state2.16b) > + /* output2 = state3 + state3 */ > + add state3.4s, state3.4s, copy3.4s > +CPU_BE( rev32 state3.16b, state3.16b) > + st1 { state0.4s - state3.4s }, [x0] > + If the u32s shouldn't be swabbed for BE, you should simply be able to do st1 {state0.16b - state3.16b}, [x0] here, and drop the CPU_BE(*). > + /* > + * ++copy3.counter, the 'add' clears the upper half of the SIMD register > + * which is the expected behaviour here. > + */ > + add copy3_d, copy3_d, one_d > + > + /* output += 64, --nblocks */ > + add x0, x0, 64 > + subs x3, x3, #1 > + b.ne .Lblock > + > + /* counter = copy3.counter */ > +CPU_BE( rev64 copy3.4s, copy3.4s) > + str copy3_d, [x2] > + ... and this could be st1 {copy3.2s}, [x2] > + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ > + movi state0.16b, #0 > + movi state1.16b, #0 > + movi state2.16b, #0 > + movi state3.16b, #0 > + movi copy1.16b, #0 > + movi copy2.16b, #0 > + ret > +SYM_FUNC_END(__arch_chacha20_blocks_nostack) > + > +emit_aarch64_feature_1_and