Re: [PATCH v4 2/2] arm64: vdso: wire up getrandom() vDSO implementation

Adhemerval Zanella Netto <adhemerval.zanella@xxxxxxxxxx> · Tue, 3 Sep 2024 09:40:36 -0300

On 02/09/24 17:57, Ard Biesheuvel wrote:
> Hi Adhemerval,
> 
> I have just a couple of more points below, on the BE handling in the asm.
> 
> On Mon, 2 Sept 2024 at 18:19, Adhemerval Zanella
> <adhemerval.zanella@xxxxxxxxxx> wrote:
>>
>> Hook up the generic vDSO implementation to the aarch64 vDSO data page.
>> The _vdso_rng_data required data is placed within the _vdso_data vvar
>> page, by using a offset larger than the vdso_data.
>>
>> The vDSO function requires a ChaCha20 implementation that does not write
>> to the stack, and that can do an entire ChaCha20 permutation.  The one
>> provided uses NEON on the permute operation, with a fallback to the
>> syscall for chips that do not support AdvSIMD.
>>
>> This also passes the vdso_test_chacha test along with
>> vdso_test_getrandom. The vdso_test_getrandom bench-single result on
>> Neoverse-N1 shows:
>>
>>    vdso: 25000000 times in 0.783884250 seconds
>>    libc: 25000000 times in 8.780275399 seconds
>> syscall: 25000000 times in 8.786581518 seconds
>>
>> A small fixup to arch/arm64/include/asm/mman.h was required to avoid
>> pulling kernel code into the vDSO, similar to what's already done in
>> arch/arm64/include/asm/rwonce.h.
>>
>> Signed-off-by: Adhemerval Zanella <adhemerval.zanella@xxxxxxxxxx>
>> ---
>>  arch/arm64/Kconfig                         |   1 +
>>  arch/arm64/include/asm/mman.h              |   6 +-
>>  arch/arm64/include/asm/vdso.h              |   6 +
>>  arch/arm64/include/asm/vdso/getrandom.h    |  50 ++++++
>>  arch/arm64/include/asm/vdso/vsyscall.h     |  10 ++
>>  arch/arm64/kernel/vdso.c                   |   6 -
>>  arch/arm64/kernel/vdso/Makefile            |  25 ++-
>>  arch/arm64/kernel/vdso/vdso                |   1 +
>>  arch/arm64/kernel/vdso/vdso.lds.S          |   4 +
>>  arch/arm64/kernel/vdso/vgetrandom-chacha.S | 178 +++++++++++++++++++++
>>  arch/arm64/kernel/vdso/vgetrandom.c        |  15 ++
>>  tools/arch/arm64/vdso                      |   1 +
>>  tools/include/linux/compiler.h             |   4 +
>>  tools/testing/selftests/vDSO/Makefile      |   3 +-
>>  14 files changed, 294 insertions(+), 16 deletions(-)
>>  create mode 100644 arch/arm64/include/asm/vdso/getrandom.h
>>  create mode 120000 arch/arm64/kernel/vdso/vdso
>>  create mode 100644 arch/arm64/kernel/vdso/vgetrandom-chacha.S
>>  create mode 100644 arch/arm64/kernel/vdso/vgetrandom.c
>>  create mode 120000 tools/arch/arm64/vdso
>>
> ...
>> diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
>> new file mode 100644
>> index 000000000000..4e5f9c349522
>> --- /dev/null
>> +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
>> @@ -0,0 +1,178 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/cache.h>
>> +#include <asm/assembler.h>
>> +
>> +       .text
>> +
>> +#define state0         v0
>> +#define state1         v1
>> +#define state2         v2
>> +#define state3         v3
>> +#define copy0          v4
>> +#define copy0_q                q4
>> +#define copy1          v5
>> +#define copy2          v6
>> +#define copy3          v7
>> +#define copy3_d                d7
>> +#define one_d          d16
>> +#define one_q          q16
>> +#define one_v          v16
>> +#define tmp            v17
>> +#define rot8           v18
>> +
>> +/*
>> + * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
>> + * number of blocks of output with nonce 0, taking an input key and 8-bytes
>> + * counter.  Importantly does not spill to the stack.
>> + *
>> + * This implementation avoids d8-d15 because they are callee-save in user
>> + * space.
>> + *
>> + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
>> + *                                    const uint8_t *key,
>> + *                                    uint32_t *counter,
>> + *                                    size_t nblocks)
>> + *
>> + *     x0: output bytes
>> + *     x1: 32-byte key input
>> + *     x2: 8-byte counter input/output
>> + *     x3: number of 64-byte block to write to output
>> + */
>> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>> +
>> +       /* copy0 = "expand 32-byte k" */
>> +       mov_q           x8, 0x3320646e61707865
>> +       mov_q           x9, 0x6b20657479622d32
>> +       mov             copy0.d[0], x8
>> +       mov             copy0.d[1], x9
>> +
>> +       /* copy1,copy2 = key */
>> +       ld1             { copy1.4s, copy2.4s }, [x1]
>> +       /* copy3 = counter || zero nonce  */
>> +       ldr             copy3_d, [x2]
>> +CPU_BE( rev64          copy3.4s, copy3.4s)
>> +
> 
> This loads 2 u32s as a single u64, and then swaps them if we are running on BE.
> So better to just use
> 
>   ld1 {copy3.2s}, [x2]
> 
> here, and drop the CPU_BE() special case.

Ack.

> 
>> +       movi            one_v.2s, #1
>> +       uzp1            one_v.4s, one_v.4s, one_v.4s
>> +
>> +.Lblock:
>> +       /* copy state to auxiliary vectors for the final add after the permute.  */
>> +       mov             state0.16b, copy0.16b
>> +       mov             state1.16b, copy1.16b
>> +       mov             state2.16b, copy2.16b
>> +       mov             state3.16b, copy3.16b
>> +
>> +       mov             w4, 20
>> +.Lpermute:
>> +       /*
>> +        * Permute one 64-byte block where the state matrix is stored in the four NEON
>> +        * registers state0-state3.  It performs matrix operations on four words in parallel,
>> +        * but requires shuffling to rearrange the words after each round.
>> +        */
>> +
>> +.Ldoubleround:
>> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
>> +       add             state0.4s, state0.4s, state1.4s
>> +       eor             state3.16b, state3.16b, state0.16b
>> +       rev32           state3.8h, state3.8h
>> +
>> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
>> +       add             state2.4s, state2.4s, state3.4s
>> +       eor             tmp.16b, state1.16b, state2.16b
>> +       shl             state1.4s, tmp.4s, #12
>> +       sri             state1.4s, tmp.4s, #20
>> +
>> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
>> +       add             state0.4s, state0.4s, state1.4s
>> +       eor             tmp.16b, state3.16b, state0.16b
>> +       shl             state3.4s, tmp.4s, #8
>> +       sri             state3.4s, tmp.4s, #24
>> +
>> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
>> +       add             state2.4s, state2.4s, state3.4s
>> +       eor             tmp.16b, state1.16b, state2.16b
>> +       shl             state1.4s, tmp.4s, #7
>> +       sri             state1.4s, tmp.4s, #25
>> +
>> +       /* state1[0,1,2,3] = state1[1,2,3,0] */
>> +       ext             state1.16b, state1.16b, state1.16b, #4
>> +       /* state2[0,1,2,3] = state2[2,3,0,1] */
>> +       ext             state2.16b, state2.16b, state2.16b, #8
>> +       /* state3[0,1,2,3] = state3[1,2,3,0] */
>> +       ext             state3.16b, state3.16b, state3.16b, #12
>> +
>> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
>> +       add             state0.4s, state0.4s, state1.4s
>> +       eor             state3.16b, state3.16b, state0.16b
>> +       rev32           state3.8h, state3.8h
>> +
>> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
>> +       add             state2.4s, state2.4s, state3.4s
>> +       eor             tmp.16b, state1.16b, state2.16b
>> +       shl             state1.4s, tmp.4s, #12
>> +       sri             state1.4s, tmp.4s, #20
>> +
>> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
>> +       add             state0.4s, state0.4s, state1.4s
>> +       eor             tmp.16b, state3.16b, state0.16b
>> +       shl             state3.4s, tmp.4s, #8
>> +       sri             state3.4s, tmp.4s, #24
>> +
>> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
>> +       add             state2.4s, state2.4s, state3.4s
>> +       eor             tmp.16b, state1.16b, state2.16b
>> +       shl             state1.4s, tmp.4s, #7
>> +       sri             state1.4s, tmp.4s, #25
>> +
>> +       /* state1[0,1,2,3] = state1[3,0,1,2] */
>> +       ext             state1.16b, state1.16b, state1.16b, #12
>> +       /* state2[0,1,2,3] = state2[2,3,0,1] */
>> +       ext             state2.16b, state2.16b, state2.16b, #8
>> +       /* state3[0,1,2,3] = state3[1,2,3,0] */
>> +       ext             state3.16b, state3.16b, state3.16b, #4
>> +
>> +       subs            w4, w4, #2
>> +       b.ne            .Ldoubleround
>> +
>> +       /* output0 = state0 + state0 */
>> +       add             state0.4s, state0.4s, copy0.4s
>> +CPU_BE( rev32          state0.16b, state0.16b)
>> +       /* output1 = state1 + state1 */
>> +       add             state1.4s, state1.4s, copy1.4s
>> +CPU_BE( rev32          state1.16b, state1.16b)
>> +       /* output2 = state2 + state2 */
>> +       add             state2.4s, state2.4s, copy2.4s
>> +CPU_BE( rev32          state2.16b, state2.16b)
>> +       /* output2 = state3 + state3 */
>> +       add             state3.4s, state3.4s, copy3.4s
>> +CPU_BE( rev32          state3.16b, state3.16b)
>> +       st1             { state0.4s - state3.4s }, [x0]
>> +
> 
> If the u32s shouldn't be swabbed for BE, you should simply be able to do
> 
> st1 {state0.16b - state3.16b}, [x0]
> 
> here, and drop the CPU_BE(*).

Ack.

> 
>> +       /*
>> +        * ++copy3.counter, the 'add' clears the upper half of the SIMD register
>> +        * which is the expected behaviour here.
>> +        */
>> +       add             copy3_d, copy3_d, one_d
>> +
>> +       /* output += 64, --nblocks */
>> +       add             x0, x0, 64
>> +       subs            x3, x3, #1
>> +       b.ne            .Lblock
>> +
>> +       /* counter = copy3.counter */
>> +CPU_BE( rev64          copy3.4s, copy3.4s)
>> +       str             copy3_d, [x2]
>> +
> 
> ... and this could be
> 
> st1 {copy3.2s}, [x2]

Ack.

I just sent a v5 with your proposed suggestions, thanks!

> 
>> +       /* Zero out the potentially sensitive regs, in case nothing uses these again. */
>> +       movi            state0.16b, #0
>> +       movi            state1.16b, #0
>> +       movi            state2.16b, #0
>> +       movi            state3.16b, #0
>> +       movi            copy1.16b, #0
>> +       movi            copy2.16b, #0
>> +       ret
>> +SYM_FUNC_END(__arch_chacha20_blocks_nostack)
>> +
>> +emit_aarch64_feature_1_and