Re: [PATCH v3 1/3] LoongArch: vDSO: Wire up getrandom() vDSO implementation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi, Ruoyao,

On Fri, Aug 16, 2024 at 7:07 PM Xi Ruoyao <xry111@xxxxxxxxxxx> wrote:
>
> Hook up the generic vDSO implementation to the LoongArch vDSO data page:
> embed struct vdso_rng_data into struct loongarch_vdso_data, and use
> assembler hack to resolve the symbol name "_vdso_rng_data" (which is
> expected by the generic vDSO implementation) to the rng_data field in
> loongarch_vdso_data.
>
> The compiler (GCC 14.2) calls memset() for initializing a "large" struct
> in a cold path of the generic vDSO getrandom() code.  There seems no way
> to prevent it from calling memset(), and it's a cold path so the
> performance does not matter, so just provide a naive memset()
> implementation for vDSO.
Why x86 doesn't need to provide a naive memset()?

>
> Signed-off-by: Xi Ruoyao <xry111@xxxxxxxxxxx>
> ---
>  arch/loongarch/Kconfig                      |   1 +
>  arch/loongarch/include/asm/vdso/getrandom.h |  47 ++++
>  arch/loongarch/include/asm/vdso/vdso.h      |   8 +
>  arch/loongarch/kernel/asm-offsets.c         |  10 +
>  arch/loongarch/kernel/vdso.c                |   6 +
>  arch/loongarch/vdso/Makefile                |   2 +
>  arch/loongarch/vdso/memset.S                |  24 ++
>  arch/loongarch/vdso/vdso.lds.S              |   1 +
>  arch/loongarch/vdso/vgetrandom-chacha.S     | 239 ++++++++++++++++++++
>  arch/loongarch/vdso/vgetrandom.c            |  19 ++
>  10 files changed, 357 insertions(+)
>  create mode 100644 arch/loongarch/include/asm/vdso/getrandom.h
>  create mode 100644 arch/loongarch/vdso/memset.S
>  create mode 100644 arch/loongarch/vdso/vgetrandom-chacha.S
>  create mode 100644 arch/loongarch/vdso/vgetrandom.c
>
> diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
> index 70f169210b52..14821c2aba5b 100644
> --- a/arch/loongarch/Kconfig
> +++ b/arch/loongarch/Kconfig
> @@ -190,6 +190,7 @@ config LOONGARCH
>         select TRACE_IRQFLAGS_SUPPORT
>         select USE_PERCPU_NUMA_NODE_ID
>         select USER_STACKTRACE_SUPPORT
> +       select VDSO_GETRANDOM
>         select ZONE_DMA32
>
>  config 32BIT
> diff --git a/arch/loongarch/include/asm/vdso/getrandom.h b/arch/loongarch/include/asm/vdso/getrandom.h
> new file mode 100644
> index 000000000000..a369588a4ebf
> --- /dev/null
> +++ b/arch/loongarch/include/asm/vdso/getrandom.h
> @@ -0,0 +1,47 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2024 Xi Ruoyao <xry111@xxxxxxxxxxx>. All Rights Reserved.
> + */
> +#ifndef __ASM_VDSO_GETRANDOM_H
> +#define __ASM_VDSO_GETRANDOM_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <asm/unistd.h>
> +#include <asm/vdso/vdso.h>
> +
> +static __always_inline ssize_t getrandom_syscall(void *_buffer,
> +                                                size_t _len,
> +                                                unsigned int _flags)
> +{
> +       register long ret asm("a0");
> +       register long int nr asm("a7") = __NR_getrandom;
> +       register void *buffer asm("a0") = _buffer;
> +       register size_t len asm("a1") = _len;
> +       register unsigned int flags asm("a2") = _flags;
> +
> +       asm volatile(
> +       "      syscall 0\n"
> +       : "+r" (ret)
> +       : "r" (nr), "r" (buffer), "r" (len), "r" (flags)
> +       : "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8",
> +         "memory");
> +
> +       return ret;
> +}
> +
> +static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(
> +       void)
Don't need a line break.

> +{
> +       return (const struct vdso_rng_data *)(
> +               get_vdso_data() +
> +               VVAR_LOONGARCH_PAGES_START * PAGE_SIZE +
> +               offsetof(struct loongarch_vdso_data, rng_data));
> +}
> +
> +extern void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key,
> +                                          u32 *counter, size_t nblocks);
> +
> +#endif /* !__ASSEMBLY__ */
> +
> +#endif /* __ASM_VDSO_GETRANDOM_H */
> diff --git a/arch/loongarch/include/asm/vdso/vdso.h b/arch/loongarch/include/asm/vdso/vdso.h
> index 5a12309d9fb5..a2e24c3007e2 100644
> --- a/arch/loongarch/include/asm/vdso/vdso.h
> +++ b/arch/loongarch/include/asm/vdso/vdso.h
> @@ -4,6 +4,9 @@
>   * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
>   */
>
> +#ifndef _ASM_VDSO_VDSO_H
> +#define _ASM_VDSO_VDSO_H
> +
>  #ifndef __ASSEMBLY__
>
>  #include <asm/asm.h>
> @@ -16,6 +19,9 @@ struct vdso_pcpu_data {
>
>  struct loongarch_vdso_data {
>         struct vdso_pcpu_data pdata[NR_CPUS];
> +#ifdef CONFIG_VDSO_GETRANDOM
You select VDSO_GETRANDOM unconditionally, so #ifdef is useless.

> +       struct vdso_rng_data rng_data;
> +#endif
>  };
>
>  /*
> @@ -63,3 +69,5 @@ static inline unsigned long get_vdso_data(void)
>  }
>
>  #endif /* __ASSEMBLY__ */
> +
> +#endif
> diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c
> index bee9f7a3108f..86f6d8a6dc23 100644
> --- a/arch/loongarch/kernel/asm-offsets.c
> +++ b/arch/loongarch/kernel/asm-offsets.c
> @@ -14,6 +14,7 @@
>  #include <asm/ptrace.h>
>  #include <asm/processor.h>
>  #include <asm/ftrace.h>
> +#include <asm/vdso/vdso.h>
>
>  static void __used output_ptreg_defines(void)
>  {
> @@ -321,3 +322,12 @@ static void __used output_kvm_defines(void)
>         OFFSET(KVM_GPGD, kvm, arch.pgd);
>         BLANK();
>  }
> +
> +#ifdef CONFIG_VDSO_GETRANDOM
The same.

> +static void __used output_vdso_rng_defines(void)
> +{
> +       COMMENT("LoongArch VDSO getrandom offsets.");
> +       OFFSET(VDSO_RNG_DATA, loongarch_vdso_data, rng_data);
> +       BLANK();
> +}
> +#endif
> diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c
> index 90dfccb41c14..15b65d8e2fdc 100644
> --- a/arch/loongarch/kernel/vdso.c
> +++ b/arch/loongarch/kernel/vdso.c
> @@ -22,6 +22,7 @@
>  #include <vdso/helpers.h>
>  #include <vdso/vsyscall.h>
>  #include <vdso/datapage.h>
> +#include <generated/asm-offsets.h>
>  #include <generated/vdso-offsets.h>
>
>  extern char vdso_start[], vdso_end[];
> @@ -34,6 +35,11 @@ static union {
>         struct loongarch_vdso_data vdata;
>  } loongarch_vdso_data __page_aligned_data;
>
> +#ifdef CONFIG_VDSO_GETRANDOM
The same.

> +asm(".globl _vdso_rng_data\n"
> +    ".set _vdso_rng_data, loongarch_vdso_data + " __stringify(VDSO_RNG_DATA));
> +#endif
> +
>  static struct page *vdso_pages[] = { NULL };
>  struct vdso_data *vdso_data = generic_vdso_data.data;
>  struct vdso_pcpu_data *vdso_pdata = loongarch_vdso_data.vdata.pdata;
> diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile
> index 2ddf0480e710..c8c5d9a7c80c 100644
> --- a/arch/loongarch/vdso/Makefile
> +++ b/arch/loongarch/vdso/Makefile
> @@ -6,6 +6,8 @@ include $(srctree)/lib/vdso/Makefile
>
>  obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o sigreturn.o
>
> +obj-vdso-$(CONFIG_VDSO_GETRANDOM) += vgetrandom.o vgetrandom-chacha.o memset.o
> +
>  # Common compiler flags between ABIs.
>  ccflags-vdso := \
>         $(filter -I%,$(KBUILD_CFLAGS)) \
> diff --git a/arch/loongarch/vdso/memset.S b/arch/loongarch/vdso/memset.S
> new file mode 100644
> index 000000000000..ec1531683936
> --- /dev/null
> +++ b/arch/loongarch/vdso/memset.S
> @@ -0,0 +1,24 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * A copy of __memset_generic from arch/loongarch/lib/memset.S for vDSO.
> + *
> + * Copyright (C) 2020-2024 Loongson Technology Corporation Limited
> + */
> +
> +#include <asm/regdef.h>
> +#include <linux/linkage.h>
> +
> +SYM_FUNC_START(memset)
> +       move    a3, a0
> +       beqz    a2, 2f
> +
> +1:     st.b    a1, a0, 0
> +       addi.d  a0, a0, 1
> +       addi.d  a2, a2, -1
> +       bgt     a2, zero, 1b
> +
> +2:     move    a0, a3
> +       jr      ra
> +SYM_FUNC_END(memset)
> +
> +.hidden memset
> diff --git a/arch/loongarch/vdso/vdso.lds.S b/arch/loongarch/vdso/vdso.lds.S
> index 56ad855896de..2c965a597d9e 100644
> --- a/arch/loongarch/vdso/vdso.lds.S
> +++ b/arch/loongarch/vdso/vdso.lds.S
> @@ -63,6 +63,7 @@ VERSION
>                 __vdso_clock_gettime;
>                 __vdso_gettimeofday;
>                 __vdso_rt_sigreturn;
> +               __vdso_getrandom;
In my opinion, __vdso_rt_sigreturn is different from others, so I
prefer to keep it at last.


Huacai

>         local: *;
>         };
>  }
> diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S
> new file mode 100644
> index 000000000000..2e42198f2faf
> --- /dev/null
> +++ b/arch/loongarch/vdso/vgetrandom-chacha.S
> @@ -0,0 +1,239 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2024 Xi Ruoyao <xry111@xxxxxxxxxxx>. All Rights Reserved.
> + */
> +
> +#include <asm/asm.h>
> +#include <asm/regdef.h>
> +#include <linux/linkage.h>
> +
> +.text
> +
> +/* Salsa20 quarter-round */
> +.macro QR      a b c d
> +       add.w           \a, \a, \b
> +       xor             \d, \d, \a
> +       rotri.w         \d, \d, 16
> +
> +       add.w           \c, \c, \d
> +       xor             \b, \b, \c
> +       rotri.w         \b, \b, 20
> +
> +       add.w           \a, \a, \b
> +       xor             \d, \d, \a
> +       rotri.w         \d, \d, 24
> +
> +       add.w           \c, \c, \d
> +       xor             \b, \b, \c
> +       rotri.w         \b, \b, 25
> +.endm
> +
> +/*
> + * Very basic LoongArch implementation of ChaCha20. Produces a given positive
> + * number of blocks of output with a nonce of 0, taking an input key and
> + * 8-byte counter. Importantly does not spill to the stack. Its arguments
> + * are:
> + *
> + *     a0: output bytes
> + *     a1: 32-byte key input
> + *     a2: 8-byte counter input/output
> + *     a3: number of 64-byte blocks to write to output
> + */
> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> +
> +/* We don't need a frame pointer */
> +#define s9             fp
> +
> +#define output         a0
> +#define key            a1
> +#define counter                a2
> +#define nblocks                a3
> +#define i              a4
> +#define state0         s0
> +#define state1         s1
> +#define state2         s2
> +#define state3         s3
> +#define state4         s4
> +#define state5         s5
> +#define state6         s6
> +#define state7         s7
> +#define state8         s8
> +#define state9         s9
> +#define state10                a5
> +#define state11                a6
> +#define state12                a7
> +#define state13                t0
> +#define state14                t1
> +#define state15                t2
> +#define cnt_lo         t3
> +#define cnt_hi         t4
> +#define copy0          t5
> +#define copy1          t6
> +#define copy2          t7
> +
> +/* Reuse i as copy3 */
> +#define copy3          i
> +
> +       /*
> +        * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
> +        * This does not violate the stack-less requirement: no sensitive data
> +        * is spilled onto the stack.
> +        */
> +       PTR_ADDI        sp, sp, (-SZREG * 10) & STACK_ALIGN
> +       REG_S           s0, sp, 0
> +       REG_S           s1, sp, SZREG
> +       REG_S           s2, sp, SZREG * 2
> +       REG_S           s3, sp, SZREG * 3
> +       REG_S           s4, sp, SZREG * 4
> +       REG_S           s5, sp, SZREG * 5
> +       REG_S           s6, sp, SZREG * 6
> +       REG_S           s7, sp, SZREG * 7
> +       REG_S           s8, sp, SZREG * 8
> +       REG_S           s9, sp, SZREG * 9
> +
> +       li.w            copy0, 0x61707865
> +       li.w            copy1, 0x3320646e
> +       li.w            copy2, 0x79622d32
> +
> +       ld.w            cnt_lo, counter, 0
> +       ld.w            cnt_hi, counter, 4
> +
> +.Lblock:
> +       /* state[0,1,2,3] = "expand 32-byte k" */
> +       move            state0, copy0
> +       move            state1, copy1
> +       move            state2, copy2
> +       li.w            state3, 0x6b206574
> +
> +       /* state[4,5,..,11] = key */
> +       ld.w            state4, key, 0
> +       ld.w            state5, key, 4
> +       ld.w            state6, key, 8
> +       ld.w            state7, key, 12
> +       ld.w            state8, key, 16
> +       ld.w            state9, key, 20
> +       ld.w            state10, key, 24
> +       ld.w            state11, key, 28
> +
> +       /* state[12,13] = counter */
> +       move            state12, cnt_lo
> +       move            state13, cnt_hi
> +
> +       /* state[14,15] = 0 */
> +       move            state14, zero
> +       move            state15, zero
> +
> +       li.w            i, 10
> +.Lpermute:
> +       /* odd round */
> +       QR              state0, state4, state8, state12
> +       QR              state1, state5, state9, state13
> +       QR              state2, state6, state10, state14
> +       QR              state3, state7, state11, state15
> +
> +       /* even round */
> +       QR              state0, state5, state10, state15
> +       QR              state1, state6, state11, state12
> +       QR              state2, state7, state8, state13
> +       QR              state3, state4, state9, state14
> +
> +       addi.w          i, i, -1
> +       bnez            i, .Lpermute
> +
> +       /* copy[3] = "expa" */
> +       li.w            copy3, 0x6b206574
> +
> +       /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
> +       add.w           state0, state0, copy0
> +       add.w           state1, state1, copy1
> +       add.w           state2, state2, copy2
> +       add.w           state3, state3, copy3
> +       st.w            state0, output, 0
> +       st.w            state1, output, 4
> +       st.w            state2, output, 8
> +       st.w            state3, output, 12
> +
> +       /* from now on state[0,1,2,3] are scratch registers  */
> +
> +       /* state[0,1,2,3] = lo32(key) */
> +       ld.w            state0, key, 0
> +       ld.w            state1, key, 4
> +       ld.w            state2, key, 8
> +       ld.w            state3, key, 12
> +
> +       /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
> +       add.w           state4, state4, state0
> +       add.w           state5, state5, state1
> +       add.w           state6, state6, state2
> +       add.w           state7, state7, state3
> +       st.w            state4, output, 16
> +       st.w            state5, output, 20
> +       st.w            state6, output, 24
> +       st.w            state7, output, 28
> +
> +       /* state[0,1,2,3] = hi32(key) */
> +       ld.w            state0, key, 16
> +       ld.w            state1, key, 20
> +       ld.w            state2, key, 24
> +       ld.w            state3, key, 28
> +
> +       /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
> +       add.w           state8, state8, state0
> +       add.w           state9, state9, state1
> +       add.w           state10, state10, state2
> +       add.w           state11, state11, state3
> +       st.w            state8, output, 32
> +       st.w            state9, output, 36
> +       st.w            state10, output, 40
> +       st.w            state11, output, 44
> +
> +       /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
> +       add.w           state12, state12, cnt_lo
> +       add.w           state13, state13, cnt_hi
> +       st.w            state12, output, 48
> +       st.w            state13, output, 52
> +       st.w            state14, output, 56
> +       st.w            state15, output, 60
> +
> +       /* ++counter  */
> +       addi.w          cnt_lo, cnt_lo, 1
> +       sltui           state0, cnt_lo, 1
> +       add.w           cnt_hi, cnt_hi, state0
> +
> +       /* output += 64 */
> +       PTR_ADDI        output, output, 64
> +       /* --nblocks */
> +       PTR_ADDI        nblocks, nblocks, -1
> +       bnez            nblocks, .Lblock
> +
> +       /* counter = [cnt_lo, cnt_hi] */
> +       st.w            cnt_lo, counter, 0
> +       st.w            cnt_hi, counter, 4
> +
> +       /*
> +        * Zero out the potentially sensitive regs, in case nothing uses these
> +        * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
> +        * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
> +        * only need to zero state[11,...,15].
> +        */
> +       move            state10, zero
> +       move            state11, zero
> +       move            state12, zero
> +       move            state13, zero
> +       move            state14, zero
> +       move            state15, zero
> +
> +       REG_L           s0, sp, 0
> +       REG_L           s1, sp, SZREG
> +       REG_L           s2, sp, SZREG * 2
> +       REG_L           s3, sp, SZREG * 3
> +       REG_L           s4, sp, SZREG * 4
> +       REG_L           s5, sp, SZREG * 5
> +       REG_L           s6, sp, SZREG * 6
> +       REG_L           s7, sp, SZREG * 7
> +       REG_L           s8, sp, SZREG * 8
> +       REG_L           s9, sp, SZREG * 9
> +       PTR_ADDI        sp, sp, -((-SZREG * 10) & STACK_ALIGN)
> +
> +       jr              ra
> +SYM_FUNC_END(__arch_chacha20_blocks_nostack)
> diff --git a/arch/loongarch/vdso/vgetrandom.c b/arch/loongarch/vdso/vgetrandom.c
> new file mode 100644
> index 000000000000..0b3b30ecd68a
> --- /dev/null
> +++ b/arch/loongarch/vdso/vgetrandom.c
> @@ -0,0 +1,19 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2024 Xi Ruoyao <xry111@xxxxxxxxxxx>. All Rights Reserved.
> + */
> +#include <linux/types.h>
> +
> +#include "../../../../lib/vdso/getrandom.c"
> +
> +typeof(__cvdso_getrandom) __vdso_getrandom;
> +
> +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags,
> +                        void *opaque_state, size_t opaque_len)
> +{
> +       return __cvdso_getrandom(buffer, len, flags, opaque_state,
> +                                opaque_len);
> +}
> +
> +typeof(__cvdso_getrandom) getrandom
> +       __attribute__((weak, alias("__vdso_getrandom")));
> --
> 2.46.0
>





[Index of Archives]     [Kernel]     [Gnu Classpath]     [Gnu Crypto]     [DM Crypt]     [Netfilter]     [Bugtraq]
  Powered by Linux