Re: [PATCH v2] aarch64: vdso: Wire up getrandom() vDSO implementation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Aug 29, 2024 at 08:17:14PM +0000, Adhemerval Zanella wrote:
> Hook up the generic vDSO implementation to the aarch64 vDSO data page.
> The _vdso_rng_data required data is placed within the _vdso_data vvar
> page, by using a offset larger than the vdso_data.
> 
> The vDSO function requires a ChaCha20 implementation that does not
> write to the stack, and that can do an entire ChaCha20 permutation.
> The one provided is based on the current chacha-neon-core.S and uses NEON
> on the permute operation. The fallback for chips that do not support
> NEON issues the syscall.
> 
> This also passes the vdso_test_chacha test along with
> vdso_test_getrandom. The vdso_test_getrandom bench-single result on
> Neoverse-N1 shows:
> 
>    vdso: 25000000 times in 0.746506464 seconds
>    libc: 25000000 times in 8.849179444 seconds
> syscall: 25000000 times in 8.818726425 seconds
> 
> Changes from v1:
> - Fixed style issues and typos.
> - Added fallback for systems without NEON support.
> - Avoid use of non-volatile vector registers in neon chacha20.
> - Use c-getrandom-y for vgetrandom.c.
> - Fixed TIMENS vdso_rnd_data access.
> 
> Signed-off-by: Adhemerval Zanella <adhemerval.zanella@xxxxxxxxxx>
> ---
>  arch/arm64/Kconfig                         |   1 +
>  arch/arm64/include/asm/vdso.h              |   6 +
>  arch/arm64/include/asm/vdso/getrandom.h    |  49 ++++++
>  arch/arm64/include/asm/vdso/vsyscall.h     |  10 ++
>  arch/arm64/kernel/vdso.c                   |   6 -
>  arch/arm64/kernel/vdso/Makefile            |  11 +-
>  arch/arm64/kernel/vdso/vdso                |   1 +
>  arch/arm64/kernel/vdso/vdso.lds.S          |   4 +
>  arch/arm64/kernel/vdso/vgetrandom-chacha.S | 168 +++++++++++++++++++++
>  arch/arm64/kernel/vdso/vgetrandom.c        |  15 ++
>  lib/vdso/getrandom.c                       |   1 +
>  tools/arch/arm64/vdso                      |   1 +
>  tools/include/linux/compiler.h             |   4 +
>  tools/testing/selftests/vDSO/Makefile      |   5 +-

Please can you split the tools/ changes into a separate patch?

>  14 files changed, 273 insertions(+), 9 deletions(-)
>  create mode 100644 arch/arm64/include/asm/vdso/getrandom.h
>  create mode 120000 arch/arm64/kernel/vdso/vdso
>  create mode 100644 arch/arm64/kernel/vdso/vgetrandom-chacha.S
>  create mode 100644 arch/arm64/kernel/vdso/vgetrandom.c
>  create mode 120000 tools/arch/arm64/vdso
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index a2f8ff354ca6..7f7424d1b3b8 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -262,6 +262,7 @@ config ARM64
>  	select TRACE_IRQFLAGS_NMI_SUPPORT
>  	select HAVE_SOFTIRQ_ON_OWN_STACK
>  	select USER_STACKTRACE_SUPPORT
> +	select VDSO_GETRANDOM
>  	help
>  	  ARM 64-bit (AArch64) Linux support.
>  
> diff --git a/arch/arm64/include/asm/vdso.h b/arch/arm64/include/asm/vdso.h
> index 4305995c8f82..18407b757c95 100644
> --- a/arch/arm64/include/asm/vdso.h
> +++ b/arch/arm64/include/asm/vdso.h
> @@ -16,6 +16,12 @@
>  
>  #ifndef __ASSEMBLY__
>  
> +enum vvar_pages {
> +	VVAR_DATA_PAGE_OFFSET,
> +	VVAR_TIMENS_PAGE_OFFSET,
> +	VVAR_NR_PAGES,
> +};
> +
>  #include <generated/vdso-offsets.h>
>  
>  #define VDSO_SYMBOL(base, name)						   \
> diff --git a/arch/arm64/include/asm/vdso/getrandom.h b/arch/arm64/include/asm/vdso/getrandom.h
> new file mode 100644
> index 000000000000..fca66ba49d4c
> --- /dev/null
> +++ b/arch/arm64/include/asm/vdso/getrandom.h
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_VDSO_GETRANDOM_H
> +#define __ASM_VDSO_GETRANDOM_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <asm/vdso.h>
> +#include <asm/unistd.h>
> +#include <vdso/datapage.h>
> +
> +/**
> + * getrandom_syscall - Invoke the getrandom() syscall.
> + * @buffer:	Destination buffer to fill with random bytes.
> + * @len:	Size of @buffer in bytes.
> + * @flags:	Zero or more GRND_* flags.
> + * Returns:	The number of random bytes written to @buffer, or a negative value indicating an error.
> + */
> +static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags)
> +{
> +	register void *buffer asm ("x0") = _buffer;
> +	register size_t len asm ("x1") = _len;
> +	register unsigned int flags asm ("x2") = _flags;
> +	register long ret asm ("x0");
> +	register long nr asm ("x8") = __NR_getrandom;
> +
> +	asm volatile(
> +	"       svc #0\n"
> +	: "=r" (ret)
> +	: "r" (buffer), "r" (len), "r" (flags), "r" (nr)
> +	: "memory");
> +
> +	return ret;
> +}
> +
> +static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
> +{
> +	/*
> +	 * If a task belongs to a time namespace then a namespace the real
> +	 * VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET.
> +	 */

This comment doesn't make sense.

> +	if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
> +		return (void*)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE;
> +	return &_vdso_rng_data;
> +}
> +
> +#endif /* !__ASSEMBLY__ */
> +
> +#endif /* __ASM_VDSO_GETRANDOM_H */
> diff --git a/arch/arm64/include/asm/vdso/vsyscall.h b/arch/arm64/include/asm/vdso/vsyscall.h
> index f94b1457c117..2a87f0e1b144 100644
> --- a/arch/arm64/include/asm/vdso/vsyscall.h
> +++ b/arch/arm64/include/asm/vdso/vsyscall.h
> @@ -2,8 +2,11 @@
>  #ifndef __ASM_VDSO_VSYSCALL_H
>  #define __ASM_VDSO_VSYSCALL_H
>  
> +#define __VDSO_RND_DATA_OFFSET  480

Why 480?

> +
>  #ifndef __ASSEMBLY__
>  
> +#include <asm/vdso.h>
>  #include <linux/timekeeper_internal.h>
>  #include <vdso/datapage.h>
>  
> @@ -21,6 +24,13 @@ struct vdso_data *__arm64_get_k_vdso_data(void)
>  }
>  #define __arch_get_k_vdso_data __arm64_get_k_vdso_data
>  
> +static __always_inline
> +struct vdso_rng_data *__arm64_get_k_vdso_rnd_data(void)
> +{
> +	return (void*)vdso_data + __VDSO_RND_DATA_OFFSET;
> +}
> +#define __arch_get_k_vdso_rng_data __arm64_get_k_vdso_rnd_data
> +
>  static __always_inline
>  void __arm64_update_vsyscall(struct vdso_data *vdata, struct timekeeper *tk)
>  {
> diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
> index 89b6e7840002..706c9c3a7a50 100644
> --- a/arch/arm64/kernel/vdso.c
> +++ b/arch/arm64/kernel/vdso.c
> @@ -34,12 +34,6 @@ enum vdso_abi {
>  	VDSO_ABI_AA32,
>  };
>  
> -enum vvar_pages {
> -	VVAR_DATA_PAGE_OFFSET,
> -	VVAR_TIMENS_PAGE_OFFSET,
> -	VVAR_NR_PAGES,
> -};
> -
>  struct vdso_abi_info {
>  	const char *name;
>  	const char *vdso_code_start;
> diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
> index d11da6461278..50246a38d6bd 100644
> --- a/arch/arm64/kernel/vdso/Makefile
> +++ b/arch/arm64/kernel/vdso/Makefile
> @@ -9,7 +9,7 @@
>  # Include the generic Makefile to check the built vdso.
>  include $(srctree)/lib/vdso/Makefile
>  
> -obj-vdso := vgettimeofday.o note.o sigreturn.o
> +obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o
>  
>  # Build rules
>  targets := $(obj-vdso) vdso.so vdso.so.dbg
> @@ -40,13 +40,22 @@ CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \
>  				$(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \
>  				$(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \
>  				-Wmissing-prototypes -Wmissing-declarations
> +CFLAGS_REMOVE_vgetrandom.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \
> +			     $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \
> +			     $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \
> +			     -Wmissing-prototypes -Wmissing-declarations
>  
>  CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny -fasynchronous-unwind-tables
> +CFLAGS_vgetrandom.o = -O2 -mcmodel=tiny -fasynchronous-unwind-tables

You're using identical CFLAGS_ and CFLAGS_REMOVE_ definitions for
vgettimeofdat.o and vgetrandom.o. Please refactor this so that they use
common definitions.

> diff --git a/arch/arm64/kernel/vdso/vdso b/arch/arm64/kernel/vdso/vdso
> new file mode 120000
> index 000000000000..233c7a26f6e5
> --- /dev/null
> +++ b/arch/arm64/kernel/vdso/vdso
> @@ -0,0 +1 @@
> +../../../arch/arm64/kernel/vdso
> \ No newline at end of file
> diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S
> index 45354f2ddf70..f204a9ddc833 100644
> --- a/arch/arm64/kernel/vdso/vdso.lds.S
> +++ b/arch/arm64/kernel/vdso/vdso.lds.S
> @@ -11,7 +11,9 @@
>  #include <linux/const.h>
>  #include <asm/page.h>
>  #include <asm/vdso.h>
> +#include <asm/vdso/vsyscall.h>
>  #include <asm-generic/vmlinux.lds.h>
> +#include <vdso/datapage.h>
>  
>  OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64")
>  OUTPUT_ARCH(aarch64)
> @@ -19,6 +21,7 @@ OUTPUT_ARCH(aarch64)
>  SECTIONS
>  {
>  	PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
> +	PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET);
>  #ifdef CONFIG_TIME_NS
>  	PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
>  #endif
> @@ -102,6 +105,7 @@ VERSION
>  		__kernel_gettimeofday;
>  		__kernel_clock_gettime;
>  		__kernel_clock_getres;
> +		__kernel_getrandom;
>  	local: *;
>  	};
>  }
> diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> new file mode 100644
> index 000000000000..9ebf12a09c65
> --- /dev/null
> +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> @@ -0,0 +1,168 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/linkage.h>
> +#include <asm/cache.h>
> +#include <asm/assembler.h>
> +
> +	.text
> +
> +#define state0		v0
> +#define state1		v1
> +#define state2		v2
> +#define state3		v3
> +#define copy0		v4
> +#define copy1		v5
> +#define copy2		v6
> +#define copy3		v7
> +#define copy3_d		d7
> +#define one_d		d16
> +#define one_q		q16
> +#define tmp		v17
> +#define rot8		v18
> +
> +/*
> + * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
> + * number of blocks of output with nonce 0, taking an input key and 8-bytes
> + * counter.  Importantly does not spill to the stack.
> + *
> + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
> + *				       const uint8_t *key,
> + * 				       uint32_t *counter,
> + *				       size_t nblocks)
> + *
> + * 	x0: output bytes
> + *	x1: 32-byte key input
> + *	x2: 8-byte counter input/output
> + *	x3: number of 64-byte block to write to output
> + */
> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)

Is there any way we can reuse the existing code in
crypto/chacha-neon-core.S for this? It looks to my untrained eye like
this is an arbitrarily different implementation to what we already have.

> +	/* copy0 = "expand 32-byte k" */
> +	adr_l		x8, CTES
> +	ld1		{copy0.4s}, [x8]
> +	/* copy1,copy2 = key */
> +	ld1		{ copy1.4s, copy2.4s }, [x1]
> +	/* copy3 = counter || zero nonce  */
> +	ldr		copy3_d, [x2]
> +
> +	adr_l		x8, ONE
> +	ldr		one_q, [x8]
> +
> +	adr_l		x10, ROT8
> +	ld1		{rot8.4s}, [x10]
> +.Lblock:
> +	/* copy state to auxiliary vectors for the final add after the permute.  */
> +	mov		state0.16b, copy0.16b
> +	mov		state1.16b, copy1.16b
> +	mov		state2.16b, copy2.16b
> +	mov		state3.16b, copy3.16b
> +
> +	mov		w4, 20
> +.Lpermute:
> +	/*
> +	 * Permute one 64-byte block where the state matrix is stored in the four NEON
> +	 * registers state0-state3.  It performs matrix operations on four words in parallel,
> +	 * but requires shuffling to rearrange the words after each round.
> +	 */
> +
> +.Ldoubleround:
> +	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> +	add		state0.4s, state0.4s, state1.4s
> +	eor		state3.16b, state3.16b, state0.16b
> +	rev32		state3.8h, state3.8h
> +
> +	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> +	add		state2.4s, state2.4s, state3.4s
> +	eor		tmp.16b, state1.16b, state2.16b
> +	shl		state1.4s, tmp.4s, #12
> +	sri		state1.4s, tmp.4s, #20
> +
> +	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> +	add		state0.4s, state0.4s, state1.4s
> +	eor		state3.16b, state3.16b, state0.16b
> +	tbl		state3.16b, {state3.16b}, rot8.16b
> +
> +	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> +	add		state2.4s, state2.4s, state3.4s
> +	eor		tmp.16b, state1.16b, state2.16b
> +	shl		state1.4s, tmp.4s, #7
> +	sri		state1.4s, tmp.4s, #25
> +
> +	/* state1[0,1,2,3] = state1[1,2,3,0] */
> +	ext		state1.16b, state1.16b, state1.16b, #4
> +	/* state2[0,1,2,3] = state2[2,3,0,1] */
> +	ext		state2.16b, state2.16b, state2.16b, #8
> +	/* state3[0,1,2,3] = state3[1,2,3,0] */
> +	ext		state3.16b, state3.16b, state3.16b, #12
> +
> +	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> +	add		state0.4s, state0.4s, state1.4s
> +	eor		state3.16b, state3.16b, state0.16b
> +	rev32		state3.8h, state3.8h
> +
> +	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> +	add		state2.4s, state2.4s, state3.4s
> +	eor		tmp.16b, state1.16b, state2.16b
> +	shl		state1.4s, tmp.4s, #12
> +	sri		state1.4s, tmp.4s, #20
> +
> +	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> +	add		state0.4s, state0.4s, state1.4s
> +	eor		state3.16b, state3.16b, state0.16b
> +	tbl		state3.16b, {state3.16b}, rot8.16b
> +
> +	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> +	add		state2.4s, state2.4s, state3.4s
> +	eor		tmp.16b, state1.16b, state2.16b
> +	shl		state1.4s, tmp.4s, #7
> +	sri		state1.4s, tmp.4s, #25
> +
> +	/* state1[0,1,2,3] = state1[3,0,1,2] */
> +	ext		state1.16b, state1.16b, state1.16b, #12
> +	/* state2[0,1,2,3] = state2[2,3,0,1] */
> +	ext		state2.16b, state2.16b, state2.16b, #8
> +	/* state3[0,1,2,3] = state3[1,2,3,0] */
> +	ext		state3.16b, state3.16b, state3.16b, #4
> +
> +	subs		w4, w4, #2
> +	b.ne		.Ldoubleround
> +
> +	/* output0 = state0 + state0 */
> +	add		state0.4s, state0.4s, copy0.4s
> +	/* output1 = state1 + state1 */
> +	add		state1.4s, state1.4s, copy1.4s
> +	/* output2 = state2 + state2 */
> +	add		state2.4s, state2.4s, copy2.4s
> +	/* output2 = state3 + state3 */
> +	add		state3.4s, state3.4s, copy3.4s
> +	st1		{ state0.4s - state3.4s }, [x0]
> +
> +	/* ++copy3.counter */
> +	add		copy3_d, copy3_d, one_d
> +
> +	/* output += 64, --nblocks */
> +	add		x0, x0, 64
> +	subs		x3, x3, #1
> +	b.ne		.Lblock
> +
> +	/* counter = copy3.counter */
> +	str		copy3_d, [x2]
> +
> +	/* Zero out the potentially sensitive regs, in case nothing uses these again. */
> +	eor		state0.16b, state0.16b, state0.16b
> +	eor		state1.16b, state1.16b, state1.16b
> +	eor		state2.16b, state2.16b, state2.16b
> +	eor		state3.16b, state3.16b, state3.16b
> +	eor		copy1.16b, copy1.16b, copy1.16b
> +	eor		copy2.16b, copy2.16b, copy2.16b
> +	ret
> +SYM_FUNC_END(__arch_chacha20_blocks_nostack)
> +
> +        .section        ".rodata", "a", %progbits
> +        .align          L1_CACHE_SHIFT
> +
> +CTES:	.word		1634760805, 857760878, 	2036477234, 1797285236
> +ONE:    .xword		1, 0
> +ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
> +
> +emit_aarch64_feature_1_and
> diff --git a/arch/arm64/kernel/vdso/vgetrandom.c b/arch/arm64/kernel/vdso/vgetrandom.c
> new file mode 100644
> index 000000000000..0833d25f3121
> --- /dev/null
> +++ b/arch/arm64/kernel/vdso/vgetrandom.c
> @@ -0,0 +1,15 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +typeof(__cvdso_getrandom) __kernel_getrandom;
> +
> +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
> +{
> +	asm goto (
> +	ALTERNATIVE("b %[fallback]", "nop", RM64_HAS_FPSIMD) : : : : fallback);

"RM64_HAS_FPSIMD". Are you sure you've tested this?

> +	return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
> +
> +fallback:
> +	if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags))
> +		return -ENOSYS;
> +	return getrandom_syscall(buffer, len, flags);
> +}
> diff --git a/lib/vdso/getrandom.c b/lib/vdso/getrandom.c
> index 938ca539aaa6..7c9711248d9b 100644
> --- a/lib/vdso/getrandom.c
> +++ b/lib/vdso/getrandom.c
> @@ -5,6 +5,7 @@
>  
>  #include <linux/array_size.h>
>  #include <linux/minmax.h>
> +#include <linux/mm.h>
>  #include <vdso/datapage.h>
>  #include <vdso/getrandom.h>
>  #include <vdso/unaligned.h>

Looks like this should be a separate change?

Will




[Index of Archives]     [Kernel]     [Gnu Classpath]     [Gnu Crypto]     [DM Crypt]     [Netfilter]     [Bugtraq]
  Powered by Linux