Re: [PATCH v14 7/7] x86: vdso: Wire up getrandom() vDSO implementation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




Le 01/01/2023 à 17:29, Jason A. Donenfeld a écrit :
> Hook up the generic vDSO implementation to the x86 vDSO data page. Since
> the existing vDSO infrastructure is heavily based on the timekeeping
> functionality, which works over arrays of bases, a new macro is
> introduced for vvars that are not arrays.
> 
> The vDSO function requires a ChaCha20 implementation that does not write
> to the stack, yet can still do an entire ChaCha20 permutation, so
> provide this using SSE2, since this is userland code that must work on
> all x86-64 processors. There's a simple test for this code as well.

As far as I understand the test is not dependent on the architecture, 
can it be a separate patch ?

Also, as the chacha implementation is standalone and can be tested by 
the above mentionned simple test, can it be a separate patch as well ?

Then the last patch only has the glue to wire-up getrandom VDSO to the 
architecture, and can be used as a reference for other architectures.

> 
> Reviewed-by: Samuel Neves <sneves@xxxxxxxxx> # for vgetrandom-chacha.S
> Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx>
> ---
>   arch/x86/Kconfig                              |   1 +
>   arch/x86/entry/vdso/Makefile                  |   3 +-
>   arch/x86/entry/vdso/vdso.lds.S                |   2 +
>   arch/x86/entry/vdso/vgetrandom-chacha.S       | 178 ++++++++++++++++++
>   arch/x86/entry/vdso/vgetrandom.c              |  17 ++
>   arch/x86/include/asm/vdso/getrandom.h         |  55 ++++++
>   arch/x86/include/asm/vdso/vsyscall.h          |   2 +
>   arch/x86/include/asm/vvar.h                   |  16 ++
>   tools/testing/selftests/vDSO/.gitignore       |   1 +
>   tools/testing/selftests/vDSO/Makefile         |   9 +
>   .../testing/selftests/vDSO/vdso_test_chacha.c |  43 +++++
>   11 files changed, 326 insertions(+), 1 deletion(-)
>   create mode 100644 arch/x86/entry/vdso/vgetrandom-chacha.S
>   create mode 100644 arch/x86/entry/vdso/vgetrandom.c
>   create mode 100644 arch/x86/include/asm/vdso/getrandom.h
>   create mode 100644 tools/testing/selftests/vDSO/vdso_test_chacha.c
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 3604074a878b..ed689d831362 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -272,6 +272,7 @@ config X86
>   	select HAVE_UNSTABLE_SCHED_CLOCK
>   	select HAVE_USER_RETURN_NOTIFIER
>   	select HAVE_GENERIC_VDSO
> +	select VDSO_GETRANDOM			if X86_64
>   	select HOTPLUG_SMT			if SMP
>   	select IRQ_FORCED_THREADING
>   	select NEED_PER_CPU_EMBED_FIRST_CHUNK
> diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
> index 838613ac15b8..3979bb4a61ae 100644
> --- a/arch/x86/entry/vdso/Makefile
> +++ b/arch/x86/entry/vdso/Makefile
> @@ -27,7 +27,7 @@ VDSO32-$(CONFIG_X86_32)		:= y
>   VDSO32-$(CONFIG_IA32_EMULATION)	:= y
>   
>   # files to link into the vdso
> -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
> +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o
>   vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
>   vobjs32-y += vdso32/vclock_gettime.o
>   vobjs-$(CONFIG_X86_SGX)	+= vsgx.o
> @@ -105,6 +105,7 @@ CFLAGS_REMOVE_vclock_gettime.o = -pg
>   CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
>   CFLAGS_REMOVE_vgetcpu.o = -pg
>   CFLAGS_REMOVE_vsgx.o = -pg
> +CFLAGS_REMOVE_vgetrandom.o = -pg
>   
>   #
>   # X32 processes use x32 vDSO to access 64bit kernel data.
> diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
> index e8c60ae7a7c8..0bab5f4af6d1 100644
> --- a/arch/x86/entry/vdso/vdso.lds.S
> +++ b/arch/x86/entry/vdso/vdso.lds.S
> @@ -30,6 +30,8 @@ VERSION {
>   #ifdef CONFIG_X86_SGX
>   		__vdso_sgx_enter_enclave;
>   #endif
> +		getrandom;
> +		__vdso_getrandom;
>   	local: *;
>   	};
>   }
> diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S
> new file mode 100644
> index 000000000000..d79e2bd97598
> --- /dev/null
> +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S
> @@ -0,0 +1,178 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/frame.h>
> +
> +.section	.rodata, "a"
> +.align 16
> +CONSTANTS:	.octa 0x6b20657479622d323320646e61707865
> +.text
> +
> +/*
> + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
> + * of blocks of output with a nonce of 0, taking an input key and 8-byte
> + * counter. Importantly does not spill to the stack. Its arguments are:
> + *
> + *	rdi: output bytes
> + *	rsi: 32-byte key input
> + *	rdx: 8-byte counter input/output

Why a 8-byte counter ? According to RFC 7539, chacha20 takes:

    The inputs to ChaCha20 are:

    o  A 256-bit key, treated as a concatenation of eight 32-bit little-
       endian integers.

    o  A 96-bit nonce, treated as a concatenation of three 32-bit little-
       endian integers.

    o  A 32-bit block count parameter, treated as a 32-bit little-endian
       integer.


Are you mixing up the upper part of the counter with the nonce ? In that 
case you can't say you use a 0 nonce, can you ?


> + *	rcx: number of 64-byte blocks to write to output
> + */
> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> +
> +.set	output,		%rdi
> +.set	key,		%rsi
> +.set	counter,	%rdx
> +.set	nblocks,	%rcx
> +.set	i,		%al
> +/* xmm registers are *not* callee-save. */
> +.set	state0,		%xmm0
> +.set	state1,		%xmm1
> +.set	state2,		%xmm2
> +.set	state3,		%xmm3
> +.set	copy0,		%xmm4
> +.set	copy1,		%xmm5
> +.set	copy2,		%xmm6
> +.set	copy3,		%xmm7
> +.set	temp,		%xmm8
> +.set	one,		%xmm9
> +
> +	/* copy0 = "expand 32-byte k" */
> +	movaps		CONSTANTS(%rip),copy0
> +	/* copy1,copy2 = key */
> +	movups		0x00(key),copy1
> +	movups		0x10(key),copy2
> +	/* copy3 = counter || zero nonce */
> +	movq		0x00(counter),copy3
> +	/* one = 1 || 0 */
> +	movq		$1,%rax
> +	movq		%rax,one
> +
> +.Lblock:
> +	/* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
> +	movdqa		copy0,state0
> +	movdqa		copy1,state1
> +	movdqa		copy2,state2
> +	movdqa		copy3,state3
> +
> +	movb		$10,i
> +.Lpermute:
> +	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> +	paddd		state1,state0
> +	pxor		state0,state3
> +	movdqa		state3,temp
> +	pslld		$16,temp
> +	psrld		$16,state3
> +	por		temp,state3

There is a lot of similarities between all the blocks, can you use GAS 
macros to avoid repetitions ?


> +
> +	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> +	paddd		state3,state2
> +	pxor		state2,state1
> +	movdqa		state1,temp
> +	pslld		$12,temp
> +	psrld		$20,state1
> +	por		temp,state1
> +
> +	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> +	paddd		state1,state0
> +	pxor		state0,state3
> +	movdqa		state3,temp
> +	pslld		$8,temp
> +	psrld		$24,state3
> +	por		temp,state3
> +
> +	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> +	paddd		state3,state2
> +	pxor		state2,state1
> +	movdqa		state1,temp
> +	pslld		$7,temp
> +	psrld		$25,state1
> +	por		temp,state1
> +
> +	/* state1[0,1,2,3] = state1[1,2,3,0] */
> +	pshufd		$0x39,state1,state1
> +	/* state2[0,1,2,3] = state2[2,3,0,1] */
> +	pshufd		$0x4e,state2,state2
> +	/* state3[0,1,2,3] = state3[3,0,1,2] */
> +	pshufd		$0x93,state3,state3
> +
> +	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> +	paddd		state1,state0
> +	pxor		state0,state3
> +	movdqa		state3,temp
> +	pslld		$16,temp
> +	psrld		$16,state3
> +	por		temp,state3
> +
> +	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> +	paddd		state3,state2
> +	pxor		state2,state1
> +	movdqa		state1,temp
> +	pslld		$12,temp
> +	psrld		$20,state1
> +	por		temp,state1
> +
> +	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> +	paddd		state1,state0
> +	pxor		state0,state3
> +	movdqa		state3,temp
> +	pslld		$8,temp
> +	psrld		$24,state3
> +	por		temp,state3
> +
> +	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> +	paddd		state3,state2
> +	pxor		state2,state1
> +	movdqa		state1,temp
> +	pslld		$7,temp
> +	psrld		$25,state1
> +	por		temp,state1
> +
> +	/* state1[0,1,2,3] = state1[3,0,1,2] */
> +	pshufd		$0x93,state1,state1
> +	/* state2[0,1,2,3] = state2[2,3,0,1] */
> +	pshufd		$0x4e,state2,state2
> +	/* state3[0,1,2,3] = state3[1,2,3,0] */
> +	pshufd		$0x39,state3,state3
> +
> +	decb		i
> +	jnz		.Lpermute
> +
> +	/* output0 = state0 + copy0 */
> +	paddd		copy0,state0
> +	movups		state0,0x00(output)
> +	/* output1 = state1 + copy1 */
> +	paddd		copy1,state1
> +	movups		state1,0x10(output)
> +	/* output2 = state2 + copy2 */
> +	paddd		copy2,state2
> +	movups		state2,0x20(output)
> +	/* output3 = state3 + copy3 */
> +	paddd		copy3,state3
> +	movups		state3,0x30(output)
> +
> +	/* ++copy3.counter */
> +	paddq		one,copy3
> +
> +	/* output += 64, --nblocks */
> +	addq		$64,output
> +	decq		nblocks
> +	jnz		.Lblock
> +
> +	/* counter = copy3.counter */
> +	movq		copy3,0x00(counter)
> +
> +	/* Zero out the potentially sensitive regs, in case nothing uses these again. */
> +	pxor		state0,state0
> +	pxor		state1,state1
> +	pxor		state2,state2
> +	pxor		state3,state3
> +	pxor		copy1,copy1
> +	pxor		copy2,copy2
> +	pxor		temp,temp
> +
> +	ret
> +SYM_FUNC_END(__arch_chacha20_blocks_nostack)
> diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c
> new file mode 100644
> index 000000000000..6045ded5da90
> --- /dev/null
> +++ b/arch/x86/entry/vdso/vgetrandom.c
> @@ -0,0 +1,17 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
> + */
> +#include <linux/types.h>
> +
> +#include "../../../../lib/vdso/getrandom.c"
> +
> +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state);
> +
> +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state)
> +{
> +	return __cvdso_getrandom(buffer, len, flags, state);
> +}
> +
> +ssize_t getrandom(void *, size_t, unsigned int, void *)
> +	__attribute__((weak, alias("__vdso_getrandom")));
> diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h
> new file mode 100644
> index 000000000000..46f99d735ae6
> --- /dev/null
> +++ b/arch/x86/include/asm/vdso/getrandom.h
> @@ -0,0 +1,55 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
> + */
> +#ifndef __ASM_VDSO_GETRANDOM_H
> +#define __ASM_VDSO_GETRANDOM_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <asm/unistd.h>
> +#include <asm/vvar.h>
> +
> +/**
> + * getrandom_syscall - Invoke the getrandom() syscall.
> + * @buffer:	Destination buffer to fill with random bytes.
> + * @len:	Size of @buffer in bytes.
> + * @flags:	Zero or more GRND_* flags.
> + * Returns the number of random bytes written to @buffer, or a negative value indicating an error.
> + */
> +static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags)
> +{
> +	long ret;
> +
> +	asm ("syscall" : "=a" (ret) :
> +	     "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) :
> +	     "rcx", "r11", "memory");
> +
> +	return ret;
> +}
> +
> +#define __vdso_rng_data (VVAR(_vdso_rng_data))
> +
> +static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
> +{
> +	if (__vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
> +		return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data);
> +	return &__vdso_rng_data;
> +}
> +
> +/**
> + * __arch_chacha20_blocks_nostack - Generate ChaCha20 stream without using the stack.
> + * @dst_bytes:	Destination buffer to hold @nblocks * 64 bytes of output.
> + * @key:	32-byte input key.
> + * @counter:	8-byte counter, read on input and updated on return.
> + * @nblocks:	Number of blocks to generate.
> + *
> + * Generates a given positive number of blocks of ChaCha20 output with nonce=0, and does not write
> + * to any stack or memory outside of the parameters passed to it, in order to mitigate stack data
> + * leaking into forked child processes.
> + */
> +extern void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks);
> +
> +#endif /* !__ASSEMBLY__ */
> +
> +#endif /* __ASM_VDSO_GETRANDOM_H */
> diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h
> index be199a9b2676..71c56586a22f 100644
> --- a/arch/x86/include/asm/vdso/vsyscall.h
> +++ b/arch/x86/include/asm/vdso/vsyscall.h
> @@ -11,6 +11,8 @@
>   #include <asm/vvar.h>
>   
>   DEFINE_VVAR(struct vdso_data, _vdso_data);
> +DEFINE_VVAR_SINGLE(struct vdso_rng_data, _vdso_rng_data);
> +
>   /*
>    * Update the vDSO data page to keep in sync with kernel timekeeping.
>    */
> diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
> index 183e98e49ab9..9d9af37f7cab 100644
> --- a/arch/x86/include/asm/vvar.h
> +++ b/arch/x86/include/asm/vvar.h
> @@ -26,6 +26,8 @@
>    */
>   #define DECLARE_VVAR(offset, type, name) \
>   	EMIT_VVAR(name, offset)
> +#define DECLARE_VVAR_SINGLE(offset, type, name) \
> +	EMIT_VVAR(name, offset)
>   
>   #else
>   
> @@ -37,6 +39,10 @@ extern char __vvar_page;
>   	extern type timens_ ## name[CS_BASES]				\
>   	__attribute__((visibility("hidden")));				\
>   
> +#define DECLARE_VVAR_SINGLE(offset, type, name)				\
> +	extern type vvar_ ## name					\
> +	__attribute__((visibility("hidden")));				\
> +
>   #define VVAR(name) (vvar_ ## name)
>   #define TIMENS(name) (timens_ ## name)
>   
> @@ -44,12 +50,22 @@ extern char __vvar_page;
>   	type name[CS_BASES]						\
>   	__attribute__((section(".vvar_" #name), aligned(16))) __visible
>   
> +#define DEFINE_VVAR_SINGLE(type, name)					\
> +	type name							\
> +	__attribute__((section(".vvar_" #name), aligned(16))) __visible
> +
>   #endif
>   
>   /* DECLARE_VVAR(offset, type, name) */
>   
>   DECLARE_VVAR(128, struct vdso_data, _vdso_data)
>   
> +#if !defined(_SINGLE_DATA)
> +#define _SINGLE_DATA
> +DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data)
> +#endif
> +
>   #undef DECLARE_VVAR
> +#undef DECLARE_VVAR_SINGLE
>   
>   #endif
> diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore
> index 7dbfdec53f3d..30d5c8f0e5c7 100644
> --- a/tools/testing/selftests/vDSO/.gitignore
> +++ b/tools/testing/selftests/vDSO/.gitignore
> @@ -7,3 +7,4 @@ vdso_test_gettimeofday
>   vdso_test_getcpu
>   vdso_standalone_test_x86
>   vdso_test_getrandom
> +vdso_test_chacha
> diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
> index a33b4d200a32..54a015afe60c 100644
> --- a/tools/testing/selftests/vDSO/Makefile
> +++ b/tools/testing/selftests/vDSO/Makefile
> @@ -3,6 +3,7 @@ include ../lib.mk
>   
>   uname_M := $(shell uname -m 2>/dev/null || echo not)
>   ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
> +SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null)
>   
>   TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu
>   TEST_GEN_PROGS += $(OUTPUT)/vdso_test_abi
> @@ -12,9 +13,15 @@ TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86
>   endif
>   TEST_GEN_PROGS += $(OUTPUT)/vdso_test_correctness
>   TEST_GEN_PROGS += $(OUTPUT)/vdso_test_getrandom
> +ifeq ($(uname_M),x86_64)
> +ifneq ($(SODIUM),)
> +TEST_GEN_PROGS += $(OUTPUT)/vdso_test_chacha
> +endif
> +endif
>   
>   CFLAGS := -std=gnu99
>   CFLAGS_vdso_standalone_test_x86 := -nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector
> +CFLAGS_vdso_test_chacha := $(SODIUM) -idirafter $(top_srcdir)/include -idirafter $(top_srcdir)/arch/$(ARCH)/include -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 -Wa,--noexecstack
>   LDFLAGS_vdso_test_correctness := -ldl
>   ifeq ($(CONFIG_X86_32),y)
>   LDLIBS += -lgcc_s
> @@ -35,3 +42,5 @@ $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c
>   		-o $@ \
>   		$(LDFLAGS_vdso_test_correctness)
>   $(OUTPUT)/vdso_test_getrandom: parse_vdso.c
> +$(OUTPUT)/vdso_test_chacha: CFLAGS += $(CFLAGS_vdso_test_chacha)
> +$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S
> diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c
> new file mode 100644
> index 000000000000..bce7a7752b11
> --- /dev/null
> +++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c
> @@ -0,0 +1,43 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
> + */
> +
> +#include <sodium/crypto_stream_chacha20.h>

Is that standard ? Every distribution has sodium ?

> +#include <sys/random.h>
> +#include <string.h>
> +#include <stdint.h>
> +#include "../kselftest.h"
> +
> +extern void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, const uint8_t *key, uint32_t *counter, size_t nblocks);
> +
> +int main(int argc, char *argv[])
> +{
> +	enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 };
> +	static const uint8_t nonce[8] = { 0 };
> +	uint32_t counter[2];
> +	uint8_t key[32];
> +	uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS];
> +
> +	ksft_print_header();
> +	ksft_set_plan(1);
> +
> +	for (unsigned int trial = 0; trial < TRIALS; ++trial) {
> +		if (getrandom(key, sizeof(key), 0) != sizeof(key)) {
> +			printf("getrandom() failed!\n");
> +			return KSFT_SKIP;
> +		}
> +		crypto_stream_chacha20(output1, sizeof(output1), nonce, key);
> +		for (unsigned int split = 0; split < BLOCKS; ++split) {
> +			memset(output2, 'X', sizeof(output2));
> +			memset(counter, 0, sizeof(counter));
> +			if (split)
> +				__arch_chacha20_blocks_nostack(output2, key, counter, split);
> +			__arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter, BLOCKS - split);
> +			if (memcmp(output1, output2, sizeof(output1)))
> +				return KSFT_FAIL;
> +		}
> +	}
> +	ksft_test_result_pass("chacha: PASS\n");
> +	return KSFT_PASS;
> +}

Christophe




[Index of Archives]     [Kernel]     [Gnu Classpath]     [Gnu Crypto]     [DM Crypt]     [Netfilter]     [Bugtraq]
  Powered by Linux