Le 01/01/2023 à 17:29, Jason A. Donenfeld a écrit : > Hook up the generic vDSO implementation to the x86 vDSO data page. Since > the existing vDSO infrastructure is heavily based on the timekeeping > functionality, which works over arrays of bases, a new macro is > introduced for vvars that are not arrays. > > The vDSO function requires a ChaCha20 implementation that does not write > to the stack, yet can still do an entire ChaCha20 permutation, so > provide this using SSE2, since this is userland code that must work on > all x86-64 processors. There's a simple test for this code as well. As far as I understand the test is not dependent on the architecture, can it be a separate patch ? Also, as the chacha implementation is standalone and can be tested by the above mentionned simple test, can it be a separate patch as well ? Then the last patch only has the glue to wire-up getrandom VDSO to the architecture, and can be used as a reference for other architectures. > > Reviewed-by: Samuel Neves <sneves@xxxxxxxxx> # for vgetrandom-chacha.S > Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx> > --- > arch/x86/Kconfig | 1 + > arch/x86/entry/vdso/Makefile | 3 +- > arch/x86/entry/vdso/vdso.lds.S | 2 + > arch/x86/entry/vdso/vgetrandom-chacha.S | 178 ++++++++++++++++++ > arch/x86/entry/vdso/vgetrandom.c | 17 ++ > arch/x86/include/asm/vdso/getrandom.h | 55 ++++++ > arch/x86/include/asm/vdso/vsyscall.h | 2 + > arch/x86/include/asm/vvar.h | 16 ++ > tools/testing/selftests/vDSO/.gitignore | 1 + > tools/testing/selftests/vDSO/Makefile | 9 + > .../testing/selftests/vDSO/vdso_test_chacha.c | 43 +++++ > 11 files changed, 326 insertions(+), 1 deletion(-) > create mode 100644 arch/x86/entry/vdso/vgetrandom-chacha.S > create mode 100644 arch/x86/entry/vdso/vgetrandom.c > create mode 100644 arch/x86/include/asm/vdso/getrandom.h > create mode 100644 tools/testing/selftests/vDSO/vdso_test_chacha.c > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index 3604074a878b..ed689d831362 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -272,6 +272,7 @@ config X86 > select HAVE_UNSTABLE_SCHED_CLOCK > select HAVE_USER_RETURN_NOTIFIER > select HAVE_GENERIC_VDSO > + select VDSO_GETRANDOM if X86_64 > select HOTPLUG_SMT if SMP > select IRQ_FORCED_THREADING > select NEED_PER_CPU_EMBED_FIRST_CHUNK > diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile > index 838613ac15b8..3979bb4a61ae 100644 > --- a/arch/x86/entry/vdso/Makefile > +++ b/arch/x86/entry/vdso/Makefile > @@ -27,7 +27,7 @@ VDSO32-$(CONFIG_X86_32) := y > VDSO32-$(CONFIG_IA32_EMULATION) := y > > # files to link into the vdso > -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o > +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o > vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o > vobjs32-y += vdso32/vclock_gettime.o > vobjs-$(CONFIG_X86_SGX) += vsgx.o > @@ -105,6 +105,7 @@ CFLAGS_REMOVE_vclock_gettime.o = -pg > CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg > CFLAGS_REMOVE_vgetcpu.o = -pg > CFLAGS_REMOVE_vsgx.o = -pg > +CFLAGS_REMOVE_vgetrandom.o = -pg > > # > # X32 processes use x32 vDSO to access 64bit kernel data. > diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S > index e8c60ae7a7c8..0bab5f4af6d1 100644 > --- a/arch/x86/entry/vdso/vdso.lds.S > +++ b/arch/x86/entry/vdso/vdso.lds.S > @@ -30,6 +30,8 @@ VERSION { > #ifdef CONFIG_X86_SGX > __vdso_sgx_enter_enclave; > #endif > + getrandom; > + __vdso_getrandom; > local: *; > }; > } > diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S > new file mode 100644 > index 000000000000..d79e2bd97598 > --- /dev/null > +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S > @@ -0,0 +1,178 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. > + */ > + > +#include <linux/linkage.h> > +#include <asm/frame.h> > + > +.section .rodata, "a" > +.align 16 > +CONSTANTS: .octa 0x6b20657479622d323320646e61707865 > +.text > + > +/* > + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number > + * of blocks of output with a nonce of 0, taking an input key and 8-byte > + * counter. Importantly does not spill to the stack. Its arguments are: > + * > + * rdi: output bytes > + * rsi: 32-byte key input > + * rdx: 8-byte counter input/output Why a 8-byte counter ? According to RFC 7539, chacha20 takes: The inputs to ChaCha20 are: o A 256-bit key, treated as a concatenation of eight 32-bit little- endian integers. o A 96-bit nonce, treated as a concatenation of three 32-bit little- endian integers. o A 32-bit block count parameter, treated as a 32-bit little-endian integer. Are you mixing up the upper part of the counter with the nonce ? In that case you can't say you use a 0 nonce, can you ? > + * rcx: number of 64-byte blocks to write to output > + */ > +SYM_FUNC_START(__arch_chacha20_blocks_nostack) > + > +.set output, %rdi > +.set key, %rsi > +.set counter, %rdx > +.set nblocks, %rcx > +.set i, %al > +/* xmm registers are *not* callee-save. */ > +.set state0, %xmm0 > +.set state1, %xmm1 > +.set state2, %xmm2 > +.set state3, %xmm3 > +.set copy0, %xmm4 > +.set copy1, %xmm5 > +.set copy2, %xmm6 > +.set copy3, %xmm7 > +.set temp, %xmm8 > +.set one, %xmm9 > + > + /* copy0 = "expand 32-byte k" */ > + movaps CONSTANTS(%rip),copy0 > + /* copy1,copy2 = key */ > + movups 0x00(key),copy1 > + movups 0x10(key),copy2 > + /* copy3 = counter || zero nonce */ > + movq 0x00(counter),copy3 > + /* one = 1 || 0 */ > + movq $1,%rax > + movq %rax,one > + > +.Lblock: > + /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ > + movdqa copy0,state0 > + movdqa copy1,state1 > + movdqa copy2,state2 > + movdqa copy3,state3 > + > + movb $10,i > +.Lpermute: > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ > + paddd state1,state0 > + pxor state0,state3 > + movdqa state3,temp > + pslld $16,temp > + psrld $16,state3 > + por temp,state3 There is a lot of similarities between all the blocks, can you use GAS macros to avoid repetitions ? > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ > + paddd state3,state2 > + pxor state2,state1 > + movdqa state1,temp > + pslld $12,temp > + psrld $20,state1 > + por temp,state1 > + > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ > + paddd state1,state0 > + pxor state0,state3 > + movdqa state3,temp > + pslld $8,temp > + psrld $24,state3 > + por temp,state3 > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ > + paddd state3,state2 > + pxor state2,state1 > + movdqa state1,temp > + pslld $7,temp > + psrld $25,state1 > + por temp,state1 > + > + /* state1[0,1,2,3] = state1[1,2,3,0] */ > + pshufd $0x39,state1,state1 > + /* state2[0,1,2,3] = state2[2,3,0,1] */ > + pshufd $0x4e,state2,state2 > + /* state3[0,1,2,3] = state3[3,0,1,2] */ > + pshufd $0x93,state3,state3 > + > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ > + paddd state1,state0 > + pxor state0,state3 > + movdqa state3,temp > + pslld $16,temp > + psrld $16,state3 > + por temp,state3 > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ > + paddd state3,state2 > + pxor state2,state1 > + movdqa state1,temp > + pslld $12,temp > + psrld $20,state1 > + por temp,state1 > + > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ > + paddd state1,state0 > + pxor state0,state3 > + movdqa state3,temp > + pslld $8,temp > + psrld $24,state3 > + por temp,state3 > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ > + paddd state3,state2 > + pxor state2,state1 > + movdqa state1,temp > + pslld $7,temp > + psrld $25,state1 > + por temp,state1 > + > + /* state1[0,1,2,3] = state1[3,0,1,2] */ > + pshufd $0x93,state1,state1 > + /* state2[0,1,2,3] = state2[2,3,0,1] */ > + pshufd $0x4e,state2,state2 > + /* state3[0,1,2,3] = state3[1,2,3,0] */ > + pshufd $0x39,state3,state3 > + > + decb i > + jnz .Lpermute > + > + /* output0 = state0 + copy0 */ > + paddd copy0,state0 > + movups state0,0x00(output) > + /* output1 = state1 + copy1 */ > + paddd copy1,state1 > + movups state1,0x10(output) > + /* output2 = state2 + copy2 */ > + paddd copy2,state2 > + movups state2,0x20(output) > + /* output3 = state3 + copy3 */ > + paddd copy3,state3 > + movups state3,0x30(output) > + > + /* ++copy3.counter */ > + paddq one,copy3 > + > + /* output += 64, --nblocks */ > + addq $64,output > + decq nblocks > + jnz .Lblock > + > + /* counter = copy3.counter */ > + movq copy3,0x00(counter) > + > + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ > + pxor state0,state0 > + pxor state1,state1 > + pxor state2,state2 > + pxor state3,state3 > + pxor copy1,copy1 > + pxor copy2,copy2 > + pxor temp,temp > + > + ret > +SYM_FUNC_END(__arch_chacha20_blocks_nostack) > diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c > new file mode 100644 > index 000000000000..6045ded5da90 > --- /dev/null > +++ b/arch/x86/entry/vdso/vgetrandom.c > @@ -0,0 +1,17 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. > + */ > +#include <linux/types.h> > + > +#include "../../../../lib/vdso/getrandom.c" > + > +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state); > + > +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state) > +{ > + return __cvdso_getrandom(buffer, len, flags, state); > +} > + > +ssize_t getrandom(void *, size_t, unsigned int, void *) > + __attribute__((weak, alias("__vdso_getrandom"))); > diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h > new file mode 100644 > index 000000000000..46f99d735ae6 > --- /dev/null > +++ b/arch/x86/include/asm/vdso/getrandom.h > @@ -0,0 +1,55 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. > + */ > +#ifndef __ASM_VDSO_GETRANDOM_H > +#define __ASM_VDSO_GETRANDOM_H > + > +#ifndef __ASSEMBLY__ > + > +#include <asm/unistd.h> > +#include <asm/vvar.h> > + > +/** > + * getrandom_syscall - Invoke the getrandom() syscall. > + * @buffer: Destination buffer to fill with random bytes. > + * @len: Size of @buffer in bytes. > + * @flags: Zero or more GRND_* flags. > + * Returns the number of random bytes written to @buffer, or a negative value indicating an error. > + */ > +static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags) > +{ > + long ret; > + > + asm ("syscall" : "=a" (ret) : > + "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) : > + "rcx", "r11", "memory"); > + > + return ret; > +} > + > +#define __vdso_rng_data (VVAR(_vdso_rng_data)) > + > +static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) > +{ > + if (__vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) > + return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data); > + return &__vdso_rng_data; > +} > + > +/** > + * __arch_chacha20_blocks_nostack - Generate ChaCha20 stream without using the stack. > + * @dst_bytes: Destination buffer to hold @nblocks * 64 bytes of output. > + * @key: 32-byte input key. > + * @counter: 8-byte counter, read on input and updated on return. > + * @nblocks: Number of blocks to generate. > + * > + * Generates a given positive number of blocks of ChaCha20 output with nonce=0, and does not write > + * to any stack or memory outside of the parameters passed to it, in order to mitigate stack data > + * leaking into forked child processes. > + */ > +extern void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks); > + > +#endif /* !__ASSEMBLY__ */ > + > +#endif /* __ASM_VDSO_GETRANDOM_H */ > diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h > index be199a9b2676..71c56586a22f 100644 > --- a/arch/x86/include/asm/vdso/vsyscall.h > +++ b/arch/x86/include/asm/vdso/vsyscall.h > @@ -11,6 +11,8 @@ > #include <asm/vvar.h> > > DEFINE_VVAR(struct vdso_data, _vdso_data); > +DEFINE_VVAR_SINGLE(struct vdso_rng_data, _vdso_rng_data); > + > /* > * Update the vDSO data page to keep in sync with kernel timekeeping. > */ > diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h > index 183e98e49ab9..9d9af37f7cab 100644 > --- a/arch/x86/include/asm/vvar.h > +++ b/arch/x86/include/asm/vvar.h > @@ -26,6 +26,8 @@ > */ > #define DECLARE_VVAR(offset, type, name) \ > EMIT_VVAR(name, offset) > +#define DECLARE_VVAR_SINGLE(offset, type, name) \ > + EMIT_VVAR(name, offset) > > #else > > @@ -37,6 +39,10 @@ extern char __vvar_page; > extern type timens_ ## name[CS_BASES] \ > __attribute__((visibility("hidden"))); \ > > +#define DECLARE_VVAR_SINGLE(offset, type, name) \ > + extern type vvar_ ## name \ > + __attribute__((visibility("hidden"))); \ > + > #define VVAR(name) (vvar_ ## name) > #define TIMENS(name) (timens_ ## name) > > @@ -44,12 +50,22 @@ extern char __vvar_page; > type name[CS_BASES] \ > __attribute__((section(".vvar_" #name), aligned(16))) __visible > > +#define DEFINE_VVAR_SINGLE(type, name) \ > + type name \ > + __attribute__((section(".vvar_" #name), aligned(16))) __visible > + > #endif > > /* DECLARE_VVAR(offset, type, name) */ > > DECLARE_VVAR(128, struct vdso_data, _vdso_data) > > +#if !defined(_SINGLE_DATA) > +#define _SINGLE_DATA > +DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data) > +#endif > + > #undef DECLARE_VVAR > +#undef DECLARE_VVAR_SINGLE > > #endif > diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore > index 7dbfdec53f3d..30d5c8f0e5c7 100644 > --- a/tools/testing/selftests/vDSO/.gitignore > +++ b/tools/testing/selftests/vDSO/.gitignore > @@ -7,3 +7,4 @@ vdso_test_gettimeofday > vdso_test_getcpu > vdso_standalone_test_x86 > vdso_test_getrandom > +vdso_test_chacha > diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile > index a33b4d200a32..54a015afe60c 100644 > --- a/tools/testing/selftests/vDSO/Makefile > +++ b/tools/testing/selftests/vDSO/Makefile > @@ -3,6 +3,7 @@ include ../lib.mk > > uname_M := $(shell uname -m 2>/dev/null || echo not) > ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) > +SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null) > > TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu > TEST_GEN_PROGS += $(OUTPUT)/vdso_test_abi > @@ -12,9 +13,15 @@ TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86 > endif > TEST_GEN_PROGS += $(OUTPUT)/vdso_test_correctness > TEST_GEN_PROGS += $(OUTPUT)/vdso_test_getrandom > +ifeq ($(uname_M),x86_64) > +ifneq ($(SODIUM),) > +TEST_GEN_PROGS += $(OUTPUT)/vdso_test_chacha > +endif > +endif > > CFLAGS := -std=gnu99 > CFLAGS_vdso_standalone_test_x86 := -nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector > +CFLAGS_vdso_test_chacha := $(SODIUM) -idirafter $(top_srcdir)/include -idirafter $(top_srcdir)/arch/$(ARCH)/include -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 -Wa,--noexecstack > LDFLAGS_vdso_test_correctness := -ldl > ifeq ($(CONFIG_X86_32),y) > LDLIBS += -lgcc_s > @@ -35,3 +42,5 @@ $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c > -o $@ \ > $(LDFLAGS_vdso_test_correctness) > $(OUTPUT)/vdso_test_getrandom: parse_vdso.c > +$(OUTPUT)/vdso_test_chacha: CFLAGS += $(CFLAGS_vdso_test_chacha) > +$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S > diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c > new file mode 100644 > index 000000000000..bce7a7752b11 > --- /dev/null > +++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c > @@ -0,0 +1,43 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. > + */ > + > +#include <sodium/crypto_stream_chacha20.h> Is that standard ? Every distribution has sodium ? > +#include <sys/random.h> > +#include <string.h> > +#include <stdint.h> > +#include "../kselftest.h" > + > +extern void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, const uint8_t *key, uint32_t *counter, size_t nblocks); > + > +int main(int argc, char *argv[]) > +{ > + enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 }; > + static const uint8_t nonce[8] = { 0 }; > + uint32_t counter[2]; > + uint8_t key[32]; > + uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS]; > + > + ksft_print_header(); > + ksft_set_plan(1); > + > + for (unsigned int trial = 0; trial < TRIALS; ++trial) { > + if (getrandom(key, sizeof(key), 0) != sizeof(key)) { > + printf("getrandom() failed!\n"); > + return KSFT_SKIP; > + } > + crypto_stream_chacha20(output1, sizeof(output1), nonce, key); > + for (unsigned int split = 0; split < BLOCKS; ++split) { > + memset(output2, 'X', sizeof(output2)); > + memset(counter, 0, sizeof(counter)); > + if (split) > + __arch_chacha20_blocks_nostack(output2, key, counter, split); > + __arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter, BLOCKS - split); > + if (memcmp(output1, output2, sizeof(output1))) > + return KSFT_FAIL; > + } > + } > + ksft_test_result_pass("chacha: PASS\n"); > + return KSFT_PASS; > +} Christophe