Re: [PATCH v13 7/7] x86: vdso: Wire up getrandom() vDSO implementation

Eric Biggers <ebiggers@xxxxxxxxxx> · Wed, 21 Dec 2022 15:27:04 -0800

On Wed, Dec 21, 2022 at 03:23:27PM +0100, Jason A. Donenfeld wrote:
> diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S
> new file mode 100644
> index 000000000000..91fbb7ac7af4
> --- /dev/null
> +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S
> @@ -0,0 +1,177 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/frame.h>
> +
> +.section	.rodata.cst16.CONSTANTS, "aM", @progbits, 16
> +.align 16
> +CONSTANTS:	.octa 0x6b20657479622d323320646e61707865
> +.text

For simplicity, maybe leave off the section mergeability stuff and just have
plain ".section .rodata"?

> +/*
> + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
> + * of blocks of output with a nonce of 0, taking an input key and 8-byte
> + * counter. Importantly does not spill to the stack. Its arguments are:
> + *
> + *	rdi: output bytes
> + *	rsi: 32-byte key input
> + *	rdx: 8-byte counter input/output
> + *	rcx: number of 64-byte blocks to write to output
> + */
> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> +
> +#define output  %rdi
> +#define key     %rsi
> +#define counter %rdx
> +#define nblocks %rcx
> +#define i       %al
> +#define state0  %xmm0
> +#define state1  %xmm1
> +#define state2  %xmm2
> +#define state3  %xmm3
> +#define copy0   %xmm4
> +#define copy1   %xmm5
> +#define copy2   %xmm6
> +#define copy3   %xmm7
> +#define temp    %xmm8
> +#define one     %xmm9

It would be worth mentioning in the function comment that none of the xmm
registers are callee-save.  That was not obvious to me.  I know that on arm64,
*kernel* code doesn't need to save/restore NEON registers, so it's not something
that arch/arm64/crypto/ does.  But, it *is* needed in arm64 userspace code.  So
I was worried that something similar would apply to x86_64, but it seems not.

> +	/* state1[0,1,2,3] = state1[0,3,2,1] */
> +	pshufd		$0x39,state1,state1
> +	/* state2[0,1,2,3] = state2[1,0,3,2] */
> +	pshufd		$0x4e,state2,state2
> +	/* state3[0,1,2,3] = state3[2,1,0,3] */
> +	pshufd		$0x93,state3,state3

The comments don't match the pshufd constants.  The code is correct but the
comments are not.  They should be:

	/* state1[0,1,2,3] = state1[1,2,3,0] */
	pshufd		$0x39,state1,state1
	/* state2[0,1,2,3] = state2[2,3,0,1] */
	pshufd		$0x4e,state2,state2
	/* state3[0,1,2,3] = state3[3,0,1,2] */
	pshufd		$0x93,state3,state3

> +	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> +	paddd		state1,state0
> +	pxor		state0,state3
> +	movdqa		state3,temp
> +	pslld		$16,temp
> +	psrld		$16,state3
> +	por		temp,state3
> +
> +	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> +	paddd		state3,state2
> +	pxor		state2,state1
> +	movdqa		state1,temp
> +	pslld		$12,temp
> +	psrld		$20,state1
> +	por		temp,state1
> +
> +	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> +	paddd		state1,state0
> +	pxor		state0,state3
> +	movdqa		state3,temp
> +	pslld		$8,temp
> +	psrld		$24,state3
> +	por		temp,state3
> +
> +	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> +	paddd		state3,state2
> +	pxor		state2,state1
> +	movdqa		state1,temp
> +	pslld		$7,temp
> +	psrld		$25,state1
> +	por		temp,state1

The above sequence of 24 instructions is repeated twice, so maybe it should be a
macro (".chacha_round"?).

> +	/* state1[0,1,2,3] = state1[2,1,0,3] */
> +	pshufd		$0x93,state1,state1
> +	/* state2[0,1,2,3] = state2[1,0,3,2] */
> +	pshufd		$0x4e,state2,state2
> +	/* state3[0,1,2,3] = state3[0,3,2,1] */
> +	pshufd		$0x39,state3,state3

Similarly, the above comments are wrong.  They should be:

	/* state1[0,1,2,3] = state1[3,0,1,2] */
	pshufd		$0x93,state1,state1
	/* state2[0,1,2,3] = state2[2,3,0,1] */
	pshufd		$0x4e,state2,state2
	/* state3[0,1,2,3] = state3[1,2,3,0] */
	pshufd		$0x39,state3,state3

- Eric