On Wed, Dec 21, 2022 at 03:23:27PM +0100, Jason A. Donenfeld wrote: > diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S > new file mode 100644 > index 000000000000..91fbb7ac7af4 > --- /dev/null > +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S > @@ -0,0 +1,177 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. > + */ > + > +#include <linux/linkage.h> > +#include <asm/frame.h> > + > +.section .rodata.cst16.CONSTANTS, "aM", @progbits, 16 > +.align 16 > +CONSTANTS: .octa 0x6b20657479622d323320646e61707865 > +.text For simplicity, maybe leave off the section mergeability stuff and just have plain ".section .rodata"? > +/* > + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number > + * of blocks of output with a nonce of 0, taking an input key and 8-byte > + * counter. Importantly does not spill to the stack. Its arguments are: > + * > + * rdi: output bytes > + * rsi: 32-byte key input > + * rdx: 8-byte counter input/output > + * rcx: number of 64-byte blocks to write to output > + */ > +SYM_FUNC_START(__arch_chacha20_blocks_nostack) > + > +#define output %rdi > +#define key %rsi > +#define counter %rdx > +#define nblocks %rcx > +#define i %al > +#define state0 %xmm0 > +#define state1 %xmm1 > +#define state2 %xmm2 > +#define state3 %xmm3 > +#define copy0 %xmm4 > +#define copy1 %xmm5 > +#define copy2 %xmm6 > +#define copy3 %xmm7 > +#define temp %xmm8 > +#define one %xmm9 It would be worth mentioning in the function comment that none of the xmm registers are callee-save. That was not obvious to me. I know that on arm64, *kernel* code doesn't need to save/restore NEON registers, so it's not something that arch/arm64/crypto/ does. But, it *is* needed in arm64 userspace code. So I was worried that something similar would apply to x86_64, but it seems not. > + /* state1[0,1,2,3] = state1[0,3,2,1] */ > + pshufd $0x39,state1,state1 > + /* state2[0,1,2,3] = state2[1,0,3,2] */ > + pshufd $0x4e,state2,state2 > + /* state3[0,1,2,3] = state3[2,1,0,3] */ > + pshufd $0x93,state3,state3 The comments don't match the pshufd constants. The code is correct but the comments are not. They should be: /* state1[0,1,2,3] = state1[1,2,3,0] */ pshufd $0x39,state1,state1 /* state2[0,1,2,3] = state2[2,3,0,1] */ pshufd $0x4e,state2,state2 /* state3[0,1,2,3] = state3[3,0,1,2] */ pshufd $0x93,state3,state3 > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ > + paddd state1,state0 > + pxor state0,state3 > + movdqa state3,temp > + pslld $16,temp > + psrld $16,state3 > + por temp,state3 > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ > + paddd state3,state2 > + pxor state2,state1 > + movdqa state1,temp > + pslld $12,temp > + psrld $20,state1 > + por temp,state1 > + > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ > + paddd state1,state0 > + pxor state0,state3 > + movdqa state3,temp > + pslld $8,temp > + psrld $24,state3 > + por temp,state3 > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ > + paddd state3,state2 > + pxor state2,state1 > + movdqa state1,temp > + pslld $7,temp > + psrld $25,state1 > + por temp,state1 The above sequence of 24 instructions is repeated twice, so maybe it should be a macro (".chacha_round"?). > + /* state1[0,1,2,3] = state1[2,1,0,3] */ > + pshufd $0x93,state1,state1 > + /* state2[0,1,2,3] = state2[1,0,3,2] */ > + pshufd $0x4e,state2,state2 > + /* state3[0,1,2,3] = state3[0,3,2,1] */ > + pshufd $0x39,state3,state3 Similarly, the above comments are wrong. They should be: /* state1[0,1,2,3] = state1[3,0,1,2] */ pshufd $0x93,state1,state1 /* state2[0,1,2,3] = state2[2,3,0,1] */ pshufd $0x4e,state2,state2 /* state3[0,1,2,3] = state3[1,2,3,0] */ pshufd $0x39,state3,state3 - Eric