Re: [PATCH v4 25/35] crypto: BLAKE2s - x86_64 SIMD implementation

Eric Biggers <ebiggers@xxxxxxxxxx> · Tue, 22 Oct 2019 21:55:11 -0700

On Thu, Oct 17, 2019 at 09:09:22PM +0200, Ard Biesheuvel wrote:
> From: "Jason A. Donenfeld" <Jason@xxxxxxxxx>
> 
> These implementations from Samuel Neves support AVX and AVX-512VL.
> Originally this used AVX-512F, but Skylake thermal throttling made
> AVX-512VL more attractive and possible to do with negligable difference.
> 
> Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx>
> Signed-off-by: Samuel Neves <sneves@xxxxxxxxx>
> Co-developed-by: Samuel Neves <sneves@xxxxxxxxx>
> [ardb: move to arch/x86/crypto, wire into lib/crypto framework]
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx>
> ---
>  arch/x86/crypto/Makefile       |   2 +
>  arch/x86/crypto/blake2s-core.S | 685 ++++++++++++++++++++
>  arch/x86/crypto/blake2s-glue.c | 235 +++++++
>  crypto/Kconfig                 |   6 +
>  4 files changed, 928 insertions(+)
> 
> diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
> index 759b1a927826..922c8ecfa00f 100644
> --- a/arch/x86/crypto/Makefile
> +++ b/arch/x86/crypto/Makefile
> @@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes)
>  	obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
>  	obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
>  	obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
> +	obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
>  endif
>  
>  # These modules require assembler to support AVX2.
> @@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
>  aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
>  
>  nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
> +blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
>  
>  ifeq ($(avx_supported),yes)
>  	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
> diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S
> new file mode 100644
> index 000000000000..675288fa4cca
> --- /dev/null
> +++ b/arch/x86/crypto/blake2s-core.S
> @@ -0,0 +1,685 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> +/*
> + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
> + * Copyright (C) 2017 Samuel Neves <sneves@xxxxxxxxx>. All Rights Reserved.
> + */
> +
> +#include <linux/linkage.h>
> +
> +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
> +.align 32
> +IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
> +	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
> +.section .rodata.cst16.ROT16, "aM", @progbits, 16
> +.align 16
> +ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
> +.section .rodata.cst16.ROR328, "aM", @progbits, 16
> +.align 16
> +ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
> +#ifdef CONFIG_AS_AVX512
> +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640
> +.align 64
> +SIGMA:
> +.long 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15
> +.long 11, 2, 12, 14, 9, 8, 15, 3, 4, 0, 13, 6, 10, 1, 7, 5
> +.long 10, 12, 11, 6, 5, 9, 13, 3, 4, 15, 14, 2, 0, 7, 8, 1
> +.long 10, 9, 7, 0, 11, 14, 1, 12, 6, 2, 15, 3, 13, 8, 5, 4
> +.long 4, 9, 8, 13, 14, 0, 10, 11, 7, 3, 12, 1, 5, 6, 15, 2
> +.long 2, 10, 4, 14, 13, 3, 9, 11, 6, 5, 7, 12, 15, 1, 8, 0
> +.long 4, 11, 14, 8, 13, 10, 12, 5, 2, 1, 15, 3, 9, 7, 0, 6
> +.long 6, 12, 0, 13, 15, 2, 1, 10, 4, 5, 11, 14, 8, 3, 9, 7
> +.long 14, 5, 4, 12, 9, 7, 3, 10, 2, 0, 6, 15, 11, 1, 13, 8
> +.long 11, 7, 13, 10, 12, 14, 0, 15, 4, 5, 6, 9, 2, 1, 8, 3
> +#endif /* CONFIG_AS_AVX512 */
> +
> +.text
> +#ifdef CONFIG_AS_AVX
> +ENTRY(blake2s_compress_avx)
> +	movl		%ecx, %ecx
> +	testq		%rdx, %rdx
> +	je		.Lendofloop
> +	.align 32
> +.Lbeginofloop:
> +	addq		%rcx, 32(%rdi)
> +	vmovdqu		IV+16(%rip), %xmm1
> +	vmovdqu		(%rsi), %xmm4
> +	vpxor		32(%rdi), %xmm1, %xmm1
> +	vmovdqu		16(%rsi), %xmm3
> +	vshufps		$136, %xmm3, %xmm4, %xmm6
> +	vmovdqa		ROT16(%rip), %xmm7
> +	vpaddd		(%rdi), %xmm6, %xmm6
> +	vpaddd		16(%rdi), %xmm6, %xmm6
> +	vpxor		%xmm6, %xmm1, %xmm1
> +	vmovdqu		IV(%rip), %xmm8
> +	vpshufb		%xmm7, %xmm1, %xmm1
> +	vmovdqu		48(%rsi), %xmm5
> +	vpaddd		%xmm1, %xmm8, %xmm8
> +	vpxor		16(%rdi), %xmm8, %xmm9
> +	vmovdqu		32(%rsi), %xmm2
> +	vpblendw	$12, %xmm3, %xmm5, %xmm13
> +	vshufps		$221, %xmm5, %xmm2, %xmm12
> +	vpunpckhqdq	%xmm2, %xmm4, %xmm14
> +	vpslld		$20, %xmm9, %xmm0
> +	vpsrld		$12, %xmm9, %xmm9
> +	vpxor		%xmm0, %xmm9, %xmm0
> +	vshufps		$221, %xmm3, %xmm4, %xmm9
> +	vpaddd		%xmm9, %xmm6, %xmm9
> +	vpaddd		%xmm0, %xmm9, %xmm9
> +	vpxor		%xmm9, %xmm1, %xmm1
> +	vmovdqa		ROR328(%rip), %xmm6
> +	vpshufb		%xmm6, %xmm1, %xmm1
> +	vpaddd		%xmm1, %xmm8, %xmm8
> +	vpxor		%xmm8, %xmm0, %xmm0
> +	vpshufd		$147, %xmm1, %xmm1
> +	vpshufd		$78, %xmm8, %xmm8
> +	vpslld		$25, %xmm0, %xmm10
> +	vpsrld		$7, %xmm0, %xmm0
> +	vpxor		%xmm10, %xmm0, %xmm0
> +	vshufps		$136, %xmm5, %xmm2, %xmm10
> +	vpshufd		$57, %xmm0, %xmm0
> +	vpaddd		%xmm10, %xmm9, %xmm9
> +	vpaddd		%xmm0, %xmm9, %xmm9
> +	vpxor		%xmm9, %xmm1, %xmm1
> +	vpaddd		%xmm12, %xmm9, %xmm9
> +	vpblendw	$12, %xmm2, %xmm3, %xmm12
> +	vpshufb		%xmm7, %xmm1, %xmm1
> +	vpaddd		%xmm1, %xmm8, %xmm8
> +	vpxor		%xmm8, %xmm0, %xmm10
> +	vpslld		$20, %xmm10, %xmm0
> +	vpsrld		$12, %xmm10, %xmm10
> +	vpxor		%xmm0, %xmm10, %xmm0
> +	vpaddd		%xmm0, %xmm9, %xmm9
> +	vpxor		%xmm9, %xmm1, %xmm1
> +	vpshufb		%xmm6, %xmm1, %xmm1
> +	vpaddd		%xmm1, %xmm8, %xmm8
[...]

There are no comments in this 685-line assembly language file.
Is this the original version, or is it a generated/stripped version?

- Eric