On Thu, Oct 17, 2019 at 09:09:22PM +0200, Ard Biesheuvel wrote: > From: "Jason A. Donenfeld" <Jason@xxxxxxxxx> > > These implementations from Samuel Neves support AVX and AVX-512VL. > Originally this used AVX-512F, but Skylake thermal throttling made > AVX-512VL more attractive and possible to do with negligable difference. > > Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx> > Signed-off-by: Samuel Neves <sneves@xxxxxxxxx> > Co-developed-by: Samuel Neves <sneves@xxxxxxxxx> > [ardb: move to arch/x86/crypto, wire into lib/crypto framework] > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> > --- > arch/x86/crypto/Makefile | 2 + > arch/x86/crypto/blake2s-core.S | 685 ++++++++++++++++++++ > arch/x86/crypto/blake2s-glue.c | 235 +++++++ > crypto/Kconfig | 6 + > 4 files changed, 928 insertions(+) > > diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile > index 759b1a927826..922c8ecfa00f 100644 > --- a/arch/x86/crypto/Makefile > +++ b/arch/x86/crypto/Makefile > @@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes) > obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o > obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o > obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o > + obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o > endif > > # These modules require assembler to support AVX2. > @@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o > aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o > > nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o > +blake2s-x86_64-y := blake2s-core.o blake2s-glue.o > > ifeq ($(avx_supported),yes) > camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ > diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S > new file mode 100644 > index 000000000000..675288fa4cca > --- /dev/null > +++ b/arch/x86/crypto/blake2s-core.S > @@ -0,0 +1,685 @@ > +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ > +/* > + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. > + * Copyright (C) 2017 Samuel Neves <sneves@xxxxxxxxx>. All Rights Reserved. > + */ > + > +#include <linux/linkage.h> > + > +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 > +.align 32 > +IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 > + .octa 0x5BE0CD191F83D9AB9B05688C510E527F > +.section .rodata.cst16.ROT16, "aM", @progbits, 16 > +.align 16 > +ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 > +.section .rodata.cst16.ROR328, "aM", @progbits, 16 > +.align 16 > +ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 > +#ifdef CONFIG_AS_AVX512 > +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640 > +.align 64 > +SIGMA: > +.long 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15 > +.long 11, 2, 12, 14, 9, 8, 15, 3, 4, 0, 13, 6, 10, 1, 7, 5 > +.long 10, 12, 11, 6, 5, 9, 13, 3, 4, 15, 14, 2, 0, 7, 8, 1 > +.long 10, 9, 7, 0, 11, 14, 1, 12, 6, 2, 15, 3, 13, 8, 5, 4 > +.long 4, 9, 8, 13, 14, 0, 10, 11, 7, 3, 12, 1, 5, 6, 15, 2 > +.long 2, 10, 4, 14, 13, 3, 9, 11, 6, 5, 7, 12, 15, 1, 8, 0 > +.long 4, 11, 14, 8, 13, 10, 12, 5, 2, 1, 15, 3, 9, 7, 0, 6 > +.long 6, 12, 0, 13, 15, 2, 1, 10, 4, 5, 11, 14, 8, 3, 9, 7 > +.long 14, 5, 4, 12, 9, 7, 3, 10, 2, 0, 6, 15, 11, 1, 13, 8 > +.long 11, 7, 13, 10, 12, 14, 0, 15, 4, 5, 6, 9, 2, 1, 8, 3 > +#endif /* CONFIG_AS_AVX512 */ > + > +.text > +#ifdef CONFIG_AS_AVX > +ENTRY(blake2s_compress_avx) > + movl %ecx, %ecx > + testq %rdx, %rdx > + je .Lendofloop > + .align 32 > +.Lbeginofloop: > + addq %rcx, 32(%rdi) > + vmovdqu IV+16(%rip), %xmm1 > + vmovdqu (%rsi), %xmm4 > + vpxor 32(%rdi), %xmm1, %xmm1 > + vmovdqu 16(%rsi), %xmm3 > + vshufps $136, %xmm3, %xmm4, %xmm6 > + vmovdqa ROT16(%rip), %xmm7 > + vpaddd (%rdi), %xmm6, %xmm6 > + vpaddd 16(%rdi), %xmm6, %xmm6 > + vpxor %xmm6, %xmm1, %xmm1 > + vmovdqu IV(%rip), %xmm8 > + vpshufb %xmm7, %xmm1, %xmm1 > + vmovdqu 48(%rsi), %xmm5 > + vpaddd %xmm1, %xmm8, %xmm8 > + vpxor 16(%rdi), %xmm8, %xmm9 > + vmovdqu 32(%rsi), %xmm2 > + vpblendw $12, %xmm3, %xmm5, %xmm13 > + vshufps $221, %xmm5, %xmm2, %xmm12 > + vpunpckhqdq %xmm2, %xmm4, %xmm14 > + vpslld $20, %xmm9, %xmm0 > + vpsrld $12, %xmm9, %xmm9 > + vpxor %xmm0, %xmm9, %xmm0 > + vshufps $221, %xmm3, %xmm4, %xmm9 > + vpaddd %xmm9, %xmm6, %xmm9 > + vpaddd %xmm0, %xmm9, %xmm9 > + vpxor %xmm9, %xmm1, %xmm1 > + vmovdqa ROR328(%rip), %xmm6 > + vpshufb %xmm6, %xmm1, %xmm1 > + vpaddd %xmm1, %xmm8, %xmm8 > + vpxor %xmm8, %xmm0, %xmm0 > + vpshufd $147, %xmm1, %xmm1 > + vpshufd $78, %xmm8, %xmm8 > + vpslld $25, %xmm0, %xmm10 > + vpsrld $7, %xmm0, %xmm0 > + vpxor %xmm10, %xmm0, %xmm0 > + vshufps $136, %xmm5, %xmm2, %xmm10 > + vpshufd $57, %xmm0, %xmm0 > + vpaddd %xmm10, %xmm9, %xmm9 > + vpaddd %xmm0, %xmm9, %xmm9 > + vpxor %xmm9, %xmm1, %xmm1 > + vpaddd %xmm12, %xmm9, %xmm9 > + vpblendw $12, %xmm2, %xmm3, %xmm12 > + vpshufb %xmm7, %xmm1, %xmm1 > + vpaddd %xmm1, %xmm8, %xmm8 > + vpxor %xmm8, %xmm0, %xmm10 > + vpslld $20, %xmm10, %xmm0 > + vpsrld $12, %xmm10, %xmm10 > + vpxor %xmm0, %xmm10, %xmm0 > + vpaddd %xmm0, %xmm9, %xmm9 > + vpxor %xmm9, %xmm1, %xmm1 > + vpshufb %xmm6, %xmm1, %xmm1 > + vpaddd %xmm1, %xmm8, %xmm8 [...] There are no comments in this 685-line assembly language file. Is this the original version, or is it a generated/stripped version? - Eric