This provides AVX, AVX-2, and AVX-512F implementations for Poly1305. The AVX-512F implementation is disabled on Skylake, due to throttling. These come from Andy Polyakov's implementation, with some heavy modifications from Samuel Neves and me. Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx> Cc: Andy Lutomirski <luto@xxxxxxxxxx> Cc: Greg KH <gregkh@xxxxxxxxxxxxxxxxxxx> Cc: Samuel Neves <sneves@xxxxxxxxx> Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@xxxxxxxxx> Cc: Andy Polyakov <appro@xxxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: x86@xxxxxxxxxx Cc: linux-crypto@xxxxxxxxxxxxxxx --- lib/zinc/Makefile | 4 + lib/zinc/poly1305/poly1305-x86_64-glue.h | 109 + lib/zinc/poly1305/poly1305-x86_64.S | 2792 ++++++++++++++++++++++ 3 files changed, 2905 insertions(+) create mode 100644 lib/zinc/poly1305/poly1305-x86_64-glue.h create mode 100644 lib/zinc/poly1305/poly1305-x86_64.S diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile index 041e0649e6fb..66b2f43e4592 100644 --- a/lib/zinc/Makefile +++ b/lib/zinc/Makefile @@ -25,6 +25,10 @@ endif ifeq ($(CONFIG_ZINC_POLY1305),y) zinc-y += poly1305/poly1305.o +ifeq ($(CONFIG_X86_64)$(CONFIG_UML),y) +zinc-y += poly1305/poly1305-x86_64.o +CFLAGS_poly1305.o += -include $(srctree)/$(src)/poly1305/poly1305-x86_64-glue.h +endif ifeq ($(CONFIG_ARM),y) zinc-y += poly1305/poly1305-arm.o CFLAGS_poly1305.o += -include $(srctree)/$(src)/poly1305/poly1305-arm-glue.h diff --git a/lib/zinc/poly1305/poly1305-x86_64-glue.h b/lib/zinc/poly1305/poly1305-x86_64-glue.h new file mode 100644 index 000000000000..4ae028101e7c --- /dev/null +++ b/lib/zinc/poly1305/poly1305-x86_64-glue.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. + */ + +#include <zinc/poly1305.h> +#include <asm/cpufeature.h> +#include <asm/processor.h> +#include <asm/intel-family.h> + +asmlinkage void poly1305_init_x86_64(void *ctx, + const u8 key[POLY1305_KEY_SIZE]); +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_MAC_SIZE], + const u32 nonce[4]); +#ifdef CONFIG_AS_AVX +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_MAC_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +#endif +#ifdef CONFIG_AS_AVX2 +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +#endif +#ifdef CONFIG_AS_AVX512 +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); +#endif + +static bool poly1305_use_avx __ro_after_init; +static bool poly1305_use_avx2 __ro_after_init; +static bool poly1305_use_avx512 __ro_after_init; + +void __init poly1305_fpu_init(void) +{ + poly1305_use_avx = + boot_cpu_has(X86_FEATURE_AVX) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); + poly1305_use_avx2 = + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); + poly1305_use_avx512 = + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL) && + /* Skylake downclocks unacceptably much when using zmm. */ + boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X; +} + +static inline bool poly1305_init_arch(void *ctx, + const u8 key[POLY1305_KEY_SIZE], + simd_context_t simd_context) +{ + poly1305_init_x86_64(ctx, key); + return true; +} + +static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp, + const size_t len, const u32 padbit, + simd_context_t simd_context) +{ +#ifdef CONFIG_AS_AVX512 + if (poly1305_use_avx512 && simd_context == HAVE_FULL_SIMD) + poly1305_blocks_avx512(ctx, inp, len, padbit); + else +#endif +#ifdef CONFIG_AS_AVX2 + if (poly1305_use_avx2 && simd_context == HAVE_FULL_SIMD) + poly1305_blocks_avx2(ctx, inp, len, padbit); + else +#endif +#ifdef CONFIG_AS_AVX + if (poly1305_use_avx && simd_context == HAVE_FULL_SIMD) + poly1305_blocks_avx(ctx, inp, len, padbit); + else +#endif + poly1305_blocks_x86_64(ctx, inp, len, padbit); + return true; +} + +static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE], + const u32 nonce[4], + simd_context_t simd_context) +{ +#ifdef CONFIG_AS_AVX512 + if (poly1305_use_avx512 && simd_context == HAVE_FULL_SIMD) + poly1305_emit_avx(ctx, mac, nonce); + else +#endif +#ifdef CONFIG_AS_AVX2 + if (poly1305_use_avx2 && simd_context == HAVE_FULL_SIMD) + poly1305_emit_avx(ctx, mac, nonce); + else +#endif +#ifdef CONFIG_AS_AVX + if (poly1305_use_avx && simd_context == HAVE_FULL_SIMD) + poly1305_emit_avx(ctx, mac, nonce); + else +#endif + poly1305_emit_x86_64(ctx, mac, nonce); + return true; +} + +#define HAVE_POLY1305_ARCH_IMPLEMENTATION diff --git a/lib/zinc/poly1305/poly1305-x86_64.S b/lib/zinc/poly1305/poly1305-x86_64.S new file mode 100644 index 000000000000..26c852e3c769 --- /dev/null +++ b/lib/zinc/poly1305/poly1305-x86_64.S @@ -0,0 +1,2792 @@ +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 + * + * Copyright (C) 2017 Samuel Neves <sneves@xxxxxxxxx>. All Rights Reserved. + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved. + * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@xxxxxxxxxxx>. All Rights Reserved. + * + * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. + */ + +#include <linux/linkage.h> + +.section .rodata.cst192.Lconst, "aM", @progbits, 192 +.align 64 +.Lconst: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.long 16777216,0,16777216,0,16777216,0,16777216,0 +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.long 2,2,2,3,2,0,2,1 +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.text + +.align 32 +ENTRY(poly1305_init_x86_64) + xorq %rax,%rax + movq %rax,0(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + + cmpq $0,%rsi + je .Lno_key + + movq $0x0ffffffc0fffffff,%rax + movq $0x0ffffffc0ffffffc,%rcx + andq 0(%rsi),%rax + andq 8(%rsi),%rcx + movq %rax,24(%rdi) + movq %rcx,32(%rdi) + movl $1,%eax +.Lno_key: + ret +ENDPROC(poly1305_init_x86_64) + +.align 32 +ENTRY(poly1305_blocks_x86_64) +.Lblocks: + shrq $4,%rdx + jz .Lno_data + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lblocks_body: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movq 16(%rdi),%r10 + + movq %r13,%r12 + shrq $2,%r13 + movq %r12,%rax + addq %r12,%r13 + jmp .Loop + +.align 32 +.Loop: + + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + mulq %r14 + movq %rax,%r9 + movq %r11,%rax + movq %rdx,%rdi + + mulq %r14 + movq %rax,%r14 + movq %r11,%rax + movq %rdx,%r8 + + mulq %rbx + addq %rax,%r9 + movq %r13,%rax + adcq %rdx,%rdi + + mulq %rbx + movq %r10,%rbx + addq %rax,%r14 + adcq %rdx,%r8 + + imulq %r13,%rbx + addq %rbx,%r9 + movq %r8,%rbx + adcq $0,%rdi + + imulq %r11,%r10 + addq %r9,%rbx + movq $-4,%rax + adcq %r10,%rdi + + andq %rdi,%rax + movq %rdi,%r10 + shrq $2,%rdi + andq $3,%r10 + addq %rdi,%rax + addq %rax,%r14 + adcq $0,%rbx + adcq $0,%r10 + + movq %r12,%rax + decq %r15 + jnz .Loop + + movq 0(%rsp),%rdi + + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %r10,16(%rdi) + + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rsp +.Lno_data: +.Lblocks_epilogue: + ret +ENDPROC(poly1305_blocks_x86_64) + +.align 32 +ENTRY(poly1305_emit_x86_64) +.Lemit: + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + + movq %r8,%rax + addq $5,%r8 + movq %r9,%rcx + adcq $0,%r9 + adcq $0,%r10 + shrq $2,%r10 + cmovnzq %r8,%rax + cmovnzq %r9,%rcx + + addq 0(%rdx),%rax + adcq 8(%rdx),%rcx + movq %rax,0(%rsi) + movq %rcx,8(%rsi) + + ret +ENDPROC(poly1305_emit_x86_64) + +.macro __poly1305_block + mulq %r14 + movq %rax,%r9 + movq %r11,%rax + movq %rdx,%rdi + + mulq %r14 + movq %rax,%r14 + movq %r11,%rax + movq %rdx,%r8 + + mulq %rbx + addq %rax,%r9 + movq %r13,%rax + adcq %rdx,%rdi + + mulq %rbx + movq %r10,%rbx + addq %rax,%r14 + adcq %rdx,%r8 + + imulq %r13,%rbx + addq %rbx,%r9 + movq %r8,%rbx + adcq $0,%rdi + + imulq %r11,%r10 + addq %r9,%rbx + movq $-4,%rax + adcq %r10,%rdi + + andq %rdi,%rax + movq %rdi,%r10 + shrq $2,%rdi + andq $3,%r10 + addq %rdi,%rax + addq %rax,%r14 + adcq $0,%rbx + adcq $0,%r10 +.endm + +.macro __poly1305_init_avx + movq %r11,%r14 + movq %r12,%rbx + xorq %r10,%r10 + + leaq 48+64(%rdi),%rdi + + movq %r12,%rax + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + movq %r14,%r8 + andl %r14d,%eax + movq %r11,%r9 + andl %r11d,%edx + movl %eax,-64(%rdi) + shrq $26,%r8 + movl %edx,-60(%rdi) + shrq $26,%r9 + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + andl %r8d,%eax + andl %r9d,%edx + movl %eax,-48(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,-44(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,-32(%rdi) + shrq $26,%r8 + movl %edx,-28(%rdi) + shrq $26,%r9 + + movq %rbx,%rax + movq %r12,%rdx + shlq $12,%rax + shlq $12,%rdx + orq %r8,%rax + orq %r9,%rdx + andl $0x3ffffff,%eax + andl $0x3ffffff,%edx + movl %eax,-16(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,-12(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,0(%rdi) + movq %rbx,%r8 + movl %edx,4(%rdi) + movq %r12,%r9 + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + shrq $14,%r8 + shrq $14,%r9 + andl %r8d,%eax + andl %r9d,%edx + movl %eax,16(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,20(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,32(%rdi) + shrq $26,%r8 + movl %edx,36(%rdi) + shrq $26,%r9 + + movq %r10,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,48(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r9d,52(%rdi) + leaq (%r9,%r9,4),%r9 + movl %r8d,64(%rdi) + movl %r9d,68(%rdi) + + movq %r12,%rax + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + + movl $0x3ffffff,%eax + movq %r14,%r8 + andl %r14d,%eax + shrq $26,%r8 + movl %eax,-52(%rdi) + + movl $0x3ffffff,%edx + andl %r8d,%edx + movl %edx,-36(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,-20(%rdi) + + movq %rbx,%rax + shlq $12,%rax + orq %r8,%rax + andl $0x3ffffff,%eax + movl %eax,-4(%rdi) + leal (%rax,%rax,4),%eax + movq %rbx,%r8 + movl %eax,12(%rdi) + + movl $0x3ffffff,%edx + shrq $14,%r8 + andl %r8d,%edx + movl %edx,28(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,44(%rdi) + + movq %r10,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,60(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r8d,76(%rdi) + + movq %r12,%rax + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + + movl $0x3ffffff,%eax + movq %r14,%r8 + andl %r14d,%eax + shrq $26,%r8 + movl %eax,-56(%rdi) + + movl $0x3ffffff,%edx + andl %r8d,%edx + movl %edx,-40(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,-24(%rdi) + + movq %rbx,%rax + shlq $12,%rax + orq %r8,%rax + andl $0x3ffffff,%eax + movl %eax,-8(%rdi) + leal (%rax,%rax,4),%eax + movq %rbx,%r8 + movl %eax,8(%rdi) + + movl $0x3ffffff,%edx + shrq $14,%r8 + andl %r8d,%edx + movl %edx,24(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,40(%rdi) + + movq %r10,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,56(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r8d,72(%rdi) + + leaq -48-64(%rdi),%rdi +.endm + +#ifdef CONFIG_AS_AVX +.align 32 +ENTRY(poly1305_blocks_avx) + + movl 20(%rdi),%r8d + cmpq $128,%rdx + jae .Lblocks_avx + testl %r8d,%r8d + jz .Lblocks + +.Lblocks_avx: + andq $-16,%rdx + jz .Lno_data_avx + + vzeroupper + + testl %r8d,%r8d + jz .Lbase2_64_avx + + testq $31,%rdx + jz .Leven_avx + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lblocks_avx_body: + + movq %rdx,%r15 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movl 16(%rdi),%r10d + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + + movl %r8d,%r14d + andq $-2147483648,%r8 + movq %r9,%r12 + movl %r9d,%ebx + andq $-2147483648,%r9 + + shrq $6,%r8 + shlq $52,%r12 + addq %r8,%r14 + shrq $12,%rbx + shrq $18,%r9 + addq %r12,%r14 + adcq %r9,%rbx + + movq %r10,%r8 + shlq $40,%r8 + shrq $24,%r10 + addq %r8,%rbx + adcq $0,%r10 + + movq $-4,%r9 + movq %r10,%r8 + andq %r10,%r9 + shrq $2,%r8 + andq $3,%r10 + addq %r9,%r8 + addq %r8,%r14 + adcq $0,%rbx + adcq $0,%r10 + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + + testq %rcx,%rcx + jz .Lstore_base2_64_avx + + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r11 + movq %rbx,%r12 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r11 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r11,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r12 + andq $0x3ffffff,%rbx + orq %r12,%r10 + + subq $16,%r15 + jz .Lstore_base2_26_avx + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + jmp .Lproceed_avx + +.align 32 +.Lstore_base2_64_avx: + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %r10,16(%rdi) + jmp .Ldone_avx + +.align 16 +.Lstore_base2_26_avx: + movl %eax,0(%rdi) + movl %edx,4(%rdi) + movl %r14d,8(%rdi) + movl %ebx,12(%rdi) + movl %r10d,16(%rdi) +.align 16 +.Ldone_avx: + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rsp + +.Lno_data_avx: +.Lblocks_avx_epilogue: + ret + +.align 32 +.Lbase2_64_avx: + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lbase2_64_avx_body: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movl 16(%rdi),%r10d + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + testq $31,%rdx + jz .Linit_avx + + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + subq $16,%r15 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + +.Linit_avx: + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r8 + movq %rbx,%r9 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r8 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r8,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r9 + andq $0x3ffffff,%rbx + orq %r9,%r10 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + movl $1,20(%rdi) + + __poly1305_init_avx + +.Lproceed_avx: + movq %r15,%rdx + + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rax + leaq 48(%rsp),%rsp + +.Lbase2_64_avx_epilogue: + jmp .Ldo_avx + + +.align 32 +.Leven_avx: + vmovd 0(%rdi),%xmm0 + vmovd 4(%rdi),%xmm1 + vmovd 8(%rdi),%xmm2 + vmovd 12(%rdi),%xmm3 + vmovd 16(%rdi),%xmm4 + +.Ldo_avx: + leaq 8(%rsp),%r10 + andq $-32,%rsp + subq $8,%rsp + leaq -88(%rsp),%r11 + subq $0x178,%rsp + subq $64,%rdx + leaq -32(%rsi),%rax + cmovcq %rax,%rsi + + vmovdqu 48(%rdi),%xmm14 + leaq 112(%rdi),%rdi + leaq .Lconst(%rip),%rcx + + vmovdqu 32(%rsi),%xmm5 + vmovdqu 48(%rsi),%xmm6 + vmovdqa 64(%rcx),%xmm15 + + vpsrldq $6,%xmm5,%xmm7 + vpsrldq $6,%xmm6,%xmm8 + vpunpckhqdq %xmm6,%xmm5,%xmm9 + vpunpcklqdq %xmm6,%xmm5,%xmm5 + vpunpcklqdq %xmm8,%xmm7,%xmm8 + + vpsrlq $40,%xmm9,%xmm9 + vpsrlq $26,%xmm5,%xmm6 + vpand %xmm15,%xmm5,%xmm5 + vpsrlq $4,%xmm8,%xmm7 + vpand %xmm15,%xmm6,%xmm6 + vpsrlq $30,%xmm8,%xmm8 + vpand %xmm15,%xmm7,%xmm7 + vpand %xmm15,%xmm8,%xmm8 + vpor 32(%rcx),%xmm9,%xmm9 + + jbe .Lskip_loop_avx + + + vmovdqu -48(%rdi),%xmm11 + vmovdqu -32(%rdi),%xmm12 + vpshufd $0xEE,%xmm14,%xmm13 + vpshufd $0x44,%xmm14,%xmm10 + vmovdqa %xmm13,-144(%r11) + vmovdqa %xmm10,0(%rsp) + vpshufd $0xEE,%xmm11,%xmm14 + vmovdqu -16(%rdi),%xmm10 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm14,-128(%r11) + vmovdqa %xmm11,16(%rsp) + vpshufd $0xEE,%xmm12,%xmm13 + vmovdqu 0(%rdi),%xmm11 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm13,-112(%r11) + vmovdqa %xmm12,32(%rsp) + vpshufd $0xEE,%xmm10,%xmm14 + vmovdqu 16(%rdi),%xmm12 + vpshufd $0x44,%xmm10,%xmm10 + vmovdqa %xmm14,-96(%r11) + vmovdqa %xmm10,48(%rsp) + vpshufd $0xEE,%xmm11,%xmm13 + vmovdqu 32(%rdi),%xmm10 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm13,-80(%r11) + vmovdqa %xmm11,64(%rsp) + vpshufd $0xEE,%xmm12,%xmm14 + vmovdqu 48(%rdi),%xmm11 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm14,-64(%r11) + vmovdqa %xmm12,80(%rsp) + vpshufd $0xEE,%xmm10,%xmm13 + vmovdqu 64(%rdi),%xmm12 + vpshufd $0x44,%xmm10,%xmm10 + vmovdqa %xmm13,-48(%r11) + vmovdqa %xmm10,96(%rsp) + vpshufd $0xEE,%xmm11,%xmm14 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm14,-32(%r11) + vmovdqa %xmm11,112(%rsp) + vpshufd $0xEE,%xmm12,%xmm13 + vmovdqa 0(%rsp),%xmm14 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm13,-16(%r11) + vmovdqa %xmm12,128(%rsp) + + jmp .Loop_avx + +.align 32 +.Loop_avx: + + vpmuludq %xmm5,%xmm14,%xmm10 + vpmuludq %xmm6,%xmm14,%xmm11 + vmovdqa %xmm2,32(%r11) + vpmuludq %xmm7,%xmm14,%xmm12 + vmovdqa 16(%rsp),%xmm2 + vpmuludq %xmm8,%xmm14,%xmm13 + vpmuludq %xmm9,%xmm14,%xmm14 + + vmovdqa %xmm0,0(%r11) + vpmuludq 32(%rsp),%xmm9,%xmm0 + vmovdqa %xmm1,16(%r11) + vpmuludq %xmm8,%xmm2,%xmm1 + vpaddq %xmm0,%xmm10,%xmm10 + vpaddq %xmm1,%xmm14,%xmm14 + vmovdqa %xmm3,48(%r11) + vpmuludq %xmm7,%xmm2,%xmm0 + vpmuludq %xmm6,%xmm2,%xmm1 + vpaddq %xmm0,%xmm13,%xmm13 + vmovdqa 48(%rsp),%xmm3 + vpaddq %xmm1,%xmm12,%xmm12 + vmovdqa %xmm4,64(%r11) + vpmuludq %xmm5,%xmm2,%xmm2 + vpmuludq %xmm7,%xmm3,%xmm0 + vpaddq %xmm2,%xmm11,%xmm11 + + vmovdqa 64(%rsp),%xmm4 + vpaddq %xmm0,%xmm14,%xmm14 + vpmuludq %xmm6,%xmm3,%xmm1 + vpmuludq %xmm5,%xmm3,%xmm3 + vpaddq %xmm1,%xmm13,%xmm13 + vmovdqa 80(%rsp),%xmm2 + vpaddq %xmm3,%xmm12,%xmm12 + vpmuludq %xmm9,%xmm4,%xmm0 + vpmuludq %xmm8,%xmm4,%xmm4 + vpaddq %xmm0,%xmm11,%xmm11 + vmovdqa 96(%rsp),%xmm3 + vpaddq %xmm4,%xmm10,%xmm10 + + vmovdqa 128(%rsp),%xmm4 + vpmuludq %xmm6,%xmm2,%xmm1 + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm1,%xmm14,%xmm14 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq %xmm9,%xmm3,%xmm0 + vpmuludq %xmm8,%xmm3,%xmm1 + vpaddq %xmm0,%xmm12,%xmm12 + vmovdqu 0(%rsi),%xmm0 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq %xmm7,%xmm3,%xmm3 + vpmuludq %xmm7,%xmm4,%xmm7 + vpaddq %xmm3,%xmm10,%xmm10 + + vmovdqu 16(%rsi),%xmm1 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq %xmm8,%xmm4,%xmm8 + vpmuludq %xmm9,%xmm4,%xmm9 + vpsrldq $6,%xmm0,%xmm2 + vpaddq %xmm8,%xmm12,%xmm12 + vpaddq %xmm9,%xmm13,%xmm13 + vpsrldq $6,%xmm1,%xmm3 + vpmuludq 112(%rsp),%xmm5,%xmm9 + vpmuludq %xmm6,%xmm4,%xmm5 + vpunpckhqdq %xmm1,%xmm0,%xmm4 + vpaddq %xmm9,%xmm14,%xmm14 + vmovdqa -144(%r11),%xmm9 + vpaddq %xmm5,%xmm10,%xmm10 + + vpunpcklqdq %xmm1,%xmm0,%xmm0 + vpunpcklqdq %xmm3,%xmm2,%xmm3 + + + vpsrldq $5,%xmm4,%xmm4 + vpsrlq $26,%xmm0,%xmm1 + vpand %xmm15,%xmm0,%xmm0 + vpsrlq $4,%xmm3,%xmm2 + vpand %xmm15,%xmm1,%xmm1 + vpand 0(%rcx),%xmm4,%xmm4 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm15,%xmm2,%xmm2 + vpand %xmm15,%xmm3,%xmm3 + vpor 32(%rcx),%xmm4,%xmm4 + + vpaddq 0(%r11),%xmm0,%xmm0 + vpaddq 16(%r11),%xmm1,%xmm1 + vpaddq 32(%r11),%xmm2,%xmm2 + vpaddq 48(%r11),%xmm3,%xmm3 + vpaddq 64(%r11),%xmm4,%xmm4 + + leaq 32(%rsi),%rax + leaq 64(%rsi),%rsi + subq $64,%rdx + cmovcq %rax,%rsi + + vpmuludq %xmm0,%xmm9,%xmm5 + vpmuludq %xmm1,%xmm9,%xmm6 + vpaddq %xmm5,%xmm10,%xmm10 + vpaddq %xmm6,%xmm11,%xmm11 + vmovdqa -128(%r11),%xmm7 + vpmuludq %xmm2,%xmm9,%xmm5 + vpmuludq %xmm3,%xmm9,%xmm6 + vpaddq %xmm5,%xmm12,%xmm12 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm4,%xmm9,%xmm9 + vpmuludq -112(%r11),%xmm4,%xmm5 + vpaddq %xmm9,%xmm14,%xmm14 + + vpaddq %xmm5,%xmm10,%xmm10 + vpmuludq %xmm2,%xmm7,%xmm6 + vpmuludq %xmm3,%xmm7,%xmm5 + vpaddq %xmm6,%xmm13,%xmm13 + vmovdqa -96(%r11),%xmm8 + vpaddq %xmm5,%xmm14,%xmm14 + vpmuludq %xmm1,%xmm7,%xmm6 + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm6,%xmm12,%xmm12 + vpaddq %xmm7,%xmm11,%xmm11 + + vmovdqa -80(%r11),%xmm9 + vpmuludq %xmm2,%xmm8,%xmm5 + vpmuludq %xmm1,%xmm8,%xmm6 + vpaddq %xmm5,%xmm14,%xmm14 + vpaddq %xmm6,%xmm13,%xmm13 + vmovdqa -64(%r11),%xmm7 + vpmuludq %xmm0,%xmm8,%xmm8 + vpmuludq %xmm4,%xmm9,%xmm5 + vpaddq %xmm8,%xmm12,%xmm12 + vpaddq %xmm5,%xmm11,%xmm11 + vmovdqa -48(%r11),%xmm8 + vpmuludq %xmm3,%xmm9,%xmm9 + vpmuludq %xmm1,%xmm7,%xmm6 + vpaddq %xmm9,%xmm10,%xmm10 + + vmovdqa -16(%r11),%xmm9 + vpaddq %xmm6,%xmm14,%xmm14 + vpmuludq %xmm0,%xmm7,%xmm7 + vpmuludq %xmm4,%xmm8,%xmm5 + vpaddq %xmm7,%xmm13,%xmm13 + vpaddq %xmm5,%xmm12,%xmm12 + vmovdqu 32(%rsi),%xmm5 + vpmuludq %xmm3,%xmm8,%xmm7 + vpmuludq %xmm2,%xmm8,%xmm8 + vpaddq %xmm7,%xmm11,%xmm11 + vmovdqu 48(%rsi),%xmm6 + vpaddq %xmm8,%xmm10,%xmm10 + + vpmuludq %xmm2,%xmm9,%xmm2 + vpmuludq %xmm3,%xmm9,%xmm3 + vpsrldq $6,%xmm5,%xmm7 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq %xmm4,%xmm9,%xmm4 + vpsrldq $6,%xmm6,%xmm8 + vpaddq %xmm3,%xmm12,%xmm2 + vpaddq %xmm4,%xmm13,%xmm3 + vpmuludq -32(%r11),%xmm0,%xmm4 + vpmuludq %xmm1,%xmm9,%xmm0 + vpunpckhqdq %xmm6,%xmm5,%xmm9 + vpaddq %xmm4,%xmm14,%xmm4 + vpaddq %xmm0,%xmm10,%xmm0 + + vpunpcklqdq %xmm6,%xmm5,%xmm5 + vpunpcklqdq %xmm8,%xmm7,%xmm8 + + + vpsrldq $5,%xmm9,%xmm9 + vpsrlq $26,%xmm5,%xmm6 + vmovdqa 0(%rsp),%xmm14 + vpand %xmm15,%xmm5,%xmm5 + vpsrlq $4,%xmm8,%xmm7 + vpand %xmm15,%xmm6,%xmm6 + vpand 0(%rcx),%xmm9,%xmm9 + vpsrlq $30,%xmm8,%xmm8 + vpand %xmm15,%xmm7,%xmm7 + vpand %xmm15,%xmm8,%xmm8 + vpor 32(%rcx),%xmm9,%xmm9 + + vpsrlq $26,%xmm3,%xmm13 + vpand %xmm15,%xmm3,%xmm3 + vpaddq %xmm13,%xmm4,%xmm4 + + vpsrlq $26,%xmm0,%xmm10 + vpand %xmm15,%xmm0,%xmm0 + vpaddq %xmm10,%xmm11,%xmm1 + + vpsrlq $26,%xmm4,%xmm10 + vpand %xmm15,%xmm4,%xmm4 + + vpsrlq $26,%xmm1,%xmm11 + vpand %xmm15,%xmm1,%xmm1 + vpaddq %xmm11,%xmm2,%xmm2 + + vpaddq %xmm10,%xmm0,%xmm0 + vpsllq $2,%xmm10,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + + vpsrlq $26,%xmm2,%xmm12 + vpand %xmm15,%xmm2,%xmm2 + vpaddq %xmm12,%xmm3,%xmm3 + + vpsrlq $26,%xmm0,%xmm10 + vpand %xmm15,%xmm0,%xmm0 + vpaddq %xmm10,%xmm1,%xmm1 + + vpsrlq $26,%xmm3,%xmm13 + vpand %xmm15,%xmm3,%xmm3 + vpaddq %xmm13,%xmm4,%xmm4 + + ja .Loop_avx + +.Lskip_loop_avx: + vpshufd $0x10,%xmm14,%xmm14 + addq $32,%rdx + jnz .Long_tail_avx + + vpaddq %xmm2,%xmm7,%xmm7 + vpaddq %xmm0,%xmm5,%xmm5 + vpaddq %xmm1,%xmm6,%xmm6 + vpaddq %xmm3,%xmm8,%xmm8 + vpaddq %xmm4,%xmm9,%xmm9 + +.Long_tail_avx: + vmovdqa %xmm2,32(%r11) + vmovdqa %xmm0,0(%r11) + vmovdqa %xmm1,16(%r11) + vmovdqa %xmm3,48(%r11) + vmovdqa %xmm4,64(%r11) + + vpmuludq %xmm7,%xmm14,%xmm12 + vpmuludq %xmm5,%xmm14,%xmm10 + vpshufd $0x10,-48(%rdi),%xmm2 + vpmuludq %xmm6,%xmm14,%xmm11 + vpmuludq %xmm8,%xmm14,%xmm13 + vpmuludq %xmm9,%xmm14,%xmm14 + + vpmuludq %xmm8,%xmm2,%xmm0 + vpaddq %xmm0,%xmm14,%xmm14 + vpshufd $0x10,-32(%rdi),%xmm3 + vpmuludq %xmm7,%xmm2,%xmm1 + vpaddq %xmm1,%xmm13,%xmm13 + vpshufd $0x10,-16(%rdi),%xmm4 + vpmuludq %xmm6,%xmm2,%xmm0 + vpaddq %xmm0,%xmm12,%xmm12 + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq %xmm9,%xmm3,%xmm3 + vpaddq %xmm3,%xmm10,%xmm10 + + vpshufd $0x10,0(%rdi),%xmm2 + vpmuludq %xmm7,%xmm4,%xmm1 + vpaddq %xmm1,%xmm14,%xmm14 + vpmuludq %xmm6,%xmm4,%xmm0 + vpaddq %xmm0,%xmm13,%xmm13 + vpshufd $0x10,16(%rdi),%xmm3 + vpmuludq %xmm5,%xmm4,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vpmuludq %xmm9,%xmm2,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpshufd $0x10,32(%rdi),%xmm4 + vpmuludq %xmm8,%xmm2,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + + vpmuludq %xmm6,%xmm3,%xmm0 + vpaddq %xmm0,%xmm14,%xmm14 + vpmuludq %xmm5,%xmm3,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpshufd $0x10,48(%rdi),%xmm2 + vpmuludq %xmm9,%xmm4,%xmm1 + vpaddq %xmm1,%xmm12,%xmm12 + vpshufd $0x10,64(%rdi),%xmm3 + vpmuludq %xmm8,%xmm4,%xmm0 + vpaddq %xmm0,%xmm11,%xmm11 + vpmuludq %xmm7,%xmm4,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq %xmm9,%xmm3,%xmm1 + vpaddq %xmm1,%xmm13,%xmm13 + vpmuludq %xmm8,%xmm3,%xmm0 + vpaddq %xmm0,%xmm12,%xmm12 + vpmuludq %xmm7,%xmm3,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq %xmm6,%xmm3,%xmm3 + vpaddq %xmm3,%xmm10,%xmm10 + + jz .Lshort_tail_avx + + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + + vpsrldq $6,%xmm0,%xmm2 + vpsrldq $6,%xmm1,%xmm3 + vpunpckhqdq %xmm1,%xmm0,%xmm4 + vpunpcklqdq %xmm1,%xmm0,%xmm0 + vpunpcklqdq %xmm3,%xmm2,%xmm3 + + vpsrlq $40,%xmm4,%xmm4 + vpsrlq $26,%xmm0,%xmm1 + vpand %xmm15,%xmm0,%xmm0 + vpsrlq $4,%xmm3,%xmm2 + vpand %xmm15,%xmm1,%xmm1 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm15,%xmm2,%xmm2 + vpand %xmm15,%xmm3,%xmm3 + vpor 32(%rcx),%xmm4,%xmm4 + + vpshufd $0x32,-64(%rdi),%xmm9 + vpaddq 0(%r11),%xmm0,%xmm0 + vpaddq 16(%r11),%xmm1,%xmm1 + vpaddq 32(%r11),%xmm2,%xmm2 + vpaddq 48(%r11),%xmm3,%xmm3 + vpaddq 64(%r11),%xmm4,%xmm4 + + vpmuludq %xmm0,%xmm9,%xmm5 + vpaddq %xmm5,%xmm10,%xmm10 + vpmuludq %xmm1,%xmm9,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpmuludq %xmm2,%xmm9,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpshufd $0x32,-48(%rdi),%xmm7 + vpmuludq %xmm3,%xmm9,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm4,%xmm9,%xmm9 + vpaddq %xmm9,%xmm14,%xmm14 + + vpmuludq %xmm3,%xmm7,%xmm5 + vpaddq %xmm5,%xmm14,%xmm14 + vpshufd $0x32,-32(%rdi),%xmm8 + vpmuludq %xmm2,%xmm7,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpshufd $0x32,-16(%rdi),%xmm9 + vpmuludq %xmm1,%xmm7,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq %xmm4,%xmm8,%xmm8 + vpaddq %xmm8,%xmm10,%xmm10 + + vpshufd $0x32,0(%rdi),%xmm7 + vpmuludq %xmm2,%xmm9,%xmm6 + vpaddq %xmm6,%xmm14,%xmm14 + vpmuludq %xmm1,%xmm9,%xmm5 + vpaddq %xmm5,%xmm13,%xmm13 + vpshufd $0x32,16(%rdi),%xmm8 + vpmuludq %xmm0,%xmm9,%xmm9 + vpaddq %xmm9,%xmm12,%xmm12 + vpmuludq %xmm4,%xmm7,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpshufd $0x32,32(%rdi),%xmm9 + vpmuludq %xmm3,%xmm7,%xmm7 + vpaddq %xmm7,%xmm10,%xmm10 + + vpmuludq %xmm1,%xmm8,%xmm5 + vpaddq %xmm5,%xmm14,%xmm14 + vpmuludq %xmm0,%xmm8,%xmm8 + vpaddq %xmm8,%xmm13,%xmm13 + vpshufd $0x32,48(%rdi),%xmm7 + vpmuludq %xmm4,%xmm9,%xmm6 + vpaddq %xmm6,%xmm12,%xmm12 + vpshufd $0x32,64(%rdi),%xmm8 + vpmuludq %xmm3,%xmm9,%xmm5 + vpaddq %xmm5,%xmm11,%xmm11 + vpmuludq %xmm2,%xmm9,%xmm9 + vpaddq %xmm9,%xmm10,%xmm10 + + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm7,%xmm14,%xmm14 + vpmuludq %xmm4,%xmm8,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm3,%xmm8,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpmuludq %xmm2,%xmm8,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpmuludq %xmm1,%xmm8,%xmm8 + vpaddq %xmm8,%xmm10,%xmm10 + +.Lshort_tail_avx: + + vpsrldq $8,%xmm14,%xmm9 + vpsrldq $8,%xmm13,%xmm8 + vpsrldq $8,%xmm11,%xmm6 + vpsrldq $8,%xmm10,%xmm5 + vpsrldq $8,%xmm12,%xmm7 + vpaddq %xmm8,%xmm13,%xmm13 + vpaddq %xmm9,%xmm14,%xmm14 + vpaddq %xmm5,%xmm10,%xmm10 + vpaddq %xmm6,%xmm11,%xmm11 + vpaddq %xmm7,%xmm12,%xmm12 + + vpsrlq $26,%xmm13,%xmm3 + vpand %xmm15,%xmm13,%xmm13 + vpaddq %xmm3,%xmm14,%xmm14 + + vpsrlq $26,%xmm10,%xmm0 + vpand %xmm15,%xmm10,%xmm10 + vpaddq %xmm0,%xmm11,%xmm11 + + vpsrlq $26,%xmm14,%xmm4 + vpand %xmm15,%xmm14,%xmm14 + + vpsrlq $26,%xmm11,%xmm1 + vpand %xmm15,%xmm11,%xmm11 + vpaddq %xmm1,%xmm12,%xmm12 + + vpaddq %xmm4,%xmm10,%xmm10 + vpsllq $2,%xmm4,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + + vpsrlq $26,%xmm12,%xmm2 + vpand %xmm15,%xmm12,%xmm12 + vpaddq %xmm2,%xmm13,%xmm13 + + vpsrlq $26,%xmm10,%xmm0 + vpand %xmm15,%xmm10,%xmm10 + vpaddq %xmm0,%xmm11,%xmm11 + + vpsrlq $26,%xmm13,%xmm3 + vpand %xmm15,%xmm13,%xmm13 + vpaddq %xmm3,%xmm14,%xmm14 + + vmovd %xmm10,-112(%rdi) + vmovd %xmm11,-108(%rdi) + vmovd %xmm12,-104(%rdi) + vmovd %xmm13,-100(%rdi) + vmovd %xmm14,-96(%rdi) + leaq -8(%r10),%rsp + + vzeroupper + ret +ENDPROC(poly1305_blocks_avx) + +.align 32 +ENTRY(poly1305_emit_avx) + cmpl $0,20(%rdi) + je .Lemit + + movl 0(%rdi),%eax + movl 4(%rdi),%ecx + movl 8(%rdi),%r8d + movl 12(%rdi),%r11d + movl 16(%rdi),%r10d + + shlq $26,%rcx + movq %r8,%r9 + shlq $52,%r8 + addq %rcx,%rax + shrq $12,%r9 + addq %rax,%r8 + adcq $0,%r9 + + shlq $14,%r11 + movq %r10,%rax + shrq $24,%r10 + addq %r11,%r9 + shlq $40,%rax + addq %rax,%r9 + adcq $0,%r10 + + movq %r10,%rax + movq %r10,%rcx + andq $3,%r10 + shrq $2,%rax + andq $-4,%rcx + addq %rcx,%rax + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + + movq %r8,%rax + addq $5,%r8 + movq %r9,%rcx + adcq $0,%r9 + adcq $0,%r10 + shrq $2,%r10 + cmovnzq %r8,%rax + cmovnzq %r9,%rcx + + addq 0(%rdx),%rax + adcq 8(%rdx),%rcx + movq %rax,0(%rsi) + movq %rcx,8(%rsi) + + ret +ENDPROC(poly1305_emit_avx) +#endif /* CONFIG_AS_AVX */ + +#ifdef CONFIG_AS_AVX2 +.align 32 +ENTRY(poly1305_blocks_avx2) + + movl 20(%rdi),%r8d + cmpq $128,%rdx + jae .Lblocks_avx2 + testl %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2: + andq $-16,%rdx + jz .Lno_data_avx2 + + vzeroupper + + testl %r8d,%r8d + jz .Lbase2_64_avx2 + + testq $63,%rdx + jz .Leven_avx2 + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lblocks_avx2_body: + + movq %rdx,%r15 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movl 16(%rdi),%r10d + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + + movl %r8d,%r14d + andq $-2147483648,%r8 + movq %r9,%r12 + movl %r9d,%ebx + andq $-2147483648,%r9 + + shrq $6,%r8 + shlq $52,%r12 + addq %r8,%r14 + shrq $12,%rbx + shrq $18,%r9 + addq %r12,%r14 + adcq %r9,%rbx + + movq %r10,%r8 + shlq $40,%r8 + shrq $24,%r10 + addq %r8,%rbx + adcq $0,%r10 + + movq $-4,%r9 + movq %r10,%r8 + andq %r10,%r9 + shrq $2,%r8 + andq $3,%r10 + addq %r9,%r8 + addq %r8,%r14 + adcq $0,%rbx + adcq $0,%r10 + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + +.Lbase2_26_pre_avx2: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + subq $16,%r15 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_26_pre_avx2 + + testq %rcx,%rcx + jz .Lstore_base2_64_avx2 + + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r11 + movq %rbx,%r12 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r11 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r11,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r12 + andq $0x3ffffff,%rbx + orq %r12,%r10 + + testq %r15,%r15 + jz .Lstore_base2_26_avx2 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + jmp .Lproceed_avx2 + +.align 32 +.Lstore_base2_64_avx2: + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %r10,16(%rdi) + jmp .Ldone_avx2 + +.align 16 +.Lstore_base2_26_avx2: + movl %eax,0(%rdi) + movl %edx,4(%rdi) + movl %r14d,8(%rdi) + movl %ebx,12(%rdi) + movl %r10d,16(%rdi) +.align 16 +.Ldone_avx2: + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rsp + +.Lno_data_avx2: +.Lblocks_avx2_epilogue: + ret + + +.align 32 +.Lbase2_64_avx2: + + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lbase2_64_avx2_body: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movl 16(%rdi),%r10d + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + testq $63,%rdx + jz .Linit_avx2 + +.Lbase2_64_pre_avx2: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + subq $16,%r15 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_64_pre_avx2 + +.Linit_avx2: + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r8 + movq %rbx,%r9 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r8 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r8,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r9 + andq $0x3ffffff,%rbx + orq %r9,%r10 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + movl $1,20(%rdi) + + __poly1305_init_avx + +.Lproceed_avx2: + movq %r15,%rdx + + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rax + leaq 48(%rsp),%rsp + +.Lbase2_64_avx2_epilogue: + jmp .Ldo_avx2 + + +.align 32 +.Leven_avx2: + + vmovd 0(%rdi),%xmm0 + vmovd 4(%rdi),%xmm1 + vmovd 8(%rdi),%xmm2 + vmovd 12(%rdi),%xmm3 + vmovd 16(%rdi),%xmm4 + +.Ldo_avx2: + leaq 8(%rsp),%r10 + subq $0x128,%rsp + leaq .Lconst(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm7 + + + vmovdqu -64(%rdi),%xmm9 + andq $-512,%rsp + vmovdqu -48(%rdi),%xmm10 + vmovdqu -32(%rdi),%xmm6 + vmovdqu -16(%rdi),%xmm11 + vmovdqu 0(%rdi),%xmm12 + vmovdqu 16(%rdi),%xmm13 + leaq 144(%rsp),%rax + vmovdqu 32(%rdi),%xmm14 + vpermd %ymm9,%ymm7,%ymm9 + vmovdqu 48(%rdi),%xmm15 + vpermd %ymm10,%ymm7,%ymm10 + vmovdqu 64(%rdi),%xmm5 + vpermd %ymm6,%ymm7,%ymm6 + vmovdqa %ymm9,0(%rsp) + vpermd %ymm11,%ymm7,%ymm11 + vmovdqa %ymm10,32-144(%rax) + vpermd %ymm12,%ymm7,%ymm12 + vmovdqa %ymm6,64-144(%rax) + vpermd %ymm13,%ymm7,%ymm13 + vmovdqa %ymm11,96-144(%rax) + vpermd %ymm14,%ymm7,%ymm14 + vmovdqa %ymm12,128-144(%rax) + vpermd %ymm15,%ymm7,%ymm15 + vmovdqa %ymm13,160-144(%rax) + vpermd %ymm5,%ymm7,%ymm5 + vmovdqa %ymm14,192-144(%rax) + vmovdqa %ymm15,224-144(%rax) + vmovdqa %ymm5,256-144(%rax) + vmovdqa 64(%rcx),%ymm5 + + + + vmovdqu 0(%rsi),%xmm7 + vmovdqu 16(%rsi),%xmm8 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpsrldq $6,%ymm7,%ymm9 + vpsrldq $6,%ymm8,%ymm10 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + vpunpcklqdq %ymm10,%ymm9,%ymm9 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + + vpsrlq $30,%ymm9,%ymm10 + vpsrlq $4,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + vpsrlq $40,%ymm6,%ymm6 + vpand %ymm5,%ymm9,%ymm9 + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + vpaddq %ymm2,%ymm9,%ymm2 + subq $64,%rdx + jz .Ltail_avx2 + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqa 0(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqa 32(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqa 96(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqa 48(%rax),%ymm10 + vmovdqa 112(%rax),%ymm5 + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 64(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + vmovdqa -16(%rax),%ymm8 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vmovdqu 0(%rsi),%xmm7 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vmovdqu 16(%rsi),%xmm8 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqa 16(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpsrldq $6,%ymm7,%ymm9 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpsrldq $6,%ymm8,%ymm10 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpunpcklqdq %ymm10,%ymm9,%ymm10 + vpmuludq 80(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $4,%ymm10,%ymm9 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpand %ymm5,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpaddq %ymm9,%ymm2,%ymm2 + vpsrlq $30,%ymm10,%ymm10 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $40,%ymm6,%ymm6 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + subq $64,%rdx + jnz .Loop_avx2 + +.byte 0x66,0x90 +.Ltail_avx2: + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqu 4(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqu 36(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqu 100(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqu 52(%rax),%ymm10 + vmovdqu 116(%rax),%ymm5 + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 68(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vmovdqu -12(%rax),%ymm8 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqu 20(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpmuludq 84(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + vpsrldq $8,%ymm12,%ymm8 + vpsrldq $8,%ymm2,%ymm9 + vpsrldq $8,%ymm3,%ymm10 + vpsrldq $8,%ymm4,%ymm6 + vpsrldq $8,%ymm0,%ymm7 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + + vpermq $0x2,%ymm3,%ymm10 + vpermq $0x2,%ymm4,%ymm6 + vpermq $0x2,%ymm0,%ymm7 + vpermq $0x2,%ymm12,%ymm8 + vpermq $0x2,%ymm2,%ymm9 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vmovd %xmm0,-112(%rdi) + vmovd %xmm1,-108(%rdi) + vmovd %xmm2,-104(%rdi) + vmovd %xmm3,-100(%rdi) + vmovd %xmm4,-96(%rdi) + leaq -8(%r10),%rsp + + vzeroupper + ret + +ENDPROC(poly1305_blocks_avx2) +#endif /* CONFIG_AS_AVX2 */ + +#ifdef CONFIG_AS_AVX512 +.align 32 +ENTRY(poly1305_blocks_avx512) + + movl 20(%rdi),%r8d + cmpq $128,%rdx + jae .Lblocks_avx2_512 + testl %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2_512: + andq $-16,%rdx + jz .Lno_data_avx2_512 + + vzeroupper + + testl %r8d,%r8d + jz .Lbase2_64_avx2_512 + + testq $63,%rdx + jz .Leven_avx2_512 + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lblocks_avx2_body_512: + + movq %rdx,%r15 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movl 16(%rdi),%r10d + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + + movl %r8d,%r14d + andq $-2147483648,%r8 + movq %r9,%r12 + movl %r9d,%ebx + andq $-2147483648,%r9 + + shrq $6,%r8 + shlq $52,%r12 + addq %r8,%r14 + shrq $12,%rbx + shrq $18,%r9 + addq %r12,%r14 + adcq %r9,%rbx + + movq %r10,%r8 + shlq $40,%r8 + shrq $24,%r10 + addq %r8,%rbx + adcq $0,%r10 + + movq $-4,%r9 + movq %r10,%r8 + andq %r10,%r9 + shrq $2,%r8 + andq $3,%r10 + addq %r9,%r8 + addq %r8,%r14 + adcq $0,%rbx + adcq $0,%r10 + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + +.Lbase2_26_pre_avx2_512: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + subq $16,%r15 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_26_pre_avx2_512 + + testq %rcx,%rcx + jz .Lstore_base2_64_avx2_512 + + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r11 + movq %rbx,%r12 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r11 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r11,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r12 + andq $0x3ffffff,%rbx + orq %r12,%r10 + + testq %r15,%r15 + jz .Lstore_base2_26_avx2_512 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + jmp .Lproceed_avx2_512 + +.align 32 +.Lstore_base2_64_avx2_512: + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %r10,16(%rdi) + jmp .Ldone_avx2_512 + +.align 16 +.Lstore_base2_26_avx2_512: + movl %eax,0(%rdi) + movl %edx,4(%rdi) + movl %r14d,8(%rdi) + movl %ebx,12(%rdi) + movl %r10d,16(%rdi) +.align 16 +.Ldone_avx2_512: + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rsp + +.Lno_data_avx2_512: +.Lblocks_avx2_epilogue_512: + ret + + +.align 32 +.Lbase2_64_avx2_512: + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lbase2_64_avx2_body_512: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movl 16(%rdi),%r10d + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + testq $63,%rdx + jz .Linit_avx2_512 + +.Lbase2_64_pre_avx2_512: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + subq $16,%r15 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_64_pre_avx2_512 + +.Linit_avx2_512: + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r8 + movq %rbx,%r9 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r8 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r8,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r9 + andq $0x3ffffff,%rbx + orq %r9,%r10 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + movl $1,20(%rdi) + + __poly1305_init_avx + +.Lproceed_avx2_512: + movq %r15,%rdx + + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rax + leaq 48(%rsp),%rsp + +.Lbase2_64_avx2_epilogue_512: + jmp .Ldo_avx2_512 + + +.align 32 +.Leven_avx2_512: + + vmovd 0(%rdi),%xmm0 + vmovd 4(%rdi),%xmm1 + vmovd 8(%rdi),%xmm2 + vmovd 12(%rdi),%xmm3 + vmovd 16(%rdi),%xmm4 + +.Ldo_avx2_512: + cmpq $512,%rdx + jae .Lblocks_avx512 +.Lskip_avx512: + leaq 8(%rsp),%r10 + + subq $0x128,%rsp + leaq .Lconst(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm7 + + + vmovdqu -64(%rdi),%xmm9 + andq $-512,%rsp + vmovdqu -48(%rdi),%xmm10 + vmovdqu -32(%rdi),%xmm6 + vmovdqu -16(%rdi),%xmm11 + vmovdqu 0(%rdi),%xmm12 + vmovdqu 16(%rdi),%xmm13 + leaq 144(%rsp),%rax + vmovdqu 32(%rdi),%xmm14 + vpermd %ymm9,%ymm7,%ymm9 + vmovdqu 48(%rdi),%xmm15 + vpermd %ymm10,%ymm7,%ymm10 + vmovdqu 64(%rdi),%xmm5 + vpermd %ymm6,%ymm7,%ymm6 + vmovdqa %ymm9,0(%rsp) + vpermd %ymm11,%ymm7,%ymm11 + vmovdqa %ymm10,32-144(%rax) + vpermd %ymm12,%ymm7,%ymm12 + vmovdqa %ymm6,64-144(%rax) + vpermd %ymm13,%ymm7,%ymm13 + vmovdqa %ymm11,96-144(%rax) + vpermd %ymm14,%ymm7,%ymm14 + vmovdqa %ymm12,128-144(%rax) + vpermd %ymm15,%ymm7,%ymm15 + vmovdqa %ymm13,160-144(%rax) + vpermd %ymm5,%ymm7,%ymm5 + vmovdqa %ymm14,192-144(%rax) + vmovdqa %ymm15,224-144(%rax) + vmovdqa %ymm5,256-144(%rax) + vmovdqa 64(%rcx),%ymm5 + + + + vmovdqu 0(%rsi),%xmm7 + vmovdqu 16(%rsi),%xmm8 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpsrldq $6,%ymm7,%ymm9 + vpsrldq $6,%ymm8,%ymm10 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + vpunpcklqdq %ymm10,%ymm9,%ymm9 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + + vpsrlq $30,%ymm9,%ymm10 + vpsrlq $4,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + vpsrlq $40,%ymm6,%ymm6 + vpand %ymm5,%ymm9,%ymm9 + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + vpaddq %ymm2,%ymm9,%ymm2 + subq $64,%rdx + jz .Ltail_avx2_512 + jmp .Loop_avx2_512 + +.align 32 +.Loop_avx2_512: + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqa 0(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqa 32(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqa 96(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqa 48(%rax),%ymm10 + vmovdqa 112(%rax),%ymm5 + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 64(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + vmovdqa -16(%rax),%ymm8 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vmovdqu 0(%rsi),%xmm7 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vmovdqu 16(%rsi),%xmm8 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqa 16(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpsrldq $6,%ymm7,%ymm9 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpsrldq $6,%ymm8,%ymm10 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpunpcklqdq %ymm10,%ymm9,%ymm10 + vpmuludq 80(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $4,%ymm10,%ymm9 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpand %ymm5,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpaddq %ymm9,%ymm2,%ymm2 + vpsrlq $30,%ymm10,%ymm10 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $40,%ymm6,%ymm6 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + subq $64,%rdx + jnz .Loop_avx2_512 + +.byte 0x66,0x90 +.Ltail_avx2_512: + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqu 4(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqu 36(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqu 100(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqu 52(%rax),%ymm10 + vmovdqu 116(%rax),%ymm5 + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 68(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vmovdqu -12(%rax),%ymm8 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqu 20(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpmuludq 84(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + vpsrldq $8,%ymm12,%ymm8 + vpsrldq $8,%ymm2,%ymm9 + vpsrldq $8,%ymm3,%ymm10 + vpsrldq $8,%ymm4,%ymm6 + vpsrldq $8,%ymm0,%ymm7 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + + vpermq $0x2,%ymm3,%ymm10 + vpermq $0x2,%ymm4,%ymm6 + vpermq $0x2,%ymm0,%ymm7 + vpermq $0x2,%ymm12,%ymm8 + vpermq $0x2,%ymm2,%ymm9 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vmovd %xmm0,-112(%rdi) + vmovd %xmm1,-108(%rdi) + vmovd %xmm2,-104(%rdi) + vmovd %xmm3,-100(%rdi) + vmovd %xmm4,-96(%rdi) + leaq -8(%r10),%rsp + + vzeroupper + ret + +.Lblocks_avx512: + + movl $15,%eax + kmovw %eax,%k2 + leaq 8(%rsp),%r10 + + subq $0x128,%rsp + leaq .Lconst(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm9 + + vmovdqu32 -64(%rdi),%zmm16{%k2}{z} + andq $-512,%rsp + vmovdqu32 -48(%rdi),%zmm17{%k2}{z} + movq $0x20,%rax + vmovdqu32 -32(%rdi),%zmm21{%k2}{z} + vmovdqu32 -16(%rdi),%zmm18{%k2}{z} + vmovdqu32 0(%rdi),%zmm22{%k2}{z} + vmovdqu32 16(%rdi),%zmm19{%k2}{z} + vmovdqu32 32(%rdi),%zmm23{%k2}{z} + vmovdqu32 48(%rdi),%zmm20{%k2}{z} + vmovdqu32 64(%rdi),%zmm24{%k2}{z} + vpermd %zmm16,%zmm9,%zmm16 + vpbroadcastq 64(%rcx),%zmm5 + vpermd %zmm17,%zmm9,%zmm17 + vpermd %zmm21,%zmm9,%zmm21 + vpermd %zmm18,%zmm9,%zmm18 + vmovdqa64 %zmm16,0(%rsp){%k2} + vpsrlq $32,%zmm16,%zmm7 + vpermd %zmm22,%zmm9,%zmm22 + vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2} + vpsrlq $32,%zmm17,%zmm8 + vpermd %zmm19,%zmm9,%zmm19 + vmovdqa64 %zmm21,64(%rsp){%k2} + vpermd %zmm23,%zmm9,%zmm23 + vpermd %zmm20,%zmm9,%zmm20 + vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2} + vpermd %zmm24,%zmm9,%zmm24 + vmovdqa64 %zmm22,128(%rsp){%k2} + vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2} + vmovdqa64 %zmm23,192(%rsp){%k2} + vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2} + vmovdqa64 %zmm24,256(%rsp){%k2} + + vpmuludq %zmm7,%zmm16,%zmm11 + vpmuludq %zmm7,%zmm17,%zmm12 + vpmuludq %zmm7,%zmm18,%zmm13 + vpmuludq %zmm7,%zmm19,%zmm14 + vpmuludq %zmm7,%zmm20,%zmm15 + vpsrlq $32,%zmm18,%zmm9 + + vpmuludq %zmm8,%zmm24,%zmm25 + vpmuludq %zmm8,%zmm16,%zmm26 + vpmuludq %zmm8,%zmm17,%zmm27 + vpmuludq %zmm8,%zmm18,%zmm28 + vpmuludq %zmm8,%zmm19,%zmm29 + vpsrlq $32,%zmm19,%zmm10 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + + vpmuludq %zmm9,%zmm23,%zmm25 + vpmuludq %zmm9,%zmm24,%zmm26 + vpmuludq %zmm9,%zmm17,%zmm28 + vpmuludq %zmm9,%zmm18,%zmm29 + vpmuludq %zmm9,%zmm16,%zmm27 + vpsrlq $32,%zmm20,%zmm6 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm10,%zmm22,%zmm25 + vpmuludq %zmm10,%zmm16,%zmm28 + vpmuludq %zmm10,%zmm17,%zmm29 + vpmuludq %zmm10,%zmm23,%zmm26 + vpmuludq %zmm10,%zmm24,%zmm27 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm6,%zmm24,%zmm28 + vpmuludq %zmm6,%zmm16,%zmm29 + vpmuludq %zmm6,%zmm21,%zmm25 + vpmuludq %zmm6,%zmm22,%zmm26 + vpmuludq %zmm6,%zmm23,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vmovdqu64 0(%rsi),%zmm10 + vmovdqu64 64(%rsi),%zmm6 + leaq 128(%rsi),%rsi + + vpsrlq $26,%zmm14,%zmm28 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm28,%zmm15,%zmm15 + + vpsrlq $26,%zmm11,%zmm25 + vpandq %zmm5,%zmm11,%zmm11 + vpaddq %zmm25,%zmm12,%zmm12 + + vpsrlq $26,%zmm15,%zmm29 + vpandq %zmm5,%zmm15,%zmm15 + + vpsrlq $26,%zmm12,%zmm26 + vpandq %zmm5,%zmm12,%zmm12 + vpaddq %zmm26,%zmm13,%zmm13 + + vpaddq %zmm29,%zmm11,%zmm11 + vpsllq $2,%zmm29,%zmm29 + vpaddq %zmm29,%zmm11,%zmm11 + + vpsrlq $26,%zmm13,%zmm27 + vpandq %zmm5,%zmm13,%zmm13 + vpaddq %zmm27,%zmm14,%zmm14 + + vpsrlq $26,%zmm11,%zmm25 + vpandq %zmm5,%zmm11,%zmm11 + vpaddq %zmm25,%zmm12,%zmm12 + + vpsrlq $26,%zmm14,%zmm28 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm28,%zmm15,%zmm15 + + vpunpcklqdq %zmm6,%zmm10,%zmm7 + vpunpckhqdq %zmm6,%zmm10,%zmm6 + + vmovdqa32 128(%rcx),%zmm25 + movl $0x7777,%eax + kmovw %eax,%k1 + + vpermd %zmm16,%zmm25,%zmm16 + vpermd %zmm17,%zmm25,%zmm17 + vpermd %zmm18,%zmm25,%zmm18 + vpermd %zmm19,%zmm25,%zmm19 + vpermd %zmm20,%zmm25,%zmm20 + + vpermd %zmm11,%zmm25,%zmm16{%k1} + vpermd %zmm12,%zmm25,%zmm17{%k1} + vpermd %zmm13,%zmm25,%zmm18{%k1} + vpermd %zmm14,%zmm25,%zmm19{%k1} + vpermd %zmm15,%zmm25,%zmm20{%k1} + + vpslld $2,%zmm17,%zmm21 + vpslld $2,%zmm18,%zmm22 + vpslld $2,%zmm19,%zmm23 + vpslld $2,%zmm20,%zmm24 + vpaddd %zmm17,%zmm21,%zmm21 + vpaddd %zmm18,%zmm22,%zmm22 + vpaddd %zmm19,%zmm23,%zmm23 + vpaddd %zmm20,%zmm24,%zmm24 + + vpbroadcastq 32(%rcx),%zmm30 + + vpsrlq $52,%zmm7,%zmm9 + vpsllq $12,%zmm6,%zmm10 + vporq %zmm10,%zmm9,%zmm9 + vpsrlq $26,%zmm7,%zmm8 + vpsrlq $14,%zmm6,%zmm10 + vpsrlq $40,%zmm6,%zmm6 + vpandq %zmm5,%zmm9,%zmm9 + vpandq %zmm5,%zmm7,%zmm7 + + vpaddq %zmm2,%zmm9,%zmm2 + subq $192,%rdx + jbe .Ltail_avx512 + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + + vpmuludq %zmm2,%zmm17,%zmm14 + vpaddq %zmm0,%zmm7,%zmm0 + vpmuludq %zmm2,%zmm18,%zmm15 + vpandq %zmm5,%zmm8,%zmm8 + vpmuludq %zmm2,%zmm23,%zmm11 + vpandq %zmm5,%zmm10,%zmm10 + vpmuludq %zmm2,%zmm24,%zmm12 + vporq %zmm30,%zmm6,%zmm6 + vpmuludq %zmm2,%zmm16,%zmm13 + vpaddq %zmm1,%zmm8,%zmm1 + vpaddq %zmm3,%zmm10,%zmm3 + vpaddq %zmm4,%zmm6,%zmm4 + + vmovdqu64 0(%rsi),%zmm10 + vmovdqu64 64(%rsi),%zmm6 + leaq 128(%rsi),%rsi + vpmuludq %zmm0,%zmm19,%zmm28 + vpmuludq %zmm0,%zmm20,%zmm29 + vpmuludq %zmm0,%zmm16,%zmm25 + vpmuludq %zmm0,%zmm17,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + + vpmuludq %zmm1,%zmm18,%zmm28 + vpmuludq %zmm1,%zmm19,%zmm29 + vpmuludq %zmm1,%zmm24,%zmm25 + vpmuludq %zmm0,%zmm18,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm27,%zmm13,%zmm13 + + vpunpcklqdq %zmm6,%zmm10,%zmm7 + vpunpckhqdq %zmm6,%zmm10,%zmm6 + + vpmuludq %zmm3,%zmm16,%zmm28 + vpmuludq %zmm3,%zmm17,%zmm29 + vpmuludq %zmm1,%zmm16,%zmm26 + vpmuludq %zmm1,%zmm17,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm24,%zmm28 + vpmuludq %zmm4,%zmm16,%zmm29 + vpmuludq %zmm3,%zmm22,%zmm25 + vpmuludq %zmm3,%zmm23,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpmuludq %zmm3,%zmm24,%zmm27 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm21,%zmm25 + vpmuludq %zmm4,%zmm22,%zmm26 + vpmuludq %zmm4,%zmm23,%zmm27 + vpaddq %zmm25,%zmm11,%zmm0 + vpaddq %zmm26,%zmm12,%zmm1 + vpaddq %zmm27,%zmm13,%zmm2 + + vpsrlq $52,%zmm7,%zmm9 + vpsllq $12,%zmm6,%zmm10 + + vpsrlq $26,%zmm14,%zmm3 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm3,%zmm15,%zmm4 + + vporq %zmm10,%zmm9,%zmm9 + + vpsrlq $26,%zmm0,%zmm11 + vpandq %zmm5,%zmm0,%zmm0 + vpaddq %zmm11,%zmm1,%zmm1 + + vpandq %zmm5,%zmm9,%zmm9 + + vpsrlq $26,%zmm4,%zmm15 + vpandq %zmm5,%zmm4,%zmm4 + + vpsrlq $26,%zmm1,%zmm12 + vpandq %zmm5,%zmm1,%zmm1 + vpaddq %zmm12,%zmm2,%zmm2 + + vpaddq %zmm15,%zmm0,%zmm0 + vpsllq $2,%zmm15,%zmm15 + vpaddq %zmm15,%zmm0,%zmm0 + + vpaddq %zmm9,%zmm2,%zmm2 + vpsrlq $26,%zmm7,%zmm8 + + vpsrlq $26,%zmm2,%zmm13 + vpandq %zmm5,%zmm2,%zmm2 + vpaddq %zmm13,%zmm14,%zmm3 + + vpsrlq $14,%zmm6,%zmm10 + + vpsrlq $26,%zmm0,%zmm11 + vpandq %zmm5,%zmm0,%zmm0 + vpaddq %zmm11,%zmm1,%zmm1 + + vpsrlq $40,%zmm6,%zmm6 + + vpsrlq $26,%zmm3,%zmm14 + vpandq %zmm5,%zmm3,%zmm3 + vpaddq %zmm14,%zmm4,%zmm4 + + vpandq %zmm5,%zmm7,%zmm7 + + subq $128,%rdx + ja .Loop_avx512 + +.Ltail_avx512: + + vpsrlq $32,%zmm16,%zmm16 + vpsrlq $32,%zmm17,%zmm17 + vpsrlq $32,%zmm18,%zmm18 + vpsrlq $32,%zmm23,%zmm23 + vpsrlq $32,%zmm24,%zmm24 + vpsrlq $32,%zmm19,%zmm19 + vpsrlq $32,%zmm20,%zmm20 + vpsrlq $32,%zmm21,%zmm21 + vpsrlq $32,%zmm22,%zmm22 + + leaq (%rsi,%rdx,1),%rsi + + vpaddq %zmm0,%zmm7,%zmm0 + + vpmuludq %zmm2,%zmm17,%zmm14 + vpmuludq %zmm2,%zmm18,%zmm15 + vpmuludq %zmm2,%zmm23,%zmm11 + vpandq %zmm5,%zmm8,%zmm8 + vpmuludq %zmm2,%zmm24,%zmm12 + vpandq %zmm5,%zmm10,%zmm10 + vpmuludq %zmm2,%zmm16,%zmm13 + vporq %zmm30,%zmm6,%zmm6 + vpaddq %zmm1,%zmm8,%zmm1 + vpaddq %zmm3,%zmm10,%zmm3 + vpaddq %zmm4,%zmm6,%zmm4 + + vmovdqu 0(%rsi),%xmm7 + vpmuludq %zmm0,%zmm19,%zmm28 + vpmuludq %zmm0,%zmm20,%zmm29 + vpmuludq %zmm0,%zmm16,%zmm25 + vpmuludq %zmm0,%zmm17,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + + vmovdqu 16(%rsi),%xmm8 + vpmuludq %zmm1,%zmm18,%zmm28 + vpmuludq %zmm1,%zmm19,%zmm29 + vpmuludq %zmm1,%zmm24,%zmm25 + vpmuludq %zmm0,%zmm18,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm27,%zmm13,%zmm13 + + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + vpmuludq %zmm3,%zmm16,%zmm28 + vpmuludq %zmm3,%zmm17,%zmm29 + vpmuludq %zmm1,%zmm16,%zmm26 + vpmuludq %zmm1,%zmm17,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + vpmuludq %zmm4,%zmm24,%zmm28 + vpmuludq %zmm4,%zmm16,%zmm29 + vpmuludq %zmm3,%zmm22,%zmm25 + vpmuludq %zmm3,%zmm23,%zmm26 + vpmuludq %zmm3,%zmm24,%zmm27 + vpaddq %zmm28,%zmm14,%zmm3 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm21,%zmm25 + vpmuludq %zmm4,%zmm22,%zmm26 + vpmuludq %zmm4,%zmm23,%zmm27 + vpaddq %zmm25,%zmm11,%zmm0 + vpaddq %zmm26,%zmm12,%zmm1 + vpaddq %zmm27,%zmm13,%zmm2 + + movl $1,%eax + vpermq $0xb1,%zmm3,%zmm14 + vpermq $0xb1,%zmm15,%zmm4 + vpermq $0xb1,%zmm0,%zmm11 + vpermq $0xb1,%zmm1,%zmm12 + vpermq $0xb1,%zmm2,%zmm13 + vpaddq %zmm14,%zmm3,%zmm3 + vpaddq %zmm15,%zmm4,%zmm4 + vpaddq %zmm11,%zmm0,%zmm0 + vpaddq %zmm12,%zmm1,%zmm1 + vpaddq %zmm13,%zmm2,%zmm2 + + kmovw %eax,%k3 + vpermq $0x2,%zmm3,%zmm14 + vpermq $0x2,%zmm4,%zmm15 + vpermq $0x2,%zmm0,%zmm11 + vpermq $0x2,%zmm1,%zmm12 + vpermq $0x2,%zmm2,%zmm13 + vpaddq %zmm14,%zmm3,%zmm3 + vpaddq %zmm15,%zmm4,%zmm4 + vpaddq %zmm11,%zmm0,%zmm0 + vpaddq %zmm12,%zmm1,%zmm1 + vpaddq %zmm13,%zmm2,%zmm2 + + vextracti64x4 $0x1,%zmm3,%ymm14 + vextracti64x4 $0x1,%zmm4,%ymm15 + vextracti64x4 $0x1,%zmm0,%ymm11 + vextracti64x4 $0x1,%zmm1,%ymm12 + vextracti64x4 $0x1,%zmm2,%ymm13 + vpaddq %zmm14,%zmm3,%zmm3{%k3}{z} + vpaddq %zmm15,%zmm4,%zmm4{%k3}{z} + vpaddq %zmm11,%zmm0,%zmm0{%k3}{z} + vpaddq %zmm12,%zmm1,%zmm1{%k3}{z} + vpaddq %zmm13,%zmm2,%zmm2{%k3}{z} + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpsrldq $6,%ymm7,%ymm9 + vpsrldq $6,%ymm8,%ymm10 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpunpcklqdq %ymm10,%ymm9,%ymm9 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpsrlq $30,%ymm9,%ymm10 + vpsrlq $4,%ymm9,%ymm9 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpsrlq $26,%ymm7,%ymm8 + vpsrlq $40,%ymm6,%ymm6 + vpaddq %ymm15,%ymm0,%ymm0 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpand %ymm5,%ymm9,%ymm9 + vpand %ymm5,%ymm7,%ymm7 + vpaddq %ymm13,%ymm3,%ymm3 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm2,%ymm9,%ymm2 + vpand %ymm5,%ymm8,%ymm8 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + vpaddq %ymm14,%ymm4,%ymm4 + + leaq 144(%rsp),%rax + addq $64,%rdx + jnz .Ltail_avx2_512 + + vpsubq %ymm9,%ymm2,%ymm2 + vmovd %xmm0,-112(%rdi) + vmovd %xmm1,-108(%rdi) + vmovd %xmm2,-104(%rdi) + vmovd %xmm3,-100(%rdi) + vmovd %xmm4,-96(%rdi) + vzeroall + leaq -8(%r10),%rsp + + ret + +ENDPROC(poly1305_blocks_avx512) +#endif /* CONFIG_AS_AVX512 */ -- 2.18.0