Re: [PATCH RFC 3/3] crypto: arm64/aegis128 - implement plain NEON version

Nick Desaulniers <ndesaulniers@xxxxxxxxxx> · Thu, 8 Aug 2019 15:31:02 -0700

On Fri, Aug 2, 2019 at 8:15 AM Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> wrote:
>
> Provide a version of the core AES transform to the aegis128 SIMD
> code that does not rely on the special AES instructions, but uses
> plain NEON instructions instead. This allows the SIMD version of
> the aegis128 driver to be used on arm64 systems that do not
> implement those instructions (which are not mandatory in the
> architecture), such as the Raspberry Pi 3.
>
> Cc: Nick Desaulniers <ndesaulniers@xxxxxxxxxx>

Thanks for the heads up, thoughts below:

> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx>
> ---
>  crypto/Makefile              |  5 ++
>  crypto/aegis128-neon-inner.c | 53 ++++++++++++++++++++
>  crypto/aegis128-neon.c       | 16 +++++-
>  3 files changed, 73 insertions(+), 1 deletion(-)
>
> diff --git a/crypto/Makefile b/crypto/Makefile
> index 99a9fa9087d1..c3760c7616ac 100644
> --- a/crypto/Makefile
> +++ b/crypto/Makefile
> @@ -99,6 +99,11 @@ aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
>  endif
>  ifeq ($(ARCH),arm64)
>  CFLAGS_aegis128-neon-inner.o += -ffreestanding -mcpu=generic+crypto
> +CFLAGS_aegis128-neon-inner.o += -ffixed-q14 -ffixed-q15
> +CFLAGS_aegis128-neon-inner.o += -ffixed-q16 -ffixed-q17 -ffixed-q18 -ffixed-q19
> +CFLAGS_aegis128-neon-inner.o += -ffixed-q20 -ffixed-q21 -ffixed-q22 -ffixed-q23
> +CFLAGS_aegis128-neon-inner.o += -ffixed-q24 -ffixed-q25 -ffixed-q26 -ffixed-q27
> +CFLAGS_aegis128-neon-inner.o += -ffixed-q28 -ffixed-q29 -ffixed-q30 -ffixed-q31

So Tri implemented support for -ffixed-x*, but Clang currently lacks
support for -ffixed-q*.  Petr recently made this slightly more
generic:
https://reviews.llvm.org/D56305
but Clang still doesn't allow specifying any register number + width
for each supported arch.  The arm64 support for x registers was
manually added.

I'm guessing that for arm64 that if:
* w* is 32b registers
* x* is 64b registers
then:
* q* is 128b NEON registers?

I'm curious as to why we need to reserve these registers when calling
functions in this TU?  I assume this has to do with the calling
convention for uint8x16_t?  Can you point me to documentation about
that (that way I can reference in any patch to Clang/LLVM)?

>  CFLAGS_REMOVE_aegis128-neon-inner.o += -mgeneral-regs-only
>  aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
>  endif
> diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c
> index 6aca2f425b6d..7aa4cef3c2de 100644
> --- a/crypto/aegis128-neon-inner.c
> +++ b/crypto/aegis128-neon-inner.c
> @@ -17,6 +17,8 @@
>
>  #include <stddef.h>
>
> +extern int aegis128_have_aes_insn;
> +
>  void *memcpy(void *dest, const void *src, size_t n);
>  void *memset(void *s, int c, size_t n);
>
> @@ -49,6 +51,32 @@ uint8x16_t aegis_aes_round(uint8x16_t w)
>  {
>         uint8x16_t z = {};
>
> +#ifdef CONFIG_ARM64
> +       if (!__builtin_expect(aegis128_have_aes_insn, 1)) {

Can we use a likely/unlikely here?  It always takes me a minute to decode these.

> +               uint8x16_t v;
> +
> +               // shift rows
> +               asm("tbl %0.16b, {%0.16b}, v14.16b" : "+w"(w));
> +
> +               // sub bytes
> +               asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w));
> +               w -= 0x40;
> +               asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w));
> +               w -= 0x40;
> +               asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w));
> +               w -= 0x40;
> +               asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w));
> +
> +               // mix columns
> +               w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);

What does it mean to right shift a int8x16_t?  Is that elementwise
right shift or do the bits shift from one element to another?

> +               w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
> +               asm("tbl %0.16b, {%1.16b}, v15.16b" : "=w"(v) : "w"(v ^ w));
> +               w ^= v;
> +
> +               return w;
> +       }
> +#endif
> +
>         /*
>          * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics
>          * to force the compiler to issue the aese/aesmc instructions in pairs.
> @@ -149,3 +177,28 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
>
>         aegis128_save_state_neon(st, state);
>  }
> +
> +#ifdef CONFIG_ARM64
> +void crypto_aegis128_init_neon(void)
> +{
> +       u64 tmp;
> +
> +       asm volatile(
> +           "adrp               %0, crypto_aes_sbox             \n\t"
> +           "add                %0, %0, :lo12:crypto_aes_sbox   \n\t"
> +           "mov                v14.16b, %1.16b                 \n\t"
> +           "mov                v15.16b, %2.16b                 \n\t"
> +           "ld1                {v16.16b-v19.16b}, [%0], #64    \n\t"
> +           "ld1                {v20.16b-v23.16b}, [%0], #64    \n\t"
> +           "ld1                {v24.16b-v27.16b}, [%0], #64    \n\t"
> +           "ld1                {v28.16b-v31.16b}, [%0]         \n\t"
> +           : "=&r"(tmp)
> +           : "w"((uint8x16_t){ // shift rows permutation vector
> +                       0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
> +                       0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, }),
> +             "w"((uint8x16_t){ // ror32 permutation vector
> +                       0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
> +                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, })
> +       );
> +}
> +#endif
> diff --git a/crypto/aegis128-neon.c b/crypto/aegis128-neon.c
> index c1c0a1686f67..72f9d48e4963 100644
> --- a/crypto/aegis128-neon.c
> +++ b/crypto/aegis128-neon.c
> @@ -14,14 +14,24 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
>  void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
>                                         unsigned int size);
>
> +void crypto_aegis128_init_neon(void);
> +
> +int aegis128_have_aes_insn __ro_after_init;
> +
>  bool crypto_aegis128_have_simd(void)
>  {
> -       return cpu_have_feature(cpu_feature(AES));
> +       if (cpu_have_feature(cpu_feature(AES))) {
> +               aegis128_have_aes_insn = 1;
> +               return true;

This could just fall through right? (if you removed the return
statement, I assume IS_ENABLED doesn't have runtime overhead but is
just a preprocessor check?)

> +       }
> +       return IS_ENABLED(CONFIG_ARM64);
>  }
>
>  void crypto_aegis128_update_simd(union aegis_block *state, const void *msg)
>  {
>         kernel_neon_begin();
> +       if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
> +               crypto_aegis128_init_neon();
>         crypto_aegis128_update_neon(state, msg);
>         kernel_neon_end();
>  }
> @@ -30,6 +40,8 @@ void crypto_aegis128_encrypt_chunk_simd(union aegis_block *state, u8 *dst,
>                                         const u8 *src, unsigned int size)
>  {
>         kernel_neon_begin();
> +       if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
> +               crypto_aegis128_init_neon();
>         crypto_aegis128_encrypt_chunk_neon(state, dst, src, size);
>         kernel_neon_end();
>  }
> @@ -38,6 +50,8 @@ void crypto_aegis128_decrypt_chunk_simd(union aegis_block *state, u8 *dst,
>                                         const u8 *src, unsigned int size)
>  {
>         kernel_neon_begin();
> +       if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
> +               crypto_aegis128_init_neon();
>         crypto_aegis128_decrypt_chunk_neon(state, dst, src, size);
>         kernel_neon_end();
>  }
> --
> 2.17.1
>

-- 
Thanks,
~Nick Desaulniers