On Fri, Aug 2, 2019 at 8:15 AM Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> wrote: > > Provide a version of the core AES transform to the aegis128 SIMD > code that does not rely on the special AES instructions, but uses > plain NEON instructions instead. This allows the SIMD version of > the aegis128 driver to be used on arm64 systems that do not > implement those instructions (which are not mandatory in the > architecture), such as the Raspberry Pi 3. > > Cc: Nick Desaulniers <ndesaulniers@xxxxxxxxxx> Thanks for the heads up, thoughts below: > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> > --- > crypto/Makefile | 5 ++ > crypto/aegis128-neon-inner.c | 53 ++++++++++++++++++++ > crypto/aegis128-neon.c | 16 +++++- > 3 files changed, 73 insertions(+), 1 deletion(-) > > diff --git a/crypto/Makefile b/crypto/Makefile > index 99a9fa9087d1..c3760c7616ac 100644 > --- a/crypto/Makefile > +++ b/crypto/Makefile > @@ -99,6 +99,11 @@ aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o > endif > ifeq ($(ARCH),arm64) > CFLAGS_aegis128-neon-inner.o += -ffreestanding -mcpu=generic+crypto > +CFLAGS_aegis128-neon-inner.o += -ffixed-q14 -ffixed-q15 > +CFLAGS_aegis128-neon-inner.o += -ffixed-q16 -ffixed-q17 -ffixed-q18 -ffixed-q19 > +CFLAGS_aegis128-neon-inner.o += -ffixed-q20 -ffixed-q21 -ffixed-q22 -ffixed-q23 > +CFLAGS_aegis128-neon-inner.o += -ffixed-q24 -ffixed-q25 -ffixed-q26 -ffixed-q27 > +CFLAGS_aegis128-neon-inner.o += -ffixed-q28 -ffixed-q29 -ffixed-q30 -ffixed-q31 So Tri implemented support for -ffixed-x*, but Clang currently lacks support for -ffixed-q*. Petr recently made this slightly more generic: https://reviews.llvm.org/D56305 but Clang still doesn't allow specifying any register number + width for each supported arch. The arm64 support for x registers was manually added. I'm guessing that for arm64 that if: * w* is 32b registers * x* is 64b registers then: * q* is 128b NEON registers? I'm curious as to why we need to reserve these registers when calling functions in this TU? I assume this has to do with the calling convention for uint8x16_t? Can you point me to documentation about that (that way I can reference in any patch to Clang/LLVM)? > CFLAGS_REMOVE_aegis128-neon-inner.o += -mgeneral-regs-only > aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o > endif > diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c > index 6aca2f425b6d..7aa4cef3c2de 100644 > --- a/crypto/aegis128-neon-inner.c > +++ b/crypto/aegis128-neon-inner.c > @@ -17,6 +17,8 @@ > > #include <stddef.h> > > +extern int aegis128_have_aes_insn; > + > void *memcpy(void *dest, const void *src, size_t n); > void *memset(void *s, int c, size_t n); > > @@ -49,6 +51,32 @@ uint8x16_t aegis_aes_round(uint8x16_t w) > { > uint8x16_t z = {}; > > +#ifdef CONFIG_ARM64 > + if (!__builtin_expect(aegis128_have_aes_insn, 1)) { Can we use a likely/unlikely here? It always takes me a minute to decode these. > + uint8x16_t v; > + > + // shift rows > + asm("tbl %0.16b, {%0.16b}, v14.16b" : "+w"(w)); > + > + // sub bytes > + asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w)); > + w -= 0x40; > + asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w)); > + w -= 0x40; > + asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w)); > + w -= 0x40; > + asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w)); > + > + // mix columns > + w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b); What does it mean to right shift a int8x16_t? Is that elementwise right shift or do the bits shift from one element to another? > + w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v); > + asm("tbl %0.16b, {%1.16b}, v15.16b" : "=w"(v) : "w"(v ^ w)); > + w ^= v; > + > + return w; > + } > +#endif > + > /* > * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics > * to force the compiler to issue the aese/aesmc instructions in pairs. > @@ -149,3 +177,28 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src, > > aegis128_save_state_neon(st, state); > } > + > +#ifdef CONFIG_ARM64 > +void crypto_aegis128_init_neon(void) > +{ > + u64 tmp; > + > + asm volatile( > + "adrp %0, crypto_aes_sbox \n\t" > + "add %0, %0, :lo12:crypto_aes_sbox \n\t" > + "mov v14.16b, %1.16b \n\t" > + "mov v15.16b, %2.16b \n\t" > + "ld1 {v16.16b-v19.16b}, [%0], #64 \n\t" > + "ld1 {v20.16b-v23.16b}, [%0], #64 \n\t" > + "ld1 {v24.16b-v27.16b}, [%0], #64 \n\t" > + "ld1 {v28.16b-v31.16b}, [%0] \n\t" > + : "=&r"(tmp) > + : "w"((uint8x16_t){ // shift rows permutation vector > + 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, > + 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, }), > + "w"((uint8x16_t){ // ror32 permutation vector > + 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, > + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, }) > + ); > +} > +#endif > diff --git a/crypto/aegis128-neon.c b/crypto/aegis128-neon.c > index c1c0a1686f67..72f9d48e4963 100644 > --- a/crypto/aegis128-neon.c > +++ b/crypto/aegis128-neon.c > @@ -14,14 +14,24 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src, > void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src, > unsigned int size); > > +void crypto_aegis128_init_neon(void); > + > +int aegis128_have_aes_insn __ro_after_init; > + > bool crypto_aegis128_have_simd(void) > { > - return cpu_have_feature(cpu_feature(AES)); > + if (cpu_have_feature(cpu_feature(AES))) { > + aegis128_have_aes_insn = 1; > + return true; This could just fall through right? (if you removed the return statement, I assume IS_ENABLED doesn't have runtime overhead but is just a preprocessor check?) > + } > + return IS_ENABLED(CONFIG_ARM64); > } > > void crypto_aegis128_update_simd(union aegis_block *state, const void *msg) > { > kernel_neon_begin(); > + if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn) > + crypto_aegis128_init_neon(); > crypto_aegis128_update_neon(state, msg); > kernel_neon_end(); > } > @@ -30,6 +40,8 @@ void crypto_aegis128_encrypt_chunk_simd(union aegis_block *state, u8 *dst, > const u8 *src, unsigned int size) > { > kernel_neon_begin(); > + if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn) > + crypto_aegis128_init_neon(); > crypto_aegis128_encrypt_chunk_neon(state, dst, src, size); > kernel_neon_end(); > } > @@ -38,6 +50,8 @@ void crypto_aegis128_decrypt_chunk_simd(union aegis_block *state, u8 *dst, > const u8 *src, unsigned int size) > { > kernel_neon_begin(); > + if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn) > + crypto_aegis128_init_neon(); > crypto_aegis128_decrypt_chunk_neon(state, dst, src, size); > kernel_neon_end(); > } > -- > 2.17.1 > -- Thanks, ~Nick Desaulniers