Provide a version of the core AES transform to the aegis128 SIMD code that does not rely on the special AES instructions, but uses plain NEON instructions instead. This allows the SIMD version of the aegis128 driver to be used on arm64 systems that do not implement those instructions (which are not mandatory in the architecture), such as the Raspberry Pi 3. Cc: Nick Desaulniers <ndesaulniers@xxxxxxxxxx> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> --- crypto/Makefile | 5 ++ crypto/aegis128-neon-inner.c | 53 ++++++++++++++++++++ crypto/aegis128-neon.c | 16 +++++- 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/crypto/Makefile b/crypto/Makefile index 99a9fa9087d1..c3760c7616ac 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -99,6 +99,11 @@ aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o endif ifeq ($(ARCH),arm64) CFLAGS_aegis128-neon-inner.o += -ffreestanding -mcpu=generic+crypto +CFLAGS_aegis128-neon-inner.o += -ffixed-q14 -ffixed-q15 +CFLAGS_aegis128-neon-inner.o += -ffixed-q16 -ffixed-q17 -ffixed-q18 -ffixed-q19 +CFLAGS_aegis128-neon-inner.o += -ffixed-q20 -ffixed-q21 -ffixed-q22 -ffixed-q23 +CFLAGS_aegis128-neon-inner.o += -ffixed-q24 -ffixed-q25 -ffixed-q26 -ffixed-q27 +CFLAGS_aegis128-neon-inner.o += -ffixed-q28 -ffixed-q29 -ffixed-q30 -ffixed-q31 CFLAGS_REMOVE_aegis128-neon-inner.o += -mgeneral-regs-only aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o endif diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c index 6aca2f425b6d..7aa4cef3c2de 100644 --- a/crypto/aegis128-neon-inner.c +++ b/crypto/aegis128-neon-inner.c @@ -17,6 +17,8 @@ #include <stddef.h> +extern int aegis128_have_aes_insn; + void *memcpy(void *dest, const void *src, size_t n); void *memset(void *s, int c, size_t n); @@ -49,6 +51,32 @@ uint8x16_t aegis_aes_round(uint8x16_t w) { uint8x16_t z = {}; +#ifdef CONFIG_ARM64 + if (!__builtin_expect(aegis128_have_aes_insn, 1)) { + uint8x16_t v; + + // shift rows + asm("tbl %0.16b, {%0.16b}, v14.16b" : "+w"(w)); + + // sub bytes + asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w)); + w -= 0x40; + asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w)); + w -= 0x40; + asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w)); + w -= 0x40; + asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w)); + + // mix columns + w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b); + w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v); + asm("tbl %0.16b, {%1.16b}, v15.16b" : "=w"(v) : "w"(v ^ w)); + w ^= v; + + return w; + } +#endif + /* * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics * to force the compiler to issue the aese/aesmc instructions in pairs. @@ -149,3 +177,28 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src, aegis128_save_state_neon(st, state); } + +#ifdef CONFIG_ARM64 +void crypto_aegis128_init_neon(void) +{ + u64 tmp; + + asm volatile( + "adrp %0, crypto_aes_sbox \n\t" + "add %0, %0, :lo12:crypto_aes_sbox \n\t" + "mov v14.16b, %1.16b \n\t" + "mov v15.16b, %2.16b \n\t" + "ld1 {v16.16b-v19.16b}, [%0], #64 \n\t" + "ld1 {v20.16b-v23.16b}, [%0], #64 \n\t" + "ld1 {v24.16b-v27.16b}, [%0], #64 \n\t" + "ld1 {v28.16b-v31.16b}, [%0] \n\t" + : "=&r"(tmp) + : "w"((uint8x16_t){ // shift rows permutation vector + 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, + 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, }), + "w"((uint8x16_t){ // ror32 permutation vector + 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, }) + ); +} +#endif diff --git a/crypto/aegis128-neon.c b/crypto/aegis128-neon.c index c1c0a1686f67..72f9d48e4963 100644 --- a/crypto/aegis128-neon.c +++ b/crypto/aegis128-neon.c @@ -14,14 +14,24 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src, void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src, unsigned int size); +void crypto_aegis128_init_neon(void); + +int aegis128_have_aes_insn __ro_after_init; + bool crypto_aegis128_have_simd(void) { - return cpu_have_feature(cpu_feature(AES)); + if (cpu_have_feature(cpu_feature(AES))) { + aegis128_have_aes_insn = 1; + return true; + } + return IS_ENABLED(CONFIG_ARM64); } void crypto_aegis128_update_simd(union aegis_block *state, const void *msg) { kernel_neon_begin(); + if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn) + crypto_aegis128_init_neon(); crypto_aegis128_update_neon(state, msg); kernel_neon_end(); } @@ -30,6 +40,8 @@ void crypto_aegis128_encrypt_chunk_simd(union aegis_block *state, u8 *dst, const u8 *src, unsigned int size) { kernel_neon_begin(); + if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn) + crypto_aegis128_init_neon(); crypto_aegis128_encrypt_chunk_neon(state, dst, src, size); kernel_neon_end(); } @@ -38,6 +50,8 @@ void crypto_aegis128_decrypt_chunk_simd(union aegis_block *state, u8 *dst, const u8 *src, unsigned int size) { kernel_neon_begin(); + if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn) + crypto_aegis128_init_neon(); crypto_aegis128_decrypt_chunk_neon(state, dst, src, size); kernel_neon_end(); } -- 2.17.1