Avoid copying the tail block via a stack buffer if the total size exceeds a single AEGIS block. In this case, we can use overlapping loads and stores and NEON permutation instructions instead, which leads to a modest performance improvement on some cores (< 5%), and is slightly cleaner. Note that we still need to use a stack buffer if the entire input is smaller than 16 bytes, given that we cannot use 16 byte NEON loads and stores safely in this case. Signed-off-by: Ard Biesheuvel <ardb@xxxxxxxxxx> --- crypto/aegis128-neon-inner.c | 89 +++++++++++++++++--- 1 file changed, 75 insertions(+), 14 deletions(-) diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c index 2a660ac1bc3a..cd1b3ad1d1f3 100644 --- a/crypto/aegis128-neon-inner.c +++ b/crypto/aegis128-neon-inner.c @@ -20,7 +20,6 @@ extern int aegis128_have_aes_insn; void *memcpy(void *dest, const void *src, size_t n); -void *memset(void *s, int c, size_t n); struct aegis128_state { uint8x16_t v[5]; @@ -173,10 +172,46 @@ void crypto_aegis128_update_neon(void *state, const void *msg) aegis128_save_state_neon(st, state); } +#ifdef CONFIG_ARM +/* + * AArch32 does not provide these intrinsics natively because it does not + * implement the underlying instructions. AArch32 only provides 64-bit + * wide vtbl.8/vtbx.8 instruction, so use those instead. + */ +static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) +{ + union { + uint8x16_t val; + uint8x8x2_t pair; + } __a = { a }; + + return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)), + vtbl2_u8(__a.pair, vget_high_u8(b))); +} + +static uint8x16_t vqtbx1q_u8(uint8x16_t v, uint8x16_t a, uint8x16_t b) +{ + union { + uint8x16_t val; + uint8x8x2_t pair; + } __a = { a }; + + return vcombine_u8(vtbx2_u8(vget_low_u8(v), __a.pair, vget_low_u8(b)), + vtbx2_u8(vget_high_u8(v), __a.pair, vget_high_u8(b))); +} +#endif + +static const uint8_t permute[] __aligned(64) = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +}; + void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src, unsigned int size) { struct aegis128_state st = aegis128_load_state_neon(state); + const int short_input = size < AEGIS_BLOCK_SIZE; uint8x16_t msg; preload_sbox(); @@ -186,7 +221,8 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src, msg = vld1q_u8(src); st = aegis128_update_neon(st, msg); - vst1q_u8(dst, msg ^ s); + msg ^= s; + vst1q_u8(dst, msg); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; @@ -195,13 +231,26 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src, if (size > 0) { uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; - uint8_t buf[AEGIS_BLOCK_SIZE] = {}; + uint8_t buf[AEGIS_BLOCK_SIZE]; + const void *in = src; + void *out = dst; + uint8x16_t m; - memcpy(buf, src, size); - msg = vld1q_u8(buf); - st = aegis128_update_neon(st, msg); - vst1q_u8(buf, msg ^ s); - memcpy(dst, buf, size); + if (__builtin_expect(short_input, 0)) + in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size); + + m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE), + vld1q_u8(permute + 32 - size)); + + st = aegis128_update_neon(st, m); + + vst1q_u8(out + size - AEGIS_BLOCK_SIZE, + vqtbl1q_u8(m ^ s, vld1q_u8(permute + size))); + + if (__builtin_expect(short_input, 0)) + memcpy(dst, out, size); + else + vst1q_u8(out - AEGIS_BLOCK_SIZE, msg); } aegis128_save_state_neon(st, state); @@ -211,6 +260,7 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src, unsigned int size) { struct aegis128_state st = aegis128_load_state_neon(state); + const int short_input = size < AEGIS_BLOCK_SIZE; uint8x16_t msg; preload_sbox(); @@ -228,14 +278,25 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src, if (size > 0) { uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; uint8_t buf[AEGIS_BLOCK_SIZE]; + const void *in = src; + void *out = dst; + uint8x16_t m; - vst1q_u8(buf, s); - memcpy(buf, src, size); - msg = vld1q_u8(buf) ^ s; - vst1q_u8(buf, msg); - memcpy(dst, buf, size); + if (__builtin_expect(short_input, 0)) + in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size); - st = aegis128_update_neon(st, msg); + m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE), + vld1q_u8(permute + 32 - size)); + + st = aegis128_update_neon(st, m); + + vst1q_u8(out + size - AEGIS_BLOCK_SIZE, + vqtbl1q_u8(m, vld1q_u8(permute + size))); + + if (__builtin_expect(short_input, 0)) + memcpy(dst, out, size); + else + vst1q_u8(out - AEGIS_BLOCK_SIZE, msg); } aegis128_save_state_neon(st, state); -- 2.17.1