From: Ard Biesheuvel <ardb@xxxxxxxxxx> The encryption and decryption code paths are mostly identical, except for a small difference where the plaintext input into the MAC is taken from either the input or the output block. We can factor this in quite easily using a vector bit select, and a few additional XORs, without the need for branches. This way, we can use the same asm helper on the encrypt and decrypt code paths. Signed-off-by: Ard Biesheuvel <ardb@xxxxxxxxxx> --- arch/arm64/crypto/aes-ce-ccm-core.S | 41 +++++++++----------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index 0ec59fc4ef3e..75be3157bae1 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -60,7 +60,7 @@ SYM_FUNC_START(ce_aes_ccm_final) ret SYM_FUNC_END(ce_aes_ccm_final) - .macro aes_ccm_do_crypt,enc +SYM_FUNC_START_LOCAL(aes_ccm_do_crypt) load_round_keys x3, w4, x10 cbz x2, 5f @@ -76,28 +76,24 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ aes_encrypt v0, v1, w4 + eor v0.16b, v0.16b, v5.16b /* final round mac */ + eor v1.16b, v1.16b, v5.16b /* final round enc */ subs w2, w2, #16 bmi 6f /* partial block? */ ld1 {v2.16b}, [x1], #16 /* load next input block */ - .if \enc == 1 - eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ - eor v6.16b, v1.16b, v2.16b /* xor with crypted ctr */ - .else - eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */ - eor v6.16b, v2.16b, v5.16b /* final round enc */ - .endif - eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ + eor v6.16b, v2.16b, v1.16b /* en/decrypt input block */ + mov v23.16b, v22.16b + bsl v23.16b, v2.16b, v6.16b /* select plaintext */ st1 {v6.16b}, [x0], #16 /* write output block */ + eor v0.16b, v0.16b, v23.16b /* fold plaintext into mac */ + bne 0b CPU_LE( rev x8, x8 ) st1 {v0.16b}, [x5] /* store mac */ str x8, [x6, #8] /* store lsb end of ctr (BE) */ 5: ret -6: eor v0.16b, v0.16b, v5.16b /* final round mac */ - eor v1.16b, v1.16b, v5.16b /* final round enc */ - - add x1, x1, w2, sxtw /* rewind the input pointer (w2 < 0) */ +6: add x1, x1, w2, sxtw /* rewind the input pointer (w2 < 0) */ add x0, x0, w2, sxtw /* rewind the output pointer */ adr_l x8, .Lpermute /* load permute vectors */ @@ -108,20 +104,17 @@ CPU_LE( rev x8, x8 ) ld1 {v2.16b}, [x1] /* load a full block of input */ tbl v1.16b, {v1.16b}, v7.16b /* move keystream to end of register */ - .if \enc == 1 - tbl v7.16b, {v2.16b}, v9.16b /* copy plaintext to start of v7 */ + tbl v7.16b, {v2.16b}, v9.16b /* copy input block to start of v7 */ eor v2.16b, v2.16b, v1.16b /* encrypt partial input block */ - .else - eor v2.16b, v2.16b, v1.16b /* decrypt partial input block */ - tbl v7.16b, {v2.16b}, v9.16b /* copy plaintext to start of v7 */ - .endif - eor v0.16b, v0.16b, v7.16b /* fold plaintext into mac */ + tbl v9.16b, {v2.16b}, v9.16b /* copy output block to start of v9 */ + bsl v22.16b, v7.16b, v9.16b /* select plaintext */ + eor v0.16b, v0.16b, v22.16b /* fold plaintext into mac */ tbx v2.16b, {v6.16b}, v8.16b /* insert output from previous iteration */ st1 {v0.16b}, [x5] /* store mac */ st1 {v2.16b}, [x0] /* store output block */ ret - .endm +SYM_FUNC_END(aes_ccm_do_crypt) /* * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, @@ -132,11 +125,13 @@ CPU_LE( rev x8, x8 ) * u8 ctr[]); */ SYM_FUNC_START(ce_aes_ccm_encrypt) - aes_ccm_do_crypt 1 + movi v22.16b, #255 + b aes_ccm_do_crypt SYM_FUNC_END(ce_aes_ccm_encrypt) SYM_FUNC_START(ce_aes_ccm_decrypt) - aes_ccm_do_crypt 0 + movi v22.16b, #0 + b aes_ccm_do_crypt SYM_FUNC_END(ce_aes_ccm_decrypt) .section ".rodata", "a" -- 2.43.0.275.g3460e3d667-goog