From: Ard Biesheuvel <ardb@xxxxxxxxxx> The CCM code as originally written attempted to use as few NEON registers as possible, to avoid having to eagerly preserve/restore the entire NEON register file at every call to kernel_neon_begin/end. At that time, this API took a number of NEON registers as a parameter, and only preserved that many registers. Today, the NEON register file is restored lazily, and the old API is long gone. This means we can use as many NEON registers as we can make meaningful use of, which means in the AES case that we can keep all round keys in registers rather than reloading each of them for each AES block processed. On Cortex-A53, this results in a speedup of more than 50%. (From 4 cycles per byte to 2.6 cycles per byte) Signed-off-by: Ard Biesheuvel <ardb@xxxxxxxxxx> --- arch/arm64/crypto/aes-ce-ccm-core.S | 95 ++++++++------------ 1 file changed, 38 insertions(+), 57 deletions(-) diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index 0132872bd780..0ec59fc4ef3e 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -14,40 +14,46 @@ .text .arch armv8-a+crypto + .macro load_round_keys, rk, nr, tmp + sub w\tmp, \nr, #10 + add \tmp, \rk, w\tmp, sxtw #4 + ld1 {v10.4s-v13.4s}, [\rk] + ld1 {v14.4s-v17.4s}, [\tmp], #64 + ld1 {v18.4s-v21.4s}, [\tmp], #64 + ld1 {v3.4s-v5.4s}, [\tmp] + .endm + + .macro dround, va, vb, vk + aese \va\().16b, \vk\().16b + aesmc \va\().16b, \va\().16b + aese \vb\().16b, \vk\().16b + aesmc \vb\().16b, \vb\().16b + .endm + + .macro aes_encrypt, va, vb, nr + tbz \nr, #2, .L\@ + dround \va, \vb, v10 + dround \va, \vb, v11 + tbz \nr, #1, .L\@ + dround \va, \vb, v12 + dround \va, \vb, v13 +.L\@: .irp v, v14, v15, v16, v17, v18, v19, v20, v21, v3 + dround \va, \vb, \v + .endr + aese \va\().16b, v4.16b + aese \vb\().16b, v4.16b + .endm + /* * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], * u32 rounds); */ SYM_FUNC_START(ce_aes_ccm_final) - ld1 {v3.4s}, [x2], #16 /* load first round key */ ld1 {v0.16b}, [x0] /* load mac */ - cmp w3, #12 /* which key size? */ - sub w3, w3, #2 /* modified # of rounds */ ld1 {v1.16b}, [x1] /* load 1st ctriv */ - bmi 0f - bne 3f - mov v5.16b, v3.16b - b 2f -0: mov v4.16b, v3.16b -1: ld1 {v5.4s}, [x2], #16 /* load next round key */ - aese v0.16b, v4.16b - aesmc v0.16b, v0.16b - aese v1.16b, v4.16b - aesmc v1.16b, v1.16b -2: ld1 {v3.4s}, [x2], #16 /* load next round key */ - aese v0.16b, v5.16b - aesmc v0.16b, v0.16b - aese v1.16b, v5.16b - aesmc v1.16b, v1.16b -3: ld1 {v4.4s}, [x2], #16 /* load next round key */ - subs w3, w3, #3 - aese v0.16b, v3.16b - aesmc v0.16b, v0.16b - aese v1.16b, v3.16b - aesmc v1.16b, v1.16b - bpl 1b - aese v0.16b, v4.16b - aese v1.16b, v4.16b + + aes_encrypt v0, v1, w3 + /* final round key cancels out */ eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ st1 {v0.16b}, [x0] /* store result */ @@ -55,6 +61,8 @@ SYM_FUNC_START(ce_aes_ccm_final) SYM_FUNC_END(ce_aes_ccm_final) .macro aes_ccm_do_crypt,enc + load_round_keys x3, w4, x10 + cbz x2, 5f ldr x8, [x6, #8] /* load lower ctr */ ld1 {v0.16b}, [x5] /* load mac */ @@ -64,37 +72,10 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ prfm pldl1strm, [x1] add x8, x8, #1 rev x9, x8 - cmp w4, #12 /* which key size? */ - sub w7, w4, #2 /* get modified # of rounds */ ins v1.d[1], x9 /* no carry in lower ctr */ - ld1 {v3.4s}, [x3] /* load first round key */ - add x10, x3, #16 - bmi 1f - bne 4f - mov v5.16b, v3.16b - b 3f -1: mov v4.16b, v3.16b - ld1 {v5.4s}, [x10], #16 /* load 2nd round key */ -2: /* inner loop: 3 rounds, 2x interleaved */ - aese v0.16b, v4.16b - aesmc v0.16b, v0.16b - aese v1.16b, v4.16b - aesmc v1.16b, v1.16b -3: ld1 {v3.4s}, [x10], #16 /* load next round key */ - aese v0.16b, v5.16b - aesmc v0.16b, v0.16b - aese v1.16b, v5.16b - aesmc v1.16b, v1.16b -4: ld1 {v4.4s}, [x10], #16 /* load next round key */ - subs w7, w7, #3 - aese v0.16b, v3.16b - aesmc v0.16b, v0.16b - aese v1.16b, v3.16b - aesmc v1.16b, v1.16b - ld1 {v5.4s}, [x10], #16 /* load next round key */ - bpl 2b - aese v0.16b, v4.16b - aese v1.16b, v4.16b + + aes_encrypt v0, v1, w4 + subs w2, w2, #16 bmi 6f /* partial block? */ ld1 {v2.16b}, [x1], #16 /* load next input block */ -- 2.43.0.275.g3460e3d667-goog