As reported by Vakul, checking the TIF_NEED_RESCHED flag after every iteration of the GHASH and AES-GCM core routines is having a considerable performance impact on cores such as the Cortex-A53 with Crypto Extensions implemented. GHASH performance is down by 22% for large block sizes, and AES-GCM is down by 16% for large block sizes and 128 bit keys. This appears to be a result of the high performance of the crypto instructions on the one hand (2.0 cycles per byte for GHASH, 3.0 cpb for AES-GCM), combined with the relatively poor load/store performance of this simple core. So let's reduce this performance impact by only doing the yield check once every 32 blocks for GHASH (or 4 when using the version based on 8-bit polynomial multiplication), and once every 16 blocks for AES-GCM. This way, we recover most of the performance while still limiting the duration of scheduling blackouts due to disabling preemption to ~1000 cycles. Cc: Vakul Garg <vakul.garg@xxxxxxx> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> --- arch/arm64/crypto/ghash-ce-core.S | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index dcffb9e77589..9c14beaabeee 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -212,7 +212,7 @@ ushr XL.2d, XL.2d, #1 .endm - .macro __pmull_ghash, pn + .macro __pmull_ghash, pn, yield_count frame_push 5 mov x19, x0 @@ -259,6 +259,9 @@ CPU_LE( rev64 T1.16b, T1.16b ) eor T2.16b, T2.16b, XH.16b eor XL.16b, XL.16b, T2.16b + tst w19, #(\yield_count - 1) + b.ne 1b + cbz w19, 3f if_will_cond_yield_neon @@ -279,11 +282,11 @@ CPU_LE( rev64 T1.16b, T1.16b ) * struct ghash_key const *k, const char *head) */ ENTRY(pmull_ghash_update_p64) - __pmull_ghash p64 + __pmull_ghash p64, 32 ENDPROC(pmull_ghash_update_p64) ENTRY(pmull_ghash_update_p8) - __pmull_ghash p8 + __pmull_ghash p8, 4 ENDPROC(pmull_ghash_update_p8) KS .req v8 @@ -428,6 +431,9 @@ CPU_LE( rev x28, x28 ) st1 {INP.16b}, [x21], #16 .endif + tst w19, #0xf // do yield check only + b.ne 1b // once every 16 blocks + cbz w19, 3f if_will_cond_yield_neon -- 2.11.0