Only perform the NEON yield check for every 4 blocks of input, to prevent taking a considerable performance hit on cores with very fast crypto instructions and comparatively slow memory accesses, such as the Cortex-A53. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> --- arch/arm64/crypto/sha1-ce-core.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S index 78eb35fb5056..f592c55218d0 100644 --- a/arch/arm64/crypto/sha1-ce-core.S +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -129,6 +129,9 @@ CPU_LE( rev32 v11.16b, v11.16b ) add dgbv.2s, dgbv.2s, dg1v.2s add dgav.4s, dgav.4s, dg0v.4s + tst w21, #0x3 // yield only every 4 blocks + b.ne 1b + cbz w21, 3f if_will_cond_yield_neon -- 2.11.0