Only perform the NEON yield check for every 4 blocks of input, to prevent taking a considerable performance hit on cores with very fast crypto instructions and comparatively slow memory accesses, such as the Cortex-A53. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> --- arch/arm64/crypto/sha2-ce-core.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S index cd8b36412469..201a33ff6830 100644 --- a/arch/arm64/crypto/sha2-ce-core.S +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -136,6 +136,9 @@ CPU_LE( rev32 v19.16b, v19.16b ) add dgav.4s, dgav.4s, dg0v.4s add dgbv.4s, dgbv.4s, dg1v.4s + tst w21, #0x3 // yield only every 4 blocks + b.ne 1b + /* handled all input blocks? */ cbz w21, 3f -- 2.11.0