This implementation keeps the 64 bytes of workspace in registers rather than on the stack, eliminating most of the loads and stores, and reducing the instruction count by about 25%. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> --- Hello all, No performance numbers I am allowed to share, unfortunately, so if anyone else (with access to actual, representative hardware) would care to have a go, I would be very grateful. This can be done by building the tcrypt.ko module (CONFIG_CRYPTO_TEST=m), and inserting the module using 'mode=303' as a parameter (note that the insmod always fails, but produces its test output to the kernel log). Also note that the sha_transform() function will be part of the kernel proper, so just rebuilding the sha1_generic module is not sufficient. Cheers, arch/arm64/kernel/arm64ksyms.c | 3 + arch/arm64/lib/Makefile | 2 +- arch/arm64/lib/sha1.S | 256 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 260 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/lib/sha1.S diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index 338b568cd8ae..1f5693fb5d93 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -56,3 +56,6 @@ EXPORT_SYMBOL(clear_bit); EXPORT_SYMBOL(test_and_clear_bit); EXPORT_SYMBOL(change_bit); EXPORT_SYMBOL(test_and_change_bit); + + /* SHA-1 implementation under lib/ */ +EXPORT_SYMBOL(sha_transform); diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 328ce1a99daa..ea093ebb9a9a 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -1,4 +1,4 @@ lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ copy_to_user.o copy_in_user.o copy_page.o \ clear_page.o memchr.o memcpy.o memmove.o memset.o \ - strchr.o strrchr.o + strchr.o strrchr.o sha1.o diff --git a/arch/arm64/lib/sha1.S b/arch/arm64/lib/sha1.S new file mode 100644 index 000000000000..877b8d70e992 --- /dev/null +++ b/arch/arm64/lib/sha1.S @@ -0,0 +1,256 @@ +/* + * linux/arch/arm64/lib/sha1.S + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@xxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + + k .req w1 + + res .req w2 + xres .req x2 + + wA .req w3 + wB .req w4 + wC .req w5 + wD .req w6 + wE .req w7 + + tmp .req w16 + xtmp .req x16 + + .macro sha1_choose, out, b, c, d + eor \out, \c, \d + and \out, \out, \b + eor \out, \out, \d + .endm + + .macro sha1_parity, out, b, c, d + eor \out, \b, \c + eor \out, \out, \d + .endm + + .macro sha1_majority, out, b, c, d + eor tmp, \b, \c + and \out, \b, \c + and tmp, tmp, \d + add \out, \out, tmp + .endm + + .macro mix_state, st0, st1, st4, st6, st7 + extr xtmp, \st7, \st6, #32 + eor \st0, \st0, \st1 + eor xtmp, xtmp, \st4 + eor xtmp, xtmp, \st0 + ror res, tmp, #(32 - 1) + lsr xtmp, xtmp, #32 + ror tmp, tmp, #(32 - 1) + orr \st0, xres, xtmp, lsl #32 + .endm + + .macro sha1_round, func, r, h, a, b, c, d, e + sha1_\func res, \b, \c, \d + add res, res, \e + ror \e, \a, #(32 - 5) + .ifc \h, h + add xres, xres, x\r, lsr #32 + .else + add res, res, w\r + .endif + add \e, \e, k + ror \b, \b, #2 + add \e, \e, res + .endm + + /* + * void sha_transform(__u32 *digest, const char *data, __u32 *array) + */ +ENTRY(sha_transform) + /* load input into state array */ + ldp x8, x9, [x1] + ldp x10, x11, [x1, #16] + ldp x12, x13, [x1, #32] + ldp x14, x15, [x1, #48] + + /* load digest input */ + ldr wA, [x0] + ldp wB, wC, [x0, #4] + ldp wD, wE, [x0, #12] + + /* endian-reverse the input on LE builds */ +CPU_LE( rev32 x8, x8 ) +CPU_LE( rev32 x9, x9 ) +CPU_LE( rev32 x10, x10 ) +CPU_LE( rev32 x11, x11 ) +CPU_LE( rev32 x12, x12 ) +CPU_LE( rev32 x13, x13 ) +CPU_LE( rev32 x14, x14 ) +CPU_LE( rev32 x15, x15 ) + + /* round 1 */ + ldr k, =0x5a827999 + sha1_round choose, 8, l, wA, wB, wC, wD, wE + sha1_round choose, 8, h, wE, wA, wB, wC, wD + sha1_round choose, 9, l, wD, wE, wA, wB, wC + sha1_round choose, 9, h, wC, wD, wE, wA, wB + sha1_round choose, 10, l, wB, wC, wD, wE, wA + sha1_round choose, 10, h, wA, wB, wC, wD, wE + sha1_round choose, 11, l, wE, wA, wB, wC, wD + sha1_round choose, 11, h, wD, wE, wA, wB, wC + sha1_round choose, 12, l, wC, wD, wE, wA, wB + sha1_round choose, 12, h, wB, wC, wD, wE, wA + sha1_round choose, 13, l, wA, wB, wC, wD, wE + sha1_round choose, 13, h, wE, wA, wB, wC, wD + sha1_round choose, 14, l, wD, wE, wA, wB, wC + sha1_round choose, 14, h, wC, wD, wE, wA, wB + sha1_round choose, 15, l, wB, wC, wD, wE, wA + sha1_round choose, 15, h, wA, wB, wC, wD, wE + + mix_state x8, x9, x12, x14, x15 + sha1_round choose, 8, l, wE, wA, wB, wC, wD + sha1_round choose, 8, h, wD, wE, wA, wB, wC + mix_state x9, x10, x13, x15, x8 + sha1_round choose, 9, l, wC, wD, wE, wA, wB + sha1_round choose, 9, h, wB, wC, wD, wE, wA + + /* round 2 */ + ldr k, =0x6ed9eba1 + mix_state x10, x11, x14, x8, x9 + sha1_round parity, 10, l, wA, wB, wC, wD, wE + sha1_round parity, 10, h, wE, wA, wB, wC, wD + mix_state x11, x12, x15, x9, x10 + sha1_round parity, 11, l, wD, wE, wA, wB, wC + sha1_round parity, 11, h, wC, wD, wE, wA, wB + mix_state x12, x13, x8, x10, x11 + sha1_round parity, 12, l, wB, wC, wD, wE, wA + sha1_round parity, 12, h, wA, wB, wC, wD, wE + mix_state x13, x14, x9, x11, x12 + sha1_round parity, 13, l, wE, wA, wB, wC, wD + sha1_round parity, 13, h, wD, wE, wA, wB, wC + mix_state x14, x15, x10, x12, x13 + sha1_round parity, 14, l, wC, wD, wE, wA, wB + sha1_round parity, 14, h, wB, wC, wD, wE, wA + mix_state x15, x8, x11, x13, x14 + sha1_round parity, 15, l, wA, wB, wC, wD, wE + sha1_round parity, 15, h, wE, wA, wB, wC, wD + mix_state x8, x9, x12, x14, x15 + sha1_round parity, 8, l, wD, wE, wA, wB, wC + sha1_round parity, 8, h, wC, wD, wE, wA, wB + mix_state x9, x10, x13, x15, x8 + sha1_round parity, 9, l, wB, wC, wD, wE, wA + sha1_round parity, 9, h, wA, wB, wC, wD, wE + mix_state x10, x11, x14, x8, x9 + sha1_round parity, 10, l, wE, wA, wB, wC, wD + sha1_round parity, 10, h, wD, wE, wA, wB, wC + mix_state x11, x12, x15, x9, x10 + sha1_round parity, 11, l, wC, wD, wE, wA, wB + sha1_round parity, 11, h, wB, wC, wD, wE, wA + + /* round 3 */ + ldr k, =0x8f1bbcdc + mix_state x12, x13, x8, x10, x11 + sha1_round majority, 12, l, wA, wB, wC, wD, wE + sha1_round majority, 12, h, wE, wA, wB, wC, wD + mix_state x13, x14, x9, x11, x12 + sha1_round majority, 13, l, wD, wE, wA, wB, wC + sha1_round majority, 13, h, wC, wD, wE, wA, wB + mix_state x14, x15, x10, x12, x13 + sha1_round majority, 14, l, wB, wC, wD, wE, wA + sha1_round majority, 14, h, wA, wB, wC, wD, wE + mix_state x15, x8, x11, x13, x14 + sha1_round majority, 15, l, wE, wA, wB, wC, wD + sha1_round majority, 15, h, wD, wE, wA, wB, wC + mix_state x8, x9, x12, x14, x15 + sha1_round majority, 8, l, wC, wD, wE, wA, wB + sha1_round majority, 8, h, wB, wC, wD, wE, wA + mix_state x9, x10, x13, x15, x8 + sha1_round majority, 9, l, wA, wB, wC, wD, wE + sha1_round majority, 9, h, wE, wA, wB, wC, wD + mix_state x10, x11, x14, x8, x9 + sha1_round majority, 10, l, wD, wE, wA, wB, wC + sha1_round majority, 10, h, wC, wD, wE, wA, wB + mix_state x11, x12, x15, x9, x10 + sha1_round majority, 11, l, wB, wC, wD, wE, wA + sha1_round majority, 11, h, wA, wB, wC, wD, wE + mix_state x12, x13, x8, x10, x11 + sha1_round majority, 12, l, wE, wA, wB, wC, wD + sha1_round majority, 12, h, wD, wE, wA, wB, wC + mix_state x13, x14, x9, x11, x12 + sha1_round majority, 13, l, wC, wD, wE, wA, wB + sha1_round majority, 13, h, wB, wC, wD, wE, wA + + /* round 4 */ + ldr k, =0xca62c1d6 + mix_state x14, x15, x10, x12, x13 + sha1_round parity, 14, l, wA, wB, wC, wD, wE + sha1_round parity, 14, h, wE, wA, wB, wC, wD + mix_state x15, x8, x11, x13, x14 + sha1_round parity, 15, l, wD, wE, wA, wB, wC + sha1_round parity, 15, h, wC, wD, wE, wA, wB + mix_state x8, x9, x12, x14, x15 + sha1_round parity, 8, l, wB, wC, wD, wE, wA + sha1_round parity, 8, h, wA, wB, wC, wD, wE + mix_state x9, x10, x13, x15, x8 + sha1_round parity, 9, l, wE, wA, wB, wC, wD + sha1_round parity, 9, h, wD, wE, wA, wB, wC + mix_state x10, x11, x14, x8, x9 + sha1_round parity, 10, l, wC, wD, wE, wA, wB + sha1_round parity, 10 ,h, wB, wC, wD, wE, wA + mix_state x11, x12, x15, x9, x10 + sha1_round parity, 11, l, wA, wB, wC, wD, wE + sha1_round parity, 11, h, wE, wA, wB, wC, wD + mix_state x12, x13, x8, x10, x11 + sha1_round parity, 12, l, wD, wE, wA, wB, wC + sha1_round parity, 12, h, wC, wD, wE, wA, wB + mix_state x13, x14, x9, x11, x12 + sha1_round parity, 13, l, wB, wC, wD, wE, wA + sha1_round parity, 13, h, wA, wB, wC, wD, wE + mix_state x14, x15, x10, x12, x13 + sha1_round parity, 14, l, wE, wA, wB, wC, wD + sha1_round parity, 14, h, wD, wE, wA, wB, wC + mix_state x15, x8, x11, x13, x14 + + /* reload digest input */ + ldr w8, [x0] + ldp w9, w10, [x0, #4] + ldp w11, w12, [x0, #12] + + sha1_round parity, 15, l, wC, wD, wE, wA, wB + sha1_round parity, 15, h, wB, wC, wD, wE, wA + + /* add this round's output to digest */ + add wA, wA, w8 + add wB, wB, w9 + add wC, wC, w10 + add wD, wD, w11 + add wE, wE, w12 + + /* store digest */ + str wA, [x0] + stp wB, wC, [x0, #4] + stp wD, wE, [x0, #12] + ret +ENDPROC(sha_transform) + + /* + * void sha_init(__u32 *buf) + */ +ENTRY(sha_init) + ldr w1, =0x67452301 + ldr w2, =0xefcdab89 + ldr w3, =0x98badcfe + ldr w4, =0x10325476 + ldr w5, =0xc3d2e1f0 + str w1, [x0] + stp w2, w3, [x0, #4] + stp w4, w5, [x0, #12] + ret +ENDPROC(sha_init) -- 1.8.3.2 -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html