Re: [PATCH v3 10/14] crypto: arm/blake2s - add ARM scalar optimized BLAKE2s

Ard Biesheuvel <ardb@xxxxxxxxxx> · Wed, 23 Dec 2020 10:08:50 +0100



On Wed, 23 Dec 2020 at 09:12, Eric Biggers <ebiggers@xxxxxxxxxx> wrote:
>
> From: Eric Biggers <ebiggers@xxxxxxxxxx>
>
> Add an ARM scalar optimized implementation of BLAKE2s.
>
> NEON isn't very useful for BLAKE2s because the BLAKE2s block size is too
> small for NEON to help.  Each NEON instruction would depend on the
> previous one, resulting in poor performance.
>
> With scalar instructions, on the other hand, we can take advantage of
> ARM's "free" rotations (like I did in chacha-scalar-core.S) to get an
> implementation get runs much faster than the C implementation.
>
> Performance results on Cortex-A7 in cycles per byte using the shash API:
>
>         4096-byte messages:
>                 blake2s-256-arm:     18.8
>                 blake2s-256-generic: 26.0
>
>         500-byte messages:
>                 blake2s-256-arm:     20.3
>                 blake2s-256-generic: 27.9
>
>         100-byte messages:
>                 blake2s-256-arm:     29.7
>                 blake2s-256-generic: 39.2
>
>         32-byte messages:
>                 blake2s-256-arm:     50.6
>                 blake2s-256-generic: 66.2
>
> Except on very short messages, this is still slower than the NEON
> implementation of BLAKE2b which I've written; that is 14.0, 16.4, 25.8,
> and 76.1 cpb on 4096, 500, 100, and 32-byte messages, respectively.
> However, optimized BLAKE2s is useful for cases where BLAKE2s is used
> instead of BLAKE2b, such as WireGuard.
>
> This new implementation is added in the form of a new module
> blake2s-arm.ko, which is analogous to blake2s-x86_64.ko in that it
> provides blake2s_compress_arch() for use by the library API as well as
> optionally register the algorithms with the shash API.
>
> Acked-by: Ard Biesheuvel <ardb@xxxxxxxxxx>
> Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx>

Tested-by: Ard Biesheuvel <ardb@xxxxxxxxxx>

> ---
>  arch/arm/crypto/Kconfig        |   9 ++
>  arch/arm/crypto/Makefile       |   2 +
>  arch/arm/crypto/blake2s-core.S | 285 +++++++++++++++++++++++++++++++++
>  arch/arm/crypto/blake2s-glue.c |  78 +++++++++
>  4 files changed, 374 insertions(+)
>  create mode 100644 arch/arm/crypto/blake2s-core.S
>  create mode 100644 arch/arm/crypto/blake2s-glue.c
>
> diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
> index c9bf2df85cb90..281c829c12d0b 100644
> --- a/arch/arm/crypto/Kconfig
> +++ b/arch/arm/crypto/Kconfig
> @@ -62,6 +62,15 @@ config CRYPTO_SHA512_ARM
>           SHA-512 secure hash standard (DFIPS 180-2) implemented
>           using optimized ARM assembler and NEON, when available.
>
> +config CRYPTO_BLAKE2S_ARM
> +       tristate "BLAKE2s digest algorithm (ARM)"
> +       select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
> +       help
> +         BLAKE2s digest algorithm optimized with ARM scalar instructions.  This
> +         is faster than the generic implementations of BLAKE2s and BLAKE2b, but
> +         slower than the NEON implementation of BLAKE2b.  (There is no NEON
> +         implementation of BLAKE2s, since NEON doesn't really help with it.)
> +
>  config CRYPTO_AES_ARM
>         tristate "Scalar AES cipher for ARM"
>         select CRYPTO_ALGAPI
> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
> index b745c17d356fe..5ad1e985a718b 100644
> --- a/arch/arm/crypto/Makefile
> +++ b/arch/arm/crypto/Makefile
> @@ -9,6 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
>  obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
>  obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
>  obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
> +obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o
>  obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
>  obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
>  obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
> @@ -29,6 +30,7 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
>  sha256-arm-y   := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
>  sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
>  sha512-arm-y   := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
> +blake2s-arm-y   := blake2s-core.o blake2s-glue.o
>  sha1-arm-ce-y  := sha1-ce-core.o sha1-ce-glue.o
>  sha2-arm-ce-y  := sha2-ce-core.o sha2-ce-glue.o
>  aes-arm-ce-y   := aes-ce-core.o aes-ce-glue.o
> diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
> new file mode 100644
> index 0000000000000..bed897e9a181a
> --- /dev/null
> +++ b/arch/arm/crypto/blake2s-core.S
> @@ -0,0 +1,285 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * BLAKE2s digest algorithm, ARM scalar implementation
> + *
> + * Copyright 2020 Google LLC
> + *
> + * Author: Eric Biggers <ebiggers@xxxxxxxxxx>
> + */
> +
> +#include <linux/linkage.h>
> +
> +       // Registers used to hold message words temporarily.  There aren't
> +       // enough ARM registers to hold the whole message block, so we have to
> +       // load the words on-demand.
> +       M_0             .req    r12
> +       M_1             .req    r14
> +
> +// The BLAKE2s initialization vector
> +.Lblake2s_IV:
> +       .word   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
> +       .word   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
> +
> +.macro __ldrd          a, b, src, offset
> +#if __LINUX_ARM_ARCH__ >= 6
> +       ldrd            \a, \b, [\src, #\offset]
> +#else
> +       ldr             \a, [\src, #\offset]
> +       ldr             \b, [\src, #\offset + 4]
> +#endif
> +.endm
> +
> +.macro __strd          a, b, dst, offset
> +#if __LINUX_ARM_ARCH__ >= 6
> +       strd            \a, \b, [\dst, #\offset]
> +#else
> +       str             \a, [\dst, #\offset]
> +       str             \b, [\dst, #\offset + 4]
> +#endif
> +.endm
> +
> +// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
> +// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
> +// columns/diagonals.  s0-s1 are the word offsets to the message words the first
> +// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
> +// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
> +//
> +// Note that to save instructions, the rotations don't happen when the
> +// pseudocode says they should, but rather they are delayed until the values are
> +// used.  See the comment above _blake2s_round().
> +.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
> +
> +       ldr             M_0, [sp, #32 + 4 * \s0]
> +       ldr             M_1, [sp, #32 + 4 * \s2]
> +
> +       // a += b + m[blake2s_sigma[r][2*i + 0]];
> +       add             \a0, \a0, \b0, ror #brot
> +       add             \a1, \a1, \b1, ror #brot
> +       add             \a0, \a0, M_0
> +       add             \a1, \a1, M_1
> +
> +       // d = ror32(d ^ a, 16);
> +       eor             \d0, \a0, \d0, ror #drot
> +       eor             \d1, \a1, \d1, ror #drot
> +
> +       // c += d;
> +       add             \c0, \c0, \d0, ror #16
> +       add             \c1, \c1, \d1, ror #16
> +
> +       // b = ror32(b ^ c, 12);
> +       eor             \b0, \c0, \b0, ror #brot
> +       eor             \b1, \c1, \b1, ror #brot
> +
> +       ldr             M_0, [sp, #32 + 4 * \s1]
> +       ldr             M_1, [sp, #32 + 4 * \s3]
> +
> +       // a += b + m[blake2s_sigma[r][2*i + 1]];
> +       add             \a0, \a0, \b0, ror #12
> +       add             \a1, \a1, \b1, ror #12
> +       add             \a0, \a0, M_0
> +       add             \a1, \a1, M_1
> +
> +       // d = ror32(d ^ a, 8);
> +       eor             \d0, \a0, \d0, ror#16
> +       eor             \d1, \a1, \d1, ror#16
> +
> +       // c += d;
> +       add             \c0, \c0, \d0, ror#8
> +       add             \c1, \c1, \d1, ror#8
> +
> +       // b = ror32(b ^ c, 7);
> +       eor             \b0, \c0, \b0, ror#12
> +       eor             \b1, \c1, \b1, ror#12
> +.endm
> +
> +// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
> +// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
> +// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
> +// r14 are free to use.  The macro arguments s0-s15 give the order in which the
> +// message words are used in this round.
> +//
> +// All rotates are performed using the implicit rotate operand accepted by the
> +// 'add' and 'eor' instructions.  This is faster than using explicit rotate
> +// instructions.  To make this work, we allow the values in the second and last
> +// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
> +// wrong rotation amount.  The rotation amount is then fixed up just in time
> +// when the values are used.  'brot' is the number of bits the values in row 'b'
> +// need to be rotated right to arrive at the correct values, and 'drot'
> +// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
> +// that they end up as (7, 8) after every round.
> +.macro _blake2s_round  s0, s1, s2, s3, s4, s5, s6, s7, \
> +                       s8, s9, s10, s11, s12, s13, s14, s15
> +
> +       // Mix first two columns:
> +       // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
> +       __ldrd          r10, r11, sp, 16        // load v[12] and v[13]
> +       _blake2s_quarterround   r0, r4, r8, r10,  r1, r5, r9, r11, \
> +                               \s0, \s1, \s2, \s3
> +       __strd          r8, r9, sp, 0
> +       __strd          r10, r11, sp, 16
> +
> +       // Mix second two columns:
> +       // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
> +       __ldrd          r8, r9, sp, 8           // load v[10] and v[11]
> +       __ldrd          r10, r11, sp, 24        // load v[14] and v[15]
> +       _blake2s_quarterround   r2, r6, r8, r10,  r3, r7, r9, r11, \
> +                               \s4, \s5, \s6, \s7
> +       str             r10, [sp, #24]          // store v[14]
> +       // v[10], v[11], and v[15] are used below, so no need to store them yet.
> +
> +       .set brot, 7
> +       .set drot, 8
> +
> +       // Mix first two diagonals:
> +       // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
> +       ldr             r10, [sp, #16]          // load v[12]
> +       _blake2s_quarterround   r0, r5, r8, r11,  r1, r6, r9, r10, \
> +                               \s8, \s9, \s10, \s11
> +       __strd          r8, r9, sp, 8
> +       str             r11, [sp, #28]
> +       str             r10, [sp, #16]
> +
> +       // Mix second two diagonals:
> +       // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
> +       __ldrd          r8, r9, sp, 0           // load v[8] and v[9]
> +       __ldrd          r10, r11, sp, 20        // load v[13] and v[14]
> +       _blake2s_quarterround   r2, r7, r8, r10,  r3, r4, r9, r11, \
> +                               \s12, \s13, \s14, \s15
> +       __strd          r10, r11, sp, 20
> +.endm
> +
> +//
> +// void blake2s_compress_arch(struct blake2s_state *state,
> +//                           const u8 *block, size_t nblocks, u32 inc);
> +//
> +// Only the first three fields of struct blake2s_state are used:
> +//     u32 h[8];       (inout)
> +//     u32 t[2];       (inout)
> +//     u32 f[2];       (in)
> +//
> +       .align          5
> +ENTRY(blake2s_compress_arch)
> +       push            {r0-r2,r4-r11,lr}       // keep this an even number
> +
> +.Lnext_block:
> +       // r0 is 'state'
> +       // r1 is 'block'
> +       // r3 is 'inc'
> +
> +       // Load and increment the counter t[0..1].
> +       __ldrd          r10, r11, r0, 32
> +       adds            r10, r10, r3
> +       adc             r11, r11, #0
> +       __strd          r10, r11, r0, 32
> +
> +       // _blake2s_round is very short on registers, so copy the message block
> +       // to the stack to save a register during the rounds.  This also has the
> +       // advantage that misalignment only needs to be dealt with in one place.
> +       sub             sp, sp, #64
> +       mov             r12, sp
> +       tst             r1, #3
> +       bne             .Lcopy_block_misaligned
> +       ldmia           r1!, {r2-r9}
> +       stmia           r12!, {r2-r9}
> +       ldmia           r1!, {r2-r9}
> +       stmia           r12, {r2-r9}
> +.Lcopy_block_done:
> +       str             r1, [sp, #68]           // Update message pointer
> +
> +       // Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
> +       // for spilling v[8..9].  Leave v[8..9] in r8-r9.
> +       mov             r14, r0                 // r14 = state
> +       adr             r12, .Lblake2s_IV
> +       ldmia           r12!, {r8-r9}           // load IV[0..1]
> +       __ldrd          r0, r1, r14, 40         // load f[0..1]
> +       ldm             r12, {r2-r7}            // load IV[3..7]
> +       eor             r4, r4, r10             // v[12] = IV[4] ^ t[0]
> +       eor             r5, r5, r11             // v[13] = IV[5] ^ t[1]
> +       eor             r6, r6, r0              // v[14] = IV[6] ^ f[0]
> +       eor             r7, r7, r1              // v[15] = IV[7] ^ f[1]
> +       push            {r2-r7}                 // push v[9..15]
> +       sub             sp, sp, #8              // leave space for v[8..9]
> +
> +       // Load h[0..7] == v[0..7].
> +       ldm             r14, {r0-r7}
> +
> +       // Execute the rounds.  Each round is provided the order in which it
> +       // needs to use the message words.
> +       .set brot, 0
> +       .set drot, 0
> +       _blake2s_round  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
> +       _blake2s_round  14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
> +       _blake2s_round  11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
> +       _blake2s_round  7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
> +       _blake2s_round  9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
> +       _blake2s_round  2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
> +       _blake2s_round  12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
> +       _blake2s_round  13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
> +       _blake2s_round  6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
> +       _blake2s_round  10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
> +
> +       // Fold the final state matrix into the hash chaining value:
> +       //
> +       //      for (i = 0; i < 8; i++)
> +       //              h[i] ^= v[i] ^ v[i + 8];
> +       //
> +       ldr             r14, [sp, #96]          // r14 = &h[0]
> +       add             sp, sp, #8              // v[8..9] are already loaded.
> +       pop             {r10-r11}               // load v[10..11]
> +       eor             r0, r0, r8
> +       eor             r1, r1, r9
> +       eor             r2, r2, r10
> +       eor             r3, r3, r11
> +       ldm             r14, {r8-r11}           // load h[0..3]
> +       eor             r0, r0, r8
> +       eor             r1, r1, r9
> +       eor             r2, r2, r10
> +       eor             r3, r3, r11
> +       stmia           r14!, {r0-r3}           // store new h[0..3]
> +       ldm             r14, {r0-r3}            // load old h[4..7]
> +       pop             {r8-r11}                // load v[12..15]
> +       eor             r0, r0, r4, ror #brot
> +       eor             r1, r1, r5, ror #brot
> +       eor             r2, r2, r6, ror #brot
> +       eor             r3, r3, r7, ror #brot
> +       eor             r0, r0, r8, ror #drot
> +       eor             r1, r1, r9, ror #drot
> +       eor             r2, r2, r10, ror #drot
> +       eor             r3, r3, r11, ror #drot
> +         add           sp, sp, #64             // skip copy of message block
> +       stm             r14, {r0-r3}            // store new h[4..7]
> +
> +       // Advance to the next block, if there is one.  Note that if there are
> +       // multiple blocks, then 'inc' (the counter increment amount) must be
> +       // 64.  So we can simply set it to 64 without re-loading it.
> +       ldm             sp, {r0, r1, r2}        // load (state, block, nblocks)
> +       mov             r3, #64                 // set 'inc'
> +       subs            r2, r2, #1              // nblocks--
> +       str             r2, [sp, #8]
> +       bne             .Lnext_block            // nblocks != 0?
> +
> +       pop             {r0-r2,r4-r11,pc}
> +
> +       // The next message block (pointed to by r1) isn't 4-byte aligned, so it
> +       // can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
> +       // by r12) using an alternative method.  r2-r9 are free to use.
> +.Lcopy_block_misaligned:
> +       mov             r2, #64
> +1:
> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> +       ldr             r3, [r1], #4
> +#else
> +       ldrb            r3, [r1, #0]
> +       ldrb            r4, [r1, #1]
> +       ldrb            r5, [r1, #2]
> +       ldrb            r6, [r1, #3]
> +       add             r1, r1, #4
> +       orr             r3, r3, r4, lsl #8
> +       orr             r3, r3, r5, lsl #16
> +       orr             r3, r3, r6, lsl #24
> +#endif
> +       subs            r2, r2, #4
> +       str             r3, [r12], #4
> +       bne             1b
> +       b               .Lcopy_block_done
> +ENDPROC(blake2s_compress_arch)
> diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
> new file mode 100644
> index 0000000000000..f2cc1e5fc9ec1
> --- /dev/null
> +++ b/arch/arm/crypto/blake2s-glue.c
> @@ -0,0 +1,78 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * BLAKE2s digest algorithm, ARM scalar implementation
> + *
> + * Copyright 2020 Google LLC
> + */
> +
> +#include <crypto/internal/blake2s.h>
> +#include <crypto/internal/hash.h>
> +
> +#include <linux/module.h>
> +
> +/* defined in blake2s-core.S */
> +EXPORT_SYMBOL(blake2s_compress_arch);
> +
> +static int crypto_blake2s_update_arm(struct shash_desc *desc,
> +                                    const u8 *in, unsigned int inlen)
> +{
> +       return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
> +}
> +
> +static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
> +{
> +       return crypto_blake2s_final(desc, out, blake2s_compress_arch);
> +}
> +
> +#define BLAKE2S_ALG(name, driver_name, digest_size)                    \
> +       {                                                               \
> +               .base.cra_name          = name,                         \
> +               .base.cra_driver_name   = driver_name,                  \
> +               .base.cra_priority      = 200,                          \
> +               .base.cra_flags         = CRYPTO_ALG_OPTIONAL_KEY,      \
> +               .base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,           \
> +               .base.cra_ctxsize       = sizeof(struct blake2s_tfm_ctx), \
> +               .base.cra_module        = THIS_MODULE,                  \
> +               .digestsize             = digest_size,                  \
> +               .setkey                 = crypto_blake2s_setkey,        \
> +               .init                   = crypto_blake2s_init,          \
> +               .update                 = crypto_blake2s_update_arm,    \
> +               .final                  = crypto_blake2s_final_arm,     \
> +               .descsize               = sizeof(struct blake2s_state), \
> +       }
> +
> +static struct shash_alg blake2s_arm_algs[] = {
> +       BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE),
> +       BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE),
> +       BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE),
> +       BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE),
> +};
> +
> +static int __init blake2s_arm_mod_init(void)
> +{
> +       return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
> +               crypto_register_shashes(blake2s_arm_algs,
> +                                       ARRAY_SIZE(blake2s_arm_algs)) : 0;
> +}
> +
> +static void __exit blake2s_arm_mod_exit(void)
> +{
> +       if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
> +               crypto_unregister_shashes(blake2s_arm_algs,
> +                                         ARRAY_SIZE(blake2s_arm_algs));
> +}
> +
> +module_init(blake2s_arm_mod_init);
> +module_exit(blake2s_arm_mod_exit);
> +
> +MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation");
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Eric Biggers <ebiggers@xxxxxxxxxx>");
> +MODULE_ALIAS_CRYPTO("blake2s-128");
> +MODULE_ALIAS_CRYPTO("blake2s-128-arm");
> +MODULE_ALIAS_CRYPTO("blake2s-160");
> +MODULE_ALIAS_CRYPTO("blake2s-160-arm");
> +MODULE_ALIAS_CRYPTO("blake2s-224");
> +MODULE_ALIAS_CRYPTO("blake2s-224-arm");
> +MODULE_ALIAS_CRYPTO("blake2s-256");
> +MODULE_ALIAS_CRYPTO("blake2s-256-arm");
> --
> 2.29.2
>