Re: FAILED: patch "[PATCH] crypto: sha3-generic - deal with oversize stack frames" failed to apply to 4.14-stable tree

Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> · Tue, 20 Feb 2018 08:51:14 +0000

On 20 February 2018 at 08:34,  <gregkh@xxxxxxxxxxxxxxxxxxx> wrote:
>
> The patch below does not apply to the 4.14-stable tree.
> If someone wants it applied there, or to any other stable or longterm
> tree, then please email the backport, including the original git commit
> id to <stable@xxxxxxxxxxxxxxx>.
>

Hi Greg,

This shouldn't have been picked up for -stable: the optimized SHA3 C
code was only merged recently, so there are no stable trees to which
this fix applies.

-- 
Ard.

> ------------------ original commit in Linus's tree ------------------
>
> From 4767b9ad7d762876a5865a06465e13e139a01b6b Mon Sep 17 00:00:00 2001
> From: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx>
> Date: Sat, 27 Jan 2018 09:18:32 +0000
> Subject: [PATCH] crypto: sha3-generic - deal with oversize stack frames
>
> As reported by kbuild test robot, the optimized SHA3 C implementation
> compiles to mn10300 code that uses a disproportionate amount of stack
> space, i.e.,
>
>   crypto/sha3_generic.c: In function 'keccakf':
>   crypto/sha3_generic.c:147:1: warning: the frame size of 1232 bytes is larger than 1024 bytes [-Wframe-larger-than=]
>
> As kindly diagnosed by Arnd, this does not only occur when building for
> the mn10300 architecture (which is what the report was about) but also
> for h8300, and builds for other 32-bit architectures show an increase in
> stack space utilization as well.
>
> Given that SHA3 operates on 64-bit quantities, and keeps a state matrix
> of 25 64-bit words, it is not surprising that 32-bit architectures with
> few general purpose registers are impacted the most by this, and it is
> therefore reasonable to implement a workaround that distinguishes between
> 32-bit and 64-bit architectures.
>
> Arnd figured out that taking the round calculation out of the loop, and
> inlining it explicitly but only on 64-bit architectures preserves most
> of the performance gain achieved by the rewrite, and also gets rid of
> the excessive use of stack space.
>
> Reported-by: kbuild test robot <fengguang.wu@xxxxxxxxx>
> Suggested-by: Arnd Bergmann <arnd@xxxxxxxx>
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx>
> Signed-off-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>
>
> diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c
> index a965b9d80559..951c4eb70262 100644
> --- a/crypto/sha3_generic.c
> +++ b/crypto/sha3_generic.c
> @@ -20,6 +20,20 @@
>  #include <crypto/sha3.h>
>  #include <asm/unaligned.h>
>
> +/*
> + * On some 32-bit architectures (mn10300 and h8300), GCC ends up using
> + * over 1 KB of stack if we inline the round calculation into the loop
> + * in keccakf(). On the other hand, on 64-bit architectures with plenty
> + * of [64-bit wide] general purpose registers, not inlining it severely
> + * hurts performance. So let's use 64-bitness as a heuristic to decide
> + * whether to inline or not.
> + */
> +#ifdef CONFIG_64BIT
> +#define SHA3_INLINE    inline
> +#else
> +#define SHA3_INLINE    noinline
> +#endif
> +
>  #define KECCAK_ROUNDS 24
>
>  static const u64 keccakf_rndc[24] = {
> @@ -35,111 +49,115 @@ static const u64 keccakf_rndc[24] = {
>
>  /* update the state with given number of rounds */
>
> -static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25])
> +static SHA3_INLINE void keccakf_round(u64 st[25])
>  {
>         u64 t[5], tt, bc[5];
> -       int round;
>
> -       for (round = 0; round < KECCAK_ROUNDS; round++) {
> +       /* Theta */
> +       bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
> +       bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
> +       bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
> +       bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
> +       bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
> +
> +       t[0] = bc[4] ^ rol64(bc[1], 1);
> +       t[1] = bc[0] ^ rol64(bc[2], 1);
> +       t[2] = bc[1] ^ rol64(bc[3], 1);
> +       t[3] = bc[2] ^ rol64(bc[4], 1);
> +       t[4] = bc[3] ^ rol64(bc[0], 1);
> +
> +       st[0] ^= t[0];
> +
> +       /* Rho Pi */
> +       tt = st[1];
> +       st[ 1] = rol64(st[ 6] ^ t[1], 44);
> +       st[ 6] = rol64(st[ 9] ^ t[4], 20);
> +       st[ 9] = rol64(st[22] ^ t[2], 61);
> +       st[22] = rol64(st[14] ^ t[4], 39);
> +       st[14] = rol64(st[20] ^ t[0], 18);
> +       st[20] = rol64(st[ 2] ^ t[2], 62);
> +       st[ 2] = rol64(st[12] ^ t[2], 43);
> +       st[12] = rol64(st[13] ^ t[3], 25);
> +       st[13] = rol64(st[19] ^ t[4],  8);
> +       st[19] = rol64(st[23] ^ t[3], 56);
> +       st[23] = rol64(st[15] ^ t[0], 41);
> +       st[15] = rol64(st[ 4] ^ t[4], 27);
> +       st[ 4] = rol64(st[24] ^ t[4], 14);
> +       st[24] = rol64(st[21] ^ t[1],  2);
> +       st[21] = rol64(st[ 8] ^ t[3], 55);
> +       st[ 8] = rol64(st[16] ^ t[1], 45);
> +       st[16] = rol64(st[ 5] ^ t[0], 36);
> +       st[ 5] = rol64(st[ 3] ^ t[3], 28);
> +       st[ 3] = rol64(st[18] ^ t[3], 21);
> +       st[18] = rol64(st[17] ^ t[2], 15);
> +       st[17] = rol64(st[11] ^ t[1], 10);
> +       st[11] = rol64(st[ 7] ^ t[2],  6);
> +       st[ 7] = rol64(st[10] ^ t[0],  3);
> +       st[10] = rol64(    tt ^ t[1],  1);
> +
> +       /* Chi */
> +       bc[ 0] = ~st[ 1] & st[ 2];
> +       bc[ 1] = ~st[ 2] & st[ 3];
> +       bc[ 2] = ~st[ 3] & st[ 4];
> +       bc[ 3] = ~st[ 4] & st[ 0];
> +       bc[ 4] = ~st[ 0] & st[ 1];
> +       st[ 0] ^= bc[ 0];
> +       st[ 1] ^= bc[ 1];
> +       st[ 2] ^= bc[ 2];
> +       st[ 3] ^= bc[ 3];
> +       st[ 4] ^= bc[ 4];
> +
> +       bc[ 0] = ~st[ 6] & st[ 7];
> +       bc[ 1] = ~st[ 7] & st[ 8];
> +       bc[ 2] = ~st[ 8] & st[ 9];
> +       bc[ 3] = ~st[ 9] & st[ 5];
> +       bc[ 4] = ~st[ 5] & st[ 6];
> +       st[ 5] ^= bc[ 0];
> +       st[ 6] ^= bc[ 1];
> +       st[ 7] ^= bc[ 2];
> +       st[ 8] ^= bc[ 3];
> +       st[ 9] ^= bc[ 4];
> +
> +       bc[ 0] = ~st[11] & st[12];
> +       bc[ 1] = ~st[12] & st[13];
> +       bc[ 2] = ~st[13] & st[14];
> +       bc[ 3] = ~st[14] & st[10];
> +       bc[ 4] = ~st[10] & st[11];
> +       st[10] ^= bc[ 0];
> +       st[11] ^= bc[ 1];
> +       st[12] ^= bc[ 2];
> +       st[13] ^= bc[ 3];
> +       st[14] ^= bc[ 4];
> +
> +       bc[ 0] = ~st[16] & st[17];
> +       bc[ 1] = ~st[17] & st[18];
> +       bc[ 2] = ~st[18] & st[19];
> +       bc[ 3] = ~st[19] & st[15];
> +       bc[ 4] = ~st[15] & st[16];
> +       st[15] ^= bc[ 0];
> +       st[16] ^= bc[ 1];
> +       st[17] ^= bc[ 2];
> +       st[18] ^= bc[ 3];
> +       st[19] ^= bc[ 4];
> +
> +       bc[ 0] = ~st[21] & st[22];
> +       bc[ 1] = ~st[22] & st[23];
> +       bc[ 2] = ~st[23] & st[24];
> +       bc[ 3] = ~st[24] & st[20];
> +       bc[ 4] = ~st[20] & st[21];
> +       st[20] ^= bc[ 0];
> +       st[21] ^= bc[ 1];
> +       st[22] ^= bc[ 2];
> +       st[23] ^= bc[ 3];
> +       st[24] ^= bc[ 4];
> +}
>
> -               /* Theta */
> -               bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
> -               bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
> -               bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
> -               bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
> -               bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
> -
> -               t[0] = bc[4] ^ rol64(bc[1], 1);
> -               t[1] = bc[0] ^ rol64(bc[2], 1);
> -               t[2] = bc[1] ^ rol64(bc[3], 1);
> -               t[3] = bc[2] ^ rol64(bc[4], 1);
> -               t[4] = bc[3] ^ rol64(bc[0], 1);
> -
> -               st[0] ^= t[0];
> -
> -               /* Rho Pi */
> -               tt = st[1];
> -               st[ 1] = rol64(st[ 6] ^ t[1], 44);
> -               st[ 6] = rol64(st[ 9] ^ t[4], 20);
> -               st[ 9] = rol64(st[22] ^ t[2], 61);
> -               st[22] = rol64(st[14] ^ t[4], 39);
> -               st[14] = rol64(st[20] ^ t[0], 18);
> -               st[20] = rol64(st[ 2] ^ t[2], 62);
> -               st[ 2] = rol64(st[12] ^ t[2], 43);
> -               st[12] = rol64(st[13] ^ t[3], 25);
> -               st[13] = rol64(st[19] ^ t[4],  8);
> -               st[19] = rol64(st[23] ^ t[3], 56);
> -               st[23] = rol64(st[15] ^ t[0], 41);
> -               st[15] = rol64(st[ 4] ^ t[4], 27);
> -               st[ 4] = rol64(st[24] ^ t[4], 14);
> -               st[24] = rol64(st[21] ^ t[1],  2);
> -               st[21] = rol64(st[ 8] ^ t[3], 55);
> -               st[ 8] = rol64(st[16] ^ t[1], 45);
> -               st[16] = rol64(st[ 5] ^ t[0], 36);
> -               st[ 5] = rol64(st[ 3] ^ t[3], 28);
> -               st[ 3] = rol64(st[18] ^ t[3], 21);
> -               st[18] = rol64(st[17] ^ t[2], 15);
> -               st[17] = rol64(st[11] ^ t[1], 10);
> -               st[11] = rol64(st[ 7] ^ t[2],  6);
> -               st[ 7] = rol64(st[10] ^ t[0],  3);
> -               st[10] = rol64(    tt ^ t[1],  1);
> -
> -               /* Chi */
> -               bc[ 0] = ~st[ 1] & st[ 2];
> -               bc[ 1] = ~st[ 2] & st[ 3];
> -               bc[ 2] = ~st[ 3] & st[ 4];
> -               bc[ 3] = ~st[ 4] & st[ 0];
> -               bc[ 4] = ~st[ 0] & st[ 1];
> -               st[ 0] ^= bc[ 0];
> -               st[ 1] ^= bc[ 1];
> -               st[ 2] ^= bc[ 2];
> -               st[ 3] ^= bc[ 3];
> -               st[ 4] ^= bc[ 4];
> -
> -               bc[ 0] = ~st[ 6] & st[ 7];
> -               bc[ 1] = ~st[ 7] & st[ 8];
> -               bc[ 2] = ~st[ 8] & st[ 9];
> -               bc[ 3] = ~st[ 9] & st[ 5];
> -               bc[ 4] = ~st[ 5] & st[ 6];
> -               st[ 5] ^= bc[ 0];
> -               st[ 6] ^= bc[ 1];
> -               st[ 7] ^= bc[ 2];
> -               st[ 8] ^= bc[ 3];
> -               st[ 9] ^= bc[ 4];
> -
> -               bc[ 0] = ~st[11] & st[12];
> -               bc[ 1] = ~st[12] & st[13];
> -               bc[ 2] = ~st[13] & st[14];
> -               bc[ 3] = ~st[14] & st[10];
> -               bc[ 4] = ~st[10] & st[11];
> -               st[10] ^= bc[ 0];
> -               st[11] ^= bc[ 1];
> -               st[12] ^= bc[ 2];
> -               st[13] ^= bc[ 3];
> -               st[14] ^= bc[ 4];
> -
> -               bc[ 0] = ~st[16] & st[17];
> -               bc[ 1] = ~st[17] & st[18];
> -               bc[ 2] = ~st[18] & st[19];
> -               bc[ 3] = ~st[19] & st[15];
> -               bc[ 4] = ~st[15] & st[16];
> -               st[15] ^= bc[ 0];
> -               st[16] ^= bc[ 1];
> -               st[17] ^= bc[ 2];
> -               st[18] ^= bc[ 3];
> -               st[19] ^= bc[ 4];
> -
> -               bc[ 0] = ~st[21] & st[22];
> -               bc[ 1] = ~st[22] & st[23];
> -               bc[ 2] = ~st[23] & st[24];
> -               bc[ 3] = ~st[24] & st[20];
> -               bc[ 4] = ~st[20] & st[21];
> -               st[20] ^= bc[ 0];
> -               st[21] ^= bc[ 1];
> -               st[22] ^= bc[ 2];
> -               st[23] ^= bc[ 3];
> -               st[24] ^= bc[ 4];
> +static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25])
> +{
> +       int round;
>
> +       for (round = 0; round < KECCAK_ROUNDS; round++) {
> +               keccakf_round(st);
>                 /* Iota */
>                 st[0] ^= keccakf_rndc[round];
>         }
>