Re: [PATCH] LoongArch: vDSO: Tune the chacha20 implementation

Huacai Chen <chenhuacai@xxxxxxxxxx> · Mon, 23 Sep 2024 15:15:25 +0800

Hi, Ruoyao,

On Thu, Sep 19, 2024 at 5:15 PM Xi Ruoyao <xry111@xxxxxxxxxxx> wrote:
>
> As Christophe pointed out, tuning the chacha20 implementation by
> scheduling the instructions like what GCC does can improve the
> performance.
>
> The tuning does not introduce too much complexity (basically it's just
> reordering some instructions).  And the tuning does not hurt readibility
> too much: actually the tuned code looks even more similar to a
> textbook-style implementation based on 128-bit vectors.  So overall it's
> a good deal to me.
>
> Tested with vdso_test_getchacha and benched with vdso_test_getrandom.
> On a LA664 the speedup is 5%, and I expect a larger speedup on LA[2-4]64
> with a lower issue rate.
>
> Suggested-by: Christophe Leroy <christophe.leroy@xxxxxxxxxx>
> Link: https://lore.kernel.org/all/77655d9e-fc05-4300-8f0d-7b2ad840d091@xxxxxxxxxx/
> Signed-off-by: Xi Ruoyao <xry111@xxxxxxxxxxx>
> ---
>  arch/loongarch/vdso/vgetrandom-chacha.S | 92 +++++++++++++++----------
>  1 file changed, 55 insertions(+), 37 deletions(-)
>
> diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S
> index 7e86a50f6e85..0c5f1183c480 100644
> --- a/arch/loongarch/vdso/vgetrandom-chacha.S
> +++ b/arch/loongarch/vdso/vgetrandom-chacha.S
> @@ -9,23 +9,11 @@
>
>  .text
>
> -/* Salsa20 quarter-round */
> -.macro QR      a b c d
> -       add.w           \a, \a, \b
> -       xor             \d, \d, \a
> -       rotri.w         \d, \d, 16
> -
> -       add.w           \c, \c, \d
> -       xor             \b, \b, \c
> -       rotri.w         \b, \b, 20
> -
> -       add.w           \a, \a, \b
> -       xor             \d, \d, \a
> -       rotri.w         \d, \d, 24
> -
> -       add.w           \c, \c, \d
> -       xor             \b, \b, \c
> -       rotri.w         \b, \b, 25
> +.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3
> +       \op     \d0, \d0, \s0
> +       \op     \d1, \d1, \s1
> +       \op     \d2, \d2, \s2
> +       \op     \d3, \d3, \s3
>  .endm
>
>  /*
> @@ -74,6 +62,23 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>  /* Reuse i as copy3 */
>  #define copy3          i
>
> +/* Packs to be used with OP_4REG */
> +#define line0          state0, state1, state2, state3
> +#define line1          state4, state5, state6, state7
> +#define line2          state8, state9, state10, state11
> +#define line3          state12, state13, state14, state15
> +
> +#define        line1_perm      state5, state6, state7, state4
> +#define        line2_perm      state10, state11, state8, state9
> +#define        line3_perm      state15, state12, state13, state14
> +
> +#define        copy            copy0, copy1, copy2, copy3
The indentation here is strange, it seems some of them are spaces and
some of them are tabs.

Huacai

> +
> +#define _16            16, 16, 16, 16
> +#define _20            20, 20, 20, 20
> +#define _24            24, 24, 24, 24
> +#define _25            25, 25, 25, 25
> +
>         /*
>          * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
>          * This does not violate the stack-less requirement: no sensitive data
> @@ -126,16 +131,38 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>         li.w            i, 10
>  .Lpermute:
>         /* odd round */
> -       QR              state0, state4, state8, state12
> -       QR              state1, state5, state9, state13
> -       QR              state2, state6, state10, state14
> -       QR              state3, state7, state11, state15
> +       OP_4REG add.w   line0, line1
> +       OP_4REG xor     line3, line0
> +       OP_4REG rotri.w line3, _16
> +
> +       OP_4REG add.w   line2, line3
> +       OP_4REG xor     line1, line2
> +       OP_4REG rotri.w line1, _20
> +
> +       OP_4REG add.w   line0, line1
> +       OP_4REG xor     line3, line0
> +       OP_4REG rotri.w line3, _24
> +
> +       OP_4REG add.w   line2, line3
> +       OP_4REG xor     line1, line2
> +       OP_4REG rotri.w line1, _25
>
>         /* even round */
> -       QR              state0, state5, state10, state15
> -       QR              state1, state6, state11, state12
> -       QR              state2, state7, state8, state13
> -       QR              state3, state4, state9, state14
> +       OP_4REG add.w   line0, line1_perm
> +       OP_4REG xor     line3_perm, line0
> +       OP_4REG rotri.w line3_perm, _16
> +
> +       OP_4REG add.w   line2_perm, line3_perm
> +       OP_4REG xor     line1_perm, line2_perm
> +       OP_4REG rotri.w line1_perm, _20
> +
> +       OP_4REG add.w   line0, line1_perm
> +       OP_4REG xor     line3_perm, line0
> +       OP_4REG rotri.w line3_perm, _24
> +
> +       OP_4REG add.w   line2_perm, line3_perm
> +       OP_4REG xor     line1_perm, line2_perm
> +       OP_4REG rotri.w line1_perm, _25
>
>         addi.w          i, i, -1
>         bnez            i, .Lpermute
> @@ -147,10 +174,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>         li.w            copy3, 0x6b206574
>
>         /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
> -       add.w           state0, state0, copy0
> -       add.w           state1, state1, copy1
> -       add.w           state2, state2, copy2
> -       add.w           state3, state3, copy3
> +       OP_4REG add.w   line0, copy
>         st.w            state0, output, 0
>         st.w            state1, output, 4
>         st.w            state2, output, 8
> @@ -165,10 +189,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>         ld.w            state3, key, 12
>
>         /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
> -       add.w           state4, state4, state0
> -       add.w           state5, state5, state1
> -       add.w           state6, state6, state2
> -       add.w           state7, state7, state3
> +       OP_4REG add.w   line1, line0
>         st.w            state4, output, 16
>         st.w            state5, output, 20
>         st.w            state6, output, 24
> @@ -181,10 +202,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>         ld.w            state3, key, 28
>
>         /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
> -       add.w           state8, state8, state0
> -       add.w           state9, state9, state1
> -       add.w           state10, state10, state2
> -       add.w           state11, state11, state3
> +       OP_4REG add.w   line2, line0
>         st.w            state8, output, 32
>         st.w            state9, output, 36
>         st.w            state10, output, 40
> --
> 2.46.1
>
>