Re: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

Taehee Yoo <ap420073@xxxxxxxxx> · Fri, 2 Sep 2022 17:31:29 +0900

Hi Jussi,
Thank you so much for this work!

On 9/2/22 04:51, Jussi Kivilinna wrote:
> Hello,
>
> On 26.8.2022 8.31, Taehee Yoo wrote:
>> +#define aria_sbox_8way(x0, x1, x2, x3,            \
>> +               x4, x5, x6, x7,            \
>> +               t0, t1, t2, t3,            \
>> +               t4, t5, t6, t7)            \
>> +    vpxor t0, t0, t0;                \
>> +    vaesenclast t0, x0, x0;                \
>> +    vaesenclast t0, x4, x4;                \
>> +    vaesenclast t0, x1, x1;                \
>> +    vaesenclast t0, x5, x5;                \
>> +    vaesdeclast t0, x2, x2;                \
>> +    vaesdeclast t0, x6, x6;                \
>> +                            \
>> +    /* AES inverse shift rows */            \
>> +    vmovdqa .Linv_shift_row, t0;            \
>> +    vmovdqa .Lshift_row, t1;            \
>> +    vpshufb t0, x0, x0;                \
>> +    vpshufb t0, x4, x4;                \
>> +    vpshufb t0, x1, x1;                \
>> +    vpshufb t0, x5, x5;                \
>> +    vpshufb t0, x3, x3;                \
>> +    vpshufb t0, x7, x7;                \
>> +    vpshufb t1, x2, x2;                \
>> +    vpshufb t1, x6, x6;                \
>> +                            \
>> +    vmovdqa .Linv_lo, t0;                \
>> +    vmovdqa .Linv_hi, t1;                \
>> +    vmovdqa .Ltf_lo_s2, t2;                \
>> +    vmovdqa .Ltf_hi_s2, t3;                \
>> +    vmovdqa .Ltf_lo_x2, t4;                \
>> +    vmovdqa .Ltf_hi_x2, t5;                \
>> +    vbroadcastss .L0f0f0f0f, t6;            \
>> +                            \
>> +    /* extract multiplicative inverse */        \
>> +    filter_8bit(x1, t0, t1, t6, t7);        \
>> +    /* affine transformation for S2 */        \
>> +    filter_8bit(x1, t2, t3, t6, t7);        \
>
> Here's room for improvement. These two affine transformations
> could be combined into single filter_8bit...
>
>> +    /* extract multiplicative inverse */        \
>> +    filter_8bit(x5, t0, t1, t6, t7);        \
>> +    /* affine transformation for S2 */        \
>> +    filter_8bit(x5, t2, t3, t6, t7);        \
>> +                            \
>> +    /* affine transformation for X2 */        \
>> +    filter_8bit(x3, t4, t5, t6, t7);        \
>> +    vpxor t7, t7, t7;                \
>> +    vaesenclast t7, x3, x3;                \
>> +    /* extract multiplicative inverse */        \
>> +    filter_8bit(x3, t0, t1, t6, t7);        \
>> +    /* affine transformation for X2 */        \
>> +    filter_8bit(x7, t4, t5, t6, t7);        \
>> +    vpxor t7, t7, t7;                \
>> +    vaesenclast t7, x7, x7;                         \
>> +    /* extract multiplicative inverse */        \
>> +    filter_8bit(x7, t0, t1, t6, t7);
>
> ... as well as these two filter_8bit could be replaced with
> one operation if 'vaesenclast' would be changed to 'vaesdeclast'.
>
> With these optimizations, 'aria_sbox_8way' would look like this:
>
> /////////////////////////////////////////////////////////
> #define aria_sbox_8way(x0, x1, x2, x3,            \
>                 x4, x5, x6, x7,            \
>                 t0, t1, t2, t3,            \
>                 t4, t5, t6, t7)            \
>      vpxor t7, t7, t7;                \
>      vmovdqa .Linv_shift_row, t0;            \
>      vmovdqa .Lshift_row, t1;            \
>      vpbroadcastd .L0f0f0f0f, t6;            \
>      vmovdqa .Ltf_lo__inv_aff__and__s2, t2;        \
>      vmovdqa .Ltf_hi__inv_aff__and__s2, t3;        \
>      vmovdqa .Ltf_lo__x2__and__fwd_aff, t4;        \
>      vmovdqa .Ltf_hi__x2__and__fwd_aff, t5;        \
>                              \
>      vaesenclast t7, x0, x0;                \
>      vaesenclast t7, x4, x4;                \
>      vaesenclast t7, x1, x1;                \
>      vaesenclast t7, x5, x5;                \
>      vaesdeclast t7, x2, x2;                \
>      vaesdeclast t7, x6, x6;                \
>                              \
>      /* AES inverse shift rows */            \
>      vpshufb t0, x0, x0;                \
>      vpshufb t0, x4, x4;                \
>      vpshufb t0, x1, x1;                \
>      vpshufb t0, x5, x5;                \
>      vpshufb t1, x3, x3;                \
>      vpshufb t1, x7, x7;                \
>      vpshufb t1, x2, x2;                \
>      vpshufb t1, x6, x6;                \
>                              \
>      /* affine transformation for S2 */        \
>      filter_8bit(x1, t2, t3, t6, t0);        \
>      /* affine transformation for S2 */        \
>      filter_8bit(x5, t2, t3, t6, t0);        \
>                              \
>      /* affine transformation for X2 */        \
>      filter_8bit(x3, t4, t5, t6, t0);        \
>      /* affine transformation for X2 */        \
>      filter_8bit(x7, t4, t5, t6, t0);        \
>      vaesdeclast t7, x3, x3;                \
>      vaesdeclast t7, x7, x7;
>
> /* AES inverse affine and S2 combined:
>   *      1 1 0 0 0 0 0 1     x0     0
>   *      0 1 0 0 1 0 0 0     x1     0
>   *      1 1 0 0 1 1 1 1     x2     0
>   *      0 1 1 0 1 0 0 1     x3     1
>   *      0 1 0 0 1 1 0 0  *  x4  +  0
>   *      0 1 0 1 1 0 0 0     x5     0
>   *      0 0 0 0 0 1 0 1     x6     0
>   *      1 1 1 0 0 1 1 1     x7     1
>   */
> .Ltf_lo__inv_aff__and__s2:
>      .octa 0x92172DA81A9FA520B2370D883ABF8500
> .Ltf_hi__inv_aff__and__s2:
>      .octa 0x2B15FFC1AF917B45E6D8320C625CB688
>
> /* X2 and AES forward affine combined:
>   *      1 0 1 1 0 0 0 1     x0     0
>   *      0 1 1 1 1 0 1 1     x1     0
>   *      0 0 0 1 1 0 1 0     x2     1
>   *      0 1 0 0 0 1 0 0     x3     0
>   *      0 0 1 1 1 0 1 1  *  x4  +  0
>   *      0 1 0 0 1 0 0 0     x5     0
>   *      1 1 0 1 0 0 1 1     x6     0
>   *      0 1 0 0 1 0 1 0     x7     0
>   */
> .Ltf_lo__x2__and__fwd_aff:
>      .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
> .Ltf_hi__x2__and__fwd_aff:
>      .octa 0x3F893781E95FE1576CDA64D2BA0CB204
> /////////////////////////////////////////////////////////
>
> I tested above quickly in userspace against aria-generic
> and your original aria-avx implementation and output matches
> to these references. In quick and dirty benchmark, function
> execution time was ~30% faster on AMD Zen3 and ~20% faster
> on Intel tiger-lake.

I tested your implementation.
It works very well and as you mentioned, it improves performance so much!
Before:
128bit 4096bytes: 14758 cycles
After:
128bit 4096bytes: 11972 cycles

I will apply your implementation in the v3 patch!
Thank you so much!

Taehee Yoo