Hello, On 26.8.2022 8.31, Taehee Yoo wrote:
+#define aria_sbox_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + t0, t1, t2, t3, \ + t4, t5, t6, t7) \ + vpxor t0, t0, t0; \ + vaesenclast t0, x0, x0; \ + vaesenclast t0, x4, x4; \ + vaesenclast t0, x1, x1; \ + vaesenclast t0, x5, x5; \ + vaesdeclast t0, x2, x2; \ + vaesdeclast t0, x6, x6; \ + \ + /* AES inverse shift rows */ \ + vmovdqa .Linv_shift_row, t0; \ + vmovdqa .Lshift_row, t1; \ + vpshufb t0, x0, x0; \ + vpshufb t0, x4, x4; \ + vpshufb t0, x1, x1; \ + vpshufb t0, x5, x5; \ + vpshufb t0, x3, x3; \ + vpshufb t0, x7, x7; \ + vpshufb t1, x2, x2; \ + vpshufb t1, x6, x6; \ + \ + vmovdqa .Linv_lo, t0; \ + vmovdqa .Linv_hi, t1; \ + vmovdqa .Ltf_lo_s2, t2; \ + vmovdqa .Ltf_hi_s2, t3; \ + vmovdqa .Ltf_lo_x2, t4; \ + vmovdqa .Ltf_hi_x2, t5; \ + vbroadcastss .L0f0f0f0f, t6; \ + \ + /* extract multiplicative inverse */ \ + filter_8bit(x1, t0, t1, t6, t7); \ + /* affine transformation for S2 */ \ + filter_8bit(x1, t2, t3, t6, t7); \
Here's room for improvement. These two affine transformations could be combined into single filter_8bit...
+ /* extract multiplicative inverse */ \ + filter_8bit(x5, t0, t1, t6, t7); \ + /* affine transformation for S2 */ \ + filter_8bit(x5, t2, t3, t6, t7); \ + \ + /* affine transformation for X2 */ \ + filter_8bit(x3, t4, t5, t6, t7); \ + vpxor t7, t7, t7; \ + vaesenclast t7, x3, x3; \ + /* extract multiplicative inverse */ \ + filter_8bit(x3, t0, t1, t6, t7); \ + /* affine transformation for X2 */ \ + filter_8bit(x7, t4, t5, t6, t7); \ + vpxor t7, t7, t7; \ + vaesenclast t7, x7, x7; \ + /* extract multiplicative inverse */ \ + filter_8bit(x7, t0, t1, t6, t7);
... as well as these two filter_8bit could be replaced with one operation if 'vaesenclast' would be changed to 'vaesdeclast'. With these optimizations, 'aria_sbox_8way' would look like this: ///////////////////////////////////////////////////////// #define aria_sbox_8way(x0, x1, x2, x3, \ x4, x5, x6, x7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ vpxor t7, t7, t7; \ vmovdqa .Linv_shift_row, t0; \ vmovdqa .Lshift_row, t1; \ vpbroadcastd .L0f0f0f0f, t6; \ vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \ vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \ vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \ vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \ \ vaesenclast t7, x0, x0; \ vaesenclast t7, x4, x4; \ vaesenclast t7, x1, x1; \ vaesenclast t7, x5, x5; \ vaesdeclast t7, x2, x2; \ vaesdeclast t7, x6, x6; \ \ /* AES inverse shift rows */ \ vpshufb t0, x0, x0; \ vpshufb t0, x4, x4; \ vpshufb t0, x1, x1; \ vpshufb t0, x5, x5; \ vpshufb t1, x3, x3; \ vpshufb t1, x7, x7; \ vpshufb t1, x2, x2; \ vpshufb t1, x6, x6; \ \ /* affine transformation for S2 */ \ filter_8bit(x1, t2, t3, t6, t0); \ /* affine transformation for S2 */ \ filter_8bit(x5, t2, t3, t6, t0); \ \ /* affine transformation for X2 */ \ filter_8bit(x3, t4, t5, t6, t0); \ /* affine transformation for X2 */ \ filter_8bit(x7, t4, t5, t6, t0); \ vaesdeclast t7, x3, x3; \ vaesdeclast t7, x7, x7; /* AES inverse affine and S2 combined: * 1 1 0 0 0 0 0 1 x0 0 * 0 1 0 0 1 0 0 0 x1 0 * 1 1 0 0 1 1 1 1 x2 0 * 0 1 1 0 1 0 0 1 x3 1 * 0 1 0 0 1 1 0 0 * x4 + 0 * 0 1 0 1 1 0 0 0 x5 0 * 0 0 0 0 0 1 0 1 x6 0 * 1 1 1 0 0 1 1 1 x7 1 */ .Ltf_lo__inv_aff__and__s2: .octa 0x92172DA81A9FA520B2370D883ABF8500 .Ltf_hi__inv_aff__and__s2: .octa 0x2B15FFC1AF917B45E6D8320C625CB688 /* X2 and AES forward affine combined: * 1 0 1 1 0 0 0 1 x0 0 * 0 1 1 1 1 0 1 1 x1 0 * 0 0 0 1 1 0 1 0 x2 1 * 0 1 0 0 0 1 0 0 x3 0 * 0 0 1 1 1 0 1 1 * x4 + 0 * 0 1 0 0 1 0 0 0 x5 0 * 1 1 0 1 0 0 1 1 x6 0 * 0 1 0 0 1 0 1 0 x7 0 */ .Ltf_lo__x2__and__fwd_aff: .octa 0xEFAE0544FCBD1657B8F95213ABEA4100 .Ltf_hi__x2__and__fwd_aff: .octa 0x3F893781E95FE1576CDA64D2BA0CB204 ///////////////////////////////////////////////////////// I tested above quickly in userspace against aria-generic and your original aria-avx implementation and output matches to these references. In quick and dirty benchmark, function execution time was ~30% faster on AMD Zen3 and ~20% faster on Intel tiger-lake. -Jussi