Re: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

Jussi Kivilinna <jussi.kivilinna@xxxxxx> · Thu, 1 Sep 2022 22:51:45 +0300

Hello,

On 26.8.2022 8.31, Taehee Yoo wrote:
+#define aria_sbox_8way(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       t0, t1, t2, t3,			\
+		       t4, t5, t6, t7)			\
+	vpxor t0, t0, t0;				\
+	vaesenclast t0, x0, x0;				\
+	vaesenclast t0, x4, x4;				\
+	vaesenclast t0, x1, x1;				\
+	vaesenclast t0, x5, x5;				\
+	vaesdeclast t0, x2, x2;				\
+	vaesdeclast t0, x6, x6;				\
+							\
+	/* AES inverse shift rows */			\
+	vmovdqa .Linv_shift_row, t0;			\
+	vmovdqa .Lshift_row, t1;			\
+	vpshufb t0, x0, x0;				\
+	vpshufb t0, x4, x4;				\
+	vpshufb t0, x1, x1;				\
+	vpshufb t0, x5, x5;				\
+	vpshufb t0, x3, x3;				\
+	vpshufb t0, x7, x7;				\
+	vpshufb t1, x2, x2;				\
+	vpshufb t1, x6, x6;				\
+							\
+	vmovdqa .Linv_lo, t0;				\
+	vmovdqa .Linv_hi, t1;				\
+	vmovdqa .Ltf_lo_s2, t2;				\
+	vmovdqa .Ltf_hi_s2, t3;				\
+	vmovdqa .Ltf_lo_x2, t4;				\
+	vmovdqa .Ltf_hi_x2, t5;				\
+	vbroadcastss .L0f0f0f0f, t6;			\
+							\
+	/* extract multiplicative inverse */		\
+	filter_8bit(x1, t0, t1, t6, t7);		\
+	/* affine transformation for S2 */		\
+	filter_8bit(x1, t2, t3, t6, t7);		\

Here's room for improvement. These two affine transformations
could be combined into single filter_8bit...

+	/* extract multiplicative inverse */		\
+	filter_8bit(x5, t0, t1, t6, t7);		\
+	/* affine transformation for S2 */		\
+	filter_8bit(x5, t2, t3, t6, t7);		\
+							\
+	/* affine transformation for X2 */		\
+	filter_8bit(x3, t4, t5, t6, t7);		\
+	vpxor t7, t7, t7;				\
+	vaesenclast t7, x3, x3;				\
+	/* extract multiplicative inverse */		\
+	filter_8bit(x3, t0, t1, t6, t7);		\
+	/* affine transformation for X2 */		\
+	filter_8bit(x7, t4, t5, t6, t7);		\
+	vpxor t7, t7, t7;				\
+	vaesenclast t7, x7, x7;                         \
+	/* extract multiplicative inverse */		\
+	filter_8bit(x7, t0, t1, t6, t7);

... as well as these two filter_8bit could be replaced with
one operation if 'vaesenclast' would be changed to 'vaesdeclast'.

With these optimizations, 'aria_sbox_8way' would look like this:

/////////////////////////////////////////////////////////
#define aria_sbox_8way(x0, x1, x2, x3,			\
		       x4, x5, x6, x7,			\
		       t0, t1, t2, t3,			\
		       t4, t5, t6, t7)			\
	vpxor t7, t7, t7;				\
	vmovdqa .Linv_shift_row, t0;			\
	vmovdqa .Lshift_row, t1;			\
	vpbroadcastd .L0f0f0f0f, t6;			\
	vmovdqa .Ltf_lo__inv_aff__and__s2, t2;		\
	vmovdqa .Ltf_hi__inv_aff__and__s2, t3;		\
	vmovdqa .Ltf_lo__x2__and__fwd_aff, t4;		\
	vmovdqa .Ltf_hi__x2__and__fwd_aff, t5;		\
							\
	vaesenclast t7, x0, x0;				\
	vaesenclast t7, x4, x4;				\
	vaesenclast t7, x1, x1;				\
	vaesenclast t7, x5, x5;				\
	vaesdeclast t7, x2, x2;				\
	vaesdeclast t7, x6, x6;				\
							\
	/* AES inverse shift rows */			\
	vpshufb t0, x0, x0;				\
	vpshufb t0, x4, x4;				\
	vpshufb t0, x1, x1;				\
	vpshufb t0, x5, x5;				\
	vpshufb t1, x3, x3;				\
	vpshufb t1, x7, x7;				\
	vpshufb t1, x2, x2;				\
	vpshufb t1, x6, x6;				\
							\
	/* affine transformation for S2 */		\
	filter_8bit(x1, t2, t3, t6, t0);		\
	/* affine transformation for S2 */		\
	filter_8bit(x5, t2, t3, t6, t0);		\
							\
	/* affine transformation for X2 */		\
	filter_8bit(x3, t4, t5, t6, t0);		\
	/* affine transformation for X2 */		\
	filter_8bit(x7, t4, t5, t6, t0);		\
	vaesdeclast t7, x3, x3;				\
	vaesdeclast t7, x7, x7;

/* AES inverse affine and S2 combined:
 *      1 1 0 0 0 0 0 1     x0     0
 *      0 1 0 0 1 0 0 0     x1     0
 *      1 1 0 0 1 1 1 1     x2     0
 *      0 1 1 0 1 0 0 1     x3     1
 *      0 1 0 0 1 1 0 0  *  x4  +  0
 *      0 1 0 1 1 0 0 0     x5     0
 *      0 0 0 0 0 1 0 1     x6     0
 *      1 1 1 0 0 1 1 1     x7     1
 */
.Ltf_lo__inv_aff__and__s2:
	.octa 0x92172DA81A9FA520B2370D883ABF8500
.Ltf_hi__inv_aff__and__s2:
	.octa 0x2B15FFC1AF917B45E6D8320C625CB688

/* X2 and AES forward affine combined:
 *      1 0 1 1 0 0 0 1     x0     0
 *      0 1 1 1 1 0 1 1     x1     0
 *      0 0 0 1 1 0 1 0     x2     1
 *      0 1 0 0 0 1 0 0     x3     0
 *      0 0 1 1 1 0 1 1  *  x4  +  0
 *      0 1 0 0 1 0 0 0     x5     0
 *      1 1 0 1 0 0 1 1     x6     0
 *      0 1 0 0 1 0 1 0     x7     0
 */
.Ltf_lo__x2__and__fwd_aff:
	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
.Ltf_hi__x2__and__fwd_aff:
	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
/////////////////////////////////////////////////////////

I tested above quickly in userspace against aria-generic
and your original aria-avx implementation and output matches
to these references. In quick and dirty benchmark, function
execution time was ~30% faster on AMD Zen3 and ~20% faster
on Intel tiger-lake.

-Jussi