[PATCH 7/8] crypto: arm64/aes-ccm - Merge encrypt and decrypt asm routines

Ard Biesheuvel <ardb+git@xxxxxxxxxx> · Thu, 11 Jan 2024 13:33:10 +0100

From: Ard Biesheuvel <ardb@xxxxxxxxxx>

The encryption and decryption code paths are mostly identical, except
for a small difference where the plaintext input into the MAC is taken
from either the input or the output block.

We can factor this in quite easily using a vector bit select, and a few
additional XORs, without the need for branches. This way, we can use the
same asm helper on the encrypt and decrypt code paths.

Signed-off-by: Ard Biesheuvel <ardb@xxxxxxxxxx>
---
 arch/arm64/crypto/aes-ce-ccm-core.S | 41 +++++++++-----------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index 0ec59fc4ef3e..75be3157bae1 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -60,7 +60,7 @@ SYM_FUNC_START(ce_aes_ccm_final)
 	ret
 SYM_FUNC_END(ce_aes_ccm_final)
 
-	.macro	aes_ccm_do_crypt,enc
+SYM_FUNC_START_LOCAL(aes_ccm_do_crypt)
 	load_round_keys	x3, w4, x10
 
 	cbz	x2, 5f
@@ -76,28 +76,24 @@ CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
 
 	aes_encrypt	v0, v1, w4
 
+	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
+	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
 	subs	w2, w2, #16
 	bmi	6f				/* partial block? */
 	ld1	{v2.16b}, [x1], #16		/* load next input block */
-	.if	\enc == 1
-	eor	v2.16b, v2.16b, v5.16b		/* final round enc+mac */
-	eor	v6.16b, v1.16b, v2.16b		/* xor with crypted ctr */
-	.else
-	eor	v2.16b, v2.16b, v1.16b		/* xor with crypted ctr */
-	eor	v6.16b, v2.16b, v5.16b		/* final round enc */
-	.endif
-	eor	v0.16b, v0.16b, v2.16b		/* xor mac with pt ^ rk[last] */
+	eor	v6.16b, v2.16b, v1.16b		/* en/decrypt input block */
+	mov	v23.16b, v22.16b
+	bsl	v23.16b, v2.16b, v6.16b		/* select plaintext */
 	st1	{v6.16b}, [x0], #16		/* write output block */
+	eor	v0.16b, v0.16b, v23.16b		/* fold plaintext into mac */
+
 	bne	0b
 CPU_LE(	rev	x8, x8			)
 	st1	{v0.16b}, [x5]			/* store mac */
 	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */
 5:	ret
 
-6:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
-	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
-
-	add	x1, x1, w2, sxtw		/* rewind the input pointer (w2 < 0) */
+6:	add	x1, x1, w2, sxtw		/* rewind the input pointer (w2 < 0) */
 	add	x0, x0, w2, sxtw		/* rewind the output pointer */
 
 	adr_l	x8, .Lpermute			/* load permute vectors */
@@ -108,20 +104,17 @@ CPU_LE(	rev	x8, x8			)
 
 	ld1	{v2.16b}, [x1]			/* load a full block of input */
 	tbl	v1.16b, {v1.16b}, v7.16b	/* move keystream to end of register */
-	.if	\enc == 1
-	tbl	v7.16b, {v2.16b}, v9.16b	/* copy plaintext to start of v7 */
+	tbl	v7.16b, {v2.16b}, v9.16b	/* copy input block to start of v7 */
 	eor	v2.16b, v2.16b, v1.16b		/* encrypt partial input block */
-	.else
-	eor	v2.16b, v2.16b, v1.16b		/* decrypt partial input block */
-	tbl	v7.16b, {v2.16b}, v9.16b	/* copy plaintext to start of v7 */
-	.endif
-	eor	v0.16b, v0.16b, v7.16b		/* fold plaintext into mac */
+	tbl	v9.16b, {v2.16b}, v9.16b	/* copy output block to start of v9 */
+	bsl	v22.16b, v7.16b, v9.16b		/* select plaintext */
+	eor	v0.16b, v0.16b, v22.16b		/* fold plaintext into mac */
 	tbx	v2.16b, {v6.16b}, v8.16b	/* insert output from previous iteration */
 
 	st1	{v0.16b}, [x5]			/* store mac */
 	st1	{v2.16b}, [x0]			/* store output block */
 	ret
-	.endm
+SYM_FUNC_END(aes_ccm_do_crypt)
 
 	/*
 	 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
@@ -132,11 +125,13 @@ CPU_LE(	rev	x8, x8			)
 	 * 			   u8 ctr[]);
 	 */
 SYM_FUNC_START(ce_aes_ccm_encrypt)
-	aes_ccm_do_crypt	1
+	movi	v22.16b, #255
+	b	aes_ccm_do_crypt
 SYM_FUNC_END(ce_aes_ccm_encrypt)
 
 SYM_FUNC_START(ce_aes_ccm_decrypt)
-	aes_ccm_do_crypt	0
+	movi	v22.16b, #0
+	b	aes_ccm_do_crypt
 SYM_FUNC_END(ce_aes_ccm_decrypt)
 
 	.section ".rodata", "a"
-- 
2.43.0.275.g3460e3d667-goog