On Tue, 9 Apr 2024 at 02:02, Eric Biggers <ebiggers@xxxxxxxxxx> wrote: > > From: Eric Biggers <ebiggers@xxxxxxxxxx> > > Access the AES round keys using offsets -7*16 through 7*16, instead of > 0*16 through 14*16. This allows VEX-encoded instructions to address all > round keys using 1-byte offsets, whereas before some needed 4-byte > offsets. This decreases the code size of aes-xts-avx-x86_64.o by 4.2%. > > Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx> Nice optimization! Do you think we might be able to macrofy this a bit so we can use zero based indexing for the round keys, and hide the arithmetic? > --- > arch/x86/crypto/aes-xts-avx-x86_64.S | 81 +++++++++++++++------------- > 1 file changed, 44 insertions(+), 37 deletions(-) > > diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S > index fcaf64a2f8c6..95e412e7601d 100644 > --- a/arch/x86/crypto/aes-xts-avx-x86_64.S > +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S > @@ -80,11 +80,11 @@ > .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 > .text > > // Function parameters > .set KEY, %rdi // Initially points to crypto_aes_ctx, then is > - // advanced to point directly to the round keys > + // advanced to point directly to 7th round key > .set SRC, %rsi // Pointer to next source data > .set DST, %rdx // Pointer to next destination data > .set LEN, %rcx // Remaining length in bytes > .set TWEAK, %r8 // Pointer to next tweak > > @@ -406,28 +406,28 @@ > .endif > .endm > > // Load the round keys: just the first one if !USE_AVX10, otherwise all of them. > .macro _load_round_keys > - _vbroadcast128 0*16(KEY), KEY0 > + _vbroadcast128 -7*16(KEY), KEY0 > .if USE_AVX10 > - _vbroadcast128 1*16(KEY), KEY1 > - _vbroadcast128 2*16(KEY), KEY2 > - _vbroadcast128 3*16(KEY), KEY3 > - _vbroadcast128 4*16(KEY), KEY4 > - _vbroadcast128 5*16(KEY), KEY5 > - _vbroadcast128 6*16(KEY), KEY6 > - _vbroadcast128 7*16(KEY), KEY7 > - _vbroadcast128 8*16(KEY), KEY8 > - _vbroadcast128 9*16(KEY), KEY9 > - _vbroadcast128 10*16(KEY), KEY10 > + _vbroadcast128 -6*16(KEY), KEY1 > + _vbroadcast128 -5*16(KEY), KEY2 > + _vbroadcast128 -4*16(KEY), KEY3 > + _vbroadcast128 -3*16(KEY), KEY4 > + _vbroadcast128 -2*16(KEY), KEY5 > + _vbroadcast128 -1*16(KEY), KEY6 > + _vbroadcast128 0*16(KEY), KEY7 > + _vbroadcast128 1*16(KEY), KEY8 > + _vbroadcast128 2*16(KEY), KEY9 > + _vbroadcast128 3*16(KEY), KEY10 > // Note: if it's AES-128 or AES-192, the last several round keys won't > // be used. We do the loads anyway to save a conditional jump. > - _vbroadcast128 11*16(KEY), KEY11 > - _vbroadcast128 12*16(KEY), KEY12 > - _vbroadcast128 13*16(KEY), KEY13 > - _vbroadcast128 14*16(KEY), KEY14 > + _vbroadcast128 4*16(KEY), KEY11 > + _vbroadcast128 5*16(KEY), KEY12 > + _vbroadcast128 6*16(KEY), KEY13 > + _vbroadcast128 7*16(KEY), KEY14 > .endif > .endm > > // Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) > // on the block(s) in \data using the round key(s) in \key. The register length > @@ -454,13 +454,13 @@ > .macro _vaes_1x enc, last, i, xmm_suffix, data > .if USE_AVX10 > _vaes \enc, \last, KEY\i\xmm_suffix, \data > .else > .ifnb \xmm_suffix > - _vaes \enc, \last, \i*16(KEY), \data > + _vaes \enc, \last, (\i-7)*16(KEY), \data > .else > - _vbroadcast128 \i*16(KEY), V4 > + _vbroadcast128 (\i-7)*16(KEY), V4 > _vaes \enc, \last, V4, \data > .endif > .endif > .endm > > @@ -475,11 +475,11 @@ > _vaes \enc, \last, KEY\i, V1 > _tweak_step (2*(\i-1) + 1) > _vaes \enc, \last, KEY\i, V2 > _vaes \enc, \last, KEY\i, V3 > .else > - _vbroadcast128 \i*16(KEY), V4 > + _vbroadcast128 (\i-7)*16(KEY), V4 > _tweak_step (2*(\i-1)) > _vaes \enc, \last, V4, V0 > _vaes \enc, \last, V4, V1 > _tweak_step (2*(\i-1) + 1) > _vaes \enc, \last, V4, V2 > @@ -526,13 +526,19 @@ > _define_aliases > > // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). > movl 480(KEY), KEYLEN > > - // If decrypting, advance KEY to the decryption round keys. > -.if !\enc > - add $240, KEY > + // Advance KEY to point to the 7th encryption round key (if encrypting) > + // or the 7th decryption round key (if decrypting). This makes the > + // offset to any round key be in the range [-112, 112], fitting in a > + // signed byte. This shortens VEX-encoded instructions that access the > + // 8th and later round keys which otherwise would need 4-byte offsets. > +.if \enc > + add $7*16, KEY > +.else > + add $(15+7)*16, KEY > .endif > > // Check whether the data length is a multiple of the AES block length. > test $15, LEN > jnz .Lneed_cts\@ > @@ -751,40 +757,41 @@ > > // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, > // u8 iv[AES_BLOCK_SIZE]); > SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) > vmovdqu (%rsi), %xmm0 > - vpxor 0*16(%rdi), %xmm0, %xmm0 > + add $7*16, %rdi > + vpxor -7*16(%rdi), %xmm0, %xmm0 > + vaesenc -6*16(%rdi), %xmm0, %xmm0 > + vaesenc -5*16(%rdi), %xmm0, %xmm0 > + vaesenc -4*16(%rdi), %xmm0, %xmm0 > + vaesenc -3*16(%rdi), %xmm0, %xmm0 > + vaesenc -2*16(%rdi), %xmm0, %xmm0 > + vaesenc -1*16(%rdi), %xmm0, %xmm0 > + vaesenc 0*16(%rdi), %xmm0, %xmm0 > vaesenc 1*16(%rdi), %xmm0, %xmm0 > vaesenc 2*16(%rdi), %xmm0, %xmm0 > + cmpl $24, 480-(7*16)(%rdi) > + jle .Lencrypt_iv_aes_128_or_192 > vaesenc 3*16(%rdi), %xmm0, %xmm0 > vaesenc 4*16(%rdi), %xmm0, %xmm0 > vaesenc 5*16(%rdi), %xmm0, %xmm0 > vaesenc 6*16(%rdi), %xmm0, %xmm0 > - vaesenc 7*16(%rdi), %xmm0, %xmm0 > - vaesenc 8*16(%rdi), %xmm0, %xmm0 > - vaesenc 9*16(%rdi), %xmm0, %xmm0 > - cmpl $24, 480(%rdi) > - jle .Lencrypt_iv_aes_128_or_192 > - vaesenc 10*16(%rdi), %xmm0, %xmm0 > - vaesenc 11*16(%rdi), %xmm0, %xmm0 > - vaesenc 12*16(%rdi), %xmm0, %xmm0 > - vaesenc 13*16(%rdi), %xmm0, %xmm0 > - vaesenclast 14*16(%rdi), %xmm0, %xmm0 > + vaesenclast 7*16(%rdi), %xmm0, %xmm0 > .Lencrypt_iv_done: > vmovdqu %xmm0, (%rsi) > RET > > // Out-of-line handling of AES-128 and AES-192 > .Lencrypt_iv_aes_128_or_192: > jz .Lencrypt_iv_aes_192 > - vaesenclast 10*16(%rdi), %xmm0, %xmm0 > + vaesenclast 3*16(%rdi), %xmm0, %xmm0 > jmp .Lencrypt_iv_done > .Lencrypt_iv_aes_192: > - vaesenc 10*16(%rdi), %xmm0, %xmm0 > - vaesenc 11*16(%rdi), %xmm0, %xmm0 > - vaesenclast 12*16(%rdi), %xmm0, %xmm0 > + vaesenc 3*16(%rdi), %xmm0, %xmm0 > + vaesenc 4*16(%rdi), %xmm0, %xmm0 > + vaesenclast 5*16(%rdi), %xmm0, %xmm0 > jmp .Lencrypt_iv_done > SYM_FUNC_END(aes_xts_encrypt_iv) > > // Below are the actual AES-XTS encryption and decryption functions, > // instantiated from the above macro. They all have the following prototype: > > base-commit: 4ad27a8be9dbefd4820da0f60da879d512b2f659 > prerequisite-patch-id: 8d09ed747039f5e718ac7267e2a15e22504aa7f3 > -- > 2.44.0 > >