From: Eric Biggers <ebiggers@xxxxxxxxxx> Add an AES-XTS implementation "xts-aes-vaes-avx10_256" for x86_64 CPUs with the VAES, VPCLMULQDQ, and either AVX10/256 or AVX512BW + AVX512VL extensions. This implementation avoids using zmm registers, instead using ymm registers to operate on two AES blocks at a time. The assembly code is instantiated using a macro so that most of the source code is shared with other implementations. This is the optimal implementation on CPUs that support VAES and AVX512 but where the zmm registers should not be used due to downclocking effects, for example Intel's Ice Lake. It should also be the optimal implementation on future CPUs that support AVX10/256 but not AVX10/512. The performance is slightly better than that of xts-aes-vaes-avx2, which uses the same vector length, due to factors such as being able to use ymm16-ymm31 to cache the AES round keys. For example, on Ice Lake, the throughput of decrypting 4096-byte messages with AES-256-XTS is 5.8% higher with xts-aes-vaes-avx10_256 than with xts-aes-vaes-avx2. Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx> --- arch/x86/crypto/aes-xts-avx-x86_64.S | 9 +++++++++ arch/x86/crypto/aesni-intel_glue.c | 16 ++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index 87ae2139b7ca..c868b9af443b 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -773,6 +773,15 @@ SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) aes_xts_crypt 1 SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) aes_xts_crypt 0 SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) + +.set VL, 32 +.set USE_AVX10, 1 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256) + aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256) + aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256) #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index d958aa073c14..ac45e0b952b7 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1295,10 +1295,11 @@ static struct skcipher_alg aes_xts_alg_##suffix = { \ static struct simd_skcipher_alg *aes_xts_simdalg_##suffix DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500); #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600); +DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700); #endif static int __init register_xts_algs(void) { int err; @@ -1318,10 +1319,22 @@ static int __init register_xts_algs(void) return 0; err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1, &aes_xts_simdalg_vaes_avx2); if (err) return err; + + if (!boot_cpu_has(X86_FEATURE_AVX512BW) || + !boot_cpu_has(X86_FEATURE_AVX512VL) || + !boot_cpu_has(X86_FEATURE_BMI2) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL)) + return 0; + + err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1, + &aes_xts_simdalg_vaes_avx10_256); + if (err) + return err; #endif return 0; } static void unregister_xts_algs(void) @@ -1330,10 +1343,13 @@ static void unregister_xts_algs(void) simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, &aes_xts_simdalg_aesni_avx); if (aes_xts_simdalg_vaes_avx2) simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1, &aes_xts_simdalg_vaes_avx2); + if (aes_xts_simdalg_vaes_avx10_256) + simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1, + &aes_xts_simdalg_vaes_avx10_256); } #else static int __init register_xts_algs(void) { return 0; -- 2.44.0