Introduce the "by16" implementation of the AES CTR mode using AVX512
optimizations. "by16" means that 16 independent blocks (each block
being 128 bits) can be ciphered simultaneously as opposed to the
current 8 blocks.
The glue code in AESNI module overrides the existing "by8" CTR mode
encryption/decryption routines with the "by16" ones when the following
criteria are met:
At compile time:
1. CONFIG_CRYPTO_AVX512 is enabled
2. toolchain(assembler) supports VAES instructions
At runtime:
1. VAES and AVX512VL features are supported on platform (currently
only Icelake)
2. aesni_intel.use_avx512 module parameter is set at boot time. For this
algorithm, switching from AVX512 optimized version is not possible once
set at boot time because of how the code is structured today.(Can be
changed later if required)
The functions aes_ctr_enc_128_avx512_by16(), aes_ctr_enc_192_avx512_by16()
and aes_ctr_enc_256_avx512_by16() are adapted from Intel Optimized IPSEC
Cryptographic library.
On a Icelake desktop, with turbo disabled and all CPUs running at maximum
frequency, the "by16" CTR mode optimization shows better performance
across data & key sizes as measured by tcrypt.
The average performance improvement of the "by16" version over the "by8"
version is as follows:
For all key sizes(128/192/256 bits),
data sizes < 128 bytes/block, negligible improvement(~3% loss)
data sizes > 128 bytes/block, there is an average improvement of
48% for both encryption and decryption.
A typical run of tcrypt with AES CTR mode encryption/decryption of the
"by8" and "by16" optimization on a Icelake desktop shows the following
results:
--------------------------------------------------------------
| key | bytes | cycles/op (lower is better)| percentage |
| length | per | encryption | decryption | loss/gain |
| (bits) | block |-------------------------------------------|
| | | by8 | by16 | by8 | by16 | enc | dec |
|------------------------------------------------------------|
| 128 | 16 | 156 | 168 | 164 | 168 | -7.7 | -2.5 |
| 128 | 64 | 180 | 190 | 157 | 146 | -5.6 | 7.1 |
| 128 | 256 | 248 | 158 | 251 | 161 | 36.3 | 35.9 |
| 128 | 1024 | 633 | 316 | 642 | 319 | 50.1 | 50.4 |
| 128 | 1472 | 853 | 411 | 877 | 407 | 51.9 | 53.6 |
| 128 | 8192 | 4463 | 1959 | 4447 | 1940 | 56.2 | 56.4 |
| 192 | 16 | 136 | 145 | 149 | 166 | -6.7 | -11.5 |
| 192 | 64 | 159 | 154 | 157 | 160 | 3.2 | -2 |
| 192 | 256 | 268 | 172 | 274 | 177 | 35.9 | 35.5 |
| 192 | 1024 | 710 | 358 | 720 | 355 | 49.6 | 50.7 |
| 192 | 1472 | 989 | 468 | 983 | 469 | 52.7 | 52.3 |
| 192 | 8192 | 6326 | 3551 | 6301 | 3567 | 43.9 | 43.4 |
| 256 | 16 | 153 | 165 | 139 | 156 | -7.9 | -12.3 |
| 256 | 64 | 158 | 152 | 174 | 161 | 3.8 | 7.5 |
| 256 | 256 | 283 | 176 | 287 | 202 | 37.9 | 29.7 |
| 256 | 1024 | 797 | 393 | 807 | 395 | 50.7 | 51.1 |
| 256 | 1472 | 1108 | 534 | 1107 | 527 | 51.9 | 52.4 |
| 256 | 8192 | 5763 | 2616 | 5773 | 2617 | 54.7 | 54.7 |
--------------------------------------------------------------
This work was inspired by the AES CTR mode optimization published
in Intel Optimized IPSEC Cryptographic library.
https://github.com/intel/intel-ipsec-mb/blob/master/lib/avx512/cntr_vaes_avx512.asm
Co-developed-by: Tomasz Kantecki <tomasz.kantecki@xxxxxxxxx>
Signed-off-by: Tomasz Kantecki <tomasz.kantecki@xxxxxxxxx>
Signed-off-by: Megha Dey <megha.dey@xxxxxxxxx>
---
arch/x86/crypto/Makefile | 1 +
arch/x86/crypto/aes_ctrby16_avx512-x86_64.S | 856 ++++++++++++++++++++++++++++
arch/x86/crypto/aesni-intel_glue.c | 57 +-
arch/x86/crypto/avx512_vaes_common.S | 422 ++++++++++++++
arch/x86/include/asm/disabled-features.h | 8 +-
crypto/Kconfig | 12 +
6 files changed, 1354 insertions(+), 2 deletions(-)
create mode 100644 arch/x86/crypto/aes_ctrby16_avx512-x86_64.S
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ad8a718..f45059e 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -46,6 +46,10 @@
#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
+static bool use_avx512;
+module_param(use_avx512, bool, 0644);
+MODULE_PARM_DESC(use_avx512, "Use AVX512 optimized algorithm, if available");
+
/* This data is stored at the end of the crypto_tfm struct.
* It's a type of per "session" data storage location.
* This needs to be 16 byte aligned.
@@ -191,6 +195,35 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
+
+#ifdef CONFIG_CRYPTO_AES_CTR_AVX512
+asmlinkage void aes_ctr_enc_128_avx512_by16(void *keys, u8 *out,
+ const u8 *in,
+ unsigned int num_bytes,
+ u8 *iv);
+asmlinkage void aes_ctr_enc_192_avx512_by16(void *keys, u8 *out,
+ const u8 *in,
+ unsigned int num_bytes,
+ u8 *iv);
+asmlinkage void aes_ctr_enc_256_avx512_by16(void *keys, u8 *out,
+ const u8 *in,
+ unsigned int num_bytes,
+ u8 *iv);
+#else
+static inline void aes_ctr_enc_128_avx512_by16(void *keys, u8 *out,
+ const u8 *in,
+ unsigned int num_bytes,
+ u8 *iv) {}
+static inline void aes_ctr_enc_192_avx512_by16(void *keys, u8 *out,
+ const u8 *in,
+ unsigned int num_bytes,
+ u8 *iv) {}
+static inline void aes_ctr_enc_256_avx512_by16(void *keys, u8 *out,
+ const u8 *in,
+ unsigned int num_bytes,
+ u8 *iv) {}
+#endif
+