On Fri, 18 Dec 2020 at 22:08, Megha Dey <megha.dey@xxxxxxxxx> wrote: > > Introduce the "by16" implementation of the AES CTR mode using AVX512 > optimizations. "by16" means that 16 independent blocks (each block > being 128 bits) can be ciphered simultaneously as opposed to the > current 8 blocks. > > The glue code in AESNI module overrides the existing "by8" CTR mode > encryption/decryption routines with the "by16" ones when the following > criteria are met: > At compile time: > 1. CONFIG_CRYPTO_AVX512 is enabled > 2. toolchain(assembler) supports VAES instructions > At runtime: > 1. VAES and AVX512VL features are supported on platform (currently > only Icelake) > 2. aesni_intel.use_avx512 module parameter is set at boot time. For this > algorithm, switching from AVX512 optimized version is not possible once > set at boot time because of how the code is structured today.(Can be > changed later if required) > > The functions aes_ctr_enc_128_avx512_by16(), aes_ctr_enc_192_avx512_by16() > and aes_ctr_enc_256_avx512_by16() are adapted from Intel Optimized IPSEC > Cryptographic library. > > On a Icelake desktop, with turbo disabled and all CPUs running at maximum > frequency, the "by16" CTR mode optimization shows better performance > across data & key sizes as measured by tcrypt. > > The average performance improvement of the "by16" version over the "by8" > version is as follows: > For all key sizes(128/192/256 bits), > data sizes < 128 bytes/block, negligible improvement(~3% loss) > data sizes > 128 bytes/block, there is an average improvement of > 48% for both encryption and decryption. > > A typical run of tcrypt with AES CTR mode encryption/decryption of the > "by8" and "by16" optimization on a Icelake desktop shows the following > results: > > -------------------------------------------------------------- > | key | bytes | cycles/op (lower is better)| percentage | > | length | per | encryption | decryption | loss/gain | > | (bits) | block |-------------------------------------------| > | | | by8 | by16 | by8 | by16 | enc | dec | > |------------------------------------------------------------| > | 128 | 16 | 156 | 168 | 164 | 168 | -7.7 | -2.5 | > | 128 | 64 | 180 | 190 | 157 | 146 | -5.6 | 7.1 | > | 128 | 256 | 248 | 158 | 251 | 161 | 36.3 | 35.9 | > | 128 | 1024 | 633 | 316 | 642 | 319 | 50.1 | 50.4 | > | 128 | 1472 | 853 | 411 | 877 | 407 | 51.9 | 53.6 | > | 128 | 8192 | 4463 | 1959 | 4447 | 1940 | 56.2 | 56.4 | > | 192 | 16 | 136 | 145 | 149 | 166 | -6.7 | -11.5 | > | 192 | 64 | 159 | 154 | 157 | 160 | 3.2 | -2 | > | 192 | 256 | 268 | 172 | 274 | 177 | 35.9 | 35.5 | > | 192 | 1024 | 710 | 358 | 720 | 355 | 49.6 | 50.7 | > | 192 | 1472 | 989 | 468 | 983 | 469 | 52.7 | 52.3 | > | 192 | 8192 | 6326 | 3551 | 6301 | 3567 | 43.9 | 43.4 | > | 256 | 16 | 153 | 165 | 139 | 156 | -7.9 | -12.3 | > | 256 | 64 | 158 | 152 | 174 | 161 | 3.8 | 7.5 | > | 256 | 256 | 283 | 176 | 287 | 202 | 37.9 | 29.7 | > | 256 | 1024 | 797 | 393 | 807 | 395 | 50.7 | 51.1 | > | 256 | 1472 | 1108 | 534 | 1107 | 527 | 51.9 | 52.4 | > | 256 | 8192 | 5763 | 2616 | 5773 | 2617 | 54.7 | 54.7 | > -------------------------------------------------------------- > > This work was inspired by the AES CTR mode optimization published > in Intel Optimized IPSEC Cryptographic library. > https://github.com/intel/intel-ipsec-mb/blob/master/lib/avx512/cntr_vaes_avx512.asm > > Co-developed-by: Tomasz Kantecki <tomasz.kantecki@xxxxxxxxx> > Signed-off-by: Tomasz Kantecki <tomasz.kantecki@xxxxxxxxx> > Signed-off-by: Megha Dey <megha.dey@xxxxxxxxx> > --- > arch/x86/crypto/Makefile | 1 + > arch/x86/crypto/aes_ctrby16_avx512-x86_64.S | 856 ++++++++++++++++++++++++++++ > arch/x86/crypto/aesni-intel_glue.c | 57 +- > arch/x86/crypto/avx512_vaes_common.S | 422 ++++++++++++++ > arch/x86/include/asm/disabled-features.h | 8 +- > crypto/Kconfig | 12 + > 6 files changed, 1354 insertions(+), 2 deletions(-) > create mode 100644 arch/x86/crypto/aes_ctrby16_avx512-x86_64.S > ... > diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c > index ad8a718..f45059e 100644 > --- a/arch/x86/crypto/aesni-intel_glue.c > +++ b/arch/x86/crypto/aesni-intel_glue.c > @@ -46,6 +46,10 @@ > #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) > #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) > > +static bool use_avx512; > +module_param(use_avx512, bool, 0644); > +MODULE_PARM_DESC(use_avx512, "Use AVX512 optimized algorithm, if available"); > + > /* This data is stored at the end of the crypto_tfm struct. > * It's a type of per "session" data storage location. > * This needs to be 16 byte aligned. > @@ -191,6 +195,35 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, > void *keys, u8 *out, unsigned int num_bytes); > asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, > void *keys, u8 *out, unsigned int num_bytes); > + > +#ifdef CONFIG_CRYPTO_AES_CTR_AVX512 > +asmlinkage void aes_ctr_enc_128_avx512_by16(void *keys, u8 *out, > + const u8 *in, > + unsigned int num_bytes, > + u8 *iv); > +asmlinkage void aes_ctr_enc_192_avx512_by16(void *keys, u8 *out, > + const u8 *in, > + unsigned int num_bytes, > + u8 *iv); > +asmlinkage void aes_ctr_enc_256_avx512_by16(void *keys, u8 *out, > + const u8 *in, > + unsigned int num_bytes, > + u8 *iv); > +#else > +static inline void aes_ctr_enc_128_avx512_by16(void *keys, u8 *out, > + const u8 *in, > + unsigned int num_bytes, > + u8 *iv) {} > +static inline void aes_ctr_enc_192_avx512_by16(void *keys, u8 *out, > + const u8 *in, > + unsigned int num_bytes, > + u8 *iv) {} > +static inline void aes_ctr_enc_256_avx512_by16(void *keys, u8 *out, > + const u8 *in, > + unsigned int num_bytes, > + u8 *iv) {} > +#endif > + Please drop these alternatives. > /* > * asmlinkage void aesni_gcm_init_avx_gen2() > * gcm_data *my_ctx_data, context data > @@ -487,6 +520,23 @@ static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, > aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len); > } > > +static void aesni_ctr_enc_avx512_tfm(struct crypto_aes_ctx *ctx, u8 *out, > + const u8 *in, unsigned int len, u8 *iv) > +{ > + /* > + * based on key length, override with the by16 version > + * of ctr mode encryption/decryption for improved performance. > + * aes_set_key_common() ensures that key length is one of > + * {128,192,256} > + */ > + if (ctx->key_length == AES_KEYSIZE_128) > + aes_ctr_enc_128_avx512_by16((void *)ctx, out, in, len, iv); > + else if (ctx->key_length == AES_KEYSIZE_192) > + aes_ctr_enc_192_avx512_by16((void *)ctx, out, in, len, iv); > + else > + aes_ctr_enc_256_avx512_by16((void *)ctx, out, in, len, iv); > +} > + > static int ctr_crypt(struct skcipher_request *req) > { > struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); > @@ -1076,7 +1126,12 @@ static int __init aesni_init(void) > aesni_gcm_tfm = &aesni_gcm_tfm_sse; > } > aesni_ctr_enc_tfm = aesni_ctr_enc; > - if (boot_cpu_has(X86_FEATURE_AVX)) { > + if (use_avx512 && IS_ENABLED(CONFIG_CRYPTO_AES_CTR_AVX512) && > + cpu_feature_enabled(X86_FEATURE_VAES)) { > + /* Ctr mode performance optimization using AVX512 */ > + aesni_ctr_enc_tfm = aesni_ctr_enc_avx512_tfm; > + pr_info("AES CTR mode by16 optimization enabled\n"); This will need to be changed to a static_call_update() once my outstanding patch is merged. > + } else if (boot_cpu_has(X86_FEATURE_AVX)) { > /* optimize performance of ctr mode encryption transform */ > aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm; > pr_info("AES CTR mode by8 optimization enabled\n");