Re: [RFC V1 7/7] crypto: aesni - AVX512 version of AESNI-GCM using VPCLMULQDQ

Ard Biesheuvel <ardb@xxxxxxxxxx> · Sat, 16 Jan 2021 18:16:17 +0100

On Fri, 18 Dec 2020 at 22:08, Megha Dey <megha.dey@xxxxxxxxx> wrote:
>
> Introduce the AVX512 implementation that optimizes the AESNI-GCM encode
> and decode routines using VPCLMULQDQ.
>
> The glue code in AESNI module overrides the existing AVX2 GCM mode
> encryption/decryption routines with the AX512 AES GCM mode ones when the
> following criteria are met:
> At compile time:
> 1. CONFIG_CRYPTO_AVX512 is enabled
> 2. toolchain(assembler) supports VPCLMULQDQ instructions
> At runtime:
> 1. VPCLMULQDQ and AVX512VL features are supported on a platform
>    (currently only Icelake)
> 2. aesni_intel.use_avx512 module parameter is set at boot time. For this
>    algorithm, switching from AVX512 optimized version is not possible
>    once set at boot time because of how the code is structured today.(Can
>    be changed later if required)
>
> The functions aesni_gcm_init_avx_512, aesni_gcm_enc_update_avx_512,
> aesni_gcm_dec_update_avx_512 and aesni_gcm_finalize_avx_512 are adapted
> from the Intel Optimized IPSEC Cryptographic library.
>
> On a Icelake desktop, with turbo disabled and all CPUs running at
> maximum frequency, the AVX512 GCM mode optimization shows better
> performance across data & key sizes as measured by tcrypt.
>
> The average performance improvement of the AVX512 version over the AVX2
> version is as follows:
> For all key sizes(128/192/256 bits),
>         data sizes < 128 bytes/block, negligible improvement (~7.5%)
>         data sizes > 128 bytes/block, there is an average improvement of
>         40% for both encryption and decryption.
>
> A typical run of tcrypt with AES GCM mode encryption/decryption of the
> AVX2 and AVX512 optimization on a Icelake desktop shows the following
> results:
>
>   ----------------------------------------------------------------------
>   |   key  | bytes | cycles/op (lower is better)   | Percentage gain/  |
>   | length |   per |   encryption  |  decryption   |      loss         |
>   | (bits) | block |-------------------------------|-------------------|
>   |        |       | avx2 | avx512 | avx2 | avx512 | Encrypt | Decrypt |
>   |---------------------------------------------------------------------
>   |  128   | 16    | 689  |  701   | 689  |  707   |  -1.7   |  -2.61  |
>   |  128   | 64    | 731  |  660   | 771  |  649   |   9.7   |  15.82  |
>   |  128   | 256   | 911  |  750   | 900  |  721   |  17.67  |  19.88  |
>   |  128   | 512   | 1181 |  814   | 1161 |  782   |  31.07  |  32.64  |
>   |  128   | 1024  | 1676 |  1052  | 1685 |  1030  |  37.23  |  38.87  |
>   |  128   | 2048  | 2475 |  1447  | 2456 |  1419  |  41.53  |  42.22  |
>   |  128   | 4096  | 3806 |  2154  | 3820 |  2119  |  43.41  |  44.53  |
>   |  128   | 8192  | 9169 |  3806  | 6997 |  3718  |  58.49  |  46.86  |
>   |  192   | 16    | 754  |  683   | 737  |  672   |   9.42  |   8.82  |
>   |  192   | 64    | 735  |  686   | 715  |  640   |   6.66  |  10.49  |
>   |  192   | 256   | 949  |  738   | 2435 |  729   |  22.23  |  70     |
>   |  192   | 512   | 1235 |  854   | 1200 |  833   |  30.85  |  30.58  |
>   |  192   | 1024  | 1777 |  1084  | 1763 |  1051  |  38.99  |  40.39  |
>   |  192   | 2048  | 2574 |  1497  | 2592 |  1459  |  41.84  |  43.71  |
>   |  192   | 4096  | 4086 |  2317  | 4091 |  2244  |  43.29  |  45.14  |
>   |  192   | 8192  | 7481 |  4054  | 7505 |  3953  |  45.81  |  47.32  |
>   |  256   | 16    | 755  |  682   | 720  |  683   |   9.68  |   5.14  |
>   |  256   | 64    | 744  |  677   | 719  |  658   |   9     |   8.48  |
>   |  256   | 256   | 962  |  758   | 948  |  749   |  21.21  |  21     |
>   |  256   | 512   | 1297 |  862   | 1276 |  836   |  33.54  |  34.48  |
>   |  256   | 1024  | 1831 |  1114  | 1819 |  1095  |  39.16  |  39.8   |
>   |  256   | 2048  | 2767 |  1566  | 2715 |  1524  |  43.4   |  43.87  |
>   |  256   | 4096  | 4378 |  2382  | 4368 |  2354  |  45.6   |  46.11  |
>   |  256   | 8192  | 8075 |  4262  | 8080 |  4186  |  47.22  |  48.19  |
>   ----------------------------------------------------------------------
>
> This work was inspired by the AES GCM mode optimization published in
> Intel Optimized IPSEC Cryptographic library.
> https://github.com/intel/intel-ipsec-mb/blob/master/lib/avx512/gcm_vaes_avx512.asm
>
> Co-developed-by: Tomasz Kantecki <tomasz.kantecki@xxxxxxxxx>
> Signed-off-by: Tomasz Kantecki <tomasz.kantecki@xxxxxxxxx>
> Signed-off-by: Megha Dey <megha.dey@xxxxxxxxx>
> ---
>  arch/x86/crypto/Makefile                    |    1 +
>  arch/x86/crypto/aesni-intel_avx512-x86_64.S | 1788 +++++++++++++++++++++++++++
>  arch/x86/crypto/aesni-intel_glue.c          |   62 +-
>  crypto/Kconfig                              |   12 +
>  4 files changed, 1858 insertions(+), 5 deletions(-)
>  create mode 100644 arch/x86/crypto/aesni-intel_avx512-x86_64.S
>
...
> diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
> index 9e56cdf..8fc5bac 100644
> --- a/arch/x86/crypto/aesni-intel_glue.c
> +++ b/arch/x86/crypto/aesni-intel_glue.c
> @@ -55,13 +55,16 @@ MODULE_PARM_DESC(use_avx512, "Use AVX512 optimized algorithm, if available");
>   * This needs to be 16 byte aligned.
>   */
>  struct aesni_rfc4106_gcm_ctx {
> -       u8 hash_subkey[16] AESNI_ALIGN_ATTR;
> +       /* AVX512 optimized algorithms use 48 hash keys to conduct
> +        * multiple PCLMULQDQ operations in parallel
> +        */
> +       u8 hash_subkey[16 * 48] AESNI_ALIGN_ATTR;
>         struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
>         u8 nonce[4];
>  };
>
>  struct generic_gcmaes_ctx {
> -       u8 hash_subkey[16] AESNI_ALIGN_ATTR;
> +       u8 hash_subkey[16 * 48] AESNI_ALIGN_ATTR;
>         struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
>  };
>
> @@ -82,7 +85,7 @@ struct gcm_context_data {
>         u8 current_counter[GCM_BLOCK_LEN];
>         u64 partial_block_len;
>         u64 unused;
> -       u8 hash_keys[GCM_BLOCK_LEN * 16];
> +       u8 hash_keys[48 * 16];
>  };
>

This structure gets allocated on the stack, and gets inflated
significantly by this change, even though the code is not enabled by
default, and not even supported for most users.

Is it really necessary for this to be per-request data? If these are
precomputed powers of H, they can be moved into the TFM context
structure instead, which lives on the heap (and can be shared by all
concurrent users of the TFM)

>  asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
> @@ -266,6 +269,47 @@ static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
>         .finalize = &aesni_gcm_finalize_avx_gen2,
>  };
>
> +#ifdef CONFIG_CRYPTO_AES_GCM_AVX512
> +/*
> + * asmlinkage void aesni_gcm_init_avx_512()
> + * gcm_data *my_ctx_data, context data
> + * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
> + */
> +asmlinkage void aesni_gcm_init_avx_512(void *my_ctx_data,
> +                                      struct gcm_context_data *gdata,
> +                                      u8 *iv,
> +                                      u8 *hash_subkey,
> +                                      const u8 *aad,
> +                                      unsigned long aad_len);
> +asmlinkage void aesni_gcm_enc_update_avx_512(void *ctx,
> +                                            struct gcm_context_data *gdata,
> +                                            u8 *out,
> +                                            const u8 *in,
> +                                            unsigned long plaintext_len);
> +asmlinkage void aesni_gcm_dec_update_avx_512(void *ctx,
> +                                            struct gcm_context_data *gdata,
> +                                            u8 *out,
> +                                            const u8 *in,
> +                                            unsigned long ciphertext_len);
> +asmlinkage void aesni_gcm_finalize_avx_512(void *ctx,
> +                                          struct gcm_context_data *gdata,
> +                                          u8 *auth_tag,
> +                                          unsigned long auth_tag_len);
> +
> +asmlinkage void aes_gcm_precomp_avx_512(struct crypto_aes_ctx *ctx, u8 *hash_subkey);
> +
> +static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_512 = {
> +       .init = &aesni_gcm_init_avx_512,
> +       .enc_update = &aesni_gcm_enc_update_avx_512,
> +       .dec_update = &aesni_gcm_dec_update_avx_512,
> +       .finalize = &aesni_gcm_finalize_avx_512,
> +};
> +#else
> +static void aes_gcm_precomp_avx_512(struct crypto_aes_ctx *ctx, u8 *hash_subkey)
> +{}
> +static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_512 = {};
> +#endif
> +

Please drop the alternative dummy definitions.

>  /*
>   * asmlinkage void aesni_gcm_init_avx_gen4()
>   * gcm_data *my_ctx_data, context data
> @@ -669,7 +713,11 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
>         /* We want to cipher all zeros to create the hash sub key. */
>         memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
>
> -       aes_encrypt(&ctx, hash_subkey, hash_subkey);
> +       if (IS_ENABLED(CONFIG_CRYPTO_AES_GCM_AVX512) && use_avx512 &&
> +           cpu_feature_enabled(X86_FEATURE_VPCLMULQDQ))
> +               aes_gcm_precomp_avx_512(&ctx, hash_subkey);
> +       else
> +               aes_encrypt(&ctx, hash_subkey, hash_subkey);
>

I suppose this answers my question about the subkeys. Please find a
way to move these out of struct gcm_context_data so they don't need to
be copied to the stack for each request.

>         memzero_explicit(&ctx, sizeof(ctx));
>         return 0;
> @@ -1114,7 +1162,11 @@ static int __init aesni_init(void)
>         if (!x86_match_cpu(aesni_cpu_id))
>                 return -ENODEV;
>  #ifdef CONFIG_X86_64
> -       if (boot_cpu_has(X86_FEATURE_AVX2)) {
> +       if (use_avx512 && IS_ENABLED(CONFIG_CRYPTO_AES_GCM_AVX512) &&
> +           cpu_feature_enabled(X86_FEATURE_VPCLMULQDQ)) {
> +               pr_info("AVX512 version of gcm_enc/dec engaged.\n");
> +               aesni_gcm_tfm = &aesni_gcm_tfm_avx_512;

This was changed in the cryptodev tree to use static keys.

> +       } else if (boot_cpu_has(X86_FEATURE_AVX2)) {
>                 pr_info("AVX2 version of gcm_enc/dec engaged.\n");
>                 aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
>         } else if (boot_cpu_has(X86_FEATURE_AVX)) {
> diff --git a/crypto/Kconfig b/crypto/Kconfig
> index 3043849..8c8a68d 100644
> --- a/crypto/Kconfig
> +++ b/crypto/Kconfig
> @@ -661,6 +661,18 @@ config CRYPTO_AES_CTR_AVX512
>         depends on CRYPTO_AES_NI_INTEL
>         depends on AS_VAES_AVX512
>
> +# We default CRYPTO_AES_GCM_AVX512 to Y but depend on CRYPTO_AVX512 in
> +# order to have a singular option (CRYPTO_AVX512) select multiple algorithms
> +# when supported. Specifically, if the platform and/or toolset does not
> +# support VPLMULQDQ. Then this algorithm should not be supported as part of
> +# the set that CRYPTO_AVX512 selects.
> +config CRYPTO_AES_GCM_AVX512
> +       bool
> +       default y
> +       depends on CRYPTO_AVX512
> +       depends on CRYPTO_AES_NI_INTEL
> +       depends on AS_VPCLMULQDQ
> +
>  config CRYPTO_CRC32C_SPARC64
>         tristate "CRC32c CRC algorithm (SPARC64)"
>         depends on SPARC64
> --
> 2.7.4
>