Use a static const unsigned int for the limit of the number of bytes processed between kernel_fpu_begin() and kernel_fpu_end() rather than using the SZ_4K macro (which is a signed value), or a magic value of 4096U embedded in the C code. Use unsigned int rather than size_t for some of the arguments to avoid typecasting for the min() macro. Signed-off-by: Robert Elliott <elliott@xxxxxxx> --- v3 use static int rather than macro, change to while loops rather than do/while loops --- arch/x86/crypto/nhpoly1305-avx2-glue.c | 11 +++++--- arch/x86/crypto/nhpoly1305-sse2-glue.c | 11 +++++--- arch/x86/crypto/poly1305_glue.c | 37 +++++++++++++++++--------- arch/x86/crypto/polyval-clmulni_glue.c | 8 ++++-- 4 files changed, 46 insertions(+), 21 deletions(-) diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c index 8ea5ab0f1ca7..f7dc9c563bb5 100644 --- a/arch/x86/crypto/nhpoly1305-avx2-glue.c +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -13,6 +13,9 @@ #include <linux/sizes.h> #include <asm/simd.h> +/* avoid kernel_fpu_begin/end scheduler/rcu stalls */ +static const unsigned int bytes_per_fpu = 337 * 1024; + asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len, u8 hash[NH_HASH_BYTES]); @@ -26,18 +29,20 @@ static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len, static int nhpoly1305_avx2_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { + BUILD_BUG_ON(bytes_per_fpu == 0); + if (srclen < 64 || !crypto_simd_usable()) return crypto_nhpoly1305_update(desc, src, srclen); - do { - unsigned int n = min_t(unsigned int, srclen, SZ_4K); + while (srclen) { + unsigned int n = min(srclen, bytes_per_fpu); kernel_fpu_begin(); crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2); kernel_fpu_end(); src += n; srclen -= n; - } while (srclen); + } return 0; } diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c index 2b353d42ed13..daffcc7019ad 100644 --- a/arch/x86/crypto/nhpoly1305-sse2-glue.c +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -13,6 +13,9 @@ #include <linux/sizes.h> #include <asm/simd.h> +/* avoid kernel_fpu_begin/end scheduler/rcu stalls */ +static const unsigned int bytes_per_fpu = 199 * 1024; + asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len, u8 hash[NH_HASH_BYTES]); @@ -26,18 +29,20 @@ static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len, static int nhpoly1305_sse2_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { + BUILD_BUG_ON(bytes_per_fpu == 0); + if (srclen < 64 || !crypto_simd_usable()) return crypto_nhpoly1305_update(desc, src, srclen); - do { - unsigned int n = min_t(unsigned int, srclen, SZ_4K); + while (srclen) { + unsigned int n = min(srclen, bytes_per_fpu); kernel_fpu_begin(); crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2); kernel_fpu_end(); src += n; srclen -= n; - } while (srclen); + } return 0; } diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 1dfb8af48a3c..16831c036d71 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -15,20 +15,27 @@ #include <asm/intel-family.h> #include <asm/simd.h> +#define POLY1305_BLOCK_SIZE_MASK (~(POLY1305_BLOCK_SIZE - 1)) + +/* avoid kernel_fpu_begin/end scheduler/rcu stalls */ +static const unsigned int bytes_per_fpu = 217 * 1024; + asmlinkage void poly1305_init_x86_64(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]); asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, - const size_t len, const u32 padbit); + const unsigned int len, + const u32 padbit); asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]); asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]); -asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, + const unsigned int len, const u32 padbit); +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, + const unsigned int len, const u32 padbit); asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, - const size_t len, const u32 padbit); + const unsigned int len, + const u32 padbit); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); @@ -86,14 +93,12 @@ static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) poly1305_init_x86_64(ctx, key); } -static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, +static void poly1305_simd_blocks(void *ctx, const u8 *inp, unsigned int len, const u32 padbit) { struct poly1305_arch_internal *state = ctx; - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || - SZ_4K % POLY1305_BLOCK_SIZE); + BUILD_BUG_ON(bytes_per_fpu < POLY1305_BLOCK_SIZE); if (!static_branch_likely(&poly1305_use_avx) || (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || @@ -103,8 +108,14 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, return; } - do { - const size_t bytes = min_t(size_t, len, SZ_4K); + while (len) { + unsigned int bytes; + + if (len < POLY1305_BLOCK_SIZE) + bytes = len; + else + bytes = min(len, + bytes_per_fpu & POLY1305_BLOCK_SIZE_MASK); kernel_fpu_begin(); if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) @@ -117,7 +128,7 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, len -= bytes; inp += bytes; - } while (len); + } } static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c index b7664d018851..de1c908f7412 100644 --- a/arch/x86/crypto/polyval-clmulni_glue.c +++ b/arch/x86/crypto/polyval-clmulni_glue.c @@ -29,6 +29,9 @@ #define NUM_KEY_POWERS 8 +/* avoid kernel_fpu_begin/end scheduler/rcu stalls */ +static const unsigned int bytes_per_fpu = 393 * 1024; + struct polyval_tfm_ctx { /* * These powers must be in the order h^8, ..., h^1. @@ -107,6 +110,8 @@ static int polyval_x86_update(struct shash_desc *desc, unsigned int nblocks; unsigned int n; + BUILD_BUG_ON(bytes_per_fpu < POLYVAL_BLOCK_SIZE); + if (dctx->bytes) { n = min(srclen, dctx->bytes); pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; @@ -123,8 +128,7 @@ static int polyval_x86_update(struct shash_desc *desc, } while (srclen >= POLYVAL_BLOCK_SIZE) { - /* Allow rescheduling every 4K bytes. */ - nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; + nblocks = min(srclen, bytes_per_fpu) / POLYVAL_BLOCK_SIZE; internal_polyval_update(tctx, src, nblocks, dctx->buffer); srclen -= nblocks * POLYVAL_BLOCK_SIZE; src += nblocks * POLYVAL_BLOCK_SIZE; -- 2.38.1