As done by the ECB and CBC helpers in arch/x86/crypt/ecb_cbc_helpers.h, limit the number of bytes processed between kernel_fpu_begin() and kernel_fpu_end() calls. Those functions call preempt_disable() and preempt_enable(), so the CPU core is unavailable for scheduling while running, leading to: rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {12-... } 22 jiffies s: 277 root: 0x1/. Fixes: 78c37d191dd6 ("crypto: crc32 - add crc32 pclmulqdq implementation and wrappers for table implementation") Fixes: 6a8ce1ef3940 ("crypto: crc32c - Optimize CRC32C calculation with PCLMULQDQ instruction") Fixes: 0b95a7f85718 ("crypto: crct10dif - Glue code to cast accelerated CRCT10DIF assembly as a crypto transform") Suggested-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx> Signed-off-by: Robert Elliott <elliott@xxxxxxx> --- arch/x86/crypto/crc32-pclmul_glue.c | 18 ++++++++++---- arch/x86/crypto/crc32c-intel_glue.c | 32 ++++++++++++++++++++----- arch/x86/crypto/crct10dif-pclmul_glue.c | 32 ++++++++++++++++++++----- 3 files changed, 66 insertions(+), 16 deletions(-) diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 288200fe7b4e..7cf65dc726c4 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -49,6 +49,8 @@ #define SCALE_F 16L /* size of xmm register */ #define SCALE_F_MASK (SCALE_F - 1) +#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */ + u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32); static u32 __attribute__((pure)) @@ -57,6 +59,7 @@ static u32 __attribute__((pure)) unsigned int iquotient; unsigned int iremainder; unsigned int prealign; + unsigned int chunk; if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable()) return crc32_le(crc, p, len); @@ -73,12 +76,19 @@ static u32 __attribute__((pure)) iquotient = len & (~SCALE_F_MASK); iremainder = len & SCALE_F_MASK; - kernel_fpu_begin(); - crc = crc32_pclmul_le_16(p, iquotient, crc); - kernel_fpu_end(); + do { + chunk = min(iquotient, FPU_BYTES); + iquotient -= chunk; + + kernel_fpu_begin(); + crc = crc32_pclmul_le_16(p, chunk, crc); + kernel_fpu_end(); + + p += chunk; + } while (iquotient); if (iremainder) - crc = crc32_le(crc, p + iquotient, iremainder); + crc = crc32_le(crc, p, iremainder); return crc; } diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index c5c965b694c6..b277c215f0fb 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -44,6 +44,8 @@ */ #define CRC32C_PCL_BREAKEVEN 512 +#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */ + asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, unsigned int crc_init); #endif /* CONFIG_X86_64 */ @@ -155,15 +157,23 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, unsigned int len) { u32 *crcp = shash_desc_ctx(desc); + unsigned int chunk; /* * use faster PCL version if datasize is large enough to * overcome kernel fpu state save/restore overhead */ if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { - kernel_fpu_begin(); - *crcp = crc_pcl(data, len, *crcp); - kernel_fpu_end(); + do { + chunk = min(len, FPU_BYTES); + len -= chunk; + + kernel_fpu_begin(); + *crcp = crc_pcl(data, chunk, *crcp); + kernel_fpu_end(); + + data += chunk; + } while (len); } else *crcp = crc32c_intel_le_hw(*crcp, data, len); return 0; @@ -172,10 +182,20 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { + unsigned int chunk; + if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { - kernel_fpu_begin(); - *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); - kernel_fpu_end(); + do { + chunk = min(len, FPU_BYTES); + len -= chunk; + + kernel_fpu_begin(); + *crcp = crc_pcl(data, chunk, *crcp); + kernel_fpu_end(); + + data += chunk; + } while (len); + *(__le32 *)out = ~cpu_to_le32(*crcp); } else *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index 7c5a32282d51..bcd362df6b62 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -36,6 +36,8 @@ #include <asm/cpu_device_id.h> #include <asm/simd.h> +#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */ + asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len); struct chksum_desc_ctx { @@ -55,11 +57,19 @@ static int chksum_update(struct shash_desc *desc, const u8 *data, unsigned int length) { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int chunk; if (length >= 16 && crypto_simd_usable()) { - kernel_fpu_begin(); - ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); - kernel_fpu_end(); + do { + chunk = min(length, FPU_BYTES); + length -= chunk; + + kernel_fpu_begin(); + ctx->crc = crc_t10dif_pcl(ctx->crc, data, chunk); + kernel_fpu_end(); + + data += chunk; + } while (length); } else ctx->crc = crc_t10dif_generic(ctx->crc, data, length); return 0; @@ -75,10 +85,20 @@ static int chksum_final(struct shash_desc *desc, u8 *out) static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out) { + unsigned int chunk; + if (len >= 16 && crypto_simd_usable()) { - kernel_fpu_begin(); - *(__u16 *)out = crc_t10dif_pcl(crc, data, len); - kernel_fpu_end(); + do { + chunk = min(len, FPU_BYTES); + len -= chunk; + + kernel_fpu_begin(); + crc = crc_t10dif_pcl(crc, data, chunk); + kernel_fpu_end(); + + data += chunk; + } while (len); + *(__u16 *)out = crc; } else *(__u16 *)out = crc_t10dif_generic(crc, data, len); return 0; -- 2.37.3