From: Eric Biggers <ebiggers@xxxxxxxxxx> crc32_be was previously unoptimized on x86. Optimize it using the new template. This improves performance by over 25x in some cases. Benchmark results on AMD Ryzen 9 9950X (Zen 5) using crc_kunit: Length Before After ------ ------ ----- 1 389 MB/s 325 MB/s 16 2845 MB/s 2911 MB/s 64 3012 MB/s 6513 MB/s 127 2567 MB/s 9057 MB/s 128 3048 MB/s 11589 MB/s 200 3070 MB/s 14042 MB/s 256 3067 MB/s 20454 MB/s 511 2938 MB/s 26245 MB/s 512 3081 MB/s 36926 MB/s 1024 3090 MB/s 61914 MB/s 3173 3065 MB/s 76201 MB/s 4096 3084 MB/s 82547 MB/s 16384 3084 MB/s 89333 MB/s Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx> --- arch/x86/lib/crc-pclmul-consts.h | 49 +++++++++++++++++++++++++++++++- arch/x86/lib/crc32-glue.c | 4 +++ arch/x86/lib/crc32-pclmul.S | 1 + 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h index c3ca689eae3b8..f8af6e9278c83 100644 --- a/arch/x86/lib/crc-pclmul-consts.h +++ b/arch/x86/lib/crc-pclmul-consts.h @@ -1,10 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * CRC constants generated by: * - * ./scripts/crc/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320 + * ./scripts/crc/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_msb_0x04c11db7 * * Do not edit manually. */ /* @@ -97,5 +97,52 @@ static const struct { 0xb4e5b025f7011641, /* floor(x^95 / G(x)) */ 0x1db710641, /* G(x) */ }, .extract_crc_mask = {0, 0xffffffff}, }; + +/* + * CRC folding constants generated for most-significant-bit-first CRC-32 using + * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + + * x^5 + x^4 + x^2 + x + 1 + */ +static const struct { + u8 bswap_mask[16]; + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc32_msb_0x04c11db7_consts __cacheline_aligned __maybe_unused = { + .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, + .fold_across_2048_bits_consts = { + 0x88fe2237, /* x^(2048+0) mod G(x) */ + 0xcbcf3bcb, /* x^(2048+64) mod G(x) */ + }, + .fold_across_1024_bits_consts = { + 0x567fddeb, /* x^(1024+0) mod G(x) */ + 0x10bd4d7c, /* x^(1024+64) mod G(x) */ + }, + .fold_across_512_bits_consts = { + 0xe6228b11, /* x^(512+0) mod G(x) */ + 0x8833794c, /* x^(512+64) mod G(x) */ + }, + .fold_across_256_bits_consts = { + 0x75be46b7, /* x^(256+0) mod G(x) */ + 0x569700e5, /* x^(256+64) mod G(x) */ + }, + .fold_across_128_bits_consts = { + 0xe8a45605, /* x^(128+0) mod G(x) */ + 0xc5b9cd4c, /* x^(128+64) mod G(x) */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0x04d101df481b4e5a, /* floor(x^96 / G(x)) - x^64 */ + 0x104c11db7, /* G(x) */ + }, +}; diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c index afcdeee429664..326261e503b42 100644 --- a/arch/x86/lib/crc32-glue.c +++ b/arch/x86/lib/crc32-glue.c @@ -18,10 +18,11 @@ static DEFINE_STATIC_KEY_FALSE(have_crc32); static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); +DECLARE_CRC_PCLMUL_FUNCS(crc32_msb, u32); u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) { CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, have_pclmulqdq, IS_ENABLED(CONFIG_CRC32_SLICEBY8)); @@ -69,10 +70,12 @@ u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len) } EXPORT_SYMBOL(crc32c_le_arch); u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) { + CRC_PCLMUL(crc, p, len, crc32_msb, crc32_msb_0x04c11db7_consts, + have_pclmulqdq, IS_ENABLED(CONFIG_CRC32_SLICEBY8)); return crc32_be_base(crc, p, len); } EXPORT_SYMBOL(crc32_be_arch); static int __init crc32_x86_init(void) @@ -80,10 +83,11 @@ static int __init crc32_x86_init(void) if (boot_cpu_has(X86_FEATURE_XMM4_2)) static_branch_enable(&have_crc32); if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { static_branch_enable(&have_pclmulqdq); INIT_CRC_PCLMUL(crc32_lsb); + INIT_CRC_PCLMUL(crc32_msb); } return 0; } arch_initcall(crc32_x86_init); diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S index cf07d571ae864..d562944211d4d 100644 --- a/arch/x86/lib/crc32-pclmul.S +++ b/arch/x86/lib/crc32-pclmul.S @@ -2,5 +2,6 @@ // Copyright 2024 Google LLC #include "crc-pclmul-template.S" DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1) +DEFINE_CRC_PCLMUL_FUNCS(crc32_msb, /* bits= */ 32, /* lsb= */ 0) -- 2.47.0