The crc64 table lookup method is inefficient, using a significant number of CPU cycles in the block stack per IO. If available on x86, use a PCLMULQDQ implementation to accelerate the calculation. The assembly from this patch was mostly generated by gcc from a C program using library functions provided by x86 intrinsics, and measures ~20x faster than the table lookup. Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> --- arch/x86/crypto/Makefile | 3 + arch/x86/crypto/crc64-rocksoft-pcl-asm.S | 215 ++++++++++++++++++++++ arch/x86/crypto/crc64-rocksoft-pcl_glue.c | 117 ++++++++++++ crypto/Kconfig | 11 ++ 4 files changed, 346 insertions(+) create mode 100644 arch/x86/crypto/crc64-rocksoft-pcl-asm.S create mode 100644 arch/x86/crypto/crc64-rocksoft-pcl_glue.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index c3af959648e6..036520c59f0e 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -79,6 +79,9 @@ crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o +obj-$(CONFIG_CRYPTO_CRC64_ROCKSOFT_PCLMUL) += crc64-rocksoft-pclmul.o +crc64-rocksoft-pclmul-y := crc64-rocksoft-pcl-asm.o crc64-rocksoft-pcl_glue.o + obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o targets += poly1305-x86_64-cryptogams.S diff --git a/arch/x86/crypto/crc64-rocksoft-pcl-asm.S b/arch/x86/crypto/crc64-rocksoft-pcl-asm.S new file mode 100644 index 000000000000..e3b633a776a9 --- /dev/null +++ b/arch/x86/crypto/crc64-rocksoft-pcl-asm.S @@ -0,0 +1,215 @@ +######################################################################## +# Implement fast Rocksoft CRC-64 computation with SSE and PCLMULQDQ instructions +# + +#include <linux/linkage.h> + +SYM_FUNC_START(crc_rocksoft_pcl) + leaq (%rsi,%rdx), %rcx + movq %rsi, %r10 + andl $15, %esi + movq %rdi, %xmm3 + leaq 15(%rcx), %rax + andq $-16, %r10 + pxor %xmm1, %xmm1 + andq $-16, %rax + movdqa %xmm1, %xmm5 + movq %rax, %r8 + subq %r10, %rax + subq %rcx, %r8 + movl $16, %ecx + movq %rax, %r11 + movq %rcx, %r9 + sarq $4, %r11 + subq %rsi, %r9 + movdqu shuffleMasks(%r9), %xmm4 + movdqa %xmm4, %xmm0 + pblendvb %xmm0, (%r10), %xmm5 + cmpq $16, %rax + je .L12 + movdqa 16(%r10), %xmm2 + cmpq $2, %r11 + je .L13 + pcmpeqd %xmm1, %xmm1 + leaq -16(%rsi,%rdx), %rdi + leaq 16(%r10), %r9 + pxor %xmm1, %xmm4 + movdqa %xmm3, %xmm1 + pshufb %xmm0, %xmm3 + pshufb %xmm4, %xmm1 + movdqa %xmm3, %xmm0 + movdqa .LC0(%rip), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, %xmm4 + pclmulqdq $0, %xmm3, %xmm1 + pclmulqdq $17, %xmm3, %xmm4 + pxor %xmm4, %xmm1 + pxor %xmm1, %xmm0 + cmpq $31, %rdi + jbe .L6 + leaq -32(%rdi), %rax + movq %rax, %rsi + andq $-16, %rax + leaq 32(%r10,%rax), %rcx + shrq $4, %rsi + movq %r9, %rax + .p2align 4,,10 + .p2align 3 +.L7: + pxor %xmm2, %xmm0 + movq %rax, %rdx + addq $16, %rax + movdqa %xmm0, %xmm1 + pclmulqdq $0, %xmm3, %xmm0 + movdqa 16(%rdx), %xmm2 + pclmulqdq $17, %xmm3, %xmm1 + pxor %xmm1, %xmm0 + cmpq %rcx, %rax + jne .L7 + movq %rsi, %rax + addq $1, %rsi + negq %rax + salq $4, %rsi + salq $4, %rax + addq %rsi, %r9 + leaq -16(%rdi,%rax), %rdi +.L6: + pxor %xmm2, %xmm0 + cmpq $16, %rdi + je .L9 + movl $16, %eax + pcmpeqd %xmm2, %xmm2 + movdqa %xmm0, %xmm7 + subq %r8, %rax + movdqu shuffleMasks(%rax), %xmm4 + pxor %xmm4, %xmm2 + pshufb %xmm4, %xmm0 + movdqa 16(%r9), %xmm4 + pshufb %xmm2, %xmm7 + pshufb %xmm2, %xmm4 + movdqa %xmm7, %xmm1 + movdqa %xmm4, %xmm2 + movdqa %xmm7, %xmm4 + pclmulqdq $0, %xmm3, %xmm1 + pclmulqdq $17, %xmm3, %xmm4 + por %xmm2, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm1, %xmm0 +.L9: + movdqa %xmm0, %xmm2 + pclmulqdq $16, %xmm3, %xmm0 + psrldq $8, %xmm2 + pxor %xmm2, %xmm0 +.L3: + movdqa .LC1(%rip), %xmm2 + movdqa %xmm0, %xmm1 + pclmulqdq $0, %xmm2, %xmm1 + movdqa %xmm1, %xmm3 + pclmulqdq $16, %xmm2, %xmm1 + pslldq $8, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm1, %xmm0 + pextrd $3, %xmm0, %eax + salq $32, %rax + movq %rax, %rdx + pextrd $2, %xmm0, %eax + orq %rdx, %rax + notq %rax + ret + .p2align 4,,10 + .p2align 3 +.L13: + subq %r8, %rcx + pcmpeqd %xmm1, %xmm1 + movdqu shuffleMasks(%rcx), %xmm7 + movdqa %xmm7, %xmm6 + pxor %xmm1, %xmm6 + cmpq $7, %rdx + ja .L5 + movdqa %xmm1, %xmm4 + pshufb %xmm7, %xmm5 + movdqa %xmm3, %xmm1 + movdqu shuffleMasks(%rdx), %xmm8 + pshufb %xmm6, %xmm2 + pxor %xmm8, %xmm4 + pxor %xmm5, %xmm2 + pshufb %xmm8, %xmm3 + pshufb %xmm4, %xmm1 + movdqa %xmm3, %xmm0 + pxor %xmm1, %xmm2 + pslldq $8, %xmm0 + movdqa %xmm2, %xmm3 + pclmulqdq $16, .LC0(%rip), %xmm2 + psrldq $8, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + jmp .L3 + .p2align 4,,10 + .p2align 3 +.L12: + movdqu shuffleMasks(%rdx), %xmm2 + subq %r8, %rcx + movdqa %xmm3, %xmm6 + pcmpeqd %xmm4, %xmm4 + movdqa %xmm2, %xmm0 + pshufb %xmm2, %xmm3 + movdqu shuffleMasks(%rcx), %xmm2 + pxor %xmm4, %xmm0 + pslldq $8, %xmm3 + pxor %xmm4, %xmm2 + pshufb %xmm0, %xmm6 + pshufb %xmm2, %xmm5 + movdqa %xmm5, %xmm1 + pxor %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + pclmulqdq $16, .LC0(%rip), %xmm1 + psrldq $8, %xmm0 + pxor %xmm3, %xmm0 + pxor %xmm1, %xmm0 + jmp .L3 + .p2align 4,,10 + .p2align 3 +.L5: + pxor %xmm1, %xmm4 + movdqa %xmm3, %xmm1 + pshufb %xmm0, %xmm3 + pshufb %xmm4, %xmm1 + pxor %xmm3, %xmm2 + movdqa .LC0(%rip), %xmm3 + pxor %xmm5, %xmm1 + pshufb %xmm6, %xmm2 + movdqa %xmm1, %xmm5 + pshufb %xmm7, %xmm1 + pshufb %xmm6, %xmm5 + pxor %xmm2, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm5, %xmm0 + pclmulqdq $17, %xmm3, %xmm0 + pclmulqdq $0, %xmm3, %xmm4 + pxor %xmm0, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm1, %xmm0 + pclmulqdq $16, %xmm3, %xmm1 + psrldq $8, %xmm0 + pxor %xmm1, %xmm0 + jmp .L3 +SYM_FUNC_END(crc_rocksoft_pcl) + +.section .rodata +.align 32 +.type shuffleMasks, @object +.size shuffleMasks, 32 +shuffleMasks: + .string "" + .ascii "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\217\216\215" + .ascii "\214\213\212\211\210\207\206\205\204\203\202\201\200" + +.section .rodata.cst16,"aM",@progbits,16 +.align 16 +.LC0: + .quad -1523270018343381984 + .quad 2443614144669557164 + .align 16 +.LC1: + .quad 2876949357237608311 + .quad 3808117099328934763 diff --git a/arch/x86/crypto/crc64-rocksoft-pcl_glue.c b/arch/x86/crypto/crc64-rocksoft-pcl_glue.c new file mode 100644 index 000000000000..996780aa3d93 --- /dev/null +++ b/arch/x86/crypto/crc64-rocksoft-pcl_glue.c @@ -0,0 +1,117 @@ +#include <linux/types.h> +#include <linux/module.h> +#include <linux/crc64.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <asm/cpufeatures.h> +#include <asm/cpu_device_id.h> +#include <asm/simd.h> + +asmlinkage u64 crc_rocksoft_pcl(u64 init_crc, const u8 *buf, size_t len); + +struct chksum_desc_ctx { + u64 crc; +}; + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = 0; + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + if (length >= 16 && crypto_simd_usable()) { + kernel_fpu_begin(); + ctx->crc = crc_rocksoft_pcl(ctx->crc, data, length); + kernel_fpu_end(); + } else + ctx->crc = crc64_rocksoft_generic(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + *(u64 *)out = ctx->crc; + return 0; +} + +static int __chksum_finup(u64 crc, const u8 *data, unsigned int len, u8 *out) +{ + if (len >= 16 && crypto_simd_usable()) { + kernel_fpu_begin(); + *(u64 *)out = crc_rocksoft_pcl(crc, data, len); + kernel_fpu_end(); + } else + *(u64 *)out = crc64_rocksoft_generic(crc, data, len); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + return __chksum_finup(0, data, length, out); +} + +static struct shash_alg alg = { + .digestsize = 8, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = CRC64_ROCKSOFT_STRING, + .cra_driver_name = "crc64-rocksoft-pclmul", + .cra_priority = 200, + .cra_blocksize = 1, + .cra_module = THIS_MODULE, + } +}; + +static const struct x86_cpu_id crc64_rocksoft_cpu_id[] = { + X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, crc64_rocksoft_cpu_id); + +static int __init crc64_rocksoft_x86_mod_init(void) +{ + if (!x86_match_cpu(crc64_rocksoft_cpu_id)) + return -ENODEV; + + return crypto_register_shash(&alg); +} + +static void __exit crc64_rocksoft_x86_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crc64_rocksoft_x86_mod_init); +module_exit(crc64_rocksoft_x86_mod_fini); + +MODULE_AUTHOR("Keith Busch <kbusch@xxxxxxxxxx>"); +MODULE_DESCRIPTION("Rocksoft CRC64 calculation accelerated with PCLMULQDQ."); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CRYPTO("crc64-rocksoft-pclmul"); diff --git a/crypto/Kconfig b/crypto/Kconfig index e343147b9f8f..d8861138f117 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -744,6 +744,17 @@ config CRYPTO_CRC64_ROCKSOFT transform. This allows for faster crc64 transforms to be used if they are available. +config CRYPTO_CRC64_ROCKSOFT_PCLMUL + tristate "Rocksoft model CRC64 PCLMULQDQ hardware acceleration" + depends on X86 && 64BIT && CRC64 + select CRYPTO_HASH + help + For x86_64 processors with SSE4.2 and PCLMULQDQ supported, + CRC64 PCLMULQDQ computation can be hardware accelerated PCLMULQDQ + instruction. This option will create 'crc64-rocksoft-pclmul' + module, which is faster when computing crc64 checksum compared + with the generic table implementation. + config CRYPTO_VPMSUM_TESTER tristate "Powerpc64 vpmsum hardware acceleration tester" depends on CRYPTO_CRCT10DIF_VPMSUM && CRYPTO_CRC32C_VPMSUM -- 2.25.4