It was <2023-01-02 pon 15:03>, when Eric Biggers wrote: > On Mon, Jan 02, 2023 at 11:44:35PM +0100, Lukasz Stelmach wrote: >> I am researching possibility to use xor_blocks() in crypto_xor() and >> crypto_xor_cpy(). What I've found already is that different architecture >> dependent xor functions work on different blocks between 16 and 512 >> (Intel AVX) bytes long. There is a hint in the comment for >> async_xor_offs() that src_cnt (as passed to do_sync_xor_offs()) counts >> pages. Thus, it is assumed, that the smallest chunk xor_blocks() gets is >> a single page. Am I right? >> >> Do you think adding block_len field to struct xor_block_template (and >> maybe some information about required alignment) and using it to call >> do_2 from crypto_xor() may work? I am thinking especially about disk >> encryption where sectors of 512~4096 are handled. >> > > Taking a step back, it sounds like you think the word-at-a-time XOR in > crypto_xor() is not performant enough, so you want to use a SIMD (e.g. NEON, > SSE, or AVX) implementation instead. Yes. > Have you tested that this would actually give a benefit on the input > sizes in question, --8<---------------cut here---------------start------------->8--- [ 0.938006] xor: measuring software checksum speed [ 0.947383] crypto : 1052 MB/sec [ 0.953299] arm4regs : 1689 MB/sec [ 0.960674] 8regs : 1352 MB/sec [ 0.968033] 32regs : 1352 MB/sec [ 0.972078] neon : 2448 MB/sec --8<---------------cut here---------------end--------------->8--- (Linux 6.2.0-rc1 running on Odroid XU3 board with Arm Cortex-A15) The patch below copies, adapts and plugs in __crypto_xor() as xor_block_crypto.do_2. You can see its results labeled "crypto" above. Disk encryption is comparable to RAID5 checksumming so the results above should be adequate. > especially considering that SIMD can only be used in the kernel if > kernel_fpu_begin() is executed first? That depends on architecture. As far as I can tell this applies to Intel only. > It also would be worth considering just optimizing crypto_xor() by > unrolling the word-at-a-time loop to 4x or so. If I understand correctly the generic 8regs and 32regs implementations in include/asm-generic/xor.h are what you mean. Using xor_blocks() in crypto_xor() could enable them for free on architectures lacking SIMD or vector instructions. --- diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h index 934b549905f5..ffb67b3cbcfc 100644 --- a/arch/arm/include/asm/xor.h +++ b/arch/arm/include/asm/xor.h @@ -142,6 +142,7 @@ static struct xor_block_template xor_block_arm4regs = { #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ do { \ + xor_speed(&xor_block_crypto); \ xor_speed(&xor_block_arm4regs); \ xor_speed(&xor_block_8regs); \ xor_speed(&xor_block_32regs); \ diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h index 44509d48fca2..a04f27607c65 100644 --- a/include/asm-generic/xor.h +++ b/include/asm-generic/xor.h @@ -6,6 +6,85 @@ */ #include <linux/prefetch.h> +#include <asm-generic/unaligned.h> + +// A.K.A. __crypto_xor() +static void +xor_crypto_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) +{ + u8 *dst = (u8*)p1; + u8 *src1 = (u8*)p1; + u8 *src2 = (u8*)p2; + unsigned int len = bytes; + int relalign = 0; + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { + int size = sizeof(unsigned long); + int d = (((unsigned long)dst ^ (unsigned long)src1) | + ((unsigned long)dst ^ (unsigned long)src2)) & + (size - 1); + + relalign = d ? 1 << __ffs(d) : size; + + /* + * If we care about alignment, process as many bytes as + * needed to advance dst and src to values whose alignments + * equal their relative alignment. This will allow us to + * process the remainder of the input using optimal strides. + */ + while (((unsigned long)dst & (relalign - 1)) && len > 0) { + *dst++ = *src1++ ^ *src2++; + len--; + } + } + + while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) { + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { + u64 l = get_unaligned((u64 *)src1) ^ + get_unaligned((u64 *)src2); + put_unaligned(l, (u64 *)dst); + } else { + *(u64 *)dst = *(u64 *)src1 ^ *(u64 *)src2; + } + dst += 8; + src1 += 8; + src2 += 8; + len -= 8; + } + + while (len >= 4 && !(relalign & 3)) { + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { + u32 l = get_unaligned((u32 *)src1) ^ + get_unaligned((u32 *)src2); + put_unaligned(l, (u32 *)dst); + } else { + *(u32 *)dst = *(u32 *)src1 ^ *(u32 *)src2; + } + dst += 4; + src1 += 4; + src2 += 4; + len -= 4; + } + + while (len >= 2 && !(relalign & 1)) { + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { + u16 l = get_unaligned((u16 *)src1) ^ + get_unaligned((u16 *)src2); + put_unaligned(l, (u16 *)dst); + } else { + *(u16 *)dst = *(u16 *)src1 ^ *(u16 *)src2; + } + dst += 2; + src1 += 2; + src2 += 2; + len -= 2; + } + + while (len--) + *dst++ = *src1++ ^ *src2++; +} + static void xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1, @@ -697,6 +776,14 @@ xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1, goto once_more; } +static struct xor_block_template xor_block_crypto = { + .name = "crypto", + .do_2 = xor_crypto_2, + .do_3 = NULL, + .do_4 = NULL, + .do_5 = NULL, +}; + static struct xor_block_template xor_block_8regs = { .name = "8regs", .do_2 = xor_8regs_2, -- Łukasz Stelmach Samsung R&D Institute Poland Samsung Electronics
Attachment:
signature.asc
Description: PGP signature