Re: xor_blocks() assumptions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



It was <2023-01-02 pon 15:03>, when Eric Biggers wrote:
> On Mon, Jan 02, 2023 at 11:44:35PM +0100, Lukasz Stelmach wrote:
>> I am researching possibility to use xor_blocks() in crypto_xor() and
>> crypto_xor_cpy(). What I've found already is that different architecture
>> dependent xor functions work on different blocks between 16 and 512
>> (Intel AVX) bytes long. There is a hint in the comment for
>> async_xor_offs() that src_cnt (as passed to do_sync_xor_offs()) counts
>> pages. Thus, it is assumed, that the smallest chunk xor_blocks() gets is
>> a single page. Am I right?
>> 
>> Do you think adding block_len field to struct xor_block_template (and
>> maybe some information about required alignment) and using it to call
>> do_2 from crypto_xor() may work? I am thinking especially about disk
>> encryption where sectors of 512~4096 are handled.
>> 
>
> Taking a step back, it sounds like you think the word-at-a-time XOR in
> crypto_xor() is not performant enough, so you want to use a SIMD (e.g. NEON,
> SSE, or AVX) implementation instead.

Yes.

> Have you tested that this would actually give a benefit on the input
> sizes in question,

--8<---------------cut here---------------start------------->8---
[    0.938006] xor: measuring software checksum speed
[    0.947383]    crypto          :  1052 MB/sec
[    0.953299]    arm4regs        :  1689 MB/sec
[    0.960674]    8regs           :  1352 MB/sec
[    0.968033]    32regs          :  1352 MB/sec
[    0.972078]    neon            :  2448 MB/sec
--8<---------------cut here---------------end--------------->8---

(Linux 6.2.0-rc1 running on Odroid XU3 board with Arm Cortex-A15)

The patch below copies, adapts and plugs in __crypto_xor() as
xor_block_crypto.do_2. You can see its results labeled "crypto" above.
Disk encryption is comparable to RAID5 checksumming so the results above
should be adequate.

> especially considering that SIMD can only be used in the kernel if
> kernel_fpu_begin() is executed first?

That depends on architecture. As far as I can tell this applies to Intel
only.

> It also would be worth considering just optimizing crypto_xor() by
> unrolling the word-at-a-time loop to 4x or so.

If I understand correctly the generic 8regs and 32regs implementations
in include/asm-generic/xor.h are what you mean. Using xor_blocks() in
crypto_xor() could enable them for free on architectures lacking SIMD or
vector instructions.

---
diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h
index 934b549905f5..ffb67b3cbcfc 100644
--- a/arch/arm/include/asm/xor.h
+++ b/arch/arm/include/asm/xor.h
@@ -142,6 +142,7 @@ static struct xor_block_template xor_block_arm4regs = {
 #undef XOR_TRY_TEMPLATES
 #define XOR_TRY_TEMPLATES			\
 	do {					\
+		xor_speed(&xor_block_crypto);	\
 		xor_speed(&xor_block_arm4regs);	\
 		xor_speed(&xor_block_8regs);	\
 		xor_speed(&xor_block_32regs);	\
diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h
index 44509d48fca2..a04f27607c65 100644
--- a/include/asm-generic/xor.h
+++ b/include/asm-generic/xor.h
@@ -6,6 +6,85 @@
  */
 
 #include <linux/prefetch.h>
+#include <asm-generic/unaligned.h>
+
+// A.K.A. __crypto_xor()
+static void
+xor_crypto_2(unsigned long bytes, unsigned long * __restrict p1,
+	    const unsigned long * __restrict p2)
+{
+	u8 *dst = (u8*)p1;
+	u8 *src1 = (u8*)p1;
+	u8 *src2 = (u8*)p2;
+        unsigned int len = bytes;
+        int relalign = 0;
+
+        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+                int size = sizeof(unsigned long);
+                int d = (((unsigned long)dst ^ (unsigned long)src1) |
+                         ((unsigned long)dst ^ (unsigned long)src2)) &
+                        (size - 1);
+
+                relalign = d ? 1 << __ffs(d) : size;
+
+                /*
+                 * If we care about alignment, process as many bytes as
+                 * needed to advance dst and src to values whose alignments
+                 * equal their relative alignment. This will allow us to
+                 * process the remainder of the input using optimal strides.
+                 */
+                while (((unsigned long)dst & (relalign - 1)) && len > 0) {
+                        *dst++ = *src1++ ^ *src2++;
+                        len--;
+                }
+        }
+
+        while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) {
+                if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+                        u64 l = get_unaligned((u64 *)src1) ^
+                                get_unaligned((u64 *)src2);
+                        put_unaligned(l, (u64 *)dst);
+                } else {
+                        *(u64 *)dst = *(u64 *)src1 ^ *(u64 *)src2;
+                }
+                dst += 8;
+                src1 += 8;
+                src2 += 8;
+                len -= 8;
+        }
+
+        while (len >= 4 && !(relalign & 3)) {
+                if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+                        u32 l = get_unaligned((u32 *)src1) ^
+                                get_unaligned((u32 *)src2);
+                        put_unaligned(l, (u32 *)dst);
+                } else {
+                        *(u32 *)dst = *(u32 *)src1 ^ *(u32 *)src2;
+                }
+                dst += 4;
+                src1 += 4;
+                src2 += 4;
+                len -= 4;
+        }
+
+        while (len >= 2 && !(relalign & 1)) {
+                if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+                        u16 l = get_unaligned((u16 *)src1) ^
+                                get_unaligned((u16 *)src2);
+                        put_unaligned(l, (u16 *)dst);
+                } else {
+                        *(u16 *)dst = *(u16 *)src1 ^ *(u16 *)src2;
+                }
+                dst += 2;
+                src1 += 2;
+                src2 += 2;
+                len -= 2;
+        }
+
+        while (len--)
+                *dst++ = *src1++ ^ *src2++;
+}
+
 
 static void
 xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1,
@@ -697,6 +776,14 @@ xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
 		goto once_more;
 }
 
+static struct xor_block_template xor_block_crypto = {
+	.name = "crypto",
+	.do_2 = xor_crypto_2,
+	.do_3 = NULL,
+	.do_4 = NULL,
+	.do_5 = NULL,
+};
+
 static struct xor_block_template xor_block_8regs = {
 	.name = "8regs",
 	.do_2 = xor_8regs_2,
-- 
Łukasz Stelmach
Samsung R&D Institute Poland
Samsung Electronics

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [Kernel]     [Gnu Classpath]     [Gnu Crypto]     [DM Crypt]     [Netfilter]     [Bugtraq]
  Powered by Linux