Re: [PATCH] lib/string: Bring optimized memcmp from glibc

Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> · Wed, 21 Jul 2021 12:26:29 -0700

On Wed, Jul 21, 2021 at 11:45 AM Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> I can do the mutual alignment too, but I'd actually prefer to do it as
> a separate patch, for when there are numbers for that.
>
> And I wouldn't do it as a byte-by-byte case, because that's just stupid.

Here's that "try to align one of the pointers in order to avoid the
lots-of-unaligned case" patch.

It's not quite as simple, and the generated assembly isn't quite as
obvious. But it still generates code that looks good, it's just that
the code to align the first pointer ends up being a bit harder to
read.

And since it's a bit less obvious, the "this is probably buggy because
I didn't actually _test_ it" warning holds even more. But you can see
how much simpler the code still is than the horrendous glibc mess is.

And I removed the "CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS" checking,
because now it should be at least *somewhat* reasonable even on
machines that have a complicated "get_unaligned()".

But maybe I should have kept it. Only testing will tell.

Again: UNTESTED GARBAGE ATTACHED. Be careful. But it migth work, and
ti generates something that looks superficially reasonable.

Gcc again:

        memcmp:
                cmpq    $7, %rdx
                jbe     .L56
                movq    (%rsi), %rax
                cmpq    %rax, (%rdi)
                je      .L61
        .L55:
                xorl    %ecx, %ecx
                jmp     .L58
        .L62:
                addq    $1, %rcx
                cmpq    %rcx, %rdx
                je      .L51
        .L58:
                movzbl  (%rdi,%rcx), %eax
                movzbl  (%rsi,%rcx), %r8d
                subl    %r8d, %eax
                je      .L62
        .L51:
                ret
        .L56:
                testq   %rdx, %rdx
                jne     .L55
                xorl    %eax, %eax
                ret
        .L61:
                movq    %rdi, %rcx
                movl    $8, %eax
                andl    $7, %ecx
                subq    %rcx, %rax
                leaq    -8(%rcx,%rdx), %rdx
                addq    %rax, %rdi
                addq    %rax, %rsi
                cmpq    $7, %rdx
                ja      .L57
                jmp     .L56
        .L63:
                subq    $8, %rdx
                addq    $8, %rdi
                addq    $8, %rsi
                cmpq    $7, %rdx
                jbe     .L56
        .L57:
                movq    (%rsi), %rax
                cmpq    %rax, (%rdi)
                je      .L63
                jmp     .L55

but clang is similar (except clang isn't as eager to move basic blocks
around, so it's visually very different).

Note no spills, no odd shifts for unaligned accesses, no garbage.

Again: untested, so consider this a starting point rather than
anything good and proper.

                   Linus
 lib/string.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/lib/string.c b/lib/string.c
index 77bd0b1d3296..3eb390fc4f73 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 
+#include <asm/unaligned.h>
 #include <asm/byteorder.h>
 #include <asm/word-at-a-time.h>
 #include <asm/page.h>
@@ -935,6 +936,31 @@ __visible int memcmp(const void *cs, const void *ct, size_t count)
 	const unsigned char *su1, *su2;
 	int res = 0;
 
+	if (count >= sizeof(unsigned long)) {
+		const unsigned long *u1 = cs;
+		const unsigned long *u2 = ct;
+		unsigned long offset;
+
+		if (get_unaligned(u1) != get_unaligned(u2))
+			goto bytewise;
+
+		/* Align 'u1' up */
+		offset = sizeof(*u1) - ((sizeof(*u1)-1) & (unsigned long)(u1));
+		u1 = cs + offset;
+		u2 = ct + offset;
+		count -= offset;
+
+		while (count >= sizeof(unsigned long)) {
+			if (*u1 != get_unaligned(u2))
+				break;
+			u1++;
+			u2++;
+			count -= sizeof(unsigned long);
+		}
+		cs = u1;
+		ct = u2;
+	}
+bytewise:
 	for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--)
 		if ((res = *su1 - *su2) != 0)
 			break;