On Sat, May 28, 2022 at 10:38 PM Sedat Dilek <sedat.dilek@xxxxxxxxx> wrote: > > On Sat, May 28, 2022 at 9:57 PM Ingo Molnar <mingo@xxxxxxxxxx> wrote: > > > > > > * Ingo Molnar <mingo@xxxxxxxxxx> wrote: > > > > > > > > * Jason A. Donenfeld <Jason@xxxxxxxxx> wrote: > > > > > > > On Mon, May 23, 2022 at 10:03:45AM -0600, Jens Axboe wrote: > > > > > clear_user() > > > > > 32 ~96MB/sec > > > > > 64 195MB/sec > > > > > 128 386MB/sec > > > > > 1k 2.7GB/sec > > > > > 4k 7.8GB/sec > > > > > 16k 14.8GB/sec > > > > > > > > > > copy_from_zero_page() > > > > > 32 ~96MB/sec > > > > > 64 193MB/sec > > > > > 128 383MB/sec > > > > > 1k 2.9GB/sec > > > > > 4k 9.8GB/sec > > > > > 16k 21.8GB/sec > > > > > > > > Just FYI, on x86, Samuel Neves proposed some nice clear_user() > > > > performance improvements that were forgotten about: > > > > > > > > https://lore.kernel.org/lkml/20210523180423.108087-1-sneves@xxxxxxxxx/ > > > > https://lore.kernel.org/lkml/Yk9yBcj78mpXOOLL@xxxxxxxxx/ > > > > > > > > Hoping somebody picks this up at some point... > > > > > > Those ~2x speedup numbers are indeed looking very nice: > > > > > > | After this patch, on a Skylake CPU, these are the > > > | before/after figures: > > > | > > > | $ dd if=/dev/zero of=/dev/null bs=1024k status=progress > > > | 94402248704 bytes (94 GB, 88 GiB) copied, 6 s, 15.7 GB/s > > > | > > > | $ dd if=/dev/zero of=/dev/null bs=1024k status=progress > > > | 446476320768 bytes (446 GB, 416 GiB) copied, 15 s, 29.8 GB/s > > > > > > Patch fell through the cracks & it doesn't apply anymore: > > > > > > checking file arch/x86/lib/usercopy_64.c > > > Hunk #2 FAILED at 17. > > > 1 out of 2 hunks FAILED > > > > > > Would be nice to re-send it. > > > > Turns out Boris just sent a competing optimization to clear_user() 3 days ago: > > > > https://lore.kernel.org/r/YozQZMyQ0NDdD8cH@xxxxxxx > > > > Thanks, > > > > [ CC Hugh ] > > I hope I adapted both patches from Hugh and Samuel against Linux v5.18 > correctly. > > As I have no "modern CPU" meaning Intel Sandy-Bridge, the patch of > Hugh was not predestined for me (see numbers). > > Samuel's patch gave me 15% of speedup with running Hugh's dd test-case > (cannot say if this is a real benchmark for testing). > > Patches and latest linux-config attached. > > *** Without patch > > root# cat /proc/version > Linux version 5.18.0-3-amd64-clang14-lto (sedat.dilek@xxxxxxxxx@iniza) > (dileks clang version 14.0.4 (https://github.com/llvm/llvm-project.git > 29f1039a7285a5c3a9c353d05 > 4140bf2556d4c4d), LLD 14.0.4) #3~bookworm+dileks1 SMP PREEMPT_DYNAMIC 2022-05-27 > > root# dd if=/dev/zero of=/dev/null bs=1M count=1M > 1048576+0 Datensätze ein > 1048576+0 Datensätze aus > 1099511627776 Bytes (1,1 TB, 1,0 TiB) kopiert, 97,18 s, 11,3 GB/s > > *** With hughd patch > > Patch: 0001-x86-usercopy-Use-alternatives-for-clear_user.patch > Link: https://lore.kernel.org/lkml/2f5ca5e4-e250-a41c-11fb-a7f4ebc7e1c9@xxxxxxxxxx/ > > root# cat /proc/version > Linux version 5.18.0-4-amd64-clang14-lto (sedat.dilek@xxxxxxxxx@iniza) > (dileks clang version 14.0.4 (https://github.com/llvm/llvm-project.git > 29f1039a7285a5c3a9c35> > > root# dd if=/dev/zero of=/dev/null bs=1M count=1M > 1048576+0 Datensätze ein > 1048576+0 Datensätze aus > 1099511627776 Bytes (1,1 TB, 1,0 TiB) kopiert, 588,053 s, 1,9 GB/s > > root# cat /proc/version > Linux version 5.18.0-4-amd64-clang14-lto (sedat.dilek@xxxxxxxxx@iniza) > (dileks clang version 14.0.4 (https://github.com/llvm/llvm-project.git > 29f1039a7285a5c3a9c353d05 > 4140bf2556d4c4d), LLD 14.0.4) #4~bookworm+dileks1 SMP PREEMPT_DYNAMIC 2022-05-28 > > *** With sneves patch > > Patch: 0001-x86-usercopy-speed-up-64-bit-__clear_user-with-stos-.patch > Link: https://lore.kernel.org/lkml/20210523180423.108087-1-sneves@xxxxxxxxx/ > > root# cat /proc/version > Linux version 5.18.0-5-amd64-clang14-lto (sedat.dilek@xxxxxxxxx@iniza) > (dileks clang version 14.0.4 (https://github.com/llvm/llvm-project.git > 29f1039a7285a5c3a9c353d05 > 4140bf2556d4c4d), LLD 14.0.4) #5~bookworm+dileks1 SMP PREEMPT_DYNAMIC 2022-05-28 > > root# dd if=/dev/zero of=/dev/null bs=1M count=1M > 1048576+0 Datensätze ein > 1048576+0 Datensätze aus > 1099511627776 Bytes (1,1 TB, 1,0 TiB) kopiert, 82,697 s, 13,3 GB/s > > > -dileks // 28-May-2022 Now with attachments. -sed@-
From 98169b8804af661b485bbc5699825af8be22a125 Mon Sep 17 00:00:00 2001 From: Sedat Dilek <sedat.dilek@xxxxxxxxx> Date: Sat, 28 May 2022 12:42:00 +0200 Subject: [PATCH] x86/usercopy: Use alternatives for clear_user() From: Hugh Dickins <hughd@xxxxxxxxxx> Link: https://lore.kernel.org/lkml/2f5ca5e4-e250-a41c-11fb-a7f4ebc7e1c9@xxxxxxxxxx/ --- arch/x86/lib/copy_user_64.S | 26 ++++++++++++++++++++++++++ arch/x86/lib/usercopy_64.c | 34 +--------------------------------- 2 files changed, 27 insertions(+), 33 deletions(-) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 9dec1b38a98f..0194b6981c58 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -408,3 +408,29 @@ SYM_FUNC_START(__copy_user_nocache) _ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy) SYM_FUNC_END(__copy_user_nocache) EXPORT_SYMBOL(__copy_user_nocache) + +/* + * Recent CPUs have added enhanced REP MOVSB/STOSB instructions. + * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. + * Assume that's best for __clear_user(), until alternatives are provided + * (though would be better to avoid REP STOSB for short clears, if no FSRM). + * + * Input: + * rdi destination + * rsi count + * + * Output: + * rax uncopied bytes or 0 if successful. + */ +SYM_FUNC_START(__clear_user) + ASM_STAC + movl %esi,%ecx + xorq %rax,%rax +1: rep stosb +2: movl %ecx,%eax + ASM_CLAC + ret + + _ASM_EXTABLE_UA(1b, 2b) +SYM_FUNC_END(__clear_user) +EXPORT_SYMBOL(__clear_user) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0ae6cf804197..63a4329a7ddd 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -13,41 +13,9 @@ /* * Zero Userspace */ - -unsigned long __clear_user(void __user *addr, unsigned long size) -{ - long __d0; - might_fault(); - /* no memory constraint because it doesn't change any memory gcc knows - about */ - stac(); - asm volatile( - " testq %[size8],%[size8]\n" - " jz 4f\n" - " .align 16\n" - "0: movq $0,(%[dst])\n" - " addq $8,%[dst]\n" - " decl %%ecx ; jnz 0b\n" - "4: movq %[size1],%%rcx\n" - " testl %%ecx,%%ecx\n" - " jz 2f\n" - "1: movb $0,(%[dst])\n" - " incq %[dst]\n" - " decl %%ecx ; jnz 1b\n" - "2:\n" - - _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1]) - _ASM_EXTABLE_UA(1b, 2b) - - : [size8] "=&c"(size), [dst] "=&D" (__d0) - : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr)); - clac(); - return size; -} -EXPORT_SYMBOL(__clear_user); - unsigned long clear_user(void __user *to, unsigned long n) { + might_fault(); if (access_ok(to, n)) return __clear_user(to, n); return n; -- 2.36.1
From f9ce989826f283efc429be41caded7c885b8950c Mon Sep 17 00:00:00 2001 From: Sedat Dilek <sedat.dilek@xxxxxxxxx> Date: Sat, 28 May 2022 17:54:40 +0200 Subject: [PATCH] x86/usercopy: speed up 64-bit __clear_user() with stos{b,q} From: Samuel Neves <sneves@xxxxxxxxx> Link: https://lore.kernel.org/lkml/20210523180423.108087-1-sneves@xxxxxxxxx/ --- arch/x86/lib/usercopy_64.c | 61 +++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0ae6cf804197..799db5536d25 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -9,6 +9,7 @@ #include <linux/export.h> #include <linux/uaccess.h> #include <linux/highmem.h> +#include <asm/alternative.h> /* * Zero Userspace @@ -16,31 +17,51 @@ unsigned long __clear_user(void __user *addr, unsigned long size) { - long __d0; + long __d0, __d1; might_fault(); /* no memory constraint because it doesn't change any memory gcc knows about */ stac(); asm volatile( - " testq %[size8],%[size8]\n" - " jz 4f\n" - " .align 16\n" - "0: movq $0,(%[dst])\n" - " addq $8,%[dst]\n" - " decl %%ecx ; jnz 0b\n" - "4: movq %[size1],%%rcx\n" - " testl %%ecx,%%ecx\n" - " jz 2f\n" - "1: movb $0,(%[dst])\n" - " incq %[dst]\n" - " decl %%ecx ; jnz 1b\n" - "2:\n" - - _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1]) - _ASM_EXTABLE_UA(1b, 2b) - - : [size8] "=&c"(size), [dst] "=&D" (__d0) - : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr)); + " cmp $256, %[size]\n" + " jae 3f\n" /* size >= 256 */ + " mov %k[size], %k[aux]\n" + " and $7, %k[aux]\n" + " shr $3, %[size]\n" + " jz 1f\n" /* size < 8 */ + ".align 16\n" + "0: movq %%rax,(%[dst])\n" + " add $8,%[dst]\n" + " dec %[size]; jnz 0b\n" + "1: mov %k[aux],%k[size]\n" + " test %k[aux], %k[aux]\n" + " jz 6f\n" + "2: movb %%al,(%[dst])\n" + " inc %[dst]\n" + " dec %k[size]; jnz 2b\n" + " jmp 6f\n" + "3: \n" + ALTERNATIVE( + "mov %k[size], %k[aux]\n" + "shr $3, %[size]\n" + "and $7, %k[aux]\n" + "4: rep stosq\n" + "mov %k[aux], %k[size]\n", + "", + X86_FEATURE_ERMS + ) + "5: rep stosb\n" + "6: \n" + ".section .fixup,\"ax\"\n" + "7: lea 0(%[aux],%[size],8),%[size]\n" + " jmp 6b\n" + ".previous\n" + _ASM_EXTABLE_UA(0b, 7b) + _ASM_EXTABLE_UA(2b, 6b) + _ASM_EXTABLE_UA(4b, 7b) + _ASM_EXTABLE_UA(5b, 6b) + : [size] "=&c"(size), [dst] "=&D" (__d0), [aux] "=&r"(__d1) + : "[size]" (size), "[dst]"(addr), "a"(0)); clac(); return size; } -- 2.36.1
Attachment:
config-5.18.0-5-amd64-clang14-lto
Description: Binary data