On Sat, 16 Apr 2022, Borislav Petkov wrote: > On Fri, Apr 15, 2022 at 03:10:51PM -0700, Linus Torvalds wrote: > > Adding PeterZ and Borislav (who seem to be the last ones to have > > worked on the copy and clear_page stuff respectively) and the x86 > > maintainers in case somebody gets the urge to just fix this. > > I guess if enough people ask and keep asking, some people at least try > to move... > > > Because memory clearing should be faster than copying, and the thing > > that makes copying fast is that FSRM and ERMS logic (the whole > > "manually unrolled copy" is hopefully mostly a thing of the past and > > we can consider it legacy) > > So I did give it a look and it seems to me, if we want to do the > alternatives thing here, it will have to look something like > arch/x86/lib/copy_user_64.S. > > I.e., the current __clear_user() will have to become the "handle_tail" > thing there which deals with uncopied rest-bytes at the end and the new > fsrm/erms/rep_good variants will then be alternative_call_2 or _3. > > The fsrm thing will have only the handle_tail thing at the end when size > != 0. > > The others - erms and rep_good - will have to check for sizes smaller > than, say a cacheline, and for those call the handle_tail thing directly > instead of going into a REP loop. The current __clear_user() is still a > lot better than that copy_user_generic_unrolled() abomination. And it's > not like old CPUs would get any perf penalty - they'll simply use the > same code. > > And then you need the labels for _ASM_EXTABLE_UA() exception handling. > > Anyway, something along those lines. > > And then we'll need to benchmark this on a bunch of current machines to > make sure there's no funny surprises, perf-wise. > > I can get cracking on this but I would advise people not to hold their > breaths. :) > > Unless someone has a better idea or is itching to get hands dirty > her-/himself. I've done a skeleton implementation of alternative __clear_user() based on CPU features. It has three versions of __clear_user(); o __clear_user_original() - similar to the 'standard' __clear_user() o __clear_user_rep_good() - using resp stos{qb} when CPU has 'rep_good' o __clear_user_erms() - using 'resp stosb' when CPU has 'erms' Not claiming the implementation is ideal, but might be a useful starting point for someone. Patch is against 5.18.0-rc2. Only basic sanity testing done. Simple performance testing done for large sizes, on a system (Intel E8400) which has rep_good but not erms; # dd if=/dev/zero of=/dev/null bs=16384 count=10000 o *_original() - ~14.2GB/s. Same as the 'standard' __clear_user(). o *_rep_good() - same throughput as *_original(). o *_erms() - ~12.2GB/s (expected on a system without erms). No performance testing done for zeroing small sizes. Cheers, Mark Signed-off-by: Mark Hemment <markhemm@xxxxxxxxxxxxxx> --- arch/x86/include/asm/asm.h | 39 +++++++++++++++ arch/x86/include/asm/uaccess_64.h | 36 ++++++++++++++ arch/x86/lib/clear_page_64.S | 100 ++++++++++++++++++++++++++++++++++++++ arch/x86/lib/usercopy_64.c | 32 ------------ 4 files changed, 175 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index fbcfec4dc4cc..373ed6be7a8d 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -132,6 +132,35 @@ /* Exception table entry */ #ifdef __ASSEMBLY__ +# define UNDEFINE_EXTABLE_TYPE_REG \ + .purgem extable_type_reg ; + +# define DEFINE_EXTABLE_TYPE_REG \ + .macro extable_type_reg type:req reg:req ; \ + .set .Lfound, 0 ; \ + .set .Lregnr, 0 ; \ + .irp rs,rax,rcx,rdx,rbx,rsp,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13, \ + r14,r15 ; \ + .ifc \reg, %\rs ; \ + .set .Lfound, .Lfound+1 ; \ + .long \type + (.Lregnr << 8) ; \ + .endif ; \ + .set .Lregnr, .Lregnr+1 ; \ + .endr ; \ + .set .Lregnr, 0 ; \ + .irp rs,eax,ecx,edx,ebx,esp,ebp,esi,edi,r8d,r9d,r10d,r11d,r12d, \ + r13d,r14d,r15d ; \ + .ifc \reg, %\rs ; \ + .set .Lfound, .Lfound+1 ; \ + .long \type + (.Lregnr << 8) ; \ + .endif ; \ + .set .Lregnr, .Lregnr+1 ; \ + .endr ; \ + .if (.Lfound != 1) ; \ + .error "extable_type_reg: bad register argument" ; \ + .endif ; \ + .endm ; + # define _ASM_EXTABLE_TYPE(from, to, type) \ .pushsection "__ex_table","a" ; \ .balign 4 ; \ @@ -140,6 +169,16 @@ .long type ; \ .popsection +# define _ASM_EXTABLE_TYPE_REG(from, to, type1, reg1) \ + .pushsection "__ex_table","a" ; \ + .balign 4 ; \ + .long (from) - . ; \ + .long (to) - . ; \ + DEFINE_EXTABLE_TYPE_REG \ + extable_type_reg reg=reg1, type=type1 ; \ + UNDEFINE_EXTABLE_TYPE_REG \ + .popsection + # ifdef CONFIG_KPROBES # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 45697e04d771..6a4995e4cfae 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -79,4 +79,40 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) kasan_check_write(dst, size); return __copy_user_flushcache(dst, src, size); } + +/* + * Zero Userspace. + */ + +__must_check unsigned long +clear_user_original(void __user *addr, unsigned long len); +__must_check unsigned long +clear_user_rep_good(void __user *addr, unsigned long len); +__must_check unsigned long +clear_user_erms(void __user *addr, unsigned long len); + +static __always_inline __must_check unsigned long +___clear_user(void __user *addr, unsigned long len) +{ + unsigned long ret; + + /* + * No memory constraint because it doesn't change any memory gcc + * knows about. + */ + + might_fault(); + alternative_call_2( + clear_user_original, + clear_user_rep_good, + X86_FEATURE_REP_GOOD, + clear_user_erms, + X86_FEATURE_ERMS, + ASM_OUTPUT2("=a" (ret), "=D" (addr), "=c" (len)), + "1" (addr), "2" (len) + : "%rdx", "cc"); + return ret; +} + +#define __clear_user(d, n) ___clear_user(d, n) #endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index fe59b8ac4fcc..abe1f44ea422 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ #include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/smap.h> #include <asm/export.h> /* @@ -50,3 +52,101 @@ SYM_FUNC_START(clear_page_erms) RET SYM_FUNC_END(clear_page_erms) EXPORT_SYMBOL_GPL(clear_page_erms) + +/* + * Default clear user-space. + * Input: + * rdi destination + * rcx count + * + * Output: + * rax uncopied bytes or 0 if successful. + */ + +SYM_FUNC_START(clear_user_original) + ASM_STAC + movq %rcx,%rax + shrq $3,%rcx + andq $7,%rax + testq %rcx,%rcx + jz 1f + + .p2align 4 +0: movq $0,(%rdi) + leaq 8(%rdi),%rdi + decq %rcx + jnz 0b + +1: movq %rax,%rcx + testq %rcx,%rcx + jz 3f + +2: movb $0,(%rdi) + incq %rdi + decl %ecx + jnz 2b + +3: ASM_CLAC + movq %rcx,%rax + RET + + _ASM_EXTABLE_TYPE_REG(0b, 3b, EX_TYPE_UCOPY_LEN8, %rax) + _ASM_EXTABLE_UA(2b, 3b) +SYM_FUNC_END(clear_user_original) +EXPORT_SYMBOL(clear_user_original) + +/* + * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is + * present. + * Input: + * rdi destination + * rcx count + * + * Output: + * rax uncopied bytes or 0 if successful. + */ + +SYM_FUNC_START(clear_user_rep_good) + ASM_STAC + movq %rcx,%rdx + xorq %rax,%rax + shrq $3,%rcx + andq $7,%rdx + +0: rep stosq + movq %rdx,%rcx + +1: rep stosb + +3: ASM_CLAC + movq %rcx,%rax + RET + + _ASM_EXTABLE_TYPE_REG(0b, 3b, EX_TYPE_UCOPY_LEN8, %rdx) + _ASM_EXTABLE_UA(1b, 3b) +SYM_FUNC_END(clear_user_rep_good) +EXPORT_SYMBOL(clear_user_rep_good) + +/* + * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present. + * Input: + * rdi destination + * rcx count + * + * Output: + * rax uncopied bytes or 0 if successful. + */ + +SYM_FUNC_START(clear_user_erms) + xorq %rax,%rax + ASM_STAC + +0: rep stosb + +3: ASM_CLAC + movq %rcx,%rax + RET + + _ASM_EXTABLE_UA(0b, 3b) +SYM_FUNC_END(clear_user_erms) +EXPORT_SYMBOL(clear_user_erms) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0402a749f3a0..3a2872c9c4a9 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -14,38 +14,6 @@ * Zero Userspace */ -unsigned long __clear_user(void __user *addr, unsigned long size) -{ - long __d0; - might_fault(); - /* no memory constraint because it doesn't change any memory gcc knows - about */ - stac(); - asm volatile( - " testq %[size8],%[size8]\n" - " jz 4f\n" - " .align 16\n" - "0: movq $0,(%[dst])\n" - " addq $8,%[dst]\n" - " decl %%ecx ; jnz 0b\n" - "4: movq %[size1],%%rcx\n" - " testl %%ecx,%%ecx\n" - " jz 2f\n" - "1: movb $0,(%[dst])\n" - " incq %[dst]\n" - " decl %%ecx ; jnz 1b\n" - "2:\n" - - _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1]) - _ASM_EXTABLE_UA(1b, 2b) - - : [size8] "=&c"(size), [dst] "=&D" (__d0) - : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr)); - clac(); - return size; -} -EXPORT_SYMBOL(__clear_user); - unsigned long clear_user(void __user *to, unsigned long n) { if (access_ok(to, n))