On Sat, Apr 16, 2022 at 03:07:47PM +0100, Mark Hemment wrote: > I've done a skeleton implementation of alternative __clear_user() based on > CPU features. Cool! Just a couple of quick notes - more indepth look next week. > It has three versions of __clear_user(); > o __clear_user_original() - similar to the 'standard' __clear_user() > o __clear_user_rep_good() - using resp stos{qb} when CPU has 'rep_good' > o __clear_user_erms() - using 'resp stosb' when CPU has 'erms' you also need a _fsrm() one which checks X86_FEATURE_FSRM. That one should simply do rep; stosb regardless of the size. For that you can define an alternative_call_3 similar to how the _2 variant is defined. > Not claiming the implementation is ideal, but might be a useful starting > point for someone. > Patch is against 5.18.0-rc2. > Only basic sanity testing done. > > Simple performance testing done for large sizes, on a system (Intel E8400) > which has rep_good but not erms; > # dd if=/dev/zero of=/dev/null bs=16384 count=10000 > o *_original() - ~14.2GB/s. Same as the 'standard' __clear_user(). > o *_rep_good() - same throughput as *_original(). > o *_erms() - ~12.2GB/s (expected on a system without erms). Right. I have a couple of boxes too - I can run the benchmarks on them too so we should have enough perf data points eventually. > diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h > index fbcfec4dc4cc..373ed6be7a8d 100644 > --- a/arch/x86/include/asm/asm.h > +++ b/arch/x86/include/asm/asm.h > @@ -132,6 +132,35 @@ > /* Exception table entry */ > #ifdef __ASSEMBLY__ > > +# define UNDEFINE_EXTABLE_TYPE_REG \ > + .purgem extable_type_reg ; > + > +# define DEFINE_EXTABLE_TYPE_REG \ > + .macro extable_type_reg type:req reg:req ; \ I love that macro PeterZ!</sarcasm> > + .set .Lfound, 0 ; \ > + .set .Lregnr, 0 ; \ > + .irp rs,rax,rcx,rdx,rbx,rsp,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13, \ > + r14,r15 ; \ > + .ifc \reg, %\rs ; \ > + .set .Lfound, .Lfound+1 ; \ > + .long \type + (.Lregnr << 8) ; \ > + .endif ; \ > + .set .Lregnr, .Lregnr+1 ; \ > + .endr ; \ > + .set .Lregnr, 0 ; \ > + .irp rs,eax,ecx,edx,ebx,esp,ebp,esi,edi,r8d,r9d,r10d,r11d,r12d, \ > + r13d,r14d,r15d ; \ > + .ifc \reg, %\rs ; \ > + .set .Lfound, .Lfound+1 ; \ > + .long \type + (.Lregnr << 8) ; \ > + .endif ; \ > + .set .Lregnr, .Lregnr+1 ; \ > + .endr ; \ > + .if (.Lfound != 1) ; \ > + .error "extable_type_reg: bad register argument" ; \ > + .endif ; \ > + .endm ; > + > # define _ASM_EXTABLE_TYPE(from, to, type) \ > .pushsection "__ex_table","a" ; \ > .balign 4 ; \ > @@ -140,6 +169,16 @@ > .long type ; \ > .popsection > > +# define _ASM_EXTABLE_TYPE_REG(from, to, type1, reg1) \ > + .pushsection "__ex_table","a" ; \ > + .balign 4 ; \ > + .long (from) - . ; \ > + .long (to) - . ; \ > + DEFINE_EXTABLE_TYPE_REG \ > + extable_type_reg reg=reg1, type=type1 ; \ > + UNDEFINE_EXTABLE_TYPE_REG \ > + .popsection > + > # ifdef CONFIG_KPROBES > # define _ASM_NOKPROBE(entry) \ > .pushsection "_kprobe_blacklist","aw" ; \ > diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h > index 45697e04d771..6a4995e4cfae 100644 > --- a/arch/x86/include/asm/uaccess_64.h > +++ b/arch/x86/include/asm/uaccess_64.h > @@ -79,4 +79,40 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) > kasan_check_write(dst, size); > return __copy_user_flushcache(dst, src, size); > } > + > +/* > + * Zero Userspace. > + */ > + > +__must_check unsigned long > +clear_user_original(void __user *addr, unsigned long len); > +__must_check unsigned long > +clear_user_rep_good(void __user *addr, unsigned long len); > +__must_check unsigned long > +clear_user_erms(void __user *addr, unsigned long len); > + > +static __always_inline __must_check unsigned long > +___clear_user(void __user *addr, unsigned long len) > +{ > + unsigned long ret; > + > + /* > + * No memory constraint because it doesn't change any memory gcc > + * knows about. > + */ > + > + might_fault(); I think you could do the stac(); clac() sandwich around the alternative_call_2 here and remove the respective calls from the asm. > + alternative_call_2( > + clear_user_original, > + clear_user_rep_good, > + X86_FEATURE_REP_GOOD, > + clear_user_erms, > + X86_FEATURE_ERMS, > + ASM_OUTPUT2("=a" (ret), "=D" (addr), "=c" (len)), You can do here: : "+&c" (size), "+&D" (addr) :: "eax"); because size and addr need to be earlyclobbers: e0a96129db57 ("x86: use early clobbers in usercopy*.c") and then simply return size; The asm modifies it anyway - no need for ret. In any case, thanks for doing that! -- Regards/Gruss, Boris. https://people.kernel.org/tglx/notes-about-netiquette