Re: [PATCH] x86/clear_user: Make it faster

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 24 May 2022 at 13:32, Borislav Petkov <bp@xxxxxxxxx> wrote:
>
> Ok,
>
> finally a somewhat final version, lightly tested.
>
> I still need to run it on production Icelake and that is kinda being
> delayed due to server room cooling issues (don't ask ;-\).
>
> ---
> From: Borislav Petkov <bp@xxxxxxx>
>
> Based on a patch by Mark Hemment <markhemm@xxxxxxxxxxxxxx> and
> incorporating very sane suggestions from Linus.
>
> The point here is to have the default case with FSRM - which is supposed
> to be the majority of x86 hw out there - if not now then soon - be
> directly inlined into the instruction stream so that no function call
> overhead is taking place.
>
> The benchmarks I ran would show very small improvements and a PF
> benchmark would even show weird things like slowdowns with higher core
> counts.
>
> So for a ~6m running the git test suite, the function gets called under
> 700K times, all from padzero():
>
>   <...>-2536    [006] .....   261.208801: padzero: to: 0x55b0663ed214, size: 3564, cycles: 21900
>   <...>-2536    [006] .....   261.208819: padzero: to: 0x7f061adca078, size: 3976, cycles: 17160
>   <...>-2537    [008] .....   261.211027: padzero: to: 0x5572d019e240, size: 3520, cycles: 23850
>   <...>-2537    [008] .....   261.211049: padzero: to: 0x7f1288dc9078, size: 3976, cycles: 15900
>    ...
>
> which is around 1%-ish of the total time and which is consistent with
> the benchmark numbers.
>
> So Mel gave me the idea to simply measure how fast the function becomes.
> I.e.:
>
>   start = rdtsc_ordered();
>   ret = __clear_user(to, n);
>   end = rdtsc_ordered();
>
> Computing the mean average of all the samples collected during the test
> suite run then shows some improvement:
>
>   clear_user_original:
>   Amean: 9219.71 (Sum: 6340154910, samples: 687674)
>
>   fsrm:
>   Amean: 8030.63 (Sum: 5522277720, samples: 687652)
>
> That's on Zen3.
>
> Signed-off-by: Borislav Petkov <bp@xxxxxxx>
> Link: https://lore.kernel.org/r/CAHk-=wh=Mu_EYhtOmPn6AxoQZyEh-4fo2Zx3G7rBv1g7vwoKiw@xxxxxxxxxxxxxx
> ---
>  arch/x86/include/asm/uaccess.h    |   5 +-
>  arch/x86/include/asm/uaccess_64.h |  45 ++++++++++
>  arch/x86/lib/clear_page_64.S      | 142 ++++++++++++++++++++++++++++++
>  arch/x86/lib/usercopy_64.c        |  40 ---------
>  tools/objtool/check.c             |   3 +
>  5 files changed, 192 insertions(+), 43 deletions(-)

[...snip...]
> diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> index fe59b8ac4fcc..6dd6c7fd1eb8 100644
...
> +/*
> + * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
> + * present.
> + * Input:
> + * rdi destination
> + * rcx count
> + *
> + * Output:
> + * rcx: uncleared bytes or 0 if successful.
> + */
> +SYM_FUNC_START(clear_user_rep_good)
> +       # call the original thing for less than a cacheline
> +       cmp $64, %rcx
> +       ja  .Lprep
> +       call clear_user_original
> +       RET
> +
> +.Lprep:
> +       # copy lower 32-bits for rest bytes
> +       mov %ecx, %edx
> +       shr $3, %rcx
> +       jz .Lrep_good_rest_bytes

A slight doubt here; comment says "less than a cachline", but the code
is using 'ja' (jump if above) - so calls 'clear_user_original' for a
'len' less than or equal to 64.
Not sure of the intended behaviour for 64 bytes here, but
'copy_user_enhanced_fast_string' uses the slow-method for lengths less
than 64.  So, should this be coded as;
    cmp $64,%rcx
    jb clear_user_original
?

'clear_user_erms' has similar logic which might also need reworking.

> +
> +.Lrep_good_qwords:
> +       rep stosq
> +
> +.Lrep_good_rest_bytes:
> +       and $7, %edx
> +       jz .Lrep_good_exit
> +
> +.Lrep_good_bytes:
> +       mov %edx, %ecx
> +       rep stosb
> +
> +.Lrep_good_exit:
> +       # see .Lexit comment above
> +       xor %eax, %eax
> +       RET
> +
> +.Lrep_good_qwords_exception:
> +       # convert remaining qwords back into bytes to return to caller
> +       shl $3, %rcx
> +       and $7, %edx
> +       add %rdx, %rcx
> +       jmp .Lrep_good_exit
> +
> +       _ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
> +       _ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
> +SYM_FUNC_END(clear_user_rep_good)
> +EXPORT_SYMBOL(clear_user_rep_good)
> +
> +/*
> + * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
> + * Input:
> + * rdi destination
> + * rcx count
> + *
> + * Output:
> + * rcx: uncleared bytes or 0 if successful.
> + *
> + */
> +SYM_FUNC_START(clear_user_erms)
> +       # call the original thing for less than a cacheline
> +       cmp $64, %rcx
> +       ja  .Lerms_bytes
> +       call clear_user_original
> +       RET
> +
> +.Lerms_bytes:
> +       rep stosb
> +
> +.Lerms_exit:
> +       xorl %eax,%eax
> +       RET
> +
> +       _ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
> +SYM_FUNC_END(clear_user_erms)
> +EXPORT_SYMBOL(clear_user_erms)
> diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
> index 0ae6cf804197..6c1f8ac5e721 100644
> --- a/arch/x86/lib/usercopy_64.c
> +++ b/arch/x86/lib/usercopy_64.c
> @@ -14,46 +14,6 @@
>   * Zero Userspace
>   */
>
> -unsigned long __clear_user(void __user *addr, unsigned long size)
> -{
> -       long __d0;
> -       might_fault();
> -       /* no memory constraint because it doesn't change any memory gcc knows
> -          about */
> -       stac();
> -       asm volatile(
> -               "       testq  %[size8],%[size8]\n"
> -               "       jz     4f\n"
> -               "       .align 16\n"
> -               "0:     movq $0,(%[dst])\n"
> -               "       addq   $8,%[dst]\n"
> -               "       decl %%ecx ; jnz   0b\n"
> -               "4:     movq  %[size1],%%rcx\n"
> -               "       testl %%ecx,%%ecx\n"
> -               "       jz     2f\n"
> -               "1:     movb   $0,(%[dst])\n"
> -               "       incq   %[dst]\n"
> -               "       decl %%ecx ; jnz  1b\n"
> -               "2:\n"
> -
> -               _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1])
> -               _ASM_EXTABLE_UA(1b, 2b)
> -
> -               : [size8] "=&c"(size), [dst] "=&D" (__d0)
> -               : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
> -       clac();
> -       return size;
> -}
> -EXPORT_SYMBOL(__clear_user);
> -
> -unsigned long clear_user(void __user *to, unsigned long n)
> -{
> -       if (access_ok(to, n))
> -               return __clear_user(to, n);
> -       return n;
> -}
> -EXPORT_SYMBOL(clear_user);
> -
>  #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
>  /**
>   * clean_cache_range - write back a cache range with CLWB
> diff --git a/tools/objtool/check.c b/tools/objtool/check.c
> index ca5b74603008..e460aa004b88 100644
> --- a/tools/objtool/check.c
> +++ b/tools/objtool/check.c
> @@ -1020,6 +1020,9 @@ static const char *uaccess_safe_builtin[] = {
>         "copy_mc_fragile_handle_tail",
>         "copy_mc_enhanced_fast_string",
>         "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
> +       "clear_user_erms",
> +       "clear_user_rep_good",
> +       "clear_user_original",
>         NULL
>  };
>
> --
> 2.35.1
>
>
> --
> Regards/Gruss,
>     Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

Cheers,
Mark




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux