[ add Linus because he had comments the last time memcpy_mcsafe() was reworked [1] ] On Fri, Apr 10, 2020 at 11:06 AM Dan Williams <dan.j.williams@xxxxxxxxx> wrote: > > The original memcpy_mcsafe() implementation satisfied two primary > concerns. It provided a copy routine that avoided known unrecoverable > scenarios (poison consumption via fast-string instructions / accesses > across cacheline boundaries), and it provided a fallback to fast plain > memcpy if the platform did not indicate recovery capability. > > However, the platform indication for recovery capability is not > architectural so it was always going to require an ever growing list of > opt-in quirks. Also, ongoing hardware improvements to recoverability > mean that the cross-cacheline-boundary and fast-string poison > consumption scenarios are no longer concerns on newer platforms. The > introduction of memcpy_mcsafe_fast(), and resulting reworks, recovers > performance for these newer CPUs, but without the maintenance burden of > an opt-in whitelist. > > With memcpy_mcsafe_fast() the existing opt-in becomes a blacklist for > CPUs that benefit from a more careful slower implementation. Every other > CPU gets a fast, but optionally recoverable implementation. > > With this in place memcpy_mcsafe() never falls back to plain memcpy(). > It can be used in any scenario where the caller needs guarantees that > source or destination access faults / exceptions will be handled. > Systems without recovery support continue to default to a fast > implementation while newer systems both default to fast, and support > recovery, without needing an explicit quirk. > > Cc: x86@xxxxxxxxxx > Cc: <stable@xxxxxxxxxxxxxxx> > Cc: Ingo Molnar <mingo@xxxxxxxxxx> > Cc: Borislav Petkov <bp@xxxxxxxxx> > Cc: "H. Peter Anvin" <hpa@xxxxxxxxx> > Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> > Acked-by: Tony Luck <tony.luck@xxxxxxxxx> > Reported-by: Erwin Tsaur <erwin.tsaur@xxxxxxxxx> > Tested-by: Erwin Tsaur <erwin.tsaur@xxxxxxxxx> > Fixes: 92b0729c34ca ("x86/mm, x86/mce: Add memcpy_mcsafe()") > Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> > --- > Note that I marked this for stable and included a Fixes tag because it > is arguably a regression that old kernels stop recovering from poison > consumption on new hardware. This still applies cleanly to tip/master as of yesterday. Any concerns? [1]: https://lore.kernel.org/lkml/CA+55aFwJ=DvvcrM725KOYAF_c=uKiQcuHz4taPjbKveOVPycKA@xxxxxxxxxxxxxx/ > > arch/x86/include/asm/string_64.h | 24 ++++++------- > arch/x86/include/asm/uaccess_64.h | 7 +--- > arch/x86/kernel/cpu/mce/core.c | 6 ++- > arch/x86/kernel/quirks.c | 4 +- > arch/x86/lib/memcpy_64.S | 57 +++++++++++++++++++++++++++--- > arch/x86/lib/usercopy_64.c | 6 +-- > tools/objtool/check.c | 3 +- > tools/perf/bench/mem-memcpy-x86-64-lib.c | 8 +--- > tools/testing/nvdimm/test/nfit.c | 2 + > 9 files changed, 75 insertions(+), 42 deletions(-) > > diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h > index 75314c3dbe47..07840fa3582a 100644 > --- a/arch/x86/include/asm/string_64.h > +++ b/arch/x86/include/asm/string_64.h > @@ -83,21 +83,23 @@ int strcmp(const char *cs, const char *ct); > #endif > > #define __HAVE_ARCH_MEMCPY_MCSAFE 1 > -__must_check unsigned long __memcpy_mcsafe(void *dst, const void *src, > +__must_check unsigned long memcpy_mcsafe_slow(void *dst, const void *src, > size_t cnt); > -DECLARE_STATIC_KEY_FALSE(mcsafe_key); > +__must_check unsigned long memcpy_mcsafe_fast(void *dst, const void *src, > + size_t cnt); > +DECLARE_STATIC_KEY_FALSE(mcsafe_slow_key); > > /** > - * memcpy_mcsafe - copy memory with indication if a machine check happened > + * memcpy_mcsafe - copy memory with indication if an exception / fault happened > * > * @dst: destination address > * @src: source address > * @cnt: number of bytes to copy > * > - * Low level memory copy function that catches machine checks > - * We only call into the "safe" function on systems that can > - * actually do machine check recovery. Everyone else can just > - * use memcpy(). > + * The slow version is opted into by platform quirks. The fast version > + * is equivalent to memcpy() regardless of CPU machine-check-recovery > + * capability, but may still fall back to the slow version if the CPU > + * lacks fast-string instruction support. > * > * Return 0 for success, or number of bytes not copied if there was an > * exception. > @@ -106,12 +108,10 @@ static __always_inline __must_check unsigned long > memcpy_mcsafe(void *dst, const void *src, size_t cnt) > { > #ifdef CONFIG_X86_MCE > - if (static_branch_unlikely(&mcsafe_key)) > - return __memcpy_mcsafe(dst, src, cnt); > - else > + if (static_branch_unlikely(&mcsafe_slow_key)) > + return memcpy_mcsafe_slow(dst, src, cnt); > #endif > - memcpy(dst, src, cnt); > - return 0; > + return memcpy_mcsafe_fast(dst, src, cnt); > } > > #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE > diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h > index 5cd1caa8bc65..f8c0d38c3f45 100644 > --- a/arch/x86/include/asm/uaccess_64.h > +++ b/arch/x86/include/asm/uaccess_64.h > @@ -52,12 +52,7 @@ copy_to_user_mcsafe(void *to, const void *from, unsigned len) > unsigned long ret; > > __uaccess_begin(); > - /* > - * Note, __memcpy_mcsafe() is explicitly used since it can > - * handle exceptions / faults. memcpy_mcsafe() may fall back to > - * memcpy() which lacks this handling. > - */ > - ret = __memcpy_mcsafe(to, from, len); > + ret = memcpy_mcsafe(to, from, len); > __uaccess_end(); > return ret; > } > diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c > index 2c4f949611e4..6bf94d39dc7f 100644 > --- a/arch/x86/kernel/cpu/mce/core.c > +++ b/arch/x86/kernel/cpu/mce/core.c > @@ -2579,13 +2579,13 @@ static void __init mcheck_debugfs_init(void) > static void __init mcheck_debugfs_init(void) { } > #endif > > -DEFINE_STATIC_KEY_FALSE(mcsafe_key); > -EXPORT_SYMBOL_GPL(mcsafe_key); > +DEFINE_STATIC_KEY_FALSE(mcsafe_slow_key); > +EXPORT_SYMBOL_GPL(mcsafe_slow_key); > > static int __init mcheck_late_init(void) > { > if (mca_cfg.recovery) > - static_branch_inc(&mcsafe_key); > + static_branch_inc(&mcsafe_slow_key); > > mcheck_debugfs_init(); > cec_init(); > diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c > index 896d74cb5081..89c88d9de5c4 100644 > --- a/arch/x86/kernel/quirks.c > +++ b/arch/x86/kernel/quirks.c > @@ -636,7 +636,7 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) > pci_read_config_dword(pdev, 0x84, &capid0); > > if (capid0 & 0x10) > - static_branch_inc(&mcsafe_key); > + static_branch_inc(&mcsafe_slow_key); > } > > /* Skylake */ > @@ -653,7 +653,7 @@ static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) > * enabled, so memory machine check recovery is also enabled. > */ > if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0)) > - static_branch_inc(&mcsafe_key); > + static_branch_inc(&mcsafe_slow_key); > > } > DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); > diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S > index 56b243b14c3a..b5e4fc1cf99d 100644 > --- a/arch/x86/lib/memcpy_64.S > +++ b/arch/x86/lib/memcpy_64.S > @@ -189,11 +189,18 @@ SYM_FUNC_END(memcpy_orig) > MCSAFE_TEST_CTL > > /* > - * __memcpy_mcsafe - memory copy with machine check exception handling > - * Note that we only catch machine checks when reading the source addresses. > - * Writes to target are posted and don't generate machine checks. > + * memcpy_mcsafe_slow() - memory copy with exception handling > + * > + * In contrast to memcpy_mcsafe_fast() this version is careful to > + * never perform a read across a cacheline boundary, and not use > + * fast-string instruction sequences which are known to be unrecoverable > + * on CPUs identified by mcsafe_slow_key. > + * > + * Note that this only catches machine check exceptions when reading the > + * source addresses. Writes to target are posted and don't generate > + * machine checks. However this does handle protection faults on writes. > */ > -SYM_FUNC_START(__memcpy_mcsafe) > +SYM_FUNC_START(memcpy_mcsafe_slow) > cmpl $8, %edx > /* Less than 8 bytes? Go to byte copy loop */ > jb .L_no_whole_words > @@ -260,8 +267,8 @@ SYM_FUNC_START(__memcpy_mcsafe) > xorl %eax, %eax > .L_done: > ret > -SYM_FUNC_END(__memcpy_mcsafe) > -EXPORT_SYMBOL_GPL(__memcpy_mcsafe) > +SYM_FUNC_END(memcpy_mcsafe_slow) > +EXPORT_SYMBOL_GPL(memcpy_mcsafe_slow) > > .section .fixup, "ax" > /* > @@ -296,4 +303,42 @@ EXPORT_SYMBOL_GPL(__memcpy_mcsafe) > _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes) > _ASM_EXTABLE(.L_write_words, .E_write_words) > _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) > + > +/* > + * memcpy_mcsafe_fast - memory copy with exception handling > + * > + * Fast string copy + exception handling. If the CPU does support > + * machine check exception recovery, but does not support recovering > + * from fast-string exceptions then this CPU needs to be added to the > + * mcsafe_slow_key set of quirks. Otherwise, absent any machine check > + * recovery support this version should be no slower than standard > + * memcpy. > + */ > +SYM_FUNC_START(memcpy_mcsafe_fast) > + ALTERNATIVE "jmp memcpy_mcsafe_slow", "", X86_FEATURE_ERMS > + movq %rdi, %rax > + movq %rdx, %rcx > +.L_copy: > + rep movsb > + /* Copy successful. Return zero */ > + xorl %eax, %eax > + ret > +SYM_FUNC_END(memcpy_mcsafe_fast) > +EXPORT_SYMBOL_GPL(memcpy_mcsafe_fast) > + > + .section .fixup, "ax" > +.E_copy: > + /* > + * On fault %rcx is updated such that the copy instruction could > + * optionally be restarted at the fault position, i.e. it > + * contains 'bytes remaining'. A non-zero return indicates error > + * to memcpy_mcsafe() users, or indicate short transfers to > + * user-copy routines. > + */ > + movq %rcx, %rax > + ret > + > + .previous > + > + _ASM_EXTABLE_FAULT(.L_copy, .E_copy) > #endif > diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c > index fff28c6f73a2..348c9331748e 100644 > --- a/arch/x86/lib/usercopy_64.c > +++ b/arch/x86/lib/usercopy_64.c > @@ -64,11 +64,7 @@ __visible notrace unsigned long > mcsafe_handle_tail(char *to, char *from, unsigned len) > { > for (; len; --len, to++, from++) { > - /* > - * Call the assembly routine back directly since > - * memcpy_mcsafe() may silently fallback to memcpy. > - */ > - unsigned long rem = __memcpy_mcsafe(to, from, 1); > + unsigned long rem = memcpy_mcsafe(to, from, 1); > > if (rem) > break; > diff --git a/tools/objtool/check.c b/tools/objtool/check.c > index 4768d91c6d68..bae1f77aae90 100644 > --- a/tools/objtool/check.c > +++ b/tools/objtool/check.c > @@ -485,7 +485,8 @@ static const char *uaccess_safe_builtin[] = { > "__ubsan_handle_shift_out_of_bounds", > /* misc */ > "csum_partial_copy_generic", > - "__memcpy_mcsafe", > + "memcpy_mcsafe_slow", > + "memcpy_mcsafe_fast", > "mcsafe_handle_tail", > "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */ > NULL > diff --git a/tools/perf/bench/mem-memcpy-x86-64-lib.c b/tools/perf/bench/mem-memcpy-x86-64-lib.c > index 4130734dde84..23e1747b3a67 100644 > --- a/tools/perf/bench/mem-memcpy-x86-64-lib.c > +++ b/tools/perf/bench/mem-memcpy-x86-64-lib.c > @@ -5,17 +5,13 @@ > */ > #include <linux/types.h> > > -unsigned long __memcpy_mcsafe(void *dst, const void *src, size_t cnt); > +unsigned long memcpy_mcsafe(void *dst, const void *src, size_t cnt); > unsigned long mcsafe_handle_tail(char *to, char *from, unsigned len); > > unsigned long mcsafe_handle_tail(char *to, char *from, unsigned len) > { > for (; len; --len, to++, from++) { > - /* > - * Call the assembly routine back directly since > - * memcpy_mcsafe() may silently fallback to memcpy. > - */ > - unsigned long rem = __memcpy_mcsafe(to, from, 1); > + unsigned long rem = memcpy_mcsafe(to, from, 1); > > if (rem) > break; > diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c > index bf6422a6af7f..282722d96f8e 100644 > --- a/tools/testing/nvdimm/test/nfit.c > +++ b/tools/testing/nvdimm/test/nfit.c > @@ -3136,7 +3136,7 @@ void mcsafe_test(void) > } > > mcsafe_test_init(dst, src, 512); > - rem = __memcpy_mcsafe(dst, src, 512); > + rem = memcpy_mcsafe_slow(dst, src, 512); > valid = mcsafe_test_validate(dst, src, 512, expect); > if (rem == expect && valid) > continue; >