Kernel TLB invalidation IPIs are a common source of interference on NOHZ_FULL CPUs. Given NOHZ_FULL CPUs executing in userspace are not accessing any kernel addresses, these invalidations do not need to happen immediately, and can be deferred until the next user->kernel transition. Rather than make __flush_tlb_all() noinstr, add a minimal noinstr variant that doesn't try to leverage INVPCID. FIXME: not fully noinstr compliant XXX: same issue as with ins patching, when do we access data that should be invalidated? Signed-off-by: Valentin Schneider <vschneid@xxxxxxxxxx> --- arch/x86/include/asm/context_tracking_work.h | 4 ++++ arch/x86/include/asm/tlbflush.h | 1 + arch/x86/mm/tlb.c | 17 +++++++++++++++++ include/linux/context_tracking_work.h | 2 ++ 4 files changed, 24 insertions(+) diff --git a/arch/x86/include/asm/context_tracking_work.h b/arch/x86/include/asm/context_tracking_work.h index 2c66687ce00e2..9d4f021b5a45b 100644 --- a/arch/x86/include/asm/context_tracking_work.h +++ b/arch/x86/include/asm/context_tracking_work.h @@ -3,6 +3,7 @@ #define _ASM_X86_CONTEXT_TRACKING_WORK_H #include <asm/sync_core.h> +#include <asm/tlbflush.h> static __always_inline void arch_context_tracking_work(int work) { @@ -10,6 +11,9 @@ static __always_inline void arch_context_tracking_work(int work) case CONTEXT_WORK_SYNC: sync_core(); break; + case CONTEXT_WORK_TLBI: + __flush_tlb_all_noinstr(); + break; } } diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 75bfaa4210303..9064aa0027d05 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -15,6 +15,7 @@ #include <asm/pgtable.h> void __flush_tlb_all(void); +void noinstr __flush_tlb_all_noinstr(void); #define TLB_FLUSH_ALL -1UL #define TLB_GENERATION_INVALID 0 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 267acf27480af..631df9189ded4 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1237,6 +1237,23 @@ void __flush_tlb_all(void) } EXPORT_SYMBOL_GPL(__flush_tlb_all); +void noinstr __flush_tlb_all_noinstr(void) +{ + /* + * This is for invocation in early entry code that cannot be + * instrumented. A RMW to CR4 works for most cases, but relies on + * being able to flip either of the PGE or PCIDE bits. Flipping CR4.PCID + * would require also resetting CR3.PCID, so just try with CR4.PGE, else + * do the CR3 write. + * + * TODO: paravirt + */ + if (cpu_feature_enabled(X86_FEATURE_PGE)) + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + else + flush_tlb_local(); +} + void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { struct flush_tlb_info *info; diff --git a/include/linux/context_tracking_work.h b/include/linux/context_tracking_work.h index b0c7463048b60..c084a60d2fec5 100644 --- a/include/linux/context_tracking_work.h +++ b/include/linux/context_tracking_work.h @@ -7,12 +7,14 @@ enum { CONTEXT_WORK_DISABLED_OFFSET, CONTEXT_WORK_SYNC_OFFSET, + CONTEXT_WORK_TLBI_OFFSET, CONTEXT_WORK_MAX_OFFSET }; enum ct_work { CONTEXT_WORK_DISABLED = BIT(CONTEXT_WORK_DISABLED_OFFSET), CONTEXT_WORK_SYNC = BIT(CONTEXT_WORK_SYNC_OFFSET), + CONTEXT_WORK_TLBI = BIT(CONTEXT_WORK_TLBI_OFFSET), CONTEXT_WORK_MAX = BIT(CONTEXT_WORK_MAX_OFFSET) }; -- 2.31.1