Kernel TLB invalidation IPIs are a common source of interference on NOHZ_FULL CPUs. Given NOHZ_FULL CPUs executing in userspace are not accessing any kernel addresses, these invalidations do not need to happen immediately, and can be deferred until the next user->kernel transition. Add a minimal, noinstr-compliant variant of __flush_tlb_all() that doesn't try to leverage INVPCID. To achieve this: o Add a noinstr variant of native_write_cr4(), keeping native_write_cr4() as "only" __no_profile. o Make invalidate_user_asid() __always_inline XXX: what about paravirt? Should these be instead new operations made available through pv_ops.mmu.*? x86_64_start_kernel() uses __native_tlb_flush_global() regardless of paravirt, so I'm thinking the paravirt variants are more optimizations than hard requirements? Signed-off-by: Valentin Schneider <vschneid@xxxxxxxxxx> --- arch/x86/include/asm/context_tracking_work.h | 4 +++ arch/x86/include/asm/special_insns.h | 1 + arch/x86/include/asm/tlbflush.h | 16 ++++++++++-- arch/x86/kernel/cpu/common.c | 6 ++++- arch/x86/mm/tlb.c | 26 ++++++++++++++++++-- include/linux/context_tracking_work.h | 2 ++ 6 files changed, 50 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/context_tracking_work.h b/arch/x86/include/asm/context_tracking_work.h index 2c66687ce00e2..9d4f021b5a45b 100644 --- a/arch/x86/include/asm/context_tracking_work.h +++ b/arch/x86/include/asm/context_tracking_work.h @@ -3,6 +3,7 @@ #define _ASM_X86_CONTEXT_TRACKING_WORK_H #include <asm/sync_core.h> +#include <asm/tlbflush.h> static __always_inline void arch_context_tracking_work(int work) { @@ -10,6 +11,9 @@ static __always_inline void arch_context_tracking_work(int work) case CONTEXT_WORK_SYNC: sync_core(); break; + case CONTEXT_WORK_TLBI: + __flush_tlb_all_noinstr(); + break; } } diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index aec6e2d3aa1d5..b97157a69d48e 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -74,6 +74,7 @@ static inline unsigned long native_read_cr4(void) return val; } +void native_write_cr4_noinstr(unsigned long val); void native_write_cr4(unsigned long val); #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 69e79fff41b80..a653b5f47f0e6 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -18,6 +18,7 @@ DECLARE_PER_CPU(u64, tlbstate_untag_mask); void __flush_tlb_all(void); +void noinstr __flush_tlb_all_noinstr(void); #define TLB_FLUSH_ALL -1UL #define TLB_GENERATION_INVALID 0 @@ -418,9 +419,20 @@ static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) #endif #endif /* !MODULE */ +#define __NATIVE_TLB_FLUSH_GLOBAL(suffix, cr4) \ + native_write_cr4##suffix(cr4 ^ X86_CR4_PGE); \ + native_write_cr4##suffix(cr4) +#define NATIVE_TLB_FLUSH_GLOBAL(cr4) __NATIVE_TLB_FLUSH_GLOBAL(, cr4) +#define NATIVE_TLB_FLUSH_GLOBAL_NOINSTR(cr4) __NATIVE_TLB_FLUSH_GLOBAL(_noinstr, cr4) + static inline void __native_tlb_flush_global(unsigned long cr4) { - native_write_cr4(cr4 ^ X86_CR4_PGE); - native_write_cr4(cr4); + NATIVE_TLB_FLUSH_GLOBAL(cr4); } + +static inline void __native_tlb_flush_global_noinstr(unsigned long cr4) +{ + NATIVE_TLB_FLUSH_GLOBAL_NOINSTR(cr4); +} + #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5f221ea56888..a84bb8511650b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -424,7 +424,7 @@ void native_write_cr0(unsigned long val) } EXPORT_SYMBOL(native_write_cr0); -void __no_profile native_write_cr4(unsigned long val) +noinstr void native_write_cr4_noinstr(unsigned long val) { unsigned long bits_changed = 0; @@ -442,6 +442,10 @@ void __no_profile native_write_cr4(unsigned long val) bits_changed); } } +void native_write_cr4(unsigned long val) +{ + native_write_cr4_noinstr(val); +} #if IS_MODULE(CONFIG_LKDTM) EXPORT_SYMBOL_GPL(native_write_cr4); #endif diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 86593d1b787d8..973a4ab3f53b3 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -256,7 +256,7 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, * * See SWITCH_TO_USER_CR3. */ -static inline void invalidate_user_asid(u16 asid) +static __always_inline void invalidate_user_asid(u16 asid) { /* There is no user ASID if address space separation is off */ if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) @@ -1198,7 +1198,7 @@ STATIC_NOPV void native_flush_tlb_global(void) /* * Flush the entire current user mapping */ -STATIC_NOPV void native_flush_tlb_local(void) +static noinstr void native_flush_tlb_local_noinstr(void) { /* * Preemption or interrupts must be disabled to protect the access @@ -1213,6 +1213,11 @@ STATIC_NOPV void native_flush_tlb_local(void) native_write_cr3(__native_read_cr3()); } +STATIC_NOPV void native_flush_tlb_local(void) +{ + native_flush_tlb_local_noinstr(); +} + void flush_tlb_local(void) { __flush_tlb_local(); @@ -1240,6 +1245,23 @@ void __flush_tlb_all(void) } EXPORT_SYMBOL_GPL(__flush_tlb_all); +void noinstr __flush_tlb_all_noinstr(void) +{ + /* + * This is for invocation in early entry code that cannot be + * instrumented. A RMW to CR4 works for most cases, but relies on + * being able to flip either of the PGE or PCIDE bits. Flipping CR4.PCID + * would require also resetting CR3.PCID, so just try with CR4.PGE, else + * do the CR3 write. + * + * XXX: this gives paravirt the finger. + */ + if (cpu_feature_enabled(X86_FEATURE_PGE)) + __native_tlb_flush_global_noinstr(this_cpu_read(cpu_tlbstate.cr4)); + else + native_flush_tlb_local_noinstr(); +} + void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { struct flush_tlb_info *info; diff --git a/include/linux/context_tracking_work.h b/include/linux/context_tracking_work.h index 13fc97b395030..47d5ced39a43a 100644 --- a/include/linux/context_tracking_work.h +++ b/include/linux/context_tracking_work.h @@ -6,11 +6,13 @@ enum { CONTEXT_WORK_SYNC_OFFSET, + CONTEXT_WORK_TLBI_OFFSET, CONTEXT_WORK_MAX_OFFSET }; enum ct_work { CONTEXT_WORK_SYNC = BIT(CONTEXT_WORK_SYNC_OFFSET), + CONTEXT_WORK_TLBI = BIT(CONTEXT_WORK_TLBI_OFFSET), CONTEXT_WORK_MAX = BIT(CONTEXT_WORK_MAX_OFFSET) }; -- 2.43.0