On Wed, Apr 03, 2019 at 11:34:13AM -0600, Khalid Aziz wrote: > diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c > index 999d6d8f0bef..cc806a01a0eb 100644 > --- a/arch/x86/mm/tlb.c > +++ b/arch/x86/mm/tlb.c > @@ -37,6 +37,20 @@ > */ > #define LAST_USER_MM_IBPB 0x1UL > > +/* > + * A TLB flush may be needed to flush stale TLB entries > + * for pages that have been mapped into userspace and unmapped > + * from kernel space. This TLB flush needs to be propagated to > + * all CPUs. Asynchronous flush requests to all CPUs can cause > + * significant performance imapct. Queue a pending flush for > + * a CPU instead. Multiple of these requests can then be handled > + * by a CPU at a less disruptive time, like context switch, in > + * one go and reduce performance impact significantly. Following > + * data structure is used to keep track of CPUs with pending full > + * TLB flush forced by xpfo. > + */ > +static cpumask_t pending_xpfo_flush; > + > /* > * We get here when we do something requiring a TLB invalidation > * but could not go invalidate all of the contexts. We do the > @@ -321,6 +335,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, > __flush_tlb_all(); > } > #endif > + > + /* > + * If there is a pending TLB flush for this CPU due to XPFO > + * flush, do it now. > + */ > + if (cpumask_test_and_clear_cpu(cpu, &pending_xpfo_flush)) { > + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); > + __flush_tlb_all(); > + } That really should be: if (cpumask_test_cpu(cpu, &pending_xpfo_flush)) { cpumask_clear_cpu(cpu, &pending_xpfo_flush); count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); } test_and_clear is an unconditional RmW and can cause cacheline contention between adjecent CPUs even if none of the bits are set. > + > this_cpu_write(cpu_tlbstate.is_lazy, false); > > /* > @@ -803,6 +827,34 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) > } > } > > +void xpfo_flush_tlb_kernel_range(unsigned long start, unsigned long end) > +{ > + struct cpumask tmp_mask; > + > + /* > + * Balance as user space task's flush, a bit conservative. > + * Do a local flush immediately and post a pending flush on all > + * other CPUs. Local flush can be a range flush or full flush > + * depending upon the number of entries to be flushed. Remote > + * flushes will be done by individual processors at the time of > + * context switch and this allows multiple flush requests from > + * other CPUs to be batched together. > + */ > + if (end == TLB_FLUSH_ALL || > + (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { > + do_flush_tlb_all(NULL); > + } else { > + struct flush_tlb_info info; > + > + info.start = start; > + info.end = end; > + do_kernel_range_flush(&info); > + } > + cpumask_setall(&tmp_mask); > + __cpumask_clear_cpu(smp_processor_id(), &tmp_mask); > + cpumask_or(&pending_xpfo_flush, &pending_xpfo_flush, &tmp_mask); > +} > + > void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) > { > struct flush_tlb_info info = { > diff --git a/arch/x86/mm/xpfo.c b/arch/x86/mm/xpfo.c > index b42513347865..638eee5b1f09 100644 > --- a/arch/x86/mm/xpfo.c > +++ b/arch/x86/mm/xpfo.c > @@ -118,7 +118,7 @@ inline void xpfo_flush_kernel_tlb(struct page *page, int order) > return; > } > > - flush_tlb_kernel_range(kaddr, kaddr + (1 << order) * size); > + xpfo_flush_tlb_kernel_range(kaddr, kaddr + (1 << order) * size); > } > EXPORT_SYMBOL_GPL(xpfo_flush_kernel_tlb); So this patch is the one that makes it 'work', but I'm with Andy on hating it something fierce. Up until this point x86_64 is completely buggered in this series, after this it sorta works but *urgh* what crap. All in all your changelog is complete and utter garbage, this is _NOT_ a performance issue. It is a very much a correctness issue. Also; I distinctly dislike the inconsistent TLB states this generates. It makes it very hard to argue for its correctness..