On 2/4/20 5:17 AM, Andrea Arcangeli wrote: > With multiple NUMA nodes and multiple sockets, the tlbi broadcast > shall be delivered through the interconnects in turn increasing the > interconnect traffic and the latency of the tlbi broadcast instruction. > > Even within a single NUMA node the latency of the tlbi broadcast > instruction increases almost linearly with the number of CPUs trying to > send tlbi broadcasts at the same time. > > When the process is single threaded however we can achieve full SMP > scalability by skipping the tlbi broadcasting. Other arches already > deploy this optimization. > > After the local TLB flush this however means the ASID context goes out > of sync in all CPUs except the local one. This can be tracked in the > mm_cpumask(mm): if the bit is set it means the asid context is stale > for that CPU. This results in an extra local ASID TLB flush only if a > single threaded process is migrated to a different CPU and only after a > TLB flush. No extra local TLB flush is needed for the common case of > single threaded processes context scheduling within the same CPU and for > multithreaded processes. > > Skipping the tlbi instruction broadcasting is already implemented in > local_flush_tlb_all(), this patch only extends it to flush_tlb_mm(), > flush_tlb_range() and flush_tlb_page() too. > > Here's the result of 32 CPUs (ARMv8 Ampere) running mprotect at the same > time from 32 single threaded processes before the patch: > > Performance counter stats for './loop' (3 runs): > > 0 dummy > > 2.121353 +- 0.000387 seconds time elapsed ( +- 0.02% ) > > and with the patch applied: > > Performance counter stats for './loop' (3 runs): > > 0 dummy > > 0.1197750 +- 0.0000827 seconds time elapsed ( +- 0.07% ) Hi, I have tested this patch on thunderX2 with Himeno benchmark[1] with LARGE calculation size. Here are the results. w/o patch: MFLOPS : 1149.480174 w/ patch: MFLOPS : 1110.653003 In order to validate the effectivness of the patch, I ran a single-threded program, which calls mprotect() in a loop to issue the tlbi broadcast instruction on a CPU core. At the same time, I ran Himeno benchmark on another CPU core. The results are: w/o patch: MFLOPS : 860.238792 w/ patch: MFLOPS : 1110.449666 Though Himeno benchmark is a microbenchmark, I hope it helps. [1] http://accc.riken.jp/en/supercom/documents/himenobmt/ Thanks, QI Fuli > > Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> > --- > arch/arm64/include/asm/efi.h | 2 +- > arch/arm64/include/asm/mmu.h | 3 +- > arch/arm64/include/asm/mmu_context.h | 10 +-- > arch/arm64/include/asm/tlbflush.h | 91 +++++++++++++++++++++++++++- > arch/arm64/mm/context.c | 13 +++- > 5 files changed, 109 insertions(+), 10 deletions(-) > > diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h > index 44531a69d32b..5d9a1433d918 100644 > --- a/arch/arm64/include/asm/efi.h > +++ b/arch/arm64/include/asm/efi.h > @@ -131,7 +131,7 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) > > static inline void efi_set_pgd(struct mm_struct *mm) > { > - __switch_mm(mm); > + __switch_mm(mm, smp_processor_id()); > > if (system_uses_ttbr0_pan()) { > if (mm != current->active_mm) { > diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h > index e4d862420bb4..1f84289d3e44 100644 > --- a/arch/arm64/include/asm/mmu.h > +++ b/arch/arm64/include/asm/mmu.h > @@ -27,7 +27,8 @@ typedef struct { > * ASID change and therefore doesn't need to reload the counter using > * atomic64_read. > */ > -#define ASID(mm) ((mm)->context.id.counter & 0xffff) > +#define __ASID(asid) ((asid) & 0xffff) > +#define ASID(mm) __ASID((mm)->context.id.counter) > > extern bool arm64_use_ng_mappings; > > diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h > index 3827ff4040a3..5ec264297968 100644 > --- a/arch/arm64/include/asm/mmu_context.h > +++ b/arch/arm64/include/asm/mmu_context.h > @@ -210,10 +210,8 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) > update_saved_ttbr0(tsk, &init_mm); > } > > -static inline void __switch_mm(struct mm_struct *next) > +static inline void __switch_mm(struct mm_struct *next, unsigned int cpu) > { > - unsigned int cpu = smp_processor_id(); > - > /* > * init_mm.pgd does not contain any user mappings and it is always > * active for kernel addresses in TTBR1. Just set the reserved TTBR0. > @@ -230,8 +228,12 @@ static inline void > switch_mm(struct mm_struct *prev, struct mm_struct *next, > struct task_struct *tsk) > { > + unsigned int cpu = smp_processor_id(); > + > if (prev != next) > - __switch_mm(next); > + __switch_mm(next, cpu); > + else if (cpumask_test_and_clear_cpu(cpu, mm_cpumask(next))) > + local_flush_tlb_asid(atomic64_read(&next->context.id)); > > /* > * Update the saved TTBR0_EL1 of the scheduled-in task as the previous > diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h > index bc3949064725..283f97af3fc5 100644 > --- a/arch/arm64/include/asm/tlbflush.h > +++ b/arch/arm64/include/asm/tlbflush.h > @@ -136,6 +136,15 @@ static inline void local_flush_tlb_all(void) > isb(); > } > > +static inline void local_flush_tlb_asid(unsigned long asid) > +{ > + asid = __TLBI_VADDR(0, __ASID(asid)); > + dsb(nshst); > + __tlbi(aside1, asid); > + __tlbi_user(aside1, asid); > + dsb(nsh); > +} > + > static inline void flush_tlb_all(void) > { > dsb(ishst); > @@ -148,6 +157,27 @@ static inline void flush_tlb_mm(struct mm_struct *mm) > { > unsigned long asid = __TLBI_VADDR(0, ASID(mm)); > > + /* avoid TLB-i broadcast to remote NUMA nodes if it's a local flush */ > + if (current->mm == mm && atomic_read(&mm->mm_users) <= 1) { > + int cpu = get_cpu(); > + > + cpumask_setall(mm_cpumask(mm)); > + cpumask_clear_cpu(cpu, mm_cpumask(mm)); > + > + smp_mb(); > + > + if (atomic_read(&mm->mm_users) <= 1) { > + dsb(nshst); > + __tlbi(aside1, asid); > + __tlbi_user(aside1, asid); > + dsb(nsh); > + > + put_cpu(); > + return; > + } > + put_cpu(); > + } > + > dsb(ishst); > __tlbi(aside1is, asid); > __tlbi_user(aside1is, asid); > @@ -167,7 +197,33 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, > static inline void flush_tlb_page(struct vm_area_struct *vma, > unsigned long uaddr) > { > - flush_tlb_page_nosync(vma, uaddr); > + struct mm_struct *mm = vma->vm_mm; > + unsigned long addr = __TLBI_VADDR(uaddr, ASID(mm)); > + > + /* avoid TLB-i broadcast to remote NUMA nodes if it's a local flush */ > + if (current->mm == mm && atomic_read(&mm->mm_users) <= 1) { > + int cpu = get_cpu(); > + > + cpumask_setall(mm_cpumask(mm)); > + cpumask_clear_cpu(cpu, mm_cpumask(mm)); > + > + smp_mb(); > + > + if (atomic_read(&mm->mm_users) <= 1) { > + dsb(nshst); > + __tlbi(vale1, addr); > + __tlbi_user(vale1, addr); > + dsb(nsh); > + > + put_cpu(); > + return; > + } > + put_cpu(); > + } > + > + dsb(ishst); > + __tlbi(vale1is, addr); > + __tlbi_user(vale1is, addr); > dsb(ish); > } > > @@ -181,14 +237,15 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, > unsigned long start, unsigned long end, > unsigned long stride, bool last_level) > { > - unsigned long asid = ASID(vma->vm_mm); > + struct mm_struct *mm = vma->vm_mm; > + unsigned long asid = ASID(mm); > unsigned long addr; > > start = round_down(start, stride); > end = round_up(end, stride); > > if ((end - start) >= (MAX_TLBI_OPS * stride)) { > - flush_tlb_mm(vma->vm_mm); > + flush_tlb_mm(mm); > return; > } > > @@ -198,6 +255,34 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, > start = __TLBI_VADDR(start, asid); > end = __TLBI_VADDR(end, asid); > > + /* avoid TLB-i broadcast to remote NUMA nodes if it's a local flush */ > + if (current->mm == mm && atomic_read(&mm->mm_users) <= 1) { > + int cpu = get_cpu(); > + > + cpumask_setall(mm_cpumask(mm)); > + cpumask_clear_cpu(cpu, mm_cpumask(mm)); > + > + smp_mb(); > + > + if (atomic_read(&mm->mm_users) <= 1) { > + dsb(nshst); > + for (addr = start; addr < end; addr += stride) { > + if (last_level) { > + __tlbi(vale1, addr); > + __tlbi_user(vale1, addr); > + } else { > + __tlbi(vae1, addr); > + __tlbi_user(vae1, addr); > + } > + } > + dsb(nsh); > + > + put_cpu(); > + return; > + } > + put_cpu(); > + } > + > dsb(ishst); > for (addr = start; addr < end; addr += stride) { > if (last_level) { > diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c > index 8ef73e89d514..b44459d64dff 100644 > --- a/arch/arm64/mm/context.c > +++ b/arch/arm64/mm/context.c > @@ -198,6 +198,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) > { > unsigned long flags; > u64 asid, old_active_asid; > + int need_flush = 0; > > if (system_supports_cnp()) > cpu_set_reserved_ttbr0(); > @@ -233,7 +234,8 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) > atomic64_set(&mm->context.id, asid); > } > > - if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) > + need_flush = cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending); > + if (need_flush) > local_flush_tlb_all(); > > atomic64_set(&per_cpu(active_asids, cpu), asid); > @@ -241,6 +243,15 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) > > switch_mm_fastpath: > > + /* > + * Enforce CPU ordering between the mmget() in use_mm() and > + * the below cpumask_test_and_clear_cpu(). > + */ > + smp_mb(); > + if (cpumask_test_and_clear_cpu(cpu, mm_cpumask(mm)) && > + likely(!need_flush)) > + local_flush_tlb_asid(asid); > + > arm64_apply_bp_hardening(); > > /* > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@xxxxxxxxxxxxxxxxxxx > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >