On Sun, Feb 23, 2020 at 02:25:20PM -0500, Andrea Arcangeli wrote: > With multiple NUMA nodes and multiple sockets, the tlbi broadcast > shall be delivered through the interconnects in turn increasing the > CPU interconnect traffic and the latency of the tlbi broadcast > instruction. To avoid the synchronous delivery of the tlbi broadcast > before the tlbi instruction can be retired, the hardware would need to > implement a replicated mm_cpumask bitflag for each ASID and every CPU > would need to tell every other CPU which ASID is being loaded. Exactly > what x86 does with mm_cpumask in software. > > Even within a single NUMA node the latency of the tlbi broadcast > instruction increases almost linearly with the number of CPUs trying > to send tlbi broadcasts at the same time. > > If a single thread of the process is running and it's also running in > the CPU issuing the TLB flush, or if no thread of the process are > running, we can achieve full SMP scalability in the arm64 TLB flushng > by skipping the tlbi broadcasting. > > After the local TLB flush this means the ASID context goes out of sync > in all CPUs except the local one. This can be tracked on the per-mm > cpumask: if the bit is set it means the ASID context is stale for that > CPU. This results in an extra local ASID TLB flush only when threads > are running in new CPUs after a TLB flush. > > Skipping the tlbi instruction broadcasting is already implemented in > local_flush_tlb_all(), this patch only extends it to flush_tlb_mm(), > flush_tlb_range() and flush_tlb_page() too. > > The below benchmarks are measured on a non-NUMA 32 CPUs system (ARMv8 > Ampere), so it should be far from a worst case scenario: the > enterprise kernel config allows multiple NUMA nodes with NR_CPUS set > by default to 4096. > > === stock === > > # cat for-each-cpu.sh > #!/bin/bash > > for i in $(seq `nproc`); do > "$@" &>/dev/null & > done > wait > # perf stat -r 10 -e dummy ./for-each-cpu.sh ./mprotect-threaded 10000 > [..] > 2.1696 +- 0.0122 seconds time elapsed ( +- 0.56% ) > > # perf stat -r 10 -e dummy ./for-each-cpu.sh ./gperftools/tcmalloc_large_heap_fragmentation_unittest > [..] > 0.99018 +- 0.00360 seconds time elapsed ( +- 0.36% ) > > # cat sort-compute > #!/bin/bash > > for x in `seq 256`; do > for i in `seq 32`; do /usr/bin/sort </usr/bin/sort >/dev/null; done & > done > wait > # perf stat -r 10 -e dummy ./sort-compute > [..] > 1.8094 +- 0.0139 seconds time elapsed ( +- 0.77% ) > > === patch applied === > > # perf stat -r 10 -e dummy ./for-each-cpu.sh ./mprotect-threaded 10000 > [..] > 0.13941 +- 0.00449 seconds time elapsed ( +- 3.22% ) > > # perf stat -r 10 -e dummy ./for-each-cpu.sh ./gperftools/tcmalloc_large_heap_fragmentation_unittest > [..] > 0.90510 +- 0.00262 seconds time elapsed ( +- 0.29% ) > > # perf stat -r 10 -e dummy ./sort-compute > [..] > 1.64025 +- 0.00618 seconds time elapsed ( +- 0.38% ) > > Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> > --- > arch/arm64/include/asm/efi.h | 2 +- > arch/arm64/include/asm/mmu.h | 4 +- > arch/arm64/include/asm/mmu_context.h | 33 ++++++++-- > arch/arm64/include/asm/tlbflush.h | 95 +++++++++++++++++++++++++++- > arch/arm64/mm/context.c | 54 ++++++++++++++++ > 5 files changed, 177 insertions(+), 11 deletions(-) > > diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h > index 44531a69d32b..5d9a1433d918 100644 > --- a/arch/arm64/include/asm/efi.h > +++ b/arch/arm64/include/asm/efi.h > @@ -131,7 +131,7 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) > > static inline void efi_set_pgd(struct mm_struct *mm) > { > - __switch_mm(mm); > + __switch_mm(mm, smp_processor_id()); > > if (system_uses_ttbr0_pan()) { > if (mm != current->active_mm) { > diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h > index e4d862420bb4..9072fd7bc5f8 100644 > --- a/arch/arm64/include/asm/mmu.h > +++ b/arch/arm64/include/asm/mmu.h > @@ -20,6 +20,7 @@ typedef struct { > atomic64_t id; > void *vdso; > unsigned long flags; > + atomic_t nr_active_mm; > } mm_context_t; > > /* > @@ -27,7 +28,8 @@ typedef struct { > * ASID change and therefore doesn't need to reload the counter using > * atomic64_read. > */ > -#define ASID(mm) ((mm)->context.id.counter & 0xffff) > +#define __ASID(asid) ((asid) & 0xffff) > +#define ASID(mm) __ASID((mm)->context.id.counter) > > extern bool arm64_use_ng_mappings; > > diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h > index 3827ff4040a3..9c66fe317e2f 100644 > --- a/arch/arm64/include/asm/mmu_context.h > +++ b/arch/arm64/include/asm/mmu_context.h > @@ -175,7 +175,10 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp) > #define destroy_context(mm) do { } while(0) > void check_and_switch_context(struct mm_struct *mm, unsigned int cpu); > > -#define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; }) > +#define init_new_context(tsk,mm) \ > + ({ atomic64_set(&(mm)->context.id, 0); \ > + atomic_set(&(mm)->context.nr_active_mm, 0); \ > + 0; }) > > #ifdef CONFIG_ARM64_SW_TTBR0_PAN > static inline void update_saved_ttbr0(struct task_struct *tsk, > @@ -203,6 +206,15 @@ static inline void update_saved_ttbr0(struct task_struct *tsk, > static inline void > enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) > { > + unsigned int cpu = smp_processor_id(); > + if (per_cpu(cpu_not_lazy_tlb, cpu) && > + is_idle_task(tsk)) { > + per_cpu(cpu_not_lazy_tlb, cpu) = false; > + if (!system_uses_ttbr0_pan()) > + cpu_set_reserved_ttbr0(); > + atomic_dec(&mm->context.nr_active_mm); > + } > + VM_WARN_ON(atomic_read(&mm->context.nr_active_mm) < 0); > /* > * We don't actually care about the ttbr0 mapping, so point it at the > * zero page. > @@ -210,10 +222,8 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) > update_saved_ttbr0(tsk, &init_mm); > } > > -static inline void __switch_mm(struct mm_struct *next) > +static inline void __switch_mm(struct mm_struct *next, unsigned int cpu) > { > - unsigned int cpu = smp_processor_id(); > - > /* > * init_mm.pgd does not contain any user mappings and it is always > * active for kernel addresses in TTBR1. Just set the reserved TTBR0. > @@ -230,8 +240,19 @@ static inline void > switch_mm(struct mm_struct *prev, struct mm_struct *next, > struct task_struct *tsk) > { > - if (prev != next) > - __switch_mm(next); > + unsigned int cpu = smp_processor_id(); > + > + if (!per_cpu(cpu_not_lazy_tlb, cpu)) { > + per_cpu(cpu_not_lazy_tlb, cpu) = true; > + atomic_inc(&next->context.nr_active_mm); > + __switch_mm(next, cpu); > + } else if (prev != next) { > + atomic_inc(&next->context.nr_active_mm); > + __switch_mm(next, cpu); > + atomic_dec(&prev->context.nr_active_mm); > + } > + VM_WARN_ON(!atomic_read(&next->context.nr_active_mm)); > + VM_WARN_ON(atomic_read(&prev->context.nr_active_mm) < 0); > > /* > * Update the saved TTBR0_EL1 of the scheduled-in task as the previous > diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h > index bc3949064725..0bd987ff9cbd 100644 > --- a/arch/arm64/include/asm/tlbflush.h > +++ b/arch/arm64/include/asm/tlbflush.h > @@ -136,6 +136,15 @@ static inline void local_flush_tlb_all(void) > isb(); > } > > +static inline void local_flush_tlb_asid(unsigned long asid) > +{ > + asid = __TLBI_VADDR(0, __ASID(asid)); > + dsb(nshst); > + __tlbi(aside1, asid); > + __tlbi_user(aside1, asid); > + dsb(nsh); > +} > + > static inline void flush_tlb_all(void) > { > dsb(ishst); > @@ -144,9 +153,38 @@ static inline void flush_tlb_all(void) > isb(); > } > > +DECLARE_PER_CPU(bool, cpu_not_lazy_tlb); > + > +enum tlb_flush_types { > + TLB_FLUSH_NO, > + TLB_FLUSH_LOCAL, > + TLB_FLUSH_BROADCAST, > +}; > +extern enum tlb_flush_types tlb_flush_check(struct mm_struct *mm, > + unsigned int cpu); > + > static inline void flush_tlb_mm(struct mm_struct *mm) > { > unsigned long asid = __TLBI_VADDR(0, ASID(mm)); > + enum tlb_flush_types flush; > + > + flush = tlb_flush_check(mm, get_cpu()); > + switch (flush) { > + case TLB_FLUSH_LOCAL: > + > + dsb(nshst); > + __tlbi(aside1, asid); > + __tlbi_user(aside1, asid); > + dsb(nsh); > + > + /* fall through */ > + case TLB_FLUSH_NO: > + put_cpu(); > + return; > + case TLB_FLUSH_BROADCAST: > + break; > + } > + put_cpu(); > > dsb(ishst); > __tlbi(aside1is, asid); > @@ -167,7 +205,31 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, > static inline void flush_tlb_page(struct vm_area_struct *vma, > unsigned long uaddr) > { > - flush_tlb_page_nosync(vma, uaddr); > + struct mm_struct *mm = vma->vm_mm; > + unsigned long addr = __TLBI_VADDR(uaddr, ASID(mm)); > + enum tlb_flush_types flush; > + > + flush = tlb_flush_check(mm, get_cpu()); > + switch (flush) { > + case TLB_FLUSH_LOCAL: > + > + dsb(nshst); > + __tlbi(vale1, addr); > + __tlbi_user(vale1, addr); > + dsb(nsh); > + > + /* fall through */ > + case TLB_FLUSH_NO: > + put_cpu(); > + return; > + case TLB_FLUSH_BROADCAST: > + break; > + } > + put_cpu(); > + > + dsb(ishst); > + __tlbi(vale1is, addr); > + __tlbi_user(vale1is, addr); > dsb(ish); > } > > @@ -181,14 +243,16 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, > unsigned long start, unsigned long end, > unsigned long stride, bool last_level) > { > - unsigned long asid = ASID(vma->vm_mm); > + struct mm_struct *mm = vma->vm_mm; > + unsigned long asid = ASID(mm); > unsigned long addr; > + enum tlb_flush_types flush; > > start = round_down(start, stride); > end = round_up(end, stride); > > if ((end - start) >= (MAX_TLBI_OPS * stride)) { > - flush_tlb_mm(vma->vm_mm); > + flush_tlb_mm(mm); > return; > } > > @@ -198,6 +262,31 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, > start = __TLBI_VADDR(start, asid); > end = __TLBI_VADDR(end, asid); > > + flush = tlb_flush_check(mm, get_cpu()); > + switch (flush) { > + case TLB_FLUSH_LOCAL: > + > + dsb(nshst); > + for (addr = start; addr < end; addr += stride) { > + if (last_level) { > + __tlbi(vale1, addr); > + __tlbi_user(vale1, addr); > + } else { > + __tlbi(vae1, addr); > + __tlbi_user(vae1, addr); > + } > + } > + dsb(nsh); > + > + /* fall through */ > + case TLB_FLUSH_NO: > + put_cpu(); > + return; > + case TLB_FLUSH_BROADCAST: > + break; > + } > + put_cpu(); > + > dsb(ishst); > for (addr = start; addr < end; addr += stride) { > if (last_level) { > diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c > index 8ef73e89d514..3152b7f7da12 100644 > --- a/arch/arm64/mm/context.c > +++ b/arch/arm64/mm/context.c > @@ -25,6 +25,7 @@ static unsigned long *asid_map; > static DEFINE_PER_CPU(atomic64_t, active_asids); > static DEFINE_PER_CPU(u64, reserved_asids); > static cpumask_t tlb_flush_pending; > +DEFINE_PER_CPU(bool, cpu_not_lazy_tlb); > > #define ASID_MASK (~GENMASK(asid_bits - 1, 0)) > #define ASID_FIRST_VERSION (1UL << asid_bits) > @@ -191,6 +192,12 @@ static u64 new_context(struct mm_struct *mm) > set_asid: > __set_bit(asid, asid_map); > cur_idx = asid; > + /* > + * check_and_switch_context() will change the ASID of this mm > + * so no need of extra ASID local TLB flushes: the new ASID > + * isn't stale anymore after the tlb_flush_pending was set. > + */ > + cpumask_clear(mm_cpumask(mm)); > return idx2asid(asid) | generation; > } > > @@ -240,6 +247,15 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) > raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); > > switch_mm_fastpath: > + /* > + * Enforce CPU ordering between the atomic_inc(nr_active_mm) > + * in switch_mm() and the below cpumask_test_cpu(mm_cpumask). > + */ > + smp_mb(); > + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) { > + cpumask_clear_cpu(cpu, mm_cpumask(mm)); > + local_flush_tlb_asid(asid); > + } > > arm64_apply_bp_hardening(); > > @@ -251,6 +267,44 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) > cpu_switch_mm(mm->pgd, mm); > } > > +enum tlb_flush_types tlb_flush_check(struct mm_struct *mm, unsigned int cpu) > +{ > + if (atomic_read(&mm->context.nr_active_mm) <= 1) { > + bool is_local = current->active_mm == mm && > + per_cpu(cpu_not_lazy_tlb, cpu); > + cpumask_t *stale_cpumask = mm_cpumask(mm); > + unsigned int next_zero = cpumask_next_zero(-1, stale_cpumask); > + bool local_is_clear = false; > + if (next_zero < nr_cpu_ids && > + (is_local && next_zero == cpu)) { > + next_zero = cpumask_next_zero(next_zero, stale_cpumask); > + local_is_clear = true; > + } > + if (next_zero < nr_cpu_ids) { > + cpumask_setall(stale_cpumask); > + local_is_clear = false; > + } > + > + /* > + * Enforce CPU ordering between the above > + * cpumask_setall(mm_cpumask) and the below > + * atomic_read(nr_active_mm). > + */ > + smp_mb(); > + > + if (likely(atomic_read(&mm->context.nr_active_mm)) <= 1) { > + if (is_local) { > + if (!local_is_clear) > + cpumask_clear_cpu(cpu, stale_cpumask); > + return TLB_FLUSH_LOCAL; > + } > + if (atomic_read(&mm->context.nr_active_mm) == 0) > + return TLB_FLUSH_NO; > + } > + } > + return TLB_FLUSH_BROADCAST; > +} > + > /* Errata workaround post TTBRx_EL1 update. */ > asmlinkage void post_ttbr_update_workaround(void) > { May I suggest the following (cosmetic) changes to this patch? I'm testing these changes against RHEL integration + regression tests, and I'll also run them against a specially crafted test to measure the impact on task-switching, if any. (I'll report back, soon) Cheers! Rafael -- diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index bb73f02bef28..14eceba302bc 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -182,25 +182,21 @@ static inline void flush_tlb_mm(struct mm_struct *mm) flush = tlb_flush_check(mm, get_cpu()); switch (flush) { case TLB_FLUSH_LOCAL: - dsb(nshst); __tlbi(aside1, asid); __tlbi_user(aside1, asid); dsb(nsh); - /* fall through */ case TLB_FLUSH_NO: - put_cpu(); - return; + break; case TLB_FLUSH_BROADCAST: + dsb(ishst); + __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); + dsb(ish); break; } put_cpu(); - - dsb(ishst); - __tlbi(aside1is, asid); - __tlbi_user(aside1is, asid); - dsb(ish); } static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, @@ -223,25 +219,21 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, flush = tlb_flush_check(mm, get_cpu()); switch (flush) { case TLB_FLUSH_LOCAL: - dsb(nshst); __tlbi(vale1, addr); __tlbi_user(vale1, addr); dsb(nsh); - /* fall through */ case TLB_FLUSH_NO: - put_cpu(); - return; + break; case TLB_FLUSH_BROADCAST: + dsb(ishst); + __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); + dsb(ish); break; } put_cpu(); - - dsb(ishst); - __tlbi(vale1is, addr); - __tlbi_user(vale1is, addr); - dsb(ish); } /* @@ -276,7 +268,6 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, flush = tlb_flush_check(mm, get_cpu()); switch (flush) { case TLB_FLUSH_LOCAL: - dsb(nshst); for (addr = start; addr < end; addr += stride) { if (last_level) { @@ -288,27 +279,24 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, } } dsb(nsh); - /* fall through */ case TLB_FLUSH_NO: - put_cpu(); - return; + break; case TLB_FLUSH_BROADCAST: + dsb(ishst); + for (addr = start; addr < end; addr += stride) { + if (last_level) { + __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); + } else { + __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); + } + } + dsb(ish); break; } put_cpu(); - - dsb(ishst); - for (addr = start; addr < end; addr += stride) { - if (last_level) { - __tlbi(vale1is, addr); - __tlbi_user(vale1is, addr); - } else { - __tlbi(vae1is, addr); - __tlbi_user(vae1is, addr); - } - } - dsb(ish); } static inline void flush_tlb_range(struct vm_area_struct *vma,