Re: [PATCH 2/2] arm64: tlb: skip tlbi broadcast for single threaded TLB flushes

"qi.fuli@xxxxxxxxxxx" <qi.fuli@xxxxxxxxxxx> · Wed, 12 Feb 2020 14:13:56 +0000

On 2/4/20 5:17 AM, Andrea Arcangeli wrote:
> With multiple NUMA nodes and multiple sockets, the tlbi broadcast
> shall be delivered through the interconnects in turn increasing the
> interconnect traffic and the latency of the tlbi broadcast instruction.
> 
> Even within a single NUMA node the latency of the tlbi broadcast
> instruction increases almost linearly with the number of CPUs trying to
> send tlbi broadcasts at the same time.
> 
> When the process is single threaded however we can achieve full SMP
> scalability by skipping the tlbi broadcasting. Other arches already
> deploy this optimization.
> 
> After the local TLB flush this however means the ASID context goes out
> of sync in all CPUs except the local one. This can be tracked in the
> mm_cpumask(mm): if the bit is set it means the asid context is stale
> for that CPU. This results in an extra local ASID TLB flush only if a
> single threaded process is migrated to a different CPU and only after a
> TLB flush. No extra local TLB flush is needed for the common case of
> single threaded processes context scheduling within the same CPU and for
> multithreaded processes.
> 
> Skipping the tlbi instruction broadcasting is already implemented in
> local_flush_tlb_all(), this patch only extends it to flush_tlb_mm(),
> flush_tlb_range() and flush_tlb_page() too.
> 
> Here's the result of 32 CPUs (ARMv8 Ampere) running mprotect at the same
> time from 32 single threaded processes before the patch:
> 
>   Performance counter stats for './loop' (3 runs):
> 
>                   0      dummy
> 
>            2.121353 +- 0.000387 seconds time elapsed  ( +-  0.02% )
> 
> and with the patch applied:
> 
>   Performance counter stats for './loop' (3 runs):
> 
>                   0      dummy
> 
>           0.1197750 +- 0.0000827 seconds time elapsed  ( +-  0.07% )

Hi,

I have tested this patch on thunderX2 with Himeno benchmark[1] with 
LARGE calculation size. Here are the results.

   w/o patch:   MFLOPS : 1149.480174
   w/  patch:   MFLOPS : 1110.653003

In order to validate the effectivness of the patch, I ran a 
single-threded program, which calls mprotect() in a loop to issue the 
tlbi broadcast instruction on a CPU core. At the same time, I ran Himeno 
benchmark on another CPU core. The results are:

   w/o patch:   MFLOPS :  860.238792
   w/  patch:   MFLOPS : 1110.449666

Though Himeno benchmark is a microbenchmark, I hope it helps.

[1] http://accc.riken.jp/en/supercom/documents/himenobmt/

Thanks,
QI Fuli

> 
> Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> ---
>   arch/arm64/include/asm/efi.h         |  2 +-
>   arch/arm64/include/asm/mmu.h         |  3 +-
>   arch/arm64/include/asm/mmu_context.h | 10 +--
>   arch/arm64/include/asm/tlbflush.h    | 91 +++++++++++++++++++++++++++-
>   arch/arm64/mm/context.c              | 13 +++-
>   5 files changed, 109 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
> index 44531a69d32b..5d9a1433d918 100644
> --- a/arch/arm64/include/asm/efi.h
> +++ b/arch/arm64/include/asm/efi.h
> @@ -131,7 +131,7 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
>   
>   static inline void efi_set_pgd(struct mm_struct *mm)
>   {
> -	__switch_mm(mm);
> +	__switch_mm(mm, smp_processor_id());
>   
>   	if (system_uses_ttbr0_pan()) {
>   		if (mm != current->active_mm) {
> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> index e4d862420bb4..1f84289d3e44 100644
> --- a/arch/arm64/include/asm/mmu.h
> +++ b/arch/arm64/include/asm/mmu.h
> @@ -27,7 +27,8 @@ typedef struct {
>    * ASID change and therefore doesn't need to reload the counter using
>    * atomic64_read.
>    */
> -#define ASID(mm)	((mm)->context.id.counter & 0xffff)
> +#define __ASID(asid)	((asid) & 0xffff)
> +#define ASID(mm)	__ASID((mm)->context.id.counter)
>   
>   extern bool arm64_use_ng_mappings;
>   
> diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> index 3827ff4040a3..5ec264297968 100644
> --- a/arch/arm64/include/asm/mmu_context.h
> +++ b/arch/arm64/include/asm/mmu_context.h
> @@ -210,10 +210,8 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
>   	update_saved_ttbr0(tsk, &init_mm);
>   }
>   
> -static inline void __switch_mm(struct mm_struct *next)
> +static inline void __switch_mm(struct mm_struct *next, unsigned int cpu)
>   {
> -	unsigned int cpu = smp_processor_id();
> -
>   	/*
>   	 * init_mm.pgd does not contain any user mappings and it is always
>   	 * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
> @@ -230,8 +228,12 @@ static inline void
>   switch_mm(struct mm_struct *prev, struct mm_struct *next,
>   	  struct task_struct *tsk)
>   {
> +	unsigned int cpu = smp_processor_id();
> +
>   	if (prev != next)
> -		__switch_mm(next);
> +		__switch_mm(next, cpu);
> +	else if (cpumask_test_and_clear_cpu(cpu, mm_cpumask(next)))
> +		local_flush_tlb_asid(atomic64_read(&next->context.id));
>   
>   	/*
>   	 * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
> index bc3949064725..283f97af3fc5 100644
> --- a/arch/arm64/include/asm/tlbflush.h
> +++ b/arch/arm64/include/asm/tlbflush.h
> @@ -136,6 +136,15 @@ static inline void local_flush_tlb_all(void)
>   	isb();
>   }
>   
> +static inline void local_flush_tlb_asid(unsigned long asid)
> +{
> +	asid = __TLBI_VADDR(0, __ASID(asid));
> +	dsb(nshst);
> +	__tlbi(aside1, asid);
> +	__tlbi_user(aside1, asid);
> +	dsb(nsh);
> +}
> +
>   static inline void flush_tlb_all(void)
>   {
>   	dsb(ishst);
> @@ -148,6 +157,27 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
>   {
>   	unsigned long asid = __TLBI_VADDR(0, ASID(mm));
>   
> +	/* avoid TLB-i broadcast to remote NUMA nodes if it's a local flush */
> +	if (current->mm == mm && atomic_read(&mm->mm_users) <= 1) {
> +		int cpu = get_cpu();
> +
> +		cpumask_setall(mm_cpumask(mm));
> +		cpumask_clear_cpu(cpu, mm_cpumask(mm));
> +
> +		smp_mb();
> +
> +		if (atomic_read(&mm->mm_users) <= 1) {
> +			dsb(nshst);
> +			__tlbi(aside1, asid);
> +			__tlbi_user(aside1, asid);
> +			dsb(nsh);
> +
> +			put_cpu();
> +			return;
> +		}
> +		put_cpu();
> +	}
> +
>   	dsb(ishst);
>   	__tlbi(aside1is, asid);
>   	__tlbi_user(aside1is, asid);
> @@ -167,7 +197,33 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
>   static inline void flush_tlb_page(struct vm_area_struct *vma,
>   				  unsigned long uaddr)
>   {
> -	flush_tlb_page_nosync(vma, uaddr);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long addr = __TLBI_VADDR(uaddr, ASID(mm));
> +
> +	/* avoid TLB-i broadcast to remote NUMA nodes if it's a local flush */
> +	if (current->mm == mm && atomic_read(&mm->mm_users) <= 1) {
> +		int cpu = get_cpu();
> +
> +		cpumask_setall(mm_cpumask(mm));
> +		cpumask_clear_cpu(cpu, mm_cpumask(mm));
> +
> +		smp_mb();
> +
> +		if (atomic_read(&mm->mm_users) <= 1) {
> +			dsb(nshst);
> +			__tlbi(vale1, addr);
> +			__tlbi_user(vale1, addr);
> +			dsb(nsh);
> +
> +			put_cpu();
> +			return;
> +		}
> +		put_cpu();
> +	}
> +
> +	dsb(ishst);
> +	__tlbi(vale1is, addr);
> +	__tlbi_user(vale1is, addr);
>   	dsb(ish);
>   }
>   
> @@ -181,14 +237,15 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
>   				     unsigned long start, unsigned long end,
>   				     unsigned long stride, bool last_level)
>   {
> -	unsigned long asid = ASID(vma->vm_mm);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long asid = ASID(mm);
>   	unsigned long addr;
>   
>   	start = round_down(start, stride);
>   	end = round_up(end, stride);
>   
>   	if ((end - start) >= (MAX_TLBI_OPS * stride)) {
> -		flush_tlb_mm(vma->vm_mm);
> +		flush_tlb_mm(mm);
>   		return;
>   	}
>   
> @@ -198,6 +255,34 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
>   	start = __TLBI_VADDR(start, asid);
>   	end = __TLBI_VADDR(end, asid);
>   
> +	/* avoid TLB-i broadcast to remote NUMA nodes if it's a local flush */
> +	if (current->mm == mm && atomic_read(&mm->mm_users) <= 1) {
> +		int cpu = get_cpu();
> +
> +		cpumask_setall(mm_cpumask(mm));
> +		cpumask_clear_cpu(cpu, mm_cpumask(mm));
> +
> +		smp_mb();
> +
> +		if (atomic_read(&mm->mm_users) <= 1) {
> +			dsb(nshst);
> +			for (addr = start; addr < end; addr += stride) {
> +				if (last_level) {
> +					__tlbi(vale1, addr);
> +					__tlbi_user(vale1, addr);
> +				} else {
> +					__tlbi(vae1, addr);
> +					__tlbi_user(vae1, addr);
> +				}
> +			}
> +			dsb(nsh);
> +
> +			put_cpu();
> +			return;
> +		}
> +		put_cpu();
> +	}
> +
>   	dsb(ishst);
>   	for (addr = start; addr < end; addr += stride) {
>   		if (last_level) {
> diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
> index 8ef73e89d514..b44459d64dff 100644
> --- a/arch/arm64/mm/context.c
> +++ b/arch/arm64/mm/context.c
> @@ -198,6 +198,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
>   {
>   	unsigned long flags;
>   	u64 asid, old_active_asid;
> +	int need_flush = 0;
>   
>   	if (system_supports_cnp())
>   		cpu_set_reserved_ttbr0();
> @@ -233,7 +234,8 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
>   		atomic64_set(&mm->context.id, asid);
>   	}
>   
> -	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
> +	need_flush = cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending);
> +	if (need_flush)
>   		local_flush_tlb_all();
>   
>   	atomic64_set(&per_cpu(active_asids, cpu), asid);
> @@ -241,6 +243,15 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
>   
>   switch_mm_fastpath:
>   
> +	/*
> +	 * Enforce CPU ordering between the mmget() in use_mm() and
> +	 * the below cpumask_test_and_clear_cpu().
> +	 */
> +	smp_mb();
> +	if (cpumask_test_and_clear_cpu(cpu, mm_cpumask(mm)) &&
> +	    likely(!need_flush))
> +		local_flush_tlb_asid(asid);
> +
>   	arm64_apply_bp_hardening();
>   
>   	/*
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@xxxxxxxxxxxxxxxxxxx
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>