RE: [PATCH 0/3] arm64: tlb: skip tlbi broadcast v2

"qi.fuli@xxxxxxxxxxx" <qi.fuli@xxxxxxxxxxx> · Wed, 18 Mar 2020 08:53:26 +0000

> 
> Hello,
> 
> This is introducing a nr_active_mm that allows to optimize away the tlbi broadcast
> also for multi threaded processes, it doesn't rely anymore on mm_users <= 1.
> 
> This also optimizes away all TLB flushes (including local ones) when the process is not
> running in any cpu (including during exit_mmap with lazy tlb state).
> 
> This optimization is generally only observable when there are parallel TLB flushes
> from different processes in multiple CPUs. One possible use case is an userland malloc
> libs freeing small objects with MADV_DONTNEED and causing a frequent tiny tlb
> flushes as demonstrated by the tcmalloc testsuite.
> 
> All memory intensive apps dealing a multitude of frequently freed small objects tend
> to opt-out of glibc and they opt-in jemalloc or tcmalloc, so this should facilitate the
> SMP/NUMA scalability of long lived apps with small objects running in different
> containers if they're issuing frequent MADV_DONTNEED tlb flushes while the other
> threads of the process are not running.
> 
> I was suggested to implement the mm_cpumask the standard way in order to
> optimize multithreaded apps too and to avoid restricting the optimization to
> mm_users <= 1. So initially I had two bitmasks allocated as shown at the bottom of
> this cover letter, by setting ARCH_NR_MM_CPUMASK to 2 with the below patch
> applied... however I figured a single atomic per-mm achieves the exact same runtime
> behavior of the extra bitmap, so I just dropped the extra bitmap and I replaced it with
> nr_active_mm as an optimization.
> 
> If the switch_mm atomic ops in the switch_mm fast path would be a concern (they're
> still faster than the cpumask_set_cpu/clear_cpu, with less than 256-512 CPUs), it's
> worth mentioning it'd be possible to remove all atomic ops from the switch_mm fast
> path by restricting this optimization to single threaded processes by checking
> mm_users <= 1 and < 1 instead of nr_active_mm <= 1 and < 1 similarly to what the
> earlier version of this patchset was doing.
> 
> Thanks,
> Andrea
> 
> Andrea Arcangeli (3):
>   mm: use_mm: fix for arches checking mm_users to optimize TLB flushes
>   arm64: select CPUMASK_OFFSTACK if NUMA
>   arm64: tlb: skip tlbi broadcast
> 
>  arch/arm64/Kconfig                   |  1 +
>  arch/arm64/include/asm/efi.h         |  2 +-
>  arch/arm64/include/asm/mmu.h         |  4 +-
>  arch/arm64/include/asm/mmu_context.h | 33 ++++++++--
>  arch/arm64/include/asm/tlbflush.h    | 95
> +++++++++++++++++++++++++++-
>  arch/arm64/mm/context.c              | 54 ++++++++++++++++
>  mm/mmu_context.c                     |  2 +
>  7 files changed, 180 insertions(+), 11 deletions(-)
> 
> Early attempt with the standard mm_cpumask follows:
> 
> From: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> Subject: mm: allow per-arch mm_cpumasks based on ARCH_NR_MM_CPUMASK
> 
> Allow archs to allocate multiple mm_cpumasks in the mm_struct per-arch by
> definining a ARCH_NR_MM_CPUMASK > 1 (to be included before
> "linux/mm_types.h").
> 
> Those extra per-mm cpumasks can be referenced with __mm_cpumask(N, mm),
> where N == 0 points to the mm_cpumask() known by the common code and N > 0
> points to the per-arch private ones.
> ---
>  drivers/firmware/efi/efi.c |  3 ++-
>  include/linux/mm_types.h   | 17 +++++++++++++++--
>  kernel/fork.c              |  3 ++-
>  mm/init-mm.c               |  2 +-
>  4 files changed, 20 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index
> 5da0232ae33f..608c9bf181e5 100644
> --- a/drivers/firmware/efi/efi.c
> +++ b/drivers/firmware/efi/efi.c
> @@ -86,7 +86,8 @@ struct mm_struct efi_mm = {
>  	.mmap_sem		= __RWSEM_INITIALIZER(efi_mm.mmap_sem),
>  	.page_table_lock	= __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
>  	.mmlist			= LIST_HEAD_INIT(efi_mm.mmlist),
> -	.cpu_bitmap		= { [BITS_TO_LONGS(NR_CPUS)] = 0},
> +	.cpu_bitmap		= { [BITS_TO_LONGS(NR_CPUS) *
> +				     ARCH_NR_MM_CPUMASK] = 0},
>  };
> 
>  struct workqueue_struct *efi_rts_wq;
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index
> f29bba20bba1..b53d5622b3b2 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -531,6 +531,9 @@ struct mm_struct {
>  	RH_KABI_RESERVE(7)
>  	RH_KABI_RESERVE(8)
> 
> +#ifndef ARCH_NR_MM_CPUMASK
> +#define ARCH_NR_MM_CPUMASK 1
> +#endif
>  	/*
>  	 * The mm_cpumask needs to be at the end of mm_struct, because it
>  	 * is dynamically sized based on nr_cpu_ids.
> @@ -544,15 +547,25 @@ extern struct mm_struct init_mm;  static inline void
> mm_init_cpumask(struct mm_struct *mm)  {
>  	unsigned long cpu_bitmap = (unsigned long)mm;
> +	int i;
> 
>  	cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
> -	cpumask_clear((struct cpumask *)cpu_bitmap);
> +	for (i = 0; i < ARCH_NR_MM_CPUMASK; i++) {
> +		cpumask_clear((struct cpumask *)cpu_bitmap);
> +		cpu_bitmap += cpumask_size();
> +	}
>  }
> 
>  /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
> +static inline cpumask_t *__mm_cpumask(int index, struct mm_struct *mm)
> +{
> +	return (struct cpumask *)((unsigned long)&mm->cpu_bitmap +
> +				  cpumask_size() * index);
> +}
> +
>  static inline cpumask_t *mm_cpumask(struct mm_struct *mm)  {
> -	return (struct cpumask *)&mm->cpu_bitmap;
> +	return __mm_cpumask(0, mm);
>  }
> 
>  struct mmu_gather;
> diff --git a/kernel/fork.c b/kernel/fork.c index 1dad2f91fac3..a6cbbc1b6008 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2418,7 +2418,8 @@ void __init proc_caches_init(void)
>  	 * dynamically sized based on the maximum CPU number this system
>  	 * can have, taking hotplug into account (nr_cpu_ids).
>  	 */
> -	mm_size = sizeof(struct mm_struct) + cpumask_size();
> +	mm_size = sizeof(struct mm_struct) + cpumask_size() * \
> +		ARCH_NR_MM_CPUMASK;
> 
>  	mm_cachep = kmem_cache_create_usercopy("mm_struct",
>  			mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
> diff --git a/mm/init-mm.c b/mm/init-mm.c index a787a319211e..d975f8ce270e
> 100644
> --- a/mm/init-mm.c
> +++ b/mm/init-mm.c
> @@ -35,6 +35,6 @@ struct mm_struct init_mm = {
>  	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
>  	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
>  	.user_ns	= &init_user_ns,
> -	.cpu_bitmap	= { [BITS_TO_LONGS(NR_CPUS)] = 0},
> +	.cpu_bitmap	= { [BITS_TO_LONGS(NR_CPUS) *
> ARCH_NR_MM_CPUMASK] = 0},
>  	INIT_MM_CONTEXT(init_mm)
>  };
> 
> 
> [bitmap version depending on the above follows]
> 
> @@ -248,6 +260,42 @@ void check_and_switch_context(struct mm_struct *mm,
> unsigned int cpu)
>  		cpu_switch_mm(mm->pgd, mm);
>  }
> 
> +enum tlb_flush_types tlb_flush_check(struct mm_struct *mm, unsigned int
> +cpu) {
> +	if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
> +		bool is_local = cpumask_test_cpu(cpu, mm_cpumask(mm));
> +		cpumask_t *stale_cpumask = __mm_cpumask(1, mm);
> +		int next_zero = cpumask_next_zero(-1, stale_cpumask);
> +		bool local_is_clear = false;
> +		if (next_zero < nr_cpu_ids &&
> +		    (is_local && next_zero == cpu)) {
> +			next_zero = cpumask_next_zero(next_zero,
> stale_cpumask);
> +			local_is_clear = true;
> +		}
> +		if (next_zero < nr_cpu_ids) {
> +			cpumask_setall(stale_cpumask);
> +			local_is_clear = false;
> +		}
> +
> +		/*
> +		 * Enforce CPU ordering between the
> +		 * cpumask_setall() and cpumask_any_but().
> +		 */
> +		smp_mb();
> +
> +		if (likely(cpumask_any_but(mm_cpumask(mm),
> +					   cpu) >= nr_cpu_ids)) {
> +			if (is_local) {
> +				if (!local_is_clear)
> +					cpumask_clear_cpu(cpu,
> stale_cpumask);
> +				return TLB_FLUSH_LOCAL;
> +			}
> +			return TLB_FLUSH_NO;
> +		}
> +	}
> +	return TLB_FLUSH_BROADCAST;
> +}
> +
>  /* Errata workaround post TTBRx_EL1 update. */  asmlinkage void
> post_ttbr_update_workaround(void)  {

Hi Andrea,

Thank you very much for your patch.
I also tested this v2 patch with Himeno benchmark[1] on ThunderX2 with v5.5.7 kernel.
In order to confirm the effect of the patch, I used mprotect-attacker-threaded.c[2] program
to issue the tlbi broadcast instruction or noises, and made it run on a single core or multiple
cores via taskset command. The following it the result. 
[w/o patch, w/o noise program]
MFLOPS :  1614.438709 +- 2.640061
[w/o patch, w/ noise program running on multiple cores]
MFLOPS :  1152.862612 +- 0.7933615
[w/o patch, w/ noise program running on a single core]
MFLOPS :  1152.996692 +- 1.6517165
[w/ patch, w/o noise program]
MFLOPS :  1613.904908 +- 0.606810
[w/ patch, w/ noise program running on multiple cores]
MFLOPS :  1614.586227 +- 0.3268545
[w/ patch, w/ noise program running on a single core]
MFLOPS :  1614.910829 +- 0.694644

[1] http://accc.riken.jp/en/supercom/documents/himenobmt/
[2]
$ cat mprotect-attacker-threaded.c
#include <stdio.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <string.h>

void sleeper()
{
        pause();
}

int main(int argc, char *argv[]){
        int i;
        char *buf;
        long size = 4 * 1024 * 1024;
        long loop = 10;

        pthread_t pthread;
        if (pthread_create(&pthread, NULL, sleeper, NULL))
                perror("pthread_create"), exit(1);

        if(argc == 2){
                loop = atoi(argv[1]);
        }

        buf = mmap(NULL, size, PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
        if(buf == MAP_FAILED){
                perror("mmap");
                exit(1);
        }
        memset(buf, 1, size);
        for(i = 0; i < loop; i++){
                mprotect(buf, size, PROT_READ);
                mprotect(buf, size, PROT_WRITE);
        }
        munmap(buf, size);

        return 0;
}

Sincerely,
QI Fuli