> > Hello, > > This is introducing a nr_active_mm that allows to optimize away the tlbi broadcast > also for multi threaded processes, it doesn't rely anymore on mm_users <= 1. > > This also optimizes away all TLB flushes (including local ones) when the process is not > running in any cpu (including during exit_mmap with lazy tlb state). > > This optimization is generally only observable when there are parallel TLB flushes > from different processes in multiple CPUs. One possible use case is an userland malloc > libs freeing small objects with MADV_DONTNEED and causing a frequent tiny tlb > flushes as demonstrated by the tcmalloc testsuite. > > All memory intensive apps dealing a multitude of frequently freed small objects tend > to opt-out of glibc and they opt-in jemalloc or tcmalloc, so this should facilitate the > SMP/NUMA scalability of long lived apps with small objects running in different > containers if they're issuing frequent MADV_DONTNEED tlb flushes while the other > threads of the process are not running. > > I was suggested to implement the mm_cpumask the standard way in order to > optimize multithreaded apps too and to avoid restricting the optimization to > mm_users <= 1. So initially I had two bitmasks allocated as shown at the bottom of > this cover letter, by setting ARCH_NR_MM_CPUMASK to 2 with the below patch > applied... however I figured a single atomic per-mm achieves the exact same runtime > behavior of the extra bitmap, so I just dropped the extra bitmap and I replaced it with > nr_active_mm as an optimization. > > If the switch_mm atomic ops in the switch_mm fast path would be a concern (they're > still faster than the cpumask_set_cpu/clear_cpu, with less than 256-512 CPUs), it's > worth mentioning it'd be possible to remove all atomic ops from the switch_mm fast > path by restricting this optimization to single threaded processes by checking > mm_users <= 1 and < 1 instead of nr_active_mm <= 1 and < 1 similarly to what the > earlier version of this patchset was doing. > > Thanks, > Andrea > > Andrea Arcangeli (3): > mm: use_mm: fix for arches checking mm_users to optimize TLB flushes > arm64: select CPUMASK_OFFSTACK if NUMA > arm64: tlb: skip tlbi broadcast > > arch/arm64/Kconfig | 1 + > arch/arm64/include/asm/efi.h | 2 +- > arch/arm64/include/asm/mmu.h | 4 +- > arch/arm64/include/asm/mmu_context.h | 33 ++++++++-- > arch/arm64/include/asm/tlbflush.h | 95 > +++++++++++++++++++++++++++- > arch/arm64/mm/context.c | 54 ++++++++++++++++ > mm/mmu_context.c | 2 + > 7 files changed, 180 insertions(+), 11 deletions(-) > > Early attempt with the standard mm_cpumask follows: > > From: Andrea Arcangeli <aarcange@xxxxxxxxxx> > Subject: mm: allow per-arch mm_cpumasks based on ARCH_NR_MM_CPUMASK > > Allow archs to allocate multiple mm_cpumasks in the mm_struct per-arch by > definining a ARCH_NR_MM_CPUMASK > 1 (to be included before > "linux/mm_types.h"). > > Those extra per-mm cpumasks can be referenced with __mm_cpumask(N, mm), > where N == 0 points to the mm_cpumask() known by the common code and N > 0 > points to the per-arch private ones. > --- > drivers/firmware/efi/efi.c | 3 ++- > include/linux/mm_types.h | 17 +++++++++++++++-- > kernel/fork.c | 3 ++- > mm/init-mm.c | 2 +- > 4 files changed, 20 insertions(+), 5 deletions(-) > > diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index > 5da0232ae33f..608c9bf181e5 100644 > --- a/drivers/firmware/efi/efi.c > +++ b/drivers/firmware/efi/efi.c > @@ -86,7 +86,8 @@ struct mm_struct efi_mm = { > .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), > .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), > .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), > - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, > + .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS) * > + ARCH_NR_MM_CPUMASK] = 0}, > }; > > struct workqueue_struct *efi_rts_wq; > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index > f29bba20bba1..b53d5622b3b2 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -531,6 +531,9 @@ struct mm_struct { > RH_KABI_RESERVE(7) > RH_KABI_RESERVE(8) > > +#ifndef ARCH_NR_MM_CPUMASK > +#define ARCH_NR_MM_CPUMASK 1 > +#endif > /* > * The mm_cpumask needs to be at the end of mm_struct, because it > * is dynamically sized based on nr_cpu_ids. > @@ -544,15 +547,25 @@ extern struct mm_struct init_mm; static inline void > mm_init_cpumask(struct mm_struct *mm) { > unsigned long cpu_bitmap = (unsigned long)mm; > + int i; > > cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); > - cpumask_clear((struct cpumask *)cpu_bitmap); > + for (i = 0; i < ARCH_NR_MM_CPUMASK; i++) { > + cpumask_clear((struct cpumask *)cpu_bitmap); > + cpu_bitmap += cpumask_size(); > + } > } > > /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ > +static inline cpumask_t *__mm_cpumask(int index, struct mm_struct *mm) > +{ > + return (struct cpumask *)((unsigned long)&mm->cpu_bitmap + > + cpumask_size() * index); > +} > + > static inline cpumask_t *mm_cpumask(struct mm_struct *mm) { > - return (struct cpumask *)&mm->cpu_bitmap; > + return __mm_cpumask(0, mm); > } > > struct mmu_gather; > diff --git a/kernel/fork.c b/kernel/fork.c index 1dad2f91fac3..a6cbbc1b6008 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -2418,7 +2418,8 @@ void __init proc_caches_init(void) > * dynamically sized based on the maximum CPU number this system > * can have, taking hotplug into account (nr_cpu_ids). > */ > - mm_size = sizeof(struct mm_struct) + cpumask_size(); > + mm_size = sizeof(struct mm_struct) + cpumask_size() * \ > + ARCH_NR_MM_CPUMASK; > > mm_cachep = kmem_cache_create_usercopy("mm_struct", > mm_size, ARCH_MIN_MMSTRUCT_ALIGN, > diff --git a/mm/init-mm.c b/mm/init-mm.c index a787a319211e..d975f8ce270e > 100644 > --- a/mm/init-mm.c > +++ b/mm/init-mm.c > @@ -35,6 +35,6 @@ struct mm_struct init_mm = { > .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), > .mmlist = LIST_HEAD_INIT(init_mm.mmlist), > .user_ns = &init_user_ns, > - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, > + .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS) * > ARCH_NR_MM_CPUMASK] = 0}, > INIT_MM_CONTEXT(init_mm) > }; > > > [bitmap version depending on the above follows] > > @@ -248,6 +260,42 @@ void check_and_switch_context(struct mm_struct *mm, > unsigned int cpu) > cpu_switch_mm(mm->pgd, mm); > } > > +enum tlb_flush_types tlb_flush_check(struct mm_struct *mm, unsigned int > +cpu) { > + if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) { > + bool is_local = cpumask_test_cpu(cpu, mm_cpumask(mm)); > + cpumask_t *stale_cpumask = __mm_cpumask(1, mm); > + int next_zero = cpumask_next_zero(-1, stale_cpumask); > + bool local_is_clear = false; > + if (next_zero < nr_cpu_ids && > + (is_local && next_zero == cpu)) { > + next_zero = cpumask_next_zero(next_zero, > stale_cpumask); > + local_is_clear = true; > + } > + if (next_zero < nr_cpu_ids) { > + cpumask_setall(stale_cpumask); > + local_is_clear = false; > + } > + > + /* > + * Enforce CPU ordering between the > + * cpumask_setall() and cpumask_any_but(). > + */ > + smp_mb(); > + > + if (likely(cpumask_any_but(mm_cpumask(mm), > + cpu) >= nr_cpu_ids)) { > + if (is_local) { > + if (!local_is_clear) > + cpumask_clear_cpu(cpu, > stale_cpumask); > + return TLB_FLUSH_LOCAL; > + } > + return TLB_FLUSH_NO; > + } > + } > + return TLB_FLUSH_BROADCAST; > +} > + > /* Errata workaround post TTBRx_EL1 update. */ asmlinkage void > post_ttbr_update_workaround(void) { Hi Andrea, Thank you very much for your patch. I also tested this v2 patch with Himeno benchmark[1] on ThunderX2 with v5.5.7 kernel. In order to confirm the effect of the patch, I used mprotect-attacker-threaded.c[2] program to issue the tlbi broadcast instruction or noises, and made it run on a single core or multiple cores via taskset command. The following it the result. [w/o patch, w/o noise program] MFLOPS : 1614.438709 +- 2.640061 [w/o patch, w/ noise program running on multiple cores] MFLOPS : 1152.862612 +- 0.7933615 [w/o patch, w/ noise program running on a single core] MFLOPS : 1152.996692 +- 1.6517165 [w/ patch, w/o noise program] MFLOPS : 1613.904908 +- 0.606810 [w/ patch, w/ noise program running on multiple cores] MFLOPS : 1614.586227 +- 0.3268545 [w/ patch, w/ noise program running on a single core] MFLOPS : 1614.910829 +- 0.694644 [1] http://accc.riken.jp/en/supercom/documents/himenobmt/ [2] $ cat mprotect-attacker-threaded.c #include <stdio.h> #include <sys/mman.h> #include <stdlib.h> #include <string.h> void sleeper() { pause(); } int main(int argc, char *argv[]){ int i; char *buf; long size = 4 * 1024 * 1024; long loop = 10; pthread_t pthread; if (pthread_create(&pthread, NULL, sleeper, NULL)) perror("pthread_create"), exit(1); if(argc == 2){ loop = atoi(argv[1]); } buf = mmap(NULL, size, PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); if(buf == MAP_FAILED){ perror("mmap"); exit(1); } memset(buf, 1, size); for(i = 0; i < loop; i++){ mprotect(buf, size, PROT_READ); mprotect(buf, size, PROT_WRITE); } munmap(buf, size); return 0; } Sincerely, QI Fuli