Re: [RFC PATCH 3/3] mm: increase scalability of global memory commitment accounting

Konstantin Khlebnikov <koct9i@xxxxxxxxx> · Wed, 10 Feb 2016 20:46:33 +0300

On Wed, Feb 10, 2016 at 5:52 PM, Andrey Ryabinin
<aryabinin@xxxxxxxxxxxxx> wrote:
> Currently we use percpu_counter for accounting committed memory. Change
> of committed memory on more than vm_committed_as_batch pages leads to
> grab of counter's spinlock. The batch size is quite small - from 32 pages
> up to 0.4% of the memory/cpu (usually several MBs even on large machines).
>
> So map/munmap of several MBs anonymous memory in multiple processes leads
> to high contention on that spinlock.
>
> Instead of percpu_counter we could use ordinary per-cpu variables.
> Dump test case (8-proccesses running map/munmap of 4MB,
> vm_committed_as_batch = 2MB on test setup) showed 2.5x performance
> improvement.
>
> The downside of this approach is slowdown of vm_memory_committed().
> However, it doesn't matter much since it usually is not in a hot path.
> The only exception is __vm_enough_memory() with overcommit set to
> OVERCOMMIT_NEVER. In that case brk1 test from will-it-scale benchmark
> shows 1.1x - 1.3x performance regression.
>
> So I think it's a good tradeoff. We've got significantly increased
> scalability for the price of some overhead in vm_memory_committed().

I think thats a no go. 30% regression for your not-so-big machine.
For 4096 cores regression will be enourmous. Link: https://xkcd.com/619/

There're three per-cpu page counters: in memcg, in vmstat and this one.
Maybe more. And zero universal fast resource counter with quota and
per-cpu fast-path.

>
> Signed-off-by: Andrey Ryabinin <aryabinin@xxxxxxxxxxxxx>
> Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx>
> Cc: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
> Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> Cc: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx>
> Cc: Konstantin Khlebnikov <koct9i@xxxxxxxxx>
> ---
>  fs/proc/meminfo.c    |  2 +-
>  include/linux/mm.h   |  4 ++++
>  include/linux/mman.h | 13 +++----------
>  mm/mm_init.c         | 45 ---------------------------------------------
>  mm/mmap.c            | 11 -----------
>  mm/nommu.c           |  4 ----
>  mm/util.c            | 20 ++++++++------------
>  7 files changed, 16 insertions(+), 83 deletions(-)
>
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index df4661a..f30e387 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -41,7 +41,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
>  #define K(x) ((x) << (PAGE_SHIFT - 10))
>         si_meminfo(&i);
>         si_swapinfo(&i);
> -       committed = percpu_counter_read_positive(&vm_committed_as);
> +       committed = vm_memory_committed();
>
>         cached = global_page_state(NR_FILE_PAGES) -
>                         total_swapcache_pages() - i.bufferram;
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 979bc83..82dac6e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1881,7 +1881,11 @@ extern void memmap_init_zone(unsigned long, int, unsigned long,
>  extern void setup_per_zone_wmarks(void);
>  extern int __meminit init_per_zone_wmark_min(void);
>  extern void mem_init(void);
> +#ifdef CONFIG_MMU
> +static inline void mmap_init(void) {}
> +#else
>  extern void __init mmap_init(void);
> +#endif
>  extern void show_mem(unsigned int flags);
>  extern void si_meminfo(struct sysinfo * val);
>  extern void si_meminfo_node(struct sysinfo *val, int nid);
> diff --git a/include/linux/mman.h b/include/linux/mman.h
> index 16373c8..436ab11 100644
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -2,7 +2,7 @@
>  #define _LINUX_MMAN_H
>
>  #include <linux/mm.h>
> -#include <linux/percpu_counter.h>
> +#include <linux/percpu.h>
>
>  #include <linux/atomic.h>
>  #include <uapi/linux/mman.h>
> @@ -10,19 +10,12 @@
>  extern int sysctl_overcommit_memory;
>  extern int sysctl_overcommit_ratio;
>  extern unsigned long sysctl_overcommit_kbytes;
> -extern struct percpu_counter vm_committed_as;
> -
> -#ifdef CONFIG_SMP
> -extern s32 vm_committed_as_batch;
> -#else
> -#define vm_committed_as_batch 0
> -#endif
> -
>  unsigned long vm_memory_committed(void);
> +DECLARE_PER_CPU(int, vm_committed_as);
>
>  static inline void vm_acct_memory(long pages)
>  {
> -       __percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
> +       this_cpu_add(vm_committed_as, pages);
>  }
>
>  static inline void vm_unacct_memory(long pages)
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index fdadf91..d96c71f 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -142,51 +142,6 @@ early_param("mminit_loglevel", set_mminit_loglevel);
>  struct kobject *mm_kobj;
>  EXPORT_SYMBOL_GPL(mm_kobj);
>
> -#ifdef CONFIG_SMP
> -s32 vm_committed_as_batch = 32;
> -
> -static void __meminit mm_compute_batch(void)
> -{
> -       u64 memsized_batch;
> -       s32 nr = num_present_cpus();
> -       s32 batch = max_t(s32, nr*2, 32);
> -
> -       /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
> -       memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
> -
> -       vm_committed_as_batch = max_t(s32, memsized_batch, batch);
> -}
> -
> -static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
> -                                       unsigned long action, void *arg)
> -{
> -       switch (action) {
> -       case MEM_ONLINE:
> -       case MEM_OFFLINE:
> -               mm_compute_batch();
> -       default:
> -               break;
> -       }
> -       return NOTIFY_OK;
> -}
> -
> -static struct notifier_block compute_batch_nb __meminitdata = {
> -       .notifier_call = mm_compute_batch_notifier,
> -       .priority = IPC_CALLBACK_PRI, /* use lowest priority */
> -};
> -
> -static int __init mm_compute_batch_init(void)
> -{
> -       mm_compute_batch();
> -       register_hotmemory_notifier(&compute_batch_nb);
> -
> -       return 0;
> -}
> -
> -__initcall(mm_compute_batch_init);
> -
> -#endif
> -
>  static int __init mm_sysfs_init(void)
>  {
>         mm_kobj = kobject_create_and_add("mm", kernel_kobj);
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f088c60..c796d73 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -3184,17 +3184,6 @@ void mm_drop_all_locks(struct mm_struct *mm)
>  }
>
>  /*
> - * initialise the VMA slab
> - */
> -void __init mmap_init(void)
> -{
> -       int ret;
> -
> -       ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
> -       VM_BUG_ON(ret);
> -}
> -
> -/*
>   * Initialise sysctl_user_reserve_kbytes.
>   *
>   * This is intended to prevent a user from starting a single memory hogging
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 6402f27..2d52dbc 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -533,10 +533,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
>   */
>  void __init mmap_init(void)
>  {
> -       int ret;
> -
> -       ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
> -       VM_BUG_ON(ret);
>         vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
>  }
>
> diff --git a/mm/util.c b/mm/util.c
> index 47a57e5..418e68f 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -402,6 +402,7 @@ unsigned long sysctl_overcommit_kbytes __read_mostly;
>  int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
>  unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
>  unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
> +DEFINE_PER_CPU(int, vm_committed_as);
>
>  int overcommit_ratio_handler(struct ctl_table *table, int write,
>                              void __user *buffer, size_t *lenp,
> @@ -445,12 +446,6 @@ unsigned long vm_commit_limit(void)
>  }
>
>  /*
> - * Make sure vm_committed_as in one cacheline and not cacheline shared with
> - * other variables. It can be updated by several CPUs frequently.
> - */
> -struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
> -
> -/*
>   * The global memory commitment made in the system can be a metric
>   * that can be used to drive ballooning decisions when Linux is hosted
>   * as a guest. On Hyper-V, the host implements a policy engine for dynamically
> @@ -460,7 +455,12 @@ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
>   */
>  unsigned long vm_memory_committed(void)
>  {
> -       return percpu_counter_read_positive(&vm_committed_as);
> +       int cpu, sum = 0;
> +
> +       for_each_possible_cpu(cpu)
> +               sum += *per_cpu_ptr(&vm_committed_as, cpu);
> +
> +       return sum < 0 ? 0 : sum;
>  }
>  EXPORT_SYMBOL_GPL(vm_memory_committed);
>
> @@ -484,10 +484,6 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
>  {
>         long free, allowed, reserve;
>
> -       VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
> -                       -(s64)vm_committed_as_batch * num_online_cpus(),
> -                       "memory commitment underflow");
> -
>         vm_acct_memory(pages);
>
>         /*
> @@ -553,7 +549,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
>                 allowed -= min_t(long, mm->total_vm / 32, reserve);
>         }
>
> -       if (percpu_counter_read_positive(&vm_committed_as) < allowed)
> +       if (vm_memory_committed() < allowed)
>                 return 0;
>  error:
>         vm_unacct_memory(pages);
> --
> 2.4.10
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>