The patch titled Subject: mm: update NUMA counter threshold size has been added to the -mm tree. Its filename is mm-update-numa-counter-threshold-size.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/mm-update-numa-counter-threshold-size.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/mm-update-numa-counter-threshold-size.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/SubmitChecklist when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Kemi Wang <kemi.wang@xxxxxxxxx> Subject: mm: update NUMA counter threshold size There is significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (suggested by Dave Hansen). This patch updates NUMA counter threshold to a fixed size of MAX_U16 - 2, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter(suggested by Ying Huang). The rationality is that these statistics counters don't affect the kernel's decision, unlike other VM counters, so it's not a problem to use a large threshold. With this patchset, we see 31.3% drop of CPU cycles(537-->369) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Benchmark provided by Jesper D Brouer(increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Threshold CPU cycles Throughput(88 threads) 32 799 241760478 64 640 301628829 125 537 358906028 <==> system by default (base) 256 468 412397590 512 428 450550704 4096 399 482520943 20000 394 489009617 30000 395 488017817 65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics Link: http://lkml.kernel.org/r/1503568801-21305-3-git-send-email-kemi.wang@xxxxxxxxx Signed-off-by: Kemi Wang <kemi.wang@xxxxxxxxx> Reported-by: Jesper Dangaard Brouer <brouer@xxxxxxxxxx> Suggested-by: Dave Hansen <dave.hansen@xxxxxxxxx> Suggested-by: Ying Huang <ying.huang@xxxxxxxxx> Acked-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> Cc: Aaron Lu <aaron.lu@xxxxxxxxx> Cc: Andi Kleen <andi.kleen@xxxxxxxxx> Cc: Christopher Lameter <cl@xxxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Tim Chen <tim.c.chen@xxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- include/linux/mmzone.h | 3 +-- mm/vmstat.c | 28 ++++++++++------------------ 2 files changed, 11 insertions(+), 20 deletions(-) diff -puN include/linux/mmzone.h~mm-update-numa-counter-threshold-size include/linux/mmzone.h --- a/include/linux/mmzone.h~mm-update-numa-counter-threshold-size +++ a/include/linux/mmzone.h @@ -282,8 +282,7 @@ struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; - s8 numa_stat_threshold; - s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; + u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; diff -puN mm/vmstat.c~mm-update-numa-counter-threshold-size mm/vmstat.c --- a/mm/vmstat.c~mm-update-numa-counter-threshold-size +++ a/mm/vmstat.c @@ -30,6 +30,8 @@ #include "internal.h" +#define NUMA_STATS_THRESHOLD (U16_MAX - 2) + #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); @@ -194,10 +196,7 @@ void refresh_zone_stat_thresholds(void) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; -#ifdef CONFIG_NUMA - per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; -#endif + /* Base nodestat threshold on the largest populated zone. */ pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold @@ -231,14 +230,9 @@ void set_pgdat_percpu_threshold(pg_data_ continue; threshold = (*calculate_pressure)(zone); - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; -#ifdef CONFIG_NUMA - per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; -#endif - } } } @@ -872,16 +866,14 @@ void __inc_numa_state(struct zone *zone, enum numa_stat_item item) { struct per_cpu_pageset __percpu *pcp = zone->pageset; - s8 __percpu *p = pcp->vm_numa_stat_diff + item; - s8 v, t; + u16 __percpu *p = pcp->vm_numa_stat_diff + item; + u16 v; v = __this_cpu_inc_return(*p); - t = __this_cpu_read(pcp->numa_stat_threshold); - if (unlikely(v > t)) { - s8 overstep = t >> 1; - zone_numa_state_add(v + overstep, zone, item); - __this_cpu_write(*p, -overstep); + if (unlikely(v > NUMA_STATS_THRESHOLD)) { + zone_numa_state_add(v, zone, item); + __this_cpu_write(*p, 0); } } @@ -1796,7 +1788,7 @@ static bool need_update(int cpu) BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); #ifdef CONFIG_NUMA - BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1); + BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); #endif /* * The fast way of checking if there are any vmstat diffs. _ Patches currently in -mm which might be from kemi.wang@xxxxxxxxx are mm-change-the-call-sites-of-numa-statistics-items.patch mm-update-numa-counter-threshold-size.patch mm-consider-the-number-in-local-cpus-when-reads-numa-stats.patch -- To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html