Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed at the 2017 MM Summit, these are a substantial source of overhead in the page allocator and are very rarely consumed. A link to the MM summit slides: http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017 -JesperBrouer.pdf Therefore, with different VM stats mode, numa counters update can operate differently so that everybody can benefit: If vmstat_mode = auto, automatic detection of numa statistics, numa counter update is skipped unless it has been read by users at least once, e.g. cat /proc/zoneinfo. If vmstat_mode = strict, numa counter is updated for each page allocation. If vmstat_mode = coarse, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer(increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Reported-by: Jesper Dangaard Brouer <brouer@xxxxxxxxxx> Suggested-by: Dave Hansen <dave.hansen@xxxxxxxxx> Suggested-by: Ying Huang <ying.huang@xxxxxxxxx> Signed-off-by: Kemi Wang <kemi.wang@xxxxxxxxx> --- drivers/base/node.c | 2 ++ include/linux/vmstat.h | 6 +++++ mm/page_alloc.c | 13 +++++++++++ mm/vmstat.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 78 insertions(+), 3 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 3855902..033c0c3 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -153,6 +153,7 @@ static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL); static ssize_t node_read_numastat(struct device *dev, struct device_attribute *attr, char *buf) { + disable_zone_statistics = false; return sprintf(buf, "numa_hit %lu\n" "numa_miss %lu\n" @@ -194,6 +195,7 @@ static ssize_t node_read_vmstat(struct device *dev, NR_VM_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); + disable_zone_statistics = false; return n; } static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c3634c7..ca9854c 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -9,6 +9,7 @@ extern int sysctl_stat_interval; +extern bool disable_zone_statistics; /* * vmstat_mode: * 0 = auto mode of vmstat, automatic detection of VM statistics. @@ -19,6 +20,7 @@ extern int sysctl_stat_interval; #define VMSTAT_STRICT_MODE 1 #define VMSTAT_COARSE_MODE 2 #define VMSTAT_MODE_LEN 16 +extern int vmstat_mode; extern char sysctl_vmstat_mode[]; extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); @@ -243,6 +245,10 @@ extern unsigned long sum_zone_node_page_state(int node, extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); +extern void zero_zone_numa_counters(struct zone *zone); +extern void zero_zones_numa_counters(void); +extern void zero_global_numa_counters(void); +extern void invalid_numa_statistics(void); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c841af8..010a620 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -83,6 +83,8 @@ DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); #endif +bool disable_zone_statistics = true; + #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. @@ -2743,6 +2745,17 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; + /* + * skip zone_statistics() if vmstat is a coarse mode or zone statistics + * is inactive in auto vmstat mode + */ + + if (vmstat_mode) { + if (vmstat_mode == VMSTAT_COARSE_MODE) + return; + } else if (disable_zone_statistics) + return; + if (z->node != numa_node_id()) local_stat = NUMA_OTHER; diff --git a/mm/vmstat.c b/mm/vmstat.c index e675ad2..bcaef62 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -85,15 +85,31 @@ int sysctl_vmstat_mode_handler(struct ctl_table *table, int write, /* no change */ mutex_unlock(&vmstat_mode_lock); return 0; - } else if (vmstat_mode == VMSTAT_AUTO_MODE) + } else if (vmstat_mode == VMSTAT_AUTO_MODE) { pr_info("vmstat mode changes from %s to auto mode\n", vmstat_mode_name[oldval]); - else if (vmstat_mode == VMSTAT_STRICT_MODE) + /* + * Set default numa stats action when vmstat mode changes + * from coarse to auto + */ + if (oldval == VMSTAT_COARSE_MODE) + disable_zone_statistics = true; + } else if (vmstat_mode == VMSTAT_STRICT_MODE) pr_info("vmstat mode changes from %s to strict mode\n", vmstat_mode_name[oldval]); - else if (vmstat_mode == VMSTAT_COARSE_MODE) + else if (vmstat_mode == VMSTAT_COARSE_MODE) { pr_info("vmstat mode changes from %s to coarse mode\n", vmstat_mode_name[oldval]); +#ifdef CONFIG_NUMA + /* + * Invalidate numa counters when vmstat mode is set to coarse + * mode, because users can't tell the difference between the + * dead state and when allocator activity is quiet once + * zone_statistics() is turned off. + */ + invalid_numa_statistics(); +#endif + } else pr_warn("invalid vmstat_mode:%d\n", vmstat_mode); } @@ -984,6 +1000,42 @@ unsigned long sum_zone_numa_state(int node, return count; } +/* zero numa counters within a zone */ +void zero_zone_numa_counters(struct zone *zone) +{ + int item, cpu; + + for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { + atomic_long_set(&zone->vm_numa_stat[item], 0); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] = 0; + } +} + +/* zero numa counters of all the populated zones */ +void zero_zones_numa_counters(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + zero_zone_numa_counters(zone); +} + +/* zero global numa counters */ +void zero_global_numa_counters(void) +{ + int item; + + for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) + atomic_long_set(&vm_numa_stat[item], 0); +} + +void invalid_numa_statistics(void) +{ + zero_zones_numa_counters(); + zero_global_numa_counters(); +} + /* * Determine the per node value of a stat item. */ @@ -1652,6 +1704,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print); + disable_zone_statistics = false; return 0; } @@ -1748,6 +1801,7 @@ static int vmstat_show(struct seq_file *m, void *arg) static void vmstat_stop(struct seq_file *m, void *arg) { + disable_zone_statistics = false; kfree(m->private); m->private = NULL; } -- 2.7.4