On Tue, Aug 15, 2017 at 04:45:35PM +0800, Kemi Wang wrote: > In this patch, NUMA statistics is separated from zone statistics > framework, all the call sites of NUMA stats are changed to use > numa-stats-specific functions, it does not have any functionality change > except that the value of NUMA stats is shown behind zone page stats, and > the threshold size of NUMA stats is shown behind pcp threshold when users > *read* the zone info. > > E.g. cat /proc/zoneinfo > ***Base*** ***With this patch*** > nr_free_pages 3976 nr_free_pages 3976 > nr_zone_inactive_anon 0 nr_zone_inactive_anon 0 > nr_zone_active_anon 0 nr_zone_active_anon 0 > nr_zone_inactive_file 0 nr_zone_inactive_file 0 > nr_zone_active_file 0 nr_zone_active_file 0 > nr_zone_unevictable 0 nr_zone_unevictable 0 > nr_zone_write_pending 0 nr_zone_write_pending 0 > nr_mlock 0 nr_mlock 0 > nr_page_table_pages 0 nr_page_table_pages 0 > nr_kernel_stack 0 nr_kernel_stack 0 > nr_bounce 0 nr_bounce 0 > nr_zspages 0 nr_zspages 0 > numa_hit 0 *nr_free_cma 0* > numa_miss 0 numa_hit 0 > numa_foreign 0 numa_miss 0 > numa_interleave 0 numa_foreign 0 > numa_local 0 numa_interleave 0 > numa_other 0 numa_local 0 > *nr_free_cma 0* numa_other 0 > ... ... > vm stats threshold: 10 vm stats threshold: 10 > ... *vm numa stats threshold: 10* > ... > > The next patch updates the numa stats counter size and threshold. > > Signed-off-by: Kemi Wang <kemi.wang@xxxxxxxxx> > --- > drivers/base/node.c | 22 ++++--- > include/linux/mmzone.h | 25 +++++--- > include/linux/vmstat.h | 29 +++++++++ > mm/page_alloc.c | 10 +-- > mm/vmstat.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++-- > 5 files changed, 227 insertions(+), 26 deletions(-) > > diff --git a/drivers/base/node.c b/drivers/base/node.c > index d8dc830..12080c6 100644 > --- a/drivers/base/node.c > +++ b/drivers/base/node.c > @@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev, > "interleave_hit %lu\n" > "local_node %lu\n" > "other_node %lu\n", > - sum_zone_node_page_state(dev->id, NUMA_HIT), > - sum_zone_node_page_state(dev->id, NUMA_MISS), > - sum_zone_node_page_state(dev->id, NUMA_FOREIGN), > - sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT), > - sum_zone_node_page_state(dev->id, NUMA_LOCAL), > - sum_zone_node_page_state(dev->id, NUMA_OTHER)); > + sum_zone_node_numa_state(dev->id, NUMA_HIT), > + sum_zone_node_numa_state(dev->id, NUMA_MISS), > + sum_zone_node_numa_state(dev->id, NUMA_FOREIGN), > + sum_zone_node_numa_state(dev->id, NUMA_INTERLEAVE_HIT), > + sum_zone_node_numa_state(dev->id, NUMA_LOCAL), > + sum_zone_node_numa_state(dev->id, NUMA_OTHER)); > } The names are very similar and it would be preferred if the names were visually different like sum_zone_numa_stat() which is hard to confuse with the zone stat fields. > static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); > > @@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev, > n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], > sum_zone_node_page_state(nid, i)); > > - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) > +#ifdef CONFIG_NUMA > + for (i = 0; i < NR_VM_ZONE_NUMA_STAT_ITEMS; i++) > n += sprintf(buf+n, "%s %lu\n", > vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], > + sum_zone_node_numa_state(nid, i)); > +#endif Similar with NR_VM_ZONE_NUMA_STAT_ITEMS, it's too similar to NR_VM_NODE_STAT_ITEMS. > + > + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) > + n += sprintf(buf+n, "%s %lu\n", > + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + > + NR_VM_ZONE_NUMA_STAT_ITEMS], > node_page_state(pgdat, i)); > > return n; > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index fc14b8b..0b11ba7 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -114,6 +114,20 @@ struct zone_padding { > #define ZONE_PADDING(name) > #endif > > +#ifdef CONFIG_NUMA > +enum zone_numa_stat_item { > + NUMA_HIT, /* allocated in intended node */ > + NUMA_MISS, /* allocated in non intended node */ > + NUMA_FOREIGN, /* was intended here, hit elsewhere */ > + NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ > + NUMA_LOCAL, /* allocation from local node */ > + NUMA_OTHER, /* allocation from other node */ > + NR_VM_ZONE_NUMA_STAT_ITEMS > +}; > +#else > +#define NR_VM_ZONE_NUMA_STAT_ITEMS 0 > +#endif > + > enum zone_stat_item { > /* First 128 byte cacheline (assuming 64 bit words) */ > NR_FREE_PAGES, > @@ -132,14 +146,6 @@ enum zone_stat_item { > #if IS_ENABLED(CONFIG_ZSMALLOC) > NR_ZSPAGES, /* allocated in zsmalloc */ > #endif > -#ifdef CONFIG_NUMA > - NUMA_HIT, /* allocated in intended node */ > - NUMA_MISS, /* allocated in non intended node */ > - NUMA_FOREIGN, /* was intended here, hit elsewhere */ > - NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ > - NUMA_LOCAL, /* allocation from local node */ > - NUMA_OTHER, /* allocation from other node */ > -#endif > NR_FREE_CMA_PAGES, > NR_VM_ZONE_STAT_ITEMS }; > > @@ -276,6 +282,8 @@ struct per_cpu_pageset { > struct per_cpu_pages pcp; > #ifdef CONFIG_NUMA > s8 expire; > + s8 numa_stat_threshold; > + s8 vm_numa_stat_diff[NR_VM_ZONE_NUMA_STAT_ITEMS]; > #endif > #ifdef CONFIG_SMP > s8 stat_threshold; Ok. this slightly increases the size of the per_cpu_pageset due to numa_stat_threshold. The structure occupes 2 cache lines and still occupies 2 cache lines afterwards so that is ok but consider hard-coding the value of it. The locality stats are never used as part of a decision made by the kernel and they get summed when reading proc unconditionally. There is little benefit to tuning that threshold at all and there should be a very small performance gain if it's removed because it'll be a compile-time constant. The rest of the patch is mostly mechanical. -- Mel Gorman SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>