On 2017年12月19日 20:28, Michal Hocko wrote: > On Tue 19-12-17 14:39:22, Kemi Wang wrote: >> There is not really any use to get NUMA stats separated by zone, and >> current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code >> cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the >> existed per-cpu infrastructure. > > Let's hope that nobody really depends on the per-zone numbers. It would > be really strange as those counters are inherently per-node and that is > what users should care about but who knows... > > Anyway, I hoped we could get rid of NR_VM_NUMA_STAT_ITEMS but your patch > keeps it and follow up patches even use it further. I will comment on > those separately but this still makes these few counters really special > which I think is wrong. > Well, that's what I can think of to keep a balance between performance and simplification. If you have a better idea, please post it and I will follow that surely. >> Suggested-by: Andi Kleen <ak@xxxxxxxxxxxxxxx> >> Suggested-by: Michal Hocko <mhocko@xxxxxxxxxx> >> Signed-off-by: Kemi Wang <kemi.wang@xxxxxxxxx> > > I have to fully grasp the rest of the series before I'll give my Ack, > but I _really_ like the simplification this adds to the code. I believe > it can be even simpler. > >> --- >> drivers/base/node.c | 23 +++---- >> include/linux/mmzone.h | 27 ++++---- >> include/linux/vmstat.h | 31 --------- >> mm/mempolicy.c | 2 +- >> mm/page_alloc.c | 16 +++-- >> mm/vmstat.c | 177 +++++-------------------------------------------- >> 6 files changed, 46 insertions(+), 230 deletions(-) >> >> diff --git a/drivers/base/node.c b/drivers/base/node.c >> index ee090ab..a045ea1 100644 >> --- a/drivers/base/node.c >> +++ b/drivers/base/node.c >> @@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev, >> "interleave_hit %lu\n" >> "local_node %lu\n" >> "other_node %lu\n", >> - sum_zone_numa_state(dev->id, NUMA_HIT), >> - sum_zone_numa_state(dev->id, NUMA_MISS), >> - sum_zone_numa_state(dev->id, NUMA_FOREIGN), >> - sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT), >> - sum_zone_numa_state(dev->id, NUMA_LOCAL), >> - sum_zone_numa_state(dev->id, NUMA_OTHER)); >> + node_page_state(NODE_DATA(dev->id), NUMA_HIT), >> + node_page_state(NODE_DATA(dev->id), NUMA_MISS), >> + node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN), >> + node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT), >> + node_page_state(NODE_DATA(dev->id), NUMA_LOCAL), >> + node_page_state(NODE_DATA(dev->id), NUMA_OTHER)); >> } >> + >> static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); >> >> static ssize_t node_read_vmstat(struct device *dev, >> @@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev, >> n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], >> sum_zone_node_page_state(nid, i)); >> >> -#ifdef CONFIG_NUMA >> - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) >> - n += sprintf(buf+n, "%s %lu\n", >> - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], >> - sum_zone_numa_state(nid, i)); >> -#endif >> - >> for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) >> n += sprintf(buf+n, "%s %lu\n", >> - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + >> - NR_VM_NUMA_STAT_ITEMS], >> + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], >> node_page_state(pgdat, i)); >> >> return n; >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h >> index 67f2e3c..c06d880 100644 >> --- a/include/linux/mmzone.h >> +++ b/include/linux/mmzone.h >> @@ -115,20 +115,6 @@ struct zone_padding { >> #define ZONE_PADDING(name) >> #endif >> >> -#ifdef CONFIG_NUMA >> -enum numa_stat_item { >> - NUMA_HIT, /* allocated in intended node */ >> - NUMA_MISS, /* allocated in non intended node */ >> - NUMA_FOREIGN, /* was intended here, hit elsewhere */ >> - NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ >> - NUMA_LOCAL, /* allocation from local node */ >> - NUMA_OTHER, /* allocation from other node */ >> - NR_VM_NUMA_STAT_ITEMS >> -}; >> -#else >> -#define NR_VM_NUMA_STAT_ITEMS 0 >> -#endif >> - >> enum zone_stat_item { >> /* First 128 byte cacheline (assuming 64 bit words) */ >> NR_FREE_PAGES, >> @@ -151,7 +137,18 @@ enum zone_stat_item { >> NR_VM_ZONE_STAT_ITEMS }; >> >> enum node_stat_item { >> - NR_LRU_BASE, >> +#ifdef CONFIG_NUMA >> + NUMA_HIT, /* allocated in intended node */ >> + NUMA_MISS, /* allocated in non intended node */ >> + NUMA_FOREIGN, /* was intended here, hit elsewhere */ >> + NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ >> + NUMA_LOCAL, /* allocation from local node */ >> + NUMA_OTHER, /* allocation from other node */ >> + NR_VM_NUMA_STAT_ITEMS, >> +#else >> +#define NR_VM_NUMA_STAT_ITEMS 0 >> +#endif >> + NR_LRU_BASE = NR_VM_NUMA_STAT_ITEMS, >> NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ >> NR_ACTIVE_ANON, /* " " " " " */ >> NR_INACTIVE_FILE, /* " " " " " */ >> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h >> index 1779c98..80bf290 100644 >> --- a/include/linux/vmstat.h >> +++ b/include/linux/vmstat.h >> @@ -118,37 +118,8 @@ static inline void vm_events_fold_cpu(int cpu) >> * Zone and node-based page accounting with per cpu differentials. >> */ >> extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; >> -extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; >> extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; >> >> -#ifdef CONFIG_NUMA >> -static inline void zone_numa_state_add(long x, struct zone *zone, >> - enum numa_stat_item item) >> -{ >> - atomic_long_add(x, &zone->vm_numa_stat[item]); >> - atomic_long_add(x, &vm_numa_stat[item]); >> -} >> - >> -static inline unsigned long global_numa_state(enum numa_stat_item item) >> -{ >> - long x = atomic_long_read(&vm_numa_stat[item]); >> - >> - return x; >> -} >> - >> -static inline unsigned long zone_numa_state_snapshot(struct zone *zone, >> - enum numa_stat_item item) >> -{ >> - long x = atomic_long_read(&zone->vm_numa_stat[item]); >> - int cpu; >> - >> - for_each_online_cpu(cpu) >> - x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; >> - >> - return x; >> -} >> -#endif /* CONFIG_NUMA */ >> - >> static inline void zone_page_state_add(long x, struct zone *zone, >> enum zone_stat_item item) >> { >> @@ -234,10 +205,8 @@ static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, >> >> >> #ifdef CONFIG_NUMA >> -extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); >> extern unsigned long sum_zone_node_page_state(int node, >> enum zone_stat_item item); >> -extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); >> extern unsigned long node_page_state(struct pglist_data *pgdat, >> enum node_stat_item item); >> #else >> diff --git a/mm/mempolicy.c b/mm/mempolicy.c >> index 4ce44d3..b2293e3 100644 >> --- a/mm/mempolicy.c >> +++ b/mm/mempolicy.c >> @@ -1920,7 +1920,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, >> return page; >> if (page && page_to_nid(page) == nid) { >> preempt_disable(); >> - __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); >> + inc_node_state(page_pgdat(page), NUMA_INTERLEAVE_HIT); >> preempt_enable(); >> } >> return page; >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c >> index 7e5e775..81e8d8f 100644 >> --- a/mm/page_alloc.c >> +++ b/mm/page_alloc.c >> @@ -2793,22 +2793,24 @@ int __isolate_free_page(struct page *page, unsigned int order) >> static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) >> { >> #ifdef CONFIG_NUMA >> - enum numa_stat_item local_stat = NUMA_LOCAL; >> + int preferred_nid = preferred_zone->node; >> + int nid = z->node; >> + enum node_stat_item local_stat = NUMA_LOCAL; >> >> /* skip numa counters update if numa stats is disabled */ >> if (!static_branch_likely(&vm_numa_stat_key)) >> return; >> >> - if (z->node != numa_node_id()) >> + if (nid != numa_node_id()) >> local_stat = NUMA_OTHER; >> >> - if (z->node == preferred_zone->node) >> - __inc_numa_state(z, NUMA_HIT); >> + if (nid == preferred_nid) >> + inc_node_state(NODE_DATA(nid), NUMA_HIT); >> else { >> - __inc_numa_state(z, NUMA_MISS); >> - __inc_numa_state(preferred_zone, NUMA_FOREIGN); >> + inc_node_state(NODE_DATA(nid), NUMA_MISS); >> + inc_node_state(NODE_DATA(preferred_nid), NUMA_FOREIGN); >> } >> - __inc_numa_state(z, local_stat); >> + inc_node_state(NODE_DATA(nid), local_stat); >> #endif >> } >> >> diff --git a/mm/vmstat.c b/mm/vmstat.c >> index 40b2db6..1dd12ae 100644 >> --- a/mm/vmstat.c >> +++ b/mm/vmstat.c >> @@ -30,46 +30,44 @@ >> >> #include "internal.h" >> >> -#define NUMA_STATS_THRESHOLD (U16_MAX - 2) >> - >> #ifdef CONFIG_NUMA >> int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; >> >> -/* zero numa counters within a zone */ >> -static void zero_zone_numa_counters(struct zone *zone) >> +/* zero numa stats within a node */ >> +static void zero_node_numa_stats(int node) >> { >> int item, cpu; >> >> for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { >> - atomic_long_set(&zone->vm_numa_stat[item], 0); >> + atomic_long_set(&(NODE_DATA(node)->vm_stat[item]), 0); >> for_each_online_cpu(cpu) >> - per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] >> - = 0; >> + per_cpu_ptr(NODE_DATA(node)->per_cpu_nodestats, >> + cpu)->vm_node_stat_diff[item] = 0; >> } >> } >> >> -/* zero numa counters of all the populated zones */ >> -static void zero_zones_numa_counters(void) >> +/* zero numa stats of all the online nodes */ >> +static void zero_nodes_numa_stats(void) >> { >> - struct zone *zone; >> + int node; >> >> - for_each_populated_zone(zone) >> - zero_zone_numa_counters(zone); >> + for_each_online_node(node) >> + zero_node_numa_stats(node); >> } >> >> -/* zero global numa counters */ >> -static void zero_global_numa_counters(void) >> +/* zero global numa stats */ >> +static void zero_global_numa_stats(void) >> { >> int item; >> >> for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) >> - atomic_long_set(&vm_numa_stat[item], 0); >> + atomic_long_set(&vm_node_stat[item], 0); >> } >> >> static void invalid_numa_statistics(void) >> { >> - zero_zones_numa_counters(); >> - zero_global_numa_counters(); >> + zero_nodes_numa_stats(); >> + zero_global_numa_stats(); >> } >> >> static DEFINE_MUTEX(vm_numa_stat_lock); >> @@ -160,10 +158,8 @@ void vm_events_fold_cpu(int cpu) >> * vm_stat contains the global counters >> */ >> atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; >> -atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp; >> atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; >> EXPORT_SYMBOL(vm_zone_stat); >> -EXPORT_SYMBOL(vm_numa_stat); >> EXPORT_SYMBOL(vm_node_stat); >> >> #ifdef CONFIG_SMP >> @@ -679,32 +675,6 @@ EXPORT_SYMBOL(dec_node_page_state); >> * Fold a differential into the global counters. >> * Returns the number of counters updated. >> */ >> -#ifdef CONFIG_NUMA >> -static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) >> -{ >> - int i; >> - int changes = 0; >> - >> - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) >> - if (zone_diff[i]) { >> - atomic_long_add(zone_diff[i], &vm_zone_stat[i]); >> - changes++; >> - } >> - >> - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) >> - if (numa_diff[i]) { >> - atomic_long_add(numa_diff[i], &vm_numa_stat[i]); >> - changes++; >> - } >> - >> - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) >> - if (node_diff[i]) { >> - atomic_long_add(node_diff[i], &vm_node_stat[i]); >> - changes++; >> - } >> - return changes; >> -} >> -#else >> static int fold_diff(int *zone_diff, int *node_diff) >> { >> int i; >> @@ -723,7 +693,6 @@ static int fold_diff(int *zone_diff, int *node_diff) >> } >> return changes; >> } >> -#endif /* CONFIG_NUMA */ >> >> /* >> * Update the zone counters for the current cpu. >> @@ -747,9 +716,6 @@ static int refresh_cpu_vm_stats(bool do_pagesets) >> struct zone *zone; >> int i; >> int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; >> -#ifdef CONFIG_NUMA >> - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; >> -#endif >> int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; >> int changes = 0; >> >> @@ -771,18 +737,6 @@ static int refresh_cpu_vm_stats(bool do_pagesets) >> } >> } >> #ifdef CONFIG_NUMA >> - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { >> - int v; >> - >> - v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0); >> - if (v) { >> - >> - atomic_long_add(v, &zone->vm_numa_stat[i]); >> - global_numa_diff[i] += v; >> - __this_cpu_write(p->expire, 3); >> - } >> - } >> - >> if (do_pagesets) { >> cond_resched(); >> /* >> @@ -829,12 +783,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) >> } >> } >> >> -#ifdef CONFIG_NUMA >> - changes += fold_diff(global_zone_diff, global_numa_diff, >> - global_node_diff); >> -#else >> changes += fold_diff(global_zone_diff, global_node_diff); >> -#endif >> return changes; >> } >> >> @@ -849,9 +798,6 @@ void cpu_vm_stats_fold(int cpu) >> struct zone *zone; >> int i; >> int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; >> -#ifdef CONFIG_NUMA >> - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; >> -#endif >> int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; >> >> for_each_populated_zone(zone) { >> @@ -868,18 +814,6 @@ void cpu_vm_stats_fold(int cpu) >> atomic_long_add(v, &zone->vm_stat[i]); >> global_zone_diff[i] += v; >> } >> - >> -#ifdef CONFIG_NUMA >> - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) >> - if (p->vm_numa_stat_diff[i]) { >> - int v; >> - >> - v = p->vm_numa_stat_diff[i]; >> - p->vm_numa_stat_diff[i] = 0; >> - atomic_long_add(v, &zone->vm_numa_stat[i]); >> - global_numa_diff[i] += v; >> - } >> -#endif >> } >> >> for_each_online_pgdat(pgdat) { >> @@ -898,11 +832,7 @@ void cpu_vm_stats_fold(int cpu) >> } >> } >> >> -#ifdef CONFIG_NUMA >> - fold_diff(global_zone_diff, global_numa_diff, global_node_diff); >> -#else >> fold_diff(global_zone_diff, global_node_diff); >> -#endif >> } >> >> /* >> @@ -920,36 +850,10 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) >> atomic_long_add(v, &zone->vm_stat[i]); >> atomic_long_add(v, &vm_zone_stat[i]); >> } >> - >> -#ifdef CONFIG_NUMA >> - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) >> - if (pset->vm_numa_stat_diff[i]) { >> - int v = pset->vm_numa_stat_diff[i]; >> - >> - pset->vm_numa_stat_diff[i] = 0; >> - atomic_long_add(v, &zone->vm_numa_stat[i]); >> - atomic_long_add(v, &vm_numa_stat[i]); >> - } >> -#endif >> } >> #endif >> >> #ifdef CONFIG_NUMA >> -void __inc_numa_state(struct zone *zone, >> - enum numa_stat_item item) >> -{ >> - struct per_cpu_pageset __percpu *pcp = zone->pageset; >> - u16 __percpu *p = pcp->vm_numa_stat_diff + item; >> - u16 v; >> - >> - v = __this_cpu_inc_return(*p); >> - >> - if (unlikely(v > NUMA_STATS_THRESHOLD)) { >> - zone_numa_state_add(v, zone, item); >> - __this_cpu_write(*p, 0); >> - } >> -} >> - >> /* >> * Determine the per node value of a stat item. This function >> * is called frequently in a NUMA machine, so try to be as >> @@ -969,23 +873,6 @@ unsigned long sum_zone_node_page_state(int node, >> } >> >> /* >> - * Determine the per node value of a numa stat item. To avoid deviation, >> - * the per cpu stat number in vm_numa_stat_diff[] is also included. >> - */ >> -unsigned long sum_zone_numa_state(int node, >> - enum numa_stat_item item) >> -{ >> - struct zone *zones = NODE_DATA(node)->node_zones; >> - int i; >> - unsigned long count = 0; >> - >> - for (i = 0; i < MAX_NR_ZONES; i++) >> - count += zone_numa_state_snapshot(zones + i, item); >> - >> - return count; >> -} >> - >> -/* >> * Determine the per node value of a stat item. >> */ >> unsigned long node_page_state(struct pglist_data *pgdat, >> @@ -1569,8 +1456,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, >> seq_printf(m, "\n per-node stats"); >> for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { >> seq_printf(m, "\n %-12s %lu", >> - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + >> - NR_VM_NUMA_STAT_ITEMS], >> + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], >> node_page_state(pgdat, i)); >> } >> } >> @@ -1607,13 +1493,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, >> seq_printf(m, "\n %-12s %lu", vmstat_text[i], >> zone_page_state(zone, i)); >> >> -#ifdef CONFIG_NUMA >> - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) >> - seq_printf(m, "\n %-12s %lu", >> - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], >> - zone_numa_state_snapshot(zone, i)); >> -#endif >> - >> seq_printf(m, "\n pagesets"); >> for_each_online_cpu(i) { >> struct per_cpu_pageset *pageset; >> @@ -1688,7 +1567,6 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) >> if (*pos >= ARRAY_SIZE(vmstat_text)) >> return NULL; >> stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + >> - NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) + >> NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + >> NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); >> >> @@ -1704,12 +1582,6 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) >> v[i] = global_zone_page_state(i); >> v += NR_VM_ZONE_STAT_ITEMS; >> >> -#ifdef CONFIG_NUMA >> - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) >> - v[i] = global_numa_state(i); >> - v += NR_VM_NUMA_STAT_ITEMS; >> -#endif >> - >> for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) >> v[i] = global_node_page_state(i); >> v += NR_VM_NODE_STAT_ITEMS; >> @@ -1811,16 +1683,6 @@ int vmstat_refresh(struct ctl_table *table, int write, >> err = -EINVAL; >> } >> } >> -#ifdef CONFIG_NUMA >> - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { >> - val = atomic_long_read(&vm_numa_stat[i]); >> - if (val < 0) { >> - pr_warn("%s: %s %ld\n", >> - __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val); >> - err = -EINVAL; >> - } >> - } >> -#endif >> if (err) >> return err; >> if (write) >> @@ -1862,9 +1724,6 @@ static bool need_update(int cpu) >> struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); >> >> BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); >> -#ifdef CONFIG_NUMA >> - BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); >> -#endif >> >> /* >> * The fast way of checking if there are any vmstat diffs. >> @@ -1872,10 +1731,6 @@ static bool need_update(int cpu) >> */ >> if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) >> return true; >> -#ifdef CONFIG_NUMA >> - if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS)) >> - return true; >> -#endif >> } >> return false; >> } >> -- >> 2.7.4 >> > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>