On 4/22/21 1:14 PM, Mel Gorman wrote: > NUMA statistics are maintained on the zone level for hits, misses, foreign > etc but nothing relies on them being perfectly accurate for functional > correctness. The counters are used by userspace to get a general overview > of a workloads NUMA behaviour but the page allocator incurs a high cost to > maintain perfect accuracy similar to what is required for a vmstat like > NR_FREE_PAGES. There even is a sysctl vm.numa_stat to allow userspace to > turn off the collection of NUMA statistics like NUMA_HIT. > > This patch converts NUMA_HIT and friends to be NUMA events with similar > accuracy to VM events. There is a possibility that slight errors will be > introduced but the overall trend as seen by userspace will be similar. > The counters are no longer updated from vmstat_refresh context as it is > unnecessary overhead for counters that may never be read by userspace. > Note that counters could be maintained at the node level to save space > but it would have a user-visible impact due to /proc/zoneinfo. > > [lkp@xxxxxxxxx: Fix misplaced closing brace for !CONFIG_NUMA] > Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> ... > @@ -731,26 +722,34 @@ static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) > } > return changes; > } > -#else > -static int fold_diff(int *zone_diff, int *node_diff) > + > +#ifdef CONFIG_NUMA > +static void fold_vm_zone_numa_events(struct zone *zone) > { > - int i; > - int changes = 0; > + int zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, }; Should this be long? pzstats are, the global counters too, so seems weird to use int as intermediate sum counter. > + int cpu; > + enum numa_stat_item item; > > - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) > - if (zone_diff[i]) { > - atomic_long_add(zone_diff[i], &vm_zone_stat[i]); > - changes++; > - } > + for_each_online_cpu(cpu) { > + struct per_cpu_zonestat *pzstats; > > - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) > - if (node_diff[i]) { > - atomic_long_add(node_diff[i], &vm_node_stat[i]); > - changes++; > + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); > + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) > + zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0); > } > - return changes; > + > + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) > + zone_numa_event_add(zone_numa_events[item], zone, item); > } > -#endif /* CONFIG_NUMA */ > + > +void fold_vm_numa_events(void) > +{ > + struct zone *zone; > + > + for_each_populated_zone(zone) > + fold_vm_zone_numa_events(zone); > +} > +#endif > > /* > * Update the zone counters for the current cpu. > @@ -774,15 +773,14 @@ static int refresh_cpu_vm_stats(bool do_pagesets) > struct zone *zone; > int i; > int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; > -#ifdef CONFIG_NUMA > - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; > -#endif > int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; > int changes = 0; > > for_each_populated_zone(zone) { > struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; > +#ifdef CONFIG_NUMA > struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset; > +#endif > > for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { > int v; > @@ -799,17 +797,6 @@ static int refresh_cpu_vm_stats(bool do_pagesets) > } > } > #ifdef CONFIG_NUMA > - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { > - int v; > - > - v = this_cpu_xchg(pzstats->vm_numa_stat_diff[i], 0); > - if (v) { > - > - atomic_long_add(v, &zone->vm_numa_stat[i]); > - global_numa_diff[i] += v; > - __this_cpu_write(pcp->expire, 3); > - } > - } > > if (do_pagesets) { > cond_resched(); > @@ -857,12 +844,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) > } > } > > -#ifdef CONFIG_NUMA > - changes += fold_diff(global_zone_diff, global_numa_diff, > - global_node_diff); > -#else > changes += fold_diff(global_zone_diff, global_node_diff); > -#endif > return changes; > } > > @@ -877,9 +859,6 @@ void cpu_vm_stats_fold(int cpu) > struct zone *zone; > int i; > int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; > -#ifdef CONFIG_NUMA > - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; > -#endif > int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; > > for_each_populated_zone(zone) { > @@ -887,7 +866,7 @@ void cpu_vm_stats_fold(int cpu) > > pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); > > - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) > + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { > if (pzstats->vm_stat_diff[i]) { > int v; > > @@ -896,17 +875,17 @@ void cpu_vm_stats_fold(int cpu) > atomic_long_add(v, &zone->vm_stat[i]); > global_zone_diff[i] += v; > } > - > + } > #ifdef CONFIG_NUMA > - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) > - if (pzstats->vm_numa_stat_diff[i]) { > + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) { > + if (pzstats->vm_numa_event[i]) { > int v; Also long? > > - v = pzstats->vm_numa_stat_diff[i]; > - pzstats->vm_numa_stat_diff[i] = 0; > - atomic_long_add(v, &zone->vm_numa_stat[i]); > - global_numa_diff[i] += v; > + v = pzstats->vm_numa_event[i]; > + pzstats->vm_numa_event[i] = 0; > + zone_numa_event_add(v, zone, i); > } > + } > #endif > } > > @@ -926,11 +905,7 @@ void cpu_vm_stats_fold(int cpu) > } > } > > -#ifdef CONFIG_NUMA > - fold_diff(global_zone_diff, global_numa_diff, global_node_diff); > -#else > fold_diff(global_zone_diff, global_node_diff); > -#endif > } > > /* > @@ -939,43 +914,36 @@ void cpu_vm_stats_fold(int cpu) > */ > void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) > { > - int i; > + int i, v; And the 'v' here. Maybe keep using local to each loop below and make it long for the NUMA one? > - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) > + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { > if (pzstats->vm_stat_diff[i]) { > - int v = pzstats->vm_stat_diff[i]; > + v = pzstats->vm_stat_diff[i]; > pzstats->vm_stat_diff[i] = 0; > - atomic_long_add(v, &zone->vm_stat[i]); > - atomic_long_add(v, &vm_zone_stat[i]); > + zone_page_state_add(v, zone, i); > } > + } > > #ifdef CONFIG_NUMA > - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) > - if (pzstats->vm_numa_stat_diff[i]) { > - int v = pzstats->vm_numa_stat_diff[i]; > - > - pzstats->vm_numa_stat_diff[i] = 0; > - atomic_long_add(v, &zone->vm_numa_stat[i]); > - atomic_long_add(v, &vm_numa_stat[i]); > + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) { > + if (pzstats->vm_numa_event[i]) { > + v = pzstats->vm_numa_event[i]; > + pzstats->vm_numa_event[i] = 0; > + zone_numa_event_add(v, zone, i); > } > + } > #endif > } > #endif > > #ifdef CONFIG_NUMA