The patch titled Subject: vmstat: add pcp remote node draining via cpu_vm_stats_fold has been added to the -mm mm-unstable branch. Its filename is vmstat-add-pcp-remote-node-draining-via-cpu_vm_stats_fold.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/vmstat-add-pcp-remote-node-draining-via-cpu_vm_stats_fold.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Marcelo Tosatti <mtosatti@xxxxxxxxxx> Subject: vmstat: add pcp remote node draining via cpu_vm_stats_fold Date: Mon, 13 Mar 2023 13:25:19 -0300 Large NUMA systems might have significant portions of system memory to be trapped in pcp queues. The number of pcp is determined by the number of processors and nodes in a system. A system with 4 processors and 2 nodes has 8 pcps which is okay. But a system with 1024 processors and 512 nodes has 512k pcps with a high potential for large amount of memory being caught in them. Enable remote node draining for the CONFIG_HAVE_CMPXCHG_LOCAL case, where vmstat_shepherd will perform the aging and draining via cpu_vm_stats_fold. Link: https://lkml.kernel.org/r/20230313162634.561005765@xxxxxxxxxx Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx> Suggested-by: Vlastimil Babka <vbabka@xxxxxxx> Cc: Aaron Tomlin <atomlin@xxxxxxxxxxx> Cc: Christian König <christian.koenig@xxxxxxx> Cc: Christoph Lameter <cl@xxxxxxxxx> Cc: Dan Williams <dan.j.williams@xxxxxxxxx> Cc: Frederic Weisbecker <frederic@xxxxxxxxxx> Cc: Heiko Carstens <hca@xxxxxxxxxxxxx> Cc: Huacai Chen <chenhuacai@xxxxxxxxxx> Cc: Jason Gunthorpe <jgg@xxxxxxxx> Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> Cc: Lorenzo Stoakes <lstoakes@xxxxxxxxx> Cc: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Peter Xu <peterx@xxxxxxxxxx> Cc: "Russell King (Oracle)" <linux@xxxxxxxxxxxxxxx> Cc: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- --- a/include/linux/vmstat.h~vmstat-add-pcp-remote-node-draining-via-cpu_vm_stats_fold +++ a/include/linux/vmstat.h @@ -297,7 +297,7 @@ extern void __dec_zone_state(struct zone extern void __dec_node_state(struct pglist_data *, enum node_stat_item); void quiet_vmstat(void); -void cpu_vm_stats_fold(int cpu); +void cpu_vm_stats_fold(int cpu, bool do_pagesets); void refresh_zone_stat_thresholds(void); struct ctl_table; --- a/mm/page_alloc.c~vmstat-add-pcp-remote-node-draining-via-cpu_vm_stats_fold +++ a/mm/page_alloc.c @@ -8540,7 +8540,7 @@ static int page_alloc_cpu_dead(unsigned * Zero the differential counters of the dead processor * so that the vm statistics are consistent. */ - cpu_vm_stats_fold(cpu); + cpu_vm_stats_fold(cpu, false); for_each_populated_zone(zone) zone_pcp_update(zone, 0); --- a/mm/vmstat.c~vmstat-add-pcp-remote-node-draining-via-cpu_vm_stats_fold +++ a/mm/vmstat.c @@ -928,7 +928,7 @@ static int refresh_cpu_vm_stats(bool do_ * There cannot be any access by the offline cpu and therefore * synchronization is simplified. */ -void cpu_vm_stats_fold(int cpu) +void cpu_vm_stats_fold(int cpu, bool do_pagesets) { struct pglist_data *pgdat; struct zone *zone; @@ -938,6 +938,9 @@ void cpu_vm_stats_fold(int cpu) for_each_populated_zone(zone) { struct per_cpu_zonestat *pzstats; +#ifdef CONFIG_NUMA + struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); +#endif pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); @@ -948,6 +951,11 @@ void cpu_vm_stats_fold(int cpu) v = xchg(&pzstats->vm_stat_diff[i], 0); atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; +#ifdef CONFIG_NUMA + /* 3 seconds idle till flush */ + if (do_pagesets) + pcp->expire = 3; +#endif } } #ifdef CONFIG_NUMA @@ -959,6 +967,38 @@ void cpu_vm_stats_fold(int cpu) zone_numa_event_add(v, zone, i); } } + + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of a + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!pcp->expire || !pcp->count) + continue; + + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == cpu_to_node(cpu)) { + pcp->expire = 0; + continue; + } + + WARN_ON(pcp->expire < 0); + /* + * pcp->expire is only accessed from vmstat_shepherd context, + * therefore no locking is required. + */ + if (--pcp->expire) + continue; + + if (pcp->count) + drain_zone_pages(zone, pcp); + } #endif } @@ -2066,7 +2106,7 @@ static int refresh_all_vm_stats(void) cpus_read_lock(); for_each_online_cpu(cpu) { - cpu_vm_stats_fold(cpu); + cpu_vm_stats_fold(cpu, true); cond_resched(); } cpus_read_unlock(); _ Patches currently in -mm which might be from mtosatti@xxxxxxxxxx are this_cpu_cmpxchg-arm64-switch-this_cpu_cmpxchg-to-locked-add-_local-function.patch this_cpu_cmpxchg-loongarch-switch-this_cpu_cmpxchg-to-locked-add-_local-function.patch this_cpu_cmpxchg-s390-switch-this_cpu_cmpxchg-to-locked-add-_local-function.patch this_cpu_cmpxchg-x86-switch-this_cpu_cmpxchg-to-locked-add-_local-function.patch add-this_cpu_cmpxchg_local-and-asm-generic-definitions.patch convert-this_cpu_cmpxchg-users-to-this_cpu_cmpxchg_local.patch mm-vmstat-switch-counter-modification-to-cmpxchg.patch vmstat-switch-per-cpu-vmstat-counters-to-32-bits.patch mm-vmstat-use-xchg-in-cpu_vm_stats_fold.patch mm-vmstat-switch-vmstat-shepherd-to-flush-per-cpu-counters-remotely.patch mm-vmstat-refresh-stats-remotely-instead-of-via-work-item.patch vmstat-add-pcp-remote-node-draining-via-cpu_vm_stats_fold.patch