Yafang Shao <laoar.shao@xxxxxxxxx> writes: > The configuration parameter PCP_BATCH_SCALE_MAX poses challenges for > quickly experimenting with specific workloads in a production environment, > particularly when monitoring latency spikes caused by contention on the > zone->lock. To address this, a new sysctl parameter vm.pcp_batch_scale_max > is introduced as a more practical alternative. In general, I'm neutral to the change. I can understand that kernel configuration isn't as flexible as sysctl knob. But, sysctl knob is ABI too. > To ultimately mitigate the zone->lock contention issue, several suggestions > have been proposed. One approach involves dividing large zones into multi > smaller zones, as suggested by Matthew[0], while another entails splitting > the zone->lock using a mechanism similar to memory arenas and shifting away > from relying solely on zone_id to identify the range of free lists a > particular page belongs to[1]. However, implementing these solutions is > likely to necessitate a more extended development effort. Per my understanding, the change will hurt instead of improve zone->lock contention. Instead, it will reduce page allocation/freeing latency. > Link: https://lore.kernel.org/linux-mm/ZnTrZ9mcAIRodnjx@xxxxxxxxxxxxxxxxxxxx/ [0] > Link: https://lore.kernel.org/linux-mm/20240705130943.htsyhhhzbcptnkcu@xxxxxxxxxxxxxxxxxxx/ [1] > Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx> > Cc: "Huang, Ying" <ying.huang@xxxxxxxxx> > Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> > Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> > Cc: David Rientjes <rientjes@xxxxxxxxxx> > --- > Documentation/admin-guide/sysctl/vm.rst | 15 +++++++++++++++ > include/linux/sysctl.h | 1 + > kernel/sysctl.c | 2 +- > mm/Kconfig | 11 ----------- > mm/page_alloc.c | 22 ++++++++++++++++------ > 5 files changed, 33 insertions(+), 18 deletions(-) > > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst > index e86c968a7a0e..eb9e5216eefe 100644 > --- a/Documentation/admin-guide/sysctl/vm.rst > +++ b/Documentation/admin-guide/sysctl/vm.rst > @@ -66,6 +66,7 @@ Currently, these files are in /proc/sys/vm: > - page_lock_unfairness > - panic_on_oom > - percpu_pagelist_high_fraction > +- pcp_batch_scale_max > - stat_interval > - stat_refresh > - numa_stat > @@ -864,6 +865,20 @@ mark based on the low watermark for the zone and the number of local > online CPUs. If the user writes '0' to this sysctl, it will revert to > this default behavior. > > +pcp_batch_scale_max > +=================== > + > +In page allocator, PCP (Per-CPU pageset) is refilled and drained in > +batches. The batch number is scaled automatically to improve page > +allocation/free throughput. But too large scale factor may hurt > +latency. This option sets the upper limit of scale factor to limit > +the maximum latency. > + > +The range for this parameter spans from 0 to 6, with a default value of 5. > +The value assigned to 'N' signifies that during each refilling or draining > +process, a maximum of (batch << N) pages will be involved, where "batch" > +represents the default batch size automatically computed by the kernel for > +each zone. > > stat_interval > ============= > diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h > index 09db2f2e6488..fb797f1c0ef7 100644 > --- a/include/linux/sysctl.h > +++ b/include/linux/sysctl.h > @@ -52,6 +52,7 @@ struct ctl_dir; > /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ > #define SYSCTL_MAXOLDUID ((void *)&sysctl_vals[10]) > #define SYSCTL_NEG_ONE ((void *)&sysctl_vals[11]) > +#define SYSCTL_SIX ((void *)&sysctl_vals[12]) > > extern const int sysctl_vals[]; > > diff --git a/kernel/sysctl.c b/kernel/sysctl.c > index e0b917328cf9..430ac4f58eb7 100644 > --- a/kernel/sysctl.c > +++ b/kernel/sysctl.c > @@ -82,7 +82,7 @@ > #endif > > /* shared constants to be used in various sysctls */ > -const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; > +const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1, 6 }; > EXPORT_SYMBOL(sysctl_vals); > > const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; > diff --git a/mm/Kconfig b/mm/Kconfig > index b4cb45255a54..41fe4c13b7ac 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -663,17 +663,6 @@ config HUGETLB_PAGE_SIZE_VARIABLE > config CONTIG_ALLOC > def_bool (MEMORY_ISOLATION && COMPACTION) || CMA > > -config PCP_BATCH_SCALE_MAX > - int "Maximum scale factor of PCP (Per-CPU pageset) batch allocate/free" > - default 5 > - range 0 6 > - help > - In page allocator, PCP (Per-CPU pageset) is refilled and drained in > - batches. The batch number is scaled automatically to improve page > - allocation/free throughput. But too large scale factor may hurt > - latency. This option sets the upper limit of scale factor to limit > - the maximum latency. > - > config PHYS_ADDR_T_64BIT > def_bool 64BIT > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 2b76754a48e0..703eec22a997 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -273,6 +273,7 @@ int min_free_kbytes = 1024; > int user_min_free_kbytes = -1; > static int watermark_boost_factor __read_mostly = 15000; > static int watermark_scale_factor = 10; > +static int pcp_batch_scale_max = 5; > > /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ > int movable_zone; > @@ -2310,7 +2311,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) > int count = READ_ONCE(pcp->count); > > while (count) { > - int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); > + int to_drain = min(count, pcp->batch << pcp_batch_scale_max); > count -= to_drain; > > spin_lock(&pcp->lock); > @@ -2438,7 +2439,7 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free > > /* Free as much as possible if batch freeing high-order pages. */ > if (unlikely(free_high)) > - return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX); > + return min(pcp->count, batch << pcp_batch_scale_max); > > /* Check for PCP disabled or boot pageset */ > if (unlikely(high < batch)) > @@ -2470,7 +2471,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, > return 0; > > if (unlikely(free_high)) { > - pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX), > + pcp->high = max(high - (batch << pcp_batch_scale_max), > high_min); > return 0; > } > @@ -2540,9 +2541,9 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, > } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { > pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; > } > - if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX)) > + if (pcp->free_count < (batch << pcp_batch_scale_max)) > pcp->free_count = min(pcp->free_count + (1 << order), > - batch << CONFIG_PCP_BATCH_SCALE_MAX); > + batch << pcp_batch_scale_max); > high = nr_pcp_high(pcp, zone, batch, free_high); > if (pcp->count >= high) { > free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), > @@ -2884,7 +2885,7 @@ static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order) > * subsequent allocation of order-0 pages without any freeing. > */ > if (batch <= max_nr_alloc && > - pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX) > + pcp->alloc_factor < pcp_batch_scale_max) > pcp->alloc_factor++; > batch = min(batch, max_nr_alloc); > } > @@ -6251,6 +6252,15 @@ static struct ctl_table page_alloc_sysctl_table[] = { > .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, > .extra1 = SYSCTL_ZERO, > }, > + { > + .procname = "pcp_batch_scale_max", > + .data = &pcp_batch_scale_max, > + .maxlen = sizeof(pcp_batch_scale_max), > + .mode = 0644, > + .proc_handler = proc_dointvec_minmax, > + .extra1 = SYSCTL_ZERO, > + .extra2 = SYSCTL_SIX, > + }, > { > .procname = "lowmem_reserve_ratio", > .data = &sysctl_lowmem_reserve_ratio, -- Best Regards, Huang, Ying