Re: [PATCH 3/3] mm/page_alloc: Introduce a new sysctl knob vm.pcp_batch_scale_max

"Huang, Ying" <ying.huang@xxxxxxxxx> · Wed, 10 Jul 2024 10:49:21 +0800

Yafang Shao <laoar.shao@xxxxxxxxx> writes:

> The configuration parameter PCP_BATCH_SCALE_MAX poses challenges for
> quickly experimenting with specific workloads in a production environment,
> particularly when monitoring latency spikes caused by contention on the
> zone->lock. To address this, a new sysctl parameter vm.pcp_batch_scale_max
> is introduced as a more practical alternative.

In general, I'm neutral to the change.  I can understand that kernel
configuration isn't as flexible as sysctl knob.  But, sysctl knob is ABI
too.

> To ultimately mitigate the zone->lock contention issue, several suggestions
> have been proposed. One approach involves dividing large zones into multi
> smaller zones, as suggested by Matthew[0], while another entails splitting
> the zone->lock using a mechanism similar to memory arenas and shifting away
> from relying solely on zone_id to identify the range of free lists a
> particular page belongs to[1]. However, implementing these solutions is
> likely to necessitate a more extended development effort.

Per my understanding, the change will hurt instead of improve zone->lock
contention.  Instead, it will reduce page allocation/freeing latency.

> Link: https://lore.kernel.org/linux-mm/ZnTrZ9mcAIRodnjx@xxxxxxxxxxxxxxxxxxxx/ [0]
> Link: https://lore.kernel.org/linux-mm/20240705130943.htsyhhhzbcptnkcu@xxxxxxxxxxxxxxxxxxx/ [1]
> Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
> Cc: "Huang, Ying" <ying.huang@xxxxxxxxx>
> Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
> Cc: David Rientjes <rientjes@xxxxxxxxxx>
> ---
>  Documentation/admin-guide/sysctl/vm.rst | 15 +++++++++++++++
>  include/linux/sysctl.h                  |  1 +
>  kernel/sysctl.c                         |  2 +-
>  mm/Kconfig                              | 11 -----------
>  mm/page_alloc.c                         | 22 ++++++++++++++++------
>  5 files changed, 33 insertions(+), 18 deletions(-)
>
> diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> index e86c968a7a0e..eb9e5216eefe 100644
> --- a/Documentation/admin-guide/sysctl/vm.rst
> +++ b/Documentation/admin-guide/sysctl/vm.rst
> @@ -66,6 +66,7 @@ Currently, these files are in /proc/sys/vm:
>  - page_lock_unfairness
>  - panic_on_oom
>  - percpu_pagelist_high_fraction
> +- pcp_batch_scale_max
>  - stat_interval
>  - stat_refresh
>  - numa_stat
> @@ -864,6 +865,20 @@ mark based on the low watermark for the zone and the number of local
>  online CPUs.  If the user writes '0' to this sysctl, it will revert to
>  this default behavior.
>  
> +pcp_batch_scale_max
> +===================
> +
> +In page allocator, PCP (Per-CPU pageset) is refilled and drained in
> +batches.  The batch number is scaled automatically to improve page
> +allocation/free throughput.  But too large scale factor may hurt
> +latency.  This option sets the upper limit of scale factor to limit
> +the maximum latency.
> +
> +The range for this parameter spans from 0 to 6, with a default value of 5.
> +The value assigned to 'N' signifies that during each refilling or draining
> +process, a maximum of (batch << N) pages will be involved, where "batch"
> +represents the default batch size automatically computed by the kernel for
> +each zone.
>  
>  stat_interval
>  =============
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 09db2f2e6488..fb797f1c0ef7 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -52,6 +52,7 @@ struct ctl_dir;
>  /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
>  #define SYSCTL_MAXOLDUID		((void *)&sysctl_vals[10])
>  #define SYSCTL_NEG_ONE			((void *)&sysctl_vals[11])
> +#define SYSCTL_SIX			((void *)&sysctl_vals[12])
>  
>  extern const int sysctl_vals[];
>  
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e0b917328cf9..430ac4f58eb7 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -82,7 +82,7 @@
>  #endif
>  
>  /* shared constants to be used in various sysctls */
> -const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
> +const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1, 6 };
>  EXPORT_SYMBOL(sysctl_vals);
>  
>  const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
> diff --git a/mm/Kconfig b/mm/Kconfig
> index b4cb45255a54..41fe4c13b7ac 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -663,17 +663,6 @@ config HUGETLB_PAGE_SIZE_VARIABLE
>  config CONTIG_ALLOC
>  	def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
>  
> -config PCP_BATCH_SCALE_MAX
> -	int "Maximum scale factor of PCP (Per-CPU pageset) batch allocate/free"
> -	default 5
> -	range 0 6
> -	help
> -	  In page allocator, PCP (Per-CPU pageset) is refilled and drained in
> -	  batches.  The batch number is scaled automatically to improve page
> -	  allocation/free throughput.  But too large scale factor may hurt
> -	  latency.  This option sets the upper limit of scale factor to limit
> -	  the maximum latency.
> -
>  config PHYS_ADDR_T_64BIT
>  	def_bool 64BIT
>  
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 2b76754a48e0..703eec22a997 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -273,6 +273,7 @@ int min_free_kbytes = 1024;
>  int user_min_free_kbytes = -1;
>  static int watermark_boost_factor __read_mostly = 15000;
>  static int watermark_scale_factor = 10;
> +static int pcp_batch_scale_max = 5;
>  
>  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
>  int movable_zone;
> @@ -2310,7 +2311,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
>  	int count = READ_ONCE(pcp->count);
>  
>  	while (count) {
> -		int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);
> +		int to_drain = min(count, pcp->batch << pcp_batch_scale_max);
>  		count -= to_drain;
>  
>  		spin_lock(&pcp->lock);
> @@ -2438,7 +2439,7 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free
>  
>  	/* Free as much as possible if batch freeing high-order pages. */
>  	if (unlikely(free_high))
> -		return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
> +		return min(pcp->count, batch << pcp_batch_scale_max);
>  
>  	/* Check for PCP disabled or boot pageset */
>  	if (unlikely(high < batch))
> @@ -2470,7 +2471,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
>  		return 0;
>  
>  	if (unlikely(free_high)) {
> -		pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
> +		pcp->high = max(high - (batch << pcp_batch_scale_max),
>  				high_min);
>  		return 0;
>  	}
> @@ -2540,9 +2541,9 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
>  	} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
>  		pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
>  	}
> -	if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
> +	if (pcp->free_count < (batch << pcp_batch_scale_max))
>  		pcp->free_count = min(pcp->free_count + (1 << order),
> -				      batch << CONFIG_PCP_BATCH_SCALE_MAX);
> +				      batch << pcp_batch_scale_max);
>  	high = nr_pcp_high(pcp, zone, batch, free_high);
>  	if (pcp->count >= high) {
>  		free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
> @@ -2884,7 +2885,7 @@ static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order)
>  		 * subsequent allocation of order-0 pages without any freeing.
>  		 */
>  		if (batch <= max_nr_alloc &&
> -		    pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
> +		    pcp->alloc_factor < pcp_batch_scale_max)
>  			pcp->alloc_factor++;
>  		batch = min(batch, max_nr_alloc);
>  	}
> @@ -6251,6 +6252,15 @@ static struct ctl_table page_alloc_sysctl_table[] = {
>  		.proc_handler	= percpu_pagelist_high_fraction_sysctl_handler,
>  		.extra1		= SYSCTL_ZERO,
>  	},
> +	{
> +		.procname	= "pcp_batch_scale_max",
> +		.data		= &pcp_batch_scale_max,
> +		.maxlen		= sizeof(pcp_batch_scale_max),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= SYSCTL_ZERO,
> +		.extra2		= SYSCTL_SIX,
> +	},
>  	{
>  		.procname	= "lowmem_reserve_ratio",
>  		.data		= &sysctl_lowmem_reserve_ratio,

--
Best Regards,
Huang, Ying