Re: [RFC PATCH 20/26] mm: vmscan: use compaction_suitable() check in kswapd

"Huang, Ying" <ying.huang@xxxxxxxxx> · Tue, 25 Apr 2023 11:12:28 +0800

Johannes Weiner <hannes@xxxxxxxxxxx> writes:

> Kswapd currently bails on higher-order allocations with an open-coded
> check for whether it's reclaimed the compaction gap.
>
> compaction_suitable() is the customary interface to coordinate reclaim
> with compaction.
>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
>  mm/vmscan.c | 67 ++++++++++++++++++-----------------------------------
>  1 file changed, 23 insertions(+), 44 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ee8c8ca2e7b5..723705b9e4d9 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -6872,12 +6872,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
>  		if (!managed_zone(zone))
>  			continue;
>  
> +		/* Allocation can succeed in any zone, done */
>  		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
>  			mark = wmark_pages(zone, WMARK_PROMO);
>  		else
>  			mark = high_wmark_pages(zone);
>  		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
>  			return true;
> +
> +		/* Allocation can't succeed, but enough order-0 to compact */
> +		if (compaction_suitable(zone, order,
> +					highest_zoneidx) == COMPACT_CONTINUE)
> +			return true;

Should we check the following first?

        order > 0 && zone_watermark_ok_safe(zone, 0, mark, highest_zoneidx)

Best Regards,
Huang, Ying

>  	}
>  
>  	/*
> @@ -6968,16 +6974,6 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
>  	 */
>  	shrink_node(pgdat, sc);
>  
> -	/*
> -	 * Fragmentation may mean that the system cannot be rebalanced for
> -	 * high-order allocations. If twice the allocation size has been
> -	 * reclaimed then recheck watermarks only at order-0 to prevent
> -	 * excessive reclaim. Assume that a process requested a high-order
> -	 * can direct reclaim/compact.
> -	 */
> -	if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
> -		sc->order = 0;
> -
>  	return sc->nr_scanned >= sc->nr_to_reclaim;
>  }
>  
> @@ -7018,15 +7014,13 @@ clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
>   * that are eligible for use by the caller until at least one zone is
>   * balanced.
>   *
> - * Returns the order kswapd finished reclaiming at.
> - *
>   * kswapd scans the zones in the highmem->normal->dma direction.  It skips
>   * zones which have free_pages > high_wmark_pages(zone), but once a zone is
>   * found to have free_pages <= high_wmark_pages(zone), any page in that zone
>   * or lower is eligible for reclaim until at least one usable zone is
>   * balanced.
>   */
> -static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
> +static void balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
>  {
>  	int i;
>  	unsigned long nr_soft_reclaimed;
> @@ -7226,14 +7220,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
>  	__fs_reclaim_release(_THIS_IP_);
>  	psi_memstall_leave(&pflags);
>  	set_task_reclaim_state(current, NULL);
> -
> -	/*
> -	 * Return the order kswapd stopped reclaiming at as
> -	 * prepare_kswapd_sleep() takes it into account. If another caller
> -	 * entered the allocator slow path while kswapd was awake, order will
> -	 * remain at the higher level.
> -	 */
> -	return sc.order;
>  }
>  
>  /*
> @@ -7251,7 +7237,7 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
>  	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
>  }
>  
> -static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
> +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
>  				unsigned int highest_zoneidx)
>  {
>  	long remaining = 0;
> @@ -7269,7 +7255,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>  	 * eligible zone balanced that it's also unlikely that compaction will
>  	 * succeed.
>  	 */
> -	if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
> +	if (prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) {
>  		/*
>  		 * Compaction records what page blocks it recently failed to
>  		 * isolate pages from and skips them in the future scanning.
> @@ -7282,7 +7268,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>  		 * We have freed the memory, now we should compact it to make
>  		 * allocation of the requested order possible.
>  		 */
> -		wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
> +		wakeup_kcompactd(pgdat, order, highest_zoneidx);
>  
>  		remaining = schedule_timeout(HZ/10);
>  
> @@ -7296,8 +7282,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>  					kswapd_highest_zoneidx(pgdat,
>  							highest_zoneidx));
>  
> -			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
> -				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
> +			if (READ_ONCE(pgdat->kswapd_order) < order)
> +				WRITE_ONCE(pgdat->kswapd_order, order);
>  		}
>  
>  		finish_wait(&pgdat->kswapd_wait, &wait);
> @@ -7308,8 +7294,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>  	 * After a short sleep, check if it was a premature sleep. If not, then
>  	 * go fully to sleep until explicitly woken up.
>  	 */
> -	if (!remaining &&
> -	    prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
> +	if (!remaining && prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) {
>  		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
>  
>  		/*
> @@ -7350,8 +7335,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>   */
>  static int kswapd(void *p)
>  {
> -	unsigned int alloc_order, reclaim_order;
> -	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
> +	unsigned int order, highest_zoneidx;
>  	pg_data_t *pgdat = (pg_data_t *)p;
>  	struct task_struct *tsk = current;
>  	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
> @@ -7374,22 +7358,20 @@ static int kswapd(void *p)
>  	tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
>  	set_freezable();
>  
> -	WRITE_ONCE(pgdat->kswapd_order, 0);
> +	order = 0;
> +	highest_zoneidx = MAX_NR_ZONES - 1;
> +	WRITE_ONCE(pgdat->kswapd_order, order);
>  	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
> +
>  	atomic_set(&pgdat->nr_writeback_throttled, 0);
> +
>  	for ( ; ; ) {
>  		bool ret;
>  
> -		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
> -		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
> -							highest_zoneidx);
> -
> -kswapd_try_sleep:
> -		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
> -					highest_zoneidx);
> +		kswapd_try_to_sleep(pgdat, order, highest_zoneidx);
>  
>  		/* Read the new order and highest_zoneidx */
> -		alloc_order = READ_ONCE(pgdat->kswapd_order);
> +		order = READ_ONCE(pgdat->kswapd_order);
>  		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
>  							highest_zoneidx);
>  		WRITE_ONCE(pgdat->kswapd_order, 0);
> @@ -7415,11 +7397,8 @@ static int kswapd(void *p)
>  		 * request (alloc_order).
>  		 */
>  		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
> -						alloc_order);
> -		reclaim_order = balance_pgdat(pgdat, alloc_order,
> -						highest_zoneidx);
> -		if (reclaim_order < alloc_order)
> -			goto kswapd_try_sleep;
> +					    order);
> +		balance_pgdat(pgdat, order, highest_zoneidx);
>  	}
>  
>  	tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);