Re: [PATCH 31/31] mm, vmstat: Remove zone and node double accounting by approximating retries

"Hillf Danton" <hillf.zj@xxxxxxxxxxxxxxx> · Tue, 05 Jul 2016 16:07:23 +0800



> 
> The number of LRU pages, dirty pages and writeback pages must be accounted
> for on both zones and nodes because of the reclaim retry logic, compaction
> retry logic and highmem calculations all depending on per-zone stats.
> 
> The retry logic is only critical for allocations that can use any zones.
> Hence this patch will not retry reclaim or compaction for such allocations.
> This should not be a problem for reclaim as zone-constrained allocations
> are immune from OOM kill. For retries, a very rough approximation is made
> whether to retry or not. While it is possible this will make the wrong
> decision on occasion, it will not infinite loop as the number of reclaim
> attempts is capped by MAX_RECLAIM_RETRIES.
> 
> The highmem calculations only care about the global count of file pages
> in highmem. Hence, a global counter is used instead of per-zone stats.
> With this, the per-zone double accounting disappears.
> 
> Suggested by: Michal Hocko <mhocko@xxxxxxxxxx>
> Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> ---
Acked-by: Hillf Danton <hillf.zj@xxxxxxxxxxxxxxx>

>  include/linux/mm_inline.h | 20 +++++++++++--
>  include/linux/mmzone.h    |  4 ---
>  include/linux/swap.h      |  1 -
>  mm/compaction.c           | 22 ++++++++++++++-
>  mm/migrate.c              |  2 --
>  mm/page-writeback.c       | 13 ++++-----
>  mm/page_alloc.c           | 71 ++++++++++++++++++++++++++++++++---------------
>  mm/vmscan.c               | 16 -----------
>  mm/vmstat.c               |  3 --
>  9 files changed, 92 insertions(+), 60 deletions(-)
> 
[...]
> @@ -3445,6 +3445,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
>  {
>  	struct zone *zone;
>  	struct zoneref *z;
> +	pg_data_t *current_pgdat = NULL;
> 
>  	/*
>  	 * Make sure we converge to OOM if we cannot make any progress
> @@ -3454,6 +3455,14 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
>  		return false;
> 
>  	/*
> +	 * Blindly retry allocation requests that cannot use all zones. We do
> +	 * not have a reliable and fast means of calculating reclaimable, dirty
> +	 * and writeback pages in eligible zones.
> +	 */
> +	if (IS_ENABLED(CONFIG_HIGHMEM) && !is_highmem_idx(gfp_zone(gfp_mask)))
> +		goto out;
> +
> +	/*
>  	 * Keep reclaiming pages while there is a chance this will lead somewhere.
>  	 * If none of the target zones can satisfy our allocation request even
>  	 * if all reclaimable pages are considered then we are screwed and have
> @@ -3463,36 +3472,54 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
>  					ac->nodemask) {
>  		unsigned long available;
>  		unsigned long reclaimable;
> +		unsigned long write_pending = 0;
> +		int zid;
> +
> +		if (current_pgdat == zone->zone_pgdat)
> +			continue;
> 
> -		available = reclaimable = zone_reclaimable_pages(zone);
> +		current_pgdat = zone->zone_pgdat;
> +		available = reclaimable = pgdat_reclaimable_pages(current_pgdat);
>  		available -= DIV_ROUND_UP(no_progress_loops * available,
>  					MAX_RECLAIM_RETRIES);
> -		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
> +		write_pending = node_page_state(current_pgdat, NR_WRITEBACK) +
> +					node_page_state(current_pgdat, NR_FILE_DIRTY);
> 
> -		/*
> -		 * Would the allocation succeed if we reclaimed the whole
> -		 * available?
> -		 */
> -		if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
> -				ac_classzone_idx(ac), alloc_flags, available)) {
> -			/*
> -			 * If we didn't make any progress and have a lot of
> -			 * dirty + writeback pages then we should wait for
> -			 * an IO to complete to slow down the reclaim and
> -			 * prevent from pre mature OOM
> -			 */
> -			if (!did_some_progress) {
> -				unsigned long write_pending;
> +		/* Account for all free pages on eligible zones */
> +		for (zid = 0; zid <= zone_idx(zone); zid++) {
> +			struct zone *acct_zone = &current_pgdat->node_zones[zid];
> 
> -				write_pending = zone_page_state_snapshot(zone,
> -							NR_ZONE_WRITE_PENDING);
> +			available += zone_page_state_snapshot(acct_zone, NR_FREE_PAGES);
> +		}
> 
> -				if (2 * write_pending > reclaimable) {
> -					congestion_wait(BLK_RW_ASYNC, HZ/10);
> -					return true;
> -				}
> +		/*
> +		 * If we didn't make any progress and have a lot of
> +		 * dirty + writeback pages then we should wait for an IO to
> +		 * complete to slow down the reclaim and prevent from premature
> +		 * OOM.
> +		 */
> +		if (!did_some_progress) {
> +			if (2 * write_pending > reclaimable) {
> +				congestion_wait(BLK_RW_ASYNC, HZ/10);
> +				return true;
>  			}
> +		}
> 
> +		/*
> +		 * Would the allocation succeed if we reclaimed the whole
> +		 * available? This is approximate because there is no
> +		 * accurate count of reclaimable pages per zone.
> +		 */
> +		for (zid = 0; zid <= zone_idx(zone); zid++) {
> +			struct zone *check_zone = &current_pgdat->node_zones[zid];
> +			unsigned long estimate;
> +
> +			estimate = min(check_zone->managed_pages, available);
> +			if (__zone_watermark_ok(check_zone, order,
> +					min_wmark_pages(check_zone), ac_classzone_idx(ac),
> +					alloc_flags, available)) {
> +			}
Stray indent?

> +out:
>  			/*
>  			 * Memory allocation/reclaim might be called from a WQ
>  			 * context and the current implementation of the WQ
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 151c30dd27e2..c538a8cab43b 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>