Re: + mm-thp-avoid-unnecessary-swapin-in-khugepaged.patch added to -mm tree

Minchan Kim <minchan@xxxxxxxxxx> · Thu, 19 May 2016 14:00:38 +0900

On Tue, May 17, 2016 at 11:02:54AM +0200, Michal Hocko wrote:
> On Tue 17-05-16 09:58:15, Michal Hocko wrote:
> > On Thu 28-04-16 17:19:21, Michal Hocko wrote:
> > > On Wed 27-04-16 14:17:20, Andrew Morton wrote:
> > > [...]
> > > > @@ -2484,7 +2485,14 @@ static void collapse_huge_page(struct mm
> > > >  		goto out;
> > > >  	}
> > > >  
> > > > -	__collapse_huge_page_swapin(mm, vma, address, pmd);
> > > > +	swap = get_mm_counter(mm, MM_SWAPENTS);
> > > > +	curr_allocstall = sum_vm_event(ALLOCSTALL);
> > > > +	/*
> > > > +	 * When system under pressure, don't swapin readahead.
> > > > +	 * So that avoid unnecessary resource consuming.
> > > > +	 */
> > > > +	if (allocstall == curr_allocstall && swap != 0)
> > > > +		__collapse_huge_page_swapin(mm, vma, address, pmd);
> > > >  
> > > >  	anon_vma_lock_write(vma->anon_vma);
> > > >  
> > > 
> > > I have mentioned that before already but this seems like a rather weak
> > > heuristic. Don't we really rather teach __collapse_huge_page_swapin
> > > (resp. do_swap_page) do to an optimistic GFP_NOWAIT allocations and
> > > back off under the memory pressure?
> > 
> > I gave it a try and it doesn't seem really bad. Untested and I might
> > have missed something really obvious but what do you think about this
> > approach rather than relying on ALLOCSTALL which is really weak
> > heuristic:

I like this approach rather than playing with allocstall diff of vmevent
which can be disabled in some configuration and it's not a good indicator
to represent current memory pressure situation.

However, I agree with Rik's requirement which doesn't want to turn over
page cache for collapsing THP page via swapin. So, your suggestion cannot
prevent it because khugepaged can consume memory through this swapin
operation continuously while kswapd is doing aging of LRU list in parallel.
IOW, fluctuation between HIGH and LOW watermark.

So, How about using waitqueue_active(&pgdat->kswapd_wait) to detect
current memory pressure? So if kswapd is active, we could avoid swapin
for THP collapsing.

> 
> Ups forgot to add mm/internal.h to the git index
> ---
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 87f09dc986ab..1a4d4c807d92 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2389,7 +2389,8 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
>  		swapped_in++;
>  		ret = do_swap_page(mm, vma, _address, pte, pmd,
>  				   FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
> -				   pteval);
> +				   pteval,
> +				   GFP_HIGHUSER_MOVABLE | ~__GFP_DIRECT_RECLAIM);
>  		if (ret & VM_FAULT_ERROR) {
>  			trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
>  			return;
> diff --git a/mm/internal.h b/mm/internal.h
> index b6ead95a0184..0b3cc643eced 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -37,7 +37,7 @@
>  
>  extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  			unsigned long address, pte_t *page_table, pmd_t *pmd,
> -			unsigned int flags, pte_t orig_pte);
> +			unsigned int flags, pte_t orig_pte, gfp_t gfp_mask);
>  
>  void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
>  		unsigned long floor, unsigned long ceiling);
> diff --git a/mm/memory.c b/mm/memory.c
> index d79c6db41502..f897ec89bd79 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2490,7 +2490,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
>   */
>  int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  		unsigned long address, pte_t *page_table, pmd_t *pmd,
> -		unsigned int flags, pte_t orig_pte)
> +		unsigned int flags, pte_t orig_pte, gfp_t gfp_mask)
>  {
>  	spinlock_t *ptl;
>  	struct page *page, *swapcache;
> @@ -2519,8 +2519,7 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
>  	page = lookup_swap_cache(entry);
>  	if (!page) {
> -		page = swapin_readahead(entry,
> -					GFP_HIGHUSER_MOVABLE, vma, address);
> +		page = swapin_readahead(entry, gfp_mask, vma, address);
>  		if (!page) {
>  			/*
>  			 * Back out if somebody else faulted in this pte
> @@ -2573,7 +2572,7 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  		goto out_page;
>  	}
>  
> -	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
> +	if (mem_cgroup_try_charge(page, mm, gfp_mask, &memcg, false)) {
>  		ret = VM_FAULT_OOM;
>  		goto out_page;
>  	}
> @@ -3349,7 +3348,7 @@ static int handle_pte_fault(struct mm_struct *mm,
>  						flags, entry);
>  		}
>  		return do_swap_page(mm, vma, address,
> -					pte, pmd, flags, entry);
> +					pte, pmd, flags, entry, GFP_HIGHUSER_MOVABLE);
>  	}
>  
>  	if (pte_protnone(entry))
> -- 
> Michal Hocko
> SUSE Labs
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>