Re: [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



> 
> From: Ebru Akagunduz <ebru.akagunduz@xxxxxxxxx>
> 
> Currently khugepaged makes swapin readahead under down_write.  This patch
> supplies to make swapin readahead under down_read instead of down_write.
> 
> The patch was tested with a test program that allocates 800MB of memory,
> writes to it, and then sleeps.  The system was forced to swap out all.
> Afterwards, the test program touches the area by writing, it skips a page
> in each 20 pages of the area.
> 
> Link: http://lkml.kernel.org/r/1464335964-6510-4-git-send-email-ebru.akagunduz@xxxxxxxxx
> Signed-off-by: Ebru Akagunduz <ebru.akagunduz@xxxxxxxxx>
> Cc: Hugh Dickins <hughd@xxxxxxxxxx>
> Cc: Rik van Riel <riel@xxxxxxxxxx>
> Cc: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx>
> Cc: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> Cc: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx>
> Cc: Cyrill Gorcunov <gorcunov@xxxxxxxxxx>
> Cc: Mel Gorman <mgorman@xxxxxxx>
> Cc: David Rientjes <rientjes@xxxxxxxxxx>
> Cc: Vlastimil Babka <vbabka@xxxxxxx>
> Cc: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
> Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxx>
> Cc: Minchan Kim <minchan.kim@xxxxxxxxx>
> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> ---
>  mm/huge_memory.c | 92 ++++++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 63 insertions(+), 29 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f2bc57c45d2f..96dfe3f09bf6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2378,6 +2378,35 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
>  }
> 
>  /*
> + * If mmap_sem temporarily dropped, revalidate vma
> + * before taking mmap_sem.

See below

> + * Return 0 if succeeds, otherwise return none-zero
> + * value (scan code).
> + */
> +
> +static int hugepage_vma_revalidate(struct mm_struct *mm,
> +				   struct vm_area_struct *vma,
> +				   unsigned long address)
> +{
> +	unsigned long hstart, hend;
> +
> +	if (unlikely(khugepaged_test_exit(mm)))
> +		return SCAN_ANY_PROCESS;
> +
> +	vma = find_vma(mm, address);
> +	if (!vma)
> +		return SCAN_VMA_NULL;
> +
> +	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
> +	hend = vma->vm_end & HPAGE_PMD_MASK;
> +	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
> +		return SCAN_ADDRESS_RANGE;
> +	if (!hugepage_vma_check(vma))
> +		return SCAN_VMA_CHECK;
> +	return 0;
> +}
> +
> +/*
>   * Bring missing pages in from swap, to complete THP collapse.
>   * Only done if khugepaged_scan_pmd believes it is worthwhile.
>   *
> @@ -2385,7 +2414,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
>   * but with mmap_sem held to protect against vma changes.
>   */
> 
> -static void __collapse_huge_page_swapin(struct mm_struct *mm,
> +static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>  					struct vm_area_struct *vma,
>  					unsigned long address, pmd_t *pmd)
>  {
> @@ -2401,11 +2430,18 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
>  			continue;
>  		swapped_in++;
>  		ret = do_swap_page(mm, vma, _address, pte, pmd,
> -				   FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
> +				   FAULT_FLAG_ALLOW_RETRY,

Add a description in change log for it please.

>  				   pteval);
> +		/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
> +		if (ret & VM_FAULT_RETRY) {
> +			down_read(&mm->mmap_sem);
> +			/* vma is no longer available, don't continue to swapin */
> +			if (hugepage_vma_revalidate(mm, vma, address))
> +				return false;

Revalidate vma _after_ acquiring mmap_sem, but the above comment says _before_.

> +		}
>  		if (ret & VM_FAULT_ERROR) {
>  			trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
> -			return;
> +			return false;
>  		}
>  		/* pte is unmapped now, we need to map it */
>  		pte = pte_offset_map(pmd, _address);
> @@ -2413,6 +2449,7 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
>  	pte--;
>  	pte_unmap(pte);
>  	trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
> +	return true;
>  }
> 
>  static void collapse_huge_page(struct mm_struct *mm,
> @@ -2427,7 +2464,6 @@ static void collapse_huge_page(struct mm_struct *mm,
>  	struct page *new_page;
>  	spinlock_t *pmd_ptl, *pte_ptl;
>  	int isolated = 0, result = 0;
> -	unsigned long hstart, hend;
>  	struct mem_cgroup *memcg;
>  	unsigned long mmun_start;	/* For mmu_notifiers */
>  	unsigned long mmun_end;		/* For mmu_notifiers */
> @@ -2450,39 +2486,37 @@ static void collapse_huge_page(struct mm_struct *mm,
>  		goto out_nolock;
>  	}
> 
> -	/*
> -	 * Prevent all access to pagetables with the exception of
> -	 * gup_fast later hanlded by the ptep_clear_flush and the VM
> -	 * handled by the anon_vma lock + PG_lock.
> -	 */
> -	down_write(&mm->mmap_sem);
> -	if (unlikely(khugepaged_test_exit(mm))) {
> -		result = SCAN_ANY_PROCESS;
> +	down_read(&mm->mmap_sem);
> +	result = hugepage_vma_revalidate(mm, vma, address);
> +	if (result)
>  		goto out;
> -	}
> 
> -	vma = find_vma(mm, address);
> -	if (!vma) {
> -		result = SCAN_VMA_NULL;
> -		goto out;
> -	}
> -	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
> -	hend = vma->vm_end & HPAGE_PMD_MASK;
> -	if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
> -		result = SCAN_ADDRESS_RANGE;
> -		goto out;
> -	}
> -	if (!hugepage_vma_check(vma)) {
> -		result = SCAN_VMA_CHECK;
> -		goto out;
> -	}
>  	pmd = mm_find_pmd(mm, address);
>  	if (!pmd) {
>  		result = SCAN_PMD_NULL;
>  		goto out;
>  	}
> 
> -	__collapse_huge_page_swapin(mm, vma, address, pmd);
> +	/*
> +	 * __collapse_huge_page_swapin always returns with mmap_sem
> +	 * locked. If it fails, release mmap_sem and jump directly
> +	 * label out. Continuing to collapse causes inconsistency.
> +	 */
> +	if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) {
> +		up_read(&mm->mmap_sem);
> +		goto out;

Jump out with mmap_sem released, 

> +	}
> +
> +	up_read(&mm->mmap_sem);
> +	/*
> +	 * Prevent all access to pagetables with the exception of
> +	 * gup_fast later handled by the ptep_clear_flush and the VM
> +	 * handled by the anon_vma lock + PG_lock.
> +	 */
> +	down_write(&mm->mmap_sem);
> +	result = hugepage_vma_revalidate(mm, vma, address);
> +	if (result)
> +		goto out;

but jump out again with mmap_sem held.

They are cleaned up in subsequent darns?

> 
>  	anon_vma_lock_write(vma->anon_vma);
> 
> --
> 2.8.1


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>



[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]