Re: [PATCH -mm v2 02/11] madvise: cleanup swapin_walk_pmd_entry()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, 12 Jun 2014, Naoya Horiguchi wrote:

> With the recent update on page table walker, we can use common code for
> the walking more. Unlike many other users, this swapin_walk expects to
> handle swap entries. As a result we should be careful about ptl locking.
> Swapin operation, read_swap_cache_async(), could cause page reclaim, so
> we can't keep holding ptl throughout this pte loop.
> In order to properly handle ptl in pte_entry(), this patch adds two new
> members on struct mm_walk.
> 
> This cleanup is necessary to get to the final form of page table walker,
> where we should do all caller's specific work on leaf entries (IOW, all
> pmd_entry() should be used for trans_pmd.)
> 
> Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
> Cc: Hugh Dickins <hughd@xxxxxxxxxx>

Sorry, I believe this (and probably several other of your conversions)
is badly flawed.

You have a pattern of doing pte_offset_map_lock() inside the page walker,
then dropping and regetting map and lock inside your pte handler.

But on, say, x86_32 with CONFIG_HIGHMEM, CONFIG_SMP and CONFIG_PREEMPT,
you may be preempted then run on a different cpu while atomic kmap and
lock are dropped: so that the pte pointer then used on return to
walk_pte_range() will no longer correspond to the right mapping.

Presumably that can be fixed by keeping the pte pointer in the mm_walk
structure; but I'm not at all sure that's the right thing to do.

I am not nearly so keen as you to reduce all these to per-pte callouts,
which seem inefficient to me.  It can be argued both ways on the less
important functions (like this madvise one); but I hope you don't try
to make this kind of conversion to fast paths like those in memory.c.

Hugh

> ---
>  include/linux/mm.h |  4 ++++
>  mm/madvise.c       | 54 +++++++++++++++++++++++-------------------------------
>  mm/pagewalk.c      | 11 +++++------
>  3 files changed, 32 insertions(+), 37 deletions(-)
> 
> diff --git mmotm-2014-05-21-16-57.orig/include/linux/mm.h mmotm-2014-05-21-16-57/include/linux/mm.h
> index b4aa6579f2b1..aa832161a1ff 100644
> --- mmotm-2014-05-21-16-57.orig/include/linux/mm.h
> +++ mmotm-2014-05-21-16-57/include/linux/mm.h
> @@ -1108,6 +1108,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
>   * @vma:       vma currently walked
>   * @skip:      internal control flag which is set when we skip the lower
>   *             level entries.
> + * @pmd:       current pmd entry
> + * @ptl:       page table lock associated with current entry
>   * @private:   private data for callbacks' use
>   *
>   * (see the comment on walk_page_range() for more details)
> @@ -1126,6 +1128,8 @@ struct mm_walk {
>  	struct mm_struct *mm;
>  	struct vm_area_struct *vma;
>  	int skip;
> +	pmd_t *pmd;
> +	spinlock_t *ptl;
>  	void *private;
>  };
>  
> diff --git mmotm-2014-05-21-16-57.orig/mm/madvise.c mmotm-2014-05-21-16-57/mm/madvise.c
> index a402f8fdc68e..06b390a6fbbd 100644
> --- mmotm-2014-05-21-16-57.orig/mm/madvise.c
> +++ mmotm-2014-05-21-16-57/mm/madvise.c
> @@ -135,38 +135,31 @@ static long madvise_behavior(struct vm_area_struct *vma,
>  }
>  
>  #ifdef CONFIG_SWAP
> -static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
> +/*
> + * Assuming that page table walker holds page table lock.
> + */
> +static int swapin_walk_pte_entry(pte_t *pte, unsigned long start,
>  	unsigned long end, struct mm_walk *walk)
>  {
> -	pte_t *orig_pte;
> -	struct vm_area_struct *vma = walk->private;
> -	unsigned long index;
> -
> -	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
> -		return 0;
> -
> -	for (index = start; index != end; index += PAGE_SIZE) {
> -		pte_t pte;
> -		swp_entry_t entry;
> -		struct page *page;
> -		spinlock_t *ptl;
> -
> -		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
> -		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
> -		pte_unmap_unlock(orig_pte, ptl);
> -
> -		if (pte_present(pte) || pte_none(pte) || pte_file(pte))
> -			continue;
> -		entry = pte_to_swp_entry(pte);
> -		if (unlikely(non_swap_entry(entry)))
> -			continue;
> -
> -		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
> -								vma, index);
> -		if (page)
> -			page_cache_release(page);
> -	}
> +	pte_t ptent;
> +	pte_t *orig_pte = pte - ((start & (PMD_SIZE - 1)) >> PAGE_SHIFT);
> +	swp_entry_t entry;
> +	struct page *page;
>  
> +	ptent = *pte;
> +	pte_unmap_unlock(orig_pte, walk->ptl);
> +	if (pte_present(ptent) || pte_none(ptent) || pte_file(ptent))
> +		goto lock;
> +	entry = pte_to_swp_entry(ptent);
> +	if (unlikely(non_swap_entry(entry)))
> +		goto lock;
> +	page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
> +				     walk->vma, start);
> +	if (page)
> +		page_cache_release(page);
> +lock:
> +	pte_offset_map(walk->pmd, start & PMD_MASK);
> +	spin_lock(walk->ptl);
>  	return 0;
>  }
>  
> @@ -175,8 +168,7 @@ static void force_swapin_readahead(struct vm_area_struct *vma,
>  {
>  	struct mm_walk walk = {
>  		.mm = vma->vm_mm,
> -		.pmd_entry = swapin_walk_pmd_entry,
> -		.private = vma,
> +		.pte_entry = swapin_walk_pte_entry,
>  	};
>  
>  	walk_page_range(start, end, &walk);
> diff --git mmotm-2014-05-21-16-57.orig/mm/pagewalk.c mmotm-2014-05-21-16-57/mm/pagewalk.c
> index e734f63276c2..24311d6f5c20 100644
> --- mmotm-2014-05-21-16-57.orig/mm/pagewalk.c
> +++ mmotm-2014-05-21-16-57/mm/pagewalk.c
> @@ -27,10 +27,10 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr,
>  	struct mm_struct *mm = walk->mm;
>  	pte_t *pte;
>  	pte_t *orig_pte;
> -	spinlock_t *ptl;
>  	int err = 0;
>  
> -	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> +	walk->pmd = pmd;
> +	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &walk->ptl);
>  	do {
>  		if (pte_none(*pte)) {
>  			if (walk->pte_hole)
> @@ -48,7 +48,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr,
>  		if (err)
>  		       break;
>  	} while (pte++, addr += PAGE_SIZE, addr < end);
> -	pte_unmap_unlock(orig_pte, ptl);
> +	pte_unmap_unlock(orig_pte, walk->ptl);
>  	cond_resched();
>  	return addr == end ? 0 : err;
>  }
> @@ -172,7 +172,6 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
>  	unsigned long hmask = huge_page_mask(h);
>  	pte_t *pte;
>  	int err = 0;
> -	spinlock_t *ptl;
>  
>  	do {
>  		next = hugetlb_entry_end(h, addr, end);
> @@ -186,14 +185,14 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
>  				break;
>  			continue;
>  		}
> -		ptl = huge_pte_lock(h, mm, pte);
> +		walk->ptl = huge_pte_lock(h, mm, pte);
>  		/*
>  		 * Callers should have their own way to handle swap entries
>  		 * in walk->hugetlb_entry().
>  		 */
>  		if (walk->hugetlb_entry)
>  			err = walk->hugetlb_entry(pte, addr, next, walk);
> -		spin_unlock(ptl);
> +		spin_unlock(walk->ptl);
>  		if (err)
>  			break;
>  	} while (addr = next, addr != end);
> -- 
> 1.9.3
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]