With the recent update on page table walker, we can use common code for the walking more. Unlike many other users, this swapin_walk expects to handle swap entries. As a result we should be careful about ptl locking. Swapin operation, read_swap_cache_async(), could cause page reclaim, so we can't keep holding ptl throughout this pte loop. In order to properly handle ptl in pte_entry(), this patch adds two new members on struct mm_walk. This cleanup is necessary to get to the final form of page table walker, where we should do all caller's specific work on leaf entries (IOW, all pmd_entry() should be used for trans_pmd.) Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> --- include/linux/mm.h | 4 ++++ mm/madvise.c | 54 +++++++++++++++++++++++------------------------------- mm/pagewalk.c | 11 +++++------ 3 files changed, 32 insertions(+), 37 deletions(-) diff --git mmotm-2014-05-21-16-57.orig/include/linux/mm.h mmotm-2014-05-21-16-57/include/linux/mm.h index b4aa6579f2b1..aa832161a1ff 100644 --- mmotm-2014-05-21-16-57.orig/include/linux/mm.h +++ mmotm-2014-05-21-16-57/include/linux/mm.h @@ -1108,6 +1108,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, * @vma: vma currently walked * @skip: internal control flag which is set when we skip the lower * level entries. + * @pmd: current pmd entry + * @ptl: page table lock associated with current entry * @private: private data for callbacks' use * * (see the comment on walk_page_range() for more details) @@ -1126,6 +1128,8 @@ struct mm_walk { struct mm_struct *mm; struct vm_area_struct *vma; int skip; + pmd_t *pmd; + spinlock_t *ptl; void *private; }; diff --git mmotm-2014-05-21-16-57.orig/mm/madvise.c mmotm-2014-05-21-16-57/mm/madvise.c index a402f8fdc68e..06b390a6fbbd 100644 --- mmotm-2014-05-21-16-57.orig/mm/madvise.c +++ mmotm-2014-05-21-16-57/mm/madvise.c @@ -135,38 +135,31 @@ static long madvise_behavior(struct vm_area_struct *vma, } #ifdef CONFIG_SWAP -static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, +/* + * Assuming that page table walker holds page table lock. + */ +static int swapin_walk_pte_entry(pte_t *pte, unsigned long start, unsigned long end, struct mm_walk *walk) { - pte_t *orig_pte; - struct vm_area_struct *vma = walk->private; - unsigned long index; - - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - return 0; - - for (index = start; index != end; index += PAGE_SIZE) { - pte_t pte; - swp_entry_t entry; - struct page *page; - spinlock_t *ptl; - - orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); - pte = *(orig_pte + ((index - start) / PAGE_SIZE)); - pte_unmap_unlock(orig_pte, ptl); - - if (pte_present(pte) || pte_none(pte) || pte_file(pte)) - continue; - entry = pte_to_swp_entry(pte); - if (unlikely(non_swap_entry(entry))) - continue; - - page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, - vma, index); - if (page) - page_cache_release(page); - } + pte_t ptent; + pte_t *orig_pte = pte - ((start & (PMD_SIZE - 1)) >> PAGE_SHIFT); + swp_entry_t entry; + struct page *page; + ptent = *pte; + pte_unmap_unlock(orig_pte, walk->ptl); + if (pte_present(ptent) || pte_none(ptent) || pte_file(ptent)) + goto lock; + entry = pte_to_swp_entry(ptent); + if (unlikely(non_swap_entry(entry))) + goto lock; + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, + walk->vma, start); + if (page) + page_cache_release(page); +lock: + pte_offset_map(walk->pmd, start & PMD_MASK); + spin_lock(walk->ptl); return 0; } @@ -175,8 +168,7 @@ static void force_swapin_readahead(struct vm_area_struct *vma, { struct mm_walk walk = { .mm = vma->vm_mm, - .pmd_entry = swapin_walk_pmd_entry, - .private = vma, + .pte_entry = swapin_walk_pte_entry, }; walk_page_range(start, end, &walk); diff --git mmotm-2014-05-21-16-57.orig/mm/pagewalk.c mmotm-2014-05-21-16-57/mm/pagewalk.c index e734f63276c2..24311d6f5c20 100644 --- mmotm-2014-05-21-16-57.orig/mm/pagewalk.c +++ mmotm-2014-05-21-16-57/mm/pagewalk.c @@ -27,10 +27,10 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, struct mm_struct *mm = walk->mm; pte_t *pte; pte_t *orig_pte; - spinlock_t *ptl; int err = 0; - orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + walk->pmd = pmd; + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &walk->ptl); do { if (pte_none(*pte)) { if (walk->pte_hole) @@ -48,7 +48,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, if (err) break; } while (pte++, addr += PAGE_SIZE, addr < end); - pte_unmap_unlock(orig_pte, ptl); + pte_unmap_unlock(orig_pte, walk->ptl); cond_resched(); return addr == end ? 0 : err; } @@ -172,7 +172,6 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, unsigned long hmask = huge_page_mask(h); pte_t *pte; int err = 0; - spinlock_t *ptl; do { next = hugetlb_entry_end(h, addr, end); @@ -186,14 +185,14 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, break; continue; } - ptl = huge_pte_lock(h, mm, pte); + walk->ptl = huge_pte_lock(h, mm, pte); /* * Callers should have their own way to handle swap entries * in walk->hugetlb_entry(). */ if (walk->hugetlb_entry) err = walk->hugetlb_entry(pte, addr, next, walk); - spin_unlock(ptl); + spin_unlock(walk->ptl); if (err) break; } while (addr = next, addr != end); -- 1.9.3 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>