The patch titled Subject: mempolicy: apply page table walker on queue_pages_range() has been added to the -mm tree. Its filename is mempolicy-apply-page-table-walker-on-queue_pages_range.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/mempolicy-apply-page-table-walker-on-queue_pages_range.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/mempolicy-apply-page-table-walker-on-queue_pages_range.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/SubmitChecklist when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> Subject: mempolicy: apply page table walker on queue_pages_range() queue_pages_range() does page table walking in its own way now, but there is some code duplicate. This patch applies page table walker to reduce lines of code. queue_pages_range() has to do some precheck to determine whether we really walk over the vma or just skip it. Now we have test_walk() callback in mm_walk for this purpose, so we can do this replacement cleanly. queue_pages_test_walk() depends on not only the current vma but also the previous one, so queue_pages->prev is introduced to remember it. Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> Cc: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Cyrill Gorcunov <gorcunov@xxxxxxxxxx> Cc: Dave Hansen <dave.hansen@xxxxxxxxx> Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> Cc: Pavel Emelyanov <xemul@xxxxxxxxxxxxx> Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- mm/mempolicy.c | 224 ++++++++++++++++++----------------------------- 1 file changed, 90 insertions(+), 134 deletions(-) diff -puN mm/mempolicy.c~mempolicy-apply-page-table-walker-on-queue_pages_range mm/mempolicy.c --- a/mm/mempolicy.c~mempolicy-apply-page-table-walker-on-queue_pages_range +++ a/mm/mempolicy.c @@ -471,24 +471,34 @@ static const struct mempolicy_operations static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); +struct queue_pages { + struct list_head *pagelist; + unsigned long flags; + nodemask_t *nmask; + struct vm_area_struct *prev; +}; + /* * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. */ -static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) +static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) { - pte_t *orig_pte; + struct vm_area_struct *vma = walk->vma; + struct page *page; + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; + int nid; pte_t *pte; spinlock_t *ptl; - orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - do { - struct page *page; - int nid; + split_huge_page_pmd(vma, addr, pmd); + if (pmd_trans_unstable(pmd)) + return 0; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); @@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct if (PageReserved(page)) continue; nid = page_to_nid(page); - if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_page_add(page, private, flags); - else - break; - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(orig_pte, ptl); - return addr != end; + migrate_page_add(page, qp->pagelist, flags); + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; } -static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, - pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, - void *private) +static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; int nid; struct page *page; spinlock_t *ptl; pte_t entry; - ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); - entry = huge_ptep_get((pte_t *)pmd); + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + entry = huge_ptep_get(pte); if (!pte_present(entry)) goto unlock; page = pte_page(entry); nid = page_to_nid(page); - if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) goto unlock; /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) - isolate_huge_page(page, private); + isolate_huge_page(page, qp->pagelist); unlock: spin_unlock(ptl); #else BUG(); #endif -} - -static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (!pmd_present(*pmd)) - continue; - if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { - queue_pages_hugetlb_pmd_range(vma, pmd, nodes, - flags, private); - continue; - } - split_huge_page_pmd(vma, addr, pmd); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - continue; - if (queue_pages_pte_range(vma, pmd, addr, next, nodes, - flags, private)) - return -EIO; - } while (pmd++, addr = next, addr != end); - return 0; -} - -static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pud_t *pud; - unsigned long next; - - pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) - continue; - if (pud_none_or_clear_bad(pud)) - continue; - if (queue_pages_pmd_range(vma, pud, addr, next, nodes, - flags, private)) - return -EIO; - } while (pud++, addr = next, addr != end); - return 0; -} - -static inline int queue_pages_pgd_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pgd_t *pgd; - unsigned long next; - - pgd = pgd_offset(vma->vm_mm, addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - if (queue_pages_pud_range(vma, pgd, addr, next, nodes, - flags, private)) - return -EIO; - } while (pgd++, addr = next, addr != end); return 0; } @@ -641,6 +583,46 @@ static unsigned long change_prot_numa(st } #endif /* CONFIG_NUMA_BALANCING */ +static int queue_pages_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct queue_pages *qp = walk->private; + unsigned long endvma = vma->vm_end; + unsigned long flags = qp->flags; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + + if (!(flags & MPOL_MF_DISCONTIG_OK)) { + if (!vma->vm_next && vma->vm_end < end) + return -EFAULT; + if (qp->prev && qp->prev->vm_end < vma->vm_start) + return -EFAULT; + } + + qp->prev = vma; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (flags & MPOL_MF_LAZY) { + /* Similar to task_numa_work, skip inaccessible VMAs */ + if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + change_prot_numa(vma, start, endvma); + return 1; + } + + if ((flags & MPOL_MF_STRICT) || + ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && + vma_migratable(vma))) + /* queue pages from current vma */ + return 0; + return 1; +} + /* * Walk through page tables and collect pages to be migrated. * @@ -650,50 +632,24 @@ static unsigned long change_prot_numa(st */ static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, - const nodemask_t *nodes, unsigned long flags, void *private) + nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) { - int err = 0; - struct vm_area_struct *vma, *prev; + struct queue_pages qp = { + .pagelist = pagelist, + .flags = flags, + .nmask = nodes, + .prev = NULL, + }; + struct mm_walk queue_pages_walk = { + .hugetlb_entry = queue_pages_hugetlb, + .pmd_entry = queue_pages_pte_range, + .test_walk = queue_pages_test_walk, + .mm = mm, + .private = &qp, + }; - vma = find_vma(mm, start); - if (!vma) - return -EFAULT; - prev = NULL; - for (; vma && vma->vm_start < end; vma = vma->vm_next) { - unsigned long endvma = vma->vm_end; - - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; - - if (!(flags & MPOL_MF_DISCONTIG_OK)) { - if (!vma->vm_next && vma->vm_end < end) - return -EFAULT; - if (prev && prev->vm_end < vma->vm_start) - return -EFAULT; - } - - if (flags & MPOL_MF_LAZY) { - /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) - change_prot_numa(vma, start, endvma); - goto next; - } - - if ((flags & MPOL_MF_STRICT) || - ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma))) { - - err = queue_pages_pgd_range(vma, start, endvma, nodes, - flags, private); - if (err) - break; - } -next: - prev = vma; - } - return err; + return walk_page_range(start, end, &queue_pages_walk); } /* _ Patches currently in -mm which might be from n-horiguchi@xxxxxxxxxxxxx are mm-pagewalk-call-pte_hole-for-vm_pfnmap-during-walk_page_range.patch mm-add-kpf_zero_page-flag-for-proc-kpageflags.patch mm-hugetlb-reduce-arch-dependent-code-around-follow_huge_.patch mm-hugetlb-pmd_huge-returns-true-for-non-present-hugepage.patch mm-hugetlb-take-page-table-lock-in-follow_huge_pmd.patch mm-hugetlb-fix-getting-refcount-0-page-in-hugetlb_fault.patch mm-hugetlb-add-migration-hwpoisoned-entry-check-in-hugetlb_change_protection.patch mm-hugetlb-add-migration-entry-check-in-__unmap_hugepage_range.patch mm-hugetlb-fix-suboptimal-migration-hwpoisoned-entry-check.patch mm-hugetlb-cleanup-and-rename-is_hugetlb_entry_migrationhwpoisoned.patch mm-pagewalk-remove-pgd_entry-and-pud_entry.patch revert-mm-pagewalk-call-pte_hole-for-vm_pfnmap-during-walk_page_range.patch pagewalk-improve-vma-handling.patch pagewalk-add-walk_page_vma.patch smaps-remove-mem_size_stats-vma-and-use-walk_page_vma.patch clear_refs-remove-clear_refs_private-vma-and-introduce-clear_refs_test_walk.patch pagemap-use-walk-vma-instead-of-calling-find_vma.patch numa_maps-fix-typo-in-gather_hugetbl_stats.patch numa_maps-remove-numa_maps-vma.patch memcg-cleanup-preparation-for-page-table-walk.patch arch-powerpc-mm-subpage-protc-use-walk-vma-and-walk_page_vma.patch mempolicy-apply-page-table-walker-on-queue_pages_range.patch mm-proc-pid-clear_refs-avoid-split_huge_page.patch mincore-apply-page-table-walker-on-do_mincore.patch do_shared_fault-check-that-mmap_sem-is-held.patch -- To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html