Commit-ID: e9fe72334fb0a369a2d14799d0365514313da31c Gitweb: http://git.kernel.org/tip/e9fe72334fb0a369a2d14799d0365514313da31c Author: Andrea Arcangeli <aarcange@xxxxxxxxxx> AuthorDate: Fri, 5 Oct 2012 21:36:27 +0200 Committer: Ingo Molnar <mingo@xxxxxxxxxx> CommitDate: Sun, 21 Oct 2012 14:40:38 +0200 numa, mm: Fix NUMA hinting page faults from gup/gup_fast Introduce FOLL_NUMA to tell follow_page to check pte/pmd_numa. get_user_pages must use FOLL_NUMA, and it's safe to do so because it always invokes handle_mm_fault and retries the follow_page later. KVM secondary MMU page faults will trigger the NUMA hinting page faults through gup_fast -> get_user_pages -> follow_page -> handle_mm_fault. Other follow_page callers like KSM should not use FOLL_NUMA, or they would fail to get the pages if they use follow_page instead of get_user_pages. [ This patch was picked up from the AutoNUMA tree. ] Originally-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Rik van Riel <riel@xxxxxxxxxx> [ ported to this tree. ] Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx> --- include/linux/mm.h | 1 + mm/memory.c | 55 ++++++++++++++++++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 64ccf29..54b3094 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1608,6 +1608,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_MLOCK 0x40 /* mark page as mlocked */ #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ +#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/mm/memory.c b/mm/memory.c index 1ea7e5b..b609354 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1471,6 +1471,25 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); +static bool pte_prot_none(struct vm_area_struct *vma, pte_t pte) +{ + /* + * If we have the normal vma->vm_page_prot protections we're not a + * 'special' PROT_NONE page. + * + * This means we cannot get 'special' PROT_NONE faults from genuine + * PROT_NONE maps, nor from PROT_WRITE file maps that do dirty + * tracking. + * + * Neither case is really interesting for our current use though so we + * don't care. + */ + if (pte_same(pte, pte_modify(pte, vma->vm_page_prot))) + return false; + + return pte_same(pte, pte_modify(pte, vma_prot_none(vma))); +} + /** * follow_page - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address @@ -1524,6 +1543,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } + if ((flags & FOLL_NUMA) && pmd_prot_none(vma, *pmd)) + goto no_page_table; if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { split_huge_page_pmd(mm, pmd); @@ -1553,6 +1574,8 @@ split_fallthrough: pte = *ptep; if (!pte_present(pte)) goto no_page; + if ((flags & FOLL_NUMA) && pte_prot_none(vma, pte)) + goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -1704,6 +1727,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (gup_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + + /* + * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault + * would be called on PROT_NONE ranges. We must never invoke + * handle_mm_fault on PROT_NONE ranges or the NUMA hinting + * page faults would unprotect the PROT_NONE ranges if + * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd + * bitflag. So to avoid that, don't set FOLL_NUMA if + * FOLL_FORCE is set. + */ + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + i = 0; do { @@ -3440,25 +3476,6 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } -static bool pte_prot_none(struct vm_area_struct *vma, pte_t pte) -{ - /* - * If we have the normal vma->vm_page_prot protections we're not a - * 'special' PROT_NONE page. - * - * This means we cannot get 'special' PROT_NONE faults from genuine - * PROT_NONE maps, nor from PROT_WRITE file maps that do dirty - * tracking. - * - * Neither case is really interesting for our current use though so we - * don't care. - */ - if (pte_same(pte, pte_modify(pte, vma->vm_page_prot))) - return false; - - return pte_same(pte, pte_modify(pte, vma_prot_none(vma))); -} - static int do_prot_none(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pmd_t *pmd, unsigned int flags, pte_t entry) -- To unsubscribe from this list: send the line "unsubscribe linux-tip-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html