From: Li Xinhai <lixinhai.lxh@xxxxxxxxx> mbind() is required to report EFAULT if range, specified by addr and len, contains unmapped holes. In current implementation, below rules are applied for this checking: 1 Unmapped holes at any part of the specified range should be reported as EFAULT if mbind() for none MPOL_DEFAULT cases; 2 Unmapped holes at any part of the specified range should be ignored (do not reprot EFAULT) if mbind() for MPOL_DEFAULT case; 3 The whole range in an unmapped hole should be reported as EFAULT; Note that rule 2 does not fullfill the mbind() API definition, but since that behavior has existed for long days (refer the usage of internal flag MPOL_MF_DISCONTIG_OK), this patch does not plan to change it. Cases do not follow those rules and been fixed by this patch are: case_1: unmapped hole at tail side of the sepcified range when mbind() for non MPOL_DEFAULT cases, EFAULT is not reported (conflicts rule 1). [ hole ][ vma ][ hole ] [ range ] case_2: unmapped hole at head side of the specified range when mbind() for MPOL_DEFAULT case, EFAULT is reported (conflicts rule 2). [ hole ][ vma ][ hole ] [ range ] Fixes: 9d8cebd4bcd7 ("mm: fix mbind vma merge problem") Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") Fixes: 48684a65b4e3 ("mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP)") Signed-off-by: Li Xinhai <lixinhai.lxh@xxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Naoya Horiguchi<n-horiguchi@xxxxxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Vlastimil Babka <vbabka@xxxxxxx> Cc: linux-man <linux-man@xxxxxxxxxxxxxxx> --- Changes v3->v4: - fix my email address; Changes v2->v3: - Add more details in change log; - Check holes in .test_walk() and after call walk_page_range(); mm/mempolicy.c | 52 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ae967b..b2e10bf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -410,7 +410,9 @@ struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; - struct vm_area_struct *prev; + unsigned long start; + struct vm_area_struct *first; + struct vm_area_struct *last; }; /* @@ -618,6 +620,21 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; + /* range check first */ + VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end)); + + if (!qp->first) { + qp->first = vma; + if (!(flags & MPOL_MF_DISCONTIG_OK) && + (qp->start < vma->vm_start)) + /* hole at head side of range */ + return -EFAULT; + } else if (!(flags & MPOL_MF_DISCONTIG_OK) && + (vma->vm_prev->vm_end < vma->vm_start)) + /* hole at middle of range */ + return -EFAULT; + qp->last = vma; + /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable @@ -628,17 +645,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (endvma > end) endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; - - if (!(flags & MPOL_MF_DISCONTIG_OK)) { - if (!vma->vm_next && vma->vm_end < end) - return -EFAULT; - if (qp->prev && qp->prev->vm_end < vma->vm_start) - return -EFAULT; - } - - qp->prev = vma; if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ @@ -679,14 +685,29 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { + int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, - .prev = NULL, + .start = start, + .first = NULL, + .last = NULL, }; - return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); + err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); + + if (err != -EFAULT) { + if (!qp.first) + /* whole range in hole */ + err = -EFAULT; + else if (!(flags & MPOL_MF_DISCONTIG_OK) && + (qp.last->vm_end < end)) + /* hole at tail side of range */ + err = -EFAULT; + } + + return err; } /* @@ -738,8 +759,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long vmend; vma = find_vma(mm, start); - if (!vma || vma->vm_start > start) - return -EFAULT; + VM_BUG_ON(!vma); prev = vma->vm_prev; if (start > vma->vm_start) -- 1.8.3.1