On 11/04/2016 12:36 PM, Andrea Arcangeli wrote: > On Thu, Nov 03, 2016 at 12:14:15PM -0700, Mike Kravetz wrote: >> + /* lookup dst_addr as we may have copied some pages */ >> + dst_vma = find_vma(dst_mm, dst_addr); > > I put back dst_start here. > >> + if (dst_addr < dst_vma->vm_start || >> + dst_addr + len - (copied * vma_hpagesize) > dst_vma->vm_end) >> + goto out_unlock; > > Actually this introduces a bug: copied * vma_hpagesize in the new > patch is wrong, copied is already in byte units. I rolled back this > one because of the dst_start commented above anyway. > >> + /* >> + * Validate alignment based on huge page size >> + */ >> + if (dst_addr & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) >> + goto out_unlock; > > If the vma changes under us we an as well fail. So I moved the > alignment checks on dst_start/len before the retry loop and I added a > further WARN_ON check inside the loop on dst_addr/len-copied just in > case but that cannot trigger as we abort if the vma_hpagesize changed > (hence WARN_ON). > > If we need to relax this later and handle a change of vma_hpagesize, > it'll be backwards compatible change. I don't think it's needed and > this is more strict behavior. > >> + while (src_addr < src_start + len) { >> + pte_t dst_pteval; >> + >> + BUG_ON(dst_addr >= dst_start + len); >> + dst_addr &= huge_page_mask(h); > > The additional mask is superflous here, it was already enforced by the > alignment checks so I turned it into a bugcheck. Thanks, I had made similar hugetlb changes and was testing. I'll perform hugetlb testing with the full diff patch below. -- Mike Kravetz > > This is the current status, I'm sending a full diff against the > previous submit for review of the latest updates. It's easier to > review incrementally I think. > > Please test it, I updated the aa.git tree userfault branch in sync > with this. > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index 063ccc7..8a0ee3ba 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -628,11 +628,11 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, > } > } > > -void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx vm_ctx, > +void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, > unsigned long from, unsigned long to, > unsigned long len) > { > - struct userfaultfd_ctx *ctx = vm_ctx.ctx; > + struct userfaultfd_ctx *ctx = vm_ctx->ctx; > struct userfaultfd_wait_queue ewq; > > if (!ctx) > @@ -657,6 +657,7 @@ void madvise_userfault_dontneed(struct vm_area_struct *vma, > struct vm_area_struct **prev, > unsigned long start, unsigned long end) > { > + struct mm_struct *mm = vma->vm_mm; > struct userfaultfd_ctx *ctx; > struct userfaultfd_wait_queue ewq; > > @@ -665,8 +666,9 @@ void madvise_userfault_dontneed(struct vm_area_struct *vma, > return; > > userfaultfd_ctx_get(ctx); > + up_read(&mm->mmap_sem); > + > *prev = NULL; /* We wait for ACK w/o the mmap semaphore */ > - up_read(&vma->vm_mm->mmap_sem); > > msg_init(&ewq.msg); > > @@ -676,7 +678,7 @@ void madvise_userfault_dontneed(struct vm_area_struct *vma, > > userfaultfd_event_wait_completion(ctx, &ewq); > > - down_read(&vma->vm_mm->mmap_sem); > + down_read(&mm->mmap_sem); > } > > static int userfaultfd_release(struct inode *inode, struct file *file) > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > index 5caf97f..01a4e98 100644 > --- a/include/linux/userfaultfd_k.h > +++ b/include/linux/userfaultfd_k.h > @@ -77,7 +77,7 @@ extern void dup_userfaultfd_complete(struct list_head *); > > extern void mremap_userfaultfd_prep(struct vm_area_struct *, > struct vm_userfaultfd_ctx *); > -extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx, > +extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, > unsigned long from, unsigned long to, > unsigned long len); > > @@ -143,7 +143,7 @@ static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, > { > } > > -static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx ctx, > +static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, > unsigned long from, > unsigned long to, > unsigned long len) > diff --git a/mm/mremap.c b/mm/mremap.c > index 450e811..cef4967 100644 > --- a/mm/mremap.c > +++ b/mm/mremap.c > @@ -592,6 +592,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, > up_write(¤t->mm->mmap_sem); > if (locked && new_len > old_len) > mm_populate(new_addr + old_len, new_len - old_len); > - mremap_userfaultfd_complete(uf, addr, new_addr, old_len); > + mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); > return ret; > } > diff --git a/mm/shmem.c b/mm/shmem.c > index 578622e..5d3e8bf 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -1609,7 +1609,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, > if (fault_type) { > *fault_type |= VM_FAULT_MAJOR; > count_vm_event(PGMAJFAULT); > - mem_cgroup_count_vm_event(vma->vm_mm, > + mem_cgroup_count_vm_event(charge_mm, > PGMAJFAULT); > } > /* Here we actually start the io */ > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index d47b743..e8d7a89 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -172,8 +172,10 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > * by THP. Since we can not reliably insert a zero page, this > * feature is not supported. > */ > - if (zeropage) > + if (zeropage) { > + up_read(&dst_mm->mmap_sem); > return -EINVAL; > + } > > src_addr = src_start; > dst_addr = dst_start; > @@ -181,6 +183,12 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > page = NULL; > vma_hpagesize = vma_kernel_pagesize(dst_vma); > > + /* > + * Validate alignment based on huge page size > + */ > + if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) > + goto out_unlock; > + > retry: > /* > * On routine entry dst_vma is set. If we had to drop mmap_sem and > @@ -189,11 +197,15 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > err = -EINVAL; > if (!dst_vma) { > dst_vma = find_vma(dst_mm, dst_start); > - vma_hpagesize = vma_kernel_pagesize(dst_vma); > + if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) > + goto out_unlock; > + > + if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) > + goto out_unlock; > > /* > - * Make sure the vma is not shared, that the dst range is > - * both valid and fully within a single existing vma. > + * Make sure the vma is not shared, that the remaining dst > + * range is both valid and fully within a single existing vma. > */ > if (dst_vma->vm_flags & VM_SHARED) > goto out_unlock; > @@ -202,10 +214,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > goto out_unlock; > } > > - /* > - * Validate alignment based on huge page size > - */ > - if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) > + if (WARN_ON(dst_addr & (vma_hpagesize - 1) || > + (len - copied) & (vma_hpagesize - 1))) > goto out_unlock; > > /* > @@ -227,7 +237,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > pte_t dst_pteval; > > BUG_ON(dst_addr >= dst_start + len); > - dst_addr &= huge_page_mask(h); > + VM_BUG_ON(dst_addr & ~huge_page_mask(h)); > > /* > * Serialize via hugetlb_fault_mutex > @@ -300,17 +310,13 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > return copied ? copied : err; > } > #else /* !CONFIG_HUGETLB_PAGE */ > -static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > - struct vm_area_struct *dst_vma, > - unsigned long dst_start, > - unsigned long src_start, > - unsigned long len, > - bool zeropage) > -{ > - up_read(&dst_mm->mmap_sem); /* HUGETLB not configured */ > - BUG(); > - return -EINVAL; > -} > +/* fail at build time if gcc attempts to use this */ > +extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > + struct vm_area_struct *dst_vma, > + unsigned long dst_start, > + unsigned long src_start, > + unsigned long len, > + bool zeropage); > #endif /* CONFIG_HUGETLB_PAGE */ > > static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, > @@ -360,9 +366,9 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, > /* > * If this is a HUGETLB vma, pass off to appropriate routine > */ > - if (dst_vma->vm_flags & VM_HUGETLB) > + if (is_vm_hugetlb_page(dst_vma)) > return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, > - src_start, len, false); > + src_start, len, zeropage); > > /* > * Be strict and only allow __mcopy_atomic on userfaultfd > @@ -431,8 +437,11 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, > err = mfill_zeropage_pte(dst_mm, dst_pmd, > dst_vma, dst_addr); > } else { > - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, > - dst_addr, src_addr, &page); > + err = -EINVAL; /* if zeropage is true return -EINVAL */ > + if (likely(!zeropage)) > + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, > + dst_vma, dst_addr, > + src_addr, &page); > } > > cond_resched(); > diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c > index fed2119..5a840a6 100644 > --- a/tools/testing/selftests/vm/userfaultfd.c > +++ b/tools/testing/selftests/vm/userfaultfd.c > @@ -625,6 +625,86 @@ static int faulting_process(void) > return 0; > } > > +static int uffdio_zeropage(int ufd, unsigned long offset) > +{ > + struct uffdio_zeropage uffdio_zeropage; > + int ret; > + unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE); > + > + if (offset >= nr_pages * page_size) > + fprintf(stderr, "unexpected offset %lu\n", > + offset), exit(1); > + uffdio_zeropage.range.start = (unsigned long) area_dst + offset; > + uffdio_zeropage.range.len = page_size; > + uffdio_zeropage.mode = 0; > + ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); > + if (ret) { > + /* real retval in ufdio_zeropage.zeropage */ > + if (has_zeropage) { > + if (uffdio_zeropage.zeropage == -EEXIST) > + fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"), > + exit(1); > + else > + fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n", > + uffdio_zeropage.zeropage), exit(1); > + } else { > + if (uffdio_zeropage.zeropage != -EINVAL) > + fprintf(stderr, > + "UFFDIO_ZEROPAGE not -EINVAL %Ld\n", > + uffdio_zeropage.zeropage), exit(1); > + } > + } else if (has_zeropage) { > + if (uffdio_zeropage.zeropage != page_size) { > + fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n", > + uffdio_zeropage.zeropage), exit(1); > + } else > + return 1; > + } else { > + fprintf(stderr, > + "UFFDIO_ZEROPAGE succeeded %Ld\n", > + uffdio_zeropage.zeropage), exit(1); > + } > + > + return 0; > +} > + > +/* exercise UFFDIO_ZEROPAGE */ > +static int userfaultfd_zeropage_test(void) > +{ > + struct uffdio_register uffdio_register; > + unsigned long expected_ioctls; > + > + printf("testing UFFDIO_ZEROPAGE: "); > + fflush(stdout); > + > + if (release_pages(area_dst)) > + return 1; > + > + if (userfaultfd_open(0) < 0) > + return 1; > + uffdio_register.range.start = (unsigned long) area_dst; > + uffdio_register.range.len = nr_pages * page_size; > + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; > + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) > + fprintf(stderr, "register failure\n"), exit(1); > + > + expected_ioctls = EXPECTED_IOCTLS; > + if ((uffdio_register.ioctls & expected_ioctls) != > + expected_ioctls) > + fprintf(stderr, > + "unexpected missing ioctl for anon memory\n"), > + exit(1); > + > + if (uffdio_zeropage(uffd, 0)) { > + if (my_bcmp(area_dst, zeropage, page_size)) > + fprintf(stderr, "zeropage is not zero\n"), exit(1); > + } > + > + close(uffd); > + printf("done.\n"); > + return 0; > +} > + > static int userfaultfd_events_test(void) > { > struct uffdio_register uffdio_register; > @@ -679,6 +759,7 @@ static int userfaultfd_events_test(void) > if (pthread_join(uffd_mon, (void **)&userfaults)) > return 1; > > + close(uffd); > printf("userfaults: %ld\n", userfaults); > > return userfaults != nr_pages; > @@ -852,7 +933,7 @@ static int userfaultfd_stress(void) > return err; > > close(uffd); > - return userfaultfd_events_test(); > + return userfaultfd_zeropage_test() || userfaultfd_events_test(); > } > > #ifndef HUGETLB_TEST > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>