From: Andrea Arcangeli <aarcange@xxxxxxxxxx> Subject: userfaultfd: non-cooperative: userfaultfd_remove revalidate vma in MADV_DONTNEED userfaultfd_remove() has to be execute before zapping the pagetables or UFFDIO_COPY could keep filling pages after zap_page_range returned, which would result in non zero data after a MADV_DONTNEED. However userfaultfd_remove() may have to release the mmap_sem. This was handled correctly in MADV_REMOVE, but MADV_DONTNEED accessed a potentially stale vma (the very vma passed to zap_page_range(vma, ...)). The fix consists in revalidating the vma in case userfaultfd_remove() had to release the mmap_sem. This also optimizes away an unnecessary down_read/up_read in the MADV_REMOVE case if UFFD_EVENT_FORK had to be delivered. It all remains zero runtime cost in case CONFIG_USERFAULTFD=n as userfaultfd_remove() will be defined as "true" at build time. Link: http://lkml.kernel.org/r/20170302173738.18994-3-aarcange@xxxxxxxxxx Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> Acked-by: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx> Cc: "Dr. David Alan Gilbert" <dgilbert@xxxxxxxxxx> Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Cc: Pavel Emelyanov <xemul@xxxxxxxxxxxxx> Cc: Hillf Danton <hillf.zj@xxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- fs/userfaultfd.c | 9 ++---- include/linux/userfaultfd_k.h | 7 ++--- mm/madvise.c | 44 +++++++++++++++++++++++++++++--- 3 files changed, 47 insertions(+), 13 deletions(-) diff -puN fs/userfaultfd.c~userfaultfd-non-cooperative-userfaultfd_remove-revalidate-vma-in-madv_dontneed fs/userfaultfd.c --- a/fs/userfaultfd.c~userfaultfd-non-cooperative-userfaultfd_remove-revalidate-vma-in-madv_dontneed +++ a/fs/userfaultfd.c @@ -695,8 +695,7 @@ void mremap_userfaultfd_complete(struct userfaultfd_event_wait_completion(ctx, &ewq); } -void userfaultfd_remove(struct vm_area_struct *vma, - struct vm_area_struct **prev, +bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; @@ -705,13 +704,11 @@ void userfaultfd_remove(struct vm_area_s ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) - return; + return true; userfaultfd_ctx_get(ctx); up_read(&mm->mmap_sem); - *prev = NULL; /* We wait for ACK w/o the mmap semaphore */ - msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_REMOVE; @@ -720,7 +717,7 @@ void userfaultfd_remove(struct vm_area_s userfaultfd_event_wait_completion(ctx, &ewq); - down_read(&mm->mmap_sem); + return false; } static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, diff -puN include/linux/userfaultfd_k.h~userfaultfd-non-cooperative-userfaultfd_remove-revalidate-vma-in-madv_dontneed include/linux/userfaultfd_k.h --- a/include/linux/userfaultfd_k.h~userfaultfd-non-cooperative-userfaultfd_remove-revalidate-vma-in-madv_dontneed +++ a/include/linux/userfaultfd_k.h @@ -61,8 +61,7 @@ extern void mremap_userfaultfd_complete( unsigned long from, unsigned long to, unsigned long len); -extern void userfaultfd_remove(struct vm_area_struct *vma, - struct vm_area_struct **prev, +extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -118,11 +117,11 @@ static inline void mremap_userfaultfd_co { } -static inline void userfaultfd_remove(struct vm_area_struct *vma, - struct vm_area_struct **prev, +static inline bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { + return true; } static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, diff -puN mm/madvise.c~userfaultfd-non-cooperative-userfaultfd_remove-revalidate-vma-in-madv_dontneed mm/madvise.c --- a/mm/madvise.c~userfaultfd-non-cooperative-userfaultfd_remove-revalidate-vma-in-madv_dontneed +++ a/mm/madvise.c @@ -513,7 +513,43 @@ static long madvise_dontneed(struct vm_a if (!can_madv_dontneed_vma(vma)) return -EINVAL; - userfaultfd_remove(vma, prev, start, end); + if (!userfaultfd_remove(vma, start, end)) { + *prev = NULL; /* mmap_sem has been dropped, prev is stale */ + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + if (!vma) + return -ENOMEM; + if (start < vma->vm_start) { + /* + * This "vma" under revalidation is the one + * with the lowest vma->vm_start where start + * is also < vma->vm_end. If start < + * vma->vm_start it means an hole materialized + * in the user address space within the + * virtual range passed to MADV_DONTNEED. + */ + return -ENOMEM; + } + if (!can_madv_dontneed_vma(vma)) + return -EINVAL; + if (end > vma->vm_end) { + /* + * Don't fail if end > vma->vm_end. If the old + * vma was splitted while the mmap_sem was + * released the effect of the concurrent + * operation may not cause MADV_DONTNEED to + * have an undefined result. There may be an + * adjacent next vma that we'll walk + * next. userfaultfd_remove() will generate an + * UFFD_EVENT_REMOVE repetition on the + * end-vma->vm_end range, but the manager can + * handle a repetition fine. + */ + end = vma->vm_end; + } + VM_WARN_ON(start >= end); + } zap_page_range(vma, start, end - start); return 0; } @@ -554,8 +590,10 @@ static long madvise_remove(struct vm_are * mmap_sem. */ get_file(f); - userfaultfd_remove(vma, prev, start, end); - up_read(¤t->mm->mmap_sem); + if (userfaultfd_remove(vma, start, end)) { + /* mmap_sem was not released by userfaultfd_remove() */ + up_read(¤t->mm->mmap_sem); + } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); _ -- To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html