+ userfaultfd-prevent-non-cooperative-events-vs-mcopy_atomic-races.patch added to -mm tree

akpm@xxxxxxxxxxxxxxxxxxxx · Fri, 25 May 2018 12:39:30 -0700

The patch titled
     Subject: userfaultfd: prevent non-cooperative events vs mcopy_atomic races
has been added to the -mm tree.  Its filename is
     userfaultfd-prevent-non-cooperative-events-vs-mcopy_atomic-races.patch

This patch should soon appear at
    http://ozlabs.org/~akpm/mmots/broken-out/userfaultfd-prevent-non-cooperative-events-vs-mcopy_atomic-races.patch
and later at
    http://ozlabs.org/~akpm/mmotm/broken-out/userfaultfd-prevent-non-cooperative-events-vs-mcopy_atomic-races.patch

Before you just go and hit "reply", please:
   a) Consider who else should be cc'ed
   b) Prefer to cc a suitable mailing list as well
   c) Ideally: find the original patch on the mailing list and do a
      reply-to-all to that, adding suitable additional cc's

*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***

The -mm tree is included into linux-next and is updated
there every 3-4 working days

------------------------------------------------------
From: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx>
Subject: userfaultfd: prevent non-cooperative events vs mcopy_atomic races

If a process monitored with userfaultfd changes it's memory mappings or
forks() at the same time as uffd monitor fills the process memory with
UFFDIO_COPY, the actual creation of page table entries and copying of the
data in mcopy_atomic may happen either before of after the memory mapping
modifications and there is no way for the uffd monitor to maintain
consistent view of the process memory layout.

For instance, let's consider fork() running in parallel with
userfaultfd_copy():

process        		         |	uffd monitor
---------------------------------+------------------------------
fork()        		         | userfaultfd_copy()
...        		         | ...
    dup_mmap()        	         |     down_read(mmap_sem)
    down_write(mmap_sem)         |     /* create PTEs, copy data */
        dup_uffd()               |     up_read(mmap_sem)
        copy_page_range()        |
        up_write(mmap_sem)       |
        dup_uffd_complete()      |
            /* notify monitor */ |

If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will
be present by the time copy_page_range() is called and they will appear in
the child's memory mappings.  However, if the fork() is the first to take
the mmap_sem, the new pages won't be mapped in the child's address space.

If the pages are not present and child tries to access them, the monitor
will get page fault notification and everything is fine.  However, if the
pages *are present*, the child can access them without uffd noticing.  And
if we copy them into child it'll see the wrong data.  Since we are talking
about background copy, we'd need to decide whether the pages should be
copied or not regardless #PF notifications.

Since userfaultfd monitor has no way to determine what was the order,
let's disallow userfaultfd_copy in parallel with the non-cooperative
events.  In such case we return -EAGAIN and the uffd monitor can
understand that userfaultfd_copy() clashed with a non-cooperative event
and take an appropriate action.

Link: http://lkml.kernel.org/r/1527061324-19949-1-git-send-email-rppt@xxxxxxxxxxxxxxxxxx
Signed-off-by: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx>
Acked-by: Pavel Emelyanov <xemul@xxxxxxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
Cc: Andrei Vagin <avagin@xxxxxxxxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 fs/userfaultfd.c              |   22 ++++++++++++++++++++--
 include/linux/userfaultfd_k.h |    6 ++++--
 mm/userfaultfd.c              |   22 +++++++++++++++++-----
 3 files changed, 41 insertions(+), 9 deletions(-)

diff -puN fs/userfaultfd.c~userfaultfd-prevent-non-cooperative-events-vs-mcopy_atomic-races fs/userfaultfd.c

--- a/fs/userfaultfd.c~userfaultfd-prevent-non-cooperative-events-vs-mcopy_atomic-races
+++ a/fs/userfaultfd.c
@@ -62,6 +62,8 @@ struct userfaultfd_ctx {
 	enum userfaultfd_state state;
 	/* released */
 	bool released;
+	/* memory mappings are changing because of non-cooperative event */
+	bool mmap_changing;
 	/* mm with one ore more vmas attached to this userfaultfd_ctx */
 	struct mm_struct *mm;
 };
@@ -641,6 +643,7 @@ static void userfaultfd_event_wait_compl
 	 * already released.
 	 */
 out:
+	WRITE_ONCE(ctx->mmap_changing, false);
 	userfaultfd_ctx_put(ctx);
 }
 
@@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struc
 		ctx->state = UFFD_STATE_RUNNING;
 		ctx->features = octx->features;
 		ctx->released = false;
+		ctx->mmap_changing = false;
 		ctx->mm = vma->vm_mm;
 		mmgrab(ctx->mm);
 
 		userfaultfd_ctx_get(octx);
+		WRITE_ONCE(octx->mmap_changing, true);
 		fctx->orig = octx;
 		fctx->new = ctx;
 		list_add_tail(&fctx->list, fcs);
@@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_a
 	if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
 		vm_ctx->ctx = ctx;
 		userfaultfd_ctx_get(ctx);
+		WRITE_ONCE(ctx->mmap_changing, true);
 	}
 }
 
@@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_s
 		return true;
 
 	userfaultfd_ctx_get(ctx);
+	WRITE_ONCE(ctx->mmap_changing, true);
 	up_read(&mm->mmap_sem);
 
 	msg_init(&ewq.msg);
@@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_are
 			return -ENOMEM;
 
 		userfaultfd_ctx_get(ctx);
+		WRITE_ONCE(ctx->mmap_changing, true);
 		unmap_ctx->ctx = ctx;
 		unmap_ctx->start = start;
 		unmap_ctx->end = end;
@@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userf
 
 	user_uffdio_copy = (struct uffdio_copy __user *) arg;
 
+	ret = -EAGAIN;
+	if (READ_ONCE(ctx->mmap_changing))
+		goto out;
+
 	ret = -EFAULT;
 	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
 			   /* don't copy "copy" last field */
@@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userf
 		goto out;
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-				   uffdio_copy.len);
+				   uffdio_copy.len, &ctx->mmap_changing);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct u
 
 	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
 
+	ret = -EAGAIN;
+	if (READ_ONCE(ctx->mmap_changing))
+		goto out;
+
 	ret = -EFAULT;
 	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
 			   /* don't copy "zeropage" last field */
@@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct u
 
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
-				     uffdio_zeropage.range.len);
+				     uffdio_zeropage.range.len,
+				     &ctx->mmap_changing);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
 	ctx->features = 0;
 	ctx->state = UFFD_STATE_WAIT_API;
 	ctx->released = false;
+	ctx->mmap_changing = false;
 	ctx->mm = current->mm;
 	/* prevent the mm struct to be freed */
 	mmgrab(ctx->mm);
diff -puN include/linux/userfaultfd_k.h~userfaultfd-prevent-non-cooperative-events-vs-mcopy_atomic-races include/linux/userfaultfd_k.h
--- a/include/linux/userfaultfd_k.h~userfaultfd-prevent-non-cooperative-events-vs-mcopy_atomic-races
+++ a/include/linux/userfaultfd_k.h
@@ -31,10 +31,12 @@
 extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
-			    unsigned long src_start, unsigned long len);
+			    unsigned long src_start, unsigned long len,
+			    bool *mmap_changing);
 extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
 			      unsigned long dst_start,
-			      unsigned long len);
+			      unsigned long len,
+			      bool *mmap_changing);
 
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
diff -puN mm/userfaultfd.c~userfaultfd-prevent-non-cooperative-events-vs-mcopy_atomic-races mm/userfaultfd.c
--- a/mm/userfaultfd.c~userfaultfd-prevent-non-cooperative-events-vs-mcopy_atomic-races
+++ a/mm/userfaultfd.c
@@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_a
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      bool zeropage)
+					      bool zeropage,
+					      bool *mmap_changing)
 {
 	struct vm_area_struct *dst_vma;
 	ssize_t err;
@@ -431,6 +432,15 @@ retry:
 	down_read(&dst_mm->mmap_sem);
 
 	/*
+	 * If memory mappings are changing because of non-cooperative
+	 * operation (e.g. mremap) running in parallel, bail out and
+	 * request the user to retry later
+	 */
+	err = -EAGAIN;
+	if (mmap_changing && READ_ONCE(*mmap_changing))
+		goto out_unlock;
+
+	/*
 	 * Make sure the vma is not shared, that the dst range is
 	 * both valid and fully within a single existing vma.
 	 */
@@ -563,13 +573,15 @@ out:
 }
 
 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
-		     unsigned long src_start, unsigned long len)
+		     unsigned long src_start, unsigned long len,
+		     bool *mmap_changing)
 {
-	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
+			      mmap_changing);
 }
 
 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
-		       unsigned long len)
+		       unsigned long len, bool *mmap_changing)
 {
-	return __mcopy_atomic(dst_mm, start, 0, len, true);
+	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
 }
_

Patches currently in -mm which might be from rppt@xxxxxxxxxxxxxxxxxx are

mm-ksm-remove-unused-page_referenced_ksm-declaration.patch
mm-ksm-move-page_stable_node-from-ksmh-to-ksmc.patch
userfaultfd-prevent-non-cooperative-events-vs-mcopy_atomic-races.patch

--
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html