[RFC PATCH] userfaultfd: add UFFDIO_TRY_COW

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

This is an initial attempt to implement COW with userfaultfd.
It's not yet complete, but I'd like to get an early feedback to see I'm not
talking complete nonsense.

It was possible to extend UFFDIO_COPY with UFFDIO_COPY_MODE_COW,  but I've
preferred to add the COW'ing of the pages as a new ioctl because otherwise
I would need to extend uffdio_copy structure to hold an additional
parameter.

--
Sincerely yours,
Mike.

>From b97ef7f7d31918e4651c68493bc4b6ea86dd0038 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx>
Date: Wed, 28 Mar 2018 11:33:35 +0300
Subject: [RFC PATCH] userfaultfd: add UFFDIO_TRY_COW

If userfaultfd is used to manage memory of a process tree it is
impossible to create copy-on-write mappings for the pages that can be COW
shared between two or more processes.

When we restore a process tree using pre-copy approach, it is possible to
recreate the COW mappings. However, with lazy-memory restore that uses
userfaultfd, all the pages that could have been COW shared are copied to
each address space with UFFDIO_COPY and a physical page is allocated for
each process. This increases memory usage of the restored processes
relatively to their memory usage before checkpoint.

The new UFFDIO_TRY_COW ioctl() allows to create COW mappings for the pages
that were COW'ed at the process tree at the dump time. It operates on three
address spaces: the calling process, the address space managed with
userfaultfd, and the address space of the process that contains potential
COW pages (cow_mm). Like UFFDIO_COPY, it receives an address in the calling
process that contains the page data, an address in the managed process were
the data should be put and the length of the range. For every page in the
range, UFFDIO_TRY_COW checks if a page with the same contents as the source
page exists in the cow_mm and, if yes, it creates COW mapping in the
destination address space. Otherwise the page contents is copied from the
source.

Signed-off-by: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx>
---
 fs/userfaultfd.c                 |  65 ++++++++++++++++++
 include/linux/userfaultfd_k.h    |   5 ++
 include/uapi/linux/userfaultfd.h |  15 ++++-
 mm/userfaultfd.c                 | 140 +++++++++++++++++++++++++++++++++++++--
 4 files changed, 219 insertions(+), 6 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index cec550c8468f..29a505a1cf0f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1750,6 +1750,68 @@ static inline unsigned int uffd_ctx_features(__u64 user_features)
 	return (unsigned int)user_features;
 }
 
+static int userfaultfd_try_cow(struct userfaultfd_ctx *ctx,
+			       unsigned long arg)
+{
+	__s64 ret;
+	struct uffdio_try_cow uffdio_try_cow;
+	struct uffdio_try_cow __user *user_uffdio_try_cow;
+	struct userfaultfd_wake_range range;
+	struct task_struct *cow_src_task;
+	struct mm_struct *cow_src_mm;
+
+	user_uffdio_try_cow = (struct uffdio_try_cow __user *) arg;
+
+	ret = -EFAULT;
+	if (copy_from_user(&uffdio_try_cow, user_uffdio_try_cow,
+			   /* don't copy "result" last field */
+			   sizeof(uffdio_try_cow)-sizeof(__s64)))
+		goto out;
+
+	ret = validate_range(ctx->mm, uffdio_try_cow.dst_start,
+			     uffdio_try_cow.len);
+	if (ret)
+		goto out;
+
+	ret = -ESRCH;
+	cow_src_task = find_get_task_by_vpid(uffdio_try_cow.pid);
+	if (!cow_src_task)
+		goto out;
+
+	cow_src_mm = get_task_mm(cow_src_task);
+	if (!cow_src_mm) {
+		put_task_struct(cow_src_task);
+		goto out;
+	}
+
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mfill_cow(ctx->mm, cow_src_mm,
+				uffdio_try_cow.dst_start,
+				uffdio_try_cow.src_start,
+				uffdio_try_cow.len);
+		mmput(ctx->mm);
+		mmput(cow_src_mm);
+		put_task_struct(cow_src_task);
+	} else {
+		mmput(cow_src_mm);
+		put_task_struct(cow_src_task);
+		return -ESRCH;
+	}
+	if (unlikely(put_user(ret, &user_uffdio_try_cow->result)))
+		return -EFAULT;
+	if (ret < 0)
+		goto out;
+	/* len == 0 would wake all */
+	BUG_ON(!ret);
+	range.len = ret;
+	range.start = uffdio_try_cow.dst_start;
+	wake_userfault(ctx, &range);
+
+	ret = range.len == uffdio_try_cow.len ? 0 : -EAGAIN;
+out:
+	return ret;
+}
+
 /*
  * userland asks for a certain API version and we return which bits
  * and ioctl commands are implemented in this kernel for such API
@@ -1819,6 +1881,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_ZEROPAGE:
 		ret = userfaultfd_zeropage(ctx, arg);
 		break;
+	case UFFDIO_TRY_COW:
+		ret = userfaultfd_try_cow(ctx, arg);
+		break;
 	}
 	return ret;
 }
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f2f3b68ba910..d8f7e3bd6921 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -35,6 +35,11 @@ extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
 			      unsigned long dst_start,
 			      unsigned long len);
+extern ssize_t mfill_cow(struct mm_struct *dst_mm,
+			 struct mm_struct *cow_src_mm,
+			 unsigned long dst_start,
+			 unsigned long copy_src_start,
+			 unsigned long len);
 
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 48f1a7c2f1f0..627b24ed5422 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -34,7 +34,8 @@
 #define UFFD_API_RANGE_IOCTLS			\
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY |		\
-	 (__u64)1 << _UFFDIO_ZEROPAGE)
+	 (__u64)1 << _UFFDIO_ZEROPAGE |		\
+	 (__u64)1 << _UFFDIO_TRY_COW)
 #define UFFD_API_RANGE_IOCTLS_BASIC		\
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY)
@@ -52,6 +53,7 @@
 #define _UFFDIO_WAKE			(0x02)
 #define _UFFDIO_COPY			(0x03)
 #define _UFFDIO_ZEROPAGE		(0x04)
+#define _UFFDIO_TRY_COW			(0x05)
 #define _UFFDIO_API			(0x3F)
 
 /* userfaultfd ioctl ids */
@@ -68,6 +70,8 @@
 				      struct uffdio_copy)
 #define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
 				      struct uffdio_zeropage)
+#define UFFDIO_TRY_COW		_IOWR(UFFDIO, _UFFDIO_TRY_COW,	\
+				      struct uffdio_try_cow)
 
 /* read() structure */
 struct uffd_msg {
@@ -231,4 +235,13 @@ struct uffdio_zeropage {
 	__s64 zeropage;
 };
 
+struct uffdio_try_cow {
+	__u32 pid;
+	__u64 dst_start;
+	__u64 src_start;
+	__u64 len;
+	__u64 mode;
+	__s64 result;
+};
+
 #endif /* _LINUX_USERFAULTFD_H */
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 39791b81ede7..7597b6e40963 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -124,6 +124,108 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm,
 	return ret;
 }
 
+static int mfill_cow_pte(struct mm_struct *dst_mm,
+			 struct mm_struct *cow_mm,
+			 pmd_t *dst_pmd,
+			 struct vm_area_struct *dst_vma,
+			 unsigned long dst_addr,
+			 unsigned long src_addr,
+			 struct page **pagep)
+{
+	struct vm_area_struct *cow_vma;
+	struct page *src_page, *cow_page;
+	pmd_t *cow_pmd;
+	pte_t *cow_pte, *dst_pte, _cow_pte, _dst_pte;
+	spinlock_t *cow_ptl, *dst_ptl;
+	void *src_page_kaddr, *cow_page_kaddr;
+	int err = -EINVAL;
+	int ret;
+
+	down_read_nested(&cow_mm->mmap_sem, SINGLE_DEPTH_NESTING);
+
+	/* We are trying to COW iff the page is mapped at the same address */
+	cow_vma = find_vma(cow_mm, dst_addr);
+	if (dst_addr < dst_vma->vm_start ||
+	    dst_addr + PAGE_SIZE > dst_vma->vm_end)
+		goto unlock_cow_mm;
+
+#if 0
+	/* FIXME: validate VMAs compatibility */
+	err = validate_vmas(cow_vma, dst_vma);
+	if (err)
+		goto unlock_cow_mm;
+#endif
+	ret = __get_user_pages_fast(src_addr, 1, 0, &src_page);
+	if (ret != 1)
+		/* FIXME: maybe fallback to full blown gup */
+		goto unlock_cow_mm;
+
+	cow_pmd = mm_find_pmd(cow_mm, dst_addr);
+	if (!cow_pmd || !pmd_present(*cow_pmd) || pmd_trans_huge(*cow_pmd))
+		goto put_src_page;
+
+	cow_pte = pte_offset_map_lock(cow_mm, cow_pmd, dst_addr, &cow_ptl);
+	if (!pte_present(*cow_pte))
+		goto unlock_cow_pte;
+
+	cow_page = vm_normal_page(cow_vma, dst_addr, *cow_pte);
+	if (!cow_page || !PageAnon(cow_page))
+		goto unlock_cow_pte;
+
+	get_page(cow_page);
+
+	_cow_pte = *cow_pte;
+	if (pte_write(_cow_pte))
+		ptep_set_wrprotect(cow_mm, dst_addr, cow_pte);
+
+	src_page_kaddr = kmap_atomic(src_page);
+	cow_page_kaddr = kmap_atomic(cow_page);
+
+	ret = memcmp(src_page_kaddr, cow_page_kaddr, PAGE_SIZE);
+
+	kunmap_atomic(cow_page_kaddr);
+	kunmap_atomic(src_page_kaddr);
+
+	if (ret)
+		goto unlock_put_cow_page;
+
+	dst_pte = pte_offset_map(dst_pmd, dst_addr);
+	dst_ptl = pte_lockptr(dst_mm, dst_pmd);
+	spin_lock_nested(dst_ptl, SINGLE_DEPTH_NESTING);
+	if (!pte_none(*dst_pte))
+		goto unlock_put_cow_page;
+
+	page_dup_rmap(cow_page, false);
+
+	_dst_pte = *cow_pte;
+	_dst_pte = pte_mkold(_dst_pte);
+	_dst_pte = pte_wrprotect(_dst_pte);
+
+	ptep_set_wrprotect(cow_mm, dst_addr, cow_pte);
+
+	inc_mm_counter(dst_mm, MM_ANONPAGES);
+
+	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+	err = 0;
+
+	pte_unmap_unlock(dst_pte, dst_ptl);
+unlock_put_cow_page:
+	if (pte_write(_cow_pte))
+		set_pte_at(cow_mm, dst_addr, cow_pte, _cow_pte);
+	if (err)
+		put_page(cow_page);
+unlock_cow_pte:
+	pte_unmap_unlock(cow_pte, cow_ptl);
+put_src_page:
+	put_page(src_page);
+unlock_cow_mm:
+	up_read(&cow_mm->mmap_sem);
+
+	return err;
+}
+
+
 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
 {
 	pgd_t *pgd;
@@ -401,6 +503,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 }
 
 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
+					      struct mm_struct *_cow_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
@@ -412,6 +515,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	unsigned long src_addr, dst_addr;
 	long copied;
 	struct page *page;
+	struct mm_struct *cow_mm = NULL;
 
 	/*
 	 * Sanitize the command parameters:
@@ -423,6 +527,10 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	BUG_ON(src_start + len <= src_start);
 	BUG_ON(dst_start + len <= dst_start);
 
+	/* don't try to COW if cow_mm is the same as dst_mm */
+	if (_cow_mm != dst_mm)
+		cow_mm = _cow_mm;
+
 	src_addr = src_start;
 	dst_addr = dst_start;
 	copied = 0;
@@ -466,13 +574,18 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	/*
 	 * If this is a HUGETLB vma, pass off to appropriate routine
 	 */
-	if (is_vm_hugetlb_page(dst_vma))
+	if (is_vm_hugetlb_page(dst_vma)) {
+		WARN_ON(cow_mm);
 		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
 						src_start, len, zeropage);
+	}
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
 
+	if (!vma_is_anonymous(dst_vma) && cow_mm)
+		goto out_unlock;
+
 	/*
 	 * Ensure the dst_vma has a anon_vma or this page
 	 * would get a NULL anon_vma when moved in the
@@ -516,8 +629,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 		BUG_ON(pmd_none(*dst_pmd));
 		BUG_ON(pmd_trans_huge(*dst_pmd));
 
-		err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-				       src_addr, &page, zeropage);
+		err = -EINVAL;
+		if (cow_mm)
+			err = mfill_cow_pte(dst_mm, cow_mm, dst_pmd,
+					    dst_vma, dst_addr, src_addr,
+					    &page);
+		if (err)
+			err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+					       dst_addr, src_addr, &page,
+					       zeropage);
 		cond_resched();
 
 		if (unlikely(err == -EFAULT)) {
@@ -565,11 +685,21 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 		     unsigned long src_start, unsigned long len)
 {
-	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+	return __mcopy_atomic(dst_mm, NULL, dst_start, src_start, len, false);
 }
 
 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
 		       unsigned long len)
 {
-	return __mcopy_atomic(dst_mm, start, 0, len, true);
+	return __mcopy_atomic(dst_mm, NULL, start, 0, len, true);
+}
+
+ssize_t mfill_cow(struct mm_struct *dst_mm,
+		  struct mm_struct *cow_src_mm,
+		  unsigned long dst_start,
+		  unsigned long copy_src_start,
+		  unsigned long len)
+{
+	return __mcopy_atomic(dst_mm, cow_src_mm, dst_start, copy_src_start,
+			      len, false);
 }
-- 
2.7.4


-- 
Sincerely yours,
Mike.

--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux