Hi, This is an initial attempt to implement COW with userfaultfd. It's not yet complete, but I'd like to get an early feedback to see I'm not talking complete nonsense. It was possible to extend UFFDIO_COPY with UFFDIO_COPY_MODE_COW, but I've preferred to add the COW'ing of the pages as a new ioctl because otherwise I would need to extend uffdio_copy structure to hold an additional parameter. -- Sincerely yours, Mike. >From b97ef7f7d31918e4651c68493bc4b6ea86dd0038 Mon Sep 17 00:00:00 2001 From: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx> Date: Wed, 28 Mar 2018 11:33:35 +0300 Subject: [RFC PATCH] userfaultfd: add UFFDIO_TRY_COW If userfaultfd is used to manage memory of a process tree it is impossible to create copy-on-write mappings for the pages that can be COW shared between two or more processes. When we restore a process tree using pre-copy approach, it is possible to recreate the COW mappings. However, with lazy-memory restore that uses userfaultfd, all the pages that could have been COW shared are copied to each address space with UFFDIO_COPY and a physical page is allocated for each process. This increases memory usage of the restored processes relatively to their memory usage before checkpoint. The new UFFDIO_TRY_COW ioctl() allows to create COW mappings for the pages that were COW'ed at the process tree at the dump time. It operates on three address spaces: the calling process, the address space managed with userfaultfd, and the address space of the process that contains potential COW pages (cow_mm). Like UFFDIO_COPY, it receives an address in the calling process that contains the page data, an address in the managed process were the data should be put and the length of the range. For every page in the range, UFFDIO_TRY_COW checks if a page with the same contents as the source page exists in the cow_mm and, if yes, it creates COW mapping in the destination address space. Otherwise the page contents is copied from the source. Signed-off-by: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx> --- fs/userfaultfd.c | 65 ++++++++++++++++++ include/linux/userfaultfd_k.h | 5 ++ include/uapi/linux/userfaultfd.h | 15 ++++- mm/userfaultfd.c | 140 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 219 insertions(+), 6 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cec550c8468f..29a505a1cf0f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1750,6 +1750,68 @@ static inline unsigned int uffd_ctx_features(__u64 user_features) return (unsigned int)user_features; } +static int userfaultfd_try_cow(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_try_cow uffdio_try_cow; + struct uffdio_try_cow __user *user_uffdio_try_cow; + struct userfaultfd_wake_range range; + struct task_struct *cow_src_task; + struct mm_struct *cow_src_mm; + + user_uffdio_try_cow = (struct uffdio_try_cow __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_try_cow, user_uffdio_try_cow, + /* don't copy "result" last field */ + sizeof(uffdio_try_cow)-sizeof(__s64))) + goto out; + + ret = validate_range(ctx->mm, uffdio_try_cow.dst_start, + uffdio_try_cow.len); + if (ret) + goto out; + + ret = -ESRCH; + cow_src_task = find_get_task_by_vpid(uffdio_try_cow.pid); + if (!cow_src_task) + goto out; + + cow_src_mm = get_task_mm(cow_src_task); + if (!cow_src_mm) { + put_task_struct(cow_src_task); + goto out; + } + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_cow(ctx->mm, cow_src_mm, + uffdio_try_cow.dst_start, + uffdio_try_cow.src_start, + uffdio_try_cow.len); + mmput(ctx->mm); + mmput(cow_src_mm); + put_task_struct(cow_src_task); + } else { + mmput(cow_src_mm); + put_task_struct(cow_src_task); + return -ESRCH; + } + if (unlikely(put_user(ret, &user_uffdio_try_cow->result))) + return -EFAULT; + if (ret < 0) + goto out; + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + range.start = uffdio_try_cow.dst_start; + wake_userfault(ctx, &range); + + ret = range.len == uffdio_try_cow.len ? 0 : -EAGAIN; +out: + return ret; +} + /* * userland asks for a certain API version and we return which bits * and ioctl commands are implemented in this kernel for such API @@ -1819,6 +1881,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_ZEROPAGE: ret = userfaultfd_zeropage(ctx, arg); break; + case UFFDIO_TRY_COW: + ret = userfaultfd_try_cow(ctx, arg); + break; } return ret; } diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f2f3b68ba910..d8f7e3bd6921 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -35,6 +35,11 @@ extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len); +extern ssize_t mfill_cow(struct mm_struct *dst_mm, + struct mm_struct *cow_src_mm, + unsigned long dst_start, + unsigned long copy_src_start, + unsigned long len); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 48f1a7c2f1f0..627b24ed5422 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -34,7 +34,8 @@ #define UFFD_API_RANGE_IOCTLS \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ - (__u64)1 << _UFFDIO_ZEROPAGE) + (__u64)1 << _UFFDIO_ZEROPAGE | \ + (__u64)1 << _UFFDIO_TRY_COW) #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY) @@ -52,6 +53,7 @@ #define _UFFDIO_WAKE (0x02) #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_TRY_COW (0x05) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ @@ -68,6 +70,8 @@ struct uffdio_copy) #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ struct uffdio_zeropage) +#define UFFDIO_TRY_COW _IOWR(UFFDIO, _UFFDIO_TRY_COW, \ + struct uffdio_try_cow) /* read() structure */ struct uffd_msg { @@ -231,4 +235,13 @@ struct uffdio_zeropage { __s64 zeropage; }; +struct uffdio_try_cow { + __u32 pid; + __u64 dst_start; + __u64 src_start; + __u64 len; + __u64 mode; + __s64 result; +}; + #endif /* _LINUX_USERFAULTFD_H */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 39791b81ede7..7597b6e40963 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -124,6 +124,108 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, return ret; } +static int mfill_cow_pte(struct mm_struct *dst_mm, + struct mm_struct *cow_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + struct vm_area_struct *cow_vma; + struct page *src_page, *cow_page; + pmd_t *cow_pmd; + pte_t *cow_pte, *dst_pte, _cow_pte, _dst_pte; + spinlock_t *cow_ptl, *dst_ptl; + void *src_page_kaddr, *cow_page_kaddr; + int err = -EINVAL; + int ret; + + down_read_nested(&cow_mm->mmap_sem, SINGLE_DEPTH_NESTING); + + /* We are trying to COW iff the page is mapped at the same address */ + cow_vma = find_vma(cow_mm, dst_addr); + if (dst_addr < dst_vma->vm_start || + dst_addr + PAGE_SIZE > dst_vma->vm_end) + goto unlock_cow_mm; + +#if 0 + /* FIXME: validate VMAs compatibility */ + err = validate_vmas(cow_vma, dst_vma); + if (err) + goto unlock_cow_mm; +#endif + ret = __get_user_pages_fast(src_addr, 1, 0, &src_page); + if (ret != 1) + /* FIXME: maybe fallback to full blown gup */ + goto unlock_cow_mm; + + cow_pmd = mm_find_pmd(cow_mm, dst_addr); + if (!cow_pmd || !pmd_present(*cow_pmd) || pmd_trans_huge(*cow_pmd)) + goto put_src_page; + + cow_pte = pte_offset_map_lock(cow_mm, cow_pmd, dst_addr, &cow_ptl); + if (!pte_present(*cow_pte)) + goto unlock_cow_pte; + + cow_page = vm_normal_page(cow_vma, dst_addr, *cow_pte); + if (!cow_page || !PageAnon(cow_page)) + goto unlock_cow_pte; + + get_page(cow_page); + + _cow_pte = *cow_pte; + if (pte_write(_cow_pte)) + ptep_set_wrprotect(cow_mm, dst_addr, cow_pte); + + src_page_kaddr = kmap_atomic(src_page); + cow_page_kaddr = kmap_atomic(cow_page); + + ret = memcmp(src_page_kaddr, cow_page_kaddr, PAGE_SIZE); + + kunmap_atomic(cow_page_kaddr); + kunmap_atomic(src_page_kaddr); + + if (ret) + goto unlock_put_cow_page; + + dst_pte = pte_offset_map(dst_pmd, dst_addr); + dst_ptl = pte_lockptr(dst_mm, dst_pmd); + spin_lock_nested(dst_ptl, SINGLE_DEPTH_NESTING); + if (!pte_none(*dst_pte)) + goto unlock_put_cow_page; + + page_dup_rmap(cow_page, false); + + _dst_pte = *cow_pte; + _dst_pte = pte_mkold(_dst_pte); + _dst_pte = pte_wrprotect(_dst_pte); + + ptep_set_wrprotect(cow_mm, dst_addr, cow_pte); + + inc_mm_counter(dst_mm, MM_ANONPAGES); + + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + err = 0; + + pte_unmap_unlock(dst_pte, dst_ptl); +unlock_put_cow_page: + if (pte_write(_cow_pte)) + set_pte_at(cow_mm, dst_addr, cow_pte, _cow_pte); + if (err) + put_page(cow_page); +unlock_cow_pte: + pte_unmap_unlock(cow_pte, cow_ptl); +put_src_page: + put_page(src_page); +unlock_cow_mm: + up_read(&cow_mm->mmap_sem); + + return err; +} + + static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; @@ -401,6 +503,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, } static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, + struct mm_struct *_cow_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, @@ -412,6 +515,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long src_addr, dst_addr; long copied; struct page *page; + struct mm_struct *cow_mm = NULL; /* * Sanitize the command parameters: @@ -423,6 +527,10 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, BUG_ON(src_start + len <= src_start); BUG_ON(dst_start + len <= dst_start); + /* don't try to COW if cow_mm is the same as dst_mm */ + if (_cow_mm != dst_mm) + cow_mm = _cow_mm; + src_addr = src_start; dst_addr = dst_start; copied = 0; @@ -466,13 +574,18 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, /* * If this is a HUGETLB vma, pass off to appropriate routine */ - if (is_vm_hugetlb_page(dst_vma)) + if (is_vm_hugetlb_page(dst_vma)) { + WARN_ON(cow_mm); return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, src_start, len, zeropage); + } if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; + if (!vma_is_anonymous(dst_vma) && cow_mm) + goto out_unlock; + /* * Ensure the dst_vma has a anon_vma or this page * would get a NULL anon_vma when moved in the @@ -516,8 +629,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); - err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage); + err = -EINVAL; + if (cow_mm) + err = mfill_cow_pte(dst_mm, cow_mm, dst_pmd, + dst_vma, dst_addr, src_addr, + &page); + if (err) + err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, &page, + zeropage); cond_resched(); if (unlikely(err == -EFAULT)) { @@ -565,11 +685,21 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); + return __mcopy_atomic(dst_mm, NULL, dst_start, src_start, len, false); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, unsigned long len) { - return __mcopy_atomic(dst_mm, start, 0, len, true); + return __mcopy_atomic(dst_mm, NULL, start, 0, len, true); +} + +ssize_t mfill_cow(struct mm_struct *dst_mm, + struct mm_struct *cow_src_mm, + unsigned long dst_start, + unsigned long copy_src_start, + unsigned long len) +{ + return __mcopy_atomic(dst_mm, cow_src_mm, dst_start, copy_src_start, + len, false); } -- 2.7.4 -- Sincerely yours, Mike.