From: Andrea Arcangeli <aarcange@xxxxxxxxxx> Subject: userfaultfd: wp: add UFFDIO_COPY_MODE_WP This allows UFFDIO_COPY to map pages write-protected. [peterx@xxxxxxxxxx: switch to VM_WARN_ON_ONCE in mfill_atomic_pte; add brackets around "dst_vma->vm_flags & VM_WRITE"; fix wordings in comments and commit messages] Link: http://lkml.kernel.org/r/20200220163112.11409-6-peterx@xxxxxxxxxx Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> Signed-off-by: Peter Xu <peterx@xxxxxxxxxx> Reviewed-by: Jerome Glisse <jglisse@xxxxxxxxxx> Reviewed-by: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx> Cc: Bobby Powers <bobbypowers@xxxxxxxxx> Cc: Brian Geffon <bgeffon@xxxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: Denis Plotnikov <dplotnikov@xxxxxxxxxxxxx> Cc: "Dr . David Alan Gilbert" <dgilbert@xxxxxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: "Kirill A . Shutemov" <kirill@xxxxxxxxxxxxx> Cc: Martin Cracauer <cracauer@xxxxxxxx> Cc: Marty McFadden <mcfadden8@xxxxxxxx> Cc: Maya Gokhale <gokhale2@xxxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxx> Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Cc: Pavel Emelyanov <xemul@xxxxxxxxxx> Cc: Rik van Riel <riel@xxxxxxxxxx> Cc: Shaohua Li <shli@xxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- fs/userfaultfd.c | 5 ++-- include/linux/userfaultfd_k.h | 2 - include/uapi/linux/userfaultfd.h | 11 ++++---- mm/userfaultfd.c | 36 ++++++++++++++++++++--------- 4 files changed, 35 insertions(+), 19 deletions(-) --- a/fs/userfaultfd.c~userfaultfd-wp-add-uffdio_copy_mode_wp +++ a/fs/userfaultfd.c @@ -1724,11 +1724,12 @@ static int userfaultfd_copy(struct userf ret = -EINVAL; if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) goto out; - if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing); + uffdio_copy.len, &ctx->mmap_changing, + uffdio_copy.mode); mmput(ctx->mm); } else { return -ESRCH; --- a/include/linux/userfaultfd_k.h~userfaultfd-wp-add-uffdio_copy_mode_wp +++ a/include/linux/userfaultfd_k.h @@ -36,7 +36,7 @@ extern vm_fault_t handle_userfault(struc extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing); + bool *mmap_changing, __u64 mode); extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, --- a/include/uapi/linux/userfaultfd.h~userfaultfd-wp-add-uffdio_copy_mode_wp +++ a/include/uapi/linux/userfaultfd.h @@ -203,13 +203,14 @@ struct uffdio_copy { __u64 dst; __u64 src; __u64 len; +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) /* - * There will be a wrprotection flag later that allows to map - * pages wrprotected on the fly. And such a flag will be - * available if the wrprotection ioctl are implemented for the - * range according to the uffdio_register.ioctls. + * UFFDIO_COPY_MODE_WP will map the page write protected on + * the fly. UFFDIO_COPY_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. */ -#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_COPY_MODE_WP ((__u64)1<<1) __u64 mode; /* --- a/mm/userfaultfd.c~userfaultfd-wp-add-uffdio_copy_mode_wp +++ a/mm/userfaultfd.c @@ -53,7 +53,8 @@ static int mcopy_atomic_pte(struct mm_st struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - struct page **pagep) + struct page **pagep, + bool wp_copy) { struct mem_cgroup *memcg; pte_t _dst_pte, *dst_pte; @@ -99,9 +100,9 @@ static int mcopy_atomic_pte(struct mm_st if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) goto out_release; - _dst_pte = mk_pte(page, dst_vma->vm_page_prot); - if (dst_vma->vm_flags & VM_WRITE) - _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); + if ((dst_vma->vm_flags & VM_WRITE) && !wp_copy) + _dst_pte = pte_mkwrite(_dst_pte); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (dst_vma->vm_file) { @@ -415,7 +416,8 @@ static __always_inline ssize_t mfill_ato unsigned long dst_addr, unsigned long src_addr, struct page **page, - bool zeropage) + bool zeropage, + bool wp_copy) { ssize_t err; @@ -432,11 +434,13 @@ static __always_inline ssize_t mfill_ato if (!(dst_vma->vm_flags & VM_SHARED)) { if (!zeropage) err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, page); + dst_addr, src_addr, page, + wp_copy); else err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr); } else { + VM_WARN_ON_ONCE(wp_copy); if (!zeropage) err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, @@ -454,7 +458,8 @@ static __always_inline ssize_t __mcopy_a unsigned long src_start, unsigned long len, bool zeropage, - bool *mmap_changing) + bool *mmap_changing, + __u64 mode) { struct vm_area_struct *dst_vma; ssize_t err; @@ -462,6 +467,7 @@ static __always_inline ssize_t __mcopy_a unsigned long src_addr, dst_addr; long copied; struct page *page; + bool wp_copy; /* * Sanitize the command parameters: @@ -508,6 +514,14 @@ retry: goto out_unlock; /* + * validate 'mode' now that we know the dst_vma: don't allow + * a wrprotect copy if the userfaultfd didn't register as WP. + */ + wp_copy = mode & UFFDIO_COPY_MODE_WP; + if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP)) + goto out_unlock; + + /* * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) @@ -562,7 +576,7 @@ retry: BUG_ON(pmd_trans_huge(*dst_pmd)); err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage); + src_addr, &page, zeropage, wp_copy); cond_resched(); if (unlikely(err == -ENOENT)) { @@ -609,14 +623,14 @@ out: ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing) + bool *mmap_changing, __u64 mode) { return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, - mmap_changing); + mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0); } _