On Tue, Feb 12, 2019 at 10:56:27AM +0800, Peter Xu wrote: > From: Andrea Arcangeli <aarcange@xxxxxxxxxx> > > v1: From: Shaohua Li <shli@xxxxxx> > > v2: cleanups, remove a branch. > > [peterx writes up the commit message, as below...] > > This patch introduces the new uffd-wp APIs for userspace. > > Firstly, we'll allow to do UFFDIO_REGISTER with write protection > tracking using the new UFFDIO_REGISTER_MODE_WP flag. Note that this > flag can co-exist with the existing UFFDIO_REGISTER_MODE_MISSING, in > which case the userspace program can not only resolve missing page > faults, and at the same time tracking page data changes along the way. > > Secondly, we introduced the new UFFDIO_WRITEPROTECT API to do page > level write protection tracking. Note that we will need to register > the memory region with UFFDIO_REGISTER_MODE_WP before that. > > Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> > [peterx: remove useless block, write commit message, check against > VM_MAYWRITE rather than VM_WRITE when register] > Signed-off-by: Peter Xu <peterx@xxxxxxxxxx> > --- > fs/userfaultfd.c | 82 +++++++++++++++++++++++++------- > include/uapi/linux/userfaultfd.h | 11 +++++ > 2 files changed, 77 insertions(+), 16 deletions(-) > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index 3092885c9d2c..81962d62520c 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -304,8 +304,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, > if (!pmd_present(_pmd)) > goto out; > > - if (pmd_trans_huge(_pmd)) > + if (pmd_trans_huge(_pmd)) { > + if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) > + ret = true; > goto out; > + } > > /* > * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it > @@ -318,6 +321,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, > */ > if (pte_none(*pte)) > ret = true; > + if (!pte_write(*pte) && (reason & VM_UFFD_WP)) > + ret = true; > pte_unmap(pte); > > out: > @@ -1251,10 +1256,13 @@ static __always_inline int validate_range(struct mm_struct *mm, > return 0; > } > > -static inline bool vma_can_userfault(struct vm_area_struct *vma) > +static inline bool vma_can_userfault(struct vm_area_struct *vma, > + unsigned long vm_flags) > { > - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || > - vma_is_shmem(vma); > + /* FIXME: add WP support to hugetlbfs and shmem */ > + return vma_is_anonymous(vma) || > + ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) && > + !(vm_flags & VM_UFFD_WP)); > } > > static int userfaultfd_register(struct userfaultfd_ctx *ctx, > @@ -1286,15 +1294,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > vm_flags = 0; > if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) > vm_flags |= VM_UFFD_MISSING; > - if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { > + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) > vm_flags |= VM_UFFD_WP; > - /* > - * FIXME: remove the below error constraint by > - * implementing the wprotect tracking mode. > - */ > - ret = -EINVAL; > - goto out; > - } > > ret = validate_range(mm, uffdio_register.range.start, > uffdio_register.range.len); > @@ -1342,7 +1343,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > > /* check not compatible vmas */ > ret = -EINVAL; > - if (!vma_can_userfault(cur)) > + if (!vma_can_userfault(cur, vm_flags)) > goto out_unlock; > > /* > @@ -1370,6 +1371,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > if (end & (vma_hpagesize - 1)) > goto out_unlock; > } > + if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) > + goto out_unlock; > > /* > * Check that this vma isn't already owned by a > @@ -1399,7 +1402,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > do { > cond_resched(); > > - BUG_ON(!vma_can_userfault(vma)); > + BUG_ON(!vma_can_userfault(vma, vm_flags)); > BUG_ON(vma->vm_userfaultfd_ctx.ctx && > vma->vm_userfaultfd_ctx.ctx != ctx); > WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); > @@ -1534,7 +1537,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > * provides for more strict behavior to notice > * unregistration errors. > */ > - if (!vma_can_userfault(cur)) > + if (!vma_can_userfault(cur, cur->vm_flags)) > goto out_unlock; > > found = true; > @@ -1548,7 +1551,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > do { > cond_resched(); > > - BUG_ON(!vma_can_userfault(vma)); > + BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); > > /* > * Nothing to do: this vma is already registered into this > @@ -1761,6 +1764,50 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, > return ret; > } > > +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, > + unsigned long arg) > +{ > + int ret; > + struct uffdio_writeprotect uffdio_wp; > + struct uffdio_writeprotect __user *user_uffdio_wp; > + struct userfaultfd_wake_range range; > + > + if (READ_ONCE(ctx->mmap_changing)) > + return -EAGAIN; > + > + user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; > + > + if (copy_from_user(&uffdio_wp, user_uffdio_wp, > + sizeof(struct uffdio_writeprotect))) > + return -EFAULT; > + > + ret = validate_range(ctx->mm, uffdio_wp.range.start, > + uffdio_wp.range.len); > + if (ret) > + return ret; > + > + if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | > + UFFDIO_WRITEPROTECT_MODE_WP)) > + return -EINVAL; > + if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) && > + (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) > + return -EINVAL; Why _DONTWAKE cannot be used when setting write-protection? I can imagine a use-case when you'd want to freeze an application, write-protect several regions and then let the application continue. > + > + ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, > + uffdio_wp.range.len, uffdio_wp.mode & > + UFFDIO_WRITEPROTECT_MODE_WP, > + &ctx->mmap_changing); > + if (ret) > + return ret; > + > + if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) { > + range.start = uffdio_wp.range.start; > + range.len = uffdio_wp.range.len; > + wake_userfault(ctx, &range); > + } > + return ret; > +} > + > static inline unsigned int uffd_ctx_features(__u64 user_features) > { > /* > @@ -1838,6 +1885,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, > case UFFDIO_ZEROPAGE: > ret = userfaultfd_zeropage(ctx, arg); > break; > + case UFFDIO_WRITEPROTECT: > + ret = userfaultfd_writeprotect(ctx, arg); > + break; > } > return ret; > } > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h > index 297cb044c03f..1b977a7a4435 100644 > --- a/include/uapi/linux/userfaultfd.h > +++ b/include/uapi/linux/userfaultfd.h > @@ -52,6 +52,7 @@ > #define _UFFDIO_WAKE (0x02) > #define _UFFDIO_COPY (0x03) > #define _UFFDIO_ZEROPAGE (0x04) > +#define _UFFDIO_WRITEPROTECT (0x06) > #define _UFFDIO_API (0x3F) > > /* userfaultfd ioctl ids */ > @@ -68,6 +69,8 @@ > struct uffdio_copy) > #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ > struct uffdio_zeropage) > +#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ > + struct uffdio_writeprotect) > > /* read() structure */ > struct uffd_msg { > @@ -232,4 +235,12 @@ struct uffdio_zeropage { > __s64 zeropage; > }; > > +struct uffdio_writeprotect { > + struct uffdio_range range; > + /* !WP means undo writeprotect. DONTWAKE is valid only with !WP */ > +#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) > +#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) > + __u64 mode; > +}; > + > #endif /* _LINUX_USERFAULTFD_H */ > -- > 2.17.1 > -- Sincerely yours, Mike.