On Sun, Jun 19, 2022 at 04:34:48PM -0700, Nadav Amit wrote: > From: Nadav Amit <namit@xxxxxxxxxx> > > When userfaultfd provides a zeropage in response to ioctl, it provides a > readonly alias to the zero page. If the page is later written (which is > the likely scenario), page-fault occurs and the page-fault allocator > allocates a page and rewires the page-tables. > > This is an expensive flow for cases in which a page is likely be written > to. Users can use the copy ioctl to initialize zero page (by copying > zeros), but this is also wasteful. > > Allow userfaultfd users to efficiently map initialized zero-pages that > are writable. Introduce UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY, which, when > provided would map a clear page instead of an alias to the zero page. > > For consistency, introduce also UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY. > > Suggested-by: David Hildenbrand <david@xxxxxxxxxx> > Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> > Cc: Hugh Dickins <hughd@xxxxxxxxxx> > Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > Cc: Axel Rasmussen <axelrasmussen@xxxxxxxxxx> > Cc: Peter Xu <peterx@xxxxxxxxxx> > Cc: Mike Rapoport <rppt@xxxxxxxxxxxxx> > Signed-off-by: Nadav Amit <namit@xxxxxxxxxx> > --- > fs/userfaultfd.c | 14 +++++++++++-- > include/uapi/linux/userfaultfd.h | 2 ++ > mm/userfaultfd.c | 36 ++++++++++++++++++++++++++++++++ > 3 files changed, 50 insertions(+), 2 deletions(-) > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index a56983b594d5..ff073de78ea8 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -1770,6 +1770,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, > struct uffdio_zeropage uffdio_zeropage; > struct uffdio_zeropage __user *user_uffdio_zeropage; > struct userfaultfd_wake_range range; > + bool mode_dontwake, mode_access_likely, mode_write_likely; > + uffd_flags_t uffd_flags; > > user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; > > @@ -1788,8 +1790,16 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, > if (ret) > goto out; > ret = -EINVAL; > - if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) > - goto out; > + > + mode_dontwake = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE; > + mode_access_likely = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY; > + mode_write_likely = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY; > + > + if (mode_dontwake) > + return -EINVAL; Hmm.. Why? Note that the above uffdio_zeropage.mode check was for invalid mode flags only, and I think that should be kept, but still I don't see why we want to fail UFFDIO_ZEROPAGE_MODE_DONTWAKE users. > + > + uffd_flags = (mode_access_likely ? UFFD_FLAGS_ACCESS_LIKELY : 0) | > + (mode_write_likely ? UFFD_FLAGS_WRITE_LIKELY : 0); > > if (mmget_not_zero(ctx->mm)) { > ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h > index 6ad93a13282e..b586b7c1e265 100644 > --- a/include/uapi/linux/userfaultfd.h > +++ b/include/uapi/linux/userfaultfd.h > @@ -286,6 +286,8 @@ struct uffdio_copy { > struct uffdio_zeropage { > struct uffdio_range range; > #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) > +#define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY ((__u64)1<<2) > +#define UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY ((__u64)1<<3) > __u64 mode; > > /* > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index 3172158d8faa..5dfbb1e80369 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -249,6 +249,38 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, > return ret; > } > > +static int mfill_clearpage_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > + struct vm_area_struct *dst_vma, > + unsigned long dst_addr, > + uffd_flags_t uffd_flags) > +{ > + struct page *page; > + int ret; > + > + ret = -ENOMEM; > + page = alloc_zeroed_user_highpage_movable(dst_vma, dst_addr); > + if (!page) > + goto out; > + > + /* The PTE is not marked as dirty unconditionally */ > + SetPageDirty(page); > + __SetPageUptodate(page); > + > + ret = -ENOMEM; Nit: can drop this since ret will always be -ENOMEM here.. Thanks, > + if (mem_cgroup_charge(page_folio(page), dst_vma->vm_mm, GFP_KERNEL)) > + goto out_release; > + > + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, > + page, true, uffd_flags); > + if (ret) > + goto out_release; > +out: > + return ret; > +out_release: > + put_page(page); > + goto out; > +} > + > /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ > static int mcontinue_atomic_pte(struct mm_struct *dst_mm, > pmd_t *dst_pmd, > @@ -511,6 +543,10 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, > err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, > dst_addr, src_addr, page, > uffd_flags); > + else if (!(uffd_flags & UFFD_FLAGS_WP) && > + (uffd_flags & UFFD_FLAGS_WRITE_LIKELY)) > + err = mfill_clearpage_pte(dst_mm, dst_pmd, dst_vma, > + dst_addr, uffd_flags); > else > err = mfill_zeropage_pte(dst_mm, dst_pmd, > dst_vma, dst_addr, uffd_flags); > -- > 2.25.1 > -- Peter Xu