Re: [PATCH v5] userfaultfd/shmem: fix MCOPY_ATOMIC_CONTINUE behavior

Axel Rasmussen <axelrasmussen@xxxxxxxxxx> · Thu, 8 Apr 2021 15:08:31 -0700

On Tue, Apr 6, 2021 at 4:49 PM Peter Xu <peterx@xxxxxxxxxx> wrote:
>
> On Mon, Apr 05, 2021 at 10:19:17AM -0700, Axel Rasmussen wrote:
> > Previously, the continue implementation in shmem_mcopy_atomic_pte was
> > incorrect for two main reasons:
> >
> > - It didn't correctly skip some sections of code which make sense for
> >   newly allocated pages, but absolutely don't make sense for
> >   pre-existing page cache pages.
> >
> > - Because shmem_mcopy_continue_pte is called only if VM_SHARED is
> >   detected in mm/userfaultd.c, we were incorrectly not supporting the
> >   case where a tmpfs file had been mmap()-ed with MAP_PRIVATE.
> >
> > So, this patch does the following:
> >
> > In mm/userfaultfd.c, break the logic to install PTEs out of
> > mcopy_atomic_pte, into a separate helper function.
> >
> > In mfill_atomic_pte, for the MCOPY_ATOMIC_CONTINUE case, simply look
> > up the existing page in the page cache, and then use the PTE
> > installation helper to setup the mapping. This addresses the two issues
> > noted above.
> >
> > The previous code's bugs manifested clearly in the error handling path.
> > So, to detect this kind of issue in the future, modify the selftest to
> > exercise the error handling path as well.
> >
> > Note that this patch is based on linux-next/akpm; the "fixes" line
> > refers to a SHA1 in that branch.
> >
> > Changes since v4:
> > - Added back the userfaultfd.c selftest changes from v3; this file was
> >   mistakenly reverted in v4.
> >
> > Changes since v3:
> > - Significantly refactored the patch. Continue handling now happens in
> >   mm/userfaultfd.c, via a PTE installation helper. Most of the
> >   mm/shmem.c changes from the patch being fixed [1] are reverted.
> >
> > Changes since v2:
> > - Drop the ClearPageDirty() entirely, instead of trying to remember the
> >   old value.
> > - Modify both pgoff / max_off checks to use pgoff. It's equivalent to
> >   offset, but offset wasn't initialized until the first check (which
> >   we're skipping).
> > - Keep the second pgoff / max_off check in the continue case.
> >
> > Changes since v1:
> > - Refactor to skip ahead with goto, instead of adding several more
> >   "if (!is_continue)".
> > - Fix unconditional ClearPageDirty().
> > - Don't pte_mkwrite() when is_continue && !VM_SHARED.
> >
> > [1] https://lore.kernel.org/patchwork/patch/1392464/
> >
> > Fixes: 00da60b9d0a0 ("userfaultfd: support minor fault handling for shmem")
> > Signed-off-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>
> > ---
> >  mm/shmem.c                               |  56 +++----
> >  mm/userfaultfd.c                         | 183 ++++++++++++++++-------
> >  tools/testing/selftests/vm/userfaultfd.c |  12 ++
> >  3 files changed, 168 insertions(+), 83 deletions(-)
> >
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 5cfd2fb6e52b..9d9a9f254f33 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -2366,7 +2366,6 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> >                          unsigned long dst_addr, unsigned long src_addr,
> >                          enum mcopy_atomic_mode mode, struct page **pagep)
> >  {
> > -     bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
> >       struct inode *inode = file_inode(dst_vma->vm_file);
> >       struct shmem_inode_info *info = SHMEM_I(inode);
> >       struct address_space *mapping = inode->i_mapping;
> > @@ -2377,18 +2376,17 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> >       struct page *page;
> >       pte_t _dst_pte, *dst_pte;
> >       int ret;
> > -     pgoff_t offset, max_off;
> > +     pgoff_t max_off;
> > +
> > +     /* Handled by mcontinue_atomic_pte instead. */
> > +     if (WARN_ON_ONCE(mode == MCOPY_ATOMIC_CONTINUE))
> > +             return -EINVAL;
>
> (It would be ideal if this patch can be squashed into original one, since
>  ideally shmem_mcopy_atomic_pte could be mostly untouched to support uffd-minor
>  then we can avoid a lot of code churns; it's just that we noticed it too
>  late.. then this warn_on_one will not be needed too since previously it was a
>  bool)
>
> >
> >       ret = -ENOMEM;
> >       if (!shmem_inode_acct_block(inode, 1))
> >               goto out;
> >
> > -     if (is_continue) {
> > -             ret = -EFAULT;
> > -             page = find_lock_page(mapping, pgoff);
> > -             if (!page)
> > -                     goto out_unacct_blocks;
> > -     } else if (!*pagep) {
> > +     if (!*pagep) {
> >               page = shmem_alloc_page(gfp, info, pgoff);
> >               if (!page)
> >                       goto out_unacct_blocks;
> > @@ -2415,27 +2413,21 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> >               *pagep = NULL;
> >       }
> >
> > -     if (!is_continue) {
> > -             VM_BUG_ON(PageSwapBacked(page));
> > -             VM_BUG_ON(PageLocked(page));
> > -             __SetPageLocked(page);
> > -             __SetPageSwapBacked(page);
> > -             __SetPageUptodate(page);
> > -     }
> > +     VM_BUG_ON(PageSwapBacked(page));
> > +     VM_BUG_ON(PageLocked(page));
> > +     __SetPageLocked(page);
> > +     __SetPageSwapBacked(page);
> > +     __SetPageUptodate(page);
> >
> >       ret = -EFAULT;
> > -     offset = linear_page_index(dst_vma, dst_addr);
> >       max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> > -     if (unlikely(offset >= max_off))
> > +     if (unlikely(pgoff >= max_off))
> >               goto out_release;
> >
> > -     /* If page wasn't already in the page cache, add it. */
> > -     if (!is_continue) {
> > -             ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
> > -                                           gfp & GFP_RECLAIM_MASK, dst_mm);
> > -             if (ret)
> > -                     goto out_release;
> > -     }
> > +     ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
> > +                                   gfp & GFP_RECLAIM_MASK, dst_mm);
> > +     if (ret)
> > +             goto out_release;
> >
> >       _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
> >       if (dst_vma->vm_flags & VM_WRITE)
> > @@ -2455,22 +2447,20 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> >
> >       ret = -EFAULT;
> >       max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> > -     if (unlikely(offset >= max_off))
> > +     if (unlikely(pgoff >= max_off))
> >               goto out_release_unlock;
> >
> >       ret = -EEXIST;
> >       if (!pte_none(*dst_pte))
> >               goto out_release_unlock;
> >
> > -     if (!is_continue) {
> > -             lru_cache_add(page);
> > +     lru_cache_add(page);
> >
> > -             spin_lock_irq(&info->lock);
> > -             info->alloced++;
> > -             inode->i_blocks += BLOCKS_PER_PAGE;
> > -             shmem_recalc_inode(inode);
> > -             spin_unlock_irq(&info->lock);
> > -     }
> > +     spin_lock_irq(&info->lock);
> > +     info->alloced++;
> > +     inode->i_blocks += BLOCKS_PER_PAGE;
> > +     shmem_recalc_inode(inode);
> > +     spin_unlock_irq(&info->lock);
> >
> >       inc_mm_counter(dst_mm, mm_counter_file(page));
> >       page_add_file_rmap(page, false);
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index cbb7c8d79a4d..286d0657fbe2 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -48,21 +48,103 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> >       return dst_vma;
> >  }
> >
> > +/*
> > + * Install PTEs, to map dst_addr (within dst_vma) to page.
> > + *
> > + * This function handles MCOPY_ATOMIC_CONTINUE (which is always file-backed),
> > + * whether or not dst_vma is VM_SHARED. It also handles the more general
> > + * MCOPY_ATOMIC_NORMAL case, when dst_vma is *not* VM_SHARED (it may be file
> > + * backed, or not).
> > + *
> > + * Note that MCOPY_ATOMIC_NORMAL for a VM_SHARED dst_vma is handled by
> > + * shmem_mcopy_atomic_pte instead.
> > + */
> > +static int mcopy_atomic_install_ptes(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> > +                                  struct vm_area_struct *dst_vma,
> > +                                  unsigned long dst_addr, struct page *page,
> > +                                  enum mcopy_atomic_mode mode, bool wp_copy)
> > +{
> > +     int ret;
> > +     pte_t _dst_pte, *dst_pte;
> > +     bool is_continue = mode == MCOPY_ATOMIC_CONTINUE;
>
> Nit: brackets?
>
> > +     int writable;
> > +     bool vm_shared = dst_vma->vm_flags & VM_SHARED;
> > +     bool is_file_backed = dst_vma->vm_file;
>
> Nit: Maybe "!vma_is_anonymous(dst_vma)" is better?
>
> > +     spinlock_t *ptl;
> > +     struct inode *inode;
> > +     pgoff_t offset, max_off;
> > +
> > +     _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
> > +     writable = dst_vma->vm_flags & VM_WRITE;
> > +     /* For CONTINUE on a non-shared VMA, don't pte_mkwrite for CoW. */
> > +     if (is_continue && !vm_shared)
>
> Curious whether we can drop "mode" in this function.
>
> For this one, can it be replaced with "!vma_is_anonymous() && !vm_shared"?
>
> > +             writable = 0;
> > +
> > +     if (writable) {
> > +             _dst_pte = pte_mkdirty(_dst_pte);
>
> This was unconditional previously at least for anonymous, see [1] below.  I
> think we need to keep that behavior..
>
> Probably you saw that shmem code has the dirty bit conditionally set, however
> I'm thinking maybe it's easier to always set it (I'll do it in uffd-wp shmem
> series anyways), then we can even drop the set_page_dirty() below, afaict.
>
> > +             if (wp_copy)
> > +                     _dst_pte = pte_mkuffd_wp(_dst_pte);
> > +             else
> > +                     _dst_pte = pte_mkwrite(_dst_pte);
> > +     } else if (vm_shared) {
> > +             /*
> > +              * Since we didn't pte_mkdirty(), mark the page dirty or it
> > +              * could be freed from under us. We could do this
> > +              * unconditionally, but doing it only if !writable is faster.
> > +              */
> > +             set_page_dirty(page);
> > +     }
> > +
> > +     dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
> > +
> > +     if (is_file_backed) {
> > +             /* The shmem MAP_PRIVATE case requires checking the i_size */
> > +             inode = dst_vma->vm_file->f_inode;
> > +             offset = linear_page_index(dst_vma, dst_addr);
> > +             max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> > +             ret = -EFAULT;
> > +             if (unlikely(offset >= max_off))
> > +                     goto out_unlock;
>
> Frankly I don't really know why this must be put into pgtable lock..  Since if
> not required then it can be moved into UFFDIO_COPY path, as CONTINUE doesn't
> need it iiuc.  Just raise it up as a pure question.

It's not clear to me either. shmem_getpage_gfp() does check this twice
kinda like we're doing, but it doesn't ever touch the PTL. What it
seems to be worried about is, what happens if a concurrent
FALLOC_FL_PUNCH_HOLE happens somewhere in the middle of whatever
manipulation we're doing? From looking at shmem_fallocate(), I think
the basic point is that truncation happens while "inode_lock(inode)"
is held, but neither shmem_mcopy_atomic_pte() or the new
mcopy_atomic_install_ptes() take that lock.

I'm a bit hesitant to just remove it, run some tests, and then declare
victory, because it seems plausible it's there to catch some
semi-hard-to-induce race. I'm not sure how to prove that *isn't*
needed, so my inclination is to just keep it?

I'll send a series addressing the feedback so far this afternoon, and
I'll leave this alone for now - at least, it doesn't seem to hurt
anything. Maybe Hugh or someone else has some more advice about it. If
so, I'm happy to remove it in a follow-up.

>
> > +     }
> > +
> > +     ret = -EEXIST;
> > +     if (!pte_none(*dst_pte))
> > +             goto out_unlock;
> > +
> > +     inc_mm_counter(dst_mm, mm_counter(page));
> > +     if (is_file_backed)
> > +             page_add_file_rmap(page, false);
> > +     else
> > +             page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
> > +
> > +     if (!is_continue)
> > +             lru_cache_add_inactive_or_unevictable(page, dst_vma);
> > +
> > +     set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
> > +
> > +     /* No need to invalidate - it was non-present before */
> > +     update_mmu_cache(dst_vma, dst_addr, dst_pte);
> > +     pte_unmap_unlock(dst_pte, ptl);
> > +     ret = 0;
> > +out:
>
> This label seems not needed any more, we can "return 0" here and "return ret"
> below so as to drop the last reference.
>
> > +     return ret;
> > +out_unlock:
> > +     pte_unmap_unlock(dst_pte, ptl);
> > +     goto out;
> > +}
> > +
> >  static int mcopy_atomic_pte(struct mm_struct *dst_mm,
> >                           pmd_t *dst_pmd,
> >                           struct vm_area_struct *dst_vma,
> >                           unsigned long dst_addr,
> >                           unsigned long src_addr,
> >                           struct page **pagep,
> > +                         enum mcopy_atomic_mode mode,
>
> When reach this function, mode must be MCOPY_ATOMIC_NORMAL, right?  Then I
> think we can drop it and use MCOPY_ATOMIC_NORMAL directly below.  Moreover, if
> you read above and agree "mode" can be dropped in mcopy_atomic_install_ptes(),
> then it's even cleaner since we just drop all "mode" parameters.
>
> >                           bool wp_copy)
> >  {
> > -     pte_t _dst_pte, *dst_pte;
> > -     spinlock_t *ptl;
> >       void *page_kaddr;
> >       int ret;
> >       struct page *page;
> > -     pgoff_t offset, max_off;
> > -     struct inode *inode;
> >
> >       if (!*pagep) {
> >               ret = -ENOMEM;
> > @@ -99,43 +181,12 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
> >       if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
> >               goto out_release;
> >
> > -     _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
>
> [1]
>
> > -     if (dst_vma->vm_flags & VM_WRITE) {
> > -             if (wp_copy)
> > -                     _dst_pte = pte_mkuffd_wp(_dst_pte);
> > -             else
> > -                     _dst_pte = pte_mkwrite(_dst_pte);
> > -     }
> > -
> > -     dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
> > -     if (dst_vma->vm_file) {
> > -             /* the shmem MAP_PRIVATE case requires checking the i_size */
> > -             inode = dst_vma->vm_file->f_inode;
> > -             offset = linear_page_index(dst_vma, dst_addr);
> > -             max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> > -             ret = -EFAULT;
> > -             if (unlikely(offset >= max_off))
> > -                     goto out_release_uncharge_unlock;
> > -     }
> > -     ret = -EEXIST;
> > -     if (!pte_none(*dst_pte))
> > -             goto out_release_uncharge_unlock;
> > -
> > -     inc_mm_counter(dst_mm, MM_ANONPAGES);
> > -     page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
> > -     lru_cache_add_inactive_or_unevictable(page, dst_vma);
> > -
> > -     set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
> > -
> > -     /* No need to invalidate - it was non-present before */
> > -     update_mmu_cache(dst_vma, dst_addr, dst_pte);
> > -
> > -     pte_unmap_unlock(dst_pte, ptl);
> > -     ret = 0;
> > +     ret = mcopy_atomic_install_ptes(dst_mm, dst_pmd, dst_vma, dst_addr,
> > +                                     page, mode, wp_copy);
> > +     if (ret)
> > +             goto out_release;
> >  out:
> >       return ret;
> > -out_release_uncharge_unlock:
> > -     pte_unmap_unlock(dst_pte, ptl);
> >  out_release:
> >       put_page(page);
> >       goto out;
> > @@ -176,6 +227,38 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm,
> >       return ret;
> >  }
> >
> > +static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
> > +                             pmd_t *dst_pmd,
> > +                             struct vm_area_struct *dst_vma,
> > +                             unsigned long dst_addr,
> > +                             bool wp_copy)
> > +{
> > +     struct inode *inode = file_inode(dst_vma->vm_file);
> > +     struct address_space *mapping = inode->i_mapping;
> > +     pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
> > +     struct page *page;
> > +     int ret;
> > +
> > +     ret = -EFAULT;
> > +     page = find_lock_page(mapping, pgoff);
> > +     if (!page)
> > +             goto out;
> > +
> > +     ret = mcopy_atomic_install_ptes(dst_mm, dst_pmd, dst_vma, dst_addr,
> > +                                     page, MCOPY_ATOMIC_CONTINUE, wp_copy);
> > +     if (ret)
> > +             goto out_release;
> > +
> > +     unlock_page(page);
> > +     ret = 0;
> > +out:
> > +     return ret;
> > +out_release:
> > +     unlock_page(page);
> > +     put_page(page);
> > +     goto out;
> > +}
> > +
> >  static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> >  {
> >       pgd_t *pgd;
> > @@ -418,7 +501,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> >                                               enum mcopy_atomic_mode mode,
> >                                               bool wp_copy)
> >  {
> > -     ssize_t err;
> > +     ssize_t err = 0;
> > +
> > +     if (mode == MCOPY_ATOMIC_CONTINUE) {
> > +             err = mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> > +                                        wp_copy);
> > +             goto out;
>
> Why not return directly? :)
>
> Thanks,
>
> > +     }
> >
> >       /*
> >        * The normal page fault path for a shmem will invoke the
> > @@ -431,26 +520,20 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> >        * and not in the radix tree.
> >        */
> >       if (!(dst_vma->vm_flags & VM_SHARED)) {
> > -             switch (mode) {
> > -             case MCOPY_ATOMIC_NORMAL:
> > +             if (mode == MCOPY_ATOMIC_NORMAL)
> >                       err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
> >                                              dst_addr, src_addr, page,
> > -                                            wp_copy);
> > -                     break;
> > -             case MCOPY_ATOMIC_ZEROPAGE:
> > +                                            mode, wp_copy);
> > +             else if (mode == MCOPY_ATOMIC_ZEROPAGE)
> >                       err = mfill_zeropage_pte(dst_mm, dst_pmd,
> >                                                dst_vma, dst_addr);
> > -                     break;
> > -             case MCOPY_ATOMIC_CONTINUE:
> > -                     err = -EINVAL;
> > -                     break;
> > -             }
> >       } else {
> >               VM_WARN_ON_ONCE(wp_copy);
> >               err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> >                                            src_addr, mode, page);
> >       }
> >
> > +out:
> >       return err;
> >  }
> >
> > diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
> > index f6c86b036d0f..d8541a59dae5 100644
> > --- a/tools/testing/selftests/vm/userfaultfd.c
> > +++ b/tools/testing/selftests/vm/userfaultfd.c
> > @@ -485,6 +485,7 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
> >  static void continue_range(int ufd, __u64 start, __u64 len)
> >  {
> >       struct uffdio_continue req;
> > +     int ret;
> >
> >       req.range.start = start;
> >       req.range.len = len;
> > @@ -493,6 +494,17 @@ static void continue_range(int ufd, __u64 start, __u64 len)
> >       if (ioctl(ufd, UFFDIO_CONTINUE, &req))
> >               err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
> >                   (uint64_t)start);
> > +
> > +     /*
> > +      * Error handling within the kernel for continue is subtly different
> > +      * from copy or zeropage, so it may be a source of bugs. Trigger an
> > +      * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
> > +      */
> > +     req.mapped = 0;
> > +     ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
> > +     if (ret >= 0 || req.mapped != -EEXIST)
> > +             err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
> > +                 ret, req.mapped);
> >  }
> >
> >  static void *locking_thread(void *arg)
> > --
> > 2.31.0.208.g409f899ff0-goog
> >
>
> --
> Peter Xu
>