On Tue, Apr 6, 2021 at 4:49 PM Peter Xu <peterx@xxxxxxxxxx> wrote: > > On Mon, Apr 05, 2021 at 10:19:17AM -0700, Axel Rasmussen wrote: > > Previously, the continue implementation in shmem_mcopy_atomic_pte was > > incorrect for two main reasons: > > > > - It didn't correctly skip some sections of code which make sense for > > newly allocated pages, but absolutely don't make sense for > > pre-existing page cache pages. > > > > - Because shmem_mcopy_continue_pte is called only if VM_SHARED is > > detected in mm/userfaultd.c, we were incorrectly not supporting the > > case where a tmpfs file had been mmap()-ed with MAP_PRIVATE. > > > > So, this patch does the following: > > > > In mm/userfaultfd.c, break the logic to install PTEs out of > > mcopy_atomic_pte, into a separate helper function. > > > > In mfill_atomic_pte, for the MCOPY_ATOMIC_CONTINUE case, simply look > > up the existing page in the page cache, and then use the PTE > > installation helper to setup the mapping. This addresses the two issues > > noted above. > > > > The previous code's bugs manifested clearly in the error handling path. > > So, to detect this kind of issue in the future, modify the selftest to > > exercise the error handling path as well. > > > > Note that this patch is based on linux-next/akpm; the "fixes" line > > refers to a SHA1 in that branch. > > > > Changes since v4: > > - Added back the userfaultfd.c selftest changes from v3; this file was > > mistakenly reverted in v4. > > > > Changes since v3: > > - Significantly refactored the patch. Continue handling now happens in > > mm/userfaultfd.c, via a PTE installation helper. Most of the > > mm/shmem.c changes from the patch being fixed [1] are reverted. > > > > Changes since v2: > > - Drop the ClearPageDirty() entirely, instead of trying to remember the > > old value. > > - Modify both pgoff / max_off checks to use pgoff. It's equivalent to > > offset, but offset wasn't initialized until the first check (which > > we're skipping). > > - Keep the second pgoff / max_off check in the continue case. > > > > Changes since v1: > > - Refactor to skip ahead with goto, instead of adding several more > > "if (!is_continue)". > > - Fix unconditional ClearPageDirty(). > > - Don't pte_mkwrite() when is_continue && !VM_SHARED. > > > > [1] https://lore.kernel.org/patchwork/patch/1392464/ > > > > Fixes: 00da60b9d0a0 ("userfaultfd: support minor fault handling for shmem") > > Signed-off-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx> > > --- > > mm/shmem.c | 56 +++---- > > mm/userfaultfd.c | 183 ++++++++++++++++------- > > tools/testing/selftests/vm/userfaultfd.c | 12 ++ > > 3 files changed, 168 insertions(+), 83 deletions(-) > > > > diff --git a/mm/shmem.c b/mm/shmem.c > > index 5cfd2fb6e52b..9d9a9f254f33 100644 > > --- a/mm/shmem.c > > +++ b/mm/shmem.c > > @@ -2366,7 +2366,6 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > > unsigned long dst_addr, unsigned long src_addr, > > enum mcopy_atomic_mode mode, struct page **pagep) > > { > > - bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); > > struct inode *inode = file_inode(dst_vma->vm_file); > > struct shmem_inode_info *info = SHMEM_I(inode); > > struct address_space *mapping = inode->i_mapping; > > @@ -2377,18 +2376,17 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > > struct page *page; > > pte_t _dst_pte, *dst_pte; > > int ret; > > - pgoff_t offset, max_off; > > + pgoff_t max_off; > > + > > + /* Handled by mcontinue_atomic_pte instead. */ > > + if (WARN_ON_ONCE(mode == MCOPY_ATOMIC_CONTINUE)) > > + return -EINVAL; > > (It would be ideal if this patch can be squashed into original one, since > ideally shmem_mcopy_atomic_pte could be mostly untouched to support uffd-minor > then we can avoid a lot of code churns; it's just that we noticed it too > late.. then this warn_on_one will not be needed too since previously it was a > bool) > > > > > ret = -ENOMEM; > > if (!shmem_inode_acct_block(inode, 1)) > > goto out; > > > > - if (is_continue) { > > - ret = -EFAULT; > > - page = find_lock_page(mapping, pgoff); > > - if (!page) > > - goto out_unacct_blocks; > > - } else if (!*pagep) { > > + if (!*pagep) { > > page = shmem_alloc_page(gfp, info, pgoff); > > if (!page) > > goto out_unacct_blocks; > > @@ -2415,27 +2413,21 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > > *pagep = NULL; > > } > > > > - if (!is_continue) { > > - VM_BUG_ON(PageSwapBacked(page)); > > - VM_BUG_ON(PageLocked(page)); > > - __SetPageLocked(page); > > - __SetPageSwapBacked(page); > > - __SetPageUptodate(page); > > - } > > + VM_BUG_ON(PageSwapBacked(page)); > > + VM_BUG_ON(PageLocked(page)); > > + __SetPageLocked(page); > > + __SetPageSwapBacked(page); > > + __SetPageUptodate(page); > > > > ret = -EFAULT; > > - offset = linear_page_index(dst_vma, dst_addr); > > max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); > > - if (unlikely(offset >= max_off)) > > + if (unlikely(pgoff >= max_off)) > > goto out_release; > > > > - /* If page wasn't already in the page cache, add it. */ > > - if (!is_continue) { > > - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, > > - gfp & GFP_RECLAIM_MASK, dst_mm); > > - if (ret) > > - goto out_release; > > - } > > + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, > > + gfp & GFP_RECLAIM_MASK, dst_mm); > > + if (ret) > > + goto out_release; > > > > _dst_pte = mk_pte(page, dst_vma->vm_page_prot); > > if (dst_vma->vm_flags & VM_WRITE) > > @@ -2455,22 +2447,20 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > > > > ret = -EFAULT; > > max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); > > - if (unlikely(offset >= max_off)) > > + if (unlikely(pgoff >= max_off)) > > goto out_release_unlock; > > > > ret = -EEXIST; > > if (!pte_none(*dst_pte)) > > goto out_release_unlock; > > > > - if (!is_continue) { > > - lru_cache_add(page); > > + lru_cache_add(page); > > > > - spin_lock_irq(&info->lock); > > - info->alloced++; > > - inode->i_blocks += BLOCKS_PER_PAGE; > > - shmem_recalc_inode(inode); > > - spin_unlock_irq(&info->lock); > > - } > > + spin_lock_irq(&info->lock); > > + info->alloced++; > > + inode->i_blocks += BLOCKS_PER_PAGE; > > + shmem_recalc_inode(inode); > > + spin_unlock_irq(&info->lock); > > > > inc_mm_counter(dst_mm, mm_counter_file(page)); > > page_add_file_rmap(page, false); > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > > index cbb7c8d79a4d..286d0657fbe2 100644 > > --- a/mm/userfaultfd.c > > +++ b/mm/userfaultfd.c > > @@ -48,21 +48,103 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, > > return dst_vma; > > } > > > > +/* > > + * Install PTEs, to map dst_addr (within dst_vma) to page. > > + * > > + * This function handles MCOPY_ATOMIC_CONTINUE (which is always file-backed), > > + * whether or not dst_vma is VM_SHARED. It also handles the more general > > + * MCOPY_ATOMIC_NORMAL case, when dst_vma is *not* VM_SHARED (it may be file > > + * backed, or not). > > + * > > + * Note that MCOPY_ATOMIC_NORMAL for a VM_SHARED dst_vma is handled by > > + * shmem_mcopy_atomic_pte instead. > > + */ > > +static int mcopy_atomic_install_ptes(struct mm_struct *dst_mm, pmd_t *dst_pmd, > > + struct vm_area_struct *dst_vma, > > + unsigned long dst_addr, struct page *page, > > + enum mcopy_atomic_mode mode, bool wp_copy) > > +{ > > + int ret; > > + pte_t _dst_pte, *dst_pte; > > + bool is_continue = mode == MCOPY_ATOMIC_CONTINUE; > > Nit: brackets? > > > + int writable; > > + bool vm_shared = dst_vma->vm_flags & VM_SHARED; > > + bool is_file_backed = dst_vma->vm_file; > > Nit: Maybe "!vma_is_anonymous(dst_vma)" is better? > > > + spinlock_t *ptl; > > + struct inode *inode; > > + pgoff_t offset, max_off; > > + > > + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); > > + writable = dst_vma->vm_flags & VM_WRITE; > > + /* For CONTINUE on a non-shared VMA, don't pte_mkwrite for CoW. */ > > + if (is_continue && !vm_shared) > > Curious whether we can drop "mode" in this function. > > For this one, can it be replaced with "!vma_is_anonymous() && !vm_shared"? > > > + writable = 0; > > + > > + if (writable) { > > + _dst_pte = pte_mkdirty(_dst_pte); > > This was unconditional previously at least for anonymous, see [1] below. I > think we need to keep that behavior.. > > Probably you saw that shmem code has the dirty bit conditionally set, however > I'm thinking maybe it's easier to always set it (I'll do it in uffd-wp shmem > series anyways), then we can even drop the set_page_dirty() below, afaict. > > > + if (wp_copy) > > + _dst_pte = pte_mkuffd_wp(_dst_pte); > > + else > > + _dst_pte = pte_mkwrite(_dst_pte); > > + } else if (vm_shared) { > > + /* > > + * Since we didn't pte_mkdirty(), mark the page dirty or it > > + * could be freed from under us. We could do this > > + * unconditionally, but doing it only if !writable is faster. > > + */ > > + set_page_dirty(page); > > + } > > + > > + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); > > + > > + if (is_file_backed) { > > + /* The shmem MAP_PRIVATE case requires checking the i_size */ > > + inode = dst_vma->vm_file->f_inode; > > + offset = linear_page_index(dst_vma, dst_addr); > > + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); > > + ret = -EFAULT; > > + if (unlikely(offset >= max_off)) > > + goto out_unlock; > > Frankly I don't really know why this must be put into pgtable lock.. Since if > not required then it can be moved into UFFDIO_COPY path, as CONTINUE doesn't > need it iiuc. Just raise it up as a pure question. It's not clear to me either. shmem_getpage_gfp() does check this twice kinda like we're doing, but it doesn't ever touch the PTL. What it seems to be worried about is, what happens if a concurrent FALLOC_FL_PUNCH_HOLE happens somewhere in the middle of whatever manipulation we're doing? From looking at shmem_fallocate(), I think the basic point is that truncation happens while "inode_lock(inode)" is held, but neither shmem_mcopy_atomic_pte() or the new mcopy_atomic_install_ptes() take that lock. I'm a bit hesitant to just remove it, run some tests, and then declare victory, because it seems plausible it's there to catch some semi-hard-to-induce race. I'm not sure how to prove that *isn't* needed, so my inclination is to just keep it? I'll send a series addressing the feedback so far this afternoon, and I'll leave this alone for now - at least, it doesn't seem to hurt anything. Maybe Hugh or someone else has some more advice about it. If so, I'm happy to remove it in a follow-up. > > > + } > > + > > + ret = -EEXIST; > > + if (!pte_none(*dst_pte)) > > + goto out_unlock; > > + > > + inc_mm_counter(dst_mm, mm_counter(page)); > > + if (is_file_backed) > > + page_add_file_rmap(page, false); > > + else > > + page_add_new_anon_rmap(page, dst_vma, dst_addr, false); > > + > > + if (!is_continue) > > + lru_cache_add_inactive_or_unevictable(page, dst_vma); > > + > > + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); > > + > > + /* No need to invalidate - it was non-present before */ > > + update_mmu_cache(dst_vma, dst_addr, dst_pte); > > + pte_unmap_unlock(dst_pte, ptl); > > + ret = 0; > > +out: > > This label seems not needed any more, we can "return 0" here and "return ret" > below so as to drop the last reference. > > > + return ret; > > +out_unlock: > > + pte_unmap_unlock(dst_pte, ptl); > > + goto out; > > +} > > + > > static int mcopy_atomic_pte(struct mm_struct *dst_mm, > > pmd_t *dst_pmd, > > struct vm_area_struct *dst_vma, > > unsigned long dst_addr, > > unsigned long src_addr, > > struct page **pagep, > > + enum mcopy_atomic_mode mode, > > When reach this function, mode must be MCOPY_ATOMIC_NORMAL, right? Then I > think we can drop it and use MCOPY_ATOMIC_NORMAL directly below. Moreover, if > you read above and agree "mode" can be dropped in mcopy_atomic_install_ptes(), > then it's even cleaner since we just drop all "mode" parameters. > > > bool wp_copy) > > { > > - pte_t _dst_pte, *dst_pte; > > - spinlock_t *ptl; > > void *page_kaddr; > > int ret; > > struct page *page; > > - pgoff_t offset, max_off; > > - struct inode *inode; > > > > if (!*pagep) { > > ret = -ENOMEM; > > @@ -99,43 +181,12 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, > > if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL)) > > goto out_release; > > > > - _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); > > [1] > > > - if (dst_vma->vm_flags & VM_WRITE) { > > - if (wp_copy) > > - _dst_pte = pte_mkuffd_wp(_dst_pte); > > - else > > - _dst_pte = pte_mkwrite(_dst_pte); > > - } > > - > > - dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); > > - if (dst_vma->vm_file) { > > - /* the shmem MAP_PRIVATE case requires checking the i_size */ > > - inode = dst_vma->vm_file->f_inode; > > - offset = linear_page_index(dst_vma, dst_addr); > > - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); > > - ret = -EFAULT; > > - if (unlikely(offset >= max_off)) > > - goto out_release_uncharge_unlock; > > - } > > - ret = -EEXIST; > > - if (!pte_none(*dst_pte)) > > - goto out_release_uncharge_unlock; > > - > > - inc_mm_counter(dst_mm, MM_ANONPAGES); > > - page_add_new_anon_rmap(page, dst_vma, dst_addr, false); > > - lru_cache_add_inactive_or_unevictable(page, dst_vma); > > - > > - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); > > - > > - /* No need to invalidate - it was non-present before */ > > - update_mmu_cache(dst_vma, dst_addr, dst_pte); > > - > > - pte_unmap_unlock(dst_pte, ptl); > > - ret = 0; > > + ret = mcopy_atomic_install_ptes(dst_mm, dst_pmd, dst_vma, dst_addr, > > + page, mode, wp_copy); > > + if (ret) > > + goto out_release; > > out: > > return ret; > > -out_release_uncharge_unlock: > > - pte_unmap_unlock(dst_pte, ptl); > > out_release: > > put_page(page); > > goto out; > > @@ -176,6 +227,38 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, > > return ret; > > } > > > > +static int mcontinue_atomic_pte(struct mm_struct *dst_mm, > > + pmd_t *dst_pmd, > > + struct vm_area_struct *dst_vma, > > + unsigned long dst_addr, > > + bool wp_copy) > > +{ > > + struct inode *inode = file_inode(dst_vma->vm_file); > > + struct address_space *mapping = inode->i_mapping; > > + pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); > > + struct page *page; > > + int ret; > > + > > + ret = -EFAULT; > > + page = find_lock_page(mapping, pgoff); > > + if (!page) > > + goto out; > > + > > + ret = mcopy_atomic_install_ptes(dst_mm, dst_pmd, dst_vma, dst_addr, > > + page, MCOPY_ATOMIC_CONTINUE, wp_copy); > > + if (ret) > > + goto out_release; > > + > > + unlock_page(page); > > + ret = 0; > > +out: > > + return ret; > > +out_release: > > + unlock_page(page); > > + put_page(page); > > + goto out; > > +} > > + > > static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) > > { > > pgd_t *pgd; > > @@ -418,7 +501,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, > > enum mcopy_atomic_mode mode, > > bool wp_copy) > > { > > - ssize_t err; > > + ssize_t err = 0; > > + > > + if (mode == MCOPY_ATOMIC_CONTINUE) { > > + err = mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, > > + wp_copy); > > + goto out; > > Why not return directly? :) > > Thanks, > > > + } > > > > /* > > * The normal page fault path for a shmem will invoke the > > @@ -431,26 +520,20 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, > > * and not in the radix tree. > > */ > > if (!(dst_vma->vm_flags & VM_SHARED)) { > > - switch (mode) { > > - case MCOPY_ATOMIC_NORMAL: > > + if (mode == MCOPY_ATOMIC_NORMAL) > > err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, > > dst_addr, src_addr, page, > > - wp_copy); > > - break; > > - case MCOPY_ATOMIC_ZEROPAGE: > > + mode, wp_copy); > > + else if (mode == MCOPY_ATOMIC_ZEROPAGE) > > err = mfill_zeropage_pte(dst_mm, dst_pmd, > > dst_vma, dst_addr); > > - break; > > - case MCOPY_ATOMIC_CONTINUE: > > - err = -EINVAL; > > - break; > > - } > > } else { > > VM_WARN_ON_ONCE(wp_copy); > > err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, > > src_addr, mode, page); > > } > > > > +out: > > return err; > > } > > > > diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c > > index f6c86b036d0f..d8541a59dae5 100644 > > --- a/tools/testing/selftests/vm/userfaultfd.c > > +++ b/tools/testing/selftests/vm/userfaultfd.c > > @@ -485,6 +485,7 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp) > > static void continue_range(int ufd, __u64 start, __u64 len) > > { > > struct uffdio_continue req; > > + int ret; > > > > req.range.start = start; > > req.range.len = len; > > @@ -493,6 +494,17 @@ static void continue_range(int ufd, __u64 start, __u64 len) > > if (ioctl(ufd, UFFDIO_CONTINUE, &req)) > > err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, > > (uint64_t)start); > > + > > + /* > > + * Error handling within the kernel for continue is subtly different > > + * from copy or zeropage, so it may be a source of bugs. Trigger an > > + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. > > + */ > > + req.mapped = 0; > > + ret = ioctl(ufd, UFFDIO_CONTINUE, &req); > > + if (ret >= 0 || req.mapped != -EEXIST) > > + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, > > + ret, req.mapped); > > } > > > > static void *locking_thread(void *arg) > > -- > > 2.31.0.208.g409f899ff0-goog > > > > -- > Peter Xu >