On Wed, Sep 27, 2023 at 5:47 AM Jann Horn <jannh@xxxxxxxxxx> wrote: > > On Sat, Sep 23, 2023 at 3:31 AM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote: > > From: Andrea Arcangeli <aarcange@xxxxxxxxxx> > > > > This implements the uABI of UFFDIO_REMAP. > > > > Notably one mode bitflag is also forwarded (and in turn known) by the > > lowlevel remap_pages method. > > > > Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> > > Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> > [...] > > +int remap_pages_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, > > + pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, > > + struct vm_area_struct *dst_vma, > > + struct vm_area_struct *src_vma, > > + unsigned long dst_addr, unsigned long src_addr) > > +{ > > + pmd_t _dst_pmd, src_pmdval; > > + struct page *src_page; > > + struct folio *src_folio; > > + struct anon_vma *src_anon_vma, *dst_anon_vma; > > + spinlock_t *src_ptl, *dst_ptl; > > + pgtable_t src_pgtable, dst_pgtable; > > + struct mmu_notifier_range range; > > + int err = 0; > > + > > + src_pmdval = *src_pmd; > > + src_ptl = pmd_lockptr(src_mm, src_pmd); > > + > > + BUG_ON(!spin_is_locked(src_ptl)); > > + mmap_assert_locked(src_mm); > > + mmap_assert_locked(dst_mm); > > + > > + BUG_ON(!pmd_trans_huge(src_pmdval)); > > + BUG_ON(!pmd_none(dst_pmdval)); > > + BUG_ON(src_addr & ~HPAGE_PMD_MASK); > > + BUG_ON(dst_addr & ~HPAGE_PMD_MASK); > > + > > + src_page = pmd_page(src_pmdval); > > + if (unlikely(!PageAnonExclusive(src_page))) { > > + spin_unlock(src_ptl); > > + return -EBUSY; > > + } > > + > > + src_folio = page_folio(src_page); > > + folio_get(src_folio); > > + spin_unlock(src_ptl); > > + > > + /* preallocate dst_pgtable if needed */ > > + if (dst_mm != src_mm) { > > + dst_pgtable = pte_alloc_one(dst_mm); > > + if (unlikely(!dst_pgtable)) { > > + err = -ENOMEM; > > + goto put_folio; > > + } > > + } else { > > + dst_pgtable = NULL; > > + } > > + > > + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm, src_addr, > > + src_addr + HPAGE_PMD_SIZE); > > + mmu_notifier_invalidate_range_start(&range); > > + > > + /* block all concurrent rmap walks */ > > + folio_lock(src_folio); > > + > > + /* > > + * split_huge_page walks the anon_vma chain without the page > > + * lock. Serialize against it with the anon_vma lock, the page > > + * lock is not enough. > > + */ > > + src_anon_vma = folio_get_anon_vma(src_folio); > > + if (!src_anon_vma) { > > + err = -EAGAIN; > > + goto unlock_folio; > > + } > > + anon_vma_lock_write(src_anon_vma); > > + > > + dst_ptl = pmd_lockptr(dst_mm, dst_pmd); > > + double_pt_lock(src_ptl, dst_ptl); > > + if (unlikely(!pmd_same(*src_pmd, src_pmdval) || > > + !pmd_same(*dst_pmd, dst_pmdval) || > > + folio_mapcount(src_folio) != 1)) { > > I think this is also supposed to be PageAnonExclusive()? Yes. Will fix. > > > + double_pt_unlock(src_ptl, dst_ptl); > > + err = -EAGAIN; > > + goto put_anon_vma; > > + } > > + > > + BUG_ON(!folio_test_head(src_folio)); > > + BUG_ON(!folio_test_anon(src_folio)); > > + > > + dst_anon_vma = (void *)dst_vma->anon_vma + PAGE_MAPPING_ANON; > > + WRITE_ONCE(src_folio->mapping, (struct address_space *) dst_anon_vma); > > + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); > > + > > + src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); > > + _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); > > + _dst_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); > > + set_pmd_at(dst_mm, dst_addr, dst_pmd, _dst_pmd); > > + > > + src_pgtable = pgtable_trans_huge_withdraw(src_mm, src_pmd); > > + if (dst_pgtable) { > > + pgtable_trans_huge_deposit(dst_mm, dst_pmd, dst_pgtable); > > + pte_free(src_mm, src_pgtable); > > + dst_pgtable = NULL; > > + > > + mm_inc_nr_ptes(dst_mm); > > + mm_dec_nr_ptes(src_mm); > > + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); > > + add_mm_counter(src_mm, MM_ANONPAGES, -HPAGE_PMD_NR); > > + } else { > > + pgtable_trans_huge_deposit(dst_mm, dst_pmd, src_pgtable); > > + } > > + double_pt_unlock(src_ptl, dst_ptl); > > + > > +put_anon_vma: > > + anon_vma_unlock_write(src_anon_vma); > > + put_anon_vma(src_anon_vma); > > +unlock_folio: > > + /* unblock rmap walks */ > > + folio_unlock(src_folio); > > + mmu_notifier_invalidate_range_end(&range); > > + if (dst_pgtable) > > + pte_free(dst_mm, dst_pgtable); > > +put_folio: > > + folio_put(src_folio); > > + > > + return err; > > +} > > +#endif /* CONFIG_USERFAULTFD */ > [...] > > +static int remap_anon_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, > > + struct vm_area_struct *dst_vma, > > + struct vm_area_struct *src_vma, > > + unsigned long dst_addr, unsigned long src_addr, > > + pte_t *dst_pte, pte_t *src_pte, > > + pte_t orig_dst_pte, pte_t orig_src_pte, > > + spinlock_t *dst_ptl, spinlock_t *src_ptl, > > + struct folio *src_folio) > > +{ > > + struct anon_vma *dst_anon_vma; > > + > > + double_pt_lock(dst_ptl, src_ptl); > > + > > + if (!pte_same(*src_pte, orig_src_pte) || > > + !pte_same(*dst_pte, orig_dst_pte) || > > + folio_test_large(src_folio) || > > + folio_estimated_sharers(src_folio) != 1) { > > + double_pt_unlock(dst_ptl, src_ptl); > > + return -EAGAIN; > > + } > > + > > + BUG_ON(!folio_test_anon(src_folio)); > > + > > + dst_anon_vma = (void *)dst_vma->anon_vma + PAGE_MAPPING_ANON; > > + WRITE_ONCE(src_folio->mapping, > > + (struct address_space *) dst_anon_vma); > > + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, > > + dst_addr)); > > + > > + orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); > > + orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); > > + orig_dst_pte = maybe_mkwrite(pte_mkdirty(orig_dst_pte), > > + dst_vma); > > I think there's still a theoretical issue here that you could fix by > checking for the AnonExclusive flag, similar to the huge page case. > > Consider the following scenario: > > 1. process P1 does a write fault in a private anonymous VMA, creating > and mapping a new anonymous page A1 > 2. process P1 forks and creates two children P2 and P3. afterwards, A1 > is mapped in P1, P2 and P3 as a COW page, with mapcount 3. > 3. process P1 removes its mapping of A1, dropping its mapcount to 2. > 4. process P2 uses vmsplice() to grab a reference to A1 with get_user_pages() > 5. process P2 removes its mapping of A1, dropping its mapcount to 1. > > If at this point P3 does a write fault on its mapping of A1, it will > still trigger copy-on-write thanks to the AnonExclusive mechanism; and > this is necessary to avoid P3 mapping A1 as writable and writing data > into it that will become visible to P2, if P2 and P3 are in different > security contexts. > > But if P3 instead moves its mapping of A1 to another address with > remap_anon_pte() which only does a page mapcount check, the > maybe_mkwrite() will directly make the mapping writable, circumventing > the AnonExclusive mechanism. I see. Thanks for the detailed explanation! I will add PageAnonExclusive() check in this path to prevent this scenario. > > > + set_pte_at(dst_mm, dst_addr, dst_pte, orig_dst_pte); > > + > > + if (dst_mm != src_mm) { > > + inc_mm_counter(dst_mm, MM_ANONPAGES); > > + dec_mm_counter(src_mm, MM_ANONPAGES); > > + } > > + > > + double_pt_unlock(dst_ptl, src_ptl); > > + > > + return 0; > > +} > > + > > +static int remap_swap_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, > > + unsigned long dst_addr, unsigned long src_addr, > > + pte_t *dst_pte, pte_t *src_pte, > > + pte_t orig_dst_pte, pte_t orig_src_pte, > > + spinlock_t *dst_ptl, spinlock_t *src_ptl) > > +{ > > + if (!pte_swp_exclusive(orig_src_pte)) > > + return -EBUSY; > > + > > + double_pt_lock(dst_ptl, src_ptl); > > + > > + if (!pte_same(*src_pte, orig_src_pte) || > > + !pte_same(*dst_pte, orig_dst_pte)) { > > + double_pt_unlock(dst_ptl, src_ptl); > > + return -EAGAIN; > > + } > > + > > + orig_src_pte = ptep_get_and_clear(src_mm, src_addr, src_pte); > > + set_pte_at(dst_mm, dst_addr, dst_pte, orig_src_pte); > > + > > + if (dst_mm != src_mm) { > > + inc_mm_counter(dst_mm, MM_ANONPAGES); > > + dec_mm_counter(src_mm, MM_ANONPAGES); > > I think this is the wrong counter. Looking at zap_pte_range(), in the > "Genuine swap entry" case, we modify the MM_SWAPENTS counter, not > MM_ANONPAGES. Oops, my bad. Will fix. > > > + } > > + > > + double_pt_unlock(dst_ptl, src_ptl); > > + > > + return 0; > > +} > > + > > +/* > > + * The mmap_lock for reading is held by the caller. Just move the page > > + * from src_pmd to dst_pmd if possible, and return true if succeeded > > + * in moving the page. > > + */ > > +static int remap_pages_pte(struct mm_struct *dst_mm, > > + struct mm_struct *src_mm, > > + pmd_t *dst_pmd, > > + pmd_t *src_pmd, > > + struct vm_area_struct *dst_vma, > > + struct vm_area_struct *src_vma, > > + unsigned long dst_addr, > > + unsigned long src_addr, > > + __u64 mode) > > +{ > > + swp_entry_t entry; > > + pte_t orig_src_pte, orig_dst_pte; > > + spinlock_t *src_ptl, *dst_ptl; > > + pte_t *src_pte = NULL; > > + pte_t *dst_pte = NULL; > > + > > + struct folio *src_folio = NULL; > > + struct anon_vma *src_anon_vma = NULL; > > + struct mmu_notifier_range range; > > + int err = 0; > > + > > + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm, > > + src_addr, src_addr + PAGE_SIZE); > > + mmu_notifier_invalidate_range_start(&range); > > +retry: > > This retry looks a bit dodgy. On this retry label, we restart almost > the entire operation, including re-reading the source PTE; the only > variables that carry state forward from the previous retry loop > iteration are src_folio and src_anon_vma. > > > + dst_pte = pte_offset_map_nolock(dst_mm, dst_pmd, dst_addr, &dst_ptl); > > + > > + /* If an huge pmd materialized from under us fail */ > > + if (unlikely(!dst_pte)) { > > + err = -EFAULT; > > + goto out; > > + } > [...] > > + spin_lock(dst_ptl); > > + orig_dst_pte = *dst_pte; > > + spin_unlock(dst_ptl); > > + if (!pte_none(orig_dst_pte)) { > > + err = -EEXIST; > > + goto out; > > + } > > + > > + spin_lock(src_ptl); > > + orig_src_pte = *src_pte; > > Here we read an entirely new orig_src_pte value. Something like a > concurrent MADV_DONTNEED+pagefault could have made the PTE point to a > different page between loop iterations. > > > + spin_unlock(src_ptl); > > I think you have to insert something like the following here: > > if (src_folio && (orig_dst_pte != previous_src_pte)) { > err = -EAGAIN; > goto out; > } > previous_src_pte = orig_dst_pte; Yes, this definitely needs to be rechecked. Will fix. > > Otherwise: > > > + if (pte_none(orig_src_pte)) { > > + if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) > > + err = -ENOENT; > > + else /* nothing to do to remap a hole */ > > + err = 0; > > + goto out; > > + } > > + > > + if (pte_present(orig_src_pte)) { > > + /* > > + * Pin and lock both source folio and anon_vma. Since we are in > > + * RCU read section, we can't block, so on contention have to > > + * unmap the ptes, obtain the lock and retry. > > + */ > > + if (!src_folio) { > > If we already found a src_folio in the previous iteration (but the > trylock failed), we keep using the same src_folio without rechecking > if the current PTE still points to the same folio. > > > + struct folio *folio; > > + > > + /* > > + * Pin the page while holding the lock to be sure the > > + * page isn't freed under us > > + */ > > + spin_lock(src_ptl); > > + if (!pte_same(orig_src_pte, *src_pte)) { > > + spin_unlock(src_ptl); > > + err = -EAGAIN; > > + goto out; > > + } > > + > > + folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); > > + if (!folio || !folio_test_anon(folio) || > > + folio_test_large(folio) || > > + folio_estimated_sharers(folio) != 1) { > > + spin_unlock(src_ptl); > > + err = -EBUSY; > > + goto out; > > + } > > + > > + folio_get(folio); > > + src_folio = folio; > > + spin_unlock(src_ptl); > > + > > + /* block all concurrent rmap walks */ > > + if (!folio_trylock(src_folio)) { > > + pte_unmap(&orig_src_pte); > > + pte_unmap(&orig_dst_pte); > > + src_pte = dst_pte = NULL; > > + /* now we can block and wait */ > > + folio_lock(src_folio); > > + goto retry; > > + } > > + } > > + > > + if (!src_anon_vma) { > > (And here, if we already saw a src_anon_vma but the trylock failed, > we'll keep using that src_anon_vma.) Ack. The check for previous_src_pte should handle that as well. > > > + /* > > + * folio_referenced walks the anon_vma chain > > + * without the folio lock. Serialize against it with > > + * the anon_vma lock, the folio lock is not enough. > > + */ > > + src_anon_vma = folio_get_anon_vma(src_folio); > > + if (!src_anon_vma) { > > + /* page was unmapped from under us */ > > + err = -EAGAIN; > > + goto out; > > + } > > + if (!anon_vma_trylock_write(src_anon_vma)) { > > + pte_unmap(&orig_src_pte); > > + pte_unmap(&orig_dst_pte); > > + src_pte = dst_pte = NULL; > > + /* now we can block and wait */ > > + anon_vma_lock_write(src_anon_vma); > > + goto retry; > > + } > > + } > > So at this point we have: > > - the current src_pte > - some referenced+locked src_folio that used to be mapped exclusively > at src_addr > - (the anon_vma associated with the src_folio) > > > + err = remap_anon_pte(dst_mm, src_mm, dst_vma, src_vma, > > + dst_addr, src_addr, dst_pte, src_pte, > > + orig_dst_pte, orig_src_pte, > > + dst_ptl, src_ptl, src_folio); > > And then this will, without touching folio mapcounts/refcounts, delete > the current PTE at src_addr, and create a PTE at dst_addr pointing to > the old src_folio, leading to incorrect refcounts/mapcounts? I assume this still points to the missing previous_src_pte check discussed in the previous comments. Is that correct or is there yet another issue? > > > + } else { > [...] > > + } > > + > > +out: > > + if (src_anon_vma) { > > + anon_vma_unlock_write(src_anon_vma); > > + put_anon_vma(src_anon_vma); > > + } > > + if (src_folio) { > > + folio_unlock(src_folio); > > + folio_put(src_folio); > > + } > > + if (dst_pte) > > + pte_unmap(dst_pte); > > + if (src_pte) > > + pte_unmap(src_pte); > > + mmu_notifier_invalidate_range_end(&range); > > + > > + return err; > > +} > [...] > > +ssize_t remap_pages(struct mm_struct *dst_mm, struct mm_struct *src_mm, > > + unsigned long dst_start, unsigned long src_start, > > + unsigned long len, __u64 mode) > > +{ > > + struct vm_area_struct *src_vma, *dst_vma; > > + unsigned long src_addr, dst_addr; > > + pmd_t *src_pmd, *dst_pmd; > > + long err = -EINVAL; > > + ssize_t moved = 0; > > + > > + /* > > + * Sanitize the command parameters: > > + */ > > + BUG_ON(src_start & ~PAGE_MASK); > > + BUG_ON(dst_start & ~PAGE_MASK); > > + BUG_ON(len & ~PAGE_MASK); > > + > > + /* Does the address range wrap, or is the span zero-sized? */ > > + BUG_ON(src_start + len <= src_start); > > + BUG_ON(dst_start + len <= dst_start); > > + > > + /* > > + * Because these are read sempahores there's no risk of lock > > + * inversion. > > + */ > > + mmap_read_lock(dst_mm); > > + if (dst_mm != src_mm) > > + mmap_read_lock(src_mm); > > + > > + /* > > + * Make sure the vma is not shared, that the src and dst remap > > + * ranges are both valid and fully within a single existing > > + * vma. > > + */ > > + src_vma = find_vma(src_mm, src_start); > > + if (!src_vma || (src_vma->vm_flags & VM_SHARED)) > > + goto out; > > + if (src_start < src_vma->vm_start || > > + src_start + len > src_vma->vm_end) > > + goto out; > > + > > + dst_vma = find_vma(dst_mm, dst_start); > > + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) > > + goto out; > > + if (dst_start < dst_vma->vm_start || > > + dst_start + len > dst_vma->vm_end) > > + goto out; > > + > > + err = validate_remap_areas(src_vma, dst_vma); > > + if (err) > > + goto out; > > + > > + for (src_addr = src_start, dst_addr = dst_start; > > + src_addr < src_start + len;) { > > + spinlock_t *ptl; > > + pmd_t dst_pmdval; > > + unsigned long step_size; > > + > > + BUG_ON(dst_addr >= dst_start + len); > > + /* > > + * Below works because anonymous area would not have a > > + * transparent huge PUD. If file-backed support is added, > > + * that case would need to be handled here. > > + */ > > + src_pmd = mm_find_pmd(src_mm, src_addr); > > + if (unlikely(!src_pmd)) { > > + if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) { > > + err = -ENOENT; > > + break; > > + } > > + src_pmd = mm_alloc_pmd(src_mm, src_addr); > > + if (unlikely(!src_pmd)) { > > + err = -ENOMEM; > > + break; > > + } > > + } > > + dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); > > + if (unlikely(!dst_pmd)) { > > + err = -ENOMEM; > > + break; > > + } > > + > > + dst_pmdval = pmdp_get_lockless(dst_pmd); > > + /* > > + * If the dst_pmd is mapped as THP don't override it and just > > + * be strict. If dst_pmd changes into TPH after this check, the > > + * remap_pages_huge_pmd() will detect the change and retry > > + * while remap_pages_pte() will detect the change and fail. > > + */ > > + if (unlikely(pmd_trans_huge(dst_pmdval))) { > > + err = -EEXIST; > > + break; > > + } > > + > > + ptl = pmd_trans_huge_lock(src_pmd, src_vma); > > + if (ptl && !pmd_trans_huge(*src_pmd)) { > > + spin_unlock(ptl); > > + ptl = NULL; > > + } > > This still looks wrong - we do still have to split_huge_pmd() > somewhere so that remap_pages_pte() works. Hmm, I guess this extra check is not even needed... > > > + if (ptl) { > > + /* > > + * Check if we can move the pmd without > > + * splitting it. First check the address > > + * alignment to be the same in src/dst. These > > + * checks don't actually need the PT lock but > > + * it's good to do it here to optimize this > > + * block away at build time if > > + * CONFIG_TRANSPARENT_HUGEPAGE is not set. > > + */ > > + if ((src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || > > + src_start + len - src_addr < HPAGE_PMD_SIZE || !pmd_none(dst_pmdval)) { > > + spin_unlock(ptl); > > + split_huge_pmd(src_vma, src_pmd, src_addr); > > + continue; > > + } > > + > > + err = remap_pages_huge_pmd(dst_mm, src_mm, > > + dst_pmd, src_pmd, > > + dst_pmdval, > > + dst_vma, src_vma, > > + dst_addr, src_addr); > > + step_size = HPAGE_PMD_SIZE; > > + } else { > > + if (pmd_none(*src_pmd)) { > > + if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) { > > + err = -ENOENT; > > + break; > > + } > > + if (unlikely(__pte_alloc(src_mm, src_pmd))) { > > + err = -ENOMEM; > > + break; > > + } > > + } > > + > > + if (unlikely(pte_alloc(dst_mm, dst_pmd))) { > > + err = -ENOMEM; > > + break; > > + } > > + > > + err = remap_pages_pte(dst_mm, src_mm, > > + dst_pmd, src_pmd, > > + dst_vma, src_vma, > > + dst_addr, src_addr, > > + mode); > > + step_size = PAGE_SIZE; > > + } > > + > > + cond_resched(); > > + > > + if (!err) { > > + dst_addr += step_size; > > + src_addr += step_size; > > + moved += step_size; > > + } > > + > > + if ((!err || err == -EAGAIN) && > > + fatal_signal_pending(current)) > > + err = -EINTR; > > + > > + if (err && err != -EAGAIN) > > + break; > > + } > > + > > +out: > > + mmap_read_unlock(dst_mm); > > + if (dst_mm != src_mm) > > + mmap_read_unlock(src_mm); > > + BUG_ON(moved < 0); > > + BUG_ON(err > 0); > > + BUG_ON(!moved && !err); > > + return moved ? moved : err; > > +} > > -- > > 2.42.0.515.g380fc7ccd1-goog > >