On Thu, Sep 14, 2023 at 12:28 PM Jann Horn <jannh@xxxxxxxxxx> wrote: > > On Thu, Sep 14, 2023 at 5:26 PM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote: > > > > From: Andrea Arcangeli <aarcange@xxxxxxxxxx> > > > > This implements the uABI of UFFDIO_REMAP. > > > > Notably one mode bitflag is also forwarded (and in turn known) by the > > lowlevel remap_pages method. > > > > Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> > > Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> > > --- > > fs/userfaultfd.c | 49 +++ > > include/linux/rmap.h | 5 + > > include/linux/userfaultfd_k.h | 17 + > > include/uapi/linux/userfaultfd.h | 22 ++ > > mm/huge_memory.c | 118 +++++++ > > mm/khugepaged.c | 3 + > > mm/userfaultfd.c | 586 +++++++++++++++++++++++++++++++ > > 7 files changed, 800 insertions(+) > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > index 56eaae9dac1a..7bf64e7541c1 100644 > > --- a/fs/userfaultfd.c > > +++ b/fs/userfaultfd.c > > @@ -2027,6 +2027,52 @@ static inline unsigned int uffd_ctx_features(__u64 user_features) > > return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; > > } > > > > +static int userfaultfd_remap(struct userfaultfd_ctx *ctx, > > + unsigned long arg) > > +{ > > + __s64 ret; > > + struct uffdio_remap uffdio_remap; > > + struct uffdio_remap __user *user_uffdio_remap; > > + struct userfaultfd_wake_range range; > > + > > + user_uffdio_remap = (struct uffdio_remap __user *) arg; > > + > > + ret = -EFAULT; > > + if (copy_from_user(&uffdio_remap, user_uffdio_remap, > > + /* don't copy "remap" last field */ > > + sizeof(uffdio_remap)-sizeof(__s64))) > > + goto out; > > + > > + ret = validate_range(ctx->mm, uffdio_remap.dst, uffdio_remap.len); > > + if (ret) > > + goto out; > > + ret = validate_range(current->mm, uffdio_remap.src, uffdio_remap.len); > > + if (ret) > > + goto out; > > + ret = -EINVAL; > > + if (uffdio_remap.mode & ~(UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES| > > + UFFDIO_REMAP_MODE_DONTWAKE)) > > + goto out; > > Do you not need mmget_not_zero(ctx->mm) to make sure the MM can't be > concurrently torn down while remap_pages() is running, similar to what > the other userfaultfd ioctl handlers do? > > > + ret = remap_pages(ctx->mm, current->mm, > > + uffdio_remap.dst, uffdio_remap.src, > > + uffdio_remap.len, uffdio_remap.mode); > > + if (unlikely(put_user(ret, &user_uffdio_remap->remap))) > > + return -EFAULT; > > + if (ret < 0) > > + goto out; > > + /* len == 0 would wake all */ > > + BUG_ON(!ret); > > + range.len = ret; > > + if (!(uffdio_remap.mode & UFFDIO_REMAP_MODE_DONTWAKE)) { > > + range.start = uffdio_remap.dst; > > + wake_userfault(ctx, &range); > > + } > > + ret = range.len == uffdio_remap.len ? 0 : -EAGAIN; > > +out: > > + return ret; > > +} > [...] > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index 064fbd90822b..c7a9880a1f6a 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > > @@ -1932,6 +1932,124 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > > return ret; > > } > > > > +#ifdef CONFIG_USERFAULTFD > > +/* > > + * The PT lock for src_pmd and the mmap_lock for reading are held by > > + * the caller, but it must return after releasing the > > + * page_table_lock. We're guaranteed the src_pmd is a pmd_trans_huge > > + * until the PT lock of the src_pmd is released. Just move the page > > + * from src_pmd to dst_pmd if possible. Return zero if succeeded in > > + * moving the page, -EAGAIN if it needs to be repeated by the caller, > > + * or other errors in case of failure. > > + */ > > +int remap_pages_huge_pmd(struct mm_struct *dst_mm, > > + struct mm_struct *src_mm, > > + pmd_t *dst_pmd, pmd_t *src_pmd, > > + pmd_t dst_pmdval, > > + struct vm_area_struct *dst_vma, > > + struct vm_area_struct *src_vma, > > + unsigned long dst_addr, > > + unsigned long src_addr) > > +{ > > + pmd_t _dst_pmd, src_pmdval; > > + struct page *src_page; > > + struct anon_vma *src_anon_vma, *dst_anon_vma; > > + spinlock_t *src_ptl, *dst_ptl; > > + pgtable_t pgtable; > > + struct mmu_notifier_range range; > > + > > + src_pmdval = *src_pmd; > > + src_ptl = pmd_lockptr(src_mm, src_pmd); > > + > > + BUG_ON(!pmd_trans_huge(src_pmdval)); > > + BUG_ON(!pmd_none(dst_pmdval)); > > Why can we assert that pmd_none(dst_pmdval) is true here? Can we not > have concurrent faults (or userfaultfd operations) populating that > PMD? > > > + BUG_ON(!spin_is_locked(src_ptl)); > > + mmap_assert_locked(src_mm); > > + mmap_assert_locked(dst_mm); > > + BUG_ON(src_addr & ~HPAGE_PMD_MASK); > > + BUG_ON(dst_addr & ~HPAGE_PMD_MASK); > > + > > + src_page = pmd_page(src_pmdval); > > + BUG_ON(!PageHead(src_page)); > > + BUG_ON(!PageAnon(src_page)); > > + if (unlikely(page_mapcount(src_page) != 1)) { > > + spin_unlock(src_ptl); > > + return -EBUSY; > > + } > > + > > + get_page(src_page); > > + spin_unlock(src_ptl); > > + > > + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm, src_addr, > > + src_addr + HPAGE_PMD_SIZE); > > + mmu_notifier_invalidate_range_start(&range); > > + > > + /* block all concurrent rmap walks */ > > + lock_page(src_page); > > + > > + /* > > + * split_huge_page walks the anon_vma chain without the page > > + * lock. Serialize against it with the anon_vma lock, the page > > + * lock is not enough. > > + */ > > + src_anon_vma = folio_get_anon_vma(page_folio(src_page)); > > + if (!src_anon_vma) { > > + unlock_page(src_page); > > + put_page(src_page); > > + mmu_notifier_invalidate_range_end(&range); > > + return -EAGAIN; > > + } > > + anon_vma_lock_write(src_anon_vma); > > + > > + dst_ptl = pmd_lockptr(dst_mm, dst_pmd); > > + double_pt_lock(src_ptl, dst_ptl); > > + if (unlikely(!pmd_same(*src_pmd, src_pmdval) || > > + !pmd_same(*dst_pmd, dst_pmdval) || > > + page_mapcount(src_page) != 1)) { > > + double_pt_unlock(src_ptl, dst_ptl); > > + anon_vma_unlock_write(src_anon_vma); > > + put_anon_vma(src_anon_vma); > > + unlock_page(src_page); > > + put_page(src_page); > > + mmu_notifier_invalidate_range_end(&range); > > + return -EAGAIN; > > + } > > + > > + BUG_ON(!PageHead(src_page)); > > + BUG_ON(!PageAnon(src_page)); > > + /* the PT lock is enough to keep the page pinned now */ > > + put_page(src_page); > > + > > + dst_anon_vma = (void *) dst_vma->anon_vma + PAGE_MAPPING_ANON; > > + WRITE_ONCE(src_page->mapping, (struct address_space *) dst_anon_vma); > > + WRITE_ONCE(src_page->index, linear_page_index(dst_vma, dst_addr)); > > + > > + if (!pmd_same(pmdp_huge_clear_flush(src_vma, src_addr, src_pmd), > > + src_pmdval)) > > + BUG_ON(1); > > I'm not sure we can assert that the PMDs are exactly equal; the CPU > might have changed the A/D bits under us? > > > + _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); > > + _dst_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); > > + set_pmd_at(dst_mm, dst_addr, dst_pmd, _dst_pmd); > > + > > + pgtable = pgtable_trans_huge_withdraw(src_mm, src_pmd); > > + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); > > Are we allowed to move page tables between mm_structs on all > architectures? The first example I found that looks a bit dodgy, > looking through various architectures' pte_alloc_one(), is s390's > page_table_alloc() which looks like page tables are tied to per-MM > lists sometimes. > If that's not allowed, we might have to allocate a new deposit table > and free the old one or something like that. > > > + if (dst_mm != src_mm) { > > + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); > > + add_mm_counter(src_mm, MM_ANONPAGES, -HPAGE_PMD_NR); > > + } > > + double_pt_unlock(src_ptl, dst_ptl); > > + > > + anon_vma_unlock_write(src_anon_vma); > > + put_anon_vma(src_anon_vma); > > + > > + /* unblock rmap walks */ > > + unlock_page(src_page); > > + > > + mmu_notifier_invalidate_range_end(&range); > > + return 0; > > +} > > +#endif /* CONFIG_USERFAULTFD */ > > + > > /* > > * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. > > * > [...] > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > > index 96d9eae5c7cc..0cca60dfa8f8 100644 > > --- a/mm/userfaultfd.c > > +++ b/mm/userfaultfd.c > [...] > > +ssize_t remap_pages(struct mm_struct *dst_mm, struct mm_struct *src_mm, > > + unsigned long dst_start, unsigned long src_start, > > + unsigned long len, __u64 mode) > > +{ > [...] > > + > > + if (pgprot_val(src_vma->vm_page_prot) != > > + pgprot_val(dst_vma->vm_page_prot)) > > + goto out; > > Does this check intentionally allow moving pages from a > PROT_READ|PROT_WRITE anonymous private VMA into a PROT_READ anonymous > private VMA (on architectures like x86 and arm64 where CoW memory has > the same protection flags as read-only memory), but forbid moving them > from a PROT_READ|PROT_EXEC VMA into a PROT_READ VMA? I think this > check needs at least a comment to explain what's going on here. > > > + /* only allow remapping if both are mlocked or both aren't */ > > + if ((src_vma->vm_flags & VM_LOCKED) ^ (dst_vma->vm_flags & VM_LOCKED)) > > + goto out; > > + > > + /* > > + * Be strict and only allow remap_pages if either the src or > > + * dst range is registered in the userfaultfd to prevent > > + * userland errors going unnoticed. As far as the VM > > + * consistency is concerned, it would be perfectly safe to > > + * remove this check, but there's no useful usage for > > + * remap_pages ouside of userfaultfd registered ranges. This > > + * is after all why it is an ioctl belonging to the > > + * userfaultfd and not a syscall. > > + * > > + * Allow both vmas to be registered in the userfaultfd, just > > + * in case somebody finds a way to make such a case useful. > > + * Normally only one of the two vmas would be registered in > > + * the userfaultfd. > > + */ > > + if (!dst_vma->vm_userfaultfd_ctx.ctx && > > + !src_vma->vm_userfaultfd_ctx.ctx) > > + goto out; > > + > > + /* > > + * FIXME: only allow remapping across anonymous vmas, > > + * tmpfs should be added. > > + */ > > + if (src_vma->vm_ops || dst_vma->vm_ops) > > + goto out; > > I don't think it's okay to check for anonymous VMAs by checking > ->vm_ops. There are some weird drivers whose ->mmap helpers don't set > ->vm_ops and instead just shove all the necessary PTEs into the VMA > right on ->mmap, so I think they end up with ->vm_ops==NULL. For > example, kcov_mmap() looks that way. I'm not sure how common this is. > > Though, uuuuuh, I guess if that's true, the existing > vma_is_anonymous() is broken, since that also just checks ->vm_ops? > I'm not sure what the consequences of that would be... Either way, > vma_is_anonymous() might be the better way to check for anonymous VMAs > here, and someone should figure out whether vma_is_anonymous() needs > to be fixed. > > > + /* > > + * Ensure the dst_vma has a anon_vma or this page > > + * would get a NULL anon_vma when moved in the > > + * dst_vma. > > + */ > > + err = -ENOMEM; > > + if (unlikely(anon_vma_prepare(dst_vma))) > > + goto out; > > + > > + for (src_addr = src_start, dst_addr = dst_start; > > + src_addr < src_start + len;) { > > + spinlock_t *ptl; > > + pmd_t dst_pmdval; > > + > > + BUG_ON(dst_addr >= dst_start + len); > > + src_pmd = mm_find_pmd(src_mm, src_addr); > > (this would blow up pretty badly if we could have transparent huge PUD > in the region but I think that's limited to file VMAs so it's fine as > it currently is) > > > + if (unlikely(!src_pmd)) { > > + if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) { > > + err = -ENOENT; > > + break; > > + } > > + src_pmd = mm_alloc_pmd(src_mm, src_addr); > > + if (unlikely(!src_pmd)) { > > + err = -ENOMEM; > > + break; > > + } > > + } > > + dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); > > + if (unlikely(!dst_pmd)) { > > + err = -ENOMEM; > > + break; > > + } > > + > > + dst_pmdval = pmdp_get_lockless(dst_pmd); > > + /* > > + * If the dst_pmd is mapped as THP don't > > + * override it and just be strict. > > + */ > > + if (unlikely(pmd_trans_huge(dst_pmdval))) { > > + err = -EEXIST; > > + break; > > + } > > This check is racy because the dst_pmd can still change at this point, > from previously pointing to a zeroed PMD to now pointing to a > hugepage, right? And we rely on remap_pages_pte() and > remap_pages_huge_pmd() to recheck for that? > If yes, maybe add a comment noting this and explaining why we want this check. > > > + ptl = pmd_trans_huge_lock(src_pmd, src_vma); > > + if (ptl) { > > + /* > > + * Check if we can move the pmd without > > + * splitting it. First check the address > > + * alignment to be the same in src/dst. These > > + * checks don't actually need the PT lock but > > + * it's good to do it here to optimize this > > + * block away at build time if > > + * CONFIG_TRANSPARENT_HUGEPAGE is not set. > > + */ > > + if (thp_aligned == -1) > > + thp_aligned = ((src_addr & ~HPAGE_PMD_MASK) == > > + (dst_addr & ~HPAGE_PMD_MASK)); > > + if (!thp_aligned || (src_addr & ~HPAGE_PMD_MASK) || > > This seems overly complicated, the only case when you can move a huge > PMD is if both addresses are hugepage-aligned and you have enough > length for one hugepage: > > (src_addr & ~HPAGE_PMD_MASK) == 0 && (dst_addr & ~HPAGE_PMD_MASK) == 0 > && (src_start + len - src_addr >= HPAGE_PMD_SIZE). > > > + !pmd_none(dst_pmdval) || > > + src_start + len - src_addr < HPAGE_PMD_SIZE) { > > + spin_unlock(ptl); > > + /* Fall through */ > > + split_huge_pmd(src_vma, src_pmd, src_addr); > > + } else { > > + err = remap_pages_huge_pmd(dst_mm, > > + src_mm, > > + dst_pmd, > > + src_pmd, > > + dst_pmdval, > > + dst_vma, > > + src_vma, > > + dst_addr, > > + src_addr); > > + cond_resched(); > > + > > + if (!err) { > > + dst_addr += HPAGE_PMD_SIZE; > > + src_addr += HPAGE_PMD_SIZE; > > + moved += HPAGE_PMD_SIZE; > > + } > > + > > + if ((!err || err == -EAGAIN) && > > + fatal_signal_pending(current)) > > + err = -EINTR; > > + > > + if (err && err != -EAGAIN) > > + break; > > + > > + continue; > > + } > > + } > > + > > + if (pmd_none(*src_pmd)) { > > + if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) { > > + err = -ENOENT; > > + break; > > + } > > + if (unlikely(__pte_alloc(src_mm, src_pmd))) { > > + err = -ENOMEM; > > + break; > > + } > > + } > > + > > + if (unlikely(pmd_none(dst_pmdval)) && > > + unlikely(__pte_alloc(dst_mm, dst_pmd))) { > > Maybe just use pte_alloc() here? > > > + err = -ENOMEM; > > + break; > > + } > > + > > + err = remap_pages_pte(dst_mm, src_mm, > > + dst_pmd, src_pmd, > > + dst_vma, src_vma, > > + dst_addr, src_addr, > > + mode); > > + > > + cond_resched(); > > + > > + if (!err) { > > + dst_addr += PAGE_SIZE; > > + src_addr += PAGE_SIZE; > > + moved += PAGE_SIZE; > > + } > > + > > + if ((!err || err == -EAGAIN) && > > + fatal_signal_pending(current)) > > + err = -EINTR; > > + > > + if (err && err != -EAGAIN) > > + break; > > + } > > + > > +out: > > + mmap_read_unlock(dst_mm); > > + if (dst_mm != src_mm) > > + mmap_read_unlock(src_mm); > > + BUG_ON(moved < 0); > > + BUG_ON(err > 0); > > + BUG_ON(!moved && !err); > > + return moved ? moved : err; > > +} > > Maybe you could try whether this function would look simpler with a > shape roughly like: > > for (src_addr = ...; src_addr < ...;) { > unsigned long step_size; > > if (hugepage case) { > if (have to split) { > split it; > continue; > } > step_size = HPAGE_PMD_SIZE; > ... > } else { > ... 4k case ... > step_size = PAGE_SIZE; > } > ... > cond_resched(); > if (!err) { > dst_addr += step_size; > src_addr += step_size; > moved += step_size; > } > ... > } I'll need some time to gather the answers to all your questions and will reply once I have them ready. Thanks for reviewing, Jann!