On Wed, Feb 17, 2021 at 05:46:30PM -0800, Mike Kravetz wrote: > On 2/17/21 12:46 PM, Peter Xu wrote: > > Huge pmd sharing for hugetlbfs is racy with userfaultfd-wp because > > userfaultfd-wp is always based on pgtable entries, so they cannot be shared. > > > > Walk the hugetlb range and unshare all such mappings if there is, right before > > UFFDIO_REGISTER will succeed and return to userspace. > > > > This will pair with want_pmd_share() in hugetlb code so that huge pmd sharing > > is completely disabled for userfaultfd-wp registered range. > > > > Signed-off-by: Peter Xu <peterx@xxxxxxxxxx> > > --- > > fs/userfaultfd.c | 4 ++++ > > include/linux/hugetlb.h | 1 + > > mm/hugetlb.c | 51 +++++++++++++++++++++++++++++++++++++++++ > > 3 files changed, 56 insertions(+) > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > index 894cc28142e7..e259318fcae1 100644 > > --- a/fs/userfaultfd.c > > +++ b/fs/userfaultfd.c > > @@ -15,6 +15,7 @@ > > #include <linux/sched/signal.h> > > #include <linux/sched/mm.h> > > #include <linux/mm.h> > > +#include <linux/mmu_notifier.h> > > #include <linux/poll.h> > > #include <linux/slab.h> > > #include <linux/seq_file.h> > > @@ -1448,6 +1449,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > > vma->vm_flags = new_flags; > > vma->vm_userfaultfd_ctx.ctx = ctx; > > > > + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) > > + hugetlb_unshare_all_pmds(vma); > > + > > skip: > > prev = vma; > > start = vma->vm_end; > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > > index 3b4104021dd3..97ecfd4c20b2 100644 > > --- a/include/linux/hugetlb.h > > +++ b/include/linux/hugetlb.h > > @@ -188,6 +188,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, > > unsigned long address, unsigned long end, pgprot_t newprot); > > > > bool is_hugetlb_entry_migration(pte_t pte); > > +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); > > > > #else /* !CONFIG_HUGETLB_PAGE */ > > > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > > index f53a0b852ed8..83c006ea3ff9 100644 > > --- a/mm/hugetlb.c > > +++ b/mm/hugetlb.c > > @@ -5723,4 +5723,55 @@ void __init hugetlb_cma_check(void) > > pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); > > } > > > > +/* > > + * This function will unconditionally remove all the shared pmd pgtable entries > > + * within the specific vma for a hugetlbfs memory range. > > + */ > > Thanks for updating this! > > > +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) > > +{ > > + struct hstate *h = hstate_vma(vma); > > + unsigned long sz = huge_page_size(h); > > + struct mm_struct *mm = vma->vm_mm; > > + struct mmu_notifier_range range; > > + unsigned long address, start, end; > > + spinlock_t *ptl; > > + pte_t *ptep; > > + > > + if (!(vma->vm_flags & VM_MAYSHARE)) > > + return; > > + > > + start = ALIGN(vma->vm_start, PUD_SIZE); > > + end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); > > + > > + if (start >= end) > > + return; > > + > > + /* > > + * No need to call adjust_range_if_pmd_sharing_possible(), because > > + * we're going to operate on the whole vma > > not necessary, but perhaps change to: > * we're going to operate on ever PUD_SIZE aligned sized range > * within the vma. > > > + * we're going to operate on the whole vma > > + */ > > + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, > > + vma->vm_start, vma->vm_end); > > Should we use start, end here instead of vma->vm_start, vma->vm_end ? > > > + mmu_notifier_invalidate_range_start(&range); > > + i_mmap_lock_write(vma->vm_file->f_mapping); > > + for (address = start; address < end; address += PUD_SIZE) { > > + unsigned long tmp = address; > > + > > + ptep = huge_pte_offset(mm, address, sz); > > + if (!ptep) > > + continue; > > + ptl = huge_pte_lock(h, mm, ptep); > > + /* We don't want 'address' to be changed */ > > + huge_pmd_unshare(mm, vma, &tmp, ptep); > > + spin_unlock(ptl); > > + } > > + flush_hugetlb_tlb_range(vma, vma->vm_start, vma->vm_end); > > start, end ? Right we can even shrink the notifier, I'll respin shortly. Thanks, -- Peter Xu