On 2/17/21 12:46 PM, Peter Xu wrote: > Huge pmd sharing for hugetlbfs is racy with userfaultfd-wp because > userfaultfd-wp is always based on pgtable entries, so they cannot be shared. > > Walk the hugetlb range and unshare all such mappings if there is, right before > UFFDIO_REGISTER will succeed and return to userspace. > > This will pair with want_pmd_share() in hugetlb code so that huge pmd sharing > is completely disabled for userfaultfd-wp registered range. > > Signed-off-by: Peter Xu <peterx@xxxxxxxxxx> > --- > fs/userfaultfd.c | 4 ++++ > include/linux/hugetlb.h | 1 + > mm/hugetlb.c | 51 +++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 56 insertions(+) > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index 894cc28142e7..e259318fcae1 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -15,6 +15,7 @@ > #include <linux/sched/signal.h> > #include <linux/sched/mm.h> > #include <linux/mm.h> > +#include <linux/mmu_notifier.h> > #include <linux/poll.h> > #include <linux/slab.h> > #include <linux/seq_file.h> > @@ -1448,6 +1449,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > vma->vm_flags = new_flags; > vma->vm_userfaultfd_ctx.ctx = ctx; > > + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) > + hugetlb_unshare_all_pmds(vma); > + > skip: > prev = vma; > start = vma->vm_end; > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 3b4104021dd3..97ecfd4c20b2 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -188,6 +188,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, > unsigned long address, unsigned long end, pgprot_t newprot); > > bool is_hugetlb_entry_migration(pte_t pte); > +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); > > #else /* !CONFIG_HUGETLB_PAGE */ > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index f53a0b852ed8..83c006ea3ff9 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5723,4 +5723,55 @@ void __init hugetlb_cma_check(void) > pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); > } > > +/* > + * This function will unconditionally remove all the shared pmd pgtable entries > + * within the specific vma for a hugetlbfs memory range. > + */ Thanks for updating this! > +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) > +{ > + struct hstate *h = hstate_vma(vma); > + unsigned long sz = huge_page_size(h); > + struct mm_struct *mm = vma->vm_mm; > + struct mmu_notifier_range range; > + unsigned long address, start, end; > + spinlock_t *ptl; > + pte_t *ptep; > + > + if (!(vma->vm_flags & VM_MAYSHARE)) > + return; > + > + start = ALIGN(vma->vm_start, PUD_SIZE); > + end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); > + > + if (start >= end) > + return; > + > + /* > + * No need to call adjust_range_if_pmd_sharing_possible(), because > + * we're going to operate on the whole vma not necessary, but perhaps change to: * we're going to operate on ever PUD_SIZE aligned sized range * within the vma. > + * we're going to operate on the whole vma > + */ > + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, > + vma->vm_start, vma->vm_end); Should we use start, end here instead of vma->vm_start, vma->vm_end ? > + mmu_notifier_invalidate_range_start(&range); > + i_mmap_lock_write(vma->vm_file->f_mapping); > + for (address = start; address < end; address += PUD_SIZE) { > + unsigned long tmp = address; > + > + ptep = huge_pte_offset(mm, address, sz); > + if (!ptep) > + continue; > + ptl = huge_pte_lock(h, mm, ptep); > + /* We don't want 'address' to be changed */ > + huge_pmd_unshare(mm, vma, &tmp, ptep); > + spin_unlock(ptl); > + } > + flush_hugetlb_tlb_range(vma, vma->vm_start, vma->vm_end); start, end ? -- Mike Kravetz > + i_mmap_unlock_write(vma->vm_file->f_mapping); > + /* > + * No need to call mmu_notifier_invalidate_range(), see > + * Documentation/vm/mmu_notifier.rst. > + */ > + mmu_notifier_invalidate_range_end(&range); > +} > + > #endif /* CONFIG_CMA */ >