On 04/09/2024 11:09, Dev Jain wrote: > Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and > replace it with a PMD-mapped THP. Change the helpers introduced in the > previous patch to flush TLB entry corresponding to the hugezeropage, > and preserve PMD uffd-wp marker. In case of failure, fallback to > splitting the PMD. > > Signed-off-by: Dev Jain <dev.jain@xxxxxxx> > --- > include/linux/huge_mm.h | 6 ++++ > mm/huge_memory.c | 79 +++++++++++++++++++++++++++++++++++------ > mm/memory.c | 5 +-- > 3 files changed, 78 insertions(+), 12 deletions(-) > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index e25d9ebfdf89..fdd2cf473a3c 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -9,6 +9,12 @@ > #include <linux/kobject.h> > > vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); > +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma, > + unsigned long haddr, struct folio **foliop, > + unsigned long addr); > +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, > + struct vm_area_struct *vma, unsigned long haddr, > + pgtable_t pgtable); I don't think you are using either of these outside of huge_memory.c, so not sure you need to declare them here or make them non-static? > int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, > pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, > struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 58125fbcc532..150163ad77d3 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, > } > EXPORT_SYMBOL_GPL(thp_get_unmapped_area); > > -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma, > - unsigned long haddr, struct folio **foliop, > - unsigned long addr) > +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma, > + unsigned long haddr, struct folio **foliop, > + unsigned long addr) > { > struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true); > > @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order) > count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); > } > > -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, > - struct vm_area_struct *vma, unsigned long haddr, > - pgtable_t pgtable) > +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, > + struct vm_area_struct *vma, unsigned long haddr, > + pgtable_t pgtable) > { > - pmd_t entry; > + pmd_t entry, old_pmd; > + bool is_pmd_none = pmd_none(*vmf->pmd); > > entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); > entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); > folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); > folio_add_lru_vma(folio, vma); > - pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); > + if (!is_pmd_none) { > + old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd); > + if (pmd_uffd_wp(old_pmd)) > + entry = pmd_mkuffd_wp(entry); I don't really get this; entry is writable, so I wouldn't expect to also be setting uffd-wp here? That combination is not allowed and is checked for in page_table_check_pte_flags(). It looks like you expect to get here in the uffd-wp-async case, which used to cause the pmd to be split to ptes. I'm guessing (but don't know for sure) that would cause the uffd-wp bit to be set in each of the new ptes, then during fallback to handling the wp fault on the pte, uffd would handle it? > + } > + if (pgtable) > + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); Should this call be moved outside of here? It doesn't really feel like it belongs. Could it be called before calling map_pmd_thp() for the site that has a pgtable? > set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); > update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); > add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); > - mm_inc_nr_ptes(vma->vm_mm); > + if (is_pmd_none) > + mm_inc_nr_ptes(vma->vm_mm); > } > > static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) > @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf) > spin_unlock(vmf->ptl); > } > > +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf, > + unsigned long haddr, > + struct folio *folio) > +{ > + struct vm_area_struct *vma = vmf->vma; > + vm_fault_t ret = 0; > + > + ret = check_stable_address_space(vma->vm_mm); > + if (ret) > + goto out; > + map_pmd_thp(folio, vmf, vma, haddr, NULL); > +out: > + return ret; > +} > + > +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr) > +{ > + struct vm_area_struct *vma = vmf->vma; > + gfp_t gfp = vma_thp_gfp_mask(vma); > + struct mmu_notifier_range range; > + struct folio *folio = NULL; > + vm_fault_t ret = 0; > + > + ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio, > + vmf->address); Just checking: the PTE table was already allocated during the read fault, right? So we don't have to allocate it here. > + if (ret) > + goto out; > + > + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr, > + haddr + HPAGE_PMD_SIZE); > + mmu_notifier_invalidate_range_start(&range); > + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); > + if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) > + goto unlock; > + ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio); > + if (!ret) > + __thp_fault_success_stats(vma, HPAGE_PMD_ORDER); > +unlock: > + spin_unlock(vmf->ptl); > + mmu_notifier_invalidate_range_end(&range); I'll confess I don't understand all the mmu notifier rules. But the doc at Documentation/mm/mmu_notifier.rst implies that the notification must be done while holding the PTL. Although that's not how wp_page_copy(). Are you confident what you have done is correct? Thanks, Ryan > +out: > + return ret; > +} > + > vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) > { > const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; > @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) > vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); > VM_BUG_ON_VMA(!vma->anon_vma, vma); > > - if (is_huge_zero_pmd(orig_pmd)) > + if (is_huge_zero_pmd(orig_pmd)) { > + vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr); > + > + if (!(ret & VM_FAULT_FALLBACK)) > + return ret; > + > + /* Fallback to splitting PMD if THP cannot be allocated */ > goto fallback; > + } > > spin_lock(vmf->ptl); > > diff --git a/mm/memory.c b/mm/memory.c > index 3c01d68065be..c081a25f5173 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) > if (vma_is_anonymous(vma)) { > if (likely(!unshare) && > userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { > - if (userfaultfd_wp_async(vmf->vma)) > + if (!userfaultfd_wp_async(vmf->vma)) > + return handle_userfault(vmf, VM_UFFD_WP); > + if (!is_huge_zero_pmd(vmf->orig_pmd)) > goto split; > - return handle_userfault(vmf, VM_UFFD_WP); > } > return do_huge_pmd_wp_page(vmf); > }