On Fri, Dec 20, 2024 at 07:52:43PM +0100, David Hildenbrand wrote: > On 17.12.24 06:12, Alistair Popple wrote: > > Currently DAX folio/page reference counts are managed differently to > > normal pages. To allow these to be managed the same as normal pages > > introduce vmf_insert_folio_pud. This will map the entire PUD-sized folio > > and take references as it would for a normally mapped page. > > > > This is distinct from the current mechanism, vmf_insert_pfn_pud, which > > simply inserts a special devmap PUD entry into the page table without > > holding a reference to the page for the mapping. > > > > Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx> > > --- > > include/linux/huge_mm.h | 11 +++++- > > mm/huge_memory.c | 96 ++++++++++++++++++++++++++++++++++++------ > > 2 files changed, 95 insertions(+), 12 deletions(-) > > > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > > index 93e509b..012137b 100644 > > --- a/include/linux/huge_mm.h > > +++ b/include/linux/huge_mm.h > > @@ -39,6 +39,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > > vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); > > vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); > > +vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, bool write); > > enum transparent_hugepage_flag { > > TRANSPARENT_HUGEPAGE_UNSUPPORTED, > > @@ -458,6 +459,11 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) > > return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); > > } > > +static inline bool is_huge_zero_pud(pud_t pud) > > +{ > > + return false; > > +} > > + > > struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); > > void mm_put_huge_zero_folio(struct mm_struct *mm); > > @@ -604,6 +610,11 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) > > return false; > > } > > +static inline bool is_huge_zero_pud(pud_t pud) > > +{ > > + return false; > > +} > > + > > I'm really not a fan of these, because I assume we will never ever implement > these any time soon. (who will waste 1 GiG or more on faster reading of 0s?) Ok, I will drop these. > > static inline void mm_put_huge_zero_folio(struct mm_struct *mm) > > { > > return; > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index 120cd2c..5081808 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > > @@ -1482,19 +1482,17 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, > > struct mm_struct *mm = vma->vm_mm; > > pgprot_t prot = vma->vm_page_prot; > > pud_t entry; > > - spinlock_t *ptl; > > - ptl = pud_lock(mm, pud); > > if (!pud_none(*pud)) { > > if (write) { > > if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) > > - goto out_unlock; > > + return; > > entry = pud_mkyoung(*pud); > > entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); > > if (pudp_set_access_flags(vma, addr, pud, entry, 1)) > > update_mmu_cache_pud(vma, addr, pud); > > } > > - goto out_unlock; > > + return; > > } > > entry = pud_mkhuge(pfn_t_pud(pfn, prot)); > > @@ -1508,9 +1506,6 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, > > } > > set_pud_at(mm, addr, pud, entry); > > update_mmu_cache_pud(vma, addr, pud); > > - > > -out_unlock: > > - spin_unlock(ptl); > > } > > /** > > @@ -1528,6 +1523,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) > > unsigned long addr = vmf->address & PUD_MASK; > > struct vm_area_struct *vma = vmf->vma; > > pgprot_t pgprot = vma->vm_page_prot; > > + spinlock_t *ptl; > > /* > > * If we had pud_special, we could avoid all these restrictions, > > @@ -1545,10 +1541,55 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) > > track_pfn_insert(vma, &pgprot, pfn); > > + ptl = pud_lock(vma->vm_mm, vmf->pud); > > insert_pfn_pud(vma, addr, vmf->pud, pfn, write); > > + spin_unlock(ptl); > > + > > return VM_FAULT_NOPAGE; > > } > > EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); > > + > > +/** > > + * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry > > + * @vmf: Structure describing the fault > > + * @pfn: pfn of the page to insert > > + * @write: whether it's a write fault > > + * > > + * Return: vm_fault_t value. > > + */ > > +vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, bool write) > > +{ > > + struct vm_area_struct *vma = vmf->vma; > > + unsigned long addr = vmf->address & PUD_MASK; > > + pfn_t pfn = pfn_to_pfn_t(folio_pfn(folio)); > > + pud_t *pud = vmf->pud; > > + pgprot_t prot = vma->vm_page_prot; > > See below, pfn, prot and page can likely go. > > > + struct mm_struct *mm = vma->vm_mm; > > + spinlock_t *ptl; > > + struct page *page; > > + > > + if (addr < vma->vm_start || addr >= vma->vm_end) > > + return VM_FAULT_SIGBUS; > > + > > + if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER)) > > + return VM_FAULT_SIGBUS; > > + > > + track_pfn_insert(vma, &prot, pfn); > > Oh, why is that required? We are inserting a folio and start messing with > VM_PAT on x86 that only applies to VM_PFNMAP mappings? :) > > > + > > + ptl = pud_lock(mm, pud); > > + if (pud_none(*vmf->pud)) { > > + page = pfn_t_to_page(pfn); > > Why are we suddenly working with that pfn_t whichcraft? :) Heh. Mostly because insert_pfn_{pmd|pud}() still requires such witchcraft. Of course once this series is merged there is a resonable chance we will be able to cast a spell to remove pfn_t entirely - https://lore.kernel.org/ linux-mm/172721874675.497781.3277495908107141898.stgit@xxxxxxxxxxxxxxxxxxxxxxxxx/ > > > > + folio = page_folio(page); > > Ehm, you got the folio ... passed into this function? > > Why can't that simply be > > folio_get(folio); > folio_add_file_rmap_pud(folio, folio_page(folio, 0), vma); No good reason. Likely it is a hangover from my original implementation, where this function was DAX specific and took a pfn_t instead of a folio. Peter Xu asked for a more generic version that took a folio and clearly I did a somewhat lazy implementation of it. Will clean it up as suggested. > > + folio_get(folio); > > + folio_add_file_rmap_pud(folio, page, vma); > > + add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR); > > + } > > + insert_pfn_pud(vma, addr, vmf->pud, pfn, write); > > + spin_unlock(ptl); > > + > > + return VM_FAULT_NOPAGE; > > +} > > +EXPORT_SYMBOL_GPL(vmf_insert_folio_pud); > > #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ > > void touch_pmd(struct vm_area_struct *vma, unsigned long addr, > > @@ -2146,7 +2187,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > > zap_deposited_table(tlb->mm, pmd); > > spin_unlock(ptl); > > } else if (is_huge_zero_pmd(orig_pmd)) { > > - zap_deposited_table(tlb->mm, pmd); > > + if (!vma_is_dax(vma) || arch_needs_pgtable_deposit()) > > + zap_deposited_table(tlb->mm, pmd); > > spin_unlock(ptl); > > } else { > > struct folio *folio = NULL; > > @@ -2634,12 +2676,24 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, > > orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); > > arch_check_zapped_pud(vma, orig_pud); > > tlb_remove_pud_tlb_entry(tlb, pud, addr); > > - if (vma_is_special_huge(vma)) { > > + if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { > > spin_unlock(ptl); > > /* No zero page support yet */ > > } else { > > - /* No support for anonymous PUD pages yet */ > > - BUG(); > > + struct page *page = NULL; > > + struct folio *folio; > > + > > + /* No support for anonymous PUD pages or migration yet */ > > + BUG_ON(vma_is_anonymous(vma) || !pud_present(orig_pud)); > > VM_WARN_ON_ONCE(). > > > + > > + page = pud_page(orig_pud); > > + folio = page_folio(page); > > + folio_remove_rmap_pud(folio, page, vma); > > + VM_BUG_ON_PAGE(!PageHead(page), page); > > Please drop that or so something like > > VM_WARN_ON_ONCE(page != folio_page(folio, 0)); > > > + add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR); > > + > > + spin_unlock(ptl); > > + tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE); > > } > > return 1; > > } > > @@ -2647,6 +2701,8 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, > > static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, > > unsigned long haddr) > > { > > + pud_t old_pud; > > + > > VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); > > VM_BUG_ON_VMA(vma->vm_start > haddr, vma); > > VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); > > @@ -2654,7 +2710,23 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, > > count_vm_event(THP_SPLIT_PUD); > > - pudp_huge_clear_flush(vma, haddr, pud); > > + old_pud = pudp_huge_clear_flush(vma, haddr, pud); > > + if (is_huge_zero_pud(old_pud)) > > + return; > > + > > + if (vma_is_dax(vma)) { > > Maybe you want > > if (!vma_is_dax(vma)) > return; > > To then reduce alignment. I suspect all other splitting code (besides anon > memory) will want to do the same thing here in the future. Yeah, that looks more readable. Thanks! > -- > Cheers, > > David / dhildenb >