Hey David, Thanks for taking time to review. On Wed, Apr 17, 2024 at 11:02 PM David Hildenbrand <david@xxxxxxxxxx> wrote: > > On 17.04.24 16:11, Lance Yang wrote: > > When the user no longer requires the pages, they would use madvise(madv_free) > > to mark the pages as lazy free. IMO, they would not typically rewrite to the > > given range. > > > > At present, a PMD-mapped THP marked as lazyfree during shrink_folio_list() > > is unconditionally split, which may be unnecessary. If the THP is exclusively > > mapped and clean, and the PMD associated with it is also clean, then we can > > attempt to remove the PMD mapping from it. This change will improve the > > efficiency of memory reclamation in this case. > > > > On an Intel i5 CPU, reclaiming 1GiB of PMD-mapped THPs using > > mem_cgroup_force_empty() results in the following runtimes in seconds > > (shorter is better): > > > > -------------------------------------------- > > | Old | New | Change | > > -------------------------------------------- > > | 0.683426 | 0.049197 | -92.80% | > > -------------------------------------------- > > > > Signed-off-by: Lance Yang <ioworker0@xxxxxxxxx> > > --- > > include/linux/huge_mm.h | 1 + > > include/linux/rmap.h | 1 + > > mm/huge_memory.c | 2 +- > > mm/rmap.c | 81 +++++++++++++++++++++++++++++++++++++++++ > > mm/vmscan.c | 7 ++++ > > 5 files changed, 91 insertions(+), 1 deletion(-) > > > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > > index 7cd07b83a3d0..02a71c05f68a 100644 > > --- a/include/linux/huge_mm.h > > +++ b/include/linux/huge_mm.h > > @@ -36,6 +36,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, > > int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > > pmd_t *pmd, unsigned long addr, pgprot_t newprot, > > unsigned long cp_flags); > > +inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd); > > > > vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); > > vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h > > index 0f906dc6d280..8c2f45713351 100644 > > --- a/include/linux/rmap.h > > +++ b/include/linux/rmap.h > > @@ -100,6 +100,7 @@ enum ttu_flags { > > * do a final flush if necessary */ > > TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: > > * caller holds it */ > > + TTU_LAZYFREE_THP = 0x100, /* avoid split PMD-mapped THP */ > > }; > > > > #ifdef CONFIG_MMU > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index 58f2c4745d80..309fba9624c2 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > > @@ -1801,7 +1801,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > > return ret; > > } > > > > -static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) > > +inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) > > { > > pgtable_t pgtable; > > > > diff --git a/mm/rmap.c b/mm/rmap.c > > index 2608c40dffad..4994f9e402d4 100644 > > --- a/mm/rmap.c > > +++ b/mm/rmap.c > > @@ -77,6 +77,7 @@ > > #include <linux/mm_inline.h> > > > > #include <asm/tlbflush.h> > > +#include <asm/tlb.h> > > > > #define CREATE_TRACE_POINTS > > #include <trace/events/tlb.h> > > @@ -1606,6 +1607,80 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, > > #endif > > } > > > > +static bool __try_to_unmap_lazyfree_thp(struct vm_area_struct *vma, > > + unsigned long address, > > + struct folio *folio) > > +{ > > + spinlock_t *ptl; > > + pmd_t *pmdp, orig_pmd; > > + struct mmu_notifier_range range; > > + struct mmu_gather tlb; > > + struct mm_struct *mm = vma->vm_mm; > > + struct page *page; > > + bool ret = false; > > + > > + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); > > + VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio); > > + VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio); > > + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); > > + > > + /* > > + * If we encounter a PMD-mapped THP that marked as lazyfree, we > > + * will try to unmap it without splitting. > > + * > > + * The folio exclusively mapped should only have two refs: > > + * one from the isolation and one from the rmap. > > + */ > > + if (folio_entire_mapcount(folio) != 1 || folio_test_dirty(folio) || > > + folio_ref_count(folio) != 2) > > folio_mapcount() == 1 is a bit nicer. Bit I assume you can drop that > completely and only check the refcount? Thanks for your suggestion! + if (folio_test_dirty(folio) || folio_ref_count(folio) != 2) I'm not sure if it's safe without checking the folio_mapcount. > > > + return false; > > + > > + pmdp = mm_find_pmd(mm, address); > > + if (unlikely(!pmdp)) > > + return false; > > + if (pmd_dirty(*pmdp)) > > + return false; > > + > > + tlb_gather_mmu(&tlb, mm); > > + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, > > + address & HPAGE_PMD_MASK, > > + (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); > > + mmu_notifier_invalidate_range_start(&range); > > + > > + ptl = pmd_lock(mm, pmdp); > > + orig_pmd = *pmdp; > > + if (unlikely(!pmd_present(orig_pmd) || !pmd_trans_huge(orig_pmd))) > > + goto out; > > + > > + page = pmd_page(orig_pmd); > > + if (unlikely(page_folio(page) != folio)) > > + goto out; > > + > > + orig_pmd = pmdp_huge_get_and_clear(mm, address, pmdp); > > + tlb_remove_pmd_tlb_entry(&tlb, pmdp, address); > > Until this point, the page could have been pinned (including GUP-fast) > and we might be in trouble if we drop it. Thanks for pointing that out! + if (pmd_dirty(orig_pmd) || folio_maybe_dma_pinned(folio) || folio_ref_count(folio) != 2) { + set_pmd_at(mm, address, pmdp, orig_pmd); + } else { Could I check the folio->_pincount using folio_maybe_dma_pinned() and then re-check the refcount here? Or should I just re-check the refcount? IIUC, this folio has been already unlinked from the PMD and the process cannot get an additional pin on this folio. Thanks again for the review! Lance > > -- > Cheers, > > David / dhildenb >