This patch optimizes lazyfreeing with PTE-mapped mTHP[1] (Inspired by David Hildenbrand[2]). We aim to avoid unnecessary folio splitting if the large folio is entirely within the given range. On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of the same size results in the following runtimes for madvise(MADV_FREE) in seconds (shorter is better): Folio Size | Old | New | Change ------------------------------------------ 4KiB | 0.590251 | 0.590259 | 0% 16KiB | 2.990447 | 0.185655 | -94% 32KiB | 2.547831 | 0.104870 | -95% 64KiB | 2.457796 | 0.052812 | -97% 128KiB | 2.281034 | 0.032777 | -99% 256KiB | 2.230387 | 0.017496 | -99% 512KiB | 2.189106 | 0.010781 | -99% 1024KiB | 2.183949 | 0.007753 | -99% 2048KiB | 0.002799 | 0.002804 | 0% [1] https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@xxxxxxx [2] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@xxxxxxxxxx Signed-off-by: Lance Yang <ioworker0@xxxxxxxxx> --- This patch is rebased on Ryan's patchset[3] and follows a similar pattern to madvise_cold_or_pageout_pte_range() for madvise_free_pte_range(). [3] https://lore.kernel.org/linux-mm/20240311150058.1122862-1-ryan.roberts@xxxxxxx The performance numbers are from v2. I did a quick benchmark run of v3 and nothing significantly changed. v2 -> v3: * Only skip all the PTEs for nr_pages when the number of batched PTEs matches nr_pages per Barry Song * Change folio_pte_batch() to consume an optional *any_dirty and *any_young function, per David Hildenbrand * Move the ptep_get_and_clear_full() loop into refresh_full_ptes(), per David Hildenbrand * Follow a similar pattern for madvise_free_pte_range(), per Ryan Roberts * https://lore.kernel.org/linux-mm/20240307061425.21013-1-ioworker0@xxxxxxxxx v1 -> v2: * Update the performance numbers * Update the changelog, suggested by Ryan Roberts * Check the COW folio, suggested by Yin Fengwei * Check if we are mapping all subpages, suggested by Barry Song, David Hildenbrand, Ryan Roberts * https://lore.kernel.org/linux-mm/20240225123215.86503-1-ioworker0@xxxxxxxxx include/linux/pgtable.h | 48 ++++++++++++ mm/internal.h | 17 ++++- mm/madvise.c | 159 +++++++++++++++++++++------------------- mm/memory.c | 4 +- 4 files changed, 149 insertions(+), 79 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 8cf1f2fe2c25..3ca9ae29e653 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -736,6 +736,54 @@ static inline void clear_not_present_full_ptes(struct mm_struct *mm, } #endif +/* Flags for refresh_full_ptes(). */ +typedef int __bitwise rfp_t; + +/* Refresh PTEs after pte_mkold(), droping the accessed bit. */ +#define RFP_CLEAR_YOUNG ((__force rfp_t)BIT(0)) + +/* Refresh PTEs after pte_mkclean(), droping the dirty bit. */ +#define RFP_CLEAR_DIRTY ((__force rfp_t)BIT(1)) + +/** + * refresh_full_ptes - refresh present PTEs that map consecutive pages of the + * same folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to refresh. + * @flags: Flags to indicate the PTE refresh semantics. + * @full: Whether we are clearing a full mm. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void refresh_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, rfp_t flags, int full) +{ + pte_t ptent; + + for (;;) { + ptent = ptep_get_and_clear_full(mm, addr, ptep, full); + + if (flags & RFP_CLEAR_YOUNG) + ptent = pte_mkold(ptent); + + if (flags & RFP_CLEAR_DIRTY) + ptent = pte_mkclean(ptent); + + set_pte_at(mm, addr, ptep, ptent); + + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} + #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/internal.h b/mm/internal.h index 9256d440a080..3610f2f65f8e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -130,16 +130,20 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, - bool *any_writable) + bool *any_writable, bool *any_young, bool *any_dirty) { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte, *ptep; - bool writable; + bool writable, young, dirty; int nr; if (any_writable) *any_writable = false; + if (any_young) + *any_young = false; + if (any_dirty) + *any_dirty = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); @@ -153,6 +157,11 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); + if (any_young) + young = !!pte_young(pte); + if (any_dirty) + dirty = !!pte_dirty(pte); + pte = __pte_batch_clear_ignored(pte, flags); if (!pte_same(pte, expected_pte)) @@ -168,6 +177,10 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, if (any_writable) *any_writable |= writable; + if (any_young) + *any_young |= young; + if (any_dirty) + *any_dirty |= dirty; nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, nr); diff --git a/mm/madvise.c b/mm/madvise.c index 56c7ba7bd558..f88b4d7d75cb 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -321,6 +321,38 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) file_permission(vma->vm_file, MAY_WRITE) == 0; } +static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, + struct folio *folio, pte_t *pte, + bool *any_young, bool *any_dirty) +{ + int max_nr = (end - addr) / PAGE_SIZE; + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + + return folio_pte_batch(folio, addr, pte, ptep_get(pte), max_nr, + fpb_flags, NULL, any_young, any_dirty); +} + +static inline bool madvise_split_folio(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, struct folio *folio, + pte_t **pte, spinlock_t **ptl) +{ + int err; + + if (!folio_trylock(folio)) + return false; + + folio_get(folio); + pte_unmap_unlock(*pte, *ptl); + *pte = NULL; + err = split_folio(folio); + folio_unlock(folio); + folio_put(folio); + + *pte = pte_offset_map_lock(mm, pmd, addr, ptl); + + return err == 0; +} + static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -334,9 +366,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, spinlock_t *ptl; struct folio *folio = NULL; LIST_HEAD(folio_list); - bool pageout_anon_only_filter; + bool pageout_anon_only_filter, any_young; unsigned int batch_count = 0; int nr; + const rfp_t rfp_flags = RFP_CLEAR_YOUNG; if (fatal_signal_pending(current)) return -EINTR; @@ -456,39 +489,25 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, * next pte in the range. */ if (folio_test_large(folio)) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | - FPB_IGNORE_SOFT_DIRTY; - int max_nr = (end - addr) / PAGE_SIZE; - - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, - fpb_flags, NULL); - + nr = madvise_folio_pte_batch(addr, end, folio, pte, + &any_young, NULL); + if (folio_estimated_sharers(folio) > 1) + continue; if (nr < folio_nr_pages(folio)) { - int err; - - if (folio_estimated_sharers(folio) > 1) - continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; - if (!folio_trylock(folio)) - continue; - folio_get(folio); arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - start_pte = NULL; - err = split_folio(folio); - folio_unlock(folio); - folio_put(folio); - if (err) - continue; - start_pte = pte = - pte_offset_map_lock(mm, pmd, addr, &ptl); + if (madvise_split_folio(mm, pmd, addr, folio, + &start_pte, &ptl)) + nr = 0; if (!start_pte) break; + pte = start_pte; arch_enter_lazy_mmu_mode(); - nr = 0; continue; } + if (any_young) + ptent = pte_mkyoung(ptent); } /* @@ -504,11 +523,9 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; - if (!pageout) { - for (; nr != 0; nr--, pte++, addr += PAGE_SIZE) { - if (ptep_test_and_clear_young(vma, addr, pte)) - tlb_remove_tlb_entry(tlb, pte, addr); - } + if (!pageout && pte_young(ptent)) { + refresh_full_ptes(mm, addr, pte, nr, rfp_flags, tlb->fullmm); + tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* @@ -642,6 +659,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, int nr_swap = 0; unsigned long next; int nr, max_nr; + bool any_young, any_dirty; + const rfp_t rfp_flags = RFP_CLEAR_YOUNG | RFP_CLEAR_DIRTY; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) @@ -654,7 +673,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); - for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { + for (; addr < end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); @@ -687,57 +706,52 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; /* - * If pmd isn't transhuge but the folio is large and - * is owned by only this process, split it and - * deactivate all pages. + * If we encounter a large folio, only split it if it is not + * fully mapped within the range we are operating on. Otherwise + * leave it as is so that it can be marked as lazyfree. If we + * fail to split a folio, leave it in place and advance to the + * next pte in the range. */ if (folio_test_large(folio)) { - int err; + nr = madvise_folio_pte_batch(addr, end, folio, pte, + &any_young, &any_dirty); + if (folio_estimated_sharers(folio) > 1) + continue; + if (nr < folio_nr_pages(folio)) { + arch_leave_lazy_mmu_mode(); + if (madvise_split_folio(mm, pmd, addr, folio, + &start_pte, &ptl)) + nr = 0; + if (!start_pte) + break; + pte = start_pte; + arch_enter_lazy_mmu_mode(); + continue; + } + if (any_young || any_dirty) + ptent = pte_mkyoung(pte_mkdirty(ptent)); + } - if (folio_estimated_sharers(folio) != 1) - break; - if (!folio_trylock(folio)) - break; - folio_get(folio); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - start_pte = NULL; - err = split_folio(folio); + if (!folio_trylock(folio)) + continue; + /* + * If we have a large folio at this point, we know it is fully mapped + * so if its mapcount is the same as its number of pages, it must be + * exclusive. + */ + if (folio_mapcount(folio) != folio_nr_pages(folio)) { folio_unlock(folio); - folio_put(folio); - if (err) - break; - start_pte = pte = - pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!start_pte) - break; - arch_enter_lazy_mmu_mode(); - pte--; - addr -= PAGE_SIZE; continue; } - if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { - if (!folio_trylock(folio)) - continue; - /* - * If folio is shared with others, we mustn't clear - * the folio's dirty flag. - */ - if (folio_mapcount(folio) != 1) { - folio_unlock(folio); - continue; - } - if (folio_test_swapcache(folio) && !folio_free_swap(folio)) { folio_unlock(folio); continue; } - folio_clear_dirty(folio); - folio_unlock(folio); } + folio_unlock(folio); if (pte_young(ptent) || pte_dirty(ptent)) { /* @@ -746,13 +760,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, * the portability, remap the pte with old|clean * after pte clearing. */ - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - - ptent = pte_mkold(ptent); - ptent = pte_mkclean(ptent); - set_pte_at(mm, addr, pte, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); + refresh_full_ptes(mm, addr, pte, nr, rfp_flags, tlb->fullmm); + tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); } diff --git a/mm/memory.c b/mm/memory.c index 25c0ef1c7ff3..7121e5cc74d9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -989,7 +989,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma flags |= FPB_IGNORE_SOFT_DIRTY; nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable); + &any_writable, NULL, NULL); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1551,7 +1551,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, - NULL); + NULL, NULL, NULL); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, -- 2.33.1