From: Zi Yan <ziy@xxxxxxxxxx> It mimics PMD-level THP split. In addition, to support PMD-mapped PUD THP, PMDPageInPUD() is used. For the mapcount of PMD-mapped PUD THP, sub_compound_mapcount() is used, which uses (head_page+3).compound_mapcount, since each base page's mapcount is used for PTE mapping. PagePUDDoubleMap() is used for both PUD-mapped and PMD-mapped PUD THPs. page_xxx_rmap() functions now have an extra page order parameter to distinguish different THP sizes. Signed-off-by: Zi Yan <ziy@xxxxxxxxxx> --- arch/x86/include/asm/pgtable.h | 21 ++ include/linux/huge_mm.h | 31 +- include/linux/memcontrol.h | 5 + include/linux/mm.h | 25 +- include/linux/page-flags.h | 47 +++ include/linux/pgtable.h | 17 ++ include/linux/rmap.h | 9 +- include/linux/swap.h | 2 + include/linux/vm_event_item.h | 4 + kernel/events/uprobes.c | 4 +- mm/huge_memory.c | 536 +++++++++++++++++++++++++++++++-- mm/hugetlb.c | 4 +- mm/khugepaged.c | 6 +- mm/ksm.c | 4 +- mm/memcontrol.c | 13 + mm/memory.c | 18 +- mm/migrate.c | 10 +- mm/page_alloc.c | 20 +- mm/pgtable-generic.c | 11 + mm/rmap.c | 106 +++++-- mm/swap.c | 31 ++ mm/swapfile.c | 4 +- mm/userfaultfd.c | 2 +- mm/util.c | 16 +- mm/vmstat.c | 4 + 25 files changed, 852 insertions(+), 98 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 15334f5ba172..fe4600256bc7 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -630,6 +630,12 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd) __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } +static inline pud_t pud_mknotpresent(pud_t pud) +{ + return pfn_pud(pud_pfn(pud), + __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) @@ -1246,6 +1252,21 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) } #endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#ifndef pudp_establish +#define pudp_establish pudp_establish +static inline pud_t pudp_establish(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, pud_t pud) +{ + if (IS_ENABLED(CONFIG_SMP)) { + return xchg(pudp, pud); + } else { + pud_t old = *pudp; + *pudp = pud; + return old; + } +} +#endif + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0c20a8ea6911..589e5af5a1c2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -227,17 +227,27 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page); +bool can_split_huge_pud_page(struct page *page, int *pextra_pins); +int split_huge_pud_page_to_list(struct page *page, struct list_head *list); +static inline int split_huge_pud_page(struct page *page) +{ + return split_huge_pud_page_to_list(page, NULL); +} void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, - unsigned long address); + unsigned long address, bool freeze, struct page *page); #define split_huge_pud(__vma, __pud, __address) \ do { \ pud_t *____pud = (__pud); \ if (pud_trans_huge(*____pud) \ || pud_devmap(*____pud)) \ - __split_huge_pud(__vma, __pud, __address); \ + __split_huge_pud(__vma, __pud, __address, \ + false, NULL); \ } while (0) +void split_huge_pud_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct page *page); + extern int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); extern void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -427,8 +437,25 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) {} +static inline bool +can_split_huge_pud_page(struct page *page, int *pextra_pins) +{ + BUILD_BUG(); + return false; +} +static inline int +split_huge_pud_page_to_list(struct page *page, struct list_head *list) +{ + return 0; +} +static inline int split_huge_pud_page(struct page *page) +{ + return 0; +} #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) +static inline void split_huge_pud_address(struct vm_area_struct *vma, + unsigned long address, bool freeze, struct page *page) {} static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d0b036123c6a..3ccff298d4b2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -929,6 +929,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, #ifdef CONFIG_TRANSPARENT_HUGEPAGE void mem_cgroup_split_huge_fixup(struct page *head); +void mem_cgroup_split_huge_pud_fixup(struct page *head); #endif #else /* CONFIG_MEMCG */ @@ -1261,6 +1262,10 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head) { } +static inline void mem_cgroup_split_huge_pud_fixup(struct page *head) +{ +} + static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) diff --git a/include/linux/mm.h b/include/linux/mm.h index cb1ccf804404..8a85d96ab7e5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -797,6 +797,24 @@ static inline int compound_mapcount(struct page *page) return head_compound_mapcount(page); } +static inline unsigned int compound_order(struct page *page); +static inline atomic_t *sub_compound_mapcount_ptr(struct page *page, int sub_level) +{ + struct page *head = compound_head(page); + + VM_BUG_ON_PAGE(!PageCompound(page), page); + VM_BUG_ON_PAGE(compound_order(head) != HPAGE_PUD_ORDER, page); + VM_BUG_ON_PAGE((page - head) % HPAGE_PMD_NR, page); + VM_BUG_ON_PAGE(sub_level != 1, page); + return &page[2 + sub_level].compound_mapcount; +} + +/* Only works for PUD pages */ +static inline int sub_compound_mapcount(struct page *page) +{ + return atomic_read(sub_compound_mapcount_ptr(page, 1)) + 1; +} + /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -889,13 +907,6 @@ static inline void destroy_compound_page(struct page *page) compound_page_dtors[page[1].compound_dtor](page); } -static inline unsigned int compound_order(struct page *page) -{ - if (!PageHead(page)) - return 0; - return page[1].compound_order; -} - static inline bool hpage_pincount_available(struct page *page) { /* diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index fbbb841a9346..cdca0165d2db 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -235,6 +235,9 @@ static inline void page_init_poison(struct page *page, size_t size) * * PF_SECOND: * the page flag is stored in the first tail page. + * + * PF_THIRD: + * the page flag is stored in the second tail page. */ #define PF_POISONED_CHECK(page) ({ \ VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \ @@ -253,6 +256,9 @@ static inline void page_init_poison(struct page *page, size_t size) #define PF_SECOND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ PF_POISONED_CHECK(&page[1]); }) +#define PF_THIRD(page, enforce) ({ \ + VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ + PF_POISONED_CHECK(&page[2]); }) /* * Macros to create function definitions for page flags @@ -674,6 +680,29 @@ static inline int PageTransTail(struct page *page) return PageTail(page); } +#define HPAGE_PMD_SHIFT PMD_SHIFT +#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) +#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) + +#define HPAGE_PUD_SHIFT PUD_SHIFT +#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT) +#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER) + +static inline unsigned int compound_order(struct page *page) +{ + if (!PageHead(page)) + return 0; + return page[1].compound_order; +} + + +static inline int PMDPageInPUD(struct page *page) +{ + struct page *head = compound_head(page); + return (PageCompound(page) && compound_order(head) == HPAGE_PUD_ORDER && + ((page - head) % HPAGE_PMD_NR == 0)); +} + /* * PageDoubleMap indicates that the compound page is mapped with PTEs as well * as PMDs. @@ -689,13 +718,31 @@ static inline int PageTransTail(struct page *page) */ PAGEFLAG(DoubleMap, double_map, PF_SECOND) TESTSCFLAG(DoubleMap, double_map, PF_SECOND) +/* + * PagePUDDoubleMap indicates that the compound page is mapped with PMDs as well + * as PUDs. + * + * This is required for optimization of rmap operations for THP: we can postpone + * per small page mapcount accounting (and its overhead from atomic operations) + * until the first PUD split. + * + * For the page PagePUDDoubleMap means ->_mapcount in all sub-PMD pages is + * offset up by one. This reference will go away with last sub_compound_mapcount. + * + * See also __split_huge_pud_locked() and page_remove_anon_compound_rmap(). + */ +PAGEFLAG(PUDDoubleMap, double_map, PF_THIRD) + TESTSCFLAG(PUDDoubleMap, double_map, PF_THIRD) #else TESTPAGEFLAG_FALSE(TransHuge) TESTPAGEFLAG_FALSE(TransCompound) TESTPAGEFLAG_FALSE(TransCompoundMap) TESTPAGEFLAG_FALSE(TransTail) +TESTPAGEFLAG_FALSE(PMDPageInPUD) PAGEFLAG_FALSE(DoubleMap) TESTSCFLAG_FALSE(DoubleMap) +PAGEFLAG_FALSE(PUDDoubleMap) + TESTSETFLAG_FALSE(PUDDoubleMap) #endif /* diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 8ef358c386af..7acf218a8879 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -505,6 +505,11 @@ extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif +#ifndef __HAVE_ARCH_PUDP_INVALIDATE +extern pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp); +#endif + #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { @@ -1158,6 +1163,18 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif +#ifndef pud_read_atomic +static inline pud_t pud_read_atomic(pud_t *pudp) +{ + /* + * Depend on compiler for an atomic pmd read. NOTE: this is + * only going to work, if the pmdval_t isn't larger than + * an unsigned long. + */ + return *pudp; +} +#endif + #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 0af61dd193d2..c43da5919354 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -99,6 +99,7 @@ enum ttu_flags { TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: * caller holds it */ TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ + TTU_SPLIT_HUGE_PUD = 0x200, /* split huge PUD if any */ }; #ifdef CONFIG_MMU @@ -171,13 +172,13 @@ struct anon_vma *page_get_anon_vma(struct page *page); */ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); + unsigned long, bool, int); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, int); + unsigned long, int, int); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); + unsigned long, bool, int); void page_add_file_rmap(struct page *, bool); -void page_remove_rmap(struct page *, bool); +void page_remove_rmap(struct page *, bool, int); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); diff --git a/include/linux/swap.h b/include/linux/swap.h index 5c48713221fe..871c62211ecd 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -340,6 +340,8 @@ extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); +extern void lru_add_pud_page_tail(struct page *page, struct page *page_tail, + struct lruvec *lruvec, struct list_head *head); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a3f1093a55bb..b336de64586c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -96,6 +96,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_FAULT_FALLBACK_PUD, THP_FAULT_FALLBACK_PUD_CHARGE, THP_SPLIT_PUD, + THP_SPLIT_PUD_PAGE, + THP_SPLIT_PUD_PAGE_FAILED, + THP_ZERO_PUD_PAGE_ALLOC, + THP_ZERO_PUD_PAGE_ALLOC_FAILED, #endif THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0e18aaf23a7b..834b350a49f6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -183,7 +183,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); + page_add_new_anon_rmap(new_page, vma, addr, false, 0); lru_cache_add_inactive_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ @@ -200,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, false, 0); if (!page_mapped(old_page)) try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6da9b02501b7..398f1b52f789 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -618,7 +618,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr, true); + page_add_new_anon_rmap(page, vma, haddr, true, HPAGE_PMD_ORDER); lru_cache_add_inactive_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); @@ -991,7 +991,7 @@ static int __do_huge_pud_anonymous_page(struct vm_fault *vmf, struct page *page, entry = mk_huge_pud(page, vma->vm_page_prot); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr, true); + page_add_new_anon_rmap(page, vma, haddr, true, HPAGE_PUD_ORDER); lru_cache_add_inactive_or_unevictable(page, vma); pgtable_trans_huge_pud_deposit(vma->vm_mm, vmf->pud, virt_to_page(pmd_pgtable)); @@ -1384,7 +1384,7 @@ vm_fault_t do_huge_pud_wp_page(struct vm_fault *vmf, pud_t orig_pud) unlock_page(page); spin_unlock(vmf->ptl); fallback: - __split_huge_pud(vma, vmf->pud, vmf->address); + __split_huge_pud(vma, vmf->pud, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } @@ -1825,9 +1825,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); - page_remove_rmap(page, true); + page_remove_rmap(page, true, HPAGE_PMD_ORDER); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_PAGE(!PageHead(page) && !PMDPageInPUD(page), page); } else if (thp_migration_supported()) { swp_entry_t entry; @@ -2111,7 +2111,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pud_present(orig_pud)) { page = pud_page(orig_pud); - page_remove_rmap(page, true); + page_remove_rmap(page, true, HPAGE_PUD_ORDER); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else @@ -2134,8 +2134,16 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, } static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, - unsigned long haddr) + unsigned long haddr, bool freeze) { + struct mm_struct *mm = vma->vm_mm; + struct page *page; + pgtable_t pgtable; + pud_t _pud, old_pud; + bool young, write, dirty, soft_dirty; + unsigned long addr; + int i; + VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); @@ -2143,23 +2151,141 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, count_vm_event(THP_SPLIT_PUD); - pudp_huge_clear_flush_notify(vma, haddr, pud); + if (!vma_is_anonymous(vma)) { + _pud = pudp_huge_clear_flush_notify(vma, haddr, pud); + /* + * We are going to unmap this huge page. So + * just go ahead and zap it + */ + if (arch_needs_pgtable_deposit()) + zap_pud_deposited_table(mm, pud); + if (vma_is_dax(vma)) + return; + page = pud_page(_pud); + if (!PageReferenced(page) && pud_young(_pud)) + SetPageReferenced(page); + page_remove_rmap(page, true, HPAGE_PUD_ORDER); + put_page(page); + add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PUD_NR); + return; + } + + /* See the comment above pmdp_invalidate() in __split_huge_pmd_locked() */ + old_pud = pudp_invalidate(vma, haddr, pud); + + page = pud_page(old_pud); + VM_BUG_ON_PAGE(!page_count(page), page); + page_ref_add(page, (1<<(HPAGE_PUD_ORDER-HPAGE_PMD_ORDER)) - 1); + if (pud_dirty(old_pud)) + SetPageDirty(page); + write = pud_write(old_pud); + young = pud_young(old_pud); + dirty = pud_dirty(old_pud); + soft_dirty = pud_soft_dirty(old_pud); + + pgtable = pgtable_trans_huge_pud_withdraw(mm, pud); + pud_populate_with_pgtable(mm, &_pud, pgtable); + + for (i = 0, addr = haddr; i < HPAGE_PUD_NR; + i += HPAGE_PMD_NR, addr += PMD_SIZE) { + pmd_t entry, *pmd; + /* + * Note that NUMA hinting access restrictions are not + * transferred to avoid any possibility of altering + * permissions across VMAs. + */ + if (freeze) { + swp_entry_t swp_entry; + + swp_entry = make_migration_entry(page + i, write); + entry = swp_entry_to_pmd(swp_entry); + if (soft_dirty) + entry = pmd_swp_mksoft_dirty(entry); + } else { + entry = mk_huge_pmd(page + i, READ_ONCE(vma->vm_page_prot)); + entry = maybe_pmd_mkwrite(entry, vma); + if (!write) + entry = pmd_wrprotect(entry); + if (!young) + entry = pmd_mkold(entry); + if (soft_dirty) + entry = pmd_mksoft_dirty(entry); + } + pmd = pmd_offset(&_pud, addr); + VM_BUG_ON(!pmd_none(*pmd)); + set_pmd_at(mm, addr, pmd, entry); + /* distinguish between pud compound_mapcount and pmd compound_mapcount */ + if (atomic_inc_and_test(sub_compound_mapcount_ptr(&page[i], 1))) { + /* first pmd-mapped pud page */ + lock_page_memcg(page); + __inc_lruvec_page_state(page, NR_ANON_THPS); + unlock_page_memcg(page); + } + } + + /* + * Set PG_double_map before dropping compound_mapcount to avoid + * false-negative page_mapped(). + */ + if (compound_mapcount(page) > 1 && !TestSetPagePUDDoubleMap(page)) { + for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) + /* distinguish between pud compound_mapcount and pmd compound_mapcount */ + atomic_inc(sub_compound_mapcount_ptr(&page[i], 1)); + } + + lock_page_memcg(page); + if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { + /* Last compound_mapcount is gone. */ + __dec_lruvec_page_state(page, NR_ANON_THPS_PUD); + if (TestClearPagePUDDoubleMap(page)) { + /* No need in mapcount reference anymore */ + for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) + /* distinguish between pud compound_mapcount and pmd compound_mapcount */ + atomic_dec(sub_compound_mapcount_ptr(&page[i], 1)); + } + } + unlock_page_memcg(page); + + smp_wmb(); /* make pte visible before pmd */ + pud_populate_with_pgtable(mm, pud, pgtable); + + if (freeze) { + for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) { + page_remove_rmap(page + i, true, HPAGE_PMD_ORDER); + put_page(page + i); + } + } } void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, - unsigned long address) + unsigned long address, bool freeze, struct page *page) { spinlock_t *ptl; + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & HPAGE_PUD_MASK; struct mmu_notifier_range range; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); - ptl = pud_lock(vma->vm_mm, pud); - if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) + ptl = pud_lock(mm, pud); + + /* + * If caller asks to setup a migration entries, we need a page to check + * pmd against. Otherwise we can end up replacing wrong page. + */ + VM_BUG_ON(freeze && !page); + if (page && page != pud_page(*pud)) goto out; - __split_huge_pud_locked(vma, pud, range.start); + + if (pud_trans_huge(*pud)) { + page = pud_page(*pud); + if (PageMlocked(page)) + clear_page_mlock(page); + } else if (unlikely(!pud_devmap(*pud))) + goto out; + __split_huge_pud_locked(vma, pud, haddr, freeze); out: spin_unlock(ptl); @@ -2169,6 +2295,281 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, */ mmu_notifier_invalidate_range_only_end(&range); } + +void split_huge_pud_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct page *page) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset(vma->vm_mm, address); + if (!pgd_present(*pgd)) + return; + + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return; + + pud = pud_offset(p4d, address); + + __split_huge_pud(vma, pud, address, freeze, page); +} + +static void unmap_pud_page(struct page *page) +{ + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | + TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PUD; + bool unmap_success; + + VM_BUG_ON_PAGE(!PageHead(page), page); + + if (PageAnon(page)) + ttu_flags |= TTU_SPLIT_FREEZE; + + unmap_success = try_to_unmap(page, ttu_flags); + VM_BUG_ON_PAGE(!unmap_success, page); +} + +static void remap_pud_page(struct page *page) +{ + int i; + + VM_BUG_ON(!PageTransHuge(page)); + if (compound_order(page) == HPAGE_PUD_ORDER) { + remove_migration_ptes(page, page, true); + } else if (compound_order(page) == HPAGE_PMD_ORDER) { + for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) + remove_migration_ptes(page + i, page + i, true); + } else + VM_BUG_ON_PAGE(1, page); +} + +static void __split_huge_pud_page_tail(struct page *head, int tail, + struct lruvec *lruvec, struct list_head *list) +{ + struct page *page_tail = head + tail; + + VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); + + /* + * Clone page flags before unfreezing refcount. + * + * After successful get_page_unless_zero() might follow flags change, + * for example lock_page() which set PG_waiters. + */ + + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (head->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_swapcache) | + (1L << PG_mlocked) | + (1L << PG_uptodate) | + (1L << PG_active) | + (1L << PG_locked) | + (1L << PG_unevictable) | + (1L << PG_dirty) | + /* preserve THP */ + (1L << PG_head))); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, + page_tail); + page_tail->mapping = head->mapping; + page_tail->index = head->index + tail; + + /* Page flags also must be visible before we make the page PMD-compound. */ + smp_wmb(); + + clear_compound_head(page_tail); + prep_compound_page(page_tail, HPAGE_PMD_ORDER); + prep_transhuge_page(page_tail); + + /* Finally unfreeze refcount. Additional reference from page cache. */ + page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || + PageSwapCache(head))); + + if (page_is_young(head)) + set_page_young(page_tail); + if (page_is_idle(head)) + set_page_idle(page_tail); + + page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + lru_add_pud_page_tail(head, page_tail, lruvec, list); +} + +static void __split_huge_pud_page(struct page *page, struct list_head *list, + unsigned long flags) +{ + struct page *head = compound_head(page); + pg_data_t *pgdat = page_pgdat(head); + struct lruvec *lruvec; + int i; + + lruvec = mem_cgroup_page_lruvec(head, pgdat); + + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_pud_fixup(head); + + /* no file-back page support yet */ + VM_BUG_ON(!PageAnon(page)); + + for (i = HPAGE_PUD_NR - HPAGE_PMD_NR; i >= 1; i -= HPAGE_PMD_NR) { + __split_huge_pud_page_tail(head, i, lruvec, list); + } + /* reset head page order */ + prep_compound_page(head, HPAGE_PMD_ORDER); + prep_transhuge_page(head); + + page_ref_inc(head); + + spin_unlock_irqrestore(&pgdat->lru_lock, flags); + + remap_pud_page(head); + + for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) { + struct page *subpage = head + i; + + if (subpage == page) + continue; + unlock_page(subpage); + + /* + * Subpages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(subpage); + } +} +/* Racy check whether the huge page can be split */ +bool can_split_huge_pud_page(struct page *page, int *pextra_pins) +{ + int extra_pins; + + VM_BUG_ON(!PageAnon(page)); + + extra_pins = PageSwapCache(page) ? HPAGE_PUD_NR : 0; + + if (pextra_pins) + *pextra_pins = extra_pins; + return total_mapcount(page) == page_count(page) - extra_pins - 1; +} + +/* + * This function splits huge page into normal pages. @page can point to any + * subpage of huge page to split. Split doesn't change the position of @page. + * + * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. + * The huge page must be locked. + * + * If @list is null, tail pages will be added to LRU list, otherwise, to @list. + * + * Both head page and tail pages will inherit mapping, flags, and so on from + * the hugepage. + * + * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if + * they are not mapped. + * + * Returns 0 if the hugepage is split successfully. + * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under + * us. + */ +int split_huge_pud_page_to_list(struct page *page, struct list_head *list) +{ + struct page *head = compound_head(page); + struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); + struct deferred_split *ds_queue = get_deferred_split_queue(head); + struct anon_vma *anon_vma = NULL; + struct address_space *mapping = NULL; + int count, mapcount, extra_pins, ret; + bool mlocked; + unsigned long flags; + + VM_BUG_ON_PAGE(is_huge_zero_page(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + + if (PageWriteback(page)) + return -EBUSY; + + /* + * The caller does not necessarily hold an mmap_sem that would + * prevent the anon_vma disappearing so we first we take a + * reference to it and then lock the anon_vma for write. This + * is similar to page_lock_anon_vma_read except the write lock + * is taken to serialise against parallel split or collapse + * operations. + */ + anon_vma = page_get_anon_vma(head); + if (!anon_vma) { + ret = -EBUSY; + goto out; + } + mapping = NULL; + anon_vma_lock_write(anon_vma); + /* + * Racy check if we can split the page, before unmap_pud_page() will + * split PUDs + */ + if (!can_split_huge_pud_page(head, &extra_pins)) { + ret = -EBUSY; + goto out_unlock; + } + + mlocked = PageMlocked(page); + unmap_pud_page(head); + VM_BUG_ON_PAGE(compound_mapcount(head), head); + + /* Make sure the page is not on per-CPU pagevec as it takes pin */ + if (mlocked) + lru_add_drain(); + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irqsave(&pgdata->lru_lock, flags); + + /* Prevent deferred_split_scan() touching ->_refcount */ + spin_lock(&ds_queue->split_queue_lock); + count = page_count(head); + mapcount = total_mapcount(head); + if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { + if (!list_empty(page_deferred_list(head))) { + ds_queue->split_queue_len--; + list_del(page_deferred_list(head)); + } + if (mapping) { + __dec_node_page_state(page, NR_SHMEM_THPS); + } + spin_unlock(&ds_queue->split_queue_lock); + __split_huge_pud_page(page, list, flags); + ret = 0; + } else { + if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { + pr_alert("total_mapcount: %u, page_count(): %u\n", + mapcount, count); + if (PageTail(page)) + dump_page(head, NULL); + dump_page(page, "total_mapcount(head) > 0"); + } + spin_unlock(&ds_queue->split_queue_lock); + spin_unlock_irqrestore(&pgdata->lru_lock, flags); + remap_pud_page(head); + ret = -EBUSY; + } + +out_unlock: + if (anon_vma) { + anon_vma_unlock_write(anon_vma); + put_anon_vma(anon_vma); + } +out: + count_vm_event(!ret ? THP_SPLIT_PUD_PAGE : THP_SPLIT_PUD_PAGE_FAILED); + return ret; +} #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, @@ -2209,7 +2610,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, unsigned long haddr, bool freeze) { struct mm_struct *mm = vma->vm_mm; - struct page *page; + struct page *page, *head; pgtable_t pgtable; pmd_t old_pmd, _pmd; bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; @@ -2239,7 +2640,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, set_page_dirty(page); if (!PageReferenced(page) && pmd_young(_pmd)) SetPageReferenced(page); - page_remove_rmap(page, true); + page_remove_rmap(page, true, HPAGE_PMD_ORDER); put_page(page); add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); return; @@ -2298,7 +2699,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, uffd_wp = pmd_uffd_wp(old_pmd); } VM_BUG_ON_PAGE(!page_count(page), page); - page_ref_add(page, HPAGE_PMD_NR - 1); + head = compound_head(page); + page_ref_add(head, HPAGE_PMD_NR - 1); /* * Withdraw the table only after we mark the pmd entry invalid. @@ -2344,14 +2746,24 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, /* * Set PG_double_map before dropping compound_mapcount to avoid * false-negative page_mapped(). + * Don't set it if the PUD page is mapped at PUD level, since + * page_mapped() is true in that case. */ - if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { + if (((PMDPageInPUD(page) && + sub_compound_mapcount(page) > + (1 + PagePUDDoubleMap(compound_head(page)))) || + (!PMDPageInPUD(page) && + compound_mapcount(page) > 1)) + && !TestSetPageDoubleMap(page)) { for (i = 0; i < HPAGE_PMD_NR; i++) atomic_inc(&page[i]._mapcount); } lock_page_memcg(page); - if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { + if ((PMDPageInPUD(page) && + atomic_add_negative(-1, sub_compound_mapcount_ptr(page, 1))) || + (!PMDPageInPUD(page) && + atomic_add_negative(-1, compound_mapcount_ptr(page)))) { /* Last compound_mapcount is gone. */ __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { @@ -2367,7 +2779,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (freeze) { for (i = 0; i < HPAGE_PMD_NR; i++) { - page_remove_rmap(page + i, false); + page_remove_rmap(page + i, false, 0); put_page(page + i); } } @@ -2478,6 +2890,11 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, * previously contain an hugepage: check if we need to split * an huge pmd. */ + if (start & ~HPAGE_PUD_MASK && + (start & HPAGE_PUD_MASK) >= vma->vm_start && + (start & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE <= vma->vm_end) + split_huge_pud_address(vma, start, false, NULL); + if (start & ~HPAGE_PMD_MASK && (start & HPAGE_PMD_MASK) >= vma->vm_start && (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) @@ -2488,6 +2905,11 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, * previously contain an hugepage: check if we need to split * an huge pmd. */ + if (end & ~HPAGE_PUD_MASK && + (end & HPAGE_PUD_MASK) >= vma->vm_start && + (end & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE <= vma->vm_end) + split_huge_pud_address(vma, end, false, NULL); + if (end & ~HPAGE_PMD_MASK && (end & HPAGE_PMD_MASK) >= vma->vm_start && (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) @@ -2502,6 +2924,11 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, struct vm_area_struct *next = vma->vm_next; unsigned long nstart = next->vm_start; nstart += adjust_next << PAGE_SHIFT; + if (nstart & ~HPAGE_PUD_MASK && + (nstart & HPAGE_PUD_MASK) >= next->vm_start && + (nstart & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE <= next->vm_end) + split_huge_pud_address(next, nstart, false, NULL); + if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) @@ -2691,12 +3118,23 @@ int total_mapcount(struct page *page) if (PageHuge(page)) return compound; ret = compound; - for (i = 0; i < HPAGE_PMD_NR; i++) - ret += atomic_read(&page[i]._mapcount) + 1; + /* if PMD, read all base page, if PUD, read the sub_compound_mapcount()*/ + if (compound_order(page) == HPAGE_PMD_ORDER) { + for (i = 0; i < thp_nr_pages(page); i++) + ret += atomic_read(&page[i]._mapcount) + 1; + } else if (compound_order(page) == HPAGE_PUD_ORDER) { + for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) + ret += sub_compound_mapcount(&page[i]); + for (i = 0; i < thp_nr_pages(page); i++) + ret += atomic_read(&page[i]._mapcount) + 1; + } else + VM_BUG_ON_PAGE(1, page); /* File pages has compound_mapcount included in _mapcount */ + /* both PUD and PMD has HPAGE_PMD_NR sub pages */ if (!PageAnon(page)) return ret - compound * HPAGE_PMD_NR; - if (PageDoubleMap(page)) + /* both PUD and PMD has HPAGE_PMD_NR sub pages */ + if (PagePUDDoubleMap(page) || PageDoubleMap(page)) ret -= HPAGE_PMD_NR; return ret; } @@ -2742,13 +3180,38 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount) page = compound_head(page); _total_mapcount = ret = 0; - for (i = 0; i < HPAGE_PMD_NR; i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; - ret = max(ret, mapcount); - _total_mapcount += mapcount; - } - if (PageDoubleMap(page)) { + /* if PMD, read all base page, if PUD, read the sub_compound_mapcount()*/ + if (compound_order(page) == HPAGE_PMD_ORDER) { + for (i = 0; i < thp_nr_pages(page); i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + ret = max(ret, mapcount); + _total_mapcount += mapcount; + } + } else if (compound_order(page) == HPAGE_PUD_ORDER) { + for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) { + int j; + + mapcount = sub_compound_mapcount(&page[i]); + ret = max(ret, mapcount); + _total_mapcount += mapcount; + + /* Triple mapped at base page size */ + for (j = 0; j < HPAGE_PMD_NR; j++) { + mapcount = atomic_read(&page[i + j]._mapcount) + 1; + ret = max(ret, mapcount); + _total_mapcount += mapcount; + } + + if (PageDoubleMap(&page[i])) { + ret -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + } + } else + VM_BUG_ON_PAGE(1, page); + if (PageDoubleMap(page) || PagePUDDoubleMap(page)) { ret -= 1; + /* both PUD and PMD has HPAGE_PMD_NR sub pages */ _total_mapcount -= HPAGE_PMD_NR; } mapcount = compound_mapcount(page); @@ -2994,6 +3457,9 @@ static unsigned long deferred_split_count(struct shrinker *shrink, return READ_ONCE(ds_queue->split_queue_len); } +#define deferred_list_entry(x) (compound_head(list_entry((void *)x, \ + struct page, mapping))) + static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { @@ -3027,12 +3493,18 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); list_for_each_safe(pos, next, &list) { - page = list_entry((void *)pos, struct page, mapping); + page = deferred_list_entry(pos); if (!trylock_page(page)) goto next; /* split_huge_page() removes page from list on success */ - if (!split_huge_page(page)) - split++; + if (compound_order(page) == HPAGE_PUD_ORDER) { + if (!split_huge_pud_page(page)) + split++; + } else if (compound_order(page) == HPAGE_PMD_ORDER) { + if (!split_huge_page(page)) + split++; + } else + VM_BUG_ON_PAGE(1, page); unlock_page(page); next: put_page(page); @@ -3135,7 +3607,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); - page_remove_rmap(page, true); + page_remove_rmap(page, true, HPAGE_PMD_ORDER); put_page(page); } @@ -3161,7 +3633,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); if (PageAnon(new)) - page_add_anon_rmap(new, vma, mmun_start, true); + page_add_anon_rmap(new, vma, mmun_start, true, HPAGE_PMD_ORDER); else page_add_file_rmap(new, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 27a51b202d1f..4113d7b66fee 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3993,7 +3993,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page, true); + page_remove_rmap(page, true, huge_page_order(h)); spin_unlock(ptl); tlb_remove_page_size(tlb, page, huge_page_size(h)); @@ -4218,7 +4218,7 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, mmu_notifier_invalidate_range(mm, range.start, range.end); set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); - page_remove_rmap(old_page, true); + page_remove_rmap(old_page, true, huge_page_order(h)); hugepage_add_new_anon_rmap(new_page, vma, haddr); set_page_huge_active(new_page); /* Make the old page be freed below */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e749e568e1ea..84ce39652282 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -762,7 +762,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * superfluous. */ pte_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, false); + page_remove_rmap(src_page, false, 0); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -1172,7 +1172,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address, true); + page_add_new_anon_rmap(new_page, vma, address, true, HPAGE_PMD_ORDER); lru_cache_add_inactive_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -1475,7 +1475,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - page_remove_rmap(page, false); + page_remove_rmap(page, false, HPAGE_PMD_ORDER); } pte_unmap_unlock(start_pte, ptl); diff --git a/mm/ksm.c b/mm/ksm.c index 0aa2247bddd7..d778b4d1b626 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1153,7 +1153,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, */ if (!is_zero_pfn(page_to_pfn(kpage))) { get_page(kpage); - page_add_anon_rmap(kpage, vma, addr, false); + page_add_anon_rmap(kpage, vma, addr, false, 0); newpte = mk_pte(kpage, vma->vm_page_prot); } else { newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), @@ -1177,7 +1177,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); - page_remove_rmap(page, false); + page_remove_rmap(page, false, 0); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dc892a3c4b17..5d5be3b7c739 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3232,6 +3232,19 @@ void mem_cgroup_split_huge_fixup(struct page *head) head[i].mem_cgroup = memcg; } } + +void mem_cgroup_split_huge_pud_fixup(struct page *head) +{ + int i; + + if (mem_cgroup_disabled()) + return; + + for (i = HPAGE_PMD_NR; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) + head[i].mem_cgroup = head->mem_cgroup; + + /*__mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PUD_NR);*/ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_MEMCG_SWAP diff --git a/mm/memory.c b/mm/memory.c index b88587256bc1..184d8eb2d060 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1090,7 +1090,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, false); + page_remove_rmap(page, false, 0); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { @@ -1118,7 +1118,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; - page_remove_rmap(page, false); + page_remove_rmap(page, false, 0); put_page(page); continue; } @@ -2725,7 +2725,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * thread doing COW. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); - page_add_new_anon_rmap(new_page, vma, vmf->address, false); + page_add_new_anon_rmap(new_page, vma, vmf->address, false, 0); lru_cache_add_inactive_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -2757,7 +2757,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, false, 0); } /* Free the old page.. */ @@ -3273,10 +3273,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { - page_add_new_anon_rmap(page, vma, vmf->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false, 0); lru_cache_add_inactive_or_unevictable(page, vma); } else { - do_page_add_anon_rmap(page, vma, vmf->address, exclusive); + do_page_add_anon_rmap(page, vma, vmf->address, exclusive, 0); } swap_free(entry); @@ -3420,7 +3420,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false, 0); lru_cache_add_inactive_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -3678,7 +3678,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false, 0); lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); @@ -4155,7 +4155,7 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) return ret; } /* COW or write-notify not handled on PUD level: split pud.*/ - __split_huge_pud(vmf->vma, vmf->pud, vmf->address); + split_huge_pud(vmf->vma, vmf->pud, vmf->address); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } diff --git a/mm/migrate.c b/mm/migrate.c index 0b945c8031be..be0e80b32686 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -270,7 +270,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) - page_add_anon_rmap(new, vma, pvmw.address, false); + page_add_anon_rmap(new, vma, pvmw.address, false, 0); else page_add_file_rmap(new, false); } @@ -2194,7 +2194,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * new page and page_add_new_anon_rmap guarantee the copy is * visible before the pagetable update. */ - page_add_anon_rmap(new_page, vma, start, true); + page_add_anon_rmap(new_page, vma, start, true, HPAGE_PMD_ORDER); /* * At this point the pmd is numa/protnone (i.e. non present) and the TLB * has already been flushed globally. So no TLB can be currently @@ -2211,7 +2211,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, page_ref_unfreeze(page, 2); mlock_migrate_page(new_page, page); - page_remove_rmap(page, true); + page_remove_rmap(page, true, HPAGE_PMD_ORDER); set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); @@ -2455,7 +2455,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, * drop page refcount. Page won't be freed, as we took * a reference just above. */ - page_remove_rmap(page, false); + page_remove_rmap(page, false, 0); put_page(page); if (pte_present(pte)) @@ -2940,7 +2940,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr, false); + page_add_new_anon_rmap(page, vma, addr, false, 0); if (!is_zone_device_page(page)) lru_cache_add_inactive_or_unevictable(page, vma); get_page(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 763acbed66f1..97a4c7e4a579 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -679,6 +679,9 @@ void prep_compound_page(struct page *page, unsigned int order) atomic_set(compound_mapcount_ptr(page), -1); if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); + if (order == HPAGE_PUD_ORDER) + for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) + atomic_set(sub_compound_mapcount_ptr(&page[i], 1), -1); } #ifdef CONFIG_DEBUG_PAGEALLOC @@ -1132,6 +1135,15 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) */ break; default: + /* sub_compound_map_ptr store here */ + if (compound_order(head_page) == HPAGE_PUD_ORDER && + (page - head_page) % HPAGE_PMD_NR == 3) { + if (unlikely(atomic_read(&page->compound_mapcount) != -1)) { + pr_err("sub_compound_mapcount: %d\n", atomic_read(&page->compound_mapcount) + 1); + bad_page(page, "nonzero sub_compound_mapcount"); + } + break; + } if (page->mapping != TAIL_MAPPING) { bad_page(page, "corrupted mapping in tail page"); goto out; @@ -1183,8 +1195,14 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - if (compound) + if (compound) { ClearPageDoubleMap(page); + if (order == HPAGE_PUD_ORDER) { + ClearPagePUDDoubleMap(page); + for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) + ClearPageDoubleMap(&page[i]); + } + } for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index ef218b0f5d74..a8529afc55e5 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -245,6 +245,17 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, } #endif +#ifndef __HAVE_ARCH_PUDP_INVALIDATE +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + pud_t old = pudp_establish(vma, address, pudp, pud_mknotpresent(*pudp)); + + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return old; +} +#endif + #ifndef pmdp_collapse_flush pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) diff --git a/mm/rmap.c b/mm/rmap.c index 77cec0658b76..0bbaaa891b3c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1108,9 +1108,9 @@ static void __page_check_anon_rmap(struct page *page, * (but PageKsm is never downgraded to PageAnon). */ void page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, bool compound) + struct vm_area_struct *vma, unsigned long address, bool compound, int order) { - do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); + do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0, order); } /* @@ -1119,7 +1119,7 @@ void page_add_anon_rmap(struct page *page, * Everybody else should continue to use page_add_anon_rmap above. */ void do_page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, int flags) + struct vm_area_struct *vma, unsigned long address, int flags, int order) { bool compound = flags & RMAP_COMPOUND; bool first; @@ -1130,10 +1130,21 @@ void do_page_add_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); if (compound) { - atomic_t *mapcount; + atomic_t *mapcount = NULL; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageTransHuge(page), page); - mapcount = compound_mapcount_ptr(page); + if (compound_order(page) == HPAGE_PUD_ORDER) { + if (order == HPAGE_PUD_ORDER) { + mapcount = compound_mapcount_ptr(page); + } else if (order == HPAGE_PMD_ORDER) { + VM_BUG_ON(!PMDPageInPUD(page)); + mapcount = sub_compound_mapcount_ptr(page, 1); + } else + VM_BUG_ON(1); + } else if (compound_order(page) == HPAGE_PMD_ORDER) { + mapcount = compound_mapcount_ptr(page); + } else + VM_BUG_ON(1); first = atomic_inc_and_test(mapcount); } else { first = atomic_inc_and_test(&page->_mapcount); @@ -1148,7 +1159,7 @@ void do_page_add_anon_rmap(struct page *page, * disabled. */ if (compound) { - if (nr == HPAGE_PMD_NR) + if (order == HPAGE_PMD_ORDER) __inc_lruvec_page_state(page, NR_ANON_THPS); else __inc_lruvec_page_state(page, NR_ANON_THPS_PUD); @@ -1181,7 +1192,7 @@ void do_page_add_anon_rmap(struct page *page, * Page does not have to be locked. */ void page_add_new_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, bool compound) + struct vm_area_struct *vma, unsigned long address, bool compound, int order) { int nr = compound ? thp_nr_pages(page) : 1; @@ -1194,10 +1205,15 @@ void page_add_new_anon_rmap(struct page *page, if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); - if (nr == HPAGE_PMD_NR) - __inc_lruvec_page_state(page, NR_ANON_THPS); - else + if (order == HPAGE_PUD_ORDER) { + VM_BUG_ON(compound_order(page) != HPAGE_PUD_ORDER); + /* Anon THP always mapped first with PMD */ __inc_lruvec_page_state(page, NR_ANON_THPS_PUD); + } else if (order == HPAGE_PMD_ORDER) { + VM_BUG_ON(compound_order(page) != HPAGE_PMD_ORDER); + __inc_lruvec_page_state(page, NR_ANON_THPS); + } else + VM_BUG_ON(1); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -1289,12 +1305,40 @@ static void page_remove_file_rmap(struct page *page, bool compound) clear_page_mlock(page); } -static void page_remove_anon_compound_rmap(struct page *page) +static void page_remove_anon_compound_rmap(struct page *page, int order) { - int i, nr; - - if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - return; + int i, nr = 0; + struct page *head = compound_head(page); + + if (compound_order(head) == HPAGE_PUD_ORDER) { + if (order == HPAGE_PMD_ORDER) { + VM_BUG_ON(!PMDPageInPUD(page)); + if (atomic_add_negative(-1, sub_compound_mapcount_ptr(page, 1))) { + if (TestClearPageDoubleMap(page)) { + /* + * Subpages can be mapped with PTEs too. Check how many of + * themi are still mapped. + */ + for (i = 0; i < thp_nr_pages(head); i++) { + if (atomic_add_negative(-1, &head[i]._mapcount)) + nr++; + } + } + __dec_node_page_state(page, NR_ANON_THPS); + } + nr += HPAGE_PMD_NR; + __mod_node_page_state(page_pgdat(head), NR_ANON_MAPPED, -nr); + return; + } else { + VM_BUG_ON(order != HPAGE_PUD_ORDER); + if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) + return; + } + } else if (compound_order(head) == HPAGE_PMD_ORDER) { + if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) + return; + } else + VM_BUG_ON_PAGE(1, page); /* Hugepages are not counted in NR_ANON_PAGES for now. */ if (unlikely(PageHuge(page))) @@ -1303,12 +1347,26 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - if (thp_nr_pages(page) == HPAGE_PMD_NR) + if (order == HPAGE_PMD_ORDER) __dec_lruvec_page_state(page, NR_ANON_THPS); - else + else if (order == HPAGE_PUD_ORDER) __dec_lruvec_page_state(page, NR_ANON_THPS_PUD); + else + VM_BUG_ON(1); - if (TestClearPageDoubleMap(page)) { + /* PMD-mapped PUD THP is handled above */ + if (TestClearPagePUDDoubleMap(head)) { + VM_BUG_ON(!(compound_order(head) == HPAGE_PUD_ORDER || head == page)); + /* + * Subpages can be mapped with PMDs too. Check how many of + * themi are still mapped. + */ + for (i = 0, nr = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) { + if (atomic_add_negative(-1, sub_compound_mapcount_ptr(&head[i], 1))) + nr += HPAGE_PMD_NR; + } + } else if (TestClearPageDoubleMap(head)) { + VM_BUG_ON(compound_order(head) != HPAGE_PMD_ORDER); /* * Subpages can be mapped with PTEs too. Check how many of * them are still mapped. @@ -1332,8 +1390,10 @@ static void page_remove_anon_compound_rmap(struct page *page) if (unlikely(PageMlocked(page))) clear_page_mlock(page); - if (nr) - __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); + if (nr) { + __mod_lruvec_page_state(head, NR_ANON_MAPPED, -nr); + deferred_split_huge_page(head); + } } /** @@ -1343,7 +1403,7 @@ static void page_remove_anon_compound_rmap(struct page *page) * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, bool compound) +void page_remove_rmap(struct page *page, bool compound, int order) { lock_page_memcg(page); @@ -1353,7 +1413,7 @@ void page_remove_rmap(struct page *page, bool compound) } if (compound) { - page_remove_anon_compound_rmap(page); + page_remove_anon_compound_rmap(page, order); goto out; } @@ -1734,7 +1794,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); + page_remove_rmap(subpage, PageHuge(page), 0); put_page(page); } diff --git a/mm/swap.c b/mm/swap.c index 999a84dbe12c..b70631c71171 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -964,6 +964,37 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, page_lru(page_tail)); } } + +/* used by __split_pud_huge_page_tail() */ +void lru_add_pud_page_tail(struct page *page, struct page *page_tail, + struct lruvec *lruvec, struct list_head *list) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_PAGE(PageLRU(page_tail), page); + VM_BUG_ON(NR_CPUS != 1 && + !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); + + if (!list) + SetPageLRU(page_tail); + + if (likely(PageLRU(page))) + list_add_tail(&page_tail->lru, &page->lru); + else if (list) { + /* page reclaim is reclaiming a huge page */ + get_page(page_tail); + list_add_tail(&page_tail->lru, list); + } else { + /* + * Head page has not yet been counted, as an hpage, + * so we must account for each subpage individually. + * + * Put page_tail on the list at the correct position + * so they all end up in order. + */ + add_page_to_lru_list_tail(page_tail, lruvec, + page_lru(page_tail)); + } +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, diff --git a/mm/swapfile.c b/mm/swapfile.c index e3f771c2ad83..285edbcb5e22 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1921,9 +1921,9 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { - page_add_anon_rmap(page, vma, addr, false); + page_add_anon_rmap(page, vma, addr, false, 0); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, addr, false); + page_add_new_anon_rmap(page, vma, addr, false, 0); lru_cache_add_inactive_or_unevictable(page, vma); } swap_free(entry); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9a3d451402d7..9b31d9beaa46 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -122,7 +122,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_uncharge_unlock; inc_mm_counter(dst_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + page_add_new_anon_rmap(page, dst_vma, dst_addr, false, 0); lru_cache_add_inactive_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); diff --git a/mm/util.c b/mm/util.c index bb902f5a6582..410f1ca0932a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -713,17 +713,27 @@ struct address_space *page_mapping_file(struct page *page) int __page_mapcount(struct page *page) { int ret; + struct page *head = compound_head(page); + /* base page mapping */ ret = atomic_read(&page->_mapcount) + 1; + + /* PMDInPUD mapping */ + if (compound_order(head) == HPAGE_PUD_ORDER) { + struct page *sub_compound_page = head + + (((page - head) / HPAGE_PMD_NR) * HPAGE_PMD_NR); + + ret += sub_compound_mapcount(sub_compound_page); + } /* * For file THP page->_mapcount contains total number of mapping * of the page: no need to look into compound_mapcount. */ if (!PageAnon(page) && !PageHuge(page)) return ret; - page = compound_head(page); - ret += atomic_read(compound_mapcount_ptr(page)) + 1; - if (PageDoubleMap(page)) + /* highest compound mapping */ + ret += atomic_read(compound_mapcount_ptr(head)) + 1; + if (PageDoubleMap(head)) ret--; return ret; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 3a01212b652c..dc7c2cec9102 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1330,6 +1330,10 @@ const char * const vmstat_text[] = { "thp_fault_fallback_pud", "thp_fault_fallback_pud_charge", "thp_split_pud", + "thp_split_pud_page", + "thp_split_pud_page_failed", + "thp_zero_pud_page_alloc", + "thp_zero_pud_page_alloc_failed", #endif "thp_zero_page_alloc", "thp_zero_page_alloc_failed", -- 2.28.0