From: Zi Yan <ziy@xxxxxxxxxx> This adds 1GB THP support for anonymous pages. Applications can get 1GB pages during page faults when their VMAs are larger than 1GB. For read-only 1GB zero THP, a shared 1GB zero THP is created for all readers. Signed-off-by: Zi Yan <ziy@xxxxxxxxxx> --- arch/x86/include/asm/pgalloc.h | 59 +++++++++++ arch/x86/include/asm/pgtable.h | 2 + arch/x86/mm/pgtable.c | 25 +++++ drivers/base/node.c | 3 + fs/proc/meminfo.c | 2 + include/linux/huge_mm.h | 13 ++- include/linux/mm.h | 4 + include/linux/mm_types.h | 1 + include/linux/mmzone.h | 1 + include/linux/pgtable.h | 3 + include/linux/vm_event_item.h | 3 + kernel/fork.c | 5 + mm/huge_memory.c | 188 +++++++++++++++++++++++++++++++-- mm/memory.c | 29 ++++- mm/page_alloc.c | 3 +- mm/pgtable-generic.c | 45 ++++++++ mm/rmap.c | 30 ++++-- mm/vmstat.c | 4 + 18 files changed, 396 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 62ad61d6fefc..fae13467d3e1 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -52,6 +52,18 @@ extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); extern pgtable_t pte_alloc_one(struct mm_struct *); +extern pgtable_t pte_alloc_order(struct mm_struct *, unsigned long, int); + +static inline void pte_free_order(struct mm_struct *mm, struct page *pte, + int order) +{ + int i; + + for (i = 0; i < (1<<order); i++) { + pgtable_pte_page_dtor(&pte[i]); + __free_page(&pte[i]); + } +} extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); @@ -87,6 +99,53 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #define pmd_pgtable(pmd) pmd_page(pmd) #if CONFIG_PGTABLE_LEVELS > 2 +static inline pmd_t *pmd_alloc_one_page_with_ptes(struct mm_struct *mm, unsigned long addr) +{ + pgtable_t pte_pgtables; + pmd_t *pmd; + spinlock_t *pmd_ptl; + int i; + + pte_pgtables = pte_alloc_order(mm, addr, + HPAGE_PUD_ORDER - HPAGE_PMD_ORDER); + if (!pte_pgtables) + return NULL; + + pmd = pmd_alloc_one(mm, addr); + if (unlikely(!pmd)) { + pte_free_order(mm, pte_pgtables, + HPAGE_PUD_ORDER - HPAGE_PMD_ORDER); + return NULL; + } + pmd_ptl = pmd_lock(mm, pmd); + + for (i = 0; i < (1<<(HPAGE_PUD_ORDER - HPAGE_PMD_ORDER)); i++) + pgtable_trans_huge_deposit(mm, pmd, pte_pgtables + i); + + spin_unlock(pmd_ptl); + + return pmd; +} + +static inline void pmd_free_page_with_ptes(struct mm_struct *mm, pmd_t *pmd) +{ + spinlock_t *pmd_ptl; + int i; + + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); + pmd_ptl = pmd_lock(mm, pmd); + + for (i = 0; i < (1<<(HPAGE_PUD_ORDER - HPAGE_PMD_ORDER)); i++) { + pgtable_t pte_pgtable; + + pte_pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pte_free(mm, pte_pgtable); + } + + spin_unlock(pmd_ptl); + pmd_free(mm, pmd); +} + extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5e0dcc20614d..26255cac78c0 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1141,6 +1141,8 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long return native_pmdp_get_and_clear(pmdp); } +#define mk_pud(page, pgprot) pfn_pud(page_to_pfn(page), (pgprot)) + #define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pud_t *pudp) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index dfd82f51ba66..7be73aee6183 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -33,6 +33,31 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) return __pte_alloc_one(mm, __userpte_alloc_gfp); } +pgtable_t pte_alloc_order(struct mm_struct *mm, unsigned long address, int order) +{ + struct page *pte; + int i; + + pte = alloc_pages(__userpte_alloc_gfp, order); + if (!pte) + return NULL; + split_page(pte, order); + for (i = 1; i < (1 << order); i++) + set_page_private(pte + i, 0); + + for (i = 0; i < (1<<order); i++) { + if (!pgtable_pte_page_ctor(&pte[i])) { + __free_page(&pte[i]); + while (--i >= 0) { + pgtable_pte_page_dtor(&pte[i]); + __free_page(&pte[i]); + } + return NULL; + } + } + return pte; +} + static int __init setup_userpte(char *arg) { if (!arg) diff --git a/drivers/base/node.c b/drivers/base/node.c index 508b80f6329b..f11b4d88911c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -428,6 +428,7 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d SUnreclaim: %8lu kB\n" #ifdef CONFIG_TRANSPARENT_HUGEPAGE "Node %d AnonHugePages: %8lu kB\n" + "Node %d AnonHugePUDPages: %8lu kB\n" "Node %d ShmemHugePages: %8lu kB\n" "Node %d ShmemPmdMapped: %8lu kB\n" "Node %d FileHugePages: %8lu kB\n" @@ -457,6 +458,8 @@ static ssize_t node_read_meminfo(struct device *dev, , nid, K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_ANON_THPS_PUD) * + HPAGE_PUD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 887a5532e449..b60e0c241015 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -130,6 +130,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_TRANSPARENT_HUGEPAGE show_val_kb(m, "AnonHugePages: ", global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR); + show_val_kb(m, "AnonHugePUDPages: ", + global_node_page_state(NR_ANON_THPS_PUD) * HPAGE_PUD_NR); show_val_kb(m, "ShmemHugePages: ", global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); show_val_kb(m, "ShmemPmdMapped: ", diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8a8bc46a2432..7528652400e4 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -18,10 +18,15 @@ extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); +extern int do_huge_pud_anonymous_page(struct vm_fault *vmf); #else static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { } +extern int do_huge_pud_anonymous_page(struct vm_fault *vmf) +{ + return VM_FAULT_FALLBACK; +} #endif extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); @@ -115,6 +120,9 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) +#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT) +#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER) + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) @@ -276,7 +284,7 @@ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) - return HPAGE_PMD_ORDER; + return page[1].compound_order; return 0; } @@ -288,7 +296,7 @@ static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) - return HPAGE_PMD_NR; + return (1<<page[1].compound_order); return 1; } @@ -320,6 +328,7 @@ struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) +#define mk_huge_pud(page, prot) pud_mkhuge(mk_pud(page, prot)) static inline bool thp_migration_supported(void) { diff --git a/include/linux/mm.h b/include/linux/mm.h index f3a4f099fb1b..cb1ccf804404 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -31,6 +31,7 @@ #include <linux/sizes.h> #include <linux/sched.h> #include <linux/pgtable.h> +#include <linux/pagechain.h> struct mempolicy; struct anon_vma; @@ -2184,6 +2185,7 @@ static inline void pgtable_init(void) { ptlock_cache_init(); pgtable_cache_init(); + pagechain_cache_init(); } static inline bool pgtable_pte_page_ctor(struct page *page) @@ -2316,6 +2318,8 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) return ptl; } +#define pud_huge_pte(mm, pud) ((mm)->pud_huge_pte) + extern void __init pagecache_init(void); extern void __init free_area_init_memoryless_node(int nid); extern void free_initmem(void); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 496c3ff97cce..4c1839366af4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -513,6 +513,7 @@ struct mm_struct { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif + struct list_head pud_huge_pte; /* protected by page_table_lock */ #ifdef CONFIG_NUMA_BALANCING /* * numa_next_scan is the next time that the PTEs will be marked diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0a404552ecc1..3a8f54a2c5a7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -196,6 +196,7 @@ enum node_stat_item { NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, + NR_ANON_THPS_PUD, NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e8cbc2e795d5..255275d5b73e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -462,10 +462,13 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); +extern void pgtable_trans_huge_pud_deposit(struct mm_struct *mm, pud_t *pudp, + pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +extern pgtable_t pgtable_trans_huge_pud_withdraw(struct mm_struct *mm, pud_t *pudp); #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 2e6ca53b9bbd..a3f1093a55bb 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -92,6 +92,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_DEFERRED_SPLIT_PAGE, THP_SPLIT_PMD, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + THP_FAULT_ALLOC_PUD, + THP_FAULT_FALLBACK_PUD, + THP_FAULT_FALLBACK_PUD_CHARGE, THP_SPLIT_PUD, #endif THP_ZERO_PAGE_ALLOC, diff --git a/kernel/fork.c b/kernel/fork.c index 3f281814a3d3..842fdc4ae5fc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -663,6 +663,10 @@ static void check_mm(struct mm_struct *mm) #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif + VM_BUG_ON_MM(!list_empty(&mm->pud_huge_pte) && + !pagechain_empty(list_first_entry(&mm->pud_huge_pte, + struct pagechain, list)), + mm); } #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) @@ -1023,6 +1027,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif + INIT_LIST_HEAD(&mm->pud_huge_pte); mm_init_uprobes_state(mm); if (current->mm) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 90733cefa528..ec3847392208 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -933,6 +933,112 @@ vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot); + +static int __do_huge_pud_anonymous_page(struct vm_fault *vmf, struct page *page, + gfp_t gfp) +{ + struct vm_area_struct *vma = vmf->vma; + pmd_t *pmd_pgtable; + unsigned long haddr = vmf->address & HPAGE_PUD_MASK; + int ret = 0; + + VM_BUG_ON_PAGE(!PageCompound(page), page); + + if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK_PUD); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); + return VM_FAULT_FALLBACK; + } + cgroup_throttle_swaprate(page, gfp); + + pmd_pgtable = pmd_alloc_one_page_with_ptes(vma->vm_mm, haddr); + if (unlikely(!pmd_pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } + + clear_huge_page(page, vmf->address, HPAGE_PUD_NR); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ + __SetPageUptodate(page); + + vmf->ptl = pud_lock(vma->vm_mm, vmf->pud); + if (unlikely(!pud_none(*vmf->pud))) { + goto unlock_release; + } else { + pud_t entry; + int i; + + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + + /* Deliver the page fault to userland */ + if (userfaultfd_missing(vma)) { + vm_fault_t ret2; + + spin_unlock(vmf->ptl); + put_page(page); + pmd_free_page_with_ptes(vma->vm_mm, pmd_pgtable); + ret2 = handle_userfault(vmf, VM_UFFD_MISSING); + VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); + return ret2; + } + + entry = mk_huge_pud(page, vma->vm_page_prot); + entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); + page_add_new_anon_rmap(page, vma, haddr, true); + lru_cache_add_inactive_or_unevictable(page, vma); + pgtable_trans_huge_pud_deposit(vma->vm_mm, vmf->pud, + virt_to_page(pmd_pgtable)); + set_pud_at(vma->vm_mm, haddr, vmf->pud, entry); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PUD_NR); + mm_inc_nr_pmds(vma->vm_mm); + for (i = 0; i < (1<<(HPAGE_PUD_ORDER - HPAGE_PMD_ORDER)); i++) + mm_inc_nr_ptes(vma->vm_mm); + spin_unlock(vmf->ptl); + count_vm_event(THP_FAULT_ALLOC_PUD); + } + + return 0; +unlock_release: + spin_unlock(vmf->ptl); +release: + if (pmd_pgtable) + pmd_free_page_with_ptes(vma->vm_mm, pmd_pgtable); + put_page(page); + return ret; + +} + +int do_huge_pud_anonymous_page(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + gfp_t gfp; + struct page *page; + unsigned long haddr = vmf->address & HPAGE_PUD_MASK; + + if (haddr < vma->vm_start || haddr + HPAGE_PUD_SIZE > vma->vm_end) + return VM_FAULT_FALLBACK; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) + return VM_FAULT_OOM; + + gfp = alloc_hugepage_direct_gfpmask(vma); + page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PUD_ORDER); + if (unlikely(!page)) { + count_vm_event(THP_FAULT_FALLBACK_PUD); + return VM_FAULT_FALLBACK; + } + prep_transhuge_page(page); + return __do_huge_pud_anonymous_page(vmf, page, gfp); +} + #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -1159,7 +1265,12 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, { spinlock_t *dst_ptl, *src_ptl; pud_t pud; - int ret; + pmd_t *pmd_pgtable = NULL; + int ret = -ENOMEM; + + pmd_pgtable = pmd_alloc_one_page_with_ptes(vma->vm_mm, addr); + if (unlikely(!pmd_pgtable)) + goto out; dst_ptl = pud_lock(dst_mm, dst_pud); src_ptl = pud_lockptr(src_mm, src_pud); @@ -1167,16 +1278,28 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pud = *src_pud; + + /* only transparent huge pud page needs extra page table pages for + * possible huge page split */ + if (!pud_trans_huge(pud)) + pmd_free_page_with_ptes(dst_mm, pmd_pgtable); + if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) goto out_unlock; - /* - * When page table lock is held, the huge zero pud should not be - * under splitting since we don't split the page itself, only pud to - * a page table. - */ - if (is_huge_zero_pud(pud)) { - /* No huge zero pud yet */ + if (pud_trans_huge(pud)) { + struct page *src_page; + int i; + + src_page = pud_page(pud); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + get_page(src_page); + page_dup_rmap(src_page, true); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PUD_NR); + mm_inc_nr_pmds(dst_mm); + for (i = 0; i < (1<<(HPAGE_PUD_ORDER - HPAGE_PMD_ORDER)); i++) + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_pud_deposit(dst_mm, dst_pud, virt_to_page(pmd_pgtable)); } pudp_set_wrprotect(src_mm, addr, src_pud); @@ -1187,6 +1310,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, out_unlock: spin_unlock(src_ptl); spin_unlock(dst_ptl); +out: return ret; } @@ -1887,11 +2011,27 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline void zap_pud_deposited_table(struct mm_struct *mm, pud_t *pud) +{ + pgtable_t pgtable; + int i; + + pgtable = pgtable_trans_huge_pud_withdraw(mm, pud); + pmd_free_page_with_ptes(mm, (pmd_t *)page_address(pgtable)); + + mm_dec_nr_pmds(mm); + for (i = 0; i < (1<<(HPAGE_PUD_ORDER - HPAGE_PMD_ORDER)); i++) + mm_dec_nr_ptes(mm); +} + int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr) { + pud_t orig_pud; spinlock_t *ptl; + tlb_change_page_size(tlb, HPAGE_PUD_SIZE); + ptl = __pud_trans_huge_lock(pud, vma); if (!ptl) return 0; @@ -1901,14 +2041,40 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, * pgtable_trans_huge_withdraw after finishing pudp related * operations. */ - pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); + orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, + tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_special_huge(vma)) { spin_unlock(ptl); /* No zero page support yet */ + } else if (is_huge_zero_pud(orig_pud)) { + zap_pud_deposited_table(tlb->mm, pud); + spin_unlock(ptl); + tlb_remove_page_size(tlb, pud_page(orig_pud), HPAGE_PUD_SIZE); } else { - /* No support for anonymous PUD pages yet */ - BUG(); + struct page *page = NULL; + int flush_needed = 1; + + if (pud_present(orig_pud)) { + page = pud_page(orig_pud); + page_remove_rmap(page, true); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + VM_BUG_ON_PAGE(!PageHead(page), page); + } else + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + + if (PageAnon(page)) { + zap_pud_deposited_table(tlb->mm, pud); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PUD_NR); + } else { + if (arch_needs_pgtable_deposit()) + zap_pud_deposited_table(tlb->mm, pud); + add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PUD_NR); + } + + spin_unlock(ptl); + if (flush_needed) + tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE); } return 1; } diff --git a/mm/memory.c b/mm/memory.c index fb5463153351..6f86294438fd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4147,14 +4147,13 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vmf->vma)) - goto split; + return do_huge_pud_anonymous_page(vmf); if (vmf->vma->vm_ops->huge_fault) { vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); if (!(ret & VM_FAULT_FALLBACK)) return ret; } -split: /* COW or write-notify not handled on PUD level: split pud.*/ __split_huge_pud(vmf->vma, vmf->pud, vmf->address); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -5098,3 +5097,29 @@ void ptlock_free(struct page *page) kmem_cache_free(page_ptl_cachep, page->ptl); } #endif + +static struct kmem_cache *pagechain_cachep; + +void __init pagechain_cache_init(void) +{ + pagechain_cachep = kmem_cache_create("pagechain", + sizeof(struct pagechain), 0, SLAB_PANIC, NULL); +} + +struct pagechain *pagechain_alloc(void) +{ + struct pagechain *chain; + + chain = kmem_cache_alloc(pagechain_cachep, GFP_ATOMIC); + + if (!chain) + return NULL; + + pagechain_init(chain); + return chain; +} + +void pagechain_free(struct pagechain *pchain) +{ + kmem_cache_free(pagechain_cachep, pchain); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0d9f9bd0e06c..763acbed66f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5443,7 +5443,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), - K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR + + node_page_state(pgdat, NR_ANON_THPS_PUD) * HPAGE_PUD_NR), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), node_page_state(pgdat, NR_KERNEL_STACK_KB), diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 9578db83e312..ef218b0f5d74 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> +#include <linux/pagechain.h> #include <asm/tlb.h> /* @@ -170,6 +171,23 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); pmd_huge_pte(mm, pmdp) = pgtable; } + +void pgtable_trans_huge_pud_deposit(struct mm_struct *mm, pud_t *pudp, + pgtable_t pgtable) +{ + struct pagechain *chain = NULL; + + assert_spin_locked(pud_lockptr(mm, pudp)); + /* FIFO */ + chain = list_first_entry_or_null(&pud_huge_pte(mm, pudp), + struct pagechain, list); + + if (!chain || !pagechain_space(chain)) { + chain = pagechain_alloc(); + list_add(&chain->list, &pud_huge_pte(mm, pudp)); + } + pagechain_deposit(chain, pgtable); +} #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW @@ -188,6 +206,33 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) list_del(&pgtable->lru); return pgtable; } + +pgtable_t pgtable_trans_huge_pud_withdraw(struct mm_struct *mm, pud_t *pudp) +{ + pgtable_t pgtable; + struct pagechain *chain = NULL; + + assert_spin_locked(pud_lockptr(mm, pudp)); + + /* FIFO */ +retry: + chain = list_first_entry_or_null(&pud_huge_pte(mm, pudp), + struct pagechain, list); + + if (!chain) + return NULL; + + if (pagechain_empty(chain)) { + if (list_is_singular(&chain->list)) + return NULL; + list_del(&chain->list); + pagechain_free(chain); + goto retry; + } + + pgtable = pagechain_withdraw(chain); + return pgtable; +} #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE diff --git a/mm/rmap.c b/mm/rmap.c index 9425260774a1..10195a2421cf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -726,6 +726,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) pgd_t *pgd; p4d_t *p4d; pud_t *pud; + pud_t pude; pmd_t *pmd = NULL; pmd_t pmde; @@ -738,7 +739,10 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) goto out; pud = pud_offset(p4d, address); - if (!pud_present(*pud)) + + pude = *pud; + barrier(); + if (!pud_present(pude) || pud_trans_huge(pude)) goto out; pmd = pmd_offset(pud, address); @@ -1033,7 +1037,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) * __page_set_anon_rmap - set up new anonymous rmap * @page: Page or Hugepage to add to rmap * @vma: VM area to add page to. - * @address: User virtual address of the mapping + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct page *page, @@ -1137,8 +1141,12 @@ void do_page_add_anon_rmap(struct page *page, * pte lock(a spinlock) is held, which implies preemption * disabled. */ - if (compound) - __inc_lruvec_page_state(page, NR_ANON_THPS); + if (compound) { + if (nr == HPAGE_PMD_NR) + __inc_lruvec_page_state(page, NR_ANON_THPS); + else + __inc_lruvec_page_state(page, NR_ANON_THPS_PUD); + } __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } @@ -1180,7 +1188,10 @@ void page_add_new_anon_rmap(struct page *page, if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); - __inc_lruvec_page_state(page, NR_ANON_THPS); + if (nr == HPAGE_PMD_NR) + __inc_lruvec_page_state(page, NR_ANON_THPS); + else + __inc_lruvec_page_state(page, NR_ANON_THPS_PUD); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -1286,14 +1297,17 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_lruvec_page_state(page, NR_ANON_THPS); + if (thp_nr_pages(page) == HPAGE_PMD_NR) + __dec_lruvec_page_state(page, NR_ANON_THPS); + else + __dec_lruvec_page_state(page, NR_ANON_THPS_PUD); if (TestClearPageDoubleMap(page)) { /* * Subpages can be mapped with PTEs too. Check how many of * them are still mapped. */ - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } @@ -1306,7 +1320,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (nr && nr < HPAGE_PMD_NR) deferred_split_huge_page(page); } else { - nr = HPAGE_PMD_NR; + nr = thp_nr_pages(page); } if (unlikely(PageMlocked(page))) diff --git a/mm/vmstat.c b/mm/vmstat.c index 06fd13ebc2b8..3a01212b652c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1209,6 +1209,7 @@ const char * const vmstat_text[] = { "nr_file_hugepages", "nr_file_pmdmapped", "nr_anon_transparent_hugepages", + "nr_anon_transparent_pud_hugepages", "nr_vmscan_write", "nr_vmscan_immediate_reclaim", "nr_dirtied", @@ -1325,6 +1326,9 @@ const char * const vmstat_text[] = { "thp_deferred_split_page", "thp_split_pmd", #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + "thp_fault_alloc_pud", + "thp_fault_fallback_pud", + "thp_fault_fallback_pud_charge", "thp_split_pud", #endif "thp_zero_page_alloc", -- 2.28.0