From: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx> Let's modify __do_fault() to handle transhuge pages. To indicate that huge page is required caller pass flags with FAULT_FLAG_TRANSHUGE set. __do_fault() now returns VM_FAULT_FALLBACK to indicate that fallback to small pages is required. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> --- include/linux/huge_mm.h | 41 +++++++++++++ include/linux/mm.h | 5 ++ mm/huge_memory.c | 22 ------- mm/memory.c | 148 ++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 172 insertions(+), 44 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d688271..b20334a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -188,6 +188,28 @@ static inline struct page *compound_trans_head(struct page *page) return page; } +static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) +{ + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; +} + +static inline struct page *alloc_hugepage_vma(int defrag, + struct vm_area_struct *vma, + unsigned long haddr, int nd, + gfp_t extra_gfp) +{ + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), + HPAGE_PMD_ORDER, vma, haddr, nd); +} + +static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) +{ + pmd_t entry; + entry = mk_pmd(page, prot); + entry = pmd_mkhuge(entry); + return entry; +} + extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp); @@ -200,12 +222,15 @@ extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vm #define HPAGE_CACHE_NR ({ BUILD_BUG(); 0; }) #define HPAGE_CACHE_INDEX_MASK ({ BUILD_BUG(); 0; }) +#define THP_FAULT_ALLOC ({ BUILD_BUG(); 0; }) +#define THP_FAULT_FALLBACK ({ BUILD_BUG(); 0; }) #define THP_WRITE_ALLOC ({ BUILD_BUG(); 0; }) #define THP_WRITE_ALLOC_FAILED ({ BUILD_BUG(); 0; }) #define hpage_nr_pages(x) 1 #define transparent_hugepage_enabled(__vma) 0 +#define transparent_hugepage_defrag(__vma) 0 #define transparent_hugepage_flags 0UL static inline int @@ -242,6 +267,22 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, return 0; } +static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) +{ + pmd_t entry; + BUILD_BUG(); + return entry; +} + +static inline struct page *alloc_hugepage_vma(int defrag, + struct vm_area_struct *vma, + unsigned long haddr, int nd, + gfp_t extra_gfp) +{ + BUILD_BUG(); + return NULL; +} + static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 280b414..563c8b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -167,6 +167,11 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ #define FAULT_FLAG_TRIED 0x40 /* second try */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_PAGECACHE +#define FAULT_FLAG_TRANSHUGE 0x80 /* Try to allocate transhuge page */ +#else +#define FAULT_FLAG_TRANSHUGE 0 /* Optimize out THP code if disabled */ +#endif /* * vm_fault is filled by the the pagefault handler and passed to the vma's diff --git a/mm/huge_memory.c b/mm/huge_memory.c index facfdac..893cc69 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -709,14 +709,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } -static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) -{ - pmd_t entry; - entry = mk_pmd(page, prot); - entry = pmd_mkhuge(entry); - return entry; -} - static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, @@ -758,20 +750,6 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return 0; } -static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) -{ - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; -} - -static inline struct page *alloc_hugepage_vma(int defrag, - struct vm_area_struct *vma, - unsigned long haddr, int nd, - gfp_t extra_gfp) -{ - return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), - HPAGE_PMD_ORDER, vma, haddr, nd); -} - #ifndef CONFIG_NUMA static inline struct page *alloc_hugepage(int defrag) { diff --git a/mm/memory.c b/mm/memory.c index 97b22c7..8997cd8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -59,6 +59,7 @@ #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> +#include <linux/khugepaged.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -167,6 +168,7 @@ static void check_sync_rss_stat(struct task_struct *task) } #else /* SPLIT_RSS_COUNTING */ +#define add_mm_counter_fast(mm, member, val) add_mm_counter(mm, member, val) #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) @@ -3282,6 +3284,38 @@ oom: return VM_FAULT_OOM; } +static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, + unsigned long addr) +{ + unsigned long haddr = addr & HPAGE_PMD_MASK; + + if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) != + (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK)) + return false; + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + return false; + return true; +} + +static struct page *alloc_fault_page_vma(struct vm_area_struct *vma, + unsigned long addr, unsigned int flags) +{ + + if (flags & FAULT_FLAG_TRANSHUGE) { + struct page *page; + unsigned long haddr = addr & HPAGE_PMD_MASK; + + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr, numa_node_id(), 0); + if (page) + count_vm_event(THP_FAULT_ALLOC); + else + count_vm_event(THP_FAULT_FALLBACK); + return page; + } + return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); +} + /* * __do_fault() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if @@ -3301,12 +3335,23 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, { pte_t *page_table; spinlock_t *ptl; + pgtable_t pgtable = NULL; struct page *page, *cow_page, *dirty_page = NULL; - pte_t entry; bool anon = false, page_mkwrite = false; bool write = flags & FAULT_FLAG_WRITE; + bool thp = flags & FAULT_FLAG_TRANSHUGE; + unsigned long addr_aligned; struct vm_fault vmf; - int ret; + int nr, ret; + + if (thp) { + if (!transhuge_vma_suitable(vma, address)) + return VM_FAULT_FALLBACK; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; + addr_aligned = address & HPAGE_PMD_MASK; + } else + addr_aligned = address & PAGE_MASK; /* * If we do COW later, allocate page befor taking lock_page() @@ -3316,17 +3361,25 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + cow_page = alloc_fault_page_vma(vma, address, flags); if (!cow_page) - return VM_FAULT_OOM; + return VM_FAULT_OOM | VM_FAULT_FALLBACK; if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { page_cache_release(cow_page); - return VM_FAULT_OOM; + return VM_FAULT_OOM | VM_FAULT_FALLBACK; } } else cow_page = NULL; + if (thp) { + pgtable = pte_alloc_one(mm, address); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto uncharge_out; + } + } + vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; vmf.flags = flags; @@ -3353,6 +3406,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, VM_BUG_ON(!PageLocked(vmf.page)); page = vmf.page; + + /* + * If we asked for huge page we expect to get it or VM_FAULT_FALLBACK. + * If we don't ask for huge page it must be splitted in ->fault(). + */ + BUG_ON(PageTransHuge(page) != thp); + if (!write) goto update_pgtable; @@ -3362,7 +3422,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!(vma->vm_flags & VM_SHARED)) { page = cow_page; anon = true; - copy_user_highpage(page, vmf.page, address, vma); + if (thp) + copy_user_huge_page(page, vmf.page, addr_aligned, vma, + HPAGE_PMD_NR); + else + copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else if (vma->vm_ops->page_mkwrite) { /* @@ -3373,6 +3437,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(page); vmf.flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE; + if (thp) + vmf.flags |= FAULT_FLAG_TRANSHUGE; tmp = vma->vm_ops->page_mkwrite(vma, &vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { ret = tmp; @@ -3391,19 +3457,30 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, } update_pgtable: - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); /* Only go through if we didn't race with anybody else... */ - if (unlikely(!pte_same(*page_table, orig_pte))) { - pte_unmap_unlock(page_table, ptl); - goto race_out; + if (thp) { + spin_lock(&mm->page_table_lock); + if (!pmd_none(*pmd)) { + spin_unlock(&mm->page_table_lock); + goto race_out; + } + /* make GCC happy */ + ptl = NULL; page_table = NULL; + } else { + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*page_table, orig_pte))) { + pte_unmap_unlock(page_table, ptl); + goto race_out; + } } flush_icache_page(vma, page); + nr = thp ? HPAGE_PMD_NR : 1; if (anon) { - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); + add_mm_counter_fast(mm, MM_ANONPAGES, nr); + page_add_new_anon_rmap(page, vma, addr_aligned); } else { - inc_mm_counter_fast(mm, MM_FILEPAGES); + add_mm_counter_fast(mm, MM_FILEPAGES, nr); page_add_file_rmap(page); if (write) { dirty_page = page; @@ -3419,15 +3496,23 @@ update_pgtable: * exclusive copy of the page, or this is a shared mapping, so we can * make it writable and dirty to avoid having to handle that later. */ - entry = mk_pte(page, vma->vm_page_prot); - if (write) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - set_pte_at(mm, address, page_table, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, page_table); - - pte_unmap_unlock(page_table, ptl); + if (thp) { + pmd_t entry = mk_huge_pmd(page, vma->vm_page_prot); + if (flags & FAULT_FLAG_WRITE) + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + set_pmd_at(mm, address, pmd, entry); + pgtable_trans_huge_deposit(mm, pgtable); + mm->nr_ptes++; + update_mmu_cache_pmd(vma, address, pmd); + spin_unlock(&mm->page_table_lock); + } else { + pte_t entry = mk_pte(page, vma->vm_page_prot); + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + set_pte_at(mm, address, page_table, entry); + update_mmu_cache(vma, address, page_table); + pte_unmap_unlock(page_table, ptl); + } if (dirty_page) { struct address_space *mapping = page->mapping; @@ -3457,9 +3542,13 @@ update_pgtable: return ret; unwritable_page: + if (pgtable) + pte_free(mm, pgtable); page_cache_release(page); return ret; uncharge_out: + if (pgtable) + pte_free(mm, pgtable); /* fs's fault handler get error */ if (cow_page) { mem_cgroup_uncharge_page(cow_page); @@ -3467,6 +3556,8 @@ uncharge_out: } return ret; race_out: + if (pgtable) + pte_free(mm, pgtable); if (cow_page) mem_cgroup_uncharge_page(cow_page); if (anon) @@ -3519,6 +3610,19 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } +static int do_huge_linear_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, + unsigned int flags) +{ + pgoff_t pgoff = (((address & PAGE_MASK) + - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + pte_t __unused; /* unused with FAULT_FLAG_TRANSHUGE */ + + flags |= FAULT_FLAG_TRANSHUGE; + + return __do_fault(mm, vma, address, pmd, pgoff, flags, __unused); +} + int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int current_nid) { -- 1.7.10.4 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>