From: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx> Subject: khugepaged: add support of collapse for tmpfs/shmem pages This patch extends khugepaged to support collapse of tmpfs/shmem pages. We share fair amount of infrastructure with anon-THP collapse. Few design points: - First we are looking for VMA which can be suitable for mapping huge page; - If the VMA maps shmem file, the rest scan/collapse operations operates on page cache, not on page tables as in anon VMA case. - khugepaged_scan_shmem() finds a range which is suitable for huge page. The scan is lockless and shouldn't disturb system too much. - once the candidate for collapse is found, collapse_shmem() attempts to create a huge page: + scan over radix tree, making the range point to new huge page; + new huge page is not-uptodate, locked and freezed (refcount is 0), so nobody can touch them until we say so. + we swap in pages during the scan. khugepaged_scan_shmem() filters out ranges with more than khugepaged_max_ptes_swap swapped out pages. It's HPAGE_PMD_NR/8 by default. + old pages are isolated, unmapped and put to local list in case to be restored back if collapse failed. - if collapse succeed, we retract pte page tables from VMAs where huge pages mapping is possible. The huge page will be mapped as PMD on next minor fault into the range. Link: http://lkml.kernel.org/r/1466021202-61880-35-git-send-email-kirill.shutemov@xxxxxxxxxxxxxxx Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- include/linux/shmem_fs.h | 23 + include/trace/events/huge_memory.h | 3 mm/khugepaged.c | 435 ++++++++++++++++++++++++++- mm/shmem.c | 56 ++- 4 files changed, 500 insertions(+), 17 deletions(-) diff -puN include/linux/shmem_fs.h~khugepaged-add-support-of-collapse-for-tmpfs-shmem-pages include/linux/shmem_fs.h --- a/include/linux/shmem_fs.h~khugepaged-add-support-of-collapse-for-tmpfs-shmem-pages +++ a/include/linux/shmem_fs.h @@ -54,6 +54,7 @@ extern unsigned long shmem_get_unmapped_ unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); extern bool shmem_mapping(struct address_space *mapping); +extern bool shmem_huge_enabled(struct vm_area_struct *vma); extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); @@ -64,6 +65,19 @@ extern unsigned long shmem_swap_usage(st extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); +/* Flag allocation requirements to shmem_getpage */ +enum sgp_type { + SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_CACHE, /* don't exceed i_size, may allocate page */ + SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ + SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ + SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ + SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ +}; + +extern int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp); + static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) { @@ -71,6 +85,15 @@ static inline struct page *shmem_read_ma mapping_gfp_mask(mapping)); } +static inline bool shmem_file(struct file *file) +{ + if (!IS_ENABLED(CONFIG_SHMEM)) + return false; + if (!file || !file->f_mapping) + return false; + return shmem_mapping(file->f_mapping); +} + extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); diff -puN include/trace/events/huge_memory.h~khugepaged-add-support-of-collapse-for-tmpfs-shmem-pages include/trace/events/huge_memory.h --- a/include/trace/events/huge_memory.h~khugepaged-add-support-of-collapse-for-tmpfs-shmem-pages +++ a/include/trace/events/huge_memory.h @@ -29,7 +29,8 @@ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ - EMe( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") + EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ + EMe(SCAN_TRUNCATED, "truncated") \ #undef EM #undef EMe diff -puN mm/khugepaged.c~khugepaged-add-support-of-collapse-for-tmpfs-shmem-pages mm/khugepaged.c --- a/mm/khugepaged.c~khugepaged-add-support-of-collapse-for-tmpfs-shmem-pages +++ a/mm/khugepaged.c @@ -14,6 +14,7 @@ #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> #include <linux/swapops.h> +#include <linux/shmem_fs.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -42,7 +43,8 @@ enum scan_result { SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, - SCAN_EXCEED_SWAP_PTE + SCAN_EXCEED_SWAP_PTE, + SCAN_TRUNCATED, }; #define CREATE_TRACE_POINTS @@ -294,7 +296,7 @@ struct attribute_group khugepaged_attr_g .name = "khugepaged", }; -#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) +#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) @@ -816,6 +818,10 @@ static bool hugepage_vma_check(struct vm if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || (vma->vm_flags & VM_NOHUGEPAGE)) return false; + if (shmem_file(vma->vm_file)) { + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR); + } if (!vma->anon_vma || vma->vm_ops) return false; if (is_vma_temporary_stack(vma)) @@ -1216,6 +1222,412 @@ static void collect_mm_slot(struct mm_sl } } +#ifdef CONFIG_SHMEM +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) +{ + struct vm_area_struct *vma; + unsigned long addr; + pmd_t *pmd, _pmd; + + i_mmap_lock_write(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + /* probably overkill */ + if (vma->anon_vma) + continue; + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (addr & ~HPAGE_PMD_MASK) + continue; + if (vma->vm_end < addr + HPAGE_PMD_SIZE) + continue; + pmd = mm_find_pmd(vma->vm_mm, addr); + if (!pmd) + continue; + /* + * We need exclusive mmap_sem to retract page table. + * If trylock fails we would end up with pte-mapped THP after + * re-fault. Not ideal, but it's more important to not disturb + * the system too much. + */ + if (down_write_trylock(&vma->vm_mm->mmap_sem)) { + spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); + /* assume page table is clear */ + _pmd = pmdp_collapse_flush(vma, addr, pmd); + spin_unlock(ptl); + up_write(&vma->vm_mm->mmap_sem); + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } + } + i_mmap_unlock_write(mapping); +} + +/** + * collapse_shmem - collapse small tmpfs/shmem pages into huge one. + * + * Basic scheme is simple, details are more complex: + * - allocate and freeze a new huge page; + * - scan over radix tree replacing old pages the new one + * + swap in pages if necessary; + * + fill in gaps; + * + keep old pages around in case if rollback is required; + * - if replacing succeed: + * + copy data over; + * + free old pages; + * + unfreeze huge page; + * - if replacing failed; + * + put all pages back and unfreeze them; + * + restore gaps in the radix-tree; + * + free huge page; + */ +static void collapse_shmem(struct mm_struct *mm, + struct address_space *mapping, pgoff_t start, + struct page **hpage, int node) +{ + gfp_t gfp; + struct page *page, *new_page, *tmp; + struct mem_cgroup *memcg; + pgoff_t index, end = start + HPAGE_PMD_NR; + LIST_HEAD(pagelist); + struct radix_tree_iter iter; + void **slot; + int nr_none = 0, result = SCAN_SUCCEED; + + VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); + + /* Only allocate from the target node */ + gfp = alloc_hugepage_khugepaged_gfpmask() | + __GFP_OTHER_NODE | __GFP_THISNODE; + + new_page = khugepaged_alloc_page(hpage, gfp, node); + if (!new_page) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out; + } + + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + result = SCAN_CGROUP_CHARGE_FAIL; + goto out; + } + + new_page->index = start; + new_page->mapping = mapping; + __SetPageSwapBacked(new_page); + __SetPageLocked(new_page); + BUG_ON(!page_ref_freeze(new_page, 1)); + + + /* + * At this point the new_page is 'frozen' (page_count() is zero), locked + * and not up-to-date. It's safe to insert it into radix tree, because + * nobody would be able to map it or use it in other way until we + * unfreeze it. + */ + + index = start; + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + int n = min(iter.index, end) - index; + + /* + * Handle holes in the radix tree: charge it from shmem and + * insert relevant subpage of new_page into the radix-tree. + */ + if (n && !shmem_charge(mapping->host, n)) { + result = SCAN_FAIL; + break; + } + nr_none += n; + for (; index < min(iter.index, end); index++) { + radix_tree_insert(&mapping->page_tree, index, + new_page + (index % HPAGE_PMD_NR)); + } + + /* We are done. */ + if (index >= end) + break; + + page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { + spin_unlock_irq(&mapping->tree_lock); + /* swap in or instantiate fallocated page */ + if (shmem_getpage(mapping->host, index, &page, + SGP_NOHUGE)) { + result = SCAN_FAIL; + goto tree_unlocked; + } + spin_lock_irq(&mapping->tree_lock); + } else if (trylock_page(page)) { + get_page(page); + } else { + result = SCAN_PAGE_LOCK; + break; + } + + /* + * The page must be locked, so we can drop the tree_lock + * without racing with truncate. + */ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageUptodate(page), page); + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (page_mapping(page) != mapping) { + result = SCAN_TRUNCATED; + goto out_unlock; + } + spin_unlock_irq(&mapping->tree_lock); + + if (isolate_lru_page(page)) { + result = SCAN_DEL_PAGE_LRU; + goto out_isolate_failed; + } + + if (page_mapped(page)) + unmap_mapping_range(mapping, index << PAGE_SHIFT, + PAGE_SIZE, 0); + + spin_lock_irq(&mapping->tree_lock); + + VM_BUG_ON_PAGE(page_mapped(page), page); + + /* + * The page is expected to have page_count() == 3: + * - we hold a pin on it; + * - one reference from radix tree; + * - one from isolate_lru_page; + */ + if (!page_ref_freeze(page, 3)) { + result = SCAN_PAGE_COUNT; + goto out_lru; + } + + /* + * Add the page to the list to be able to undo the collapse if + * something go wrong. + */ + list_add_tail(&page->lru, &pagelist); + + /* Finally, replace with the new page. */ + radix_tree_replace_slot(slot, + new_page + (index % HPAGE_PMD_NR)); + + index++; + continue; +out_lru: + spin_unlock_irq(&mapping->tree_lock); + putback_lru_page(page); +out_isolate_failed: + unlock_page(page); + put_page(page); + goto tree_unlocked; +out_unlock: + unlock_page(page); + put_page(page); + break; + } + + /* + * Handle hole in radix tree at the end of the range. + * This code only triggers if there's nothing in radix tree + * beyond 'end'. + */ + if (result == SCAN_SUCCEED && index < end) { + int n = end - index; + + if (!shmem_charge(mapping->host, n)) { + result = SCAN_FAIL; + goto tree_locked; + } + + for (; index < end; index++) { + radix_tree_insert(&mapping->page_tree, index, + new_page + (index % HPAGE_PMD_NR)); + } + nr_none += n; + } + +tree_locked: + spin_unlock_irq(&mapping->tree_lock); +tree_unlocked: + + if (result == SCAN_SUCCEED) { + unsigned long flags; + struct zone *zone = page_zone(new_page); + + /* + * Replacing old pages with new one has succeed, now we need to + * copy the content and free old pages. + */ + list_for_each_entry_safe(page, tmp, &pagelist, lru) { + copy_highpage(new_page + (page->index % HPAGE_PMD_NR), + page); + list_del(&page->lru); + unlock_page(page); + page_ref_unfreeze(page, 1); + page->mapping = NULL; + ClearPageActive(page); + ClearPageUnevictable(page); + put_page(page); + } + + local_irq_save(flags); + __inc_zone_page_state(new_page, NR_SHMEM_THPS); + if (nr_none) { + __mod_zone_page_state(zone, NR_FILE_PAGES, nr_none); + __mod_zone_page_state(zone, NR_SHMEM, nr_none); + } + local_irq_restore(flags); + + /* + * Remove pte page tables, so we can re-faulti + * the page as huge. + */ + retract_page_tables(mapping, start); + + /* Everything is ready, let's unfreeze the new_page */ + set_page_dirty(new_page); + SetPageUptodate(new_page); + page_ref_unfreeze(new_page, HPAGE_PMD_NR); + mem_cgroup_commit_charge(new_page, memcg, false, true); + lru_cache_add_anon(new_page); + unlock_page(new_page); + + *hpage = NULL; + } else { + /* Something went wrong: rollback changes to the radix-tree */ + shmem_uncharge(mapping->host, nr_none); + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, + start) { + if (iter.index >= end) + break; + page = list_first_entry_or_null(&pagelist, + struct page, lru); + if (!page || iter.index < page->index) { + if (!nr_none) + break; + /* Put holes back where they were */ + radix_tree_replace_slot(slot, NULL); + nr_none--; + continue; + } + + VM_BUG_ON_PAGE(page->index != iter.index, page); + + /* Unfreeze the page. */ + list_del(&page->lru); + page_ref_unfreeze(page, 2); + radix_tree_replace_slot(slot, page); + spin_unlock_irq(&mapping->tree_lock); + putback_lru_page(page); + unlock_page(page); + spin_lock_irq(&mapping->tree_lock); + } + VM_BUG_ON(nr_none); + spin_unlock_irq(&mapping->tree_lock); + + /* Unfreeze new_page, caller would take care about freeing it */ + page_ref_unfreeze(new_page, 1); + mem_cgroup_cancel_charge(new_page, memcg, true); + unlock_page(new_page); + new_page->mapping = NULL; + } +out: + VM_BUG_ON(!list_empty(&pagelist)); + /* TODO: tracepoints */ +} + +static void khugepaged_scan_shmem(struct mm_struct *mm, + struct address_space *mapping, + pgoff_t start, struct page **hpage) +{ + struct page *page = NULL; + struct radix_tree_iter iter; + void **slot; + int present, swap; + int node = NUMA_NO_NODE; + int result = SCAN_SUCCEED; + + present = 0; + swap = 0; + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + rcu_read_lock(); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + if (iter.index >= start + HPAGE_PMD_NR) + break; + + page = radix_tree_deref_slot(slot); + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + if (radix_tree_exception(page)) { + if (++swap > khugepaged_max_ptes_swap) { + result = SCAN_EXCEED_SWAP_PTE; + break; + } + continue; + } + + if (PageTransCompound(page)) { + result = SCAN_PAGE_COMPOUND; + break; + } + + node = page_to_nid(page); + if (khugepaged_scan_abort(node)) { + result = SCAN_SCAN_ABORT; + break; + } + khugepaged_node_load[node]++; + + if (!PageLRU(page)) { + result = SCAN_PAGE_LRU; + break; + } + + if (page_count(page) != 1 + page_mapcount(page)) { + result = SCAN_PAGE_COUNT; + break; + } + + /* + * We probably should check if the page is referenced here, but + * nobody would transfer pte_young() to PageReferenced() for us. + * And rmap walk here is just too costly... + */ + + present++; + + if (need_resched()) { + cond_resched_rcu(); + slot = radix_tree_iter_next(&iter); + } + } + rcu_read_unlock(); + + if (result == SCAN_SUCCEED) { + if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + result = SCAN_EXCEED_NONE_PTE; + } else { + node = khugepaged_find_target_node(); + collapse_shmem(mm, mapping, start, hpage, node); + } + } + + /* TODO: tracepoints */ +} +#else +static void khugepaged_scan_shmem(struct mm_struct *mm, + struct address_space *mapping, + pgoff_t start, struct page **hpage) +{ + BUILD_BUG(); +} +#endif + static unsigned int khugepaged_scan_mm_slot(unsigned int pages, struct page **hpage) __releases(&khugepaged_mm_lock) @@ -1269,6 +1681,8 @@ skip: if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma)) + goto skip; while (khugepaged_scan.address < hend) { int ret; @@ -1279,9 +1693,20 @@ skip: VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - hpage); + if (shmem_file(vma->vm_file)) { + struct file *file = get_file(vma->vm_file); + pgoff_t pgoff = linear_page_index(vma, + khugepaged_scan.address); + up_read(&mm->mmap_sem); + ret = 1; + khugepaged_scan_shmem(mm, file->f_mapping, + pgoff, hpage); + fput(file); + } else { + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + } /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; diff -puN mm/shmem.c~khugepaged-add-support-of-collapse-for-tmpfs-shmem-pages mm/shmem.c --- a/mm/shmem.c~khugepaged-add-support-of-collapse-for-tmpfs-shmem-pages +++ a/mm/shmem.c @@ -32,6 +32,7 @@ #include <linux/export.h> #include <linux/swap.h> #include <linux/uio.h> +#include <linux/khugepaged.h> static struct vfsmount *shm_mnt; @@ -97,16 +98,6 @@ struct shmem_falloc { pgoff_t nr_unswapped; /* how often writepage refused to swap out */ }; -/* Flag allocation requirements to shmem_getpage */ -enum sgp_type { - SGP_READ, /* don't exceed i_size, don't allocate page */ - SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ - SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ - SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ - SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ -}; - #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { @@ -126,7 +117,7 @@ static int shmem_getpage_gfp(struct inod struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); -static inline int shmem_getpage(struct inode *inode, pgoff_t index, +int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp) { return shmem_getpage_gfp(inode, index, pagep, sgp, @@ -1899,6 +1890,11 @@ static int shmem_mmap(struct file *file, { file_accessed(file); vma->vm_ops = &shmem_vm_ops; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < + (vma->vm_end & HPAGE_PMD_MASK)) { + khugepaged_enter(vma, vma->vm_flags); + } return 0; } @@ -3803,6 +3799,37 @@ static ssize_t shmem_enabled_store(struc struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); + +bool shmem_huge_enabled(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + loff_t i_size; + pgoff_t off; + + if (shmem_huge == SHMEM_HUGE_FORCE) + return true; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + switch (sbinfo->huge) { + case SHMEM_HUGE_NEVER: + return false; + case SHMEM_HUGE_ALWAYS: + return true; + case SHMEM_HUGE_WITHIN_SIZE: + off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && + i_size >> PAGE_SHIFT >= off) + return true; + case SHMEM_HUGE_ADVISE: + /* TODO: implement fadvise() hints */ + return (vma->vm_flags & VM_HUGEPAGE); + default: + VM_BUG_ON(1); + return false; + } +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ #else /* !CONFIG_SHMEM */ @@ -3982,6 +4009,13 @@ int shmem_zero_setup(struct vm_area_stru fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < + (vma->vm_end & HPAGE_PMD_MASK)) { + khugepaged_enter(vma, vma->vm_flags); + } + return 0; } _ -- To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html