The patch titled Subject: khugepaged: enable collapse pmd for pte-mapped THP has been removed from the -mm tree. Its filename was khugepaged-enable-collapse-pmd-for-pte-mapped-thp.patch This patch was dropped because an updated version will be merged ------------------------------------------------------ From: Song Liu <songliubraving@xxxxxx> Subject: khugepaged: enable collapse pmd for pte-mapped THP khugepaged needs exclusive mmap_sem to access page table. When it fails to lock mmap_sem, the page will fault in as pte-mapped THP. As the page is already a THP, khugepaged will not handle this pmd again. This patch enables the khugepaged to retry collapse the page table. struct mm_slot (in khugepaged.c) is extended with an array, containing addresses of pte-mapped THPs. We use array here for simplicity. We can easily replace it with more advanced data structures when needed. This array is protected by khugepaged_mm_lock. In khugepaged_scan_mm_slot(), if the mm contains pte-mapped THP, we try to collapse the page table. Since collapse may happen at an later time, some pages may already fault in. collapse_pte_mapped_thp() is added to properly handle these pages. collapse_pte_mapped_thp() also double checks whether all ptes in this pmd are mapping to the same THP. This is necessary because some subpage of the THP may be replaced, for example by uprobe. In such cases, it is not possible to collapse the pmd. Link: http://lkml.kernel.org/r/20190731183331.2565608-2-songliubraving@xxxxxx Signed-off-by: Song Liu <songliubraving@xxxxxx> Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> Cc: Matthew Wilcox <matthew.wilcox@xxxxxxxxxx> Cc: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> Cc: Oleg Nesterov <oleg@xxxxxxxxxx> Cc: Steven Rostedt (VMware) <rostedt@xxxxxxxxxxx> Cc: William Kucharski <william.kucharski@xxxxxxxxxx> Cc: Srikar Dronamraju <srikar@xxxxxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- include/linux/khugepaged.h | 12 +++ mm/khugepaged.c | 140 +++++++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+) --- a/include/linux/khugepaged.h~khugepaged-enable-collapse-pmd-for-pte-mapped-thp +++ a/include/linux/khugepaged.h @@ -15,6 +15,14 @@ extern int __khugepaged_enter(struct mm_ extern void __khugepaged_exit(struct mm_struct *mm); extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags); +#ifdef CONFIG_SHMEM +extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long haddr); +#else +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long haddr) +{ +} +#endif #define khugepaged_enabled() \ (transparent_hugepage_flags & \ @@ -73,6 +81,10 @@ static inline int khugepaged_enter_vma_m { return 0; } +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long haddr) +{ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */ --- a/mm/khugepaged.c~khugepaged-enable-collapse-pmd-for-pte-mapped-thp +++ a/mm/khugepaged.c @@ -76,6 +76,8 @@ static __read_mostly DEFINE_HASHTABLE(mm static struct kmem_cache *mm_slot_cache __read_mostly; +#define MAX_PTE_MAPPED_THP 8 + /** * struct mm_slot - hash lookup from mm to mm_slot * @hash: hash collision list @@ -86,6 +88,10 @@ struct mm_slot { struct hlist_node hash; struct list_head mm_node; struct mm_struct *mm; + + /* pte-mapped THP in this mm */ + int nr_pte_mapped_thp; + unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP]; }; /** @@ -1248,6 +1254,135 @@ static void collect_mm_slot(struct mm_sl } #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +/* + * Notify khugepaged that given addr of the mm is pte-mapped THP. Then + * khugepaged should try to collapse the page table. + */ +static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ + struct mm_slot *mm_slot; + int ret = 0; + + /* hold mmap_sem for khugepaged_test_exit() */ + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + + if (unlikely(khugepaged_test_exit(mm))) + return 0; + + if (!test_bit(MMF_VM_HUGEPAGE, &mm->flags) && + !test_bit(MMF_DISABLE_THP, &mm->flags)) { + ret = __khugepaged_enter(mm); + if (ret) + return ret; + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) + mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; + spin_unlock(&khugepaged_mm_lock); + return 0; +} + +/** + * Try to collapse a pte-mapped THP for mm at address haddr. + * + * This function checks whether all the PTEs in the PMD are pointing to the + * right THP. If so, retract the page table so the THP can refault in with + * as pmd-mapped. + */ +void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long haddr) +{ + struct vm_area_struct *vma = find_vma(mm, haddr); + pmd_t *pmd = mm_find_pmd(mm, haddr); + struct page *hpage = NULL; + unsigned long addr; + spinlock_t *ptl; + int count = 0; + pmd_t _pmd; + int i; + + VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); + + if (!vma || !pmd || pmd_trans_huge(*pmd)) + return; + + /* step 1: check all mapped PTEs are to the right huge page */ + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + pte_t *pte = pte_offset_map(pmd, addr); + struct page *page; + + if (pte_none(*pte)) + continue; + + page = vm_normal_page(vma, addr, *pte); + + if (!PageCompound(page)) + return; + + if (!hpage) { + hpage = compound_head(page); + if (hpage->mapping != vma->vm_file->f_mapping) + return; + } + + if (hpage + i != page) + return; + count++; + } + + /* step 2: adjust rmap */ + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + pte_t *pte = pte_offset_map(pmd, addr); + struct page *page; + + if (pte_none(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + page_remove_rmap(page, false); + } + + /* step 3: set proper refcount and mm_counters. */ + if (hpage) { + page_ref_sub(hpage, count); + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); + } + + /* step 4: collapse pmd */ + ptl = pmd_lock(vma->vm_mm, pmd); + _pmd = pmdp_collapse_flush(vma, addr, pmd); + spin_unlock(ptl); + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(_pmd)); +} + +static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + int i; + + lockdep_assert_held(&khugepaged_mm_lock); + + if (likely(mm_slot->nr_pte_mapped_thp == 0)) + return 0; + + if (!down_write_trylock(&mm->mmap_sem)) + return -EBUSY; + + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); + +out: + mm_slot->nr_pte_mapped_thp = 0; + up_write(&mm->mmap_sem); + return 0; +} + static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; @@ -1281,6 +1416,10 @@ static void retract_page_tables(struct a up_write(&vma->vm_mm->mmap_sem); mm_dec_nr_ptes(vma->vm_mm); pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } else if (down_read_trylock(&vma->vm_mm->mmap_sem)) { + /* need down_read for khugepaged_test_exit() */ + khugepaged_add_pte_mapped_thp(vma->vm_mm, addr); + up_read(&vma->vm_mm->mmap_sem); } } i_mmap_unlock_write(mapping); @@ -1667,6 +1806,7 @@ static unsigned int khugepaged_scan_mm_s khugepaged_scan.address = 0; khugepaged_scan.mm_slot = mm_slot; } + khugepaged_collapse_pte_mapped_thps(mm_slot); spin_unlock(&khugepaged_mm_lock); mm = mm_slot->mm; _ Patches currently in -mm which might be from songliubraving@xxxxxx are uprobe-collapse-thp-pmd-after-removing-all-uprobes.patch filemap-check-compound_headpage-mapping-in-filemap_fault.patch filemap-check-compound_headpage-mapping-in-pagecache_get_page.patch filemap-update-offset-check-in-filemap_fault.patch mmthp-stats-for-file-backed-thp.patch khugepaged-rename-collapse_shmem-and-khugepaged_scan_shmem.patch mmthp-add-read-only-thp-support-for-non-shmem-fs.patch mmthp-avoid-writes-to-file-with-thp-in-pagecache.patch