> On Jun 24, 2019, at 5:47 AM, Kirill A. Shutemov <kirill@xxxxxxxxxxxxx> wrote: > > On Sat, Jun 22, 2019 at 10:47:48PM -0700, Song Liu wrote: >> This patch is (hopefully) the first step to enable THP for non-shmem >> filesystems. >> >> This patch enables an application to put part of its text sections to THP >> via madvise, for example: >> >> madvise((void *)0x600000, 0x200000, MADV_HUGEPAGE); >> >> We tried to reuse the logic for THP on tmpfs. >> >> Currently, write is not supported for non-shmem THP. khugepaged will only >> process vma with VM_DENYWRITE. The next patch will handle writes, which >> would only happen when the vma with VM_DENYWRITE is unmapped. >> >> An EXPERIMENTAL config, READ_ONLY_THP_FOR_FS, is added to gate this >> feature. >> >> Acked-by: Rik van Riel <riel@xxxxxxxxxxx> >> Signed-off-by: Song Liu <songliubraving@xxxxxx> >> --- >> mm/Kconfig | 11 ++++++ >> mm/filemap.c | 4 +-- >> mm/khugepaged.c | 90 ++++++++++++++++++++++++++++++++++++++++--------- >> mm/rmap.c | 12 ++++--- >> 4 files changed, 96 insertions(+), 21 deletions(-) >> >> diff --git a/mm/Kconfig b/mm/Kconfig >> index f0c76ba47695..0a8fd589406d 100644 >> --- a/mm/Kconfig >> +++ b/mm/Kconfig >> @@ -762,6 +762,17 @@ config GUP_BENCHMARK >> >> See tools/testing/selftests/vm/gup_benchmark.c >> >> +config READ_ONLY_THP_FOR_FS >> + bool "Read-only THP for filesystems (EXPERIMENTAL)" >> + depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM >> + >> + help >> + Allow khugepaged to put read-only file-backed pages in THP. >> + >> + This is marked experimental because it is a new feature. Write >> + support of file THPs will be developed in the next few release >> + cycles. >> + >> config ARCH_HAS_PTE_SPECIAL >> bool >> >> diff --git a/mm/filemap.c b/mm/filemap.c >> index 5f072a113535..e79ceccdc6df 100644 >> --- a/mm/filemap.c >> +++ b/mm/filemap.c >> @@ -203,8 +203,8 @@ static void unaccount_page_cache_page(struct address_space *mapping, >> __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); >> if (PageTransHuge(page)) >> __dec_node_page_state(page, NR_SHMEM_THPS); >> - } else { >> - VM_BUG_ON_PAGE(PageTransHuge(page), page); >> + } else if (PageTransHuge(page)) { >> + __dec_node_page_state(page, NR_FILE_THPS); >> } >> >> /* >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c >> index 158cad542627..090127e4e185 100644 >> --- a/mm/khugepaged.c >> +++ b/mm/khugepaged.c >> @@ -48,6 +48,7 @@ enum scan_result { >> SCAN_CGROUP_CHARGE_FAIL, >> SCAN_EXCEED_SWAP_PTE, >> SCAN_TRUNCATED, >> + SCAN_PAGE_HAS_PRIVATE, >> }; >> >> #define CREATE_TRACE_POINTS >> @@ -404,7 +405,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, >> (vm_flags & VM_NOHUGEPAGE) || >> test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) >> return false; >> - if (shmem_file(vma->vm_file)) { >> + >> + if (shmem_file(vma->vm_file) || >> + (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && >> + vma->vm_file && >> + (vm_flags & VM_DENYWRITE))) { >> if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) >> return false; >> return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, >> @@ -456,8 +461,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, >> unsigned long hstart, hend; >> >> /* >> - * khugepaged does not yet work on non-shmem files or special >> - * mappings. And file-private shmem THP is not supported. >> + * khugepaged only supports read-only files for non-shmem files. >> + * khugepaged does not yet work on special mappings. And >> + * file-private shmem THP is not supported. >> */ >> if (!hugepage_vma_check(vma, vm_flags)) >> return 0; >> @@ -1287,12 +1293,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) >> } >> >> /** >> - * collapse_file - collapse small tmpfs/shmem pages into huge one. >> + * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. >> * >> * Basic scheme is simple, details are more complex: >> * - allocate and lock a new huge page; >> * - scan page cache replacing old pages with the new one >> - * + swap in pages if necessary; >> + * + swap/gup in pages if necessary; >> * + fill in gaps; >> * + keep old pages around in case rollback is required; >> * - if replacing succeeds: >> @@ -1316,7 +1322,11 @@ static void collapse_file(struct mm_struct *mm, >> LIST_HEAD(pagelist); >> XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); >> int nr_none = 0, result = SCAN_SUCCEED; >> + bool is_shmem = shmem_file(file); >> >> +#ifndef CONFIG_READ_ONLY_THP_FOR_FS >> + VM_BUG_ON(!is_shmem); >> +#endif > > VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); Will fix. > >> VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); >> >> /* Only allocate from the target node */ >> @@ -1348,7 +1358,8 @@ static void collapse_file(struct mm_struct *mm, >> } while (1); >> >> __SetPageLocked(new_page); >> - __SetPageSwapBacked(new_page); >> + if (is_shmem) >> + __SetPageSwapBacked(new_page); >> new_page->index = start; >> new_page->mapping = mapping; >> >> @@ -1363,7 +1374,7 @@ static void collapse_file(struct mm_struct *mm, >> struct page *page = xas_next(&xas); >> >> VM_BUG_ON(index != xas.xa_index); >> - if (!page) { >> + if (is_shmem && !page) { >> /* >> * Stop if extent has been truncated or hole-punched, >> * and is now completely empty. >> @@ -1384,7 +1395,7 @@ static void collapse_file(struct mm_struct *mm, >> continue; >> } >> >> - if (xa_is_value(page) || !PageUptodate(page)) { >> + if (is_shmem && (xa_is_value(page) || !PageUptodate(page))) { >> xas_unlock_irq(&xas); >> /* swap in or instantiate fallocated page */ >> if (shmem_getpage(mapping->host, index, &page, >> @@ -1392,6 +1403,23 @@ static void collapse_file(struct mm_struct *mm, >> result = SCAN_FAIL; >> goto xa_unlocked; >> } >> + } else if (!page || xa_is_value(page)) { >> + xas_unlock_irq(&xas); >> + page_cache_sync_readahead(mapping, &file->f_ra, file, >> + index, PAGE_SIZE); >> + lru_add_drain(); > > Why? isolate_lru_page() is likely to fail if we don't drain the pagevecs. > >> + page = find_lock_page(mapping, index); >> + if (unlikely(page == NULL)) { >> + result = SCAN_FAIL; >> + goto xa_unlocked; >> + } >> + } else if (!PageUptodate(page)) { > > Maybe we should try wait_on_page_locked() here before give up? Are you referring to the "if (!PageUptodate(page))" case? > >> + VM_BUG_ON(is_shmem); >> + result = SCAN_FAIL; >> + goto xa_locked; >> + } else if (!is_shmem && PageDirty(page)) { >> + result = SCAN_FAIL; >> + goto xa_locked; >> } else if (trylock_page(page)) { >> get_page(page); >> xas_unlock_irq(&xas); > -- > Kirill A. Shutemov