The patch titled Subject: mm, THP, swap: support to read a huge swap cluster for swapin a THP has been added to the -mm tree. Its filename is mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Huang Ying <ying.huang@xxxxxxxxx> Subject: mm, THP, swap: support to read a huge swap cluster for swapin a THP To swapin a THP as a whole, we need to read a huge swap cluster from the swap device. This patch revised the __read_swap_cache_async() and its callers and callees to support this. If __read_swap_cache_async() find the swap cluster of the specified swap entry is huge, it will try to allocate a THP, add it into the swap cache. So later the contents of the huge swap cluster can be read into the THP. Link: http://lkml.kernel.org/r/20180622035151.6676-9-ying.huang@xxxxxxxxx Signed-off-by: "Huang, Ying" <ying.huang@xxxxxxxxx> Cc: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Shaohua Li <shli@xxxxxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> Cc: Minchan Kim <minchan@xxxxxxxxxx> Cc: Rik van Riel <riel@xxxxxxxxxx> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> Cc: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> Cc: Zi Yan <zi.yan@xxxxxxxxxxxxxx> Cc: Daniel Jordan <daniel.m.jordan@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- diff -puN include/linux/huge_mm.h~mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp include/linux/huge_mm.h --- a/include/linux/huge_mm.h~mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp +++ a/include/linux/huge_mm.h @@ -250,6 +250,39 @@ static inline bool thp_migration_support return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } +/* + * always: directly stall for all thp allocations + * defer: wake kswapd and fail if not immediately available + * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise + * fail if not immediately available + * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately + * available + * never: never stall for any thp allocation + */ +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +{ + bool vma_madvised; + + if (!vma) + return GFP_TRANSHUGE_LIGHT; + vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, + &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + return GFP_TRANSHUGE_LIGHT; +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -362,6 +395,11 @@ static inline bool thp_migration_support { return false; } + +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff -puN include/linux/swap.h~mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp include/linux/swap.h --- a/include/linux/swap.h~mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp +++ a/include/linux/swap.h @@ -462,7 +462,7 @@ extern sector_t map_swap_page(struct pag extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); extern int __swap_count(swp_entry_t entry); -extern int __swp_swapcount(swp_entry_t entry); +extern int __swp_swapcount(swp_entry_t entry, bool *huge_cluster); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); @@ -589,7 +589,7 @@ static inline int __swap_count(swp_entry return 0; } -static inline int __swp_swapcount(swp_entry_t entry) +static inline int __swp_swapcount(swp_entry_t entry, bool *huge_cluster) { return 0; } diff -puN mm/huge_memory.c~mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp mm/huge_memory.c --- a/mm/huge_memory.c~mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp +++ a/mm/huge_memory.c @@ -620,32 +620,6 @@ release: } -/* - * always: directly stall for all thp allocations - * defer: wake kswapd and fail if not immediately available - * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise - * fail if not immediately available - * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately - * available - * never: never stall for any thp allocation - */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) -{ - const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - 0); - return GFP_TRANSHUGE_LIGHT; -} - /* Caller must hold page table lock. */ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, diff -puN mm/swapfile.c~mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp mm/swapfile.c --- a/mm/swapfile.c~mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp +++ a/mm/swapfile.c @@ -1497,7 +1497,8 @@ int __swap_count(swp_entry_t entry) return count; } -static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry, + bool *huge_cluster) { int count = 0; pgoff_t offset = swp_offset(entry); @@ -1505,6 +1506,8 @@ static int swap_swapcount(struct swap_in ci = lock_cluster_or_swap_info(si, offset); count = swap_count(si->swap_map[offset]); + if (huge_cluster && ci) + *huge_cluster = cluster_is_huge(ci); unlock_cluster_or_swap_info(si, ci); return count; } @@ -1514,14 +1517,14 @@ static int swap_swapcount(struct swap_in * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -int __swp_swapcount(swp_entry_t entry) +int __swp_swapcount(swp_entry_t entry, bool *huge_cluster) { int count = 0; struct swap_info_struct *si; si = get_swap_device(entry); if (si) { - count = swap_swapcount(si, entry); + count = swap_swapcount(si, entry, huge_cluster); put_swap_device(si); } return count; @@ -1681,7 +1684,7 @@ static int page_trans_huge_map_swapcount return map_swapcount; } #else -#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) +#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry, NULL) #define page_swapped(page) (page_swapcount(page) != 0) static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, diff -puN mm/swap_state.c~mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp mm/swap_state.c --- a/mm/swap_state.c~mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp +++ a/mm/swap_state.c @@ -386,6 +386,9 @@ struct page *__read_swap_cache_async(swp struct page *found_page = NULL, *new_page = NULL; struct swap_info_struct *si; int err; + bool huge_cluster = false; + swp_entry_t hentry; + *new_page_allocated = false; do { @@ -411,14 +414,32 @@ struct page *__read_swap_cache_async(swp * as SWAP_HAS_CACHE. That's done in later part of code or * else swap_off will be aborted if we return NULL. */ - if (!__swp_swapcount(entry) && swap_slot_cache_enabled) + if (!__swp_swapcount(entry, &huge_cluster) && + swap_slot_cache_enabled) break; /* * Get a new page to read into from swap. */ - if (!new_page) { - new_page = alloc_page_vma(gfp_mask, vma, addr); + if (!new_page || + (thp_swap_supported() && + !!PageTransCompound(new_page) != huge_cluster)) { + if (new_page) + put_page(new_page); + if (thp_swap_supported() && huge_cluster) { + gfp_t gfp = alloc_hugepage_direct_gfpmask(vma); + + new_page = alloc_hugepage_vma(gfp, vma, + addr, HPAGE_PMD_ORDER); + if (new_page) + prep_transhuge_page(new_page); + hentry = swp_entry(swp_type(entry), + round_down(swp_offset(entry), + HPAGE_PMD_NR)); + } else { + new_page = alloc_page_vma(gfp_mask, vma, addr); + hentry = entry; + } if (!new_page) break; /* Out of memory */ } @@ -426,33 +447,37 @@ struct page *__read_swap_cache_async(swp /* * call radix_tree_preload() while we can wait. */ - err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); + err = radix_tree_maybe_preload_order(gfp_mask & GFP_KERNEL, + compound_order(new_page)); if (err) break; /* * Swap entry may have been freed since our caller observed it. */ - err = swapcache_prepare(entry, false); - if (err == -EEXIST) { - radix_tree_preload_end(); - /* - * We might race against get_swap_page() and stumble - * across a SWAP_HAS_CACHE swap_map entry whose page - * has not been brought into the swapcache yet. - */ - cond_resched(); - continue; - } - if (err) { /* swp entry is obsolete ? */ + err = swapcache_prepare(hentry, huge_cluster); + if (err) { radix_tree_preload_end(); - break; + if (err == -EEXIST) { + /* + * We might race against get_swap_page() and + * stumble across a SWAP_HAS_CACHE swap_map + * entry whose page has not been brought into + * the swapcache yet. + */ + cond_resched(); + continue; + } else if (err == -ENOTDIR) { + /* huge swap cluster is split under us */ + continue; + } else /* swp entry is obsolete ? */ + break; } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __SetPageLocked(new_page); __SetPageSwapBacked(new_page); - err = __add_to_swap_cache(new_page, entry); + err = __add_to_swap_cache(new_page, hentry); if (likely(!err)) { radix_tree_preload_end(); /* @@ -460,6 +485,9 @@ struct page *__read_swap_cache_async(swp */ lru_cache_add_anon(new_page); *new_page_allocated = true; + if (thp_swap_supported() && huge_cluster) + new_page += swp_offset(entry) & + (HPAGE_PMD_NR - 1); return new_page; } radix_tree_preload_end(); @@ -468,7 +496,7 @@ struct page *__read_swap_cache_async(swp * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - put_swap_page(new_page, entry); + put_swap_page(new_page, hentry); } while (err != -ENOMEM); if (new_page) @@ -490,7 +518,7 @@ struct page *read_swap_cache_async(swp_e vma, addr, &page_was_allocated); if (page_was_allocated) - swap_readpage(retpage, do_poll); + swap_readpage(compound_head(retpage), do_poll); return retpage; } @@ -609,8 +637,9 @@ struct page *swap_cluster_readahead(swp_ if (!page) continue; if (page_allocated) { - swap_readpage(page, false); - if (offset != entry_offset) { + swap_readpage(compound_head(page), false); + if (offset != entry_offset && + !PageTransCompound(page)) { SetPageReadahead(page); count_vm_event(SWAP_RA); } @@ -771,8 +800,8 @@ static struct page *swap_vma_readahead(s if (!page) continue; if (page_allocated) { - swap_readpage(page, false); - if (i != ra_info.offset) { + swap_readpage(compound_head(page), false); + if (i != ra_info.offset && !PageTransCompound(page)) { SetPageReadahead(page); count_vm_event(SWAP_RA); } _ Patches currently in -mm which might be from ying.huang@xxxxxxxxx are mm-clear_huge_page-move-order-algorithm-into-a-separate-function.patch mm-huge-page-copy-target-sub-page-last-when-copy-huge-page.patch mm-hugetlbfs-rename-address-to-haddr-in-hugetlb_cow.patch mm-hugetlbfs-pass-fault-address-to-cow-handler.patch mm-swap-fix-race-between-swapoff-and-some-swap-operations.patch mm-swap-fix-race-between-swapoff-and-some-swap-operations-v6.patch mm-fix-race-between-swapoff-and-mincore.patch mm-thp-swap-enable-pmd-swap-operations-for-config_thp_swap.patch mm-thp-swap-make-config_thp_swap-depends-on-config_swap.patch mm-thp-swap-support-pmd-swap-mapping-in-swap_duplicate.patch mm-thp-swap-support-pmd-swap-mapping-in-swapcache_free_cluster.patch mm-thp-swap-support-pmd-swap-mapping-in-free_swap_and_cache-swap_free.patch mm-thp-swap-support-pmd-swap-mapping-when-splitting-huge-pmd.patch mm-thp-swap-support-pmd-swap-mapping-in-split_swap_cluster.patch mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp.patch mm-thp-swap-swapin-a-thp-as-a-whole.patch mm-thp-swap-support-to-count-thp-swapin-and-its-fallback.patch mm-thp-swap-add-sysfs-interface-to-configure-thp-swapin.patch mm-thp-swap-support-pmd-swap-mapping-in-swapoff.patch mm-thp-swap-support-pmd-swap-mapping-in-madvise_free.patch mm-cgroup-thp-swap-support-to-move-swap-account-for-pmd-swap-mapping.patch mm-thp-swap-support-to-copy-pmd-swap-mapping-when-fork.patch mm-thp-swap-free-pmd-swap-mapping-when-zap_huge_pmd.patch mm-thp-swap-support-pmd-swap-mapping-for-madv_willneed.patch mm-thp-swap-support-pmd-swap-mapping-in-mincore.patch mm-thp-swap-support-pmd-swap-mapping-in-common-path.patch mm-thp-swap-create-pmd-swap-mapping-when-unmap-the-thp.patch mm-thp-avoid-to-split-thp-when-reclaim-madv_free-thp.patch -- To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html