The patch titled Subject: mm/hugetlb: use __GFP_COMP for gigantic folios has been added to the -mm mm-unstable branch. Its filename is mm-hugetlb-use-__gfp_comp-for-gigantic-folios.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-hugetlb-use-__gfp_comp-for-gigantic-folios.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Yu Zhao <yuzhao@xxxxxxxxxx> Subject: mm/hugetlb: use __GFP_COMP for gigantic folios Date: Sun, 11 Aug 2024 15:21:29 -0600 Use __GFP_COMP for gigantic folios to greatly reduce not only the code but also the allocation and free time. LOC (approximately): -200, +50 Allocate and free 500 1GB hugeTLB memory without HVO by: time echo 500 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages time echo 0 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages Before After Alloc ~13s ~10s Free ~15s <1s The above magnitude generally holds for multiple x86 and arm64 CPU models. Link: https://lkml.kernel.org/r/20240811212129.3074314-4-yuzhao@xxxxxxxxxx Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> Cc: Muchun Song <muchun.song@xxxxxxxxx> Cc: Zi Yan <ziy@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- include/linux/hugetlb.h | 9 - mm/hugetlb.c | 239 ++++++-------------------------------- 2 files changed, 48 insertions(+), 200 deletions(-) --- a/include/linux/hugetlb.h~mm-hugetlb-use-__gfp_comp-for-gigantic-folios +++ a/include/linux/hugetlb.h @@ -896,10 +896,11 @@ static inline bool hugepage_movable_supp /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepage_movable_supported(h)) - return GFP_HIGHUSER_MOVABLE; - else - return GFP_HIGHUSER; + gfp_t gfp = __GFP_COMP | __GFP_NOWARN; + + gfp |= hugepage_movable_supported(h) ? GFP_HIGHUSER_MOVABLE : GFP_HIGHUSER; + + return gfp; } static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) --- a/mm/hugetlb.c~mm-hugetlb-use-__gfp_comp-for-gigantic-folios +++ a/mm/hugetlb.c @@ -1512,43 +1512,7 @@ static int hstate_next_node_to_free(stru ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -/* used to demote non-gigantic_huge pages as well */ -static void __destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order, bool demote) -{ - int i; - int nr_pages = 1 << order; - struct page *p; - - atomic_set(&folio->_entire_mapcount, 0); - atomic_set(&folio->_large_mapcount, 0); - atomic_set(&folio->_pincount, 0); - - for (i = 1; i < nr_pages; i++) { - p = folio_page(folio, i); - p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; - p->mapping = NULL; - clear_compound_head(p); - if (!demote) - set_page_refcounted(p); - } - - __folio_clear_head(folio); -} - -static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, - unsigned int order) -{ - __destroy_compound_gigantic_folio(folio, order, true); -} - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static void destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order) -{ - __destroy_compound_gigantic_folio(folio, order, false); -} - static void free_gigantic_folio(struct folio *folio, unsigned int order) { /* @@ -1569,38 +1533,52 @@ static void free_gigantic_folio(struct f static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - struct page *page; - unsigned long nr_pages = pages_per_huge_page(h); + struct folio *folio; + int order = huge_page_order(h); + bool retry = false; + if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - +retry: + folio = NULL; #ifdef CONFIG_CMA { int node; - if (hugetlb_cma[nid]) { - page = cma_alloc(hugetlb_cma[nid], nr_pages, - huge_page_order(h), true); - if (page) - return page_folio(page); - } + if (hugetlb_cma[nid]) + folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask); - if (!(gfp_mask & __GFP_THISNODE)) { + if (!folio && !(gfp_mask & __GFP_THISNODE)) { for_each_node_mask(node, *nodemask) { if (node == nid || !hugetlb_cma[node]) continue; - page = cma_alloc(hugetlb_cma[node], nr_pages, - huge_page_order(h), true); - if (page) - return page_folio(page); + folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask); + if (folio) + break; } } } #endif + if (!folio) { + struct page *page = alloc_contig_pages(1 << order, gfp_mask, nid, nodemask); + + if (!page) + return NULL; - page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); - return page ? page_folio(page) : NULL; + folio = page_folio(page); + } + + if (folio_ref_freeze(folio, 1)) + return folio; + + pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio)); + free_gigantic_folio(folio, order); + if (!retry) { + retry = true; + goto retry; + } + return NULL; } #else /* !CONFIG_CONTIG_ALLOC */ @@ -1619,8 +1597,6 @@ static struct folio *alloc_gigantic_foli } static inline void free_gigantic_folio(struct folio *folio, unsigned int order) { } -static inline void destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order) { } #endif /* @@ -1747,6 +1723,7 @@ static void __update_and_free_hugetlb_fo folio_clear_hugetlb_hwpoison(folio); folio_ref_unfreeze(folio, 1); + INIT_LIST_HEAD(&folio->_deferred_list); /* * Non-gigantic pages demoted from CMA allocated gigantic pages @@ -1754,10 +1731,8 @@ static void __update_and_free_hugetlb_fo */ if (hstate_is_gigantic(h) || hugetlb_cma_folio(folio, huge_page_order(h))) { - destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_folio(folio, huge_page_order(h)); } else { - INIT_LIST_HEAD(&folio->_deferred_list); folio_clear_partially_mapped(folio); folio_put(folio); } @@ -2033,95 +2008,6 @@ static void prep_new_hugetlb_folio(struc spin_unlock_irq(&hugetlb_lock); } -static bool __prep_compound_gigantic_folio(struct folio *folio, - unsigned int order, bool demote) -{ - int i, j; - int nr_pages = 1 << order; - struct page *p; - - __folio_clear_reserved(folio); - for (i = 0; i < nr_pages; i++) { - p = folio_page(folio, i); - - /* - * For gigantic hugepages allocated through bootmem at - * boot, it's safer to be consistent with the not-gigantic - * hugepages and clear the PG_reserved bit from all tail pages - * too. Otherwise drivers using get_user_pages() to access tail - * pages may get the reference counting wrong if they see - * PG_reserved set on a tail page (despite the head page not - * having PG_reserved set). Enforcing this consistency between - * head and tail pages allows drivers to optimize away a check - * on the head page when they need know if put_page() is needed - * after get_user_pages(). - */ - if (i != 0) /* head page cleared above */ - __ClearPageReserved(p); - /* - * Subtle and very unlikely - * - * Gigantic 'page allocators' such as memblock or cma will - * return a set of pages with each page ref counted. We need - * to turn this set of pages into a compound page with tail - * page ref counts set to zero. Code such as speculative page - * cache adding could take a ref on a 'to be' tail page. - * We need to respect any increased ref count, and only set - * the ref count to zero if count is currently 1. If count - * is not 1, we return an error. An error return indicates - * the set of pages can not be converted to a gigantic page. - * The caller who allocated the pages should then discard the - * pages using the appropriate free interface. - * - * In the case of demote, the ref count will be zero. - */ - if (!demote) { - if (!page_ref_freeze(p, 1)) { - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); - goto out_error; - } - } else { - VM_BUG_ON_PAGE(page_count(p), p); - } - if (i != 0) - set_compound_head(p, &folio->page); - } - __folio_set_head(folio); - /* we rely on prep_new_hugetlb_folio to set the hugetlb flag */ - folio_set_order(folio, order); - atomic_set(&folio->_entire_mapcount, -1); - atomic_set(&folio->_large_mapcount, -1); - atomic_set(&folio->_pincount, 0); - return true; - -out_error: - /* undo page modifications made above */ - for (j = 0; j < i; j++) { - p = folio_page(folio, j); - if (j != 0) - clear_compound_head(p); - set_page_refcounted(p); - } - /* need to clear PG_reserved on remaining tail pages */ - for (; j < nr_pages; j++) { - p = folio_page(folio, j); - __ClearPageReserved(p); - } - return false; -} - -static bool prep_compound_gigantic_folio(struct folio *folio, - unsigned int order) -{ - return __prep_compound_gigantic_folio(folio, order, false); -} - -static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, - unsigned int order) -{ - return __prep_compound_gigantic_folio(folio, order, true); -} - /* * Find and lock address space (mapping) in write mode. * @@ -2160,7 +2046,6 @@ static struct folio *alloc_buddy_hugetlb */ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) alloc_try_hard = false; - gfp_mask |= __GFP_COMP|__GFP_NOWARN; if (alloc_try_hard) gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) @@ -2207,48 +2092,14 @@ retry: return folio; } -static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) -{ - struct folio *folio; - bool retry = false; - -retry: - if (hstate_is_gigantic(h)) - folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); - else - folio = alloc_buddy_hugetlb_folio(h, gfp_mask, - nid, nmask, node_alloc_noretry); - if (!folio) - return NULL; - - if (hstate_is_gigantic(h)) { - if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { - /* - * Rare failure to convert pages to compound page. - * Free pages and try again - ONCE! - */ - free_gigantic_folio(folio, huge_page_order(h)); - if (!retry) { - retry = true; - goto retry; - } - return NULL; - } - } - - return folio; -} - static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; - folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, - node_alloc_noretry); + folio = hstate_is_gigantic(h) ? alloc_gigantic_folio(h, gfp_mask, nid, nmask) : + alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); if (folio) init_new_hugetlb_folio(h, folio); return folio; @@ -2266,7 +2117,8 @@ static struct folio *alloc_fresh_hugetlb { struct folio *folio; - folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + folio = hstate_is_gigantic(h) ? alloc_gigantic_folio(h, gfp_mask, nid, nmask) : + alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; @@ -2550,9 +2402,8 @@ struct folio *alloc_buddy_hugetlb_folio_ nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { - gfp_t gfp = gfp_mask | __GFP_NOWARN; + gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ @@ -3334,6 +3185,7 @@ static void __init hugetlb_folio_init_ta for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); + __ClearPageReserved(folio_page(folio, pfn - head_pfn)); __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); ret = page_ref_freeze(page, 1); @@ -3950,21 +3802,16 @@ static long demote_free_hugetlb_folios(s continue; list_del(&folio->lru); - /* - * Use destroy_compound_hugetlb_folio_for_demote for all huge page - * sizes as it will not ref count folios. - */ - destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(src)); + + split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); + pgalloc_tag_split(&folio->page, 1 << huge_page_order(src)); for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { struct page *page = folio_page(folio, i); - if (hstate_is_gigantic(dst)) - prep_compound_gigantic_folio_for_demote(page_folio(page), - dst->order); - else - prep_compound_page(page, dst->order); - set_page_private(page, 0); + page->mapping = NULL; + clear_compound_head(page); + prep_compound_page(page, dst->order); init_new_hugetlb_folio(dst, page_folio(page)); list_add(&page->lru, &dst_list); _ Patches currently in -mm which might be from yuzhao@xxxxxxxxxx are mm-hugetlb_vmemmap-dont-synchronize_rcu-without-hvo.patch mm-swap-reduce-indentation-level.patch mm-swap-rename-cpu_fbatches-activate.patch mm-swap-fold-lru_rotate-into-cpu_fbatches.patch mm-swap-remove-remaining-_fn-suffix.patch mm-swap-remove-boilerplate.patch mm-swap-remove-boilerplate-fix.patch mm-free-zapped-tail-pages-when-splitting-isolated-thp.patch mm-remap-unused-subpages-to-shared-zeropage-when-splitting-isolated-thp.patch mm-hugetlb_vmemmap-batch-hvo-work-when-demoting.patch mm-contig_alloc-support-__gfp_comp.patch mm-cma-add-cma_alloc_folio.patch mm-hugetlb-use-__gfp_comp-for-gigantic-folios.patch