[v3 4/4] mm: hugetlb: Skip initialization of gigantic tail struct pages if freed by HVO

Usama Arif <usama.arif@xxxxxxxxxxxxx> · Fri, 25 Aug 2023 12:18:36 +0100

The new boot flow when it comes to initialization of gigantic pages
is as follows:
- At boot time, for a gigantic page during __alloc_bootmem_hugepage,
the region after the first struct page is marked as noinit.
- This results in only the first struct page to be
initialized in reserve_bootmem_region. As the tail struct pages are
not initialized at this point, there can be a significant saving
in boot time if HVO succeeds later on.
- Later on in the boot, HVO is attempted. If its successful, only the first
HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page) - 1 tail struct pages
after the head struct page are initialized. If it is not successful,
then all of the tail struct pages are initialized.

Signed-off-by: Usama Arif <usama.arif@xxxxxxxxxxxxx>
---
 mm/hugetlb.c         | 52 +++++++++++++++++++++++++++++++++++---------
 mm/hugetlb_vmemmap.h |  8 +++----
 mm/internal.h        |  3 +++
 mm/mm_init.c         |  2 +-
 4 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6da626bfb52e..964f7a2b693e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1953,7 +1953,6 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
 
 static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
 {
-	hugetlb_vmemmap_optimize(h, &folio->page);
 	INIT_LIST_HEAD(&folio->lru);
 	folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
 	hugetlb_set_folio_subpool(folio, NULL);
@@ -2225,6 +2224,7 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
 			return NULL;
 		}
 	}
+	hugetlb_vmemmap_optimize(h, &folio->page);
 	prep_new_hugetlb_folio(h, folio, folio_nid(folio));
 
 	return folio;
@@ -2943,6 +2943,7 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
 	new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
 	if (!new_folio)
 		return -ENOMEM;
+	hugetlb_vmemmap_optimize(h, &new_folio->page);
 	__prep_new_hugetlb_folio(h, new_folio);
 
 retry:
@@ -3206,6 +3207,15 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 	}
 
 found:
+
+	/*
+	 * Only initialize the head struct page in memmap_init_reserved_pages,
+	 * rest of the struct pages will be initialized by the HugeTLB subsystem itself.
+	 * The head struct page is used to get folio information by the HugeTLB
+	 * subsystem like zone id and node id.
+	 */
+	memblock_reserved_mark_noinit_vmemmap(virt_to_phys((void *)m + PAGE_SIZE),
+		huge_page_size(h) - PAGE_SIZE);
 	/* Put them into a private list first because mem_map is not up yet */
 	INIT_LIST_HEAD(&m->list);
 	list_add(&m->list, &huge_boot_pages);
@@ -3213,6 +3223,27 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 	return 1;
 }
 
+static void __init hugetlb_folio_init_vmemmap(struct hstate *h, struct folio *folio,
+					       unsigned long nr_pages)
+{
+	enum zone_type zone = zone_idx(folio_zone(folio));
+	int nid = folio_nid(folio);
+	unsigned long head_pfn = folio_pfn(folio);
+	unsigned long pfn, end_pfn = head_pfn + nr_pages;
+
+	__folio_clear_reserved(folio);
+	__folio_set_head(folio);
+
+	for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
+		struct page *page = pfn_to_page(pfn);
+
+		__init_single_page(page, pfn, zone, nid);
+		prep_compound_tail((struct page *)folio, pfn - head_pfn);
+		set_page_count(page, 0);
+	}
+	prep_compound_head((struct page *)folio, huge_page_order(h));
+}
+
 /*
  * Put bootmem huge pages into the standard lists after mem_map is up.
  * Note: This only applies to gigantic (order > MAX_ORDER) pages.
@@ -3223,19 +3254,19 @@ static void __init gather_bootmem_prealloc(void)
 
 	list_for_each_entry(m, &huge_boot_pages, list) {
 		struct page *page = virt_to_page(m);
-		struct folio *folio = page_folio(page);
+		struct folio *folio = (void *)page;
 		struct hstate *h = m->hstate;
+		unsigned long nr_pages = pages_per_huge_page(h);
 
 		VM_BUG_ON(!hstate_is_gigantic(h));
 		WARN_ON(folio_ref_count(folio) != 1);
-		if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
-			WARN_ON(folio_test_reserved(folio));
-			prep_new_hugetlb_folio(h, folio, folio_nid(folio));
-			free_huge_page(page); /* add to the hugepage allocator */
-		} else {
-			/* VERY unlikely inflated ref count on a tail page */
-			free_gigantic_folio(folio, huge_page_order(h));
-		}
+
+		hugetlb_vmemmap_optimize(h, &folio->page);
+		if (HPageVmemmapOptimized(&folio->page))
+			nr_pages = HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page);
+		hugetlb_folio_init_vmemmap(h, folio, nr_pages);
+		prep_new_hugetlb_folio(h, folio, folio_nid(folio));
+		free_huge_page(page); /* add to the hugepage allocator */
 
 		/*
 		 * We need to restore the 'stolen' pages to totalram_pages
@@ -3656,6 +3687,7 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
 		else
 			prep_compound_page(subpage, target_hstate->order);
 		folio_change_private(inner_folio, NULL);
+		hugetlb_vmemmap_optimize(h, &folio->page);
 		prep_new_hugetlb_folio(target_hstate, inner_folio, nid);
 		free_huge_page(subpage);
 	}
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 25bd0e002431..d30aff8f3573 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -10,16 +10,16 @@
 #define _LINUX_HUGETLB_VMEMMAP_H
 #include <linux/hugetlb.h>
 
-#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
-void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
-
 /*
  * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
  * Documentation/vm/vmemmap_dedup.rst.
  */
 #define HUGETLB_VMEMMAP_RESERVE_SIZE	PAGE_SIZE
 
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
+void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
+
 static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
 {
 	return pages_per_huge_page(h) * sizeof(struct page);
diff --git a/mm/internal.h b/mm/internal.h
index a7d9e980429a..31b3d45f4609 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1102,4 +1102,7 @@ struct vma_prepare {
 	struct vm_area_struct *remove;
 	struct vm_area_struct *remove2;
 };
+
+void __meminit __init_single_page(struct page *page, unsigned long pfn,
+				unsigned long zone, int nid);
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/mm_init.c b/mm/mm_init.c
index a1963c3322af..3d4ab595ca7d 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -551,7 +551,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 	node_states[N_MEMORY] = saved_node_state;
 }
 
-static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+void __meminit __init_single_page(struct page *page, unsigned long pfn,
 				unsigned long zone, int nid)
 {
 	mm_zero_struct_page(page);
-- 
2.25.1