[PATCH v1] mm/hugetlb_vmemmap: remap head page to newly allocated page

Joao Martins <joao.m.martins@xxxxxxxxxx> · Tue, 2 Aug 2022 19:03:09 +0100

Today with `hugetlb_free_vmemmap=on` the struct page memory that is
freed back to page allocator is as following: for a 2M hugetlb page it
will reuse the first 4K vmemmap page to remap the remaining 7 vmemmap
pages, and for a 1G hugetlb it will remap the remaining 4095 vmemmap
pages. Essentially, that means that it breaks the first 4K of a
potentially contiguous chunk of memory of 32K (for 2M hugetlb pages) or
16M (for 1G hugetlb pages). For this reason the memory that it's free
back to page allocator cannot be used for hugetlb to allocate huge pages
of the same size, but rather only of a smaller huge page size:

Try to assign a 64G node to hugetlb (on a 128G 2node guest, each node
having 64G):

* Before allocation:
Free pages count per migrate type at order       0      1      2      3
4      5      6      7      8      9     10
...
Node    0, zone   Normal, type      Movable      0      0      1      0
1      0      0      0      0      0  15561

$ echo 32768 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
$ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
 31974

* After:

Node    0, zone   Normal, type      Movable  32174  31999  31642    104
58     24     16      4      2      0      0

Notice how the memory freed back are put back into 4K / 8K / 16K page pools.
And it allocates a total of 31974 pages (63948M).

To fix this behaviour rather than leaving one page (thus breaking the
contiguous block of memory backing the struct pages) instead repopulate
with a new page for the head vmemmap page. Followed by copying the data
from the currently mapped vmemmap page there, to then remap it to this
new page. Additionally, change the remap_pte callback to include a rw
bool parameter given that the head page needs r/w perm compared to the
tail page vmemmap remap that is r/o. vmemmap_pte_range() will set
accordingly when calling remap_pte() when it first tries to set
@reuse_page. The new head page is allocated by the caller of
vmemmap_remap_free() given that on restore it should still be using the
same code path as before. Note that, because right now one hugepage is
remapped at a time, thus only one free 4K page at a time is needed to
remap the head page. Should it fail to allocate said new page, it reuses
the one that's already mapped just like before. As a result, for every 64G
of contiguous hugepages it can give back 1G more of contiguous memory per 64G,
while needing in total 128M new 4K pages (for 2M hugetlb) or 256k (for 1G
hugetlb).

After the changes, try to assign a 64G node to hugetlb (on a 128G 2node guest,
each node with 64G):

* Before allocation
Free pages count per migrate type at order       0      1      2      3
4      5      6      7      8      9     10
...
Node    0, zone   Normal, type      Movable      0      1      1      0
0      0      0      1      1      1  15564

$ echo 32768  > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
$ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
32390

* After:

Node    0, zone   Normal, type      Movable      0      1      0     70
106     91     78     48     17      0      0

In the example above, 416 more hugeltb 2M pages are allocated i.e. 832M
out of the 32390 (64780M) allocated. So the memory freed back is indeed
being used back in hugetlb and there's no order-0..order-2 pages
accumulated unused.

Signed-off-by: Joao Martins <joao.m.martins@xxxxxxxxxx>
---
 mm/hugetlb_vmemmap.c | 47 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 20f414c0379f..2b97df8115fe 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -28,9 +28,11 @@
  */
 struct vmemmap_remap_walk {
 	void			(*remap_pte)(pte_t *pte, unsigned long addr,
-					     struct vmemmap_remap_walk *walk);
+					     struct vmemmap_remap_walk *walk,
+					     bool rw);
 	unsigned long		nr_walked;
 	struct page		*reuse_page;
+	struct page		*head_page;
 	unsigned long		reuse_addr;
 	struct list_head	*vmemmap_pages;
 };
@@ -104,10 +106,25 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
 	 * remapping (which is calling @walk->remap_pte).
 	 */
 	if (!walk->reuse_page) {
-		walk->reuse_page = pte_page(*pte);
+		struct page *ptpage = pte_page(*pte);
+		struct page *page = walk->head_page ? walk->head_page : ptpage;
+
+		walk->reuse_page = page;
+
+		/*
+		 * Copy the data from the original head, and remap to
+		 * the newly allocated page.
+		 */
+		if (page != ptpage) {
+			memcpy(page_address(page), page_address(ptpage),
+			       PAGE_SIZE);
+			walk->remap_pte(pte, addr, walk, true);
+		}
+
 		/*
-		 * Because the reuse address is part of the range that we are
-		 * walking, skip the reuse address range.
+		 * Because the reuse address is part of the range that
+		 * we are walking, skip the reuse address range. Or we
+		 * already remapped the head page to a newly page.
 		 */
 		addr += PAGE_SIZE;
 		pte++;
@@ -115,7 +132,7 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
 	}
 
 	for (; addr != end; addr += PAGE_SIZE, pte++) {
-		walk->remap_pte(pte, addr, walk);
+		walk->remap_pte(pte, addr, walk, false);
 		walk->nr_walked++;
 	}
 }
@@ -238,13 +255,13 @@ static void free_vmemmap_page_list(struct list_head *list)
 }
 
 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
-			      struct vmemmap_remap_walk *walk)
+			      struct vmemmap_remap_walk *walk, bool rw)
 {
 	/*
 	 * Remap the tail pages as read-only to catch illegal write operation
 	 * to the tail pages.
 	 */
-	pgprot_t pgprot = PAGE_KERNEL_RO;
+	pgprot_t pgprot = rw ? PAGE_KERNEL : PAGE_KERNEL_RO;
 	pte_t entry = mk_pte(walk->reuse_page, pgprot);
 	struct page *page = pte_page(*pte);
 
@@ -273,7 +290,7 @@ static inline void reset_struct_pages(struct page *start)
 }
 
 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
-				struct vmemmap_remap_walk *walk)
+				struct vmemmap_remap_walk *walk, bool rw)
 {
 	pgprot_t pgprot = PAGE_KERNEL;
 	struct page *page;
@@ -312,6 +329,20 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
 		.reuse_addr	= reuse,
 		.vmemmap_pages	= &vmemmap_pages,
 	};
+	gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+	int nid = page_to_nid((struct page *)start);
+	struct page *page;
+
+	/*
+	 * Allocate a new head vmemmap page to avoid breaking a contiguous
+	 * block of struct page memory when freeing it back to page allocator
+	 * in free_vmemmap_page_list(). This will allow the likely contiguous
+	 * struct page backing memory to be kept contiguous and allowing for
+	 * more allocations of hugepages. Fallback to the currently
+	 * mapped head page in case should it fail to allocate.
+	 */
+	page = alloc_pages_node(nid, gfp_mask, 0);
+	walk.head_page = page;
 
 	/*
 	 * In order to make remapping routine most efficient for the huge pages,
-- 
2.17.2