Re: [PATCH v1] mm/hugetlb_vmemmap: remap head page to newly allocated page

Muchun Song <songmuchun@xxxxxxxxxxxxx> · Wed, 3 Aug 2022 12:11:24 +0800

On Tue, Aug 02, 2022 at 07:03:09PM +0100, Joao Martins wrote:
> Today with `hugetlb_free_vmemmap=on` the struct page memory that is
> freed back to page allocator is as following: for a 2M hugetlb page it
> will reuse the first 4K vmemmap page to remap the remaining 7 vmemmap
> pages, and for a 1G hugetlb it will remap the remaining 4095 vmemmap
> pages. Essentially, that means that it breaks the first 4K of a
> potentially contiguous chunk of memory of 32K (for 2M hugetlb pages) or
> 16M (for 1G hugetlb pages). For this reason the memory that it's free
> back to page allocator cannot be used for hugetlb to allocate huge pages
> of the same size, but rather only of a smaller huge page size:
>

Hi Joao,

Thanks for your work on this. I admit you are right. The current mechanism
prevented the freed vmemmap pages from being mergerd into a potential
contiguous page. Allocating a new head page is straightforward approach,
however, it is very dangerous at runtime after system booting up. Why
dangerous? Because you should first 1) copy the content from the head vmemmap
page to the targeted (newly allocated) page, and then 2) change the PTE
entry to the new page. However, the content (especially the refcount) of the
old head vmemmmap page could be changed elsewhere (e.g. other modules)
between the step 1) and 2). Eventually, the new allocated vmemmap page is
corrupted. Unluckily, we don't have an easy approach to prevent it.

I also thought of solving this issue, but I didn't find any easy way to
solve it after system booting up. However, it is possible at system boot
time. Because if the system is in a very early initialization stage,
anyone should not access struct page. I have implemented it a mounth ago.
I didn't send it out since some additional preparation work is required.
The main preparation is to move the allocation of HugeTLB to a very
early initialization stage (I want to put it into pure_initcall). Because
the allocation of HugeTLB is parsed from cmdline whose logic is very
complex, I have sent a patch to cleanup it [1]. After this cleanup, it
will be easy to move the allocation to an early stage. Sorry for that
I am busy lately, I didn't have time to send a new version. But I will
send it ASAP.

[1] https://lore.kernel.org/all/20220616071827.3480-1-songmuchun@xxxxxxxxxxxxx/

The following diff is the main work to remap the head vmemmap page to
a new allocated page.

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 20f414c0379f..71f2d7335e6f 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -15,6 +15,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "hugetlb_vmemmap.h"
+#include "internal.h"

 /**
  * struct vmemmap_remap_walk - walk vmemmap page table
@@ -227,14 +228,37 @@ static inline void free_vmemmap_page(struct page *page)
 }

 /* Free a list of the vmemmap pages */
-static void free_vmemmap_page_list(struct list_head *list)
+static void free_vmemmap_pages(struct list_head *list)
 {
        struct page *page, *next;

+       list_for_each_entry_safe(page, next, list, lru)
+               free_vmemmap_page(page);
+}
+
+/*
+ * Free a list of vmemmap pages but skip per-cpu free list of buddy
+ * allocator.
+ */
+static void free_vmemmap_pages_nonpcp(struct list_head *list)
+{
+       struct zone *zone;
+       struct page *page, *next;
+
+       if (list_empty(list))
+               return;
+
+       zone = page_zone(list_first_entry(list, struct page, lru));
+       zone_pcp_disable(zone);
        list_for_each_entry_safe(page, next, list, lru) {
-               list_del(&page->lru);
+               if (zone != page_zone(page)) {
+                       zone_pcp_enable(zone);
+                       zone = page_zone(page);
+                       zone_pcp_disable(zone);
+               }
                free_vmemmap_page(page);
        }
+       zone_pcp_enable(zone);
 }

 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
@@ -244,12 +268,28 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
         * Remap the tail pages as read-only to catch illegal write operation
         * to the tail pages.
         */
-       pgprot_t pgprot = PAGE_KERNEL_RO;
+       pgprot_t pgprot = addr == walk->reuse_addr ? PAGE_KERNEL : PAGE_KERNEL_RO;
        pte_t entry = mk_pte(walk->reuse_page, pgprot);
        struct page *page = pte_page(*pte);

        list_add_tail(&page->lru, walk->vmemmap_pages);
        set_pte_at(&init_mm, addr, pte, entry);
+
+       if (unlikely(addr == walk->reuse_addr)) {
+               void *old = page_to_virt(page);
+
+               /* Remove it from the vmemmap_pages list to avoid being freed. */
+               list_del(&walk->reuse_page->lru);
+               flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+               /*
+                * If we reach here meaning the system is in a very early
+                * initialization stage, anyone should not access struct page.
+                * However, if there is something unexpected, the head struct
+                * page most likely to be written (usually ->_refcount). Using
+                * BUG_ON() to catch this unexpected case.
+                */
+               BUG_ON(memcmp(old, (void *)addr, sizeof(struct page)));
+       }
 }

 /*
@@ -298,7 +338,10 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
  *             to remap.
  * @end:       end address of the vmemmap virtual address range that we want to
  *             remap.
- * @reuse:     reuse address.
+ * @reuse:     reuse address. If @reuse is equal to @start, it means the page
+ *             frame which @reuse address is mapped to will be replaced with a
+ *             new page frame and the previous page frame will be freed, this
+ *             is to reduce memory fragment.
  *
  * Return: %0 on success, negative error code otherwise.
  */
@@ -319,14 +362,26 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
         * (see more details from the vmemmap_pte_range()):
         *
         * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
-        *   should be continuous.
+        *   should be continuous or @start is equal to @reuse.
         * - The @reuse address is part of the range [@reuse, @end) that we are
         *   walking which is passed to vmemmap_remap_range().
         * - The @reuse address is the first in the complete range.
         *
         * So we need to make sure that @start and @reuse meet the above rules.
         */
-       BUG_ON(start - reuse != PAGE_SIZE);
+       BUG_ON(start - reuse != PAGE_SIZE && start != reuse);
+
+       if (unlikely(reuse == start)) {
+               int nid = page_to_nid((struct page *)start);
+               gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY |
+                                __GFP_NOWARN;
+
+               walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
+               if (walk.reuse_page) {
+                       copy_page(page_to_virt(walk.reuse_page), (void *)reuse);
+                       list_add(&walk.reuse_page->lru, &vmemmap_pages);
+               }
+       }

        mmap_read_lock(&init_mm);
        ret = vmemmap_remap_range(reuse, end, &walk);
@@ -348,7 +403,10 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
        }
        mmap_read_unlock(&init_mm);

-       free_vmemmap_page_list(&vmemmap_pages);
+       if (unlikely(reuse == start))
+               free_vmemmap_pages_nonpcp(&vmemmap_pages);
+       else
+               free_vmemmap_pages(&vmemmap_pages);

        return ret;
 }
@@ -512,6 +570,22 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h
        return true;
 }

+/*
+ * Control if the page frame which the address of the head vmemmap associated
+ * with a HugeTLB page is mapped to should be replaced with a new page. The
+ * vmemmap pages are usually mapped with huge PMD mapping, the head vmemmap
+ * page frames is best freed to the buddy allocator once at an initial stage
+ * of system booting to reduce memory fragment.
+ */
+static bool vmemmap_remap_head __ro_after_init = true;
+
+static int __init vmemmap_remap_head_init(void)
+{
+       vmemmap_remap_head = false;
+       return 0;
+}
+core_initcall(vmemmap_remap_head_init);
+
 /**
  * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
  * @h:         struct hstate.
@@ -537,6 +611,19 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
        vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;

        /*
+        * The vmemmap pages are usually mapped with huge PMD mapping. If the
+        * head vmemmap page is not freed to the buddy allocator, then those
+        * freed tail vmemmap pages cannot be merged into a big order chunk.
+        * The head vmemmap page frame can be replaced with a new allocated
+        * page and be freed to the buddy allocator, then those freed vmemmmap
+        * pages have the opportunity to be merged into larger contiguous pages
+        * to reduce memory fragment. vmemmap_remap_free() will do this if
+        * @vmemmap_remap_free is equal to @vmemmap_reuse.
+        */
+       if (unlikely(vmemmap_remap_head))
+               vmemmap_start = vmemmap_reuse;
+
+       /*
         * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
         * to the page which @vmemmap_reuse is mapped to, then free the pages
         * which the range [@vmemmap_start, @vmemmap_end] is mapped to.

Thanks.

> Try to assign a 64G node to hugetlb (on a 128G 2node guest, each node
> having 64G):
> 
> * Before allocation:
> Free pages count per migrate type at order       0      1      2      3
> 4      5      6      7      8      9     10
> ...
> Node    0, zone   Normal, type      Movable      0      0      1      0
> 1      0      0      0      0      0  15561
> 
> $ echo 32768 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
> $ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
>  31974
> 
> * After:
> 
> Node    0, zone   Normal, type      Movable  32174  31999  31642    104
> 58     24     16      4      2      0      0
> 
> Notice how the memory freed back are put back into 4K / 8K / 16K page pools.
> And it allocates a total of 31974 pages (63948M).
> 
> To fix this behaviour rather than leaving one page (thus breaking the
> contiguous block of memory backing the struct pages) instead repopulate
> with a new page for the head vmemmap page. Followed by copying the data
> from the currently mapped vmemmap page there, to then remap it to this
> new page. Additionally, change the remap_pte callback to include a rw
> bool parameter given that the head page needs r/w perm compared to the
> tail page vmemmap remap that is r/o. vmemmap_pte_range() will set
> accordingly when calling remap_pte() when it first tries to set
> @reuse_page. The new head page is allocated by the caller of
> vmemmap_remap_free() given that on restore it should still be using the
> same code path as before. Note that, because right now one hugepage is
> remapped at a time, thus only one free 4K page at a time is needed to
> remap the head page. Should it fail to allocate said new page, it reuses
> the one that's already mapped just like before. As a result, for every 64G
> of contiguous hugepages it can give back 1G more of contiguous memory per 64G,
> while needing in total 128M new 4K pages (for 2M hugetlb) or 256k (for 1G
> hugetlb).
> 
> After the changes, try to assign a 64G node to hugetlb (on a 128G 2node guest,
> each node with 64G):
> 
> * Before allocation
> Free pages count per migrate type at order       0      1      2      3
> 4      5      6      7      8      9     10
> ...
> Node    0, zone   Normal, type      Movable      0      1      1      0
> 0      0      0      1      1      1  15564
> 
> $ echo 32768  > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
> $ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
> 32390
> 
> * After:
> 
> Node    0, zone   Normal, type      Movable      0      1      0     70
> 106     91     78     48     17      0      0
> 
> In the example above, 416 more hugeltb 2M pages are allocated i.e. 832M
> out of the 32390 (64780M) allocated. So the memory freed back is indeed
> being used back in hugetlb and there's no order-0..order-2 pages
> accumulated unused.
> 
> Signed-off-by: Joao Martins <joao.m.martins@xxxxxxxxxx>
> ---
>  mm/hugetlb_vmemmap.c | 47 ++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 39 insertions(+), 8 deletions(-)
> 
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index 20f414c0379f..2b97df8115fe 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -28,9 +28,11 @@
>   */
>  struct vmemmap_remap_walk {
>  	void			(*remap_pte)(pte_t *pte, unsigned long addr,
> -					     struct vmemmap_remap_walk *walk);
> +					     struct vmemmap_remap_walk *walk,
> +					     bool rw);
>  	unsigned long		nr_walked;
>  	struct page		*reuse_page;
> +	struct page		*head_page;
>  	unsigned long		reuse_addr;
>  	struct list_head	*vmemmap_pages;
>  };
> @@ -104,10 +106,25 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
>  	 * remapping (which is calling @walk->remap_pte).
>  	 */
>  	if (!walk->reuse_page) {
> -		walk->reuse_page = pte_page(*pte);
> +		struct page *ptpage = pte_page(*pte);
> +		struct page *page = walk->head_page ? walk->head_page : ptpage;
> +
> +		walk->reuse_page = page;
> +
> +		/*
> +		 * Copy the data from the original head, and remap to
> +		 * the newly allocated page.
> +		 */
> +		if (page != ptpage) {
> +			memcpy(page_address(page), page_address(ptpage),
> +			       PAGE_SIZE);
> +			walk->remap_pte(pte, addr, walk, true);
> +		}
> +
>  		/*
> -		 * Because the reuse address is part of the range that we are
> -		 * walking, skip the reuse address range.
> +		 * Because the reuse address is part of the range that
> +		 * we are walking, skip the reuse address range. Or we
> +		 * already remapped the head page to a newly page.
>  		 */
>  		addr += PAGE_SIZE;
>  		pte++;
> @@ -115,7 +132,7 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
>  	}
>  
>  	for (; addr != end; addr += PAGE_SIZE, pte++) {
> -		walk->remap_pte(pte, addr, walk);
> +		walk->remap_pte(pte, addr, walk, false);
>  		walk->nr_walked++;
>  	}
>  }
> @@ -238,13 +255,13 @@ static void free_vmemmap_page_list(struct list_head *list)
>  }
>  
>  static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
> -			      struct vmemmap_remap_walk *walk)
> +			      struct vmemmap_remap_walk *walk, bool rw)
>  {
>  	/*
>  	 * Remap the tail pages as read-only to catch illegal write operation
>  	 * to the tail pages.
>  	 */
> -	pgprot_t pgprot = PAGE_KERNEL_RO;
> +	pgprot_t pgprot = rw ? PAGE_KERNEL : PAGE_KERNEL_RO;
>  	pte_t entry = mk_pte(walk->reuse_page, pgprot);
>  	struct page *page = pte_page(*pte);
>  
> @@ -273,7 +290,7 @@ static inline void reset_struct_pages(struct page *start)
>  }
>  
>  static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> -				struct vmemmap_remap_walk *walk)
> +				struct vmemmap_remap_walk *walk, bool rw)
>  {
>  	pgprot_t pgprot = PAGE_KERNEL;
>  	struct page *page;
> @@ -312,6 +329,20 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
>  		.reuse_addr	= reuse,
>  		.vmemmap_pages	= &vmemmap_pages,
>  	};
> +	gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
> +	int nid = page_to_nid((struct page *)start);
> +	struct page *page;
> +
> +	/*
> +	 * Allocate a new head vmemmap page to avoid breaking a contiguous
> +	 * block of struct page memory when freeing it back to page allocator
> +	 * in free_vmemmap_page_list(). This will allow the likely contiguous
> +	 * struct page backing memory to be kept contiguous and allowing for
> +	 * more allocations of hugepages. Fallback to the currently
> +	 * mapped head page in case should it fail to allocate.
> +	 */
> +	page = alloc_pages_node(nid, gfp_mask, 0);
> +	walk.head_page = page;
>  
>  	/*
>  	 * In order to make remapping routine most efficient for the huge pages,
> -- 
> 2.17.2
> 
>