The patch titled Subject: hugetlb: batch PMD split for bulk vmemmap dedup has been added to the -mm mm-unstable branch. Its filename is hugetlb-batch-pmd-split-for-bulk-vmemmap-dedup.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/hugetlb-batch-pmd-split-for-bulk-vmemmap-dedup.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Joao Martins <joao.m.martins@xxxxxxxxxx> Subject: hugetlb: batch PMD split for bulk vmemmap dedup Date: Thu, 5 Oct 2023 20:20:08 -0700 In an effort to minimize amount of TLB flushes, batch all PMD splits belonging to a range of pages in order to perform only 1 (global) TLB flush. Add a flags field to the walker and pass whether it's a bulk allocation or just a single page to decide to remap. First value (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB flush when we split the PMD. Rebased and updated by Mike Kravetz Link: https://lkml.kernel.org/r/20231006032012.296473-7-mike.kravetz@xxxxxxxxxx Signed-off-by: Joao Martins <joao.m.martins@xxxxxxxxxx> Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Reviewed-by: Muchun Song <songmuchun@xxxxxxxxxxxxx> Cc: Anshuman Khandual <anshuman.khandual@xxxxxxx> Cc: Barry Song <21cnbao@xxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: David Rientjes <rientjes@xxxxxxxxxx> Cc: James Houghton <jthoughton@xxxxxxxxxx> Cc: Konrad Dybcio <konradybcio@xxxxxxxxxx> Cc: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx> Cc: Miaohe Lin <linmiaohe@xxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Naoya Horiguchi <naoya.horiguchi@xxxxxxxxx> Cc: Oscar Salvador <osalvador@xxxxxxx> Cc: Xiongchun Duan <duanxiongchun@xxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- mm/hugetlb_vmemmap.c | 92 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 4 deletions(-) --- a/mm/hugetlb_vmemmap.c~hugetlb-batch-pmd-split-for-bulk-vmemmap-dedup +++ a/mm/hugetlb_vmemmap.c @@ -27,6 +27,8 @@ * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. + * @flags: used to modify behavior in vmemmap page table walking + * operations. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, @@ -35,9 +37,13 @@ struct vmemmap_remap_walk { struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; + +/* Skip the TLB flush when we split the PMD */ +#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) + unsigned long flags; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) { pmd_t __pmd; int i; @@ -80,7 +86,8 @@ static int split_vmemmap_huge_pmd(pmd_t /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); - flush_tlb_kernel_range(start, start + PMD_SIZE); + if (flush) + flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } @@ -127,11 +134,20 @@ static int vmemmap_pmd_range(pud_t *pud, do { int ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, + !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)); if (ret) return ret; next = pmd_addr_end(addr, end); + + /* + * We are only splitting, not remapping the hugetlb vmemmap + * pages. + */ + if (!walk->remap_pte) + continue; + vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -198,7 +214,8 @@ static int vmemmap_remap_range(unsigned return ret; } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); + if (walk->remap_pte) + flush_tlb_kernel_range(start, end); return 0; } @@ -298,6 +315,36 @@ static void vmemmap_restore_pte(pte_t *p } /** + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) + * backing PMDs of the directmap into PTEs + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_split(unsigned long start, unsigned long end, + unsigned long reuse) +{ + int ret; + struct vmemmap_remap_walk walk = { + .remap_pte = NULL, + .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + mmap_read_lock(&init_mm); + ret = vmemmap_remap_range(reuse, end, &walk); + mmap_read_unlock(&init_mm); + + return ret; +} + +/** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * to the page which @reuse is mapped to, then free vmemmap * which the range are mapped to. @@ -320,6 +367,7 @@ static int vmemmap_remap_free(unsigned l .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, + .flags = 0, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; @@ -368,6 +416,7 @@ static int vmemmap_remap_free(unsigned l .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, + .flags = 0, }; vmemmap_remap_range(reuse, end, &walk); @@ -419,6 +468,7 @@ static int vmemmap_remap_alloc(unsigned .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, + .flags = 0, }; /* See the comment in the vmemmap_remap_free(). */ @@ -628,12 +678,46 @@ void hugetlb_vmemmap_optimize(const stru free_vmemmap_page_list(&vmemmap_pages); } +static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head) +{ + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; + + if (!vmemmap_should_optimize(h, head)) + return 0; + + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; + + /* + * Split PMDs on the vmemmap virtual address range [@vmemmap_start, + * @vmemmap_end] + */ + return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); +} + void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { struct folio *folio; LIST_HEAD(vmemmap_pages); list_for_each_entry(folio, folio_list, lru) { + int ret = hugetlb_vmemmap_split(h, &folio->page); + + /* + * Spliting the PMD requires allocating a page, thus lets fail + * early once we encounter the first OOM. No point in retrying + * as it can be dynamically done on remap with the memory + * we get back from the vmemmap deduplication. + */ + if (ret == -ENOMEM) + break; + } + + flush_tlb_all(); + + list_for_each_entry(folio, folio_list, lru) { int ret = __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages); _ Patches currently in -mm which might be from joao.m.martins@xxxxxxxxxx are hugetlb-batch-pmd-split-for-bulk-vmemmap-dedup.patch hugetlb-batch-tlb-flushes-when-freeing-vmemmap.patch