The patch reduces the process visible downtime during hugepage collapse. This is achieved by pre-zeroing the hugepage before acquiring mmap_lock(write mode) if nr_pte_none >= 256, without affecting the efficiency of khugepaged. On an Intel Core i5 CPU, the process visible downtime during hugepage collapse is as follows: | nr_ptes_none | w/o __GFP_ZERO | w/ __GFP_ZERO | Change | --------------------------------------------------—---------- | 511 | 233us | 95us | -59.21%| | 384 | 376us | 219us | -41.20%| | 256 | 421us | 323us | -23.28%| | 128 | 523us | 507us | -3.06%| Of course, alloc_charge_hpage() will take longer to run with the __GFP_ZERO flag. | Func | w/o __GFP_ZERO | w/ __GFP_ZERO | |----------------------|----------------|---------------| | alloc_charge_hpage | 198us | 295us | But it's not a big deal because it doesn't impact the total time spent by khugepaged in collapsing a hugepage. In fact, it would decrease. Signed-off-by: Lance Yang <ioworker0@xxxxxxxxx> --- mm/khugepaged.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 38830174608f..a2872596b865 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -88,6 +88,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; +static unsigned int khugepaged_min_ptes_none_prezero __read_mostly; #define MM_SLOTS_HASH_BITS 10 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -96,6 +97,7 @@ static struct kmem_cache *mm_slot_cache __ro_after_init; struct collapse_control { bool is_khugepaged; + bool alloc_zeroed_hpage; /* Num pages scanned per node */ u32 node_load[MAX_NUMNODES]; @@ -396,6 +398,7 @@ int __init khugepaged_init(void) khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; + khugepaged_min_ptes_none_prezero = HPAGE_PMD_NR / 2; return 0; } @@ -782,6 +785,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, + struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *src_page; @@ -797,7 +801,8 @@ static int __collapse_huge_page_copy(pte_t *pte, _pte++, page++, _address += PAGE_SIZE) { pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - clear_user_highpage(page, _address); + if (!cc->alloc_zeroed_hpage) + clear_user_highpage(page, _address); continue; } src_page = pte_page(pteval); @@ -1067,6 +1072,9 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, int node = hpage_collapse_find_target_node(cc); struct folio *folio; + if (cc->alloc_zeroed_hpage) + gfp |= __GFP_ZERO; + if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) { *hpage = NULL; return SCAN_ALLOC_HUGE_PAGE_FAIL; @@ -1209,7 +1217,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, anon_vma_unlock_write(vma->anon_vma); result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd, - vma, address, pte_ptl, + vma, address, pte_ptl, cc, &compound_pagelist); pte_unmap(pte); if (unlikely(result != SCAN_SUCCEED)) @@ -1272,6 +1280,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); + cc->alloc_zeroed_hpage = false; pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte) { result = SCAN_PMD_NULL; @@ -1408,6 +1417,10 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { + if (cc->is_khugepaged && + none_or_zero >= khugepaged_min_ptes_none_prezero) + cc->alloc_zeroed_hpage = true; + result = collapse_huge_page(mm, address, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ @@ -2054,7 +2067,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, index = start; list_for_each_entry(page, &pagelist, lru) { while (index < page->index) { - clear_highpage(hpage + (index % HPAGE_PMD_NR)); + if (!cc->alloc_zeroed_hpage) + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) { @@ -2064,7 +2078,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, index++; } while (index < end) { - clear_highpage(hpage + (index % HPAGE_PMD_NR)); + if (!cc->alloc_zeroed_hpage) + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } @@ -2234,6 +2249,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, swap = 0; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); + cc->alloc_zeroed_hpage = false; rcu_read_lock(); xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) @@ -2305,11 +2321,16 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (cc->is_khugepaged && - present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + if (!cc->is_khugepaged) + result = collapse_file(mm, addr, file, start, cc); + else if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { + if (HPAGE_PMD_NR - present >= + khugepaged_min_ptes_none_prezero) + cc->alloc_zeroed_hpage = true; + result = collapse_file(mm, addr, file, start, cc); } } -- 2.33.1