Modularize huge page collapse by introducing struct collapse_control. This structure serves to describe the properties of the requested collapse, as well as serve as a local scratch pad to use during the collapse itself. Later in the series when we introduce the madvise collapse context, we will want to be able to ignore khugepaged_max_ptes_[none|swap|shared] in said context, and so is included here as a property of the requested collapse. Signed-off-by: Zach O'Keefe <zokeefe@xxxxxxxxxx> --- mm/khugepaged.c | 120 ++++++++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 44 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a4e5eaf3eb01..36fc0099c445 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -85,6 +85,24 @@ static struct kmem_cache *mm_slot_cache __read_mostly; #define MAX_PTE_MAPPED_THP 8 +struct collapse_control { + /* Respect khugepaged_max_ptes_[none|swap|shared] */ + bool enforce_pte_scan_limits; + + /* Num pages scanned per node */ + int node_load[MAX_NUMNODES]; + + /* Last target selected in khugepaged_find_target_node() for this scan */ + int last_target_node; +}; + +static void collapse_control_init(struct collapse_control *cc, + bool enforce_pte_scan_limits) +{ + cc->enforce_pte_scan_limits = enforce_pte_scan_limits; + cc->last_target_node = NUMA_NO_NODE; +} + /** * struct mm_slot - hash lookup from mm to mm_slot * @hash: hash collision list @@ -601,6 +619,7 @@ static bool is_refcount_suitable(struct page *page) static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, + bool enforce_pte_scan_limits, struct list_head *compound_pagelist) { struct page *page = NULL; @@ -614,7 +633,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (++none_or_zero <= khugepaged_max_ptes_none || + !enforce_pte_scan_limits)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -634,8 +654,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { + if (page_mapcount(page) > 1 && enforce_pte_scan_limits && + ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; @@ -785,9 +805,7 @@ static void khugepaged_alloc_sleep(void) remove_wait_queue(&khugepaged_wait, &wait); } -static int khugepaged_node_load[MAX_NUMNODES]; - -static bool khugepaged_scan_abort(int nid) +static bool khugepaged_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -799,11 +817,11 @@ static bool khugepaged_scan_abort(int nid) return false; /* If there is a count for this node already, it must be acceptable */ - if (khugepaged_node_load[nid]) + if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { - if (!khugepaged_node_load[i]) + if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; @@ -818,28 +836,28 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int khugepaged_find_target_node(void) +static int khugepaged_find_target_node(struct collapse_control *cc) { - static int last_khugepaged_target_node = NUMA_NO_NODE; int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) - if (khugepaged_node_load[nid] > max_value) { - max_value = khugepaged_node_load[nid]; + if (cc->node_load[nid] > max_value) { + max_value = cc->node_load[nid]; target_node = nid; } /* do some balance if several nodes have the same hit record */ - if (target_node <= last_khugepaged_target_node) - for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == khugepaged_node_load[nid]) { + if (target_node <= cc->last_target_node) + for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES; + nid++) { + if (max_value == cc->node_load[nid]) { target_node = nid; break; } + } - last_khugepaged_target_node = target_node; + cc->last_target_node = target_node; return target_node; } @@ -877,7 +895,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) return *hpage; } #else -static int khugepaged_find_target_node(void) +static int khugepaged_find_target_node(struct collapse_control *cc) { return 0; } @@ -1043,7 +1061,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - int node, int referenced, int unmapped) + int node, int referenced, int unmapped, + int enforce_pte_scan_limits) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; @@ -1141,7 +1160,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte, - &compound_pagelist); + enforce_pte_scan_limits, &compound_pagelist); spin_unlock(pte_ptl); if (unlikely(!isolated)) { @@ -1206,7 +1225,8 @@ static void collapse_huge_page(struct mm_struct *mm, static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, - struct page **hpage) + struct page **hpage, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; @@ -1226,13 +1246,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out; } - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap) { + if (++unmapped <= khugepaged_max_ptes_swap || + !cc->enforce_pte_scan_limits) { /* * Always be strict with uffd-wp * enabled swap entries. Please see @@ -1251,7 +1272,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (++none_or_zero <= khugepaged_max_ptes_none || + !cc->enforce_pte_scan_limits)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1282,7 +1304,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { + ++shared > khugepaged_max_ptes_shared && + cc->enforce_pte_scan_limits) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; @@ -1292,16 +1315,16 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* * Record which node the original page is from and save this - * information to khugepaged_node_load[]. + * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (khugepaged_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; goto out_unmap; @@ -1352,10 +1375,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { - node = khugepaged_find_target_node(); + node = khugepaged_find_target_node(cc); /* collapse_huge_page will return with the mmap_lock released */ collapse_huge_page(mm, address, hpage, node, - referenced, unmapped); + referenced, unmapped, + cc->enforce_pte_scan_limits); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -1992,7 +2016,8 @@ static void collapse_file(struct mm_struct *mm, } static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) + struct file *file, pgoff_t start, struct page **hpage, + struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -2003,14 +2028,15 @@ static void khugepaged_scan_file(struct mm_struct *mm, present = 0; swap = 0; - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); rcu_read_lock(); xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) { - if (++swap > khugepaged_max_ptes_swap) { + if (cc->enforce_pte_scan_limits && + ++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; @@ -2028,11 +2054,11 @@ static void khugepaged_scan_file(struct mm_struct *mm, } node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (khugepaged_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; @@ -2061,11 +2087,12 @@ static void khugepaged_scan_file(struct mm_struct *mm, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none && + cc->enforce_pte_scan_limits) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - node = khugepaged_find_target_node(); + node = khugepaged_find_target_node(cc); collapse_file(mm, file, start, hpage, node); } } @@ -2074,7 +2101,8 @@ static void khugepaged_scan_file(struct mm_struct *mm, } #else static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) + struct file *file, pgoff_t start, struct page **hpage, + struct collapse_control *cc) { BUILD_BUG(); } @@ -2085,7 +2113,8 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage) + struct page **hpage, + struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { @@ -2161,12 +2190,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, mmap_read_unlock(mm); ret = 1; - khugepaged_scan_file(mm, file, pgoff, hpage); + khugepaged_scan_file(mm, file, pgoff, hpage, cc); fput(file); } else { ret = khugepaged_scan_pmd(mm, vma, khugepaged_scan.address, - hpage); + hpage, cc); } /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; @@ -2222,7 +2251,7 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_do_scan(void) +static void khugepaged_do_scan(struct collapse_control *cc) { struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; @@ -2246,7 +2275,7 @@ static void khugepaged_do_scan(void) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - &hpage); + &hpage, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); @@ -2285,12 +2314,15 @@ static void khugepaged_wait_work(void) static int khugepaged(void *none) { struct mm_slot *mm_slot; + struct collapse_control cc; + + collapse_control_init(&cc, /* enforce_pte_scan_limits= */ 1); set_freezable(); set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { - khugepaged_do_scan(); + khugepaged_do_scan(&cc); khugepaged_wait_work(); } -- 2.35.1.616.g0bdcbb4464-goog