Add struct collapse_result which aggregates data from a single khugepaged_scan_pmd() or khugapaged_scan_file() request. Change khugepaged to take action based on this returned data instead of deep within the collapsing functions themselves. Signed-off-by: Zach O'Keefe <zokeefe@xxxxxxxxxx> --- mm/khugepaged.c | 187 ++++++++++++++++++++++++++---------------------- 1 file changed, 101 insertions(+), 86 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c4962191d6e1..0e4f5fbe00d2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -99,6 +99,14 @@ struct collapse_control { int node); }; +/* Gather information from one khugepaged_scan_[pmd|file]() request */ +struct collapse_result { + enum scan_result result; + + /* Was mmap_lock dropped during request? */ + bool dropped_mmap_lock; +}; + /** * struct mm_slot - hash lookup from mm to mm_slot * @hash: hash collision list @@ -743,13 +751,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 1; + return SCAN_SUCCEED; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 0; + return result; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, @@ -1087,7 +1095,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct collapse_control *cc, int referenced, - int unmapped) + int unmapped, struct collapse_result *cr) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; @@ -1095,7 +1103,6 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, pgtable_t pgtable; struct page *new_page; spinlock_t *pmd_ptl, *pte_ptl; - int isolated = 0, result = 0; struct vm_area_struct *vma; struct mmu_notifier_range range; gfp_t gfp; @@ -1103,6 +1110,7 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, int node; VM_BUG_ON(address & ~HPAGE_PMD_MASK); + cr->result = SCAN_FAIL; /* Only allocate from the target node */ gfp = cc->gfp() | __GFP_THISNODE; @@ -1114,6 +1122,7 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); + cr->dropped_mmap_lock = true; node = khugepaged_find_target_node(cc); /* sched to specified node before huage page memory copy */ @@ -1124,26 +1133,26 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, } new_page = cc->alloc_hpage(cc, gfp, node); if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; + cr->result = SCAN_ALLOC_HUGE_PAGE_FAIL; goto out_nolock; } if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; + cr->result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) { + cr->result = hugepage_vma_revalidate(mm, address, &vma); + if (cr->result) { mmap_read_unlock(mm); goto out_nolock; } pmd = mm_find_pmd(mm, address); if (!pmd) { - result = SCAN_PMD_NULL; + cr->result = SCAN_PMD_NULL; mmap_read_unlock(mm); goto out_nolock; } @@ -1166,8 +1175,8 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) + cr->result = hugepage_vma_revalidate(mm, address, &vma); + if (cr->result) goto out_up_write; /* check if the pmd is still valid */ if (mm_find_pmd(mm, address) != pmd) @@ -1194,11 +1203,11 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte, - &compound_pagelist); + cr->result = __collapse_huge_page_isolate(vma, address, pte, + &compound_pagelist); spin_unlock(pte_ptl); - if (unlikely(!isolated)) { + if (unlikely(cr->result != SCAN_SUCCEED)) { pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); @@ -1210,7 +1219,7 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); - result = SCAN_FAIL; + cr->result = SCAN_FAIL; goto out_up_write; } @@ -1246,25 +1255,25 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, cc->hpage = NULL; - khugepaged_pages_collapsed++; - result = SCAN_SUCCEED; + cr->result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: if (!IS_ERR_OR_NULL(cc->hpage)) mem_cgroup_uncharge(page_folio(cc->hpage)); - trace_mm_collapse_huge_page(mm, isolated, result); + trace_mm_collapse_huge_page(mm, cr->result == SCAN_SUCCEED, cr->result); return; } -static int khugepaged_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - struct collapse_control *cc) +static void khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct collapse_control *cc, + struct collapse_result *cr) { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, result = 0, referenced = 0; + int referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; @@ -1273,9 +1282,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, bool writable = false; VM_BUG_ON(address & ~HPAGE_PMD_MASK); + cr->result = SCAN_FAIL; - result = find_pmd_or_thp_or_none(mm, address, &pmd); - if (result != SCAN_SUCCEED) + cr->result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (cr->result != SCAN_SUCCEED) goto out; memset(cc->node_load, 0, sizeof(cc->node_load)); @@ -1291,12 +1301,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * comment below for pte_uffd_wp(). */ if (pte_swp_uffd_wp(pteval)) { - result = SCAN_PTE_UFFD_WP; + cr->result = SCAN_PTE_UFFD_WP; goto out_unmap; } continue; } else { - result = SCAN_EXCEED_SWAP_PTE; + cr->result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } @@ -1306,7 +1316,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ++none_or_zero <= khugepaged_max_ptes_none) { continue; } else { - result = SCAN_EXCEED_NONE_PTE; + cr->result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } @@ -1321,7 +1331,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * userfault messages that falls outside of * the registered range. So, just be simple. */ - result = SCAN_PTE_UFFD_WP; + cr->result = SCAN_PTE_UFFD_WP; goto out_unmap; } if (pte_write(pteval)) @@ -1329,13 +1339,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, page = vm_normal_page(vma, _address, pteval); if (unlikely(!page)) { - result = SCAN_PAGE_NULL; + cr->result = SCAN_PAGE_NULL; goto out_unmap; } if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; + cr->result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } @@ -1350,20 +1360,20 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, */ node = page_to_nid(page); if (khugepaged_scan_abort(node, cc)) { - result = SCAN_SCAN_ABORT; + cr->result = SCAN_SCAN_ABORT; goto out_unmap; } cc->node_load[node]++; if (!PageLRU(page)) { - result = SCAN_PAGE_LRU; + cr->result = SCAN_PAGE_LRU; goto out_unmap; } if (PageLocked(page)) { - result = SCAN_PAGE_LOCK; + cr->result = SCAN_PAGE_LOCK; goto out_unmap; } if (!PageAnon(page)) { - result = SCAN_PAGE_ANON; + cr->result = SCAN_PAGE_ANON; goto out_unmap; } @@ -1385,7 +1395,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * will be done again later the risk seems low. */ if (!is_refcount_suitable(page)) { - result = SCAN_PAGE_COUNT; + cr->result = SCAN_PAGE_COUNT; goto out_unmap; } if (pte_young(pteval) || @@ -1394,23 +1404,20 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, referenced++; } if (!writable) { - result = SCAN_PAGE_RO; + cr->result = SCAN_PAGE_RO; } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { - result = SCAN_LACK_REFERENCED_PAGE; + cr->result = SCAN_LACK_REFERENCED_PAGE; } else { - result = SCAN_SUCCEED; - ret = 1; + cr->result = SCAN_SUCCEED; } out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { + if (cr->result == SCAN_SUCCEED) /* collapse_huge_page will return with the mmap_lock released */ - collapse_huge_page(mm, address, cc, referenced, unmapped); - } + collapse_huge_page(mm, address, cc, referenced, unmapped, cr); out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, - none_or_zero, result, unmapped); - return ret; + none_or_zero, cr->result, unmapped); } static void collect_mm_slot(struct mm_slot *mm_slot) @@ -1671,6 +1678,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * @file: file that collapse on * @start: collapse start address * @cc: collapse context and scratchpad + * @cr: aggregate result information of collapse * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; @@ -1689,7 +1697,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ static void collapse_file(struct mm_struct *mm, struct file *file, pgoff_t start, - struct collapse_control *cc) + struct collapse_control *cc, + struct collapse_result *cr) + { struct address_space *mapping = file->f_mapping; gfp_t gfp; @@ -1697,25 +1707,27 @@ static void collapse_file(struct mm_struct *mm, pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); - int nr_none = 0, result = SCAN_SUCCEED; + int nr_none = 0; bool is_shmem = shmem_file(file); int nr, node; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); + cr->result = SCAN_SUCCEED; + /* Only allocate from the target node */ gfp = cc->gfp() | __GFP_THISNODE; node = khugepaged_find_target_node(cc); new_page = cc->alloc_hpage(cc, gfp, node); if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; + cr->result = SCAN_ALLOC_HUGE_PAGE_FAIL; goto out; } if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; + cr->result = SCAN_CGROUP_CHARGE_FAIL; goto out; } count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); @@ -1731,7 +1743,7 @@ static void collapse_file(struct mm_struct *mm, break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { - result = SCAN_FAIL; + cr->result = SCAN_FAIL; goto out; } } while (1); @@ -1762,13 +1774,13 @@ static void collapse_file(struct mm_struct *mm, */ if (index == start) { if (!xas_next_entry(&xas, end - 1)) { - result = SCAN_TRUNCATED; + cr->result = SCAN_TRUNCATED; goto xa_locked; } xas_set(&xas, index); } if (!shmem_charge(mapping->host, 1)) { - result = SCAN_FAIL; + cr->result = SCAN_FAIL; goto xa_locked; } xas_store(&xas, new_page); @@ -1781,14 +1793,14 @@ static void collapse_file(struct mm_struct *mm, /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, SGP_NOALLOC)) { - result = SCAN_FAIL; + cr->result = SCAN_FAIL; goto xa_unlocked; } } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); } else { - result = SCAN_PAGE_LOCK; + cr->result = SCAN_PAGE_LOCK; goto xa_locked; } } else { /* !is_shmem */ @@ -1801,7 +1813,7 @@ static void collapse_file(struct mm_struct *mm, lru_add_drain(); page = find_lock_page(mapping, index); if (unlikely(page == NULL)) { - result = SCAN_FAIL; + cr->result = SCAN_FAIL; goto xa_unlocked; } } else if (PageDirty(page)) { @@ -1820,17 +1832,17 @@ static void collapse_file(struct mm_struct *mm, */ xas_unlock_irq(&xas); filemap_flush(mapping); - result = SCAN_FAIL; + cr->result = SCAN_FAIL; goto xa_unlocked; } else if (PageWriteback(page)) { xas_unlock_irq(&xas); - result = SCAN_FAIL; + cr->result = SCAN_FAIL; goto xa_unlocked; } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); } else { - result = SCAN_PAGE_LOCK; + cr->result = SCAN_PAGE_LOCK; goto xa_locked; } } @@ -1843,7 +1855,7 @@ static void collapse_file(struct mm_struct *mm, /* make sure the page is up to date */ if (unlikely(!PageUptodate(page))) { - result = SCAN_FAIL; + cr->result = SCAN_FAIL; goto out_unlock; } @@ -1852,12 +1864,12 @@ static void collapse_file(struct mm_struct *mm, * we locked the first page, then a THP might be there already. */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + cr->result = SCAN_PAGE_COMPOUND; goto out_unlock; } if (page_mapping(page) != mapping) { - result = SCAN_TRUNCATED; + cr->result = SCAN_TRUNCATED; goto out_unlock; } @@ -1868,18 +1880,18 @@ static void collapse_file(struct mm_struct *mm, * page is dirty because it hasn't been flushed * since first write. */ - result = SCAN_FAIL; + cr->result = SCAN_FAIL; goto out_unlock; } if (isolate_lru_page(page)) { - result = SCAN_DEL_PAGE_LRU; + cr->result = SCAN_DEL_PAGE_LRU; goto out_unlock; } if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) { - result = SCAN_PAGE_HAS_PRIVATE; + cr->result = SCAN_PAGE_HAS_PRIVATE; putback_lru_page(page); goto out_unlock; } @@ -1900,7 +1912,7 @@ static void collapse_file(struct mm_struct *mm, * - one from isolate_lru_page; */ if (!page_ref_freeze(page, 3)) { - result = SCAN_PAGE_COUNT; + cr->result = SCAN_PAGE_COUNT; xas_unlock_irq(&xas); putback_lru_page(page); goto out_unlock; @@ -1935,7 +1947,7 @@ static void collapse_file(struct mm_struct *mm, */ smp_mb(); if (inode_is_open_for_write(mapping->host)) { - result = SCAN_FAIL; + cr->result = SCAN_FAIL; __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); goto xa_locked; @@ -1962,7 +1974,7 @@ static void collapse_file(struct mm_struct *mm, */ try_to_unmap_flush(); - if (result == SCAN_SUCCEED) { + if (cr->result == SCAN_SUCCEED) { struct page *page, *tmp; /* @@ -2002,8 +2014,6 @@ static void collapse_file(struct mm_struct *mm, */ retract_page_tables(mapping, start); cc->hpage = NULL; - - khugepaged_pages_collapsed++; } else { struct page *page; @@ -2055,15 +2065,16 @@ static void collapse_file(struct mm_struct *mm, static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start, - struct collapse_control *cc) + struct collapse_control *cc, + struct collapse_result *cr) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; - int result = SCAN_SUCCEED; + cr->result = SCAN_SUCCEED; present = 0; swap = 0; memset(cc->node_load, 0, sizeof(cc->node_load)); @@ -2074,7 +2085,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, if (xa_is_value(page)) { if (++swap > khugepaged_max_ptes_swap) { - result = SCAN_EXCEED_SWAP_PTE; + cr->result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } @@ -2086,25 +2097,25 @@ static void khugepaged_scan_file(struct mm_struct *mm, * into a PMD sized page */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + cr->result = SCAN_PAGE_COMPOUND; break; } node = page_to_nid(page); if (khugepaged_scan_abort(node, cc)) { - result = SCAN_SCAN_ABORT; + cr->result = SCAN_SCAN_ABORT; break; } cc->node_load[node]++; if (!PageLRU(page)) { - result = SCAN_PAGE_LRU; + cr->result = SCAN_PAGE_LRU; break; } if (page_count(page) != 1 + page_mapcount(page) + page_has_private(page)) { - result = SCAN_PAGE_COUNT; + cr->result = SCAN_PAGE_COUNT; break; } @@ -2123,12 +2134,12 @@ static void khugepaged_scan_file(struct mm_struct *mm, } rcu_read_unlock(); - if (result == SCAN_SUCCEED) { + if (cr->result == SCAN_SUCCEED) { if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { - result = SCAN_EXCEED_NONE_PTE; + cr->result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - collapse_file(mm, file, start, cc); + collapse_file(mm, file, start, cc, cr); } } @@ -2137,7 +2148,8 @@ static void khugepaged_scan_file(struct mm_struct *mm, #else static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start, - struct collapse_control *cc) + struct collapse_control *cc, + struct collapse_result *cr) { BUILD_BUG(); } @@ -2209,7 +2221,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, goto skip; while (khugepaged_scan.address < hend) { - int ret; + struct collapse_result cr = {0}; cond_resched(); if (unlikely(khugepaged_test_exit(mm))) goto breakouterloop; @@ -2223,17 +2235,20 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, khugepaged_scan.address); mmap_read_unlock(mm); - ret = 1; - khugepaged_scan_file(mm, file, pgoff, cc); + cr.dropped_mmap_lock = true; + khugepaged_scan_file(mm, file, pgoff, cc, &cr); fput(file); } else { - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, cc); + khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + cc, &cr); } + if (cr.result == SCAN_SUCCEED) + ++khugepaged_pages_collapsed; /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; - if (ret) + if (cr.dropped_mmap_lock) /* we released mmap_lock so break loop */ goto breakouterloop_mmap_lock; if (progress >= pages) -- 2.36.0.rc2.479.g8af0fa9b8e-goog