In the anonymous collapse path, khugepaged always collapses pte-mapped hugepage by allocating and copying to a new hugepage. In some scenarios, we can only update the mapping page tables for anonymous pte-mapped THPs, in the same way as file/shmem-backed pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged: attempt to map file/shmem-backed pte-mapped THPs by pmds") The simplest scenario that satisfies the conditions, as David points out, is when no subpages are PageAnonExclusive (PTEs must be R/O), we can collapse into a R/O PMD without further action. Let's start from this simplest scenario. Signed-off-by: Xu Yu <xuyu@xxxxxxxxxxxxxxxxx> --- mm/khugepaged.c | 212 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 88433cc25d8a..57e261387124 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1237,6 +1237,196 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, return result; } +static struct folio *find_lock_pte_mapped_folio(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmd) +{ + pte_t *pte, pteval; + struct folio *folio = NULL; + + pte = pte_offset_map(pmd, addr); + if (!pte) + return NULL; + + pteval = ptep_get_lockless(pte); + if (pte_none(pteval) || !pte_present(pteval)) + goto out; + + folio = vm_normal_folio(vma, addr, pteval); + if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) + goto out; + + if (!folio_trylock(folio)) { + folio = NULL; + goto out; + } + + if (!folio_try_get(folio)) { + folio_unlock(folio); + folio = NULL; + goto out; + } + +out: + pte_unmap(pte); + return folio; +} + +static int collapse_pte_mapped_anon_thp(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, bool *mmap_locked, + struct collapse_control *cc) +{ + struct mmu_notifier_range range; + struct folio *folio; + pte_t *start_pte, *pte; + pmd_t *pmd, pmdval; + spinlock_t *pml, *ptl; + pgtable_t pgtable; + unsigned long addr; + int exclusive = 0; + bool writable = false; + int result, i; + + /* Fast check before locking folio if already PMD-mapped */ + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); + if (result == SCAN_PMD_MAPPED) + return result; + + folio = find_lock_pte_mapped_folio(vma, haddr, pmd); + if (!folio) + return SCAN_PAGE_NULL; + if (!folio_test_large(folio)) { + result = SCAN_FAIL; + goto drop_folio; + } + if (folio_order(folio) != HPAGE_PMD_ORDER) { + result = SCAN_PAGE_COMPOUND; + goto drop_folio; + } + + mmap_read_unlock(mm); + *mmap_locked = false; + + /* Prevent all access to pagetables */ + mmap_write_lock(mm); + vma_start_write(vma); + + result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc); + if (result != SCAN_SUCCEED) + goto up_write; + + result = check_pmd_still_valid(mm, haddr, pmd); + if (result != SCAN_SUCCEED) + goto up_write; + + /* Recheck with mmap write lock */ + result = SCAN_SUCCEED; + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + if (!start_pte) + goto up_write; + for (i = 0, addr = haddr, pte = start_pte; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + struct page *subpage; + pte_t pteval = ptep_get(pte); + + if (pte_none(pteval) || !pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; + break; + } + + if (pte_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + break; + } + + if (pte_write(pteval)) + writable = true; + + subpage = vm_normal_page(vma, addr, pteval); + + if (unlikely(!subpage) || + unlikely(is_zone_device_page(subpage))) { + result = SCAN_PAGE_NULL; + break; + } + + if (folio_page(folio, i) != subpage) { + result = SCAN_FAIL; + break; + } + + if (PageAnonExclusive(subpage)) + exclusive++; + } + pte_unmap_unlock(start_pte, ptl); + if (result != SCAN_SUCCEED) + goto up_write; + + /* + * Case 1: + * No subpages are PageAnonExclusive (PTEs must be R/O), we can + * collapse into a R/O PMD without further action. + */ + if (!(exclusive == 0 && !writable)) + goto up_write; + + /* Collapse pmd entry */ + anon_vma_lock_write(vma->anon_vma); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + haddr, haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + + pml = pmd_lock(mm, pmd); /* probably unnecessary */ + pmdval = pmdp_collapse_flush(vma, haddr, pmd); + spin_unlock(pml); + mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one(); + + anon_vma_unlock_write(vma->anon_vma); + + /* + * Obtain a new pmd rmap before dropping pte rmaps to avoid + * false-negative page_mapped(). + */ + folio_get(folio); + page_add_anon_rmap(&folio->page, vma, haddr, RMAP_COMPOUND); + + start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl); + for (i = 0, addr = haddr, pte = start_pte; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + struct page *subpage; + pte_t pteval = ptep_get(pte); + + ptep_clear(mm, addr, pte); + subpage = vm_normal_page(vma, addr, pteval); + page_remove_rmap(subpage, vma, false); + } + pte_unmap_unlock(start_pte, ptl); + folio_ref_sub(folio, HPAGE_PMD_NR); + + /* Install pmd entry */ + pgtable = pmd_pgtable(pmdval); + pmdval = mk_huge_pmd(&folio->page, vma->vm_page_prot); + spin_lock(pml); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, haddr, pmd, pmdval); + update_mmu_cache_pmd(vma, haddr, pmd); + spin_unlock(pml); + + result = SCAN_SUCCEED; + +up_write: + mmap_write_unlock(mm); + +drop_folio: + folio_unlock(folio); + folio_put(folio); + + /* TODO: tracepoints */ + return result; +} + static int hpage_collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, bool *mmap_locked, @@ -1251,6 +1441,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; bool writable = false; + int exclusive = 0; + bool is_hpage = false; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1333,8 +1525,14 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, } } + if (PageAnonExclusive(page)) + exclusive++; + page = compound_head(page); + if (compound_order(page) == HPAGE_PMD_ORDER) + is_hpage = true; + /* * Record which node the original page is from and save this * information to cc->node_load[]. @@ -1396,7 +1594,21 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, } out_unmap: pte_unmap_unlock(pte, ptl); + + if (is_hpage && (exclusive == 0 && !writable)) { + int res; + + res = collapse_pte_mapped_anon_thp(mm, vma, address, + mmap_locked, cc); + if (res == SCAN_PMD_MAPPED || res == SCAN_SUCCEED) { + result = res; + goto out; + } + } + if (result == SCAN_SUCCEED) { + if (!*mmap_locked) + mmap_read_lock(mm); result = collapse_huge_page(mm, address, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ -- 2.37.1