On Wed, Jul 6, 2022 at 5:06 PM Zach O'Keefe <zokeefe@xxxxxxxxxx> wrote: > > When scanning an anon pmd to see if it's eligible for collapse, return > SCAN_PMD_MAPPED if the pmd already maps a hugepage. Note that > SCAN_PMD_MAPPED is different from SCAN_PAGE_COMPOUND used in the > file-collapse path, since the latter might identify pte-mapped compound > pages. This is required by MADV_COLLAPSE which necessarily needs to > know what hugepage-aligned/sized regions are already pmd-mapped. > > In order to determine if a pmd already maps a hugepage, refactor > mm_find_pmd(): > > Return mm_find_pmd() to it's pre-commit f72e7dcdd252 ("mm: let mm_find_pmd > fix buggy race with THP fault") behavior. ksm was the only caller that > explicitly wanted a pte-mapping pmd, so open code the pte-mapping logic > there (pmd_present() and pmd_trans_huge() checks). > > Undo revert change in commit f72e7dcdd252 ("mm: let mm_find_pmd fix buggy race > with THP fault") that open-coded split_huge_pmd_address() pmd lookup and > use mm_find_pmd() instead. > > Signed-off-by: Zach O'Keefe <zokeefe@xxxxxxxxxx> Reviewed-by: Yang Shi <shy828301@xxxxxxxxx> > --- > include/trace/events/huge_memory.h | 1 + > mm/huge_memory.c | 18 +-------- > mm/internal.h | 2 +- > mm/khugepaged.c | 60 ++++++++++++++++++++++++------ > mm/ksm.c | 10 +++++ > mm/rmap.c | 15 +++----- > 6 files changed, 67 insertions(+), 39 deletions(-) > > diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h > index d651f3437367..55392bf30a03 100644 > --- a/include/trace/events/huge_memory.h > +++ b/include/trace/events/huge_memory.h > @@ -11,6 +11,7 @@ > EM( SCAN_FAIL, "failed") \ > EM( SCAN_SUCCEED, "succeeded") \ > EM( SCAN_PMD_NULL, "pmd_null") \ > + EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \ > EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ > EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ > EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \ > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 4fbe43dc1568..fb76db6c703e 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -2363,25 +2363,11 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, > bool freeze, struct folio *folio) > { > - pgd_t *pgd; > - p4d_t *p4d; > - pud_t *pud; > - pmd_t *pmd; > + pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); > > - pgd = pgd_offset(vma->vm_mm, address); > - if (!pgd_present(*pgd)) > + if (!pmd) > return; > > - p4d = p4d_offset(pgd, address); > - if (!p4d_present(*p4d)) > - return; > - > - pud = pud_offset(p4d, address); > - if (!pud_present(*pud)) > - return; > - > - pmd = pmd_offset(pud, address); > - > __split_huge_pmd(vma, pmd, address, freeze, folio); > } > > diff --git a/mm/internal.h b/mm/internal.h > index 6e14749ad1e5..ef8c23fb678f 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -188,7 +188,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason > /* > * in mm/rmap.c: > */ > -extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); > +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); > > /* > * in mm/page_alloc.c > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index b0e20db3f805..c7a09cc9a0e8 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -28,6 +28,7 @@ enum scan_result { > SCAN_FAIL, > SCAN_SUCCEED, > SCAN_PMD_NULL, > + SCAN_PMD_MAPPED, > SCAN_EXCEED_NONE_PTE, > SCAN_EXCEED_SWAP_PTE, > SCAN_EXCEED_SHARED_PTE, > @@ -871,6 +872,45 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, > return SCAN_SUCCEED; > } > > +static int find_pmd_or_thp_or_none(struct mm_struct *mm, > + unsigned long address, > + pmd_t **pmd) > +{ > + pmd_t pmde; > + > + *pmd = mm_find_pmd(mm, address); > + if (!*pmd) > + return SCAN_PMD_NULL; > + > + pmde = pmd_read_atomic(*pmd); > + > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ > + barrier(); > +#endif > + if (!pmd_present(pmde)) > + return SCAN_PMD_NULL; > + if (pmd_trans_huge(pmde)) > + return SCAN_PMD_MAPPED; > + if (pmd_bad(pmde)) > + return SCAN_PMD_NULL; > + return SCAN_SUCCEED; > +} > + > +static int check_pmd_still_valid(struct mm_struct *mm, > + unsigned long address, > + pmd_t *pmd) > +{ > + pmd_t *new_pmd; > + int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); > + > + if (result != SCAN_SUCCEED) > + return result; > + if (new_pmd != pmd) > + return SCAN_FAIL; > + return SCAN_SUCCEED; > +} > + > /* > * Bring missing pages in from swap, to complete THP collapse. > * Only done if khugepaged_scan_pmd believes it is worthwhile. > @@ -982,9 +1022,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > goto out_nolock; > } > > - pmd = mm_find_pmd(mm, address); > - if (!pmd) { > - result = SCAN_PMD_NULL; > + result = find_pmd_or_thp_or_none(mm, address, &pmd); > + if (result != SCAN_SUCCEED) { > mmap_read_unlock(mm); > goto out_nolock; > } > @@ -1012,7 +1051,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > if (result != SCAN_SUCCEED) > goto out_up_write; > /* check if the pmd is still valid */ > - if (mm_find_pmd(mm, address) != pmd) > + result = check_pmd_still_valid(mm, address, pmd); > + if (result != SCAN_SUCCEED) > goto out_up_write; > > anon_vma_lock_write(vma->anon_vma); > @@ -1115,11 +1155,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, > > VM_BUG_ON(address & ~HPAGE_PMD_MASK); > > - pmd = mm_find_pmd(mm, address); > - if (!pmd) { > - result = SCAN_PMD_NULL; > + result = find_pmd_or_thp_or_none(mm, address, &pmd); > + if (result != SCAN_SUCCEED) > goto out; > - } > > memset(cc->node_load, 0, sizeof(cc->node_load)); > pte = pte_offset_map_lock(mm, pmd, address, &ptl); > @@ -1373,8 +1411,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) > if (!PageHead(hpage)) > goto drop_hpage; > > - pmd = mm_find_pmd(mm, haddr); > - if (!pmd) > + if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) > goto drop_hpage; > > start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); > @@ -1492,8 +1529,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) > if (vma->vm_end < addr + HPAGE_PMD_SIZE) > continue; > mm = vma->vm_mm; > - pmd = mm_find_pmd(mm, addr); > - if (!pmd) > + if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) > continue; > /* > * We need exclusive mmap_lock to retract page table. > diff --git a/mm/ksm.c b/mm/ksm.c > index 075123602bd0..3e0a0a42fa1f 100644 > --- a/mm/ksm.c > +++ b/mm/ksm.c > @@ -1136,6 +1136,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, > { > struct mm_struct *mm = vma->vm_mm; > pmd_t *pmd; > + pmd_t pmde; > pte_t *ptep; > pte_t newpte; > spinlock_t *ptl; > @@ -1150,6 +1151,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, > pmd = mm_find_pmd(mm, addr); > if (!pmd) > goto out; > + /* > + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() > + * without holding anon_vma lock for write. So when looking for a > + * genuine pmde (in which to find pte), test present and !THP together. > + */ > + pmde = *pmd; > + barrier(); > + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) > + goto out; > > mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, > addr + PAGE_SIZE); > diff --git a/mm/rmap.c b/mm/rmap.c > index edc06c52bc82..af775855e58f 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -767,13 +767,17 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) > return vma_address(page, vma); > } > > +/* > + * Returns the actual pmd_t* where we expect 'address' to be mapped from, or > + * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* > + * represents. > + */ > pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) > { > pgd_t *pgd; > p4d_t *p4d; > pud_t *pud; > pmd_t *pmd = NULL; > - pmd_t pmde; > > pgd = pgd_offset(mm, address); > if (!pgd_present(*pgd)) > @@ -788,15 +792,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) > goto out; > > pmd = pmd_offset(pud, address); > - /* > - * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() > - * without holding anon_vma lock for write. So when looking for a > - * genuine pmde (in which to find pte), test present and !THP together. > - */ > - pmde = *pmd; > - barrier(); > - if (!pmd_present(pmde) || pmd_trans_huge(pmde)) > - pmd = NULL; > out: > return pmd; > } > -- > 2.37.0.rc0.161.g10f37bed90-goog >