On 2/24/2023 10:51 AM, HORIGUCHI NAOYA(堀口 直也) wrote: > On Thu, Feb 23, 2023 at 05:28:10PM +0000, Matthew Wilcox wrote: >> On Thu, Feb 23, 2023 at 04:31:56PM +0800, Yin Fengwei wrote: >>> It's to prepare the batched rmap update for large folio. >>> No need to looped handle hugetlb. Just handle hugetlb and >>> bail out early. >>> >>> Almost no functional change. Just one change to mm counter >>> update. >> >> This looks like a very worthwhile cleanup in its own right. Adding >> various hugetlb & memory poisoning experts for commentary on the mm >> counter change (search for three asterisks) >> >>> Signed-off-by: Yin Fengwei <fengwei.yin@xxxxxxxxx> >>> --- >>> mm/rmap.c | 205 +++++++++++++++++++++++++++++++++--------------------- >>> 1 file changed, 126 insertions(+), 79 deletions(-) >>> >>> diff --git a/mm/rmap.c b/mm/rmap.c >>> index 15ae24585fc4..e7aa63b800f7 100644 >>> --- a/mm/rmap.c >>> +++ b/mm/rmap.c >>> @@ -1443,6 +1443,108 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, >>> munlock_vma_folio(folio, vma, compound); >>> } >>> >>> +static bool try_to_unmap_one_hugetlb(struct folio *folio, >>> + struct vm_area_struct *vma, struct mmu_notifier_range range, >>> + struct page_vma_mapped_walk pvmw, unsigned long address, >>> + enum ttu_flags flags) >>> +{ >>> + struct mm_struct *mm = vma->vm_mm; >>> + pte_t pteval; >>> + bool ret = true, anon = folio_test_anon(folio); >>> + >>> + /* >>> + * The try_to_unmap() is only passed a hugetlb page >>> + * in the case where the hugetlb page is poisoned. >>> + */ >>> + VM_BUG_ON_FOLIO(!folio_test_hwpoison(folio), folio); >>> + /* >>> + * huge_pmd_unshare may unmap an entire PMD page. >>> + * There is no way of knowing exactly which PMDs may >>> + * be cached for this mm, so we must flush them all. >>> + * start/end were already adjusted above to cover this >>> + * range. >>> + */ >>> + flush_cache_range(vma, range.start, range.end); >>> + >>> + /* >>> + * To call huge_pmd_unshare, i_mmap_rwsem must be >>> + * held in write mode. Caller needs to explicitly >>> + * do this outside rmap routines. >>> + * >>> + * We also must hold hugetlb vma_lock in write mode. >>> + * Lock order dictates acquiring vma_lock BEFORE >>> + * i_mmap_rwsem. We can only try lock here and fail >>> + * if unsuccessful. >>> + */ >>> + if (!anon) { >>> + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); >>> + if (!hugetlb_vma_trylock_write(vma)) { >>> + ret = false; >>> + goto out; >>> + } >>> + if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { >>> + hugetlb_vma_unlock_write(vma); >>> + flush_tlb_range(vma, >>> + range.start, range.end); >>> + mmu_notifier_invalidate_range(mm, >>> + range.start, range.end); >>> + /* >>> + * The ref count of the PMD page was >>> + * dropped which is part of the way map >>> + * counting is done for shared PMDs. >>> + * Return 'true' here. When there is >>> + * no other sharing, huge_pmd_unshare >>> + * returns false and we will unmap the >>> + * actual page and drop map count >>> + * to zero. >>> + */ >>> + goto out; >>> + } >>> + hugetlb_vma_unlock_write(vma); >>> + } >>> + pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); >>> + >>> + /* >>> + * Now the pte is cleared. If this pte was uffd-wp armed, >>> + * we may want to replace a none pte with a marker pte if >>> + * it's file-backed, so we don't lose the tracking info. >>> + */ >>> + pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval); >>> + >>> + /* Set the dirty flag on the folio now the pte is gone. */ >>> + if (pte_dirty(pteval)) >>> + folio_mark_dirty(folio); >>> + >>> + /* Update high watermark before we lower rss */ >>> + update_hiwater_rss(mm); >>> + >>> + if (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) { >>> + pteval = swp_entry_to_pte(make_hwpoison_entry(&folio->page)); >>> + set_huge_pte_at(mm, address, pvmw.pte, pteval); >>> + } >>> + >>> + /*** try_to_unmap_one() called dec_mm_counter for >>> + * (folio_test_hwpoison(folio) && !(flags & TTU_IGNORE_HWPOISON)) not >>> + * true case, looks incorrect. Change it to hugetlb_count_sub() here. >>> + */ >>> + hugetlb_count_sub(folio_nr_pages(folio), mm); > > I have no objection to this change (moving hugetlb_count_sub() outside the > if), but I have a question related to this. > > Generally TTU_IGNORE_HWPOISON is used to control the pte-conversion based > on page dirtiness. But actually what it depends on is whether data lost > happens when the page is forcibly dropped. And for hugetlb pages, that's > true regardless of PageDirty flag on it. > So I think we can assume that try_to_unmap_one_hugetlb() is called with > TTU_IGNORE_HWPOISON clear. So maybe we don't need the if-check? Thanks a lot for detail explaination. I will remove the if check if no object from other reviewer. Regards Yin, Fengwei > > Otherwise, the cleanup looks good to me. > > Thanks, > Naoya Horiguchi > >>> + >>> + /* >>> + * No need to call mmu_notifier_invalidate_range() it has be >>> + * done above for all cases requiring it to happen under page >>> + * table lock before mmu_notifier_invalidate_range_end() >>> + * >>> + * See Documentation/mm/mmu_notifier.rst >>> + */ >>> + page_remove_rmap(&folio->page, vma, folio_test_hugetlb(folio)); >>> + if (vma->vm_flags & VM_LOCKED) >>> + mlock_drain_local(); >>> + folio_put(folio); >>> + >>> +out: >>> + return ret; >>> +} >>> + >>> /* >>> * @arg: enum ttu_flags will be passed to this argument >>> */ >>> @@ -1506,86 +1608,37 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, >>> break; >>> } >>> >>> + address = pvmw.address; >>> + if (folio_test_hugetlb(folio)) { >>> + ret = try_to_unmap_one_hugetlb(folio, vma, range, >>> + pvmw, address, flags); >>> + >>> + /* no need to loop for hugetlb */ >>> + page_vma_mapped_walk_done(&pvmw); >>> + break; >>> + } >>> + >>> subpage = folio_page(folio, >>> pte_pfn(*pvmw.pte) - folio_pfn(folio)); >>> - address = pvmw.address; >>> anon_exclusive = folio_test_anon(folio) && >>> PageAnonExclusive(subpage); >>> >>> - if (folio_test_hugetlb(folio)) { >>> - bool anon = folio_test_anon(folio); >>> - >>> + flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); >>> + /* Nuke the page table entry. */ >>> + if (should_defer_flush(mm, flags)) { >>> /* >>> - * The try_to_unmap() is only passed a hugetlb page >>> - * in the case where the hugetlb page is poisoned. >>> + * We clear the PTE but do not flush so potentially >>> + * a remote CPU could still be writing to the folio. >>> + * If the entry was previously clean then the >>> + * architecture must guarantee that a clear->dirty >>> + * transition on a cached TLB entry is written through >>> + * and traps if the PTE is unmapped. >>> */ >>> - VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage); >>> - /* >>> - * huge_pmd_unshare may unmap an entire PMD page. >>> - * There is no way of knowing exactly which PMDs may >>> - * be cached for this mm, so we must flush them all. >>> - * start/end were already adjusted above to cover this >>> - * range. >>> - */ >>> - flush_cache_range(vma, range.start, range.end); >>> + pteval = ptep_get_and_clear(mm, address, pvmw.pte); >>> >>> - /* >>> - * To call huge_pmd_unshare, i_mmap_rwsem must be >>> - * held in write mode. Caller needs to explicitly >>> - * do this outside rmap routines. >>> - * >>> - * We also must hold hugetlb vma_lock in write mode. >>> - * Lock order dictates acquiring vma_lock BEFORE >>> - * i_mmap_rwsem. We can only try lock here and fail >>> - * if unsuccessful. >>> - */ >>> - if (!anon) { >>> - VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); >>> - if (!hugetlb_vma_trylock_write(vma)) { >>> - page_vma_mapped_walk_done(&pvmw); >>> - ret = false; >>> - break; >>> - } >>> - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { >>> - hugetlb_vma_unlock_write(vma); >>> - flush_tlb_range(vma, >>> - range.start, range.end); >>> - mmu_notifier_invalidate_range(mm, >>> - range.start, range.end); >>> - /* >>> - * The ref count of the PMD page was >>> - * dropped which is part of the way map >>> - * counting is done for shared PMDs. >>> - * Return 'true' here. When there is >>> - * no other sharing, huge_pmd_unshare >>> - * returns false and we will unmap the >>> - * actual page and drop map count >>> - * to zero. >>> - */ >>> - page_vma_mapped_walk_done(&pvmw); >>> - break; >>> - } >>> - hugetlb_vma_unlock_write(vma); >>> - } >>> - pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); >>> + set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); >>> } else { >>> - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); >>> - /* Nuke the page table entry. */ >>> - if (should_defer_flush(mm, flags)) { >>> - /* >>> - * We clear the PTE but do not flush so potentially >>> - * a remote CPU could still be writing to the folio. >>> - * If the entry was previously clean then the >>> - * architecture must guarantee that a clear->dirty >>> - * transition on a cached TLB entry is written through >>> - * and traps if the PTE is unmapped. >>> - */ >>> - pteval = ptep_get_and_clear(mm, address, pvmw.pte); >>> - >>> - set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); >>> - } else { >>> - pteval = ptep_clear_flush(vma, address, pvmw.pte); >>> - } >>> + pteval = ptep_clear_flush(vma, address, pvmw.pte); >>> } >>> >>> /* >>> @@ -1604,14 +1657,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, >>> >>> if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) { >>> pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); >>> - if (folio_test_hugetlb(folio)) { >>> - hugetlb_count_sub(folio_nr_pages(folio), mm); >>> - set_huge_pte_at(mm, address, pvmw.pte, pteval); >>> - } else { >>> - dec_mm_counter(mm, mm_counter(&folio->page)); >>> - set_pte_at(mm, address, pvmw.pte, pteval); >>> - } >>> - >>> + dec_mm_counter(mm, mm_counter(&folio->page)); >>> + set_pte_at(mm, address, pvmw.pte, pteval); >>> } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { >>> /* >>> * The guest indicated that the page content is of no >>> -- >>> 2.30.2