On 7/17/23 22:31, Ryan Roberts wrote: > This allows batching the rmap removal with folio_remove_rmap_range(), > which means we avoid spuriously adding a partially unmapped folio to the > deferrred split queue in the common case, which reduces split queue lock > contention. > > Previously each page was removed from the rmap individually with > page_remove_rmap(). If the first page belonged to a large folio, this > would cause page_remove_rmap() to conclude that the folio was now > partially mapped and add the folio to the deferred split queue. But > subsequent calls would cause the folio to become fully unmapped, meaning > there is no value to adding it to the split queue. > > Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx> > --- > mm/memory.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 119 insertions(+) > > diff --git a/mm/memory.c b/mm/memory.c > index 01f39e8144ef..6facb8c8807a 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -1391,6 +1391,95 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, > pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); > } > > +static inline unsigned long page_addr(struct page *page, > + struct page *anchor, unsigned long anchor_addr) > +{ > + unsigned long offset; > + unsigned long addr; > + > + offset = (page_to_pfn(page) - page_to_pfn(anchor)) << PAGE_SHIFT; > + addr = anchor_addr + offset; > + > + if (anchor > page) { > + if (addr > anchor_addr) > + return 0; > + } else { > + if (addr < anchor_addr) > + return ULONG_MAX; > + } > + > + return addr; > +} > + > +static int calc_anon_folio_map_pgcount(struct folio *folio, > + struct page *page, pte_t *pte, > + unsigned long addr, unsigned long end) > +{ > + pte_t ptent; > + int floops; > + int i; > + unsigned long pfn; > + > + end = min(page_addr(&folio->page + folio_nr_pages(folio), page, addr), > + end); > + floops = (end - addr) >> PAGE_SHIFT; > + pfn = page_to_pfn(page); > + pfn++; > + pte++; > + > + for (i = 1; i < floops; i++) { > + ptent = ptep_get(pte); > + > + if (!pte_present(ptent) || > + pte_pfn(ptent) != pfn) { > + return i; > + } > + > + pfn++; > + pte++; > + } > + > + return floops; > +} > + > +static unsigned long zap_anon_pte_range(struct mmu_gather *tlb, > + struct vm_area_struct *vma, > + struct page *page, pte_t *pte, > + unsigned long addr, unsigned long end, > + bool *full_out) > +{ > + struct folio *folio = page_folio(page); > + struct mm_struct *mm = tlb->mm; > + pte_t ptent; > + int pgcount; > + int i; > + bool full; > + > + pgcount = calc_anon_folio_map_pgcount(folio, page, pte, addr, end); > + > + for (i = 0; i < pgcount;) { > + ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); > + tlb_remove_tlb_entry(tlb, pte, addr); > + full = __tlb_remove_page(tlb, page, 0); > + > + if (unlikely(page_mapcount(page) < 1)) > + print_bad_pte(vma, addr, ptent, page); > + > + i++; > + page++; > + pte++; > + addr += PAGE_SIZE; > + > + if (unlikely(full)) > + break; > + } > + > + folio_remove_rmap_range(folio, page - i, i, vma); > + > + *full_out = full; > + return i; > +} > + > static unsigned long zap_pte_range(struct mmu_gather *tlb, > struct vm_area_struct *vma, pmd_t *pmd, > unsigned long addr, unsigned long end, > @@ -1428,6 +1517,36 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, > page = vm_normal_page(vma, addr, ptent); > if (unlikely(!should_zap_page(details, page))) > continue; > + > + /* > + * Batch zap large anonymous folio mappings. This allows > + * batching the rmap removal, which means we avoid > + * spuriously adding a partially unmapped folio to the > + * deferrred split queue in the common case, which > + * reduces split queue lock contention. Require the VMA > + * to be anonymous to ensure that none of the PTEs in > + * the range require zap_install_uffd_wp_if_needed(). > + */ > + if (page && PageAnon(page) && vma_is_anonymous(vma)) { Why this is only for anonymous page? I suppose it can support file mapping also. Regards Yin, Fengwei > + bool full; > + int pgcount; > + > + pgcount = zap_anon_pte_range(tlb, vma, > + page, pte, addr, end, &full); > + > + rss[mm_counter(page)] -= pgcount; > + pgcount--; > + pte += pgcount; > + addr += pgcount << PAGE_SHIFT; > + > + if (unlikely(full)) { > + force_flush = 1; > + addr += PAGE_SIZE; > + break; > + } > + continue; > + } > + > ptent = ptep_get_and_clear_full(mm, addr, pte, > tlb->fullmm); > tlb_remove_tlb_entry(tlb, pte, addr);