On Fri, Nov 18, 2022 at 01:12:03AM -0800, Hugh Dickins wrote: > Following suggestion from Linus, instead of counting every PTE map of a > compound page in subpages_mapcount, just count how many of its subpages > are PTE-mapped: this yields the exact number needed for NR_ANON_MAPPED > and NR_FILE_MAPPED stats, without any need for a locked scan of subpages; > and requires updating the count less often. > > This does then revert total_mapcount() and folio_mapcount() to needing a > scan of subpages; but they are inherently racy, and need no locking, so > Linus is right that the scans are much better done there. Plus (unlike > in 6.1 and previous) subpages_mapcount lets us avoid the scan in the > common case of no PTE maps. And page_mapped() and folio_mapped() remain > scanless and just as efficient with the new meaning of subpages_mapcount: > those are the functions which I most wanted to remove the scan from. > > The updated page_dup_compound_rmap() is no longer suitable for use by > anon THP's __split_huge_pmd_locked(); but page_add_anon_rmap() can be > used for that, so long as its VM_BUG_ON_PAGE(!PageLocked) is deleted. > > Evidence is that this way goes slightly faster than the previous > implementation for most cases; but significantly faster in the (now > scanless) pmds after ptes case, which started out at 870ms and was > brought down to 495ms by the previous series, now takes around 105ms. > > Suggested-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> > Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx> Acked-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> Few minor nitpicks below. > --- > Documentation/mm/transhuge.rst | 3 +- > include/linux/mm.h | 52 ++++++----- > include/linux/rmap.h | 8 +- > mm/huge_memory.c | 2 +- > mm/rmap.c | 155 ++++++++++++++------------------- > 5 files changed, 103 insertions(+), 117 deletions(-) > > diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst > index 1e2a637cc607..af4c9d70321d 100644 > --- a/Documentation/mm/transhuge.rst > +++ b/Documentation/mm/transhuge.rst > @@ -122,7 +122,8 @@ pages: > > - map/unmap of sub-pages with PTE entry increment/decrement ->_mapcount > on relevant sub-page of the compound page, and also increment/decrement > - ->subpages_mapcount, stored in first tail page of the compound page. > + ->subpages_mapcount, stored in first tail page of the compound page, when > + _mapcount goes from -1 to 0 or 0 to -1: counting sub-pages mapped by PTE. > In order to have race-free accounting of sub-pages mapped, changes to > sub-page ->_mapcount, ->subpages_mapcount and ->compound_mapcount are > are all locked by bit_spin_lock of PG_locked in the first tail ->flags. > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 8fe6276d8cc2..c9e46d4d46f2 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -828,7 +828,7 @@ static inline int head_compound_mapcount(struct page *head) > } > > /* > - * Sum of mapcounts of sub-pages, does not include compound mapcount. > + * Number of sub-pages mapped by PTE, does not include compound mapcount. > * Must be called only on head of compound page. > */ > static inline int head_subpages_mapcount(struct page *head) > @@ -864,23 +864,7 @@ static inline int page_mapcount(struct page *page) > return head_compound_mapcount(page) + mapcount; > } > > -static inline int total_mapcount(struct page *page) > -{ > - if (likely(!PageCompound(page))) > - return atomic_read(&page->_mapcount) + 1; > - page = compound_head(page); > - return head_compound_mapcount(page) + head_subpages_mapcount(page); > -} > - > -/* > - * Return true if this page is mapped into pagetables. > - * For compound page it returns true if any subpage of compound page is mapped, > - * even if this particular subpage is not itself mapped by any PTE or PMD. > - */ > -static inline bool page_mapped(struct page *page) > -{ > - return total_mapcount(page) > 0; > -} > +int total_compound_mapcount(struct page *head); > > /** > * folio_mapcount() - Calculate the number of mappings of this folio. > @@ -897,8 +881,20 @@ static inline int folio_mapcount(struct folio *folio) > { > if (likely(!folio_test_large(folio))) > return atomic_read(&folio->_mapcount) + 1; > - return atomic_read(folio_mapcount_ptr(folio)) + 1 + > - atomic_read(folio_subpages_mapcount_ptr(folio)); > + return total_compound_mapcount(&folio->page); > +} > + > +static inline int total_mapcount(struct page *page) > +{ > + if (likely(!PageCompound(page))) > + return atomic_read(&page->_mapcount) + 1; > + return total_compound_mapcount(compound_head(page)); > +} > + > +static inline bool folio_large_is_mapped(struct folio *folio) > +{ > + return atomic_read(folio_mapcount_ptr(folio)) + > + atomic_read(folio_subpages_mapcount_ptr(folio)) >= 0; > } > > /** > @@ -909,7 +905,21 @@ static inline int folio_mapcount(struct folio *folio) > */ > static inline bool folio_mapped(struct folio *folio) > { > - return folio_mapcount(folio) > 0; > + if (likely(!folio_test_large(folio))) > + return atomic_read(&folio->_mapcount) >= 0; > + return folio_large_is_mapped(folio); > +} > + > +/* > + * Return true if this page is mapped into pagetables. > + * For compound page it returns true if any sub-page of compound page is mapped, > + * even if this particular sub-page is not itself mapped by any PTE or PMD. > + */ > +static inline bool page_mapped(struct page *page) > +{ > + if (likely(!PageCompound(page))) > + return atomic_read(&page->_mapcount) >= 0; > + return folio_large_is_mapped(page_folio(page)); > } > > static inline struct page *virt_to_head_page(const void *x) > diff --git a/include/linux/rmap.h b/include/linux/rmap.h > index 011a7530dc76..860f558126ac 100644 > --- a/include/linux/rmap.h > +++ b/include/linux/rmap.h > @@ -204,14 +204,14 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, > void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, > unsigned long address); > > -void page_dup_compound_rmap(struct page *page, bool compound); > +void page_dup_compound_rmap(struct page *page); > > static inline void page_dup_file_rmap(struct page *page, bool compound) > { > - if (PageCompound(page)) > - page_dup_compound_rmap(page, compound); > - else > + if (likely(!compound /* page is mapped by PTE */)) I'm not a fan of this kind of comments. Maybe move above the line (here and below)? > atomic_inc(&page->_mapcount); > + else > + page_dup_compound_rmap(page); > } > > /** > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 30056efc79ad..3dee8665c585 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -2215,7 +2215,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > BUG_ON(!pte_none(*pte)); > set_pte_at(mm, addr, pte, entry); > if (!pmd_migration) > - page_dup_compound_rmap(page + i, false); > + page_add_anon_rmap(page + i, vma, addr, false); > pte_unmap(pte); > } > > diff --git a/mm/rmap.c b/mm/rmap.c > index 4833d28c5e1a..66be8cae640f 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -1117,55 +1117,36 @@ static void unlock_compound_mapcounts(struct page *head, > bit_spin_unlock(PG_locked, &head[1].flags); > } > > -/* > - * When acting on a compound page under lock_compound_mapcounts(), avoid the > - * unnecessary overhead of an actual atomic operation on its subpage mapcount. > - * Return true if this is the first increment or the last decrement > - * (remembering that page->_mapcount -1 represents logical mapcount 0). > - */ > -static bool subpage_mapcount_inc(struct page *page) > -{ > - int orig_mapcount = atomic_read(&page->_mapcount); > - > - atomic_set(&page->_mapcount, orig_mapcount + 1); > - return orig_mapcount < 0; > -} > - > -static bool subpage_mapcount_dec(struct page *page) > -{ > - int orig_mapcount = atomic_read(&page->_mapcount); > - > - atomic_set(&page->_mapcount, orig_mapcount - 1); > - return orig_mapcount == 0; > -} > - > -/* > - * When mapping a THP's first pmd, or unmapping its last pmd, if that THP > - * also has pte mappings, then those must be discounted: in order to maintain > - * NR_ANON_MAPPED and NR_FILE_MAPPED statistics exactly, without any drift, > - * and to decide when an anon THP should be put on the deferred split queue. > - * This function must be called between lock_ and unlock_compound_mapcounts(). > - */ > -static int nr_subpages_unmapped(struct page *head, int nr_subpages) > +int total_compound_mapcount(struct page *head) > { > - int nr = nr_subpages; > + int mapcount = head_compound_mapcount(head); > + int nr_subpages; > int i; > > - /* Discount those subpages mapped by pte */ > + /* In the common case, avoid the loop when no subpages mapped by PTE */ > + if (head_subpages_mapcount(head) == 0) > + return mapcount; > + /* > + * Add all the PTE mappings of those subpages mapped by PTE. > + * Limit the loop, knowing that only subpages_mapcount are mapped? > + * Perhaps: given all the raciness, that may be a good or a bad idea. > + */ > + nr_subpages = thp_nr_pages(head); > for (i = 0; i < nr_subpages; i++) > - if (atomic_read(&head[i]._mapcount) >= 0) > - nr--; > - return nr; > + mapcount += atomic_read(&head[i]._mapcount); > + > + /* But each of those _mapcounts was based on -1 */ > + mapcount += nr_subpages; > + return mapcount; > } > > /* > - * page_dup_compound_rmap(), used when copying mm, or when splitting pmd, > + * page_dup_compound_rmap(), used when copying mm, > * provides a simple example of using lock_ and unlock_compound_mapcounts(). > */ > -void page_dup_compound_rmap(struct page *page, bool compound) > +void page_dup_compound_rmap(struct page *head) > { > struct compound_mapcounts mapcounts; > - struct page *head; > > /* > * Hugetlb pages could use lock_compound_mapcounts(), like THPs do; > @@ -1176,20 +1157,16 @@ void page_dup_compound_rmap(struct page *page, bool compound) > * Note that hugetlb does not call page_add_file_rmap(): > * here is where hugetlb shared page mapcount is raised. > */ > - if (PageHuge(page)) { > - atomic_inc(compound_mapcount_ptr(page)); > - return; > - } > + if (PageHuge(head)) { > + atomic_inc(compound_mapcount_ptr(head)); > Remove the newline? > - head = compound_head(page); > - lock_compound_mapcounts(head, &mapcounts); > - if (compound) { > + } else if (PageTransHuge(head)) { > + /* That test is redundant: it's for safety or to optimize out */ > + > + lock_compound_mapcounts(head, &mapcounts); > mapcounts.compound_mapcount++; > - } else { > - mapcounts.subpages_mapcount++; > - subpage_mapcount_inc(page); > + unlock_compound_mapcounts(head, &mapcounts); > } > - unlock_compound_mapcounts(head, &mapcounts); > } > > /** > @@ -1308,31 +1285,29 @@ void page_add_anon_rmap(struct page *page, > > if (unlikely(PageKsm(page))) > lock_page_memcg(page); > - else > - VM_BUG_ON_PAGE(!PageLocked(page), page); > > - if (likely(!PageCompound(page))) { > + if (likely(!compound /* page is mapped by PTE */)) { > first = atomic_inc_and_test(&page->_mapcount); > nr = first; > + if (first && PageCompound(page)) { > + struct page *head = compound_head(page); > + > + lock_compound_mapcounts(head, &mapcounts); > + mapcounts.subpages_mapcount++; > + nr = !mapcounts.compound_mapcount; > + unlock_compound_mapcounts(head, &mapcounts); > + } > + } else if (PageTransHuge(page)) { > + /* That test is redundant: it's for safety or to optimize out */ > > - } else if (compound && PageTransHuge(page)) { > lock_compound_mapcounts(page, &mapcounts); > first = !mapcounts.compound_mapcount; > mapcounts.compound_mapcount++; > if (first) { > - nr = nr_pmdmapped = thp_nr_pages(page); > - if (mapcounts.subpages_mapcount) > - nr = nr_subpages_unmapped(page, nr_pmdmapped); > + nr_pmdmapped = thp_nr_pages(page); > + nr = nr_pmdmapped - mapcounts.subpages_mapcount; > } > unlock_compound_mapcounts(page, &mapcounts); > - } else { > - struct page *head = compound_head(page); > - > - lock_compound_mapcounts(head, &mapcounts); > - mapcounts.subpages_mapcount++; > - first = subpage_mapcount_inc(page); > - nr = first && !mapcounts.compound_mapcount; > - unlock_compound_mapcounts(head, &mapcounts); > } > > VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); > @@ -1411,28 +1386,28 @@ void page_add_file_rmap(struct page *page, > VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); > lock_page_memcg(page); > > - if (likely(!PageCompound(page))) { > + if (likely(!compound /* page is mapped by PTE */)) { > first = atomic_inc_and_test(&page->_mapcount); > nr = first; > + if (first && PageCompound(page)) { > + struct page *head = compound_head(page); > + > + lock_compound_mapcounts(head, &mapcounts); > + mapcounts.subpages_mapcount++; > + nr = !mapcounts.compound_mapcount; > + unlock_compound_mapcounts(head, &mapcounts); > + } > + } else if (PageTransHuge(page)) { > + /* That test is redundant: it's for safety or to optimize out */ > > - } else if (compound && PageTransHuge(page)) { > lock_compound_mapcounts(page, &mapcounts); > first = !mapcounts.compound_mapcount; > mapcounts.compound_mapcount++; > if (first) { > - nr = nr_pmdmapped = thp_nr_pages(page); > - if (mapcounts.subpages_mapcount) > - nr = nr_subpages_unmapped(page, nr_pmdmapped); > + nr_pmdmapped = thp_nr_pages(page); > + nr = nr_pmdmapped - mapcounts.subpages_mapcount; > } > unlock_compound_mapcounts(page, &mapcounts); > - } else { > - struct page *head = compound_head(page); > - > - lock_compound_mapcounts(head, &mapcounts); > - mapcounts.subpages_mapcount++; > - first = subpage_mapcount_inc(page); > - nr = first && !mapcounts.compound_mapcount; > - unlock_compound_mapcounts(head, &mapcounts); > } > > if (nr_pmdmapped) > @@ -1472,28 +1447,28 @@ void page_remove_rmap(struct page *page, > lock_page_memcg(page); > > /* page still mapped by someone else? */ > - if (likely(!PageCompound(page))) { > + if (likely(!compound /* page is mapped by PTE */)) { > last = atomic_add_negative(-1, &page->_mapcount); > nr = last; > + if (last && PageCompound(page)) { > + struct page *head = compound_head(page); > + > + lock_compound_mapcounts(head, &mapcounts); > + mapcounts.subpages_mapcount--; > + nr = !mapcounts.compound_mapcount; > + unlock_compound_mapcounts(head, &mapcounts); > + } > + } else if (PageTransHuge(page)) { > + /* That test is redundant: it's for safety or to optimize out */ > > - } else if (compound && PageTransHuge(page)) { > lock_compound_mapcounts(page, &mapcounts); > mapcounts.compound_mapcount--; > last = !mapcounts.compound_mapcount; > if (last) { > - nr = nr_pmdmapped = thp_nr_pages(page); > - if (mapcounts.subpages_mapcount) > - nr = nr_subpages_unmapped(page, nr_pmdmapped); > + nr_pmdmapped = thp_nr_pages(page); > + nr = nr_pmdmapped - mapcounts.subpages_mapcount; > } > unlock_compound_mapcounts(page, &mapcounts); > - } else { > - struct page *head = compound_head(page); > - > - lock_compound_mapcounts(head, &mapcounts); > - mapcounts.subpages_mapcount--; > - last = subpage_mapcount_dec(page); > - nr = last && !mapcounts.compound_mapcount; > - unlock_compound_mapcounts(head, &mapcounts); > } > > if (nr_pmdmapped) { > -- > 2.35.3 > -- Kiryl Shutsemau / Kirill A. Shutemov