On Fri, May 26, 2023 at 10:41:40PM +0100, David Howells wrote: > Make pin_user_pages*() leave a ZERO_PAGE unpinned if it extracts a pointer > to it from the page tables and make unpin_user_page*() correspondingly > ignore a ZERO_PAGE when unpinning. We don't want to risk overrunning a > zero page's refcount as we're only allowed ~2 million pins on it - > something that userspace can conceivably trigger. > > Add a pair of functions to test whether a page or a folio is a ZERO_PAGE. > > Signed-off-by: David Howells <dhowells@xxxxxxxxxx> > cc: Christoph Hellwig <hch@xxxxxxxxxxxxx> > cc: David Hildenbrand <david@xxxxxxxxxx> > cc: Lorenzo Stoakes <lstoakes@xxxxxxxxx> > cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > cc: Jens Axboe <axboe@xxxxxxxxx> > cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx> > cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> > cc: Jan Kara <jack@xxxxxxx> > cc: Jeff Layton <jlayton@xxxxxxxxxx> > cc: Jason Gunthorpe <jgg@xxxxxxxxxx> > cc: Logan Gunthorpe <logang@xxxxxxxxxxxx> > cc: Hillf Danton <hdanton@xxxxxxxx> > cc: Christian Brauner <brauner@xxxxxxxxxx> > cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> > cc: linux-fsdevel@xxxxxxxxxxxxxxx > cc: linux-block@xxxxxxxxxxxxxxx > cc: linux-kernel@xxxxxxxxxxxxxxx > cc: linux-mm@xxxxxxxxx > --- > > Notes: > ver #3) > - Move is_zero_page() and is_zero_folio() to mm.h for dependency reasons. > - Add more comments and adjust the docs. > > ver #2) > - Fix use of ZERO_PAGE(). > - Add is_zero_page() and is_zero_folio() wrappers. > - Return the zero page obtained, not ZERO_PAGE(0) unconditionally. > > Documentation/core-api/pin_user_pages.rst | 6 +++++ > include/linux/mm.h | 26 +++++++++++++++++-- > mm/gup.c | 31 ++++++++++++++++++++++- > 3 files changed, 60 insertions(+), 3 deletions(-) > > diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst > index 9fb0b1080d3b..d3c1f6d8c0e0 100644 > --- a/Documentation/core-api/pin_user_pages.rst > +++ b/Documentation/core-api/pin_user_pages.rst > @@ -112,6 +112,12 @@ pages: > This also leads to limitations: there are only 31-10==21 bits available for a > counter that increments 10 bits at a time. > > +* Because of that limitation, special handling is applied to the zero pages > + when using FOLL_PIN. We only pretend to pin a zero page - we don't alter its > + refcount or pincount at all (it is permanent, so there's no need). The > + unpinning functions also don't do anything to a zero page. This is > + transparent to the caller. > + Great! Cheers. :) > * Callers must specifically request "dma-pinned tracking of pages". In other > words, just calling get_user_pages() will not suffice; a new set of functions, > pin_user_page() and related, must be used. > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 27ce77080c79..3c2f6b452586 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1910,6 +1910,28 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, > return page_maybe_dma_pinned(page); > } > > +/** > + * is_zero_page - Query if a page is a zero page > + * @page: The page to query > + * > + * This returns true if @page is one of the permanent zero pages. > + */ > +static inline bool is_zero_page(const struct page *page) > +{ > + return is_zero_pfn(page_to_pfn(page)); > +} > + > +/** > + * is_zero_folio - Query if a folio is a zero page > + * @folio: The folio to query > + * > + * This returns true if @folio is one of the permanent zero pages. > + */ > +static inline bool is_zero_folio(const struct folio *folio) > +{ > + return is_zero_page(&folio->page); > +} > + > /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ > #ifdef CONFIG_MIGRATION > static inline bool is_longterm_pinnable_page(struct page *page) > @@ -1920,8 +1942,8 @@ static inline bool is_longterm_pinnable_page(struct page *page) > if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) > return false; > #endif > - /* The zero page may always be pinned */ > - if (is_zero_pfn(page_to_pfn(page))) > + /* The zero page can be "pinned" but gets special handling. */ > + if (is_zero_page(page)) > return true; > > /* Coherent device memory must always allow eviction. */ > diff --git a/mm/gup.c b/mm/gup.c > index bbe416236593..ad28261dcafd 100644 > --- a/mm/gup.c > +++ b/mm/gup.c > @@ -51,7 +51,8 @@ static inline void sanity_check_pinned_pages(struct page **pages, > struct page *page = *pages; > struct folio *folio = page_folio(page); > > - if (!folio_test_anon(folio)) > + if (is_zero_page(page) || > + !folio_test_anon(folio)) > continue; > if (!folio_test_large(folio) || folio_test_hugetlb(folio)) > VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); > @@ -131,6 +132,13 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) > else if (flags & FOLL_PIN) { > struct folio *folio; > > + /* > + * Don't take a pin on the zero page - it's not going anywhere > + * and it is used in a *lot* of places. > + */ > + if (is_zero_page(page)) > + return page_folio(page); > + > /* > * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a > * right zone, so fail and let the caller fall back to the slow > @@ -180,6 +188,8 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) > static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) > { > if (flags & FOLL_PIN) { > + if (is_zero_folio(folio)) > + return; > node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); > if (folio_test_large(folio)) > atomic_sub(refs, &folio->_pincount); > @@ -224,6 +234,13 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) > if (flags & FOLL_GET) > folio_ref_inc(folio); > else if (flags & FOLL_PIN) { > + /* > + * Don't take a pin on the zero page - it's not going anywhere > + * and it is used in a *lot* of places. > + */ > + if (is_zero_page(page)) > + return 0; > + > /* > * Similar to try_grab_folio(): be sure to *also* > * increment the normal page refcount field at least once, > @@ -3079,6 +3096,9 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); > * > * FOLL_PIN means that the pages must be released via unpin_user_page(). Please > * see Documentation/core-api/pin_user_pages.rst for further details. > + * > + * Note that if a zero_page is amongst the returned pages, it will not have > + * pins in it and unpin_user_page() will not remove pins from it. > */ > int pin_user_pages_fast(unsigned long start, int nr_pages, > unsigned int gup_flags, struct page **pages) > @@ -3110,6 +3130,9 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); > * > * FOLL_PIN means that the pages must be released via unpin_user_page(). Please > * see Documentation/core-api/pin_user_pages.rst for details. > + * > + * Note that if a zero_page is amongst the returned pages, it will not have > + * pins in it and unpin_user_page*() will not remove pins from it. > */ > long pin_user_pages_remote(struct mm_struct *mm, > unsigned long start, unsigned long nr_pages, > @@ -3143,6 +3166,9 @@ EXPORT_SYMBOL(pin_user_pages_remote); > * > * FOLL_PIN means that the pages must be released via unpin_user_page(). Please > * see Documentation/core-api/pin_user_pages.rst for details. > + * > + * Note that if a zero_page is amongst the returned pages, it will not have > + * pins in it and unpin_user_page*() will not remove pins from it. > */ > long pin_user_pages(unsigned long start, unsigned long nr_pages, > unsigned int gup_flags, struct page **pages, > @@ -3161,6 +3187,9 @@ EXPORT_SYMBOL(pin_user_pages); > * pin_user_pages_unlocked() is the FOLL_PIN variant of > * get_user_pages_unlocked(). Behavior is the same, except that this one sets > * FOLL_PIN and rejects FOLL_GET. > + * > + * Note that if a zero_page is amongst the returned pages, it will not have > + * pins in it and unpin_user_page*() will not remove pins from it. > */ > long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, > struct page **pages, unsigned int gup_flags) > All looks good, thanks for adding comments/updating doc! Reviewed-by: Lorenzo Stoakes <lstoakes@xxxxxxxxx>