On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote: > We are above to use page_frag_alloc_*() API to not just "about to use", not "above to use" > allocate memory for skb->data, but also use them to do > the memory allocation for skb frag too. Currently the > implementation of page_frag in mm subsystem is running > the offset as a countdown rather than count-up value, > there may have several advantages to that as mentioned > in [1], but it may have some disadvantages, for example, > it may disable skb frag coaleasing and more correct cache > prefetching > > We have a trade-off to make in order to have a unified > implementation and API for page_frag, so use a initial zero > offset in this patch, and the following patch will try to > make some optimization to aovid the disadvantages as much > as possible. > > As offsets is added due to alignment requirement before > actually checking if the cache is enough, which might make > it exploitable if caller passes a align value bigger than > 32K mistakenly. As we are allowing order 3 page allocation > to fail easily under low memory condition, align value bigger > than PAGE_SIZE is not really allowed, so add a 'align > > PAGE_SIZE' checking in page_frag_alloc_va_align() to catch > that. > > 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@xxxxxxxxx/ > > CC: Alexander Duyck <alexander.duyck@xxxxxxxxx> > Signed-off-by: Yunsheng Lin <linyunsheng@xxxxxxxxxx> > --- > include/linux/page_frag_cache.h | 2 +- > include/linux/skbuff.h | 4 ++-- > mm/page_frag_cache.c | 26 +++++++++++--------------- > 3 files changed, 14 insertions(+), 18 deletions(-) > > diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h > index 3a44bfc99750..b9411f0db25a 100644 > --- a/include/linux/page_frag_cache.h > +++ b/include/linux/page_frag_cache.h > @@ -32,7 +32,7 @@ static inline void *page_frag_alloc_align(struct page_frag_cache *nc, > unsigned int fragsz, gfp_t gfp_mask, > unsigned int align) > { > - WARN_ON_ONCE(!is_power_of_2(align)); > + WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE); > return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align); > } > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index eb8ae8292c48..d1fea23ec386 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -3320,7 +3320,7 @@ static inline void *netdev_alloc_frag(unsigned int fragsz) > static inline void *netdev_alloc_frag_align(unsigned int fragsz, > unsigned int align) > { > - WARN_ON_ONCE(!is_power_of_2(align)); > + WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE); > return __netdev_alloc_frag_align(fragsz, -align); > } > > @@ -3391,7 +3391,7 @@ static inline void *napi_alloc_frag(unsigned int fragsz) > static inline void *napi_alloc_frag_align(unsigned int fragsz, > unsigned int align) > { > - WARN_ON_ONCE(!is_power_of_2(align)); > + WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE); > return __napi_alloc_frag_align(fragsz, -align); > } > > diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c > index 88f567ef0e29..da244851b8a4 100644 > --- a/mm/page_frag_cache.c > +++ b/mm/page_frag_cache.c > @@ -72,10 +72,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc, > if (!page) > return NULL; > > -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > - /* if size can vary use size else just use PAGE_SIZE */ > - size = nc->size; > -#endif > /* Even if we own the page, we do not use atomic_set(). > * This would break get_page_unless_zero() users. > */ > @@ -84,11 +80,16 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc, > /* reset page count bias and offset to start of new frag */ > nc->pfmemalloc = page_is_pfmemalloc(page); > nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; > - nc->offset = size; > + nc->offset = 0; > } > > - offset = nc->offset - fragsz; > - if (unlikely(offset < 0)) { > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > + /* if size can vary use size else just use PAGE_SIZE */ > + size = nc->size; > +#endif > + > + offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask); > + if (unlikely(offset + fragsz > size)) { The fragsz check below could be moved to here. > page = virt_to_page(nc->va); > > if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) > @@ -99,17 +100,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc, > goto refill; > } > > -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > - /* if size can vary use size else just use PAGE_SIZE */ > - size = nc->size; > -#endif > /* OK, page count is 0, we can safely set it */ > set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); > > /* reset page count bias and offset to start of new frag */ > nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; > - offset = size - fragsz; > - if (unlikely(offset < 0)) { > + offset = 0; > + if (unlikely(fragsz > PAGE_SIZE)) { Since we aren't taking advantage of the flag that is left after the subtraction we might just want to look at moving this piece up to just after the offset + fragsz check. That should prevent us from trying to refill if we have a request that is larger than a single page. In addition we could probably just drop the 3 PAGE_SIZE checks above as they would be redundant. > /* > * The caller is trying to allocate a fragment > * with fragsz > PAGE_SIZE but the cache isn't big > @@ -124,8 +121,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc, > } > > nc->pagecnt_bias--; > - offset &= align_mask; > - nc->offset = offset; > + nc->offset = offset + fragsz; > > return nc->va + offset; > }