Please ignore this one, the title name has been changed to: "page_pool: add frag page recycling support in page pool". On 2021/7/12 17:19, Yunsheng Lin wrote: > Currently page pool only support page recycling only when > there is only one user of the page, and the split page > reusing implemented in the most driver can not use the > page pool as bing-pong way of reusing requires the elevated > refcnt support. > > Those reusing or recycling has below limitations: > 1. page from page pool can only be used be one user in order > for the page recycling to happen. > 2. Bing-pong way of reusing in most driver does not support > multi desc using different part of the same page in order > to save memory. > > So add elevated refcnt support in page pool to in order to > overcome the above limitation. > > This is a preparation to support allocating page frag in page > pool. > > Signed-off-by: Yunsheng Lin <linyunsheng@xxxxxxxxxx> > --- > include/net/page_pool.h | 22 ++++++++- > net/core/page_pool.c | 121 ++++++++++++++++++++++++++++++++++++++++++------ > 2 files changed, 129 insertions(+), 14 deletions(-) > > diff --git a/include/net/page_pool.h b/include/net/page_pool.h > index 84cd972..d9a736f 100644 > --- a/include/net/page_pool.h > +++ b/include/net/page_pool.h > @@ -45,7 +45,10 @@ > * Please note DMA-sync-for-CPU is still > * device driver responsibility > */ > -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) > +#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ > +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ > + PP_FLAG_DMA_SYNC_DEV |\ > + PP_FLAG_PAGE_FRAG) > > /* > * Fast allocation side cache array/stack > @@ -88,6 +91,9 @@ struct page_pool { > unsigned long defer_warn; > > u32 pages_state_hold_cnt; > + unsigned int frag_offset; > + int frag_bias; > + struct page *frag_page; > > /* > * Data structure for allocation side > @@ -137,6 +143,20 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) > return page_pool_alloc_pages(pool, gfp); > } > > +struct page *page_pool_alloc_frag(struct page_pool *pool, > + unsigned int *offset, > + unsigned int size, > + gfp_t gfp); > + > +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, > + unsigned int *offset, > + unsigned int size) > +{ > + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); > + > + return page_pool_alloc_frag(pool, offset, size, gfp); > +} > + > /* get the stored dma direction. A driver might decide to treat this locally and > * avoid the extra cache line from page_pool to determine the direction > */ > diff --git a/net/core/page_pool.c b/net/core/page_pool.c > index 1abefc6..9f518dc 100644 > --- a/net/core/page_pool.c > +++ b/net/core/page_pool.c > @@ -24,6 +24,8 @@ > #define DEFER_TIME (msecs_to_jiffies(1000)) > #define DEFER_WARN_INTERVAL (60 * HZ) > > +#define BIAS_MAX (PAGE_SIZE - 1) > + > static int page_pool_init(struct page_pool *pool, > const struct page_pool_params *params) > { > @@ -304,6 +306,33 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, > return page; > } > > +/* nr could be negative */ > +static int page_pool_atomic_add_bias(struct page *page, int nr) > +{ > + unsigned long *bias_ptr = page_pool_pagecnt_bias_ptr(page); > + unsigned long old_bias = READ_ONCE(*bias_ptr); > + unsigned long new_bias; > + > + do { > + int bias = (int)(old_bias & ~PAGE_MASK); > + > + /* Warn when page_pool_dev_alloc_pages() is called > + * with PP_FLAG_PAGE_FRAG flag in driver. > + */ > + WARN_ON(!bias); > + > + /* already the last user */ > + if (!(bias + nr)) > + return 0; > + > + new_bias = old_bias + nr; > + } while (!try_cmpxchg(bias_ptr, &old_bias, new_bias)); > + > + WARN_ON((new_bias & PAGE_MASK) != (old_bias & PAGE_MASK)); > + > + return new_bias & ~PAGE_MASK; > +} > + > /* For using page_pool replace: alloc_pages() API calls, but provide > * synchronization guarantee for allocation side. > */ > @@ -425,6 +454,11 @@ static __always_inline struct page * > __page_pool_put_page(struct page_pool *pool, struct page *page, > unsigned int dma_sync_size, bool allow_direct) > { > + /* It is not the last user for the page frag case */ > + if (pool->p.flags & PP_FLAG_PAGE_FRAG && > + page_pool_atomic_add_bias(page, -1)) > + return NULL; > + > /* This allocator is optimized for the XDP mode that uses > * one-frame-per-page, but have fallbacks that act like the > * regular page allocator APIs. > @@ -448,19 +482,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, > /* Page found as candidate for recycling */ > return page; > } > - /* Fallback/non-XDP mode: API user have elevated refcnt. > - * > - * Many drivers split up the page into fragments, and some > - * want to keep doing this to save memory and do refcnt based > - * recycling. Support this use case too, to ease drivers > - * switching between XDP/non-XDP. > - * > - * In-case page_pool maintains the DMA mapping, API user must > - * call page_pool_put_page once. In this elevated refcnt > - * case, the DMA is unmapped/released, as driver is likely > - * doing refcnt based recycle tricks, meaning another process > - * will be invoking put_page. > - */ > + > /* Do not replace this with page_pool_return_page() */ > page_pool_release_page(pool, page); > put_page(page); > @@ -517,6 +539,77 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, > } > EXPORT_SYMBOL(page_pool_put_page_bulk); > > +/* When BIAS_RESERVE to avoid frag page being recycled back to > + * page pool while the frag page is still in pool->frag_page > + * waiting for more user. As minimum align size for DMA seems to > + * be 32, so we support max size of 2047 * 32 for 4K page size. > + */ > +#define BIAS_RESERVE ((int)(BIAS_MAX / 2 + 1)) > +#define BIAS_NEGATIVE_RESERVE (0 - BIAS_RESERVE) > + > +static struct page *page_pool_drain_frag(struct page_pool *pool, > + struct page *page) > +{ > + /* page pool is not the last user */ > + if (page_pool_atomic_add_bias(page, pool->frag_bias + > + BIAS_NEGATIVE_RESERVE)) > + return NULL; > + else > + return page; > +} > + > +static void page_pool_free_frag(struct page_pool *pool) > +{ > + struct page *page = pool->frag_page; > + > + if (!page || > + page_pool_atomic_add_bias(page, pool->frag_bias + > + BIAS_NEGATIVE_RESERVE)) > + return; > + > + page_pool_return_page(pool, page); > + pool->frag_page = NULL; > +} > + > +struct page *page_pool_alloc_frag(struct page_pool *pool, > + unsigned int *offset, > + unsigned int size, > + gfp_t gfp) > +{ > + unsigned int max_size = PAGE_SIZE << pool->p.order; > + unsigned int frag_offset = pool->frag_offset; > + struct page *frag_page = pool->frag_page; > + > + if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || > + size > max_size)) > + return NULL; > + > + size = ALIGN(size, dma_get_cache_alignment()); > + > + if (frag_page && frag_offset + size > max_size) > + frag_page = page_pool_drain_frag(pool, frag_page); > + > + if (!frag_page) { > + frag_page = page_pool_alloc_pages(pool, gfp); > + if (unlikely(!frag_page)) { > + pool->frag_page = NULL; > + return NULL; > + } > + > + pool->frag_page = frag_page; > + pool->frag_bias = 0; > + frag_offset = 0; > + page_pool_set_pagecnt_bias(frag_page, BIAS_RESERVE); > + } > + > + pool->frag_bias++; > + *offset = frag_offset; > + pool->frag_offset = frag_offset + size; > + > + return frag_page; > +} > +EXPORT_SYMBOL(page_pool_alloc_frag); > + > static void page_pool_empty_ring(struct page_pool *pool) > { > struct page *page; > @@ -622,6 +715,8 @@ void page_pool_destroy(struct page_pool *pool) > if (!page_pool_put(pool)) > return; > > + page_pool_free_frag(pool); > + > if (!page_pool_release(pool)) > return; > >