Re: [PATCH 9/9] net: page_pool: use alloc_pages_bulk in refill code path

Alexander Lobakin <alobakin@xxxxx> · Thu, 25 Mar 2021 13:33:57 +0000

From: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
Date: Thu, 25 Mar 2021 11:42:28 +0000

> From: Jesper Dangaard Brouer <brouer@xxxxxxxxxx>
>
> There are cases where the page_pool need to refill with pages from the
> page allocator. Some workloads cause the page_pool to release pages
> instead of recycling these pages.
>
> For these workload it can improve performance to bulk alloc pages from
> the page-allocator to refill the alloc cache.
>
> For XDP-redirect workload with 100G mlx5 driver (that use page_pool)
> redirecting xdp_frame packets into a veth, that does XDP_PASS to create
> an SKB from the xdp_frame, which then cannot return the page to the
> page_pool.
>
> Performance results under GitHub xdp-project[1]:
>  [1] https://github.com/xdp-project/xdp-project/blob/master/areas/mem/page_pool06_alloc_pages_bulk.org
>
> Mel: The patch "net: page_pool: convert to use alloc_pages_bulk_array
> variant" was squashed with this patch. From the test page, the array
> variant was superior with one of the test results as follows.
>
> 	Kernel		XDP stats       CPU     pps           Delta
> 	Baseline	XDP-RX CPU      total   3,771,046       n/a
> 	List		XDP-RX CPU      total   3,940,242    +4.49%
> 	Array		XDP-RX CPU      total   4,249,224   +12.68%
>
> Signed-off-by: Jesper Dangaard Brouer <brouer@xxxxxxxxxx>
> Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>

I tested it a lot for past two weeks and I'm very satisfied with
the results, especially the new array-based version.
Haven't had a chance to test this particular set yet, but still.

Reviewed-by: Alexander Lobakin <alobakin@xxxxx>

Great work, thank you all guys!

> ---
>  include/net/page_pool.h |  2 +-
>  net/core/page_pool.c    | 82 ++++++++++++++++++++++++++++-------------
>  2 files changed, 57 insertions(+), 27 deletions(-)
>
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index b5b195305346..6d517a37c18b 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -65,7 +65,7 @@
>  #define PP_ALLOC_CACHE_REFILL	64
>  struct pp_alloc_cache {
>  	u32 count;
> -	void *cache[PP_ALLOC_CACHE_SIZE];
> +	struct page *cache[PP_ALLOC_CACHE_SIZE];
>  };
>
>  struct page_pool_params {
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 40e1b2beaa6c..9ec1aa9640ad 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -203,38 +203,17 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
>  	return true;
>  }
>
> -/* slow path */
> -noinline
> -static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
> -						 gfp_t _gfp)
> +static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
> +						 gfp_t gfp)
>  {
> -	unsigned int pp_flags = pool->p.flags;
>  	struct page *page;
> -	gfp_t gfp = _gfp;
> -
> -	/* We could always set __GFP_COMP, and avoid this branch, as
> -	 * prep_new_page() can handle order-0 with __GFP_COMP.
> -	 */
> -	if (pool->p.order)
> -		gfp |= __GFP_COMP;
> -
> -	/* FUTURE development:
> -	 *
> -	 * Current slow-path essentially falls back to single page
> -	 * allocations, which doesn't improve performance.  This code
> -	 * need bulk allocation support from the page allocator code.
> -	 */
>
> -	/* Cache was empty, do real allocation */
> -#ifdef CONFIG_NUMA
> +	gfp |= __GFP_COMP;
>  	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
> -#else
> -	page = alloc_pages(gfp, pool->p.order);
> -#endif
> -	if (!page)
> +	if (unlikely(!page))
>  		return NULL;
>
> -	if ((pp_flags & PP_FLAG_DMA_MAP) &&
> +	if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
>  	    unlikely(!page_pool_dma_map(pool, page))) {
>  		put_page(page);
>  		return NULL;
> @@ -243,6 +222,57 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
>  	/* Track how many pages are held 'in-flight' */
>  	pool->pages_state_hold_cnt++;
>  	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
> +	return page;
> +}
> +
> +/* slow path */
> +noinline
> +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
> +						 gfp_t gfp)
> +{
> +	const int bulk = PP_ALLOC_CACHE_REFILL;
> +	unsigned int pp_flags = pool->p.flags;
> +	unsigned int pp_order = pool->p.order;
> +	struct page *page;
> +	int i, nr_pages;
> +
> +	/* Don't support bulk alloc for high-order pages */
> +	if (unlikely(pp_order))
> +		return __page_pool_alloc_page_order(pool, gfp);
> +
> +	/* Unnecessary as alloc cache is empty, but guarantees zero count */
> +	if (unlikely(pool->alloc.count > 0))
> +		return pool->alloc.cache[--pool->alloc.count];
> +
> +	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
> +	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
> +
> +	nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache);
> +	if (unlikely(!nr_pages))
> +		return NULL;
> +
> +	/* Pages have been filled into alloc.cache array, but count is zero and
> +	 * page element have not been (possibly) DMA mapped.
> +	 */
> +	for (i = 0; i < nr_pages; i++) {
> +		page = pool->alloc.cache[i];
> +		if ((pp_flags & PP_FLAG_DMA_MAP) &&
> +		    unlikely(!page_pool_dma_map(pool, page))) {
> +			put_page(page);
> +			continue;
> +		}
> +		pool->alloc.cache[pool->alloc.count++] = page;
> +		/* Track how many pages are held 'in-flight' */
> +		pool->pages_state_hold_cnt++;
> +		trace_page_pool_state_hold(pool, page,
> +					   pool->pages_state_hold_cnt);
> +	}
> +
> +	/* Return last page */
> +	if (likely(pool->alloc.count > 0))
> +		page = pool->alloc.cache[--pool->alloc.count];
> +	else
> +		page = NULL;
>
>  	/* When page just alloc'ed is should/must have refcnt 1. */
>  	return page;
> --
> 2.26.2

Al