Re: [RFC PATCH net-next v2] page_pool: Track DMA-mapped pages and unmap them when destroying the pool

Mina Almasry <almasrymina@xxxxxxxxxx> · Tue, 11 Mar 2025 07:30:07 -0700

On Sun, Mar 9, 2025 at 5:50 AM Toke Høiland-Jørgensen <toke@xxxxxxxxxx> wrote:
>
> When enabling DMA mapping in page_pool, pages are kept DMA mapped until
> they are released from the pool, to avoid the overhead of re-mapping the
> pages every time they are used. This causes problems when a device is
> torn down, because the page pool can't unmap the pages until they are
> returned to the pool. This causes resource leaks and/or crashes when
> there are pages still outstanding while the device is torn down, because
> page_pool will attempt an unmap of a non-existent DMA device on the
> subsequent page return.
>
> To fix this, implement a simple tracking of outstanding dma-mapped pages
> in page pool using an xarray. This was first suggested by Mina[0], and
> turns out to be fairly straight forward: We simply store pointers to
> pages directly in the xarray with xa_alloc() when they are first DMA
> mapped, and remove them from the array on unmap. Then, when a page pool
> is torn down, it can simply walk the xarray and unmap all pages still
> present there before returning, which also allows us to get rid of the
> get/put_device() calls in page_pool. Using xa_cmpxchg(), no additional
> synchronisation is needed, as a page will only ever be unmapped once.
>
> To avoid having to walk the entire xarray on unmap to find the page
> reference, we stash the ID assigned by xa_alloc() into the page
> structure itself, using the upper bits of the pp_magic field. This
> requires a couple of defines to avoid conflicting with the
> POINTER_POISON_DELTA define, but this is all evaluated at compile-time,
> so should not affect run-time performance.
>
> Since all the tracking is performed on DMA map/unmap, no additional code
> is needed in the fast path, meaning the performance overhead of this
> tracking is negligible. The extra memory needed to track the pages is
> neatly encapsulated inside xarray, which uses the 'struct xa_node'
> structure to track items. This structure is 576 bytes long, with slots
> for 64 items, meaning that a full node occurs only 9 bytes of overhead
> per slot it tracks (in practice, it probably won't be this efficient,
> but in any case it should be an acceptable overhead).
>
> [0] https://lore.kernel.org/all/CAHS8izPg7B5DwKfSuzz-iOop_YRbk3Sd6Y4rX7KBG9DcVJcyWg@xxxxxxxxxxxxxx/
>
> Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code")
> Reported-by: Yonglong Liu <liuyonglong@xxxxxxxxxx>
> Suggested-by: Mina Almasry <almasrymina@xxxxxxxxxx>
> Reviewed-by: Jesper Dangaard Brouer <hawk@xxxxxxxxxx>
> Tested-by: Jesper Dangaard Brouer <hawk@xxxxxxxxxx>
> Signed-off-by: Toke Høiland-Jørgensen <toke@xxxxxxxxxx>

I only have nits and suggestions for improvement. With and without those:

Reviewed-by: Mina Almasry <almasrymina@xxxxxxxxxx>

> ---
> v2:
>   - Stash the id in the pp_magic field of struct page instead of
>     overwriting the mapping field. This version is compile-tested only.
>
>  include/net/page_pool/types.h | 31 +++++++++++++++++++++++
>  mm/page_alloc.c               |  3 ++-
>  net/core/netmem_priv.h        | 35 +++++++++++++++++++++++++-
>  net/core/page_pool.c          | 46 +++++++++++++++++++++++++++++------
>  4 files changed, 105 insertions(+), 10 deletions(-)
>
> diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
> index 36eb57d73abc..d879a505ca4d 100644
> --- a/include/net/page_pool/types.h
> +++ b/include/net/page_pool/types.h
> @@ -6,6 +6,7 @@
>  #include <linux/dma-direction.h>
>  #include <linux/ptr_ring.h>
>  #include <linux/types.h>
> +#include <linux/xarray.h>
>  #include <net/netmem.h>
>
>  #define PP_FLAG_DMA_MAP                BIT(0) /* Should page_pool do the DMA
> @@ -54,6 +55,34 @@ struct pp_alloc_cache {
>         netmem_ref cache[PP_ALLOC_CACHE_SIZE];
>  };
>
> +/*
> + * DMA mapping IDs
> + *
> + * When DMA-mapping a page, we allocate an ID (from an xarray) and stash this in
> + * the upper bits of page->pp_magic. The number of bits available here is
> + * constrained by the size of an unsigned long, and the definition of
> + * PP_SIGNATURE.
> + */
> +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA))
> +#define _PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 1)
> +
> +#if POISON_POINTER_DELTA > 0
> +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA
> + * index to not overlap with that if set
> + */
> +#define PP_DMA_INDEX_BITS MIN(_PP_DMA_INDEX_BITS, \
> +                             __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT)
> +#else
> +#define PP_DMA_INDEX_BITS _PP_DMA_INDEX_BITS
> +#endif
> +
> +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \
> +                                 PP_DMA_INDEX_SHIFT)
> +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1)
> +
> +/* For check in page_alloc.c */
> +#define PP_MAGIC_MASK (~(PP_DMA_INDEX_MASK | 0x3))
> +
>  /**
>   * struct page_pool_params - page pool parameters
>   * @fast:      params accessed frequently on hotpath
> @@ -221,6 +250,8 @@ struct page_pool {
>         void *mp_priv;
>         const struct memory_provider_ops *mp_ops;
>
> +       struct xarray dma_mapped;
> +
>  #ifdef CONFIG_PAGE_POOL_STATS
>         /* recycle stats are per-cpu to avoid locking */
>         struct page_pool_recycle_stats __percpu *recycle_stats;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 579789600a3c..96776e7b2301 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -55,6 +55,7 @@
>  #include <linux/delayacct.h>
>  #include <linux/cacheinfo.h>
>  #include <linux/pgalloc_tag.h>
> +#include <net/page_pool/types.h>
>  #include <asm/div64.h>
>  #include "internal.h"
>  #include "shuffle.h"
> @@ -873,7 +874,7 @@ static inline bool page_expected_state(struct page *page,
>                         page->memcg_data |
>  #endif
>  #ifdef CONFIG_PAGE_POOL
> -                       ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
> +                       ((page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE) |
>  #endif
>                         (page->flags & check_flags)))
>                 return false;
> diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
> index 7eadb8393e00..59face70f40d 100644
> --- a/net/core/netmem_priv.h
> +++ b/net/core/netmem_priv.h
> @@ -3,10 +3,19 @@
>  #ifndef __NETMEM_PRIV_H
>  #define __NETMEM_PRIV_H
>
> -static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
> +static inline unsigned long _netmem_get_pp_magic(netmem_ref netmem)

Nit: maybe __netmem_get...() To be consistent (I used double
underscore elsewhere).

>  {
>         return __netmem_clear_lsb(netmem)->pp_magic;
>  }

nit: newline between function definitions.

> +static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
> +{
> +       return _netmem_get_pp_magic(netmem) & ~PP_DMA_INDEX_MASK;
> +}
> +
> +static inline void netmem_set_pp_magic(netmem_ref netmem, unsigned long pp_magic)
> +{
> +       __netmem_clear_lsb(netmem)->pp_magic = pp_magic;
> +}
>
>  static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
>  {
> @@ -28,4 +37,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem,
>  {
>         __netmem_clear_lsb(netmem)->dma_addr = dma_addr;
>  }
> +
> +static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
> +{
> +       unsigned long magic;
> +
> +       if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
> +               return 0;
> +
> +       magic = _netmem_get_pp_magic(netmem);
> +
> +       return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT;
> +}
> +
> +static inline void netmem_set_dma_index(netmem_ref netmem,
> +                                       unsigned long id)
> +{
> +       unsigned long magic;
> +
> +       if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
> +               return;
> +
> +       magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT);
> +       netmem_set_pp_magic(netmem, magic);
> +}
>  #endif
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index acef1fcd8ddc..dceef9b82198 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -226,6 +226,8 @@ static int page_pool_init(struct page_pool *pool,
>                         return -EINVAL;
>
>                 pool->dma_map = true;
> +
> +               xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1);
>         }
>
>         if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
> @@ -275,9 +277,6 @@ static int page_pool_init(struct page_pool *pool,
>         /* Driver calling page_pool_create() also call page_pool_destroy() */
>         refcount_set(&pool->user_cnt, 1);
>
> -       if (pool->dma_map)
> -               get_device(pool->p.dev);
> -
>         if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
>                 /* We rely on rtnl_lock()ing to make sure netdev_rx_queue
>                  * configuration doesn't change while we're initializing
> @@ -325,7 +324,7 @@ static void page_pool_uninit(struct page_pool *pool)
>         ptr_ring_cleanup(&pool->ring, NULL);
>
>         if (pool->dma_map)
> -               put_device(pool->p.dev);
> +               xa_destroy(&pool->dma_mapped);
>
>  #ifdef CONFIG_PAGE_POOL_STATS
>         if (!pool->system)
> @@ -470,9 +469,11 @@ page_pool_dma_sync_for_device(const struct page_pool *pool,
>                 __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
>  }
>
> -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
> +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp)
>  {
>         dma_addr_t dma;
> +       int err;
> +       u32 id;
>
>         /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
>          * since dma_addr_t can be either 32 or 64 bits and does not always fit
> @@ -486,9 +487,19 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
>         if (dma_mapping_error(pool->p.dev, dma))
>                 return false;
>
> +       if (in_softirq())
> +               err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem),
> +                              PP_DMA_INDEX_LIMIT, gfp);
> +       else
> +               err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem),
> +                                 PP_DMA_INDEX_LIMIT, gfp);

I think there is a bit of discussion on this thread on whether
PP_DMA_INDEX_LIMIT is going to be large enough.

xa_alloc() returns -EBUSY if there are no available entries in the
xarray. How about we do a rate-limited pr_err or
DEBUG_NET_WARN_ON_ONCE when that happens?