On Sun, Mar 9, 2025 at 5:50 AM Toke Høiland-Jørgensen <toke@xxxxxxxxxx> wrote: > > When enabling DMA mapping in page_pool, pages are kept DMA mapped until > they are released from the pool, to avoid the overhead of re-mapping the > pages every time they are used. This causes problems when a device is > torn down, because the page pool can't unmap the pages until they are > returned to the pool. This causes resource leaks and/or crashes when > there are pages still outstanding while the device is torn down, because > page_pool will attempt an unmap of a non-existent DMA device on the > subsequent page return. > > To fix this, implement a simple tracking of outstanding dma-mapped pages > in page pool using an xarray. This was first suggested by Mina[0], and > turns out to be fairly straight forward: We simply store pointers to > pages directly in the xarray with xa_alloc() when they are first DMA > mapped, and remove them from the array on unmap. Then, when a page pool > is torn down, it can simply walk the xarray and unmap all pages still > present there before returning, which also allows us to get rid of the > get/put_device() calls in page_pool. Using xa_cmpxchg(), no additional > synchronisation is needed, as a page will only ever be unmapped once. > > To avoid having to walk the entire xarray on unmap to find the page > reference, we stash the ID assigned by xa_alloc() into the page > structure itself, using the upper bits of the pp_magic field. This > requires a couple of defines to avoid conflicting with the > POINTER_POISON_DELTA define, but this is all evaluated at compile-time, > so should not affect run-time performance. > > Since all the tracking is performed on DMA map/unmap, no additional code > is needed in the fast path, meaning the performance overhead of this > tracking is negligible. The extra memory needed to track the pages is > neatly encapsulated inside xarray, which uses the 'struct xa_node' > structure to track items. This structure is 576 bytes long, with slots > for 64 items, meaning that a full node occurs only 9 bytes of overhead > per slot it tracks (in practice, it probably won't be this efficient, > but in any case it should be an acceptable overhead). > > [0] https://lore.kernel.org/all/CAHS8izPg7B5DwKfSuzz-iOop_YRbk3Sd6Y4rX7KBG9DcVJcyWg@xxxxxxxxxxxxxx/ > > Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code") > Reported-by: Yonglong Liu <liuyonglong@xxxxxxxxxx> > Suggested-by: Mina Almasry <almasrymina@xxxxxxxxxx> > Reviewed-by: Jesper Dangaard Brouer <hawk@xxxxxxxxxx> > Tested-by: Jesper Dangaard Brouer <hawk@xxxxxxxxxx> > Signed-off-by: Toke Høiland-Jørgensen <toke@xxxxxxxxxx> I only have nits and suggestions for improvement. With and without those: Reviewed-by: Mina Almasry <almasrymina@xxxxxxxxxx> > --- > v2: > - Stash the id in the pp_magic field of struct page instead of > overwriting the mapping field. This version is compile-tested only. > > include/net/page_pool/types.h | 31 +++++++++++++++++++++++ > mm/page_alloc.c | 3 ++- > net/core/netmem_priv.h | 35 +++++++++++++++++++++++++- > net/core/page_pool.c | 46 +++++++++++++++++++++++++++++------ > 4 files changed, 105 insertions(+), 10 deletions(-) > > diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h > index 36eb57d73abc..d879a505ca4d 100644 > --- a/include/net/page_pool/types.h > +++ b/include/net/page_pool/types.h > @@ -6,6 +6,7 @@ > #include <linux/dma-direction.h> > #include <linux/ptr_ring.h> > #include <linux/types.h> > +#include <linux/xarray.h> > #include <net/netmem.h> > > #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA > @@ -54,6 +55,34 @@ struct pp_alloc_cache { > netmem_ref cache[PP_ALLOC_CACHE_SIZE]; > }; > > +/* > + * DMA mapping IDs > + * > + * When DMA-mapping a page, we allocate an ID (from an xarray) and stash this in > + * the upper bits of page->pp_magic. The number of bits available here is > + * constrained by the size of an unsigned long, and the definition of > + * PP_SIGNATURE. > + */ > +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) > +#define _PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 1) > + > +#if POISON_POINTER_DELTA > 0 > +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA > + * index to not overlap with that if set > + */ > +#define PP_DMA_INDEX_BITS MIN(_PP_DMA_INDEX_BITS, \ > + __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) > +#else > +#define PP_DMA_INDEX_BITS _PP_DMA_INDEX_BITS > +#endif > + > +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ > + PP_DMA_INDEX_SHIFT) > +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) > + > +/* For check in page_alloc.c */ > +#define PP_MAGIC_MASK (~(PP_DMA_INDEX_MASK | 0x3)) > + > /** > * struct page_pool_params - page pool parameters > * @fast: params accessed frequently on hotpath > @@ -221,6 +250,8 @@ struct page_pool { > void *mp_priv; > const struct memory_provider_ops *mp_ops; > > + struct xarray dma_mapped; > + > #ifdef CONFIG_PAGE_POOL_STATS > /* recycle stats are per-cpu to avoid locking */ > struct page_pool_recycle_stats __percpu *recycle_stats; > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 579789600a3c..96776e7b2301 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -55,6 +55,7 @@ > #include <linux/delayacct.h> > #include <linux/cacheinfo.h> > #include <linux/pgalloc_tag.h> > +#include <net/page_pool/types.h> > #include <asm/div64.h> > #include "internal.h" > #include "shuffle.h" > @@ -873,7 +874,7 @@ static inline bool page_expected_state(struct page *page, > page->memcg_data | > #endif > #ifdef CONFIG_PAGE_POOL > - ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | > + ((page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE) | > #endif > (page->flags & check_flags))) > return false; > diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h > index 7eadb8393e00..59face70f40d 100644 > --- a/net/core/netmem_priv.h > +++ b/net/core/netmem_priv.h > @@ -3,10 +3,19 @@ > #ifndef __NETMEM_PRIV_H > #define __NETMEM_PRIV_H > > -static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) > +static inline unsigned long _netmem_get_pp_magic(netmem_ref netmem) Nit: maybe __netmem_get...() To be consistent (I used double underscore elsewhere). > { > return __netmem_clear_lsb(netmem)->pp_magic; > } nit: newline between function definitions. > +static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) > +{ > + return _netmem_get_pp_magic(netmem) & ~PP_DMA_INDEX_MASK; > +} > + > +static inline void netmem_set_pp_magic(netmem_ref netmem, unsigned long pp_magic) > +{ > + __netmem_clear_lsb(netmem)->pp_magic = pp_magic; > +} > > static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) > { > @@ -28,4 +37,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem, > { > __netmem_clear_lsb(netmem)->dma_addr = dma_addr; > } > + > +static inline unsigned long netmem_get_dma_index(netmem_ref netmem) > +{ > + unsigned long magic; > + > + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) > + return 0; > + > + magic = _netmem_get_pp_magic(netmem); > + > + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; > +} > + > +static inline void netmem_set_dma_index(netmem_ref netmem, > + unsigned long id) > +{ > + unsigned long magic; > + > + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) > + return; > + > + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); > + netmem_set_pp_magic(netmem, magic); > +} > #endif > diff --git a/net/core/page_pool.c b/net/core/page_pool.c > index acef1fcd8ddc..dceef9b82198 100644 > --- a/net/core/page_pool.c > +++ b/net/core/page_pool.c > @@ -226,6 +226,8 @@ static int page_pool_init(struct page_pool *pool, > return -EINVAL; > > pool->dma_map = true; > + > + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); > } > > if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { > @@ -275,9 +277,6 @@ static int page_pool_init(struct page_pool *pool, > /* Driver calling page_pool_create() also call page_pool_destroy() */ > refcount_set(&pool->user_cnt, 1); > > - if (pool->dma_map) > - get_device(pool->p.dev); > - > if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { > /* We rely on rtnl_lock()ing to make sure netdev_rx_queue > * configuration doesn't change while we're initializing > @@ -325,7 +324,7 @@ static void page_pool_uninit(struct page_pool *pool) > ptr_ring_cleanup(&pool->ring, NULL); > > if (pool->dma_map) > - put_device(pool->p.dev); > + xa_destroy(&pool->dma_mapped); > > #ifdef CONFIG_PAGE_POOL_STATS > if (!pool->system) > @@ -470,9 +469,11 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, > __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); > } > > -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) > +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) > { > dma_addr_t dma; > + int err; > + u32 id; > > /* Setup DMA mapping: use 'struct page' area for storing DMA-addr > * since dma_addr_t can be either 32 or 64 bits and does not always fit > @@ -486,9 +487,19 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) > if (dma_mapping_error(pool->p.dev, dma)) > return false; > > + if (in_softirq()) > + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), > + PP_DMA_INDEX_LIMIT, gfp); > + else > + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), > + PP_DMA_INDEX_LIMIT, gfp); I think there is a bit of discussion on this thread on whether PP_DMA_INDEX_LIMIT is going to be large enough. xa_alloc() returns -EBUSY if there are no available entries in the xarray. How about we do a rate-limited pr_err or DEBUG_NET_WARN_ON_ONCE when that happens?