Hi Yunsheng, Thanks for looking into this! On Wed, 18 Sept 2024 at 14:24, Yunsheng Lin <linyunsheng@xxxxxxxxxx> wrote: > > Networking driver with page_pool support may hand over page > still with dma mapping to network stack and try to reuse that > page after network stack is done with it and passes it back > to page_pool to avoid the penalty of dma mapping/unmapping. I think you can shorten this to "If recycling and DMA mapping are enabled during the pool creation" > With all the caching in the network stack, some pages may be > held in the network stack without returning to the page_pool > soon enough, and with VF disable causing the driver unbound, > the page_pool does not stop the driver from doing it's > unbounding work, instead page_pool uses workqueue to check > if there is some pages coming back from the network stack > periodically, if there is any, it will do the dma unmmapping > related cleanup work. > > As mentioned in [1], attempting DMA unmaps after the driver > has already unbound may leak resources or at worst corrupt > memory. Fundamentally, the page pool code cannot allow DMA > mappings to outlive the driver they belong to. > > Currently it seems there are at least two cases that the page > is not released fast enough causing dma unmmapping done after > driver has already unbound: > 1. ipv4 packet defragmentation timeout: this seems to cause > delay up to 30 secs: > > 2. skb_defer_free_flush(): this may cause infinite delay if > there is no triggering for net_rx_action(). > > In order not to do the dma unmmapping after driver has already > unbound and stall the unloading of the networking driver, add > the pool->items array to record all the pages including the ones > which are handed over to network stack, so the page_pool can > do the dma unmmapping for those pages when page_pool_destroy() > is called. So, I was thinking of a very similar idea. But what do you mean by "all"? The pages that are still in caches (slow or fast) of the pool will be unmapped during page_pool_destroy(). Don't we 'just' need a list of the inflight packets and their pages or fragments? What we could do is go through that list and unmap these pages during page_pool_destroy(). I'll have a closer look at the patch tomorrow Thanks! /Ilias > Note, the devmem patchset seems to make the bug harder to fix, > and may make backporting harder too. As there is no actual user > for the devmem and the fixing for devmem is unclear for now, > this patch does not consider fixing the case for devmem yet. > > 1. https://lore.kernel.org/lkml/8067f204-1380-4d37-8ffd-007fc6f26738@xxxxxxxxxx/T/ > > Fixes: f71fec47c2df ("page_pool: make sure struct device is stable") > Signed-off-by: Yunsheng Lin <linyunsheng@xxxxxxxxxx> > CC: Robin Murphy <robin.murphy@xxxxxxx> > CC: Alexander Duyck <alexander.duyck@xxxxxxxxx> > CC: IOMMU <iommu@xxxxxxxxxxxxxxx> > --- > drivers/net/ethernet/freescale/fec_main.c | 8 +- > drivers/net/ethernet/intel/iavf/iavf_txrx.c | 6 +- > drivers/net/ethernet/intel/idpf/idpf_txrx.c | 14 +- > drivers/net/ethernet/intel/libeth/rx.c | 2 +- > .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 3 +- > drivers/net/netdevsim/netdev.c | 6 +- > drivers/net/wireless/mediatek/mt76/mt76.h | 2 +- > include/linux/mm_types.h | 2 +- > include/linux/skbuff.h | 1 + > include/net/libeth/rx.h | 3 +- > include/net/netmem.h | 10 +- > include/net/page_pool/helpers.h | 11 ++ > include/net/page_pool/types.h | 15 +- > net/core/devmem.c | 4 +- > net/core/netmem_priv.h | 5 +- > net/core/page_pool.c | 161 +++++++++++++++--- > net/core/page_pool_priv.h | 10 +- > net/core/skbuff.c | 3 +- > net/core/xdp.c | 3 +- > 19 files changed, 212 insertions(+), 57 deletions(-) > > diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c > index acbb627d51bf..c00f8c460759 100644 > --- a/drivers/net/ethernet/freescale/fec_main.c > +++ b/drivers/net/ethernet/freescale/fec_main.c > @@ -1009,7 +1009,8 @@ static void fec_enet_bd_init(struct net_device *dev) > struct page *page = txq->tx_buf[i].buf_p; > > if (page) > - page_pool_put_page(page->pp, page, 0, false); > + page_pool_put_page(page_pool_to_pp(page), > + page, 0, false); > } > > txq->tx_buf[i].buf_p = NULL; > @@ -1538,7 +1539,7 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id, int budget) > xdp_return_frame_rx_napi(xdpf); > } else { /* recycle pages of XDP_TX frames */ > /* The dma_sync_size = 0 as XDP_TX has already synced DMA for_device */ > - page_pool_put_page(page->pp, page, 0, true); > + page_pool_put_page(page_pool_to_pp(page), page, 0, true); > } > > txq->tx_buf[index].buf_p = NULL; > @@ -3300,7 +3301,8 @@ static void fec_enet_free_buffers(struct net_device *ndev) > } else { > struct page *page = txq->tx_buf[i].buf_p; > > - page_pool_put_page(page->pp, page, 0, false); > + page_pool_put_page(page_pool_to_pp(page), > + page, 0, false); > } > > txq->tx_buf[i].buf_p = NULL; > diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c > index 26b424fd6718..658d8f9a6abb 100644 > --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c > +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c > @@ -1050,7 +1050,8 @@ static void iavf_add_rx_frag(struct sk_buff *skb, > const struct libeth_fqe *rx_buffer, > unsigned int size) > { > - u32 hr = rx_buffer->page->pp->p.offset; > + struct page_pool *pool = page_pool_to_pp(rx_buffer->page); > + u32 hr = pool->p.offset; > > skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page, > rx_buffer->offset + hr, size, rx_buffer->truesize); > @@ -1067,7 +1068,8 @@ static void iavf_add_rx_frag(struct sk_buff *skb, > static struct sk_buff *iavf_build_skb(const struct libeth_fqe *rx_buffer, > unsigned int size) > { > - u32 hr = rx_buffer->page->pp->p.offset; > + struct page_pool *pool = page_pool_to_pp(rx_buffer->page); > + u32 hr = pool->p.offset; > struct sk_buff *skb; > void *va; > > diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c > index d4e6f0e10487..e3389f1a215f 100644 > --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c > +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c > @@ -385,7 +385,8 @@ static void idpf_rx_page_rel(struct libeth_fqe *rx_buf) > if (unlikely(!rx_buf->page)) > return; > > - page_pool_put_full_page(rx_buf->page->pp, rx_buf->page, false); > + page_pool_put_full_page(page_pool_to_pp(rx_buf->page), rx_buf->page, > + false); > > rx_buf->page = NULL; > rx_buf->offset = 0; > @@ -3097,7 +3098,8 @@ idpf_rx_process_skb_fields(struct idpf_rx_queue *rxq, struct sk_buff *skb, > void idpf_rx_add_frag(struct idpf_rx_buf *rx_buf, struct sk_buff *skb, > unsigned int size) > { > - u32 hr = rx_buf->page->pp->p.offset; > + struct page_pool *pool = page_pool_to_pp(rx_buf->page); > + u32 hr = pool->p.offset; > > skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buf->page, > rx_buf->offset + hr, size, rx_buf->truesize); > @@ -3129,8 +3131,10 @@ static u32 idpf_rx_hsplit_wa(const struct libeth_fqe *hdr, > if (!libeth_rx_sync_for_cpu(buf, copy)) > return 0; > > - dst = page_address(hdr->page) + hdr->offset + hdr->page->pp->p.offset; > - src = page_address(buf->page) + buf->offset + buf->page->pp->p.offset; > + dst = page_address(hdr->page) + hdr->offset + > + page_pool_to_pp(hdr->page)->p.offset; > + src = page_address(buf->page) + buf->offset + > + page_pool_to_pp(buf->page)->p.offset; > memcpy(dst, src, LARGEST_ALIGN(copy)); > > buf->offset += copy; > @@ -3148,7 +3152,7 @@ static u32 idpf_rx_hsplit_wa(const struct libeth_fqe *hdr, > */ > struct sk_buff *idpf_rx_build_skb(const struct libeth_fqe *buf, u32 size) > { > - u32 hr = buf->page->pp->p.offset; > + u32 hr = page_pool_to_pp(buf->page)->p.offset; > struct sk_buff *skb; > void *va; > > diff --git a/drivers/net/ethernet/intel/libeth/rx.c b/drivers/net/ethernet/intel/libeth/rx.c > index f20926669318..385afca0e61d 100644 > --- a/drivers/net/ethernet/intel/libeth/rx.c > +++ b/drivers/net/ethernet/intel/libeth/rx.c > @@ -207,7 +207,7 @@ EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_destroy, LIBETH); > */ > void libeth_rx_recycle_slow(struct page *page) > { > - page_pool_recycle_direct(page->pp, page); > + page_pool_recycle_direct(page_pool_to_pp(page), page); > } > EXPORT_SYMBOL_NS_GPL(libeth_rx_recycle_slow, LIBETH); > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c > index 4610621a340e..83511a45a6dc 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c > @@ -716,7 +716,8 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, > /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) > * as we know this is a page_pool page. > */ > - page_pool_recycle_direct(page->pp, page); > + page_pool_recycle_direct(page_pool_to_pp(page), > + page); > } while (++n < num); > > break; > diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c > index 017a6102be0a..9bfa593cd5dd 100644 > --- a/drivers/net/netdevsim/netdev.c > +++ b/drivers/net/netdevsim/netdev.c > @@ -593,7 +593,8 @@ nsim_pp_hold_write(struct file *file, const char __user *data, > if (!ns->page) > ret = -ENOMEM; > } else { > - page_pool_put_full_page(ns->page->pp, ns->page, false); > + page_pool_put_full_page(page_pool_to_pp(ns->page), ns->page, > + false); > ns->page = NULL; > } > rtnl_unlock(); > @@ -788,7 +789,8 @@ void nsim_destroy(struct netdevsim *ns) > > /* Put this intentionally late to exercise the orphaning path */ > if (ns->page) { > - page_pool_put_full_page(ns->page->pp, ns->page, false); > + page_pool_put_full_page(page_pool_to_pp(ns->page), ns->page, > + false); > ns->page = NULL; > } > > diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h > index 0b75a45ad2e8..94a277290909 100644 > --- a/drivers/net/wireless/mediatek/mt76/mt76.h > +++ b/drivers/net/wireless/mediatek/mt76/mt76.h > @@ -1688,7 +1688,7 @@ static inline void mt76_put_page_pool_buf(void *buf, bool allow_direct) > { > struct page *page = virt_to_head_page(buf); > > - page_pool_put_full_page(page->pp, page, allow_direct); > + page_pool_put_full_page(page_pool_to_pp(page), page, allow_direct); > } > > static inline void * > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 485424979254..410187133d27 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -120,7 +120,7 @@ struct page { > * page_pool allocated pages. > */ > unsigned long pp_magic; > - struct page_pool *pp; > + struct page_pool_item *pp_item; > unsigned long _pp_mapping_pad; > unsigned long dma_addr; > atomic_long_t pp_ref_count; > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 39f1d16f3628..64d1ecb7a7fc 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -38,6 +38,7 @@ > #include <net/net_debug.h> > #include <net/dropreason-core.h> > #include <net/netmem.h> > +#include <net/page_pool/types.h> > > /** > * DOC: skb checksums > diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h > index 43574bd6612f..beee7ddd77a5 100644 > --- a/include/net/libeth/rx.h > +++ b/include/net/libeth/rx.h > @@ -137,7 +137,8 @@ static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe, > return false; > } > > - page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len); > + page_pool_dma_sync_for_cpu(page_pool_to_pp(page), page, fqe->offset, > + len); > > return true; > } > diff --git a/include/net/netmem.h b/include/net/netmem.h > index 8a6e20be4b9d..27f5d284285e 100644 > --- a/include/net/netmem.h > +++ b/include/net/netmem.h > @@ -23,7 +23,7 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); > struct net_iov { > unsigned long __unused_padding; > unsigned long pp_magic; > - struct page_pool *pp; > + struct page_pool_item *pp_item; > struct dmabuf_genpool_chunk_owner *owner; > unsigned long dma_addr; > atomic_long_t pp_ref_count; > @@ -33,7 +33,7 @@ struct net_iov { > * > * struct { > * unsigned long pp_magic; > - * struct page_pool *pp; > + * struct page_pool *pp_item; > * unsigned long _pp_mapping_pad; > * unsigned long dma_addr; > * atomic_long_t pp_ref_count; > @@ -49,7 +49,7 @@ struct net_iov { > static_assert(offsetof(struct page, pg) == \ > offsetof(struct net_iov, iov)) > NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic); > -NET_IOV_ASSERT_OFFSET(pp, pp); > +NET_IOV_ASSERT_OFFSET(pp_item, pp_item); > NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); > NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); > #undef NET_IOV_ASSERT_OFFSET > @@ -127,9 +127,9 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) > return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); > } > > -static inline struct page_pool *netmem_get_pp(netmem_ref netmem) > +static inline struct page_pool_item *netmem_get_pp_item(netmem_ref netmem) > { > - return __netmem_clear_lsb(netmem)->pp; > + return __netmem_clear_lsb(netmem)->pp_item; > } > > static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem) > diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h > index 793e6fd78bc5..ed068b3cee3b 100644 > --- a/include/net/page_pool/helpers.h > +++ b/include/net/page_pool/helpers.h > @@ -83,6 +83,17 @@ static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) > } > #endif > > +static inline struct page_pool *page_pool_to_pp(struct page *page) > +{ > + struct page_pool_item *item = page->pp_item; > + struct page_pool *pool; > + > + item -= item->pp_idx; > + pool = (struct page_pool *)item; > + > + return --pool; > +} > + > /** > * page_pool_dev_alloc_pages() - allocate a page. > * @pool: pool from which to allocate > diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h > index c022c410abe3..526250ed812a 100644 > --- a/include/net/page_pool/types.h > +++ b/include/net/page_pool/types.h > @@ -142,6 +142,11 @@ struct page_pool_stats { > }; > #endif > > +struct page_pool_item { > + netmem_ref pp_netmem; > + unsigned int pp_idx; > +}; > + > /* The whole frag API block must stay within one cacheline. On 32-bit systems, > * sizeof(long) == sizeof(int), so that the block size is ``3 * sizeof(long)``. > * On 64-bit systems, the actual size is ``2 * sizeof(long) + sizeof(int)``. > @@ -161,6 +166,8 @@ struct page_pool { > > int cpuid; > u32 pages_state_hold_cnt; > + unsigned int item_mask; > + unsigned int item_idx; > > bool has_init_callback:1; /* slow::init_callback is set */ > bool dma_map:1; /* Perform DMA mapping */ > @@ -228,7 +235,11 @@ struct page_pool { > */ > refcount_t user_cnt; > > - u64 destroy_cnt; > + /* Lock to avoid doing dma unmapping concurrently when > + * destroy_cnt > 0. > + */ > + spinlock_t destroy_lock; > + unsigned int destroy_cnt; > > /* Slow/Control-path information follows */ > struct page_pool_params_slow slow; > @@ -239,6 +250,8 @@ struct page_pool { > u32 napi_id; > u32 id; > } user; > + > + struct page_pool_item items[] ____cacheline_aligned_in_smp; > }; > > struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); > diff --git a/net/core/devmem.c b/net/core/devmem.c > index 11b91c12ee11..09c5aa83f12a 100644 > --- a/net/core/devmem.c > +++ b/net/core/devmem.c > @@ -85,7 +85,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) > niov = &owner->niovs[index]; > > niov->pp_magic = 0; > - niov->pp = NULL; > + niov->pp_item = NULL; > atomic_long_set(&niov->pp_ref_count, 0); > > return niov; > @@ -380,7 +380,7 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) > if (WARN_ON_ONCE(refcount != 1)) > return false; > > - page_pool_clear_pp_info(netmem); > + page_pool_clear_pp_info(pool, netmem); > > net_devmem_free_dmabuf(netmem_to_net_iov(netmem)); > > diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h > index 7eadb8393e00..3173f6070cf7 100644 > --- a/net/core/netmem_priv.h > +++ b/net/core/netmem_priv.h > @@ -18,9 +18,10 @@ static inline void netmem_clear_pp_magic(netmem_ref netmem) > __netmem_clear_lsb(netmem)->pp_magic = 0; > } > > -static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) > +static inline void netmem_set_pp_item(netmem_ref netmem, > + struct page_pool_item *item) > { > - __netmem_clear_lsb(netmem)->pp = pool; > + __netmem_clear_lsb(netmem)->pp_item = item; > } > > static inline void netmem_set_dma_addr(netmem_ref netmem, > diff --git a/net/core/page_pool.c b/net/core/page_pool.c > index bec6e717cd22..1f3017a2e0a0 100644 > --- a/net/core/page_pool.c > +++ b/net/core/page_pool.c > @@ -267,14 +267,12 @@ static int page_pool_init(struct page_pool *pool, > return -ENOMEM; > } > > + spin_lock_init(&pool->destroy_lock); > atomic_set(&pool->pages_state_release_cnt, 0); > > /* Driver calling page_pool_create() also call page_pool_destroy() */ > refcount_set(&pool->user_cnt, 1); > > - if (pool->dma_map) > - get_device(pool->p.dev); > - > if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { > /* We rely on rtnl_lock()ing to make sure netdev_rx_queue > * configuration doesn't change while we're initializing > @@ -312,15 +310,93 @@ static void page_pool_uninit(struct page_pool *pool) > { > ptr_ring_cleanup(&pool->ring, NULL); > > - if (pool->dma_map) > - put_device(pool->p.dev); > - > #ifdef CONFIG_PAGE_POOL_STATS > if (!pool->system) > free_percpu(pool->recycle_stats); > #endif > } > > +static void page_pool_item_init(struct page_pool *pool, unsigned int item_cnt) > +{ > + struct page_pool_item *items = pool->items; > + unsigned int i; > + > + WARN_ON_ONCE(!is_power_of_2(item_cnt)); > + > + for (i = 0; i < item_cnt; i++) > + items[i].pp_idx = i; > + > + pool->item_mask = item_cnt - 1; > +} > + > +static void page_pool_item_uninit(struct page_pool *pool) > +{ > + struct page_pool_item *items = pool->items; > + unsigned int mask = pool->item_mask; > + unsigned int i; > + > + if (!pool->dma_map || pool->mp_priv) > + return; > + > + spin_lock_bh(&pool->destroy_lock); > + > + for (i = 0; i <= mask; i++) { > + struct page *page; > + > + page = netmem_to_page(READ_ONCE(items[i].pp_netmem)); > + if (!page) > + continue; > + > + WARN_ONCE(1, "dma unmapping in %s: %p for %p\n", __func__, page, > + pool); > + > + dma_unmap_page_attrs(pool->p.dev, page_pool_get_dma_addr(page), > + PAGE_SIZE << pool->p.order, > + pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | > + DMA_ATTR_WEAK_ORDERING); > + page_pool_set_dma_addr(page, 0); > + } > + > + pool->dma_map = false; > + spin_unlock_bh(&pool->destroy_lock); > +} > + > +static bool page_pool_item_add(struct page_pool *pool, netmem_ref netmem) > +{ > + struct page_pool_item *items = pool->items; > + unsigned int mask = pool->item_mask; > + unsigned int idx = pool->item_idx; > + unsigned int i; > + > + for (i = 0; i <= mask; i++) { > + unsigned int mask_idx = idx++ & mask; > + > + if (!READ_ONCE(items[mask_idx].pp_netmem)) { > + WRITE_ONCE(items[mask_idx].pp_netmem, netmem); > + netmem_set_pp_item(netmem, &items[mask_idx]); > + pool->item_idx = idx; > + return true; > + } > + } > + > + pool->item_idx = idx; > + DEBUG_NET_WARN_ON_ONCE(true); > + return false; > +} > + > +static void page_pool_item_del(struct page_pool *pool, netmem_ref netmem) > +{ > + struct page_pool_item *item = netmem_to_page(netmem)->pp_item; > + struct page_pool_item *items = pool->items; > + unsigned int idx = item->pp_idx; > + > + DEBUG_NET_WARN_ON_ONCE(items[idx].pp_netmem != netmem); > + WRITE_ONCE(items[idx].pp_netmem, (unsigned long __bitwise)NULL); > + netmem_set_pp_item(netmem, NULL); > +} > + > +#define PAGE_POOL_MIN_ITEM_CNT 512 > + > /** > * page_pool_create_percpu() - create a page pool for a given cpu. > * @params: parameters, see struct page_pool_params > @@ -329,10 +405,14 @@ static void page_pool_uninit(struct page_pool *pool) > struct page_pool * > page_pool_create_percpu(const struct page_pool_params *params, int cpuid) > { > + unsigned int item_cnt = (params->pool_size ? : 1024) + > + PP_ALLOC_CACHE_SIZE + PAGE_POOL_MIN_ITEM_CNT; > struct page_pool *pool; > int err; > > - pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); > + item_cnt = roundup_pow_of_two(item_cnt); > + pool = kvzalloc_node(struct_size(pool, items, item_cnt), GFP_KERNEL, > + params->nid); > if (!pool) > return ERR_PTR(-ENOMEM); > > @@ -340,6 +420,8 @@ page_pool_create_percpu(const struct page_pool_params *params, int cpuid) > if (err < 0) > goto err_free; > > + page_pool_item_init(pool, item_cnt); > + > err = page_pool_list(pool); > if (err) > goto err_uninit; > @@ -350,7 +432,7 @@ page_pool_create_percpu(const struct page_pool_params *params, int cpuid) > page_pool_uninit(pool); > err_free: > pr_warn("%s() gave up with errno %d\n", __func__, err); > - kfree(pool); > + kvfree(pool); > return ERR_PTR(err); > } > EXPORT_SYMBOL(page_pool_create_percpu); > @@ -499,19 +581,24 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, > if (unlikely(!page)) > return NULL; > > - if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { > - put_page(page); > - return NULL; > - } > + if (unlikely(!page_pool_set_pp_info(pool, page_to_netmem(page)))) > + goto err_alloc; > + > + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) > + goto err_set_info; > > alloc_stat_inc(pool, slow_high_order); > - page_pool_set_pp_info(pool, page_to_netmem(page)); > > /* Track how many pages are held 'in-flight' */ > pool->pages_state_hold_cnt++; > trace_page_pool_state_hold(pool, page_to_netmem(page), > pool->pages_state_hold_cnt); > return page; > +err_set_info: > + page_pool_clear_pp_info(pool, page_to_netmem(page)); > +err_alloc: > + put_page(page); > + return NULL; > } > > /* slow path */ > @@ -546,12 +633,18 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, > */ > for (i = 0; i < nr_pages; i++) { > netmem = pool->alloc.cache[i]; > + > + if (unlikely(!page_pool_set_pp_info(pool, netmem))) { > + put_page(netmem_to_page(netmem)); > + continue; > + } > + > if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { > + page_pool_clear_pp_info(pool, netmem); > put_page(netmem_to_page(netmem)); > continue; > } > > - page_pool_set_pp_info(pool, netmem); > pool->alloc.cache[pool->alloc.count++] = netmem; > /* Track how many pages are held 'in-flight' */ > pool->pages_state_hold_cnt++; > @@ -623,9 +716,13 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) > return inflight; > } > > -void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) > +bool page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) > { > - netmem_set_pp(netmem, pool); > + if (unlikely(!page_pool_item_add(pool, netmem))) > + return false; > + > + DEBUG_NET_WARN_ON_ONCE(page_pool_to_pp(netmem_to_page(netmem)) != pool); > + > netmem_or_pp_magic(netmem, PP_SIGNATURE); > > /* Ensuring all pages have been split into one fragment initially: > @@ -637,12 +734,14 @@ void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) > page_pool_fragment_netmem(netmem, 1); > if (pool->has_init_callback) > pool->slow.init_callback(netmem, pool->slow.init_arg); > + > + return true; > } > > -void page_pool_clear_pp_info(netmem_ref netmem) > +void page_pool_clear_pp_info(struct page_pool *pool, netmem_ref netmem) > { > netmem_clear_pp_magic(netmem); > - netmem_set_pp(netmem, NULL); > + page_pool_item_del(pool, netmem); > } > > static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, > @@ -672,9 +771,13 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, > */ > void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) > { > + unsigned int destroy_cnt = READ_ONCE(pool->destroy_cnt); > int count; > bool put; > > + if (unlikely(destroy_cnt)) > + spin_lock_bh(&pool->destroy_lock); > + > put = true; > if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) > put = mp_dmabuf_devmem_release_page(pool, netmem); > @@ -688,9 +791,13 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) > trace_page_pool_state_release(pool, netmem, count); > > if (put) { > - page_pool_clear_pp_info(netmem); > + page_pool_clear_pp_info(pool, netmem); > put_page(netmem_to_page(netmem)); > } > + > + if (unlikely(destroy_cnt)) > + spin_unlock_bh(&pool->destroy_lock); > + > /* An optimization would be to call __free_pages(page, pool->p.order) > * knowing page is not part of page-cache (thus avoiding a > * __page_cache_release() call). > @@ -1034,14 +1141,14 @@ static void __page_pool_destroy(struct page_pool *pool) > static_branch_dec(&page_pool_mem_providers); > } > > - kfree(pool); > + kvfree(pool); > } > > static void page_pool_empty_alloc_cache_once(struct page_pool *pool) > { > netmem_ref netmem; > > - if (pool->destroy_cnt) > + if (pool->destroy_cnt > 1) > return; > > /* Empty alloc cache, assume caller made sure this is > @@ -1057,7 +1164,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) > static void page_pool_scrub(struct page_pool *pool) > { > page_pool_empty_alloc_cache_once(pool); > - pool->destroy_cnt++; > + WRITE_ONCE(pool->destroy_cnt, pool->destroy_cnt + 1); > > /* No more consumers should exist, but producers could still > * be in-flight. > @@ -1139,10 +1246,14 @@ void page_pool_destroy(struct page_pool *pool) > if (!page_pool_put(pool)) > return; > > + /* disable dma_sync_for_device */ > + pool->dma_sync = false; > + > page_pool_disable_direct_recycling(pool); > + WRITE_ONCE(pool->destroy_cnt, 1); > > - /* Wait for the freeing side see the disabling direct recycling setting > - * to avoid the concurrent access to the pool->alloc cache. > + /* Wait for the freeing side to see the new pool->dma_sync, > + * disable_direct and pool->destroy_cnt in page_pool_put_page. > */ > synchronize_rcu(); > > @@ -1151,6 +1262,8 @@ void page_pool_destroy(struct page_pool *pool) > if (!page_pool_release(pool)) > return; > > + page_pool_item_uninit(pool); > + > page_pool_detached(pool); > pool->defer_start = jiffies; > pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; > diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h > index 57439787b9c2..5d85f862a30a 100644 > --- a/net/core/page_pool_priv.h > +++ b/net/core/page_pool_priv.h > @@ -36,16 +36,18 @@ static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) > } > > #if defined(CONFIG_PAGE_POOL) > -void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem); > -void page_pool_clear_pp_info(netmem_ref netmem); > +bool page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem); > +void page_pool_clear_pp_info(struct page_pool *pool, netmem_ref netmem); > int page_pool_check_memory_provider(struct net_device *dev, > struct netdev_rx_queue *rxq); > #else > -static inline void page_pool_set_pp_info(struct page_pool *pool, > +static inline bool page_pool_set_pp_info(struct page_pool *pool, > netmem_ref netmem) > { > + return true; > } > -static inline void page_pool_clear_pp_info(netmem_ref netmem) > +static inline void page_pool_clear_pp_info(struct page_pool *pool, > + netmem_ref netmem) > { > } > static inline int page_pool_check_memory_provider(struct net_device *dev, > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 74149dc4ee31..d4295353ca6e 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -1033,7 +1033,8 @@ bool napi_pp_put_page(netmem_ref netmem) > if (unlikely(!is_pp_netmem(netmem))) > return false; > > - page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); > + page_pool_put_full_netmem(page_pool_to_pp(netmem_to_page(netmem)), > + netmem, false); > > return true; > } > diff --git a/net/core/xdp.c b/net/core/xdp.c > index bcc5551c6424..e8582036b411 100644 > --- a/net/core/xdp.c > +++ b/net/core/xdp.c > @@ -384,7 +384,8 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, > /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) > * as mem->type knows this a page_pool page > */ > - page_pool_put_full_page(page->pp, page, napi_direct); > + page_pool_put_full_page(page_pool_to_pp(page), page, > + napi_direct); > break; > case MEM_TYPE_PAGE_SHARED: > page_frag_free(data); > -- > 2.33.0 >