Use Jakub's memory provider PoC API: https://github.com/kuba-moo/linux/tree/pp-providers To implement a dmabuf devmem memory provider. The provider allocates NET_RX dmabuf pages to the page pool. This abstracts any custom memory allocation or freeing changes for devmem TCP from drivers using the page pool. The memory provider allocates NET_RX pages from the dmabuf pages provided by the driver. These pages are ZONE_DEVICE pages with the sg dma_addrs stored in the zone_device_data entry in the page. The page pool entries in struct page are in a union with the ZONE_DEVICE entries, and - without special handling - the page pool would accidentally overwrite the data in the ZONE_DEVICE fields. To solve this, the memory provider converts the page from a ZONE_DEVICE page to a ZONE_NORMAL page upon giving it to the page pool, and converts it back to ZONE_DEVICE page upon getting it back from the page pool. This is safe to do because the NET_RX pages are dmabuf pages created to hold the dma_addr in the dma_buf_map_attachement sg_table entries, and are only used with code that handles them specifically. However, since dmabuf pages can now also be page pool page, we need to update 2 places to detect this correctly: 1. is_dma_buf_page() needs to be updated to correctly detect dmabuf pages after they've been inserted into the pool. 2. dma_buf_page_to_dma_addr() needs to be updated. For page pool pages, the dma_addr exists in page->dma_addr. For non page pool pages, the dma_addr exists in page->zone_device_data. Signed-off-by: Mina Almasry <almasrymina@xxxxxxxxxx> --- include/linux/dma-buf.h | 29 ++++++++++- include/net/page_pool.h | 20 ++++++++ net/core/page_pool.c | 104 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 143 insertions(+), 10 deletions(-) diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 93228a2fec47..896359fa998d 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -692,15 +692,26 @@ static inline bool is_dma_buf_pages_file(struct file *file) struct page *dma_buf_pages_net_rx_alloc(struct dma_buf_pages *priv); +static inline bool is_dma_buf_page_net_rx(struct page *page) +{ + struct dma_buf_pages *priv; + + return (is_page_pool_page(page) && (priv = page->pp->mp_priv) && + priv->pgmap.ops == &dma_buf_pgmap_ops); +} + static inline bool is_dma_buf_page(struct page *page) { return (is_zone_device_page(page) && page->pgmap && - page->pgmap->ops == &dma_buf_pgmap_ops); + page->pgmap->ops == &dma_buf_pgmap_ops) || + is_dma_buf_page_net_rx(page); } static inline dma_addr_t dma_buf_page_to_dma_addr(struct page *page) { - return (dma_addr_t)page->zone_device_data; + return is_dma_buf_page_net_rx(page) ? + (dma_addr_t)page->dma_addr : + (dma_addr_t)page->zone_device_data; } static inline int dma_buf_map_sg(struct device *dev, struct scatterlist *sg, @@ -718,6 +729,16 @@ static inline int dma_buf_map_sg(struct device *dev, struct scatterlist *sg, return nents; } + +static inline bool is_dma_buf_pages_priv(void *ptr) +{ + struct dma_buf_pages *priv = (struct dma_buf_pages *)ptr; + + if (!priv || priv->pgmap.ops != &dma_buf_pgmap_ops) + return false; + + return true; +} #else static inline bool is_dma_buf_page(struct page *page) { @@ -745,6 +766,10 @@ static inline struct page *dma_buf_pages_net_rx_alloc(struct dma_buf_pages *priv return NULL; } +static inline bool is_dma_buf_pages_priv(void *ptr) +{ + return false; +} #endif diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 7b6668479baf..a57757a13cc8 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -157,6 +157,7 @@ enum pp_memory_provider_type { PP_MP_HUGE_SPLIT, /* 2MB, online page alloc */ PP_MP_HUGE, /* 2MB, all memory pre-allocated */ PP_MP_HUGE_1G, /* 1G pages, MEP, pre-allocated */ + PP_MP_DMABUF_DEVMEM, /* dmabuf devmem provider */ }; struct pp_memory_provider_ops { @@ -170,6 +171,7 @@ extern const struct pp_memory_provider_ops basic_ops; extern const struct pp_memory_provider_ops hugesp_ops; extern const struct pp_memory_provider_ops huge_ops; extern const struct pp_memory_provider_ops huge_1g_ops; +extern const struct pp_memory_provider_ops dmabuf_devmem_ops; struct page_pool { struct page_pool_params p; @@ -420,4 +422,22 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } +static inline bool is_page_pool_page(struct page *page) +{ + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; + + if (!page->pp) + return false; + + return true; +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index df3f431fcff3..e626d4e309c1 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -236,6 +236,9 @@ static int page_pool_init(struct page_pool *pool, case PP_MP_HUGE_1G: pool->mp_ops = &huge_1g_ops; break; + case PP_MP_DMABUF_DEVMEM: + pool->mp_ops = &dmabuf_devmem_ops; + break; default: err = -EINVAL; goto free_ptr_ring; @@ -975,14 +978,7 @@ bool page_pool_return_skb_page(struct page *page, bool napi_safe) page = compound_head(page); - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + if (!is_page_pool_page(page)) return false; pp = page->pp; @@ -1538,3 +1534,95 @@ const struct pp_memory_provider_ops huge_1g_ops = { .alloc_pages = mp_huge_1g_alloc_pages, .release_page = mp_huge_1g_release, }; + +/*** "Dmabuf devmem page" ***/ + +/* Dmabuf devmem memory provider allocates DMA_BUF_PAGES_NET_RX pages which are + * backing the dma_buf_map_attachment() from the NIC to the device memory. + * + * These pages are wrappers around the dma_addr of the sg entries in the + * sg_table returned from dma_buf_map_attachment(). They can be passed to the + * networking stack, which will generate devmem skbs from them and process them + * correctly. + */ +static int mp_dmabuf_devmem_init(struct page_pool *pool) +{ + struct dma_buf_pages *priv; + + priv = pool->mp_priv; + if (!is_dma_buf_pages_priv(priv)) + return -EINVAL; + + return 0; +} + +static void mp_dmabuf_devmem_destroy(struct page_pool *pool) +{ +} + +static struct page *mp_dmabuf_devmem_alloc_pages(struct page_pool *pool, + gfp_t gfp) +{ + struct dma_buf_pages *priv = pool->mp_priv; + dma_addr_t dma_addr; + struct page *page; + + page = dma_buf_pages_net_rx_alloc(priv); + if (!page) + return page; + + /* It shouldn't be possible for the allocation to give us a page not + * belonging to this page_pool's pgmap. + */ + BUG_ON(page->pgmap != &priv->pgmap); + + /* netdev_rxq_alloc_dma_buf_page() allocates a ZONE_DEVICE page. + * Prepare to convert it into a page_pool page. We need to hold pgmap + * and zone_device_data (which holds the dma_addr). + * + * DMA_BUF_PAGES_NET_RX are dmabuf pages created specifically to wrap + * the dma_addr of the sg_table into a struct page. These pages are + * used by code specifically equipped to handle them, so this + * conversation from ZONE_DEVICE page to page pool page should be safe. + */ + dma_addr = (dma_addr_t)page->zone_device_data; + + set_page_zone(page, ZONE_NORMAL); + page->pp_magic = 0; + page_pool_set_pp_info(pool, page); + + page->dma_addr = dma_addr; + + return page; +} + +static bool mp_dmabuf_devmem_release_page(struct page_pool *pool, + struct page *page) +{ + struct dma_buf_pages *priv = pool->mp_priv; + unsigned long dma_addr = page->dma_addr; + + page_pool_clear_pp_info(page); + + /* As the page pool releases the page, restore it back to a ZONE_DEVICE + * page so it gets freed according to the + * page->pgmap->ops->page_free(). + */ + set_page_zone(page, ZONE_DEVICE); + page->zone_device_data = (void*)dma_addr; + page->pgmap = &priv->pgmap; + put_page(page); + + /* Return false here as we don't want the page pool touching the page + * after it's released to us. + */ + return false; +} + +const struct pp_memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_pages = mp_dmabuf_devmem_alloc_pages, + .release_page = mp_dmabuf_devmem_release_page, +}; +EXPORT_SYMBOL(dmabuf_devmem_ops); -- 2.41.0.390.g38632f3daf-goog