Overload the LSB of struct page* to indicate that it's a page_pool_iov. Refactor mm calls on struct page * into helpers, and add page_pool_iov handling on those helpers. Modify callers of these mm APIs with calls to these helpers instead. In areas where struct page* is dereferenced, add a check for special handling of page_pool_iov. The memory providers producing page_pool_iov can set the LSB on the struct page* returned to the page pool. Note that instead of overloading the LSB of page pointers, we can instead define a new union between struct page & struct page_pool_iov and compact it in a new type. However, we'd need to implement the code churn to modify the page_pool & drivers to use this new type. For this POC that is not implemented (feedback welcome). I have a sample implementation of adding a new page_pool_token type in the page_pool to give a general idea here: https://github.com/torvalds/linux/commit/3a7628700eb7fd02a117db036003bca50779608d Full branch here: https://github.com/torvalds/linux/compare/master...mina:linux:tcpdevmem-pp-tokens (In the branches above, page_pool_iov is called devmem_slice). Could also add static_branch to speed up the checks in page_pool_iov memory providers are being used. Signed-off-by: Mina Almasry <almasrymina@xxxxxxxxxx> --- include/net/page_pool.h | 74 ++++++++++++++++++++++++++++++++++- net/core/page_pool.c | 85 ++++++++++++++++++++++++++++------------- 2 files changed, 131 insertions(+), 28 deletions(-) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 537eb36115ed..f08ca230d68e 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -282,6 +282,64 @@ static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page) return NULL; } +static inline int page_pool_page_ref_count(struct page *page) +{ + if (page_is_page_pool_iov(page)) + return page_pool_iov_refcount(page_to_page_pool_iov(page)); + + return page_ref_count(page); +} + +static inline void page_pool_page_get_many(struct page *page, + unsigned int count) +{ + if (page_is_page_pool_iov(page)) + return page_pool_iov_get_many(page_to_page_pool_iov(page), + count); + + return page_ref_add(page, count); +} + +static inline void page_pool_page_put_many(struct page *page, + unsigned int count) +{ + if (page_is_page_pool_iov(page)) + return page_pool_iov_put_many(page_to_page_pool_iov(page), + count); + + if (count > 1) + page_ref_sub(page, count - 1); + + put_page(page); +} + +static inline bool page_pool_page_is_pfmemalloc(struct page *page) +{ + if (page_is_page_pool_iov(page)) + return false; + + return page_is_pfmemalloc(page); +} + +static inline bool page_pool_page_is_pref_nid(struct page *page, int pref_nid) +{ + /* Assume page_pool_iov are on the preferred node without actually + * checking... + * + * This check is only used to check for recycling memory in the page + * pool's fast paths. Currently the only implementation of page_pool_iov + * is dmabuf device memory. It's a deliberate decision by the user to + * bind a certain dmabuf to a certain netdev, and the netdev rx queue + * would not be able to reallocate memory from another dmabuf that + * exists on the preferred node, so, this check doesn't make much sense + * in this case. Assume all page_pool_iovs can be recycled for now. + */ + if (page_is_page_pool_iov(page)) + return true; + + return page_to_nid(page) == pref_nid; +} + struct page_pool { struct page_pool_params p; @@ -434,6 +492,9 @@ static inline long page_pool_defrag_page(struct page *page, long nr) { long ret; + if (page_is_page_pool_iov(page)) + return -EINVAL; + /* If nr == pp_frag_count then we have cleared all remaining * references to the page. No need to actually overwrite it, instead * we can leave this to be overwritten by the calling function. @@ -494,7 +555,12 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { - dma_addr_t ret = page->dma_addr; + dma_addr_t ret; + + if (page_is_page_pool_iov(page)) + return page_pool_iov_dma_addr(page_to_page_pool_iov(page)); + + ret = page->dma_addr; if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; @@ -504,6 +570,12 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page) static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) { + /* page_pool_iovs are mapped and their dma-addr can't be modified. */ + if (page_is_page_pool_iov(page)) { + DEBUG_NET_WARN_ON_ONCE(true); + return; + } + page->dma_addr = addr; if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) page->dma_addr_upper = upper_32_bits(addr); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 0a7c08d748b8..20c1f74fd844 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -318,7 +318,7 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) if (unlikely(!page)) break; - if (likely(page_to_nid(page) == pref_nid)) { + if (likely(page_pool_page_is_pref_nid(page, pref_nid))) { pool->alloc.cache[pool->alloc.count++] = page; } else { /* NUMA mismatch; @@ -363,7 +363,15 @@ static void page_pool_dma_sync_for_device(struct page_pool *pool, struct page *page, unsigned int dma_sync_size) { - dma_addr_t dma_addr = page_pool_get_dma_addr(page); + dma_addr_t dma_addr; + + /* page_pool_iov memory provider do not support PP_FLAG_DMA_SYNC_DEV */ + if (page_is_page_pool_iov(page)) { + DEBUG_NET_WARN_ON_ONCE(true); + return; + } + + dma_addr = page_pool_get_dma_addr(page); dma_sync_size = min(dma_sync_size, pool->p.max_len); dma_sync_single_range_for_device(pool->p.dev, dma_addr, @@ -375,6 +383,12 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) { dma_addr_t dma; + if (page_is_page_pool_iov(page)) { + /* page_pool_iovs are already mapped */ + DEBUG_NET_WARN_ON_ONCE(true); + return true; + } + /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit * into page private data (i.e 32bit cpu with 64bit DMA caps) @@ -398,14 +412,24 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) static void page_pool_set_pp_info(struct page_pool *pool, struct page *page) { - page->pp = pool; - page->pp_magic |= PP_SIGNATURE; + if (!page_is_page_pool_iov(page)) { + page->pp = pool; + page->pp_magic |= PP_SIGNATURE; + } else { + page_to_page_pool_iov(page)->pp = pool; + } + if (pool->p.init_callback) pool->p.init_callback(page, pool->p.init_arg); } static void page_pool_clear_pp_info(struct page *page) { + if (page_is_page_pool_iov(page)) { + page_to_page_pool_iov(page)->pp = NULL; + return; + } + page->pp_magic = 0; page->pp = NULL; } @@ -615,7 +639,7 @@ static bool page_pool_recycle_in_cache(struct page *page, return false; } - /* Caller MUST have verified/know (page_ref_count(page) == 1) */ + /* Caller MUST have verified/know (page_pool_page_ref_count(page) == 1) */ pool->alloc.cache[pool->alloc.count++] = page; recycle_stat_inc(pool, cached); return true; @@ -638,9 +662,10 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * refcnt == 1 means page_pool owns page, and can recycle it. * * page is NOT reusable when allocated when system is under - * some pressure. (page_is_pfmemalloc) + * some pressure. (page_pool_page_is_pfmemalloc) */ - if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { + if (likely(page_pool_page_ref_count(page) == 1 && + !page_pool_page_is_pfmemalloc(page))) { /* Read barrier done in page_ref_count / READ_ONCE */ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) @@ -741,7 +766,8 @@ static struct page *page_pool_drain_frag(struct page_pool *pool, if (likely(page_pool_defrag_page(page, drain_count))) return NULL; - if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { + if (page_pool_page_ref_count(page) == 1 && + !page_pool_page_is_pfmemalloc(page)) { if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) page_pool_dma_sync_for_device(pool, page, -1); @@ -818,9 +844,9 @@ static void page_pool_empty_ring(struct page_pool *pool) /* Empty recycle ring */ while ((page = ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ - if (!(page_ref_count(page) == 1)) + if (!(page_pool_page_ref_count(page) == 1)) pr_crit("%s() page_pool refcnt %d violation\n", - __func__, page_ref_count(page)); + __func__, page_pool_page_ref_count(page)); page_pool_return_page(pool, page); } @@ -977,19 +1003,24 @@ bool page_pool_return_skb_page(struct page *page, bool napi_safe) struct page_pool *pp; bool allow_direct; - page = compound_head(page); + if (!page_is_page_pool_iov(page)) { + page = compound_head(page); - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) - return false; + /* page->pp_magic is OR'ed with PP_SIGNATURE after the + * allocation in order to preserve any existing bits, such as + * bit 0 for the head page of compound page and bit 1 for + * pfmemalloc page, so mask those bits for freeing side when + * doing below checking, and page_pool_page_is_pfmemalloc() is + * checked in __page_pool_put_page() to avoid recycling the + * pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; - pp = page->pp; + pp = page->pp; + } else { + pp = page_to_page_pool_iov(page)->pp; + } /* Allow direct recycle if we have reasons to believe that we are * in the same context as the consumer would run, so there's @@ -1273,9 +1304,9 @@ static bool mp_huge_busy(struct mp_huge *hu, unsigned int idx) for (j = 0; j < (1 << MP_HUGE_ORDER); j++) { page = hu->page[idx] + j; - if (page_ref_count(page) != 1) { + if (page_pool_page_ref_count(page) != 1) { pr_warn("Page with ref count %d at %u, %u. Can't safely destory, leaking memory!\n", - page_ref_count(page), idx, j); + page_pool_page_ref_count(page), idx, j); return true; } } @@ -1330,7 +1361,7 @@ static struct page *mp_huge_alloc_pages(struct page_pool *pool, gfp_t gfp) continue; if ((page->pp_magic & ~0x3UL) == PP_SIGNATURE || - page_ref_count(page) != 1) { + page_pool_page_ref_count(page) != 1) { atomic_inc(&mp_huge_ins_b); continue; } @@ -1458,9 +1489,9 @@ static void mp_huge_1g_destroy(struct page_pool *pool) free = true; for (i = 0; i < MP_HUGE_1G_CNT; i++) { page = hu->page + i; - if (page_ref_count(page) != 1) { + if (page_pool_page_ref_count(page) != 1) { pr_warn("Page with ref count %d at %u. Can't safely destory, leaking memory!\n", - page_ref_count(page), i); + page_pool_page_ref_count(page), i); free = false; break; } @@ -1489,7 +1520,7 @@ static struct page *mp_huge_1g_alloc_pages(struct page_pool *pool, gfp_t gfp) page = hu->page + page_i; if ((page->pp_magic & ~0x3UL) == PP_SIGNATURE || - page_ref_count(page) != 1) { + page_pool_page_ref_count(page) != 1) { atomic_inc(&mp_huge_ins_b); continue; } -- 2.41.0.640.ga95def55d0-goog