On 2023/5/5 8:54, Yunsheng Lin wrote: > It is not exactly the kind of refcnt bias trick in my mind, I was thinking > about using pool->pages_state_hold_cnt as refcnt bias and merge it to > pool->pages_state_release_cnt as needed, maybe I need to try to implement > that to see if it turn out to be what I want it to be. > I did try implementing the above idea, not sure is there anything missing yet as I only do the compile testing.
diff --git a/include/net/page_pool.h b/include/net/page_pool.h index c8ec2f34722b..bd8dacc2adfd 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -50,6 +50,10 @@ PP_FLAG_DMA_SYNC_DEV |\ PP_FLAG_PAGE_FRAG) + +/* Internal flag: PP in shutdown phase, waiting for inflight pages */ +#define PP_FLAG_SHUTDOWN BIT(8) + /* * Fast allocation side cache array/stack * @@ -151,11 +155,6 @@ static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) struct page_pool { struct page_pool_params p; - struct delayed_work release_dw; - void (*disconnect)(void *); - unsigned long defer_start; - unsigned long defer_warn; - u32 pages_state_hold_cnt; unsigned int frag_offset; struct page *frag_page; @@ -165,6 +164,7 @@ struct page_pool { /* these stats are incremented while in softirq context */ struct page_pool_alloc_stats alloc_stats; #endif + void (*disconnect)(void *); u32 xdp_mem_id; /* @@ -206,8 +206,6 @@ struct page_pool { * refcnt serves purpose is to simplify drivers error handling. */ refcount_t user_cnt; - - u64 destroy_cnt; }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index ca534501158b..200f539f1a70 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -23,7 +23,6 @@ TRACE_EVENT(page_pool_release, __field(s32, inflight) __field(u32, hold) __field(u32, release) - __field(u64, cnt) ), TP_fast_assign( @@ -31,12 +30,11 @@ TRACE_EVENT(page_pool_release, __entry->inflight = inflight; __entry->hold = hold; __entry->release = release; - __entry->cnt = pool->destroy_cnt; ), - TP_printk("page_pool=%p inflight=%d hold=%u release=%u cnt=%llu", + TP_printk("page_pool=%p inflight=%d hold=%u release=%u", __entry->pool, __entry->inflight, __entry->hold, - __entry->release, __entry->cnt) + __entry->release) ); TRACE_EVENT(page_pool_state_release, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e212e9d7edcb..6b3393f0c194 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -190,7 +190,8 @@ static int page_pool_init(struct page_pool *pool, if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; - atomic_set(&pool->pages_state_release_cnt, 0); + atomic_set(&pool->pages_state_release_cnt, INT_MAX / 2); + pool->pages_state_hold_cnt = INT_MAX / 2; /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); @@ -344,6 +345,15 @@ static void page_pool_clear_pp_info(struct page *page) page->pp = NULL; } +static void page_pool_hold_cnt_dec(struct page_pool *pool) +{ + if (likely(--pool->pages_state_hold_cnt)) + return; + + pool->pages_state_hold_cnt = INT_MAX / 2; + atomic_add(INT_MAX / 2, &pool->pages_state_release_cnt); +} + static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -364,7 +374,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, page_pool_set_pp_info(pool, page); /* Track how many pages are held 'in-flight' */ - pool->pages_state_hold_cnt++; + page_pool_hold_cnt_dec(pool); + trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); return page; } @@ -410,7 +421,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, page_pool_set_pp_info(pool, page); pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ - pool->pages_state_hold_cnt++; + page_pool_hold_cnt_dec(pool); trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); } @@ -445,24 +456,8 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) } EXPORT_SYMBOL(page_pool_alloc_pages); -/* Calculate distance between two u32 values, valid if distance is below 2^(31) - * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution - */ -#define _distance(a, b) (s32)((a) - (b)) - -static s32 page_pool_inflight(struct page_pool *pool) -{ - u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); - u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); - s32 inflight; - - inflight = _distance(hold_cnt, release_cnt); - trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); - WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); - - return inflight; -} +static void page_pool_free(struct page_pool *pool); /* Disconnects a page (from a page_pool). API users can have a need * to disconnect a page (from a page_pool), to allow it to be used as @@ -493,8 +488,12 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ - count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); + count = atomic_dec_return_relaxed(&pool->pages_state_release_cnt); + trace_page_pool_state_release(pool, page, count); + + if (unlikely(!count)) + page_pool_free(pool); } EXPORT_SYMBOL(page_pool_release_page); @@ -513,11 +512,20 @@ static void page_pool_return_page(struct page_pool *pool, struct page *page) static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) { int ret; - /* BH protection not needed if current is softirq */ - if (in_softirq()) - ret = ptr_ring_produce(&pool->ring, page); - else - ret = ptr_ring_produce_bh(&pool->ring, page); + + page_pool_ring_lock(pool); + + /* Do the check within spinlock to pair with flags setting + * in page_pool_destroy(). + */ + if (unlikely(pool->p.flags & PP_FLAG_SHUTDOWN)) { + page_pool_ring_unlock(pool); + return false; + } + + ret = __ptr_ring_produce(&pool->ring, page); + + page_pool_ring_unlock(pool); if (!ret) { recycle_stat_inc(pool, ring); @@ -634,9 +642,20 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, if (unlikely(!bulk_len)) return; + i = 0; + /* Bulk producer into ptr_ring page_pool cache */ page_pool_ring_lock(pool); - for (i = 0; i < bulk_len; i++) { + + /* Do the check within spinlock to pair with flags setting + * in page_pool_destroy(). + */ + if (unlikely(pool->p.flags & PP_FLAG_SHUTDOWN)) { + page_pool_ring_unlock(pool); + goto return_page; + } + + for (; i < bulk_len; i++) { if (__ptr_ring_produce(&pool->ring, data[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); @@ -650,8 +669,10 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, if (likely(i == bulk_len)) return; - /* ptr_ring cache full, free remaining pages outside producer lock - * since put_page() with refcnt == 1 can be an expensive operation +return_page: + /* ptr_ring cache full or in shutdown mode, free remaining pages + * outside producer lock since put_page() with refcnt == 1 can + * be an expensive operation */ for (; i < bulk_len; i++) page_pool_return_page(pool, data[i]); @@ -768,13 +789,10 @@ static void page_pool_free(struct page_pool *pool) kfree(pool); } -static void page_pool_empty_alloc_cache_once(struct page_pool *pool) +static void page_pool_scrub(struct page_pool *pool) { struct page *page; - if (pool->destroy_cnt) - return; - /* Empty alloc cache, assume caller made sure this is * no-longer in use, and page_pool_alloc_pages() cannot be * call concurrently. @@ -785,52 +803,6 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) } } -static void page_pool_scrub(struct page_pool *pool) -{ - page_pool_empty_alloc_cache_once(pool); - pool->destroy_cnt++; - - /* No more consumers should exist, but producers could still - * be in-flight. - */ - page_pool_empty_ring(pool); -} - -static int page_pool_release(struct page_pool *pool) -{ - int inflight; - - page_pool_scrub(pool); - inflight = page_pool_inflight(pool); - if (!inflight) - page_pool_free(pool); - - return inflight; -} - -static void page_pool_release_retry(struct work_struct *wq) -{ - struct delayed_work *dwq = to_delayed_work(wq); - struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); - int inflight; - - inflight = page_pool_release(pool); - if (!inflight) - return; - - /* Periodic warning */ - if (time_after_eq(jiffies, pool->defer_warn)) { - int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; - - pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", - __func__, inflight, sec); - pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; - } - - /* Still not ready to be disconnected, retry later */ - schedule_delayed_work(&pool->release_dw, DEFER_TIME); -} - void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), struct xdp_mem_info *mem) { @@ -864,15 +836,25 @@ void page_pool_destroy(struct page_pool *pool) page_pool_unlink_napi(pool); page_pool_free_frag(pool); + page_pool_scrub(pool); - if (!page_pool_release(pool)) - return; + /* when page_pool_destroy() is called, it is assumed that no + * page will be recycled to pool->alloc cache, we only need to + * make sure no page will be recycled to pool->ring by setting + * PP_FLAG_SHUTDOWN within spinlock to pair with flags checking + * within spinlock. + */ + page_pool_ring_lock(pool); + pool->p.flags |= PP_FLAG_SHUTDOWN; + page_pool_ring_unlock(pool); - pool->defer_start = jiffies; - pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + page_pool_empty_ring(pool); + + if (atomic_sub_return_relaxed(pool->pages_state_hold_cnt, + &pool->pages_state_release_cnt)) + return; - INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); - schedule_delayed_work(&pool->release_dw, DEFER_TIME); + page_pool_free(pool); } EXPORT_SYMBOL(page_pool_destroy);