Several comments on different things below for this patch that I just noticed. On Thu, Sep 12, 2024 at 10:07:13AM +0000, Joe Damato wrote: > Add a persistent NAPI config area for NAPI configuration to the core. > Drivers opt-in to setting the storage for a NAPI by passing an index > when calling netif_napi_add_storage. > > napi_config is allocated in alloc_netdev_mqs, freed in free_netdev > (after the NAPIs are deleted), and set to 0 when napi_enable is called. Forgot to re-read all the commit messages. I will do that for rfcv4 and make sure they are all correct; this message is not correct. > Drivers which implement call netif_napi_add_storage will have persistent > NAPI IDs. > > Signed-off-by: Joe Damato <jdamato@xxxxxxxxxx> > --- > .../networking/net_cachelines/net_device.rst | 1 + > include/linux/netdevice.h | 34 +++++++++ > net/core/dev.c | 74 +++++++++++++++++-- > net/core/dev.h | 12 +++ > 4 files changed, 113 insertions(+), 8 deletions(-) > [...] > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 3e07ab8e0295..08afc96179f9 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h [...] > @@ -2685,6 +2717,8 @@ void __netif_napi_del(struct napi_struct *napi); > */ > static inline void netif_napi_del(struct napi_struct *napi) > { > + napi->config = NULL; > + napi->index = -1; > __netif_napi_del(napi); > synchronize_net(); > } I don't quite see how, but occasionally when changing the queue count with ethtool -L, I seem to trigger a page_pool issue. I assumed it was related to either my changes above in netif_napi_del, or below in __netif_napi_del, but I'm not so sure because the issue does not happen reliably and I can't seem to figure out how my change would cause this. When it does happen, this is the stack trace: page_pool_empty_ring() page_pool refcnt -30528 violation ------------[ cut here ]------------ Negative(-1) inflight packet-pages WARNING: CPU: 1 PID: 5117 at net/core/page_pool.c:617 page_pool_inflight+0x4c/0x90 [...] CPU: 1 UID: 0 PID: 5117 Comm: ethtool Tainted: G W [...] RIP: 0010:page_pool_inflight+0x4c/0x90 Code: e4 b8 00 00 00 00 44 0f 48 e0 44 89 e0 41 5c e9 8a c1 1b 00 66 90 45 85 e4 79 ef 44 89 e6 48 c7 c7 78 56 26 82 e8 14 63 78 ff <0f> 0b eb dc 65 8b 05 b5 af 71 7e 48 0f a3 05 21 d9 3b 01 73 d7 48 RSP: 0018:ffffc9001d01b640 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027 RDX: 0000000000000027 RSI: 00000000ffefffff RDI: ffff88bf4f45c988 RBP: ffff8900da55a800 R08: 0000000000000000 R09: ffffc9001d01b480 R10: 0000000000000001 R11: 0000000000000001 R12: 00000000ffffffff R13: ffffffff82cd35b0 R14: ffff890062550f00 R15: ffff8881b0e85400 FS: 00007fa9cb382740(0000) GS:ffff88bf4f440000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000558baa9d3b38 CR3: 000000011222a000 CR4: 0000000000350ef0 Call Trace: <TASK> ? __warn+0x80/0x110 ? page_pool_inflight+0x4c/0x90 ? report_bug+0x19c/0x1b0 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? page_pool_inflight+0x4c/0x90 page_pool_release+0x10e/0x1d0 page_pool_destroy+0x66/0x160 mlx5e_free_rq+0x69/0xb0 [mlx5_core] mlx5e_close_queues+0x39/0x150 [mlx5_core] mlx5e_close_channel+0x1c/0x60 [mlx5_core] mlx5e_close_channels+0x49/0xa0 [mlx5_core] mlx5e_switch_priv_channels+0xa9/0x120 [mlx5_core] ? __pfx_mlx5e_num_channels_changed_ctx+0x10/0x10 [mlx5_core] mlx5e_safe_switch_params+0xb8/0xf0 [mlx5_core] mlx5e_ethtool_set_channels+0x17a/0x290 [mlx5_core] ethnl_set_channels+0x243/0x310 ethnl_default_set_doit+0xc1/0x170 genl_family_rcv_msg_doit+0xd9/0x130 genl_rcv_msg+0x18f/0x2c0 ? __pfx_ethnl_default_set_doit+0x10/0x10 ? __pfx_genl_rcv_msg+0x10/0x10 netlink_rcv_skb+0x5a/0x110 genl_rcv+0x28/0x40 netlink_unicast+0x1aa/0x260 netlink_sendmsg+0x1e9/0x420 __sys_sendto+0x1d5/0x1f0 ? handle_mm_fault+0x1cb/0x290 ? do_user_addr_fault+0x558/0x7c0 __x64_sys_sendto+0x29/0x30 do_syscall_64+0x5d/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e > diff --git a/net/core/dev.c b/net/core/dev.c > index f2fd503516de..ca2227d0b8ed 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c [...] > @@ -6736,7 +6776,13 @@ void __netif_napi_del(struct napi_struct *napi) > if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) > return; > > - napi_hash_del(napi); > + if (!napi->config) { > + napi_hash_del(napi); > + } else { > + napi->index = -1; > + napi->config = NULL; > + } > + See above; perhaps something related to this change is triggering the page pool warning occasionally. > list_del_rcu(&napi->dev_list); > napi_free_frags(napi); > > @@ -11049,6 +11095,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, > unsigned int txqs, unsigned int rxqs) > { > struct net_device *dev; > + size_t napi_config_sz; > + unsigned int maxqs; > > BUG_ON(strlen(name) >= sizeof(dev->name)); > > @@ -11062,6 +11110,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, > return NULL; > } > > + WARN_ON_ONCE(txqs != rxqs); This warning triggers for me on boot every time with mlx5 NICs. The code in mlx5 seems to get the rxq and txq maximums in: drivers/net/ethernet/mellanox/mlx5/core/en_main.c mlx5e_create_netdev which does: txqs = mlx5e_get_max_num_txqs(mdev, profile); rxqs = mlx5e_get_max_num_rxqs(mdev, profile); netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv), txqs, rxqs); In my case for my device, txqs: 760, rxqs: 63. I would guess that this warning will trigger everytime for mlx5 NICs and would be quite annoying. We may just want to replace the allocation logic to allocate txqs+rxqs, remove the WARN_ON_ONCE, and be OK with some wasted space? > + maxqs = max(txqs, rxqs); > + > dev = kvzalloc(struct_size(dev, priv, sizeof_priv), > GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); > if (!dev) > @@ -11136,6 +11187,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, > if (!dev->ethtool) > goto free_all; > > + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); > + dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); > + if (!dev->napi_config) > + goto free_all; > + [...] > diff --git a/net/core/dev.h b/net/core/dev.h > index a9d5f678564a..9eb3f559275c 100644 > --- a/net/core/dev.h > +++ b/net/core/dev.h > @@ -167,11 +167,17 @@ static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer) > static inline void netdev_set_defer_hard_irqs(struct net_device *netdev, > u32 defer) > { > + unsigned int count = max(netdev->num_rx_queues, > + netdev->num_tx_queues); > struct napi_struct *napi; > + int i; > > WRITE_ONCE(netdev->napi_defer_hard_irqs, defer); > list_for_each_entry(napi, &netdev->napi_list, dev_list) > napi_set_defer_hard_irqs(napi, defer); > + > + for (i = 0; i < count; i++) > + netdev->napi_config[i].defer_hard_irqs = defer; The above is incorrect. Some devices may have netdev->napi_config = NULL if they don't call the add_storage wrapper. Unless there is major feedback/changes requested that affect this code, in the rfcv4 branch I plan to fix this by adding a NULL check: if (netdev->napi_config) for (....) netdev->napi_config[i].... > > /** > @@ -206,11 +212,17 @@ static inline void napi_set_gro_flush_timeout(struct napi_struct *n, > static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, > unsigned long timeout) > { > + unsigned int count = max(netdev->num_rx_queues, > + netdev->num_tx_queues); > struct napi_struct *napi; > + int i; > > WRITE_ONCE(netdev->gro_flush_timeout, timeout); > list_for_each_entry(napi, &netdev->napi_list, dev_list) > napi_set_gro_flush_timeout(napi, timeout); > + > + for (i = 0; i < count; i++) > + netdev->napi_config[i].gro_flush_timeout = timeout; Same as above.