On Thu, Aug 08, 2024 at 11:35:52AM -0700, Tony Nguyen wrote: > From: Sriram Yagnaraman <sriram.yagnaraman@xxxxxxxx> > > Add the following ring flags > - IGB_RING_FLAG_TX_DISABLED (when xsk pool is being setup) > - IGB_RING_FLAG_AF_XDP_ZC (xsk pool is active) > > Add a xdp_buff array for use with XSK receive batch API, and a pointer > to xsk_pool in igb_adapter. > > Add enable/disable functions for TX and RX rings > Add enable/disable functions for XSK pool > Add xsk wakeup function > > None of the above functionality will be active until > NETDEV_XDP_ACT_XSK_ZEROCOPY is advertised in netdev->xdp_features. > > Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@xxxxxxxx> > Signed-off-by: Kurt Kanzenbach <kurt@xxxxxxxxxxxxx> > Tested-by: Chandan Kumar Rout <chandanx.rout@xxxxxxxxx> (A Contingent Worker at Intel) > Signed-off-by: Tony Nguyen <anthony.l.nguyen@xxxxxxxxx> > --- > drivers/net/ethernet/intel/igb/Makefile | 2 +- > drivers/net/ethernet/intel/igb/igb.h | 14 +- > drivers/net/ethernet/intel/igb/igb_main.c | 9 + > drivers/net/ethernet/intel/igb/igb_xsk.c | 210 ++++++++++++++++++++++ > 4 files changed, 233 insertions(+), 2 deletions(-) > create mode 100644 drivers/net/ethernet/intel/igb/igb_xsk.c > > diff --git a/drivers/net/ethernet/intel/igb/Makefile b/drivers/net/ethernet/intel/igb/Makefile > index 463c0d26b9d4..6c1b702fd992 100644 > --- a/drivers/net/ethernet/intel/igb/Makefile > +++ b/drivers/net/ethernet/intel/igb/Makefile > @@ -8,4 +8,4 @@ obj-$(CONFIG_IGB) += igb.o > > igb-y := igb_main.o igb_ethtool.o e1000_82575.o \ > e1000_mac.o e1000_nvm.o e1000_phy.o e1000_mbx.o \ > - e1000_i210.o igb_ptp.o igb_hwmon.o > + e1000_i210.o igb_ptp.o igb_hwmon.o igb_xsk.o > diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h > index 0de71ec324ed..053130c01480 100644 > --- a/drivers/net/ethernet/intel/igb/igb.h > +++ b/drivers/net/ethernet/intel/igb/igb.h > @@ -20,6 +20,7 @@ > #include <linux/mdio.h> > > #include <net/xdp.h> > +#include <net/xdp_sock_drv.h> > > struct igb_adapter; > > @@ -320,6 +321,7 @@ struct igb_ring { > union { /* array of buffer info structs */ > struct igb_tx_buffer *tx_buffer_info; > struct igb_rx_buffer *rx_buffer_info; > + struct xdp_buff **rx_buffer_info_zc; > }; > void *desc; /* descriptor ring memory */ > unsigned long flags; /* ring specific flags */ > @@ -357,6 +359,7 @@ struct igb_ring { > }; > }; > struct xdp_rxq_info xdp_rxq; > + struct xsk_buff_pool *xsk_pool; > } ____cacheline_internodealigned_in_smp; > > struct igb_q_vector { > @@ -384,7 +387,9 @@ enum e1000_ring_flags_t { > IGB_RING_FLAG_RX_SCTP_CSUM, > IGB_RING_FLAG_RX_LB_VLAN_BSWAP, > IGB_RING_FLAG_TX_CTX_IDX, > - IGB_RING_FLAG_TX_DETECT_HANG > + IGB_RING_FLAG_TX_DETECT_HANG, > + IGB_RING_FLAG_TX_DISABLED, > + IGB_RING_FLAG_AF_XDP_ZC > }; > > #define ring_uses_large_buffer(ring) \ > @@ -822,4 +827,11 @@ int igb_add_mac_steering_filter(struct igb_adapter *adapter, > int igb_del_mac_steering_filter(struct igb_adapter *adapter, > const u8 *addr, u8 queue, u8 flags); > > +struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter, > + struct igb_ring *ring); > +int igb_xsk_pool_setup(struct igb_adapter *adapter, > + struct xsk_buff_pool *pool, > + u16 qid); > +int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags); > + > #endif /* _IGB_H_ */ > diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c > index bdb7637559b8..b6f23bbeff71 100644 > --- a/drivers/net/ethernet/intel/igb/igb_main.c > +++ b/drivers/net/ethernet/intel/igb/igb_main.c > @@ -2904,9 +2904,14 @@ static int igb_xdp_setup(struct net_device *dev, struct netdev_bpf *bpf) > > static int igb_xdp(struct net_device *dev, struct netdev_bpf *xdp) > { > + struct igb_adapter *adapter = netdev_priv(dev); > + > switch (xdp->command) { > case XDP_SETUP_PROG: > return igb_xdp_setup(dev, xdp); > + case XDP_SETUP_XSK_POOL: > + return igb_xsk_pool_setup(adapter, xdp->xsk.pool, > + xdp->xsk.queue_id); > default: > return -EINVAL; > } > @@ -3033,6 +3038,7 @@ static const struct net_device_ops igb_netdev_ops = { > .ndo_setup_tc = igb_setup_tc, > .ndo_bpf = igb_xdp, > .ndo_xdp_xmit = igb_xdp_xmit, > + .ndo_xsk_wakeup = igb_xsk_wakeup, > }; > > /** > @@ -4355,6 +4361,8 @@ void igb_configure_tx_ring(struct igb_adapter *adapter, > u64 tdba = ring->dma; > int reg_idx = ring->reg_idx; > > + ring->xsk_pool = igb_xsk_pool(adapter, ring); use WRITE_ONCE() > + > wr32(E1000_TDLEN(reg_idx), > ring->count * sizeof(union e1000_adv_tx_desc)); > wr32(E1000_TDBAL(reg_idx), > @@ -4750,6 +4758,7 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, > xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); > WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, > MEM_TYPE_PAGE_SHARED, NULL)); > + ring->xsk_pool = igb_xsk_pool(adapter, ring); ditto I was recently addressing issues around xsk in ice, see: [0]: https://lore.kernel.org/netdev/172239123450.15322.12860347838208396251.git-patchwork-notify@xxxxxxxxxx/ > > /* disable the queue */ > wr32(E1000_RXDCTL(reg_idx), 0); > diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c > new file mode 100644 > index 000000000000..925bf97f7caa > --- /dev/null > +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c > @@ -0,0 +1,210 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* Copyright(c) 2018 Intel Corporation. */ > + > +#include <linux/bpf_trace.h> > +#include <net/xdp_sock_drv.h> > +#include <net/xdp.h> > + > +#include "e1000_hw.h" > +#include "igb.h" > + > +static int igb_realloc_rx_buffer_info(struct igb_ring *ring, bool pool_present) > +{ > + int size = pool_present ? > + sizeof(*ring->rx_buffer_info_zc) * ring->count : > + sizeof(*ring->rx_buffer_info) * ring->count; > + void *buff_info = vmalloc(size); You need to take into account the rx_buffer_info_zc in the memset in igb_configure_rx_ring(). Also why vmalloc? > + > + if (!buff_info) > + return -ENOMEM; > + > + if (pool_present) { > + vfree(ring->rx_buffer_info); > + ring->rx_buffer_info = NULL; > + ring->rx_buffer_info_zc = buff_info; > + } else { > + vfree(ring->rx_buffer_info_zc); > + ring->rx_buffer_info_zc = NULL; > + ring->rx_buffer_info = buff_info; > + } > + > + return 0; > +} > + > +static void igb_txrx_ring_disable(struct igb_adapter *adapter, u16 qid) > +{ > + struct igb_ring *tx_ring = adapter->tx_ring[qid]; > + struct igb_ring *rx_ring = adapter->rx_ring[qid]; > + struct e1000_hw *hw = &adapter->hw; > + > + set_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags); > + > + wr32(E1000_TXDCTL(tx_ring->reg_idx), 0); > + wr32(E1000_RXDCTL(rx_ring->reg_idx), 0); > + synchronize_net() to let the napi finish its current job? > + /* Rx/Tx share the same napi context. */ > + napi_disable(&rx_ring->q_vector->napi); > + > + igb_clean_tx_ring(tx_ring); > + igb_clean_rx_ring(rx_ring); > + > + memset(&rx_ring->rx_stats, 0, sizeof(rx_ring->rx_stats)); > + memset(&tx_ring->tx_stats, 0, sizeof(tx_ring->tx_stats)); > +} > + > +static void igb_txrx_ring_enable(struct igb_adapter *adapter, u16 qid) > +{ > + struct igb_ring *tx_ring = adapter->tx_ring[qid]; > + struct igb_ring *rx_ring = adapter->rx_ring[qid]; > + > + igb_configure_tx_ring(adapter, tx_ring); > + igb_configure_rx_ring(adapter, rx_ring); > + synchronize_net() after updating xsk_pool ptrs > + clear_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags); > + > + /* call igb_desc_unused which always leaves > + * at least 1 descriptor unused to make sure > + * next_to_use != next_to_clean > + */ > + igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring)); > + > + /* Rx/Tx share the same napi context. */ > + napi_enable(&rx_ring->q_vector->napi); > +} > + > +struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter, > + struct igb_ring *ring) > +{ > + int qid = ring->queue_index; > + > + if (!igb_xdp_is_enabled(adapter) || > + !test_bit(IGB_RING_FLAG_AF_XDP_ZC, &ring->flags)) See: [1]: https://lore.kernel.org/netdev/20240603-net-2024-05-30-intel-net-fixes-v2-3-e3563aa89b0c@xxxxxxxxx/ how to avoid the introduction of IGB_RING_FLAG_AF_XDP_ZC altogether. > + return NULL; > + > + return xsk_get_pool_from_qid(adapter->netdev, qid); > +} > + > +static int igb_xsk_pool_enable(struct igb_adapter *adapter, > + struct xsk_buff_pool *pool, > + u16 qid) > +{ > + struct net_device *netdev = adapter->netdev; > + struct igb_ring *tx_ring, *rx_ring; > + bool if_running; > + int err; > + > + if (qid >= adapter->num_rx_queues) > + return -EINVAL; > + > + if (qid >= netdev->real_num_rx_queues || > + qid >= netdev->real_num_tx_queues) > + return -EINVAL; > + > + err = xsk_pool_dma_map(pool, &adapter->pdev->dev, IGB_RX_DMA_ATTR); > + if (err) > + return err; > + > + tx_ring = adapter->tx_ring[qid]; > + rx_ring = adapter->rx_ring[qid]; > + if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter); > + if (if_running) > + igb_txrx_ring_disable(adapter, qid); > + > + set_bit(IGB_RING_FLAG_AF_XDP_ZC, &tx_ring->flags); > + set_bit(IGB_RING_FLAG_AF_XDP_ZC, &rx_ring->flags); > + > + if (if_running) { > + err = igb_realloc_rx_buffer_info(rx_ring, true); > + if (!err) { > + igb_txrx_ring_enable(adapter, qid); > + /* Kick start the NAPI context so that receiving will start */ > + err = igb_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX); > + } > + > + if (err) { > + clear_bit(IGB_RING_FLAG_AF_XDP_ZC, &tx_ring->flags); > + clear_bit(IGB_RING_FLAG_AF_XDP_ZC, &rx_ring->flags); > + xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR); > + return err; > + } > + } > + > + return 0; > +} > + > +static int igb_xsk_pool_disable(struct igb_adapter *adapter, u16 qid) > +{ > + struct igb_ring *tx_ring, *rx_ring; > + struct xsk_buff_pool *pool; > + bool if_running; > + int err; > + > + pool = xsk_get_pool_from_qid(adapter->netdev, qid); > + if (!pool) > + return -EINVAL; > + > + tx_ring = adapter->tx_ring[qid]; > + rx_ring = adapter->rx_ring[qid]; > + if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter); > + if (if_running) > + igb_txrx_ring_disable(adapter, qid); > + > + xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR); > + clear_bit(IGB_RING_FLAG_AF_XDP_ZC, &tx_ring->flags); > + clear_bit(IGB_RING_FLAG_AF_XDP_ZC, &rx_ring->flags); > + > + if (if_running) { > + err = igb_realloc_rx_buffer_info(rx_ring, false); > + if (err) > + return err; > + > + igb_txrx_ring_enable(adapter, qid); > + } > + > + return 0; > +} > + > +int igb_xsk_pool_setup(struct igb_adapter *adapter, > + struct xsk_buff_pool *pool, > + u16 qid) > +{ > + return pool ? igb_xsk_pool_enable(adapter, pool, qid) : > + igb_xsk_pool_disable(adapter, qid); > +} > + > +int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) > +{ > + struct igb_adapter *adapter = netdev_priv(dev); > + struct e1000_hw *hw = &adapter->hw; > + struct igb_ring *ring; > + u32 eics = 0; > + > + if (test_bit(__IGB_DOWN, &adapter->state)) > + return -ENETDOWN; > + > + if (!igb_xdp_is_enabled(adapter)) > + return -EINVAL; > + > + if (qid >= adapter->num_tx_queues) > + return -EINVAL; > + > + ring = adapter->tx_ring[qid]; > + > + if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags)) > + return -ENETDOWN; > + > + if (!ring->xsk_pool) READ_ONCE() Also, please test this patchset against a scenario where you do Tx ZC from every queue available and toggle the interface down and up. We had a nasty case that [0] fixed where we were producing Tx descriptors to wire when interface was either already going down or not brought up yet. > + return -EINVAL; > + > + if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) { > + /* Cause software interrupt to ensure Rx ring is cleaned */ > + if (adapter->flags & IGB_FLAG_HAS_MSIX) { > + eics |= ring->q_vector->eims_value; > + wr32(E1000_EICS, eics); > + } else { > + wr32(E1000_ICS, E1000_ICS_RXDMT0); > + } > + } > + > + return 0; > +} > -- > 2.42.0 >