Hi Maciej, On Mon Oct 07 2024, Maciej Fijalkowski wrote: > On Mon, Oct 07, 2024 at 02:31:26PM +0200, Kurt Kanzenbach wrote: >> From: Sriram Yagnaraman <sriram.yagnaraman@xxxxxxxx> >> >> Add support for AF_XDP zero-copy receive path. >> >> When AF_XDP zero-copy is enabled, the rx buffers are allocated from the >> xsk buff pool using igb_alloc_rx_buffers_zc(). >> >> Use xsk_pool_get_rx_frame_size() to set SRRCTL rx buf size when zero-copy >> is enabled. >> >> Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@xxxxxxxx> >> [Kurt: Port to v6.10 and provide napi_id for xdp_rxq_info_reg(), >> RCT, remove NETDEV_XDP_ACT_XSK_ZEROCOPY, update NTC handling, >> move stats update and xdp finalize to common functions, >> READ_ONCE() xsk_pool, likelyfy for XDP_REDIRECT case] >> Signed-off-by: Kurt Kanzenbach <kurt@xxxxxxxxxxxxx> > > Hi Kurt, > > Sorry but still have comments :< see below. No worries :) > >> --- >> drivers/net/ethernet/intel/igb/igb.h | 8 + >> drivers/net/ethernet/intel/igb/igb_main.c | 132 +++++++++---- >> drivers/net/ethernet/intel/igb/igb_xsk.c | 296 +++++++++++++++++++++++++++++- >> 3 files changed, 398 insertions(+), 38 deletions(-) >> >> diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h >> index c30d6f9708f8..ea3977b313fc 100644 >> --- a/drivers/net/ethernet/intel/igb/igb.h >> +++ b/drivers/net/ethernet/intel/igb/igb.h >> @@ -88,6 +88,7 @@ struct igb_adapter; >> #define IGB_XDP_CONSUMED BIT(0) >> #define IGB_XDP_TX BIT(1) >> #define IGB_XDP_REDIR BIT(2) >> +#define IGB_XDP_EXIT BIT(3) >> >> struct vf_data_storage { >> unsigned char vf_mac_addresses[ETH_ALEN]; >> @@ -740,6 +741,9 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring); >> void igb_clean_rx_ring(struct igb_ring *rx_ring); >> void igb_configure_tx_ring(struct igb_adapter *, struct igb_ring *); >> void igb_configure_rx_ring(struct igb_adapter *, struct igb_ring *); >> +void igb_finalize_xdp(struct igb_adapter *adapter, unsigned int status); >> +void igb_update_rx_stats(struct igb_q_vector *q_vector, unsigned int packets, >> + unsigned int bytes); >> void igb_setup_tctl(struct igb_adapter *); >> void igb_setup_rctl(struct igb_adapter *); >> void igb_setup_srrctl(struct igb_adapter *, struct igb_ring *); >> @@ -850,6 +854,10 @@ struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter, >> int igb_xsk_pool_setup(struct igb_adapter *adapter, >> struct xsk_buff_pool *pool, >> u16 qid); >> +bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, u16 count); >> +void igb_clean_rx_ring_zc(struct igb_ring *rx_ring); >> +int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, >> + struct xsk_buff_pool *xsk_pool, const int budget); >> int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags); >> >> #endif /* _IGB_H_ */ >> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c >> index bdba5c5861be..449ee794b3c9 100644 >> --- a/drivers/net/ethernet/intel/igb/igb_main.c >> +++ b/drivers/net/ethernet/intel/igb/igb_main.c >> @@ -472,12 +472,17 @@ static void igb_dump(struct igb_adapter *adapter) >> >> for (i = 0; i < rx_ring->count; i++) { >> const char *next_desc; >> - struct igb_rx_buffer *buffer_info; >> - buffer_info = &rx_ring->rx_buffer_info[i]; >> + dma_addr_t dma = (dma_addr_t)0; >> + struct igb_rx_buffer *buffer_info = NULL; >> rx_desc = IGB_RX_DESC(rx_ring, i); >> u0 = (struct my_u0 *)rx_desc; >> staterr = le32_to_cpu(rx_desc->wb.upper.status_error); >> >> + if (!rx_ring->xsk_pool) { >> + buffer_info = &rx_ring->rx_buffer_info[i]; >> + dma = buffer_info->dma; >> + } >> + >> if (i == rx_ring->next_to_use) >> next_desc = " NTU"; >> else if (i == rx_ring->next_to_clean) >> @@ -497,11 +502,11 @@ static void igb_dump(struct igb_adapter *adapter) >> "R ", i, >> le64_to_cpu(u0->a), >> le64_to_cpu(u0->b), >> - (u64)buffer_info->dma, >> + (u64)dma, >> next_desc); >> >> if (netif_msg_pktdata(adapter) && >> - buffer_info->dma && buffer_info->page) { >> + buffer_info && dma && buffer_info->page) { >> print_hex_dump(KERN_INFO, "", >> DUMP_PREFIX_ADDRESS, >> 16, 1, >> @@ -1983,7 +1988,10 @@ static void igb_configure(struct igb_adapter *adapter) >> */ >> for (i = 0; i < adapter->num_rx_queues; i++) { >> struct igb_ring *ring = adapter->rx_ring[i]; >> - igb_alloc_rx_buffers(ring, igb_desc_unused(ring)); >> + if (ring->xsk_pool) >> + igb_alloc_rx_buffers_zc(ring, igb_desc_unused(ring)); >> + else >> + igb_alloc_rx_buffers(ring, igb_desc_unused(ring)); >> } >> } >> >> @@ -4405,7 +4413,8 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring) >> if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) >> xdp_rxq_info_unreg(&rx_ring->xdp_rxq); >> res = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, >> - rx_ring->queue_index, 0); >> + rx_ring->queue_index, >> + rx_ring->q_vector->napi.napi_id); >> if (res < 0) { >> dev_err(dev, "Failed to register xdp_rxq index %u\n", >> rx_ring->queue_index); >> @@ -4701,12 +4710,17 @@ void igb_setup_srrctl(struct igb_adapter *adapter, struct igb_ring *ring) >> struct e1000_hw *hw = &adapter->hw; >> int reg_idx = ring->reg_idx; >> u32 srrctl = 0; >> + u32 buf_size; >> >> - srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT; >> - if (ring_uses_large_buffer(ring)) >> - srrctl |= IGB_RXBUFFER_3072 >> E1000_SRRCTL_BSIZEPKT_SHIFT; >> + if (ring->xsk_pool) >> + buf_size = xsk_pool_get_rx_frame_size(ring->xsk_pool); >> + else if (ring_uses_large_buffer(ring)) >> + buf_size = IGB_RXBUFFER_3072; >> else >> - srrctl |= IGB_RXBUFFER_2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT; >> + buf_size = IGB_RXBUFFER_2048; >> + >> + srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT; >> + srrctl |= buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT; >> srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF; >> if (hw->mac.type >= e1000_82580) >> srrctl |= E1000_SRRCTL_TIMESTAMP; >> @@ -4738,9 +4752,17 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, >> u32 rxdctl = 0; >> >> xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); >> - WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, >> - MEM_TYPE_PAGE_SHARED, NULL)); >> WRITE_ONCE(ring->xsk_pool, igb_xsk_pool(adapter, ring)); >> + if (ring->xsk_pool) { >> + WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, >> + MEM_TYPE_XSK_BUFF_POOL, >> + NULL)); >> + xsk_pool_set_rxq_info(ring->xsk_pool, &ring->xdp_rxq); >> + } else { >> + WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, >> + MEM_TYPE_PAGE_SHARED, >> + NULL)); >> + } >> >> /* disable the queue */ >> wr32(E1000_RXDCTL(reg_idx), 0); >> @@ -4767,9 +4789,12 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, >> rxdctl |= IGB_RX_HTHRESH << 8; >> rxdctl |= IGB_RX_WTHRESH << 16; >> >> - /* initialize rx_buffer_info */ >> - memset(ring->rx_buffer_info, 0, >> - sizeof(struct igb_rx_buffer) * ring->count); >> + if (ring->xsk_pool) >> + memset(ring->rx_buffer_info_zc, 0, >> + sizeof(*ring->rx_buffer_info_zc) * ring->count); >> + else >> + memset(ring->rx_buffer_info, 0, >> + sizeof(*ring->rx_buffer_info) * ring->count); >> >> /* initialize Rx descriptor 0 */ >> rx_desc = IGB_RX_DESC(ring, 0); >> @@ -4957,8 +4982,13 @@ void igb_free_rx_resources(struct igb_ring *rx_ring) >> >> rx_ring->xdp_prog = NULL; >> xdp_rxq_info_unreg(&rx_ring->xdp_rxq); >> - vfree(rx_ring->rx_buffer_info); >> - rx_ring->rx_buffer_info = NULL; >> + if (rx_ring->xsk_pool) { >> + vfree(rx_ring->rx_buffer_info_zc); >> + rx_ring->rx_buffer_info_zc = NULL; >> + } else { >> + vfree(rx_ring->rx_buffer_info); >> + rx_ring->rx_buffer_info = NULL; >> + } >> >> /* if not set, then don't free */ >> if (!rx_ring->desc) >> @@ -4996,6 +5026,11 @@ void igb_clean_rx_ring(struct igb_ring *rx_ring) >> dev_kfree_skb(rx_ring->skb); >> rx_ring->skb = NULL; >> >> + if (rx_ring->xsk_pool) { >> + igb_clean_rx_ring_zc(rx_ring); >> + goto skip_for_xsk; >> + } >> + >> /* Free all the Rx ring sk_buffs */ >> while (i != rx_ring->next_to_alloc) { >> struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i]; >> @@ -5023,6 +5058,7 @@ void igb_clean_rx_ring(struct igb_ring *rx_ring) >> i = 0; >> } >> >> +skip_for_xsk: >> rx_ring->next_to_alloc = 0; >> rx_ring->next_to_clean = 0; >> rx_ring->next_to_use = 0; >> @@ -8177,6 +8213,7 @@ static int igb_poll(struct napi_struct *napi, int budget) >> struct igb_q_vector *q_vector = container_of(napi, >> struct igb_q_vector, >> napi); >> + struct xsk_buff_pool *xsk_pool; >> bool clean_complete = true; >> int work_done = 0; >> >> @@ -8188,7 +8225,12 @@ static int igb_poll(struct napi_struct *napi, int budget) >> clean_complete = igb_clean_tx_irq(q_vector, budget); >> >> if (q_vector->rx.ring) { >> - int cleaned = igb_clean_rx_irq(q_vector, budget); >> + int cleaned; >> + >> + xsk_pool = READ_ONCE(q_vector->rx.ring->xsk_pool); >> + cleaned = xsk_pool ? >> + igb_clean_rx_irq_zc(q_vector, xsk_pool, budget) : >> + igb_clean_rx_irq(q_vector, budget); >> >> work_done += cleaned; >> if (cleaned >= budget) >> @@ -8852,6 +8894,38 @@ static void igb_put_rx_buffer(struct igb_ring *rx_ring, >> rx_buffer->page = NULL; >> } >> >> +void igb_finalize_xdp(struct igb_adapter *adapter, unsigned int status) >> +{ >> + int cpu = smp_processor_id(); >> + struct netdev_queue *nq; >> + >> + if (status & IGB_XDP_REDIR) >> + xdp_do_flush(); >> + >> + if (status & IGB_XDP_TX) { >> + struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter); >> + >> + nq = txring_txq(tx_ring); >> + __netif_tx_lock(nq, cpu); >> + igb_xdp_ring_update_tail(tx_ring); >> + __netif_tx_unlock(nq); >> + } >> +} >> + >> +void igb_update_rx_stats(struct igb_q_vector *q_vector, unsigned int packets, >> + unsigned int bytes) >> +{ >> + struct igb_ring *ring = q_vector->rx.ring; >> + >> + u64_stats_update_begin(&ring->rx_syncp); >> + ring->rx_stats.packets += packets; >> + ring->rx_stats.bytes += bytes; >> + u64_stats_update_end(&ring->rx_syncp); >> + >> + q_vector->rx.total_packets += packets; >> + q_vector->rx.total_bytes += bytes; >> +} >> + >> static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) >> { >> unsigned int total_bytes = 0, total_packets = 0; >> @@ -8859,9 +8933,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) >> struct igb_ring *rx_ring = q_vector->rx.ring; >> u16 cleaned_count = igb_desc_unused(rx_ring); >> struct sk_buff *skb = rx_ring->skb; >> - int cpu = smp_processor_id(); >> unsigned int xdp_xmit = 0; >> - struct netdev_queue *nq; >> struct xdp_buff xdp; >> u32 frame_sz = 0; >> int rx_buf_pgcnt; >> @@ -8983,24 +9055,10 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) >> /* place incomplete frames back on ring for completion */ >> rx_ring->skb = skb; >> >> - if (xdp_xmit & IGB_XDP_REDIR) >> - xdp_do_flush(); >> - >> - if (xdp_xmit & IGB_XDP_TX) { >> - struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter); >> - >> - nq = txring_txq(tx_ring); >> - __netif_tx_lock(nq, cpu); >> - igb_xdp_ring_update_tail(tx_ring); >> - __netif_tx_unlock(nq); >> - } >> + if (xdp_xmit) >> + igb_finalize_xdp(adapter, xdp_xmit); > > Nit: given you would be sending next revision, IMHO this is a candidate > for a separate patch. Not a big deal but would reduce the noise in this > one. Yes, makes sense. > >> >> - u64_stats_update_begin(&rx_ring->rx_syncp); >> - rx_ring->rx_stats.packets += total_packets; >> - rx_ring->rx_stats.bytes += total_bytes; >> - u64_stats_update_end(&rx_ring->rx_syncp); >> - q_vector->rx.total_packets += total_packets; >> - q_vector->rx.total_bytes += total_bytes; >> + igb_update_rx_stats(q_vector, total_packets, total_bytes); > > This also. > >> >> if (cleaned_count) >> igb_alloc_rx_buffers(rx_ring, cleaned_count); >> diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c >> index 7b632be3e7e3..9fd094a799fa 100644 >> --- a/drivers/net/ethernet/intel/igb/igb_xsk.c >> +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c >> @@ -70,7 +70,10 @@ static void igb_txrx_ring_enable(struct igb_adapter *adapter, u16 qid) >> * at least 1 descriptor unused to make sure >> * next_to_use != next_to_clean >> */ >> - igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring)); >> + if (rx_ring->xsk_pool) >> + igb_alloc_rx_buffers_zc(rx_ring, igb_desc_unused(rx_ring)); >> + else >> + igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring)); >> >> /* Rx/Tx share the same napi context. */ >> napi_enable(&rx_ring->q_vector->napi); >> @@ -169,6 +172,297 @@ int igb_xsk_pool_setup(struct igb_adapter *adapter, >> igb_xsk_pool_disable(adapter, qid); >> } >> >> +static u16 igb_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp, >> + union e1000_adv_rx_desc *rx_desc, u16 count) >> +{ >> + dma_addr_t dma; >> + u16 buffs; >> + int i; >> + >> + /* nothing to do */ >> + if (!count) >> + return 0; >> + >> + buffs = xsk_buff_alloc_batch(pool, xdp, count); >> + for (i = 0; i < buffs; i++) { >> + dma = xsk_buff_xdp_get_dma(*xdp); >> + rx_desc->read.pkt_addr = cpu_to_le64(dma); >> + rx_desc->wb.upper.length = 0; >> + >> + rx_desc++; >> + xdp++; >> + } >> + >> + return buffs; >> +} >> + >> +bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, u16 count) >> +{ >> + u32 nb_buffs_extra = 0, nb_buffs = 0; >> + union e1000_adv_rx_desc *rx_desc; >> + u16 ntu = rx_ring->next_to_use; >> + u16 total_count = count; >> + struct xdp_buff **xdp; >> + >> + rx_desc = IGB_RX_DESC(rx_ring, ntu); >> + xdp = &rx_ring->rx_buffer_info_zc[ntu]; >> + >> + if (ntu + count >= rx_ring->count) { >> + nb_buffs_extra = igb_fill_rx_descs(rx_ring->xsk_pool, xdp, >> + rx_desc, >> + rx_ring->count - ntu); > > Ehh wanted to ack this finally, but I believe that here you need to work > on the pool pointer that was READ_ONCE() in igb_poll() in hot path and > in igb_configure() pass rx_ring->xsk_pool as an argument. Good catch, thanks! Thanks, Kurt
Attachment:
signature.asc
Description: PGP signature