> > } else { > > - xdp_return_frame(xdpf); > > + xdp_return_frame_rx_napi(xdpf); > > If you implement Jesper's syncing suggestions, I think you can use > > page_pool_put_page(pool, page, 0, true); > > for XDP_TX here to avoid the DMA sync on page recycle. I tried Jasper's syncing suggestion and used page_pool_put_page() to recycle pages, but the results does not seem to improve the performance of XDP_TX, it even degrades the speed. The result of the current modification. root@imx8mpevk:~# ./xdp2 eth0 proto 17: 260180 pkt/s proto 17: 260373 pkt/s proto 17: 260363 pkt/s proto 17: 259036 pkt/s proto 17: 260180 pkt/s proto 17: 260048 pkt/s proto 17: 260029 pkt/s proto 17: 260133 pkt/s proto 17: 260021 pkt/s proto 17: 260203 pkt/s proto 17: 260293 pkt/s proto 17: 259418 pkt/s After using the sync suggestion, the result shows as follow. root@imx8mpevk:~# ./xdp2 eth0 proto 17: 255956 pkt/s proto 17: 255841 pkt/s proto 17: 255835 pkt/s proto 17: 255381 pkt/s proto 17: 255736 pkt/s proto 17: 255779 pkt/s proto 17: 254135 pkt/s proto 17: 255584 pkt/s proto 17: 255855 pkt/s proto 17: 255664 pkt/s Below are my changes, I don't know what cause it. Based on the results, it's better to keep the current modification. diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index d5fda24a4c52..415c0cb83f84 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -77,7 +77,8 @@ static void set_multicast_list(struct net_device *ndev); static void fec_enet_itr_coal_set(struct net_device *ndev); static int fec_enet_xdp_tx_xmit(struct net_device *ndev, - struct xdp_buff *xdp); + struct xdp_buff *xdp, + u32 dma_sync_len); #define DRIVER_NAME "fec" @@ -1487,7 +1488,14 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id, int budget) /* Free the sk buffer associated with this last transmit */ dev_kfree_skb_any(skb); } else { - xdp_return_frame_rx_napi(xdpf); + if (txq->tx_buf[index].type == FEC_TXBUF_T_XDP_NDO) + xdp_return_frame_rx_napi(xdpf); + else { + struct page *page; + + page = virt_to_head_page(xdpf->data); + page_pool_put_page(page->pp, page, 0, true); + } txq->tx_buf[index].xdp = NULL; /* restore default tx buffer type: FEC_TXBUF_T_SKB */ @@ -1557,7 +1565,8 @@ fec_enet_run_xdp(struct fec_enet_private *fep, struct bpf_prog *prog, act = bpf_prog_run_xdp(prog, xdp); /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */ - sync = xdp->data_end - xdp->data_hard_start - FEC_ENET_XDP_HEADROOM; + sync = xdp->data_end - xdp->data; sync = max(sync, len); switch (act) { @@ -1579,7 +1588,7 @@ fec_enet_run_xdp(struct fec_enet_private *fep, struct bpf_prog *prog, break; case XDP_TX: - err = fec_enet_xdp_tx_xmit(fep->netdev, xdp); + err = fec_enet_xdp_tx_xmit(fep->netdev, xdp, sync); if (unlikely(err)) { ret = FEC_ENET_XDP_CONSUMED; page = virt_to_head_page(xdp->data); @@ -3807,6 +3816,7 @@ fec_enet_xdp_get_tx_queue(struct fec_enet_private *fep, int index) static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep, struct fec_enet_priv_tx_q *txq, struct xdp_frame *frame, + u32 dma_sync_len, bool ndo_xmit) { unsigned int index, status, estatus; @@ -3840,7 +3850,7 @@ static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep, dma_addr = page_pool_get_dma_addr(page) + sizeof(*frame) + frame->headroom; dma_sync_single_for_device(&fep->pdev->dev, dma_addr, - frame->len, DMA_BIDIRECTIONAL); + dma_sync_len, DMA_BIDIRECTIONAL); txq->tx_buf[index].type = FEC_TXBUF_T_XDP_TX; } @@ -3889,7 +3899,8 @@ static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep, } static int fec_enet_xdp_tx_xmit(struct net_device *ndev, - struct xdp_buff *xdp) + struct xdp_buff *xdp, + u32 dma_sync_len) { struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp); struct fec_enet_private *fep = netdev_priv(ndev); @@ -3909,7 +3920,7 @@ static int fec_enet_xdp_tx_xmit(struct net_device *ndev, /* Avoid tx timeout as XDP shares the queue with kernel stack */ txq_trans_cond_update(nq); - ret = fec_enet_txq_xmit_frame(fep, txq, xdpf, false); + ret = fec_enet_txq_xmit_frame(fep, txq, xdpf, dma_sync_len, false); __netif_tx_unlock(nq); @@ -3938,7 +3949,7 @@ static int fec_enet_xdp_xmit(struct net_device *dev, /* Avoid tx timeout as XDP shares the queue with kernel stack */ txq_trans_cond_update(nq); for (i = 0; i < num_frames; i++) { - if (fec_enet_txq_xmit_frame(fep, txq, frames[i], true) < 0) + if (fec_enet_txq_xmit_frame(fep, txq, frames[i], 0, true) < 0) break; sent_frames++; }