> > On 03/08/2023 05.58, Wei Fang wrote: > >>> } else { > >>> - xdp_return_frame(xdpf); > >>> + xdp_return_frame_rx_napi(xdpf); > >> > >> If you implement Jesper's syncing suggestions, I think you can use > >> > >> page_pool_put_page(pool, page, 0, true); > > To Jakub, using 0 here you are trying to bypass the DMA-sync (which is valid > as driver knows XDP_TX have already done the sync). > The code will still call into DMA-sync calls with zero as size, so wonder if we > should detect size zero and skip that call? > (I mean is this something page_pool should support.) > > diff --git a/net/core/page_pool.c b/net/core/page_pool.c index > 7ca456bfab71..778d061e4f2c 100644 > --- a/net/core/page_pool.c > +++ b/net/core/page_pool.c > @@ -323,7 +323,8 @@ static void page_pool_dma_sync_for_device(struct > page_pool *pool, > dma_addr_t dma_addr = page_pool_get_dma_addr(page); > > dma_sync_size = min(dma_sync_size, pool->p.max_len); > - dma_sync_single_range_for_device(pool->p.dev, dma_addr, > + if (dma_sync_size) > + dma_sync_single_range_for_device(pool->p.dev, > dma_addr, > pool->p.offset, > dma_sync_size, > pool->p.dma_dir); > > > > >> > >> for XDP_TX here to avoid the DMA sync on page recycle. > > > > I tried Jasper's syncing suggestion and used page_pool_put_page() to > > recycle pages, but the results does not seem to improve the > > performance of XDP_TX, > > The optimization will only have effect on those devices which have > dev->dma_coherent=false else DMA function [1] (e.g. > dma_direct_sync_single_for_device) will skip the sync calls. > > [1] > https://elixir.b/ > ootlin.com%2Flinux%2Fv6.5-rc4%2Fsource%2Fkernel%2Fdma%2Fdirect.h%2 > 3L63&data=05%7C01%7Cwei.fang%40nxp.com%7Cb81436deb63d41dd41a1 > 08db93f454cb%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C63 > 8266449821982804%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwM > DAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C% > 7C&sdata=CSi%2Fzld%2FucIYo6VDb9YdO91D8HKV1tbzPq8fB5X%2Bs3s%3D& > reserved=0 > > (Cc. Andrew Lunn) > Does any of the imx generations have dma-noncoherent memory? > > And does any of these use the fec NIC driver? > > > it even degrades the speed. > > Could be low runs simply be a variation between your test runs? > Maybe, I just tested once before. So I test several times again, the results of the two methods do not seem to be much different so far, both about 255000 pkt/s. > The specific device (imx8mpevk) this was tested on, clearly have > dma_coherent=true, or else we would have seen a difference. > But the code change should not have any overhead for the > dma_coherent=true case, the only extra overhead is the extra empty DMA > sync call with size zero (as discussed in top). > The FEC of i.MX8MP-EVK has dma_coherent=false, and as I mentioned above, I did not see an obvious difference in the performance. :( > > > > The result of the current modification. > > root@imx8mpevk:~# ./xdp2 eth0 > > proto 17: 260180 pkt/s > > These results are *significantly* better than reported in patch-1. > What happened?!? > The test environment is slightly different, in patch-1, the FEC port was directly connected to the port of another board. But in the latest test, the ports of the two boards were connected to a switch, so the ports of the two boards are not directly connected. > e.g. > root@imx8mpevk:~# ./xdp2 eth0 > proto 17: 135817 pkt/s > proto 17: 142776 pkt/s > > > proto 17: 260373 pkt/s > > proto 17: 260363 pkt/s > > proto 17: 259036 pkt/s > > proto 17: 260180 pkt/s > > proto 17: 260048 pkt/s > > proto 17: 260029 pkt/s > > proto 17: 260133 pkt/s > > proto 17: 260021 pkt/s > > proto 17: 260203 pkt/s > > proto 17: 260293 pkt/s > > proto 17: 259418 pkt/s > > > > After using the sync suggestion, the result shows as follow. > > root@imx8mpevk:~# ./xdp2 eth0 > > proto 17: 255956 pkt/s > > proto 17: 255841 pkt/s > > proto 17: 255835 pkt/s > > proto 17: 255381 pkt/s > > proto 17: 255736 pkt/s > > proto 17: 255779 pkt/s > > proto 17: 254135 pkt/s > > proto 17: 255584 pkt/s > > proto 17: 255855 pkt/s > > proto 17: 255664 pkt/s > > > > Below are my changes, I don't know what cause it. Based on the > > results, it's better to keep the current modification. > > > > diff --git a/drivers/net/ethernet/freescale/fec_main.c > > b/drivers/net/ethernet/freescale/fec_main.c > > index d5fda24a4c52..415c0cb83f84 100644 > > --- a/drivers/net/ethernet/freescale/fec_main.c > > +++ b/drivers/net/ethernet/freescale/fec_main.c > > @@ -77,7 +77,8 @@ > > static void set_multicast_list(struct net_device *ndev); > > static void fec_enet_itr_coal_set(struct net_device *ndev); > > static int fec_enet_xdp_tx_xmit(struct net_device *ndev, > > - struct xdp_buff *xdp); > > + struct xdp_buff *xdp, > > + u32 dma_sync_len); > > > > #define DRIVER_NAME "fec" > > > > @@ -1487,7 +1488,14 @@ fec_enet_tx_queue(struct net_device *ndev, > u16 queue_id, int budget) > > /* Free the sk buffer associated with this last > transmit */ > > dev_kfree_skb_any(skb); > > } else { > > - xdp_return_frame_rx_napi(xdpf); > > + if (txq->tx_buf[index].type == > FEC_TXBUF_T_XDP_NDO) > > + xdp_return_frame_rx_napi(xdpf); > > + else { > > + struct page *page; > > + > > + page = > virt_to_head_page(xdpf->data); > > + page_pool_put_page(page->pp, page, > 0, true); > > + } > > > > txq->tx_buf[index].xdp = NULL; > > /* restore default tx buffer type: > > FEC_TXBUF_T_SKB */ @@ -1557,7 +1565,8 @@ fec_enet_run_xdp(struct > fec_enet_private *fep, struct bpf_prog *prog, > > act = bpf_prog_run_xdp(prog, xdp); > > > > /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU > touch */ > > - sync = xdp->data_end - xdp->data_hard_start - > FEC_ENET_XDP_HEADROOM; > > + sync = xdp->data_end - xdp->data; > > sync = max(sync, len); > > > > switch (act) { > > @@ -1579,7 +1588,7 @@ fec_enet_run_xdp(struct fec_enet_private *fep, > struct bpf_prog *prog, > > break; > > > > case XDP_TX: > > - err = fec_enet_xdp_tx_xmit(fep->netdev, xdp); > > + err = fec_enet_xdp_tx_xmit(fep->netdev, xdp, sync); > > if (unlikely(err)) { > > ret = FEC_ENET_XDP_CONSUMED; > > page = virt_to_head_page(xdp->data); @@ > > -3807,6 +3816,7 @@ fec_enet_xdp_get_tx_queue(struct fec_enet_private > *fep, int index) > > static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep, > > struct fec_enet_priv_tx_q *txq, > > struct xdp_frame *frame, > > + u32 dma_sync_len, > > bool ndo_xmit) > > { > > unsigned int index, status, estatus; @@ -3840,7 +3850,7 @@ > > static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep, > > dma_addr = page_pool_get_dma_addr(page) + > sizeof(*frame) + > > frame->headroom; > > dma_sync_single_for_device(&fep->pdev->dev, > dma_addr, > > - frame->len, > DMA_BIDIRECTIONAL); > > + dma_sync_len, > > + DMA_BIDIRECTIONAL); > > txq->tx_buf[index].type = FEC_TXBUF_T_XDP_TX; > > } > > > > @@ -3889,7 +3899,8 @@ static int fec_enet_txq_xmit_frame(struct > fec_enet_private *fep, > > } > > > > static int fec_enet_xdp_tx_xmit(struct net_device *ndev, > > - struct xdp_buff *xdp) > > + struct xdp_buff *xdp, > > + u32 dma_sync_len) > > { > > struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp); > > struct fec_enet_private *fep = netdev_priv(ndev); @@ -3909,7 > > +3920,7 @@ static int fec_enet_xdp_tx_xmit(struct net_device *ndev, > > > > /* Avoid tx timeout as XDP shares the queue with kernel stack > */ > > txq_trans_cond_update(nq); > > - ret = fec_enet_txq_xmit_frame(fep, txq, xdpf, false); > > + ret = fec_enet_txq_xmit_frame(fep, txq, xdpf, dma_sync_len, > > + false); > > > > __netif_tx_unlock(nq); > > > > @@ -3938,7 +3949,7 @@ static int fec_enet_xdp_xmit(struct net_device > *dev, > > /* Avoid tx timeout as XDP shares the queue with kernel stack > */ > > txq_trans_cond_update(nq); > > for (i = 0; i < num_frames; i++) { > > - if (fec_enet_txq_xmit_frame(fep, txq, frames[i], true) < 0) > > + if (fec_enet_txq_xmit_frame(fep, txq, frames[i], 0, > > + true) < 0) > > break; > > sent_frames++; > > } > >