On Wed, 17 Apr 2019 20:49:42 +0300, Ivan Khoronzhuk wrote: > Add XDP support based on rx page_pool allocator, one frame per page. > This patch was verified with af_xdp and xdp drop. Page pool allocator > is used with assumption that only one rx_handler is running > simultaneously. DMA map/unmap is reused from page pool despite there > is no need to map whole page. > > Due to specific of cpsw, the same TX/RX handler can be used by 2 > network devices, so special fields in buffer are added to identify > an interface the frame is destined to. > > XDP prog is common for all channels till appropriate changes are added > in XDP infrastructure. > > Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@xxxxxxxxxx> > @@ -902,22 +947,169 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb) > } > } > > +static inline int cpsw_tx_submit_xdpf(struct cpsw_priv *priv, > + struct xdp_frame *xdpf, > + struct cpdma_chan *txch) > +{ > + struct cpsw_common *cpsw = priv->cpsw; > + > + return cpdma_chan_submit(txch, cpsw_xdpf_to_handle(xdpf), xdpf->data, > + xdpf->len, > + priv->emac_port + cpsw->data.dual_emac); > +} > + > +static int cpsw_xdp_tx_frame(struct cpsw_priv *priv, struct xdp_frame *frame) > +{ > + struct cpsw_common *cpsw = priv->cpsw; > + struct cpsw_meta_xdp *xmeta; > + struct cpdma_chan *txch; > + int ret = 0; > + > + frame->metasize = sizeof(struct cpsw_meta_xdp); > + xmeta = frame->data - frame->metasize; > + xmeta->ndev = priv->ndev; > + xmeta->ch = 0; > + > + txch = cpsw->txv[0].ch; > + ret = cpsw_tx_submit_xdpf(priv, frame, txch); > + if (ret) { > + xdp_return_frame_rx_napi(frame); > + ret = -1; > + } > + > + /* If there is no more tx desc left free then we need to > + * tell the kernel to stop sending us tx frames. > + */ So you're using the same TX ring for XDP and stack? How does that work? The stack's TX ring has a lock, and can be used from any CPU, while XDP TX rings are per-PCU, no? > + if (unlikely(!cpdma_check_free_tx_desc(txch))) { > + struct netdev_queue *txq = netdev_get_tx_queue(priv->ndev, 0); > + > + netif_tx_stop_queue(txq); > + > + /* Barrier, so that stop_queue visible to other cpus */ > + smp_mb__after_atomic(); > + > + if (cpdma_check_free_tx_desc(txch)) > + netif_tx_wake_queue(txq); > + } > + > + return ret; > +} > +static struct page_pool *cpsw_create_rx_pool(struct cpsw_common *cpsw) > +{ > + struct page_pool_params pp_params = { 0 }; > + > + pp_params.order = 0; > + pp_params.flags = PP_FLAG_DMA_MAP; > + > + /* set it to number of descriptors to be cached from init? */ > + pp_params.pool_size = descs_pool_size; > + pp_params.nid = NUMA_NO_NODE; /* no numa */ > + pp_params.dma_dir = DMA_FROM_DEVICE; DMA_FROM_DEVICE looks suspicious if you support TX, shouldn't this be BIDIRECTIONAL? > + pp_params.dev = cpsw->dev; > + > + return page_pool_create(&pp_params); > +} > static int cpsw_rx_handler(void *token, int len, int status) > { > - struct cpdma_chan *ch; > - struct sk_buff *skb = token; > - struct sk_buff *new_skb; > - struct net_device *ndev = skb->dev; > - int ret = 0, port; > - struct cpsw_common *cpsw = ndev_to_cpsw(ndev); > + struct page *new_page, *page = token; > + struct cpsw_meta_xdp *new_xmeta, *xmeta = page_address(page); > + struct cpsw_common *cpsw = ndev_to_cpsw(xmeta->ndev); > + int pkt_size = cpsw->rx_packet_max; > + int ret = 0, port, ch = xmeta->ch; > + struct page_pool *pool = cpsw->rx_page_pool; > + int headroom = CPSW_HEADROOM; > + struct net_device *ndev = xmeta->ndev; > + int flush = 0; > struct cpsw_priv *priv; > + struct sk_buff *skb; > + struct xdp_buff xdp; > + dma_addr_t dma; > > if (cpsw->data.dual_emac) { > port = CPDMA_RX_SOURCE_PORT(status); > - if (port) { > + if (port) > ndev = cpsw->slaves[--port].ndev; > - skb->dev = ndev; > - } > } > > if (unlikely(status < 0) || unlikely(!netif_running(ndev))) { > @@ -930,47 +1122,105 @@ static int cpsw_rx_handler(void *token, int len, int status) > * in reducing of the number of rx descriptor in > * DMA engine, requeue skb back to cpdma. > */ > - new_skb = skb; > + new_page = page; > + new_xmeta = xmeta; > goto requeue; > } > > /* the interface is going down, skbs are purged */ > - dev_kfree_skb_any(skb); > + page_pool_recycle_direct(pool, page); > return 0; > } > > - new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max); > - if (new_skb) { > - skb_copy_queue_mapping(new_skb, skb); > - skb_put(skb, len); > - if (status & CPDMA_RX_VLAN_ENCAP) > - cpsw_rx_vlan_encap(skb); > - priv = netdev_priv(ndev); > - if (priv->rx_ts_enabled) > - cpts_rx_timestamp(cpsw->cpts, skb); > - skb->protocol = eth_type_trans(skb, ndev); > - netif_receive_skb(skb); > - ndev->stats.rx_bytes += len; > - ndev->stats.rx_packets++; > - kmemleak_not_leak(new_skb); > - } else { > + new_page = cpsw_alloc_page(cpsw); > + if (unlikely(!new_page)) { > + new_page = page; > + new_xmeta = xmeta; > ndev->stats.rx_dropped++; > - new_skb = skb; > + goto requeue; > } > + new_xmeta = page_address(new_page); > + > + priv = netdev_priv(ndev); > + if (priv->xdp_prog) { > + xdp_set_data_meta_invalid(&xdp); > + > + if (status & CPDMA_RX_VLAN_ENCAP) { > + xdp.data = (u8 *)xmeta + CPSW_HEADROOM + > + CPSW_RX_VLAN_ENCAP_HDR_SIZE; > + xdp.data_end = xdp.data + len - > + CPSW_RX_VLAN_ENCAP_HDR_SIZE; > + } else { > + xdp.data = (u8 *)xmeta + CPSW_HEADROOM; > + xdp.data_end = xdp.data + len; > + } > + > + xdp.data_hard_start = xmeta; > + xdp.rxq = &priv->xdp_rxq[ch]; > + > + ret = cpsw_run_xdp(priv, &cpsw->rxv[ch], &xdp); > + if (ret) { > + if (ret == 2) > + flush = 1; > + > + goto requeue; > + } > + > + /* XDP prog might have changed packet data and boundaries */ > + len = xdp.data_end - xdp.data; > + headroom = xdp.data - xdp.data_hard_start; > + } > + > + /* Build skb and pass it to networking stack if XDP off or XDP prog > + * returned XDP_PASS > + */ > + skb = build_skb(xmeta, cpsw_rxbuf_total_len(pkt_size)); > + if (!skb) { > + ndev->stats.rx_dropped++; > + page_pool_recycle_direct(pool, page); > + goto requeue; > + } > + > + skb_reserve(skb, headroom); > + skb_put(skb, len); > + skb->dev = ndev; > + if (status & CPDMA_RX_VLAN_ENCAP) > + cpsw_rx_vlan_encap(skb); > + if (priv->rx_ts_enabled) > + cpts_rx_timestamp(cpsw->cpts, skb); > + skb->protocol = eth_type_trans(skb, ndev); > + > + /* as cpsw handles one packet per NAPI recycle page before increasing > + * refcounter, holding this in page pool cache > + */ > + page_pool_recycle_direct(pool, page); > + > + /* it's decremented by netstack after what can be allocated > + * in cpsw_alloc_page() > + */ > + page_ref_inc(page); > + netif_receive_skb(skb); > + > + ndev->stats.rx_bytes += len; > + ndev->stats.rx_packets++; > > requeue: > if (netif_dormant(ndev)) { > - dev_kfree_skb_any(new_skb); > - return 0; > + page_pool_recycle_direct(pool, new_page); > + return flush; > } > > - ch = cpsw->rxv[skb_get_queue_mapping(new_skb)].ch; > - ret = cpdma_chan_submit(ch, new_skb, new_skb->data, > - skb_tailroom(new_skb), 0); > + new_xmeta->ndev = ndev; > + new_xmeta->ch = ch; > + dma = new_page->dma_addr + CPSW_HEADROOM; > + ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, (void *)dma, > + pkt_size, 0); > if (WARN_ON(ret < 0)) > - dev_kfree_skb_any(new_skb); > + page_pool_recycle_direct(pool, new_page); > + else > + kmemleak_not_leak(new_xmeta); /* Is it needed? */ > > - return 0; > + return flush; > } On a quick scan I don't see DMA syncs, does the DMA driver takes care of making sure the DMA sync happens? > static void cpsw_split_res(struct net_device *ndev) > @@ -2684,6 +2949,63 @@ static int cpsw_ndo_setup_tc(struct net_device *ndev, enum tc_setup_type type, > } > } > > +static int cpsw_xdp_prog_setup(struct net_device *ndev, struct bpf_prog *prog) > +{ > + struct cpsw_priv *priv = netdev_priv(ndev); > + struct bpf_prog *old_prog; > + > + if (!priv->xdp_prog && !prog) > + return 0; > + > + old_prog = xchg(&priv->xdp_prog, prog); > + if (old_prog) > + bpf_prog_put(old_prog); > + > + return 0; > +} > + > +static int cpsw_ndo_bpf(struct net_device *ndev, struct netdev_bpf *bpf) > +{ > + struct cpsw_priv *priv = netdev_priv(ndev); > + > + switch (bpf->command) { > + case XDP_SETUP_PROG: > + return cpsw_xdp_prog_setup(ndev, bpf->prog); > + > + case XDP_QUERY_PROG: > + bpf->prog_id = priv->xdp_prog ? priv->xdp_prog->aux->id : 0; Consider using xdp_attachment_query() and friends. This way you'll also return the flags. > + return 0; > + > + default: > + return -EINVAL; > + } > +} > + > +static int cpsw_ndo_xdp_xmit(struct net_device *ndev, int n, > + struct xdp_frame **frames, u32 flags) > +{ > + struct cpsw_priv *priv = netdev_priv(ndev); > + struct xdp_frame *xdpf; > + int i, drops = 0; > + > + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) > + return -EINVAL; > + > + for (i = 0; i < n; i++) { > + xdpf = frames[i]; > + if (xdpf->len < CPSW_MIN_PACKET_SIZE) { > + xdp_return_frame_rx_napi(xdpf); > + drops++; > + continue; > + } > + > + if (cpsw_xdp_tx_frame(priv, xdpf)) > + drops++; > + } > + > + return n - drops; > +} > + > static const struct net_device_ops cpsw_netdev_ops = { > .ndo_open = cpsw_ndo_open, > .ndo_stop = cpsw_ndo_stop, > @@ -2700,6 +3022,8 @@ static const struct net_device_ops cpsw_netdev_ops = { > .ndo_vlan_rx_add_vid = cpsw_ndo_vlan_rx_add_vid, > .ndo_vlan_rx_kill_vid = cpsw_ndo_vlan_rx_kill_vid, > .ndo_setup_tc = cpsw_ndo_setup_tc, > + .ndo_bpf = cpsw_ndo_bpf, > + .ndo_xdp_xmit = cpsw_ndo_xdp_xmit, > }; > > static int cpsw_get_regs_len(struct net_device *ndev) > @@ -2920,6 +3244,57 @@ static int cpsw_check_ch_settings(struct cpsw_common *cpsw, > return 0; > } > > +static void cpsw_xdp_rxq_unreg(struct cpsw_common *cpsw, int ch) > +{ > + struct cpsw_slave *slave; > + struct cpsw_priv *priv; > + int i; > + > + for (i = cpsw->data.slaves, slave = cpsw->slaves; i; i--, slave++) { > + if (!slave->ndev) > + continue; > + > + priv = netdev_priv(slave->ndev); > + xdp_rxq_info_unreg(&priv->xdp_rxq[ch]); > + } > +} > + > +static int cpsw_xdp_rxq_reg(struct cpsw_common *cpsw, int ch) > +{ > + struct cpsw_slave *slave; > + struct cpsw_priv *priv; > + int i, ret; > + > + /* As channels are common for both ports sharing same queues, xdp_rxq > + * information also becomes shared and used by every packet on this > + * channel. But exch xdp_rxq holds link on netdev, which by the theory > + * can have different memory model and so, network device must hold it's > + * own set of rxq and thus both netdevs should be prepared > + */ > + for (i = cpsw->data.slaves, slave = cpsw->slaves; i; i--, slave++) { > + if (!slave->ndev) > + continue; > + > + priv = netdev_priv(slave->ndev); > + > + ret = xdp_rxq_info_reg(&priv->xdp_rxq[ch], priv->ndev, ch); > + if (ret) > + goto err; > + > + ret = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq[ch], > + MEM_TYPE_PAGE_POOL, > + cpsw->rx_page_pool); > + if (ret) > + goto err; > + } > + > + return ret; > + > +err: > + cpsw_xdp_rxq_unreg(cpsw, ch); > + return ret; > +} > + > static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx) > { > struct cpsw_common *cpsw = priv->cpsw; > @@ -2950,6 +3325,11 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx) > if (!vec[*ch].ch) > return -EINVAL; > > + if (rx && cpsw_xdp_rxq_reg(cpsw, *ch)) { > + cpdma_chan_destroy(vec[*ch].ch); > + return -EINVAL; > + } > + > cpsw_info(priv, ifup, "created new %d %s channel\n", *ch, > (rx ? "rx" : "tx")); > (*ch)++; > @@ -2958,6 +3338,9 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx) > while (*ch > ch_num) { > (*ch)--; > > + if (rx) > + cpsw_xdp_rxq_unreg(cpsw, *ch); > + > ret = cpdma_chan_destroy(vec[*ch].ch); > if (ret) > return ret; > @@ -3446,6 +3829,15 @@ static int cpsw_probe_dual_emac(struct cpsw_priv *priv) > ndev->netdev_ops = &cpsw_netdev_ops; > ndev->ethtool_ops = &cpsw_ethtool_ops; > > + ret = xdp_rxq_info_reg(&priv_sl2->xdp_rxq[0], ndev, 0); > + if (ret) > + return ret; > + > + ret = xdp_rxq_info_reg_mem_model(&priv_sl2->xdp_rxq[0], > + MEM_TYPE_PAGE_SHARED, NULL); > + if (ret) > + return ret; > + > /* register the network device */ > SET_NETDEV_DEV(ndev, cpsw->dev); > ret = register_netdev(ndev); > @@ -3517,6 +3909,12 @@ static int cpsw_probe(struct platform_device *pdev) > goto clean_ndev_ret; > } > > + cpsw->rx_page_pool = cpsw_create_rx_pool(cpsw); > + if (IS_ERR(cpsw->rx_page_pool)) { > + dev_err(&pdev->dev, "create rx page pool\n"); > + goto clean_ndev_ret; > + } > + > /* > * This may be required here for child devices. > */ > @@ -3663,20 +4061,31 @@ static int cpsw_probe(struct platform_device *pdev) > cpsw->quirk_irq = 1; > > ch = cpsw->quirk_irq ? 0 : 7; > - cpsw->txv[0].ch = cpdma_chan_create(cpsw->dma, ch, cpsw_tx_handler, 0); > + cpsw->txv[0].ch = > + cpdma_chan_create(cpsw->dma, ch, cpsw_tx_handler, 0); > if (IS_ERR(cpsw->txv[0].ch)) { > dev_err(priv->dev, "error initializing tx dma channel\n"); > ret = PTR_ERR(cpsw->txv[0].ch); > goto clean_dma_ret; > } > > - cpsw->rxv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1); > + cpsw->rxv[0].ch = > + cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1); > if (IS_ERR(cpsw->rxv[0].ch)) { > dev_err(priv->dev, "error initializing rx dma channel\n"); > ret = PTR_ERR(cpsw->rxv[0].ch); > goto clean_dma_ret; > } > > + ret = xdp_rxq_info_reg(&priv->xdp_rxq[0], ndev, 0); > + if (ret) > + goto clean_dma_ret; > + > + ret = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq[0], MEM_TYPE_PAGE_POOL, > + cpsw->rx_page_pool); > + if (ret) > + goto clean_dma_ret; > + > ale_params.dev = &pdev->dev; > ale_params.ale_ageout = ale_ageout; > ale_params.ale_entries = data->ale_entries; I think you need to unreg the mem model somewhere on the failure path, no? > @@ -3786,6 +4195,7 @@ static int cpsw_probe(struct platform_device *pdev) > pm_runtime_put_sync(&pdev->dev); > clean_runtime_disable_ret: > pm_runtime_disable(&pdev->dev); > + page_pool_destroy(cpsw->rx_page_pool); > clean_ndev_ret: > free_netdev(priv->ndev); > return ret; > @@ -3809,6 +4219,7 @@ static int cpsw_remove(struct platform_device *pdev) > > cpts_release(cpsw->cpts); > cpdma_ctlr_destroy(cpsw->dma); > + page_pool_destroy(cpsw->rx_page_pool); > cpsw_remove_dt(pdev); > pm_runtime_put_sync(&pdev->dev); > pm_runtime_disable(&pdev->dev);