Hangbin Liu wrote: > From: Jesper Dangaard Brouer <brouer@xxxxxxxxxx> > > This changes the devmap XDP program support to run the program when the > bulk queue is flushed instead of before the frame is enqueued. This has > a couple of benefits: > > - It "sorts" the packets by destination devmap entry, and then runs the > same BPF program on all the packets in sequence. This ensures that we > keep the XDP program and destination device properties hot in I-cache. > > - It makes the multicast implementation simpler because it can just > enqueue packets using bq_enqueue() without having to deal with the > devmap program at all. > > The drawback is that if the devmap program drops the packet, the enqueue > step is redundant. However, arguably this is mostly visible in a > micro-benchmark, and with more mixed traffic the I-cache benefit should > win out. The performance impact of just this patch is as follows: > > The bq_xmit_all's logic is also refactored and error label is removed. > When bq_xmit_all() is called from bq_enqueue(), another packet will > always be enqueued immediately after, so clearing dev_rx, xdp_prog and > flush_node in bq_xmit_all() is redundant. Let's move the clear to > __dev_flush(), and only check them once in bq_enqueue() since they are > all modified together. > > By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd: > ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 > > There are about +/- 0.1M deviation for native testing, the performance > improved for the base-case, but some drop back with xdp devmap prog attached. > > Version | Test | Generic | Native | Native + 2nd xdp_prog > 5.10 rc6 | xdp_redirect_map i40e->i40e | 2.0M | 9.1M | 8.0M > 5.10 rc6 | xdp_redirect_map i40e->veth | 1.7M | 11.0M | 9.7M > 5.10 rc6 + patch | xdp_redirect_map i40e->i40e | 2.0M | 9.5M | 7.5M > 5.10 rc6 + patch | xdp_redirect_map i40e->veth | 1.7M | 11.6M | 9.1M > [...] > +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, > + struct xdp_frame **frames, int n, > + struct net_device *dev) > +{ > + struct xdp_txq_info txq = { .dev = dev }; > + struct xdp_buff xdp; > + int i, nframes = 0; > + > + for (i = 0; i < n; i++) { > + struct xdp_frame *xdpf = frames[i]; > + u32 act; > + int err; > + > + xdp_convert_frame_to_buff(xdpf, &xdp); > + xdp.txq = &txq; > + > + act = bpf_prog_run_xdp(xdp_prog, &xdp); > + switch (act) { > + case XDP_PASS: > + err = xdp_update_frame_from_buff(&xdp, xdpf); > + if (unlikely(err < 0)) > + xdp_return_frame_rx_napi(xdpf); > + else > + frames[nframes++] = xdpf; > + break; > + default: > + bpf_warn_invalid_xdp_action(act); > + fallthrough; > + case XDP_ABORTED: > + trace_xdp_exception(dev, xdp_prog, act); > + fallthrough; > + case XDP_DROP: > + xdp_return_frame_rx_napi(xdpf); > + break; > + } > + } > + return nframes; /* sent frames count */ > +} > + > static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) > { > struct net_device *dev = bq->dev; > - int sent = 0, drops = 0, err = 0; > + unsigned int cnt = bq->count; > + int drops = 0, err = 0; > + int to_send = cnt; > + int sent = cnt; > int i; > > - if (unlikely(!bq->count)) > + if (unlikely(!cnt)) > return; > > - for (i = 0; i < bq->count; i++) { > + for (i = 0; i < cnt; i++) { > struct xdp_frame *xdpf = bq->q[i]; > > prefetch(xdpf); > } > > - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); > + if (bq->xdp_prog) { > + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); > + if (!to_send) { > + sent = 0; > + goto out; > + } > + drops = cnt - to_send; > + } I might be missing something about how *bq works here. What happens when dev_map_bpf_prog_run returns to_send < cnt? So I read this as it will send [0, to_send] and [to_send, cnt] will be dropped? How do we know the bpf prog would have dropped the set, [to_send+1, cnt]? > + > + sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); > if (sent < 0) { > err = sent; > sent = 0; > - goto error; > + > + /* If ndo_xdp_xmit fails with an errno, no frames have been > + * xmit'ed and it's our responsibility to them free all. > + */ > + for (i = 0; i < cnt - drops; i++) { > + struct xdp_frame *xdpf = bq->q[i]; > + > + xdp_return_frame_rx_napi(xdpf); > + } > } > - drops = bq->count - sent; > out: > + drops = cnt - sent; > bq->count = 0; > > trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); I gather the remaining [to_send+1, cnt] packets are in fact dropped because we set bq->count=0 and trace_xdp_devmap_xmit seems to think they are dropped. Also update the comment in trace_xdp_devmap_xmit, /* Catch API error of drv ndo_xdp_xmit sent more than count */ so that it reads to account for devmap progs as well? > - bq->dev_rx = NULL; > - __list_del_clearprev(&bq->flush_node); > return; > -error: > - /* If ndo_xdp_xmit fails with an errno, no frames have been > - * xmit'ed and it's our responsibility to them free all. > - */ > - for (i = 0; i < bq->count; i++) { > - struct xdp_frame *xdpf = bq->q[i]; > - > - xdp_return_frame_rx_napi(xdpf); > - drops++; > - } > - goto out; > } [...] > -static struct xdp_buff *dev_map_run_prog(struct net_device *dev, > - struct xdp_buff *xdp, > - struct bpf_prog *xdp_prog) > -{ > - struct xdp_txq_info txq = { .dev = dev }; > - u32 act; > - > - xdp_set_data_meta_invalid(xdp); > - xdp->txq = &txq; > - > - act = bpf_prog_run_xdp(xdp_prog, xdp); > - switch (act) { > - case XDP_PASS: > - return xdp; > - case XDP_DROP: > - break; > - default: > - bpf_warn_invalid_xdp_action(act); > - fallthrough; > - case XDP_ABORTED: > - trace_xdp_exception(dev, xdp_prog, act); > - break; > - } > - > - xdp_return_buff(xdp); > - return NULL; > -} > - > int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, > struct net_device *dev_rx) > { > - return __xdp_enqueue(dev, xdp, dev_rx); > + return __xdp_enqueue(dev, xdp, dev_rx, NULL); > } > > int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > @@ -489,12 +516,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > { > struct net_device *dev = dst->dev; > > - if (dst->xdp_prog) { > - xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); > - if (!xdp) > - return 0; So here it looks like dev_map_run_prog will not drop extra packets, but will see every single packet. Are we changing the semantics subtle here? This looks like a problem to me. We should not drop packets in the new case unless bpf program tells us to. > - } > - return __xdp_enqueue(dev, xdp, dev_rx); > + return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); > } Did I miss something? If not maybe a comment in the commit message would help or in the code itself. Thanks, John