This improves XDP_TX performance by about 8%. Here are single core XDP_TX test results. CPU consumptions are taken from "perf report --no-child". - Before: 7.26 Mpps _raw_spin_lock 7.83% veth_xdp_xmit 12.23% - After: 7.84 Mpps _raw_spin_lock 1.17% veth_xdp_xmit 6.45% Signed-off-by: Toshiaki Makita <makita.toshiaki@xxxxxxxxxxxxx> --- drivers/net/veth.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 52110e5..4edc75f 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n, return ret; } +static void veth_xdp_flush_bq(struct net_device *dev) +{ + struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq); + int sent, i, err = 0; + + sent = veth_xdp_xmit(dev, bq->count, bq->q, 0); + if (sent < 0) { + err = sent; + sent = 0; + for (i = 0; i < bq->count; i++) + xdp_return_frame(bq->q[i]); + } + trace_xdp_bulk_tx(dev, sent, bq->count - sent, err); + + bq->count = 0; +} + static void veth_xdp_flush(struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); @@ -449,6 +466,7 @@ static void veth_xdp_flush(struct net_device *dev) struct veth_rq *rq; rcu_read_lock(); + veth_xdp_flush_bq(dev); rcv = rcu_dereference(priv->peer); if (unlikely(!rcv)) goto out; @@ -466,12 +484,18 @@ static void veth_xdp_flush(struct net_device *dev) static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) { + struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq); struct xdp_frame *frame = convert_to_xdp_frame(xdp); if (unlikely(!frame)) return -EOVERFLOW; - return veth_xdp_xmit(dev, 1, &frame, 0); + if (unlikely(bq->count == XDP_TX_BULK_SIZE)) + veth_xdp_flush_bq(dev); + + bq->q[bq->count++] = frame; + + return 0; } static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, -- 1.8.3.1