XDP_REDIRECT support for mergeable buffer was removed since commit 7324f5399b06 ("virtio_net: disable XDP_REDIRECT in receive_mergeable() case"). This is because we don't reserve enough tailroom for struct skb_shared_info which breaks XDP assumption. Other complaints are, the complex linearize logic and EWMA estimation may increase the possibility of linearizing. Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx> --- drivers/net/virtio_net.c | 107 +++++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 40 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 9bb9e56..81190ba 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -537,6 +537,26 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, return NULL; } +static struct sk_buff *virtnet_skb_xdp(struct receive_queue *rq, + struct sk_buff *skb) +{ + struct bpf_prog *xdp_prog; + int ret; + + rcu_read_lock(); + xdp_prog = rcu_dereference(rq->xdp_prog); + if (xdp_prog) { + ret = do_xdp_generic(xdp_prog, skb); + if (ret != XDP_PASS) { + rcu_read_unlock(); + return NULL; + } + } + rcu_read_unlock(); + + return skb; +} + static struct sk_buff *receive_small(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, @@ -689,31 +709,30 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, struct bpf_prog *xdp_prog; unsigned int truesize; unsigned int headroom = mergeable_ctx_to_headroom(ctx); - bool sent; + bool sent, skb_xdp = false; + int err; head_skb = NULL; rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (xdp_prog) { - struct page *xdp_page; struct xdp_buff xdp; void *data; u32 act; - /* This happens when rx buffer size is underestimated */ + /* This happens when rx buffer size is underestimated + * or headroom is not enough because of the buffer + * was refilled before XDP is set. In both cases, + * for simplicity, we will offload them to generic + * XDP routine. This should only happen for the first + * several packets, so we don't care much about its + * performance. + */ if (unlikely(num_buf > 1 || headroom < virtnet_get_headroom(vi))) { - /* linearize data for XDP */ - xdp_page = xdp_linearize_page(rq, &num_buf, - page, offset, - VIRTIO_XDP_HEADROOM, - &len); - if (!xdp_page) - goto err_xdp; - offset = VIRTIO_XDP_HEADROOM; - } else { - xdp_page = page; + skb_xdp = true; + goto skb_xdp; } /* Transient failure which in theory could occur if @@ -727,7 +746,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, /* Allow consuming headroom but reserve enough space to push * the descriptor on if we get an XDP_TX return code. */ - data = page_address(xdp_page) + offset; + data = page_address(page) + offset; xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; xdp.data = data + vi->hdr_len; xdp_set_data_meta_invalid(&xdp); @@ -736,9 +755,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, act = bpf_prog_run_xdp(xdp_prog, &xdp); - if (act != XDP_PASS) - ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); - switch (act) { case XDP_PASS: /* recalculate offset to account for any header @@ -746,28 +762,22 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, * skb and avoid using offset */ offset = xdp.data - - page_address(xdp_page) - vi->hdr_len; - - /* We can only create skb based on xdp_page. */ - if (unlikely(xdp_page != page)) { - rcu_read_unlock(); - put_page(page); - head_skb = page_to_skb(vi, rq, xdp_page, - offset, len, PAGE_SIZE); - return head_skb; - } + page_address(page) - vi->hdr_len; break; case XDP_TX: sent = __virtnet_xdp_xmit(vi, &xdp); if (unlikely(!sent)) { trace_xdp_exception(vi->dev, xdp_prog, act); - if (unlikely(xdp_page != page)) - put_page(xdp_page); goto err_xdp; } *xdp_xmit = true; - if (unlikely(xdp_page != page)) + rcu_read_unlock(); + goto xdp_xmit; + case XDP_REDIRECT: + err = xdp_do_redirect(dev, &xdp, xdp_prog); + if (err) goto err_xdp; + *xdp_xmit = true; rcu_read_unlock(); goto xdp_xmit; default: @@ -775,13 +785,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, case XDP_ABORTED: trace_xdp_exception(vi->dev, xdp_prog, act); case XDP_DROP: - if (unlikely(xdp_page != page)) - __free_pages(xdp_page, 0); goto err_xdp; } } rcu_read_unlock(); +skb_xdp: truesize = mergeable_ctx_to_truesize(ctx); if (unlikely(len > truesize)) { pr_debug("%s: rx error: len %u exceeds truesize %lu\n", @@ -848,7 +857,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } } - ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len); + if (skb_xdp) + head_skb = virtnet_skb_xdp(rq, head_skb); + else + ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len); + return head_skb; err_xdp: @@ -1013,13 +1026,18 @@ static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq, } static unsigned int get_mergeable_buf_len(struct receive_queue *rq, - struct ewma_pkt_len *avg_pkt_len) + struct ewma_pkt_len *avg_pkt_len, + unsigned int room) { const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); unsigned int len; - len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len), + if (room) + return PAGE_SIZE - room; + + len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len), rq->min_buf_len, PAGE_SIZE - hdr_len); + return ALIGN(len, L1_CACHE_BYTES); } @@ -1028,21 +1046,27 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, { struct page_frag *alloc_frag = &rq->alloc_frag; unsigned int headroom = virtnet_get_headroom(vi); + unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; + unsigned int room = SKB_DATA_ALIGN(headroom + tailroom); char *buf; void *ctx; int err; unsigned int len, hole; - len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len); - if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp))) + /* Extra tailroom is needed to satisfy XDP's assumption. This + * means rx frags coalescing won't work, but consider we've + * disabled GSO for XDP, it won't be a big issue. + */ + len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) return -ENOMEM; buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; buf += headroom; /* advance address leaving hole at front of pkt */ get_page(alloc_frag->page); - alloc_frag->offset += len + headroom; + alloc_frag->offset += len + room; hole = alloc_frag->size - alloc_frag->offset; - if (hole < len + headroom) { + if (hole < len + room) { /* To avoid internal fragmentation, if there is very likely not * enough space for another buffer, add the remaining space to * the current buffer. @@ -2576,12 +2600,15 @@ static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue, { struct virtnet_info *vi = netdev_priv(queue->dev); unsigned int queue_index = get_netdev_rx_queue_index(queue); + unsigned int headroom = virtnet_get_headroom(vi); + unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; struct ewma_pkt_len *avg; BUG_ON(queue_index >= vi->max_queue_pairs); avg = &vi->rq[queue_index].mrg_avg_pkt_len; return sprintf(buf, "%u\n", - get_mergeable_buf_len(&vi->rq[queue_index], avg)); + get_mergeable_buf_len(&vi->rq[queue_index], avg, + SKB_DATA_ALIGN(headroom + tailroom))); } static struct rx_queue_attribute mergeable_rx_buffer_size_attribute = -- 2.7.4 _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization