Due to long-term development, the current code structure of page_to_skb() and the semantic expression of variables are rather chaotic, so it is necessary to reconstruct this piece of logic. The code has been tested with the following test code. Call test_merge and test_big to test "merge" and "big" modes respectively. ================================= test.sh ======================== function test() { flag=$1 max=$2 inc=$3 if [ "x$max" == "x" ] then max=4096 fi echo $max for s in $(seq 64 $inc $max) do echo >> log echo $flag :UDP_STREAM $s >> log netperf -H 192.168.122.202 -l 5 -t UDP_STREAM -- -m $s >> log echo $flag :UDP_STREAM $s $? done for s in $(seq 64 $inc $max) do echo >> log echo $flag :TCP_STREAM $s >> log netperf -H 192.168.122.202 -l 5 -t TCP_STREAM -- -m $s >> log echo $flag :TCP_STREAM $s $? done } function test_merge() { XDP='no-xdp' ssh root@192.168.122.202 ip link set dev eth0 xdp off test $XDP XDP='xdp-pass' ssh root@192.168.122.202 ip link set dev eth0 xdp off ssh root@192.168.122.202 ip link set dev eth0 xdp object xdp.o sec xdp test $XDP XDP='xdp-meta' ssh root@192.168.122.202 ip link set dev eth0 xdp off ssh root@192.168.122.202 ip link set dev eth0 xdp object xdp_meta.o sec xdp_mark test $XDP } function test_big() { # set host net dev mtu to 60000 test "big" 5000 test "big" 60000 10 } ssh root@192.168.122.202 ./netserver echo '' > log test_merge # or test_big ================================== xdp.c ================================= SEC("xdp") int _xdp(struct xdp_md *xdp) { return XDP_PASS; } char _license[] SEC("license") = "GPL"; ================================== xdp_meta.c ================================= static long (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 54; struct meta_info { __u32 mark; } __attribute__((aligned(4))); SEC("xdp_mark") int _xdp_mark(struct xdp_md *ctx) { struct meta_info *meta; void *data, *data_end; int ret; ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta)); if (ret < 0) return XDP_ABORTED; data = (void *)(unsigned long)ctx->data; /* Check data_meta have room for meta_info struct */ meta = (void *)(unsigned long)ctx->data_meta; if ((void *)(meta + 1) > data) return XDP_ABORTED; meta->mark = 42; return XDP_PASS; } char _license[] SEC("license") = "GPL"; =================================================================== Signed-off-by: Xuan Zhuo <xuanzhuo@xxxxxxxxxxxxxxxxx> --- drivers/net/virtio_net.c | 309 +++++++++++++++++++++++---------------- 1 file changed, 185 insertions(+), 124 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 7fda2ae4c40f..a117b3496653 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -249,6 +249,35 @@ struct padded_vnet_hdr { char padding[4]; }; +struct virtnet_page_info { + struct virtnet_info *vi; + struct receive_queue *rq; + + /* this may be the head_page, buf not starts with this page */ + struct page *page; + + /* the allcated buf. this may point to the headroom */ + char *buf; + + /* the size of the buf */ + unsigned int buf_size; + + /* OUT. the offset of the remaining data in the page */ + unsigned int offset; + + char *virtnet_hdr; + + /* packet data. generally point to eth header */ + char *packet; + + /* IN. packet len without virtnet hdr + * OUT. the size of the remaining data + */ + unsigned int len; + + unsigned int metasize; +}; + static bool is_xdp_frame(void *ptr) { return (unsigned long)ptr & VIRTIO_XDP_FLAG; @@ -357,6 +386,89 @@ static void skb_xmit_done(struct virtqueue *vq) netif_wake_subqueue(vi->dev, vq2txq(vq)); } +static struct sk_buff *virtnet_page_to_skb(struct virtnet_page_info *pinfo) +{ + int shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct virtio_net_hdr_mrg_rxbuf *hdr; + struct sk_buff *skb; + int tailroom, copy; + + /* In the case of "big", tailroom may be negative, because len can be + * greater than PAGE_SIZE. + */ + tailroom = pinfo->buf + pinfo->buf_size - (pinfo->packet + pinfo->len); + + if (!NET_IP_ALIGN && tailroom >= shinfo_size) { + skb = build_skb(pinfo->buf, pinfo->buf_size); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, pinfo->packet - pinfo->buf); + skb_put(skb, pinfo->len); + + /* mark. page has been used. */ + pinfo->page = NULL; + } else { + /* copy small data so we can reuse these pages for small data + * + * GOOD_COPY_LEN is used to save network headers, such as eth + * header, ip header, tcp header. If you want to save metadata + * information, we should apply for a larger space. Prevent the + * network header cannot fit in the linear space. + */ + skb = napi_alloc_skb(&pinfo->rq->napi, + pinfo->metasize + GOOD_COPY_LEN); + if (unlikely(!skb)) + return NULL; + + /* Copy all frame if it fits skb->head, otherwise + * we let virtio_net_hdr_to_skb() and GRO pull headers as needed. + */ + if (pinfo->len <= GOOD_COPY_LEN) + copy = pinfo->len; + else + copy = ETH_HLEN; + + skb_put_data(skb, pinfo->packet - pinfo->metasize, + copy + pinfo->metasize); + __skb_pull(skb, pinfo->metasize); + pinfo->len -= copy; + pinfo->offset = pinfo->packet + copy - + (char *)page_address(pinfo->page); + } + + if (pinfo->metasize) + skb_metadata_set(skb, pinfo->metasize); + + if (pinfo->virtnet_hdr) { + hdr = skb_vnet_hdr(skb); + memcpy(hdr, pinfo->virtnet_hdr, pinfo->vi->hdr_len); + } + + return skb; +} + +static struct sk_buff *virtnet_merge_page_to_skb(struct virtnet_page_info *pinfo) +{ + struct sk_buff *skb; + + skb = virtnet_page_to_skb(pinfo); + if (unlikely(!skb)) + return NULL; + + /* page has been used by build_skb() */ + if (!pinfo->page) + return skb; + + if (pinfo->len) + skb_add_rx_frag(skb, 0, pinfo->page, pinfo->offset, pinfo->len, + pinfo->buf_size); + else + put_page(pinfo->page); + + return skb; +} + #define MRG_CTX_HEADER_SHIFT 22 static void *mergeable_len_to_ctx(unsigned int truesize, unsigned int headroom) @@ -375,86 +487,30 @@ static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx) } /* Called from bottom half context */ -static struct sk_buff *page_to_skb(struct virtnet_info *vi, - struct receive_queue *rq, - struct page *page, unsigned int offset, - unsigned int len, unsigned int truesize, - bool hdr_valid, unsigned int metasize, - unsigned int headroom) +static struct sk_buff *virtnet_big_page_to_skb(struct virtnet_page_info *pinfo) { + unsigned int len, offset, truesize; + struct receive_queue *rq; struct sk_buff *skb; - struct virtio_net_hdr_mrg_rxbuf *hdr; - unsigned int copy, hdr_len, hdr_padded_len; - struct page *page_to_free = NULL; - int tailroom, shinfo_size; - char *p, *hdr_p, *buf; + struct page *page; - p = page_address(page) + offset; - hdr_p = p; + /* save next page */ + page = (struct page *)pinfo->page->private; - hdr_len = vi->hdr_len; - if (vi->mergeable_rx_bufs) - hdr_padded_len = sizeof(*hdr); - else - hdr_padded_len = sizeof(struct padded_vnet_hdr); - - /* If headroom is not 0, there is an offset between the beginning of the - * data and the allocated space, otherwise the data and the allocated - * space are aligned. - */ - if (headroom) { - /* Buffers with headroom use PAGE_SIZE as alloc size, - * see add_recvbuf_mergeable() + get_mergeable_buf_len() - */ - truesize = PAGE_SIZE; - tailroom = truesize - len - offset; - buf = page_address(page); - } else { - tailroom = truesize - len; - buf = p; - } - - len -= hdr_len; - offset += hdr_padded_len; - p += hdr_padded_len; - - shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - - /* copy small packet so we can reuse these pages */ - if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) { - skb = build_skb(buf, truesize); - if (unlikely(!skb)) - return NULL; - - skb_reserve(skb, p - buf); - skb_put(skb, len); - goto ok; - } - - /* copy small packet so we can reuse these pages for small data */ - skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN); + skb = virtnet_page_to_skb(pinfo); if (unlikely(!skb)) return NULL; - /* Copy all frame if it fits skb->head, otherwise - * we let virtio_net_hdr_to_skb() and GRO pull headers as needed. - */ - if (len <= skb_tailroom(skb)) - copy = len; - else - copy = ETH_HLEN + metasize; - skb_put_data(skb, p, copy); + rq = pinfo->rq; - len -= copy; - offset += copy; + /* page has been used by build_skb() */ + if (!pinfo->page) + goto end; - if (vi->mergeable_rx_bufs) { - if (len) - skb_add_rx_frag(skb, 0, page, offset, len, truesize); - else - page_to_free = page; - goto ok; - } + page = pinfo->page; + len = pinfo->len; + offset = pinfo->offset; + truesize = pinfo->buf_size; /* * Verify that we can indeed put this data into a skb. @@ -477,23 +533,10 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, offset = 0; } +end: if (page) give_pages(rq, page); -ok: - /* hdr_valid means no XDP, so we can copy the vnet header */ - if (hdr_valid) { - hdr = skb_vnet_hdr(skb); - memcpy(hdr, hdr_p, hdr_len); - } - if (page_to_free) - put_page(page_to_free); - - if (metasize) { - __skb_pull(skb, metasize); - skb_metadata_set(skb, metasize); - } - return skb; } @@ -654,17 +697,17 @@ static unsigned int virtnet_get_headroom(struct virtnet_info *vi) */ static struct page *xdp_linearize_page(struct receive_queue *rq, u16 *num_buf, - struct page *p, - int offset, + void *buf, int page_off, unsigned int *len) { struct page *page = alloc_page(GFP_ATOMIC); + struct page *p; if (!page) return NULL; - memcpy(page_address(page) + page_off, page_address(p) + offset, *len); + memcpy(page_address(page) + page_off, buf, *len); page_off += *len; while (--*num_buf) { @@ -739,18 +782,18 @@ static struct sk_buff *receive_small(struct net_device *dev, goto err_xdp; if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) { - int offset = buf - page_address(page) + header_offset; unsigned int tlen = len + vi->hdr_len; u16 num_buf = 1; + buf += header_offset; + xdp_headroom = virtnet_get_headroom(vi); header_offset = VIRTNET_RX_PAD + xdp_headroom; headroom = vi->hdr_len + header_offset; buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - xdp_page = xdp_linearize_page(rq, &num_buf, page, - offset, header_offset, - &tlen); + xdp_page = xdp_linearize_page(rq, &num_buf, buf, + header_offset, &tlen); if (!xdp_page) goto err_xdp; @@ -842,9 +885,21 @@ static struct sk_buff *receive_big(struct net_device *dev, unsigned int len, struct virtnet_rq_stats *stats) { + struct virtnet_page_info pinfo; struct page *page = buf; - struct sk_buff *skb = - page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0); + struct sk_buff *skb; + + pinfo.rq = rq; + pinfo.vi = vi; + pinfo.page = page; + pinfo.buf = page_address(page); + pinfo.buf_size = PAGE_SIZE; + pinfo.virtnet_hdr = pinfo.buf; + pinfo.packet = pinfo.virtnet_hdr + sizeof(struct padded_vnet_hdr); + pinfo.len = len - vi->hdr_len; + pinfo.metasize = 0; + + skb = virtnet_big_page_to_skb(&pinfo); stats->bytes += len - vi->hdr_len; if (unlikely(!skb)) @@ -870,12 +925,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, struct virtio_net_hdr_mrg_rxbuf *hdr = buf; u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); struct page *page = virt_to_head_page(buf); - int offset = buf - page_address(page); struct sk_buff *head_skb, *curr_skb; + struct virtnet_page_info pinfo; struct bpf_prog *xdp_prog; unsigned int truesize = mergeable_ctx_to_truesize(ctx); unsigned int headroom = mergeable_ctx_to_headroom(ctx); - unsigned int metasize = 0; unsigned int frame_sz; int err; @@ -887,8 +941,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, if (xdp_prog) { struct xdp_frame *xdpf; struct page *xdp_page; + void *hard_start; struct xdp_buff xdp; - void *data; u32 act; /* Transient failure which in theory could occur if @@ -912,54 +966,47 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, if (unlikely(num_buf > 1 || headroom < virtnet_get_headroom(vi))) { /* linearize data for XDP */ - xdp_page = xdp_linearize_page(rq, &num_buf, - page, offset, + xdp_page = xdp_linearize_page(rq, &num_buf, buf, VIRTIO_XDP_HEADROOM, &len); frame_sz = PAGE_SIZE; if (!xdp_page) goto err_xdp; - offset = VIRTIO_XDP_HEADROOM; + + hard_start = page_address(xdp_page) + vi->hdr_len; } else { xdp_page = page; + hard_start = buf + vi->hdr_len - VIRTIO_XDP_HEADROOM; } /* Allow consuming headroom but reserve enough space to push * the descriptor on if we get an XDP_TX return code. */ - data = page_address(xdp_page) + offset; xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq); - xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len, - VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true); + xdp_prepare_buff(&xdp, hard_start, VIRTIO_XDP_HEADROOM, + len - vi->hdr_len, true); act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; switch (act) { case XDP_PASS: - metasize = xdp.data - xdp.data_meta; - - /* recalculate offset to account for any header - * adjustments and minus the metasize to copy the - * metadata in page_to_skb(). Note other cases do not - * build an skb and avoid using offset - */ - offset = xdp.data - page_address(xdp_page) - - vi->hdr_len - metasize; - - /* recalculate len if xdp.data, xdp.data_end or - * xdp.data_meta were adjusted - */ - len = xdp.data_end - xdp.data + vi->hdr_len + metasize; + pinfo.rq = rq; + pinfo.vi = vi; + pinfo.page = xdp_page; + pinfo.buf = xdp.data_hard_start - vi->hdr_len; + pinfo.buf_size = PAGE_SIZE; + pinfo.virtnet_hdr = NULL; + pinfo.packet = xdp.data; + pinfo.len = xdp.data_end - xdp.data; + pinfo.metasize = xdp.data - xdp.data_meta; /* We can only create skb based on xdp_page. */ if (unlikely(xdp_page != page)) { rcu_read_unlock(); put_page(page); - head_skb = page_to_skb(vi, rq, xdp_page, offset, - len, PAGE_SIZE, false, - metasize, headroom); - return head_skb; + + return virtnet_merge_page_to_skb(&pinfo); } break; case XDP_TX: @@ -1005,8 +1052,22 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, __free_pages(xdp_page, 0); goto err_xdp; } + rcu_read_unlock(); + + /* pinfo has been filled inside XDP_PASS */ + } else { + rcu_read_unlock(); + + pinfo.rq = rq; + pinfo.vi = vi; + pinfo.page = page; + pinfo.buf = buf - headroom; + pinfo.buf_size = headroom ? PAGE_SIZE : truesize; + pinfo.virtnet_hdr = buf; + pinfo.packet = buf + sizeof(*hdr); + pinfo.len = len - sizeof(*hdr); + pinfo.metasize = 0; } - rcu_read_unlock(); if (unlikely(len > truesize)) { pr_debug("%s: rx error: len %u exceeds truesize %lu\n", @@ -1015,14 +1076,14 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, goto err_skb; } - head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog, - metasize, headroom); + head_skb = virtnet_merge_page_to_skb(&pinfo); curr_skb = head_skb; if (unlikely(!curr_skb)) goto err_skb; while (--num_buf) { int num_skb_frags; + int offset; buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx); if (unlikely(!buf)) { -- 2.31.0