On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@xxxxxxxxxx> wrote: > > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@xxxxxxxxx> wrote: > > > > The implementation at the moment uses one page per packet in both the > > normal and XDP path. > > It's better to explain why we need a page pool and how it can help the > performance. > Sure, I will include that on v2. > > In addition, introducing a module parameter to enable > > or disable the usage of page pool (disabled by default). > > If page pool wins for most of the cases, any reason to disable it by default? > Thank you for raising the point. It does make sense to enable it by default. > > > > In single-core vm testing environments, it gives a modest performance gain > > in the normal path. > > Upstream codebase: 47.5 Gbits/sec > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > In multi-core vm testing environments, The most significant performance > > gain is observed in XDP cpumap: > > Upstream codebase: 1.38 Gbits/sec > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > Please show more details on the test. E.g which kinds of tests have > you measured? > > Btw, it would be better to measure PPS as well. > Sure. It will be added on v2. > > > > With this foundation, we can further integrate page pool fragmentation and > > DMA map/unmap support. > > > > Signed-off-by: Liang Chen <liangchen.linux@xxxxxxxxx> > > --- > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do > the ifdef tricks at least. > Sure. it will be done on v2. > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > index c5dca0d92e64..99c0ca0c1781 100644 > > --- a/drivers/net/virtio_net.c > > +++ b/drivers/net/virtio_net.c > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > module_param(gso, bool, 0444); > > module_param(napi_tx, bool, 0644); > > > > +static bool page_pool_enabled; > > +module_param(page_pool_enabled, bool, 0400); > > + > > /* FIXME: MTU in config. */ > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > #define GOOD_COPY_LEN 128 > > @@ -159,6 +162,9 @@ struct receive_queue { > > /* Chain pages by the private ptr. */ > > struct page *pages; > > > > + /* Page pool */ > > + struct page_pool *page_pool; > > + > > /* Average packet length for mergeable receive buffers. */ > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > return skb; > > } > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > +{ > > + if (rq->page_pool) > > + page_pool_put_full_page(rq->page_pool, page, true); > > + else > > + put_page(page); > > +} > > + > > /* Called from bottom half context */ > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > struct receive_queue *rq, > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > hdr = skb_vnet_hdr(skb); > > memcpy(hdr, hdr_p, hdr_len); > > if (page_to_free) > > - put_page(page_to_free); > > + virtnet_put_page(rq, page_to_free); > > > > return skb; > > } > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > return ret; > > } > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > { > > rq could be fetched from xdp_rxq_info? Yeah, it has the queue_index there. > > > struct skb_shared_info *shinfo; > > struct page *xdp_page; > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > shinfo = xdp_get_shared_info_from_buff(xdp); > > for (i = 0; i < shinfo->nr_frags; i++) { > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > - put_page(xdp_page); > > + virtnet_put_page(rq, xdp_page); > > } > > } > > } > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > if (page_off + *len + tailroom > PAGE_SIZE) > > return NULL; > > > > - page = alloc_page(GFP_ATOMIC); > > + if (rq->page_pool) > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > + else > > + page = alloc_page(GFP_ATOMIC); > > + > > if (!page) > > return NULL; > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > * is sending packet larger than the MTU. > > */ > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > - put_page(p); > > + virtnet_put_page(rq, p); > > goto err_buf; > > } > > > > memcpy(page_address(page) + page_off, > > page_address(p) + off, buflen); > > page_off += buflen; > > - put_page(p); > > + virtnet_put_page(rq, p); > > } > > > > /* Headroom does not contribute to packet length */ > > *len = page_off - VIRTIO_XDP_HEADROOM; > > return page; > > err_buf: > > - __free_pages(page, 0); > > + if (rq->page_pool) > > + page_pool_put_full_page(rq->page_pool, page, true); > > + else > > + __free_pages(page, 0); > > return NULL; > > } > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > } > > stats->bytes += len; > > page = virt_to_head_page(buf); > > - put_page(page); > > + virtnet_put_page(rq, page); > > } > > } > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > cur_frag_size = truesize; > > xdp_frags_truesz += cur_frag_size; > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > - put_page(page); > > + virtnet_put_page(rq, page); > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > dev->name, len, (unsigned long)(truesize - room)); > > dev->stats.rx_length_errors++; > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > return 0; > > > > err: > > - put_xdp_frags(xdp); > > + put_xdp_frags(xdp, rq); > > return -EINVAL; > > } > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > if (*len + xdp_room > PAGE_SIZE) > > return NULL; > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > + if (rq->page_pool) > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > + else > > + xdp_page = alloc_page(GFP_ATOMIC); > > if (!xdp_page) > > return NULL; > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > *frame_sz = PAGE_SIZE; > > > > - put_page(*page); > > + virtnet_put_page(rq, *page); > > > > *page = xdp_page; > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > if (unlikely(!head_skb)) > > break; > > + if (rq->page_pool) > > + skb_mark_for_recycle(head_skb); > > return head_skb; > > > > case XDP_TX: > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > break; > > } > > > > - put_xdp_frags(&xdp); > > + put_xdp_frags(&xdp, rq); > > > > err_xdp: > > - put_page(page); > > + virtnet_put_page(rq, page); > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > stats->xdp_drops++; > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > curr_skb = head_skb; > > > > + if (rq->page_pool) > > + skb_mark_for_recycle(curr_skb); > > + > > if (unlikely(!curr_skb)) > > goto err_skb; > > while (--num_buf) { > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > curr_skb = nskb; > > head_skb->truesize += nskb->truesize; > > num_skb_frags = 0; > > + if (rq->page_pool) > > + skb_mark_for_recycle(curr_skb); > > } > > if (curr_skb != head_skb) { > > head_skb->data_len += len; > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > } > > offset = buf - page_address(page); > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > - put_page(page); > > + virtnet_put_page(rq, page); > > I wonder why not we can't do this during buffer allocation like other drivers? > Sorry, I don't quite understand the point here. Would you please elaborate a bit more? > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > len, truesize); > > } else { > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > return head_skb; > > > > err_skb: > > - put_page(page); > > + virtnet_put_page(rq, page); > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > err_buf: > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > * disabled GSO for XDP, it won't be a big issue. > > */ > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > - return -ENOMEM; > > + if (rq->page_pool) { > > + struct page *page; > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > - get_page(alloc_frag->page); > > - alloc_frag->offset += len + room; > > - hole = alloc_frag->size - alloc_frag->offset; > > - if (hole < len + room) { > > - /* To avoid internal fragmentation, if there is very likely not > > - * enough space for another buffer, add the remaining space to > > - * the current buffer. > > - * XDP core assumes that frame_size of xdp_buff and the length > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > - */ > > - if (!headroom) > > - len += hole; > > - alloc_frag->offset += hole; > > - } > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > + if (unlikely(!page)) > > + return -ENOMEM; > > + buf = (char *)page_address(page); > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > + } else { > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > Why not simply use a helper like virtnet_page_frag_refill() and add > the page_pool allocation logic there? It helps to reduce the > changeset. > Sure. Will do that on v2. > > + return -ENOMEM; > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > + get_page(alloc_frag->page); > > + alloc_frag->offset += len + room; > > + hole = alloc_frag->size - alloc_frag->offset; > > + if (hole < len + room) { > > + /* To avoid internal fragmentation, if there is very likely not > > + * enough space for another buffer, add the remaining space to > > + * the current buffer. > > + * XDP core assumes that frame_size of xdp_buff and the length > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > + */ > > + if (!headroom) > > + len += hole; > > + alloc_frag->offset += hole; > > + } > > + } > > sg_init_one(rq->sg, buf, len); > > ctx = mergeable_len_to_ctx(len + room, headroom); > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > if (err < 0) > > - put_page(virt_to_head_page(buf)); > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > return err; > > } > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > if (err < 0) > > return err; > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > - MEM_TYPE_PAGE_SHARED, NULL); > > + if (vi->rq[qp_index].page_pool) > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > + MEM_TYPE_PAGE_POOL, > > + vi->rq[qp_index].page_pool); > > + else > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > + MEM_TYPE_PAGE_SHARED, > > + NULL); > > + > > if (err < 0) > > goto err_xdp_reg_mem_model; > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > virtnet_sq_stats_desc[j].desc); > > } > > + page_pool_ethtool_stats_get_strings(p); > > break; > > } > > } > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > switch (sset) { > > case ETH_SS_STATS: > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > - VIRTNET_SQ_STATS_LEN); > > + VIRTNET_SQ_STATS_LEN + > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > + page_pool_ethtool_stats_get_count() : 0)); > > default: > > return -EOPNOTSUPP; > > } > > } > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > +{ > > +#ifdef CONFIG_PAGE_POOL_STATS > > + struct virtnet_info *vi = netdev_priv(dev); > > + struct page_pool_stats pp_stats = {}; > > + int i; > > + > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > + if (!vi->rq[i].page_pool) > > + continue; > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > + } > > + page_pool_ethtool_stats_get(data, &pp_stats); > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > +} > > + > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > struct ethtool_stats *stats, u64 *data) > > { > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > idx += VIRTNET_SQ_STATS_LEN; > > } > > + > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > } > > > > static void virtnet_get_channels(struct net_device *dev, > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > for (i = 0; i < vi->max_queue_pairs; i++) { > > __netif_napi_del(&vi->rq[i].napi); > > __netif_napi_del(&vi->sq[i].napi); > > + if (vi->rq[i].page_pool) > > + page_pool_destroy(vi->rq[i].page_pool); > > } > > > > /* We called __netif_napi_del(), > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > struct virtnet_info *vi = vq->vdev->priv; > > int i = vq2rxq(vq); > > > > - if (vi->mergeable_rx_bufs) > > - put_page(virt_to_head_page(buf)); > > - else if (vi->big_packets) > > + if (vi->mergeable_rx_bufs) { > > + if (vi->rq[i].page_pool) { > > + page_pool_put_full_page(vi->rq[i].page_pool, > > + virt_to_head_page(buf), > > + true); > > + } else { > > + put_page(virt_to_head_page(buf)); > > + } > > + } else if (vi->big_packets) { > > give_pages(&vi->rq[i], buf); > > Any reason only mergeable were modified but not for small and big? > > Thanks > Big mode uses the page chain to recycle pages, thus the using of "private" of the buffer page. I will take further look into that to see if it is better to use page pool in these cases. Thanks! > > - else > > + } else { > > put_page(virt_to_head_page(buf)); > > + } > > } > > > > static void free_unused_bufs(struct virtnet_info *vi) > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > virtnet_free_queues(vi); > > } > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > +{ > > + struct virtio_device *vdev = rq->vq->vdev; > > + > > + struct page_pool_params pp_params = { > > + .order = 0, > > + .pool_size = rq->vq->num_max, > > + .nid = dev_to_node(vdev->dev.parent), > > + .dev = vdev->dev.parent, > > + .offset = 0, > > + }; > > + > > + rq->page_pool = page_pool_create(&pp_params); > > + if (IS_ERR(rq->page_pool)) { > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > + PTR_ERR(rq->page_pool)); > > + rq->page_pool = NULL; > > + } > > +} > > + > > /* How large should a single buffer be so a queue full of these can fit at > > * least one full packet? > > * Logic below assumes the mergeable buffer header is used. > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > vi->sq[i].vq = vqs[txq2vq(i)]; > > + > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > + virtnet_alloc_page_pool(&vi->rq[i]); > > + else > > + dev_warn(&vi->vdev->dev, > > + "page pool only support mergeable mode\n"); > > + > > } > > > > /* run here: ret == 0. */ > > -- > > 2.31.1 > > > _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization