On Mon, Jul 10, 2023 at 11:43 AM Xuan Zhuo <xuanzhuo@xxxxxxxxxxxxxxxxx> wrote: > I'd suggest to tweak the title like: "merge dma operations when refilling mergeable buffers" > Currently, the virtio core will perform a dma operation for each > operation. "for each buffer"? > Although, the same page may be operated multiple times. > > The driver does the dma operation and manages the dma address based the > feature premapped of virtio core. > > This way, we can perform only one dma operation for the same page. In > the case of mtu 1500, this can reduce a lot of dma operations. > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps > increased from 1893766 to 1901105. An increase of 0.4%. Btw, it looks to me the code to deal with XDP_TX/REDIRECT for linearized pages was missed. > > Signed-off-by: Xuan Zhuo <xuanzhuo@xxxxxxxxxxxxxxxxx> > --- > drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++--- > 1 file changed, 267 insertions(+), 16 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 486b5849033d..4de845d35bed 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = { > #define VIRTNET_SQ_STATS_LEN ARRAY_SIZE(virtnet_sq_stats_desc) > #define VIRTNET_RQ_STATS_LEN ARRAY_SIZE(virtnet_rq_stats_desc) > > +/* The bufs on the same page may share this struct. */ > +struct virtnet_rq_dma { > + struct virtnet_rq_dma *next; > + > + dma_addr_t addr; > + > + void *buf; > + u32 len; > + > + u32 ref; > +}; > + > +/* Record the dma and buf. */ > +struct virtnet_rq_data { > + struct virtnet_rq_data *next; > + > + void *buf; > + > + struct virtnet_rq_dma *dma; > +}; > + > /* Internal representation of a send virtqueue */ > struct send_queue { > /* Virtqueue associated with this send _queue */ > @@ -175,6 +196,13 @@ struct receive_queue { > char name[16]; > > struct xdp_rxq_info xdp_rxq; > + > + struct virtnet_rq_data *data_array; > + struct virtnet_rq_data *data_free; > + > + struct virtnet_rq_dma *dma_array; > + struct virtnet_rq_dma *dma_free; > + struct virtnet_rq_dma *last_dma; > }; > > /* This structure can contain rss message with maximum settings for indirection table and keysize > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > return skb; > } > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma) > +{ > + struct device *dev; > + > + --dma->ref; > + > + if (dma->ref) > + return; > + > + dev = virtqueue_dma_dev(rq->vq); > + > + dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE); > + > + dma->next = rq->dma_free; > + rq->dma_free = dma; > +} > + > +static void *virtnet_rq_recycle_data(struct receive_queue *rq, > + struct virtnet_rq_data *data) > +{ > + void *buf; > + > + buf = data->buf; > + > + data->next = rq->data_free; > + rq->data_free = data; > + > + return buf; > +} > + > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq, > + void *buf, > + struct virtnet_rq_dma *dma) > +{ > + struct virtnet_rq_data *data; > + > + data = rq->data_free; > + rq->data_free = data->next; > + > + data->buf = buf; > + data->dma = dma; > + > + return data; > +} > + > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx) > +{ > + struct virtnet_rq_data *data; > + void *buf; > + > + buf = virtqueue_get_buf_ctx(rq->vq, len, ctx); > + if (!buf || !rq->data_array) > + return buf; > + > + data = buf; > + > + virtnet_rq_unmap(rq, data->dma); > + > + return virtnet_rq_recycle_data(rq, data); > +} > + > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq) > +{ > + struct virtnet_rq_data *data; > + void *buf; > + > + buf = virtqueue_detach_unused_buf(rq->vq); > + if (!buf || !rq->data_array) > + return buf; > + > + data = buf; > + > + virtnet_rq_unmap(rq, data->dma); > + > + return virtnet_rq_recycle_data(rq, data); > +} > + > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len) > +{ > + struct virtnet_rq_dma *dma = rq->last_dma; > + struct device *dev; > + u32 off, map_len; > + dma_addr_t addr; > + void *end; > + > + if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) { > + ++dma->ref; > + addr = dma->addr + (buf - dma->buf); > + goto ok; > + } > + > + end = buf + len - 1; > + off = offset_in_page(end); > + map_len = len + PAGE_SIZE - off; This assumes a PAGE_SIZE which seems sub-optimal as page frag could be larger than this. > + > + dev = virtqueue_dma_dev(rq->vq); > + > + addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf), > + map_len, DMA_FROM_DEVICE, 0); > + if (addr == DMA_MAPPING_ERROR) > + return -ENOMEM; > + > + dma = rq->dma_free; > + rq->dma_free = dma->next; > + > + dma->ref = 1; > + dma->buf = buf; > + dma->addr = addr; > + dma->len = map_len; > + > + rq->last_dma = dma; > + > +ok: > + sg_init_table(rq->sg, 1); > + rq->sg[0].dma_address = addr; > + rq->sg[0].length = len; > + > + return 0; > +} > + > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi) > +{ > + struct receive_queue *rq; > + int i, err, j, num; > + > + /* disable for big mode */ > + if (!vi->mergeable_rx_bufs && vi->big_packets) > + return 0; > + > + for (i = 0; i < vi->max_queue_pairs; i++) { > + err = virtqueue_set_premapped(vi->rq[i].vq); > + if (err) > + continue; > + > + rq = &vi->rq[i]; > + > + num = virtqueue_get_vring_size(rq->vq); > + > + rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL); > + if (!rq->data_array) Can we avoid those allocations when we don't use the DMA API? > + goto err; > + > + rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL); > + if (!rq->dma_array) > + goto err; > + > + for (j = 0; j < num; ++j) { > + rq->data_array[j].next = rq->data_free; > + rq->data_free = &rq->data_array[j]; > + > + rq->dma_array[j].next = rq->dma_free; > + rq->dma_free = &rq->dma_array[j]; > + } > + } > + > + return 0; > + > +err: > + for (i = 0; i < vi->max_queue_pairs; i++) { > + struct receive_queue *rq; > + > + rq = &vi->rq[i]; > + > + kfree(rq->dma_array); > + kfree(rq->data_array); > + } > + > + return -ENOMEM; > +} > + > static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi) > { > unsigned int len; > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > void *buf; > int off; > > - buf = virtqueue_get_buf(rq->vq, &buflen); > + buf = virtnet_rq_get_buf(rq, &buflen, NULL); > if (unlikely(!buf)) > goto err_buf; > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > return -EINVAL; > > while (--*num_buf > 0) { > - buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx); > + buf = virtnet_rq_get_buf(rq, &len, &ctx); > if (unlikely(!buf)) { > pr_debug("%s: rx error: %d buffers out of %d missing\n", > dev->name, *num_buf, > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > while (--num_buf) { > int num_skb_frags; > > - buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx); > + buf = virtnet_rq_get_buf(rq, &len, &ctx); > if (unlikely(!buf)) { > pr_debug("%s: rx error: %d buffers out of %d missing\n", > dev->name, num_buf, > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > err_skb: > put_page(page); > while (num_buf-- > 1) { > - buf = virtqueue_get_buf(rq->vq, &len); > + buf = virtnet_rq_get_buf(rq, &len, NULL); > if (unlikely(!buf)) { > pr_debug("%s: rx error: %d buffers missing\n", > dev->name, num_buf); > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, > unsigned int xdp_headroom = virtnet_get_headroom(vi); > void *ctx = (void *)(unsigned long)xdp_headroom; > int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom; > + struct virtnet_rq_data *data; > int err; > > len = SKB_DATA_ALIGN(len) + > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, > buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > get_page(alloc_frag->page); > alloc_frag->offset += len; > - sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom, > - vi->hdr_len + GOOD_PACKET_LEN); > - err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > + > + if (rq->data_array) { > + err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom, > + vi->hdr_len + GOOD_PACKET_LEN); Thanks to the compound page. I wonder if everything could be simplified if we just reuse page->private for storing metadata like dma address and refcnt. Then we don't need extra stuff for tracking any other thing? Thanks > + if (err) > + goto map_err; > + > + data = virtnet_rq_get_data(rq, buf, rq->last_dma); > + } else { > + sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom, > + vi->hdr_len + GOOD_PACKET_LEN); > + data = (void *)buf; > + } > + > + err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp); > if (err < 0) > - put_page(virt_to_head_page(buf)); > + goto add_err; > + > + return err; > + > +add_err: > + if (rq->data_array) { > + virtnet_rq_unmap(rq, data->dma); > + virtnet_rq_recycle_data(rq, data); > + } > + > +map_err: > + put_page(virt_to_head_page(buf)); > return err; > } > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > unsigned int headroom = virtnet_get_headroom(vi); > unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; > unsigned int room = SKB_DATA_ALIGN(headroom + tailroom); > + struct virtnet_rq_data *data; > char *buf; > void *ctx; > int err; > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > alloc_frag->offset += hole; > } > > - sg_init_one(rq->sg, buf, len); > + if (rq->data_array) { > + err = virtnet_rq_map_sg(rq, buf, len); > + if (err) > + goto map_err; > + > + data = virtnet_rq_get_data(rq, buf, rq->last_dma); > + } else { > + sg_init_one(rq->sg, buf, len); > + data = (void *)buf; > + } > + > ctx = mergeable_len_to_ctx(len + room, headroom); > - err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > + err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp); > if (err < 0) > - put_page(virt_to_head_page(buf)); > + goto add_err; > + > + return 0; > + > +add_err: > + if (rq->data_array) { > + virtnet_rq_unmap(rq, data->dma); > + virtnet_rq_recycle_data(rq, data); > + } > > +map_err: > + put_page(virt_to_head_page(buf)); > return err; > } > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget, > void *ctx; > > while (stats.packets < budget && > - (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) { > + (buf = virtnet_rq_get_buf(rq, &len, &ctx))) { > receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats); > stats.packets++; > } > } else { > while (stats.packets < budget && > - (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { > + (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) { > receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats); > stats.packets++; > } > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi) > for (i = 0; i < vi->max_queue_pairs; i++) { > __netif_napi_del(&vi->rq[i].napi); > __netif_napi_del(&vi->sq[i].napi); > + > + kfree(vi->rq[i].data_array); > + kfree(vi->rq[i].dma_array); > } > > /* We called __netif_napi_del(), > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi) > } > > for (i = 0; i < vi->max_queue_pairs; i++) { > - struct virtqueue *vq = vi->rq[i].vq; > - while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) > - virtnet_rq_free_unused_buf(vq, buf); > + struct receive_queue *rq = &vi->rq[i]; > + > + while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL) > + virtnet_rq_free_unused_buf(rq->vq, buf); > cond_resched(); > } > } > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi) > if (ret) > goto err_free; > > + ret = virtnet_rq_merge_map_init(vi); > + if (ret) > + goto err_free; > + > cpus_read_lock(); > virtnet_set_affinity(vi); > cpus_read_unlock(); > -- > 2.32.0.3.g01195cf9f >