The following conditions need to be satisfied to achieve zero-copy: 1. The tx desc has enough space to store the xdp_frame and skb_share_info. 2. The memory address pointed to by the tx desc is within a page. test zero copy with libxdp Performance: |MSS (bytes) | Packet rate (PPS) AF_XDP | 1300 | 480k AF_XDP with zero copy| 1300 | 540K signed-off-by: huangjie.albert <huangjie.albert@xxxxxxxxxxxxx> --- drivers/net/veth.c | 207 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 178 insertions(+), 29 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 600225e27e9e..e4f1a8345f42 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -103,6 +103,11 @@ struct veth_xdp_tx_bq { unsigned int count; }; +struct veth_seg_info { + u32 segs; + u64 desc[] ____cacheline_aligned_in_smp; +}; + /* * ethtool interface */ @@ -645,6 +650,100 @@ static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, return 0; } +static struct sk_buff *veth_build_skb(void *head, int headroom, int len, + int buflen) +{ + struct sk_buff *skb; + + skb = build_skb(head, buflen); + if (!skb) + return NULL; + + skb_reserve(skb, headroom); + skb_put(skb, len); + + return skb; +} + +static void veth_xsk_destruct_skb(struct sk_buff *skb) +{ + struct veth_seg_info *seg_info = (struct veth_seg_info *)skb_shinfo(skb)->destructor_arg; + struct xsk_buff_pool *pool = (struct xsk_buff_pool *)skb_shinfo(skb)->destructor_arg_xsk_pool; + unsigned long flags; + u32 index = 0; + u64 addr; + + /* release cq */ + spin_lock_irqsave(&pool->cq_lock, flags); + for (index = 0; index < seg_info->segs; index++) { + addr = (u64)(long)seg_info->desc[index]; + xsk_tx_completed_addr(pool, addr); + } + spin_unlock_irqrestore(&pool->cq_lock, flags); + + kfree(seg_info); + skb_shinfo(skb)->destructor_arg = NULL; + skb_shinfo(skb)->destructor_arg_xsk_pool = NULL; +} + +static struct sk_buff *veth_build_skb_zerocopy(struct net_device *dev, struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + struct veth_seg_info *seg_info; + struct sk_buff *skb; + struct page *page; + void *hard_start; + u32 len, ts; + void *buffer; + int headroom; + u64 addr; + u32 index; + + addr = desc->addr; + len = desc->len; + buffer = xsk_buff_raw_get_data(pool, addr); + ts = pool->unaligned ? len : pool->chunk_size; + + headroom = offset_in_page(buffer); + + /* offset in umem pool buffer */ + addr = buffer - pool->addrs; + + /* get the page of the desc */ + page = pool->umem->pgs[addr >> PAGE_SHIFT]; + + /* in order to avoid to get freed by kfree_skb */ + get_page(page); + + hard_start = page_to_virt(page); + + skb = veth_build_skb(hard_start, headroom, len, ts); + seg_info = (struct veth_seg_info *)kmalloc(struct_size(seg_info, desc, MAX_SKB_FRAGS), GFP_KERNEL); + if (!seg_info) + { + printk("here must to deal with\n"); + } + + /* later we will support gso for this */ + index = skb_shinfo(skb)->gso_segs; + seg_info->desc[index] = desc->addr; + seg_info->segs = ++index; + + skb->truesize += ts; + skb->dev = dev; + skb_shinfo(skb)->destructor_arg = (void *)(long)seg_info; + skb_shinfo(skb)->destructor_arg_xsk_pool = (void *)(long)pool; + skb->destructor = veth_xsk_destruct_skb; + + /* set the mac header */ + skb->protocol = eth_type_trans(skb, dev); + + /* to do, add skb to sock. may be there is no need to do for this + * refcount_add(ts, &xs->sk.sk_wmem_alloc); + */ + return skb; +} + static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, struct xdp_frame *frame, struct veth_xdp_tx_bq *bq, @@ -1063,6 +1162,20 @@ static int veth_poll(struct napi_struct *napi, int budget) return done; } +/* if buffer contain in a page */ +static inline bool buffer_in_page(void *buffer, u32 len) +{ + u32 offset; + + offset = offset_in_page(buffer); + + if(PAGE_SIZE - offset >= len) { + return true; + } else { + return false; + } +} + static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, int budget) { struct veth_priv *priv, *peer_priv; @@ -1073,6 +1186,9 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, struct veth_xdp_tx_bq bq; struct xdp_desc desc; void *xdpf; + struct sk_buff *skb = NULL; + bool zc = xsk_pool->umem->zc; + u32 xsk_headroom = xsk_pool->headroom; int done = 0; bq.count = 0; @@ -1102,12 +1218,6 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, break; } - /* - * Get a xmit addr - * desc.addr is a offset, so we should to convert to real virtual address - */ - addr = xsk_buff_raw_get_data(xsk_pool, desc.addr); - /* can not hold all data in a page */ truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + desc.len + sizeof(struct xdp_frame); if (truesize > PAGE_SIZE) { @@ -1116,16 +1226,39 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, continue; } - page = dev_alloc_page(); - if (!page) { - /* - * error , release xdp frame and increase drops - */ - xsk_tx_completed_addr(xsk_pool, desc.addr); - stats.xdp_drops++; - break; + /* + * Get a xmit addr + * desc.addr is a offset, so we should to convert to real virtual address + */ + addr = xsk_buff_raw_get_data(xsk_pool, desc.addr); + + /* + * in order to support zero copy, headroom must have enough space to hold xdp_frame + */ + if (zc && (xsk_headroom < sizeof(struct xdp_frame))) + zc = false; + + /* + * if desc not contain in a page, also do not support zero copy + */ + if (!buffer_in_page(addr, desc.len)) + zc = false; + + if (zc) { + /* headroom is reserved for xdp_frame */ + new_addr = addr - sizeof(struct xdp_frame); + } else { + page = dev_alloc_page(); + if (!page) { + /* + * error , release xdp frame and increase drops + */ + xsk_tx_completed_addr(xsk_pool, desc.addr); + stats.xdp_drops++; + break; + } + new_addr = page_to_virt(page); } - new_addr = page_to_virt(page); p_frame = new_addr; new_addr += sizeof(struct xdp_frame); @@ -1137,19 +1270,37 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, */ p_frame->headroom = 0; p_frame->metasize = 0; - p_frame->frame_sz = PAGE_SIZE; p_frame->flags = 0; - p_frame->mem.type = MEM_TYPE_PAGE_SHARED; - memcpy(p_frame->data, addr, p_frame->len); - xsk_tx_completed_addr(xsk_pool, desc.addr); - - /* if peer have xdp prog, if it has ,just send to peer */ - p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats); - /* if no xdp with this queue, convert to skb to xmit*/ - if (p_frame) { - xdpf = p_frame; - veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats); - p_frame = NULL; + + if (zc) { + p_frame->frame_sz = xsk_pool->frame_len; + /* to do: if there is a xdp, how to recycle the tx desc */ + p_frame->mem.type = MEM_TYPE_XSK_BUFF_POOL_TX; + /* no need to copy address for af+xdp */ + p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats); + if (p_frame) { + skb = veth_build_skb_zerocopy(peer_dev, xsk_pool, &desc); + if (skb) { + napi_gro_receive(&peer_rq->xdp_napi, skb); + skb = NULL; + } else { + xsk_tx_completed_addr(xsk_pool, desc.addr); + } + } + } else { + p_frame->frame_sz = PAGE_SIZE; + p_frame->mem.type = MEM_TYPE_PAGE_SHARED; + memcpy(p_frame->data, addr, p_frame->len); + xsk_tx_completed_addr(xsk_pool, desc.addr); + + /* if peer have xdp prog, if it has ,just send to peer */ + p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats); + /* if no xdp with this queue, convert to skb to xmit*/ + if (p_frame) { + xdpf = p_frame; + veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats); + p_frame = NULL; + } } stats.xdp_bytes += desc.len; @@ -1163,8 +1314,6 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, xsk_tx_release(xsk_pool); } - - /* just for peer rq */ if (peer_stats.xdp_tx > 0) veth_xdp_flush(peer_rq, &bq); -- 2.20.1