> -----Original Message----- > From: Jason Wang [mailto:jasowang@xxxxxxxxxx] > Sent: Monday, March 4, 2024 2:56 PM > To: wangyunjian <wangyunjian@xxxxxxxxxx> > Cc: mst@xxxxxxxxxx; willemdebruijn.kernel@xxxxxxxxx; kuba@xxxxxxxxxx; > bjorn@xxxxxxxxxx; magnus.karlsson@xxxxxxxxx; maciej.fijalkowski@xxxxxxxxx; > jonathan.lemon@xxxxxxxxx; davem@xxxxxxxxxxxxx; bpf@xxxxxxxxxxxxxxx; > netdev@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx; > virtualization@xxxxxxxxxxxxxxx; xudingke <xudingke@xxxxxxxxxx>; liwei (DT) > <liwei395@xxxxxxxxxx> > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support > > On Wed, Feb 28, 2024 at 7:06 PM Yunjian Wang <wangyunjian@xxxxxxxxxx> > wrote: > > > > This patch set allows TUN to support the AF_XDP Tx zero-copy feature, > > which can significantly reduce CPU utilization for XDP programs. > > > > Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer > > ring has been utilized to queue different types of pointers by > > encoding the type into the lower bits. Therefore, we introduce a new > > flag, TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP > > descriptors and differentiate them from XDP buffers and sk_buffs. > > Additionally, a spin lock is added for enabling and disabling operations on the > xsk pool. > > > > The performance testing was performed on a Intel E5-2620 2.40GHz > machine. > > Traffic were generated/send through TUN(testpmd txonly with AF_XDP) to > > VM (testpmd rxonly in guest). > > > > +------+---------+---------+---------+ > > | | copy |zero-copy| speedup | > > +------+---------+---------+---------+ > > | UDP | Mpps | Mpps | % | > > | 64 | 2.5 | 4.0 | 60% | > > | 512 | 2.1 | 3.6 | 71% | > > | 1024 | 1.9 | 3.3 | 73% | > > +------+---------+---------+---------+ > > > > Signed-off-by: Yunjian Wang <wangyunjian@xxxxxxxxxx> > > --- > > drivers/net/tun.c | 177 > +++++++++++++++++++++++++++++++++++++++-- > > drivers/vhost/net.c | 4 + > > include/linux/if_tun.h | 32 ++++++++ > > 3 files changed, 208 insertions(+), 5 deletions(-) > > > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c index > > bc80fc1d576e..7f4ff50b532c 100644 > > --- a/drivers/net/tun.c > > +++ b/drivers/net/tun.c > > @@ -63,6 +63,7 @@ > > #include <net/rtnetlink.h> > > #include <net/sock.h> > > #include <net/xdp.h> > > +#include <net/xdp_sock_drv.h> > > #include <net/ip_tunnels.h> > > #include <linux/seq_file.h> > > #include <linux/uio.h> > > @@ -86,6 +87,7 @@ static void tun_default_link_ksettings(struct > net_device *dev, > > struct > ethtool_link_ksettings > > *cmd); > > > > #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) > > +#define TUN_XDP_BATCH 64 > > > > /* TUN device flags */ > > > > @@ -146,6 +148,9 @@ struct tun_file { > > struct tun_struct *detached; > > struct ptr_ring tx_ring; > > struct xdp_rxq_info xdp_rxq; > > + struct xsk_buff_pool *xsk_pool; > > + spinlock_t pool_lock; /* Protects xsk pool enable/disable */ > > + u32 nb_descs; > > }; > > > > struct tun_page { > > @@ -614,6 +619,8 @@ void tun_ptr_free(void *ptr) > > struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); > > > > xdp_return_frame(xdpf); > > + } else if (tun_is_xdp_desc_frame(ptr)) { > > + return; > > } else { > > __skb_array_destroy_skb(ptr); > > } > > @@ -631,6 +638,37 @@ static void tun_queue_purge(struct tun_file *tfile) > > skb_queue_purge(&tfile->sk.sk_error_queue); > > } > > > > +static void tun_set_xsk_pool(struct tun_file *tfile, struct > > +xsk_buff_pool *pool) { > > + if (!pool) > > + return; > > + > > + spin_lock(&tfile->pool_lock); > > + xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq); > > + tfile->xsk_pool = pool; > > + spin_unlock(&tfile->pool_lock); } > > + > > +static void tun_clean_xsk_pool(struct tun_file *tfile) { > > + spin_lock(&tfile->pool_lock); > > + if (tfile->xsk_pool) { > > + void *ptr; > > + > > + while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL) > > + tun_ptr_free(ptr); > > + > > + if (tfile->nb_descs) { > > + xsk_tx_completed(tfile->xsk_pool, > tfile->nb_descs); > > + if (xsk_uses_need_wakeup(tfile->xsk_pool)) > > + > xsk_set_tx_need_wakeup(tfile->xsk_pool); > > + tfile->nb_descs = 0; > > + } > > + tfile->xsk_pool = NULL; > > + } > > + spin_unlock(&tfile->pool_lock); } > > + > > static void __tun_detach(struct tun_file *tfile, bool clean) { > > struct tun_file *ntfile; > > @@ -648,6 +686,11 @@ static void __tun_detach(struct tun_file *tfile, bool > clean) > > u16 index = tfile->queue_index; > > BUG_ON(index >= tun->numqueues); > > > > + ntfile = rtnl_dereference(tun->tfiles[tun->numqueues - > 1]); > > + /* Stop xsk zc xmit */ > > + tun_clean_xsk_pool(tfile); > > + tun_clean_xsk_pool(ntfile); > > + > > rcu_assign_pointer(tun->tfiles[index], > > tun->tfiles[tun->numqueues - 1]); > > ntfile = rtnl_dereference(tun->tfiles[index]); > > @@ -668,6 +711,7 @@ static void __tun_detach(struct tun_file *tfile, bool > clean) > > tun_flow_delete_by_queue(tun, tun->numqueues + 1); > > /* Drop read queue */ > > tun_queue_purge(tfile); > > + tun_set_xsk_pool(ntfile, > > + xsk_get_pool_from_qid(tun->dev, index)); > > tun_set_real_num_queues(tun); > > } else if (tfile->detached && clean) { > > tun = tun_enable_queue(tfile); @@ -801,6 +845,7 @@ > > static int tun_attach(struct tun_struct *tun, struct file *file, > > > > if (tfile->xdp_rxq.queue_index != tfile->queue_index) > > tfile->xdp_rxq.queue_index = > > tfile->queue_index; > > + tun_set_xsk_pool(tfile, xsk_get_pool_from_qid(dev, > > + tfile->queue_index)); > > } else { > > /* Setup XDP RX-queue info, for new tfile getting > attached */ > > err = xdp_rxq_info_reg(&tfile->xdp_rxq, @@ -1221,11 > > +1266,50 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog > *prog, > > return 0; > > } > > > > +static int tun_xsk_pool_enable(struct net_device *netdev, > > + struct xsk_buff_pool *pool, > > + u16 qid) { > > + struct tun_struct *tun = netdev_priv(netdev); > > + struct tun_file *tfile; > > + > > + if (qid >= tun->numqueues) > > + return -EINVAL; > > + > > + tfile = rtnl_dereference(tun->tfiles[qid]); > > + tun_set_xsk_pool(tfile, pool); > > + > > + return 0; > > +} > > + > > +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid) { > > + struct tun_struct *tun = netdev_priv(netdev); > > + struct tun_file *tfile; > > + > > + if (qid >= MAX_TAP_QUEUES) > > + return -EINVAL; > > + > > + tfile = rtnl_dereference(tun->tfiles[qid]); > > + if (tfile) > > + tun_clean_xsk_pool(tfile); > > + return 0; > > +} > > + > > +static int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool > *pool, > > + u16 qid) { > > + return pool ? tun_xsk_pool_enable(dev, pool, qid) : > > + tun_xsk_pool_disable(dev, qid); } > > + > > static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { > > switch (xdp->command) { > > case XDP_SETUP_PROG: > > return tun_xdp_set(dev, xdp->prog, xdp->extack); > > + case XDP_SETUP_XSK_POOL: > > + return tun_xsk_pool_setup(dev, xdp->xsk.pool, > > + xdp->xsk.queue_id); > > default: > > return -EINVAL; > > } > > @@ -1330,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, > struct xdp_buff *xdp) > > return nxmit; > > } > > > > +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) > > +{ > > + struct tun_struct *tun = netdev_priv(dev); > > + struct tun_file *tfile; > > + > > + rcu_read_lock(); > > + tfile = rcu_dereference(tun->tfiles[qid]); > > + if (tfile) > > + __tun_xdp_flush_tfile(tfile); > > + rcu_read_unlock(); > > + return 0; > > +} > > I may miss something but why not simply queue xdp frames into ptr ring then > we don't need tricks for peek? The current implementation is implemented with reference to other NIC's drivers (ixgbe, ice, dpaa2, mlx5), which use XDP descriptors directly. Converting XDP descriptors to XDP frames does not seem to be very beneficial. Thanks > > Thanks