> -----Original Message----- > From: Willem de Bruijn [mailto:willemdebruijn.kernel@xxxxxxxxx] > Sent: Thursday, January 25, 2024 3:05 AM > To: wangyunjian <wangyunjian@xxxxxxxxxx>; mst@xxxxxxxxxx; > willemdebruijn.kernel@xxxxxxxxx; jasowang@xxxxxxxxxx; kuba@xxxxxxxxxx; > davem@xxxxxxxxxxxxx; magnus.karlsson@xxxxxxxxx > Cc: netdev@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; > kvm@xxxxxxxxxxxxxxx; virtualization@xxxxxxxxxxxxxxx; xudingke > <xudingke@xxxxxxxxxx>; wangyunjian <wangyunjian@xxxxxxxxxx> > Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support > > Yunjian Wang wrote: > > Now the zero-copy feature of AF_XDP socket is supported by some > > drivers, which can reduce CPU utilization on the xdp program. > > This patch set allows tun to support AF_XDP Rx zero-copy feature. > > > > This patch tries to address this by: > > - Use peek_len to consume a xsk->desc and get xsk->desc length. > > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty. > > So add a check for empty vq's array in vhost_net_buf_produce(). > > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support > > - add tun_put_user_desc function to copy the Rx data to VM > > > > Signed-off-by: Yunjian Wang <wangyunjian@xxxxxxxxxx> > > I don't fully understand the higher level design of this feature yet. This feature can reduce the memory copy for the xdp program. > > But some initial comments at the code level. > > > --- > > drivers/net/tun.c | 165 > +++++++++++++++++++++++++++++++++++++++++++- > > drivers/vhost/net.c | 18 +++-- > > 2 files changed, 176 insertions(+), 7 deletions(-) > > > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c index > > afa5497f7c35..248b0f8e07d1 100644 > > --- a/drivers/net/tun.c > > +++ b/drivers/net/tun.c > > @@ -77,6 +77,7 @@ > > #include <net/ax25.h> > > #include <net/rose.h> > > #include <net/6lowpan.h> > > +#include <net/xdp_sock_drv.h> > > > > #include <linux/uaccess.h> > > #include <linux/proc_fs.h> > > @@ -145,6 +146,10 @@ struct tun_file { > > struct tun_struct *detached; > > struct ptr_ring tx_ring; > > struct xdp_rxq_info xdp_rxq; > > + struct xdp_desc desc; > > + /* protects xsk pool */ > > + spinlock_t pool_lock; > > + struct xsk_buff_pool *pool; > > }; > > > > struct tun_page { > > @@ -208,6 +213,8 @@ struct tun_struct { > > struct bpf_prog __rcu *xdp_prog; > > struct tun_prog __rcu *steering_prog; > > struct tun_prog __rcu *filter_prog; > > + /* tracks AF_XDP ZC enabled queues */ > > + unsigned long *af_xdp_zc_qps; > > struct ethtool_link_ksettings link_ksettings; > > /* init args */ > > struct file *file; > > @@ -795,6 +802,8 @@ static int tun_attach(struct tun_struct *tun, > > struct file *file, > > > > tfile->queue_index = tun->numqueues; > > tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; > > + tfile->desc.len = 0; > > + tfile->pool = NULL; > > > > if (tfile->detached) { > > /* Re-attach detached tfile, updating XDP queue_index */ @@ -989,6 > > +998,13 @@ static int tun_net_init(struct net_device *dev) > > return err; > > } > > > > + tun->af_xdp_zc_qps = bitmap_zalloc(MAX_TAP_QUEUES, GFP_KERNEL); > > + if (!tun->af_xdp_zc_qps) { > > + security_tun_dev_free_security(tun->security); > > + free_percpu(dev->tstats); > > + return -ENOMEM; > > + } > > + > > tun_flow_init(tun); > > > > dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | @@ -1009,6 > > +1025,7 @@ static int tun_net_init(struct net_device *dev) > > tun_flow_uninit(tun); > > security_tun_dev_free_security(tun->security); > > free_percpu(dev->tstats); > > + bitmap_free(tun->af_xdp_zc_qps); > > Please release state in inverse order of acquire. Will done. > > > return err; > > } > > return 0; > > @@ -1222,11 +1239,77 @@ static int tun_xdp_set(struct net_device *dev, > struct bpf_prog *prog, > > return 0; > > } > > > > +static int tun_xsk_pool_enable(struct net_device *netdev, > > + struct xsk_buff_pool *pool, > > + u16 qid) > > +{ > > + struct tun_struct *tun = netdev_priv(netdev); > > + struct tun_file *tfile; > > + unsigned long flags; > > + > > + rcu_read_lock(); > > + tfile = rtnl_dereference(tun->tfiles[qid]); > > + if (!tfile) { > > + rcu_read_unlock(); > > + return -ENODEV; > > + } > > No need for rcu_read_lock with rtnl_dereference. > > Consider ASSERT_RTNL() if unsure whether this patch could be reached without > the rtnl held. Will done. > > > + > > + spin_lock_irqsave(&tfile->pool_lock, flags); > > + xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq); > > + tfile->pool = pool; > > + spin_unlock_irqrestore(&tfile->pool_lock, flags); > > + > > + rcu_read_unlock(); > > + set_bit(qid, tun->af_xdp_zc_qps); > > What are the concurrency semantics: there's a spinlock to make the update to > xdp_rxq and pool a critical section, but the bitmap is not part of this? Please > also then document why the irqsave. The bitmap indicates whether the XDP pool is enabled on the current queue, reducing the impact of the non-enabled XDP pool on the original scenario. The XDP program may release the pool when receiving packets. So the lock is only used to protect the pool. > > > + > > + return 0; > > +} > > + > > +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid) { > > + struct tun_struct *tun = netdev_priv(netdev); > > + struct tun_file *tfile; > > + unsigned long flags; > > + > > + if (!test_bit(qid, tun->af_xdp_zc_qps)) > > + return 0; > > + > > + clear_bit(qid, tun->af_xdp_zc_qps); > > Time of check to time of use race between test and clear? Or is there no race > because anything that clears will hold the RTNL? If so, please add a comment. > > > + > > + rcu_read_lock(); > > + tfile = rtnl_dereference(tun->tfiles[qid]); > > + if (!tfile) { > > + rcu_read_unlock(); > > + return 0; > > + } > > + > > + spin_lock_irqsave(&tfile->pool_lock, flags); > > + if (tfile->desc.len) { > > + xsk_tx_completed(tfile->pool, 1); > > + tfile->desc.len = 0; > > + } > > + tfile->pool = NULL; > > + spin_unlock_irqrestore(&tfile->pool_lock, flags); > > + > > + rcu_read_unlock(); > > + return 0; > > +} > > + > > +int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool, > > + u16 qid) > > +{ > > + return pool ? tun_xsk_pool_enable(dev, pool, qid) : > > + tun_xsk_pool_disable(dev, qid); > > +} > > + > > static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { > > switch (xdp->command) { > > case XDP_SETUP_PROG: > > return tun_xdp_set(dev, xdp->prog, xdp->extack); > > + case XDP_SETUP_XSK_POOL: > > + return tun_xsk_pool_setup(dev, xdp->xsk.pool, > > + xdp->xsk.queue_id); > > default: > > return -EINVAL; > > } > > @@ -1331,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, > struct xdp_buff *xdp) > > return nxmit; > > } > > > > +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) > > +{ > > + struct tun_struct *tun = netdev_priv(dev); > > + struct tun_file *tfile; > > + > > + rcu_read_lock(); > > + tfile = rcu_dereference(tun->tfiles[qid]); > > + if (tfile) > > + __tun_xdp_flush_tfile(tfile); > > + rcu_read_unlock(); > > + return 0; > > +} > > + > > static const struct net_device_ops tap_netdev_ops = { > > .ndo_init = tun_net_init, > > .ndo_uninit = tun_net_uninit, > > @@ -1347,6 +1443,7 @@ static const struct net_device_ops > tap_netdev_ops = { > > .ndo_get_stats64 = dev_get_tstats64, > > .ndo_bpf = tun_xdp, > > .ndo_xdp_xmit = tun_xdp_xmit, > > + .ndo_xsk_wakeup = tun_xsk_wakeup, > > .ndo_change_carrier = tun_net_change_carrier, > > }; > > > > @@ -1404,7 +1501,8 @@ static void tun_net_initialize(struct net_device > *dev) > > /* Currently tun does not support XDP, only tap does. */ > > dev->xdp_features = NETDEV_XDP_ACT_BASIC | > > NETDEV_XDP_ACT_REDIRECT | > > - NETDEV_XDP_ACT_NDO_XMIT; > > + NETDEV_XDP_ACT_NDO_XMIT | > > + NETDEV_XDP_ACT_XSK_ZEROCOPY; > > > > break; > > } > > @@ -2213,6 +2311,37 @@ static void *tun_ring_recv(struct tun_file *tfile, > int noblock, int *err) > > return ptr; > > } > > > > +static ssize_t tun_put_user_desc(struct tun_struct *tun, > > + struct tun_file *tfile, > > + struct xdp_desc *desc, > > + struct iov_iter *iter) > > +{ > > + size_t size = desc->len; > > + int vnet_hdr_sz = 0; > > + size_t ret; > > + > > + if (tun->flags & IFF_VNET_HDR) { > > + struct virtio_net_hdr_mrg_rxbuf gso = { 0 }; > > + > > + vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); > > + if (unlikely(iov_iter_count(iter) < vnet_hdr_sz)) > > + return -EINVAL; > > + if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) != > > + sizeof(gso))) > > + return -EFAULT; > > + iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); > > + } > > + > > + ret = copy_to_iter(xsk_buff_raw_get_data(tfile->pool, desc->addr), > > + size, iter) + vnet_hdr_sz; > > + > > + preempt_disable(); > > + dev_sw_netstats_tx_add(tun->dev, 1, ret); > > + preempt_enable(); > > + > > + return ret; > > +} > > + > > This is almost a copy of tun_put_user_xdp. Can we refactor to avoid duplicating > code (here and elsewhere this may apply). Will done. > > > static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, > > struct iov_iter *to, > > int noblock, void *ptr) > > @@ -2226,6 +2355,22 @@ static ssize_t tun_do_read(struct tun_struct > *tun, struct tun_file *tfile, > > } > > > > if (!ptr) { > > + /* Read frames from xsk's desc */ > > + if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) { > > + spin_lock(&tfile->pool_lock); > > + if (tfile->pool) { > > + ret = tun_put_user_desc(tun, tfile, &tfile->desc, to); > > + xsk_tx_completed(tfile->pool, 1); > > + if (xsk_uses_need_wakeup(tfile->pool)) > > + xsk_set_tx_need_wakeup(tfile->pool); > > For the xsk maintainers if they're reading along: this two line pattern is seen > quite often. Deserves a helper in a header file? > > > + tfile->desc.len = 0; > > + } else { > > + ret = -EBADFD; > > + } > > + spin_unlock(&tfile->pool_lock); > > + return ret; > > + } > > + > > /* Read frames from ring */ > > ptr = tun_ring_recv(tfile, noblock, &err); > > if (!ptr) > > @@ -2311,6 +2456,7 @@ static void tun_free_netdev(struct net_device > > *dev) > > > > BUG_ON(!(list_empty(&tun->disabled))); > > > > + bitmap_free(tun->af_xdp_zc_qps); > > free_percpu(dev->tstats); > > tun_flow_uninit(tun); > > security_tun_dev_free_security(tun->security); > > @@ -2666,7 +2812,19 @@ static int tun_peek_len(struct socket *sock) > > if (!tun) > > return 0; > > > > - ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); > > + if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) { > > + spin_lock(&tfile->pool_lock); > > + if (tfile->pool && xsk_tx_peek_desc(tfile->pool, &tfile->desc)) { > > + xsk_tx_release(tfile->pool); > > + ret = tfile->desc.len; > > + /* The length of desc must be greater than 0 */ > > + if (!ret) > > + xsk_tx_completed(tfile->pool, 1); > > Peek semantics usually don't result in releasing the buffer. Am I > misunderstanding this operation? > > + } > > + spin_unlock(&tfile->pool_lock); > > + } else { > > + ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); > > + } > > tun_put(tun); > > > > return ret; > > @@ -3469,8 +3627,11 @@ static int tun_chr_open(struct inode *inode, > > struct file * file) > > > > mutex_init(&tfile->napi_mutex); > > RCU_INIT_POINTER(tfile->tun, NULL); > > + spin_lock_init(&tfile->pool_lock); > > tfile->flags = 0; > > tfile->ifindex = 0; > > + tfile->pool = NULL; > > + tfile->desc.len = 0; > > > > init_waitqueue_head(&tfile->socket.wq.wait); > > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index > > f2ed7167c848..a1f143ad2341 100644 > > --- a/drivers/vhost/net.c > > +++ b/drivers/vhost/net.c > > For virtio maintainer: is it okay to have tun and vhost/net changes in the same > patch, or is it better to split them? > > > @@ -169,9 +169,10 @@ static int vhost_net_buf_is_empty(struct > > vhost_net_buf *rxq) > > > > static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) { > > - void *ret = vhost_net_buf_get_ptr(rxq); > > - ++rxq->head; > > - return ret; > > + if (rxq->tail == rxq->head) > > + return NULL; > > + > > + return rxq->queue[rxq->head++]; > > Why this change? > > > } > > > > static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) @@ > > -993,12 +994,19 @@ static void handle_tx(struct vhost_net *net) > > > > static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock > > *sk) { > > + struct socket *sock = sk->sk_socket; > > struct sk_buff *head; > > int len = 0; > > unsigned long flags; > > > > - if (rvq->rx_ring) > > - return vhost_net_buf_peek(rvq); > > + if (rvq->rx_ring) { > > + len = vhost_net_buf_peek(rvq); > > + if (likely(len)) > > + return len; > > + } > > + > > + if (sock->ops->peek_len) > > + return sock->ops->peek_len(sock); > > > > spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); > > head = skb_peek(&sk->sk_receive_queue); > > -- > > 2.33.0 > > > Thanks