On Wed, Jan 24, 2024 at 5:38 PM Yunjian Wang <wangyunjian@xxxxxxxxxx> wrote: > > Now the zero-copy feature of AF_XDP socket is supported by some > drivers, which can reduce CPU utilization on the xdp program. > This patch set allows tun to support AF_XDP Rx zero-copy feature. > > This patch tries to address this by: > - Use peek_len to consume a xsk->desc and get xsk->desc length. > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty. > So add a check for empty vq's array in vhost_net_buf_produce(). > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support > - add tun_put_user_desc function to copy the Rx data to VM Code explains themselves, let's explain why you need to do this. 1) why you want to use peek_len 2) for "vq's array", what does it mean? 3) from the view of TUN/TAP tun_put_user_desc() is the TX path, so I guess you meant TX zerocopy instead of RX (as I don't see codes for RX?) A big question is how could you handle GSO packets from userspace/guests? > > Signed-off-by: Yunjian Wang <wangyunjian@xxxxxxxxxx> > --- > drivers/net/tun.c | 165 +++++++++++++++++++++++++++++++++++++++++++- > drivers/vhost/net.c | 18 +++-- > 2 files changed, 176 insertions(+), 7 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index afa5497f7c35..248b0f8e07d1 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -77,6 +77,7 @@ > #include <net/ax25.h> > #include <net/rose.h> > #include <net/6lowpan.h> > +#include <net/xdp_sock_drv.h> > > #include <linux/uaccess.h> > #include <linux/proc_fs.h> > @@ -145,6 +146,10 @@ struct tun_file { > struct tun_struct *detached; > struct ptr_ring tx_ring; > struct xdp_rxq_info xdp_rxq; > + struct xdp_desc desc; > + /* protects xsk pool */ > + spinlock_t pool_lock; > + struct xsk_buff_pool *pool; > }; > > struct tun_page { > @@ -208,6 +213,8 @@ struct tun_struct { > struct bpf_prog __rcu *xdp_prog; > struct tun_prog __rcu *steering_prog; > struct tun_prog __rcu *filter_prog; > + /* tracks AF_XDP ZC enabled queues */ > + unsigned long *af_xdp_zc_qps; > struct ethtool_link_ksettings link_ksettings; > /* init args */ > struct file *file; > @@ -795,6 +802,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file, > > tfile->queue_index = tun->numqueues; > tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; > + tfile->desc.len = 0; > + tfile->pool = NULL; > > if (tfile->detached) { > /* Re-attach detached tfile, updating XDP queue_index */ > @@ -989,6 +998,13 @@ static int tun_net_init(struct net_device *dev) > return err; > } > > + tun->af_xdp_zc_qps = bitmap_zalloc(MAX_TAP_QUEUES, GFP_KERNEL); > + if (!tun->af_xdp_zc_qps) { > + security_tun_dev_free_security(tun->security); > + free_percpu(dev->tstats); > + return -ENOMEM; > + } > + > tun_flow_init(tun); > > dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | > @@ -1009,6 +1025,7 @@ static int tun_net_init(struct net_device *dev) > tun_flow_uninit(tun); > security_tun_dev_free_security(tun->security); > free_percpu(dev->tstats); > + bitmap_free(tun->af_xdp_zc_qps); > return err; > } > return 0; > @@ -1222,11 +1239,77 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, > return 0; > } > > +static int tun_xsk_pool_enable(struct net_device *netdev, > + struct xsk_buff_pool *pool, > + u16 qid) > +{ > + struct tun_struct *tun = netdev_priv(netdev); > + struct tun_file *tfile; > + unsigned long flags; > + > + rcu_read_lock(); > + tfile = rtnl_dereference(tun->tfiles[qid]); > + if (!tfile) { > + rcu_read_unlock(); > + return -ENODEV; > + } > + > + spin_lock_irqsave(&tfile->pool_lock, flags); > + xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq); > + tfile->pool = pool; > + spin_unlock_irqrestore(&tfile->pool_lock, flags); > + > + rcu_read_unlock(); > + set_bit(qid, tun->af_xdp_zc_qps); > + > + return 0; > +} > + > +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid) > +{ > + struct tun_struct *tun = netdev_priv(netdev); > + struct tun_file *tfile; > + unsigned long flags; > + > + if (!test_bit(qid, tun->af_xdp_zc_qps)) > + return 0; > + > + clear_bit(qid, tun->af_xdp_zc_qps); > + > + rcu_read_lock(); > + tfile = rtnl_dereference(tun->tfiles[qid]); > + if (!tfile) { > + rcu_read_unlock(); > + return 0; > + } > + > + spin_lock_irqsave(&tfile->pool_lock, flags); > + if (tfile->desc.len) { > + xsk_tx_completed(tfile->pool, 1); > + tfile->desc.len = 0; > + } > + tfile->pool = NULL; > + spin_unlock_irqrestore(&tfile->pool_lock, flags); > + > + rcu_read_unlock(); > + return 0; > +} > + > +int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool, > + u16 qid) > +{ > + return pool ? tun_xsk_pool_enable(dev, pool, qid) : > + tun_xsk_pool_disable(dev, qid); > +} > + > static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) > { > switch (xdp->command) { > case XDP_SETUP_PROG: > return tun_xdp_set(dev, xdp->prog, xdp->extack); > + case XDP_SETUP_XSK_POOL: > + return tun_xsk_pool_setup(dev, xdp->xsk.pool, > + xdp->xsk.queue_id); > default: > return -EINVAL; > } > @@ -1331,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) > return nxmit; > } > > +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) > +{ > + struct tun_struct *tun = netdev_priv(dev); > + struct tun_file *tfile; > + > + rcu_read_lock(); > + tfile = rcu_dereference(tun->tfiles[qid]); > + if (tfile) > + __tun_xdp_flush_tfile(tfile); > + rcu_read_unlock(); > + return 0; > +} > + > static const struct net_device_ops tap_netdev_ops = { > .ndo_init = tun_net_init, > .ndo_uninit = tun_net_uninit, > @@ -1347,6 +1443,7 @@ static const struct net_device_ops tap_netdev_ops = { > .ndo_get_stats64 = dev_get_tstats64, > .ndo_bpf = tun_xdp, > .ndo_xdp_xmit = tun_xdp_xmit, > + .ndo_xsk_wakeup = tun_xsk_wakeup, > .ndo_change_carrier = tun_net_change_carrier, > }; > > @@ -1404,7 +1501,8 @@ static void tun_net_initialize(struct net_device *dev) > /* Currently tun does not support XDP, only tap does. */ > dev->xdp_features = NETDEV_XDP_ACT_BASIC | > NETDEV_XDP_ACT_REDIRECT | > - NETDEV_XDP_ACT_NDO_XMIT; > + NETDEV_XDP_ACT_NDO_XMIT | > + NETDEV_XDP_ACT_XSK_ZEROCOPY; > > break; > } > @@ -2213,6 +2311,37 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) > return ptr; > } > > +static ssize_t tun_put_user_desc(struct tun_struct *tun, > + struct tun_file *tfile, > + struct xdp_desc *desc, > + struct iov_iter *iter) > +{ > + size_t size = desc->len; > + int vnet_hdr_sz = 0; > + size_t ret; > + > + if (tun->flags & IFF_VNET_HDR) { > + struct virtio_net_hdr_mrg_rxbuf gso = { 0 }; > + > + vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); > + if (unlikely(iov_iter_count(iter) < vnet_hdr_sz)) > + return -EINVAL; > + if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) != > + sizeof(gso))) > + return -EFAULT; > + iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); > + } > + > + ret = copy_to_iter(xsk_buff_raw_get_data(tfile->pool, desc->addr), > + size, iter) + vnet_hdr_sz; > + > + preempt_disable(); > + dev_sw_netstats_tx_add(tun->dev, 1, ret); > + preempt_enable(); > + > + return ret; > +} > + > static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, > struct iov_iter *to, > int noblock, void *ptr) > @@ -2226,6 +2355,22 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, > } > > if (!ptr) { > + /* Read frames from xsk's desc */ > + if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) { > + spin_lock(&tfile->pool_lock); > + if (tfile->pool) { > + ret = tun_put_user_desc(tun, tfile, &tfile->desc, to); > + xsk_tx_completed(tfile->pool, 1); > + if (xsk_uses_need_wakeup(tfile->pool)) > + xsk_set_tx_need_wakeup(tfile->pool); > + tfile->desc.len = 0; > + } else { > + ret = -EBADFD; > + } > + spin_unlock(&tfile->pool_lock); > + return ret; > + } > + > /* Read frames from ring */ > ptr = tun_ring_recv(tfile, noblock, &err); > if (!ptr) > @@ -2311,6 +2456,7 @@ static void tun_free_netdev(struct net_device *dev) > > BUG_ON(!(list_empty(&tun->disabled))); > > + bitmap_free(tun->af_xdp_zc_qps); > free_percpu(dev->tstats); > tun_flow_uninit(tun); > security_tun_dev_free_security(tun->security); > @@ -2666,7 +2812,19 @@ static int tun_peek_len(struct socket *sock) > if (!tun) > return 0; > > - ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); > + if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) { > + spin_lock(&tfile->pool_lock); > + if (tfile->pool && xsk_tx_peek_desc(tfile->pool, &tfile->desc)) { Does it mean if userspace doesn't peek, we can't read anything? We need to make sure syscall read works as well as vhost_net. > + xsk_tx_release(tfile->pool); > + ret = tfile->desc.len; > + /* The length of desc must be greater than 0 */ > + if (!ret) > + xsk_tx_completed(tfile->pool, 1); > + } > + spin_unlock(&tfile->pool_lock); > + } else { > + ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); > + } > tun_put(tun); > > return ret; > @@ -3469,8 +3627,11 @@ static int tun_chr_open(struct inode *inode, struct file * file) > > mutex_init(&tfile->napi_mutex); > RCU_INIT_POINTER(tfile->tun, NULL); > + spin_lock_init(&tfile->pool_lock); > tfile->flags = 0; > tfile->ifindex = 0; > + tfile->pool = NULL; > + tfile->desc.len = 0; > > init_waitqueue_head(&tfile->socket.wq.wait); > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index f2ed7167c848..a1f143ad2341 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -169,9 +169,10 @@ static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) > > static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) > { > - void *ret = vhost_net_buf_get_ptr(rxq); > - ++rxq->head; > - return ret; > + if (rxq->tail == rxq->head) > + return NULL; > + > + return rxq->queue[rxq->head++]; > } > > static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) > @@ -993,12 +994,19 @@ static void handle_tx(struct vhost_net *net) > > static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) > { > + struct socket *sock = sk->sk_socket; > struct sk_buff *head; > int len = 0; > unsigned long flags; > > - if (rvq->rx_ring) > - return vhost_net_buf_peek(rvq); > + if (rvq->rx_ring) { > + len = vhost_net_buf_peek(rvq); > + if (likely(len)) > + return len; > + } > + > + if (sock->ops->peek_len) > + return sock->ops->peek_len(sock); What prevents you from reusing the ptr_ring here? Then you don't need the above tricks. Thanks > > spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); > head = skb_peek(&sk->sk_receive_queue); > -- > 2.33.0 >