Now the zero-copy feature of AF_XDP socket is supported by some drivers, which can reduce CPU utilization on the xdp program. This patch set allows tun to support AF_XDP Rx zero-copy feature. This patch tries to address this by: - Use peek_len to consume a xsk->desc and get xsk->desc length. - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty. So add a check for empty vq's array in vhost_net_buf_produce(). - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support - add tun_put_user_desc function to copy the Rx data to VM Signed-off-by: Yunjian Wang <wangyunjian@xxxxxxxxxx> --- drivers/net/tun.c | 165 +++++++++++++++++++++++++++++++++++++++++++- drivers/vhost/net.c | 18 +++-- 2 files changed, 176 insertions(+), 7 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index afa5497f7c35..248b0f8e07d1 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -77,6 +77,7 @@ #include <net/ax25.h> #include <net/rose.h> #include <net/6lowpan.h> +#include <net/xdp_sock_drv.h> #include <linux/uaccess.h> #include <linux/proc_fs.h> @@ -145,6 +146,10 @@ struct tun_file { struct tun_struct *detached; struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq; + struct xdp_desc desc; + /* protects xsk pool */ + spinlock_t pool_lock; + struct xsk_buff_pool *pool; }; struct tun_page { @@ -208,6 +213,8 @@ struct tun_struct { struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; + /* tracks AF_XDP ZC enabled queues */ + unsigned long *af_xdp_zc_qps; struct ethtool_link_ksettings link_ksettings; /* init args */ struct file *file; @@ -795,6 +802,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file, tfile->queue_index = tun->numqueues; tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; + tfile->desc.len = 0; + tfile->pool = NULL; if (tfile->detached) { /* Re-attach detached tfile, updating XDP queue_index */ @@ -989,6 +998,13 @@ static int tun_net_init(struct net_device *dev) return err; } + tun->af_xdp_zc_qps = bitmap_zalloc(MAX_TAP_QUEUES, GFP_KERNEL); + if (!tun->af_xdp_zc_qps) { + security_tun_dev_free_security(tun->security); + free_percpu(dev->tstats); + return -ENOMEM; + } + tun_flow_init(tun); dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | @@ -1009,6 +1025,7 @@ static int tun_net_init(struct net_device *dev) tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); free_percpu(dev->tstats); + bitmap_free(tun->af_xdp_zc_qps); return err; } return 0; @@ -1222,11 +1239,77 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, return 0; } +static int tun_xsk_pool_enable(struct net_device *netdev, + struct xsk_buff_pool *pool, + u16 qid) +{ + struct tun_struct *tun = netdev_priv(netdev); + struct tun_file *tfile; + unsigned long flags; + + rcu_read_lock(); + tfile = rtnl_dereference(tun->tfiles[qid]); + if (!tfile) { + rcu_read_unlock(); + return -ENODEV; + } + + spin_lock_irqsave(&tfile->pool_lock, flags); + xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq); + tfile->pool = pool; + spin_unlock_irqrestore(&tfile->pool_lock, flags); + + rcu_read_unlock(); + set_bit(qid, tun->af_xdp_zc_qps); + + return 0; +} + +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid) +{ + struct tun_struct *tun = netdev_priv(netdev); + struct tun_file *tfile; + unsigned long flags; + + if (!test_bit(qid, tun->af_xdp_zc_qps)) + return 0; + + clear_bit(qid, tun->af_xdp_zc_qps); + + rcu_read_lock(); + tfile = rtnl_dereference(tun->tfiles[qid]); + if (!tfile) { + rcu_read_unlock(); + return 0; + } + + spin_lock_irqsave(&tfile->pool_lock, flags); + if (tfile->desc.len) { + xsk_tx_completed(tfile->pool, 1); + tfile->desc.len = 0; + } + tfile->pool = NULL; + spin_unlock_irqrestore(&tfile->pool_lock, flags); + + rcu_read_unlock(); + return 0; +} + +int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool, + u16 qid) +{ + return pool ? tun_xsk_pool_enable(dev, pool, qid) : + tun_xsk_pool_disable(dev, qid); +} + static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return tun_xdp_set(dev, xdp->prog, xdp->extack); + case XDP_SETUP_XSK_POOL: + return tun_xsk_pool_setup(dev, xdp->xsk.pool, + xdp->xsk.queue_id); default: return -EINVAL; } @@ -1331,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) return nxmit; } +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) +{ + struct tun_struct *tun = netdev_priv(dev); + struct tun_file *tfile; + + rcu_read_lock(); + tfile = rcu_dereference(tun->tfiles[qid]); + if (tfile) + __tun_xdp_flush_tfile(tfile); + rcu_read_unlock(); + return 0; +} + static const struct net_device_ops tap_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, @@ -1347,6 +1443,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_get_stats64 = dev_get_tstats64, .ndo_bpf = tun_xdp, .ndo_xdp_xmit = tun_xdp_xmit, + .ndo_xsk_wakeup = tun_xsk_wakeup, .ndo_change_carrier = tun_net_change_carrier, }; @@ -1404,7 +1501,8 @@ static void tun_net_initialize(struct net_device *dev) /* Currently tun does not support XDP, only tap does. */ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | - NETDEV_XDP_ACT_NDO_XMIT; + NETDEV_XDP_ACT_NDO_XMIT | + NETDEV_XDP_ACT_XSK_ZEROCOPY; break; } @@ -2213,6 +2311,37 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) return ptr; } +static ssize_t tun_put_user_desc(struct tun_struct *tun, + struct tun_file *tfile, + struct xdp_desc *desc, + struct iov_iter *iter) +{ + size_t size = desc->len; + int vnet_hdr_sz = 0; + size_t ret; + + if (tun->flags & IFF_VNET_HDR) { + struct virtio_net_hdr_mrg_rxbuf gso = { 0 }; + + vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); + if (unlikely(iov_iter_count(iter) < vnet_hdr_sz)) + return -EINVAL; + if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) != + sizeof(gso))) + return -EFAULT; + iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); + } + + ret = copy_to_iter(xsk_buff_raw_get_data(tfile->pool, desc->addr), + size, iter) + vnet_hdr_sz; + + preempt_disable(); + dev_sw_netstats_tx_add(tun->dev, 1, ret); + preempt_enable(); + + return ret; +} + static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, int noblock, void *ptr) @@ -2226,6 +2355,22 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, } if (!ptr) { + /* Read frames from xsk's desc */ + if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) { + spin_lock(&tfile->pool_lock); + if (tfile->pool) { + ret = tun_put_user_desc(tun, tfile, &tfile->desc, to); + xsk_tx_completed(tfile->pool, 1); + if (xsk_uses_need_wakeup(tfile->pool)) + xsk_set_tx_need_wakeup(tfile->pool); + tfile->desc.len = 0; + } else { + ret = -EBADFD; + } + spin_unlock(&tfile->pool_lock); + return ret; + } + /* Read frames from ring */ ptr = tun_ring_recv(tfile, noblock, &err); if (!ptr) @@ -2311,6 +2456,7 @@ static void tun_free_netdev(struct net_device *dev) BUG_ON(!(list_empty(&tun->disabled))); + bitmap_free(tun->af_xdp_zc_qps); free_percpu(dev->tstats); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); @@ -2666,7 +2812,19 @@ static int tun_peek_len(struct socket *sock) if (!tun) return 0; - ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); + if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) { + spin_lock(&tfile->pool_lock); + if (tfile->pool && xsk_tx_peek_desc(tfile->pool, &tfile->desc)) { + xsk_tx_release(tfile->pool); + ret = tfile->desc.len; + /* The length of desc must be greater than 0 */ + if (!ret) + xsk_tx_completed(tfile->pool, 1); + } + spin_unlock(&tfile->pool_lock); + } else { + ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); + } tun_put(tun); return ret; @@ -3469,8 +3627,11 @@ static int tun_chr_open(struct inode *inode, struct file * file) mutex_init(&tfile->napi_mutex); RCU_INIT_POINTER(tfile->tun, NULL); + spin_lock_init(&tfile->pool_lock); tfile->flags = 0; tfile->ifindex = 0; + tfile->pool = NULL; + tfile->desc.len = 0; init_waitqueue_head(&tfile->socket.wq.wait); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f2ed7167c848..a1f143ad2341 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -169,9 +169,10 @@ static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) { - void *ret = vhost_net_buf_get_ptr(rxq); - ++rxq->head; - return ret; + if (rxq->tail == rxq->head) + return NULL; + + return rxq->queue[rxq->head++]; } static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) @@ -993,12 +994,19 @@ static void handle_tx(struct vhost_net *net) static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) { + struct socket *sock = sk->sk_socket; struct sk_buff *head; int len = 0; unsigned long flags; - if (rvq->rx_ring) - return vhost_net_buf_peek(rvq); + if (rvq->rx_ring) { + len = vhost_net_buf_peek(rvq); + if (likely(len)) + return len; + } + + if (sock->ops->peek_len) + return sock->ops->peek_len(sock); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); -- 2.33.0