This patch set allows TUN to support the AF_XDP Tx zero-copy feature, which can significantly reduce CPU utilization for XDP programs. Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer ring has been utilized to queue different types of pointers by encoding the type into the lower bits. Therefore, we introduce a new flag, TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP descriptors and differentiate them from XDP buffers and sk_buffs. Additionally, a spin lock is added for enabling and disabling operations on the xsk pool. The performance testing was performed on a Intel E5-2620 2.40GHz machine. Traffic were generated/send through TUN(testpmd txonly with AF_XDP) to VM (testpmd rxonly in guest). +------+---------+---------+---------+ | | copy |zero-copy| speedup | +------+---------+---------+---------+ | UDP | Mpps | Mpps | % | | 64 | 2.5 | 4.0 | 60% | | 512 | 2.1 | 3.6 | 71% | | 1024 | 1.9 | 3.3 | 73% | +------+---------+---------+---------+ Signed-off-by: Yunjian Wang <wangyunjian@xxxxxxxxxx> --- drivers/net/tun.c | 177 +++++++++++++++++++++++++++++++++++++++-- drivers/vhost/net.c | 4 + include/linux/if_tun.h | 32 ++++++++ 3 files changed, 208 insertions(+), 5 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index bc80fc1d576e..7f4ff50b532c 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -63,6 +63,7 @@ #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> +#include <net/xdp_sock_drv.h> #include <net/ip_tunnels.h> #include <linux/seq_file.h> #include <linux/uio.h> @@ -86,6 +87,7 @@ static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd); #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) +#define TUN_XDP_BATCH 64 /* TUN device flags */ @@ -146,6 +148,9 @@ struct tun_file { struct tun_struct *detached; struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq; + struct xsk_buff_pool *xsk_pool; + spinlock_t pool_lock; /* Protects xsk pool enable/disable */ + u32 nb_descs; }; struct tun_page { @@ -614,6 +619,8 @@ void tun_ptr_free(void *ptr) struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); xdp_return_frame(xdpf); + } else if (tun_is_xdp_desc_frame(ptr)) { + return; } else { __skb_array_destroy_skb(ptr); } @@ -631,6 +638,37 @@ static void tun_queue_purge(struct tun_file *tfile) skb_queue_purge(&tfile->sk.sk_error_queue); } +static void tun_set_xsk_pool(struct tun_file *tfile, struct xsk_buff_pool *pool) +{ + if (!pool) + return; + + spin_lock(&tfile->pool_lock); + xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq); + tfile->xsk_pool = pool; + spin_unlock(&tfile->pool_lock); +} + +static void tun_clean_xsk_pool(struct tun_file *tfile) +{ + spin_lock(&tfile->pool_lock); + if (tfile->xsk_pool) { + void *ptr; + + while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL) + tun_ptr_free(ptr); + + if (tfile->nb_descs) { + xsk_tx_completed(tfile->xsk_pool, tfile->nb_descs); + if (xsk_uses_need_wakeup(tfile->xsk_pool)) + xsk_set_tx_need_wakeup(tfile->xsk_pool); + tfile->nb_descs = 0; + } + tfile->xsk_pool = NULL; + } + spin_unlock(&tfile->pool_lock); +} + static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; @@ -648,6 +686,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean) u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); + ntfile = rtnl_dereference(tun->tfiles[tun->numqueues - 1]); + /* Stop xsk zc xmit */ + tun_clean_xsk_pool(tfile); + tun_clean_xsk_pool(ntfile); + rcu_assign_pointer(tun->tfiles[index], tun->tfiles[tun->numqueues - 1]); ntfile = rtnl_dereference(tun->tfiles[index]); @@ -668,6 +711,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun_flow_delete_by_queue(tun, tun->numqueues + 1); /* Drop read queue */ tun_queue_purge(tfile); + tun_set_xsk_pool(ntfile, xsk_get_pool_from_qid(tun->dev, index)); tun_set_real_num_queues(tun); } else if (tfile->detached && clean) { tun = tun_enable_queue(tfile); @@ -801,6 +845,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file, if (tfile->xdp_rxq.queue_index != tfile->queue_index) tfile->xdp_rxq.queue_index = tfile->queue_index; + tun_set_xsk_pool(tfile, xsk_get_pool_from_qid(dev, tfile->queue_index)); } else { /* Setup XDP RX-queue info, for new tfile getting attached */ err = xdp_rxq_info_reg(&tfile->xdp_rxq, @@ -1221,11 +1266,50 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, return 0; } +static int tun_xsk_pool_enable(struct net_device *netdev, + struct xsk_buff_pool *pool, + u16 qid) +{ + struct tun_struct *tun = netdev_priv(netdev); + struct tun_file *tfile; + + if (qid >= tun->numqueues) + return -EINVAL; + + tfile = rtnl_dereference(tun->tfiles[qid]); + tun_set_xsk_pool(tfile, pool); + + return 0; +} + +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid) +{ + struct tun_struct *tun = netdev_priv(netdev); + struct tun_file *tfile; + + if (qid >= MAX_TAP_QUEUES) + return -EINVAL; + + tfile = rtnl_dereference(tun->tfiles[qid]); + if (tfile) + tun_clean_xsk_pool(tfile); + return 0; +} + +static int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool, + u16 qid) +{ + return pool ? tun_xsk_pool_enable(dev, pool, qid) : + tun_xsk_pool_disable(dev, qid); +} + static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return tun_xdp_set(dev, xdp->prog, xdp->extack); + case XDP_SETUP_XSK_POOL: + return tun_xsk_pool_setup(dev, xdp->xsk.pool, xdp->xsk.queue_id); default: return -EINVAL; } @@ -1330,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) return nxmit; } +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) +{ + struct tun_struct *tun = netdev_priv(dev); + struct tun_file *tfile; + + rcu_read_lock(); + tfile = rcu_dereference(tun->tfiles[qid]); + if (tfile) + __tun_xdp_flush_tfile(tfile); + rcu_read_unlock(); + return 0; +} + static const struct net_device_ops tap_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, @@ -1346,6 +1443,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_get_stats64 = dev_get_tstats64, .ndo_bpf = tun_xdp, .ndo_xdp_xmit = tun_xdp_xmit, + .ndo_xsk_wakeup = tun_xsk_wakeup, .ndo_change_carrier = tun_net_change_carrier, }; @@ -1403,7 +1501,8 @@ static void tun_net_initialize(struct net_device *dev) /* Currently tun does not support XDP, only tap does. */ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | - NETDEV_XDP_ACT_NDO_XMIT; + NETDEV_XDP_ACT_NDO_XMIT | + NETDEV_XDP_ACT_XSK_ZEROCOPY; break; } @@ -2058,11 +2157,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t tun_put_user_xdp(struct tun_struct *tun, struct tun_file *tfile, - struct xdp_frame *xdp_frame, + void *addr, + size_t size, struct iov_iter *iter) { int vnet_hdr_sz = 0; - size_t size = xdp_frame->len; size_t ret; if (tun->flags & IFF_VNET_HDR) { @@ -2077,7 +2176,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); } - ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; + ret = copy_to_iter(addr, size, iter) + vnet_hdr_sz; preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, ret); @@ -2240,8 +2339,20 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); - ret = tun_put_user_xdp(tun, tfile, xdpf, to); + ret = tun_put_user_xdp(tun, tfile, xdpf->data, xdpf->len, to); xdp_return_frame(xdpf); + } else if (tun_is_xdp_desc_frame(ptr)) { + struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr); + void *data; + + spin_lock(&tfile->pool_lock); + if (tfile->xsk_pool) { + data = xsk_buff_raw_get_data(tfile->xsk_pool, desc->addr); + ret = tun_put_user_xdp(tun, tfile, data, desc->len, to); + } else { + ret = 0; + } + spin_unlock(&tfile->pool_lock); } else { struct sk_buff *skb = ptr; @@ -2654,6 +2765,10 @@ static int tun_ptr_peek_len(void *ptr) struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; + } else if (tun_is_xdp_desc_frame(ptr)) { + struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr); + + return desc->len; } return __skb_array_len_with_tag(ptr); } else { @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr) } } +static void tun_peek_xsk(struct tun_file *tfile) +{ + struct xsk_buff_pool *pool; + u32 i, batch, budget; + void *frame; + + if (!ptr_ring_empty(&tfile->tx_ring)) + return; + + spin_lock(&tfile->pool_lock); + pool = tfile->xsk_pool; + if (!pool) { + spin_unlock(&tfile->pool_lock); + return; + } + + if (tfile->nb_descs) { + xsk_tx_completed(pool, tfile->nb_descs); + if (xsk_uses_need_wakeup(pool)) + xsk_set_tx_need_wakeup(pool); + } + + spin_lock(&tfile->tx_ring.producer_lock); + budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH); + + batch = xsk_tx_peek_release_desc_batch(pool, budget); + if (!batch) { + tfile->nb_descs = 0; + spin_unlock(&tfile->tx_ring.producer_lock); + spin_unlock(&tfile->pool_lock); + return; + } + + tfile->nb_descs = batch; + for (i = 0; i < batch; i++) { + /* Encode the XDP DESC flag into lowest bit for consumer to differ + * XDP desc from XDP buffer and sk_buff. + */ + frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]); + /* The budget must be less than or equal to tx_ring.size, + * so enqueuing will not fail. + */ + __ptr_ring_produce(&tfile->tx_ring, frame); + } + spin_unlock(&tfile->tx_ring.producer_lock); + spin_unlock(&tfile->pool_lock); +} + static int tun_peek_len(struct socket *sock) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); @@ -2671,6 +2834,9 @@ static int tun_peek_len(struct socket *sock) if (!tun) return 0; + if (sock_flag(&tfile->sk, SOCK_XDP)) + tun_peek_xsk(tfile); + ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); tun_put(tun); @@ -3473,6 +3639,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) } mutex_init(&tfile->napi_mutex); + spin_lock_init(&tfile->pool_lock); RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 077e74421558..eb83764be26c 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -202,6 +202,10 @@ static int vhost_net_buf_peek_len(void *ptr) struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; + } else if (tun_is_xdp_desc_frame(ptr)) { + struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr); + + return desc->len; } return __skb_array_len_with_tag(ptr); diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 043d442994b0..4142453b5e52 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -6,10 +6,12 @@ #ifndef __IF_TUN_H #define __IF_TUN_H +#include <uapi/linux/if_xdp.h> #include <uapi/linux/if_tun.h> #include <uapi/linux/virtio_net.h> #define TUN_XDP_FLAG 0x1UL +#define TUN_XDP_DESC_FLAG 0x2UL #define TUN_MSG_UBUF 1 #define TUN_MSG_PTR 2 @@ -43,6 +45,21 @@ static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); } +static inline bool tun_is_xdp_desc_frame(void *ptr) +{ + return (unsigned long)ptr & TUN_XDP_DESC_FLAG; +} + +static inline void *tun_xdp_desc_to_ptr(struct xdp_desc *desc) +{ + return (void *)((unsigned long)desc | TUN_XDP_DESC_FLAG); +} + +static inline struct xdp_desc *tun_ptr_to_xdp_desc(void *ptr) +{ + return (void *)((unsigned long)ptr & ~TUN_XDP_DESC_FLAG); +} + void tun_ptr_free(void *ptr); #else #include <linux/err.h> @@ -75,6 +92,21 @@ static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) return NULL; } +static inline bool tun_is_xdp_desc_frame(void *ptr) +{ + return false; +} + +static inline void *tun_xdp_desc_to_ptr(struct xdp_desc *desc) +{ + return NULL; +} + +static inline struct xdp_frame *tun_ptr_to_xdp_desc(void *ptr) +{ + return NULL; +} + static inline void tun_ptr_free(void *ptr) { } -- 2.41.0