On Fri, Jan 06, 2017 at 10:13:17AM +0800, Jason Wang wrote: > We can only process 1 packet at one time during sendmsg(). This often > lead bad cache utilization under heavy load. So this patch tries to do > some batching during rx before submitting them to host network > stack. This is done through accepting MSG_MORE as a hint from > sendmsg() caller, if it was set, batch the packet temporarily in a > linked list and submit them all once MSG_MORE were cleared. > > Tests were done by pktgen (burst=128) in guest over mlx4(noqueue) on host: > > Mpps -+% > rx-frames = 0 0.91 +0% > rx-frames = 4 1.00 +9.8% > rx-frames = 8 1.00 +9.8% > rx-frames = 16 1.01 +10.9% > rx-frames = 32 1.07 +17.5% > rx-frames = 48 1.07 +17.5% > rx-frames = 64 1.08 +18.6% > rx-frames = 64 (no MSG_MORE) 0.91 +0% > > User were allowed to change per device batched packets through > ethtool -C rx-frames. NAPI_POLL_WEIGHT were used as upper limitation > to prevent bh from being disabled too long. > > Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx> > --- > drivers/net/tun.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 70 insertions(+), 6 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index cd8e02c..6c93926 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -218,6 +218,7 @@ struct tun_struct { > struct list_head disabled; > void *security; > u32 flow_count; > + u32 rx_batched; > struct tun_pcpu_stats __percpu *pcpu_stats; > }; > > @@ -522,6 +523,7 @@ static void tun_queue_purge(struct tun_file *tfile) > while ((skb = skb_array_consume(&tfile->tx_array)) != NULL) > kfree_skb(skb); > > + skb_queue_purge(&tfile->sk.sk_write_queue); > skb_queue_purge(&tfile->sk.sk_error_queue); > } > > @@ -1140,10 +1142,45 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, > return skb; > } > > +static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile, > + struct sk_buff *skb, int more) > +{ > + struct sk_buff_head *queue = &tfile->sk.sk_write_queue; > + struct sk_buff_head process_queue; > + u32 rx_batched = tun->rx_batched; > + bool rcv = false; > + > + if (!rx_batched || (!more && skb_queue_empty(queue))) { > + local_bh_disable(); > + netif_receive_skb(skb); > + local_bh_enable(); > + return; > + } > + > + spin_lock(&queue->lock); > + if (!more || skb_queue_len(queue) == rx_batched) { > + __skb_queue_head_init(&process_queue); > + skb_queue_splice_tail_init(queue, &process_queue); > + rcv = true; > + } else { > + __skb_queue_tail(queue, skb); > + } > + spin_unlock(&queue->lock); > + > + if (rcv) { > + struct sk_buff *nskb; > + local_bh_disable(); > + while ((nskb = __skb_dequeue(&process_queue))) > + netif_receive_skb(nskb); > + netif_receive_skb(skb); > + local_bh_enable(); > + } > +} > + > /* Get packet from user space buffer */ > static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, > void *msg_control, struct iov_iter *from, > - int noblock) > + int noblock, bool more) > { > struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; > struct sk_buff *skb; > @@ -1283,10 +1320,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, > skb_probe_transport_header(skb, 0); > > rxhash = skb_get_hash(skb); > + > #ifndef CONFIG_4KSTACKS > - local_bh_disable(); > - netif_receive_skb(skb); > - local_bh_enable(); > + tun_rx_batched(tun, tfile, skb, more); > #else > netif_rx_ni(skb); > #endif > @@ -1312,7 +1348,8 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) > if (!tun) > return -EBADFD; > > - result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK); > + result = tun_get_user(tun, tfile, NULL, from, > + file->f_flags & O_NONBLOCK, false); > > tun_put(tun); > return result; > @@ -1570,7 +1607,8 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) > return -EBADFD; > > ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter, > - m->msg_flags & MSG_DONTWAIT); > + m->msg_flags & MSG_DONTWAIT, > + m->msg_flags & MSG_MORE); > tun_put(tun); > return ret; > } > @@ -1771,6 +1809,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) > tun->align = NET_SKB_PAD; > tun->filter_attached = false; > tun->sndbuf = tfile->socket.sk->sk_sndbuf; > + tun->rx_batched = 0; > > tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); > if (!tun->pcpu_stats) { > @@ -2439,6 +2478,29 @@ static void tun_set_msglevel(struct net_device *dev, u32 value) > #endif > } > > +static int tun_get_coalesce(struct net_device *dev, > + struct ethtool_coalesce *ec) > +{ > + struct tun_struct *tun = netdev_priv(dev); > + > + ec->rx_max_coalesced_frames = tun->rx_batched; > + > + return 0; > +} > + > +static int tun_set_coalesce(struct net_device *dev, > + struct ethtool_coalesce *ec) > +{ > + struct tun_struct *tun = netdev_priv(dev); > + > + if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT) > + return -EINVAL; So what should userspace do? Keep trying until it succeeds? I think it's better to just use NAPI_POLL_WEIGHT instead and DTRT here. > + > + tun->rx_batched = ec->rx_max_coalesced_frames; > + > + return 0; > +} > + > static const struct ethtool_ops tun_ethtool_ops = { > .get_settings = tun_get_settings, > .get_drvinfo = tun_get_drvinfo, > @@ -2446,6 +2508,8 @@ static const struct ethtool_ops tun_ethtool_ops = { > .set_msglevel = tun_set_msglevel, > .get_link = ethtool_op_get_link, > .get_ts_info = ethtool_op_get_ts_info, > + .get_coalesce = tun_get_coalesce, > + .set_coalesce = tun_set_coalesce, > }; > > static int tun_queue_resize(struct tun_struct *tun) > -- > 2.7.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html