Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available

"Michael S. Tsirkin" <mst@xxxxxxxxxx> · Thu, 21 Dec 2017 02:14:46 +0200

On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:
> This patch enables virtio to switch over to a VF datapath when a VF netdev
> is present with the same MAC address.

I prefer saying "a passthrough device" here. Does not have to be a VF at
all.

>  It allows live migration of a VM
> with a direct attached VF without the need to setup a bond/team between a
> VF and virtio net device in the guest.
> 
> The hypervisor needs to unplug the VF device from the guest on the source
> host and reset the MAC filter of the VF to initiate failover of datapath to
> virtio before starting the migration. After the migration is completed, the
> destination hypervisor sets the MAC filter on the VF and plugs it back to
> the guest to switch over to VF datapath.
> 
> It is entirely based on netvsc implementation and it should be possible to
> make this code generic and move it to a common location that can be shared
> by netvsc and virtio.
> 
> Also, i think we should make this a negotiated feature that is off by
> default via a new feature bit.

So please include this. A copy needs to go to virtio TC
to reserve the bit. Enabling this by default risks breaking
too many configurations.

> 
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
> 
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@xxxxxxxxx>
> Reviewed-by: Jesse Brandeburg <jesse.brandeburg@xxxxxxxxx>
> ---
>  drivers/net/virtio_net.c | 341 ++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 339 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 559b215c0169..a34c717bb15b 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -31,6 +31,8 @@
>  #include <linux/average.h>
>  #include <linux/filter.h>
>  #include <net/route.h>
> +#include <linux/netdevice.h>
> +#include <linux/netpoll.h>
>  
>  static int napi_weight = NAPI_POLL_WEIGHT;
>  module_param(napi_weight, int, 0444);
> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
>   */
>  DECLARE_EWMA(pkt_len, 0, 64)
>  
> +#define VF_TAKEOVER_INT	(HZ / 10)
> +
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>  
>  static const unsigned long guest_offloads[] = {
> @@ -117,6 +121,15 @@ struct receive_queue {
>  	char name[40];
>  };
>  
> +struct virtnet_vf_pcpu_stats {
> +	u64	rx_packets;
> +	u64	rx_bytes;
> +	u64	tx_packets;
> +	u64	tx_bytes;
> +	struct u64_stats_sync   syncp;
> +	u32	tx_dropped;
> +};
> +
>  struct virtnet_info {
>  	struct virtio_device *vdev;
>  	struct virtqueue *cvq;
> @@ -179,6 +192,11 @@ struct virtnet_info {
>  	u32 speed;
>  
>  	unsigned long guest_offloads;
> +
> +	/* State to manage the associated VF interface. */
> +	struct net_device __rcu *vf_netdev;
> +	struct virtnet_vf_pcpu_stats __percpu *vf_stats;
> +	struct delayed_work vf_takeover;
>  };
>  
>  struct padded_vnet_hdr {
> @@ -1300,16 +1318,51 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
>  	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
>  }
>  
> +/* Send skb on the slave VF device. */
> +static int virtnet_vf_xmit(struct net_device *dev, struct net_device *vf_netdev,
> +			   struct sk_buff *skb)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	unsigned int len = skb->len;
> +	int rc;
> +
> +	skb->dev = vf_netdev;
> +	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
> +
> +	rc = dev_queue_xmit(skb);
> +	if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
> +		struct virtnet_vf_pcpu_stats *pcpu_stats
> +			= this_cpu_ptr(vi->vf_stats);
> +
> +		u64_stats_update_begin(&pcpu_stats->syncp);
> +		pcpu_stats->tx_packets++;
> +		pcpu_stats->tx_bytes += len;
> +		u64_stats_update_end(&pcpu_stats->syncp);
> +	} else {
> +		this_cpu_inc(vi->vf_stats->tx_dropped);
> +	}
> +
> +	return rc;
> +}
> +
>  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	int qnum = skb_get_queue_mapping(skb);
>  	struct send_queue *sq = &vi->sq[qnum];
> +	struct net_device *vf_netdev;
>  	int err;
>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>  	bool kick = !skb->xmit_more;
>  	bool use_napi = sq->napi.weight;
>  
> +	/* if VF is present and up then redirect packets
> +	 * called with rcu_read_lock_bh
> +	 */
> +	vf_netdev = rcu_dereference_bh(vi->vf_netdev);
> +	if (vf_netdev && netif_running(vf_netdev) && !netpoll_tx_running(dev))
> +		return virtnet_vf_xmit(dev, vf_netdev, skb);
> +
>  	/* Free up any pending old buffers before queueing new ones. */
>  	free_old_xmit_skbs(sq);
>  
> @@ -1456,10 +1509,41 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
>  	return ret;
>  }
>  
> +static void virtnet_get_vf_stats(struct net_device *dev,
> +				 struct virtnet_vf_pcpu_stats *tot)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int i;
> +
> +	memset(tot, 0, sizeof(*tot));
> +
> +	for_each_possible_cpu(i) {
> +		const struct virtnet_vf_pcpu_stats *stats
> +				= per_cpu_ptr(vi->vf_stats, i);
> +		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
> +		unsigned int start;
> +
> +		do {
> +			start = u64_stats_fetch_begin_irq(&stats->syncp);
> +			rx_packets = stats->rx_packets;
> +			tx_packets = stats->tx_packets;
> +			rx_bytes = stats->rx_bytes;
> +			tx_bytes = stats->tx_bytes;
> +		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
> +
> +		tot->rx_packets += rx_packets;
> +		tot->tx_packets += tx_packets;
> +		tot->rx_bytes   += rx_bytes;
> +		tot->tx_bytes   += tx_bytes;
> +		tot->tx_dropped += stats->tx_dropped;
> +	}
> +}
> +
>  static void virtnet_stats(struct net_device *dev,
>  			  struct rtnl_link_stats64 *tot)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
> +	struct virtnet_vf_pcpu_stats vf_stats;
>  	int cpu;
>  	unsigned int start;
>  
> @@ -1490,6 +1574,13 @@ static void virtnet_stats(struct net_device *dev,
>  	tot->rx_dropped = dev->stats.rx_dropped;
>  	tot->rx_length_errors = dev->stats.rx_length_errors;
>  	tot->rx_frame_errors = dev->stats.rx_frame_errors;
> +
> +	virtnet_get_vf_stats(dev, &vf_stats);
> +	tot->rx_packets += vf_stats.rx_packets;
> +	tot->tx_packets += vf_stats.tx_packets;
> +	tot->rx_bytes += vf_stats.rx_bytes;
> +	tot->tx_bytes += vf_stats.tx_bytes;
> +	tot->tx_dropped += vf_stats.tx_dropped;
>  }
>  
>  #ifdef CONFIG_NET_POLL_CONTROLLER
> @@ -2508,6 +2599,47 @@ static int virtnet_validate(struct virtio_device *vdev)
>  	return 0;
>  }
>  
> +static void __virtnet_vf_setup(struct net_device *ndev,
> +			       struct net_device *vf_netdev)
> +{
> +	int ret;
> +
> +	/* Align MTU of VF with master */
> +	ret = dev_set_mtu(vf_netdev, ndev->mtu);
> +	if (ret)
> +		netdev_warn(vf_netdev,
> +			    "unable to change mtu to %u\n", ndev->mtu);
> +
> +	if (netif_running(ndev)) {
> +		ret = dev_open(vf_netdev);
> +		if (ret)
> +			netdev_warn(vf_netdev,
> +				    "unable to open: %d\n", ret);
> +	}
> +}
> +
> +/* Setup VF as slave of the virtio device.
> + * Runs in workqueue to avoid recursion in netlink callbacks.
> + */
> +static void virtnet_vf_setup(struct work_struct *w)
> +{
> +	struct virtnet_info *vi
> +		= container_of(w, struct virtnet_info, vf_takeover.work);
> +	struct net_device *ndev = vi->dev;
> +	struct net_device *vf_netdev;
> +
> +	if (!rtnl_trylock()) {
> +		schedule_delayed_work(&vi->vf_takeover, 0);
> +		return;
> +	}
> +
> +	vf_netdev = rtnl_dereference(vi->vf_netdev);
> +	if (vf_netdev)
> +		__virtnet_vf_setup(ndev, vf_netdev);
> +
> +	rtnl_unlock();
> +}
> +
>  static int virtnet_probe(struct virtio_device *vdev)
>  {
>  	int i, err;
> @@ -2600,6 +2732,11 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	}
>  
>  	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
> +	INIT_DELAYED_WORK(&vi->vf_takeover, virtnet_vf_setup);
> +
> +	vi->vf_stats = netdev_alloc_pcpu_stats(struct virtnet_vf_pcpu_stats);
> +	if (!vi->vf_stats)
> +		goto free_stats;
>  
>  	/* If we can receive ANY GSO packets, we must allocate large ones. */
>  	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> @@ -2634,7 +2771,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>  			 */
>  			dev_err(&vdev->dev, "device MTU appears to have changed "
>  				"it is now %d < %d", mtu, dev->min_mtu);
> -			goto free_stats;
> +			goto free_vf_stats;
>  		}
>  
>  		dev->mtu = mtu;
> @@ -2658,7 +2795,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
>  	err = init_vqs(vi);
>  	if (err)
> -		goto free_stats;
> +		goto free_vf_stats;
>  
>  #ifdef CONFIG_SYSFS
>  	if (vi->mergeable_rx_bufs)
> @@ -2712,6 +2849,8 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	cancel_delayed_work_sync(&vi->refill);
>  	free_receive_page_frags(vi);
>  	virtnet_del_vqs(vi);
> +free_vf_stats:
> +	free_percpu(vi->vf_stats);
>  free_stats:
>  	free_percpu(vi->stats);
>  free:
> @@ -2733,19 +2872,178 @@ static void remove_vq_common(struct virtnet_info *vi)
>  	virtnet_del_vqs(vi);
>  }
>  
> +static struct net_device *get_virtio_bymac(const u8 *mac)
> +{
> +	struct net_device *dev;
> +
> +	ASSERT_RTNL();
> +
> +	for_each_netdev(&init_net, dev) {
> +		if (dev->netdev_ops != &virtnet_netdev)
> +			continue;       /* not a virtio_net device */
> +
> +		if (ether_addr_equal(mac, dev->perm_addr))
> +			return dev;
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct net_device *get_virtio_byref(struct net_device *vf_netdev)
> +{
> +	struct net_device *dev;
> +
> +	ASSERT_RTNL();
> +
> +	for_each_netdev(&init_net, dev) {
> +		struct virtnet_info *vi;
> +
> +		if (dev->netdev_ops != &virtnet_netdev)
> +			continue;	/* not a virtio_net device */
> +
> +		vi = netdev_priv(dev);
> +		if (rtnl_dereference(vi->vf_netdev) == vf_netdev)
> +			return dev;	/* a match */
> +	}
> +
> +	return NULL;
> +}
> +
> +/* Called when VF is injecting data into network stack.
> + * Change the associated network device from VF to virtio.
> + * note: already called with rcu_read_lock
> + */
> +static rx_handler_result_t virtnet_vf_handle_frame(struct sk_buff **pskb)
> +{
> +	struct sk_buff *skb = *pskb;
> +	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
> +	struct virtnet_info *vi = netdev_priv(ndev);
> +	struct virtnet_vf_pcpu_stats *pcpu_stats =
> +				this_cpu_ptr(vi->vf_stats);
> +
> +	skb->dev = ndev;
> +
> +	u64_stats_update_begin(&pcpu_stats->syncp);
> +	pcpu_stats->rx_packets++;
> +	pcpu_stats->rx_bytes += skb->len;
> +	u64_stats_update_end(&pcpu_stats->syncp);
> +
> +	return RX_HANDLER_ANOTHER;
> +}
> +
> +static int virtnet_vf_join(struct net_device *vf_netdev,
> +			   struct net_device *ndev)
> +{
> +	struct virtnet_info *vi = netdev_priv(ndev);
> +	int ret;
> +
> +	ret = netdev_rx_handler_register(vf_netdev,
> +					 virtnet_vf_handle_frame, ndev);
> +	if (ret != 0) {
> +		netdev_err(vf_netdev,
> +			   "can not register virtio VF receive handler (err = %d)\n",
> +			   ret);
> +		goto rx_handler_failed;
> +	}
> +
> +	ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
> +	if (ret != 0) {
> +		netdev_err(vf_netdev,
> +			   "can not set master device %s (err = %d)\n",
> +			   ndev->name, ret);
> +		goto upper_link_failed;
> +	}
> +
> +	/* set slave flag before open to prevent IPv6 addrconf */
> +	vf_netdev->flags |= IFF_SLAVE;
> +
> +	schedule_delayed_work(&vi->vf_takeover, VF_TAKEOVER_INT);
> +
> +	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
> +
> +	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
> +	return 0;
> +
> +upper_link_failed:
> +	netdev_rx_handler_unregister(vf_netdev);
> +rx_handler_failed:
> +	return ret;
> +}
> +
> +static int virtnet_register_vf(struct net_device *vf_netdev)
> +{
> +	struct net_device *ndev;
> +	struct virtnet_info *vi;
> +
> +	if (vf_netdev->addr_len != ETH_ALEN)
> +		return NOTIFY_DONE;
> +
> +	/* We will use the MAC address to locate the virtio_net interface to
> +	 * associate with the VF interface. If we don't find a matching
> +	 * virtio interface, move on.
> +	 */
> +	ndev = get_virtio_bymac(vf_netdev->perm_addr);
> +	if (!ndev)
> +		return NOTIFY_DONE;
> +
> +	vi = netdev_priv(ndev);
> +	if (rtnl_dereference(vi->vf_netdev))
> +		return NOTIFY_DONE;
> +
> +	if (virtnet_vf_join(vf_netdev, ndev) != 0)
> +		return NOTIFY_DONE;
> +
> +	netdev_info(ndev, "VF registering %s\n", vf_netdev->name);
> +
> +	dev_hold(vf_netdev);
> +	rcu_assign_pointer(vi->vf_netdev, vf_netdev);
> +
> +	return NOTIFY_OK;
> +}
> +
> +static int virtnet_unregister_vf(struct net_device *vf_netdev)
> +{
> +	struct net_device *ndev;
> +	struct virtnet_info *vi;
> +
> +	ndev = get_virtio_byref(vf_netdev);
> +	if (!ndev)
> +		return NOTIFY_DONE;
> +
> +	vi = netdev_priv(ndev);
> +	cancel_delayed_work_sync(&vi->vf_takeover);
> +
> +	netdev_info(ndev, "VF unregistering %s\n", vf_netdev->name);
> +
> +	netdev_rx_handler_unregister(vf_netdev);
> +	netdev_upper_dev_unlink(vf_netdev, ndev);
> +	RCU_INIT_POINTER(vi->vf_netdev, NULL);
> +	dev_put(vf_netdev);
> +
> +	return NOTIFY_OK;
> +}
> +
>  static void virtnet_remove(struct virtio_device *vdev)
>  {
>  	struct virtnet_info *vi = vdev->priv;
> +	struct net_device *vf_netdev;
>  
>  	virtnet_cpu_notif_remove(vi);
>  
>  	/* Make sure no work handler is accessing the device. */
>  	flush_work(&vi->config_work);
>  
> +	rtnl_lock();
> +	vf_netdev = rtnl_dereference(vi->vf_netdev);
> +	if (vf_netdev)
> +		virtnet_unregister_vf(vf_netdev);
> +	rtnl_unlock();
> +
>  	unregister_netdev(vi->dev);
>  
>  	remove_vq_common(vi);
>  
> +	free_percpu(vi->vf_stats);
>  	free_percpu(vi->stats);
>  	free_netdev(vi->dev);
>  }
> @@ -2823,6 +3121,42 @@ static struct virtio_driver virtio_net_driver = {
>  #endif
>  };
>  
> +static int virtio_netdev_event(struct notifier_block *this,
> +			       unsigned long event, void *ptr)
> +{
> +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> +	/* Skip our own events */
> +	if (event_dev->netdev_ops == &virtnet_netdev)
> +		return NOTIFY_DONE;
> +
> +	/* Avoid non-Ethernet type devices */
> +	if (event_dev->type != ARPHRD_ETHER)
> +		return NOTIFY_DONE;
> +
> +	/* Avoid Vlan dev with same MAC registering as VF */
> +	if (is_vlan_dev(event_dev))
> +		return NOTIFY_DONE;
> +
> +	/* Avoid Bonding master dev with same MAC registering as VF */
> +	if ((event_dev->priv_flags & IFF_BONDING) &&
> +	    (event_dev->flags & IFF_MASTER))
> +		return NOTIFY_DONE;
> +
> +	switch (event) {
> +	case NETDEV_REGISTER:
> +		return virtnet_register_vf(event_dev);
> +	case NETDEV_UNREGISTER:
> +		return virtnet_unregister_vf(event_dev);
> +	default:
> +		return NOTIFY_DONE;
> +	}
> +}
> +
> +static struct notifier_block virtio_netdev_notifier = {
> +	.notifier_call = virtio_netdev_event,
> +};
> +
>  static __init int virtio_net_driver_init(void)
>  {
>  	int ret;
> @@ -2841,6 +3175,8 @@ static __init int virtio_net_driver_init(void)
>          ret = register_virtio_driver(&virtio_net_driver);
>  	if (ret)
>  		goto err_virtio;
> +
> +	register_netdevice_notifier(&virtio_netdev_notifier);
>  	return 0;
>  err_virtio:
>  	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> @@ -2853,6 +3189,7 @@ module_init(virtio_net_driver_init);
>  
>  static __exit void virtio_net_driver_exit(void)
>  {
> +	unregister_netdevice_notifier(&virtio_netdev_notifier);
>  	unregister_virtio_driver(&virtio_net_driver);
>  	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>  	cpuhp_remove_multi_state(virtionet_online);
> -- 
> 2.14.3
_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/virtualization