On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote: > This patch enables virtio to switch over to a VF datapath when a VF netdev > is present with the same MAC address. I prefer saying "a passthrough device" here. Does not have to be a VF at all. > It allows live migration of a VM > with a direct attached VF without the need to setup a bond/team between a > VF and virtio net device in the guest. > > The hypervisor needs to unplug the VF device from the guest on the source > host and reset the MAC filter of the VF to initiate failover of datapath to > virtio before starting the migration. After the migration is completed, the > destination hypervisor sets the MAC filter on the VF and plugs it back to > the guest to switch over to VF datapath. > > It is entirely based on netvsc implementation and it should be possible to > make this code generic and move it to a common location that can be shared > by netvsc and virtio. > > Also, i think we should make this a negotiated feature that is off by > default via a new feature bit. So please include this. A copy needs to go to virtio TC to reserve the bit. Enabling this by default risks breaking too many configurations. > > This patch is based on the discussion initiated by Jesse on this thread. > https://marc.info/?l=linux-virtualization&m=151189725224231&w=2 > > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@xxxxxxxxx> > Reviewed-by: Jesse Brandeburg <jesse.brandeburg@xxxxxxxxx> > --- > drivers/net/virtio_net.c | 341 ++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 339 insertions(+), 2 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 559b215c0169..a34c717bb15b 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -31,6 +31,8 @@ > #include <linux/average.h> > #include <linux/filter.h> > #include <net/route.h> > +#include <linux/netdevice.h> > +#include <linux/netpoll.h> > > static int napi_weight = NAPI_POLL_WEIGHT; > module_param(napi_weight, int, 0444); > @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644); > */ > DECLARE_EWMA(pkt_len, 0, 64) > > +#define VF_TAKEOVER_INT (HZ / 10) > + > #define VIRTNET_DRIVER_VERSION "1.0.0" > > static const unsigned long guest_offloads[] = { > @@ -117,6 +121,15 @@ struct receive_queue { > char name[40]; > }; > > +struct virtnet_vf_pcpu_stats { > + u64 rx_packets; > + u64 rx_bytes; > + u64 tx_packets; > + u64 tx_bytes; > + struct u64_stats_sync syncp; > + u32 tx_dropped; > +}; > + > struct virtnet_info { > struct virtio_device *vdev; > struct virtqueue *cvq; > @@ -179,6 +192,11 @@ struct virtnet_info { > u32 speed; > > unsigned long guest_offloads; > + > + /* State to manage the associated VF interface. */ > + struct net_device __rcu *vf_netdev; > + struct virtnet_vf_pcpu_stats __percpu *vf_stats; > + struct delayed_work vf_takeover; > }; > > struct padded_vnet_hdr { > @@ -1300,16 +1318,51 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) > return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); > } > > +/* Send skb on the slave VF device. */ > +static int virtnet_vf_xmit(struct net_device *dev, struct net_device *vf_netdev, > + struct sk_buff *skb) > +{ > + struct virtnet_info *vi = netdev_priv(dev); > + unsigned int len = skb->len; > + int rc; > + > + skb->dev = vf_netdev; > + skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping; > + > + rc = dev_queue_xmit(skb); > + if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) { > + struct virtnet_vf_pcpu_stats *pcpu_stats > + = this_cpu_ptr(vi->vf_stats); > + > + u64_stats_update_begin(&pcpu_stats->syncp); > + pcpu_stats->tx_packets++; > + pcpu_stats->tx_bytes += len; > + u64_stats_update_end(&pcpu_stats->syncp); > + } else { > + this_cpu_inc(vi->vf_stats->tx_dropped); > + } > + > + return rc; > +} > + > static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > { > struct virtnet_info *vi = netdev_priv(dev); > int qnum = skb_get_queue_mapping(skb); > struct send_queue *sq = &vi->sq[qnum]; > + struct net_device *vf_netdev; > int err; > struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > bool kick = !skb->xmit_more; > bool use_napi = sq->napi.weight; > > + /* if VF is present and up then redirect packets > + * called with rcu_read_lock_bh > + */ > + vf_netdev = rcu_dereference_bh(vi->vf_netdev); > + if (vf_netdev && netif_running(vf_netdev) && !netpoll_tx_running(dev)) > + return virtnet_vf_xmit(dev, vf_netdev, skb); > + > /* Free up any pending old buffers before queueing new ones. */ > free_old_xmit_skbs(sq); > > @@ -1456,10 +1509,41 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) > return ret; > } > > +static void virtnet_get_vf_stats(struct net_device *dev, > + struct virtnet_vf_pcpu_stats *tot) > +{ > + struct virtnet_info *vi = netdev_priv(dev); > + int i; > + > + memset(tot, 0, sizeof(*tot)); > + > + for_each_possible_cpu(i) { > + const struct virtnet_vf_pcpu_stats *stats > + = per_cpu_ptr(vi->vf_stats, i); > + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; > + unsigned int start; > + > + do { > + start = u64_stats_fetch_begin_irq(&stats->syncp); > + rx_packets = stats->rx_packets; > + tx_packets = stats->tx_packets; > + rx_bytes = stats->rx_bytes; > + tx_bytes = stats->tx_bytes; > + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); > + > + tot->rx_packets += rx_packets; > + tot->tx_packets += tx_packets; > + tot->rx_bytes += rx_bytes; > + tot->tx_bytes += tx_bytes; > + tot->tx_dropped += stats->tx_dropped; > + } > +} > + > static void virtnet_stats(struct net_device *dev, > struct rtnl_link_stats64 *tot) > { > struct virtnet_info *vi = netdev_priv(dev); > + struct virtnet_vf_pcpu_stats vf_stats; > int cpu; > unsigned int start; > > @@ -1490,6 +1574,13 @@ static void virtnet_stats(struct net_device *dev, > tot->rx_dropped = dev->stats.rx_dropped; > tot->rx_length_errors = dev->stats.rx_length_errors; > tot->rx_frame_errors = dev->stats.rx_frame_errors; > + > + virtnet_get_vf_stats(dev, &vf_stats); > + tot->rx_packets += vf_stats.rx_packets; > + tot->tx_packets += vf_stats.tx_packets; > + tot->rx_bytes += vf_stats.rx_bytes; > + tot->tx_bytes += vf_stats.tx_bytes; > + tot->tx_dropped += vf_stats.tx_dropped; > } > > #ifdef CONFIG_NET_POLL_CONTROLLER > @@ -2508,6 +2599,47 @@ static int virtnet_validate(struct virtio_device *vdev) > return 0; > } > > +static void __virtnet_vf_setup(struct net_device *ndev, > + struct net_device *vf_netdev) > +{ > + int ret; > + > + /* Align MTU of VF with master */ > + ret = dev_set_mtu(vf_netdev, ndev->mtu); > + if (ret) > + netdev_warn(vf_netdev, > + "unable to change mtu to %u\n", ndev->mtu); > + > + if (netif_running(ndev)) { > + ret = dev_open(vf_netdev); > + if (ret) > + netdev_warn(vf_netdev, > + "unable to open: %d\n", ret); > + } > +} > + > +/* Setup VF as slave of the virtio device. > + * Runs in workqueue to avoid recursion in netlink callbacks. > + */ > +static void virtnet_vf_setup(struct work_struct *w) > +{ > + struct virtnet_info *vi > + = container_of(w, struct virtnet_info, vf_takeover.work); > + struct net_device *ndev = vi->dev; > + struct net_device *vf_netdev; > + > + if (!rtnl_trylock()) { > + schedule_delayed_work(&vi->vf_takeover, 0); > + return; > + } > + > + vf_netdev = rtnl_dereference(vi->vf_netdev); > + if (vf_netdev) > + __virtnet_vf_setup(ndev, vf_netdev); > + > + rtnl_unlock(); > +} > + > static int virtnet_probe(struct virtio_device *vdev) > { > int i, err; > @@ -2600,6 +2732,11 @@ static int virtnet_probe(struct virtio_device *vdev) > } > > INIT_WORK(&vi->config_work, virtnet_config_changed_work); > + INIT_DELAYED_WORK(&vi->vf_takeover, virtnet_vf_setup); > + > + vi->vf_stats = netdev_alloc_pcpu_stats(struct virtnet_vf_pcpu_stats); > + if (!vi->vf_stats) > + goto free_stats; > > /* If we can receive ANY GSO packets, we must allocate large ones. */ > if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || > @@ -2634,7 +2771,7 @@ static int virtnet_probe(struct virtio_device *vdev) > */ > dev_err(&vdev->dev, "device MTU appears to have changed " > "it is now %d < %d", mtu, dev->min_mtu); > - goto free_stats; > + goto free_vf_stats; > } > > dev->mtu = mtu; > @@ -2658,7 +2795,7 @@ static int virtnet_probe(struct virtio_device *vdev) > /* Allocate/initialize the rx/tx queues, and invoke find_vqs */ > err = init_vqs(vi); > if (err) > - goto free_stats; > + goto free_vf_stats; > > #ifdef CONFIG_SYSFS > if (vi->mergeable_rx_bufs) > @@ -2712,6 +2849,8 @@ static int virtnet_probe(struct virtio_device *vdev) > cancel_delayed_work_sync(&vi->refill); > free_receive_page_frags(vi); > virtnet_del_vqs(vi); > +free_vf_stats: > + free_percpu(vi->vf_stats); > free_stats: > free_percpu(vi->stats); > free: > @@ -2733,19 +2872,178 @@ static void remove_vq_common(struct virtnet_info *vi) > virtnet_del_vqs(vi); > } > > +static struct net_device *get_virtio_bymac(const u8 *mac) > +{ > + struct net_device *dev; > + > + ASSERT_RTNL(); > + > + for_each_netdev(&init_net, dev) { > + if (dev->netdev_ops != &virtnet_netdev) > + continue; /* not a virtio_net device */ > + > + if (ether_addr_equal(mac, dev->perm_addr)) > + return dev; > + } > + > + return NULL; > +} > + > +static struct net_device *get_virtio_byref(struct net_device *vf_netdev) > +{ > + struct net_device *dev; > + > + ASSERT_RTNL(); > + > + for_each_netdev(&init_net, dev) { > + struct virtnet_info *vi; > + > + if (dev->netdev_ops != &virtnet_netdev) > + continue; /* not a virtio_net device */ > + > + vi = netdev_priv(dev); > + if (rtnl_dereference(vi->vf_netdev) == vf_netdev) > + return dev; /* a match */ > + } > + > + return NULL; > +} > + > +/* Called when VF is injecting data into network stack. > + * Change the associated network device from VF to virtio. > + * note: already called with rcu_read_lock > + */ > +static rx_handler_result_t virtnet_vf_handle_frame(struct sk_buff **pskb) > +{ > + struct sk_buff *skb = *pskb; > + struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data); > + struct virtnet_info *vi = netdev_priv(ndev); > + struct virtnet_vf_pcpu_stats *pcpu_stats = > + this_cpu_ptr(vi->vf_stats); > + > + skb->dev = ndev; > + > + u64_stats_update_begin(&pcpu_stats->syncp); > + pcpu_stats->rx_packets++; > + pcpu_stats->rx_bytes += skb->len; > + u64_stats_update_end(&pcpu_stats->syncp); > + > + return RX_HANDLER_ANOTHER; > +} > + > +static int virtnet_vf_join(struct net_device *vf_netdev, > + struct net_device *ndev) > +{ > + struct virtnet_info *vi = netdev_priv(ndev); > + int ret; > + > + ret = netdev_rx_handler_register(vf_netdev, > + virtnet_vf_handle_frame, ndev); > + if (ret != 0) { > + netdev_err(vf_netdev, > + "can not register virtio VF receive handler (err = %d)\n", > + ret); > + goto rx_handler_failed; > + } > + > + ret = netdev_upper_dev_link(vf_netdev, ndev, NULL); > + if (ret != 0) { > + netdev_err(vf_netdev, > + "can not set master device %s (err = %d)\n", > + ndev->name, ret); > + goto upper_link_failed; > + } > + > + /* set slave flag before open to prevent IPv6 addrconf */ > + vf_netdev->flags |= IFF_SLAVE; > + > + schedule_delayed_work(&vi->vf_takeover, VF_TAKEOVER_INT); > + > + call_netdevice_notifiers(NETDEV_JOIN, vf_netdev); > + > + netdev_info(vf_netdev, "joined to %s\n", ndev->name); > + return 0; > + > +upper_link_failed: > + netdev_rx_handler_unregister(vf_netdev); > +rx_handler_failed: > + return ret; > +} > + > +static int virtnet_register_vf(struct net_device *vf_netdev) > +{ > + struct net_device *ndev; > + struct virtnet_info *vi; > + > + if (vf_netdev->addr_len != ETH_ALEN) > + return NOTIFY_DONE; > + > + /* We will use the MAC address to locate the virtio_net interface to > + * associate with the VF interface. If we don't find a matching > + * virtio interface, move on. > + */ > + ndev = get_virtio_bymac(vf_netdev->perm_addr); > + if (!ndev) > + return NOTIFY_DONE; > + > + vi = netdev_priv(ndev); > + if (rtnl_dereference(vi->vf_netdev)) > + return NOTIFY_DONE; > + > + if (virtnet_vf_join(vf_netdev, ndev) != 0) > + return NOTIFY_DONE; > + > + netdev_info(ndev, "VF registering %s\n", vf_netdev->name); > + > + dev_hold(vf_netdev); > + rcu_assign_pointer(vi->vf_netdev, vf_netdev); > + > + return NOTIFY_OK; > +} > + > +static int virtnet_unregister_vf(struct net_device *vf_netdev) > +{ > + struct net_device *ndev; > + struct virtnet_info *vi; > + > + ndev = get_virtio_byref(vf_netdev); > + if (!ndev) > + return NOTIFY_DONE; > + > + vi = netdev_priv(ndev); > + cancel_delayed_work_sync(&vi->vf_takeover); > + > + netdev_info(ndev, "VF unregistering %s\n", vf_netdev->name); > + > + netdev_rx_handler_unregister(vf_netdev); > + netdev_upper_dev_unlink(vf_netdev, ndev); > + RCU_INIT_POINTER(vi->vf_netdev, NULL); > + dev_put(vf_netdev); > + > + return NOTIFY_OK; > +} > + > static void virtnet_remove(struct virtio_device *vdev) > { > struct virtnet_info *vi = vdev->priv; > + struct net_device *vf_netdev; > > virtnet_cpu_notif_remove(vi); > > /* Make sure no work handler is accessing the device. */ > flush_work(&vi->config_work); > > + rtnl_lock(); > + vf_netdev = rtnl_dereference(vi->vf_netdev); > + if (vf_netdev) > + virtnet_unregister_vf(vf_netdev); > + rtnl_unlock(); > + > unregister_netdev(vi->dev); > > remove_vq_common(vi); > > + free_percpu(vi->vf_stats); > free_percpu(vi->stats); > free_netdev(vi->dev); > } > @@ -2823,6 +3121,42 @@ static struct virtio_driver virtio_net_driver = { > #endif > }; > > +static int virtio_netdev_event(struct notifier_block *this, > + unsigned long event, void *ptr) > +{ > + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); > + > + /* Skip our own events */ > + if (event_dev->netdev_ops == &virtnet_netdev) > + return NOTIFY_DONE; > + > + /* Avoid non-Ethernet type devices */ > + if (event_dev->type != ARPHRD_ETHER) > + return NOTIFY_DONE; > + > + /* Avoid Vlan dev with same MAC registering as VF */ > + if (is_vlan_dev(event_dev)) > + return NOTIFY_DONE; > + > + /* Avoid Bonding master dev with same MAC registering as VF */ > + if ((event_dev->priv_flags & IFF_BONDING) && > + (event_dev->flags & IFF_MASTER)) > + return NOTIFY_DONE; > + > + switch (event) { > + case NETDEV_REGISTER: > + return virtnet_register_vf(event_dev); > + case NETDEV_UNREGISTER: > + return virtnet_unregister_vf(event_dev); > + default: > + return NOTIFY_DONE; > + } > +} > + > +static struct notifier_block virtio_netdev_notifier = { > + .notifier_call = virtio_netdev_event, > +}; > + > static __init int virtio_net_driver_init(void) > { > int ret; > @@ -2841,6 +3175,8 @@ static __init int virtio_net_driver_init(void) > ret = register_virtio_driver(&virtio_net_driver); > if (ret) > goto err_virtio; > + > + register_netdevice_notifier(&virtio_netdev_notifier); > return 0; > err_virtio: > cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); > @@ -2853,6 +3189,7 @@ module_init(virtio_net_driver_init); > > static __exit void virtio_net_driver_exit(void) > { > + unregister_netdevice_notifier(&virtio_netdev_notifier); > unregister_virtio_driver(&virtio_net_driver); > cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); > cpuhp_remove_multi_state(virtionet_online); > -- > 2.14.3 _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization