First off, as mentioned in another thread, the model of stacking up virt-bond functionality over virtio seems a wrong direction to me. Essentially the migration process would need to carry over all guest side configurations previously done on the VF/PT and get them moved to the new device being it virtio or VF/PT. Without the help of a new upper layer bond driver that enslaves virtio and VF/PT devices underneath, virtio will be overloaded with too much specifics being a VF/PT backup in the future. I hope you're already aware of the issue in longer term and move to that model as soon as possible. See more inline. On Thu, Jan 11, 2018 at 9:58 PM, Sridhar Samudrala <sridhar.samudrala@xxxxxxxxx> wrote: > This patch enables virtio_net to switch over to a VF datapath when a VF > netdev is present with the same MAC address. The VF datapath is only used > for unicast traffic. Broadcasts/multicasts go via virtio datapath so that > east-west broadcasts don't use the PCI bandwidth. Why not making an this an option/optimization rather than being the only means? The problem of east-west broadcast eating PCI bandwidth depends on specifics of the (virtual) network setup, while some users won't want to lose VF's merits such as latency. Why restricting broadcast/multicast xmit to virtio only which potentially regresses the performance against raw VF? > It allows live migration > of a VM with a direct attached VF without the need to setup a bond/team > between a VF and virtio net device in the guest. > > The hypervisor needs to unplug the VF device from the guest on the source > host and reset the MAC filter of the VF to initiate failover of datapath to > virtio before starting the migration. After the migration is completed, the > destination hypervisor sets the MAC filter on the VF and plugs it back to > the guest to switch over to VF datapath. Is there a host side patch (planned) for this MAC filter switching process? As said in another thread, that simple script won't work for macvtap backend. Thanks, -Siwei > > This patch is based on the discussion initiated by Jesse on this thread. > https://marc.info/?l=linux-virtualization&m=151189725224231&w=2 > > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@xxxxxxxxx> > Reviewed-by: Jesse Brandeburg <jesse.brandeburg@xxxxxxxxx> > --- > drivers/net/virtio_net.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 305 insertions(+), 2 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index f149a160a8c5..0e58d364fde9 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -30,6 +30,8 @@ > #include <linux/cpu.h> > #include <linux/average.h> > #include <linux/filter.h> > +#include <linux/netdevice.h> > +#include <linux/netpoll.h> > #include <net/route.h> > #include <net/xdp.h> > > @@ -120,6 +122,15 @@ struct receive_queue { > struct xdp_rxq_info xdp_rxq; > }; > > +struct virtnet_vf_pcpu_stats { > + u64 rx_packets; > + u64 rx_bytes; > + u64 tx_packets; > + u64 tx_bytes; > + struct u64_stats_sync syncp; > + u32 tx_dropped; > +}; > + > struct virtnet_info { > struct virtio_device *vdev; > struct virtqueue *cvq; > @@ -182,6 +193,10 @@ struct virtnet_info { > u32 speed; > > unsigned long guest_offloads; > + > + /* State to manage the associated VF interface. */ > + struct net_device __rcu *vf_netdev; > + struct virtnet_vf_pcpu_stats __percpu *vf_stats; > }; > > struct padded_vnet_hdr { > @@ -1314,16 +1329,53 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) > return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); > } > > +/* Send skb on the slave VF device. */ > +static int virtnet_vf_xmit(struct net_device *dev, struct net_device *vf_netdev, > + struct sk_buff *skb) > +{ > + struct virtnet_info *vi = netdev_priv(dev); > + unsigned int len = skb->len; > + int rc; > + > + skb->dev = vf_netdev; > + skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping; > + > + rc = dev_queue_xmit(skb); > + if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) { > + struct virtnet_vf_pcpu_stats *pcpu_stats > + = this_cpu_ptr(vi->vf_stats); > + > + u64_stats_update_begin(&pcpu_stats->syncp); > + pcpu_stats->tx_packets++; > + pcpu_stats->tx_bytes += len; > + u64_stats_update_end(&pcpu_stats->syncp); > + } else { > + this_cpu_inc(vi->vf_stats->tx_dropped); > + } > + > + return rc; > +} > + > static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > { > struct virtnet_info *vi = netdev_priv(dev); > int qnum = skb_get_queue_mapping(skb); > struct send_queue *sq = &vi->sq[qnum]; > + struct net_device *vf_netdev; > int err; > struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > bool kick = !skb->xmit_more; > bool use_napi = sq->napi.weight; > > + /* If VF is present and up then redirect packets > + * called with rcu_read_lock_bh > + */ > + vf_netdev = rcu_dereference_bh(vi->vf_netdev); > + if (vf_netdev && netif_running(vf_netdev) && > + !netpoll_tx_running(dev) && > + is_unicast_ether_addr(eth_hdr(skb)->h_dest)) > + return virtnet_vf_xmit(dev, vf_netdev, skb); > + > /* Free up any pending old buffers before queueing new ones. */ > free_old_xmit_skbs(sq); > > @@ -1470,10 +1522,41 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) > return ret; > } > > +static void virtnet_get_vf_stats(struct net_device *dev, > + struct virtnet_vf_pcpu_stats *tot) > +{ > + struct virtnet_info *vi = netdev_priv(dev); > + int i; > + > + memset(tot, 0, sizeof(*tot)); > + > + for_each_possible_cpu(i) { > + const struct virtnet_vf_pcpu_stats *stats > + = per_cpu_ptr(vi->vf_stats, i); > + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; > + unsigned int start; > + > + do { > + start = u64_stats_fetch_begin_irq(&stats->syncp); > + rx_packets = stats->rx_packets; > + tx_packets = stats->tx_packets; > + rx_bytes = stats->rx_bytes; > + tx_bytes = stats->tx_bytes; > + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); > + > + tot->rx_packets += rx_packets; > + tot->tx_packets += tx_packets; > + tot->rx_bytes += rx_bytes; > + tot->tx_bytes += tx_bytes; > + tot->tx_dropped += stats->tx_dropped; > + } > +} > + > static void virtnet_stats(struct net_device *dev, > struct rtnl_link_stats64 *tot) > { > struct virtnet_info *vi = netdev_priv(dev); > + struct virtnet_vf_pcpu_stats vf_stats; > int cpu; > unsigned int start; > > @@ -1504,6 +1587,13 @@ static void virtnet_stats(struct net_device *dev, > tot->rx_dropped = dev->stats.rx_dropped; > tot->rx_length_errors = dev->stats.rx_length_errors; > tot->rx_frame_errors = dev->stats.rx_frame_errors; > + > + virtnet_get_vf_stats(dev, &vf_stats); > + tot->rx_packets += vf_stats.rx_packets; > + tot->tx_packets += vf_stats.tx_packets; > + tot->rx_bytes += vf_stats.rx_bytes; > + tot->tx_bytes += vf_stats.tx_bytes; > + tot->tx_dropped += vf_stats.tx_dropped; > } > > #ifdef CONFIG_NET_POLL_CONTROLLER > @@ -2635,6 +2725,13 @@ static int virtnet_probe(struct virtio_device *vdev) > > INIT_WORK(&vi->config_work, virtnet_config_changed_work); > > + if (virtio_has_feature(vdev, VIRTIO_NET_F_BACKUP)) { > + vi->vf_stats = > + netdev_alloc_pcpu_stats(struct virtnet_vf_pcpu_stats); > + if (!vi->vf_stats) > + goto free_stats; > + } > + > /* If we can receive ANY GSO packets, we must allocate large ones. */ > if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || > virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) || > @@ -2668,7 +2765,7 @@ static int virtnet_probe(struct virtio_device *vdev) > */ > dev_err(&vdev->dev, "device MTU appears to have changed " > "it is now %d < %d", mtu, dev->min_mtu); > - goto free_stats; > + goto free_vf_stats; > } > > dev->mtu = mtu; > @@ -2692,7 +2789,7 @@ static int virtnet_probe(struct virtio_device *vdev) > /* Allocate/initialize the rx/tx queues, and invoke find_vqs */ > err = init_vqs(vi); > if (err) > - goto free_stats; > + goto free_vf_stats; > > #ifdef CONFIG_SYSFS > if (vi->mergeable_rx_bufs) > @@ -2747,6 +2844,8 @@ static int virtnet_probe(struct virtio_device *vdev) > cancel_delayed_work_sync(&vi->refill); > free_receive_page_frags(vi); > virtnet_del_vqs(vi); > +free_vf_stats: > + free_percpu(vi->vf_stats); > free_stats: > free_percpu(vi->stats); > free: > @@ -2768,19 +2867,184 @@ static void remove_vq_common(struct virtnet_info *vi) > virtnet_del_vqs(vi); > } > > +static struct net_device *get_virtio_bymac(const u8 *mac) > +{ > + struct net_device *dev; > + > + ASSERT_RTNL(); > + > + for_each_netdev(&init_net, dev) { > + if (dev->netdev_ops != &virtnet_netdev) > + continue; /* not a virtio_net device */ > + > + if (ether_addr_equal(mac, dev->perm_addr)) > + return dev; > + } > + > + return NULL; > +} > + > +static struct net_device *get_virtio_byref(struct net_device *vf_netdev) > +{ > + struct net_device *dev; > + > + ASSERT_RTNL(); > + > + for_each_netdev(&init_net, dev) { > + struct virtnet_info *vi; > + > + if (dev->netdev_ops != &virtnet_netdev) > + continue; /* not a virtio_net device */ > + > + vi = netdev_priv(dev); > + if (rtnl_dereference(vi->vf_netdev) == vf_netdev) > + return dev; /* a match */ > + } > + > + return NULL; > +} > + > +/* Called when VF is injecting data into network stack. > + * Change the associated network device from VF to virtio. > + * note: already called with rcu_read_lock > + */ > +static rx_handler_result_t virtnet_vf_handle_frame(struct sk_buff **pskb) > +{ > + struct sk_buff *skb = *pskb; > + struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data); > + struct virtnet_info *vi = netdev_priv(ndev); > + struct virtnet_vf_pcpu_stats *pcpu_stats = > + this_cpu_ptr(vi->vf_stats); > + > + skb->dev = ndev; > + > + u64_stats_update_begin(&pcpu_stats->syncp); > + pcpu_stats->rx_packets++; > + pcpu_stats->rx_bytes += skb->len; > + u64_stats_update_end(&pcpu_stats->syncp); > + > + return RX_HANDLER_ANOTHER; > +} > + > +static int virtnet_vf_join(struct net_device *vf_netdev, > + struct net_device *ndev) > +{ > + int ret; > + > + ret = netdev_rx_handler_register(vf_netdev, > + virtnet_vf_handle_frame, ndev); > + if (ret != 0) { > + netdev_err(vf_netdev, > + "can not register virtio VF receive handler (err = %d)\n", > + ret); > + goto rx_handler_failed; > + } > + > + ret = netdev_upper_dev_link(vf_netdev, ndev, NULL); > + if (ret != 0) { > + netdev_err(vf_netdev, > + "can not set master device %s (err = %d)\n", > + ndev->name, ret); > + goto upper_link_failed; > + } > + > + vf_netdev->flags |= IFF_SLAVE; > + > + /* Align MTU of VF with master */ > + ret = dev_set_mtu(vf_netdev, ndev->mtu); > + if (ret) > + netdev_warn(vf_netdev, > + "unable to change mtu to %u\n", ndev->mtu); > + > + call_netdevice_notifiers(NETDEV_JOIN, vf_netdev); > + > + netdev_info(vf_netdev, "joined to %s\n", ndev->name); > + return 0; > + > +upper_link_failed: > + netdev_rx_handler_unregister(vf_netdev); > +rx_handler_failed: > + return ret; > +} > + > +static int virtnet_register_vf(struct net_device *vf_netdev) > +{ > + struct net_device *ndev; > + struct virtnet_info *vi; > + > + if (vf_netdev->addr_len != ETH_ALEN) > + return NOTIFY_DONE; > + > + /* We will use the MAC address to locate the virtio_net interface to > + * associate with the VF interface. If we don't find a matching > + * virtio interface, move on. > + */ > + ndev = get_virtio_bymac(vf_netdev->perm_addr); > + if (!ndev) > + return NOTIFY_DONE; > + > + vi = netdev_priv(ndev); > + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP)) > + return NOTIFY_DONE; > + > + if (rtnl_dereference(vi->vf_netdev)) > + return NOTIFY_DONE; > + > + if (virtnet_vf_join(vf_netdev, ndev) != 0) > + return NOTIFY_DONE; > + > + netdev_info(ndev, "VF registering %s\n", vf_netdev->name); > + > + dev_hold(vf_netdev); > + rcu_assign_pointer(vi->vf_netdev, vf_netdev); > + > + return NOTIFY_OK; > +} > + > +static int virtnet_unregister_vf(struct net_device *vf_netdev) > +{ > + struct net_device *ndev; > + struct virtnet_info *vi; > + > + ndev = get_virtio_byref(vf_netdev); > + if (!ndev) > + return NOTIFY_DONE; > + > + vi = netdev_priv(ndev); > + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP)) > + return NOTIFY_DONE; > + > + netdev_info(ndev, "VF unregistering %s\n", vf_netdev->name); > + > + netdev_rx_handler_unregister(vf_netdev); > + netdev_upper_dev_unlink(vf_netdev, ndev); > + RCU_INIT_POINTER(vi->vf_netdev, NULL); > + dev_put(vf_netdev); > + > + return NOTIFY_OK; > +} > + > static void virtnet_remove(struct virtio_device *vdev) > { > struct virtnet_info *vi = vdev->priv; > + struct net_device *vf_netdev; > > virtnet_cpu_notif_remove(vi); > > /* Make sure no work handler is accessing the device. */ > flush_work(&vi->config_work); > > + rtnl_lock(); > + vf_netdev = rtnl_dereference(vi->vf_netdev); > + if (vf_netdev) > + virtnet_unregister_vf(vf_netdev); > + rtnl_unlock(); > + > unregister_netdev(vi->dev); > > remove_vq_common(vi); > > + free_percpu(vi->vf_stats); > free_percpu(vi->stats); > free_netdev(vi->dev); > } > @@ -2859,6 +3123,42 @@ static struct virtio_driver virtio_net_driver = { > #endif > }; > > +static int virtio_netdev_event(struct notifier_block *this, > + unsigned long event, void *ptr) > +{ > + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); > + > + /* Skip our own events */ > + if (event_dev->netdev_ops == &virtnet_netdev) > + return NOTIFY_DONE; > + > + /* Avoid non-Ethernet type devices */ > + if (event_dev->type != ARPHRD_ETHER) > + return NOTIFY_DONE; > + > + /* Avoid Vlan dev with same MAC registering as VF */ > + if (is_vlan_dev(event_dev)) > + return NOTIFY_DONE; > + > + /* Avoid Bonding master dev with same MAC registering as VF */ > + if ((event_dev->priv_flags & IFF_BONDING) && > + (event_dev->flags & IFF_MASTER)) > + return NOTIFY_DONE; > + > + switch (event) { > + case NETDEV_REGISTER: > + return virtnet_register_vf(event_dev); > + case NETDEV_UNREGISTER: > + return virtnet_unregister_vf(event_dev); > + default: > + return NOTIFY_DONE; > + } > +} > + > +static struct notifier_block virtio_netdev_notifier = { > + .notifier_call = virtio_netdev_event, > +}; > + > static __init int virtio_net_driver_init(void) > { > int ret; > @@ -2877,6 +3177,8 @@ static __init int virtio_net_driver_init(void) > ret = register_virtio_driver(&virtio_net_driver); > if (ret) > goto err_virtio; > + > + register_netdevice_notifier(&virtio_netdev_notifier); > return 0; > err_virtio: > cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); > @@ -2889,6 +3191,7 @@ module_init(virtio_net_driver_init); > > static __exit void virtio_net_driver_exit(void) > { > + unregister_netdevice_notifier(&virtio_netdev_notifier); > unregister_virtio_driver(&virtio_net_driver); > cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); > cpuhp_remove_multi_state(virtionet_online); > -- > 2.14.3 > _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization