This provides a generic interface for paravirtual drivers to listen for netdev register/unregister/link change events from pci ethernet devices with the same MAC and takeover their datapath. The notifier and event handling code is based on the existing netvsc implementation. It exposes 2 sets of interfaces to the paravirtual drivers. 1. existing netvsc driver that uses 2 netdev model. In this model, no master netdev is created. The paravirtual driver registers each bypass instance along with a set of ops to manage the slave events. bypass_master_register() bypass_master_unregister() 2. new virtio_net based solution that uses 3 netdev model. In this model, the bypass module provides interfaces to create/destroy additional master netdev and all the slave events are managed internally. bypass_master_create() bypass_master_destroy() Signed-off-by: Sridhar Samudrala <sridhar.samudrala@xxxxxxxxx> --- include/linux/netdevice.h | 14 + include/net/bypass.h | 96 ++++++ net/Kconfig | 18 + net/core/Makefile | 1 + net/core/bypass.c | 844 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 973 insertions(+) create mode 100644 include/net/bypass.h create mode 100644 net/core/bypass.c diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf44503ea81a..587293728f70 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1430,6 +1430,8 @@ enum netdev_priv_flags { IFF_PHONY_HEADROOM = 1<<24, IFF_MACSEC = 1<<25, IFF_NO_RX_HANDLER = 1<<26, + IFF_BYPASS = 1 << 27, + IFF_BYPASS_SLAVE = 1 << 28, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1458,6 +1460,8 @@ enum netdev_priv_flags { #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED #define IFF_MACSEC IFF_MACSEC #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER +#define IFF_BYPASS IFF_BYPASS +#define IFF_BYPASS_SLAVE IFF_BYPASS_SLAVE /** * struct net_device - The DEVICE structure. @@ -4308,6 +4312,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev) return dev->priv_flags & IFF_RXFH_CONFIGURED; } +static inline bool netif_is_bypass_master(const struct net_device *dev) +{ + return dev->priv_flags & IFF_BYPASS; +} + +static inline bool netif_is_bypass_slave(const struct net_device *dev) +{ + return dev->priv_flags & IFF_BYPASS_SLAVE; +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { diff --git a/include/net/bypass.h b/include/net/bypass.h new file mode 100644 index 000000000000..86b02cb894cf --- /dev/null +++ b/include/net/bypass.h @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018, Intel Corporation. */ + +#ifndef _NET_BYPASS_H +#define _NET_BYPASS_H + +#include <linux/netdevice.h> + +struct bypass_ops { + int (*slave_pre_register)(struct net_device *slave_netdev, + struct net_device *bypass_netdev); + int (*slave_join)(struct net_device *slave_netdev, + struct net_device *bypass_netdev); + int (*slave_pre_unregister)(struct net_device *slave_netdev, + struct net_device *bypass_netdev); + int (*slave_release)(struct net_device *slave_netdev, + struct net_device *bypass_netdev); + int (*slave_link_change)(struct net_device *slave_netdev, + struct net_device *bypass_netdev); + rx_handler_result_t (*handle_frame)(struct sk_buff **pskb); +}; + +struct bypass_master { + struct list_head list; + struct net_device __rcu *bypass_netdev; + struct bypass_ops __rcu *ops; +}; + +/* bypass state */ +struct bypass_info { + /* passthru netdev with same MAC */ + struct net_device __rcu *active_netdev; + + /* virtio_net netdev */ + struct net_device __rcu *backup_netdev; + + /* active netdev stats */ + struct rtnl_link_stats64 active_stats; + + /* backup netdev stats */ + struct rtnl_link_stats64 backup_stats; + + /* aggregated stats */ + struct rtnl_link_stats64 bypass_stats; + + /* spinlock while updating stats */ + spinlock_t stats_lock; +}; + +#if IS_ENABLED(CONFIG_NET_BYPASS) + +int bypass_master_create(struct net_device *backup_netdev, + struct bypass_master **pbypass_master); +void bypass_master_destroy(struct bypass_master *bypass_master); + +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops, + struct bypass_master **pbypass_master); +void bypass_master_unregister(struct bypass_master *bypass_master); + +int bypass_slave_unregister(struct net_device *slave_netdev); + +#else + +static inline +int bypass_master_create(struct net_device *backup_netdev, + struct bypass_master **pbypass_master); +{ + return 0; +} + +static inline +void bypass_master_destroy(struct bypass_master *bypass_master) +{ +} + +static inline +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops, + struct pbypass_master **pbypass_master); +{ + return 0; +} + +static inline +void bypass_master_unregister(struct bypass_master *bypass_master) +{ +} + +static inline +int bypass_slave_unregister(struct net_device *slave_netdev) +{ + return 0; +} + +#endif + +#endif /* _NET_BYPASS_H */ diff --git a/net/Kconfig b/net/Kconfig index 0428f12c25c2..994445f4a96a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -423,6 +423,24 @@ config MAY_USE_DEVLINK on MAY_USE_DEVLINK to ensure they do not cause link errors when devlink is a loadable module and the driver using it is built-in. +config NET_BYPASS + tristate "Bypass interface" + ---help--- + This provides a generic interface for paravirtual drivers to listen + for netdev register/unregister/link change events from pci ethernet + devices with the same MAC and takeover their datapath. This also + enables live migration of a VM with direct attached VF by failing + over to the paravirtual datapath when the VF is unplugged. + +config MAY_USE_BYPASS + tristate + default m if NET_BYPASS=m + default y if NET_BYPASS=y || NET_BYPASS=n + help + Drivers using the bypass infrastructure should have a dependency + on MAY_USE_BYPASS to ensure they do not cause link errors when + bypass is a loadable module and the driver using it is built-in. + endif # if NET # Used by archs to tell that they support BPF JIT compiler plus which flavour. diff --git a/net/core/Makefile b/net/core/Makefile index 6dbbba8c57ae..a9727ed1c8fc 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -30,3 +30,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o +obj-$(CONFIG_NET_BYPASS) += bypass.o diff --git a/net/core/bypass.c b/net/core/bypass.c new file mode 100644 index 000000000000..b5b9cb554c3f --- /dev/null +++ b/net/core/bypass.c @@ -0,0 +1,844 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018, Intel Corporation. */ + +/* A common module to handle registrations and notifications for paravirtual + * drivers to enable accelerated datapath and support VF live migration. + * + * The notifier and event handling code is based on netvsc driver. + */ + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#include <linux/netpoll.h> +#include <linux/rtnetlink.h> +#include <linux/if_vlan.h> +#include <linux/pci.h> +#include <net/sch_generic.h> +#include <uapi/linux/if_arp.h> +#include <net/bypass.h> + +static LIST_HEAD(bypass_master_list); +static DEFINE_SPINLOCK(bypass_lock); + +static int bypass_slave_pre_register(struct net_device *slave_netdev, + struct net_device *bypass_netdev, + struct bypass_ops *bypass_ops) +{ + struct bypass_info *bi; + bool backup; + + if (bypass_ops) { + if (!bypass_ops->slave_pre_register) + return -EINVAL; + + return bypass_ops->slave_pre_register(slave_netdev, + bypass_netdev); + } + + bi = netdev_priv(bypass_netdev); + backup = (slave_netdev->dev.parent == bypass_netdev->dev.parent); + if (backup ? rtnl_dereference(bi->backup_netdev) : + rtnl_dereference(bi->active_netdev)) { + netdev_err(bypass_netdev, "%s attempting to register as slave dev when %s already present\n", + slave_netdev->name, backup ? "backup" : "active"); + return -EEXIST; + } + + /* Avoid non pci devices as active netdev */ + if (!backup && (!slave_netdev->dev.parent || + !dev_is_pci(slave_netdev->dev.parent))) + return -EINVAL; + + return 0; +} + +static int bypass_slave_join(struct net_device *slave_netdev, + struct net_device *bypass_netdev, + struct bypass_ops *bypass_ops) +{ + struct bypass_info *bi; + bool backup; + + if (bypass_ops) { + if (!bypass_ops->slave_join) + return -EINVAL; + + return bypass_ops->slave_join(slave_netdev, bypass_netdev); + } + + bi = netdev_priv(bypass_netdev); + backup = (slave_netdev->dev.parent == bypass_netdev->dev.parent); + + dev_hold(slave_netdev); + + if (backup) { + rcu_assign_pointer(bi->backup_netdev, slave_netdev); + dev_get_stats(bi->backup_netdev, &bi->backup_stats); + } else { + rcu_assign_pointer(bi->active_netdev, slave_netdev); + dev_get_stats(bi->active_netdev, &bi->active_stats); + bypass_netdev->min_mtu = slave_netdev->min_mtu; + bypass_netdev->max_mtu = slave_netdev->max_mtu; + } + + netdev_info(bypass_netdev, "bypass slave:%s joined\n", + slave_netdev->name); + + return 0; +} + +/* Called when slave dev is injecting data into network stack. + * Change the associated network device from lower dev to virtio. + * note: already called with rcu_read_lock + */ +static rx_handler_result_t bypass_handle_frame(struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data); + + skb->dev = ndev; + + return RX_HANDLER_ANOTHER; +} + +static struct net_device *bypass_master_get_bymac(u8 *mac, + struct bypass_ops **ops) +{ + struct bypass_master *bypass_master; + struct net_device *bypass_netdev; + + spin_lock(&bypass_lock); + list_for_each_entry(bypass_master, &bypass_master_list, list) { + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev); + if (ether_addr_equal(bypass_netdev->perm_addr, mac)) { + *ops = rcu_dereference(bypass_master->ops); + spin_unlock(&bypass_lock); + return bypass_netdev; + } + } + spin_unlock(&bypass_lock); + return NULL; +} + +static int bypass_slave_register(struct net_device *slave_netdev) +{ + struct net_device *bypass_netdev; + struct bypass_ops *bypass_ops; + int ret, orig_mtu; + + ASSERT_RTNL(); + + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr, + &bypass_ops); + if (!bypass_netdev) + goto done; + + ret = bypass_slave_pre_register(slave_netdev, bypass_netdev, + bypass_ops); + if (ret != 0) + goto done; + + ret = netdev_rx_handler_register(slave_netdev, + bypass_ops ? bypass_ops->handle_frame : + bypass_handle_frame, bypass_netdev); + if (ret != 0) { + netdev_err(slave_netdev, "can not register bypass rx handler (err = %d)\n", + ret); + goto done; + } + + ret = netdev_upper_dev_link(slave_netdev, bypass_netdev, NULL); + if (ret != 0) { + netdev_err(slave_netdev, "can not set master device %s (err = %d)\n", + bypass_netdev->name, ret); + goto upper_link_failed; + } + + slave_netdev->priv_flags |= IFF_BYPASS_SLAVE; + + if (netif_running(bypass_netdev)) { + ret = dev_open(slave_netdev); + if (ret && (ret != -EBUSY)) { + netdev_err(bypass_netdev, "Opening slave %s failed ret:%d\n", + slave_netdev->name, ret); + goto err_interface_up; + } + } + + /* Align MTU of slave with master */ + orig_mtu = slave_netdev->mtu; + ret = dev_set_mtu(slave_netdev, bypass_netdev->mtu); + if (ret != 0) { + netdev_err(bypass_netdev, "unable to change mtu of %s to %u register failed\n", + slave_netdev->name, bypass_netdev->mtu); + goto err_set_mtu; + } + + ret = bypass_slave_join(slave_netdev, bypass_netdev, bypass_ops); + if (ret != 0) + goto err_join; + + call_netdevice_notifiers(NETDEV_JOIN, slave_netdev); + + netdev_info(bypass_netdev, "bypass slave:%s registered\n", + slave_netdev->name); + + goto done; + +err_join: + dev_set_mtu(slave_netdev, orig_mtu); +err_set_mtu: + dev_close(slave_netdev); +err_interface_up: + netdev_upper_dev_unlink(slave_netdev, bypass_netdev); + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE; +upper_link_failed: + netdev_rx_handler_unregister(slave_netdev); +done: + return NOTIFY_DONE; +} + +static int bypass_slave_pre_unregister(struct net_device *slave_netdev, + struct net_device *bypass_netdev, + struct bypass_ops *bypass_ops) +{ + struct net_device *backup_netdev, *active_netdev; + struct bypass_info *bi; + + if (bypass_ops) { + if (!bypass_ops->slave_pre_unregister) + return -EINVAL; + + return bypass_ops->slave_pre_unregister(slave_netdev, + bypass_netdev); + } + + bi = netdev_priv(bypass_netdev); + active_netdev = rtnl_dereference(bi->active_netdev); + backup_netdev = rtnl_dereference(bi->backup_netdev); + + if (slave_netdev != active_netdev && slave_netdev != backup_netdev) + return -EINVAL; + + return 0; +} + +static int bypass_slave_release(struct net_device *slave_netdev, + struct net_device *bypass_netdev, + struct bypass_ops *bypass_ops) +{ + struct net_device *backup_netdev, *active_netdev; + struct bypass_info *bi; + + if (bypass_ops) { + if (!bypass_ops->slave_release) + return -EINVAL; + + return bypass_ops->slave_release(slave_netdev, bypass_netdev); + } + + bi = netdev_priv(bypass_netdev); + active_netdev = rtnl_dereference(bi->active_netdev); + backup_netdev = rtnl_dereference(bi->backup_netdev); + + if (slave_netdev == backup_netdev) { + RCU_INIT_POINTER(bi->backup_netdev, NULL); + } else { + RCU_INIT_POINTER(bi->active_netdev, NULL); + if (backup_netdev) { + bypass_netdev->min_mtu = backup_netdev->min_mtu; + bypass_netdev->max_mtu = backup_netdev->max_mtu; + } + } + + dev_put(slave_netdev); + + netdev_info(bypass_netdev, "bypass slave:%s released\n", + slave_netdev->name); + + return 0; +} + +int bypass_slave_unregister(struct net_device *slave_netdev) +{ + struct net_device *bypass_netdev; + struct bypass_ops *bypass_ops; + int ret; + + if (!netif_is_bypass_slave(slave_netdev)) + goto done; + + ASSERT_RTNL(); + + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr, + &bypass_ops); + if (!bypass_netdev) + goto done; + + ret = bypass_slave_pre_unregister(slave_netdev, bypass_netdev, + bypass_ops); + if (ret != 0) + goto done; + + netdev_rx_handler_unregister(slave_netdev); + netdev_upper_dev_unlink(slave_netdev, bypass_netdev); + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE; + + bypass_slave_release(slave_netdev, bypass_netdev, bypass_ops); + + netdev_info(bypass_netdev, "bypass slave:%s unregistered\n", + slave_netdev->name); + +done: + return NOTIFY_DONE; +} +EXPORT_SYMBOL_GPL(bypass_slave_unregister); + +static bool bypass_xmit_ready(struct net_device *dev) +{ + return netif_running(dev) && netif_carrier_ok(dev); +} + +static int bypass_slave_link_change(struct net_device *slave_netdev) +{ + struct net_device *bypass_netdev, *active_netdev, *backup_netdev; + struct bypass_ops *bypass_ops; + struct bypass_info *bi; + + if (!netif_is_bypass_slave(slave_netdev)) + goto done; + + ASSERT_RTNL(); + + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr, + &bypass_ops); + if (!bypass_netdev) + goto done; + + if (bypass_ops) { + if (!bypass_ops->slave_link_change) + goto done; + + return bypass_ops->slave_link_change(slave_netdev, + bypass_netdev); + } + + if (!netif_running(bypass_netdev)) + return 0; + + bi = netdev_priv(bypass_netdev); + + active_netdev = rtnl_dereference(bi->active_netdev); + backup_netdev = rtnl_dereference(bi->backup_netdev); + + if (slave_netdev != active_netdev && slave_netdev != backup_netdev) + goto done; + + if ((active_netdev && bypass_xmit_ready(active_netdev)) || + (backup_netdev && bypass_xmit_ready(backup_netdev))) { + netif_carrier_on(bypass_netdev); + netif_tx_wake_all_queues(bypass_netdev); + } else { + netif_carrier_off(bypass_netdev); + netif_tx_stop_all_queues(bypass_netdev); + } + +done: + return NOTIFY_DONE; +} + +static bool bypass_validate_event_dev(struct net_device *dev) +{ + /* Skip parent events */ + if (netif_is_bypass_master(dev)) + return false; + + /* Avoid non-Ethernet type devices */ + if (dev->type != ARPHRD_ETHER) + return false; + + /* Avoid Vlan dev with same MAC registering as VF */ + if (is_vlan_dev(dev)) + return false; + + /* Avoid Bonding master dev with same MAC registering as slave dev */ + if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER)) + return false; + + return true; +} + +static int +bypass_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + + if (!bypass_validate_event_dev(event_dev)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + return bypass_slave_register(event_dev); + case NETDEV_UNREGISTER: + return bypass_slave_unregister(event_dev); + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + return bypass_slave_link_change(event_dev); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block bypass_notifier = { + .notifier_call = bypass_event, +}; + +int bypass_open(struct net_device *dev) +{ + struct bypass_info *bi = netdev_priv(dev); + struct net_device *active_netdev, *backup_netdev; + int err; + + netif_carrier_off(dev); + netif_tx_wake_all_queues(dev); + + active_netdev = rtnl_dereference(bi->active_netdev); + if (active_netdev) { + err = dev_open(active_netdev); + if (err) + goto err_active_open; + } + + backup_netdev = rtnl_dereference(bi->backup_netdev); + if (backup_netdev) { + err = dev_open(backup_netdev); + if (err) + goto err_backup_open; + } + + return 0; + +err_backup_open: + dev_close(active_netdev); +err_active_open: + netif_tx_disable(dev); + return err; +} +EXPORT_SYMBOL_GPL(bypass_open); + +int bypass_close(struct net_device *dev) +{ + struct bypass_info *vi = netdev_priv(dev); + struct net_device *slave_netdev; + + netif_tx_disable(dev); + + slave_netdev = rtnl_dereference(vi->active_netdev); + if (slave_netdev) + dev_close(slave_netdev); + + slave_netdev = rtnl_dereference(vi->backup_netdev); + if (slave_netdev) + dev_close(slave_netdev); + + return 0; +} +EXPORT_SYMBOL_GPL(bypass_close); + +static netdev_tx_t bypass_drop_xmit(struct sk_buff *skb, struct net_device *dev) +{ + atomic_long_inc(&dev->tx_dropped); + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; +} + +netdev_tx_t bypass_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct bypass_info *bi = netdev_priv(dev); + struct net_device *xmit_dev; + + /* Try xmit via active netdev followed by backup netdev */ + xmit_dev = rcu_dereference_bh(bi->active_netdev); + if (!xmit_dev || !bypass_xmit_ready(xmit_dev)) { + xmit_dev = rcu_dereference_bh(bi->backup_netdev); + if (!xmit_dev || !bypass_xmit_ready(xmit_dev)) + return bypass_drop_xmit(skb, dev); + } + + skb->dev = xmit_dev; + skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping; + + return dev_queue_xmit(skb); +} +EXPORT_SYMBOL_GPL(bypass_start_xmit); + +u16 bypass_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback) +{ + /* This helper function exists to help dev_pick_tx get the correct + * destination queue. Using a helper function skips a call to + * skb_tx_hash and will put the skbs in the queue we expect on their + * way down to the bonding driver. + */ + u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0; + + /* Save the original txq to restore before passing to the driver */ + qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping; + + if (unlikely(txq >= dev->real_num_tx_queues)) { + do { + txq -= dev->real_num_tx_queues; + } while (txq >= dev->real_num_tx_queues); + } + + return txq; +} +EXPORT_SYMBOL_GPL(bypass_select_queue); + +/* fold stats, assuming all rtnl_link_stats64 fields are u64, but + * that some drivers can provide 32bit values only. + */ +static void bypass_fold_stats(struct rtnl_link_stats64 *_res, + const struct rtnl_link_stats64 *_new, + const struct rtnl_link_stats64 *_old) +{ + const u64 *new = (const u64 *)_new; + const u64 *old = (const u64 *)_old; + u64 *res = (u64 *)_res; + int i; + + for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) { + u64 nv = new[i]; + u64 ov = old[i]; + s64 delta = nv - ov; + + /* detects if this particular field is 32bit only */ + if (((nv | ov) >> 32) == 0) + delta = (s64)(s32)((u32)nv - (u32)ov); + + /* filter anomalies, some drivers reset their stats + * at down/up events. + */ + if (delta > 0) + res[i] += delta; + } +} + +void bypass_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + struct bypass_info *bi = netdev_priv(dev); + const struct rtnl_link_stats64 *new; + struct rtnl_link_stats64 temp; + struct net_device *slave_netdev; + + spin_lock(&bi->stats_lock); + memcpy(stats, &bi->bypass_stats, sizeof(*stats)); + + rcu_read_lock(); + + slave_netdev = rcu_dereference(bi->active_netdev); + if (slave_netdev) { + new = dev_get_stats(slave_netdev, &temp); + bypass_fold_stats(stats, new, &bi->active_stats); + memcpy(&bi->active_stats, new, sizeof(*new)); + } + + slave_netdev = rcu_dereference(bi->backup_netdev); + if (slave_netdev) { + new = dev_get_stats(slave_netdev, &temp); + bypass_fold_stats(stats, new, &bi->backup_stats); + memcpy(&bi->backup_stats, new, sizeof(*new)); + } + + rcu_read_unlock(); + + memcpy(&bi->bypass_stats, stats, sizeof(*stats)); + spin_unlock(&bi->stats_lock); +} +EXPORT_SYMBOL_GPL(bypass_get_stats); + +int bypass_change_mtu(struct net_device *dev, int new_mtu) +{ + struct bypass_info *bi = netdev_priv(dev); + struct net_device *active_netdev, *backup_netdev; + int ret = 0; + + active_netdev = rcu_dereference(bi->active_netdev); + if (active_netdev) { + ret = dev_set_mtu(active_netdev, new_mtu); + if (ret) + return ret; + } + + backup_netdev = rcu_dereference(bi->backup_netdev); + if (backup_netdev) { + ret = dev_set_mtu(backup_netdev, new_mtu); + if (ret) { + dev_set_mtu(active_netdev, dev->mtu); + return ret; + } + } + + dev->mtu = new_mtu; + return 0; +} +EXPORT_SYMBOL_GPL(bypass_change_mtu); + +void bypass_set_rx_mode(struct net_device *dev) +{ + struct bypass_info *bi = netdev_priv(dev); + struct net_device *slave_netdev; + + rcu_read_lock(); + + slave_netdev = rcu_dereference(bi->active_netdev); + if (slave_netdev) { + dev_uc_sync_multiple(slave_netdev, dev); + dev_mc_sync_multiple(slave_netdev, dev); + } + + slave_netdev = rcu_dereference(bi->backup_netdev); + if (slave_netdev) { + dev_uc_sync_multiple(slave_netdev, dev); + dev_mc_sync_multiple(slave_netdev, dev); + } + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(bypass_set_rx_mode); + +static const struct net_device_ops bypass_netdev_ops = { + .ndo_open = bypass_open, + .ndo_stop = bypass_close, + .ndo_start_xmit = bypass_start_xmit, + .ndo_select_queue = bypass_select_queue, + .ndo_get_stats64 = bypass_get_stats, + .ndo_change_mtu = bypass_change_mtu, + .ndo_set_rx_mode = bypass_set_rx_mode, + .ndo_validate_addr = eth_validate_addr, + .ndo_features_check = passthru_features_check, +}; + +#define BYPASS_DRV_NAME "bypass" +#define BYPASS_DRV_VERSION "0.1" + +static void bypass_ethtool_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + strlcpy(drvinfo->driver, BYPASS_DRV_NAME, sizeof(drvinfo->driver)); + strlcpy(drvinfo->version, BYPASS_DRV_VERSION, sizeof(drvinfo->version)); +} + +int bypass_ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) +{ + struct bypass_info *bi = netdev_priv(dev); + struct net_device *slave_netdev; + + slave_netdev = rtnl_dereference(bi->active_netdev); + if (!slave_netdev || !bypass_xmit_ready(slave_netdev)) { + slave_netdev = rtnl_dereference(bi->backup_netdev); + if (!slave_netdev || !bypass_xmit_ready(slave_netdev)) { + cmd->base.duplex = DUPLEX_UNKNOWN; + cmd->base.port = PORT_OTHER; + cmd->base.speed = SPEED_UNKNOWN; + + return 0; + } + } + + return __ethtool_get_link_ksettings(slave_netdev, cmd); +} +EXPORT_SYMBOL_GPL(bypass_ethtool_get_link_ksettings); + +static const struct ethtool_ops bypass_ethtool_ops = { + .get_drvinfo = bypass_ethtool_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_link_ksettings = bypass_ethtool_get_link_ksettings, +}; + +static void bypass_register_existing_slave(struct net_device *bypass_netdev) +{ + struct net *net = dev_net(bypass_netdev); + struct net_device *dev; + + rtnl_lock(); + for_each_netdev(net, dev) { + if (dev == bypass_netdev) + continue; + if (!bypass_validate_event_dev(dev)) + continue; + if (ether_addr_equal(bypass_netdev->perm_addr, dev->perm_addr)) + bypass_slave_register(dev); + } + rtnl_unlock(); +} + +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops, + struct bypass_master **pbypass_master) +{ + struct bypass_master *bypass_master; + + bypass_master = kzalloc(sizeof(*bypass_master), GFP_KERNEL); + if (!bypass_master) + return -ENOMEM; + + rcu_assign_pointer(bypass_master->ops, ops); + dev_hold(dev); + dev->priv_flags |= IFF_BYPASS; + rcu_assign_pointer(bypass_master->bypass_netdev, dev); + + spin_lock(&bypass_lock); + list_add_tail(&bypass_master->list, &bypass_master_list); + spin_unlock(&bypass_lock); + + bypass_register_existing_slave(dev); + + *pbypass_master = bypass_master; + return 0; +} +EXPORT_SYMBOL_GPL(bypass_master_register); + +void bypass_master_unregister(struct bypass_master *bypass_master) +{ + struct net_device *bypass_netdev; + + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev); + + bypass_netdev->priv_flags &= ~IFF_BYPASS; + dev_put(bypass_netdev); + + spin_lock(&bypass_lock); + list_del(&bypass_master->list); + spin_unlock(&bypass_lock); + + kfree(bypass_master); +} +EXPORT_SYMBOL_GPL(bypass_master_unregister); + +int bypass_master_create(struct net_device *backup_netdev, + struct bypass_master **pbypass_master) +{ + struct device *dev = backup_netdev->dev.parent; + struct net_device *bypass_netdev; + int err; + + /* Alloc at least 2 queues, for now we are going with 16 assuming + * that most devices being bonded won't have too many queues. + */ + bypass_netdev = alloc_etherdev_mq(sizeof(struct bypass_info), 16); + if (!bypass_netdev) { + dev_err(dev, "Unable to allocate bypass_netdev!\n"); + return -ENOMEM; + } + + dev_net_set(bypass_netdev, dev_net(backup_netdev)); + SET_NETDEV_DEV(bypass_netdev, dev); + + bypass_netdev->netdev_ops = &bypass_netdev_ops; + bypass_netdev->ethtool_ops = &bypass_ethtool_ops; + + /* Initialize the device options */ + bypass_netdev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE; + bypass_netdev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | + IFF_TX_SKB_SHARING); + + /* don't acquire bypass netdev's netif_tx_lock when transmitting */ + bypass_netdev->features |= NETIF_F_LLTX; + + /* Don't allow bypass devices to change network namespaces. */ + bypass_netdev->features |= NETIF_F_NETNS_LOCAL; + + bypass_netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | + NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | + NETIF_F_HIGHDMA | NETIF_F_LRO; + + bypass_netdev->hw_features |= NETIF_F_GSO_ENCAP_ALL; + bypass_netdev->features |= bypass_netdev->hw_features; + + memcpy(bypass_netdev->dev_addr, backup_netdev->dev_addr, + bypass_netdev->addr_len); + + bypass_netdev->min_mtu = backup_netdev->min_mtu; + bypass_netdev->max_mtu = backup_netdev->max_mtu; + + err = register_netdev(bypass_netdev); + if (err < 0) { + dev_err(dev, "Unable to register bypass_netdev!\n"); + goto err_register_netdev; + } + + netif_carrier_off(bypass_netdev); + + err = bypass_master_register(bypass_netdev, NULL, pbypass_master); + if (err < 0) + goto err_bypass; + + return 0; + +err_bypass: + unregister_netdev(bypass_netdev); +err_register_netdev: + free_netdev(bypass_netdev); + + return err; +} +EXPORT_SYMBOL_GPL(bypass_master_create); + +void bypass_master_destroy(struct bypass_master *bypass_master) +{ + struct net_device *bypass_netdev; + struct net_device *slave_netdev; + struct bypass_info *bi; + + if (!bypass_master) + return; + + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev); + bi = netdev_priv(bypass_netdev); + + netif_device_detach(bypass_netdev); + + rtnl_lock(); + + slave_netdev = rtnl_dereference(bi->active_netdev); + if (slave_netdev) + bypass_slave_unregister(slave_netdev); + + slave_netdev = rtnl_dereference(bi->backup_netdev); + if (slave_netdev) + bypass_slave_unregister(slave_netdev); + + bypass_master_unregister(bypass_master); + + unregister_netdevice(bypass_netdev); + + rtnl_unlock(); + + free_netdev(bypass_netdev); +} +EXPORT_SYMBOL_GPL(bypass_master_destroy); + +static __init int +bypass_init(void) +{ + register_netdevice_notifier(&bypass_notifier); + + return 0; +} +module_init(bypass_init); + +static __exit +void bypass_exit(void) +{ + unregister_netdevice_notifier(&bypass_notifier); +} +module_exit(bypass_exit); + +MODULE_DESCRIPTION("Bypass infrastructure/interface for Paravirtual drivers"); +MODULE_LICENSE("GPL v2"); -- 2.14.3 _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization