From: Parav Pandit <parav@xxxxxxxxxxxx> Implement compatibility layer sysfs entries of ib_core so that non init_net net namespaces can also discover rdma devices. Each non init_net net namespace has ib_core_device created in it. Such ib_core_device sysfs tree resembles rdma devices found in init_net namespace. This allows discovering rdma devices in multiple non init_net net namespaces via sysfs entries and helpful to rdma-core userspace. Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx> Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> --- drivers/infiniband/core/core_priv.h | 3 + drivers/infiniband/core/device.c | 289 +++++++++++++++++++++++++++- include/rdma/ib_verbs.h | 6 + 3 files changed, 294 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index eeabe9ca8427..7705aa6861b5 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -333,4 +333,7 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec, const struct ib_gid_attr *attr); struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); + +void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev, + struct net *net); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 35b4bfec91c3..0ab28ab801af 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -38,6 +38,8 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/netdevice.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include <linux/security.h> #include <linux/notifier.h> #include <rdma/rdma_netlink.h> @@ -100,6 +102,50 @@ static DECLARE_RWSEM(clients_rwsem); * be registered. */ #define CLIENT_DATA_REGISTERED XA_MARK_1 + +/** + * ib_compat_device - rdma compat device per net namespace + * @coredev: IB core device + * @id: xarray id to identify the compat device; same id as that of + * net namespace xarray. + */ +struct ib_compat_device { + struct ib_core_device coredev; + u32 id; /* xarray id same as that of rdma net namespace */ +}; + +/** + * rdma_dev_net - rdma net namespace metadata for a net + * @net: Pointer to owner net namespace + * @id: xarray id to identify the net namespace. + */ +struct rdma_dev_net { + possible_net_t net; + u32 id; +}; + +/* + * If netns is registered then the corresponding compat device must also + * be registered. + */ +#define NET_NS_REGISTERED XA_MARK_1 + +static unsigned int rdma_dev_net_id; +/* + * Shadow net namespace entries maintained in xarray, which are referred + * by net life cycle routines (init_net/exit_net) and device life cycle + * routines (reg_dev/unreg_dev). + * Without this shadow list, if device life cycle routines access the + * net stack's net ns list, it can miss out to consider a net ns whose + * init_net() is executed but entry is not part of the net ns list in + * setup_net(). + */ +static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_TRACK_FREE); +/* + * rwsem to protect accessing the rdma_nets xarray entries. + */ +static DECLARE_RWSEM(rdma_nets_rwsem); + /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in @@ -226,6 +272,26 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } +static int rename_compat_devs(struct ib_device *device) +{ + struct ib_compat_device *cdev; + unsigned long index; + int ret = 0; + + mutex_lock(&device->compat_devs_mutex); + xa_for_each (&device->compat_devs, index, cdev) { + ret = device_rename(&cdev->coredev.dev, dev_name(&device->dev)); + if (ret) { + dev_warn(&cdev->coredev.dev, + "Fail to rename compatdev to new name %s\n", + dev_name(&device->dev)); + break; + } + } + mutex_unlock(&device->compat_devs_mutex); + return ret; +} + int ib_device_rename(struct ib_device *ibdev, const char *name) { int ret; @@ -245,6 +311,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) if (ret) goto out; strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); + ret = rename_compat_devs(ibdev); out: up_write(&devices_rwsem); return ret; @@ -314,7 +381,10 @@ static int ib_device_uevent(struct device *device, static const void *net_namespace(struct device *d) { - return &init_net; + struct ib_core_device *coredev = + container_of(d, struct ib_core_device, dev); + + return read_pnet(&coredev->rdma_net); } static struct class ib_class = { @@ -325,8 +395,8 @@ static struct class ib_class = { .namespace = net_namespace, }; -static void rdma_init_coredev(struct ib_core_device *coredev, - struct ib_device *dev) +void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev, + struct net *net) { /* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. @@ -342,6 +412,7 @@ static void rdma_init_coredev(struct ib_core_device *coredev, device_initialize(&coredev->dev); coredev->owner = dev; INIT_LIST_HEAD(&coredev->port_list); + write_pnet(&coredev->rdma_net, net); } /** @@ -371,7 +442,7 @@ struct ib_device *_ib_alloc_device(size_t size) } device->groups[0] = &ib_dev_attr_group; - rdma_init_coredev(&device->coredev, device); + rdma_init_coredev(&device->coredev, device, &init_net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); @@ -381,6 +452,8 @@ struct ib_device *_ib_alloc_device(size_t size) */ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); init_rwsem(&device->client_data_rwsem); + xa_init_flags(&device->compat_devs, XA_FLAGS_TRACK_FREE); + mutex_init(&device->compat_devs_mutex); init_completion(&device->unreg_completion); return device; @@ -395,6 +468,7 @@ EXPORT_SYMBOL(_ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { + WARN_ON(!xa_empty(&device->compat_devs)); WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); @@ -593,6 +667,182 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, return NOTIFY_OK; } +static void compatdev_release(struct device *dev) +{ + struct ib_compat_device *cdev = + container_of(dev, struct ib_compat_device, coredev.dev); + + kfree(cdev); +} + +static int add_one_compat_dev(struct ib_device *device, + struct rdma_dev_net *rnet) +{ + struct ib_compat_device *cdev; + int ret; + + /* create and add compat device in all namespaces other than + * where it is currently bound to. + */ + if (net_eq(read_pnet(&rnet->net), + read_pnet(&device->coredev.rdma_net))) + return 0; + + /* Whichever path among init_net() or ib_register_device() takes the + * compat_devs_mutex first, will be adding the compat devices. So if its + * already added, don't add again. + */ + mutex_lock(&device->compat_devs_mutex); + cdev = xa_load(&device->compat_devs, rnet->id); + if (cdev) { + /* entry already exist for a net, no need to add again. */ + ret = 0; + goto done; + } + + cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); + if (!cdev) { + ret = -ENOMEM; + goto done; + } + cdev->coredev.dev.parent = device->dev.parent; + cdev->id = rnet->id; + + rdma_init_coredev(&cdev->coredev, device, read_pnet(&rnet->net)); + cdev->coredev.dev.release = compatdev_release; + dev_set_name(&cdev->coredev.dev, "%s", dev_name(&device->dev)); + + ret = device_add(&cdev->coredev.dev); + if (ret) + goto add_err; + + ret = xa_insert(&device->compat_devs, rnet->id, cdev, GFP_KERNEL); + if (ret) + goto insert_err; + + mutex_unlock(&device->compat_devs_mutex); + return 0; + +insert_err: + device_del(&cdev->coredev.dev); +add_err: + put_device(&cdev->coredev.dev); +done: + mutex_unlock(&device->compat_devs_mutex); + return ret; +} + +static void remove_one_compat_dev(struct ib_device *device, u32 id) +{ + struct ib_compat_device *cdev; + + mutex_lock(&device->compat_devs_mutex); + cdev = xa_erase(&device->compat_devs, id); + mutex_unlock(&device->compat_devs_mutex); + if (cdev) { + device_del(&cdev->coredev.dev); + put_device(&cdev->coredev.dev); + } +} + +static void remove_compat_devs(struct ib_device *device) +{ + struct ib_compat_device *cdev; + unsigned long index; + + xa_for_each (&device->compat_devs, index, cdev) + remove_one_compat_dev(device, index); +} + +static int add_compat_devs(struct ib_device *device) +{ + struct rdma_dev_net *rnet = NULL; + unsigned long index; + int ret = 0; + + down_read(&rdma_nets_rwsem); + xa_for_each_marked (&rdma_nets, index, rnet, NET_NS_REGISTERED) { + ret = add_one_compat_dev(device, rnet); + if (ret) + break; + } + up_read(&rdma_nets_rwsem); + return ret; +} + +static void rdma_dev_exit_net(struct net *net) +{ + struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); + struct ib_device *dev; + unsigned long index; + + /* Clear the net for REGISTERED state, so that any new device + * registration doesn't use this net to add compat devices. + */ + down_write(&rdma_nets_rwsem); + xa_clear_mark(&rdma_nets, rnet->id, NET_NS_REGISTERED); + up_write(&rdma_nets_rwsem); + + down_read(&devices_rwsem); + xa_for_each (&devices, index, dev) { + /* hold device reference to not free the device + * while working on removing its compat devices. + */ + get_device(&dev->dev); + /* Release the devices_rwsem so that pontentially + * blocking device_del, doesn't hold the devices_rwsem + * for too long. + */ + up_read(&devices_rwsem); + remove_one_compat_dev(dev, rnet->id); + put_device(&dev->dev); + down_read(&devices_rwsem); + } + up_read(&devices_rwsem); + + down_write(&rdma_nets_rwsem); + xa_erase(&rdma_nets, rnet->id); + up_write(&rdma_nets_rwsem); +} + +static __net_init int rdma_dev_init_net(struct net *net) +{ + struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); + unsigned long index; + struct ib_device *dev; + int ret; + + /* No need to create any compat devices in default init_net. */ + if (net_eq(net, &init_net)) + return 0; + + write_pnet(&rnet->net, net); + + down_write(&rdma_nets_rwsem); + ret = xa_alloc(&rdma_nets, &rnet->id, U32_MAX, rnet, GFP_KERNEL); + if (!ret) + xa_set_mark(&rdma_nets, rnet->id, NET_NS_REGISTERED); + up_write(&rdma_nets_rwsem); + if (ret) + return ret; + + /* Hold devices_rwsem to synchronize with disable_device(), + * rename_device() so that we don't add a compat devices for a device + * which may be undergoing unregistration sequence. + */ + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + ret = add_one_compat_dev(dev, rnet); + if (ret) + break; + } + up_read(&devices_rwsem); + + if (ret) + rdma_dev_exit_net(net); + return ret; +} + /* * Assign the unique string device name and the unique device index. */ @@ -810,8 +1060,18 @@ int ib_register_device(struct ib_device *device, const char *name) if (ret) goto sysfs_cleanup; + ret = add_compat_devs(device); + if (ret) + goto disable_cleanup; + return 0; +disable_cleanup: + disable_device(device); + /* Cleanup any compat devices which are added by init_net() before + * disabling the device. + */ + remove_compat_devs(device); sysfs_cleanup: ib_device_unregister_sysfs(device); dev_cleanup: @@ -834,6 +1094,11 @@ EXPORT_SYMBOL(ib_register_device); void ib_unregister_device(struct ib_device *device) { disable_device(device); + /* compat devices must be removed after device refcount drops to zero. + * Otherwise init_net() may add more compatdevs after removing + * compat devices and before device is disabled. + */ + remove_compat_devs(device); ib_device_unregister_sysfs(device); device_del(&device->dev); ib_device_unregister_rdmacg(device); @@ -842,6 +1107,13 @@ void ib_unregister_device(struct ib_device *device) } EXPORT_SYMBOL(ib_unregister_device); +static struct pernet_operations rdma_dev_net_ops = { + .init = rdma_dev_init_net, + .exit = rdma_dev_exit_net, + .id = &rdma_dev_net_id, + .size = sizeof(struct rdma_dev_net), +}; + static int assign_client_id(struct ib_client *client) { int ret; @@ -1531,12 +1803,20 @@ static int __init ib_core_init(void) goto err_sa; } + ret = register_pernet_device(&rdma_dev_net_ops); + if (ret) { + pr_warn("Couldn't init compat dev. ret %d\n", ret); + goto err_compat; + } + nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); roce_gid_mgmt_init(); return 0; +err_compat: + unregister_lsm_notifier(&ibdev_lsm_nb); err_sa: ib_sa_cleanup(); err_mad: @@ -1561,6 +1841,7 @@ static void __exit ib_core_cleanup(void) roce_gid_mgmt_cleanup(); nldev_exit(); rdma_nl_unregister(RDMA_NL_LS); + unregister_pernet_device(&rdma_dev_net_ops); unregister_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 30314376d032..3e78551b0bfa 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2539,6 +2539,7 @@ struct ib_core_device { * union of ib_core_device and device exists in ib_device. */ struct device dev; + possible_net_t rdma_net; struct kobject *ports_kobj; struct list_head port_list; struct ib_device *owner; /* reach back to owner ib_device */ @@ -2613,6 +2614,11 @@ struct ib_device { */ refcount_t refcount; struct completion unreg_completion; + + /* Protects compat_devs xarray modifications */ + struct mutex compat_devs_mutex; + /* Maintains compat devices for each net namespace */ + struct xarray compat_devs; }; struct ib_client { -- 2.19.1