From: Parav Pandit <parav@xxxxxxxxxxxx> Implement compatibility layer sysfs entries of ib_core so that non init_net net namespaces can also discover rdma devices. Each non init_net net namespace has ib_core_device created in it. Such ib_core_device sysfs tree resembles rdma devices found in init_net namespace. This allows discovering rdma devices in multiple non init_net net namespaces via sysfs entries and helpful to rdma-core userspace. Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx> Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/compat_sysfs.c | 183 +++++++++++++++++++++++++ drivers/infiniband/core/core_priv.h | 11 ++ drivers/infiniband/core/device.c | 30 +++- include/rdma/ib_verbs.h | 16 +++ 5 files changed, 235 insertions(+), 7 deletions(-) create mode 100644 drivers/infiniband/core/compat_sysfs.c diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 867cee5e27b2..781b3172e596 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - nldev.o restrack.o + nldev.o restrack.o compat_sysfs.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o diff --git a/drivers/infiniband/core/compat_sysfs.c b/drivers/infiniband/core/compat_sysfs.c new file mode 100644 index 000000000000..6fa330678063 --- /dev/null +++ b/drivers/infiniband/core/compat_sysfs.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. + */ + +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <rdma/ib_verbs.h> +#include "core_priv.h" + +/** + * ib_compat_device - rdma compat device per net namespace + * @coredev: IB core device + * @list: list entry + */ +struct ib_compat_device { + struct ib_core_device coredev; + struct list_head list; +}; + +struct rdma_compatdev_net { + /* List of compat devices of a net namespace (except for init_net). + * It is used to destroy compat devices when either parent rdma device + * is removed, or when a given net namespace is removed. + */ + struct list_head compatdev_list; + + /* Semaphore to protect a ib_compat_device during device + * add/remove or during net namespace add/remove operation. + */ + struct rw_semaphore compat_rwsem; +}; + +static unsigned int rdma_compatdev_net_id; + +static void rdma_compatdev_release(struct device *dev) +{ + struct ib_core_device *coredev = + container_of(dev, struct ib_core_device, dev); + struct ib_compat_device *cdev = + container_of(coredev, struct ib_compat_device, coredev); + + kfree(cdev); +} + +static void rdma_compatdev_create(struct ib_device *device, struct net *net) +{ + struct rdma_compatdev_net *rdma_net = + net_generic(net, rdma_compatdev_net_id); + struct ib_compat_device *cdev; + int ret; + + cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); + if (!cdev) + return; + cdev->coredev.dev.parent = device->dev.parent; + + rdma_init_coredev(&cdev->coredev, device); + rdma_dev_net_set(&cdev->coredev, net); + cdev->coredev.dev.release = rdma_compatdev_release; + dev_set_name(&cdev->coredev.dev, "%s", device->name); + + ret = device_add(&cdev->coredev.dev); + if (ret) { + kfree(cdev); + return; + } + + down_write(&rdma_net->compat_rwsem); + list_add_tail(&cdev->list, &rdma_net->compatdev_list); + up_write(&rdma_net->compat_rwsem); +} + +void rdma_compatdev_add(struct ib_device *device) +{ + struct net *net; + + down_read(&net_rwsem); + /* create and add compat device in all namespaces other than where it + * it is currently bound to. + */ + for_each_net(net) { + if (!net_eq(net, rdma_dev_net(&device->coredev))) { + /* device_add and list operation needs to be done + * under a net_rwsem lock because it need to synchronize + * with net namespace del operation. + */ + rdma_compatdev_create(device, net); + } + } + up_read(&net_rwsem); +} + +static void remove_one_compatdev(struct ib_compat_device *cdev) +{ + list_del(&cdev->list); + device_unregister(&cdev->coredev.dev); +} + +void rdma_compatdev_remove(struct ib_device *device) +{ + struct rdma_compatdev_net *rdma_net; + struct ib_compat_device *cur, *tmp; + struct net *net; + + /* Hold net_rwsem while adding compat dev entries to synchronize with + * _exit_net()/_init_net. + */ + down_read(&net_rwsem); + for_each_net(net) { + rdma_net = net_generic(net, rdma_compatdev_net_id); + down_write(&rdma_net->compat_rwsem); + list_for_each_entry_safe(cur, tmp, + &rdma_net->compatdev_list, list) { + if (device == rdma_device_to_ibdev(&cur->coredev.dev)) { + /* Found the matching compat device, cleanup */ + remove_one_compatdev(cur); + break; + } + } + up_write(&rdma_net->compat_rwsem); + } + up_read(&net_rwsem); +} + +static __net_init int rdma_compatdev_init_net(struct net *net) +{ + struct rdma_compatdev_net *rdma_net = + net_generic(net, rdma_compatdev_net_id); + struct ib_device *device; + + INIT_LIST_HEAD(&rdma_net->compatdev_list); + init_rwsem(&rdma_net->compat_rwsem); + + /* No need to create any compat devices in init_net. */ + if (net_eq(net, &init_net)) + return 0; + + /* Hold device mutex to synchronize with ib_register_device() + * which also tries to add compat devices. + */ + mutex_lock(&ib_device_mutex); + /* Hold ib_lists_rwsem read lock; thereby not assume that + * ib_device_mutex is always locked while accessing ib_device_list. + */ + down_read(&ib_lists_rwsem); + list_for_each_entry(device, &ib_device_list, core_list) + rdma_compatdev_create(device, net); + up_read(&ib_lists_rwsem); + mutex_unlock(&ib_device_mutex); + return 0; +} + +static void rdma_compatdev_exit_net(struct net *net) +{ + struct rdma_compatdev_net *rdma_net = + net_generic(net, rdma_compatdev_net_id); + struct ib_compat_device *cur, *tmp; + + mutex_lock(&ib_device_mutex); + down_write(&rdma_net->compat_rwsem); + list_for_each_entry_safe(cur, tmp, &rdma_net->compatdev_list, list) + remove_one_compatdev(cur); + up_write(&rdma_net->compat_rwsem); + mutex_unlock(&ib_device_mutex); +} + +static struct pernet_operations rdma_compat_net_ops = { + .init = rdma_compatdev_init_net, + .exit = rdma_compatdev_exit_net, + .id = &rdma_compatdev_net_id, + .size = sizeof(struct rdma_compatdev_net), +}; + +int __init rdma_compat_dev_init(void) +{ + return register_pernet_device(&rdma_compat_net_ops); +} + +void __exit rdma_compat_dev_cleanup(void) +{ + unregister_pernet_device(&rdma_compat_net_ops); +} diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 0da0a26c02b9..ea04926f6b0c 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -55,6 +55,9 @@ struct pkey_index_qp_list { }; extern const struct attribute_group ib_dev_attr_group; +extern struct list_head ib_device_list; +extern struct mutex ib_device_mutex; +extern struct rw_semaphore ib_lists_rwsem; int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, @@ -323,4 +326,12 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec, const struct ib_gid_attr *attr); struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); + +void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev); + +int __init rdma_compat_dev_init(void); +void __exit rdma_compat_dev_cleanup(void); + +void rdma_compatdev_add(struct ib_device *device); +void rdma_compatdev_remove(struct ib_device *device); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index e8ae8699cb87..2d3939dc6d22 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -68,7 +68,7 @@ EXPORT_SYMBOL_GPL(ib_wq); /* The ib_device_list and client_list contain devices and clients after their * registration has completed, and the devices and clients are removed * during unregistration. */ -static LIST_HEAD(ib_device_list); +LIST_HEAD(ib_device_list); static LIST_HEAD(client_list); /* @@ -82,8 +82,8 @@ static LIST_HEAD(client_list); * * ib_lists_rwsem also protects access to the client data list. */ -static DEFINE_MUTEX(ib_device_mutex); -static DECLARE_RWSEM(ib_lists_rwsem); +DEFINE_MUTEX(ib_device_mutex); +DECLARE_RWSEM(ib_lists_rwsem); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); @@ -267,7 +267,10 @@ static int ib_device_uevent(struct device *device, static const void *net_namespace(struct device *d) { - return &init_net; + struct ib_core_device *coredev = + container_of(d, struct ib_core_device, dev); + + return rdma_dev_net(coredev); } static struct class ib_class = { @@ -278,8 +281,7 @@ static struct class ib_class = { .namespace = net_namespace, }; -static void rdma_init_coredev(struct ib_core_device *coredev, - struct ib_device *dev) +void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev) { /* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. @@ -319,6 +321,7 @@ struct ib_device *ib_alloc_device(size_t size) device->groups[0] = &ib_dev_attr_group; rdma_init_coredev(&device->coredev, device); + rdma_dev_net_set(&device->coredev, &init_net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); @@ -642,6 +645,11 @@ int ib_register_device(struct ib_device *device, const char *name, goto cg_cleanup; } + /* Perform this under device mutex lock, so that it can synchronize + * with _init_net() to avoid duplicate additions for a given device. + */ + rdma_compatdev_add(device); + device->reg_state = IB_DEV_REGISTERED; list_for_each_entry(client, &client_list, list) @@ -698,6 +706,7 @@ void ib_unregister_device(struct ib_device *device) } up_read(&ib_lists_rwsem); + rdma_compatdev_remove(device); ib_device_unregister_sysfs(device); ib_device_unregister_rdmacg(device); @@ -1314,12 +1323,20 @@ static int __init ib_core_init(void) goto err_sa; } + ret = rdma_compat_dev_init(); + if (ret) { + pr_warn("Couldn't init compat dev. ret %d\n", ret); + goto err_compat; + } + nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); roce_gid_mgmt_init(); return 0; +err_compat: + unregister_lsm_notifier(&ibdev_lsm_nb); err_sa: ib_sa_cleanup(); err_mad: @@ -1344,6 +1361,7 @@ static void __exit ib_core_cleanup(void) roce_gid_mgmt_cleanup(); nldev_exit(); rdma_nl_unregister(RDMA_NL_LS); + rdma_compat_dev_cleanup(); unregister_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index f9bedf28e292..6beb3a4f3c22 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2262,6 +2262,7 @@ struct ib_core_device { * union of ib_core_device and device exists in ib_device. */ struct device dev; + possible_net_t rdma_net; struct kobject *ports_kobj; struct list_head port_list; struct ib_device *owner; /* reach back to owner ib_device */ @@ -2625,6 +2626,21 @@ struct ib_device { struct completion unreg_completion; }; +/* + * Net namespace inlines + */ +static inline +struct net *rdma_dev_net(const struct ib_core_device *coredev) +{ + return read_pnet(&coredev->rdma_net); +} + +static inline +void rdma_dev_net_set(struct ib_core_device *coredev, struct net *net) +{ + write_pnet(&coredev->rdma_net, net); +} + struct ib_client { char *name; void (*add) (struct ib_device *); -- 2.19.1