From: Parav Pandit <parav@xxxxxxxxxxxx> Introduce a helper function that changes rdma device's net namespace which performs mini disable enable sequence to have device visible only in assigned net namespace. device unregistration, device rename and device change net namespace may be invoked concurrently. (a) device unregistration needs to wait until device change (rename or net namespace change) operation is in progress. (b) device net namespace change should not proceed if the unregistration has started. (c) while one cpu is changing device net namespace, other cpu should not be able to rename or change net namespace. To address above concurrency, (a) Use unreg_mutex to synchronize between ib_unregister_device() and net namespace change operation (b) In cases where unregister_device() has started unregistration before change_netns got chance to acquire unreg_mutex, validate the refcount - if it dropped to zero, abort the net namespace change operation. Finally use the helper function to change net namespace of ib device to move the device back to init_net when such net is deleted. Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx> Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> --- drivers/infiniband/core/device.c | 121 ++++++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 477b787bd083..48805e00b115 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -201,6 +201,8 @@ static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; +static int __rdma_dev_change_netns(struct ib_device *device, struct net *net); + /* Pointer to the RCU head at the start of the ib_port_data array */ struct ib_port_data_rcu { struct rcu_head rcu_head; @@ -861,6 +863,14 @@ static int add_compat_devs(struct ib_device *device) unsigned long index; int ret = 0; + /* If we didn't get a reference for a registered it must be + * undergoing a net namespace change via netlink command. + * In such case its not an error and no need to add any compat + * devices either. + */ + if (!ib_device_try_get(device)) + return 0; + down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, index, rnet) { ret = add_one_compat_dev(device, rnet); @@ -868,6 +878,7 @@ static int add_compat_devs(struct ib_device *device) break; } up_read(&rdma_nets_rwsem); + ib_device_put(device); return ret; } @@ -952,6 +963,29 @@ int rdma_compatdev_set(u8 enable) return ret; } +static void devices_to_default_net(struct net *net) +{ + struct ib_device *dev; + unsigned long index; + + down_read(&devices_rwsem); + /* If net namespace change is initiated by the time we reach here, + * or if device unregistration has started, don't consider those + * devices. + */ + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + if (!net_eq(net, read_pnet(&dev->coredev.rdma_net))) + continue; + + up_read(&devices_rwsem); + mutex_lock(&dev->unregistration_lock); + __rdma_dev_change_netns(dev, &init_net); + mutex_unlock(&dev->unregistration_lock); + down_read(&devices_rwsem); + } + up_read(&devices_rwsem); +} + static void rdma_dev_exit_net(struct net *net) { struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); @@ -983,7 +1017,7 @@ static void rdma_dev_exit_net(struct net *net) } up_read(&devices_rwsem); - xa_erase(&rdma_nets, rnet->id); + devices_to_default_net(net); } static __net_init int rdma_dev_init_net(struct net *net) @@ -1433,6 +1467,91 @@ void ib_unregister_device_queued(struct ib_device *ib_dev) } EXPORT_SYMBOL(ib_unregister_device_queued); +static int __rdma_dev_change_netns(struct ib_device *device, struct net *net) +{ + int ret; + + disable_device(device); + + /* At this point no one would be using the device, + * so it is safe to change the device net namespace. + */ + write_pnet(&device->coredev.rdma_net, net); + /* It is safe to rename the device here without holding + * devices_rwsem, because all references to the device would + * have dropped by now. + */ + ret = device_rename(&device->dev, dev_name(&device->dev)); + if (ret) { + dev_warn(&device->dev, "%s Couldn't rename device\n", __func__); + return ret; + } + ret = enable_device_and_get(device); + if (ret) { + /* This shouldn't really happen, but if it does, let + * user retry at later point. So don't disable the + * device. + */ + dev_warn(&device->dev, + "%s Couldn't re-enable device\n", __func__); + } + ib_device_put(device); + return ret; +} + +static int rdma_dev_change_netns_with_put(struct ib_device *dev, + struct net *net) +{ + int ret; + + /* + * Currently supported only for those providers which supports + * disassociation and those which doesn't do port specific sysfs + * init. Once a port_cleanup infrastructure is implemented, this + * limitation will be removed. + */ + if (!dev->ops.disassociate_ucontext || dev->ops.init_port) { + ib_device_put(dev); + return -EOPNOTSUPP; + } + + /* We must drop the device reference; otherwise disable_device() + * who is waiting for refcount to drop to zero while holding + * unregistration_lock won't happen. This can lead to deadlock. + * Also if we give up the ib_device reference here, + * ib_unregister_device() followed by ib_dealloc_device() can proceeed + * to free this device. Therefore, this thread must hold a reference + * to the device structure so that it cannot get freed, while this + * thread works on it. + */ + get_device(&dev->dev); + ib_device_put(dev); + + mutex_lock(&dev->unregistration_lock); + /* If the refcount is zero, it means disable_device() has progressed + * first to disable the device. There is no point in changing net + * namespace of device which is on path of removal. So fail the request. + */ + if (refcount_read(&dev->refcount) == 0) { + ret = -ENODEV; + goto done; + } + /* If net namespace was changed by the time this thread gets to it, + * return an error. + */ + if (!net_eq(current->nsproxy->net_ns, + read_pnet(&dev->coredev.rdma_net))) { + ret = -ENODEV; + goto done; + } + ret = __rdma_dev_change_netns(dev, net); + +done: + mutex_unlock(&dev->unregistration_lock); + put_device(&dev->dev); + return ret; +} + static struct pernet_operations rdma_dev_net_ops = { .init = rdma_dev_init_net, .exit = rdma_dev_exit_net, -- 2.19.1