On Sun, Mar 31, 2019 at 07:36:43PM +0300, Leon Romanovsky wrote: > From: Parav Pandit <parav@xxxxxxxxxxxx> > > Introduce a helper function that changes rdma device's net namespace > which performs mini disable enable sequence to have device visible > only in assigned net namespace. > > device unregistration, device rename and device change net namespace > may be invoked concurrently. > (a) device unregistration needs to wait until device change (rename > or net namespace change) operation is in progress. > (b) device net namespace change should not proceed if the > unregistration has started. > (c) while one cpu is changing device net namespace, other cpu should > not be able to rename or change net namespace. > > To address above concurrency, > (a) Use unreg_mutex to synchronize between ib_unregister_device() and > net namespace change operation > (b) In cases where unregister_device() has started unregistration before > change_netns got chance to acquire unreg_mutex, validate the refcount - > if it dropped to zero, abort the net namespace change operation. > > Finally use the helper function to change net namespace of ib device to > move the device back to init_net when such net is deleted. > > Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx> > Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> > --- > drivers/infiniband/core/device.c | 123 +++++++++++++++++++++++++++++++ > 1 file changed, 123 insertions(+) > > -- > 2.20.1 > > diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c > index 55fccbb0aabd..307e729cb8f4 100644 > --- a/drivers/infiniband/core/device.c > +++ b/drivers/infiniband/core/device.c > @@ -201,6 +201,8 @@ static struct notifier_block ibdev_lsm_nb = { > .notifier_call = ib_security_change, > }; > > +static int __rdma_dev_change_netns(struct ib_device *device, struct net *net); > + > /* Pointer to the RCU head at the start of the ib_port_data array */ > struct ib_port_data_rcu { > struct rcu_head rcu_head; > @@ -861,6 +863,8 @@ static int add_compat_devs(struct ib_device *device) > unsigned long index; > int ret = 0; > > + lockdep_assert_held(&devices_rwsem); > + > down_read(&rdma_nets_rwsem); > xa_for_each (&rdma_nets, index, rnet) { > ret = add_one_compat_dev(device, rnet); > @@ -952,6 +956,32 @@ int rdma_compatdev_set(u8 enable) > return ret; > } > > +static void devices_to_default_net(struct net *net) > +{ > + struct ib_device *dev; > + unsigned long index; > + > + down_read(&devices_rwsem); > + /* If net namespace change is initiated by the time we reach here, > + * or if device unregistration has started, don't consider those > + * devices. > + */ infiniband/core uses the /* * foo */ Style of multi-line comment for some reason. > + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { > + if (!net_eq(net, read_pnet(&dev->coredev.rdma_net))) > + continue; > + > + /* Hold parent device reference to avoid getting freed */ > + get_device(&dev->dev); > + up_read(&devices_rwsem); > + mutex_lock(&dev->unregistration_lock); > + __rdma_dev_change_netns(dev, &init_net); > + mutex_unlock(&dev->unregistration_lock); > + put_device(&dev->dev); > + down_read(&devices_rwsem); > + } > + up_read(&devices_rwsem); > +} This is better just as part of the loop in rdma_dev_exit_net() > + > static void rdma_dev_exit_net(struct net *net) > { > struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); > @@ -983,6 +1013,7 @@ static void rdma_dev_exit_net(struct net *net) > } > up_read(&devices_rwsem); > > + devices_to_default_net(net); > xa_erase(&rdma_nets, rnet->id); > } > > @@ -1425,6 +1456,98 @@ void ib_unregister_device_queued(struct ib_device *ib_dev) > } > EXPORT_SYMBOL(ib_unregister_device_queued); > > +static int __rdma_dev_change_netns(struct ib_device *device, struct net *net) > +{ > + int ret; > + > + /* If the refcount is zero, it means disable_device() has progressed > + * first during unregistration phase. There is no point in changing net > + * namespace of device which is on path of removal. > + */ > + if (refcount_read(&device->refcount) == 0) > + return -ENODEV; > + > + disable_device(device); > + > + /* At this point no one would be using the device, > + * so it is safe to change the device net namespace. > + */ > + write_pnet(&device->coredev.rdma_net, net); > + /* Currently rdma devices are system wide unique. So device > + * name is guaranteed free in new namespace. Publish new > + * namespace at sysfs level. > + */ > + ret = device_rename(&device->dev, dev_name(&device->dev)); > + if (ret) { > + dev_warn(&device->dev, "%s Couldn't rename device\n", __func__); > + return ret; Hurm.. Technically we are supposed to hold the devices_rwsem if we are manipulating the name. Inferring that we don't have to because the refcount is 0 is a little over optimizing for my tastes .. and if this fails the device is now permanently stuck with a 0 refcount.. So we should re-enable it anyhow. The sysfs will be screwed up but what can you do.. > + if (ret) { > + /* This shouldn't really happen, but if it does, let > + * user retry at later point. So don't disable the > + * device. > + */ > + dev_warn(&device->dev, > + "%s Couldn't re-enable device\n", __func__); > + } > + > + ib_device_put(device); > + return ret; > +} > + > +int rdma_dev_change_netns_with_put(struct ib_device *dev, struct net *net) > +{ Missing static? > + int ret; > + > + /* > + * Currently supported only for those providers which supports > + * disassociation and those which doesn't do port specific sysfs > + * init. Once a port_cleanup infrastructure is implemented, this > + * limitation will be removed. > + */ > + if (!dev->ops.disassociate_ucontext || dev->ops.init_port) { > + ib_device_put(dev); > + return -EOPNOTSUPP; > + } I feel like this should also be prevented if ib_devices_shared_netns?? > + /* We must drop the device reference; otherwise disable_device() > + * who is waiting for refcount to drop to zero while holding > + * unregistration_lock won't happen. This can lead to deadlock. > + * Also if we give up the ib_device reference here, > + * ib_unregister_device() followed by ib_dealloc_device() can proceeed > + * to free this device. Therefore, this thread must hold a reference > + * to the device structure so that it cannot get freed, while this > + * thread works on it. > + */ > + get_device(&dev->dev); > + ib_device_put(dev); > + > + mutex_lock(&dev->unregistration_lock); > + /* If the refcount is zero, it means disable_device() has progressed > + * first to disable the device. There is no point in changing net > + * namespace of device which is on path of removal. So fail the request. > + */ > + if (refcount_read(&dev->refcount) == 0) { > + ret = -ENODEV; > + goto done; > + } This is redundant, __rdma_dev_change_netns already does this test.. > + /* If net namespace was changed by the time this thread gets to it, > + * return an error. > + */ > + if (!net_eq(current->nsproxy->net_ns, > + read_pnet(&dev->coredev.rdma_net))) { > + ret = -ENODEV; > + goto done; > + } And this needs to be inside __rdma_dev_change_netns as the other caller technically needs it too. .. and if we are doing that then the lock should be absorbed into __rdma_dev_change_netns as well so it is a self-consistent function. What about the kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); kobject_uevent(&dev->dev.kobj, KOBJ_ADD); Stuff that netdev does? Shouldn't we do that too? KOBJ_REMOVE is part of triggering userspace to let go of its processes too.. I ended up with a delta like this: diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 2b8254dd4c67ac..4458443b96a72e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -201,7 +201,8 @@ static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; -static int __rdma_dev_change_netns(struct ib_device *device, struct net *net); +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, + struct net *net); /* Pointer to the RCU head at the start of the ib_port_data array */ struct ib_port_data_rcu { @@ -956,32 +957,6 @@ int rdma_compatdev_set(u8 enable) return ret; } -static void devices_to_default_net(struct net *net) -{ - struct ib_device *dev; - unsigned long index; - - down_read(&devices_rwsem); - /* If net namespace change is initiated by the time we reach here, - * or if device unregistration has started, don't consider those - * devices. - */ - xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { - if (!net_eq(net, read_pnet(&dev->coredev.rdma_net))) - continue; - - /* Hold parent device reference to avoid getting freed */ - get_device(&dev->dev); - up_read(&devices_rwsem); - mutex_lock(&dev->unregistration_lock); - __rdma_dev_change_netns(dev, &init_net); - mutex_unlock(&dev->unregistration_lock); - put_device(&dev->dev); - down_read(&devices_rwsem); - } - up_read(&devices_rwsem); -} - static void rdma_dev_exit_net(struct net *net) { struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); @@ -1008,12 +983,16 @@ static void rdma_dev_exit_net(struct net *net) remove_one_compat_dev(dev, rnet->id); + /* + * If the real device is in the NS then move it back to init. + */ + rdma_dev_change_netns(dev, net, &init_net); + put_device(&dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); - devices_to_default_net(net); xa_erase(&rdma_nets, rnet->id); } @@ -1459,96 +1438,74 @@ void ib_unregister_device_queued(struct ib_device *ib_dev) } EXPORT_SYMBOL(ib_unregister_device_queued); -static int __rdma_dev_change_netns(struct ib_device *device, struct net *net) +/* + * The caller must pass in a device that has the kref held and the refcount + * released. If the device is in cur_net and still registered then it is moved + * into net. + */ +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, + struct net *net) { + int ret2 = -EINVAL; int ret; - /* If the refcount is zero, it means disable_device() has progressed - * first during unregistration phase. There is no point in changing net - * namespace of device which is on path of removal. + mutex_lock(&device->unregistration_lock); + + /* + * If a device not under ib_device_get() or the unregistration_lock + * the namespace can be changed, or it can be unregistered. Check + * again under the lock. */ - if (refcount_read(&device->refcount) == 0) - return -ENODEV; + if (!net_eq(current->nsproxy->net_ns, + read_pnet(&device->coredev.rdma_net)) || + refcount_read(&device->refcount) == 0) { + ret = -ENODEV; + goto out; + } + kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); disable_device(device); - /* At this point no one would be using the device, - * so it is safe to change the device net namespace. + /* + * At this point no one can be using the device, so it is safe to + * change the namespace. */ write_pnet(&device->coredev.rdma_net, net); - /* Currently rdma devices are system wide unique. So device - * name is guaranteed free in new namespace. Publish new - * namespace at sysfs level. + + /* + * Currently rdma devices are system wide unique. So the device name + * is guaranteed free in the new namespace. Publish the new namespace + * at the sysfs level. */ + down_read(&devices_rwsem); ret = device_rename(&device->dev, dev_name(&device->dev)); + up_read(&devices_rwsem); if (ret) { - dev_warn(&device->dev, "%s Couldn't rename device\n", __func__); - return ret; - } - ret = enable_device_and_get(device); - if (ret) { - /* This shouldn't really happen, but if it does, let - * user retry at later point. So don't disable the - * device. - */ dev_warn(&device->dev, - "%s Couldn't re-enable device\n", __func__); - } - - ib_device_put(device); - return ret; -} - -int rdma_dev_change_netns_with_put(struct ib_device *dev, struct net *net) -{ - int ret; - - /* - * Currently supported only for those providers which supports - * disassociation and those which doesn't do port specific sysfs - * init. Once a port_cleanup infrastructure is implemented, this - * limitation will be removed. - */ - if (!dev->ops.disassociate_ucontext || dev->ops.init_port) { - ib_device_put(dev); - return -EOPNOTSUPP; + "%s: Couldn't rename device after namespace change\n", + __func__); + /* Try and put things back and re-enable the device */ + write_pnet(&device->coredev.rdma_net, cur_net); } - /* We must drop the device reference; otherwise disable_device() - * who is waiting for refcount to drop to zero while holding - * unregistration_lock won't happen. This can lead to deadlock. - * Also if we give up the ib_device reference here, - * ib_unregister_device() followed by ib_dealloc_device() can proceeed - * to free this device. Therefore, this thread must hold a reference - * to the device structure so that it cannot get freed, while this - * thread works on it. - */ - get_device(&dev->dev); - ib_device_put(dev); - - mutex_lock(&dev->unregistration_lock); - /* If the refcount is zero, it means disable_device() has progressed - * first to disable the device. There is no point in changing net - * namespace of device which is on path of removal. So fail the request. - */ - if (refcount_read(&dev->refcount) == 0) { - ret = -ENODEV; - goto done; - } - /* If net namespace was changed by the time this thread gets to it, - * return an error. - */ - if (!net_eq(current->nsproxy->net_ns, - read_pnet(&dev->coredev.rdma_net))) { - ret = -ENODEV; - goto done; - } - ret = __rdma_dev_change_netns(dev, net); + ret2 = enable_device_and_get(device); + if (ret2) + /* + * This shouldn't really happen, but if it does, let the user + * retry at later point. So don't disable the device. + */ + dev_warn( + &device->dev, + "%s: Couldn't re-enable device after namespace change\n", + __func__); + kobject_uevent(&device->dev.kobj, KOBJ_ADD); -done: - mutex_unlock(&dev->unregistration_lock); - put_device(&dev->dev); - return ret; + ib_device_put(device); +out: + mutex_unlock(&device->unregistration_lock); + if (ret) + return ret; + return ret2; } int ib_device_set_netns_put(struct sk_buff *skb, @@ -1568,7 +1525,23 @@ int ib_device_set_netns_put(struct sk_buff *skb, goto ns_err; } - ret = rdma_dev_change_netns_with_put(dev, net); + /* + * Currently supported only for those providers which support + * disassociation and don't do port specific sysfs init. Once a + * port_cleanup infrastructure is implemented, this limitation will be + * removed. + */ + if (!dev->ops.disassociate_ucontext || dev->ops.init_port || + ib_devices_shared_netns) { + ret = -EOPNOTSUPP; + goto ns_err; + } + + get_device(&dev->dev); + ib_device_put(dev); + ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); + put_device(&dev->dev); + put_net(net); return ret;