Re: [PATCH rdma-next v1 2/3] RDMA/core: Introduce a helper function to change net namespace of rdma device

Jason Gunthorpe <jgg@xxxxxxxx> · Wed, 10 Apr 2019 14:41:55 -0300

On Sun, Mar 31, 2019 at 07:36:43PM +0300, Leon Romanovsky wrote:
> From: Parav Pandit <parav@xxxxxxxxxxxx>
> 
> Introduce a helper function that changes rdma device's net namespace
> which performs mini disable enable sequence to have device visible
> only in assigned net namespace.
> 
> device unregistration, device rename and device change net namespace
> may be invoked concurrently.
> (a) device unregistration needs to wait until device change (rename
> or net namespace change) operation is in progress.
> (b) device net namespace change should not proceed if the
> unregistration has started.
> (c) while one cpu is changing device net namespace, other cpu should
> not be able to rename or change net namespace.
> 
> To address above concurrency,
> (a) Use unreg_mutex to synchronize between ib_unregister_device() and
> net namespace change operation
> (b) In cases where unregister_device() has started unregistration before
> change_netns got chance to acquire unreg_mutex, validate the refcount -
> if it dropped to zero, abort the net namespace change operation.
> 
> Finally use the helper function to change net namespace of ib device to
> move the device back to init_net when such net is deleted.
> 
> Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx>
> Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx>
> ---
>  drivers/infiniband/core/device.c | 123 +++++++++++++++++++++++++++++++
>  1 file changed, 123 insertions(+)
> 
> --
> 2.20.1
> 
> diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> index 55fccbb0aabd..307e729cb8f4 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -201,6 +201,8 @@ static struct notifier_block ibdev_lsm_nb = {
>  	.notifier_call = ib_security_change,
>  };
> 
> +static int __rdma_dev_change_netns(struct ib_device *device, struct net *net);
> +
>  /* Pointer to the RCU head at the start of the ib_port_data array */
>  struct ib_port_data_rcu {
>  	struct rcu_head rcu_head;
> @@ -861,6 +863,8 @@ static int add_compat_devs(struct ib_device *device)
>  	unsigned long index;
>  	int ret = 0;
> 
> +	lockdep_assert_held(&devices_rwsem);
> +
>  	down_read(&rdma_nets_rwsem);
>  	xa_for_each (&rdma_nets, index, rnet) {
>  		ret = add_one_compat_dev(device, rnet);
> @@ -952,6 +956,32 @@ int rdma_compatdev_set(u8 enable)
>  	return ret;
>  }
> 
> +static void devices_to_default_net(struct net *net)
> +{
> +	struct ib_device *dev;
> +	unsigned long index;
> +
> +	down_read(&devices_rwsem);
> +	/* If net namespace change is initiated by the time we reach here,
> +	 * or if device unregistration has started, don't consider those
> +	 * devices.
> +	 */

infiniband/core uses the 
/*
 * foo
 */

Style of multi-line comment for some reason.

> +	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
> +		if (!net_eq(net, read_pnet(&dev->coredev.rdma_net)))
> +			continue;
> +
> +		/* Hold parent device reference to avoid getting freed */
> +		get_device(&dev->dev);
> +		up_read(&devices_rwsem);
> +		mutex_lock(&dev->unregistration_lock);
> +		__rdma_dev_change_netns(dev, &init_net);
> +		mutex_unlock(&dev->unregistration_lock);
> +		put_device(&dev->dev);
> +		down_read(&devices_rwsem);
> +	}
> +	up_read(&devices_rwsem);
> +}

This is better just as part of the loop in rdma_dev_exit_net()

> +
>  static void rdma_dev_exit_net(struct net *net)
>  {
>  	struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
> @@ -983,6 +1013,7 @@ static void rdma_dev_exit_net(struct net *net)
>  	}
>  	up_read(&devices_rwsem);
> 
> +	devices_to_default_net(net);
>  	xa_erase(&rdma_nets, rnet->id);
>  }
> 
> @@ -1425,6 +1456,98 @@ void ib_unregister_device_queued(struct ib_device *ib_dev)
>  }
>  EXPORT_SYMBOL(ib_unregister_device_queued);
> 
> +static int __rdma_dev_change_netns(struct ib_device *device, struct net *net)
> +{
> +	int ret;
> +
> +	/* If the refcount is zero, it means disable_device() has progressed
> +	 * first during unregistration phase. There is no point in changing net
> +	 * namespace of device which is on path of removal.
> +	 */
> +	if (refcount_read(&device->refcount) == 0)
> +		return -ENODEV;
> +
> +	disable_device(device);
> +
> +	/* At this point no one would be using the device,
> +	 * so it is safe to change the device net namespace.
> +	 */
> +	write_pnet(&device->coredev.rdma_net, net);
> +	/* Currently rdma devices are system wide unique. So device
> +	 * name is guaranteed free in new namespace. Publish new
> +	 * namespace at sysfs level.
> +	 */
> +	ret = device_rename(&device->dev, dev_name(&device->dev));
> +	if (ret) {
> +		dev_warn(&device->dev, "%s Couldn't rename device\n", __func__);
> +		return ret;

Hurm.. Technically we are supposed to hold the devices_rwsem if we are
manipulating the name. Inferring that we don't have to because the
refcount is 0 is a little over optimizing for my tastes

.. and if this fails the device is now permanently stuck with a 0
refcount.. So we should re-enable it anyhow. The sysfs will be screwed
up but what can you do..


> +	if (ret) {
> +		/* This shouldn't really happen, but if it does, let
> +		 * user retry at later point. So don't disable the
> +		 * device.
> +		 */
> +		dev_warn(&device->dev,
> +			 "%s Couldn't re-enable device\n", __func__);
> +	}
> +
> +	ib_device_put(device);
> +	return ret;
> +}
> +
> +int rdma_dev_change_netns_with_put(struct ib_device *dev, struct net *net)
> +{

Missing static?

> +	int ret;
> +
> +	/*
> +	 * Currently supported only for those providers which supports
> +	 * disassociation and those which doesn't do port specific sysfs
> +	 * init. Once a port_cleanup infrastructure is implemented, this
> +	 * limitation will be removed.
> +	 */
> +	if (!dev->ops.disassociate_ucontext || dev->ops.init_port) {
> +		ib_device_put(dev);
> +		return -EOPNOTSUPP;
> +	}

I feel like this should also be prevented if ib_devices_shared_netns??

> +	/* We must drop the device reference; otherwise disable_device()
> +	 * who is waiting for refcount to drop to zero while holding
> +	 * unregistration_lock won't happen. This can lead to deadlock.
> +	 * Also if we give up the ib_device reference here,
> +	 * ib_unregister_device() followed by ib_dealloc_device() can proceeed
> +	 * to free this device. Therefore, this thread must hold a reference
> +	 * to the device structure so that it cannot get freed, while this
> +	 * thread works on it.
> +	 */
> +	get_device(&dev->dev);
> +	ib_device_put(dev);
> +
> +	mutex_lock(&dev->unregistration_lock);
> +	/* If the refcount is zero, it means disable_device() has progressed
> +	 * first to disable the device. There is no point in changing net
> +	 * namespace of device which is on path of removal. So fail the request.
> +	 */
> +	if (refcount_read(&dev->refcount) == 0) {
> +		ret = -ENODEV;
> +		goto done;
> +	}

This is redundant, __rdma_dev_change_netns already does this test..

> +	/* If net namespace was changed by the time this thread gets to it,
> +	 * return an error.
> +	 */
> +	if (!net_eq(current->nsproxy->net_ns,
> +		    read_pnet(&dev->coredev.rdma_net))) {
> +		ret = -ENODEV;
> +		goto done;
> +	}

And this needs to be inside __rdma_dev_change_netns as the other
caller technically needs it too. .. and if we are doing that then the
lock should be absorbed into __rdma_dev_change_netns as well so it is
a self-consistent function.

What about the 
	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);

Stuff that netdev does? Shouldn't we do that too? KOBJ_REMOVE is part
of triggering userspace to let go of its processes too..

I ended up with a delta like this:

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 2b8254dd4c67ac..4458443b96a72e 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -201,7 +201,8 @@ static struct notifier_block ibdev_lsm_nb = {
 	.notifier_call = ib_security_change,
 };
 
-static int __rdma_dev_change_netns(struct ib_device *device, struct net *net);
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
+				 struct net *net);
 
 /* Pointer to the RCU head at the start of the ib_port_data array */
 struct ib_port_data_rcu {
@@ -956,32 +957,6 @@ int rdma_compatdev_set(u8 enable)
 	return ret;
 }
 
-static void devices_to_default_net(struct net *net)
-{
-	struct ib_device *dev;
-	unsigned long index;
-
-	down_read(&devices_rwsem);
-	/* If net namespace change is initiated by the time we reach here,
-	 * or if device unregistration has started, don't consider those
-	 * devices.
-	 */
-	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
-		if (!net_eq(net, read_pnet(&dev->coredev.rdma_net)))
-			continue;
-
-		/* Hold parent device reference to avoid getting freed */
-		get_device(&dev->dev);
-		up_read(&devices_rwsem);
-		mutex_lock(&dev->unregistration_lock);
-		__rdma_dev_change_netns(dev, &init_net);
-		mutex_unlock(&dev->unregistration_lock);
-		put_device(&dev->dev);
-		down_read(&devices_rwsem);
-	}
-	up_read(&devices_rwsem);
-}
-
 static void rdma_dev_exit_net(struct net *net)
 {
 	struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
@@ -1008,12 +983,16 @@ static void rdma_dev_exit_net(struct net *net)
 
 		remove_one_compat_dev(dev, rnet->id);
 
+		/*
+		 * If the real device is in the NS then move it back to init.
+		 */
+		rdma_dev_change_netns(dev, net, &init_net);
+
 		put_device(&dev->dev);
 		down_read(&devices_rwsem);
 	}
 	up_read(&devices_rwsem);
 
-	devices_to_default_net(net);
 	xa_erase(&rdma_nets, rnet->id);
 }
 
@@ -1459,96 +1438,74 @@ void ib_unregister_device_queued(struct ib_device *ib_dev)
 }
 EXPORT_SYMBOL(ib_unregister_device_queued);
 
-static int __rdma_dev_change_netns(struct ib_device *device, struct net *net)
+/*
+ * The caller must pass in a device that has the kref held and the refcount
+ * released. If the device is in cur_net and still registered then it is moved
+ * into net.
+ */
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
+				 struct net *net)
 {
+	int ret2 = -EINVAL;
 	int ret;
 
-	/* If the refcount is zero, it means disable_device() has progressed
-	 * first during unregistration phase. There is no point in changing net
-	 * namespace of device which is on path of removal.
+	mutex_lock(&device->unregistration_lock);
+
+	/*
+	 * If a device not under ib_device_get() or the unregistration_lock
+	 * the namespace can be changed, or it can be unregistered. Check
+	 * again under the lock.
 	 */
-	if (refcount_read(&device->refcount) == 0)
-		return -ENODEV;
+	if (!net_eq(current->nsproxy->net_ns,
+		    read_pnet(&device->coredev.rdma_net)) ||
+	    refcount_read(&device->refcount) == 0) {
+		ret = -ENODEV;
+		goto out;
+	}
 
+	kobject_uevent(&device->dev.kobj, KOBJ_REMOVE);
 	disable_device(device);
 
-	/* At this point no one would be using the device,
-	 * so it is safe to change the device net namespace.
+	/*
+	 * At this point no one can be using the device, so it is safe to
+	 * change the namespace.
 	 */
 	write_pnet(&device->coredev.rdma_net, net);
-	/* Currently rdma devices are system wide unique. So device
-	 * name is guaranteed free in new namespace. Publish new
-	 * namespace at sysfs level.
+
+	/*
+	 * Currently rdma devices are system wide unique. So the device name
+	 * is guaranteed free in the new namespace. Publish the new namespace
+	 * at the sysfs level.
 	 */
+	down_read(&devices_rwsem);
 	ret = device_rename(&device->dev, dev_name(&device->dev));
+	up_read(&devices_rwsem);
 	if (ret) {
-		dev_warn(&device->dev, "%s Couldn't rename device\n", __func__);
-		return ret;
-	}
-	ret = enable_device_and_get(device);
-	if (ret) {
-		/* This shouldn't really happen, but if it does, let
-		 * user retry at later point. So don't disable the
-		 * device.
-		 */
 		dev_warn(&device->dev,
-			 "%s Couldn't re-enable device\n", __func__);
-	}
-
-	ib_device_put(device);
-	return ret;
-}
-
-int rdma_dev_change_netns_with_put(struct ib_device *dev, struct net *net)
-{
-	int ret;
-
-	/*
-	 * Currently supported only for those providers which supports
-	 * disassociation and those which doesn't do port specific sysfs
-	 * init. Once a port_cleanup infrastructure is implemented, this
-	 * limitation will be removed.
-	 */
-	if (!dev->ops.disassociate_ucontext || dev->ops.init_port) {
-		ib_device_put(dev);
-		return -EOPNOTSUPP;
+			 "%s: Couldn't rename device after namespace change\n",
+			 __func__);
+		/* Try and put things back and re-enable the device */
+		write_pnet(&device->coredev.rdma_net, cur_net);
 	}
 
-	/* We must drop the device reference; otherwise disable_device()
-	 * who is waiting for refcount to drop to zero while holding
-	 * unregistration_lock won't happen. This can lead to deadlock.
-	 * Also if we give up the ib_device reference here,
-	 * ib_unregister_device() followed by ib_dealloc_device() can proceeed
-	 * to free this device. Therefore, this thread must hold a reference
-	 * to the device structure so that it cannot get freed, while this
-	 * thread works on it.
-	 */
-	get_device(&dev->dev);
-	ib_device_put(dev);
-
-	mutex_lock(&dev->unregistration_lock);
-	/* If the refcount is zero, it means disable_device() has progressed
-	 * first to disable the device. There is no point in changing net
-	 * namespace of device which is on path of removal. So fail the request.
-	 */
-	if (refcount_read(&dev->refcount) == 0) {
-		ret = -ENODEV;
-		goto done;
-	}
-	/* If net namespace was changed by the time this thread gets to it,
-	 * return an error.
-	 */
-	if (!net_eq(current->nsproxy->net_ns,
-		    read_pnet(&dev->coredev.rdma_net))) {
-		ret = -ENODEV;
-		goto done;
-	}
-	ret = __rdma_dev_change_netns(dev, net);
+	ret2 = enable_device_and_get(device);
+	if (ret2)
+		/*
+		 * This shouldn't really happen, but if it does, let the user
+		 * retry at later point. So don't disable the device.
+		 */
+		dev_warn(
+			&device->dev,
+			"%s: Couldn't re-enable device after namespace change\n",
+			__func__);
+	kobject_uevent(&device->dev.kobj, KOBJ_ADD);
 
-done:
-	mutex_unlock(&dev->unregistration_lock);
-	put_device(&dev->dev);
-	return ret;
+	ib_device_put(device);
+out:
+	mutex_unlock(&device->unregistration_lock);
+	if (ret)
+		return ret;
+	return ret2;
 }
 
 int ib_device_set_netns_put(struct sk_buff *skb,
@@ -1568,7 +1525,23 @@ int ib_device_set_netns_put(struct sk_buff *skb,
 		goto ns_err;
 	}
 
-	ret = rdma_dev_change_netns_with_put(dev, net);
+	/*
+	 * Currently supported only for those providers which support
+	 * disassociation and don't do port specific sysfs init. Once a
+	 * port_cleanup infrastructure is implemented, this limitation will be
+	 * removed.
+	 */
+	if (!dev->ops.disassociate_ucontext || dev->ops.init_port ||
+	    ib_devices_shared_netns) {
+		ret = -EOPNOTSUPP;
+		goto ns_err;
+	}
+
+	get_device(&dev->dev);
+	ib_device_put(dev);
+	ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
+	put_device(&dev->dev);
+
 	put_net(net);
 	return ret;