[PATCH rdma-next v1 2/3] RDMA/core: Introduce a helper function to change net namespace of rdma device

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Parav Pandit <parav@xxxxxxxxxxxx>

Introduce a helper function that changes rdma device's net namespace
which performs mini disable enable sequence to have device visible
only in assigned net namespace.

device unregistration, device rename and device change net namespace
may be invoked concurrently.
(a) device unregistration needs to wait until device change (rename
or net namespace change) operation is in progress.
(b) device net namespace change should not proceed if the
unregistration has started.
(c) while one cpu is changing device net namespace, other cpu should
not be able to rename or change net namespace.

To address above concurrency,
(a) Use unreg_mutex to synchronize between ib_unregister_device() and
net namespace change operation
(b) In cases where unregister_device() has started unregistration before
change_netns got chance to acquire unreg_mutex, validate the refcount -
if it dropped to zero, abort the net namespace change operation.

Finally use the helper function to change net namespace of ib device to
move the device back to init_net when such net is deleted.

Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx>
Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx>
---
 drivers/infiniband/core/device.c | 123 +++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 55fccbb0aabd..307e729cb8f4 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -201,6 +201,8 @@ static struct notifier_block ibdev_lsm_nb = {
 	.notifier_call = ib_security_change,
 };

+static int __rdma_dev_change_netns(struct ib_device *device, struct net *net);
+
 /* Pointer to the RCU head at the start of the ib_port_data array */
 struct ib_port_data_rcu {
 	struct rcu_head rcu_head;
@@ -861,6 +863,8 @@ static int add_compat_devs(struct ib_device *device)
 	unsigned long index;
 	int ret = 0;

+	lockdep_assert_held(&devices_rwsem);
+
 	down_read(&rdma_nets_rwsem);
 	xa_for_each (&rdma_nets, index, rnet) {
 		ret = add_one_compat_dev(device, rnet);
@@ -952,6 +956,32 @@ int rdma_compatdev_set(u8 enable)
 	return ret;
 }

+static void devices_to_default_net(struct net *net)
+{
+	struct ib_device *dev;
+	unsigned long index;
+
+	down_read(&devices_rwsem);
+	/* If net namespace change is initiated by the time we reach here,
+	 * or if device unregistration has started, don't consider those
+	 * devices.
+	 */
+	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+		if (!net_eq(net, read_pnet(&dev->coredev.rdma_net)))
+			continue;
+
+		/* Hold parent device reference to avoid getting freed */
+		get_device(&dev->dev);
+		up_read(&devices_rwsem);
+		mutex_lock(&dev->unregistration_lock);
+		__rdma_dev_change_netns(dev, &init_net);
+		mutex_unlock(&dev->unregistration_lock);
+		put_device(&dev->dev);
+		down_read(&devices_rwsem);
+	}
+	up_read(&devices_rwsem);
+}
+
 static void rdma_dev_exit_net(struct net *net)
 {
 	struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
@@ -983,6 +1013,7 @@ static void rdma_dev_exit_net(struct net *net)
 	}
 	up_read(&devices_rwsem);

+	devices_to_default_net(net);
 	xa_erase(&rdma_nets, rnet->id);
 }

@@ -1425,6 +1456,98 @@ void ib_unregister_device_queued(struct ib_device *ib_dev)
 }
 EXPORT_SYMBOL(ib_unregister_device_queued);

+static int __rdma_dev_change_netns(struct ib_device *device, struct net *net)
+{
+	int ret;
+
+	/* If the refcount is zero, it means disable_device() has progressed
+	 * first during unregistration phase. There is no point in changing net
+	 * namespace of device which is on path of removal.
+	 */
+	if (refcount_read(&device->refcount) == 0)
+		return -ENODEV;
+
+	disable_device(device);
+
+	/* At this point no one would be using the device,
+	 * so it is safe to change the device net namespace.
+	 */
+	write_pnet(&device->coredev.rdma_net, net);
+	/* Currently rdma devices are system wide unique. So device
+	 * name is guaranteed free in new namespace. Publish new
+	 * namespace at sysfs level.
+	 */
+	ret = device_rename(&device->dev, dev_name(&device->dev));
+	if (ret) {
+		dev_warn(&device->dev, "%s Couldn't rename device\n", __func__);
+		return ret;
+	}
+	ret = enable_device_and_get(device);
+	if (ret) {
+		/* This shouldn't really happen, but if it does, let
+		 * user retry at later point. So don't disable the
+		 * device.
+		 */
+		dev_warn(&device->dev,
+			 "%s Couldn't re-enable device\n", __func__);
+	}
+
+	ib_device_put(device);
+	return ret;
+}
+
+int rdma_dev_change_netns_with_put(struct ib_device *dev, struct net *net)
+{
+	int ret;
+
+	/*
+	 * Currently supported only for those providers which supports
+	 * disassociation and those which doesn't do port specific sysfs
+	 * init. Once a port_cleanup infrastructure is implemented, this
+	 * limitation will be removed.
+	 */
+	if (!dev->ops.disassociate_ucontext || dev->ops.init_port) {
+		ib_device_put(dev);
+		return -EOPNOTSUPP;
+	}
+
+	/* We must drop the device reference; otherwise disable_device()
+	 * who is waiting for refcount to drop to zero while holding
+	 * unregistration_lock won't happen. This can lead to deadlock.
+	 * Also if we give up the ib_device reference here,
+	 * ib_unregister_device() followed by ib_dealloc_device() can proceeed
+	 * to free this device. Therefore, this thread must hold a reference
+	 * to the device structure so that it cannot get freed, while this
+	 * thread works on it.
+	 */
+	get_device(&dev->dev);
+	ib_device_put(dev);
+
+	mutex_lock(&dev->unregistration_lock);
+	/* If the refcount is zero, it means disable_device() has progressed
+	 * first to disable the device. There is no point in changing net
+	 * namespace of device which is on path of removal. So fail the request.
+	 */
+	if (refcount_read(&dev->refcount) == 0) {
+		ret = -ENODEV;
+		goto done;
+	}
+	/* If net namespace was changed by the time this thread gets to it,
+	 * return an error.
+	 */
+	if (!net_eq(current->nsproxy->net_ns,
+		    read_pnet(&dev->coredev.rdma_net))) {
+		ret = -ENODEV;
+		goto done;
+	}
+	ret = __rdma_dev_change_netns(dev, net);
+
+done:
+	mutex_unlock(&dev->unregistration_lock);
+	put_device(&dev->dev);
+	return ret;
+}
+
 static struct pernet_operations rdma_dev_net_ops = {
 	.init = rdma_dev_init_net,
 	.exit = rdma_dev_exit_net,
--
2.20.1




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux