Re: [PATCH rdma-next v1 2/3] RDMA/core: Introduce a helper function to change net namespace of rdma device

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Apr 10, 2019 at 02:41:55PM -0300, Jason Gunthorpe wrote:
> On Sun, Mar 31, 2019 at 07:36:43PM +0300, Leon Romanovsky wrote:
> > From: Parav Pandit <parav@xxxxxxxxxxxx>
> >
> > Introduce a helper function that changes rdma device's net namespace
> > which performs mini disable enable sequence to have device visible
> > only in assigned net namespace.
> >
> > device unregistration, device rename and device change net namespace
> > may be invoked concurrently.
> > (a) device unregistration needs to wait until device change (rename
> > or net namespace change) operation is in progress.
> > (b) device net namespace change should not proceed if the
> > unregistration has started.
> > (c) while one cpu is changing device net namespace, other cpu should
> > not be able to rename or change net namespace.
> >
> > To address above concurrency,
> > (a) Use unreg_mutex to synchronize between ib_unregister_device() and
> > net namespace change operation
> > (b) In cases where unregister_device() has started unregistration before
> > change_netns got chance to acquire unreg_mutex, validate the refcount -
> > if it dropped to zero, abort the net namespace change operation.
> >
> > Finally use the helper function to change net namespace of ib device to
> > move the device back to init_net when such net is deleted.
> >
> > Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx>
> > Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx>
> > ---
> >  drivers/infiniband/core/device.c | 123 +++++++++++++++++++++++++++++++
> >  1 file changed, 123 insertions(+)
> >
> > --
> > 2.20.1
> >
> > diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> > index 55fccbb0aabd..307e729cb8f4 100644
> > --- a/drivers/infiniband/core/device.c
> > +++ b/drivers/infiniband/core/device.c
> > @@ -201,6 +201,8 @@ static struct notifier_block ibdev_lsm_nb = {
> >  	.notifier_call = ib_security_change,
> >  };
> >
> > +static int __rdma_dev_change_netns(struct ib_device *device, struct net *net);
> > +
> >  /* Pointer to the RCU head at the start of the ib_port_data array */
> >  struct ib_port_data_rcu {
> >  	struct rcu_head rcu_head;
> > @@ -861,6 +863,8 @@ static int add_compat_devs(struct ib_device *device)
> >  	unsigned long index;
> >  	int ret = 0;
> >
> > +	lockdep_assert_held(&devices_rwsem);
> > +
> >  	down_read(&rdma_nets_rwsem);
> >  	xa_for_each (&rdma_nets, index, rnet) {
> >  		ret = add_one_compat_dev(device, rnet);
> > @@ -952,6 +956,32 @@ int rdma_compatdev_set(u8 enable)
> >  	return ret;
> >  }
> >
> > +static void devices_to_default_net(struct net *net)
> > +{
> > +	struct ib_device *dev;
> > +	unsigned long index;
> > +
> > +	down_read(&devices_rwsem);
> > +	/* If net namespace change is initiated by the time we reach here,
> > +	 * or if device unregistration has started, don't consider those
> > +	 * devices.
> > +	 */
>
> infiniband/core uses the
> /*
>  * foo
>  */
>
> Style of multi-line comment for some reason.

It is preferable coding style for whole kernel, except netdev
and Parav used netdev coding style.

During my reviews, I'm allowing to use netdev codding so people will be
able easily submit their patches to netdev without need to respin due to
differences in coding style.

>
> > +	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
> > +		if (!net_eq(net, read_pnet(&dev->coredev.rdma_net)))
> > +			continue;
> > +
> > +		/* Hold parent device reference to avoid getting freed */
> > +		get_device(&dev->dev);
> > +		up_read(&devices_rwsem);
> > +		mutex_lock(&dev->unregistration_lock);
> > +		__rdma_dev_change_netns(dev, &init_net);
> > +		mutex_unlock(&dev->unregistration_lock);
> > +		put_device(&dev->dev);
> > +		down_read(&devices_rwsem);
> > +	}
> > +	up_read(&devices_rwsem);
> > +}
>
> This is better just as part of the loop in rdma_dev_exit_net()
>
> > +
> >  static void rdma_dev_exit_net(struct net *net)
> >  {
> >  	struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
> > @@ -983,6 +1013,7 @@ static void rdma_dev_exit_net(struct net *net)
> >  	}
> >  	up_read(&devices_rwsem);
> >
> > +	devices_to_default_net(net);
> >  	xa_erase(&rdma_nets, rnet->id);
> >  }
> >
> > @@ -1425,6 +1456,98 @@ void ib_unregister_device_queued(struct ib_device *ib_dev)
> >  }
> >  EXPORT_SYMBOL(ib_unregister_device_queued);
> >
> > +static int __rdma_dev_change_netns(struct ib_device *device, struct net *net)
> > +{
> > +	int ret;
> > +
> > +	/* If the refcount is zero, it means disable_device() has progressed
> > +	 * first during unregistration phase. There is no point in changing net
> > +	 * namespace of device which is on path of removal.
> > +	 */
> > +	if (refcount_read(&device->refcount) == 0)
> > +		return -ENODEV;
> > +
> > +	disable_device(device);
> > +
> > +	/* At this point no one would be using the device,
> > +	 * so it is safe to change the device net namespace.
> > +	 */
> > +	write_pnet(&device->coredev.rdma_net, net);
> > +	/* Currently rdma devices are system wide unique. So device
> > +	 * name is guaranteed free in new namespace. Publish new
> > +	 * namespace at sysfs level.
> > +	 */
> > +	ret = device_rename(&device->dev, dev_name(&device->dev));
> > +	if (ret) {
> > +		dev_warn(&device->dev, "%s Couldn't rename device\n", __func__);
> > +		return ret;
>
> Hurm.. Technically we are supposed to hold the devices_rwsem if we are
> manipulating the name. Inferring that we don't have to because the
> refcount is 0 is a little over optimizing for my tastes
>
> .. and if this fails the device is now permanently stuck with a 0
> refcount.. So we should re-enable it anyhow. The sysfs will be screwed
> up but what can you do..

Why? We will enter device_rename only in case refcount != 0, see check
above.

>
>
> > +	if (ret) {
> > +		/* This shouldn't really happen, but if it does, let
> > +		 * user retry at later point. So don't disable the
> > +		 * device.
> > +		 */
> > +		dev_warn(&device->dev,
> > +			 "%s Couldn't re-enable device\n", __func__);
> > +	}
> > +
> > +	ib_device_put(device);
> > +	return ret;
> > +}
> > +
> > +int rdma_dev_change_netns_with_put(struct ib_device *dev, struct net *net)
> > +{
>
> Missing static?

Yes

>
> > +	int ret;
> > +
> > +	/*
> > +	 * Currently supported only for those providers which supports
> > +	 * disassociation and those which doesn't do port specific sysfs
> > +	 * init. Once a port_cleanup infrastructure is implemented, this
> > +	 * limitation will be removed.
> > +	 */
> > +	if (!dev->ops.disassociate_ucontext || dev->ops.init_port) {
> > +		ib_device_put(dev);
> > +		return -EOPNOTSUPP;
> > +	}
>
> I feel like this should also be prevented if ib_devices_shared_netns??
>
> > +	/* We must drop the device reference; otherwise disable_device()
> > +	 * who is waiting for refcount to drop to zero while holding
> > +	 * unregistration_lock won't happen. This can lead to deadlock.
> > +	 * Also if we give up the ib_device reference here,
> > +	 * ib_unregister_device() followed by ib_dealloc_device() can proceeed
> > +	 * to free this device. Therefore, this thread must hold a reference
> > +	 * to the device structure so that it cannot get freed, while this
> > +	 * thread works on it.
> > +	 */
> > +	get_device(&dev->dev);
> > +	ib_device_put(dev);
> > +
> > +	mutex_lock(&dev->unregistration_lock);
> > +	/* If the refcount is zero, it means disable_device() has progressed
> > +	 * first to disable the device. There is no point in changing net
> > +	 * namespace of device which is on path of removal. So fail the request.
> > +	 */
> > +	if (refcount_read(&dev->refcount) == 0) {
> > +		ret = -ENODEV;
> > +		goto done;
> > +	}
>
> This is redundant, __rdma_dev_change_netns already does this test..
>
> > +	/* If net namespace was changed by the time this thread gets to it,
> > +	 * return an error.
> > +	 */
> > +	if (!net_eq(current->nsproxy->net_ns,
> > +		    read_pnet(&dev->coredev.rdma_net))) {
> > +		ret = -ENODEV;
> > +		goto done;
> > +	}
>
> And this needs to be inside __rdma_dev_change_netns as the other
> caller technically needs it too. .. and if we are doing that then the
> lock should be absorbed into __rdma_dev_change_netns as well so it is
> a self-consistent function.
>
> What about the
> 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
> 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
>
> Stuff that netdev does? Shouldn't we do that too? KOBJ_REMOVE is part
> of triggering userspace to let go of its processes too..
>
> I ended up with a delta like this:
>
> diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> index 2b8254dd4c67ac..4458443b96a72e 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -201,7 +201,8 @@ static struct notifier_block ibdev_lsm_nb = {
>  	.notifier_call = ib_security_change,
>  };
>
> -static int __rdma_dev_change_netns(struct ib_device *device, struct net *net);
> +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
> +				 struct net *net);
>
>  /* Pointer to the RCU head at the start of the ib_port_data array */
>  struct ib_port_data_rcu {
> @@ -956,32 +957,6 @@ int rdma_compatdev_set(u8 enable)
>  	return ret;
>  }
>
> -static void devices_to_default_net(struct net *net)
> -{
> -	struct ib_device *dev;
> -	unsigned long index;
> -
> -	down_read(&devices_rwsem);
> -	/* If net namespace change is initiated by the time we reach here,
> -	 * or if device unregistration has started, don't consider those
> -	 * devices.
> -	 */
> -	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
> -		if (!net_eq(net, read_pnet(&dev->coredev.rdma_net)))
> -			continue;
> -
> -		/* Hold parent device reference to avoid getting freed */
> -		get_device(&dev->dev);
> -		up_read(&devices_rwsem);
> -		mutex_lock(&dev->unregistration_lock);
> -		__rdma_dev_change_netns(dev, &init_net);
> -		mutex_unlock(&dev->unregistration_lock);
> -		put_device(&dev->dev);
> -		down_read(&devices_rwsem);
> -	}
> -	up_read(&devices_rwsem);
> -}
> -
>  static void rdma_dev_exit_net(struct net *net)
>  {
>  	struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
> @@ -1008,12 +983,16 @@ static void rdma_dev_exit_net(struct net *net)
>
>  		remove_one_compat_dev(dev, rnet->id);
>
> +		/*
> +		 * If the real device is in the NS then move it back to init.
> +		 */
> +		rdma_dev_change_netns(dev, net, &init_net);
> +
>  		put_device(&dev->dev);
>  		down_read(&devices_rwsem);
>  	}
>  	up_read(&devices_rwsem);
>
> -	devices_to_default_net(net);
>  	xa_erase(&rdma_nets, rnet->id);
>  }
>
> @@ -1459,96 +1438,74 @@ void ib_unregister_device_queued(struct ib_device *ib_dev)
>  }
>  EXPORT_SYMBOL(ib_unregister_device_queued);
>
> -static int __rdma_dev_change_netns(struct ib_device *device, struct net *net)
> +/*
> + * The caller must pass in a device that has the kref held and the refcount
> + * released. If the device is in cur_net and still registered then it is moved
> + * into net.
> + */
> +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
> +				 struct net *net)
>  {
> +	int ret2 = -EINVAL;
>  	int ret;
>
> -	/* If the refcount is zero, it means disable_device() has progressed
> -	 * first during unregistration phase. There is no point in changing net
> -	 * namespace of device which is on path of removal.
> +	mutex_lock(&device->unregistration_lock);
> +
> +	/*
> +	 * If a device not under ib_device_get() or the unregistration_lock
> +	 * the namespace can be changed, or it can be unregistered. Check
> +	 * again under the lock.
>  	 */
> -	if (refcount_read(&device->refcount) == 0)
> -		return -ENODEV;
> +	if (!net_eq(current->nsproxy->net_ns,
> +		    read_pnet(&device->coredev.rdma_net)) ||
> +	    refcount_read(&device->refcount) == 0) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
>
> +	kobject_uevent(&device->dev.kobj, KOBJ_REMOVE);
>  	disable_device(device);
>
> -	/* At this point no one would be using the device,
> -	 * so it is safe to change the device net namespace.
> +	/*
> +	 * At this point no one can be using the device, so it is safe to
> +	 * change the namespace.
>  	 */
>  	write_pnet(&device->coredev.rdma_net, net);
> -	/* Currently rdma devices are system wide unique. So device
> -	 * name is guaranteed free in new namespace. Publish new
> -	 * namespace at sysfs level.
> +
> +	/*
> +	 * Currently rdma devices are system wide unique. So the device name
> +	 * is guaranteed free in the new namespace. Publish the new namespace
> +	 * at the sysfs level.
>  	 */
> +	down_read(&devices_rwsem);
>  	ret = device_rename(&device->dev, dev_name(&device->dev));
> +	up_read(&devices_rwsem);
>  	if (ret) {
> -		dev_warn(&device->dev, "%s Couldn't rename device\n", __func__);
> -		return ret;
> -	}
> -	ret = enable_device_and_get(device);
> -	if (ret) {
> -		/* This shouldn't really happen, but if it does, let
> -		 * user retry at later point. So don't disable the
> -		 * device.
> -		 */
>  		dev_warn(&device->dev,
> -			 "%s Couldn't re-enable device\n", __func__);
> -	}
> -
> -	ib_device_put(device);
> -	return ret;
> -}
> -
> -int rdma_dev_change_netns_with_put(struct ib_device *dev, struct net *net)
> -{
> -	int ret;
> -
> -	/*
> -	 * Currently supported only for those providers which supports
> -	 * disassociation and those which doesn't do port specific sysfs
> -	 * init. Once a port_cleanup infrastructure is implemented, this
> -	 * limitation will be removed.
> -	 */
> -	if (!dev->ops.disassociate_ucontext || dev->ops.init_port) {
> -		ib_device_put(dev);
> -		return -EOPNOTSUPP;
> +			 "%s: Couldn't rename device after namespace change\n",
> +			 __func__);
> +		/* Try and put things back and re-enable the device */
> +		write_pnet(&device->coredev.rdma_net, cur_net);
>  	}
>
> -	/* We must drop the device reference; otherwise disable_device()
> -	 * who is waiting for refcount to drop to zero while holding
> -	 * unregistration_lock won't happen. This can lead to deadlock.
> -	 * Also if we give up the ib_device reference here,
> -	 * ib_unregister_device() followed by ib_dealloc_device() can proceeed
> -	 * to free this device. Therefore, this thread must hold a reference
> -	 * to the device structure so that it cannot get freed, while this
> -	 * thread works on it.
> -	 */
> -	get_device(&dev->dev);
> -	ib_device_put(dev);
> -
> -	mutex_lock(&dev->unregistration_lock);
> -	/* If the refcount is zero, it means disable_device() has progressed
> -	 * first to disable the device. There is no point in changing net
> -	 * namespace of device which is on path of removal. So fail the request.
> -	 */
> -	if (refcount_read(&dev->refcount) == 0) {
> -		ret = -ENODEV;
> -		goto done;
> -	}
> -	/* If net namespace was changed by the time this thread gets to it,
> -	 * return an error.
> -	 */
> -	if (!net_eq(current->nsproxy->net_ns,
> -		    read_pnet(&dev->coredev.rdma_net))) {
> -		ret = -ENODEV;
> -		goto done;
> -	}
> -	ret = __rdma_dev_change_netns(dev, net);
> +	ret2 = enable_device_and_get(device);
> +	if (ret2)
> +		/*
> +		 * This shouldn't really happen, but if it does, let the user
> +		 * retry at later point. So don't disable the device.
> +		 */
> +		dev_warn(
> +			&device->dev,
> +			"%s: Couldn't re-enable device after namespace change\n",
> +			__func__);
> +	kobject_uevent(&device->dev.kobj, KOBJ_ADD);
>
> -done:
> -	mutex_unlock(&dev->unregistration_lock);
> -	put_device(&dev->dev);
> -	return ret;
> +	ib_device_put(device);
> +out:
> +	mutex_unlock(&device->unregistration_lock);
> +	if (ret)
> +		return ret;
> +	return ret2;
>  }
>
>  int ib_device_set_netns_put(struct sk_buff *skb,
> @@ -1568,7 +1525,23 @@ int ib_device_set_netns_put(struct sk_buff *skb,
>  		goto ns_err;
>  	}
>
> -	ret = rdma_dev_change_netns_with_put(dev, net);
> +	/*
> +	 * Currently supported only for those providers which support
> +	 * disassociation and don't do port specific sysfs init. Once a
> +	 * port_cleanup infrastructure is implemented, this limitation will be
> +	 * removed.
> +	 */
> +	if (!dev->ops.disassociate_ucontext || dev->ops.init_port ||
> +	    ib_devices_shared_netns) {
> +		ret = -EOPNOTSUPP;
> +		goto ns_err;
> +	}
> +
> +	get_device(&dev->dev);
> +	ib_device_put(dev);
> +	ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
> +	put_device(&dev->dev);
> +
>  	put_net(net);
>  	return ret;
>

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux