Re: [PATCH rdma-next] RDMA/core: Sync unregistration with netlink commands

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 15-Nov-18 04:05, Leon Romanovsky wrote:
> From: Parav Pandit <parav@xxxxxxxxxxxx>
> 
> Currently when rdma device is getting removed, get resource info can
> race with device removal example below.
> 
>       CPU-0                                  CPU-1
>     --------                               --------
>     rdma_nl_rcv_msg()
>        nldev_res_get_cq_dumpit()
>           mutex_lock(device_lock);
>           get device reference
>           mutex_unlock(device_lock);        [..]
>                                             ib_unregister_device()
>                                             /* Valid reference to
>                                              * device->dev exists.
>                                              */
>                                              ib_dealloc_device()
> 
>           [..]
>           provider->fill_res_entry();
> 
> Even though device object is not freed, fill_res_entry() can get called
> on device which doesn't from provider driver side.
> Kernel core device reference count is not sufficient.
> 
> Similar race can occur with device renaming and device removal, where
> device_rename() tries to rename a unregistered device. While this is fine
> for devices of a class which are not net namespace aware, but it is
> incorrect for net namespace aware class coming in subsequent series.
> If a class is net namespace aware, than below [1] call trace is observed
> in above situation.
> 
> Therefore, to avoid the such race, keep a reference count and let device
> unregistration wait until all netlink users drop the reference.
> 
> [1] Call trace:
> kernfs: ns required in 'infiniband' for 'mlx5_0'
> WARNING: CPU: 18 PID: 44270 at fs/kernfs/dir.c:842 kernfs_find_ns+0x104/0x120
> libahci i2c_core mlxfw libata dca [last unloaded: devlink]
> RIP: 0010:kernfs_find_ns+0x104/0x120
> Call Trace:
> kernfs_find_and_get_ns+0x2e/0x50
> sysfs_rename_link_ns+0x40/0xb0
> device_rename+0xb2/0xf0
> ib_device_rename+0xb3/0x100 [ib_core]
> nldev_set_doit+0x165/0x190 [ib_core]
> rdma_nl_rcv_msg+0x249/0x250 [ib_core]
> ? netlink_deliver_tap+0x8f/0x3e0
> rdma_nl_rcv+0xd6/0x120 [ib_core]
> netlink_unicast+0x17c/0x230
> netlink_sendmsg+0x2f0/0x3e0
> sock_sendmsg+0x30/0x40
> __sys_sendto+0xdc/0x160
> 
> Fixes: da5c85078215 ("RDMA/nldev: add driver-specific resource tracking")
> Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx>
> Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx>
> ---
>  drivers/infiniband/core/core_priv.h |  1 +
>  drivers/infiniband/core/device.c    | 27 +++++++++++++++++++++++----
>  drivers/infiniband/core/nldev.c     | 20 ++++++++++----------
>  include/rdma/ib_verbs.h             |  8 +++++++-
>  4 files changed, 41 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
> index 0b0ba44e773d..0da0a26c02b9 100644
> --- a/drivers/infiniband/core/core_priv.h
> +++ b/drivers/infiniband/core/core_priv.h
> @@ -269,6 +269,7 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
>  #endif
>  
>  struct ib_device *ib_device_get_by_index(u32 ifindex);
> +void ib_device_put(struct ib_device *device);
>  /* RDMA device netlink */
>  void nldev_init(void);
>  void nldev_exit(void);
> diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> index 1a783724fcba..842853b16941 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -145,7 +145,8 @@ static struct ib_device *__ib_device_get_by_index(u32 index)
>  }
>  
>  /*
> - * Caller is responsible to return refrerence count by calling put_device()
> + * Caller must perform ib_device_put to return device refrerence count when

Typo in reference.

> + * ib_device_get_by_index() returns valid device pointer.
>   */
>  struct ib_device *ib_device_get_by_index(u32 index)
>  {
> @@ -153,13 +154,23 @@ struct ib_device *ib_device_get_by_index(u32 index)
>  
>  	down_read(&lists_rwsem);
>  	device = __ib_device_get_by_index(index);
> -	if (device)
> -		get_device(&device->dev);
> -
> +	if (device) {
> +		/* If unregistration has started, than better not

This should probably be "then" (maybe just remove the word).

> +		 * return a device.
> +		 */
> +		if (!refcount_inc_not_zero(&device->refcount))
> +			device = NULL;
> +	}
>  	up_read(&lists_rwsem);
>  	return device;
>  }
>  
> +void ib_device_put(struct ib_device *device)
> +{
> +	if (refcount_dec_and_test(&device->refcount))
> +		complete(&device->unreg_completion);
> +}
> +
>  static struct ib_device *__ib_device_get_by_name(const char *name)
>  {
>  	struct ib_device *device;
> @@ -307,6 +318,8 @@ struct ib_device *ib_alloc_device(size_t size)
>  	rwlock_init(&device->client_data_lock);
>  	INIT_LIST_HEAD(&device->client_data_list);
>  	rdma_restrack_init(&device->res);
> +	refcount_set(&device->refcount, 1);
> +	init_completion(&device->unreg_completion);
>  
>  	return device;
>  }
> @@ -655,6 +668,12 @@ void ib_unregister_device(struct ib_device *device)
>  	struct ib_client_data *context, *tmp;
>  	unsigned long flags;
>  
> +	/* Wait for all netlink command callers to finish working on the
> +	 * device.
> +	 */
> +	ib_device_put(device);
> +	wait_for_completion(&device->unreg_completion);
> +
>  	mutex_lock(&device_mutex);
>  
>  	down_write(&lists_rwsem);
> diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
> index 573399e3ccc1..63cc74483188 100644
> --- a/drivers/infiniband/core/nldev.c
> +++ b/drivers/infiniband/core/nldev.c
> @@ -636,13 +636,13 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
>  
>  	nlmsg_end(msg, nlh);
>  
> -	put_device(&device->dev);
> +	ib_device_put(device);
>  	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
>  
>  err_free:
>  	nlmsg_free(msg);
>  err:
> -	put_device(&device->dev);
> +	ib_device_put(device);
>  	return err;
>  }
>  
> @@ -672,7 +672,7 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
>  		err = ib_device_rename(device, name);
>  	}
>  
> -	put_device(&device->dev);
> +	ib_device_put(device);
>  	return err;
>  }
>  
> @@ -756,14 +756,14 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
>  		goto err_free;
>  
>  	nlmsg_end(msg, nlh);
> -	put_device(&device->dev);
> +	ib_device_put(device);
>  
>  	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
>  
>  err_free:
>  	nlmsg_free(msg);
>  err:
> -	put_device(&device->dev);
> +	ib_device_put(device);
>  	return err;
>  }
>  
> @@ -820,7 +820,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
>  	}
>  
>  out:
> -	put_device(&device->dev);
> +	ib_device_put(device);
>  	cb->args[0] = idx;
>  	return skb->len;
>  }
> @@ -859,13 +859,13 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
>  		goto err_free;
>  
>  	nlmsg_end(msg, nlh);
> -	put_device(&device->dev);
> +	ib_device_put(device);
>  	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
>  
>  err_free:
>  	nlmsg_free(msg);
>  err:
> -	put_device(&device->dev);
> +	ib_device_put(device);
>  	return ret;
>  }
>  
> @@ -1058,7 +1058,7 @@ next:		idx++;
>  	if (!filled)
>  		goto err;
>  
> -	put_device(&device->dev);
> +	ib_device_put(device);
>  	return skb->len;
>  
>  res_err:
> @@ -1069,7 +1069,7 @@ next:		idx++;
>  	nlmsg_cancel(skb, nlh);
>  
>  err_index:
> -	put_device(&device->dev);
> +	ib_device_put(device);
>  	return ret;
>  }
>  
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 3d727dcc007d..ccb279cc5d98 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -53,7 +53,7 @@
>  #include <linux/string.h>
>  #include <linux/slab.h>
>  #include <linux/netdevice.h>
> -
> +#include <linux/refcount.h>
>  #include <linux/if_link.h>
>  #include <linux/atomic.h>
>  #include <linux/mmu_notifier.h>
> @@ -2612,6 +2612,12 @@ struct ib_device {
>  
>  	const struct uapi_definition   *driver_def;
>  	enum rdma_driver_id		driver_id;
> +	/*
> +	 * Provides synchronization between device unregistration and netlink
> +	 * commands on a device. To be used only by core.
> +	 */
> +	refcount_t			refcount;
> +	struct completion		unreg_completion;
>  };
>  
>  struct ib_client {
> 




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux