On 15-Nov-18 04:05, Leon Romanovsky wrote: > From: Parav Pandit <parav@xxxxxxxxxxxx> > > Currently when rdma device is getting removed, get resource info can > race with device removal example below. > > CPU-0 CPU-1 > -------- -------- > rdma_nl_rcv_msg() > nldev_res_get_cq_dumpit() > mutex_lock(device_lock); > get device reference > mutex_unlock(device_lock); [..] > ib_unregister_device() > /* Valid reference to > * device->dev exists. > */ > ib_dealloc_device() > > [..] > provider->fill_res_entry(); > > Even though device object is not freed, fill_res_entry() can get called > on device which doesn't from provider driver side. > Kernel core device reference count is not sufficient. > > Similar race can occur with device renaming and device removal, where > device_rename() tries to rename a unregistered device. While this is fine > for devices of a class which are not net namespace aware, but it is > incorrect for net namespace aware class coming in subsequent series. > If a class is net namespace aware, than below [1] call trace is observed > in above situation. > > Therefore, to avoid the such race, keep a reference count and let device > unregistration wait until all netlink users drop the reference. > > [1] Call trace: > kernfs: ns required in 'infiniband' for 'mlx5_0' > WARNING: CPU: 18 PID: 44270 at fs/kernfs/dir.c:842 kernfs_find_ns+0x104/0x120 > libahci i2c_core mlxfw libata dca [last unloaded: devlink] > RIP: 0010:kernfs_find_ns+0x104/0x120 > Call Trace: > kernfs_find_and_get_ns+0x2e/0x50 > sysfs_rename_link_ns+0x40/0xb0 > device_rename+0xb2/0xf0 > ib_device_rename+0xb3/0x100 [ib_core] > nldev_set_doit+0x165/0x190 [ib_core] > rdma_nl_rcv_msg+0x249/0x250 [ib_core] > ? netlink_deliver_tap+0x8f/0x3e0 > rdma_nl_rcv+0xd6/0x120 [ib_core] > netlink_unicast+0x17c/0x230 > netlink_sendmsg+0x2f0/0x3e0 > sock_sendmsg+0x30/0x40 > __sys_sendto+0xdc/0x160 > > Fixes: da5c85078215 ("RDMA/nldev: add driver-specific resource tracking") > Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx> > Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> > --- > drivers/infiniband/core/core_priv.h | 1 + > drivers/infiniband/core/device.c | 27 +++++++++++++++++++++++---- > drivers/infiniband/core/nldev.c | 20 ++++++++++---------- > include/rdma/ib_verbs.h | 8 +++++++- > 4 files changed, 41 insertions(+), 15 deletions(-) > > diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h > index 0b0ba44e773d..0da0a26c02b9 100644 > --- a/drivers/infiniband/core/core_priv.h > +++ b/drivers/infiniband/core/core_priv.h > @@ -269,6 +269,7 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, > #endif > > struct ib_device *ib_device_get_by_index(u32 ifindex); > +void ib_device_put(struct ib_device *device); > /* RDMA device netlink */ > void nldev_init(void); > void nldev_exit(void); > diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c > index 1a783724fcba..842853b16941 100644 > --- a/drivers/infiniband/core/device.c > +++ b/drivers/infiniband/core/device.c > @@ -145,7 +145,8 @@ static struct ib_device *__ib_device_get_by_index(u32 index) > } > > /* > - * Caller is responsible to return refrerence count by calling put_device() > + * Caller must perform ib_device_put to return device refrerence count when Typo in reference. > + * ib_device_get_by_index() returns valid device pointer. > */ > struct ib_device *ib_device_get_by_index(u32 index) > { > @@ -153,13 +154,23 @@ struct ib_device *ib_device_get_by_index(u32 index) > > down_read(&lists_rwsem); > device = __ib_device_get_by_index(index); > - if (device) > - get_device(&device->dev); > - > + if (device) { > + /* If unregistration has started, than better not This should probably be "then" (maybe just remove the word). > + * return a device. > + */ > + if (!refcount_inc_not_zero(&device->refcount)) > + device = NULL; > + } > up_read(&lists_rwsem); > return device; > } > > +void ib_device_put(struct ib_device *device) > +{ > + if (refcount_dec_and_test(&device->refcount)) > + complete(&device->unreg_completion); > +} > + > static struct ib_device *__ib_device_get_by_name(const char *name) > { > struct ib_device *device; > @@ -307,6 +318,8 @@ struct ib_device *ib_alloc_device(size_t size) > rwlock_init(&device->client_data_lock); > INIT_LIST_HEAD(&device->client_data_list); > rdma_restrack_init(&device->res); > + refcount_set(&device->refcount, 1); > + init_completion(&device->unreg_completion); > > return device; > } > @@ -655,6 +668,12 @@ void ib_unregister_device(struct ib_device *device) > struct ib_client_data *context, *tmp; > unsigned long flags; > > + /* Wait for all netlink command callers to finish working on the > + * device. > + */ > + ib_device_put(device); > + wait_for_completion(&device->unreg_completion); > + > mutex_lock(&device_mutex); > > down_write(&lists_rwsem); > diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c > index 573399e3ccc1..63cc74483188 100644 > --- a/drivers/infiniband/core/nldev.c > +++ b/drivers/infiniband/core/nldev.c > @@ -636,13 +636,13 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, > > nlmsg_end(msg, nlh); > > - put_device(&device->dev); > + ib_device_put(device); > return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); > > err_free: > nlmsg_free(msg); > err: > - put_device(&device->dev); > + ib_device_put(device); > return err; > } > > @@ -672,7 +672,7 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, > err = ib_device_rename(device, name); > } > > - put_device(&device->dev); > + ib_device_put(device); > return err; > } > > @@ -756,14 +756,14 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, > goto err_free; > > nlmsg_end(msg, nlh); > - put_device(&device->dev); > + ib_device_put(device); > > return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); > > err_free: > nlmsg_free(msg); > err: > - put_device(&device->dev); > + ib_device_put(device); > return err; > } > > @@ -820,7 +820,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, > } > > out: > - put_device(&device->dev); > + ib_device_put(device); > cb->args[0] = idx; > return skb->len; > } > @@ -859,13 +859,13 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, > goto err_free; > > nlmsg_end(msg, nlh); > - put_device(&device->dev); > + ib_device_put(device); > return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); > > err_free: > nlmsg_free(msg); > err: > - put_device(&device->dev); > + ib_device_put(device); > return ret; > } > > @@ -1058,7 +1058,7 @@ next: idx++; > if (!filled) > goto err; > > - put_device(&device->dev); > + ib_device_put(device); > return skb->len; > > res_err: > @@ -1069,7 +1069,7 @@ next: idx++; > nlmsg_cancel(skb, nlh); > > err_index: > - put_device(&device->dev); > + ib_device_put(device); > return ret; > } > > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h > index 3d727dcc007d..ccb279cc5d98 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -53,7 +53,7 @@ > #include <linux/string.h> > #include <linux/slab.h> > #include <linux/netdevice.h> > - > +#include <linux/refcount.h> > #include <linux/if_link.h> > #include <linux/atomic.h> > #include <linux/mmu_notifier.h> > @@ -2612,6 +2612,12 @@ struct ib_device { > > const struct uapi_definition *driver_def; > enum rdma_driver_id driver_id; > + /* > + * Provides synchronization between device unregistration and netlink > + * commands on a device. To be used only by core. > + */ > + refcount_t refcount; > + struct completion unreg_completion; > }; > > struct ib_client { >