> -----Original Message----- > From: linux-rdma-owner@xxxxxxxxxxxxxxx <linux-rdma- > owner@xxxxxxxxxxxxxxx> On Behalf Of Jason Gunthorpe > Sent: Friday, December 14, 2018 10:06 AM > To: dledford@xxxxxxxxxx; Jason Gunthorpe <jgg@xxxxxxxxxxxx> > Cc: linux-rdma@xxxxxxxxxxxxxxx; BMT@xxxxxxxxxxxxxx; leon@xxxxxxxxxx; > Mark Bloch <markb@xxxxxxxxxxxx>; yanjun.zhu@xxxxxxxxxx > Subject: [PATCH v7 1/5] verbs/rxe: Use core services to keep track of RXE > things > > Replace > - The nonsense rxe_dev.kref with rxe_dev.ib_dev.dev.kref > - rxe_dev_list with core's device_list. > - Buggy net_to_rxe with ib_device_get_by_netdev that does proper > lifetime management > - Buggy get_rxe_by_name with ib_device_get_by_name > > Add > - ib_unregister_driver so drivers like rxe don't have to keep track > of things to do module unload > - ib_unregister_device_and_put to work with potiners returned by > that various ib_device_get_* functions > > The refcount change to the pool might need more work, but fundamentally > the driver cannot have krefs open in the pool across unregistration or it > contains module unlolad races. > > Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxxxx> > --- > drivers/infiniband/core/core_priv.h | 1 - > drivers/infiniband/core/device.c | 213 > ++++++++++++++++++++++++++++++++-- > drivers/infiniband/sw/rxe/rxe.c | 38 +----- > drivers/infiniband/sw/rxe/rxe.h | 14 ++- > drivers/infiniband/sw/rxe/rxe_loc.h | 2 +- > drivers/infiniband/sw/rxe/rxe_net.c | 83 +++---------- > drivers/infiniband/sw/rxe/rxe_pool.c | 4 - > drivers/infiniband/sw/rxe/rxe_sysfs.c | 39 +++---- > drivers/infiniband/sw/rxe/rxe_verbs.c | 7 -- > drivers/infiniband/sw/rxe/rxe_verbs.h | 3 - > include/rdma/ib_verbs.h | 10 ++ > 11 files changed, 261 insertions(+), 153 deletions(-) > > diff --git a/drivers/infiniband/core/core_priv.h > b/drivers/infiniband/core/core_priv.h > index cc7535c5e192..1b2575430032 100644 > --- a/drivers/infiniband/core/core_priv.h > +++ b/drivers/infiniband/core/core_priv.h > @@ -267,7 +267,6 @@ static inline int ib_mad_enforce_security(struct > ib_mad_agent_private *map, #endif > > struct ib_device *ib_device_get_by_index(u32 ifindex); -void > ib_device_put(struct ib_device *device); > /* RDMA device netlink */ > void nldev_init(void); > void nldev_exit(void); > diff --git a/drivers/infiniband/core/device.c > b/drivers/infiniband/core/device.c > index 348a7fb1f945..a2523605e02a 100644 > --- a/drivers/infiniband/core/device.c > +++ b/drivers/infiniband/core/device.c > @@ -85,6 +85,9 @@ struct ib_client_data { static > DEFINE_MUTEX(device_mutex); static DECLARE_RWSEM(lists_rwsem); > > +/* Used to serialize ib_unregister_driver */ static > +DECLARE_RWSEM(unregister_rwsem); > + > static int ib_security_change(struct notifier_block *nb, unsigned long event, > void *lsm_data); > static void ib_policy_change_task(struct work_struct *work); @@ -168,6 > +171,7 @@ void ib_device_put(struct ib_device *device) > if (refcount_dec_and_test(&device->refcount)) > complete(&device->unreg_completion); > } > +EXPORT_SYMBOL(ib_device_put); > > static struct ib_device *__ib_device_get_by_name(const char *name) { @@ > -180,6 +184,27 @@ static struct ib_device *__ib_device_get_by_name(const > char *name) > return NULL; > } > > +struct ib_device *ib_device_get_by_name(const char *name, > + enum rdma_driver_id driver_id) > +{ > + struct ib_device *device; > + > + down_read(&lists_rwsem); > + device = __ib_device_get_by_name(name); > + if (device && driver_id != RDMA_DRIVER_UNKNOWN && > + device->driver_id != driver_id) > + device = NULL; > + > + if (device) { > + /* Do not return a device if unregistration has started. */ > + if (!refcount_inc_not_zero(&device->refcount)) > + device = NULL; > + } > + up_read(&lists_rwsem); > + return device; > +} > +EXPORT_SYMBOL(ib_device_get_by_name); > + > int ib_device_rename(struct ib_device *ibdev, const char *name) { > struct ib_device *device; > @@ -641,28 +666,41 @@ int ib_register_device(struct ib_device *device, > const char *name, } EXPORT_SYMBOL(ib_register_device); > > -/** > - * ib_unregister_device - Unregister an IB device > - * @device:Device to unregister > +/* Returns false if the device is already undergoing unregistration in > +another > + * thread, and the device pointer may be invalid upon return. > * > - * Unregister an IB device. All clients will receive a remove callback. > + * If true is returned then the caller owns a kref associated with the > + device > + * (taken from the kref owned by the list). > + * > + * Upon return the device cannot be returned by any 'get' functions. > */ > -void ib_unregister_device(struct ib_device *device) > +static bool ib_pre_unregister(struct ib_device *device) { > + bool already_removed; > + > + mutex_lock(&device_mutex); > + down_write(&lists_rwsem); > + already_removed = list_empty(&device->core_list); > + list_del_init(&device->core_list); > + up_write(&lists_rwsem); > + mutex_unlock(&device_mutex); No. you cannot unlock device_mutex here. Now that device is removed from the list, another ib_register_device tries to create device with same name and fails to creates sysfs entries. > + > + /* Matches the recound_set in ib_alloc_device */ Did you mean refcount set? > + ib_device_put(device); > + > + return !already_removed; > +} > + > +static void __ib_unregister_device(struct ib_device *device) > { > struct ib_client_data *context, *tmp; > unsigned long flags; > > - /* > - * Wait for all netlink command callers to finish working on the > - * device. > - */ > - ib_device_put(device); > wait_for_completion(&device->unreg_completion); > > mutex_lock(&device_mutex); > > down_write(&lists_rwsem); > - list_del(&device->core_list); > write_lock_irq(&device->client_data_lock); > list_for_each_entry(context, &device->client_data_list, list) > context->going_down = true; > @@ -696,10 +734,116 @@ void ib_unregister_device(struct ib_device > *device) > up_write(&lists_rwsem); > > device->reg_state = IB_DEV_UNREGISTERED; > + > + /* > + * Drivers using the new flow may not call ib_dealloc_device except > + * in error unwind prior to registration success. > + */ > + if (device->driver_unregister) { > + device->driver_unregister(device); > + ib_dealloc_device(device); > + } > +} > + > +/** > + * ib_unregister_device - Unregister an IB device > + * @device: The device to unregister > + * > + * Unregister an IB device. All clients will receive a remove callback. > + * > + * Callers can call this routine only once, and must protect against > + * races. Typically it should only be called as part of a remove > +callback in > + * an implementation of driver core's struct device_driver and related. > + * > + * A driver must choose to use either only ib_unregister_device or > + * ib_unregister_device_and_put & ib_unregister_driver. > + */ > +void ib_unregister_device(struct ib_device *device) { > + down_read(&unregister_rwsem); > + if (ib_pre_unregister(device)) > + __ib_unregister_device(device); > + else > + WARN(true, "Driver mixing %s with _and_put", __func__); > + up_read(&unregister_rwsem); > } > EXPORT_SYMBOL(ib_unregister_device); > > /** > + * ib_unregister_device_and_put - Unregister a device while holding a 'get' > + * device: The device to unregister > + * > + * This is the same as ib_unregister_device(), except it includes an > +internal > + * ib_device_put() that should match a 'get' obtained by the caller. > + * > + * It is safe to call this routine concurrently from multiple threads > +while > + * holding the 'get', however in this case return from the function > +does not > + * guarantee the device is destroyed, only that destruction is in-progress. > + * > + * Drivers using this flow MUST use the driver_unregister callback to > +clean up > + * their resources associated with the device and dealloc it. > + */ > +void ib_unregister_device_and_put(struct ib_device *device) { > + down_read(&unregister_rwsem); > + if (ib_pre_unregister(device)) { > + /* > + * Caller's get can be returned now that we hold the kref, > + * otherwise ib_pre_unregister returned the caller's get > + */ > + ib_device_put(device); > + __ib_unregister_device(device); > + } > + up_read(&unregister_rwsem); > +} > +EXPORT_SYMBOL(ib_unregister_device_and_put); > + > +/** > + * ib_unregister_driver - Unregister all IB devices for a driver > + * @driver_id: The driver to unregister > + * > + * This implements a fence for device unregistration. It only returns > +once all > + * devices associated with the driver_id have fully completed their > + * unregistration and returned from ib_unregister_device*(). > + * > + * If device's are not yet unregistered it goes ahead and starts > +unregistering > + * them. > + * > + * The caller must ensure that no devices can be be added while this is > + * running (or at all for the module_unload case) */ void > +ib_unregister_driver(enum rdma_driver_id driver_id) { > + struct ib_device *device, *tmp; > + LIST_HEAD(driver_devs); > + > + down_write(&unregister_rwsem); > + mutex_lock(&device_mutex); > + down_write(&lists_rwsem); > + list_for_each_entry_safe(device, tmp, &device_list, core_list) { > + if (device->driver_id == driver_id) > + list_move(&device->core_list, &driver_devs); > + } > + up_write(&lists_rwsem); > + > + while ((device = list_first_entry_or_null( > + &driver_devs, struct ib_device, core_list))) { > + mutex_unlock(&device_mutex); > + /* > + * We are holding the unregister_rwsem, so it is impossible > + * for another thread to be doing registration. registration or unregistration? > + */ > + if (!ib_pre_unregister(device)) > + WARN(true, "Impossible failure of > ib_pre_unregister"); > + __ib_unregister_device(device); > + mutex_lock(&device_mutex); > + } > + mutex_unlock(&device_mutex); > + up_write(&unregister_rwsem); > +} > +EXPORT_SYMBOL(ib_unregister_driver); > + > +/** > * ib_register_client - Register an IB client > * @client:Client to register > * > @@ -981,6 +1125,53 @@ void ib_enum_roce_netdev(struct ib_device > *ib_dev, > } > } > > +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, > + enum rdma_driver_id driver_id) > +{ > + struct ib_device *ib_dev; > + > + down_read(&lists_rwsem); > + list_for_each_entry(ib_dev, &device_list, core_list) { > + unsigned int port; > + > + if (driver_id != RDMA_DRIVER_UNKNOWN && > + ib_dev->driver_id != driver_id) > + continue; > + > + if (!ib_dev->get_netdev) > + continue; > + > + for (port = rdma_start_port(ib_dev); > + port <= rdma_end_port(ib_dev); port++) { > + struct net_device *idev; > + > + idev = ib_dev->get_netdev(ib_dev, port); > + if (idev != ndev) { > + if (idev) > + dev_put(idev); > + continue; > + } > + > + /* > + * Do not return a device if unregistration has > + * started. > + */ > + if (!refcount_inc_not_zero(&ib_dev->refcount)) { > + dev_put(idev); > + continue; > + } > + > + up_read(&lists_rwsem); > + dev_put(idev); > + return ib_dev; > + } > + } > + up_read(&lists_rwsem); > + > + return NULL; > +} > +EXPORT_SYMBOL(ib_device_get_by_netdev); > + > /** > * ib_enum_all_roce_netdevs - enumerate all RoCE devices > * @filter: Should we call the callback? > diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c > index 383e65c7bbc0..4069463f9a66 100644 > --- a/drivers/infiniband/sw/rxe/rxe.c > +++ b/drivers/infiniband/sw/rxe/rxe.c > @@ -50,8 +50,10 @@ static void rxe_cleanup_ports(struct rxe_dev *rxe) > /* free resources for a rxe device all objects created for this device must > * have been destroyed > */ > -static void rxe_cleanup(struct rxe_dev *rxe) > +void rxe_unregistered(struct ib_device *ib_dev) > { > + struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); > + > rxe_pool_cleanup(&rxe->uc_pool); > rxe_pool_cleanup(&rxe->pd_pool); > rxe_pool_cleanup(&rxe->ah_pool); > @@ -68,15 +70,6 @@ static void rxe_cleanup(struct rxe_dev *rxe) > crypto_free_shash(rxe->tfm); > } > > -/* called when all references have been dropped */ -void rxe_release(struct > kref *kref) -{ > - struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt); > - > - rxe_cleanup(rxe); > - ib_dealloc_device(&rxe->ib_dev); > -} > - > /* initialize rxe device parameters */ > static void rxe_init_device_param(struct rxe_dev *rxe) { @@ -279,7 +272,6 > @@ static int rxe_init(struct rxe_dev *rxe) > spin_lock_init(&rxe->mmap_offset_lock); > spin_lock_init(&rxe->pending_lock); > INIT_LIST_HEAD(&rxe->pending_mmaps); > - INIT_LIST_HEAD(&rxe->list); > > mutex_init(&rxe->usdev_lock); > > @@ -312,31 +304,13 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu) > { > int err; > > - kref_init(&rxe->ref_cnt); > - > err = rxe_init(rxe); > if (err) > - goto err1; > + return err; > > rxe_set_mtu(rxe, mtu); > > - err = rxe_register_device(rxe); > - if (err) > - goto err1; > - > - return 0; > - > -err1: > - rxe_dev_put(rxe); > - return err; > -} > - > -/* called by the ifc layer to remove a device */ -void rxe_remove(struct > rxe_dev *rxe) -{ > - rxe_unregister_device(rxe); > - > - rxe_dev_put(rxe); > + return rxe_register_device(rxe); > } > > static int __init rxe_module_init(void) @@ -360,7 +334,7 @@ static int > __init rxe_module_init(void) > > static void __exit rxe_module_exit(void) { > - rxe_remove_all(); > + ib_unregister_driver(RDMA_DRIVER_RXE); > rxe_net_exit(); > rxe_cache_exit(); > > diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h > index 8f79bd86d033..75e7113a756d 100644 > --- a/drivers/infiniband/sw/rxe/rxe.h > +++ b/drivers/infiniband/sw/rxe/rxe.h > @@ -96,17 +96,19 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe, void > rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); > > int rxe_add(struct rxe_dev *rxe, unsigned int mtu); -void rxe_remove(struct > rxe_dev *rxe); -void rxe_remove_all(void); > > void rxe_rcv(struct sk_buff *skb); > > -static inline void rxe_dev_put(struct rxe_dev *rxe) > +/* The caller must do a matching ib_device_put(&dev->ib_dev) */ static > +inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev) > { > - kref_put(&rxe->ref_cnt, rxe_release); > + struct ib_device *ibdev = > + ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE); > + > + if (!ibdev) > + return NULL; > + return container_of(ibdev, struct rxe_dev, ib_dev); > } > -struct rxe_dev *net_to_rxe(struct net_device *ndev); -struct rxe_dev > *get_rxe_by_name(const char *name); > > void rxe_port_up(struct rxe_dev *rxe); > void rxe_port_down(struct rxe_dev *rxe); diff --git > a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h > index a675c9f2b427..e830d56a15c2 100644 > --- a/drivers/infiniband/sw/rxe/rxe_loc.h > +++ b/drivers/infiniband/sw/rxe/rxe_loc.h > @@ -231,7 +231,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct > rxe_srq *srq, > struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, > struct rxe_modify_srq_cmd *ucmd); > > -void rxe_release(struct kref *kref); > +void rxe_unregistered(struct ib_device *ib_dev); > > int rxe_completer(void *arg); > int rxe_requester(void *arg); > diff --git a/drivers/infiniband/sw/rxe/rxe_net.c > b/drivers/infiniband/sw/rxe/rxe_net.c > index b26a8141f3ed..c7637ad5e584 100644 > --- a/drivers/infiniband/sw/rxe/rxe_net.c > +++ b/drivers/infiniband/sw/rxe/rxe_net.c > @@ -45,43 +45,6 @@ > #include "rxe_net.h" > #include "rxe_loc.h" > > -static LIST_HEAD(rxe_dev_list); > -static DEFINE_SPINLOCK(dev_list_lock); /* spinlock for device list */ > - > -struct rxe_dev *net_to_rxe(struct net_device *ndev) -{ > - struct rxe_dev *rxe; > - struct rxe_dev *found = NULL; > - > - spin_lock_bh(&dev_list_lock); > - list_for_each_entry(rxe, &rxe_dev_list, list) { > - if (rxe->ndev == ndev) { > - found = rxe; > - break; > - } > - } > - spin_unlock_bh(&dev_list_lock); > - > - return found; > -} > - > -struct rxe_dev *get_rxe_by_name(const char *name) -{ > - struct rxe_dev *rxe; > - struct rxe_dev *found = NULL; > - > - spin_lock_bh(&dev_list_lock); > - list_for_each_entry(rxe, &rxe_dev_list, list) { > - if (!strcmp(name, dev_name(&rxe->ib_dev.dev))) { > - found = rxe; > - break; > - } > - } > - spin_unlock_bh(&dev_list_lock); > - return found; > -} > - > - > static struct rxe_recv_sockets recv_sockets; > > struct device *rxe_dma_device(struct rxe_dev *rxe) @@ -229,18 +192,19 > @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > struct udphdr *udph; > struct net_device *ndev = skb->dev; > struct net_device *rdev = ndev; > - struct rxe_dev *rxe = net_to_rxe(ndev); > + struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); > struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); > > if (!rxe && is_vlan_dev(rdev)) { > rdev = vlan_dev_real_dev(ndev); > - rxe = net_to_rxe(rdev); > + rxe = rxe_get_dev_from_net(rdev); Please, let's not lock-unlock a read semaphore on every incoming RoCE udp packet that too in NAPI's non-blocking context. > } > if (!rxe) > goto drop; > > if (skb_linearize(skb)) { > pr_err("skb_linearize failed\n"); > + ib_device_put(&rxe->ib_dev); > goto drop; > } > > @@ -253,6 +217,12 @@ static int rxe_udp_encap_recv(struct sock *sk, struct > sk_buff *skb) > > rxe_rcv(skb); > > + /* > + * FIXME: this is in the wrong place, it needs to be done when pkt is > + * destroyed > + */ > + ib_device_put(&rxe->ib_dev); > + > return 0; > drop: > kfree_skb(skb); > @@ -558,36 +528,18 @@ struct rxe_dev *rxe_net_add(struct net_device > *ndev) > rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe)); > if (!rxe) > return NULL; > - > + rxe->ib_dev.driver_unregister = rxe_unregistered; > rxe->ndev = ndev; > > err = rxe_add(rxe, ndev->mtu); > if (err) { > - ib_dealloc_device(&rxe->ib_dev); Unbalanced ib_alloc_device and ib_dealloc_device doesn't sound like a good approach. There should be a better way to do this. > + rxe_unregistered(&rxe->ib_dev); > return NULL; > } > > - spin_lock_bh(&dev_list_lock); > - list_add_tail(&rxe->list, &rxe_dev_list); > - spin_unlock_bh(&dev_list_lock); > return rxe; > } > > -void rxe_remove_all(void) > -{ > - spin_lock_bh(&dev_list_lock); > - while (!list_empty(&rxe_dev_list)) { > - struct rxe_dev *rxe = > - list_first_entry(&rxe_dev_list, struct rxe_dev, list); > - > - list_del(&rxe->list); > - spin_unlock_bh(&dev_list_lock); > - rxe_remove(rxe); > - spin_lock_bh(&dev_list_lock); > - } > - spin_unlock_bh(&dev_list_lock); > -} > - > static void rxe_port_event(struct rxe_dev *rxe, > enum ib_event_type event) > { > @@ -630,16 +582,16 @@ static int rxe_notify(struct notifier_block *not_blk, > void *arg) > { > struct net_device *ndev = netdev_notifier_info_to_dev(arg); > - struct rxe_dev *rxe = net_to_rxe(ndev); > + struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); > > if (!rxe) > - goto out; > + return NOTIFY_OK; > > switch (event) { > case NETDEV_UNREGISTER: > - list_del(&rxe->list); > - rxe_remove(rxe); > - break; > + ib_unregister_device_and_put(&rxe->ib_dev); > + return NOTIFY_OK; > + > case NETDEV_UP: > rxe_port_up(rxe); > break; > @@ -666,7 +618,8 @@ static int rxe_notify(struct notifier_block *not_blk, > event, ndev->name); > break; > } > -out: > + > + ib_device_put(&rxe->ib_dev); > return NOTIFY_OK; > } > > diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c > b/drivers/infiniband/sw/rxe/rxe_pool.c > index a04a076e03af..bb7e3268713e 100644 > --- a/drivers/infiniband/sw/rxe/rxe_pool.c > +++ b/drivers/infiniband/sw/rxe/rxe_pool.c > @@ -390,8 +390,6 @@ void *rxe_alloc(struct rxe_pool *pool) > kref_get(&pool->ref_cnt); > read_unlock_irqrestore(&pool->pool_lock, flags); > > - kref_get(&pool->rxe->ref_cnt); > - > if (atomic_inc_return(&pool->num_elem) > pool->max_elem) > goto out_put_pool; > > @@ -408,7 +406,6 @@ void *rxe_alloc(struct rxe_pool *pool) > > out_put_pool: > atomic_dec(&pool->num_elem); > - rxe_dev_put(pool->rxe); > rxe_pool_put(pool); > return NULL; > } > @@ -424,7 +421,6 @@ void rxe_elem_release(struct kref *kref) > > kmem_cache_free(pool_cache(pool), elem); > atomic_dec(&pool->num_elem); > - rxe_dev_put(pool->rxe); > rxe_pool_put(pool); > } > > diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c > b/drivers/infiniband/sw/rxe/rxe_sysfs.c > index 73a19f808e1b..87c17675cf9d 100644 > --- a/drivers/infiniband/sw/rxe/rxe_sysfs.c > +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c > @@ -53,20 +53,14 @@ static int sanitize_arg(const char *val, char *intf, int > intf_len) > return len; > } > > -static void rxe_set_port_state(struct net_device *ndev) > +static void rxe_set_port_state(struct rxe_dev *rxe, struct net_device > +*ndev) > { > - struct rxe_dev *rxe = net_to_rxe(ndev); > bool is_up = netif_running(ndev) && netif_carrier_ok(ndev); > > - if (!rxe) > - goto out; > - > if (is_up) > rxe_port_up(rxe); > else > rxe_port_down(rxe); /* down for unknown state */ > -out: > - return; > } > > static int rxe_param_set_add(const char *val, const struct kernel_param > *kp) @@ -74,24 +68,25 @@ static int rxe_param_set_add(const char *val, > const struct kernel_param *kp) > int len; > int err = 0; > char intf[32]; > - struct net_device *ndev = NULL; > + struct net_device *ndev; > + struct rxe_dev *exists; > struct rxe_dev *rxe; > > len = sanitize_arg(val, intf, sizeof(intf)); > if (!len) { > pr_err("add: invalid interface name\n"); > - err = -EINVAL; > - goto err; > + return -EINVAL; > } > > ndev = dev_get_by_name(&init_net, intf); > if (!ndev) { > pr_err("interface %s not found\n", intf); > - err = -EINVAL; > - goto err; > + return -EINVAL; > } > > - if (net_to_rxe(ndev)) { > + exists = rxe_get_dev_from_net(ndev); > + if (exists) { > + ib_device_put(&exists->ib_dev); > pr_err("already configured on %s\n", intf); > err = -EINVAL; > goto err; > @@ -104,11 +99,11 @@ static int rxe_param_set_add(const char *val, const > struct kernel_param *kp) > goto err; > } > > - rxe_set_port_state(ndev); > + rxe_set_port_state(rxe, ndev); > dev_info(&rxe->ib_dev.dev, "added %s\n", intf); > + > err: > - if (ndev) > - dev_put(ndev); > + dev_put(ndev); > return err; > } > > @@ -116,7 +111,7 @@ static int rxe_param_set_remove(const char *val, > const struct kernel_param *kp) { > int len; > char intf[32]; > - struct rxe_dev *rxe; > + struct ib_device *ib_dev; > > len = sanitize_arg(val, intf, sizeof(intf)); > if (!len) { > @@ -126,20 +121,18 @@ static int rxe_param_set_remove(const char *val, > const struct kernel_param *kp) > > if (strncmp("all", intf, len) == 0) { > pr_info("rxe_sys: remove all"); > - rxe_remove_all(); > + ib_unregister_driver(RDMA_DRIVER_RXE); > return 0; > } > > - rxe = get_rxe_by_name(intf); > + ib_dev = ib_device_get_by_name(intf, RDMA_DRIVER_RXE); > > - if (!rxe) { > + if (!ib_dev) { > pr_err("not configured on %s\n", intf); > return -EINVAL; > } > > - list_del(&rxe->list); > - rxe_remove(rxe); > - > + ib_unregister_device_and_put(ib_dev); > return 0; > } > > diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c > b/drivers/infiniband/sw/rxe/rxe_verbs.c > index 30817c79ba96..28beee8f5838 100644 > --- a/drivers/infiniband/sw/rxe/rxe_verbs.c > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c > @@ -1286,10 +1286,3 @@ int rxe_register_device(struct rxe_dev *rxe) > > return err; > } > - > -void rxe_unregister_device(struct rxe_dev *rxe) -{ > - struct ib_device *dev = &rxe->ib_dev; > - > - ib_unregister_device(dev); > -} > diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h > b/drivers/infiniband/sw/rxe/rxe_verbs.h > index 831381b7788d..a0b635812784 100644 > --- a/drivers/infiniband/sw/rxe/rxe_verbs.h > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h > @@ -385,7 +385,6 @@ struct rxe_dev { > struct ib_device_attr attr; > int max_ucontext; > int max_inline_data; > - struct kref ref_cnt; > struct mutex usdev_lock; > > struct net_device *ndev; > @@ -412,7 +411,6 @@ struct rxe_dev { > u64 stats_counters[RXE_NUM_OF_COUNTERS]; > > struct rxe_port port; > - struct list_head list; > struct crypto_shash *tfm; > }; > > @@ -467,7 +465,6 @@ static inline struct rxe_mem *to_rmw(struct ib_mw > *mw) } > > int rxe_register_device(struct rxe_dev *rxe); -void > rxe_unregister_device(struct rxe_dev *rxe); > > void rxe_mc_cleanup(struct rxe_pool_entry *arg); > > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index > 85021451eee0..5fbeedacb3aa 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -2534,6 +2534,9 @@ struct ib_device { > struct ib_counters_read_attr > *counters_read_attr, > struct uverbs_attr_bundle *attrs); > > + /* Called after all clients have been destroyed*/ > + void (*driver_unregister)(struct ib_device *dev); > + > /** > * rdma netdev operation > * > @@ -2653,6 +2656,8 @@ int ib_register_device(struct ib_device *device, > const char *name, > int (*port_callback)(struct ib_device *, u8, > struct kobject *)); > void ib_unregister_device(struct ib_device *device); > +void ib_unregister_driver(enum rdma_driver_id driver_id); void > +ib_unregister_device_and_put(struct ib_device *device); > > int ib_register_client (struct ib_client *client); > void ib_unregister_client(struct ib_client *client); @@ -3937,6 +3942,11 @@ > static inline bool ib_access_writable(int access_flags) int > ib_check_mr_status(struct ib_mr *mr, u32 check_mask, > struct ib_mr_status *mr_status); > > +void ib_device_put(struct ib_device *device); struct ib_device > +*ib_device_get_by_netdev(struct net_device *ndev, > + enum rdma_driver_id driver_id); > +struct ib_device *ib_device_get_by_name(const char *name, > + enum rdma_driver_id driver_id); > struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 > port, > u16 pkey, const union ib_gid *gid, > const struct sockaddr *addr); > -- > 1.8.3.1 I think driver unload and device unregistration related complexity should be left in the rxe driver. I do not have any better locking suggestion at this point, haven't followed the email discussion. But core is surely getting undesired locking here along with unbalanced deallocs that deserves a better method.