On Thu, May 31, 2018 at 11:19 AM, Sagi Grimberg <sagi@xxxxxxxxxxx> wrote: > >> +struct iser_global ig = { >> + .rdma_ib_client = { > > > .ib_client > >> + .name = "iser_rdma_ib", > > > "iser_ib_client" > > >> + .add = iser_ib_client_add_one, >> + .remove = iser_ib_client_remove_one >> + } >> +}; >> int iser_debug_level = 0; >> module_param_named(debug_level, iser_debug_level, int, S_IRUGO | >> S_IWUSR); >> @@ -1064,8 +1071,6 @@ static int __init iser_init(void) >> return -EINVAL; >> } >> - memset(&ig, 0, sizeof(struct iser_global)); >> - >> ig.desc_cache = kmem_cache_create("iser_descriptors", >> sizeof(struct iser_tx_desc), >> 0, SLAB_HWCACHE_ALIGN, >> @@ -1074,11 +1079,14 @@ static int __init iser_init(void) >> return -ENOMEM; >> /* device init is called only after the first addr resolution */ >> - mutex_init(&ig.device_list_mutex); >> - INIT_LIST_HEAD(&ig.device_list); >> mutex_init(&ig.connlist_mutex); >> INIT_LIST_HEAD(&ig.connlist); >> + err = ib_register_client(&ig.rdma_ib_client); >> + if (unlikely(err)) { >> + iser_err("ib_register_client(): %d\n", err); >> + goto err_register_client; >> + } >> release_wq = alloc_workqueue("release workqueue", 0, 0); >> if (!release_wq) { >> iser_err("failed to allocate release workqueue\n"); >> @@ -1099,6 +1107,8 @@ static int __init iser_init(void) >> err_reg: >> destroy_workqueue(release_wq); >> err_alloc_wq: >> + ib_unregister_client(&ig.rdma_ib_client); >> +err_register_client: >> kmem_cache_destroy(ig.desc_cache); >> return err; >> @@ -1127,6 +1137,7 @@ static void __exit iser_exit(void) >> iscsi_unregister_transport(&iscsi_iser_transport); >> kmem_cache_destroy(ig.desc_cache); >> + ib_unregister_client(&ig.rdma_ib_client); >> } >> module_init(iser_init); >> diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h >> b/drivers/infiniband/ulp/iser/iscsi_iser.h >> index c1ae4aeae2f9..bd87bae65d59 100644 >> --- a/drivers/infiniband/ulp/iser/iscsi_iser.h >> +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h >> @@ -363,7 +363,6 @@ struct iser_reg_ops { >> * @pd: Protection Domain for this device >> * @mr: Global DMA memory region >> * @event_handler: IB events handle routine >> - * @ig_list: entry in devices list >> * @refcount: Reference counter, dominated by open iser connections >> * @comps_used: Number of completion contexts used, Min between >> online >> * cpus and device max completion vectors >> @@ -375,8 +374,7 @@ struct iser_device { >> struct ib_device *ib_device; >> struct ib_pd *pd; >> struct ib_event_handler event_handler; >> - struct list_head ig_list; >> - int refcount; >> + refcount_t refcount; >> int comps_used; >> struct iser_comp *comps; >> const struct iser_reg_ops *reg_ops; >> @@ -557,15 +555,13 @@ struct iser_page_vec { >> /** >> * struct iser_global: iSER global context >> * >> - * @device_list_mutex: protects device_list >> - * @device_list: iser devices global list >> + * @rdma_ib_client: IB client >> * @connlist_mutex: protects connlist >> * @connlist: iser connections global list >> * @desc_cache: kmem cache for tx dataout >> */ >> struct iser_global { >> - struct mutex device_list_mutex; >> - struct list_head device_list; >> + struct ib_client rdma_ib_client; >> struct mutex connlist_mutex; >> struct list_head connlist; >> struct kmem_cache *desc_cache; >> @@ -578,6 +574,9 @@ extern int iser_pi_guard; >> extern unsigned int iser_max_sectors; >> extern bool iser_always_reg; >> +void iser_ib_client_add_one(struct ib_device *ib_device); >> +void iser_ib_client_remove_one(struct ib_device *ib_device, void >> *client_data); >> + >> int iser_assign_reg_ops(struct iser_device *device); >> int iser_send_control(struct iscsi_conn *conn, >> diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c >> b/drivers/infiniband/ulp/iser/iser_verbs.c >> index 56b7240a3fc3..b2d2650fefb4 100644 >> --- a/drivers/infiniband/ulp/iser/iser_verbs.c >> +++ b/drivers/infiniband/ulp/iser/iser_verbs.c >> @@ -491,55 +491,70 @@ static int iser_create_ib_conn_res(struct ib_conn >> *ib_conn) >> return ret; >> } >> -/** >> - * based on the resolved device node GUID see if there already allocated >> - * device for this device. If there's no such, create one. >> - */ >> static >> struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id >> *cma_id) >> { >> struct iser_device *device; >> - mutex_lock(&ig.device_list_mutex); >> + /* Paired with iser_ib_client_remove_one() */ >> + rcu_read_lock(); > > > Why is this not needed for others as well? Well, I can only consider nvme case, why it is not needed there, but in other ULPs that can be probably a bug or they have other protections, I do not know (seems net/rds/ib.c does nice things). *_remove_one() callback will be firstly called if device is dying and only then cm_handler() will get RDMA_CM_EVENT_DEVICE_REMOVAL. Order matters. NVMe host and target do flush_scheduled_work() in nvme[t]_rdma_remove_one(), thus it is guaranteed that *_remove_one() is the last one who puts the reference on the device, i.e.: nvmet_rdma_remove_one(...) { ... flush_scheduled_work(); WARN_ON(!nvmet_rdma_dev_put(ndev)); } I added the WARN_ON line to spot violators. Seems it works as expected, at least according to my smoke test: map 128 devices, unload mlx4_ib to invoke *_remove_one(). If you do not have such guarantees, that *_remove_one() puts the last reference, then you have to have some other strict protections to avoid a race between setting the NULL pointer and others who can observe that pointer. > >> + device = ib_get_client_data(cma_id->device, &ig.rdma_ib_client); >> + if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount))) >> + device = NULL; >> + rcu_read_unlock(); >> + >> + return device; >> +} >> - list_for_each_entry(device, &ig.device_list, ig_list) >> - /* find if there's a match using the node GUID */ >> - if (device->ib_device->node_guid == >> cma_id->device->node_guid) >> - goto inc_refcnt; >> +static struct iser_device *iser_device_alloc(struct ib_device *ib_device) >> +{ >> + struct iser_device *device; >> device = kzalloc(sizeof *device, GFP_KERNEL); >> - if (device == NULL) >> - goto out; >> + if (unlikely(device == NULL)) >> + return NULL; >> - /* assign this device to the device */ >> - device->ib_device = cma_id->device; >> - /* init the device and link it into ig device list */ >> + refcount_set(&device->refcount, 1); > > > Nice cleanup! > > Would be good if the refcount change is splitted out of this > patch as its not strictly related to the patch. Yep. -- Roman > > >> + device->ib_device = ib_device; >> if (iser_create_device_ib_res(device)) { >> kfree(device); >> - device = NULL; >> - goto out; >> + return NULL; >> } >> - list_add(&device->ig_list, &ig.device_list); >> -inc_refcnt: >> - device->refcount++; >> -out: >> - mutex_unlock(&ig.device_list_mutex); >> return device; >> } >> /* if there's no demand for this device, release it */ >> static void iser_device_try_release(struct iser_device *device) >> { >> - mutex_lock(&ig.device_list_mutex); >> - device->refcount--; >> - iser_info("device %p refcount %d\n", device, device->refcount); >> - if (!device->refcount) { >> + iser_info("device %p refcount %d\n", device, >> + refcount_read(&device->refcount)); >> + if (refcount_dec_and_test(&device->refcount)) { >> iser_free_device_ib_res(device); >> - list_del(&device->ig_list); >> kfree(device); >> } >> - mutex_unlock(&ig.device_list_mutex); >> +} >> + >> +void iser_ib_client_add_one(struct ib_device *ib_device) >> +{ >> + struct iser_device *device; >> + >> + device = iser_device_alloc(ib_device); >> + if (!WARN_ON(!device)) >> + ib_set_client_data(ib_device, &ig.rdma_ib_client, device); >> +} >> + >> +void iser_ib_client_remove_one(struct ib_device *ib_device, void >> *client_data) >> +{ >> + struct iser_device *device = client_data; >> + >> + if (unlikely(!device)) >> + return; >> + >> + ib_set_client_data(ib_device, &ig.rdma_ib_client, NULL); >> + /* Paired with iser_device_find_by_ib_device() */ >> + synchronize_rcu(); >> + iser_device_try_release(device); >> } >> /** >> > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html