Re: [RFC 3/5] IB/iser: use ib_client API to wrap ib_device

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, May 31, 2018 at 11:19 AM, Sagi Grimberg <sagi@xxxxxxxxxxx> wrote:
>
>> +struct iser_global ig = {
>> +       .rdma_ib_client = {
>
>
> .ib_client
>
>> +               .name   = "iser_rdma_ib",
>
>
> "iser_ib_client"
>
>
>> +               .add    = iser_ib_client_add_one,
>> +               .remove = iser_ib_client_remove_one
>> +       }
>> +};
>>     int iser_debug_level = 0;
>>   module_param_named(debug_level, iser_debug_level, int, S_IRUGO |
>> S_IWUSR);
>> @@ -1064,8 +1071,6 @@ static int __init iser_init(void)
>>                 return -EINVAL;
>>         }
>>   -     memset(&ig, 0, sizeof(struct iser_global));
>> -
>>         ig.desc_cache = kmem_cache_create("iser_descriptors",
>>                                           sizeof(struct iser_tx_desc),
>>                                           0, SLAB_HWCACHE_ALIGN,
>> @@ -1074,11 +1079,14 @@ static int __init iser_init(void)
>>                 return -ENOMEM;
>>         /* device init is called only after the first addr resolution */
>> -       mutex_init(&ig.device_list_mutex);
>> -       INIT_LIST_HEAD(&ig.device_list);
>>         mutex_init(&ig.connlist_mutex);
>>         INIT_LIST_HEAD(&ig.connlist);
>>   +     err = ib_register_client(&ig.rdma_ib_client);
>> +       if (unlikely(err)) {
>> +               iser_err("ib_register_client(): %d\n", err);
>> +               goto err_register_client;
>> +       }
>>         release_wq = alloc_workqueue("release workqueue", 0, 0);
>>         if (!release_wq) {
>>                 iser_err("failed to allocate release workqueue\n");
>> @@ -1099,6 +1107,8 @@ static int __init iser_init(void)
>>   err_reg:
>>         destroy_workqueue(release_wq);
>>   err_alloc_wq:
>> +       ib_unregister_client(&ig.rdma_ib_client);
>> +err_register_client:
>>         kmem_cache_destroy(ig.desc_cache);
>>         return err;
>> @@ -1127,6 +1137,7 @@ static void __exit iser_exit(void)
>>         iscsi_unregister_transport(&iscsi_iser_transport);
>>         kmem_cache_destroy(ig.desc_cache);
>> +       ib_unregister_client(&ig.rdma_ib_client);
>>   }
>>     module_init(iser_init);
>> diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h
>> b/drivers/infiniband/ulp/iser/iscsi_iser.h
>> index c1ae4aeae2f9..bd87bae65d59 100644
>> --- a/drivers/infiniband/ulp/iser/iscsi_iser.h
>> +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
>> @@ -363,7 +363,6 @@ struct iser_reg_ops {
>>    * @pd:            Protection Domain for this device
>>    * @mr:            Global DMA memory region
>>    * @event_handler: IB events handle routine
>> - * @ig_list:      entry in devices list
>>    * @refcount:      Reference counter, dominated by open iser connections
>>    * @comps_used:    Number of completion contexts used, Min between
>> online
>>    *                 cpus and device max completion vectors
>> @@ -375,8 +374,7 @@ struct iser_device {
>>         struct ib_device             *ib_device;
>>         struct ib_pd                 *pd;
>>         struct ib_event_handler      event_handler;
>> -       struct list_head             ig_list;
>> -       int                          refcount;
>> +       refcount_t                   refcount;
>>         int                          comps_used;
>>         struct iser_comp             *comps;
>>         const struct iser_reg_ops    *reg_ops;
>> @@ -557,15 +555,13 @@ struct iser_page_vec {
>>   /**
>>    * struct iser_global: iSER global context
>>    *
>> - * @device_list_mutex:    protects device_list
>> - * @device_list:          iser devices global list
>> + * @rdma_ib_client:       IB client
>>    * @connlist_mutex:       protects connlist
>>    * @connlist:             iser connections global list
>>    * @desc_cache:           kmem cache for tx dataout
>>    */
>>   struct iser_global {
>> -       struct mutex      device_list_mutex;
>> -       struct list_head  device_list;
>> +       struct ib_client  rdma_ib_client;
>>         struct mutex      connlist_mutex;
>>         struct list_head  connlist;
>>         struct kmem_cache *desc_cache;
>> @@ -578,6 +574,9 @@ extern int iser_pi_guard;
>>   extern unsigned int iser_max_sectors;
>>   extern bool iser_always_reg;
>>   +void iser_ib_client_add_one(struct ib_device *ib_device);
>> +void iser_ib_client_remove_one(struct ib_device *ib_device, void
>> *client_data);
>> +
>>   int iser_assign_reg_ops(struct iser_device *device);
>>     int iser_send_control(struct iscsi_conn *conn,
>> diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c
>> b/drivers/infiniband/ulp/iser/iser_verbs.c
>> index 56b7240a3fc3..b2d2650fefb4 100644
>> --- a/drivers/infiniband/ulp/iser/iser_verbs.c
>> +++ b/drivers/infiniband/ulp/iser/iser_verbs.c
>> @@ -491,55 +491,70 @@ static int iser_create_ib_conn_res(struct ib_conn
>> *ib_conn)
>>         return ret;
>>   }
>>   -/**
>> - * based on the resolved device node GUID see if there already allocated
>> - * device for this device. If there's no such, create one.
>> - */
>>   static
>>   struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id
>> *cma_id)
>>   {
>>         struct iser_device *device;
>>   -     mutex_lock(&ig.device_list_mutex);
>> +       /* Paired with iser_ib_client_remove_one() */
>> +       rcu_read_lock();
>
>
> Why is this not needed for others as well?

Well, I can only consider nvme case, why it is not needed there, but in
other ULPs that can be probably a bug or they have other protections,
I do not know (seems net/rds/ib.c does nice things).

*_remove_one() callback will be firstly called if device is dying and
only then cm_handler() will get RDMA_CM_EVENT_DEVICE_REMOVAL. Order
matters.

NVMe host and target do flush_scheduled_work() in nvme[t]_rdma_remove_one(),
thus it is guaranteed that *_remove_one() is the last one who puts the
reference on the device, i.e.:

nvmet_rdma_remove_one(...)
{
     ...
     flush_scheduled_work();
     WARN_ON(!nvmet_rdma_dev_put(ndev));
}

I added the WARN_ON line to spot violators.  Seems it works as expected,
at least according to my smoke test: map 128 devices, unload mlx4_ib to
invoke *_remove_one().

If you do not have such guarantees, that *_remove_one() puts the last
reference, then you have to have some other strict protections to avoid
a race between setting the NULL pointer and others who can observe
that pointer.


>
>> +       device = ib_get_client_data(cma_id->device, &ig.rdma_ib_client);
>> +       if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount)))
>> +               device = NULL;
>> +       rcu_read_unlock();
>> +
>> +       return device;
>> +}
>>   -     list_for_each_entry(device, &ig.device_list, ig_list)
>> -               /* find if there's a match using the node GUID */
>> -               if (device->ib_device->node_guid ==
>> cma_id->device->node_guid)
>> -                       goto inc_refcnt;
>> +static struct iser_device *iser_device_alloc(struct ib_device *ib_device)
>> +{
>> +       struct iser_device *device;
>>         device = kzalloc(sizeof *device, GFP_KERNEL);
>> -       if (device == NULL)
>> -               goto out;
>> +       if (unlikely(device == NULL))
>> +               return NULL;
>>   -     /* assign this device to the device */
>> -       device->ib_device = cma_id->device;
>> -       /* init the device and link it into ig device list */
>> +       refcount_set(&device->refcount, 1);
>
>
> Nice cleanup!
>
> Would be good if the refcount change is splitted out of this
> patch as its not strictly related to the patch.

Yep.

--
Roman

>
>
>> +       device->ib_device = ib_device;
>>         if (iser_create_device_ib_res(device)) {
>>                 kfree(device);
>> -               device = NULL;
>> -               goto out;
>> +               return NULL;
>>         }
>> -       list_add(&device->ig_list, &ig.device_list);
>>   -inc_refcnt:
>> -       device->refcount++;
>> -out:
>> -       mutex_unlock(&ig.device_list_mutex);
>>         return device;
>>   }
>>     /* if there's no demand for this device, release it */
>>   static void iser_device_try_release(struct iser_device *device)
>>   {
>> -       mutex_lock(&ig.device_list_mutex);
>> -       device->refcount--;
>> -       iser_info("device %p refcount %d\n", device, device->refcount);
>> -       if (!device->refcount) {
>> +       iser_info("device %p refcount %d\n", device,
>> +                 refcount_read(&device->refcount));
>> +       if (refcount_dec_and_test(&device->refcount)) {
>>                 iser_free_device_ib_res(device);
>> -               list_del(&device->ig_list);
>>                 kfree(device);
>>         }
>> -       mutex_unlock(&ig.device_list_mutex);
>> +}
>> +
>> +void iser_ib_client_add_one(struct ib_device *ib_device)
>> +{
>> +       struct iser_device *device;
>> +
>> +       device = iser_device_alloc(ib_device);
>> +       if (!WARN_ON(!device))
>> +               ib_set_client_data(ib_device, &ig.rdma_ib_client, device);
>> +}
>> +
>> +void iser_ib_client_remove_one(struct ib_device *ib_device, void
>> *client_data)
>> +{
>> +       struct iser_device *device = client_data;
>> +
>> +       if (unlikely(!device))
>> +               return;
>> +
>> +       ib_set_client_data(ib_device, &ig.rdma_ib_client, NULL);
>> +       /* Paired with iser_device_find_by_ib_device() */
>> +       synchronize_rcu();
>> +       iser_device_try_release(device);
>>   }
>>     /**
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux