On Tue, Jun 18, 2019 at 08:15:38PM +0300, Leon Romanovsky wrote: > From: Yishai Hadas <yishaih@xxxxxxxxxxxx> > > Enable subscription for device events over DEVX. > > Each subscription is added to the two level XA data structure according > to its event number and the DEVX object information in case was given > with the given target fd. > > Those events will be reported over the given fd once will occur. > Downstream patches will mange the dispatching to any subscription. > > Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxxxx> > Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> > drivers/infiniband/hw/mlx5/devx.c | 564 ++++++++++++++++++++++- > include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 + > 2 files changed, 566 insertions(+), 7 deletions(-) > > diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c > index e9b9ba5a3e9a..304b13e7a265 100644 > +++ b/drivers/infiniband/hw/mlx5/devx.c > @@ -14,6 +14,7 @@ > #include <linux/mlx5/driver.h> > #include <linux/mlx5/fs.h> > #include "mlx5_ib.h" > +#include <linux/xarray.h> > > #define UVERBS_MODULE_NAME mlx5_ib > #include <rdma/uverbs_named_ioctl.h> > @@ -33,6 +34,37 @@ struct devx_async_data { > struct mlx5_ib_uapi_devx_async_cmd_hdr hdr; > }; > > +/* first level XA value data structure */ > +struct devx_event { > + struct xarray object_ids; /* second XA level, Key = object id */ > + struct list_head unaffiliated_list; > +}; > + > +/* second level XA value data structure */ > +struct devx_obj_event { > + struct rcu_head rcu; > + struct list_head obj_sub_list; > +}; > + > +struct devx_event_subscription { > + struct list_head file_list; /* headed in private_data-> > + * subscribed_events_list > + */ > + struct list_head xa_list; /* headed in devx_event->unaffiliated_list or > + * devx_obj_event->obj_sub_list > + */ > + struct list_head obj_list; /* headed in devx_object */ > + > + u32 xa_key_level1; > + u32 xa_key_level2; > + struct rcu_head rcu; > + u64 cookie; > + bool is_obj_related; > + struct ib_uobject *fd_uobj; > + void *object; /* May need direct access upon hot unplug */ This should be a 'struct file *' and have a better name. And I'm unclear why we need to store both the ib_uobject and the struct file for the same thing? And why are we storing the uobj here instead of the struct devx_async_event_file *? Since uobj->object == flip && filp->private_data == uobj, I have a hard time to understand why we need both things, it seems to me that if we get the fget on the filp then we can rely on the filp->private_data to get back to the devx_async_event_file. > + struct eventfd_ctx *eventfd; > +}; > + > /* > * As the obj_id in the firmware is not globally unique the object type > * must be considered upon checking for a valid object id. > @@ -1143,14 +1275,53 @@ static void devx_cleanup_mkey(struct devx_obj *obj) > write_unlock_irqrestore(&table->lock, flags); > } > > +static void devx_cleanup_subscription(struct mlx5_ib_dev *dev, > + struct devx_event_subscription *sub) > +{ > + list_del_rcu(&sub->file_list); > + list_del_rcu(&sub->xa_list); > + > + if (sub->is_obj_related) { is_obj_related looks like it is just list_empty(obj_list)?? Success oriented flow > @@ -1523,6 +1700,350 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)( > return err; > } > > +static void > +subscribe_event_xa_dealloc(struct mlx5_devx_event_table *devx_event_table, > + u32 key_level1, > + u32 key_level2, > + struct devx_obj_event *alloc_obj_event) > +{ > + struct devx_event *event; > + > + /* Level 1 is valid for future use - no need to free */ > + if (!alloc_obj_event) > + return; > + > + event = xa_load(&devx_event_table->event_xa, key_level1); > + WARN_ON(!event); > + > + xa_erase(&event->object_ids, key_level2); Shoulnd't this only erase if the value stored is NULL? > + kfree(alloc_obj_event); > +} > + > +static int > +subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table, > + u32 key_level1, > + bool is_level2, > + u32 key_level2, > + struct devx_obj_event **alloc_obj_event) > +{ > + struct devx_obj_event *obj_event; > + struct devx_event *event; > + bool new_entry_level1 = false; > + int err; > + > + event = xa_load(&devx_event_table->event_xa, key_level1); > + if (!event) { > + event = kzalloc(sizeof(*event), GFP_KERNEL); > + if (!event) > + return -ENOMEM; > + > + new_entry_level1 = true; > + INIT_LIST_HEAD(&event->unaffiliated_list); > + xa_init(&event->object_ids); > + > + err = xa_insert(&devx_event_table->event_xa, > + key_level1, > + event, > + GFP_KERNEL); > + if (err) > + goto end; > + } > + > + if (!is_level2) > + return 0; > + > + obj_event = xa_load(&event->object_ids, key_level2); > + if (!obj_event) { > + err = xa_reserve(&event->object_ids, key_level2, GFP_KERNEL); > + if (err) > + goto err_level1; > + > + obj_event = kzalloc(sizeof(*obj_event), GFP_KERNEL); > + if (!obj_event) { > + err = -ENOMEM; > + goto err_level2; > + } > + > + INIT_LIST_HEAD(&obj_event->obj_sub_list); > + *alloc_obj_event = obj_event; This is goofy, just store the empty obj_event in the xa instead of using xa_reserve, and when you go to do the error unwind just delete any level2' devx_obj_event' that has a list_empty(obj_sub_list), get rid of the wonky alloc_obj_event stuff. The best configuration would be to use devx_cleanup_subscription to undo the partially ready subscription. > + } > + > + return 0; > + > +err_level2: > + xa_erase(&event->object_ids, key_level2); > + > +err_level1: > + if (new_entry_level1) > + xa_erase(&devx_event_table->event_xa, key_level1); > +end: > + if (new_entry_level1) > + kfree(event); Can't do this, once the level1 is put in the tree it could be referenced by the irqs. At least it needs a kfree_rcu, most likely it is simpler to just leave it. > +#define MAX_NUM_EVENTS 16 > +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)( > + struct uverbs_attr_bundle *attrs) > +{ > + struct ib_uobject *devx_uobj = uverbs_attr_get_uobject( > + attrs, > + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE); > + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( > + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); > + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); > + struct ib_uobject *fd_uobj; > + struct devx_obj *obj = NULL; > + struct devx_async_event_file *ev_file; > + struct mlx5_devx_event_table *devx_event_table = &dev->devx_event_table; > + u16 *event_type_num_list; > + struct devx_event_subscription **event_sub_arr; > + struct devx_obj_event **event_obj_array_alloc; > + int redirect_fd; > + bool use_eventfd = false; > + int num_events; > + int num_alloc_xa_entries = 0; > + u16 obj_type = 0; > + u64 cookie = 0; > + u32 obj_id = 0; > + int err; > + int i; > + > + if (!c->devx_uid) > + return -EINVAL; > + > + if (!IS_ERR(devx_uobj)) { > + obj = (struct devx_obj *)devx_uobj->object; > + if (obj) > + obj_id = get_dec_obj_id(obj->obj_id); > + } > + > + fd_uobj = uverbs_attr_get_uobject(attrs, > + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE); > + if (IS_ERR(fd_uobj)) > + return PTR_ERR(fd_uobj); > + > + ev_file = container_of(fd_uobj, struct devx_async_event_file, > + uobj); > + > + if (uverbs_attr_is_valid(attrs, > + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM)) { > + err = uverbs_copy_from(&redirect_fd, attrs, > + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM); > + if (err) > + return err; > + > + use_eventfd = true; > + } > + > + if (uverbs_attr_is_valid(attrs, > + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE)) { > + if (use_eventfd) > + return -EINVAL; > + > + err = uverbs_copy_from(&cookie, attrs, > + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE); > + if (err) > + return err; > + } > + > + num_events = uverbs_attr_ptr_get_array_size( > + attrs, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, > + sizeof(u16)); > + > + if (num_events < 0) > + return num_events; > + > + if (num_events > MAX_NUM_EVENTS) > + return -EINVAL; > + > + event_type_num_list = uverbs_attr_get_alloced_ptr(attrs, > + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST); > + > + if (!is_valid_events(dev->mdev, num_events, event_type_num_list, obj)) > + return -EINVAL; > + > + event_sub_arr = uverbs_zalloc(attrs, > + MAX_NUM_EVENTS * sizeof(struct devx_event_subscription *)); > + event_obj_array_alloc = uverbs_zalloc(attrs, > + MAX_NUM_EVENTS * sizeof(struct devx_obj_event *)); There are so many list_heads in the devx_event_subscription, why not use just one of them to store the allocated events instead of this temp array? ie event_list looks good for this purpose. > + > + if (!event_sub_arr || !event_obj_array_alloc) > + return -ENOMEM; > + > + /* Protect from concurrent subscriptions to same XA entries to allow > + * both to succeed > + */ > + mutex_lock(&devx_event_table->event_xa_lock); > + for (i = 0; i < num_events; i++) { > + u32 key_level1; > + > + if (obj) > + obj_type = get_dec_obj_type(obj, > + event_type_num_list[i]); > + key_level1 = event_type_num_list[i] | obj_type << 16; > + > + err = subscribe_event_xa_alloc(devx_event_table, > + key_level1, > + obj ? true : false, > + obj_id, > + &event_obj_array_alloc[i]); Usless ?: > + if (err) > + goto err; > + > + num_alloc_xa_entries++; > + event_sub_arr[i] = kzalloc(sizeof(*event_sub_arr[i]), > + GFP_KERNEL); > + if (!event_sub_arr[i]) > + goto err; > + > + if (use_eventfd) { > + event_sub_arr[i]->eventfd = > + eventfd_ctx_fdget(redirect_fd); > + > + if (IS_ERR(event_sub_arr[i]->eventfd)) { > + err = PTR_ERR(event_sub_arr[i]->eventfd); > + event_sub_arr[i]->eventfd = NULL; > + goto err; > + } > + } > + > + event_sub_arr[i]->cookie = cookie; > + event_sub_arr[i]->fd_uobj = fd_uobj; > + event_sub_arr[i]->object = fd_uobj->object; > + /* May be needed upon cleanup the devx object/subscription */ > + event_sub_arr[i]->xa_key_level1 = key_level1; > + event_sub_arr[i]->xa_key_level2 = obj_id; > + event_sub_arr[i]->is_obj_related = obj ? true : false; Unneeded ?: Jason