On Thu, Mar 18, 2021 at 01:15:47PM +0200, Leon Romanovsky wrote: > From: Maor Gottlieb <maorg@xxxxxxxxxx> > > MEMIC buffer, in addition to regular read and write operations, can > support atomic operations from the host. > > Introduce and implement new UAPI to allocate address space for MEMIC > operations such as atomic. This includes: > > 1. Expose new IOCTL for request mapping of MEMIC operation. > 2. Hold the operations address in a list, so same operation to same DM > will be allocated only once. > 3. Manage refcount on the mlx5_ib_dm object, so it would be keep valid > until all addresses were unmapped. > > Signed-off-by: Maor Gottlieb <maorg@xxxxxxxxxx> > Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxx> > --- > drivers/infiniband/hw/mlx5/dm.c | 196 +++++++++++++++++++++-- > drivers/infiniband/hw/mlx5/dm.h | 2 + > drivers/infiniband/hw/mlx5/main.c | 7 +- > drivers/infiniband/hw/mlx5/mlx5_ib.h | 16 +- > include/uapi/rdma/mlx5_user_ioctl_cmds.h | 11 ++ > 5 files changed, 214 insertions(+), 18 deletions(-) > > -- > 2.30.2 > > diff --git a/drivers/infiniband/hw/mlx5/dm.c b/drivers/infiniband/hw/mlx5/dm.c > index 97a925d43312..ee4ee197a626 100644 > --- a/drivers/infiniband/hw/mlx5/dm.c > +++ b/drivers/infiniband/hw/mlx5/dm.c > @@ -150,12 +150,14 @@ static int mlx5_cmd_alloc_memic_op(struct mlx5_dm *dm, phys_addr_t addr, > } > > static int add_dm_mmap_entry(struct ib_ucontext *context, > - struct mlx5_ib_dm *mdm, u64 address) > + struct mlx5_user_mmap_entry *mentry, u8 mmap_flag, > + size_t size, u64 address) > { > - mdm->mentry.mmap_flag = MLX5_IB_MMAP_TYPE_MEMIC; > - mdm->mentry.address = address; > + mentry->mmap_flag = mmap_flag; > + mentry->address = address; > + > return rdma_user_mmap_entry_insert_range( > - context, &mdm->mentry.rdma_entry, mdm->size, > + context, &mentry->rdma_entry, size, > MLX5_IB_MMAP_DEVICE_MEM << 16, > (MLX5_IB_MMAP_DEVICE_MEM << 16) + (1UL << 16) - 1); > } > @@ -183,6 +185,114 @@ static inline int check_dm_type_support(struct mlx5_ib_dev *dev, u32 type) > return 0; > } > > +void mlx5_ib_dm_memic_free(struct kref *kref) > +{ > + struct mlx5_ib_dm *dm = > + container_of(kref, struct mlx5_ib_dm, memic.ref); > + struct mlx5_ib_dev *dev = to_mdev(dm->ibdm.device); > + > + mlx5_cmd_dealloc_memic(&dev->dm, dm->dev_addr, dm->size); > + kfree(dm); > +} > + > +static int copy_op_to_user(struct mlx5_ib_dm_op_entry *op_entry, > + struct uverbs_attr_bundle *attrs) > +{ > + u64 start_offset; > + u16 page_idx; > + int err; > + > + page_idx = op_entry->mentry.rdma_entry.start_pgoff & 0xFFFF; > + start_offset = op_entry->op_addr & ~PAGE_MASK; > + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_PAGE_INDEX, > + &page_idx, sizeof(page_idx)); > + if (err) > + return err; > + > + return uverbs_copy_to(attrs, > + MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_START_OFFSET, > + &start_offset, sizeof(start_offset)); > +} > + > +static int map_existing_op(struct mlx5_ib_dm *dm, u8 op, > + struct uverbs_attr_bundle *attrs) > +{ > + struct mlx5_ib_dm_op_entry *op_entry; > + > + op_entry = xa_load(&dm->memic.ops, op); > + if (!op_entry) > + return -ENOENT; > + > + return copy_op_to_user(op_entry, attrs); > +} > + > +static int UVERBS_HANDLER(MLX5_IB_METHOD_DM_MAP_OP_ADDR)( > + struct uverbs_attr_bundle *attrs) > +{ > + struct ib_uobject *uobj = uverbs_attr_get_uobject( > + attrs, MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_HANDLE); > + struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); > + struct ib_dm *ibdm = uobj->object; > + struct mlx5_ib_dm *dm = to_mdm(ibdm); > + struct mlx5_ib_dm_op_entry *op_entry; > + int err; > + u8 op; > + > + err = uverbs_copy_from(&op, attrs, MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_OP); > + if (err) > + return err; > + > + if (!(MLX5_CAP_DEV_MEM(dev->mdev, memic_operations) & BIT(op))) > + return -EOPNOTSUPP; > + > + mutex_lock(&dm->memic.ops_xa_lock); > + err = map_existing_op(dm, op, attrs); > + if (!err || err != -ENOENT) > + goto err_unlock; > + > + op_entry = kzalloc(sizeof(*op_entry), GFP_KERNEL); > + if (!op_entry) > + goto err_unlock; > + > + err = mlx5_cmd_alloc_memic_op(&dev->dm, dm->dev_addr, op, > + &op_entry->op_addr); > + if (err) { > + kfree(op_entry); > + goto err_unlock; > + } > + op_entry->op = op; > + op_entry->dm = dm; > + > + err = add_dm_mmap_entry(uobj->context, &op_entry->mentry, > + MLX5_IB_MMAP_TYPE_MEMIC_OP, dm->size, > + op_entry->op_addr & PAGE_MASK); > + if (err) { > + mlx5_cmd_dealloc_memic_op(&dev->dm, dm->dev_addr, op); > + kfree(op_entry); > + goto err_unlock; > + } > + /* From this point, entry will be freed by mmap_free */ > + kref_get(&dm->memic.ref); > + > + err = copy_op_to_user(op_entry, attrs); > + if (err) > + goto err_remove; > + > + err = xa_insert(&dm->memic.ops, op, op_entry, GFP_KERNEL); > + if (err) > + goto err_remove; > + mutex_unlock(&dm->memic.ops_xa_lock); > + > + return 0; > + > +err_remove: > + rdma_user_mmap_entry_remove(&op_entry->mentry.rdma_entry); > +err_unlock: > + mutex_unlock(&dm->memic.ops_xa_lock); > + > + return err; > +} > + > static int handle_alloc_dm_memic(struct ib_ucontext *ctx, struct mlx5_ib_dm *dm, > struct ib_dm_alloc_attr *attr, > struct uverbs_attr_bundle *attrs) > @@ -193,6 +303,9 @@ static int handle_alloc_dm_memic(struct ib_ucontext *ctx, struct mlx5_ib_dm *dm, > int err; > u64 address; > > + kref_init(&dm->memic.ref); > + xa_init(&dm->memic.ops); > + mutex_init(&dm->memic.ops_xa_lock); > dm->size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE); > > err = mlx5_cmd_alloc_memic(dm_db, &dm->dev_addr, > @@ -203,18 +316,17 @@ static int handle_alloc_dm_memic(struct ib_ucontext *ctx, struct mlx5_ib_dm *dm, > } > > address = dm->dev_addr & PAGE_MASK; > - err = add_dm_mmap_entry(ctx, dm, address); > + err = add_dm_mmap_entry(ctx, &dm->memic.mentry, MLX5_IB_MMAP_TYPE_MEMIC, > + dm->size, address); > if (err) { > mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size); > kfree(dm); > return err; > } > > - page_idx = dm->mentry.rdma_entry.start_pgoff & 0xFFFF; > - err = uverbs_copy_to(attrs, > - MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, > - &page_idx, > - sizeof(page_idx)); > + page_idx = dm->memic.mentry.rdma_entry.start_pgoff & 0xFFFF; > + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, > + &page_idx, sizeof(page_idx)); > if (err) > goto err_copy; > > @@ -228,7 +340,7 @@ static int handle_alloc_dm_memic(struct ib_ucontext *ctx, struct mlx5_ib_dm *dm, > return 0; > > err_copy: > - rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry); > + rdma_user_mmap_entry_remove(&dm->memic.mentry.rdma_entry); > > return err; > } > @@ -292,6 +404,7 @@ struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev, > return ERR_PTR(-ENOMEM); > > dm->type = type; > + dm->ibdm.device = ibdev; > > switch (type) { > case MLX5_IB_UAPI_DM_TYPE_MEMIC: > @@ -323,6 +436,19 @@ struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev, > return ERR_PTR(err); > } > > +static void dm_memic_remove_ops(struct mlx5_ib_dm *dm) > +{ > + struct mlx5_ib_dm_op_entry *entry; > + unsigned long idx; > + > + mutex_lock(&dm->memic.ops_xa_lock); > + xa_for_each(&dm->memic.ops, idx, entry) { > + xa_erase(&dm->memic.ops, idx); > + rdma_user_mmap_entry_remove(&entry->mentry.rdma_entry); > + } > + mutex_unlock(&dm->memic.ops_xa_lock); > +} > + > int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs) > { > struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context( > @@ -333,7 +459,8 @@ int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs) > > switch (dm->type) { > case MLX5_IB_UAPI_DM_TYPE_MEMIC: > - rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry); > + dm_memic_remove_ops(dm); > + rdma_user_mmap_entry_remove(&dm->memic.mentry.rdma_entry); > return 0; > case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: > ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING, > @@ -359,6 +486,31 @@ int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs) > return 0; > } > > +void mlx5_ib_dm_mmap_free(struct mlx5_ib_dev *dev, > + struct mlx5_user_mmap_entry *mentry) > +{ > + struct mlx5_ib_dm_op_entry *op_entry; > + struct mlx5_ib_dm *mdm; > + > + switch (mentry->mmap_flag) { > + case MLX5_IB_MMAP_TYPE_MEMIC: > + mdm = container_of(mentry, struct mlx5_ib_dm, memic.mentry); > + kref_put(&mdm->memic.ref, mlx5_ib_dm_memic_free); > + break; > + case MLX5_IB_MMAP_TYPE_MEMIC_OP: > + op_entry = container_of(mentry, struct mlx5_ib_dm_op_entry, > + mentry); > + mdm = op_entry->dm; > + mlx5_cmd_dealloc_memic_op(&dev->dm, mdm->dev_addr, > + op_entry->op); > + kfree(op_entry); > + kref_put(&mdm->memic.ref, mlx5_ib_dm_memic_free); > + break; > + default: > + WARN_ON(true); > + } > +} > + > ADD_UVERBS_ATTRIBUTES_SIMPLE( > mlx5_ib_dm, UVERBS_OBJECT_DM, UVERBS_METHOD_DM_ALLOC, > UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, > @@ -368,8 +520,28 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE( > UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE, > enum mlx5_ib_uapi_dm_type, UA_OPTIONAL)); > > +DECLARE_UVERBS_NAMED_METHOD( > + MLX5_IB_METHOD_DM_MAP_OP_ADDR, > + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_HANDLE, > + UVERBS_OBJECT_DM, > + UVERBS_ACCESS_READ, > + UA_MANDATORY), > + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_OP, > + UVERBS_ATTR_TYPE(u8), > + UA_MANDATORY), > + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_START_OFFSET, > + UVERBS_ATTR_TYPE(u64), > + UA_MANDATORY), > + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_PAGE_INDEX, > + UVERBS_ATTR_TYPE(u16), > + UA_OPTIONAL)); > + > +DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DM, > + &UVERBS_METHOD(MLX5_IB_METHOD_DM_MAP_OP_ADDR)); > + > const struct uapi_definition mlx5_ib_dm_defs[] = { > UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm), > + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DM), > {}, > }; > > diff --git a/drivers/infiniband/hw/mlx5/dm.h b/drivers/infiniband/hw/mlx5/dm.h > index adb39d3d8131..56cf1ba9c985 100644 > --- a/drivers/infiniband/hw/mlx5/dm.h > +++ b/drivers/infiniband/hw/mlx5/dm.h > @@ -8,6 +8,8 @@ > > #include "mlx5_ib.h" > > +void mlx5_ib_dm_mmap_free(struct mlx5_ib_dev *dev, > + struct mlx5_user_mmap_entry *mentry); > void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, > u64 length); > void mlx5_cmd_dealloc_memic_op(struct mlx5_dm *dm, phys_addr_t addr, > diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c > index 49c8c60d9520..6908db28b796 100644 > --- a/drivers/infiniband/hw/mlx5/main.c > +++ b/drivers/infiniband/hw/mlx5/main.c > @@ -2090,14 +2090,11 @@ static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry) > struct mlx5_user_mmap_entry *mentry = to_mmmap(entry); > struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device); > struct mlx5_var_table *var_table = &dev->var_table; > - struct mlx5_ib_dm *mdm; > > switch (mentry->mmap_flag) { > case MLX5_IB_MMAP_TYPE_MEMIC: > - mdm = container_of(mentry, struct mlx5_ib_dm, mentry); > - mlx5_cmd_dealloc_memic(&dev->dm, mdm->dev_addr, > - mdm->size); > - kfree(mdm); > + case MLX5_IB_MMAP_TYPE_MEMIC_OP: > + mlx5_ib_dm_mmap_free(dev, mentry); > break; > case MLX5_IB_MMAP_TYPE_VAR: > mutex_lock(&var_table->bitmap_lock); > diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h > index ae971de6e934..b714131f87b7 100644 > --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h > +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h > @@ -166,6 +166,7 @@ enum mlx5_ib_mmap_type { > MLX5_IB_MMAP_TYPE_VAR = 2, > MLX5_IB_MMAP_TYPE_UAR_WC = 3, > MLX5_IB_MMAP_TYPE_UAR_NC = 4, > + MLX5_IB_MMAP_TYPE_MEMIC_OP = 5, > }; > > struct mlx5_bfreg_info { > @@ -618,18 +619,30 @@ struct mlx5_user_mmap_entry { > u32 page_idx; > }; > > +struct mlx5_ib_dm_op_entry { > + struct mlx5_user_mmap_entry mentry; > + phys_addr_t op_addr; > + struct mlx5_ib_dm *dm; > + u8 op; > +}; > + > struct mlx5_ib_dm { > struct ib_dm ibdm; > phys_addr_t dev_addr; > u32 type; > size_t size; > union { > + struct { > + struct mlx5_user_mmap_entry mentry; > + struct xarray ops; > + struct mutex ops_xa_lock; > + struct kref ref; > + } memic; > struct { > u32 obj_id; > } icm_dm; This union is making it much too difficult to read and understand now. An optional kref inside a structure is too far Please split it to more types and have proper typesafety throughout It looks mostly fine otherwise, the error flows are a bit hard to read though, when a new type is added this should also get re-organized so we don't do stuff like: err_free: /* In MEMIC error flow, dm will be freed internally */ if (type != MLX5_IB_UAPI_DM_TYPE_MEMIC) kfree(dm); I'd inline the checks from check_dm_type_support() into their respective allocation functions too Jason