> From: Yishai Hadas <yishaih@xxxxxxxxxxxxxxxxxx> > Sent: Thursday, October 31, 2019 2:36 PM > > External Email > > ---------------------------------------------------------------------- > On 10/30/2019 11:44 AM, Michal Kalderon wrote: > > Create some common API's for adding entries to a xa_mmap. > > Searching for an entry and freeing one. > > > > Most of the code was copied from the efa driver almost as is, just > > renamed function to be generic and not efa specific. > > The fact that this code moved to core enabled managing it differently, > > so that now entries can be removed and deleted when driver+user are > > done with them. This enabled changing the insert algorithm in > > comparison to what was done in efa. > > > > Signed-off-by: Ariel Elior <ariel.elior@xxxxxxxxxxx> > > Signed-off-by: Michal Kalderon <michal.kalderon@xxxxxxxxxxx> > > --- > > drivers/infiniband/core/device.c | 1 + > > drivers/infiniband/core/ib_core_uverbs.c | 201 > +++++++++++++++++++++++++++++++ > > drivers/infiniband/core/rdma_core.c | 1 + > > drivers/infiniband/core/uverbs_cmd.c | 2 + > > include/rdma/ib_verbs.h | 34 ++++++ > > 5 files changed, 239 insertions(+) > > > > diff --git a/drivers/infiniband/core/device.c > > b/drivers/infiniband/core/device.c > > index a667636f74bf..bf3a683057bc 100644 > > --- a/drivers/infiniband/core/device.c > > +++ b/drivers/infiniband/core/device.c > > @@ -2629,6 +2629,7 @@ void ib_set_device_ops(struct ib_device *dev, > const struct ib_device_ops *ops) > > SET_DEVICE_OP(dev_ops, map_mr_sg_pi); > > SET_DEVICE_OP(dev_ops, map_phys_fmr); > > SET_DEVICE_OP(dev_ops, mmap); > > + SET_DEVICE_OP(dev_ops, mmap_free); > > SET_DEVICE_OP(dev_ops, modify_ah); > > SET_DEVICE_OP(dev_ops, modify_cq); > > SET_DEVICE_OP(dev_ops, modify_device); diff --git > > a/drivers/infiniband/core/ib_core_uverbs.c > > b/drivers/infiniband/core/ib_core_uverbs.c > > index b74d2a2fb342..1ffc89fd5d94 100644 > > --- a/drivers/infiniband/core/ib_core_uverbs.c > > +++ b/drivers/infiniband/core/ib_core_uverbs.c > > @@ -71,3 +71,204 @@ int rdma_user_mmap_io(struct ib_ucontext > *ucontext, struct vm_area_struct *vma, > > return 0; > > } > > EXPORT_SYMBOL(rdma_user_mmap_io); > > + > > +/** > > + * rdma_user_mmap_entry_get() - Get an entry from the mmap_xa. > > + * > > + * @ucontext: associated user context. > > + * @key: the key received from rdma_user_mmap_entry_insert which > > + * is provided by user as the address to map. > > + * @vma: the vma related to the current mmap call. > > + * > > + * This function is called when a user tries to mmap a key it > > + * initially received from the driver. The key was created by > > + * the function rdma_user_mmap_entry_insert. > > + * This function increases the refcnt of the entry so that it won't > > + * be deleted from the xa in the meantime. > > + * > > + * Return an entry if exists or NULL if there is no match. > > + */ > > +struct rdma_user_mmap_entry * > > +rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, u64 key, > > + struct vm_area_struct *vma) > > Where @vma is used in this function ? I would expect that this API will return > the entry pointed by @key without any relation to the @vma, wasn't that > the plan ? Yes, seems this is left over from first version of the series that also tried making the Remap common. The parameter can be removed. > > > +{ > > + struct rdma_user_mmap_entry *entry; > > + u64 mmap_page; > > + > > + mmap_page = key >> PAGE_SHIFT; > > + if (mmap_page > U32_MAX) > > + return NULL; > > + > > + xa_lock(&ucontext->mmap_xa); > > + > > + entry = xa_load(&ucontext->mmap_xa, mmap_page); > > + > > + /* if refcount is zero, entry is already being deleted */ > > + if (!entry || entry->invalid || !kref_get_unless_zero(&entry->ref)) > > + goto err; > > + > > + xa_unlock(&ucontext->mmap_xa); > > + > > + ibdev_dbg(ucontext->device, > > + "mmap: key[%#llx] npages[%#x] returned\n", > > + key, entry->npages); > > + > > + return entry; > > + > > +err: > > + xa_unlock(&ucontext->mmap_xa); > > + return NULL; > > +} > > +EXPORT_SYMBOL(rdma_user_mmap_entry_get); > > + > > +void rdma_user_mmap_entry_free(struct kref *kref) { > > + struct rdma_user_mmap_entry *entry = > > + container_of(kref, struct rdma_user_mmap_entry, ref); > > + struct ib_ucontext *ucontext = entry->ucontext; > > + unsigned long i; > > + > > + /* need to erase all entries occupied by this single entry */ > > + xa_lock(&ucontext->mmap_xa); > > + for (i = 0; i < entry->npages; i++) > > + __xa_erase(&ucontext->mmap_xa, entry->mmap_page + i); > > + xa_unlock(&ucontext->mmap_xa); > > + > > + ibdev_dbg(ucontext->device, > > + "mmap: key[%#llx] npages[%#x] removed\n", > > + rdma_user_mmap_get_key(entry), > > + entry->npages); > > + > > + if (ucontext->device->ops.mmap_free) > > + ucontext->device->ops.mmap_free(entry); > > +} > > + > > +/** > > + * rdma_user_mmap_entry_put() - Drop reference to the mmap entry > > + * > > + * @ucontext: associated user context. > > + * @entry: an entry in the mmap_xa. > > + * > > + * This function is called when the mapping is closed if it was > > + * an io mapping or when the driver is done with the entry for > > + * some other reason. > > + * Should be called after rdma_user_mmap_entry_get was called > > + * and entry is no longer needed. This function will erase the > > + * entry and free it if its refcnt reaches zero. > > + */ > > +void rdma_user_mmap_entry_put(struct ib_ucontext *ucontext, > > + struct rdma_user_mmap_entry *entry) { > > + kref_put(&entry->ref, rdma_user_mmap_entry_free); } > > +EXPORT_SYMBOL(rdma_user_mmap_entry_put); > > + > > +/** > > + * rdma_user_mmap_entry_remove() - Drop reference to entry and > > + * mark it as invalid. > > + * > > + * @ucontext: associated user context. > > + * @entry: the entry to insert into the mmap_xa */ void > > +rdma_user_mmap_entry_remove(struct ib_ucontext *ucontext, > > + struct rdma_user_mmap_entry *entry) { > > + if (!entry) > > + return; > > + > > + entry->invalid = true; > > + kref_put(&entry->ref, rdma_user_mmap_entry_free); } > > +EXPORT_SYMBOL(rdma_user_mmap_entry_remove); > > + > > +/** > > + * rdma_user_mmap_entry_insert() - Insert an entry to the mmap_xa. > > + * > > + * @ucontext: associated user context. > > + * @entry: the entry to insert into the mmap_xa > > + * @length: length of the address that will be mmapped > > + * > > + * This function should be called by drivers that use the > > +rdma_user_mmap > > + * interface for handling user mmapped addresses. The database is > > +handled in > > + * the core and helper functions are provided to insert entries into > > +the > > + * database and extract entries when the user calls mmap with the given > key. > > + * The function allocates a unique key that should be provided to > > +user, the user > > + * will use the key to retrieve information such as address to > > + * be mapped and how. > > + * > > + * Return: 0 on success and -ENOMEM on failure */ int > > +rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, > > + struct rdma_user_mmap_entry *entry, > > + size_t length) > > +{ > > + struct ib_uverbs_file *ufile = ucontext->ufile; > > + XA_STATE(xas, &ucontext->mmap_xa, 0); > > + u32 xa_first, xa_last, npages; > > + int err, i; > > + > > + if (!entry) > > + return -EINVAL; > > + > > + kref_init(&entry->ref); > > + entry->ucontext = ucontext; > > + > > + /* We want the whole allocation to be done without interruption > > + * from a different thread. The allocation requires finding a > > + * free range and storing. During the xa_insert the lock could be > > + * released, we don't want another thread taking the gap. > > + */ > > + mutex_lock(&ufile->umap_lock); > > + > > + xa_lock(&ucontext->mmap_xa); > > + > > + /* We want to find an empty range */ > > + npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE); > > + entry->npages = npages; > > + while (true) { > > + /* First find an empty index */ > > + xas_find_marked(&xas, U32_MAX, XA_FREE_MARK); > > + if (xas.xa_node == XAS_RESTART) > > + goto err_unlock; > > + > > + xa_first = xas.xa_index; > > + > > + /* Is there enough room to have the range? */ > > + if (check_add_overflow(xa_first, npages, &xa_last)) > > + goto err_unlock; > > + > > + /* Now look for the next present entry. If such doesn't > > + * exist, we found an empty range and can proceed > > + */ > > + xas_next_entry(&xas, xa_last - 1); > > + if (xas.xa_node == XAS_BOUNDS || xas.xa_index >= xa_last) > > + break; > > + /* o/w look for the next free entry */ > > + } > > + > > + for (i = xa_first; i < xa_last; i++) { > > + err = __xa_insert(&ucontext->mmap_xa, i, entry, > GFP_KERNEL); > > + if (err) > > + goto err_undo; > > + } > > + > > + entry->mmap_page = xa_first; > > + xa_unlock(&ucontext->mmap_xa); > > + > > + mutex_unlock(&ufile->umap_lock); > > + ibdev_dbg(ucontext->device, > > + "mmap: key[%#llx] npages[%#x] inserted\n", > > + rdma_user_mmap_get_key(entry), npages); > > + > > + return 0; > > + > > +err_undo: > > + for (; i > xa_first; i--) > > + __xa_erase(&ucontext->mmap_xa, i - 1); > > + > > +err_unlock: > > + xa_unlock(&ucontext->mmap_xa); > > + mutex_unlock(&ufile->umap_lock); > > + return -ENOMEM; > > +} > > +EXPORT_SYMBOL(rdma_user_mmap_entry_insert); > > diff --git a/drivers/infiniband/core/rdma_core.c > > b/drivers/infiniband/core/rdma_core.c > > index ccf4d069c25c..6c72773faf29 100644 > > --- a/drivers/infiniband/core/rdma_core.c > > +++ b/drivers/infiniband/core/rdma_core.c > > @@ -817,6 +817,7 @@ static void ufile_destroy_ucontext(struct > ib_uverbs_file *ufile, > > rdma_restrack_del(&ucontext->res); > > > > ib_dev->ops.dealloc_ucontext(ucontext); > > + WARN_ON(!xa_empty(&ucontext->mmap_xa)); > > kfree(ucontext); > > > > ufile->ucontext = NULL; > > diff --git a/drivers/infiniband/core/uverbs_cmd.c > > b/drivers/infiniband/core/uverbs_cmd.c > > index 14a80fd9f464..06ed32c8662f 100644 > > --- a/drivers/infiniband/core/uverbs_cmd.c > > +++ b/drivers/infiniband/core/uverbs_cmd.c > > @@ -252,6 +252,8 @@ static int ib_uverbs_get_context(struct > uverbs_attr_bundle *attrs) > > ucontext->closing = false; > > ucontext->cleanup_retryable = false; > > > > + xa_init_flags(&ucontext->mmap_xa, XA_FLAGS_ALLOC); > > + > > ret = get_unused_fd_flags(O_CLOEXEC); > > if (ret < 0) > > goto err_free; > > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index > > 6a47ba85c54c..8a87c9d442bc 100644 > > --- a/include/rdma/ib_verbs.h > > +++ b/include/rdma/ib_verbs.h > > @@ -1471,6 +1471,7 @@ struct ib_ucontext { > > * Implementation details of the RDMA core, don't use in drivers: > > */ > > struct rdma_restrack_entry res; > > + struct xarray mmap_xa; > > }; > > > > struct ib_uobject { > > @@ -2251,6 +2252,20 @@ struct iw_cm_conn_param; > > > > #define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct > > > > +struct rdma_user_mmap_entry { > > + struct kref ref; > > + struct ib_ucontext *ucontext; > > + u32 npages; > > + u32 mmap_page; > > + bool invalid; > > +}; > > + > > +static inline u64 > > +rdma_user_mmap_get_key(const struct rdma_user_mmap_entry > *entry) { > > + return (u64)entry->mmap_page << PAGE_SHIFT; } > > + > > /** > > * struct ib_device_ops - InfiniBand device operations > > * This structure defines all the InfiniBand device operations, > > providers will @@ -2363,6 +2378,13 @@ struct ib_device_ops { > > struct ib_udata *udata); > > void (*dealloc_ucontext)(struct ib_ucontext *context); > > int (*mmap)(struct ib_ucontext *context, struct vm_area_struct > > *vma); > > + /** > > + * This will be called once refcount of an entry in mmap_xa reaches > > + * zero. The type of the memory that was mapped may differ > between > > + * entries and is opaque to the rdma_user_mmap interface. > > + * Therefore needs to be implemented by the driver in mmap_free. > > + */ > > + void (*mmap_free)(struct rdma_user_mmap_entry *entry); > > void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); > > int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata); > > void (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata); @@ > > -2801,6 +2823,18 @@ static inline int rdma_user_mmap_io(struct > ib_ucontext *ucontext, > > return -EINVAL; > > } > > #endif > > +int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, > > + struct rdma_user_mmap_entry *entry, > > + size_t length); > > +struct rdma_user_mmap_entry * > > +rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, u64 key, > > + struct vm_area_struct *vma); > > + > > +void rdma_user_mmap_entry_put(struct ib_ucontext *ucontext, > > + struct rdma_user_mmap_entry *entry); > > + > > +void rdma_user_mmap_entry_remove(struct ib_ucontext *ucontext, > > + struct rdma_user_mmap_entry *entry); > > > > static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, > size_t len) > > { > >