RE: [EXT] Re: [PATCH v7 rdma-next 2/7] RDMA/core: Create mmap database and cookie helper functions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



> From: Jason Gunthorpe <jgg@xxxxxxxx>
> Sent: Tuesday, August 20, 2019 4:21 PM
> 
> ----------------------------------------------------------------------
> On Tue, Aug 20, 2019 at 03:18:42PM +0300, Michal Kalderon wrote:
> > Create some common API's for adding entries to a xa_mmap.
> > Searching for an entry and freeing one.
> >
> > Most of the code was copied from the efa driver almost as is, just
> > renamed function to be generic and not efa specific.
> > In addition to original code, the xa_mmap entries are now linked to a
> > umap_priv object and reference counted according to umap operations.
> > The fact that this code moved to core enabled managing it differently,
> > so that now entries can be removed and deleted when driver+user are
> > done with them. This enabled changing the insert algorithm in
> > comparison to what was done in efa.
> >
> > Signed-off-by: Ariel Elior <ariel.elior@xxxxxxxxxxx>
> > Signed-off-by: Michal Kalderon <michal.kalderon@xxxxxxxxxxx>
> >  drivers/infiniband/core/core_priv.h      |  12 +-
> >  drivers/infiniband/core/device.c         |   1 +
> >  drivers/infiniband/core/ib_core_uverbs.c | 343
> +++++++++++++++++++++++++++++--
> >  drivers/infiniband/core/rdma_core.c      |   1 +
> >  drivers/infiniband/core/uverbs_cmd.c     |   1 +
> >  drivers/infiniband/core/uverbs_main.c    |  18 +-
> >  include/rdma/ib_verbs.h                  |  41 +++-
> >  7 files changed, 381 insertions(+), 36 deletions(-)
> >
> > diff --git a/drivers/infiniband/core/core_priv.h
> > b/drivers/infiniband/core/core_priv.h
> > index 6850e973401c..4951ecfbf133 100644
> > +++ b/drivers/infiniband/core/core_priv.h
> > @@ -388,9 +388,17 @@ void rdma_nl_net_exit(struct rdma_dev_net
> *rnet);
> > struct rdma_umap_priv {
> >  	struct vm_area_struct *vma;
> >  	struct list_head list;
> > +	struct rdma_user_mmap_entry *entry;
> >  };
> >
> > -void rdma_umap_priv_init(struct rdma_umap_priv *priv,
> > -			 struct vm_area_struct *vma);
> > +int rdma_umap_priv_init(struct vm_area_struct *vma,
> > +			struct rdma_user_mmap_entry *entry);
> > +
> > +void rdma_umap_priv_delete(struct ib_uverbs_file *ufile,
> > +			   struct rdma_umap_priv *priv);
> > +
> > +void rdma_user_mmap_entries_remove_free(struct ib_ucontext
> > +*ucontext); void rdma_user_mmap_entry_put(struct ib_ucontext
> *ucontext,
> > +			      struct rdma_user_mmap_entry *entry);
> >
> >  #endif /* _CORE_PRIV_H */
> > diff --git a/drivers/infiniband/core/device.c
> > b/drivers/infiniband/core/device.c
> > index 8892862fb759..229977237d1a 100644
> > +++ b/drivers/infiniband/core/device.c
> > @@ -2594,6 +2594,7 @@ void ib_set_device_ops(struct ib_device *dev,
> const struct ib_device_ops *ops)
> >  	SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
> >  	SET_DEVICE_OP(dev_ops, map_phys_fmr);
> >  	SET_DEVICE_OP(dev_ops, mmap);
> > +	SET_DEVICE_OP(dev_ops, mmap_free);
> >  	SET_DEVICE_OP(dev_ops, modify_ah);
> >  	SET_DEVICE_OP(dev_ops, modify_cq);
> >  	SET_DEVICE_OP(dev_ops, modify_device); diff --git
> > a/drivers/infiniband/core/ib_core_uverbs.c
> > b/drivers/infiniband/core/ib_core_uverbs.c
> > index cab7dc922cf0..cce20172cd71 100644
> > +++ b/drivers/infiniband/core/ib_core_uverbs.c
> > @@ -36,41 +36,98 @@
> >  #include "uverbs.h"
> >  #include "core_priv.h"
> >
> > -/*
> > - * Each time we map IO memory into user space this keeps track of the
> mapping.
> > - * When the device is hot-unplugged we 'zap' the mmaps in user space
> > to point
> > - * to the zero page and allow the hot unplug to proceed.
> > +/**
> > + * rdma_umap_priv_init() - Initialize the private data of a vma
> > + *
> > + * @vma: The vm area struct that needs private data
> > + * @entry: entry into the mmap_xa that needs to be linked with
> > + *       this vma
> > + *
> > + * Each time we map IO memory into user space this keeps track
> > + * of the mapping. When the device is hot-unplugged we 'zap' the
> > + * mmaps in user space to point to the zero page and allow the
> > + * hot unplug to proceed.
> >   *
> >   * This is necessary for cases like PCI physical hot unplug as the actual BAR
> >   * memory may vanish after this and access to it from userspace could
> MCE.
> >   *
> >   * RDMA drivers supporting disassociation must have their user space
> designed
> >   * to cope in some way with their IO pages going to the zero page.
> > + *
> > + * We extended the umap list usage to track all memory that was
> > + mapped by
> > + * user space and not only the IO memory. This will occur for drivers
> > + that use
> > + * the mmap_xa database and helper functions
> > + *
> > + * Return 0 on success or -ENOMEM if out of memory
> >   */
> > -void rdma_umap_priv_init(struct rdma_umap_priv *priv,
> > -			 struct vm_area_struct *vma)
> > +int rdma_umap_priv_init(struct vm_area_struct *vma,
> > +			struct rdma_user_mmap_entry *entry)
> >  {
> >  	struct ib_uverbs_file *ufile = vma->vm_file->private_data;
> > +	struct rdma_umap_priv *priv;
> > +
> > +	/* If the xa_mmap is used, private data will already be initialized.
> > +	 * this is required for the cases that rdma_user_mmap_io is called
> > +	 * from drivers that don't use the xa_mmap database yet
> > +	 */
> > +	if (vma->vm_private_data)
> > +		return 0;
> 
> ?? Still have to track the ufile->umaps though
If the driver uses the mmap_xa this function will be called in an earlier stage
And umaps will be updated, the vm_private_data will be initialized in this case
once the driver calls rdma_user_mmap_io and therefore there is no action
that needs to be taken. 


> 
> > +/**
> > + * rdma_user_mmap_entry_put() - drop reference to the mmap entry
> > + *
> > + * @ucontext: associated user context.
> > + * @entry: An entry in the mmap_xa.
> > + *
> > + * This function is called when the mapping is closed or when
> > + * the driver is done with the entry for some other reason.
> > + * Should be called after rdma_user_mmap_entry_get was called
> > + * and entry is no longer needed. This function will erase the
> > + * entry and free it if it's refcnt reaches zero.
> > + */
> > +void rdma_user_mmap_entry_put(struct ib_ucontext *ucontext,
> > +			      struct rdma_user_mmap_entry *entry) {
> > +	WARN_ON(!kref_read(&entry->ref));
> 
> kref_put does this internally when refcount debugging is enabled
Ok, will remove, thanks.
> 
> > +	kref_put(&entry->ref, rdma_user_mmap_entry_free); }
> > +EXPORT_SYMBOL(rdma_user_mmap_entry_put);
> > +
> > +/**
> > + * rdma_user_mmap_entry_remove() - Remove a key's entry from the
> > +mmap_xa
> > + *
> > + * @ucontext: associated user context.
> > + * @key: The key to be deleted
> > + *
> > + * This function will find if there is an entry matching the key and
> > +if so
> > + * decrease it's refcnt, which will in turn delete the entry if its
> > +refcount
> > + * reaches zero.
> > + */
> > +void rdma_user_mmap_entry_remove(struct ib_ucontext *ucontext,
> u64
> > +key) {
> > +	struct rdma_user_mmap_entry *entry;
> > +	u32 mmap_page;
> > +
> > +	if (key == RDMA_USER_MMAP_INVALID)
> > +		return;
> > +
> > +	mmap_page = key >> PAGE_SHIFT;
> > +	if (mmap_page > U32_MAX)
> > +		return;
> > +
> > +	entry = xa_load(&ucontext->mmap_xa, mmap_page);
> > +	if (!entry)
> > +		return;
> > +
> > +	rdma_user_mmap_entry_put(ucontext, entry); }
> > +EXPORT_SYMBOL(rdma_user_mmap_entry_remove);
> > +
> > +/**
> > + * rdma_user_mmap_entry_insert() - Allocate and insert an entry to the
> mmap_xa.
> > + *
> > + * @ucontext: associated user context.
> > + * @obj: opaque driver object that will be stored in the entry.
> > + * @address: The address that will be mmapped to the user
> > + * @length: Length of the address that will be mmapped
> > + * @mmap_flag: opaque driver flags related to the address (For
> > + *           example could be used for cachability)
> > + *
> > + * This function should be called by drivers that use the
> > +rdma_user_mmap
> > + * interface for handling user mmapped addresses. The database is
> > +handled in
> > + * the core and helper functions are provided to insert entries into
> > +the
> > + * database and extract entries when the user call mmap with the given
> key.
> > + * The function returns a unique key that should be provided to user,
> > +the user
> > + * will use the key to map the given address.
> > + *
> > + * Return: unique key or RDMA_USER_MMAP_INVALID if entry was not
> added.
> > + */
> > +u64 rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, void
> *obj,
> > +				u64 address, u64 length, u8 mmap_flag) {
> > +	XA_STATE(xas, &ucontext->mmap_xa, 0);
> > +	struct rdma_user_mmap_entry *entry;
> > +	unsigned long index = 0, index_max;
> > +	u32 xa_first, xa_last, npages;
> > +	int err, i;
> > +	void *ent;
> > +
> > +	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> > +	if (!entry)
> > +		return RDMA_USER_MMAP_INVALID;
> > +
> > +	entry->obj = obj;
> 
> It is more a kernel pattern to have the driver allocate a
> rdma_user_mmap_entry and extend it with its 'priv', then use container_of
then would we also want the driver to free the memory ? 
Or will it be ok to free it using the kref put callback ? 
> 
> 
> > +	entry->address = address;
> > +	entry->length = length;
> > +	kref_init(&entry->ref);
> > +	entry->mmap_flag = mmap_flag;
> > +	entry->ucontext = ucontext;
> > +
> > +	xa_lock(&ucontext->mmap_xa);
> > +
> > +	/* We want to find an empty range */
> > +	npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE);
> > +	do {
> > +		/* First find an empty index */
> > +		xas_find_marked(&xas, U32_MAX, XA_FREE_MARK);
> > +		if (xas.xa_node == XAS_RESTART)
> > +			goto err_unlock;
> > +
> > +		xa_first = xas.xa_index;
> > +
> > +		/* Is there enough room to have the range? */
> > +		if (check_add_overflow(xa_first, npages, &xa_last))
> > +			goto err_unlock;
> > +
> > +		/* Iterate over all present entries in the range. If a present
> > +		 * entry exists we will finish this with the largest index
> > +		 * occupied in the range which will serve as the start of the
> > +		 * new search
> > +		 */
> > +		index_max = xa_last;
> > +		xa_for_each_start(&ucontext->mmap_xa, index, ent,
> xa_first)
> 
> I think this can just be written as xas_next_entry() ?
> 
> And if it returns something we know the range xa_first -> xas.xa_index is not
> occupied, then check if it has the right size? Otherwise the range xa_first ->
> U32_MAX
Seems cleaner thanks, will take a look. 
> 
> 
> > +			if (index < xa_last)
> > +				index_max = index;
> > +			else
> > +				break;
> > +		if (index_max == xa_last) /* range is free */
> > +			break;
> > +		/* o/w start again from largest index found in range */
> > +		xas_set(&xas, index_max);
> > +	} while (true);
> > +
> > +	for (i = xa_first; i < xa_last; i++) {
> > +		err = __xa_insert(&ucontext->mmap_xa, i, entry,
> GFP_KERNEL);
> 
> Hum, keep in mind this is a bit tricky as the __xa_insert will drop the xa_lock
> lock to allocate and a parallel thread could jump into the gap
> 
> This seems undesirable, so we probably need to enclose the whole thing in a
> sleeping mutex. Can probably use the umap_lock
I didn't take this into account, thanks, will take a look.

> 
> > +void rdma_user_mmap_entries_remove_free(struct ib_ucontext
> *ucontext)
> > +{
> > +	struct rdma_user_mmap_entry *entry;
> > +	unsigned long mmap_page;
> > +
> > +	WARN_ON(!xa_empty(&ucontext->mmap_xa));
> > +	xa_for_each(&ucontext->mmap_xa, mmap_page, entry) {
> > +		ibdev_dbg(ucontext->device,
> > +			  "mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx]
> removed\n",
> > +			  entry->obj, rdma_user_mmap_get_key(entry),
> > +			  entry->address, entry->length);
> > +
> > +		/* override the refcnt to make sure entry is deleted */
> > +		kref_init(&entry->ref);
> 
> Yikes, no. The zap flow has to clean this up so the kref goes naturally to zero.
Ok thanks
> 
> > +		rdma_user_mmap_entry_put(ucontext, entry);
> > +	}
> > +}
> > +EXPORT_SYMBOL(rdma_user_mmap_entries_remove_free);
> > diff --git a/drivers/infiniband/core/rdma_core.c
> > b/drivers/infiniband/core/rdma_core.c
> > index ccf4d069c25c..7166741834c8 100644
> > +++ b/drivers/infiniband/core/rdma_core.c
> > @@ -817,6 +817,7 @@ static void ufile_destroy_ucontext(struct
> ib_uverbs_file *ufile,
> >  	rdma_restrack_del(&ucontext->res);
> >
> >  	ib_dev->ops.dealloc_ucontext(ucontext);
> > +	rdma_user_mmap_entries_remove_free(ucontext);
> >  	kfree(ucontext);
> >
> >  	ufile->ucontext = NULL;
> > diff --git a/drivers/infiniband/core/uverbs_cmd.c
> > b/drivers/infiniband/core/uverbs_cmd.c
> > index 7ddd0e5bc6b3..4903e6eee854 100644
> > +++ b/drivers/infiniband/core/uverbs_cmd.c
> > @@ -254,6 +254,7 @@ static int ib_uverbs_get_context(struct
> > uverbs_attr_bundle *attrs)
> >
> >  	mutex_init(&ucontext->per_mm_list_lock);
> >  	INIT_LIST_HEAD(&ucontext->per_mm_list);
> > +	xa_init_flags(&ucontext->mmap_xa, XA_FLAGS_ALLOC);
> >
> >  	ret = get_unused_fd_flags(O_CLOEXEC);
> >  	if (ret < 0)
> > diff --git a/drivers/infiniband/core/uverbs_main.c
> > b/drivers/infiniband/core/uverbs_main.c
> > index 180a5e0f70e4..80d0d3467d93 100644
> > +++ b/drivers/infiniband/core/uverbs_main.c
> > @@ -802,7 +802,7 @@ static void rdma_umap_open(struct
> vm_area_struct
> > *vma)  {
> >  	struct ib_uverbs_file *ufile = vma->vm_file->private_data;
> >  	struct rdma_umap_priv *opriv = vma->vm_private_data;
> > -	struct rdma_umap_priv *priv;
> > +	int ret;
> >
> >  	if (!opriv)
> >  		return;
> > @@ -816,10 +816,12 @@ static void rdma_umap_open(struct
> vm_area_struct *vma)
> >  	if (!ufile->ucontext)
> >  		goto out_unlock;
> >
> > -	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> > -	if (!priv)
> > +	if (opriv->entry)
> > +		kref_get(&opriv->entry->ref);
> > +
> > +	ret = rdma_umap_priv_init(vma, opriv->entry);
> > +	if (ret)
> >  		goto out_unlock;
> > -	rdma_umap_priv_init(priv, vma);
> >
> >  	up_read(&ufile->hw_destroy_rwsem);
> >  	return;
> > @@ -844,15 +846,15 @@ static void rdma_umap_close(struct
> vm_area_struct *vma)
> >  	if (!priv)
> >  		return;
> >
> > +	if (priv->entry)
> > +		rdma_user_mmap_entry_put(ufile->ucontext, priv->entry);
> > +
> >  	/*
> >  	 * The vma holds a reference on the struct file that created it, which
> >  	 * in turn means that the ib_uverbs_file is guaranteed to exist at
> >  	 * this point.
> >  	 */
> > -	mutex_lock(&ufile->umap_lock);
> > -	list_del(&priv->list);
> > -	mutex_unlock(&ufile->umap_lock);
> > -	kfree(priv);
> > +	rdma_umap_priv_delete(ufile, priv);
> >  }
> >
> >  /*
> > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
> > 391499008a22..b66c197a7079 100644
> > +++ b/include/rdma/ib_verbs.h
> > @@ -1479,6 +1479,7 @@ struct ib_ucontext {
> >  	 * Implementation details of the RDMA core, don't use in drivers:
> >  	 */
> >  	struct rdma_restrack_entry res;
> > +	struct xarray mmap_xa;
> >  };
> >
> >  struct ib_uobject {
> > @@ -2259,6 +2260,19 @@ struct iw_cm_conn_param;
> >
> >  #define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct
> >
> > +#define RDMA_USER_MMAP_FLAG_SHIFT 56
> > +#define RDMA_USER_MMAP_PAGE_MASK
> GENMASK(EFA_MMAP_FLAG_SHIFT - 1, 0)
> 
> Why is something called EFA_MMAP_FLAGS_SHIFT in ib_verbs.h?
I have nothing to say in my defense. Will remove. 
> 
> Jason




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux