Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations

Christoph Hellwig <hch@xxxxxxxxxxxxx> · Tue, 8 Dec 2015 07:18:52 -0800

There is absolutely nothing IB specific here.  If you want to support
anonymous mmaps to allocate large contiguous pages work with the MM
folks on providing that in a generic fashion.

[full quote alert for reference:]

On Tue, Dec 08, 2015 at 05:15:06PM +0200, Yishai Hadas wrote:
> New structure 'cmem' represents the contiguous allocated memory.
> It supports:
> Allocate, Free, 'Map to virtual address' operations, etc.
> 
> Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxxxx>
> ---
>  drivers/infiniband/core/Makefile |   2 +-
>  drivers/infiniband/core/cmem.c   | 245 +++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_cmem.h           |  41 +++++++
>  3 files changed, 287 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/infiniband/core/cmem.c
>  create mode 100644 include/rdma/ib_cmem.h
> 
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index d43a899..8549ea4 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
>  ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
>  				device.o fmr_pool.o cache.o netlink.o \
>  				roce_gid_mgmt.o
> -ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> +ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o cmem.o
>  ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
>  
>  ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
> diff --git a/drivers/infiniband/core/cmem.c b/drivers/infiniband/core/cmem.c
> new file mode 100644
> index 0000000..21d8573
> --- /dev/null
> +++ b/drivers/infiniband/core/cmem.c
> @@ -0,0 +1,245 @@
> +#include <linux/mm.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/sched.h>
> +#include <linux/export.h>
> +#include <linux/dma-attrs.h>
> +#include <linux/slab.h>
> +#include <rdma/ib_cmem.h>
> +#include "uverbs.h"
> +
> +static void ib_cmem_release(struct kref *ref)
> +{
> +	struct ib_cmem *cmem;
> +	struct ib_cmem_block *cmem_block, *tmp;
> +	unsigned long ntotal_pages;
> +
> +	cmem = container_of(ref, struct ib_cmem, refcount);
> +
> +	list_for_each_entry_safe(cmem_block, tmp, &cmem->ib_cmem_block, list) {
> +		__free_pages(cmem_block->page, cmem->block_order);
> +		list_del(&cmem_block->list);
> +		kfree(cmem_block);
> +	}
> +	/* no locking is needed:
> +	  * ib_cmem_release is called from vm_close which is always called
> +	  * with mm->mmap_sem held for writing.
> +	  * The only exception is when the process shutting down but in that case
> +	  * counter not relevant any more.
> +	  */
> +	if (current->mm) {
> +		ntotal_pages = PAGE_ALIGN(cmem->length) >> PAGE_SHIFT;
> +		current->mm->pinned_vm -= ntotal_pages;
> +	}
> +	kfree(cmem);
> +}
> +
> +/**
> + * ib_cmem_release_contiguous_pages - release memory allocated by
> + *                                              ib_cmem_alloc_contiguous_pages.
> + * @cmem: cmem struct to release
> + */
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
> +{
> +	kref_put(&cmem->refcount, ib_cmem_release);
> +}
> +EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
> +
> +static void cmem_vma_open(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *ib_cmem;
> +
> +	ib_cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	/* vm_open and vm_close are always called with mm->mmap_sem held for
> +	  * writing. The only exception is when the process is shutting down, at
> +	  * which point vm_close is called with no locks held, but since it is
> +	  * after the VMAs have been detached, it is impossible that vm_open will
> +	  * be called. Therefore, there is no need to synchronize the kref_get and
> +	  * kref_put calls.
> +	*/
> +	kref_get(&ib_cmem->refcount);
> +}
> +
> +static void cmem_vma_close(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *cmem;
> +
> +	cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	ib_cmem_release_contiguous_pages(cmem);
> +}
> +
> +static const struct vm_operations_struct cmem_contig_pages_vm_ops = {
> +	.open = cmem_vma_open,
> +	.close = cmem_vma_close
> +};
> +
> +/**
> + * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
> + * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
> + * @vma: VMA to inject pages into.
> + */
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma)
> +{
> +	int ret;
> +	unsigned long page_entry;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontig_pages;
> +	unsigned long total_size;
> +	struct page *page;
> +	unsigned long vma_entry_number = 0;
> +	struct ib_cmem_block *ib_cmem_block = NULL;
> +
> +	total_size = vma->vm_end - vma->vm_start;
> +	if (ib_cmem->length != total_size)
> +		return -EINVAL;
> +
> +	if (total_size != PAGE_ALIGN(total_size)) {
> +		WARN(1,
> +		     "ib_cmem_map: total size %lu not aligned to page size\n",
> +		     total_size);
> +		return -EINVAL;
> +	}
> +
> +	ntotal_pages = total_size >> PAGE_SHIFT;
> +	ncontig_pages = 1 << ib_cmem->block_order;
> +
> +	list_for_each_entry(ib_cmem_block, &ib_cmem->ib_cmem_block, list) {
> +		page = ib_cmem_block->page;
> +		for (page_entry = 0; page_entry < ncontig_pages; page_entry++) {
> +			/* We reached end of vma - going out from both loops */
> +			if (vma_entry_number >= ntotal_pages)
> +				goto end;
> +
> +			ret = vm_insert_page(vma, vma->vm_start +
> +				(vma_entry_number << PAGE_SHIFT), page);
> +			if (ret < 0)
> +				goto err_vm_insert;
> +
> +			vma_entry_number++;
> +			page++;
> +		}
> +	}
> +
> +end:
> +
> +	/* We expect to have enough pages   */
> +	if (vma_entry_number >= ntotal_pages) {
> +		vma->vm_ops =  &cmem_contig_pages_vm_ops;
> +		vma->vm_private_data = ib_cmem;
> +		return 0;
> +	}
> +	/* Not expected but if we reached here
> +	  * not enough contiguous pages were registered
> +	  */
> +	ret = -EINVAL;
> +
> +err_vm_insert:
> +
> +	zap_vma_ptes(vma, vma->vm_start, total_size);
> +	return ret;
> +}
> +EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
> +
> +/**
> + * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
> + * @context: userspace context to allocate memory for
> + * @total_size: total required size for that allocation.
> + * @page_size_order: order of one contiguous page.
> + * @numa_nude: From which numa node to allocate memory
> + *             when numa_nude < 0 use default numa_nude.
> + */
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node)
> +{
> +	struct ib_cmem *cmem;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontiguous_pages;
> +	unsigned long ncontiguous_groups;
> +	struct page *page;
> +	int i;
> +	int ncontiguous_pages_order;
> +	struct ib_cmem_block *ib_cmem_block;
> +	unsigned long locked;
> +	unsigned long lock_limit;
> +
> +	if (page_size_order < PAGE_SHIFT || page_size_order > 31)
> +		return ERR_PTR(-EINVAL);
> +
> +	cmem = kzalloc(sizeof(*cmem), GFP_KERNEL);
> +	if (!cmem)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&cmem->refcount);
> +	cmem->context   = context;
> +	INIT_LIST_HEAD(&cmem->ib_cmem_block);
> +
> +	/* Total size is expected to be already page aligned -
> +	  * verifying anyway.
> +	  */
> +	ntotal_pages = PAGE_ALIGN(total_size) >> PAGE_SHIFT;
> +	/* ib_cmem_alloc_contiguous_pages is called as part of mmap
> +	  * with mm->mmap_sem held for writing.
> +	  * No need to lock
> +	  */
> +	locked     = ntotal_pages + current->mm->pinned_vm;
> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
> +		goto err_alloc;
> +
> +	/* How many contiguous pages do we need in 1 block */
> +	ncontiguous_pages = (1 << page_size_order) >> PAGE_SHIFT;
> +	ncontiguous_pages_order = ilog2(ncontiguous_pages);
> +	ncontiguous_groups = (ntotal_pages >> ncontiguous_pages_order)  +
> +		(!!(ntotal_pages & (ncontiguous_pages - 1)));
> +
> +	/* Checking MAX_ORDER to prevent WARN via calling alloc_pages below */
> +	if (ncontiguous_pages_order >= MAX_ORDER)
> +		goto err_alloc;
> +	/* we set block_order before starting allocation to prevent
> +	  * a leak in a failure flow in ib_cmem_release.
> +	  * cmem->length has at that step value 0 from kzalloc as expected
> +	  */
> +	cmem->block_order = ncontiguous_pages_order;
> +	for (i = 0; i < ncontiguous_groups; i++) {
> +		/* Allocating the managed entry */
> +		ib_cmem_block = kmalloc(sizeof(*ib_cmem_block),
> +					GFP_KERNEL);
> +		if (!ib_cmem_block)
> +			goto err_alloc;
> +
> +		if (numa_node < 0)
> +			page =  alloc_pages(GFP_HIGHUSER | __GFP_ZERO |
> +					    __GFP_COMP | __GFP_NOWARN,
> +					    ncontiguous_pages_order);
> +		else
> +			page =  alloc_pages_node(numa_node,
> +						 GFP_HIGHUSER | __GFP_ZERO |
> +						 __GFP_COMP | __GFP_NOWARN,
> +						 ncontiguous_pages_order);
> +
> +		if (!page) {
> +			kfree(ib_cmem_block);
> +			/* We should deallocate previous succeeded allocatations
> +			  * if exists.
> +			  */
> +			goto err_alloc;
> +		}
> +
> +		ib_cmem_block->page = page;
> +		list_add_tail(&ib_cmem_block->list, &cmem->ib_cmem_block);
> +	}
> +
> +	cmem->length = total_size;
> +	current->mm->pinned_vm = locked;
> +	return cmem;
> +
> +err_alloc:
> +	ib_cmem_release_contiguous_pages(cmem);
> +	return ERR_PTR(-ENOMEM);
> +}
> +EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
> diff --git a/include/rdma/ib_cmem.h b/include/rdma/ib_cmem.h
> new file mode 100644
> index 0000000..5f26a49
> --- /dev/null
> +++ b/include/rdma/ib_cmem.h
> @@ -0,0 +1,41 @@
> +#ifndef IB_CMEM_H
> +#define IB_CMEM_H
> +
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_verbs.h>
> +
> +/* contiguous memory structure */
> +struct ib_cmem {
> +	struct ib_ucontext     *context;
> +	size_t			length;
> +	/* Link list of contiguous blocks being part of that cmem  */
> +	struct list_head ib_cmem_block;
> +
> +	/* Order of cmem block,  2^ block_order will equal number
> +	  * of physical pages per block
> +	  */
> +	unsigned long    block_order;
> +	/* Refernce counter for that memory area
> +	  * When value became 0 pages will be returned to the kernel.
> +	  */
> +	struct kref refcount;
> +};
> +
> +struct ib_cmem_block {
> +	struct list_head	list;
> +	/* page will point to the page struct of the head page
> +	  * in the current compound page.
> +	  * block order is saved once as part of ib_cmem.
> +	  */
> +	struct page            *page;
> +};
> +
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma);
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node);
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem);
> +
> +#endif
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
---end quoted text---

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>