Re: [PATCH for-next v1 06/12] SIW application buffer management

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sun, May 26, 2019 at 01:41:50PM +0200, Bernard Metzler wrote:
> Signed-off-by: Bernard Metzler <bmt@xxxxxxxxxxxxxx>
> ---
>  drivers/infiniband/sw/siw/siw_mem.c | 462 ++++++++++++++++++++++++++++
>  drivers/infiniband/sw/siw/siw_mem.h |  74 +++++
>  2 files changed, 536 insertions(+)
>  create mode 100644 drivers/infiniband/sw/siw/siw_mem.c
>  create mode 100644 drivers/infiniband/sw/siw/siw_mem.h
>
> diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
> new file mode 100644
> index 000000000000..e2961e6be1d9
> --- /dev/null
> +++ b/drivers/infiniband/sw/siw/siw_mem.c
> @@ -0,0 +1,462 @@
> +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
> +
> +/* Authors: Bernard Metzler <bmt@xxxxxxxxxxxxxx> */
> +/* Copyright (c) 2008-2019, IBM Corporation */
> +
> +#include <linux/version.h>
> +#include <linux/scatterlist.h>
> +#include <linux/gfp.h>
> +#include <rdma/ib_verbs.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/slab.h>
> +#include <linux/pid.h>
> +#include <linux/sched/mm.h>
> +
> +#include "siw.h"
> +#include "siw_debug.h"
> +#include "siw_mem.h"
> +
> +/*
> + * Stag lookup is based on its index part only (24 bits).
> + * The code avoids special Stag of zero and tries to randomize
> + * STag values between 1 and SIW_STAG_MAX_INDEX.
> + */
> +int siw_mem_add(struct siw_device *sdev, struct siw_mem *m)
> +{
> +	struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
> +	u32 id, next;
> +
> +	get_random_bytes(&next, 4);
> +	next &= 0x00ffffff;
> +
> +	if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next,
> +	    GFP_KERNEL) < 0)
> +		return -ENOMEM;
> +
> +	/* Set the STag index part */
> +	m->stag = id << 8;
> +
> +	siw_dbg_mem(m, "new MEM object\n");
> +
> +	return 0;
> +}
> +
> +/*
> + * siw_mem_id2obj()
> + *
> + * resolves memory from stag given by id. might be called from:
> + * o process context before sending out of sgl, or
> + * o in softirq when resolving target memory
> + */
> +struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
> +{
> +	struct siw_mem *mem;
> +
> +	rcu_read_lock();
> +	mem = xa_load(&sdev->mem_xa, stag_index);
> +	if (likely(mem && kref_get_unless_zero(&mem->ref))) {
> +		rcu_read_unlock();
> +		return mem;
> +	}
> +	rcu_read_unlock();
> +
> +	return NULL;
> +}
> +
> +static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
> +			   bool dirty)
> +{
> +	struct page **p = chunk->p;
> +
> +	while (num_pages--) {
> +		if (!PageDirty(*p) && dirty)
> +			set_page_dirty_lock(*p);
> +		put_page(*p);
> +		p++;
> +	}
> +}
> +
> +void siw_umem_release(struct siw_umem *umem, bool dirty)
> +{
> +	struct mm_struct *mm_s = umem->owning_mm;
> +	int i, num_pages = umem->num_pages;
> +
> +	for (i = 0; num_pages; i++) {
> +		int to_free = min_t(int, PAGES_PER_CHUNK, num_pages);
> +
> +		siw_free_plist(&umem->page_chunk[i], to_free,
> +			       umem->writable && dirty);
> +		kfree(umem->page_chunk[i].p);
> +		num_pages -= to_free;
> +	}
> +	atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
> +
> +	mmdrop(mm_s);
> +	kfree(umem->page_chunk);
> +	kfree(umem);
> +}
> +
> +int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
> +		   u64 start, u64 len, int rights)
> +{
> +	struct siw_device *sdev = to_siw_dev(pd->device);
> +	struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
> +	struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
> +	u32 id, next;
> +
> +	if (!mem)
> +		return -ENOMEM;
> +
> +	mem->mem_obj = mem_obj;
> +	mem->stag_valid = 0;
> +	mem->sdev = sdev;
> +	mem->va = start;
> +	mem->len = len;
> +	mem->pd = pd;
> +	mem->perms = rights & IWARP_ACCESS_MASK;
> +	kref_init(&mem->ref);
> +
> +	mr->mem = mem;
> +
> +	get_random_bytes(&next, 4);
> +	next &= 0x00ffffff;
> +
> +	if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next,
> +	    GFP_KERNEL) < 0) {
> +		kfree(mem);
> +		return -ENOMEM;
> +	}
> +	/* Set the STag index part */
> +	mem->stag = id << 8;
> +	mr->base_mr.lkey = mr->base_mr.rkey = mem->stag;
> +
> +	return 0;
> +}
> +
> +void siw_mr_drop_mem(struct siw_mr *mr)
> +{
> +	struct siw_mem *mem = mr->mem, *found;
> +
> +	mem->stag_valid = 0;
> +
> +	/* make STag invalid visible asap */
> +	smp_mb();
> +
> +	found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8);
> +	WARN_ON(found != mem);
> +	siw_mem_put(mem);
> +}
> +
> +void siw_free_mem(struct kref *ref)
> +{
> +	struct siw_mem *mem = container_of(ref, struct siw_mem, ref);
> +
> +	siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n");
> +
> +	if (!mem->is_mw && mem->mem_obj) {
> +		if (mem->is_pbl == 0)
> +			siw_umem_release(mem->umem, true);
> +		else
> +			kfree(mem->pbl);
> +	}
> +	kfree(mem);
> +}
> +
> +/*
> + * siw_check_mem()
> + *
> + * Check protection domain, STAG state, access permissions and
> + * address range for memory object.
> + *
> + * @pd:		Protection Domain memory should belong to
> + * @mem:	memory to be checked
> + * @addr:	starting addr of mem
> + * @perms:	requested access permissions
> + * @len:	len of memory interval to be checked
> + *
> + */
> +int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
> +		  enum ib_access_flags perms, int len)
> +{
> +	if (!mem->stag_valid) {
> +		siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag);
> +		return -E_STAG_INVALID;
> +	}
> +	if (mem->pd != pd) {
> +		siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag);
> +		return -E_PD_MISMATCH;
> +	}
> +	/*
> +	 * check access permissions
> +	 */
> +	if ((mem->perms & perms) < perms) {
> +		siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n",
> +			   mem->perms, perms);
> +		return -E_ACCESS_PERM;
> +	}
> +	/*
> +	 * Check if access falls into valid memory interval.
> +	 */
> +	if (addr < mem->va || addr + len > mem->va + mem->len) {
> +		siw_dbg_pd(pd, "MEM interval len %d\n", len);
> +		siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] out of bounds\n",
> +			   (unsigned long long)addr,
> +			   (unsigned long long)(addr + len));
> +		siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] STag=0x%08x\n",
> +			   (unsigned long long)mem->va,
> +			   (unsigned long long)(mem->va + mem->len),
> +			   mem->stag);
> +
> +		return -E_BASE_BOUNDS;
> +	}
> +	return E_ACCESS_OK;
> +}
> +
> +/*
> + * siw_check_sge()
> + *
> + * Check SGE for access rights in given interval
> + *
> + * @pd:		Protection Domain memory should belong to
> + * @sge:	SGE to be checked
> + * @mem:	location of memory reference within array
> + * @perms:	requested access permissions
> + * @off:	starting offset in SGE
> + * @len:	len of memory interval to be checked
> + *
> + * NOTE: Function references SGE's memory object (mem->obj)
> + * if not yet done. New reference is kept if check went ok and
> + * released if check failed. If mem->obj is already valid, no new
> + * lookup is being done and mem is not released it check fails.
> + */
> +int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[],
> +		  enum ib_access_flags perms, u32 off, int len)
> +{
> +	struct siw_device *sdev = to_siw_dev(pd->device);
> +	struct siw_mem *new = NULL;
> +	int rv = E_ACCESS_OK;
> +
> +	if (len + off > sge->length) {
> +		rv = -E_BASE_BOUNDS;
> +		goto fail;
> +	}
> +	if (*mem == NULL) {
> +		new = siw_mem_id2obj(sdev, sge->lkey >> 8);
> +		if (unlikely(!new)) {
> +			siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey);
> +			rv = -E_STAG_INVALID;
> +			goto fail;
> +		}
> +		*mem = new;
> +	}
> +	/* Check if user re-registered with different STag key */
> +	if (unlikely((*mem)->stag != sge->lkey)) {
> +		siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey);
> +		rv = -E_STAG_INVALID;
> +		goto fail;
> +	}
> +	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
> +	if (unlikely(rv))
> +		goto fail;
> +
> +	return 0;
> +
> +fail:
> +	if (new) {
> +		*mem = NULL;
> +		siw_mem_put(new);
> +	}
> +	return rv;
> +}
> +
> +void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op)
> +{
> +	switch (op) {
> +	case SIW_OP_SEND:
> +	case SIW_OP_WRITE:
> +	case SIW_OP_SEND_WITH_IMM:
> +	case SIW_OP_SEND_REMOTE_INV:
> +	case SIW_OP_READ:
> +	case SIW_OP_READ_LOCAL_INV:
> +		if (!(wqe->sqe.flags & SIW_WQE_INLINE))
> +			siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge);
> +		break;
> +
> +	case SIW_OP_RECEIVE:
> +		siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge);
> +		break;
> +
> +	case SIW_OP_READ_RESPONSE:
> +		siw_unref_mem_sgl(wqe->mem, 1);
> +		break;
> +
> +	default:
> +		/*
> +		 * SIW_OP_INVAL_STAG and SIW_OP_REG_MR
> +		 * do not hold memory references
> +		 */
> +		break;
> +	}
> +}
> +
> +int siw_invalidate_stag(struct ib_pd *pd, u32 stag)
> +{
> +	struct siw_device *sdev = to_siw_dev(pd->device);
> +	struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8);
> +	int rv = 0;
> +
> +	if (unlikely(!mem)) {
> +		siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag);
> +		return -EINVAL;
> +	}
> +	if (unlikely(mem->pd != pd)) {
> +		siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag);
> +		rv = -EACCES;
> +		goto out;
> +	}
> +	/*
> +	 * Per RDMA verbs definition, an STag may already be in invalid
> +	 * state if invalidation is requested. So no state check here.
> +	 */
> +	mem->stag_valid = 0;
> +
> +	siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag);
> +out:
> +	siw_mem_put(mem);
> +	return rv;
> +}
> +
> +/*
> + * Gets physical address backed by PBL element. Address is referenced
> + * by linear byte offset into list of variably sized PB elements.
> + * Optionally, provides remaining len within current element, and
> + * current PBL index for later resume at same element.
> + */
> +u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx)
> +{
> +	int i = idx ? *idx : 0;
> +
> +	while (i < pbl->num_buf) {
> +		struct siw_pble *pble = &pbl->pbe[i];
> +
> +		if (pble->pbl_off + pble->size > off) {
> +			u64 pble_off = off - pble->pbl_off;
> +
> +			if (len)
> +				*len = pble->size - pble_off;
> +			if (idx)
> +				*idx = i;
> +
> +			return pble->addr + pble_off;
> +		}
> +		i++;
> +	}
> +	if (len)
> +		*len = 0;
> +	return 0;
> +}
> +
> +struct siw_pbl *siw_pbl_alloc(u32 num_buf)
> +{
> +	struct siw_pbl *pbl;
> +	int buf_size = sizeof(*pbl);
> +
> +	if (num_buf == 0)
> +		return ERR_PTR(-EINVAL);
> +
> +	buf_size += ((num_buf - 1) * sizeof(struct siw_pble));
> +
> +	pbl = kzalloc(buf_size, GFP_KERNEL);
> +	if (!pbl)
> +		return ERR_PTR(-ENOMEM);
> +
> +	pbl->max_buf = num_buf;
> +
> +	return pbl;
> +}
> +
> +struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
> +{
> +	struct siw_umem *umem;
> +	struct mm_struct *mm_s;
> +	u64 first_page_va;
> +	unsigned long mlock_limit;
> +	unsigned int foll_flags = FOLL_WRITE;
> +	int num_pages, num_chunks, i, rv = 0;
> +
> +	if (!can_do_mlock())
> +		return ERR_PTR(-EPERM);
> +
> +	if (!len)
> +		return ERR_PTR(-EINVAL);
> +
> +	first_page_va = start & PAGE_MASK;
> +	num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT;
> +	num_chunks = (num_pages >> CHUNK_SHIFT) + 1;
> +
> +	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
> +	if (!umem)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mm_s = current->mm;
> +	umem->owning_mm = mm_s;
> +	umem->writable = writable;
> +
> +	mmgrab(mm_s);
> +
> +	if (!writable)
> +		foll_flags |= FOLL_FORCE;
> +
> +	down_read(&mm_s->mmap_sem);
> +
> +	mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +	if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) {
> +		rv = -ENOMEM;
> +		goto out_sem_up;
> +	}
> +	umem->fp_addr = first_page_va;
> +
> +	umem->page_chunk =
> +		kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL);
> +	if (!umem->page_chunk) {
> +		rv = -ENOMEM;
> +		goto out_sem_up;
> +	}
> +	for (i = 0; num_pages; i++) {
> +		int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK);
> +
> +		umem->page_chunk[i].p =
> +			kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
> +		if (!umem->page_chunk[i].p) {
> +			rv = -ENOMEM;
> +			goto out_sem_up;
> +		}
> +		got = 0;
> +		while (nents) {
> +			struct page **plist = &umem->page_chunk[i].p[got];
> +
> +			rv = get_user_pages(first_page_va, nents,
> +					    foll_flags | FOLL_LONGTERM,
> +					    plist, NULL);
> +			if (rv < 0)
> +				goto out_sem_up;
> +
> +			umem->num_pages += rv;
> +			atomic64_add(rv, &mm_s->pinned_vm);
> +			first_page_va += rv * PAGE_SIZE;
> +			nents -= rv;
> +			got += rv;
> +		}
> +		num_pages -= got;
> +	}
> +out_sem_up:
> +	up_read(&mm_s->mmap_sem);
> +
> +	if (rv > 0)
> +		return umem;
> +
> +	siw_umem_release(umem, false);
> +
> +	return ERR_PTR(rv);

No, please use ib_umem_get().

Thanks



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux