On Sun, May 26, 2019 at 01:41:50PM +0200, Bernard Metzler wrote: > Signed-off-by: Bernard Metzler <bmt@xxxxxxxxxxxxxx> > --- > drivers/infiniband/sw/siw/siw_mem.c | 462 ++++++++++++++++++++++++++++ > drivers/infiniband/sw/siw/siw_mem.h | 74 +++++ > 2 files changed, 536 insertions(+) > create mode 100644 drivers/infiniband/sw/siw/siw_mem.c > create mode 100644 drivers/infiniband/sw/siw/siw_mem.h > > diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c > new file mode 100644 > index 000000000000..e2961e6be1d9 > --- /dev/null > +++ b/drivers/infiniband/sw/siw/siw_mem.c > @@ -0,0 +1,462 @@ > +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause > + > +/* Authors: Bernard Metzler <bmt@xxxxxxxxxxxxxx> */ > +/* Copyright (c) 2008-2019, IBM Corporation */ > + > +#include <linux/version.h> > +#include <linux/scatterlist.h> > +#include <linux/gfp.h> > +#include <rdma/ib_verbs.h> > +#include <linux/dma-mapping.h> > +#include <linux/slab.h> > +#include <linux/pid.h> > +#include <linux/sched/mm.h> > + > +#include "siw.h" > +#include "siw_debug.h" > +#include "siw_mem.h" > + > +/* > + * Stag lookup is based on its index part only (24 bits). > + * The code avoids special Stag of zero and tries to randomize > + * STag values between 1 and SIW_STAG_MAX_INDEX. > + */ > +int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) > +{ > + struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); > + u32 id, next; > + > + get_random_bytes(&next, 4); > + next &= 0x00ffffff; > + > + if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, > + GFP_KERNEL) < 0) > + return -ENOMEM; > + > + /* Set the STag index part */ > + m->stag = id << 8; > + > + siw_dbg_mem(m, "new MEM object\n"); > + > + return 0; > +} > + > +/* > + * siw_mem_id2obj() > + * > + * resolves memory from stag given by id. might be called from: > + * o process context before sending out of sgl, or > + * o in softirq when resolving target memory > + */ > +struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) > +{ > + struct siw_mem *mem; > + > + rcu_read_lock(); > + mem = xa_load(&sdev->mem_xa, stag_index); > + if (likely(mem && kref_get_unless_zero(&mem->ref))) { > + rcu_read_unlock(); > + return mem; > + } > + rcu_read_unlock(); > + > + return NULL; > +} > + > +static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, > + bool dirty) > +{ > + struct page **p = chunk->p; > + > + while (num_pages--) { > + if (!PageDirty(*p) && dirty) > + set_page_dirty_lock(*p); > + put_page(*p); > + p++; > + } > +} > + > +void siw_umem_release(struct siw_umem *umem, bool dirty) > +{ > + struct mm_struct *mm_s = umem->owning_mm; > + int i, num_pages = umem->num_pages; > + > + for (i = 0; num_pages; i++) { > + int to_free = min_t(int, PAGES_PER_CHUNK, num_pages); > + > + siw_free_plist(&umem->page_chunk[i], to_free, > + umem->writable && dirty); > + kfree(umem->page_chunk[i].p); > + num_pages -= to_free; > + } > + atomic64_sub(umem->num_pages, &mm_s->pinned_vm); > + > + mmdrop(mm_s); > + kfree(umem->page_chunk); > + kfree(umem); > +} > + > +int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, > + u64 start, u64 len, int rights) > +{ > + struct siw_device *sdev = to_siw_dev(pd->device); > + struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL); > + struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); > + u32 id, next; > + > + if (!mem) > + return -ENOMEM; > + > + mem->mem_obj = mem_obj; > + mem->stag_valid = 0; > + mem->sdev = sdev; > + mem->va = start; > + mem->len = len; > + mem->pd = pd; > + mem->perms = rights & IWARP_ACCESS_MASK; > + kref_init(&mem->ref); > + > + mr->mem = mem; > + > + get_random_bytes(&next, 4); > + next &= 0x00ffffff; > + > + if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next, > + GFP_KERNEL) < 0) { > + kfree(mem); > + return -ENOMEM; > + } > + /* Set the STag index part */ > + mem->stag = id << 8; > + mr->base_mr.lkey = mr->base_mr.rkey = mem->stag; > + > + return 0; > +} > + > +void siw_mr_drop_mem(struct siw_mr *mr) > +{ > + struct siw_mem *mem = mr->mem, *found; > + > + mem->stag_valid = 0; > + > + /* make STag invalid visible asap */ > + smp_mb(); > + > + found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8); > + WARN_ON(found != mem); > + siw_mem_put(mem); > +} > + > +void siw_free_mem(struct kref *ref) > +{ > + struct siw_mem *mem = container_of(ref, struct siw_mem, ref); > + > + siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n"); > + > + if (!mem->is_mw && mem->mem_obj) { > + if (mem->is_pbl == 0) > + siw_umem_release(mem->umem, true); > + else > + kfree(mem->pbl); > + } > + kfree(mem); > +} > + > +/* > + * siw_check_mem() > + * > + * Check protection domain, STAG state, access permissions and > + * address range for memory object. > + * > + * @pd: Protection Domain memory should belong to > + * @mem: memory to be checked > + * @addr: starting addr of mem > + * @perms: requested access permissions > + * @len: len of memory interval to be checked > + * > + */ > +int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, > + enum ib_access_flags perms, int len) > +{ > + if (!mem->stag_valid) { > + siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag); > + return -E_STAG_INVALID; > + } > + if (mem->pd != pd) { > + siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag); > + return -E_PD_MISMATCH; > + } > + /* > + * check access permissions > + */ > + if ((mem->perms & perms) < perms) { > + siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n", > + mem->perms, perms); > + return -E_ACCESS_PERM; > + } > + /* > + * Check if access falls into valid memory interval. > + */ > + if (addr < mem->va || addr + len > mem->va + mem->len) { > + siw_dbg_pd(pd, "MEM interval len %d\n", len); > + siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] out of bounds\n", > + (unsigned long long)addr, > + (unsigned long long)(addr + len)); > + siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] STag=0x%08x\n", > + (unsigned long long)mem->va, > + (unsigned long long)(mem->va + mem->len), > + mem->stag); > + > + return -E_BASE_BOUNDS; > + } > + return E_ACCESS_OK; > +} > + > +/* > + * siw_check_sge() > + * > + * Check SGE for access rights in given interval > + * > + * @pd: Protection Domain memory should belong to > + * @sge: SGE to be checked > + * @mem: location of memory reference within array > + * @perms: requested access permissions > + * @off: starting offset in SGE > + * @len: len of memory interval to be checked > + * > + * NOTE: Function references SGE's memory object (mem->obj) > + * if not yet done. New reference is kept if check went ok and > + * released if check failed. If mem->obj is already valid, no new > + * lookup is being done and mem is not released it check fails. > + */ > +int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[], > + enum ib_access_flags perms, u32 off, int len) > +{ > + struct siw_device *sdev = to_siw_dev(pd->device); > + struct siw_mem *new = NULL; > + int rv = E_ACCESS_OK; > + > + if (len + off > sge->length) { > + rv = -E_BASE_BOUNDS; > + goto fail; > + } > + if (*mem == NULL) { > + new = siw_mem_id2obj(sdev, sge->lkey >> 8); > + if (unlikely(!new)) { > + siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey); > + rv = -E_STAG_INVALID; > + goto fail; > + } > + *mem = new; > + } > + /* Check if user re-registered with different STag key */ > + if (unlikely((*mem)->stag != sge->lkey)) { > + siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey); > + rv = -E_STAG_INVALID; > + goto fail; > + } > + rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len); > + if (unlikely(rv)) > + goto fail; > + > + return 0; > + > +fail: > + if (new) { > + *mem = NULL; > + siw_mem_put(new); > + } > + return rv; > +} > + > +void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op) > +{ > + switch (op) { > + case SIW_OP_SEND: > + case SIW_OP_WRITE: > + case SIW_OP_SEND_WITH_IMM: > + case SIW_OP_SEND_REMOTE_INV: > + case SIW_OP_READ: > + case SIW_OP_READ_LOCAL_INV: > + if (!(wqe->sqe.flags & SIW_WQE_INLINE)) > + siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge); > + break; > + > + case SIW_OP_RECEIVE: > + siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge); > + break; > + > + case SIW_OP_READ_RESPONSE: > + siw_unref_mem_sgl(wqe->mem, 1); > + break; > + > + default: > + /* > + * SIW_OP_INVAL_STAG and SIW_OP_REG_MR > + * do not hold memory references > + */ > + break; > + } > +} > + > +int siw_invalidate_stag(struct ib_pd *pd, u32 stag) > +{ > + struct siw_device *sdev = to_siw_dev(pd->device); > + struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8); > + int rv = 0; > + > + if (unlikely(!mem)) { > + siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag); > + return -EINVAL; > + } > + if (unlikely(mem->pd != pd)) { > + siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag); > + rv = -EACCES; > + goto out; > + } > + /* > + * Per RDMA verbs definition, an STag may already be in invalid > + * state if invalidation is requested. So no state check here. > + */ > + mem->stag_valid = 0; > + > + siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag); > +out: > + siw_mem_put(mem); > + return rv; > +} > + > +/* > + * Gets physical address backed by PBL element. Address is referenced > + * by linear byte offset into list of variably sized PB elements. > + * Optionally, provides remaining len within current element, and > + * current PBL index for later resume at same element. > + */ > +u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) > +{ > + int i = idx ? *idx : 0; > + > + while (i < pbl->num_buf) { > + struct siw_pble *pble = &pbl->pbe[i]; > + > + if (pble->pbl_off + pble->size > off) { > + u64 pble_off = off - pble->pbl_off; > + > + if (len) > + *len = pble->size - pble_off; > + if (idx) > + *idx = i; > + > + return pble->addr + pble_off; > + } > + i++; > + } > + if (len) > + *len = 0; > + return 0; > +} > + > +struct siw_pbl *siw_pbl_alloc(u32 num_buf) > +{ > + struct siw_pbl *pbl; > + int buf_size = sizeof(*pbl); > + > + if (num_buf == 0) > + return ERR_PTR(-EINVAL); > + > + buf_size += ((num_buf - 1) * sizeof(struct siw_pble)); > + > + pbl = kzalloc(buf_size, GFP_KERNEL); > + if (!pbl) > + return ERR_PTR(-ENOMEM); > + > + pbl->max_buf = num_buf; > + > + return pbl; > +} > + > +struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) > +{ > + struct siw_umem *umem; > + struct mm_struct *mm_s; > + u64 first_page_va; > + unsigned long mlock_limit; > + unsigned int foll_flags = FOLL_WRITE; > + int num_pages, num_chunks, i, rv = 0; > + > + if (!can_do_mlock()) > + return ERR_PTR(-EPERM); > + > + if (!len) > + return ERR_PTR(-EINVAL); > + > + first_page_va = start & PAGE_MASK; > + num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT; > + num_chunks = (num_pages >> CHUNK_SHIFT) + 1; > + > + umem = kzalloc(sizeof(*umem), GFP_KERNEL); > + if (!umem) > + return ERR_PTR(-ENOMEM); > + > + mm_s = current->mm; > + umem->owning_mm = mm_s; > + umem->writable = writable; > + > + mmgrab(mm_s); > + > + if (!writable) > + foll_flags |= FOLL_FORCE; > + > + down_read(&mm_s->mmap_sem); > + > + mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + > + if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) { > + rv = -ENOMEM; > + goto out_sem_up; > + } > + umem->fp_addr = first_page_va; > + > + umem->page_chunk = > + kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL); > + if (!umem->page_chunk) { > + rv = -ENOMEM; > + goto out_sem_up; > + } > + for (i = 0; num_pages; i++) { > + int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK); > + > + umem->page_chunk[i].p = > + kcalloc(nents, sizeof(struct page *), GFP_KERNEL); > + if (!umem->page_chunk[i].p) { > + rv = -ENOMEM; > + goto out_sem_up; > + } > + got = 0; > + while (nents) { > + struct page **plist = &umem->page_chunk[i].p[got]; > + > + rv = get_user_pages(first_page_va, nents, > + foll_flags | FOLL_LONGTERM, > + plist, NULL); > + if (rv < 0) > + goto out_sem_up; > + > + umem->num_pages += rv; > + atomic64_add(rv, &mm_s->pinned_vm); > + first_page_va += rv * PAGE_SIZE; > + nents -= rv; > + got += rv; > + } > + num_pages -= got; > + } > +out_sem_up: > + up_read(&mm_s->mmap_sem); > + > + if (rv > 0) > + return umem; > + > + siw_umem_release(umem, false); > + > + return ERR_PTR(rv); No, please use ib_umem_get(). Thanks