-----"Leon Romanovsky" <leon@xxxxxxxxxx> wrote: ----- >To: "Bernard Metzler" <bmt@xxxxxxxxxxxxxx> >From: "Leon Romanovsky" <leon@xxxxxxxxxx> >Date: 03/28/2019 04:36PM >Cc: linux-rdma@xxxxxxxxxxxxxxx >Subject: Re: [PATCH v6 06/13] SIW application interface > >On Mon, Mar 25, 2019 at 06:10:40PM +0100, Bernard Metzler wrote: >> Signed-off-by: Bernard Metzler <bmt@xxxxxxxxxxxxxx> >> --- >> drivers/infiniband/sw/siw/siw_ae.c | 87 ++ >> drivers/infiniband/sw/siw/siw_verbs.c | 1799 >++++++++++++++++++++++ >> drivers/infiniband/sw/siw/siw_verbs.h | 79 + >> include/uapi/rdma/rdma_user_ioctl_cmds.h | 1 + >> include/uapi/rdma/siw_user.h | 226 +++ >> 5 files changed, 2192 insertions(+) >> create mode 100644 drivers/infiniband/sw/siw/siw_ae.c >> create mode 100644 drivers/infiniband/sw/siw/siw_verbs.c >> create mode 100644 drivers/infiniband/sw/siw/siw_verbs.h >> create mode 100644 include/uapi/rdma/siw_user.h >> >> diff --git a/drivers/infiniband/sw/siw/siw_ae.c >b/drivers/infiniband/sw/siw/siw_ae.c >> new file mode 100644 >> index 000000000000..d1835a0b488c >> --- /dev/null >> +++ b/drivers/infiniband/sw/siw/siw_ae.c >> @@ -0,0 +1,87 @@ >> +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ >> + >> +/* Authors: Bernard Metzler <bmt@xxxxxxxxxxxxxx> */ >> +/* Copyright (c) 2008-2019, IBM Corporation */ >> + >> +#include <linux/errno.h> >> +#include <linux/types.h> >> +#include <linux/net.h> >> +#include <linux/scatterlist.h> >> +#include <linux/highmem.h> >> +#include <net/sock.h> >> +#include <net/tcp_states.h> >> +#include <net/tcp.h> >> + >> +#include <rdma/iw_cm.h> >> +#include <rdma/ib_verbs.h> >> +#include <rdma/ib_smi.h> >> +#include <rdma/ib_user_verbs.h> >> + >> +#include "siw.h" >> +#include "siw_obj.h" >> +#include "siw_cm.h" >> + >> +void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) >> +{ >> + struct ib_event event; >> + struct ib_qp *base_qp = &qp->base_qp; >> + >> + /* >> + * Do not report asynchronous errors on QP which gets >> + * destroyed via verbs interface (siw_destroy_qp()) >> + */ >> + if (qp->attrs.flags & SIW_QP_IN_DESTROY) >> + return; >> + >> + event.event = etype; >> + event.device = base_qp->device; >> + event.element.qp = base_qp; >> + >> + if (base_qp->event_handler) { >> + siw_dbg_qp(qp, "reporting event %d\n", etype); >> + (*base_qp->event_handler)(&event, base_qp->qp_context); >> + } >> +} >> + >> +void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) >> +{ >> + struct ib_event event; >> + struct ib_cq *base_cq = &cq->base_cq; >> + >> + event.event = etype; >> + event.device = base_cq->device; >> + event.element.cq = base_cq; >> + >> + if (base_cq->event_handler) { >> + siw_dbg(cq->hdr.sdev, "reporting CQ event %d\n", etype); >> + (*base_cq->event_handler)(&event, base_cq->cq_context); >> + } >> +} >> + >> +void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) >> +{ >> + struct ib_event event; >> + struct ib_srq *base_srq = &srq->base_srq; >> + >> + event.event = etype; >> + event.device = base_srq->device; >> + event.element.srq = base_srq; >> + >> + if (base_srq->event_handler) { >> + siw_dbg(srq->pd->hdr.sdev, "reporting SRQ event %d\n", etype); >> + (*base_srq->event_handler)(&event, base_srq->srq_context); >> + } >> +} >> + >> +void siw_port_event(struct siw_device *sdev, u8 port, enum >ib_event_type etype) >> +{ >> + struct ib_event event; >> + >> + event.event = etype; >> + event.device = &sdev->base_dev; >> + event.element.port_num = port; >> + >> + siw_dbg(sdev, "reporting port event %d\n", etype); >> + >> + ib_dispatch_event(&event); >> +} >> diff --git a/drivers/infiniband/sw/siw/siw_verbs.c >b/drivers/infiniband/sw/siw/siw_verbs.c >> new file mode 100644 >> index 000000000000..3b4356474a79 >> --- /dev/null >> +++ b/drivers/infiniband/sw/siw/siw_verbs.c >> @@ -0,0 +1,1799 @@ >> +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ >> + >> +/* Authors: Bernard Metzler <bmt@xxxxxxxxxxxxxx> */ >> +/* Copyright (c) 2008-2019, IBM Corporation */ >> + >> +#include <linux/errno.h> >> +#include <linux/types.h> >> +#include <linux/uaccess.h> >> +#include <linux/vmalloc.h> >> + >> +#include <rdma/iw_cm.h> >> +#include <rdma/ib_verbs.h> >> +#include <rdma/ib_smi.h> >> +#include <rdma/ib_user_verbs.h> >> +#include <rdma/uverbs_ioctl.h> >> + >> +#include "siw.h" >> +#include "siw_verbs.h" >> +#include "siw_obj.h" >> +#include "siw_cm.h" >> + >> +static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { >> + [IB_QPS_RESET] = SIW_QP_STATE_IDLE, >> + [IB_QPS_INIT] = SIW_QP_STATE_IDLE, >> + [IB_QPS_RTR] = SIW_QP_STATE_RTR, >> + [IB_QPS_RTS] = SIW_QP_STATE_RTS, >> + [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, >> + [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, >> + [IB_QPS_ERR] = SIW_QP_STATE_ERROR >> +}; >> + >> +static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] >= { >> + [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = >"RTR", >> + [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = >"SQE", >> + [IB_QPS_ERR] = "ERR" >> +}; >> + >> +static u32 siw_insert_uobj(struct siw_ucontext *uctx, void *vaddr, >u32 size) >> +{ >> + struct siw_uobj *uobj; >> + u32 key; >> + >> + uobj = kzalloc(sizeof(*uobj), GFP_KERNEL); >> + if (!uobj) >> + return SIW_INVAL_UOBJ_KEY; >> + >> + size = PAGE_ALIGN(size); >> + >> + spin_lock(&uctx->uobj_lock); >> + >> + if (list_empty(&uctx->uobj_list)) >> + uctx->uobj_key = 0; >> + >> + key = uctx->uobj_key; >> + if (key > SIW_MAX_UOBJ_KEY) { >> + spin_unlock(&uctx->uobj_lock); >> + kfree(uobj); >> + return SIW_INVAL_UOBJ_KEY; >> + } >> + uobj->key = key; >> + uobj->size = size; >> + uobj->addr = vaddr; >> + >> + uctx->uobj_key += size; /* advance for next object */ >> + >> + list_add_tail(&uobj->list, &uctx->uobj_list); >> + >> + spin_unlock(&uctx->uobj_lock); >> + >> + return key; >> +} >> + >> +static struct siw_uobj *siw_remove_uobj(struct siw_ucontext *uctx, >u32 key, >> + u32 size) >> +{ >> + struct list_head *pos, *nxt; >> + >> + spin_lock(&uctx->uobj_lock); >> + >> + list_for_each_safe(pos, nxt, &uctx->uobj_list) { >> + struct siw_uobj *uobj = list_entry(pos, struct siw_uobj, list); >> + >> + if (uobj->key == key && uobj->size == size) { >> + list_del(&uobj->list); >> + spin_unlock(&uctx->uobj_lock); >> + return uobj; >> + } >> + } >> + spin_unlock(&uctx->uobj_lock); >> + >> + return NULL; >> +} >> + >> +int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) >> +{ >> + struct siw_ucontext *uctx = to_siw_ctx(ctx); >> + struct siw_uobj *uobj; >> + u32 key = vma->vm_pgoff << PAGE_SHIFT; >> + int size = vma->vm_end - vma->vm_start; >> + int rv = -EINVAL; >> + >> + /* >> + * Must be page aligned >> + */ >> + if (vma->vm_start & (PAGE_SIZE - 1)) { >> + pr_warn("map not page aligned\n"); >> + goto out; >> + } >> + >> + uobj = siw_remove_uobj(uctx, key, size); >> + if (!uobj) { >> + pr_warn("mmap lookup failed: %u, %d\n", key, size); >> + goto out; >> + } >> + rv = remap_vmalloc_range(vma, uobj->addr, 0); >> + if (rv) >> + pr_warn("remap_vmalloc_range failed: %u, %d\n", key, size); >> + >> + kfree(uobj); >> +out: >> + return rv; >> +} >> + >> +int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct >ib_udata *udata) >> +{ >> + struct siw_device *sdev = to_siw_dev(base_ctx->device); >> + struct siw_ucontext *ctx = to_siw_ctx(base_ctx); >> + struct siw_uresp_alloc_ctx uresp; >> + int rv; >> + >> + if (!udata) >> + return 0; > >It is always correct. > OK >> + >> + if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + spin_lock_init(&ctx->uobj_lock); >> + INIT_LIST_HEAD(&ctx->uobj_list); >> + ctx->uobj_key = 0; >> + ctx->sdev = sdev; >> + >> + memset(&uresp, 0, sizeof(uresp)); > >Please declare uresp = {}; OK, right, will do that everywhere.... > >> + uresp.dev_id = sdev->vendor_part_id; >> + >> + if (udata->outlen < sizeof(uresp)) { >> + rv = -EINVAL; >> + goto err_out; >> + } >> + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); >> + if (rv) >> + goto err_out; >> + >> + siw_dbg(sdev, "success. now %d context(s)\n", >> + atomic_read(&sdev->num_ctx)); >> + >> + return 0; >> + >> +err_out: >> + atomic_dec(&sdev->num_ctx); >> + siw_dbg(sdev, "failure %d. now %d context(s)\n", rv, >> + atomic_read(&sdev->num_ctx)); >> + >> + return rv; >> +} >> + >> +void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) >> +{ >> + struct siw_ucontext *ctx = to_siw_ctx(base_ctx); >> + >> + atomic_dec(&ctx->sdev->num_ctx); >> +} >> + >> +int siw_query_device(struct ib_device *base_dev, struct >ib_device_attr *attr, >> + struct ib_udata *unused) >> +{ >> + struct siw_device *sdev = to_siw_dev(base_dev); >> + /* >> + * A process context is needed to report avail memory resources. >> + */ >> + if (in_interrupt()) >> + return -EINVAL; > >????? > The driver reports the maximum memory size available for reg_mr reservation via current rlimit(RLIMIT_MEMLOCK). This is valid only within a process context. So far I am not sure a kernel client wouldn't try to call us out of an interrupt context. Alternatively, siw could just report ~0ull for max_mr_size. But I often did hit the case where user land applications tried to register more memory than allowed per current user credentials, and the devices 'attr.max_mr_size' was a good indication ulimits should get adapted. So let me report ~0ull for all calls which do not have udata assigned (kernel level client calls), and current's rlimit otherwise. >> + >> + memset(attr, 0, sizeof(*attr)); > >If I remember correctly, attr is already cleared prior to this call. I didn't find evidence for it in rdma core, and as an iwarp device, siw doesn't set all attributes. Now also checked other drivers, which memset(0) as well. > >And please without siw_OBJ_get and siw_OBJ_put, the are not needed. >If code was called in destroy paths, it means that object is exists >and ready to be released. > I understand the rdma mid layer code undergoes a substantial revision to take responsibility of rdma resources lifetime. Let me try to get rid of all reference counting where I am sure I can rely on rdma midlayer. There are cases though, where I do not see how that should work out: assume a registered memory region which gets invalidated while referenced by a current inbound WRITE or outbound READ. The driver must guarantee the referenced memory region object stays intact, and that should not require to take a heavy lock on that object on the fast path. Same problem seem to be true for the life time of a QP. The mid layer cannot easily make sure the QP object pointer remains valid for all CQE's referencing that QP. That pointer shall remain valid until the CQE is polled by the user, or the CQ gets flushed. So let me gradually get rid of all those references where I am convinced these are just redundant to what the mid layer already guarantees. Thanks Bernard.