On 19-Feb-19 12:08, Bernard Metzler wrote: > Signed-off-by: Bernard Metzler <bmt@xxxxxxxxxxxxxx> > --- > drivers/infiniband/sw/siw/siw_ae.c | 121 ++ > drivers/infiniband/sw/siw/siw_verbs.c | 1851 ++++++++++++++++++++++ > drivers/infiniband/sw/siw/siw_verbs.h | 114 ++ > include/uapi/rdma/rdma_user_ioctl_cmds.h | 1 + > include/uapi/rdma/siw_user.h | 223 +++ > 5 files changed, 2310 insertions(+) > create mode 100644 drivers/infiniband/sw/siw/siw_ae.c > create mode 100644 drivers/infiniband/sw/siw/siw_verbs.c > create mode 100644 drivers/infiniband/sw/siw/siw_verbs.h > create mode 100644 include/uapi/rdma/siw_user.h > > diff --git a/drivers/infiniband/sw/siw/siw_ae.c b/drivers/infiniband/sw/siw/siw_ae.c > new file mode 100644 > index 000000000000..10907a8138b8 > --- /dev/null > +++ b/drivers/infiniband/sw/siw/siw_ae.c > @@ -0,0 +1,121 @@ > +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause > +/* > + * Software iWARP device driver > + * > + * Authors: Bernard Metzler <bmt@xxxxxxxxxxxxxx> > + * > + * Copyright (c) 2008-2018, IBM Corporation > + * > + * This software is available to you under a choice of one of two > + * licenses. You may choose to be licensed under the terms of the GNU > + * General Public License (GPL) Version 2, available from the file > + * COPYING in the main directory of this source tree, or the > + * BSD license below: > + * > + * Redistribution and use in source and binary forms, with or > + * without modification, are permitted provided that the following > + * conditions are met: > + * > + * - Redistributions of source code must retain the above copyright notice, > + * this list of conditions and the following disclaimer. > + * > + * - Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * - Neither the name of IBM nor the names of its contributors may be > + * used to endorse or promote products derived from this software without > + * specific prior written permission. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE > + * SOFTWARE. > + */ > + > +#include <linux/errno.h> > +#include <linux/types.h> > +#include <linux/net.h> > +#include <linux/scatterlist.h> > +#include <linux/highmem.h> > +#include <net/sock.h> > +#include <net/tcp_states.h> > +#include <net/tcp.h> > + > +#include <rdma/iw_cm.h> > +#include <rdma/ib_verbs.h> > +#include <rdma/ib_smi.h> > +#include <rdma/ib_user_verbs.h> > + > +#include "siw.h" > +#include "siw_obj.h" > +#include "siw_cm.h" > + > +void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) > +{ > + struct ib_event event; > + struct ib_qp *base_qp = &qp->base_qp; > + > + /* > + * Do not report asynchronous errors on QP which gets > + * destroyed via verbs interface (siw_destroy_qp()) > + */ > + if (qp->attrs.flags & SIW_QP_IN_DESTROY) > + return; > + > + event.event = etype; > + event.device = base_qp->device; > + event.element.qp = base_qp; > + > + if (base_qp->event_handler) { > + siw_dbg_qp(qp, "reporting event %d\n", etype); > + (*base_qp->event_handler)(&event, base_qp->qp_context); > + } > +} > + > +void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) > +{ > + struct ib_event event; > + struct ib_cq *base_cq = &cq->base_cq; > + > + event.event = etype; > + event.device = base_cq->device; > + event.element.cq = base_cq; > + > + if (base_cq->event_handler) { > + siw_dbg(cq->hdr.sdev, "reporting CQ event %d\n", etype); > + (*base_cq->event_handler)(&event, base_cq->cq_context); > + } > +} > + > +void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) > +{ > + struct ib_event event; > + struct ib_srq *base_srq = &srq->base_srq; > + > + event.event = etype; > + event.device = base_srq->device; > + event.element.srq = base_srq; > + > + if (base_srq->event_handler) { > + siw_dbg(srq->pd->hdr.sdev, "reporting SRQ event %d\n", etype); > + (*base_srq->event_handler)(&event, base_srq->srq_context); > + } > +} > + > +void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype) > +{ > + struct ib_event event; > + > + event.event = etype; > + event.device = &sdev->base_dev; > + event.element.port_num = port; > + > + siw_dbg(sdev, "reporting port event %d\n", etype); > + > + ib_dispatch_event(&event); > +} > diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c > new file mode 100644 > index 000000000000..4c1fbcf66b5c > --- /dev/null > +++ b/drivers/infiniband/sw/siw/siw_verbs.c > @@ -0,0 +1,1851 @@ > +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause > +/* > + * Software iWARP device driver > + * > + * Authors: Bernard Metzler <bmt@xxxxxxxxxxxxxx> > + * > + * Copyright (c) 2008-2019, IBM Corporation > + * > + * This software is available to you under a choice of one of two > + * licenses. You may choose to be licensed under the terms of the GNU > + * General Public License (GPL) Version 2, available from the file > + * COPYING in the main directory of this source tree, or the > + * BSD license below: > + * > + * Redistribution and use in source and binary forms, with or > + * without modification, are permitted provided that the following > + * conditions are met: > + * > + * - Redistributions of source code must retain the above copyright notice, > + * this list of conditions and the following disclaimer. > + * > + * - Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * - Neither the name of IBM nor the names of its contributors may be > + * used to endorse or promote products derived from this software without > + * specific prior written permission. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE > + * SOFTWARE. > + */ > + > +#include <linux/errno.h> > +#include <linux/types.h> > +#include <linux/uaccess.h> > +#include <linux/vmalloc.h> > + > +#include <rdma/iw_cm.h> > +#include <rdma/ib_verbs.h> > +#include <rdma/ib_smi.h> > +#include <rdma/ib_user_verbs.h> > + > +#include "siw.h" > +#include "siw_verbs.h" > +#include "siw_obj.h" > +#include "siw_cm.h" > + > +static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR+1] = { > + [IB_QPS_RESET] = SIW_QP_STATE_IDLE, > + [IB_QPS_INIT] = SIW_QP_STATE_IDLE, > + [IB_QPS_RTR] = SIW_QP_STATE_RTR, > + [IB_QPS_RTS] = SIW_QP_STATE_RTS, > + [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, > + [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, > + [IB_QPS_ERR] = SIW_QP_STATE_ERROR > +}; > + > +static char ib_qp_state_to_string[IB_QPS_ERR+1][sizeof "RESET"] = { Paranthesis after sizeof please. > + [IB_QPS_RESET] = "RESET", > + [IB_QPS_INIT] = "INIT", > + [IB_QPS_RTR] = "RTR", > + [IB_QPS_RTS] = "RTS", > + [IB_QPS_SQD] = "SQD", > + [IB_QPS_SQE] = "SQE", > + [IB_QPS_ERR] = "ERR" > +}; > + > +static u32 siw_insert_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size) > +{ > + struct siw_uobj *uobj; > + u32 key; > + > + uobj = kzalloc(sizeof(*uobj), GFP_KERNEL); > + if (!uobj) > + return SIW_INVAL_UOBJ_KEY; > + > + size = PAGE_ALIGN(size); > + > + spin_lock(&uctx->uobj_lock); > + > + if (list_empty(&uctx->uobj_list)) > + uctx->uobj_key = 0; > + > + key = uctx->uobj_key; > + if (key > SIW_MAX_UOBJ_KEY) { > + spin_unlock(&uctx->uobj_lock); > + kfree(uobj); > + return SIW_INVAL_UOBJ_KEY; > + } > + uobj->key = key; > + uobj->size = size; > + uobj->addr = vaddr; > + > + uctx->uobj_key += size; /* advance for next object */ > + > + list_add_tail(&uobj->list, &uctx->uobj_list); > + > + spin_unlock(&uctx->uobj_lock); > + > + return key; > +} > + > +static struct siw_uobj *siw_remove_uobj(struct siw_ucontext *uctx, u32 key, > + u32 size) > +{ > + struct list_head *pos, *nxt; > + > + spin_lock(&uctx->uobj_lock); > + > + list_for_each_safe(pos, nxt, &uctx->uobj_list) { > + struct siw_uobj *uobj = list_entry(pos, struct siw_uobj, list); > + > + if (uobj->key == key && uobj->size == size) { > + list_del(&uobj->list); > + spin_unlock(&uctx->uobj_lock); > + return uobj; > + } > + } > + spin_unlock(&uctx->uobj_lock); > + > + return NULL; > +} > + > +int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) > +{ > + struct siw_ucontext *uctx = to_siw_ctx(ctx); > + struct siw_uobj *uobj; > + u32 key = vma->vm_pgoff << PAGE_SHIFT; > + int size = vma->vm_end - vma->vm_start; > + int rv = -EINVAL; > + > + /* > + * Must be page aligned > + */ > + if (vma->vm_start & (PAGE_SIZE - 1)) { > + pr_warn("map not page aligned\n"); > + goto out; > + } > + > + uobj = siw_remove_uobj(uctx, key, size); > + if (!uobj) { > + pr_warn("mmap lookup failed: %u, %d\n", key, size); > + goto out; > + } EFA used to remove the objects from the list on mmap and we were asked by Jason to keep them in the list until dealloc_ucontext. This way multiple mmaps could work as well. > + rv = remap_vmalloc_range(vma, uobj->addr, 0); > + if (rv) > + pr_warn("remap_vmalloc_range failed: %u, %d\n", key, size); > + > + kfree(uobj); > +out: > + return rv; > +} > + > +struct ib_ucontext *siw_alloc_ucontext(struct ib_device *base_dev, > + struct ib_udata *udata) > +{ > + struct siw_ucontext *ctx = NULL; > + struct siw_device *sdev = to_siw_dev(base_dev); > + int rv; > + > + if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { > + rv = -ENOMEM; > + goto err_out; > + } > + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); > + if (!ctx) { > + rv = -ENOMEM; > + goto err_out; > + } > + spin_lock_init(&ctx->uobj_lock); > + INIT_LIST_HEAD(&ctx->uobj_list); > + ctx->uobj_key = 0; > + > + ctx->sdev = sdev; > + if (udata) { > + struct siw_uresp_alloc_ctx uresp; > + > + memset(&uresp, 0, sizeof(uresp)); > + uresp.dev_id = sdev->vendor_part_id; > + > + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); All ib_copy_from/to_udata should copy the minimum of sizeof() and udata->inlen/outlen. Applies to other places in the code as well. > + if (rv) > + goto err_out; > + } > + siw_dbg(sdev, "success. now %d context(s)\n", > + atomic_read(&sdev->num_ctx)); > + > + return &ctx->ib_ucontext; > + > +err_out: > + kfree(ctx); > + > + atomic_dec(&sdev->num_ctx); > + siw_dbg(sdev, "failure %d. now %d context(s)\n", > + rv, atomic_read(&sdev->num_ctx)); > + > + return ERR_PTR(rv); > +} > + > +int siw_dealloc_ucontext(struct ib_ucontext *base_ctx) > +{ > + struct siw_ucontext *ctx = to_siw_ctx(base_ctx); > + > + atomic_dec(&ctx->sdev->num_ctx); > + kfree(ctx); > + return 0; > +} > + > +int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, > + struct ib_udata *unused) > +{ > + struct siw_device *sdev = to_siw_dev(base_dev); > + /* > + * A process context is needed to report avail memory resources. > + */ > + if (in_interrupt()) > + return -EINVAL; > + > + memset(attr, 0, sizeof(*attr)); > + > + attr->max_mr_size = rlimit(RLIMIT_MEMLOCK); /* per process */ > + attr->vendor_id = SIW_VENDOR_ID; > + attr->vendor_part_id = sdev->vendor_part_id; > + attr->max_qp = sdev->attrs.max_qp; > + attr->max_qp_wr = sdev->attrs.max_qp_wr; > + > + attr->max_qp_rd_atom = sdev->attrs.max_ord; > + attr->max_qp_init_rd_atom = sdev->attrs.max_ird; > + attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; > + attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS > + | IB_DEVICE_ALLOW_USER_UNREG; > + attr->max_send_sge = sdev->attrs.max_sge; > + attr->max_recv_sge = sdev->attrs.max_sge; > + attr->max_sge_rd = sdev->attrs.max_sge_rd; > + attr->max_cq = sdev->attrs.max_cq; > + attr->max_cqe = sdev->attrs.max_cqe; > + attr->max_mr = sdev->attrs.max_mr; > + attr->max_pd = sdev->attrs.max_pd; > + attr->max_mw = sdev->attrs.max_mw; > + attr->max_fmr = sdev->attrs.max_fmr; > + attr->max_srq = sdev->attrs.max_srq; > + attr->max_srq_wr = sdev->attrs.max_srq_wr; > + attr->max_srq_sge = sdev->attrs.max_srq_sge; > + attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; > + attr->page_size_cap = PAGE_SIZE; > + /* Revisit if RFC 7306 gets supported */ > + attr->atomic_cap = 0; > + > + memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6); > + > + return 0; > +} > + > +/* > + * Approximate translation of real MTU for IB. > + */ > +static inline enum ib_mtu siw_mtu_net2base(unsigned short mtu) > +{ > + if (mtu >= 4096) > + return IB_MTU_4096; > + if (mtu >= 2048) > + return IB_MTU_2048; > + if (mtu >= 1024) > + return IB_MTU_1024; > + if (mtu >= 512) > + return IB_MTU_512; > + if (mtu >= 256) > + return IB_MTU_256; > + return IB_MTU_4096; > +} ib_mtu_int_to_enum()? > + > +int siw_query_port(struct ib_device *base_dev, u8 port, > + struct ib_port_attr *attr) > +{ > + struct siw_device *sdev = to_siw_dev(base_dev); > + > + memset(attr, 0, sizeof(*attr)); > + > + attr->state = sdev->state; > + attr->max_mtu = siw_mtu_net2base(sdev->netdev->mtu); > + attr->active_mtu = attr->max_mtu; > + attr->gid_tbl_len = 1; > + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; > + attr->max_msg_sz = -1; > + attr->pkey_tbl_len = 1; > + attr->active_width = 2; > + attr->active_speed = 2; > + attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 5 : 3; > + /* > + * All zero > + * > + * attr->lid = 0; > + * attr->bad_pkey_cntr = 0; > + * attr->qkey_viol_cntr = 0; > + * attr->sm_lid = 0; > + * attr->lmc = 0; > + * attr->max_vl_num = 0; > + * attr->sm_sl = 0; > + * attr->subnet_timeout = 0; > + * attr->init_type_repy = 0; > + */ > + return 0; > +} > + > +int siw_get_port_immutable(struct ib_device *base_dev, u8 port, > + struct ib_port_immutable *port_immutable) > +{ > + struct ib_port_attr attr; > + int rv = siw_query_port(base_dev, port, &attr); > + > + if (rv) > + return rv; > + > + port_immutable->pkey_tbl_len = attr.pkey_tbl_len; > + port_immutable->gid_tbl_len = attr.gid_tbl_len; > + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; > + > + return 0; > +} > + > +int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey) > +{ > + /* Report the default pkey */ > + *pkey = 0xffff; > + return 0; > +} > + > +int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, > + union ib_gid *gid) > +{ > + struct siw_device *sdev = to_siw_dev(base_dev); > + > + /* subnet_prefix == interface_id == 0; */ > + memset(gid, 0, sizeof(*gid)); > + memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6); > + > + return 0; > +} > + > +int siw_alloc_pd(struct ib_pd *base_pd, struct ib_ucontext *context, > + struct ib_udata *udata) > +{ > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct siw_device *sdev = to_siw_dev(base_pd->device); > + > + if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) > + goto err_out; > + > + if (siw_pd_add(sdev, pd)) > + goto err_out; > + > + siw_dbg(sdev, "success. now %d PD's(s)\n", > + atomic_read(&sdev->num_pd)); > + > + return 0; > + > +err_out: > + atomic_dec(&sdev->num_pd); > + siw_dbg(sdev, "failed. now %d PD's(s)\n", > + atomic_read(&sdev->num_pd)); > + > + return -ENOMEM; > +} > + > +void siw_dealloc_pd(struct ib_pd *base_pd) > +{ > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct siw_device *sdev = to_siw_dev(base_pd->device); > + > + siw_remove_obj(&sdev->lock, &sdev->pd_idr, &pd->hdr); > + siw_pd_put(pd); > +} > + > +void siw_qp_get_ref(struct ib_qp *base_qp) > +{ > + struct siw_qp *qp = to_siw_qp(base_qp); > + > + siw_dbg_qp(qp, "get user reference\n"); > + siw_qp_get(qp); > +} > + > +void siw_qp_put_ref(struct ib_qp *base_qp) > +{ > + struct siw_qp *qp = to_siw_qp(base_qp); > + > + siw_dbg_qp(qp, "put user reference\n"); > + siw_qp_put(qp); > +} > + > +/* > + * siw_create_qp() > + * > + * Create QP of requested size on given device. > + * > + * @base_pd: Base PD contained in siw PD > + * @attrs: Initial QP attributes. > + * @udata: used to provide QP ID, SQ and RQ size back to user. > + */ > + > +struct ib_qp *siw_create_qp(struct ib_pd *base_pd, > + struct ib_qp_init_attr *attrs, > + struct ib_udata *udata) > +{ > + struct siw_qp *qp = NULL; > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct ib_device *base_dev = base_pd->device; > + struct siw_device *sdev = to_siw_dev(base_dev); > + struct siw_cq *scq = NULL, *rcq = NULL; > + > + unsigned long flags; > + int num_sqe, num_rqe, rv = 0; > + > + siw_dbg(sdev, "create new qp\n"); > + > + if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { > + siw_dbg(sdev, "too many qp's\n"); > + rv = -ENOMEM; > + goto err_out; > + } > + if (attrs->qp_type != IB_QPT_RC) { > + siw_dbg(sdev, "only rc qp's supported\n"); > + rv = -EINVAL; > + goto err_out; > + } > + if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || > + (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || > + (attrs->cap.max_send_sge > SIW_MAX_SGE) || > + (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { > + siw_dbg(sdev, "qp size error\n"); > + rv = -EINVAL; > + goto err_out; > + } > + if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { > + siw_dbg(sdev, "max inline send: %d > %d\n", > + attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); > + rv = -EINVAL; > + goto err_out; > + } > + /* > + * NOTE: we allow for zero element SQ and RQ WQE's SGL's > + * but not for a QP unable to hold any WQE (SQ + RQ) > + */ > + if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) { > + siw_dbg(sdev, "qp must have send or receive queue\n"); > + rv = -EINVAL; > + goto err_out; > + } > + > + scq = siw_cq_id2obj(sdev, ((struct siw_cq *)attrs->send_cq)->hdr.id); > + rcq = siw_cq_id2obj(sdev, ((struct siw_cq *)attrs->recv_cq)->hdr.id); > + > + if (!scq || (!rcq && !attrs->srq)) { > + siw_dbg(sdev, "send cq or receive cq invalid\n"); > + rv = -EINVAL; > + goto err_out; > + } > + qp = kzalloc(sizeof(*qp), GFP_KERNEL); > + if (!qp) { > + rv = -ENOMEM; > + goto err_out; > + } > + > + init_rwsem(&qp->state_lock); > + spin_lock_init(&qp->sq_lock); > + spin_lock_init(&qp->rq_lock); > + spin_lock_init(&qp->orq_lock); > + > + if (!base_pd->uobject) This should be tested using 'if (udata)'. > + qp->kernel_verbs = 1; > + > + rv = siw_qp_add(sdev, qp); > + if (rv) > + goto err_out; > + > + num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); > + num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr); > + > + if (qp->kernel_verbs) > + qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe)); > + else > + qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); > + > + if (qp->sendq == NULL) { > + siw_dbg_qp(qp, "send queue size %d alloc failed\n", num_sqe); > + rv = -ENOMEM; > + goto err_out_idr; > + } > + if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { > + if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) > + qp->attrs.flags |= SIW_SIGNAL_ALL_WR; > + else { > + rv = -EINVAL; > + goto err_out_idr; > + } > + } > + qp->pd = pd; > + qp->scq = scq; > + qp->rcq = rcq; > + > + if (attrs->srq) { > + /* > + * SRQ support. > + * Verbs 6.3.7: ignore RQ size, if SRQ present > + * Verbs 6.3.5: do not check PD of SRQ against PD of QP > + */ > + qp->srq = to_siw_srq(attrs->srq); > + qp->attrs.rq_size = 0; > + siw_dbg_qp(qp, "[SRQ 0x%p] attached\n", qp->srq); > + } else if (num_rqe) { > + if (qp->kernel_verbs) > + qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); > + else > + qp->recvq = vmalloc_user(num_rqe * > + sizeof(struct siw_rqe)); > + > + if (qp->recvq == NULL) { > + siw_dbg_qp(qp, "recv queue size %d alloc failed\n", > + num_rqe); > + rv = -ENOMEM; > + goto err_out_idr; > + } > + > + qp->attrs.rq_size = num_rqe; > + } > + qp->attrs.sq_size = num_sqe; > + qp->attrs.sq_max_sges = attrs->cap.max_send_sge; > + qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; > + > + /* Make those two tunables fixed for now. */ > + qp->tx_ctx.gso_seg_limit = gso_seg_limit; > + qp->tx_ctx.zcopy_tx = zcopy_tx; > + > + qp->attrs.state = SIW_QP_STATE_IDLE; > + > + if (udata) { > + struct siw_uresp_create_qp uresp; > + struct siw_ucontext *ctx; > + > + memset(&uresp, 0, sizeof(uresp)); > + ctx = to_siw_ctx(base_pd->uobject->context); > + > + uresp.sq_key = uresp.rq_key = SIW_INVAL_UOBJ_KEY; > + uresp.num_sqe = num_sqe; > + uresp.num_rqe = num_rqe; > + uresp.qp_id = QP_ID(qp); > + > + if (qp->sendq) { > + uresp.sq_key = siw_insert_uobj(ctx, qp->sendq, > + num_sqe * sizeof(struct siw_sqe)); > + if (uresp.sq_key > SIW_MAX_UOBJ_KEY) > + siw_dbg_qp(qp, "preparing mmap sq failed\n"); > + } > + if (qp->recvq) { > + uresp.rq_key = siw_insert_uobj(ctx, qp->recvq, > + num_rqe * sizeof(struct siw_rqe)); > + if (uresp.rq_key > SIW_MAX_UOBJ_KEY) > + siw_dbg_qp(qp, "preparing mmap rq failed\n"); > + } > + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); > + if (rv) > + goto err_out_idr; > + } > + qp->tx_cpu = siw_get_tx_cpu(sdev); > + if (qp->tx_cpu < 0) { > + rv = -EINVAL; > + goto err_out_idr; > + } > + qp->base_qp.qp_num = QP_ID(qp); > + > + siw_pd_get(pd); > + > + INIT_LIST_HEAD(&qp->devq); > + spin_lock_irqsave(&sdev->lock, flags); > + list_add_tail(&qp->devq, &sdev->qp_list); > + spin_unlock_irqrestore(&sdev->lock, flags); > + > + return &qp->base_qp; > + > +err_out_idr: > + siw_remove_obj(&sdev->lock, &sdev->qp_idr, &qp->hdr); > +err_out: > + if (scq) > + siw_cq_put(scq); > + if (rcq) > + siw_cq_put(rcq); > + > + if (qp) { > + if (qp->sendq) > + vfree(qp->sendq); > + if (qp->recvq) > + vfree(qp->recvq); > + kfree(qp); > + } > + atomic_dec(&sdev->num_qp); > + > + return ERR_PTR(rv); > +} > + > +/* > + * Minimum siw_query_qp() verb interface. > + * > + * @qp_attr_mask is not used but all available information is provided > + */ > +int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, > + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) > +{ > + struct siw_qp *qp; > + struct siw_device *sdev; > + > + if (base_qp && qp_attr && qp_init_attr) { > + qp = to_siw_qp(base_qp); > + sdev = to_siw_dev(base_qp->device); > + } else > + return -EINVAL; > + > + qp_attr->cap.max_inline_data = SIW_MAX_INLINE; > + qp_attr->cap.max_send_wr = qp->attrs.sq_size; > + qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; > + qp_attr->cap.max_recv_wr = qp->attrs.rq_size; > + qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; > + qp_attr->path_mtu = siw_mtu_net2base(sdev->netdev->mtu); > + qp_attr->max_rd_atomic = qp->attrs.irq_size; > + qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; > + > + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | > + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; > + > + qp_init_attr->qp_type = base_qp->qp_type; > + qp_init_attr->send_cq = base_qp->send_cq; > + qp_init_attr->recv_cq = base_qp->recv_cq; > + qp_init_attr->srq = base_qp->srq; > + > + qp_init_attr->cap = qp_attr->cap; > + > + return 0; > +} > + > +int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, > + int attr_mask, struct ib_udata *udata) > +{ > + struct siw_qp_attrs new_attrs; > + enum siw_qp_attr_mask siw_attr_mask = 0; > + struct siw_qp *qp = to_siw_qp(base_qp); > + int rv = 0; > + > + if (!attr_mask) > + return 0; > + > + memset(&new_attrs, 0, sizeof(new_attrs)); > + > + if (attr_mask & IB_QP_ACCESS_FLAGS) { > + > + siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; > + > + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) > + new_attrs.flags |= SIW_RDMA_READ_ENABLED; > + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) > + new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; > + if (attr->qp_access_flags & IB_ACCESS_MW_BIND) > + new_attrs.flags |= SIW_RDMA_BIND_ENABLED; > + } > + if (attr_mask & IB_QP_STATE) { > + siw_dbg_qp(qp, "desired ib qp state: %s\n", > + ib_qp_state_to_string[attr->qp_state]); > + > + new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; > + > + if (new_attrs.state > SIW_QP_STATE_RTS) > + qp->tx_ctx.tx_suspend = 1; > + > + siw_attr_mask |= SIW_QP_ATTR_STATE; > + } > + if (!attr_mask) > + goto out; Remove, already checked at the beginning of the function. > + > + down_write(&qp->state_lock); > + > + rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); > + > + up_write(&qp->state_lock); > +out: > + return rv; > +} > + > +int siw_destroy_qp(struct ib_qp *base_qp) > +{ > + struct siw_qp *qp = to_siw_qp(base_qp); > + struct siw_qp_attrs qp_attrs; > + > + siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep); > + > + /* > + * Mark QP as in process of destruction to prevent from > + * any async callbacks to RDMA core > + */ > + qp->attrs.flags |= SIW_QP_IN_DESTROY; > + qp->rx_ctx.rx_suspend = 1; > + > + down_write(&qp->state_lock); > + > + qp_attrs.state = SIW_QP_STATE_ERROR; > + (void)siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); > + > + if (qp->cep) { > + siw_cep_put(qp->cep); > + qp->cep = NULL; > + } > + > + up_write(&qp->state_lock); > + > + kfree(qp->rx_ctx.mpa_crc_hd); > + kfree(qp->tx_ctx.mpa_crc_hd); > + > + /* Drop references */ > + siw_cq_put(qp->scq); > + siw_cq_put(qp->rcq); > + siw_pd_put(qp->pd); > + qp->scq = qp->rcq = NULL; > + > + siw_qp_put(qp); > + > + return 0; > +} > + > +/* > + * siw_copy_sgl() > + * > + * Copy SGL from RDMA core representation to local > + * representation. > + */ > +static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge, > + int num_sge) > +{ > + while (num_sge--) { > + siw_sge->laddr = sge->addr; > + siw_sge->length = sge->length; > + siw_sge->lkey = sge->lkey; > + > + siw_sge++; sge++; Split to two lines please. > + } > +} > + > +/* > + * siw_copy_inline_sgl() > + * > + * Prepare sgl of inlined data for sending. For userland callers > + * function checks if given buffer addresses and len's are within > + * process context bounds. > + * Data from all provided sge's are copied together into the wqe, > + * referenced by a single sge. > + */ > +static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, > + struct siw_sqe *sqe) > +{ > + struct ib_sge *core_sge = core_wr->sg_list; > + void *kbuf = &sqe->sge[1]; > + int num_sge = core_wr->num_sge, > + bytes = 0; > + > + sqe->sge[0].laddr = (u64)kbuf; > + sqe->sge[0].lkey = 0; > + > + while (num_sge--) { > + if (!core_sge->length) { > + core_sge++; > + continue; > + } > + bytes += core_sge->length; > + if (bytes > SIW_MAX_INLINE) { > + bytes = -EINVAL; > + break; > + } > + memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, > + core_sge->length); > + > + kbuf += core_sge->length; > + core_sge++; > + } > + sqe->sge[0].length = bytes > 0 ? bytes : 0; > + sqe->num_sge = bytes > 0 ? 1 : 0; > + > + return bytes; > +} > + > +/* > + * siw_post_send() > + * > + * Post a list of S-WR's to a SQ. > + * > + * @base_qp: Base QP contained in siw QP > + * @wr: Null terminated list of user WR's > + * @bad_wr: Points to failing WR in case of synchronous failure. > + */ > +int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, > + const struct ib_send_wr **bad_wr) > +{ > + struct siw_qp *qp = to_siw_qp(base_qp); > + struct siw_wqe *wqe = tx_wqe(qp); > + > + unsigned long flags; > + int rv = 0; > + > + siw_dbg_qp(qp, "state %d\n", qp->attrs.state); > + > + /* > + * Try to acquire QP state lock. Must be non-blocking > + * to accommodate kernel clients needs. > + */ > + if (!down_read_trylock(&qp->state_lock)) { > + *bad_wr = wr; > + return -ENOTCONN; > + } > + > + if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { > + up_read(&qp->state_lock); > + *bad_wr = wr; > + return -ENOTCONN; > + } > + if (wr && !qp->kernel_verbs) { > + siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); > + up_read(&qp->state_lock); > + *bad_wr = wr; > + return -EINVAL; > + } > + > + spin_lock_irqsave(&qp->sq_lock, flags); > + > + while (wr) { > + u32 idx = qp->sq_put % qp->attrs.sq_size; > + struct siw_sqe *sqe = &qp->sendq[idx]; > + > + if (sqe->flags) { > + siw_dbg_qp(qp, "sq full\n"); > + rv = -ENOMEM; > + break; > + } > + if (wr->num_sge > qp->attrs.sq_max_sges) { > + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); > + rv = -EINVAL; > + break; > + } > + sqe->id = wr->wr_id; > + > + if ((wr->send_flags & IB_SEND_SIGNALED) || > + (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) > + sqe->flags |= SIW_WQE_SIGNALLED; > + > + if (wr->send_flags & IB_SEND_FENCE) > + sqe->flags |= SIW_WQE_READ_FENCE; > + > + switch (wr->opcode) { > + > + case IB_WR_SEND: > + case IB_WR_SEND_WITH_INV: > + if (wr->send_flags & IB_SEND_SOLICITED) > + sqe->flags |= SIW_WQE_SOLICITED; > + > + if (!(wr->send_flags & IB_SEND_INLINE)) { > + siw_copy_sgl(wr->sg_list, sqe->sge, > + wr->num_sge); > + sqe->num_sge = wr->num_sge; > + } else { > + rv = siw_copy_inline_sgl(wr, sqe); > + if (rv <= 0) { > + rv = -EINVAL; > + break; > + } > + sqe->flags |= SIW_WQE_INLINE; > + sqe->num_sge = 1; > + } > + if (wr->opcode == IB_WR_SEND) > + sqe->opcode = SIW_OP_SEND; > + else { > + sqe->opcode = SIW_OP_SEND_REMOTE_INV; > + sqe->rkey = wr->ex.invalidate_rkey; > + } > + break; > + > + case IB_WR_RDMA_READ_WITH_INV: > + case IB_WR_RDMA_READ: > + /* > + * OFED WR restricts RREAD sink to SGL containing > + * 1 SGE only. we could relax to SGL with multiple > + * elements referring the SAME ltag or even sending > + * a private per-rreq tag referring to a checked > + * local sgl with MULTIPLE ltag's. would be easy > + * to do... > + */ > + if (unlikely(wr->num_sge != 1)) { > + rv = -EINVAL; > + break; > + } > + siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); > + /* > + * NOTE: zero length RREAD is allowed! > + */ > + sqe->raddr = rdma_wr(wr)->remote_addr; > + sqe->rkey = rdma_wr(wr)->rkey; > + sqe->num_sge = 1; > + > + if (wr->opcode == IB_WR_RDMA_READ) > + sqe->opcode = SIW_OP_READ; > + else > + sqe->opcode = SIW_OP_READ_LOCAL_INV; > + break; > + > + case IB_WR_RDMA_WRITE: > + if (!(wr->send_flags & IB_SEND_INLINE)) { > + siw_copy_sgl(wr->sg_list, &sqe->sge[0], > + wr->num_sge); > + sqe->num_sge = wr->num_sge; > + } else { > + rv = siw_copy_inline_sgl(wr, sqe); > + if (unlikely(rv < 0)) { > + rv = -EINVAL; > + break; > + } > + sqe->flags |= SIW_WQE_INLINE; > + sqe->num_sge = 1; > + } > + sqe->raddr = rdma_wr(wr)->remote_addr; > + sqe->rkey = rdma_wr(wr)->rkey; > + sqe->opcode = SIW_OP_WRITE; > + > + break; > + > + case IB_WR_REG_MR: > + sqe->base_mr = (uint64_t)reg_wr(wr)->mr; > + sqe->rkey = reg_wr(wr)->key; > + sqe->access = SIW_MEM_LREAD; > + if (reg_wr(wr)->access & IB_ACCESS_LOCAL_WRITE) > + sqe->access |= SIW_MEM_LWRITE; > + if (reg_wr(wr)->access & IB_ACCESS_REMOTE_WRITE) > + sqe->access |= SIW_MEM_RWRITE; > + if (reg_wr(wr)->access & IB_ACCESS_REMOTE_READ) > + sqe->access |= SIW_MEM_RREAD; > + sqe->opcode = SIW_OP_REG_MR; > + > + break; > + > + case IB_WR_LOCAL_INV: > + sqe->rkey = wr->ex.invalidate_rkey; > + sqe->opcode = SIW_OP_INVAL_STAG; > + > + break; > + > + default: > + siw_dbg_qp(qp, "ib wr type %d unsupported\n", > + wr->opcode); > + rv = -EINVAL; > + break; > + } > + siw_dbg_qp(qp, "opcode %d, flags 0x%x\n", > + sqe->opcode, sqe->flags); > + > + if (unlikely(rv < 0)) > + break; > + > + /* make SQE only vaild after completely written */ > + smp_wmb(); > + sqe->flags |= SIW_WQE_VALID; > + > + qp->sq_put++; > + wr = wr->next; > + } > + > + /* > + * Send directly if SQ processing is not in progress. > + * Eventual immediate errors (rv < 0) do not affect the involved > + * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ > + * processing, if new work is already pending. But rv must be passed > + * to caller. > + */ > + if (wqe->wr_status != SIW_WR_IDLE) { > + spin_unlock_irqrestore(&qp->sq_lock, flags); > + goto skip_direct_sending; > + } > + rv = siw_activate_tx(qp); > + spin_unlock_irqrestore(&qp->sq_lock, flags); > + > + if (rv <= 0) > + goto skip_direct_sending; > + > + if (qp->kernel_verbs) { > + rv = siw_sq_start(qp); > + } else { > + qp->tx_ctx.in_syscall = 1; > + > + if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) > + siw_qp_cm_drop(qp, 0); > + > + qp->tx_ctx.in_syscall = 0; > + } > +skip_direct_sending: > + > + up_read(&qp->state_lock); > + > + if (rv >= 0) > + return 0; > + /* > + * Immediate error > + */ > + siw_dbg_qp(qp, "error %d\n", rv); > + > + *bad_wr = wr; > + return rv; > +} > + > +/* > + * siw_post_receive() > + * > + * Post a list of R-WR's to a RQ. > + * > + * @base_qp: Base QP contained in siw QP > + * @wr: Null terminated list of user WR's > + * @bad_wr: Points to failing WR in case of synchronous failure. > + */ > +int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, > + const struct ib_recv_wr **bad_wr) > +{ > + struct siw_qp *qp = to_siw_qp(base_qp); There's a tab instead of a space between siq_qp and *qp. > + unsigned long flags; > + int rv = 0; > + > + if (qp->srq) { > + *bad_wr = wr; > + return -EOPNOTSUPP; /* what else from errno.h? */ > + } > + /* > + * Try to acquire QP state lock. Must be non-blocking > + * to accommodate kernel clients needs. > + */ > + if (!down_read_trylock(&qp->state_lock)) { Perhaps this rwsemlock should be replaced with a spinlock. > + *bad_wr = wr; > + return -ENOTCONN; > + } > + if (!qp->kernel_verbs) { > + siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); > + up_read(&qp->state_lock); > + *bad_wr = wr; > + return -EINVAL; > + } > + if (qp->attrs.state > SIW_QP_STATE_RTS) { > + up_read(&qp->state_lock); > + *bad_wr = wr; > + return -EINVAL; > + } > + /* > + * Serialize potentially multiple producers. > + * Not needed for single threaded consumer side. > + */ > + spin_lock_irqsave(&qp->rq_lock, flags); > + > + while (wr) { > + u32 idx = qp->rq_put % qp->attrs.rq_size; > + struct siw_rqe *rqe = &qp->recvq[idx]; > + > + if (rqe->flags) { > + siw_dbg_qp(qp, "Receive Queue full\n"); > + rv = -ENOMEM; > + break; > + } > + if (wr->num_sge > qp->attrs.rq_max_sges) { > + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); > + rv = -EINVAL; > + break; > + } > + rqe->id = wr->wr_id; > + rqe->num_sge = wr->num_sge; > + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); > + > + /* make sure RQE is completely written before valid */ > + smp_wmb(); > + > + rqe->flags = SIW_WQE_VALID; > + > + qp->rq_put++; > + wr = wr->next; > + } > + spin_unlock_irqrestore(&qp->rq_lock, flags); > + > + up_read(&qp->state_lock); > + > + if (rv < 0) { > + siw_dbg_qp(qp, "error %d\n", rv); > + *bad_wr = wr; > + } > + return rv > 0 ? 0 : rv; > +} > + > +int siw_destroy_cq(struct ib_cq *base_cq) > +{ > + struct siw_cq *cq = to_siw_cq(base_cq); > + struct ib_device *base_dev = base_cq->device; > + struct siw_device *sdev = to_siw_dev(base_dev); > + > + siw_cq_flush(cq); > + > + siw_remove_obj(&sdev->lock, &sdev->cq_idr, &cq->hdr); > + siw_cq_put(cq); > + > + return 0; > +} > + > +/* > + * siw_create_cq() > + * > + * Create CQ of requested size on given device. > + * > + * @base_dev: RDMA device contained in siw device > + * @size: maximum number of CQE's allowed. > + * @ib_context: user context. > + * @udata: used to provide CQ ID back to user. > + */ > + > +struct ib_cq *siw_create_cq(struct ib_device *base_dev, > + const struct ib_cq_init_attr *attr, > + struct ib_ucontext *ib_context, > + struct ib_udata *udata) > +{ > + struct siw_cq *cq = NULL; > + struct siw_device *sdev = to_siw_dev(base_dev); > + struct siw_uresp_create_cq uresp; > + int rv, size = attr->cqe; > + > + if (!base_dev) { > + rv = -ENODEV; > + goto err_out; > + } Is this really needed? > + if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { > + siw_dbg(sdev, "too many cq's\n"); > + rv = -ENOMEM; > + goto err_out; > + } > + if (size < 1 || size > sdev->attrs.max_cqe) { > + siw_dbg(sdev, "cq size error: %d\n", size); > + rv = -EINVAL; > + goto err_out; > + } > + cq = kzalloc(sizeof(*cq), GFP_KERNEL); > + if (!cq) { > + rv = -ENOMEM; > + goto err_out; > + } > + size = roundup_pow_of_two(size); > + cq->base_cq.cqe = size; > + cq->num_cqe = size; No reason to have another field for num_cqe, just use base_cq.cqe. > + > + if (!ib_context) { > + cq->kernel_verbs = 1; > + cq->queue = vzalloc(size * sizeof(struct siw_cqe) > + + sizeof(struct siw_cq_ctrl)); > + } else { > + cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) > + + sizeof(struct siw_cq_ctrl)); > + } > + if (cq->queue == NULL) { > + rv = -ENOMEM; > + goto err_out; > + } > + > + rv = siw_cq_add(sdev, cq); > + if (rv) > + goto err_out; > + > + spin_lock_init(&cq->lock); > + > + cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify; > + > + if (!cq->kernel_verbs) { > + struct siw_ucontext *ctx = to_siw_ctx(ib_context); > + > + uresp.cq_key = siw_insert_uobj(ctx, cq->queue, > + size * sizeof(struct siw_cqe) + > + sizeof(struct siw_cq_ctrl)); > + > + if (uresp.cq_key > SIW_MAX_UOBJ_KEY) > + siw_dbg(sdev, "[CQ %d]: preparing mmap failed\n", > + OBJ_ID(cq)); > + > + uresp.cq_id = OBJ_ID(cq); > + uresp.num_cqe = size; > + > + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); > + if (rv) > + goto err_out_idr; > + } > + return &cq->base_cq; > + > +err_out_idr: > + siw_remove_obj(&sdev->lock, &sdev->cq_idr, &cq->hdr); > +err_out: > + siw_dbg(sdev, "cq creation failed: %d", rv); > + > + if (cq && cq->queue) > + vfree(cq->queue); > + > + kfree(cq); > + atomic_dec(&sdev->num_cq); > + > + return ERR_PTR(rv); > +} > + > +/* > + * siw_poll_cq() > + * > + * Reap CQ entries if available and copy work completion status into > + * array of WC's provided by caller. Returns number of reaped CQE's. > + * > + * @base_cq: Base CQ contained in siw CQ. > + * @num_cqe: Maximum number of CQE's to reap. > + * @wc: Array of work completions to be filled by siw. > + */ > +int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) > +{ > + struct siw_cq *cq = to_siw_cq(base_cq); > + int i; > + > + for (i = 0; i < num_cqe; i++) { > + if (!(siw_reap_cqe(cq, wc))) Extra paranthesis. > + break; > + wc++; > + } > + return i; > +} > + > +/* > + * siw_req_notify_cq() > + * > + * Request notification for new CQE's added to that CQ. > + * Defined flags: > + * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification > + * event if a WQE with notification flag set enters the CQ > + * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification > + * event if a WQE enters the CQ. > + * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the > + * number of not reaped CQE's regardless of its notification > + * type and current or new CQ notification settings. > + * > + * @base_cq: Base CQ contained in siw CQ. > + * @flags: Requested notification flags. > + */ > +int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) > +{ > + struct siw_cq *cq = to_siw_cq(base_cq); > + > + siw_dbg(cq->hdr.sdev, "[CQ %d]: flags: 0x%8x\n", OBJ_ID(cq), flags); > + > + if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) > + /* CQ event for next solicited completion */ > + smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED); > + else > + /* CQ event for any signalled completion */ > + smp_store_mb(*cq->notify, SIW_NOTIFY_ALL); > + > + if (flags & IB_CQ_REPORT_MISSED_EVENTS) > + return cq->cq_put - cq->cq_get; > + > + return 0; > +} > + > +/* > + * siw_dereg_mr() > + * > + * Release Memory Region. > + * > + * TODO: Update function if Memory Windows are supported by siw: > + * Is OFED core checking for MW dependencies for current > + * MR before calling MR deregistration?. > + * > + * @base_mr: Base MR contained in siw MR. > + */ > +int siw_dereg_mr(struct ib_mr *base_mr) > +{ > + struct siw_mr *mr; > + struct siw_device *sdev = to_siw_dev(base_mr->device); > + > + mr = to_siw_mr(base_mr); > + > + siw_dbg(sdev, "[MEM %d]: deregister mr, #ref's %d\n", > + mr->mem.hdr.id, kref_read(&mr->mem.hdr.ref)); > + > + mr->mem.stag_valid = 0; > + > + siw_remove_obj(&sdev->lock, &sdev->mem_idr, &mr->mem.hdr); > + siw_mem_put(&mr->mem); > + > + return 0; > +} > + > +static struct siw_mr *siw_create_mr(struct siw_device *sdev, void *mem_obj, > + u64 start, u64 len, int rights) > +{ > + struct siw_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL); > + unsigned long flags; > + > + if (!mr) > + return NULL; > + > + mr->mem.stag_valid = 0; > + > + if (siw_mem_add(sdev, &mr->mem) < 0) { > + kfree(mr); > + return NULL; > + } > + siw_dbg(sdev, "[MEM %d]: new mr, object 0x%p\n", > + mr->mem.hdr.id, mem_obj); > + > + mr->base_mr.lkey = mr->base_mr.rkey = mr->mem.hdr.id << 8; > + > + mr->mem.va = start; > + mr->mem.len = len; > + mr->mem.mr = NULL; > + mr->mem.perms = SIW_MEM_LREAD | /* not selectable in RDMA core */ > + (rights & IB_ACCESS_REMOTE_READ ? SIW_MEM_RREAD : 0) | > + (rights & IB_ACCESS_LOCAL_WRITE ? SIW_MEM_LWRITE : 0) | > + (rights & IB_ACCESS_REMOTE_WRITE ? SIW_MEM_RWRITE : 0); > + > + mr->mem_obj = mem_obj; > + > + INIT_LIST_HEAD(&mr->devq); > + spin_lock_irqsave(&sdev->lock, flags); > + list_add_tail(&mr->devq, &sdev->mr_list); > + spin_unlock_irqrestore(&sdev->lock, flags); > + > + return mr; > +} > + > +/* > + * siw_reg_user_mr() > + * > + * Register Memory Region. > + * > + * @base_pd: Base PD contained in siw PD. > + * @start: starting address of MR (virtual address) > + * @len: len of MR > + * @rnic_va: not used by siw > + * @rights: MR access rights > + * @udata: user buffer to communicate STag and Key. > + */ > +struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len, > + u64 rnic_va, int rights, struct ib_udata *udata) > +{ > + struct siw_mr *mr = NULL; > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct siw_umem *umem = NULL; > + struct siw_ureq_reg_mr ureq; > + struct siw_uresp_reg_mr uresp; > + struct siw_device *sdev = pd->hdr.sdev; > + > + unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); > + int rv; > + > + siw_dbg(sdev, "[PD %d]: start: 0x%016llx, va: 0x%016llx, len: %llu\n", > + OBJ_ID(pd), (unsigned long long)start, > + (unsigned long long)rnic_va, (unsigned long long)len); > + > + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { > + siw_dbg(sdev, "[PD %d]: too many mr's\n", OBJ_ID(pd)); > + rv = -ENOMEM; > + goto err_out; > + } > + if (!len) { > + rv = -EINVAL; > + goto err_out; > + } > + if (mem_limit != RLIM_INFINITY) { > + unsigned long num_pages = > + (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT; > + mem_limit >>= PAGE_SHIFT; > + > + if (num_pages > mem_limit - current->mm->locked_vm) { > + siw_dbg(sdev, > + "[PD %d]: pages req %lu, max %lu, lock %lu\n", > + OBJ_ID(pd), num_pages, mem_limit, > + current->mm->locked_vm); > + rv = -ENOMEM; > + goto err_out; > + } > + } > + umem = siw_umem_get(start, len); > + if (IS_ERR(umem)) { > + rv = PTR_ERR(umem); > + siw_dbg(sdev, "[PD %d]: getting user memory failed: %d\n", > + OBJ_ID(pd), rv); > + umem = NULL; > + goto err_out; > + } > + mr = siw_create_mr(sdev, umem, start, len, rights); > + if (!mr) { > + rv = -ENOMEM; > + goto err_out; > + } > + if (udata) { > + rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); > + if (rv) > + goto err_out; > + > + mr->base_mr.lkey |= ureq.stag_key; > + mr->base_mr.rkey |= ureq.stag_key; > + uresp.stag = mr->base_mr.lkey; > + > + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); > + if (rv) > + goto err_out; > + } > + mr->pd = pd; > + siw_pd_get(pd); > + > + mr->mem.stag_valid = 1; > + > + return &mr->base_mr; > + > +err_out: > + if (mr) { > + siw_remove_obj(&sdev->lock, &sdev->mem_idr, &mr->mem.hdr); > + siw_mem_put(&mr->mem); > + umem = NULL; > + } else > + atomic_dec(&sdev->num_mr); > + > + if (umem) > + siw_umem_release(umem); > + > + return ERR_PTR(rv); > +} > + > +struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type, > + u32 max_sge) > +{ > + struct siw_mr *mr; > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct siw_device *sdev = pd->hdr.sdev; > + struct siw_pbl *pbl = NULL; > + int rv; > + > + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { > + siw_dbg(sdev, "[PD %d]: too many mr's\n", OBJ_ID(pd)); > + rv = -ENOMEM; > + goto err_out; > + } > + if (mr_type != IB_MR_TYPE_MEM_REG) { > + siw_dbg(sdev, "[PD %d]: mr type %d unsupported\n", > + OBJ_ID(pd), mr_type); > + rv = -EOPNOTSUPP; > + goto err_out; > + } > + if (max_sge > SIW_MAX_SGE_PBL) { > + siw_dbg(sdev, "[PD %d]: too many sge's: %d\n", > + OBJ_ID(pd), max_sge); > + rv = -ENOMEM; > + goto err_out; > + } > + pbl = siw_pbl_alloc(max_sge); > + if (IS_ERR(pbl)) { > + rv = PTR_ERR(pbl); > + siw_dbg(sdev, "[PD %d]: pbl allocation failed: %d\n", > + OBJ_ID(pd), rv); > + pbl = NULL; > + goto err_out; > + } > + mr = siw_create_mr(sdev, pbl, 0, max_sge * PAGE_SIZE, 0); > + if (!mr) { > + rv = -ENOMEM; > + goto err_out; > + } > + mr->mem.is_pbl = 1; > + mr->pd = pd; > + siw_pd_get(pd); > + > + siw_dbg(sdev, "[PD %d], [MEM %d]: success\n", > + OBJ_ID(pd), OBJ_ID(&mr->mem)); > + > + return &mr->base_mr; > + > +err_out: > + if (pbl) > + siw_pbl_free(pbl); > + > + siw_dbg(sdev, "[PD %d]: failed: %d\n", OBJ_ID(pd), rv); > + > + atomic_dec(&sdev->num_mr); > + > + return ERR_PTR(rv); > +} > + > +/* Just used to count number of pages being mapped */ > +static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) > +{ > + return 0; > +} > + > +int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, > + unsigned int *sg_off) > +{ > + struct scatterlist *slp; > + struct siw_mr *mr = to_siw_mr(base_mr); > + struct siw_pbl *pbl = mr->pbl; > + struct siw_pble *pble = pbl->pbe; > + u64 pbl_size; > + int i, rv; > + > + if (!pbl) { You already dereferenced pbl for pble assignment. > + siw_dbg(mr->mem.hdr.sdev, "[MEM %d]: no pbl allocated\n", > + OBJ_ID(&mr->mem)); > + return -EINVAL; > + } > + if (pbl->max_buf < num_sle) { > + siw_dbg(mr->mem.hdr.sdev, "[MEM %d]: too many sge's: %d>%d\n", > + OBJ_ID(&mr->mem), mr->pbl->max_buf, num_sle); > + return -ENOMEM; > + } > + > + for_each_sg(sl, slp, num_sle, i) { > + if (sg_dma_len(slp) == 0) { > + siw_dbg(mr->mem.hdr.sdev, "[MEM %d]: empty sge\n", > + OBJ_ID(&mr->mem)); > + return -EINVAL; > + } > + if (i == 0) { > + pble->addr = sg_dma_address(slp); > + pble->size = sg_dma_len(slp); > + pble->pbl_off = 0; > + pbl_size = pble->size; > + pbl->num_buf = 1; > + > + continue; > + } > + /* Merge PBL entries if adjacent */ > + if (pble->addr + pble->size == sg_dma_address(slp)) > + pble->size += sg_dma_len(slp); > + else { > + pble++; > + pbl->num_buf++; > + pble->addr = sg_dma_address(slp); > + pble->size = sg_dma_len(slp); > + pble->pbl_off = pbl_size; > + } > + pbl_size += sg_dma_len(slp); > + > + siw_dbg(mr->mem.hdr.sdev, > + "[MEM %d]: sge[%d], size %llu, addr %p, total %llu\n", > + OBJ_ID(&mr->mem), i, pble->size, (void *)pble->addr, > + pbl_size); > + } > + rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); > + if (rv > 0) { > + mr->mem.len = base_mr->length; > + mr->mem.va = base_mr->iova; > + siw_dbg(mr->mem.hdr.sdev, > + "[MEM %d]: %llu byte, %u SLE into %u entries\n", > + OBJ_ID(&mr->mem), mr->mem.len, num_sle, pbl->num_buf); > + } > + return rv; > +} > + > +/* > + * siw_get_dma_mr() > + * > + * Create a (empty) DMA memory region, where no umem is attached. > + */ > +struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights) > +{ > + struct siw_mr *mr; > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct siw_device *sdev = pd->hdr.sdev; > + int rv; > + > + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { > + siw_dbg(sdev, "[PD %d]: too many mr's\n", OBJ_ID(pd)); > + rv = -ENOMEM; > + goto err_out; > + } > + mr = siw_create_mr(sdev, NULL, 0, ULONG_MAX, rights); > + if (!mr) { > + rv = -ENOMEM; > + goto err_out; > + } > + mr->mem.stag_valid = 1; > + > + mr->pd = pd; > + siw_pd_get(pd); > + > + siw_dbg(sdev, "[PD %d], [MEM %d]: success\n", > + OBJ_ID(pd), OBJ_ID(&mr->mem)); > + > + return &mr->base_mr; > + > +err_out: > + atomic_dec(&sdev->num_mr); > + > + return ERR_PTR(rv); > +} > + > +/* > + * siw_create_srq() > + * > + * Create Shared Receive Queue of attributes @init_attrs > + * within protection domain given by @base_pd. > + * > + * @base_pd: Base PD contained in siw PD. > + * @init_attrs: SRQ init attributes. > + * @udata: not used by siw. > + */ > +struct ib_srq *siw_create_srq(struct ib_pd *base_pd, > + struct ib_srq_init_attr *init_attrs, > + struct ib_udata *udata) > +{ > + struct siw_srq *srq = NULL; > + struct ib_srq_attr *attrs = &init_attrs->attr; > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct siw_device *sdev = pd->hdr.sdev; > + > + int kernel_verbs = base_pd->uobject ? 0 : 1; > + int rv; > + > + if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { > + siw_dbg(sdev, "too many SRQ's\n"); > + rv = -ENOMEM; > + goto err_out; > + } > + if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || > + attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { > + rv = -EINVAL; > + goto err_out; > + } > + > + srq = kzalloc(sizeof(*srq), GFP_KERNEL); > + if (!srq) { > + rv = -ENOMEM; > + goto err_out; > + } > + > + srq->max_sge = attrs->max_sge; > + srq->num_rqe = roundup_pow_of_two(attrs->max_wr); > + atomic_set(&srq->space, srq->num_rqe); > + > + srq->limit = attrs->srq_limit; > + if (srq->limit) > + srq->armed = 1; > + > + if (kernel_verbs) > + srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe)); > + else > + srq->recvq = vmalloc_user(srq->num_rqe * > + sizeof(struct siw_rqe)); > + > + if (srq->recvq == NULL) { > + rv = -ENOMEM; > + goto err_out; > + } > + if (kernel_verbs) { > + srq->kernel_verbs = 1; > + } else if (udata) { > + struct siw_uresp_create_srq uresp; > + struct siw_ucontext *ctx; > + > + memset(&uresp, 0, sizeof(uresp)); > + ctx = to_siw_ctx(base_pd->uobject->context); > + > + uresp.num_rqe = srq->num_rqe; > + uresp.srq_key = siw_insert_uobj(ctx, srq->recvq, > + srq->num_rqe * sizeof(struct siw_rqe)); > + > + if (uresp.srq_key > SIW_MAX_UOBJ_KEY) > + siw_dbg(sdev, "preparing mmap srq failed\n"); > + > + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); > + if (rv) > + goto err_out; > + } > + srq->pd = pd; > + siw_pd_get(pd); > + > + spin_lock_init(&srq->lock); > + > + siw_dbg(sdev, "[SRQ 0x%p]: success\n", srq); > + > + return &srq->base_srq; > + > +err_out: > + if (srq) { > + if (srq->recvq) > + vfree(srq->recvq); > + kfree(srq); > + } > + atomic_dec(&sdev->num_srq); > + > + return ERR_PTR(rv); > +} > + > +/* > + * siw_modify_srq() > + * > + * Modify SRQ. The caller may resize SRQ and/or set/reset notification > + * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. > + * > + * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE > + * parameter. siw_modify_srq() does not check the attrs->max_sge param. > + */ > +int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, > + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) > +{ > + struct siw_srq *srq = to_siw_srq(base_srq); > + unsigned long flags; > + int rv = 0; > + > + spin_lock_irqsave(&srq->lock, flags); > + > + if (attr_mask & IB_SRQ_MAX_WR) { > + /* resize request not yet supported */ > + rv = -EOPNOTSUPP; > + goto out; > + } > + if (attr_mask & IB_SRQ_LIMIT) { > + if (attrs->srq_limit) { > + if (unlikely(attrs->srq_limit > srq->num_rqe)) { > + rv = -EINVAL; > + goto out; > + } > + srq->armed = 1; > + } else > + srq->armed = 0; > + > + srq->limit = attrs->srq_limit; > + } > +out: > + spin_unlock_irqrestore(&srq->lock, flags); > + > + return rv; > +} > + > +/* > + * siw_query_srq() > + * > + * Query SRQ attributes. > + */ > +int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) > +{ > + struct siw_srq *srq = to_siw_srq(base_srq); > + unsigned long flags; > + > + spin_lock_irqsave(&srq->lock, flags); > + > + attrs->max_wr = srq->num_rqe; > + attrs->max_sge = srq->max_sge; > + attrs->srq_limit = srq->limit; > + > + spin_unlock_irqrestore(&srq->lock, flags); > + > + return 0; > +} > + > +/* > + * siw_destroy_srq() > + * > + * Destroy SRQ. > + * It is assumed that the SRQ is not referenced by any > + * QP anymore - the code trusts the RDMA core environment to keep track > + * of QP references. > + */ > +int siw_destroy_srq(struct ib_srq *base_srq) > +{ > + struct siw_srq *srq = to_siw_srq(base_srq); > + struct siw_device *sdev = srq->pd->hdr.sdev; > + > + siw_pd_put(srq->pd); > + > + vfree(srq->recvq); > + kfree(srq); > + > + atomic_dec(&sdev->num_srq); > + > + return 0; > +} > + > +/* > + * siw_post_srq_recv() > + * > + * Post a list of receive queue elements to SRQ. > + * NOTE: The function does not check or lock a certain SRQ state > + * during the post operation. The code simply trusts the > + * RDMA core environment. > + * > + * @base_srq: Base SRQ contained in siw SRQ > + * @wr: List of R-WR's > + * @bad_wr: Updated to failing WR if posting fails. > + */ > +int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, > + const struct ib_recv_wr **bad_wr) > +{ > + struct siw_srq *srq = to_siw_srq(base_srq); > + unsigned long flags; > + int rv = 0; > + > + if (!srq->kernel_verbs) { > + siw_dbg(srq->pd->hdr.sdev, > + "[SRQ 0x%p]: no kernel post_recv for mapped srq\n", > + srq); > + rv = -EINVAL; > + goto out; > + } > + /* > + * Serialize potentially multiple producers. > + * Not needed for single threaded consumer side. > + */ > + spin_lock_irqsave(&srq->lock, flags); > + > + while (wr) { > + u32 idx = srq->rq_put % srq->num_rqe; > + struct siw_rqe *rqe = &srq->recvq[idx]; > + > + if (rqe->flags) { > + siw_dbg(srq->pd->hdr.sdev, "SRQ full\n"); > + rv = -ENOMEM; > + break; > + } > + if (wr->num_sge > srq->max_sge) { > + siw_dbg(srq->pd->hdr.sdev, > + "[SRQ 0x%p]: too many sge's: %d\n", > + srq, wr->num_sge); > + rv = -EINVAL; > + break; > + } > + rqe->id = wr->wr_id; > + rqe->num_sge = wr->num_sge; > + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); > + > + /* Make sure S-RQE is completely written before valid */ > + smp_wmb(); > + > + rqe->flags = SIW_WQE_VALID; > + > + srq->rq_put++; > + wr = wr->next; > + } > + spin_unlock_irqrestore(&srq->lock, flags); > +out: > + if (unlikely(rv < 0)) { > + siw_dbg(srq->pd->hdr.sdev, "[SRQ 0x%p]: error %d\n", srq, rv); > + *bad_wr = wr; > + } > + return rv; > +} > diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h > new file mode 100644 > index 000000000000..5e108d98280c > --- /dev/null > +++ b/drivers/infiniband/sw/siw/siw_verbs.h > @@ -0,0 +1,114 @@ > +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause > +/* > + * Software iWARP device driver > + * > + * Authors: Bernard Metzler <bmt@xxxxxxxxxxxxxx> > + * > + * Copyright (c) 2008-2019, IBM Corporation > + * > + * This software is available to you under a choice of one of two > + * licenses. You may choose to be licensed under the terms of the GNU > + * General Public License (GPL) Version 2, available from the file > + * COPYING in the main directory of this source tree, or the > + * BSD license below: > + * > + * Redistribution and use in source and binary forms, with or > + * without modification, are permitted provided that the following > + * conditions are met: > + * > + * - Redistributions of source code must retain the above copyright notice, > + * this list of conditions and the following disclaimer. > + * > + * - Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * - Neither the name of IBM nor the names of its contributors may be > + * used to endorse or promote products derived from this software without > + * specific prior written permission. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE > + * SOFTWARE. > + */ > + > +#ifndef _SIW_VERBS_H > +#define _SIW_VERBS_H > + > +#include <linux/errno.h> > + > +#include <rdma/iw_cm.h> > +#include <rdma/ib_verbs.h> > +#include <rdma/ib_smi.h> > +#include <rdma/ib_user_verbs.h> > + > +#include "siw.h" > +#include "siw_cm.h" > + > + > +extern struct ib_ucontext *siw_alloc_ucontext(struct ib_device *ibdev, > + struct ib_udata *udata); > +extern int siw_dealloc_ucontext(struct ib_ucontext *ucontext); > +extern int siw_query_port(struct ib_device *ibdev, u8 port, > + struct ib_port_attr *attr); > +extern int siw_get_port_immutable(struct ib_device *ibdev, u8 port, > + struct ib_port_immutable *port_imm); > +extern int siw_query_device(struct ib_device *ibdev, > + struct ib_device_attr *attr, > + struct ib_udata *udata); > +extern struct ib_cq *siw_create_cq(struct ib_device *ibdev, > + const struct ib_cq_init_attr *attr, > + struct ib_ucontext *ucontext, > + struct ib_udata *udata); > +extern int siw_query_port(struct ib_device *ibdev, u8 port, > + struct ib_port_attr *attr); > +extern int siw_query_pkey(struct ib_device *ibdev, u8 port, > + u16 idx, u16 *pkey); > +extern int siw_query_gid(struct ib_device *ibdev, u8 port, int idx, > + union ib_gid *gid); > +extern int siw_alloc_pd(struct ib_pd *base_pd, struct ib_ucontext *context, > + struct ib_udata *udata); > +extern void siw_dealloc_pd(struct ib_pd *pd); > +extern struct ib_qp *siw_create_qp(struct ib_pd *pd, > + struct ib_qp_init_attr *attr, > + struct ib_udata *udata); > +extern int siw_query_qp(struct ib_qp *ofa_qp, struct ib_qp_attr *qp_attr, > + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); > +extern int siw_verbs_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, > + int attr_mask, struct ib_udata *udata); > +extern int siw_destroy_qp(struct ib_qp *ibqp); > +extern int siw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, > + const struct ib_send_wr **bad_wr); > +extern int siw_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, > + const struct ib_recv_wr **bad_wr); > +extern int siw_destroy_cq(struct ib_cq *ibcq); > +extern int siw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); > +extern int siw_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); > +extern struct ib_mr *siw_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, > + u64 rnic_va, int rights, > + struct ib_udata *udata); > +extern struct ib_mr *siw_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, > + u32 max_sge); > +extern struct ib_mr *siw_get_dma_mr(struct ib_pd *ibpd, int rights); > +extern int siw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sl, > + int num_sle, unsigned int *sg_off); > +extern int siw_dereg_mr(struct ib_mr *ibmr); > +extern struct ib_srq *siw_create_srq(struct ib_pd *ibpd, > + struct ib_srq_init_attr *attr, > + struct ib_udata *udata); > +extern int siw_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, > + enum ib_srq_attr_mask mask, struct ib_udata *udata); > +extern int siw_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); > +extern int siw_destroy_srq(struct ib_srq *ibsrq); > +extern int siw_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, > + const struct ib_recv_wr **bad_wr); > +extern int siw_mmap(struct ib_ucontext *ibctx, struct vm_area_struct *vma); Not a big deal, but most of the functions here use different arguments names than the ones in the C file. > + > +extern const struct dma_map_ops siw_dma_generic_ops; > + > +#endif > diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h > index 06c34d99be85..a5cb2af9b829 100644 > --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h > +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h > @@ -102,6 +102,7 @@ enum rdma_driver_id { > RDMA_DRIVER_RXE, > RDMA_DRIVER_HFI1, > RDMA_DRIVER_QIB, > + RDMA_DRIVER_SIW > }; > > #endif > diff --git a/include/uapi/rdma/siw_user.h b/include/uapi/rdma/siw_user.h > new file mode 100644 > index 000000000000..6300a10e809d > --- /dev/null > +++ b/include/uapi/rdma/siw_user.h > @@ -0,0 +1,223 @@ > +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause > +/* > + * Software iWARP device driver for Linux > + * > + * Authors: Bernard Metzler <bmt@xxxxxxxxxxxxxx> > + * > + * Copyright (c) 2008-2017, IBM Corporation > + * > + * This software is available to you under a choice of one of two > + * licenses. You may choose to be licensed under the terms of the GNU > + * General Public License (GPL) Version 2, available from the file > + * COPYING in the main directory of this source tree, or the > + * BSD license below: > + * > + * Redistribution and use in source and binary forms, with or > + * without modification, are permitted provided that the following > + * conditions are met: > + * > + * - Redistributions of source code must retain the above copyright notice, > + * this list of conditions and the following disclaimer. > + * > + * - Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * - Neither the name of IBM nor the names of its contributors may be > + * used to endorse or promote products derived from this software without > + * specific prior written permission. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE > + * SOFTWARE. > + */ > + > +#ifndef _SIW_USER_H > +#define _SIW_USER_H > + > +#include <linux/types.h> > + > +/*Common string that is matched to accept the device by the user library*/ > +#define SIW_NODE_DESC_COMMON "Software iWARP stack" > + > +#define SIW_IBDEV_PREFIX "siw_" > + > +#define VERSION_ID_SOFTIWARP 2 > + > +#define SIW_MAX_SGE 6 > +#define SIW_MAX_UOBJ_KEY 0xffffff > +#define SIW_INVAL_UOBJ_KEY (SIW_MAX_UOBJ_KEY + 1) > + > +struct siw_uresp_create_cq { > + __u32 cq_id; > + __u32 num_cqe; > + __u32 cq_key; > + __u32 pad; > +}; > + > +struct siw_uresp_create_qp { > + __u32 qp_id; > + __u32 num_sqe; > + __u32 num_rqe; > + __u32 sq_key; > + __u32 rq_key; > + __u32 pad; > +}; > + > +struct siw_ureq_reg_mr { > + __u8 stag_key; > + __u8 reserved[3]; > + __u32 pad; > +}; > + > +struct siw_uresp_reg_mr { > + __u32 stag; > + __u32 pad; > +}; > + > +struct siw_uresp_create_srq { > + __u32 num_rqe; > + __u32 srq_key; > +}; > + > +struct siw_uresp_alloc_ctx { > + __u32 dev_id; > + __u32 pad; > +}; > + > +enum siw_opcode { > + SIW_OP_WRITE = 0, > + SIW_OP_READ = 1, > + SIW_OP_READ_LOCAL_INV = 2, > + SIW_OP_SEND = 3, > + SIW_OP_SEND_WITH_IMM = 4, > + SIW_OP_SEND_REMOTE_INV = 5, > + > + /* Unsupported */ > + SIW_OP_FETCH_AND_ADD = 6, > + SIW_OP_COMP_AND_SWAP = 7, > + > + SIW_OP_RECEIVE = 8, > + /* provider internal SQE */ > + SIW_OP_READ_RESPONSE = 9, > + /* > + * below opcodes valid for > + * in-kernel clients only > + */ > + SIW_OP_INVAL_STAG = 10, > + SIW_OP_REG_MR = 11, > + SIW_NUM_OPCODES = 12 > +}; > + > +/* Keep it same as ibv_sge to allow for memcpy */ > +struct siw_sge { > + __aligned_u64 laddr; > + __u32 length; > + __u32 lkey; > +}; > + > +/* > + * Inline data are kept within the work request itself occupying > + * the space of sge[1] .. sge[n]. Therefore, inline data cannot be > + * supported if SIW_MAX_SGE is below 2 elements. > + */ > +#define SIW_MAX_INLINE (sizeof(struct siw_sge) * (SIW_MAX_SGE - 1)) > + > +#if SIW_MAX_SGE < 2 > +#error "SIW_MAX_SGE must be at least 2" > +#endif > + > +enum siw_wqe_flags { > + SIW_WQE_VALID = 1, > + SIW_WQE_INLINE = (1 << 1), > + SIW_WQE_SIGNALLED = (1 << 2), > + SIW_WQE_SOLICITED = (1 << 3), > + SIW_WQE_READ_FENCE = (1 << 4), > + SIW_WQE_COMPLETED = (1 << 5) > +}; > + > +/* Send Queue Element */ > +struct siw_sqe { > + __aligned_u64 id; > + __u16 flags; > + __u8 num_sge; > + /* Contains enum siw_opcode values */ > + __u8 opcode; > + __u32 rkey; > + union { > + __aligned_u64 raddr; > + __aligned_u64 base_mr; > + }; > + union { > + struct siw_sge sge[SIW_MAX_SGE]; > + __aligned_u64 access; > + }; > +}; > + > +/* Receive Queue Element */ > +struct siw_rqe { > + __aligned_u64 id; > + __u16 flags; > + __u8 num_sge; > + /* > + * only used by kernel driver, > + * ignored if set by user > + */ > + __u8 opcode; > + __u32 unused; > + struct siw_sge sge[SIW_MAX_SGE]; > +}; > + > +enum siw_notify_flags { > + SIW_NOTIFY_NOT = (0), > + SIW_NOTIFY_SOLICITED = (1 << 0), > + SIW_NOTIFY_NEXT_COMPLETION = (1 << 1), > + SIW_NOTIFY_MISSED_EVENTS = (1 << 2), > + SIW_NOTIFY_ALL = SIW_NOTIFY_SOLICITED | > + SIW_NOTIFY_NEXT_COMPLETION | > + SIW_NOTIFY_MISSED_EVENTS > +}; > + > +enum siw_wc_status { > + SIW_WC_SUCCESS = 0, > + SIW_WC_LOC_LEN_ERR = 1, > + SIW_WC_LOC_PROT_ERR = 2, > + SIW_WC_LOC_QP_OP_ERR = 3, > + SIW_WC_WR_FLUSH_ERR = 4, > + SIW_WC_BAD_RESP_ERR = 5, > + SIW_WC_LOC_ACCESS_ERR = 6, > + SIW_WC_REM_ACCESS_ERR = 7, > + SIW_WC_REM_INV_REQ_ERR = 8, > + SIW_WC_GENERAL_ERR = 9, > + SIW_NUM_WC_STATUS = 10 > +}; > + > +struct siw_qp; > + > +struct siw_cqe { > + __aligned_u64 id; > + __u8 flags; > + __u8 opcode; > + __u16 status; > + __u32 bytes; > + __aligned_u64 imm_data; > + /* QP number or QP pointer */ > + union { > + struct siw_qp *qp; > + __aligned_u64 qp_id; > + }; > +}; > + > +/* > + * Shared structure between user and kernel > + * to control CQ arming. > + */ > +struct siw_cq_ctrl { > + __aligned_u64 notify; > +}; > +#endif > Reviewed-by: Gal Pressman <galpress@xxxxxxxxxx>