On Tue, Feb 19, 2019 at 11:08:56AM +0100, Bernard Metzler wrote: > +/* > + * siw_create_qp() > + * > + * Create QP of requested size on given device. > + * > + * @base_pd: Base PD contained in siw PD > + * @attrs: Initial QP attributes. > + * @udata: used to provide QP ID, SQ and RQ size back to user. > + */ > + > +struct ib_qp *siw_create_qp(struct ib_pd *base_pd, > + struct ib_qp_init_attr *attrs, > + struct ib_udata *udata) > +{ > + struct siw_qp *qp = NULL; > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct ib_device *base_dev = base_pd->device; > + struct siw_device *sdev = to_siw_dev(base_dev); > + struct siw_cq *scq = NULL, *rcq = NULL; > + > + unsigned long flags; > + int num_sqe, num_rqe, rv = 0; > + > + siw_dbg(sdev, "create new qp\n"); > + > + if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { > + siw_dbg(sdev, "too many qp's\n"); > + rv = -ENOMEM; > + goto err_out; > + } > + if (attrs->qp_type != IB_QPT_RC) { > + siw_dbg(sdev, "only rc qp's supported\n"); > + rv = -EINVAL; > + goto err_out; > + } > + if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || > + (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || > + (attrs->cap.max_send_sge > SIW_MAX_SGE) || > + (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { > + siw_dbg(sdev, "qp size error\n"); > + rv = -EINVAL; > + goto err_out; > + } > + if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { > + siw_dbg(sdev, "max inline send: %d > %d\n", > + attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); > + rv = -EINVAL; > + goto err_out; > + } > + /* > + * NOTE: we allow for zero element SQ and RQ WQE's SGL's > + * but not for a QP unable to hold any WQE (SQ + RQ) > + */ > + if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) { > + siw_dbg(sdev, "qp must have send or receive queue\n"); > + rv = -EINVAL; > + goto err_out; > + } > + > + scq = siw_cq_id2obj(sdev, ((struct siw_cq *)attrs->send_cq)->hdr.id); > + rcq = siw_cq_id2obj(sdev, ((struct siw_cq *)attrs->recv_cq)->hdr.id); > + > + if (!scq || (!rcq && !attrs->srq)) { > + siw_dbg(sdev, "send cq or receive cq invalid\n"); > + rv = -EINVAL; > + goto err_out; > + } > + qp = kzalloc(sizeof(*qp), GFP_KERNEL); > + if (!qp) { > + rv = -ENOMEM; > + goto err_out; > + } > + > + init_rwsem(&qp->state_lock); > + spin_lock_init(&qp->sq_lock); > + spin_lock_init(&qp->rq_lock); > + spin_lock_init(&qp->orq_lock); > + > + if (!base_pd->uobject) > + qp->kernel_verbs = 1; New drivers should not have the word 'uobject' in them. This is '!udata' > + rv = siw_qp_add(sdev, qp); > + if (rv) > + goto err_out; > + > + num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); > + num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr); > + > + if (qp->kernel_verbs) > + qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe)); > + else > + qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); > + > + if (qp->sendq == NULL) { > + siw_dbg_qp(qp, "send queue size %d alloc failed\n", num_sqe); > + rv = -ENOMEM; > + goto err_out_idr; > + } > + if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { > + if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) > + qp->attrs.flags |= SIW_SIGNAL_ALL_WR; > + else { > + rv = -EINVAL; > + goto err_out_idr; > + } > + } > + qp->pd = pd; > + qp->scq = scq; > + qp->rcq = rcq; > + > + if (attrs->srq) { > + /* > + * SRQ support. > + * Verbs 6.3.7: ignore RQ size, if SRQ present > + * Verbs 6.3.5: do not check PD of SRQ against PD of QP > + */ > + qp->srq = to_siw_srq(attrs->srq); > + qp->attrs.rq_size = 0; > + siw_dbg_qp(qp, "[SRQ 0x%p] attached\n", qp->srq); > + } else if (num_rqe) { > + if (qp->kernel_verbs) > + qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); > + else > + qp->recvq = vmalloc_user(num_rqe * > + sizeof(struct siw_rqe)); > + > + if (qp->recvq == NULL) { > + siw_dbg_qp(qp, "recv queue size %d alloc failed\n", > + num_rqe); > + rv = -ENOMEM; > + goto err_out_idr; > + } > + > + qp->attrs.rq_size = num_rqe; > + } > + qp->attrs.sq_size = num_sqe; > + qp->attrs.sq_max_sges = attrs->cap.max_send_sge; > + qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; > + > + /* Make those two tunables fixed for now. */ > + qp->tx_ctx.gso_seg_limit = gso_seg_limit; > + qp->tx_ctx.zcopy_tx = zcopy_tx; > + > + qp->attrs.state = SIW_QP_STATE_IDLE; > + > + if (udata) { > + struct siw_uresp_create_qp uresp; > + struct siw_ucontext *ctx; > + > + memset(&uresp, 0, sizeof(uresp)); > + ctx = to_siw_ctx(base_pd->uobject->context); Here too, this is rdma_udata_to_drv_context() > + uresp.sq_key = uresp.rq_key = SIW_INVAL_UOBJ_KEY; > + uresp.num_sqe = num_sqe; > + uresp.num_rqe = num_rqe; > + uresp.qp_id = QP_ID(qp); > + > + if (qp->sendq) { > + uresp.sq_key = siw_insert_uobj(ctx, qp->sendq, > + num_sqe * sizeof(struct siw_sqe)); > + if (uresp.sq_key > SIW_MAX_UOBJ_KEY) > + siw_dbg_qp(qp, "preparing mmap sq failed\n"); > + } > + if (qp->recvq) { > + uresp.rq_key = siw_insert_uobj(ctx, qp->recvq, > + num_rqe * sizeof(struct siw_rqe)); > + if (uresp.rq_key > SIW_MAX_UOBJ_KEY) > + siw_dbg_qp(qp, "preparing mmap rq failed\n"); > + } > + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); > + if (rv) > + goto err_out_idr; > + } > + qp->tx_cpu = siw_get_tx_cpu(sdev); > + if (qp->tx_cpu < 0) { > + rv = -EINVAL; > + goto err_out_idr; > + } > + qp->base_qp.qp_num = QP_ID(qp); > + > + siw_pd_get(pd); > + > + INIT_LIST_HEAD(&qp->devq); > + spin_lock_irqsave(&sdev->lock, flags); > + list_add_tail(&qp->devq, &sdev->qp_list); > + spin_unlock_irqrestore(&sdev->lock, flags); > + > + return &qp->base_qp; > + > +err_out_idr: > + siw_remove_obj(&sdev->lock, &sdev->qp_idr, &qp->hdr); > +err_out: > + if (scq) > + siw_cq_put(scq); > + if (rcq) > + siw_cq_put(rcq); > + > + if (qp) { > + if (qp->sendq) > + vfree(qp->sendq); > + if (qp->recvq) > + vfree(qp->recvq); > + kfree(qp); > + } > + atomic_dec(&sdev->num_qp); > + > + return ERR_PTR(rv); > +} > + > +/* > + * Minimum siw_query_qp() verb interface. > + * > + * @qp_attr_mask is not used but all available information is provided > + */ > +int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, > + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) > +{ > + struct siw_qp *qp; > + struct siw_device *sdev; > + > + if (base_qp && qp_attr && qp_init_attr) { > + qp = to_siw_qp(base_qp); > + sdev = to_siw_dev(base_qp->device); > + } else > + return -EINVAL; > + > + qp_attr->cap.max_inline_data = SIW_MAX_INLINE; > + qp_attr->cap.max_send_wr = qp->attrs.sq_size; > + qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; > + qp_attr->cap.max_recv_wr = qp->attrs.rq_size; > + qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; > + qp_attr->path_mtu = siw_mtu_net2base(sdev->netdev->mtu); > + qp_attr->max_rd_atomic = qp->attrs.irq_size; > + qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; > + > + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | > + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; > + > + qp_init_attr->qp_type = base_qp->qp_type; > + qp_init_attr->send_cq = base_qp->send_cq; > + qp_init_attr->recv_cq = base_qp->recv_cq; > + qp_init_attr->srq = base_qp->srq; > + > + qp_init_attr->cap = qp_attr->cap; > + > + return 0; > +} > + > +int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, > + int attr_mask, struct ib_udata *udata) > +{ > + struct siw_qp_attrs new_attrs; > + enum siw_qp_attr_mask siw_attr_mask = 0; > + struct siw_qp *qp = to_siw_qp(base_qp); > + int rv = 0; > + > + if (!attr_mask) > + return 0; > + > + memset(&new_attrs, 0, sizeof(new_attrs)); > + > + if (attr_mask & IB_QP_ACCESS_FLAGS) { > + > + siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; > + > + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) > + new_attrs.flags |= SIW_RDMA_READ_ENABLED; > + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) > + new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; > + if (attr->qp_access_flags & IB_ACCESS_MW_BIND) > + new_attrs.flags |= SIW_RDMA_BIND_ENABLED; > + } > + if (attr_mask & IB_QP_STATE) { > + siw_dbg_qp(qp, "desired ib qp state: %s\n", > + ib_qp_state_to_string[attr->qp_state]); > + > + new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; > + > + if (new_attrs.state > SIW_QP_STATE_RTS) > + qp->tx_ctx.tx_suspend = 1; > + > + siw_attr_mask |= SIW_QP_ATTR_STATE; > + } > + if (!attr_mask) > + goto out; > + > + down_write(&qp->state_lock); > + > + rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); > + > + up_write(&qp->state_lock); > +out: > + return rv; > +} > + > +int siw_destroy_qp(struct ib_qp *base_qp) > +{ > + struct siw_qp *qp = to_siw_qp(base_qp); > + struct siw_qp_attrs qp_attrs; > + > + siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep); > + > + /* > + * Mark QP as in process of destruction to prevent from > + * any async callbacks to RDMA core > + */ > + qp->attrs.flags |= SIW_QP_IN_DESTROY; > + qp->rx_ctx.rx_suspend = 1; > + > + down_write(&qp->state_lock); > + > + qp_attrs.state = SIW_QP_STATE_ERROR; > + (void)siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); > + > + if (qp->cep) { > + siw_cep_put(qp->cep); > + qp->cep = NULL; > + } > + > + up_write(&qp->state_lock); > + > + kfree(qp->rx_ctx.mpa_crc_hd); > + kfree(qp->tx_ctx.mpa_crc_hd); > + > + /* Drop references */ > + siw_cq_put(qp->scq); > + siw_cq_put(qp->rcq); > + siw_pd_put(qp->pd); > + qp->scq = qp->rcq = NULL; > + > + siw_qp_put(qp); > + > + return 0; > +} > + > +/* > + * siw_copy_sgl() > + * > + * Copy SGL from RDMA core representation to local > + * representation. > + */ > +static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge, > + int num_sge) > +{ > + while (num_sge--) { > + siw_sge->laddr = sge->addr; > + siw_sge->length = sge->length; > + siw_sge->lkey = sge->lkey; > + > + siw_sge++; sge++; > + } > +} > + > +/* > + * siw_copy_inline_sgl() > + * > + * Prepare sgl of inlined data for sending. For userland callers > + * function checks if given buffer addresses and len's are within > + * process context bounds. > + * Data from all provided sge's are copied together into the wqe, > + * referenced by a single sge. > + */ > +static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, > + struct siw_sqe *sqe) > +{ > + struct ib_sge *core_sge = core_wr->sg_list; > + void *kbuf = &sqe->sge[1]; > + int num_sge = core_wr->num_sge, > + bytes = 0; > + > + sqe->sge[0].laddr = (u64)kbuf; > + sqe->sge[0].lkey = 0; > + > + while (num_sge--) { > + if (!core_sge->length) { > + core_sge++; > + continue; > + } > + bytes += core_sge->length; > + if (bytes > SIW_MAX_INLINE) { > + bytes = -EINVAL; > + break; > + } > + memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, > + core_sge->length); > + > + kbuf += core_sge->length; > + core_sge++; > + } > + sqe->sge[0].length = bytes > 0 ? bytes : 0; > + sqe->num_sge = bytes > 0 ? 1 : 0; > + > + return bytes; > +} > + > +/* > + * siw_post_send() > + * > + * Post a list of S-WR's to a SQ. > + * > + * @base_qp: Base QP contained in siw QP > + * @wr: Null terminated list of user WR's > + * @bad_wr: Points to failing WR in case of synchronous failure. > + */ > +int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, > + const struct ib_send_wr **bad_wr) > +{ > + struct siw_qp *qp = to_siw_qp(base_qp); > + struct siw_wqe *wqe = tx_wqe(qp); > + > + unsigned long flags; > + int rv = 0; > + > + siw_dbg_qp(qp, "state %d\n", qp->attrs.state); > + > + /* > + * Try to acquire QP state lock. Must be non-blocking > + * to accommodate kernel clients needs. > + */ > + if (!down_read_trylock(&qp->state_lock)) { > + *bad_wr = wr; > + return -ENOTCONN; > + } > + > + if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { > + up_read(&qp->state_lock); > + *bad_wr = wr; > + return -ENOTCONN; > + } > + if (wr && !qp->kernel_verbs) { > + siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); > + up_read(&qp->state_lock); > + *bad_wr = wr; > + return -EINVAL; > + } > + > + spin_lock_irqsave(&qp->sq_lock, flags); > + > + while (wr) { > + u32 idx = qp->sq_put % qp->attrs.sq_size; > + struct siw_sqe *sqe = &qp->sendq[idx]; > + > + if (sqe->flags) { > + siw_dbg_qp(qp, "sq full\n"); > + rv = -ENOMEM; > + break; > + } > + if (wr->num_sge > qp->attrs.sq_max_sges) { > + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); > + rv = -EINVAL; > + break; > + } > + sqe->id = wr->wr_id; > + > + if ((wr->send_flags & IB_SEND_SIGNALED) || > + (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) > + sqe->flags |= SIW_WQE_SIGNALLED; > + > + if (wr->send_flags & IB_SEND_FENCE) > + sqe->flags |= SIW_WQE_READ_FENCE; > + > + switch (wr->opcode) { > + > + case IB_WR_SEND: > + case IB_WR_SEND_WITH_INV: > + if (wr->send_flags & IB_SEND_SOLICITED) > + sqe->flags |= SIW_WQE_SOLICITED; > + > + if (!(wr->send_flags & IB_SEND_INLINE)) { > + siw_copy_sgl(wr->sg_list, sqe->sge, > + wr->num_sge); > + sqe->num_sge = wr->num_sge; > + } else { > + rv = siw_copy_inline_sgl(wr, sqe); > + if (rv <= 0) { > + rv = -EINVAL; > + break; > + } > + sqe->flags |= SIW_WQE_INLINE; > + sqe->num_sge = 1; > + } > + if (wr->opcode == IB_WR_SEND) > + sqe->opcode = SIW_OP_SEND; > + else { > + sqe->opcode = SIW_OP_SEND_REMOTE_INV; > + sqe->rkey = wr->ex.invalidate_rkey; > + } > + break; > + > + case IB_WR_RDMA_READ_WITH_INV: > + case IB_WR_RDMA_READ: > + /* > + * OFED WR restricts RREAD sink to SGL containing > + * 1 SGE only. we could relax to SGL with multiple > + * elements referring the SAME ltag or even sending > + * a private per-rreq tag referring to a checked > + * local sgl with MULTIPLE ltag's. would be easy > + * to do... > + */ > + if (unlikely(wr->num_sge != 1)) { > + rv = -EINVAL; > + break; > + } > + siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); > + /* > + * NOTE: zero length RREAD is allowed! > + */ > + sqe->raddr = rdma_wr(wr)->remote_addr; > + sqe->rkey = rdma_wr(wr)->rkey; > + sqe->num_sge = 1; > + > + if (wr->opcode == IB_WR_RDMA_READ) > + sqe->opcode = SIW_OP_READ; > + else > + sqe->opcode = SIW_OP_READ_LOCAL_INV; > + break; > + > + case IB_WR_RDMA_WRITE: > + if (!(wr->send_flags & IB_SEND_INLINE)) { > + siw_copy_sgl(wr->sg_list, &sqe->sge[0], > + wr->num_sge); > + sqe->num_sge = wr->num_sge; > + } else { > + rv = siw_copy_inline_sgl(wr, sqe); > + if (unlikely(rv < 0)) { > + rv = -EINVAL; > + break; > + } > + sqe->flags |= SIW_WQE_INLINE; > + sqe->num_sge = 1; > + } > + sqe->raddr = rdma_wr(wr)->remote_addr; > + sqe->rkey = rdma_wr(wr)->rkey; > + sqe->opcode = SIW_OP_WRITE; > + > + break; > + > + case IB_WR_REG_MR: > + sqe->base_mr = (uint64_t)reg_wr(wr)->mr; > + sqe->rkey = reg_wr(wr)->key; > + sqe->access = SIW_MEM_LREAD; > + if (reg_wr(wr)->access & IB_ACCESS_LOCAL_WRITE) > + sqe->access |= SIW_MEM_LWRITE; > + if (reg_wr(wr)->access & IB_ACCESS_REMOTE_WRITE) > + sqe->access |= SIW_MEM_RWRITE; > + if (reg_wr(wr)->access & IB_ACCESS_REMOTE_READ) > + sqe->access |= SIW_MEM_RREAD; > + sqe->opcode = SIW_OP_REG_MR; > + > + break; > + > + case IB_WR_LOCAL_INV: > + sqe->rkey = wr->ex.invalidate_rkey; > + sqe->opcode = SIW_OP_INVAL_STAG; > + > + break; > + > + default: > + siw_dbg_qp(qp, "ib wr type %d unsupported\n", > + wr->opcode); > + rv = -EINVAL; > + break; > + } > + siw_dbg_qp(qp, "opcode %d, flags 0x%x\n", > + sqe->opcode, sqe->flags); > + > + if (unlikely(rv < 0)) > + break; > + > + /* make SQE only vaild after completely written */ > + smp_wmb(); > + sqe->flags |= SIW_WQE_VALID; > + > + qp->sq_put++; > + wr = wr->next; > + } > + > + /* > + * Send directly if SQ processing is not in progress. > + * Eventual immediate errors (rv < 0) do not affect the involved > + * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ > + * processing, if new work is already pending. But rv must be passed > + * to caller. > + */ > + if (wqe->wr_status != SIW_WR_IDLE) { > + spin_unlock_irqrestore(&qp->sq_lock, flags); > + goto skip_direct_sending; > + } > + rv = siw_activate_tx(qp); > + spin_unlock_irqrestore(&qp->sq_lock, flags); > + > + if (rv <= 0) > + goto skip_direct_sending; > + > + if (qp->kernel_verbs) { > + rv = siw_sq_start(qp); > + } else { > + qp->tx_ctx.in_syscall = 1; > + > + if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) > + siw_qp_cm_drop(qp, 0); > + > + qp->tx_ctx.in_syscall = 0; > + } > +skip_direct_sending: > + > + up_read(&qp->state_lock); > + > + if (rv >= 0) > + return 0; > + /* > + * Immediate error > + */ > + siw_dbg_qp(qp, "error %d\n", rv); > + > + *bad_wr = wr; > + return rv; > +} > + > +/* > + * siw_post_receive() > + * > + * Post a list of R-WR's to a RQ. > + * > + * @base_qp: Base QP contained in siw QP > + * @wr: Null terminated list of user WR's > + * @bad_wr: Points to failing WR in case of synchronous failure. > + */ > +int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, > + const struct ib_recv_wr **bad_wr) > +{ > + struct siw_qp *qp = to_siw_qp(base_qp); > + unsigned long flags; > + int rv = 0; > + > + if (qp->srq) { > + *bad_wr = wr; > + return -EOPNOTSUPP; /* what else from errno.h? */ > + } > + /* > + * Try to acquire QP state lock. Must be non-blocking > + * to accommodate kernel clients needs. > + */ > + if (!down_read_trylock(&qp->state_lock)) { > + *bad_wr = wr; > + return -ENOTCONN; > + } > + if (!qp->kernel_verbs) { > + siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); > + up_read(&qp->state_lock); > + *bad_wr = wr; > + return -EINVAL; > + } > + if (qp->attrs.state > SIW_QP_STATE_RTS) { > + up_read(&qp->state_lock); > + *bad_wr = wr; > + return -EINVAL; > + } > + /* > + * Serialize potentially multiple producers. > + * Not needed for single threaded consumer side. > + */ > + spin_lock_irqsave(&qp->rq_lock, flags); > + > + while (wr) { > + u32 idx = qp->rq_put % qp->attrs.rq_size; > + struct siw_rqe *rqe = &qp->recvq[idx]; > + > + if (rqe->flags) { > + siw_dbg_qp(qp, "Receive Queue full\n"); > + rv = -ENOMEM; > + break; > + } > + if (wr->num_sge > qp->attrs.rq_max_sges) { > + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); > + rv = -EINVAL; > + break; > + } > + rqe->id = wr->wr_id; > + rqe->num_sge = wr->num_sge; > + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); > + > + /* make sure RQE is completely written before valid */ > + smp_wmb(); > + > + rqe->flags = SIW_WQE_VALID; > + > + qp->rq_put++; > + wr = wr->next; > + } > + spin_unlock_irqrestore(&qp->rq_lock, flags); > + > + up_read(&qp->state_lock); > + > + if (rv < 0) { > + siw_dbg_qp(qp, "error %d\n", rv); > + *bad_wr = wr; > + } > + return rv > 0 ? 0 : rv; > +} > + > +int siw_destroy_cq(struct ib_cq *base_cq) > +{ > + struct siw_cq *cq = to_siw_cq(base_cq); > + struct ib_device *base_dev = base_cq->device; > + struct siw_device *sdev = to_siw_dev(base_dev); > + > + siw_cq_flush(cq); > + > + siw_remove_obj(&sdev->lock, &sdev->cq_idr, &cq->hdr); > + siw_cq_put(cq); > + > + return 0; > +} > + > +/* > + * siw_create_cq() > + * > + * Create CQ of requested size on given device. > + * > + * @base_dev: RDMA device contained in siw device > + * @size: maximum number of CQE's allowed. > + * @ib_context: user context. > + * @udata: used to provide CQ ID back to user. > + */ > + > +struct ib_cq *siw_create_cq(struct ib_device *base_dev, > + const struct ib_cq_init_attr *attr, > + struct ib_ucontext *ib_context, > + struct ib_udata *udata) > +{ > + struct siw_cq *cq = NULL; > + struct siw_device *sdev = to_siw_dev(base_dev); > + struct siw_uresp_create_cq uresp; > + int rv, size = attr->cqe; > + > + if (!base_dev) { > + rv = -ENODEV; > + goto err_out; > + } > + if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { > + siw_dbg(sdev, "too many cq's\n"); > + rv = -ENOMEM; > + goto err_out; > + } > + if (size < 1 || size > sdev->attrs.max_cqe) { > + siw_dbg(sdev, "cq size error: %d\n", size); > + rv = -EINVAL; > + goto err_out; > + } > + cq = kzalloc(sizeof(*cq), GFP_KERNEL); > + if (!cq) { > + rv = -ENOMEM; > + goto err_out; > + } > + size = roundup_pow_of_two(size); > + cq->base_cq.cqe = size; > + cq->num_cqe = size; > + > + if (!ib_context) { > + cq->kernel_verbs = 1; > + cq->queue = vzalloc(size * sizeof(struct siw_cqe) > + + sizeof(struct siw_cq_ctrl)); > + } else { > + cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) > + + sizeof(struct siw_cq_ctrl)); > + } > + if (cq->queue == NULL) { > + rv = -ENOMEM; > + goto err_out; > + } > + > + rv = siw_cq_add(sdev, cq); > + if (rv) > + goto err_out; > + > + spin_lock_init(&cq->lock); > + > + cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify; > + > + if (!cq->kernel_verbs) { > + struct siw_ucontext *ctx = to_siw_ctx(ib_context); > + > + uresp.cq_key = siw_insert_uobj(ctx, cq->queue, > + size * sizeof(struct siw_cqe) + > + sizeof(struct siw_cq_ctrl)); > + > + if (uresp.cq_key > SIW_MAX_UOBJ_KEY) > + siw_dbg(sdev, "[CQ %d]: preparing mmap failed\n", > + OBJ_ID(cq)); > + > + uresp.cq_id = OBJ_ID(cq); > + uresp.num_cqe = size; > + > + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); > + if (rv) > + goto err_out_idr; > + } > + return &cq->base_cq; > + > +err_out_idr: > + siw_remove_obj(&sdev->lock, &sdev->cq_idr, &cq->hdr); > +err_out: > + siw_dbg(sdev, "cq creation failed: %d", rv); > + > + if (cq && cq->queue) > + vfree(cq->queue); > + > + kfree(cq); > + atomic_dec(&sdev->num_cq); > + > + return ERR_PTR(rv); > +} > + > +/* > + * siw_poll_cq() > + * > + * Reap CQ entries if available and copy work completion status into > + * array of WC's provided by caller. Returns number of reaped CQE's. > + * > + * @base_cq: Base CQ contained in siw CQ. > + * @num_cqe: Maximum number of CQE's to reap. > + * @wc: Array of work completions to be filled by siw. > + */ > +int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) > +{ > + struct siw_cq *cq = to_siw_cq(base_cq); > + int i; > + > + for (i = 0; i < num_cqe; i++) { > + if (!(siw_reap_cqe(cq, wc))) > + break; > + wc++; > + } > + return i; > +} > + > +/* > + * siw_req_notify_cq() > + * > + * Request notification for new CQE's added to that CQ. > + * Defined flags: > + * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification > + * event if a WQE with notification flag set enters the CQ > + * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification > + * event if a WQE enters the CQ. > + * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the > + * number of not reaped CQE's regardless of its notification > + * type and current or new CQ notification settings. > + * > + * @base_cq: Base CQ contained in siw CQ. > + * @flags: Requested notification flags. > + */ > +int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) > +{ > + struct siw_cq *cq = to_siw_cq(base_cq); > + > + siw_dbg(cq->hdr.sdev, "[CQ %d]: flags: 0x%8x\n", OBJ_ID(cq), flags); > + > + if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) > + /* CQ event for next solicited completion */ > + smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED); > + else > + /* CQ event for any signalled completion */ > + smp_store_mb(*cq->notify, SIW_NOTIFY_ALL); > + > + if (flags & IB_CQ_REPORT_MISSED_EVENTS) > + return cq->cq_put - cq->cq_get; > + > + return 0; > +} > + > +/* > + * siw_dereg_mr() > + * > + * Release Memory Region. > + * > + * TODO: Update function if Memory Windows are supported by siw: > + * Is OFED core checking for MW dependencies for current > + * MR before calling MR deregistration?. > + * > + * @base_mr: Base MR contained in siw MR. > + */ > +int siw_dereg_mr(struct ib_mr *base_mr) > +{ > + struct siw_mr *mr; > + struct siw_device *sdev = to_siw_dev(base_mr->device); > + > + mr = to_siw_mr(base_mr); > + > + siw_dbg(sdev, "[MEM %d]: deregister mr, #ref's %d\n", > + mr->mem.hdr.id, kref_read(&mr->mem.hdr.ref)); > + > + mr->mem.stag_valid = 0; > + > + siw_remove_obj(&sdev->lock, &sdev->mem_idr, &mr->mem.hdr); > + siw_mem_put(&mr->mem); > + > + return 0; > +} > + > +static struct siw_mr *siw_create_mr(struct siw_device *sdev, void *mem_obj, > + u64 start, u64 len, int rights) > +{ > + struct siw_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL); > + unsigned long flags; > + > + if (!mr) > + return NULL; > + > + mr->mem.stag_valid = 0; > + > + if (siw_mem_add(sdev, &mr->mem) < 0) { > + kfree(mr); > + return NULL; > + } > + siw_dbg(sdev, "[MEM %d]: new mr, object 0x%p\n", > + mr->mem.hdr.id, mem_obj); > + > + mr->base_mr.lkey = mr->base_mr.rkey = mr->mem.hdr.id << 8; > + > + mr->mem.va = start; > + mr->mem.len = len; > + mr->mem.mr = NULL; > + mr->mem.perms = SIW_MEM_LREAD | /* not selectable in RDMA core */ > + (rights & IB_ACCESS_REMOTE_READ ? SIW_MEM_RREAD : 0) | > + (rights & IB_ACCESS_LOCAL_WRITE ? SIW_MEM_LWRITE : 0) | > + (rights & IB_ACCESS_REMOTE_WRITE ? SIW_MEM_RWRITE : 0); > + > + mr->mem_obj = mem_obj; > + > + INIT_LIST_HEAD(&mr->devq); > + spin_lock_irqsave(&sdev->lock, flags); > + list_add_tail(&mr->devq, &sdev->mr_list); > + spin_unlock_irqrestore(&sdev->lock, flags); > + > + return mr; > +} > + > +/* > + * siw_reg_user_mr() > + * > + * Register Memory Region. > + * > + * @base_pd: Base PD contained in siw PD. > + * @start: starting address of MR (virtual address) > + * @len: len of MR > + * @rnic_va: not used by siw > + * @rights: MR access rights > + * @udata: user buffer to communicate STag and Key. > + */ > +struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len, > + u64 rnic_va, int rights, struct ib_udata *udata) > +{ > + struct siw_mr *mr = NULL; > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct siw_umem *umem = NULL; > + struct siw_ureq_reg_mr ureq; > + struct siw_uresp_reg_mr uresp; > + struct siw_device *sdev = pd->hdr.sdev; > + > + unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); > + int rv; > + > + siw_dbg(sdev, "[PD %d]: start: 0x%016llx, va: 0x%016llx, len: %llu\n", > + OBJ_ID(pd), (unsigned long long)start, > + (unsigned long long)rnic_va, (unsigned long long)len); > + > + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { > + siw_dbg(sdev, "[PD %d]: too many mr's\n", OBJ_ID(pd)); > + rv = -ENOMEM; > + goto err_out; > + } > + if (!len) { > + rv = -EINVAL; > + goto err_out; > + } > + if (mem_limit != RLIM_INFINITY) { > + unsigned long num_pages = > + (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT; > + mem_limit >>= PAGE_SHIFT; > + > + if (num_pages > mem_limit - current->mm->locked_vm) { > + siw_dbg(sdev, > + "[PD %d]: pages req %lu, max %lu, lock %lu\n", > + OBJ_ID(pd), num_pages, mem_limit, > + current->mm->locked_vm); > + rv = -ENOMEM; > + goto err_out; > + } > + } > + umem = siw_umem_get(start, len); > + if (IS_ERR(umem)) { > + rv = PTR_ERR(umem); > + siw_dbg(sdev, "[PD %d]: getting user memory failed: %d\n", > + OBJ_ID(pd), rv); > + umem = NULL; > + goto err_out; > + } > + mr = siw_create_mr(sdev, umem, start, len, rights); > + if (!mr) { > + rv = -ENOMEM; > + goto err_out; > + } > + if (udata) { > + rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); > + if (rv) > + goto err_out; > + > + mr->base_mr.lkey |= ureq.stag_key; > + mr->base_mr.rkey |= ureq.stag_key; > + uresp.stag = mr->base_mr.lkey; > + > + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); > + if (rv) > + goto err_out; > + } > + mr->pd = pd; > + siw_pd_get(pd); > + > + mr->mem.stag_valid = 1; > + > + return &mr->base_mr; > + > +err_out: > + if (mr) { > + siw_remove_obj(&sdev->lock, &sdev->mem_idr, &mr->mem.hdr); > + siw_mem_put(&mr->mem); > + umem = NULL; > + } else > + atomic_dec(&sdev->num_mr); > + > + if (umem) > + siw_umem_release(umem); > + > + return ERR_PTR(rv); > +} > + > +struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type, > + u32 max_sge) > +{ > + struct siw_mr *mr; > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct siw_device *sdev = pd->hdr.sdev; > + struct siw_pbl *pbl = NULL; > + int rv; > + > + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { > + siw_dbg(sdev, "[PD %d]: too many mr's\n", OBJ_ID(pd)); > + rv = -ENOMEM; > + goto err_out; > + } > + if (mr_type != IB_MR_TYPE_MEM_REG) { > + siw_dbg(sdev, "[PD %d]: mr type %d unsupported\n", > + OBJ_ID(pd), mr_type); > + rv = -EOPNOTSUPP; > + goto err_out; > + } > + if (max_sge > SIW_MAX_SGE_PBL) { > + siw_dbg(sdev, "[PD %d]: too many sge's: %d\n", > + OBJ_ID(pd), max_sge); > + rv = -ENOMEM; > + goto err_out; > + } > + pbl = siw_pbl_alloc(max_sge); > + if (IS_ERR(pbl)) { > + rv = PTR_ERR(pbl); > + siw_dbg(sdev, "[PD %d]: pbl allocation failed: %d\n", > + OBJ_ID(pd), rv); > + pbl = NULL; > + goto err_out; > + } > + mr = siw_create_mr(sdev, pbl, 0, max_sge * PAGE_SIZE, 0); > + if (!mr) { > + rv = -ENOMEM; > + goto err_out; > + } > + mr->mem.is_pbl = 1; > + mr->pd = pd; > + siw_pd_get(pd); > + > + siw_dbg(sdev, "[PD %d], [MEM %d]: success\n", > + OBJ_ID(pd), OBJ_ID(&mr->mem)); > + > + return &mr->base_mr; > + > +err_out: > + if (pbl) > + siw_pbl_free(pbl); > + > + siw_dbg(sdev, "[PD %d]: failed: %d\n", OBJ_ID(pd), rv); > + > + atomic_dec(&sdev->num_mr); > + > + return ERR_PTR(rv); > +} > + > +/* Just used to count number of pages being mapped */ > +static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) > +{ > + return 0; > +} > + > +int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, > + unsigned int *sg_off) > +{ > + struct scatterlist *slp; > + struct siw_mr *mr = to_siw_mr(base_mr); > + struct siw_pbl *pbl = mr->pbl; > + struct siw_pble *pble = pbl->pbe; > + u64 pbl_size; > + int i, rv; > + > + if (!pbl) { > + siw_dbg(mr->mem.hdr.sdev, "[MEM %d]: no pbl allocated\n", > + OBJ_ID(&mr->mem)); > + return -EINVAL; > + } > + if (pbl->max_buf < num_sle) { > + siw_dbg(mr->mem.hdr.sdev, "[MEM %d]: too many sge's: %d>%d\n", > + OBJ_ID(&mr->mem), mr->pbl->max_buf, num_sle); > + return -ENOMEM; > + } > + > + for_each_sg(sl, slp, num_sle, i) { > + if (sg_dma_len(slp) == 0) { > + siw_dbg(mr->mem.hdr.sdev, "[MEM %d]: empty sge\n", > + OBJ_ID(&mr->mem)); > + return -EINVAL; > + } > + if (i == 0) { > + pble->addr = sg_dma_address(slp); > + pble->size = sg_dma_len(slp); > + pble->pbl_off = 0; > + pbl_size = pble->size; > + pbl->num_buf = 1; > + > + continue; > + } > + /* Merge PBL entries if adjacent */ > + if (pble->addr + pble->size == sg_dma_address(slp)) > + pble->size += sg_dma_len(slp); > + else { > + pble++; > + pbl->num_buf++; > + pble->addr = sg_dma_address(slp); > + pble->size = sg_dma_len(slp); > + pble->pbl_off = pbl_size; > + } > + pbl_size += sg_dma_len(slp); > + > + siw_dbg(mr->mem.hdr.sdev, > + "[MEM %d]: sge[%d], size %llu, addr %p, total %llu\n", > + OBJ_ID(&mr->mem), i, pble->size, (void *)pble->addr, > + pbl_size); > + } > + rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); > + if (rv > 0) { > + mr->mem.len = base_mr->length; > + mr->mem.va = base_mr->iova; > + siw_dbg(mr->mem.hdr.sdev, > + "[MEM %d]: %llu byte, %u SLE into %u entries\n", > + OBJ_ID(&mr->mem), mr->mem.len, num_sle, pbl->num_buf); > + } > + return rv; > +} > + > +/* > + * siw_get_dma_mr() > + * > + * Create a (empty) DMA memory region, where no umem is attached. > + */ > +struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights) > +{ > + struct siw_mr *mr; > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct siw_device *sdev = pd->hdr.sdev; > + int rv; > + > + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { > + siw_dbg(sdev, "[PD %d]: too many mr's\n", OBJ_ID(pd)); > + rv = -ENOMEM; > + goto err_out; > + } > + mr = siw_create_mr(sdev, NULL, 0, ULONG_MAX, rights); > + if (!mr) { > + rv = -ENOMEM; > + goto err_out; > + } > + mr->mem.stag_valid = 1; > + > + mr->pd = pd; > + siw_pd_get(pd); > + > + siw_dbg(sdev, "[PD %d], [MEM %d]: success\n", > + OBJ_ID(pd), OBJ_ID(&mr->mem)); > + > + return &mr->base_mr; > + > +err_out: > + atomic_dec(&sdev->num_mr); > + > + return ERR_PTR(rv); > +} > + > +/* > + * siw_create_srq() > + * > + * Create Shared Receive Queue of attributes @init_attrs > + * within protection domain given by @base_pd. > + * > + * @base_pd: Base PD contained in siw PD. > + * @init_attrs: SRQ init attributes. > + * @udata: not used by siw. > + */ > +struct ib_srq *siw_create_srq(struct ib_pd *base_pd, > + struct ib_srq_init_attr *init_attrs, > + struct ib_udata *udata) > +{ > + struct siw_srq *srq = NULL; > + struct ib_srq_attr *attrs = &init_attrs->attr; > + struct siw_pd *pd = to_siw_pd(base_pd); > + struct siw_device *sdev = pd->hdr.sdev; > + > + int kernel_verbs = base_pd->uobject ? 0 : 1; bool kernel_verbs = !udata; > + memset(&uresp, 0, sizeof(uresp)); > + ctx = to_siw_ctx(base_pd->uobject->context); .. and so on throughout. Jason