-----"Jason Gunthorpe" <jgg@xxxxxxxx> wrote: ----- >To: "Bernard Metzler" <bmt@xxxxxxxxxxxxxx> >From: "Jason Gunthorpe" <jgg@xxxxxxxx> >Date: 03/08/2019 02:35PM >Cc: linux-rdma@xxxxxxxxxxxxxxx >Subject: Re: [PATCH v5 06/13] SIW application interface > >On Tue, Feb 19, 2019 at 11:08:56AM +0100, Bernard Metzler wrote: > >> +/* >> + * siw_create_qp() >> + * >> + * Create QP of requested size on given device. >> + * >> + * @base_pd: Base PD contained in siw PD >> + * @attrs: Initial QP attributes. >> + * @udata: used to provide QP ID, SQ and RQ size back to user. >> + */ >> + >> +struct ib_qp *siw_create_qp(struct ib_pd *base_pd, >> + struct ib_qp_init_attr *attrs, >> + struct ib_udata *udata) >> +{ >> + struct siw_qp *qp = NULL; >> + struct siw_pd *pd = to_siw_pd(base_pd); >> + struct ib_device *base_dev = base_pd->device; >> + struct siw_device *sdev = to_siw_dev(base_dev); >> + struct siw_cq *scq = NULL, *rcq = NULL; >> + >> + unsigned long flags; >> + int num_sqe, num_rqe, rv = 0; >> + >> + siw_dbg(sdev, "create new qp\n"); >> + >> + if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { >> + siw_dbg(sdev, "too many qp's\n"); >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + if (attrs->qp_type != IB_QPT_RC) { >> + siw_dbg(sdev, "only rc qp's supported\n"); >> + rv = -EINVAL; >> + goto err_out; >> + } >> + if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || >> + (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || >> + (attrs->cap.max_send_sge > SIW_MAX_SGE) || >> + (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { >> + siw_dbg(sdev, "qp size error\n"); >> + rv = -EINVAL; >> + goto err_out; >> + } >> + if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { >> + siw_dbg(sdev, "max inline send: %d > %d\n", >> + attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); >> + rv = -EINVAL; >> + goto err_out; >> + } >> + /* >> + * NOTE: we allow for zero element SQ and RQ WQE's SGL's >> + * but not for a QP unable to hold any WQE (SQ + RQ) >> + */ >> + if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) { >> + siw_dbg(sdev, "qp must have send or receive queue\n"); >> + rv = -EINVAL; >> + goto err_out; >> + } >> + >> + scq = siw_cq_id2obj(sdev, ((struct siw_cq >*)attrs->send_cq)->hdr.id); >> + rcq = siw_cq_id2obj(sdev, ((struct siw_cq >*)attrs->recv_cq)->hdr.id); >> + >> + if (!scq || (!rcq && !attrs->srq)) { >> + siw_dbg(sdev, "send cq or receive cq invalid\n"); >> + rv = -EINVAL; >> + goto err_out; >> + } >> + qp = kzalloc(sizeof(*qp), GFP_KERNEL); >> + if (!qp) { >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + >> + init_rwsem(&qp->state_lock); >> + spin_lock_init(&qp->sq_lock); >> + spin_lock_init(&qp->rq_lock); >> + spin_lock_init(&qp->orq_lock); >> + >> + if (!base_pd->uobject) >> + qp->kernel_verbs = 1; > >New drivers should not have the word 'uobject' in them. This is >'!udata' Thanks, yes. I already fixed that after Gal's review. > >> + rv = siw_qp_add(sdev, qp); >> + if (rv) >> + goto err_out; >> + >> + num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); >> + num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr); >> + >> + if (qp->kernel_verbs) >> + qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe)); >> + else >> + qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); >> + >> + if (qp->sendq == NULL) { >> + siw_dbg_qp(qp, "send queue size %d alloc failed\n", num_sqe); >> + rv = -ENOMEM; >> + goto err_out_idr; >> + } >> + if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { >> + if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) >> + qp->attrs.flags |= SIW_SIGNAL_ALL_WR; >> + else { >> + rv = -EINVAL; >> + goto err_out_idr; >> + } >> + } >> + qp->pd = pd; >> + qp->scq = scq; >> + qp->rcq = rcq; >> + >> + if (attrs->srq) { >> + /* >> + * SRQ support. >> + * Verbs 6.3.7: ignore RQ size, if SRQ present >> + * Verbs 6.3.5: do not check PD of SRQ against PD of QP >> + */ >> + qp->srq = to_siw_srq(attrs->srq); >> + qp->attrs.rq_size = 0; >> + siw_dbg_qp(qp, "[SRQ 0x%p] attached\n", qp->srq); >> + } else if (num_rqe) { >> + if (qp->kernel_verbs) >> + qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); >> + else >> + qp->recvq = vmalloc_user(num_rqe * >> + sizeof(struct siw_rqe)); >> + >> + if (qp->recvq == NULL) { >> + siw_dbg_qp(qp, "recv queue size %d alloc failed\n", >> + num_rqe); >> + rv = -ENOMEM; >> + goto err_out_idr; >> + } >> + >> + qp->attrs.rq_size = num_rqe; >> + } >> + qp->attrs.sq_size = num_sqe; >> + qp->attrs.sq_max_sges = attrs->cap.max_send_sge; >> + qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; >> + >> + /* Make those two tunables fixed for now. */ >> + qp->tx_ctx.gso_seg_limit = gso_seg_limit; >> + qp->tx_ctx.zcopy_tx = zcopy_tx; >> + >> + qp->attrs.state = SIW_QP_STATE_IDLE; >> + >> + if (udata) { >> + struct siw_uresp_create_qp uresp; >> + struct siw_ucontext *ctx; >> + >> + memset(&uresp, 0, sizeof(uresp)); >> + ctx = to_siw_ctx(base_pd->uobject->context); > >Here too, this is rdma_udata_to_drv_context() Absolutely. I see I have to rebase to the latest greatest rdma-next. was not aware of it... > >> + uresp.sq_key = uresp.rq_key = SIW_INVAL_UOBJ_KEY; >> + uresp.num_sqe = num_sqe; >> + uresp.num_rqe = num_rqe; >> + uresp.qp_id = QP_ID(qp); >> + >> + if (qp->sendq) { >> + uresp.sq_key = siw_insert_uobj(ctx, qp->sendq, >> + num_sqe * sizeof(struct siw_sqe)); >> + if (uresp.sq_key > SIW_MAX_UOBJ_KEY) >> + siw_dbg_qp(qp, "preparing mmap sq failed\n"); >> + } >> + if (qp->recvq) { >> + uresp.rq_key = siw_insert_uobj(ctx, qp->recvq, >> + num_rqe * sizeof(struct siw_rqe)); >> + if (uresp.rq_key > SIW_MAX_UOBJ_KEY) >> + siw_dbg_qp(qp, "preparing mmap rq failed\n"); >> + } >> + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); >> + if (rv) >> + goto err_out_idr; >> + } >> + qp->tx_cpu = siw_get_tx_cpu(sdev); >> + if (qp->tx_cpu < 0) { >> + rv = -EINVAL; >> + goto err_out_idr; >> + } >> + qp->base_qp.qp_num = QP_ID(qp); >> + >> + siw_pd_get(pd); >> + >> + INIT_LIST_HEAD(&qp->devq); >> + spin_lock_irqsave(&sdev->lock, flags); >> + list_add_tail(&qp->devq, &sdev->qp_list); >> + spin_unlock_irqrestore(&sdev->lock, flags); >> + >> + return &qp->base_qp; >> + >> +err_out_idr: >> + siw_remove_obj(&sdev->lock, &sdev->qp_idr, &qp->hdr); >> +err_out: >> + if (scq) >> + siw_cq_put(scq); >> + if (rcq) >> + siw_cq_put(rcq); >> + >> + if (qp) { >> + if (qp->sendq) >> + vfree(qp->sendq); >> + if (qp->recvq) >> + vfree(qp->recvq); >> + kfree(qp); >> + } >> + atomic_dec(&sdev->num_qp); >> + >> + return ERR_PTR(rv); >> +} >> + >> +/* >> + * Minimum siw_query_qp() verb interface. >> + * >> + * @qp_attr_mask is not used but all available information is >provided >> + */ >> +int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr >*qp_attr, >> + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) >> +{ >> + struct siw_qp *qp; >> + struct siw_device *sdev; >> + >> + if (base_qp && qp_attr && qp_init_attr) { >> + qp = to_siw_qp(base_qp); >> + sdev = to_siw_dev(base_qp->device); >> + } else >> + return -EINVAL; >> + >> + qp_attr->cap.max_inline_data = SIW_MAX_INLINE; >> + qp_attr->cap.max_send_wr = qp->attrs.sq_size; >> + qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; >> + qp_attr->cap.max_recv_wr = qp->attrs.rq_size; >> + qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; >> + qp_attr->path_mtu = siw_mtu_net2base(sdev->netdev->mtu); >> + qp_attr->max_rd_atomic = qp->attrs.irq_size; >> + qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; >> + >> + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | >> + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; >> + >> + qp_init_attr->qp_type = base_qp->qp_type; >> + qp_init_attr->send_cq = base_qp->send_cq; >> + qp_init_attr->recv_cq = base_qp->recv_cq; >> + qp_init_attr->srq = base_qp->srq; >> + >> + qp_init_attr->cap = qp_attr->cap; >> + >> + return 0; >> +} >> + >> +int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr >*attr, >> + int attr_mask, struct ib_udata *udata) >> +{ >> + struct siw_qp_attrs new_attrs; >> + enum siw_qp_attr_mask siw_attr_mask = 0; >> + struct siw_qp *qp = to_siw_qp(base_qp); >> + int rv = 0; >> + >> + if (!attr_mask) >> + return 0; >> + >> + memset(&new_attrs, 0, sizeof(new_attrs)); >> + >> + if (attr_mask & IB_QP_ACCESS_FLAGS) { >> + >> + siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; >> + >> + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) >> + new_attrs.flags |= SIW_RDMA_READ_ENABLED; >> + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) >> + new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; >> + if (attr->qp_access_flags & IB_ACCESS_MW_BIND) >> + new_attrs.flags |= SIW_RDMA_BIND_ENABLED; >> + } >> + if (attr_mask & IB_QP_STATE) { >> + siw_dbg_qp(qp, "desired ib qp state: %s\n", >> + ib_qp_state_to_string[attr->qp_state]); >> + >> + new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; >> + >> + if (new_attrs.state > SIW_QP_STATE_RTS) >> + qp->tx_ctx.tx_suspend = 1; >> + >> + siw_attr_mask |= SIW_QP_ATTR_STATE; >> + } >> + if (!attr_mask) >> + goto out; >> + >> + down_write(&qp->state_lock); >> + >> + rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); >> + >> + up_write(&qp->state_lock); >> +out: >> + return rv; >> +} >> + >> +int siw_destroy_qp(struct ib_qp *base_qp) >> +{ >> + struct siw_qp *qp = to_siw_qp(base_qp); >> + struct siw_qp_attrs qp_attrs; >> + >> + siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep); >> + >> + /* >> + * Mark QP as in process of destruction to prevent from >> + * any async callbacks to RDMA core >> + */ >> + qp->attrs.flags |= SIW_QP_IN_DESTROY; >> + qp->rx_ctx.rx_suspend = 1; >> + >> + down_write(&qp->state_lock); >> + >> + qp_attrs.state = SIW_QP_STATE_ERROR; >> + (void)siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); >> + >> + if (qp->cep) { >> + siw_cep_put(qp->cep); >> + qp->cep = NULL; >> + } >> + >> + up_write(&qp->state_lock); >> + >> + kfree(qp->rx_ctx.mpa_crc_hd); >> + kfree(qp->tx_ctx.mpa_crc_hd); >> + >> + /* Drop references */ >> + siw_cq_put(qp->scq); >> + siw_cq_put(qp->rcq); >> + siw_pd_put(qp->pd); >> + qp->scq = qp->rcq = NULL; >> + >> + siw_qp_put(qp); >> + >> + return 0; >> +} >> + >> +/* >> + * siw_copy_sgl() >> + * >> + * Copy SGL from RDMA core representation to local >> + * representation. >> + */ >> +static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge >*siw_sge, >> + int num_sge) >> +{ >> + while (num_sge--) { >> + siw_sge->laddr = sge->addr; >> + siw_sge->length = sge->length; >> + siw_sge->lkey = sge->lkey; >> + >> + siw_sge++; sge++; >> + } >> +} >> + >> +/* >> + * siw_copy_inline_sgl() >> + * >> + * Prepare sgl of inlined data for sending. For userland callers >> + * function checks if given buffer addresses and len's are within >> + * process context bounds. >> + * Data from all provided sge's are copied together into the wqe, >> + * referenced by a single sge. >> + */ >> +static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, >> + struct siw_sqe *sqe) >> +{ >> + struct ib_sge *core_sge = core_wr->sg_list; >> + void *kbuf = &sqe->sge[1]; >> + int num_sge = core_wr->num_sge, >> + bytes = 0; >> + >> + sqe->sge[0].laddr = (u64)kbuf; >> + sqe->sge[0].lkey = 0; >> + >> + while (num_sge--) { >> + if (!core_sge->length) { >> + core_sge++; >> + continue; >> + } >> + bytes += core_sge->length; >> + if (bytes > SIW_MAX_INLINE) { >> + bytes = -EINVAL; >> + break; >> + } >> + memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, >> + core_sge->length); >> + >> + kbuf += core_sge->length; >> + core_sge++; >> + } >> + sqe->sge[0].length = bytes > 0 ? bytes : 0; >> + sqe->num_sge = bytes > 0 ? 1 : 0; >> + >> + return bytes; >> +} >> + >> +/* >> + * siw_post_send() >> + * >> + * Post a list of S-WR's to a SQ. >> + * >> + * @base_qp: Base QP contained in siw QP >> + * @wr: Null terminated list of user WR's >> + * @bad_wr: Points to failing WR in case of synchronous failure. >> + */ >> +int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr >*wr, >> + const struct ib_send_wr **bad_wr) >> +{ >> + struct siw_qp *qp = to_siw_qp(base_qp); >> + struct siw_wqe *wqe = tx_wqe(qp); >> + >> + unsigned long flags; >> + int rv = 0; >> + >> + siw_dbg_qp(qp, "state %d\n", qp->attrs.state); >> + >> + /* >> + * Try to acquire QP state lock. Must be non-blocking >> + * to accommodate kernel clients needs. >> + */ >> + if (!down_read_trylock(&qp->state_lock)) { >> + *bad_wr = wr; >> + return -ENOTCONN; >> + } >> + >> + if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { >> + up_read(&qp->state_lock); >> + *bad_wr = wr; >> + return -ENOTCONN; >> + } >> + if (wr && !qp->kernel_verbs) { >> + siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); >> + up_read(&qp->state_lock); >> + *bad_wr = wr; >> + return -EINVAL; >> + } >> + >> + spin_lock_irqsave(&qp->sq_lock, flags); >> + >> + while (wr) { >> + u32 idx = qp->sq_put % qp->attrs.sq_size; >> + struct siw_sqe *sqe = &qp->sendq[idx]; >> + >> + if (sqe->flags) { >> + siw_dbg_qp(qp, "sq full\n"); >> + rv = -ENOMEM; >> + break; >> + } >> + if (wr->num_sge > qp->attrs.sq_max_sges) { >> + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); >> + rv = -EINVAL; >> + break; >> + } >> + sqe->id = wr->wr_id; >> + >> + if ((wr->send_flags & IB_SEND_SIGNALED) || >> + (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) >> + sqe->flags |= SIW_WQE_SIGNALLED; >> + >> + if (wr->send_flags & IB_SEND_FENCE) >> + sqe->flags |= SIW_WQE_READ_FENCE; >> + >> + switch (wr->opcode) { >> + >> + case IB_WR_SEND: >> + case IB_WR_SEND_WITH_INV: >> + if (wr->send_flags & IB_SEND_SOLICITED) >> + sqe->flags |= SIW_WQE_SOLICITED; >> + >> + if (!(wr->send_flags & IB_SEND_INLINE)) { >> + siw_copy_sgl(wr->sg_list, sqe->sge, >> + wr->num_sge); >> + sqe->num_sge = wr->num_sge; >> + } else { >> + rv = siw_copy_inline_sgl(wr, sqe); >> + if (rv <= 0) { >> + rv = -EINVAL; >> + break; >> + } >> + sqe->flags |= SIW_WQE_INLINE; >> + sqe->num_sge = 1; >> + } >> + if (wr->opcode == IB_WR_SEND) >> + sqe->opcode = SIW_OP_SEND; >> + else { >> + sqe->opcode = SIW_OP_SEND_REMOTE_INV; >> + sqe->rkey = wr->ex.invalidate_rkey; >> + } >> + break; >> + >> + case IB_WR_RDMA_READ_WITH_INV: >> + case IB_WR_RDMA_READ: >> + /* >> + * OFED WR restricts RREAD sink to SGL containing >> + * 1 SGE only. we could relax to SGL with multiple >> + * elements referring the SAME ltag or even sending >> + * a private per-rreq tag referring to a checked >> + * local sgl with MULTIPLE ltag's. would be easy >> + * to do... >> + */ >> + if (unlikely(wr->num_sge != 1)) { >> + rv = -EINVAL; >> + break; >> + } >> + siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); >> + /* >> + * NOTE: zero length RREAD is allowed! >> + */ >> + sqe->raddr = rdma_wr(wr)->remote_addr; >> + sqe->rkey = rdma_wr(wr)->rkey; >> + sqe->num_sge = 1; >> + >> + if (wr->opcode == IB_WR_RDMA_READ) >> + sqe->opcode = SIW_OP_READ; >> + else >> + sqe->opcode = SIW_OP_READ_LOCAL_INV; >> + break; >> + >> + case IB_WR_RDMA_WRITE: >> + if (!(wr->send_flags & IB_SEND_INLINE)) { >> + siw_copy_sgl(wr->sg_list, &sqe->sge[0], >> + wr->num_sge); >> + sqe->num_sge = wr->num_sge; >> + } else { >> + rv = siw_copy_inline_sgl(wr, sqe); >> + if (unlikely(rv < 0)) { >> + rv = -EINVAL; >> + break; >> + } >> + sqe->flags |= SIW_WQE_INLINE; >> + sqe->num_sge = 1; >> + } >> + sqe->raddr = rdma_wr(wr)->remote_addr; >> + sqe->rkey = rdma_wr(wr)->rkey; >> + sqe->opcode = SIW_OP_WRITE; >> + >> + break; >> + >> + case IB_WR_REG_MR: >> + sqe->base_mr = (uint64_t)reg_wr(wr)->mr; >> + sqe->rkey = reg_wr(wr)->key; >> + sqe->access = SIW_MEM_LREAD; >> + if (reg_wr(wr)->access & IB_ACCESS_LOCAL_WRITE) >> + sqe->access |= SIW_MEM_LWRITE; >> + if (reg_wr(wr)->access & IB_ACCESS_REMOTE_WRITE) >> + sqe->access |= SIW_MEM_RWRITE; >> + if (reg_wr(wr)->access & IB_ACCESS_REMOTE_READ) >> + sqe->access |= SIW_MEM_RREAD; >> + sqe->opcode = SIW_OP_REG_MR; >> + >> + break; >> + >> + case IB_WR_LOCAL_INV: >> + sqe->rkey = wr->ex.invalidate_rkey; >> + sqe->opcode = SIW_OP_INVAL_STAG; >> + >> + break; >> + >> + default: >> + siw_dbg_qp(qp, "ib wr type %d unsupported\n", >> + wr->opcode); >> + rv = -EINVAL; >> + break; >> + } >> + siw_dbg_qp(qp, "opcode %d, flags 0x%x\n", >> + sqe->opcode, sqe->flags); >> + >> + if (unlikely(rv < 0)) >> + break; >> + >> + /* make SQE only vaild after completely written */ >> + smp_wmb(); >> + sqe->flags |= SIW_WQE_VALID; >> + >> + qp->sq_put++; >> + wr = wr->next; >> + } >> + >> + /* >> + * Send directly if SQ processing is not in progress. >> + * Eventual immediate errors (rv < 0) do not affect the involved >> + * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ >> + * processing, if new work is already pending. But rv must be >passed >> + * to caller. >> + */ >> + if (wqe->wr_status != SIW_WR_IDLE) { >> + spin_unlock_irqrestore(&qp->sq_lock, flags); >> + goto skip_direct_sending; >> + } >> + rv = siw_activate_tx(qp); >> + spin_unlock_irqrestore(&qp->sq_lock, flags); >> + >> + if (rv <= 0) >> + goto skip_direct_sending; >> + >> + if (qp->kernel_verbs) { >> + rv = siw_sq_start(qp); >> + } else { >> + qp->tx_ctx.in_syscall = 1; >> + >> + if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) >> + siw_qp_cm_drop(qp, 0); >> + >> + qp->tx_ctx.in_syscall = 0; >> + } >> +skip_direct_sending: >> + >> + up_read(&qp->state_lock); >> + >> + if (rv >= 0) >> + return 0; >> + /* >> + * Immediate error >> + */ >> + siw_dbg_qp(qp, "error %d\n", rv); >> + >> + *bad_wr = wr; >> + return rv; >> +} >> + >> +/* >> + * siw_post_receive() >> + * >> + * Post a list of R-WR's to a RQ. >> + * >> + * @base_qp: Base QP contained in siw QP >> + * @wr: Null terminated list of user WR's >> + * @bad_wr: Points to failing WR in case of synchronous failure. >> + */ >> +int siw_post_receive(struct ib_qp *base_qp, const struct >ib_recv_wr *wr, >> + const struct ib_recv_wr **bad_wr) >> +{ >> + struct siw_qp *qp = to_siw_qp(base_qp); >> + unsigned long flags; >> + int rv = 0; >> + >> + if (qp->srq) { >> + *bad_wr = wr; >> + return -EOPNOTSUPP; /* what else from errno.h? */ >> + } >> + /* >> + * Try to acquire QP state lock. Must be non-blocking >> + * to accommodate kernel clients needs. >> + */ >> + if (!down_read_trylock(&qp->state_lock)) { >> + *bad_wr = wr; >> + return -ENOTCONN; >> + } >> + if (!qp->kernel_verbs) { >> + siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); >> + up_read(&qp->state_lock); >> + *bad_wr = wr; >> + return -EINVAL; >> + } >> + if (qp->attrs.state > SIW_QP_STATE_RTS) { >> + up_read(&qp->state_lock); >> + *bad_wr = wr; >> + return -EINVAL; >> + } >> + /* >> + * Serialize potentially multiple producers. >> + * Not needed for single threaded consumer side. >> + */ >> + spin_lock_irqsave(&qp->rq_lock, flags); >> + >> + while (wr) { >> + u32 idx = qp->rq_put % qp->attrs.rq_size; >> + struct siw_rqe *rqe = &qp->recvq[idx]; >> + >> + if (rqe->flags) { >> + siw_dbg_qp(qp, "Receive Queue full\n"); >> + rv = -ENOMEM; >> + break; >> + } >> + if (wr->num_sge > qp->attrs.rq_max_sges) { >> + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); >> + rv = -EINVAL; >> + break; >> + } >> + rqe->id = wr->wr_id; >> + rqe->num_sge = wr->num_sge; >> + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); >> + >> + /* make sure RQE is completely written before valid */ >> + smp_wmb(); >> + >> + rqe->flags = SIW_WQE_VALID; >> + >> + qp->rq_put++; >> + wr = wr->next; >> + } >> + spin_unlock_irqrestore(&qp->rq_lock, flags); >> + >> + up_read(&qp->state_lock); >> + >> + if (rv < 0) { >> + siw_dbg_qp(qp, "error %d\n", rv); >> + *bad_wr = wr; >> + } >> + return rv > 0 ? 0 : rv; >> +} >> + >> +int siw_destroy_cq(struct ib_cq *base_cq) >> +{ >> + struct siw_cq *cq = to_siw_cq(base_cq); >> + struct ib_device *base_dev = base_cq->device; >> + struct siw_device *sdev = to_siw_dev(base_dev); >> + >> + siw_cq_flush(cq); >> + >> + siw_remove_obj(&sdev->lock, &sdev->cq_idr, &cq->hdr); >> + siw_cq_put(cq); >> + >> + return 0; >> +} >> + >> +/* >> + * siw_create_cq() >> + * >> + * Create CQ of requested size on given device. >> + * >> + * @base_dev: RDMA device contained in siw device >> + * @size: maximum number of CQE's allowed. >> + * @ib_context: user context. >> + * @udata: used to provide CQ ID back to user. >> + */ >> + >> +struct ib_cq *siw_create_cq(struct ib_device *base_dev, >> + const struct ib_cq_init_attr *attr, >> + struct ib_ucontext *ib_context, >> + struct ib_udata *udata) >> +{ >> + struct siw_cq *cq = NULL; >> + struct siw_device *sdev = to_siw_dev(base_dev); >> + struct siw_uresp_create_cq uresp; >> + int rv, size = attr->cqe; >> + >> + if (!base_dev) { >> + rv = -ENODEV; >> + goto err_out; >> + } >> + if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { >> + siw_dbg(sdev, "too many cq's\n"); >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + if (size < 1 || size > sdev->attrs.max_cqe) { >> + siw_dbg(sdev, "cq size error: %d\n", size); >> + rv = -EINVAL; >> + goto err_out; >> + } >> + cq = kzalloc(sizeof(*cq), GFP_KERNEL); >> + if (!cq) { >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + size = roundup_pow_of_two(size); >> + cq->base_cq.cqe = size; >> + cq->num_cqe = size; >> + >> + if (!ib_context) { >> + cq->kernel_verbs = 1; >> + cq->queue = vzalloc(size * sizeof(struct siw_cqe) >> + + sizeof(struct siw_cq_ctrl)); >> + } else { >> + cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) >> + + sizeof(struct siw_cq_ctrl)); >> + } >> + if (cq->queue == NULL) { >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + >> + rv = siw_cq_add(sdev, cq); >> + if (rv) >> + goto err_out; >> + >> + spin_lock_init(&cq->lock); >> + >> + cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify; >> + >> + if (!cq->kernel_verbs) { >> + struct siw_ucontext *ctx = to_siw_ctx(ib_context); >> + >> + uresp.cq_key = siw_insert_uobj(ctx, cq->queue, >> + size * sizeof(struct siw_cqe) + >> + sizeof(struct siw_cq_ctrl)); >> + >> + if (uresp.cq_key > SIW_MAX_UOBJ_KEY) >> + siw_dbg(sdev, "[CQ %d]: preparing mmap failed\n", >> + OBJ_ID(cq)); >> + >> + uresp.cq_id = OBJ_ID(cq); >> + uresp.num_cqe = size; >> + >> + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); >> + if (rv) >> + goto err_out_idr; >> + } >> + return &cq->base_cq; >> + >> +err_out_idr: >> + siw_remove_obj(&sdev->lock, &sdev->cq_idr, &cq->hdr); >> +err_out: >> + siw_dbg(sdev, "cq creation failed: %d", rv); >> + >> + if (cq && cq->queue) >> + vfree(cq->queue); >> + >> + kfree(cq); >> + atomic_dec(&sdev->num_cq); >> + >> + return ERR_PTR(rv); >> +} >> + >> +/* >> + * siw_poll_cq() >> + * >> + * Reap CQ entries if available and copy work completion status >into >> + * array of WC's provided by caller. Returns number of reaped >CQE's. >> + * >> + * @base_cq: Base CQ contained in siw CQ. >> + * @num_cqe: Maximum number of CQE's to reap. >> + * @wc: Array of work completions to be filled by siw. >> + */ >> +int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc >*wc) >> +{ >> + struct siw_cq *cq = to_siw_cq(base_cq); >> + int i; >> + >> + for (i = 0; i < num_cqe; i++) { >> + if (!(siw_reap_cqe(cq, wc))) >> + break; >> + wc++; >> + } >> + return i; >> +} >> + >> +/* >> + * siw_req_notify_cq() >> + * >> + * Request notification for new CQE's added to that CQ. >> + * Defined flags: >> + * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification >> + * event if a WQE with notification flag set enters the CQ >> + * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification >> + * event if a WQE enters the CQ. >> + * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the >> + * number of not reaped CQE's regardless of its notification >> + * type and current or new CQ notification settings. >> + * >> + * @base_cq: Base CQ contained in siw CQ. >> + * @flags: Requested notification flags. >> + */ >> +int siw_req_notify_cq(struct ib_cq *base_cq, enum >ib_cq_notify_flags flags) >> +{ >> + struct siw_cq *cq = to_siw_cq(base_cq); >> + >> + siw_dbg(cq->hdr.sdev, "[CQ %d]: flags: 0x%8x\n", OBJ_ID(cq), >flags); >> + >> + if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) >> + /* CQ event for next solicited completion */ >> + smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED); >> + else >> + /* CQ event for any signalled completion */ >> + smp_store_mb(*cq->notify, SIW_NOTIFY_ALL); >> + >> + if (flags & IB_CQ_REPORT_MISSED_EVENTS) >> + return cq->cq_put - cq->cq_get; >> + >> + return 0; >> +} >> + >> +/* >> + * siw_dereg_mr() >> + * >> + * Release Memory Region. >> + * >> + * TODO: Update function if Memory Windows are supported by siw: >> + * Is OFED core checking for MW dependencies for current >> + * MR before calling MR deregistration?. >> + * >> + * @base_mr: Base MR contained in siw MR. >> + */ >> +int siw_dereg_mr(struct ib_mr *base_mr) >> +{ >> + struct siw_mr *mr; >> + struct siw_device *sdev = to_siw_dev(base_mr->device); >> + >> + mr = to_siw_mr(base_mr); >> + >> + siw_dbg(sdev, "[MEM %d]: deregister mr, #ref's %d\n", >> + mr->mem.hdr.id, kref_read(&mr->mem.hdr.ref)); >> + >> + mr->mem.stag_valid = 0; >> + >> + siw_remove_obj(&sdev->lock, &sdev->mem_idr, &mr->mem.hdr); >> + siw_mem_put(&mr->mem); >> + >> + return 0; >> +} >> + >> +static struct siw_mr *siw_create_mr(struct siw_device *sdev, void >*mem_obj, >> + u64 start, u64 len, int rights) >> +{ >> + struct siw_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL); >> + unsigned long flags; >> + >> + if (!mr) >> + return NULL; >> + >> + mr->mem.stag_valid = 0; >> + >> + if (siw_mem_add(sdev, &mr->mem) < 0) { >> + kfree(mr); >> + return NULL; >> + } >> + siw_dbg(sdev, "[MEM %d]: new mr, object 0x%p\n", >> + mr->mem.hdr.id, mem_obj); >> + >> + mr->base_mr.lkey = mr->base_mr.rkey = mr->mem.hdr.id << 8; >> + >> + mr->mem.va = start; >> + mr->mem.len = len; >> + mr->mem.mr = NULL; >> + mr->mem.perms = SIW_MEM_LREAD | /* not selectable in RDMA core */ >> + (rights & IB_ACCESS_REMOTE_READ ? SIW_MEM_RREAD : 0) | >> + (rights & IB_ACCESS_LOCAL_WRITE ? SIW_MEM_LWRITE : 0) | >> + (rights & IB_ACCESS_REMOTE_WRITE ? SIW_MEM_RWRITE : 0); >> + >> + mr->mem_obj = mem_obj; >> + >> + INIT_LIST_HEAD(&mr->devq); >> + spin_lock_irqsave(&sdev->lock, flags); >> + list_add_tail(&mr->devq, &sdev->mr_list); >> + spin_unlock_irqrestore(&sdev->lock, flags); >> + >> + return mr; >> +} >> + >> +/* >> + * siw_reg_user_mr() >> + * >> + * Register Memory Region. >> + * >> + * @base_pd: Base PD contained in siw PD. >> + * @start: starting address of MR (virtual address) >> + * @len: len of MR >> + * @rnic_va: not used by siw >> + * @rights: MR access rights >> + * @udata: user buffer to communicate STag and Key. >> + */ >> +struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, >u64 len, >> + u64 rnic_va, int rights, struct ib_udata *udata) >> +{ >> + struct siw_mr *mr = NULL; >> + struct siw_pd *pd = to_siw_pd(base_pd); >> + struct siw_umem *umem = NULL; >> + struct siw_ureq_reg_mr ureq; >> + struct siw_uresp_reg_mr uresp; >> + struct siw_device *sdev = pd->hdr.sdev; >> + >> + unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); >> + int rv; >> + >> + siw_dbg(sdev, "[PD %d]: start: 0x%016llx, va: 0x%016llx, len: >%llu\n", >> + OBJ_ID(pd), (unsigned long long)start, >> + (unsigned long long)rnic_va, (unsigned long long)len); >> + >> + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { >> + siw_dbg(sdev, "[PD %d]: too many mr's\n", OBJ_ID(pd)); >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + if (!len) { >> + rv = -EINVAL; >> + goto err_out; >> + } >> + if (mem_limit != RLIM_INFINITY) { >> + unsigned long num_pages = >> + (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT; >> + mem_limit >>= PAGE_SHIFT; >> + >> + if (num_pages > mem_limit - current->mm->locked_vm) { >> + siw_dbg(sdev, >> + "[PD %d]: pages req %lu, max %lu, lock %lu\n", >> + OBJ_ID(pd), num_pages, mem_limit, >> + current->mm->locked_vm); >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + } >> + umem = siw_umem_get(start, len); >> + if (IS_ERR(umem)) { >> + rv = PTR_ERR(umem); >> + siw_dbg(sdev, "[PD %d]: getting user memory failed: %d\n", >> + OBJ_ID(pd), rv); >> + umem = NULL; >> + goto err_out; >> + } >> + mr = siw_create_mr(sdev, umem, start, len, rights); >> + if (!mr) { >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + if (udata) { >> + rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); >> + if (rv) >> + goto err_out; >> + >> + mr->base_mr.lkey |= ureq.stag_key; >> + mr->base_mr.rkey |= ureq.stag_key; >> + uresp.stag = mr->base_mr.lkey; >> + >> + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); >> + if (rv) >> + goto err_out; >> + } >> + mr->pd = pd; >> + siw_pd_get(pd); >> + >> + mr->mem.stag_valid = 1; >> + >> + return &mr->base_mr; >> + >> +err_out: >> + if (mr) { >> + siw_remove_obj(&sdev->lock, &sdev->mem_idr, &mr->mem.hdr); >> + siw_mem_put(&mr->mem); >> + umem = NULL; >> + } else >> + atomic_dec(&sdev->num_mr); >> + >> + if (umem) >> + siw_umem_release(umem); >> + >> + return ERR_PTR(rv); >> +} >> + >> +struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type >mr_type, >> + u32 max_sge) >> +{ >> + struct siw_mr *mr; >> + struct siw_pd *pd = to_siw_pd(base_pd); >> + struct siw_device *sdev = pd->hdr.sdev; >> + struct siw_pbl *pbl = NULL; >> + int rv; >> + >> + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { >> + siw_dbg(sdev, "[PD %d]: too many mr's\n", OBJ_ID(pd)); >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + if (mr_type != IB_MR_TYPE_MEM_REG) { >> + siw_dbg(sdev, "[PD %d]: mr type %d unsupported\n", >> + OBJ_ID(pd), mr_type); >> + rv = -EOPNOTSUPP; >> + goto err_out; >> + } >> + if (max_sge > SIW_MAX_SGE_PBL) { >> + siw_dbg(sdev, "[PD %d]: too many sge's: %d\n", >> + OBJ_ID(pd), max_sge); >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + pbl = siw_pbl_alloc(max_sge); >> + if (IS_ERR(pbl)) { >> + rv = PTR_ERR(pbl); >> + siw_dbg(sdev, "[PD %d]: pbl allocation failed: %d\n", >> + OBJ_ID(pd), rv); >> + pbl = NULL; >> + goto err_out; >> + } >> + mr = siw_create_mr(sdev, pbl, 0, max_sge * PAGE_SIZE, 0); >> + if (!mr) { >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + mr->mem.is_pbl = 1; >> + mr->pd = pd; >> + siw_pd_get(pd); >> + >> + siw_dbg(sdev, "[PD %d], [MEM %d]: success\n", >> + OBJ_ID(pd), OBJ_ID(&mr->mem)); >> + >> + return &mr->base_mr; >> + >> +err_out: >> + if (pbl) >> + siw_pbl_free(pbl); >> + >> + siw_dbg(sdev, "[PD %d]: failed: %d\n", OBJ_ID(pd), rv); >> + >> + atomic_dec(&sdev->num_mr); >> + >> + return ERR_PTR(rv); >> +} >> + >> +/* Just used to count number of pages being mapped */ >> +static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) >> +{ >> + return 0; >> +} >> + >> +int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, >int num_sle, >> + unsigned int *sg_off) >> +{ >> + struct scatterlist *slp; >> + struct siw_mr *mr = to_siw_mr(base_mr); >> + struct siw_pbl *pbl = mr->pbl; >> + struct siw_pble *pble = pbl->pbe; >> + u64 pbl_size; >> + int i, rv; >> + >> + if (!pbl) { >> + siw_dbg(mr->mem.hdr.sdev, "[MEM %d]: no pbl allocated\n", >> + OBJ_ID(&mr->mem)); >> + return -EINVAL; >> + } >> + if (pbl->max_buf < num_sle) { >> + siw_dbg(mr->mem.hdr.sdev, "[MEM %d]: too many sge's: %d>%d\n", >> + OBJ_ID(&mr->mem), mr->pbl->max_buf, num_sle); >> + return -ENOMEM; >> + } >> + >> + for_each_sg(sl, slp, num_sle, i) { >> + if (sg_dma_len(slp) == 0) { >> + siw_dbg(mr->mem.hdr.sdev, "[MEM %d]: empty sge\n", >> + OBJ_ID(&mr->mem)); >> + return -EINVAL; >> + } >> + if (i == 0) { >> + pble->addr = sg_dma_address(slp); >> + pble->size = sg_dma_len(slp); >> + pble->pbl_off = 0; >> + pbl_size = pble->size; >> + pbl->num_buf = 1; >> + >> + continue; >> + } >> + /* Merge PBL entries if adjacent */ >> + if (pble->addr + pble->size == sg_dma_address(slp)) >> + pble->size += sg_dma_len(slp); >> + else { >> + pble++; >> + pbl->num_buf++; >> + pble->addr = sg_dma_address(slp); >> + pble->size = sg_dma_len(slp); >> + pble->pbl_off = pbl_size; >> + } >> + pbl_size += sg_dma_len(slp); >> + >> + siw_dbg(mr->mem.hdr.sdev, >> + "[MEM %d]: sge[%d], size %llu, addr %p, total %llu\n", >> + OBJ_ID(&mr->mem), i, pble->size, (void *)pble->addr, >> + pbl_size); >> + } >> + rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, >siw_set_pbl_page); >> + if (rv > 0) { >> + mr->mem.len = base_mr->length; >> + mr->mem.va = base_mr->iova; >> + siw_dbg(mr->mem.hdr.sdev, >> + "[MEM %d]: %llu byte, %u SLE into %u entries\n", >> + OBJ_ID(&mr->mem), mr->mem.len, num_sle, pbl->num_buf); >> + } >> + return rv; >> +} >> + >> +/* >> + * siw_get_dma_mr() >> + * >> + * Create a (empty) DMA memory region, where no umem is attached. >> + */ >> +struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights) >> +{ >> + struct siw_mr *mr; >> + struct siw_pd *pd = to_siw_pd(base_pd); >> + struct siw_device *sdev = pd->hdr.sdev; >> + int rv; >> + >> + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { >> + siw_dbg(sdev, "[PD %d]: too many mr's\n", OBJ_ID(pd)); >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + mr = siw_create_mr(sdev, NULL, 0, ULONG_MAX, rights); >> + if (!mr) { >> + rv = -ENOMEM; >> + goto err_out; >> + } >> + mr->mem.stag_valid = 1; >> + >> + mr->pd = pd; >> + siw_pd_get(pd); >> + >> + siw_dbg(sdev, "[PD %d], [MEM %d]: success\n", >> + OBJ_ID(pd), OBJ_ID(&mr->mem)); >> + >> + return &mr->base_mr; >> + >> +err_out: >> + atomic_dec(&sdev->num_mr); >> + >> + return ERR_PTR(rv); >> +} >> + >> +/* >> + * siw_create_srq() >> + * >> + * Create Shared Receive Queue of attributes @init_attrs >> + * within protection domain given by @base_pd. >> + * >> + * @base_pd: Base PD contained in siw PD. >> + * @init_attrs: SRQ init attributes. >> + * @udata: not used by siw. >> + */ >> +struct ib_srq *siw_create_srq(struct ib_pd *base_pd, >> + struct ib_srq_init_attr *init_attrs, >> + struct ib_udata *udata) >> +{ >> + struct siw_srq *srq = NULL; >> + struct ib_srq_attr *attrs = &init_attrs->attr; >> + struct siw_pd *pd = to_siw_pd(base_pd); >> + struct siw_device *sdev = pd->hdr.sdev; >> + >> + int kernel_verbs = base_pd->uobject ? 0 : 1; > >bool kernel_verbs = !udata; right. thanks! > >> + memset(&uresp, 0, sizeof(uresp)); >> + ctx = to_siw_ctx(base_pd->uobject->context); > >.. and so on throughout. > >Jason > >