Hey Jason, > -----Original Message----- > From: linux-rdma-owner@xxxxxxxxxxxxxxx <linux-rdma- > owner@xxxxxxxxxxxxxxx> On Behalf Of Jason Gunthorpe > Sent: Thursday, July 26, 2018 12:21 PM > To: Potnuri Bharat Teja <bharat@xxxxxxxxxxx> > Cc: dledford@xxxxxxxxxx; swise@xxxxxxxxxxxxxxxxxxxxx; > rajur@xxxxxxxxxxx; linux-rdma@xxxxxxxxxxxxxxx > Subject: Re: [PATCH for-next v2 1/2] iw_cxgb4: RDMA write with immediate > support > > On Tue, Jul 17, 2018 at 01:08:17PM +0530, Potnuri Bharat Teja wrote: > > Adds iw_cxgb4 functionality to support RDMA_WRITE_WITH_IMMEDATE > opcode. > > > > Signed-off-by: Potnuri Bharat Teja <bharat@xxxxxxxxxxx> > > Signed-off-by: Steve Wise <swise@xxxxxxxxxxxxxxxxxxxxx> > > drivers/infiniband/hw/cxgb4/cq.c | 23 ++++++++++++++++--- > > drivers/infiniband/hw/cxgb4/qp.c | 37 ++++++++++++++++++++++++-- > ----- > > drivers/infiniband/hw/cxgb4/t4.h | 16 ++++++++++++- > > drivers/infiniband/hw/cxgb4/t4fw_ri_api.h | 18 ++++++++++++--- > > include/uapi/rdma/cxgb4-abi.h | 3 ++- > > 5 files changed, 81 insertions(+), 16 deletions(-) > > > > diff --git a/drivers/infiniband/hw/cxgb4/cq.c > b/drivers/infiniband/hw/cxgb4/cq.c > > index 64f70800945b..5792675150e0 100644 > > +++ b/drivers/infiniband/hw/cxgb4/cq.c > > @@ -816,15 +816,32 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, > struct ib_wc *wc) > > wc->byte_len = CQE_LEN(&cqe); > > else > > wc->byte_len = 0; > > - wc->opcode = IB_WC_RECV; > > - if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV || > > - CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) { > > + > > + switch (CQE_OPCODE(&cqe)) { > > + case FW_RI_SEND: > > + wc->opcode = IB_WC_RECV; > > + break; > > + case FW_RI_SEND_WITH_INV: > > + case FW_RI_SEND_WITH_SE_INV: > > + wc->opcode = IB_WC_RECV; > > wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe); > > wc->wc_flags |= IB_WC_WITH_INVALIDATE; > > c4iw_invalidate_mr(qhp->rhp, wc- > >ex.invalidate_rkey); > > + break; > > + case FW_RI_WRITE_IMMEDIATE: > > + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; > > + wc->ex.imm_data = CQE_IMM_DATA(&cqe); > > + wc->wc_flags |= IB_WC_WITH_IMM; > > + break; > > + default: > > + pr_err("Unexpected opcode %d in the CQE received > for QPID=0x%0x\n", > > + CQE_OPCODE(&cqe), CQE_QPID(&cqe)); > > + ret = -EINVAL; > > + goto out; > > } > > } else { > > switch (CQE_OPCODE(&cqe)) { > > + case FW_RI_WRITE_IMMEDIATE: > > case FW_RI_RDMA_WRITE: > > wc->opcode = IB_WC_RDMA_WRITE; > > break; > > diff --git a/drivers/infiniband/hw/cxgb4/qp.c > b/drivers/infiniband/hw/cxgb4/qp.c > > index 08dc555942af..88bd67d7dc77 100644 > > +++ b/drivers/infiniband/hw/cxgb4/qp.c > > @@ -555,7 +555,15 @@ static int build_rdma_write(struct t4_sq *sq, > union t4_wr *wqe, > > > > if (wr->num_sge > T4_MAX_SEND_SGE) > > return -EINVAL; > > - wqe->write.r2 = 0; > > + > > + /* > > + * iWARP protocol supports 64 bit immediate data but rdma api > > + * limits it to 32bit. > > + */ > > + if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) > > + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = wr- > >ex.imm_data; > > + else > > + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = 0; > > wqe->write.stag_sink = cpu_to_be32(rdma_wr(wr)->rkey); > > wqe->write.to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr); > > if (wr->num_sge) { > > @@ -846,6 +854,9 @@ static int ib_to_fw_opcode(int ib_opcode) > > case IB_WR_RDMA_WRITE: > > opcode = FW_RI_RDMA_WRITE; > > break; > > + case IB_WR_RDMA_WRITE_WITH_IMM: > > + opcode = FW_RI_WRITE_IMMEDIATE; > > + break; > > case IB_WR_RDMA_READ: > > case IB_WR_RDMA_READ_WITH_INV: > > opcode = FW_RI_READ_REQ; > > @@ -964,6 +975,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > enum fw_wr_opcodes fw_opcode = 0; > > enum fw_ri_wr_flags fw_flags; > > struct c4iw_qp *qhp; > > + struct c4iw_dev *rhp; > > union t4_wr *wqe = NULL; > > u32 num_wrs; > > struct t4_swsqe *swsqe; > > @@ -971,6 +983,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > u16 idx = 0; > > > > qhp = to_c4iw_qp(ibqp); > > + rhp = qhp->rhp; > > spin_lock_irqsave(&qhp->lock, flag); > > > > /* > > @@ -1015,6 +1028,13 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > swsqe->opcode = FW_RI_SEND_WITH_INV; > > err = build_rdma_send(&qhp->wq.sq, wqe, wr, > &len16); > > break; > > + case IB_WR_RDMA_WRITE_WITH_IMM: > > + if (unlikely(!rhp->rdev.lldi.write_w_imm_support)) { > > + err = -EINVAL; > > + break; > > + } > > + fw_flags |= > FW_RI_RDMA_WRITE_WITH_IMMEDIATE; > > + /*FALLTHROUGH*/ > > case IB_WR_RDMA_WRITE: > > fw_opcode = FW_RI_RDMA_WRITE_WR; > > swsqe->opcode = FW_RI_RDMA_WRITE; > > @@ -1025,8 +1045,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > fw_opcode = FW_RI_RDMA_READ_WR; > > swsqe->opcode = FW_RI_READ_REQ; > > if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) { > > - c4iw_invalidate_mr(qhp->rhp, > > - wr->sg_list[0].lkey); > > + c4iw_invalidate_mr(rhp, wr->sg_list[0].lkey); > > fw_flags = FW_RI_RDMA_READ_INVALIDATE; > > } else { > > fw_flags = 0; > > @@ -1042,7 +1061,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > struct c4iw_mr *mhp = to_c4iw_mr(reg_wr(wr)->mr); > > > > swsqe->opcode = FW_RI_FAST_REGISTER; > > - if (qhp->rhp->rdev.lldi.fr_nsmr_tpte_wr_support && > > + if (rhp->rdev.lldi.fr_nsmr_tpte_wr_support && > > !mhp->attr.state && mhp->mpl_len <= 2) { > > fw_opcode = FW_RI_FR_NSMR_TPTE_WR; > > build_tpte_memreg(&wqe->fr_tpte, > reg_wr(wr), > > @@ -1051,7 +1070,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > fw_opcode = FW_RI_FR_NSMR_WR; > > err = build_memreg(&qhp->wq.sq, wqe, > reg_wr(wr), > > mhp, &len16, > > - qhp->rhp- > >rdev.lldi.ulptx_memwrite_dsgl); > > + rhp->rdev.lldi.ulptx_memwrite_dsgl); > > if (err) > > break; > > } > > @@ -1064,7 +1083,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > fw_opcode = FW_RI_INV_LSTAG_WR; > > swsqe->opcode = FW_RI_LOCAL_INV; > > err = build_inv_stag(wqe, wr, &len16); > > - c4iw_invalidate_mr(qhp->rhp, wr- > >ex.invalidate_rkey); > > + c4iw_invalidate_mr(rhp, wr->ex.invalidate_rkey); > > break; > > default: > > pr_warn("%s post of type=%d TBD!\n", __func__, > > @@ -1083,7 +1102,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > swsqe->wr_id = wr->wr_id; > > if (c4iw_wr_log) { > > swsqe->sge_ts = cxgb4_read_sge_timestamp( > > - qhp->rhp->rdev.lldi.ports[0]); > > + rhp->rdev.lldi.ports[0]); > > swsqe->host_time = ktime_get(); > > } > > > > @@ -1097,7 +1116,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > t4_sq_produce(&qhp->wq, len16); > > idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); > > } > > - if (!qhp->rhp->rdev.status_page->db_off) { > > + if (!rhp->rdev.status_page->db_off) { > > t4_ring_sq_db(&qhp->wq, idx, wqe); > > spin_unlock_irqrestore(&qhp->lock, flag); > > } else { > > @@ -2092,6 +2111,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, > struct ib_qp_init_attr *attrs, > > uresp.flags = C4IW_QPF_ONCHIP; > > } else > > uresp.flags = 0; > > + if (rhp->rdev.lldi.write_w_imm_support) > > + uresp.flags |= C4IW_QPF_WRITE_W_IMM; > > uresp.qid_mask = rhp->rdev.qpmask; > > uresp.sqid = qhp->wq.sq.qid; > > uresp.sq_size = qhp->wq.sq.size; > > diff --git a/drivers/infiniband/hw/cxgb4/t4.h > b/drivers/infiniband/hw/cxgb4/t4.h > > index 11d55fc2ded7..3fdcc445ff16 100644 > > +++ b/drivers/infiniband/hw/cxgb4/t4.h > > @@ -190,7 +190,19 @@ struct t4_cqe { > > __be32 abs_rqe_idx; > > } srcqe; > > struct { > > - __be64 imm_data; > > + __be32 mo; > > + __be32 msn; > > + /* > > + * Use union for immediate data to be consistent with > > + * stack's 32 bit data and iWARP spec's 64 bit data. > > + */ > > + union { > > + struct { > > + u32 reserved; > > + __be32 imm_data32; > > + } ib_imm_data; > > + __be64 imm_data64; > > You guys should think carefully about this as this choice to put the > 32 bit version at the end becomes a permanent wire protocol ABI for > iWarp. > > The definition of 'imm data' is a memory image of what is put into the > packet, so placing the 32 bit version of this at the end of the array > on the wire seems like a strange choice. > > If we extend verbs, it will continue to be a memory image semantic, > and I would expect the 32 bits version to be the upper chunk of the > array, not the lower chunk. > Are you assuming an LE host? Which brings up a point that 'imm data' is not very platform independent. But I guess putting it in the upper chunk is fine. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html