On 30/12/2021 20:14, Xiao Yang wrote: > This patch implements RDMA Atomic Write operation for RC service. > > Signed-off-by: Xiao Yang <yangx.jy@xxxxxxxxxxx> > --- > drivers/infiniband/sw/rxe/rxe_comp.c | 4 +++ > drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++++ > drivers/infiniband/sw/rxe/rxe_opcode.h | 3 ++ > drivers/infiniband/sw/rxe/rxe_qp.c | 3 +- > drivers/infiniband/sw/rxe/rxe_req.c | 10 ++++-- > drivers/infiniband/sw/rxe/rxe_resp.c | 49 +++++++++++++++++++++----- > include/rdma/ib_pack.h | 2 ++ > include/rdma/ib_verbs.h | 2 ++ > include/uapi/rdma/ib_user_verbs.h | 2 ++ > 9 files changed, 81 insertions(+), 12 deletions(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c > index d771ba8449a1..5a5c0c3501de 100644 > --- a/drivers/infiniband/sw/rxe/rxe_comp.c > +++ b/drivers/infiniband/sw/rxe/rxe_comp.c > @@ -104,6 +104,7 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) > case IB_WR_LOCAL_INV: return IB_WC_LOCAL_INV; > case IB_WR_REG_MR: return IB_WC_REG_MR; > case IB_WR_BIND_MW: return IB_WC_BIND_MW; > + case IB_WR_RDMA_ATOMIC_WRITE: return IB_WC_RDMA_ATOMIC_WRITE; > > default: > return 0xff; > @@ -256,6 +257,9 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, > if ((syn & AETH_TYPE_MASK) != AETH_ACK) > return COMPST_ERROR; > > + if (wqe->wr.opcode == IB_WR_RDMA_ATOMIC_WRITE) > + return COMPST_WRITE_SEND; > + > fallthrough; > /* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH) > */ > diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c > index 3ef5a10a6efd..bd3639534d5a 100644 > --- a/drivers/infiniband/sw/rxe/rxe_opcode.c > +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c > @@ -103,6 +103,12 @@ struct rxe_wr_opcode_info rxe_wr_opcode_info[] = { > [IB_QPT_UC] = WR_LOCAL_OP_MASK, > }, > }, > + [IB_WR_RDMA_ATOMIC_WRITE] = { > + .name = "IB_WR_RDMA_ATOMIC_WRITE", > + .mask = { > + [IB_QPT_RC] = WR_INLINE_MASK | WR_ATOMIC_WRITE_MASK, > + }, > + }, > }; > > struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { > @@ -379,6 +385,18 @@ struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { > + RXE_IETH_BYTES, > } > }, > + [IB_OPCODE_RC_RDMA_ATOMIC_WRITE] = { > + .name = "IB_OPCODE_RC_RDMA_ATOMIC_WRITE", > + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK > + | RXE_ATOMIC_WRITE_MASK | RXE_START_MASK > + | RXE_END_MASK, > + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, > + .offset = { > + [RXE_BTH] = 0, > + [RXE_RETH] = RXE_BTH_BYTES, > + [RXE_PAYLOAD] = RXE_BTH_BYTES + RXE_RETH_BYTES, > + } > + }, > > /* UC */ > [IB_OPCODE_UC_SEND_FIRST] = { > diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h > index 8f9aaaf260f2..a470e9b0b884 100644 > --- a/drivers/infiniband/sw/rxe/rxe_opcode.h > +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h > @@ -20,6 +20,7 @@ enum rxe_wr_mask { > WR_READ_MASK = BIT(3), > WR_WRITE_MASK = BIT(4), > WR_LOCAL_OP_MASK = BIT(5), > + WR_ATOMIC_WRITE_MASK = BIT(7), > > WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK, > WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK, > @@ -81,6 +82,8 @@ enum rxe_hdr_mask { > > RXE_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), > > + RXE_ATOMIC_WRITE_MASK = BIT(NUM_HDR_TYPES + 14), > + > RXE_READ_OR_ATOMIC_MASK = (RXE_READ_MASK | RXE_ATOMIC_MASK), > RXE_WRITE_OR_SEND_MASK = (RXE_WRITE_MASK | RXE_SEND_MASK), > RXE_READ_OR_WRITE_MASK = (RXE_READ_MASK | RXE_WRITE_MASK), > diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c > index f3eec350f95c..22a1c4bcfa60 100644 > --- a/drivers/infiniband/sw/rxe/rxe_qp.c > +++ b/drivers/infiniband/sw/rxe/rxe_qp.c > @@ -135,7 +135,8 @@ static void free_rd_atomic_resources(struct rxe_qp *qp) > > void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res) > { > - if (res->type == RXE_ATOMIC_MASK) { > + if (res->type == RXE_ATOMIC_MASK || > + res->type == RXE_ATOMIC_WRITE_MASK) { > kfree_skb(res->resp.skb); > } else if (res->type == RXE_READ_MASK) { > if (res->read.mr) > diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c > index 0c9d2af15f3d..f024fb6e67c9 100644 > --- a/drivers/infiniband/sw/rxe/rxe_req.c > +++ b/drivers/infiniband/sw/rxe/rxe_req.c > @@ -240,6 +240,10 @@ static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits) > else > return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE : > IB_OPCODE_RC_SEND_FIRST; > + > + case IB_WR_RDMA_ATOMIC_WRITE: > + return IB_OPCODE_RC_RDMA_ATOMIC_WRITE; > + > case IB_WR_REG_MR: > case IB_WR_LOCAL_INV: > return opcode; > @@ -463,7 +467,7 @@ static int finish_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe, > if (err) > return err; > > - if (pkt->mask & RXE_WRITE_OR_SEND_MASK) { > + if (pkt->mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) { > if (wqe->wr.send_flags & IB_SEND_INLINE) { > u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset]; > > @@ -680,13 +684,13 @@ int rxe_requester(void *arg) > } > > mask = rxe_opcode[opcode].mask; > - if (unlikely(mask & RXE_READ_OR_ATOMIC_MASK)) { > + if (unlikely(mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK))) { > if (check_init_depth(qp, wqe)) > goto exit; > } > > mtu = get_mtu(qp); > - payload = (mask & RXE_WRITE_OR_SEND_MASK) ? wqe->dma.resid : 0; > + payload = (mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) ? wqe->dma.resid : 0; > if (payload > mtu) { > if (qp_type(qp) == IB_QPT_UD) { > /* C10-93.1.1: If the total sum of all the buffer lengths specified for a > diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c > index 738e6b6335e5..dc9af8b06bd1 100644 > --- a/drivers/infiniband/sw/rxe/rxe_resp.c > +++ b/drivers/infiniband/sw/rxe/rxe_resp.c > @@ -258,7 +258,7 @@ static enum resp_states check_op_valid(struct rxe_qp *qp, > case IB_QPT_RC: > if (((pkt->mask & RXE_READ_MASK) && > !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || > - ((pkt->mask & RXE_WRITE_MASK) && > + ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) && > !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || > ((pkt->mask & RXE_ATOMIC_MASK) && > !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) { > @@ -362,7 +362,7 @@ static enum resp_states check_resource(struct rxe_qp *qp, > } > } > > - if (pkt->mask & RXE_READ_OR_ATOMIC_MASK) { > + if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) { > /* it is the requesters job to not send > * too many read/atomic ops, we just > * recycle the responder resource queue > @@ -413,7 +413,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, > enum resp_states state; > int access; > > - if (pkt->mask & RXE_READ_OR_WRITE_MASK) { > + if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { > if (pkt->mask & RXE_RETH_MASK) { > qp->resp.va = reth_va(pkt); > qp->resp.offset = 0; > @@ -479,7 +479,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, > goto err; > } > > - if (pkt->mask & RXE_WRITE_MASK) { > + if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { > if (resid > mtu) { > if (pktlen != mtu || bth_pad(pkt)) { > state = RESPST_ERR_LENGTH; > @@ -591,6 +591,32 @@ static enum resp_states process_atomic(struct rxe_qp *qp, > return ret; > } > > + > +/* Guarantee atomicity of atomic write operations at the machine level. */ > +static DEFINE_SPINLOCK(atomic_write_ops_lock); > + > +static enum resp_states process_atomic_write(struct rxe_qp *qp, > + struct rxe_pkt_info *pkt) > +{ > + u64 *src, *dst; > + struct rxe_mr *mr = qp->resp.mr; > + > + src = payload_addr(pkt); > + > + dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, sizeof(u64)); > + if (!dst || (uintptr_t)dst & 7) > + return RESPST_ERR_MISALIGNED_ATOMIC; > + > + spin_lock_bh(&atomic_write_ops_lock); > + *dst = *src; > + spin_unlock_bh(&atomic_write_ops_lock); > + > + /* decrease resp.resid to zero */ > + qp->resp.resid -= sizeof(u64); > + > + return RESPST_NONE; > +} > + > static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, > struct rxe_pkt_info *pkt, > struct rxe_pkt_info *ack, > @@ -801,6 +827,10 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) > err = process_atomic(qp, pkt); > if (err) > return err; > + } else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { > + err = process_atomic_write(qp, pkt); > + if (err) > + return err; > } else { > /* Unreachable */ > WARN_ON_ONCE(1); > @@ -967,9 +997,12 @@ static int send_resp(struct rxe_qp *qp, struct rxe_pkt_info *pkt, > struct sk_buff *skb; > struct resp_res *res; > > + int opcode = pkt->mask & RXE_ATOMIC_MASK ? > + IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE : > + IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY; > + > skb = prepare_ack_packet(qp, pkt, &ack_pkt, > - IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn, > - syndrome); > + opcode, 0, pkt->psn, syndrome); > if (!skb) { > rc = -ENOMEM; > goto out; > @@ -980,7 +1013,7 @@ static int send_resp(struct rxe_qp *qp, struct rxe_pkt_info *pkt, > rxe_advance_resp_resource(qp); > > skb_get(skb); > - res->type = RXE_ATOMIC_MASK; > + res->type = pkt->mask & (RXE_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK); > res->resp.skb = skb; > res->first_psn = ack_pkt.psn; > res->last_psn = ack_pkt.psn; > @@ -1003,7 +1036,7 @@ static enum resp_states acknowledge(struct rxe_qp *qp, > > if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED) > send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn); > - else if (pkt->mask & RXE_ATOMIC_MASK) > + else if (pkt->mask & (RXE_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) > send_resp(qp, pkt, AETH_ACK_UNLIMITED); > else if (bth_ack(pkt)) > send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn); > diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h > index a9162f25beaf..bd12e1049ef8 100644 > --- a/include/rdma/ib_pack.h > +++ b/include/rdma/ib_pack.h > @@ -84,6 +84,7 @@ enum { > /* opcode 0x15 is reserved */ > IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, > IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, > + IB_OPCODE_RDMA_ATOMIC_WRITE = 0x19, I think you didn't follow the SPEC. SPEC: ATOMIC WRITE BTH shall hold the Opcode = 0x1D (valid only for RC, RD and XRC, reserved for unreliable transport services). Thanks Zhijian > > /* real constants follow -- see comment about above IB_OPCODE() > macro for more details */ > @@ -112,6 +113,7 @@ enum { > IB_OPCODE(RC, FETCH_ADD), > IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), > IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), > + IB_OPCODE(RC, RDMA_ATOMIC_WRITE), > > /* UC */ > IB_OPCODE(UC, SEND_FIRST), > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h > index 6e9ad656ecb7..949b3586d35b 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -971,6 +971,7 @@ enum ib_wc_opcode { > IB_WC_REG_MR, > IB_WC_MASKED_COMP_SWAP, > IB_WC_MASKED_FETCH_ADD, > + IB_WC_RDMA_ATOMIC_WRITE = IB_UVERBS_WC_RDMA_ATOMIC_WRITE, > /* > * Set value of IB_WC_RECV so consumers can test if a completion is a > * receive by testing (opcode & IB_WC_RECV). > @@ -1311,6 +1312,7 @@ enum ib_wr_opcode { > IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, > IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = > IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, > + IB_WR_RDMA_ATOMIC_WRITE = IB_UVERBS_WR_RDMA_ATOMIC_WRITE, > > /* These are kernel only and can not be issued by userspace */ > IB_WR_REG_MR = 0x20, > diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h > index 7ee73a0652f1..3b0b509fb96f 100644 > --- a/include/uapi/rdma/ib_user_verbs.h > +++ b/include/uapi/rdma/ib_user_verbs.h > @@ -466,6 +466,7 @@ enum ib_uverbs_wc_opcode { > IB_UVERBS_WC_BIND_MW = 5, > IB_UVERBS_WC_LOCAL_INV = 6, > IB_UVERBS_WC_TSO = 7, > + IB_UVERBS_WC_RDMA_ATOMIC_WRITE = 9, > }; > > struct ib_uverbs_wc { > @@ -784,6 +785,7 @@ enum ib_uverbs_wr_opcode { > IB_UVERBS_WR_RDMA_READ_WITH_INV = 11, > IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12, > IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13, > + IB_UVERBS_WR_RDMA_ATOMIC_WRITE = 15, > /* Review enum ib_wr_opcode before modifying this */ > }; >