Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 30/12/2021 20:14, Xiao Yang wrote:
> This patch implements RDMA Atomic Write operation for RC service.
>
> Signed-off-by: Xiao Yang <yangx.jy@xxxxxxxxxxx>
> ---
>   drivers/infiniband/sw/rxe/rxe_comp.c   |  4 +++
>   drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++++
>   drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
>   drivers/infiniband/sw/rxe/rxe_qp.c     |  3 +-
>   drivers/infiniband/sw/rxe/rxe_req.c    | 10 ++++--
>   drivers/infiniband/sw/rxe/rxe_resp.c   | 49 +++++++++++++++++++++-----
>   include/rdma/ib_pack.h                 |  2 ++
>   include/rdma/ib_verbs.h                |  2 ++
>   include/uapi/rdma/ib_user_verbs.h      |  2 ++
>   9 files changed, 81 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
> index d771ba8449a1..5a5c0c3501de 100644
> --- a/drivers/infiniband/sw/rxe/rxe_comp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_comp.c
> @@ -104,6 +104,7 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode)
>   	case IB_WR_LOCAL_INV:			return IB_WC_LOCAL_INV;
>   	case IB_WR_REG_MR:			return IB_WC_REG_MR;
>   	case IB_WR_BIND_MW:			return IB_WC_BIND_MW;
> +	case IB_WR_RDMA_ATOMIC_WRITE:		return IB_WC_RDMA_ATOMIC_WRITE;
>   
>   	default:
>   		return 0xff;
> @@ -256,6 +257,9 @@ static inline enum comp_state check_ack(struct rxe_qp *qp,
>   		if ((syn & AETH_TYPE_MASK) != AETH_ACK)
>   			return COMPST_ERROR;
>   
> +		if (wqe->wr.opcode == IB_WR_RDMA_ATOMIC_WRITE)
> +			return COMPST_WRITE_SEND;
> +
>   		fallthrough;
>   		/* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH)
>   		 */
> diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c
> index 3ef5a10a6efd..bd3639534d5a 100644
> --- a/drivers/infiniband/sw/rxe/rxe_opcode.c
> +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c
> @@ -103,6 +103,12 @@ struct rxe_wr_opcode_info rxe_wr_opcode_info[] = {
>   			[IB_QPT_UC]	= WR_LOCAL_OP_MASK,
>   		},
>   	},
> +	[IB_WR_RDMA_ATOMIC_WRITE]                       = {
> +		.name   = "IB_WR_RDMA_ATOMIC_WRITE",
> +		.mask   = {
> +			[IB_QPT_RC]     = WR_INLINE_MASK | WR_ATOMIC_WRITE_MASK,
> +		},
> +	},
>   };
>   
>   struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
> @@ -379,6 +385,18 @@ struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
>   						+ RXE_IETH_BYTES,
>   		}
>   	},
> +	[IB_OPCODE_RC_RDMA_ATOMIC_WRITE]                        = {
> +		.name   = "IB_OPCODE_RC_RDMA_ATOMIC_WRITE",
> +		.mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
> +				| RXE_ATOMIC_WRITE_MASK | RXE_START_MASK
> +				| RXE_END_MASK,
> +		.length = RXE_BTH_BYTES + RXE_RETH_BYTES,
> +		.offset = {
> +			[RXE_BTH]       = 0,
> +			[RXE_RETH]      = RXE_BTH_BYTES,
> +			[RXE_PAYLOAD]   = RXE_BTH_BYTES + RXE_RETH_BYTES,
> +		}
> +	},
>   
>   	/* UC */
>   	[IB_OPCODE_UC_SEND_FIRST]			= {
> diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h
> index 8f9aaaf260f2..a470e9b0b884 100644
> --- a/drivers/infiniband/sw/rxe/rxe_opcode.h
> +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h
> @@ -20,6 +20,7 @@ enum rxe_wr_mask {
>   	WR_READ_MASK			= BIT(3),
>   	WR_WRITE_MASK			= BIT(4),
>   	WR_LOCAL_OP_MASK		= BIT(5),
> +	WR_ATOMIC_WRITE_MASK		= BIT(7),
>   
>   	WR_READ_OR_WRITE_MASK		= WR_READ_MASK | WR_WRITE_MASK,
>   	WR_WRITE_OR_SEND_MASK		= WR_WRITE_MASK | WR_SEND_MASK,
> @@ -81,6 +82,8 @@ enum rxe_hdr_mask {
>   
>   	RXE_LOOPBACK_MASK	= BIT(NUM_HDR_TYPES + 12),
>   
> +	RXE_ATOMIC_WRITE_MASK   = BIT(NUM_HDR_TYPES + 14),
> +
>   	RXE_READ_OR_ATOMIC_MASK	= (RXE_READ_MASK | RXE_ATOMIC_MASK),
>   	RXE_WRITE_OR_SEND_MASK	= (RXE_WRITE_MASK | RXE_SEND_MASK),
>   	RXE_READ_OR_WRITE_MASK	= (RXE_READ_MASK | RXE_WRITE_MASK),
> diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
> index f3eec350f95c..22a1c4bcfa60 100644
> --- a/drivers/infiniband/sw/rxe/rxe_qp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_qp.c
> @@ -135,7 +135,8 @@ static void free_rd_atomic_resources(struct rxe_qp *qp)
>   
>   void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res)
>   {
> -	if (res->type == RXE_ATOMIC_MASK) {
> +	if (res->type == RXE_ATOMIC_MASK ||
> +			res->type == RXE_ATOMIC_WRITE_MASK) {
>   		kfree_skb(res->resp.skb);
>   	} else if (res->type == RXE_READ_MASK) {
>   		if (res->read.mr)
> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
> index 0c9d2af15f3d..f024fb6e67c9 100644
> --- a/drivers/infiniband/sw/rxe/rxe_req.c
> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
> @@ -240,6 +240,10 @@ static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
>   		else
>   			return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE :
>   				IB_OPCODE_RC_SEND_FIRST;
> +
> +	case IB_WR_RDMA_ATOMIC_WRITE:
> +		return IB_OPCODE_RC_RDMA_ATOMIC_WRITE;
> +
>   	case IB_WR_REG_MR:
>   	case IB_WR_LOCAL_INV:
>   		return opcode;
> @@ -463,7 +467,7 @@ static int finish_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
>   	if (err)
>   		return err;
>   
> -	if (pkt->mask & RXE_WRITE_OR_SEND_MASK) {
> +	if (pkt->mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) {
>   		if (wqe->wr.send_flags & IB_SEND_INLINE) {
>   			u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset];
>   
> @@ -680,13 +684,13 @@ int rxe_requester(void *arg)
>   	}
>   
>   	mask = rxe_opcode[opcode].mask;
> -	if (unlikely(mask & RXE_READ_OR_ATOMIC_MASK)) {
> +	if (unlikely(mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK))) {
>   		if (check_init_depth(qp, wqe))
>   			goto exit;
>   	}
>   
>   	mtu = get_mtu(qp);
> -	payload = (mask & RXE_WRITE_OR_SEND_MASK) ? wqe->dma.resid : 0;
> +	payload = (mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) ? wqe->dma.resid : 0;
>   	if (payload > mtu) {
>   		if (qp_type(qp) == IB_QPT_UD) {
>   			/* C10-93.1.1: If the total sum of all the buffer lengths specified for a
> diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
> index 738e6b6335e5..dc9af8b06bd1 100644
> --- a/drivers/infiniband/sw/rxe/rxe_resp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_resp.c
> @@ -258,7 +258,7 @@ static enum resp_states check_op_valid(struct rxe_qp *qp,
>   	case IB_QPT_RC:
>   		if (((pkt->mask & RXE_READ_MASK) &&
>   		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) ||
> -		    ((pkt->mask & RXE_WRITE_MASK) &&
> +		    ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) &&
>   		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) ||
>   		    ((pkt->mask & RXE_ATOMIC_MASK) &&
>   		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) {
> @@ -362,7 +362,7 @@ static enum resp_states check_resource(struct rxe_qp *qp,
>   		}
>   	}
>   
> -	if (pkt->mask & RXE_READ_OR_ATOMIC_MASK) {
> +	if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) {
>   		/* it is the requesters job to not send
>   		 * too many read/atomic ops, we just
>   		 * recycle the responder resource queue
> @@ -413,7 +413,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
>   	enum resp_states state;
>   	int access;
>   
> -	if (pkt->mask & RXE_READ_OR_WRITE_MASK) {
> +	if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) {
>   		if (pkt->mask & RXE_RETH_MASK) {
>   			qp->resp.va = reth_va(pkt);
>   			qp->resp.offset = 0;
> @@ -479,7 +479,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
>   		goto err;
>   	}
>   
> -	if (pkt->mask & RXE_WRITE_MASK)	 {
> +	if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) {
>   		if (resid > mtu) {
>   			if (pktlen != mtu || bth_pad(pkt)) {
>   				state = RESPST_ERR_LENGTH;
> @@ -591,6 +591,32 @@ static enum resp_states process_atomic(struct rxe_qp *qp,
>   	return ret;
>   }
>   
> +
> +/* Guarantee atomicity of atomic write operations at the machine level. */
> +static DEFINE_SPINLOCK(atomic_write_ops_lock);
> +
> +static enum resp_states process_atomic_write(struct rxe_qp *qp,
> +					     struct rxe_pkt_info *pkt)
> +{
> +	u64 *src, *dst;
> +	struct rxe_mr *mr = qp->resp.mr;
> +
> +	src = payload_addr(pkt);
> +
> +	dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, sizeof(u64));
> +	if (!dst || (uintptr_t)dst & 7)
> +		return RESPST_ERR_MISALIGNED_ATOMIC;
> +
> +	spin_lock_bh(&atomic_write_ops_lock);
> +	*dst = *src;
> +	spin_unlock_bh(&atomic_write_ops_lock);
> +
> +	/* decrease resp.resid to zero */
> +	qp->resp.resid -= sizeof(u64);
> +
> +	return RESPST_NONE;
> +}
> +
>   static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
>   					  struct rxe_pkt_info *pkt,
>   					  struct rxe_pkt_info *ack,
> @@ -801,6 +827,10 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
>   		err = process_atomic(qp, pkt);
>   		if (err)
>   			return err;
> +	} else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) {
> +		err = process_atomic_write(qp, pkt);
> +		if (err)
> +			return err;
>   	} else {
>   		/* Unreachable */
>   		WARN_ON_ONCE(1);
> @@ -967,9 +997,12 @@ static int send_resp(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
>   	struct sk_buff *skb;
>   	struct resp_res *res;
>   
> +	int opcode = pkt->mask & RXE_ATOMIC_MASK ?
> +				IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE :
> +				IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;
> +
>   	skb = prepare_ack_packet(qp, pkt, &ack_pkt,
> -				 IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn,
> -				 syndrome);
> +				 opcode, 0, pkt->psn, syndrome);
>   	if (!skb) {
>   		rc = -ENOMEM;
>   		goto out;
> @@ -980,7 +1013,7 @@ static int send_resp(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
>   	rxe_advance_resp_resource(qp);
>   
>   	skb_get(skb);
> -	res->type = RXE_ATOMIC_MASK;
> +	res->type = pkt->mask & (RXE_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK);
>   	res->resp.skb = skb;
>   	res->first_psn = ack_pkt.psn;
>   	res->last_psn  = ack_pkt.psn;
> @@ -1003,7 +1036,7 @@ static enum resp_states acknowledge(struct rxe_qp *qp,
>   
>   	if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED)
>   		send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn);
> -	else if (pkt->mask & RXE_ATOMIC_MASK)
> +	else if (pkt->mask & (RXE_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK))
>   		send_resp(qp, pkt, AETH_ACK_UNLIMITED);
>   	else if (bth_ack(pkt))
>   		send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn);
> diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
> index a9162f25beaf..bd12e1049ef8 100644
> --- a/include/rdma/ib_pack.h
> +++ b/include/rdma/ib_pack.h
> @@ -84,6 +84,7 @@ enum {
>   	/* opcode 0x15 is reserved */
>   	IB_OPCODE_SEND_LAST_WITH_INVALIDATE         = 0x16,
>   	IB_OPCODE_SEND_ONLY_WITH_INVALIDATE         = 0x17,
> +	IB_OPCODE_RDMA_ATOMIC_WRITE                 = 0x19,
I think you didn't follow the SPEC.

SPEC:
ATOMIC WRITE BTH shall hold the Opcode = 0x1D (valid only for RC, RD
and XRC, reserved for unreliable transport services).

Thanks
Zhijian

>   
>   	/* real constants follow -- see comment about above IB_OPCODE()
>   	   macro for more details */
> @@ -112,6 +113,7 @@ enum {
>   	IB_OPCODE(RC, FETCH_ADD),
>   	IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
>   	IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
> +	IB_OPCODE(RC, RDMA_ATOMIC_WRITE),
>   
>   	/* UC */
>   	IB_OPCODE(UC, SEND_FIRST),
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 6e9ad656ecb7..949b3586d35b 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -971,6 +971,7 @@ enum ib_wc_opcode {
>   	IB_WC_REG_MR,
>   	IB_WC_MASKED_COMP_SWAP,
>   	IB_WC_MASKED_FETCH_ADD,
> +	IB_WC_RDMA_ATOMIC_WRITE = IB_UVERBS_WC_RDMA_ATOMIC_WRITE,
>   /*
>    * Set value of IB_WC_RECV so consumers can test if a completion is a
>    * receive by testing (opcode & IB_WC_RECV).
> @@ -1311,6 +1312,7 @@ enum ib_wr_opcode {
>   		IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
>   	IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
>   		IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
> +	IB_WR_RDMA_ATOMIC_WRITE = IB_UVERBS_WR_RDMA_ATOMIC_WRITE,
>   
>   	/* These are kernel only and can not be issued by userspace */
>   	IB_WR_REG_MR = 0x20,
> diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
> index 7ee73a0652f1..3b0b509fb96f 100644
> --- a/include/uapi/rdma/ib_user_verbs.h
> +++ b/include/uapi/rdma/ib_user_verbs.h
> @@ -466,6 +466,7 @@ enum ib_uverbs_wc_opcode {
>   	IB_UVERBS_WC_BIND_MW = 5,
>   	IB_UVERBS_WC_LOCAL_INV = 6,
>   	IB_UVERBS_WC_TSO = 7,
> +	IB_UVERBS_WC_RDMA_ATOMIC_WRITE = 9,
>   };
>   
>   struct ib_uverbs_wc {
> @@ -784,6 +785,7 @@ enum ib_uverbs_wr_opcode {
>   	IB_UVERBS_WR_RDMA_READ_WITH_INV = 11,
>   	IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12,
>   	IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13,
> +	IB_UVERBS_WR_RDMA_ATOMIC_WRITE = 15,
>   	/* Review enum ib_wr_opcode before modifying this */
>   };
>   




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux