Re: [RFC PATCH] RDMA/siw: Experimental e2e negotiation of GSO usage.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wednesday, April 04/15/20, 2020 at 11:59:21 +0000, Bernard Metzler wrote:
Hi Bernard,

The attached patches enables the GSO negotiation code in SIW with
few modifications, and also allows hardware iwarp drivers to advertise
their max length(in 16/32/64KB granularity) that they can accept.
The logic is almost similar to how TCP SYN MSS announcements works while
3-way handshake.

Please see if this approach works better for softiwarp <=> hardiwarp
case.

Thanks,
Krishna. 
diff --git a/drivers/infiniband/sw/siw/iwarp.h b/drivers/infiniband/sw/siw/iwarp.h
index e8a04d9c89cb..fa7d93be5d18 100644
--- a/drivers/infiniband/sw/siw/iwarp.h
+++ b/drivers/infiniband/sw/siw/iwarp.h
@@ -18,12 +18,34 @@
 #define MPA_KEY_REQ "MPA ID Req Frame"
 #define MPA_KEY_REP "MPA ID Rep Frame"
 #define MPA_IRD_ORD_MASK 0x3fff
+#define MPA_FPDU_LEN_MASK 0x000c
 
 struct mpa_rr_params {
 	__be16 bits;
 	__be16 pd_len;
 };
 
+/*
+ * MPA reserved bits[4:5] are used to advertise the maximum FPDU length that an
+ * endpoint is willing to accept, like TCP SYN MSS announcement. This is an
+ * experimental enhancement to RDMA connection establishment. Both local and
+ * remote endpoints must use these flags, if at least, one endpoint wants to
+ * advertise it's capability to receive large FPDU length.
+ * As SIW driver sits on top of TCP/IP stack it can use GSO for sending large
+ * FPDUs(upto 64K - 1, capped due to 16bit MPA len), thus improves performance.
+ * Making use of bits[4:5] backwards compatibility with the original MPA Frame
+ * format, IE, rules defined in RFC5044, regarding FDPU length, should be
+ * followed when these bits are zero.
+ * TODO make this experimental feature tunable via module params, as the remote
+ * endpoint might also be using these reserve bits for other experiments.
+ */
+enum {
+	MPA_FPDU_LEN_DEFAULT = cpu_to_be16(0x0000),
+	MPA_FPDU_LEN_16K = cpu_to_be16(0x0400),
+	MPA_FPDU_LEN_32K = cpu_to_be16(0x0800),
+	MPA_FPDU_LEN_64K = cpu_to_be16(0x0c00)
+};
+
 /*
  * MPA request/response header bits & fields
  */
@@ -32,7 +54,6 @@ enum {
 	MPA_RR_FLAG_CRC = cpu_to_be16(0x4000),
 	MPA_RR_FLAG_REJECT = cpu_to_be16(0x2000),
 	MPA_RR_FLAG_ENHANCED = cpu_to_be16(0x1000),
-	MPA_RR_FLAG_GSO_EXP = cpu_to_be16(0x0800),
 	MPA_RR_MASK_REVISION = cpu_to_be16(0x00ff)
 };
 
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h
index dba4535494ab..a8e08f675cf4 100644
--- a/drivers/infiniband/sw/siw/siw.h
+++ b/drivers/infiniband/sw/siw/siw.h
@@ -415,7 +415,7 @@ struct siw_iwarp_tx {
 	u8 orq_fence : 1; /* ORQ full or Send fenced */
 	u8 in_syscall : 1; /* TX out of user context */
 	u8 zcopy_tx : 1; /* Use TCP_SENDPAGE if possible */
-	u8 gso_seg_limit; /* Maximum segments for GSO, 0 = unbound */
+	u16 large_fpdulen; /* max outbound FPDU len, capped by 16bit MPA len */
 
 	u16 fpdu_len; /* len of FPDU to tx */
 	unsigned int tcp_seglen; /* remaining tcp seg space */
@@ -505,7 +505,6 @@ struct iwarp_msg_info {
 
 /* Global siw parameters. Currently set in siw_main.c */
 extern const bool zcopy_tx;
-extern const bool try_gso;
 extern const bool loopback_enabled;
 extern const bool mpa_crc_required;
 extern const bool mpa_crc_strict;
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index 8c1931a57f4a..abeed777006f 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -712,6 +712,38 @@ static int siw_proc_mpareq(struct siw_cep *cep)
 	return -EOPNOTSUPP;
 }
 
+/*
+ * Sets/limits max outbound FPDU length based on the length advertised
+ * by the peer.
+ */
+static void siw_set_outbound_fpdulen(struct siw_cep *cep,
+				     struct mpa_rr_params *params,
+				     u16 *outbound_fpdulen)
+{
+	switch (params->bits & MPA_FPDU_LEN_MASK) {
+	case MPA_FPDU_LEN_16K:
+		*outbound_fpdulen = SZ_16K - 1;
+		break;
+	case MPA_FPDU_LEN_32K:
+		*outbound_fpdulen = SZ_32K - 1;
+		break;
+	case MPA_FPDU_LEN_64K:
+		*outbound_fpdulen = SZ_64K - 1;
+		break;
+	default:
+		WARN(1,
+		  "Peer advertised invalid FPDU len:0x%x, proceeding w/o GSO\n",
+		  (params->bits & MPA_FPDU_LEN_MASK) & 0xffff);
+
+		/* fpdulen '0' here disables sending large FPDUs */
+		*outbound_fpdulen = 0;
+		return;
+	}
+	siw_dbg_cep(cep, "Max outbound FPDU length set to: %d\n",
+			*outbound_fpdulen);
+	return;
+}
+
 static int siw_proc_mpareply(struct siw_cep *cep)
 {
 	struct siw_qp_attrs qp_attrs;
@@ -750,10 +782,12 @@ static int siw_proc_mpareply(struct siw_cep *cep)
 
 		return -ECONNRESET;
 	}
-	if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) {
-		siw_dbg_cep(cep, "peer allows GSO on TX\n");
-		qp->tx_ctx.gso_seg_limit = 0;
-	}
+	if (rep->params.bits & MPA_FPDU_LEN_MASK)
+		siw_set_outbound_fpdulen(cep, &rep->params,
+					 &qp->tx_ctx.large_fpdulen);
+	else
+		qp->tx_ctx.large_fpdulen = 0;
+
 	if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
 	    (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) ||
 	    (mpa_crc_strict && !mpa_crc_required &&
@@ -1469,8 +1503,7 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
 	}
 	__mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version);
 
-	if (try_gso)
-		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
+	cep->mpa.hdr.params.bits |= MPA_FPDU_LEN_64K;
 
 	if (mpa_crc_required)
 		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
@@ -1602,10 +1635,12 @@ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
 	}
 	siw_dbg_cep(cep, "[QP %d]\n", params->qpn);
 
-	if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) {
-		siw_dbg_cep(cep, "peer allows GSO on TX\n");
-		qp->tx_ctx.gso_seg_limit = 0;
-	}
+	if (cep->mpa.hdr.params.bits & MPA_FPDU_LEN_MASK)
+		siw_set_outbound_fpdulen(cep, &cep->mpa.hdr.params,
+					 &qp->tx_ctx.large_fpdulen);
+	else
+		qp->tx_ctx.large_fpdulen = 0;
+
 	if (params->ord > sdev->attrs.max_ord ||
 	    params->ird > sdev->attrs.max_ird) {
 		siw_dbg_cep(
@@ -1676,6 +1711,7 @@ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
 	qp_attrs.sk = cep->sock;
 	if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC)
 		qp_attrs.flags = SIW_MPA_CRC;
+	cep->mpa.hdr.params.bits |= MPA_FPDU_LEN_64K;
 	qp_attrs.state = SIW_QP_STATE_RTS;
 
 	siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp));
diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
index 05a92f997f60..28c256e52454 100644
--- a/drivers/infiniband/sw/siw/siw_main.c
+++ b/drivers/infiniband/sw/siw/siw_main.c
@@ -31,12 +31,6 @@ MODULE_LICENSE("Dual BSD/GPL");
 /* transmit from user buffer, if possible */
 const bool zcopy_tx = true;
 
-/* Restrict usage of GSO, if hardware peer iwarp is unable to process
- * large packets. try_gso = true lets siw try to use local GSO,
- * if peer agrees.  Not using GSO severly limits siw maximum tx bandwidth.
- */
-const bool try_gso;
-
 /* Attach siw also with loopback devices */
 const bool loopback_enabled = true;
 
diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c
index 5d97bba0ce6d..d95bb0ed0d7c 100644
--- a/drivers/infiniband/sw/siw/siw_qp_tx.c
+++ b/drivers/infiniband/sw/siw/siw_qp_tx.c
@@ -662,13 +662,17 @@ static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx,
 {
 	struct tcp_sock *tp = tcp_sk(s->sk);
 
-	if (tp->gso_segs) {
-		if (c_tx->gso_seg_limit == 0)
-			c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs;
-		else
+	if (tp->gso_segs && c_tx->large_fpdulen) {
+		if (tp->mss_cache >  c_tx->large_fpdulen) {
+			c_tx->tcp_seglen = c_tx->large_fpdulen;
+		} else {
+			u8 gso_seg_limit;
+			gso_seg_limit = c_tx->large_fpdulen / tp->mss_cache;
+
 			c_tx->tcp_seglen =
 				tp->mss_cache *
-				min_t(u16, c_tx->gso_seg_limit, tp->gso_segs);
+				min_t(u16, gso_seg_limit, tp->gso_segs);
+		}
 	} else {
 		c_tx->tcp_seglen = tp->mss_cache;
 	}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index b18a677832e1..208557f91ff4 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -444,8 +444,7 @@ struct ib_qp *siw_create_qp(struct ib_pd *pd,
 	qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
 	qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
 
-	/* Make those two tunables fixed for now. */
-	qp->tx_ctx.gso_seg_limit = 1;
+	/* Make this tunable fixed for now. */
 	qp->tx_ctx.zcopy_tx = zcopy_tx;
 
 	qp->attrs.state = SIW_QP_STATE_IDLE;
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index ee1182f..8941b64 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -135,6 +135,16 @@
 module_param(snd_win, int, 0644);
 MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=128KB)");
 
+/*
+ * Experimental large FPDU length support.
+ * This improves overall throughput while interop with SoftiWARP.
+ * Note that iw_cxgb4 advertises it's receiving capability only and ignores
+ * the remote peer's large FPDU length announcement, due to T6 HW limitaton.
+ */
+static int large_fpdulen;
+module_param(large_fpdulen, int, 0644);
+MODULE_PARM_DESC(large_fpdulen, "Experimental large FPDU length");
+
 static struct workqueue_struct *workq;
 
 static struct sk_buff_head rxq;
@@ -978,6 +988,12 @@ static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
 	mpa->flags = 0;
 	if (crc_enabled)
 		mpa->flags |= MPA_CRC;
+	if (large_fpdulen) {
+		if (CHELSIO_CHIP_VERSION(ep->com.dev->rdev.lldi.adapter_type) >=
+				CHELSIO_T6)
+			mpa->flags |= MPA_FPDU_LEN_16K;
+	}
+
 	if (markers_enabled) {
 		mpa->flags |= MPA_MARKERS;
 		ep->mpa_attr.recv_marker_enabled = 1;
@@ -1166,6 +1182,11 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
 		mpa->flags |= MPA_CRC;
 	if (ep->mpa_attr.recv_marker_enabled)
 		mpa->flags |= MPA_MARKERS;
+	if (large_fpdulen) {
+		if (CHELSIO_CHIP_VERSION(ep->com.dev->rdev.lldi.adapter_type) >=
+				CHELSIO_T6)
+			mpa->flags |= MPA_FPDU_LEN_16K;
+	}
 	mpa->revision = ep->mpa_attr.version;
 	mpa->private_data_size = htons(plen);
 
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 7d06b0f..051830e 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -688,6 +688,15 @@ enum c4iw_mmid_state {
 #define MPA_V2_RDMA_READ_RTR            0x4000
 #define MPA_V2_IRD_ORD_MASK             0x3FFF
 
+/* Following experimental bits should be in sync with SoftiWARP bits */
+#define MPA_FPDU_LEN_MASK 0x000c
+enum {
+	MPA_FPDU_LEN_DEFAULT = cpu_to_be16(0x0000),
+	MPA_FPDU_LEN_16K = cpu_to_be16(0x0400),
+	MPA_FPDU_LEN_32K = cpu_to_be16(0x0800),
+	MPA_FPDU_LEN_64K = cpu_to_be16(0x0c00)
+};
+
 #define c4iw_put_ep(ep) {						\
 	pr_debug("put_ep ep %p refcnt %d\n",		\
 		 ep, kref_read(&((ep)->kref)));				\

[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux