[PATCH v13 for-next 5/6] RDMA/rxe: For mcast copy qp list to temp array

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Currently rxe_rcv_mcast_pkt performs most of its work under the
rxe->mcg_lock and calls into rxe_rcv which queues the packets
to the responder and completer tasklets holding the lock which is
a very bad idea. This patch walks the qp_list in mcg and copies the
qp addresses to a temporary array under the lock but does the rest
of the work without holding the lock. The critical section is now
very small.

Signed-off-by: Bob Pearson <rpearsonhpe@xxxxxxxxx>
---
 drivers/infiniband/sw/rxe/rxe_recv.c | 103 +++++++++++++++++----------
 1 file changed, 64 insertions(+), 39 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c
index 53924453abef..9b21cbb22602 100644
--- a/drivers/infiniband/sw/rxe/rxe_recv.c
+++ b/drivers/infiniband/sw/rxe/rxe_recv.c
@@ -232,11 +232,15 @@ static inline void rxe_rcv_pkt(struct rxe_pkt_info *pkt, struct sk_buff *skb)
 
 static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
 {
+	struct sk_buff *skb_copy;
 	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+	struct rxe_pkt_info *pkt_copy;
 	struct rxe_mcg *mcg;
 	struct rxe_mca *mca;
 	struct rxe_qp *qp;
+	struct rxe_qp **qp_array;
 	union ib_gid dgid;
+	int n, nmax;
 	int err;
 
 	if (skb->protocol == htons(ETH_P_IP))
@@ -248,68 +252,89 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
 	/* lookup mcast group corresponding to mgid, takes a ref */
 	mcg = rxe_lookup_mcg(rxe, &dgid);
 	if (!mcg)
-		goto drop;	/* mcast group not registered */
+		goto err_drop;	/* mcast group not registered */
+
+	/* this is the current number of qp's attached to mcg plus a
+	 * little room in case new qp's are attached between here
+	 * and when we finish walking the qp list. If someone can
+	 * attach more than 4 new qp's we will miss forwarding
+	 * packets to those qp's. This is actually OK since UD is
+	 * a unreliable service.
+	 */
+	nmax = atomic_read(&mcg->qp_num) + 4;
+	qp_array = kmalloc_array(nmax, sizeof(qp), GFP_KERNEL);
+	n = 0;
 
 	spin_lock_bh(&rxe->mcg_lock);
-
-	/* this is unreliable datagram service so we let
-	 * failures to deliver a multicast packet to a
-	 * single QP happen and just move on and try
-	 * the rest of them on the list
-	 */
 	list_for_each_entry(mca, &mcg->qp_list, qp_list) {
-		qp = mca->qp;
+		/* protect the qp pointers in the list */
+		rxe_add_ref(mca->qp);
+		qp_array[n++] = mca->qp;
+		if (n == nmax)
+			break;
+	}
+	spin_unlock_bh(&rxe->mcg_lock);
+	nmax = n;
+	kref_put(&mcg->ref_cnt, rxe_cleanup_mcg);
 
-		/* validate qp for incoming packet */
+	for (n = 0; n < nmax; n++) {
+		qp = qp_array[n];
+
+		/* since this is an unreliable transport if
+		 * one of the qp's fails to pass these checks
+		 * just don't forward a packet and continue
+		 * on to the other qp's. If there aren't any
+		 * drop the skb
+		 */
 		err = check_type_state(rxe, pkt, qp);
-		if (err)
+		if (err) {
+			rxe_drop_ref(qp);
+			if (n == nmax - 1)
+				goto err_free;
 			continue;
+		}
 
 		err = check_keys(rxe, pkt, bth_qpn(pkt), qp);
-		if (err)
+		if (err) {
+			rxe_drop_ref(qp);
+			if (n == nmax - 1)
+				goto err_free;
 			continue;
+		}
 
-		/* for all but the last QP create a new clone of the
-		 * skb and pass to the QP. Pass the original skb to
-		 * the last QP in the list.
+		/* for all but the last qp create a new copy(clone)
+		 * of the skb and pass to the qp. Pass the original
+		 * skb to the last qp in the list unless it failed
+		 * checks above
 		 */
-		if (mca->qp_list.next != &mcg->qp_list) {
-			struct sk_buff *cskb;
-			struct rxe_pkt_info *cpkt;
-
-			cskb = skb_clone(skb, GFP_ATOMIC);
-			if (unlikely(!cskb))
+		if (n < nmax - 1) {
+			skb_copy = skb_clone(skb, GFP_KERNEL);
+			if (unlikely(!skb_copy)) {
+				rxe_drop_ref(qp);
 				continue;
+			}
 
 			if (WARN_ON(!ib_device_try_get(&rxe->ib_dev))) {
-				kfree_skb(cskb);
-				break;
+				kfree_skb(skb_copy);
+				rxe_drop_ref(qp);
+				continue;
 			}
 
-			cpkt = SKB_TO_PKT(cskb);
-			cpkt->qp = qp;
-			rxe_add_ref(qp);
-			rxe_rcv_pkt(cpkt, cskb);
+			pkt_copy = SKB_TO_PKT(skb_copy);
+			pkt_copy->qp = qp;
+			rxe_rcv_pkt(pkt_copy, skb_copy);
 		} else {
 			pkt->qp = qp;
-			rxe_add_ref(qp);
 			rxe_rcv_pkt(pkt, skb);
-			skb = NULL;	/* mark consumed */
 		}
 	}
 
-	spin_unlock_bh(&rxe->mcg_lock);
-
-	kref_put(&mcg->ref_cnt, rxe_cleanup_mcg);
-
-	if (likely(!skb))
-		return;
-
-	/* This only occurs if one of the checks fails on the last
-	 * QP in the list above
-	 */
+	kfree(qp_array);
+	return;
 
-drop:
+err_free:
+	kfree(qp_array);
+err_drop:
 	kfree_skb(skb);
 	ib_device_put(&rxe->ib_dev);
 }
-- 
2.32.0




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux