Patch "IB/mlx4: Fix starvation in paravirt mux/demux" has been added to the 4.19-stable tree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is a note to let you know that I've just added the patch titled

    IB/mlx4: Fix starvation in paravirt mux/demux

to the 4.19-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     ib-mlx4-fix-starvation-in-paravirt-mux-demux.patch
and it can be found in the queue-4.19 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit ce74e13300c55e9e27a7f0eec2c3a078ef2a7888
Author: HÃ¥kon Bugge <haakon.bugge@xxxxxxxxxx>
Date:   Mon Aug 3 08:19:39 2020 +0200

    IB/mlx4: Fix starvation in paravirt mux/demux
    
    [ Upstream commit 7fd1507df7cee9c533f38152fcd1dd769fcac6ce ]
    
    The mlx4 driver will proxy MAD packets through the PF driver. A VM or an
    instantiated VF will send its MAD packets to the PF driver using
    loop-back. The PF driver will be informed by an interrupt, but defer the
    handling and polling of CQEs to a worker thread running on an ordered
    work-queue.
    
    Consider the following scenario: the VMs will in short proximity in time,
    for example due to a network event, send many MAD packets to the PF
    driver. Lets say there are K VMs, each sending N packets.
    
    The interrupt from the first VM will start the worker thread, which will
    poll N CQEs. A common case here is where the PF driver will multiplex the
    packets received from the VMs out on the wire QP.
    
    But before the wire QP has returned a send CQE and associated interrupt,
    the other K - 1 VMs have sent their N packets as well.
    
    The PF driver has to multiplex K * N packets out on the wire QP. But the
    send-queue on the wire QP has a finite capacity.
    
    So, in this scenario, if K * N is larger than the send-queue capacity of
    the wire QP, we will get MAD packets dropped on the floor with this
    dynamic debug message:
    
    mlx4_ib_multiplex_mad: failed sending GSI to wire on behalf of slave 2 (-11)
    
    and this despite the fact that the wire send-queue could have capacity,
    but the PF driver isn't aware, because the wire send CQEs have not yet
    been polled.
    
    We can also have a similar scenario inbound, with a wire recv-queue larger
    than the tunnel QP's send-queue. If many remote peers send MAD packets to
    the very same VM, the tunnel send-queue destined to the VM could allegedly
    be construed to be full by the PF driver.
    
    This starvation is fixed by introducing separate work queues for the wire
    QPs vs. the tunnel QPs.
    
    With this fix, using a dual ported HCA, 8 VFs instantiated, we could run
    cmtime on each of the 18 interfaces towards a similar configured peer,
    each cmtime instance with 800 QPs (all in all 14400 QPs) without a single
    CM packet getting lost.
    
    Fixes: 3cf69cc8dbeb ("IB/mlx4: Add CM paravirtualization")
    Link: https://lore.kernel.org/r/20200803061941.1139994-5-haakon.bugge@xxxxxxxxxx
    Signed-off-by: HÃ¥kon Bugge <haakon.bugge@xxxxxxxxxx>
    Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxx>
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 5aaa2a6c431b6..418b9312fb2d7 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1305,6 +1305,18 @@ static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg)
 	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
 }
 
+static void mlx4_ib_wire_comp_handler(struct ib_cq *cq, void *arg)
+{
+	unsigned long flags;
+	struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context;
+	struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE)
+		queue_work(ctx->wi_wq, &ctx->work);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
 static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
 				  struct mlx4_ib_demux_pv_qp *tun_qp,
 				  int index)
@@ -2000,7 +2012,8 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
 		cq_size *= 2;
 
 	cq_attr.cqe = cq_size;
-	ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler,
+	ctx->cq = ib_create_cq(ctx->ib_dev,
+			       create_tun ? mlx4_ib_tunnel_comp_handler : mlx4_ib_wire_comp_handler,
 			       NULL, ctx, &cq_attr);
 	if (IS_ERR(ctx->cq)) {
 		ret = PTR_ERR(ctx->cq);
@@ -2037,6 +2050,7 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
 		INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker);
 
 	ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq;
+	ctx->wi_wq = to_mdev(ibdev)->sriov.demux[port - 1].wi_wq;
 
 	ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
 	if (ret) {
@@ -2180,7 +2194,7 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
 		goto err_mcg;
 	}
 
-	snprintf(name, sizeof name, "mlx4_ibt%d", port);
+	snprintf(name, sizeof(name), "mlx4_ibt%d", port);
 	ctx->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
 	if (!ctx->wq) {
 		pr_err("Failed to create tunnelling WQ for port %d\n", port);
@@ -2188,7 +2202,15 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
 		goto err_wq;
 	}
 
-	snprintf(name, sizeof name, "mlx4_ibud%d", port);
+	snprintf(name, sizeof(name), "mlx4_ibwi%d", port);
+	ctx->wi_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+	if (!ctx->wi_wq) {
+		pr_err("Failed to create wire WQ for port %d\n", port);
+		ret = -ENOMEM;
+		goto err_wiwq;
+	}
+
+	snprintf(name, sizeof(name), "mlx4_ibud%d", port);
 	ctx->ud_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
 	if (!ctx->ud_wq) {
 		pr_err("Failed to create up/down WQ for port %d\n", port);
@@ -2199,6 +2221,10 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
 	return 0;
 
 err_udwq:
+	destroy_workqueue(ctx->wi_wq);
+	ctx->wi_wq = NULL;
+
+err_wiwq:
 	destroy_workqueue(ctx->wq);
 	ctx->wq = NULL;
 
@@ -2246,12 +2272,14 @@ static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx)
 				ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING;
 		}
 		flush_workqueue(ctx->wq);
+		flush_workqueue(ctx->wi_wq);
 		for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
 			destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0);
 			free_pv_object(dev, i, ctx->port);
 		}
 		kfree(ctx->tun);
 		destroy_workqueue(ctx->ud_wq);
+		destroy_workqueue(ctx->wi_wq);
 		destroy_workqueue(ctx->wq);
 	}
 }
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index e10dccc7958f1..76ca67aa40158 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -464,6 +464,7 @@ struct mlx4_ib_demux_pv_ctx {
 	struct ib_pd *pd;
 	struct work_struct work;
 	struct workqueue_struct *wq;
+	struct workqueue_struct *wi_wq;
 	struct mlx4_ib_demux_pv_qp qp[2];
 };
 
@@ -471,6 +472,7 @@ struct mlx4_ib_demux_ctx {
 	struct ib_device *ib_dev;
 	int port;
 	struct workqueue_struct *wq;
+	struct workqueue_struct *wi_wq;
 	struct workqueue_struct *ud_wq;
 	spinlock_t ud_lock;
 	atomic64_t subnet_prefix;



[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux