Re: [bug report] blktests srp/002 hang

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 10/18/23 14:48, Bart Van Assche wrote:
> 
> On 10/18/23 12:17, Jason Gunthorpe wrote:
>> If siw hangs as well, I definitely comfortable continuing to debug and
>> leaving the work queues in-tree for now.
> 
> Regarding the KASAN complaint that I shared about one month ago, can that complaint have any other root cause than the patch "RDMA/rxe: Add
> workqueue support for rxe tasks"? That report shows a use-after-free by
> rxe code with a pointer to memory that was owned by the rxe driver and
> that was freed by the rxe driver. That memory is an skbuff. The rxe
> driver manages skbuffs. The SRP driver doesn't even know about these
> skbuff objects. See also https://lore.kernel.org/linux-rdma/8ee2869b-3f51-4195-9883-015cd30b4241@xxxxxxx/
> 
> Thanks,
> 
> Bart.
> 

And here are the patches to ib_srp I have. I will retry with a clean version to see what happens.

Bob
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 1574218764e0..1b9919e08176 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -512,9 +512,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
  */
 static void srp_destroy_qp(struct srp_rdma_ch *ch)
 {
-	spin_lock_irq(&ch->lock);
-	ib_process_cq_direct(ch->send_cq, -1);
-	spin_unlock_irq(&ch->lock);
+	//spin_lock_irq(&ch->lock);
+	//pr_info("qp#%03d: %s to ib_process_cq_direct\n",
+			//ch->qp->qp_num, __func__);
+	//ib_process_cq_direct(ch->send_cq, -1);
+	//spin_unlock_irq(&ch->lock);
 
 	ib_drain_qp(ch->qp);
 	ib_destroy_qp(ch->qp);
@@ -544,8 +546,10 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 		goto err;
 	}
 
+	//send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size,
+				//ch->comp_vector, IB_POLL_DIRECT);
 	send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size,
-				ch->comp_vector, IB_POLL_DIRECT);
+				ch->comp_vector, IB_POLL_SOFTIRQ);
 	if (IS_ERR(send_cq)) {
 		ret = PTR_ERR(send_cq);
 		goto err_recv_cq;
@@ -1154,12 +1158,19 @@ static int srp_connect_ch(struct srp_rdma_ch *ch, uint32_t max_iu_len,
 
 static void srp_inv_rkey_err_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct srp_rdma_ch *ch = cq->cq_context;
+
+	pr_err("qp#%03d: inv_rkey-done: opcode: %d status: %d: len: %d\n",
+			ch->qp->qp_num, wc->opcode, wc->status, wc->byte_len);
+
 	srp_handle_qp_err(cq, wc, "INV RKEY");
 }
 
 static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch,
 		u32 rkey)
 {
+	int ret;
+
 	struct ib_send_wr wr = {
 		.opcode		    = IB_WR_LOCAL_INV,
 		.next		    = NULL,
@@ -1170,7 +1181,12 @@ static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch,
 
 	wr.wr_cqe = &req->reg_cqe;
 	req->reg_cqe.done = srp_inv_rkey_err_done;
-	return ib_post_send(ch->qp, &wr, NULL);
+	ret = ib_post_send(ch->qp, &wr, NULL);
+	if (ret)
+		pr_err("qp#%03d: %s: ret = %d\n", ch->qp->qp_num, __func__, ret);
+	else
+		pr_info("qp#%03d: post-inv_rkey: %#x", ch->qp->qp_num, rkey);
+	return ret;
 }
 
 static void srp_unmap_data(struct scsi_cmnd *scmnd,
@@ -1408,6 +1424,11 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
 
 static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct srp_rdma_ch *ch = cq->cq_context;
+
+	pr_err("qp#%03d: reg_mr-done: opcode: %d status: %d: len: %d\n",
+			ch->qp->qp_num, wc->opcode, wc->status, wc->byte_len);
+
 	srp_handle_qp_err(cq, wc, "FAST REG");
 }
 
@@ -1488,6 +1509,11 @@ static int srp_map_finish_fr(struct srp_map_state *state,
 		     desc->mr->length, desc->mr->rkey);
 
 	err = ib_post_send(ch->qp, &wr.wr, NULL);
+	if (err)
+		pr_err("qp#%03d: %s: err = %d\n", ch->qp->qp_num, __func__, err);
+	else
+		pr_info("qp#%03d: post-reg_mr: %#x\n", ch->qp->qp_num,
+				desc->mr->rkey);
 	if (unlikely(err)) {
 		WARN_ON_ONCE(err == -ENOMEM);
 		return err;
@@ -1840,10 +1866,14 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
 
 	lockdep_assert_held(&ch->lock);
 
-	ib_process_cq_direct(ch->send_cq, -1);
+	//pr_info("qp#%03d: %s to ib_process_cq_direct\n",
+			//ch->qp->qp_num, __func__);
+	//ib_process_cq_direct(ch->send_cq, -1);
 
-	if (list_empty(&ch->free_tx))
+	if (list_empty(&ch->free_tx)) {
+		pr_err("%s: ran out of iu\n", __func__);
 		return NULL;
+	}
 
 	/* Initiator responses to target requests do not consume credits */
 	if (iu_type != SRP_IU_RSP) {
@@ -1869,15 +1899,22 @@ static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc)
 {
 	struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
 	struct srp_rdma_ch *ch = cq->cq_context;
+	/**/unsigned long flags;
+
+	pr_info("qp#%03d: send-done: opcode: %d status: %d: len: %d\n",
+			ch->qp->qp_num, wc->opcode, wc->status, wc->byte_len);
 
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		srp_handle_qp_err(cq, wc, "SEND");
 		return;
 	}
 
+
+	/**/spin_lock_irqsave(&ch->lock, flags);
 	lockdep_assert_held(&ch->lock);
 
 	list_add(&iu->list, &ch->free_tx);
+	/**/spin_unlock_irqrestore(&ch->lock, flags);
 }
 
 /**
@@ -1890,6 +1927,7 @@ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
 {
 	struct srp_target_port *target = ch->target;
 	struct ib_send_wr wr;
+	int ret;
 
 	if (WARN_ON_ONCE(iu->num_sge > SRP_MAX_SGE))
 		return -EINVAL;
@@ -1907,7 +1945,12 @@ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
 	wr.opcode     = IB_WR_SEND;
 	wr.send_flags = IB_SEND_SIGNALED;
 
-	return ib_post_send(ch->qp, &wr, NULL);
+	ret = ib_post_send(ch->qp, &wr, NULL);
+	if (ret)
+		pr_err("qp#%03d: %s: ret = %d\n", ch->qp->qp_num, __func__, ret);
+	else
+		pr_info("qp#%03d: post-send:\n", ch->qp->qp_num);
+	return ret;
 }
 
 static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
@@ -1915,6 +1958,7 @@ static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
 	struct srp_target_port *target = ch->target;
 	struct ib_recv_wr wr;
 	struct ib_sge list;
+	int ret;
 
 	list.addr   = iu->dma;
 	list.length = iu->size;
@@ -1927,7 +1971,12 @@ static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
 	wr.sg_list  = &list;
 	wr.num_sge  = 1;
 
-	return ib_post_recv(ch->qp, &wr, NULL);
+	ret = ib_post_recv(ch->qp, &wr, NULL);
+	if (ret)
+		pr_err("qp#%03d: %s: ret = %d\n", ch->qp->qp_num, __func__, ret);
+	else
+		pr_info("qp#%03d: post-recv:\n", ch->qp->qp_num);
+	return ret;
 }
 
 static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp)
@@ -2004,8 +2053,9 @@ static int srp_response_common(struct srp_rdma_ch *ch, s32 req_delta,
 	spin_unlock_irqrestore(&ch->lock, flags);
 
 	if (!iu) {
-		shost_printk(KERN_ERR, target->scsi_host, PFX
-			     "no IU available to send response\n");
+		pr_err("%s: no iu to send response\n", __func__);
+		//shost_printk(KERN_ERR, target->scsi_host, PFX
+			     //"no IU available to send response\n");
 		return 1;
 	}
 
@@ -2034,8 +2084,9 @@ static void srp_process_cred_req(struct srp_rdma_ch *ch,
 	s32 delta = be32_to_cpu(req->req_lim_delta);
 
 	if (srp_response_common(ch, delta, &rsp, sizeof(rsp)))
-		shost_printk(KERN_ERR, ch->target->scsi_host, PFX
-			     "problems processing SRP_CRED_REQ\n");
+		pr_err("%s: problems with cred req\n", __func__);
+		//shost_printk(KERN_ERR, ch->target->scsi_host, PFX
+			     //"problems processing SRP_CRED_REQ\n");
 }
 
 static void srp_process_aer_req(struct srp_rdma_ch *ch,
@@ -2065,6 +2116,9 @@ static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 	int res;
 	u8 opcode;
 
+	pr_info("qp#%03d: recv-done: opcode: %d status: %d: len: %d\n",
+			ch->qp->qp_num, wc->opcode, wc->status, wc->byte_len);
+
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		srp_handle_qp_err(cq, wc, "RECV");
 		return;
@@ -2173,8 +2227,10 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
 	iu = __srp_get_tx_iu(ch, SRP_IU_CMD);
 	spin_unlock_irqrestore(&ch->lock, flags);
 
-	if (!iu)
+	if (!iu) {
+		pr_err("%s: no iu to queue command\n", __func__);
 		goto err;
+	}
 
 	dev = target->srp_host->srp_dev->dev;
 	ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_it_iu_len,
@@ -2240,6 +2296,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
 		scsi_done(scmnd);
 		ret = 0;
 	} else {
+		pr_err("%s: returned SCSI_MLQUEUE_HOST_BUSY\n", __func__);
 		ret = SCSI_MLQUEUE_HOST_BUSY;
 	}
 
@@ -2734,6 +2791,7 @@ static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag, u64 lun,
 	spin_unlock_irq(&ch->lock);
 
 	if (!iu) {
+		pr_err("%s: no iu for task management\n", __func__);
 		mutex_unlock(&rport->mutex);
 
 		return -1;

[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux