On 10/18/23 14:48, Bart Van Assche wrote:
>
> On 10/18/23 12:17, Jason Gunthorpe wrote:
>> If siw hangs as well, I definitely comfortable continuing to debug and
>> leaving the work queues in-tree for now.
>
> Regarding the KASAN complaint that I shared about one month ago, can that complaint have any other root cause than the patch "RDMA/rxe: Add
> workqueue support for rxe tasks"? That report shows a use-after-free by
> rxe code with a pointer to memory that was owned by the rxe driver and
> that was freed by the rxe driver. That memory is an skbuff. The rxe
> driver manages skbuffs. The SRP driver doesn't even know about these
> skbuff objects. See also https://lore.kernel.org/linux-rdma/8ee2869b-3f51-4195-9883-015cd30b4241@xxxxxxx/
>
> Thanks,
>
> Bart.
>
And here are the patches to ib_srp I have. I will retry with a clean version to see what happens.
Bob
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 1574218764e0..1b9919e08176 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -512,9 +512,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
*/
static void srp_destroy_qp(struct srp_rdma_ch *ch)
{
- spin_lock_irq(&ch->lock);
- ib_process_cq_direct(ch->send_cq, -1);
- spin_unlock_irq(&ch->lock);
+ //spin_lock_irq(&ch->lock);
+ //pr_info("qp#%03d: %s to ib_process_cq_direct\n",
+ //ch->qp->qp_num, __func__);
+ //ib_process_cq_direct(ch->send_cq, -1);
+ //spin_unlock_irq(&ch->lock);
ib_drain_qp(ch->qp);
ib_destroy_qp(ch->qp);
@@ -544,8 +546,10 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
goto err;
}
+ //send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size,
+ //ch->comp_vector, IB_POLL_DIRECT);
send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size,
- ch->comp_vector, IB_POLL_DIRECT);
+ ch->comp_vector, IB_POLL_SOFTIRQ);
if (IS_ERR(send_cq)) {
ret = PTR_ERR(send_cq);
goto err_recv_cq;
@@ -1154,12 +1158,19 @@ static int srp_connect_ch(struct srp_rdma_ch *ch, uint32_t max_iu_len,
static void srp_inv_rkey_err_done(struct ib_cq *cq, struct ib_wc *wc)
{
+ struct srp_rdma_ch *ch = cq->cq_context;
+
+ pr_err("qp#%03d: inv_rkey-done: opcode: %d status: %d: len: %d\n",
+ ch->qp->qp_num, wc->opcode, wc->status, wc->byte_len);
+
srp_handle_qp_err(cq, wc, "INV RKEY");
}
static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch,
u32 rkey)
{
+ int ret;
+
struct ib_send_wr wr = {
.opcode = IB_WR_LOCAL_INV,
.next = NULL,
@@ -1170,7 +1181,12 @@ static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch,
wr.wr_cqe = &req->reg_cqe;
req->reg_cqe.done = srp_inv_rkey_err_done;
- return ib_post_send(ch->qp, &wr, NULL);
+ ret = ib_post_send(ch->qp, &wr, NULL);
+ if (ret)
+ pr_err("qp#%03d: %s: ret = %d\n", ch->qp->qp_num, __func__, ret);
+ else
+ pr_info("qp#%03d: post-inv_rkey: %#x", ch->qp->qp_num, rkey);
+ return ret;
}
static void srp_unmap_data(struct scsi_cmnd *scmnd,
@@ -1408,6 +1424,11 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc)
{
+ struct srp_rdma_ch *ch = cq->cq_context;
+
+ pr_err("qp#%03d: reg_mr-done: opcode: %d status: %d: len: %d\n",
+ ch->qp->qp_num, wc->opcode, wc->status, wc->byte_len);
+
srp_handle_qp_err(cq, wc, "FAST REG");
}
@@ -1488,6 +1509,11 @@ static int srp_map_finish_fr(struct srp_map_state *state,
desc->mr->length, desc->mr->rkey);
err = ib_post_send(ch->qp, &wr.wr, NULL);
+ if (err)
+ pr_err("qp#%03d: %s: err = %d\n", ch->qp->qp_num, __func__, err);
+ else
+ pr_info("qp#%03d: post-reg_mr: %#x\n", ch->qp->qp_num,
+ desc->mr->rkey);
if (unlikely(err)) {
WARN_ON_ONCE(err == -ENOMEM);
return err;
@@ -1840,10 +1866,14 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
lockdep_assert_held(&ch->lock);
- ib_process_cq_direct(ch->send_cq, -1);
+ //pr_info("qp#%03d: %s to ib_process_cq_direct\n",
+ //ch->qp->qp_num, __func__);
+ //ib_process_cq_direct(ch->send_cq, -1);
- if (list_empty(&ch->free_tx))
+ if (list_empty(&ch->free_tx)) {
+ pr_err("%s: ran out of iu\n", __func__);
return NULL;
+ }
/* Initiator responses to target requests do not consume credits */
if (iu_type != SRP_IU_RSP) {
@@ -1869,15 +1899,22 @@ static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
struct srp_rdma_ch *ch = cq->cq_context;
+ /**/unsigned long flags;
+
+ pr_info("qp#%03d: send-done: opcode: %d status: %d: len: %d\n",
+ ch->qp->qp_num, wc->opcode, wc->status, wc->byte_len);
if (unlikely(wc->status != IB_WC_SUCCESS)) {
srp_handle_qp_err(cq, wc, "SEND");
return;
}
+
+ /**/spin_lock_irqsave(&ch->lock, flags);
lockdep_assert_held(&ch->lock);
list_add(&iu->list, &ch->free_tx);
+ /**/spin_unlock_irqrestore(&ch->lock, flags);
}
/**
@@ -1890,6 +1927,7 @@ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
{
struct srp_target_port *target = ch->target;
struct ib_send_wr wr;
+ int ret;
if (WARN_ON_ONCE(iu->num_sge > SRP_MAX_SGE))
return -EINVAL;
@@ -1907,7 +1945,12 @@ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
wr.opcode = IB_WR_SEND;
wr.send_flags = IB_SEND_SIGNALED;
- return ib_post_send(ch->qp, &wr, NULL);
+ ret = ib_post_send(ch->qp, &wr, NULL);
+ if (ret)
+ pr_err("qp#%03d: %s: ret = %d\n", ch->qp->qp_num, __func__, ret);
+ else
+ pr_info("qp#%03d: post-send:\n", ch->qp->qp_num);
+ return ret;
}
static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
@@ -1915,6 +1958,7 @@ static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
struct srp_target_port *target = ch->target;
struct ib_recv_wr wr;
struct ib_sge list;
+ int ret;
list.addr = iu->dma;
list.length = iu->size;
@@ -1927,7 +1971,12 @@ static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
wr.sg_list = &list;
wr.num_sge = 1;
- return ib_post_recv(ch->qp, &wr, NULL);
+ ret = ib_post_recv(ch->qp, &wr, NULL);
+ if (ret)
+ pr_err("qp#%03d: %s: ret = %d\n", ch->qp->qp_num, __func__, ret);
+ else
+ pr_info("qp#%03d: post-recv:\n", ch->qp->qp_num);
+ return ret;
}
static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp)
@@ -2004,8 +2053,9 @@ static int srp_response_common(struct srp_rdma_ch *ch, s32 req_delta,
spin_unlock_irqrestore(&ch->lock, flags);
if (!iu) {
- shost_printk(KERN_ERR, target->scsi_host, PFX
- "no IU available to send response\n");
+ pr_err("%s: no iu to send response\n", __func__);
+ //shost_printk(KERN_ERR, target->scsi_host, PFX
+ //"no IU available to send response\n");
return 1;
}
@@ -2034,8 +2084,9 @@ static void srp_process_cred_req(struct srp_rdma_ch *ch,
s32 delta = be32_to_cpu(req->req_lim_delta);
if (srp_response_common(ch, delta, &rsp, sizeof(rsp)))
- shost_printk(KERN_ERR, ch->target->scsi_host, PFX
- "problems processing SRP_CRED_REQ\n");
+ pr_err("%s: problems with cred req\n", __func__);
+ //shost_printk(KERN_ERR, ch->target->scsi_host, PFX
+ //"problems processing SRP_CRED_REQ\n");
}
static void srp_process_aer_req(struct srp_rdma_ch *ch,
@@ -2065,6 +2116,9 @@ static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc)
int res;
u8 opcode;
+ pr_info("qp#%03d: recv-done: opcode: %d status: %d: len: %d\n",
+ ch->qp->qp_num, wc->opcode, wc->status, wc->byte_len);
+
if (unlikely(wc->status != IB_WC_SUCCESS)) {
srp_handle_qp_err(cq, wc, "RECV");
return;
@@ -2173,8 +2227,10 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
iu = __srp_get_tx_iu(ch, SRP_IU_CMD);
spin_unlock_irqrestore(&ch->lock, flags);
- if (!iu)
+ if (!iu) {
+ pr_err("%s: no iu to queue command\n", __func__);
goto err;
+ }
dev = target->srp_host->srp_dev->dev;
ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_it_iu_len,
@@ -2240,6 +2296,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
scsi_done(scmnd);
ret = 0;
} else {
+ pr_err("%s: returned SCSI_MLQUEUE_HOST_BUSY\n", __func__);
ret = SCSI_MLQUEUE_HOST_BUSY;
}
@@ -2734,6 +2791,7 @@ static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag, u64 lun,
spin_unlock_irq(&ch->lock);
if (!iu) {
+ pr_err("%s: no iu for task management\n", __func__);
mutex_unlock(&rport->mutex);
return -1;