Re: [PATCH v1 1/3] IB/srp: Fix crash when unmapping data loop

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 3/6/2014 5:32 PM, sagi grimberg wrote:
On 3/6/2014 11:10 AM, Bart Van Assche wrote:
On 02/27/14 12:32, Sagi Grimberg wrote:
As I recall (need to re-confirm this), the problem was that in unstable
ports condition srp_abort is
called invoking srp_free_req (unmap call #1) and reconnect process (or
reset_device or terminate_io)
finishes all requests in the request ring (unmap call #2). when FMRs are
used then nfmr goes to zero
the first time, but when FMRs are not supported nfmr goes from 0 to -1
causing the crash since nfmr condition
is not safe.
Hello Sagi,

Thinking further about this, it looks to me like the only way to avoid
that srp_terminate_io() triggers a race condition with the error path
in srp_queuecommand() is to set req->scmnd only if srp_post_send() has
succeeded. How about the patch below ? That patch should ensure that
srp_unmap_data() is only called once for each request and hence should
avoid triggering the crash you mentioned.

Hey Bart, Thanks for taking the time to think on this issue.

I see your point, and it might work. let me try it out (if I won't get to it today I'll do it on Sunday)
and reply if that resolves the issue.

Sagi.


Hey Bart (replying to my own email...),

So I took Roland latest 3.14-rc1 and tried to reproduce this issue using HCA with no FMRs support and was *NOT* able to reproduce this issue. This issue reproduced for me on RH6 backported srp and I can't tell where is the delta at the moment. Perhaps Sebastian can share his scenario that reproduces
this issue and was solved by the patch I proposed.

I suggest to try and reproduce it with your ib_srp-backport code over RH6, worth a look right?

If this is a false alarm, then I'm sorry for the bother...

Sagi.

Thanks,

Bart.

[PATCH] IB/srp: Fix a race condition between fast failing I/O and I/O queueing

Avoid that srp_unmap_data() can get invoked concurrently by
srp_terminate_io() and from the error path in srp_queuecommand()
if srp_post_send() fails.

Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx>
---
drivers/infiniband/ulp/srp/ib_srp.c | 46 +++++++++++++++++++++----------------
  1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 7858240..85eb59f 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -783,6 +783,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd,
* srp_claim_req - Take ownership of the scmnd associated with a request.
   * @target: SRP target port.
   * @req: SRP request.
+ * @sdev: If not NULL, only take ownership for this SCSI device.
* @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take
   *         ownership of @req->scmnd if it equals @scmnd.
   *
@@ -791,16 +792,17 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd,
   */
  static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target,
                         struct srp_request *req,
+                       struct scsi_device *sdev,
                         struct scsi_cmnd *scmnd)
  {
      unsigned long flags;
        spin_lock_irqsave(&target->lock, flags);
-    if (!scmnd) {
+    if (req->scmnd &&
+        (!sdev || req->scmnd->device == sdev) &&
+        (!scmnd || req->scmnd == scmnd)) {
          scmnd = req->scmnd;
          req->scmnd = NULL;
-    } else if (req->scmnd == scmnd) {
-        req->scmnd = NULL;
      } else {
          scmnd = NULL;
      }
@@ -827,9 +829,10 @@ static void srp_free_req(struct srp_target_port *target,
  }
    static void srp_finish_req(struct srp_target_port *target,
-               struct srp_request *req, int result)
+               struct srp_request *req, struct scsi_device *sdev,
+               int result)
  {
-    struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL);
+    struct scsi_cmnd *scmnd = srp_claim_req(target, req, sdev, NULL);
        if (scmnd) {
          srp_free_req(target, req, scmnd, 0);
@@ -845,7 +848,7 @@ static void srp_terminate_io(struct srp_rport *rport)
        for (i = 0; i < target->req_ring_size; ++i) {
          struct srp_request *req = &target->req_ring[i];
-        srp_finish_req(target, req, DID_TRANSPORT_FAILFAST << 16);
+ srp_finish_req(target, req, NULL, DID_TRANSPORT_FAILFAST << 16);
      }
  }
@@ -882,7 +885,7 @@ static int srp_rport_reconnect(struct srp_rport *rport)
        for (i = 0; i < target->req_ring_size; ++i) {
          struct srp_request *req = &target->req_ring[i];
-        srp_finish_req(target, req, DID_RESET << 16);
+        srp_finish_req(target, req, NULL, DID_RESET << 16);
      }
        INIT_LIST_HEAD(&target->free_tx);
@@ -1290,7 +1293,7 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp)
          complete(&target->tsk_mgmt_done);
      } else {
          req = &target->req_ring[rsp->tag];
-        scmnd = srp_claim_req(target, req, NULL);
+        scmnd = srp_claim_req(target, req, NULL, NULL);
          if (!scmnd) {
              shost_printk(KERN_ERR, target->scsi_host,
                       "Null scmnd for RSP w/tag %016llx\n",
@@ -1509,7 +1512,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
      struct srp_cmd *cmd;
      struct ib_device *dev;
      unsigned long flags;
-    int len, result;
+    int len, result, ret = 0;
const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler;
        /*
@@ -1552,8 +1555,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
      cmd->tag    = req->index;
      memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len);
  -    req->scmnd    = scmnd;
-    req->cmd      = iu;
+    req->cmd = iu;
        len = srp_map_data(scmnd, target, req);
      if (len < 0) {
@@ -1565,7 +1567,14 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
      ib_dma_sync_single_for_device(dev, iu->dma, target->max_iu_len,
                        DMA_TO_DEVICE);
  -    if (srp_post_send(target, iu, len)) {
+    spin_lock_irqsave(&target->lock, flags);
+    if (srp_post_send(target, iu, len) == 0)
+        req->scmnd = scmnd;
+    else
+        ret = SCSI_MLQUEUE_HOST_BUSY;
+    spin_unlock_irqrestore(&target->lock, flags);
+
+    if (ret) {
shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n");
          goto err_unmap;
      }
@@ -1574,7 +1583,7 @@ unlock_rport:
      if (in_scsi_eh)
          mutex_unlock(&rport->mutex);
  -    return 0;
+    return ret;
    err_unmap:
      srp_unmap_data(scmnd, target, req);
@@ -1588,10 +1597,8 @@ err_iu:
  err_unlock:
      spin_unlock_irqrestore(&target->lock, flags);
  -    if (in_scsi_eh)
-        mutex_unlock(&rport->mutex);
-
-    return SCSI_MLQUEUE_HOST_BUSY;
+    ret = SCSI_MLQUEUE_HOST_BUSY;
+    goto unlock_rport;
  }
    /*
@@ -2008,7 +2015,7 @@ static int srp_abort(struct scsi_cmnd *scmnd)
        shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n");
  -    if (!req || !srp_claim_req(target, req, scmnd))
+    if (!req || !srp_claim_req(target, req, NULL, scmnd))
          return SUCCESS;
      if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun,
                    SRP_TSK_ABORT_TASK) == 0)
@@ -2039,8 +2046,7 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
        for (i = 0; i < target->req_ring_size; ++i) {
          struct srp_request *req = &target->req_ring[i];
-        if (req->scmnd && req->scmnd->device == scmnd->device)
-            srp_finish_req(target, req, DID_RESET << 16);
+        srp_finish_req(target, req, scmnd->device, DID_RESET << 16);
      }
        return SUCCESS;


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux