Re: [PATCH v3] iwcm: don't hold the irq disabled lock on iw_rem_ref

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wednesday, September 09/18/19, 2019 at 19:13:25 +0530, Krishnamraju Eraparaju wrote:
Hi,

I have restructured iw_cxgb4 driver to support the proposed iwcm
changes. Issues addressed by this patch:
-all iw_rem_ref calls from iwcm are outside the spinlock critical
section.
-fixes siw early freeing of siw_base_qp issue.
-the ib_destroy_qp for iw_cxgb4 does not block, even before CM getting
destroyed.
Currently we are testing these changes, will submit the patch next week.

PATCH:
-----
diff --git a/drivers/infiniband/core/iwcm.c
b/drivers/infiniband/core/iwcm.c
index 72141c5..7e1d834 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -372,6 +372,7 @@ int iw_cm_disconnect(struct iw_cm_id *cm_id, int
abrupt)
 static void destroy_cm_id(struct iw_cm_id *cm_id)
 {
        struct iwcm_id_private *cm_id_priv;
+       struct ib_qp *qp;
        unsigned long flags;

        cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
@@ -389,6 +390,9 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
        set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags);

        spin_lock_irqsave(&cm_id_priv->lock, flags);
+       qp = cm_id_priv->qp;
+       cm_id_priv->qp = NULL;
+
        switch (cm_id_priv->state) {
        case IW_CM_STATE_LISTEN:
                cm_id_priv->state = IW_CM_STATE_DESTROYING;
@@ -401,7 +405,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
                cm_id_priv->state = IW_CM_STATE_DESTROYING;
                spin_unlock_irqrestore(&cm_id_priv->lock, flags);
                /* Abrupt close of the connection */
-               (void)iwcm_modify_qp_err(cm_id_priv->qp);
+               (void)iwcm_modify_qp_err(qp);
                spin_lock_irqsave(&cm_id_priv->lock, flags);
                break;
        case IW_CM_STATE_IDLE:
@@ -426,11 +430,9 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
                BUG();
                break;
        }
-       if (cm_id_priv->qp) {
-               cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
-               cm_id_priv->qp = NULL;
-       }
        spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+       if (qp)
+               cm_id_priv->id.device->ops.iw_rem_ref(qp);

        if (cm_id->mapped) {
                iwpm_remove_mapinfo(&cm_id->local_addr,
&cm_id->m_local_addr);
@@ -671,11 +673,11 @@ int iw_cm_accept(struct iw_cm_id *cm_id,
                BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
                cm_id_priv->state = IW_CM_STATE_IDLE;
                spin_lock_irqsave(&cm_id_priv->lock, flags);
-               if (cm_id_priv->qp) {
-                       cm_id->device->ops.iw_rem_ref(qp);
-                       cm_id_priv->qp = NULL;
-               }
+               qp = cm_id_priv->qp;
+               cm_id_priv->qp = NULL;
                spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+               if (qp)
+                       cm_id->device->ops.iw_rem_ref(qp);
                clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
                wake_up_all(&cm_id_priv->connect_wait);
        }
@@ -696,7 +698,7 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct
iw_cm_conn_param *iw_param)
        struct iwcm_id_private *cm_id_priv;
        int ret;
        unsigned long flags;
-       struct ib_qp *qp;
+       struct ib_qp *qp = NULL;

        cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);

@@ -730,13 +732,13 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct
iw_cm_conn_param *iw_param)
                return 0;       /* success */

        spin_lock_irqsave(&cm_id_priv->lock, flags);
-       if (cm_id_priv->qp) {
-               cm_id->device->ops.iw_rem_ref(qp);
-               cm_id_priv->qp = NULL;
-       }
+       qp = cm_id_priv->qp;
+       cm_id_priv->qp = NULL;
        cm_id_priv->state = IW_CM_STATE_IDLE;
 err:
        spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+       if (qp)
+               cm_id->device->ops.iw_rem_ref(qp);
        clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
        wake_up_all(&cm_id_priv->connect_wait);
        return ret;
@@ -878,6 +880,7 @@ static int cm_conn_est_handler(struct
iwcm_id_private *cm_id_priv,
 static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
                               struct iw_cm_event *iw_event)
 {
+       struct ib_qp *qp = NULL;
        unsigned long flags;
        int ret;

@@ -896,11 +899,13 @@ static int cm_conn_rep_handler(struct
iwcm_id_private *cm_id_priv,
                cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
        } else {
                /* REJECTED or RESET */
-               cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
+               qp = cm_id_priv->qp;
                cm_id_priv->qp = NULL;
                cm_id_priv->state = IW_CM_STATE_IDLE;
        }
        spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+       if (qp)
+               cm_id_priv->id.device->ops.iw_rem_ref(qp);
        ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);

        if (iw_event->private_data_len)
@@ -942,14 +947,13 @@ static void cm_disconnect_handler(struct
iwcm_id_private *cm_id_priv,
 static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
                                  struct iw_cm_event *iw_event)
 {
+       struct ib_qp *qp;
        unsigned long flags;
        int ret = 0;
        spin_lock_irqsave(&cm_id_priv->lock, flags);
+       qp = cm_id_priv->qp;
+       cm_id_priv->qp = NULL;

-       if (cm_id_priv->qp) {
-               cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
-               cm_id_priv->qp = NULL;
-       }
        switch (cm_id_priv->state) {
        case IW_CM_STATE_ESTABLISHED:
        case IW_CM_STATE_CLOSING:
@@ -965,6 +969,8 @@ static int cm_close_handler(struct iwcm_id_private
*cm_id_priv,
        }
        spin_unlock_irqrestore(&cm_id_priv->lock, flags);

+       if (qp)
+               cm_id_priv->id.device->ops.iw_rem_ref(qp);
        return ret;
 }

diff --git a/drivers/infiniband/hw/cxgb4/device.c
b/drivers/infiniband/hw/cxgb4/device.c
index a8b9548..330483d 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -747,6 +747,7 @@ void c4iw_release_dev_ucontext(struct c4iw_rdev
*rdev,
        struct list_head *pos, *nxt;
        struct c4iw_qid_list *entry;

+       wait_event(uctx->wait, refcount_read(&uctx->refcnt) == 1);
        mutex_lock(&uctx->lock);
        list_for_each_safe(pos, nxt, &uctx->qpids) {
                entry = list_entry(pos, struct c4iw_qid_list, entry);
@@ -775,6 +776,8 @@ void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
        INIT_LIST_HEAD(&uctx->qpids);
        INIT_LIST_HEAD(&uctx->cqids);
        mutex_init(&uctx->lock);
+       init_waitqueue_head(&uctx->wait);
+       refcount_set(&uctx->refcnt, 1);
 }

 /* Caller takes care of locking if needed */
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 7d06b0f..441adc6 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -110,6 +110,8 @@ struct c4iw_dev_ucontext {
        struct list_head cqids;
        struct mutex lock;
        struct kref kref;
+       wait_queue_head_t wait;
+       refcount_t refcnt;
 };

 enum c4iw_rdev_flags {
@@ -490,13 +492,13 @@ struct c4iw_qp {
        struct t4_wq wq;
        spinlock_t lock;
        struct mutex mutex;
+       struct kref kref;
        wait_queue_head_t wait;
        int sq_sig_all;
        struct c4iw_srq *srq;
+       struct work_struct free_work;
        struct c4iw_ucontext *ucontext;
        struct c4iw_wr_wait *wr_waitp;
-       struct completion qp_rel_comp;
-       refcount_t qp_refcnt;
 };

 static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp)
@@ -531,6 +533,7 @@ struct c4iw_ucontext {
        spinlock_t mmap_lock;
        struct list_head mmaps;
        bool is_32b_cqe;
+       struct completion qp_comp;
 };

 static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext
*c)
diff --git a/drivers/infiniband/hw/cxgb4/qp.c
b/drivers/infiniband/hw/cxgb4/qp.c
index eb9368b..ca61193 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -889,17 +889,44 @@ static int build_inv_stag(union t4_wr *wqe, const
struct ib_send_wr *wr,
        return 0;
 }

+static void free_qp_work(struct work_struct *work)
+{
+       struct c4iw_dev_ucontext *uctx;
+       struct c4iw_qp *qhp;
+       struct c4iw_dev *rhp;
+
+       qhp = container_of(work, struct c4iw_qp, free_work);
+       rhp = qhp->rhp;
+       uctx = qhp->ucontext ? &qhp->ucontext->uctx : &rhp->rdev.uctx;
+
+       pr_debug("qhp %p ucontext %p\n", qhp, qhp->ucontext);
+       destroy_qp(&rhp->rdev, &qhp->wq, uctx, !qhp->srq);
+
+       c4iw_put_wr_wait(qhp->wr_waitp);
+       kfree(qhp);
+       refcount_dec(&uctx->refcnt);
+       wake_up(&uctx->wait);
+}
+
+static void queue_qp_free(struct kref *kref)
+{
+       struct c4iw_qp *qhp;
+
+       qhp = container_of(kref, struct c4iw_qp, kref);
+       pr_debug("qhp %p\n", qhp);
+       queue_work(qhp->rhp->rdev.free_workq, &qhp->free_work);
+}
+
 void c4iw_qp_add_ref(struct ib_qp *qp)
 {
        pr_debug("ib_qp %p\n", qp);
-       refcount_inc(&to_c4iw_qp(qp)->qp_refcnt);
+       kref_get(&to_c4iw_qp(qp)->kref);
 }

 void c4iw_qp_rem_ref(struct ib_qp *qp)
 {
        pr_debug("ib_qp %p\n", qp);
-       if (refcount_dec_and_test(&to_c4iw_qp(qp)->qp_refcnt))
-               complete(&to_c4iw_qp(qp)->qp_rel_comp);
+       kref_put(&to_c4iw_qp(qp)->kref, queue_qp_free);
 }

 static void add_to_fc_list(struct list_head *head, struct list_head
*entry)
@@ -2071,12 +2098,10 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp, struct
ib_udata *udata)
 {
        struct c4iw_dev *rhp;
        struct c4iw_qp *qhp;
-       struct c4iw_ucontext *ucontext;
        struct c4iw_qp_attributes attrs;

        qhp = to_c4iw_qp(ib_qp);
        rhp = qhp->rhp;
-       ucontext = qhp->ucontext;

        attrs.next_state = C4IW_QP_STATE_ERROR;
        if (qhp->attr.state == C4IW_QP_STATE_TERMINATE)
@@ -2094,17 +2119,7 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp, struct
ib_udata *udata)

        c4iw_qp_rem_ref(ib_qp);

-       wait_for_completion(&qhp->qp_rel_comp);
-
        pr_debug("ib_qp %p qpid 0x%0x\n", ib_qp, qhp->wq.sq.qid);
-       pr_debug("qhp %p ucontext %p\n", qhp, ucontext);
-
-       destroy_qp(&rhp->rdev, &qhp->wq,
-                  ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
                   !qhp->srq);
-
-       c4iw_put_wr_wait(qhp->wr_waitp);
-
-       kfree(qhp);
        return 0;
 }

@@ -2214,8 +2229,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
struct ib_qp_init_attr *attrs,
        spin_lock_init(&qhp->lock);
        mutex_init(&qhp->mutex);
        init_waitqueue_head(&qhp->wait);
-       init_completion(&qhp->qp_rel_comp);
-       refcount_set(&qhp->qp_refcnt, 1);
+       kref_init(&qhp->kref);
+       INIT_WORK(&qhp->free_work, free_qp_work);

        ret = xa_insert_irq(&rhp->qps, qhp->wq.sq.qid, qhp, GFP_KERNEL);
        if (ret)
@@ -2335,6 +2350,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
struct ib_qp_init_attr *attrs,
        if (attrs->srq)
                qhp->srq = to_c4iw_srq(attrs->srq);
        INIT_LIST_HEAD(&qhp->db_fc_entry);
+       refcount_inc(ucontext ? &ucontext->uctx.refcnt :
+                               &rhp->rdev.uctx.refcnt);
        pr_debug("sq id %u size %u memsize %zu num_entries %u rq id %u
size %u memsize %zu num_entries %u\n",
                 qhp->wq.sq.qid, qhp->wq.sq.size, qhp->wq.sq.memsize,
                 attrs->cap.max_send_wr, qhp->wq.rq.qid,
qhp->wq.rq.size,
diff --git a/drivers/infiniband/sw/siw/siw_qp.c
b/drivers/infiniband/sw/siw/siw_qp.c
index 0990307..743d3d2 100644
--- a/drivers/infiniband/sw/siw/siw_qp.c
+++ b/drivers/infiniband/sw/siw/siw_qp.c
@@ -1305,6 +1305,7 @@ int siw_qp_add(struct siw_device *sdev, struct
siw_qp *qp)
 void siw_free_qp(struct kref *ref)
 {
        struct siw_qp *found, *qp = container_of(ref, struct siw_qp,
ref);
+       struct siw_base_qp *siw_base_qp = to_siw_base_qp(qp->ib_qp);
        struct siw_device *sdev = qp->sdev;
        unsigned long flags;

@@ -1327,4 +1328,5 @@ void siw_free_qp(struct kref *ref)
        atomic_dec(&sdev->num_qp);
        siw_dbg_qp(qp, "free QP\n");
        kfree_rcu(qp, rcu);
+       kfree(siw_base_qp);
 }
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c
b/drivers/infiniband/sw/siw/siw_verbs.c
index 03176a3..35eee3f 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -605,7 +605,6 @@ int siw_verbs_modify_qp(struct ib_qp *base_qp,
struct ib_qp_attr *attr,
 int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
 {
        struct siw_qp *qp = to_siw_qp(base_qp);
-       struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp);
        struct siw_ucontext *uctx =
                rdma_udata_to_drv_context(udata, struct siw_ucontext,
                                          base_ucontext);
@@ -642,7 +641,6 @@ int siw_destroy_qp(struct ib_qp *base_qp, struct
ib_udata *udata)
        qp->scq = qp->rcq = NULL;

        siw_qp_put(qp);
-       kfree(siw_base_qp);

        return 0;
 }

> On Tuesday, September 09/17/19, 2019 at 22:50:27 +0530, Sagi Grimberg wrote:
> > 
> > >> To: "Krishnamraju Eraparaju" <krishna2@xxxxxxxxxxx>
> > >> From: "Jason Gunthorpe" <jgg@xxxxxxxx>
> > >> Date: 09/16/2019 06:28PM
> > >> Cc: "Steve Wise" <larrystevenwise@xxxxxxxxx>, "Bernard Metzler"
> > >> <BMT@xxxxxxxxxxxxxx>, "Sagi Grimberg" <sagi@xxxxxxxxxxx>,
> > >> "linux-rdma@xxxxxxxxxxxxxxx" <linux-rdma@xxxxxxxxxxxxxxx>
> > >> Subject: [EXTERNAL] Re: Re: [PATCH v3] iwcm: don't hold the irq
> > >> disabled lock on iw_rem_ref
> > >>
> > >> On Wed, Sep 11, 2019 at 09:28:16PM +0530, Krishnamraju Eraparaju
> > >> wrote:
> > >>> Hi Steve & Bernard,
> > >>>
> > >>> Thanks for the review comments.
> > >>> I will do those formating changes.
> > >>
> > >> I don't see anything in patchworks, but the consensus is to drop
> > >> Sagi's patch pending this future patch?
> > >>
> > >> Jason
> > >>
> > > This is my impression as well. But consensus should be
> > > explicit...Sagi, what do you think?
> > 
> > I don't really care, but given the changes from Krishnamraju cause other
> > problems I'd ask if my version is also offending his test.
> Hi Sagi,
> Your version holds good for rdma_destroy_id() path only, but there are
> other places where iw_rem_ref() is being called within spinlocks, here is
> the call trace when iw_rem_ref() is called in cm_close_handler() path:
> [  627.716339] Call Trace:
> [  627.716339]  ? load_new_mm_cr3+0xc0/0xc0
> [  627.716339]  on_each_cpu+0x23/0x40
> [  627.716339]  flush_tlb_kernel_range+0x74/0x80
> [  627.716340]  free_unmap_vmap_area+0x2a/0x40
> [  627.716340]  remove_vm_area+0x59/0x60
> [  627.716340]  __vunmap+0x46/0x210
> [  627.716340]  siw_free_qp+0x88/0x120 [siw]
> [  627.716340]  cm_work_handler+0x5b8/0xd00  <=====
> [  627.716340]  process_one_work+0x155/0x380
> [  627.716341]  worker_thread+0x41/0x3b0
> [  627.716341]  kthread+0xf3/0x130
> [  627.716341]  ? process_one_work+0x380/0x380
> [  627.716341]  ? kthread_bind+0x10/0x10
> [  627.716341]  ret_from_fork+0x35/0x40
> 
> Hence, I moved all the occurances of iw_rem_ref() outside of spinlocks
> critical section.
> > 
> > In general, I do not think that making resources free routines (both
> > explict or implicit via ref dec) under a spinlock is not a robust
> > design.
> > 
> > I would first make it clear and documented what cm_id_priv->lock is
> > protecting. In my mind, it should protect *its own* mutations of
> > cm_id_priv and by design leave all the ops calls outside the lock.
> > 
> > I don't understand what is causing the Chelsio issue observed, but
> > it looks like c4iw_destroy_qp blocks on a completion that depends on a
> > refcount that is taken also by iwcm, which means that I cannot call
> > ib_destroy_qp if the cm is not destroyed as well?
> > 
> > If that is madatory, I'd say that instead of blocking on this
> > completion, we can simply convert c4iw_qp_rem_ref if use a kref
> > which is not order dependent.
> 
> I agree,
> I'm exploring best possible ways to address this issue, will update my
> final resolution by this Friday.



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux