Re: [PATCH v3] iwcm: don't hold the irq disabled lock on iw_rem_ref

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Submitted the final version of my proposed patch at:
https://patchwork.kernel.org/patch/11169293/

Please review.

Thanks,
Krishna.
On Sunday, September 09/22/19, 2019 at 00:05:08 +0530, Krishnamraju Eraparaju wrote:
> On Wednesday, September 09/18/19, 2019 at 19:13:25 +0530, Krishnamraju Eraparaju wrote:
> Hi,
> 
> I have restructured iw_cxgb4 driver to support the proposed iwcm
> changes. Issues addressed by this patch:
> -all iw_rem_ref calls from iwcm are outside the spinlock critical
> section.
> -fixes siw early freeing of siw_base_qp issue.
> -the ib_destroy_qp for iw_cxgb4 does not block, even before CM getting
> destroyed.
> Currently we are testing these changes, will submit the patch next week.
> 
> PATCH:
> -----
> diff --git a/drivers/infiniband/core/iwcm.c
> b/drivers/infiniband/core/iwcm.c
> index 72141c5..7e1d834 100644
> --- a/drivers/infiniband/core/iwcm.c
> +++ b/drivers/infiniband/core/iwcm.c
> @@ -372,6 +372,7 @@ int iw_cm_disconnect(struct iw_cm_id *cm_id, int
> abrupt)
>  static void destroy_cm_id(struct iw_cm_id *cm_id)
>  {
>         struct iwcm_id_private *cm_id_priv;
> +       struct ib_qp *qp;
>         unsigned long flags;
> 
>         cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> @@ -389,6 +390,9 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
>         set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags);
> 
>         spin_lock_irqsave(&cm_id_priv->lock, flags);
> +       qp = cm_id_priv->qp;
> +       cm_id_priv->qp = NULL;
> +
>         switch (cm_id_priv->state) {
>         case IW_CM_STATE_LISTEN:
>                 cm_id_priv->state = IW_CM_STATE_DESTROYING;
> @@ -401,7 +405,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
>                 cm_id_priv->state = IW_CM_STATE_DESTROYING;
>                 spin_unlock_irqrestore(&cm_id_priv->lock, flags);
>                 /* Abrupt close of the connection */
> -               (void)iwcm_modify_qp_err(cm_id_priv->qp);
> +               (void)iwcm_modify_qp_err(qp);
>                 spin_lock_irqsave(&cm_id_priv->lock, flags);
>                 break;
>         case IW_CM_STATE_IDLE:
> @@ -426,11 +430,9 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
>                 BUG();
>                 break;
>         }
> -       if (cm_id_priv->qp) {
> -               cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
> -               cm_id_priv->qp = NULL;
> -       }
>         spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +       if (qp)
> +               cm_id_priv->id.device->ops.iw_rem_ref(qp);
> 
>         if (cm_id->mapped) {
>                 iwpm_remove_mapinfo(&cm_id->local_addr,
> &cm_id->m_local_addr);
> @@ -671,11 +673,11 @@ int iw_cm_accept(struct iw_cm_id *cm_id,
>                 BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
>                 cm_id_priv->state = IW_CM_STATE_IDLE;
>                 spin_lock_irqsave(&cm_id_priv->lock, flags);
> -               if (cm_id_priv->qp) {
> -                       cm_id->device->ops.iw_rem_ref(qp);
> -                       cm_id_priv->qp = NULL;
> -               }
> +               qp = cm_id_priv->qp;
> +               cm_id_priv->qp = NULL;
>                 spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +               if (qp)
> +                       cm_id->device->ops.iw_rem_ref(qp);
>                 clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
>                 wake_up_all(&cm_id_priv->connect_wait);
>         }
> @@ -696,7 +698,7 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct
> iw_cm_conn_param *iw_param)
>         struct iwcm_id_private *cm_id_priv;
>         int ret;
>         unsigned long flags;
> -       struct ib_qp *qp;
> +       struct ib_qp *qp = NULL;
> 
>         cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> 
> @@ -730,13 +732,13 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct
> iw_cm_conn_param *iw_param)
>                 return 0;       /* success */
> 
>         spin_lock_irqsave(&cm_id_priv->lock, flags);
> -       if (cm_id_priv->qp) {
> -               cm_id->device->ops.iw_rem_ref(qp);
> -               cm_id_priv->qp = NULL;
> -       }
> +       qp = cm_id_priv->qp;
> +       cm_id_priv->qp = NULL;
>         cm_id_priv->state = IW_CM_STATE_IDLE;
>  err:
>         spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +       if (qp)
> +               cm_id->device->ops.iw_rem_ref(qp);
>         clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
>         wake_up_all(&cm_id_priv->connect_wait);
>         return ret;
> @@ -878,6 +880,7 @@ static int cm_conn_est_handler(struct
> iwcm_id_private *cm_id_priv,
>  static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
>                                struct iw_cm_event *iw_event)
>  {
> +       struct ib_qp *qp = NULL;
>         unsigned long flags;
>         int ret;
> 
> @@ -896,11 +899,13 @@ static int cm_conn_rep_handler(struct
> iwcm_id_private *cm_id_priv,
>                 cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
>         } else {
>                 /* REJECTED or RESET */
> -               cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
> +               qp = cm_id_priv->qp;
>                 cm_id_priv->qp = NULL;
>                 cm_id_priv->state = IW_CM_STATE_IDLE;
>         }
>         spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +       if (qp)
> +               cm_id_priv->id.device->ops.iw_rem_ref(qp);
>         ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
> 
>         if (iw_event->private_data_len)
> @@ -942,14 +947,13 @@ static void cm_disconnect_handler(struct
> iwcm_id_private *cm_id_priv,
>  static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
>                                   struct iw_cm_event *iw_event)
>  {
> +       struct ib_qp *qp;
>         unsigned long flags;
>         int ret = 0;
>         spin_lock_irqsave(&cm_id_priv->lock, flags);
> +       qp = cm_id_priv->qp;
> +       cm_id_priv->qp = NULL;
> 
> -       if (cm_id_priv->qp) {
> -               cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
> -               cm_id_priv->qp = NULL;
> -       }
>         switch (cm_id_priv->state) {
>         case IW_CM_STATE_ESTABLISHED:
>         case IW_CM_STATE_CLOSING:
> @@ -965,6 +969,8 @@ static int cm_close_handler(struct iwcm_id_private
> *cm_id_priv,
>         }
>         spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> 
> +       if (qp)
> +               cm_id_priv->id.device->ops.iw_rem_ref(qp);
>         return ret;
>  }
> 
> diff --git a/drivers/infiniband/hw/cxgb4/device.c
> b/drivers/infiniband/hw/cxgb4/device.c
> index a8b9548..330483d 100644
> --- a/drivers/infiniband/hw/cxgb4/device.c
> +++ b/drivers/infiniband/hw/cxgb4/device.c
> @@ -747,6 +747,7 @@ void c4iw_release_dev_ucontext(struct c4iw_rdev
> *rdev,
>         struct list_head *pos, *nxt;
>         struct c4iw_qid_list *entry;
> 
> +       wait_event(uctx->wait, refcount_read(&uctx->refcnt) == 1);
>         mutex_lock(&uctx->lock);
>         list_for_each_safe(pos, nxt, &uctx->qpids) {
>                 entry = list_entry(pos, struct c4iw_qid_list, entry);
> @@ -775,6 +776,8 @@ void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
>         INIT_LIST_HEAD(&uctx->qpids);
>         INIT_LIST_HEAD(&uctx->cqids);
>         mutex_init(&uctx->lock);
> +       init_waitqueue_head(&uctx->wait);
> +       refcount_set(&uctx->refcnt, 1);
>  }
> 
>  /* Caller takes care of locking if needed */
> diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
> b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
> index 7d06b0f..441adc6 100644
> --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
> +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
> @@ -110,6 +110,8 @@ struct c4iw_dev_ucontext {
>         struct list_head cqids;
>         struct mutex lock;
>         struct kref kref;
> +       wait_queue_head_t wait;
> +       refcount_t refcnt;
>  };
> 
>  enum c4iw_rdev_flags {
> @@ -490,13 +492,13 @@ struct c4iw_qp {
>         struct t4_wq wq;
>         spinlock_t lock;
>         struct mutex mutex;
> +       struct kref kref;
>         wait_queue_head_t wait;
>         int sq_sig_all;
>         struct c4iw_srq *srq;
> +       struct work_struct free_work;
>         struct c4iw_ucontext *ucontext;
>         struct c4iw_wr_wait *wr_waitp;
> -       struct completion qp_rel_comp;
> -       refcount_t qp_refcnt;
>  };
> 
>  static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp)
> @@ -531,6 +533,7 @@ struct c4iw_ucontext {
>         spinlock_t mmap_lock;
>         struct list_head mmaps;
>         bool is_32b_cqe;
> +       struct completion qp_comp;
>  };
> 
>  static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext
> *c)
> diff --git a/drivers/infiniband/hw/cxgb4/qp.c
> b/drivers/infiniband/hw/cxgb4/qp.c
> index eb9368b..ca61193 100644
> --- a/drivers/infiniband/hw/cxgb4/qp.c
> +++ b/drivers/infiniband/hw/cxgb4/qp.c
> @@ -889,17 +889,44 @@ static int build_inv_stag(union t4_wr *wqe, const
> struct ib_send_wr *wr,
>         return 0;
>  }
> 
> +static void free_qp_work(struct work_struct *work)
> +{
> +       struct c4iw_dev_ucontext *uctx;
> +       struct c4iw_qp *qhp;
> +       struct c4iw_dev *rhp;
> +
> +       qhp = container_of(work, struct c4iw_qp, free_work);
> +       rhp = qhp->rhp;
> +       uctx = qhp->ucontext ? &qhp->ucontext->uctx : &rhp->rdev.uctx;
> +
> +       pr_debug("qhp %p ucontext %p\n", qhp, qhp->ucontext);
> +       destroy_qp(&rhp->rdev, &qhp->wq, uctx, !qhp->srq);
> +
> +       c4iw_put_wr_wait(qhp->wr_waitp);
> +       kfree(qhp);
> +       refcount_dec(&uctx->refcnt);
> +       wake_up(&uctx->wait);
> +}
> +
> +static void queue_qp_free(struct kref *kref)
> +{
> +       struct c4iw_qp *qhp;
> +
> +       qhp = container_of(kref, struct c4iw_qp, kref);
> +       pr_debug("qhp %p\n", qhp);
> +       queue_work(qhp->rhp->rdev.free_workq, &qhp->free_work);
> +}
> +
>  void c4iw_qp_add_ref(struct ib_qp *qp)
>  {
>         pr_debug("ib_qp %p\n", qp);
> -       refcount_inc(&to_c4iw_qp(qp)->qp_refcnt);
> +       kref_get(&to_c4iw_qp(qp)->kref);
>  }
> 
>  void c4iw_qp_rem_ref(struct ib_qp *qp)
>  {
>         pr_debug("ib_qp %p\n", qp);
> -       if (refcount_dec_and_test(&to_c4iw_qp(qp)->qp_refcnt))
> -               complete(&to_c4iw_qp(qp)->qp_rel_comp);
> +       kref_put(&to_c4iw_qp(qp)->kref, queue_qp_free);
>  }
> 
>  static void add_to_fc_list(struct list_head *head, struct list_head
> *entry)
> @@ -2071,12 +2098,10 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp, struct
> ib_udata *udata)
>  {
>         struct c4iw_dev *rhp;
>         struct c4iw_qp *qhp;
> -       struct c4iw_ucontext *ucontext;
>         struct c4iw_qp_attributes attrs;
> 
>         qhp = to_c4iw_qp(ib_qp);
>         rhp = qhp->rhp;
> -       ucontext = qhp->ucontext;
> 
>         attrs.next_state = C4IW_QP_STATE_ERROR;
>         if (qhp->attr.state == C4IW_QP_STATE_TERMINATE)
> @@ -2094,17 +2119,7 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp, struct
> ib_udata *udata)
> 
>         c4iw_qp_rem_ref(ib_qp);
> 
> -       wait_for_completion(&qhp->qp_rel_comp);
> -
>         pr_debug("ib_qp %p qpid 0x%0x\n", ib_qp, qhp->wq.sq.qid);
> -       pr_debug("qhp %p ucontext %p\n", qhp, ucontext);
> -
> -       destroy_qp(&rhp->rdev, &qhp->wq,
> -                  ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
>                    !qhp->srq);
> -
> -       c4iw_put_wr_wait(qhp->wr_waitp);
> -
> -       kfree(qhp);
>         return 0;
>  }
> 
> @@ -2214,8 +2229,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
> struct ib_qp_init_attr *attrs,
>         spin_lock_init(&qhp->lock);
>         mutex_init(&qhp->mutex);
>         init_waitqueue_head(&qhp->wait);
> -       init_completion(&qhp->qp_rel_comp);
> -       refcount_set(&qhp->qp_refcnt, 1);
> +       kref_init(&qhp->kref);
> +       INIT_WORK(&qhp->free_work, free_qp_work);
> 
>         ret = xa_insert_irq(&rhp->qps, qhp->wq.sq.qid, qhp, GFP_KERNEL);
>         if (ret)
> @@ -2335,6 +2350,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
> struct ib_qp_init_attr *attrs,
>         if (attrs->srq)
>                 qhp->srq = to_c4iw_srq(attrs->srq);
>         INIT_LIST_HEAD(&qhp->db_fc_entry);
> +       refcount_inc(ucontext ? &ucontext->uctx.refcnt :
> +                               &rhp->rdev.uctx.refcnt);
>         pr_debug("sq id %u size %u memsize %zu num_entries %u rq id %u
> size %u memsize %zu num_entries %u\n",
>                  qhp->wq.sq.qid, qhp->wq.sq.size, qhp->wq.sq.memsize,
>                  attrs->cap.max_send_wr, qhp->wq.rq.qid,
> qhp->wq.rq.size,
> diff --git a/drivers/infiniband/sw/siw/siw_qp.c
> b/drivers/infiniband/sw/siw/siw_qp.c
> index 0990307..743d3d2 100644
> --- a/drivers/infiniband/sw/siw/siw_qp.c
> +++ b/drivers/infiniband/sw/siw/siw_qp.c
> @@ -1305,6 +1305,7 @@ int siw_qp_add(struct siw_device *sdev, struct
> siw_qp *qp)
>  void siw_free_qp(struct kref *ref)
>  {
>         struct siw_qp *found, *qp = container_of(ref, struct siw_qp,
> ref);
> +       struct siw_base_qp *siw_base_qp = to_siw_base_qp(qp->ib_qp);
>         struct siw_device *sdev = qp->sdev;
>         unsigned long flags;
> 
> @@ -1327,4 +1328,5 @@ void siw_free_qp(struct kref *ref)
>         atomic_dec(&sdev->num_qp);
>         siw_dbg_qp(qp, "free QP\n");
>         kfree_rcu(qp, rcu);
> +       kfree(siw_base_qp);
>  }
> diff --git a/drivers/infiniband/sw/siw/siw_verbs.c
> b/drivers/infiniband/sw/siw/siw_verbs.c
> index 03176a3..35eee3f 100644
> --- a/drivers/infiniband/sw/siw/siw_verbs.c
> +++ b/drivers/infiniband/sw/siw/siw_verbs.c
> @@ -605,7 +605,6 @@ int siw_verbs_modify_qp(struct ib_qp *base_qp,
> struct ib_qp_attr *attr,
>  int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
>  {
>         struct siw_qp *qp = to_siw_qp(base_qp);
> -       struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp);
>         struct siw_ucontext *uctx =
>                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
>                                           base_ucontext);
> @@ -642,7 +641,6 @@ int siw_destroy_qp(struct ib_qp *base_qp, struct
> ib_udata *udata)
>         qp->scq = qp->rcq = NULL;
> 
>         siw_qp_put(qp);
> -       kfree(siw_base_qp);
> 
>         return 0;
>  }
> 
> > On Tuesday, September 09/17/19, 2019 at 22:50:27 +0530, Sagi Grimberg wrote:
> > > 
> > > >> To: "Krishnamraju Eraparaju" <krishna2@xxxxxxxxxxx>
> > > >> From: "Jason Gunthorpe" <jgg@xxxxxxxx>
> > > >> Date: 09/16/2019 06:28PM
> > > >> Cc: "Steve Wise" <larrystevenwise@xxxxxxxxx>, "Bernard Metzler"
> > > >> <BMT@xxxxxxxxxxxxxx>, "Sagi Grimberg" <sagi@xxxxxxxxxxx>,
> > > >> "linux-rdma@xxxxxxxxxxxxxxx" <linux-rdma@xxxxxxxxxxxxxxx>
> > > >> Subject: [EXTERNAL] Re: Re: [PATCH v3] iwcm: don't hold the irq
> > > >> disabled lock on iw_rem_ref
> > > >>
> > > >> On Wed, Sep 11, 2019 at 09:28:16PM +0530, Krishnamraju Eraparaju
> > > >> wrote:
> > > >>> Hi Steve & Bernard,
> > > >>>
> > > >>> Thanks for the review comments.
> > > >>> I will do those formating changes.
> > > >>
> > > >> I don't see anything in patchworks, but the consensus is to drop
> > > >> Sagi's patch pending this future patch?
> > > >>
> > > >> Jason
> > > >>
> > > > This is my impression as well. But consensus should be
> > > > explicit...Sagi, what do you think?
> > > 
> > > I don't really care, but given the changes from Krishnamraju cause other
> > > problems I'd ask if my version is also offending his test.
> > Hi Sagi,
> > Your version holds good for rdma_destroy_id() path only, but there are
> > other places where iw_rem_ref() is being called within spinlocks, here is
> > the call trace when iw_rem_ref() is called in cm_close_handler() path:
> > [  627.716339] Call Trace:
> > [  627.716339]  ? load_new_mm_cr3+0xc0/0xc0
> > [  627.716339]  on_each_cpu+0x23/0x40
> > [  627.716339]  flush_tlb_kernel_range+0x74/0x80
> > [  627.716340]  free_unmap_vmap_area+0x2a/0x40
> > [  627.716340]  remove_vm_area+0x59/0x60
> > [  627.716340]  __vunmap+0x46/0x210
> > [  627.716340]  siw_free_qp+0x88/0x120 [siw]
> > [  627.716340]  cm_work_handler+0x5b8/0xd00  <=====
> > [  627.716340]  process_one_work+0x155/0x380
> > [  627.716341]  worker_thread+0x41/0x3b0
> > [  627.716341]  kthread+0xf3/0x130
> > [  627.716341]  ? process_one_work+0x380/0x380
> > [  627.716341]  ? kthread_bind+0x10/0x10
> > [  627.716341]  ret_from_fork+0x35/0x40
> > 
> > Hence, I moved all the occurances of iw_rem_ref() outside of spinlocks
> > critical section.
> > > 
> > > In general, I do not think that making resources free routines (both
> > > explict or implicit via ref dec) under a spinlock is not a robust
> > > design.
> > > 
> > > I would first make it clear and documented what cm_id_priv->lock is
> > > protecting. In my mind, it should protect *its own* mutations of
> > > cm_id_priv and by design leave all the ops calls outside the lock.
> > > 
> > > I don't understand what is causing the Chelsio issue observed, but
> > > it looks like c4iw_destroy_qp blocks on a completion that depends on a
> > > refcount that is taken also by iwcm, which means that I cannot call
> > > ib_destroy_qp if the cm is not destroyed as well?
> > > 
> > > If that is madatory, I'd say that instead of blocking on this
> > > completion, we can simply convert c4iw_qp_rem_ref if use a kref
> > > which is not order dependent.
> > 
> > I agree,
> > I'm exploring best possible ways to address this issue, will update my
> > final resolution by this Friday.



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux