Re: [PATCH for-next] i40iw: Add support to make destroy QP synchronous

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Sep 16, 2020 at 08:18:12AM -0500, Shiraz Saleem wrote:
> From: "Sindhu, Devale" <sindhu.devale@xxxxxxxxx>
>
> Occasionally ib_write_bw crash is seen due to
> access of a pd object in i40iw_sc_qp_destroy after it
> is freed. Destroy qp is not synchronous in i40iw and
> thus the iwqp object could be referencing a pd object
> that is freed by ib core as a result of successful
> return from i40iw_destroy_qp.
>
> Wait in i40iw_destroy_qp till all QP references are released
> and destroy the QP and its associated resources before returning.
> Switch to use the refcount API vs atomic API for lifetime
> management of the qp.
>
>  RIP: 0010:i40iw_sc_qp_destroy+0x4b/0x120 [i40iw]
>  [...]
>  RSP: 0018:ffffb4a7042e3ba8 EFLAGS: 00010002
>  RAX: 0000000000000000 RBX: 0000000000000001 RCX: dead000000000122
>  RDX: ffffb4a7042e3bac RSI: ffff8b7ef9b1e940 RDI: ffff8b7efbf09080
>  RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000
>  R10: 8080808080808080 R11: 0000000000000010 R12: ffff8b7efbf08050
>  R13: 0000000000000001 R14: ffff8b7f15042928 R15: ffff8b7ef9b1e940
>  FS:  0000000000000000(0000) GS:ffff8b7f2fa00000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: 0000000000000400 CR3: 000000020d60a006 CR4: 00000000001606e0
>  Call Trace:
>   i40iw_exec_cqp_cmd+0x4d3/0x5c0 [i40iw]
>   ? try_to_wake_up+0x1ea/0x5d0
>   ? __switch_to_asm+0x40/0x70
>   i40iw_process_cqp_cmd+0x95/0xa0 [i40iw]
>   i40iw_handle_cqp_op+0x42/0x1a0 [i40iw]
>   ? cm_event_handler+0x13c/0x1f0 [iw_cm]
>   i40iw_rem_ref+0xa0/0xf0 [i40iw]
>   cm_work_handler+0x99c/0xd10 [iw_cm]
>   process_one_work+0x1a1/0x360
>   worker_thread+0x30/0x380
>   ? process_one_work+0x360/0x360
>   kthread+0x10c/0x130
>   ? kthread_park+0x80/0x80
>   ret_from_fork+0x35/0x40
>
> Fixes: d37498417947 ("i40iw: add files for iwarp interface")
> Reported-by: Kamal Heib <kheib@xxxxxxxxxx>
> Signed-off-by: Sindhu, Devale <sindhu.devale@xxxxxxxxx>
> Signed-off-by: Shiraz, Saleem <shiraz.saleem@xxxxxxxxx>
> ---
>  drivers/infiniband/hw/i40iw/i40iw.h       |  9 +++--
>  drivers/infiniband/hw/i40iw/i40iw_cm.c    | 10 +++---
>  drivers/infiniband/hw/i40iw/i40iw_hw.c    |  4 +--
>  drivers/infiniband/hw/i40iw/i40iw_utils.c | 59 ++++++-------------------------
>  drivers/infiniband/hw/i40iw/i40iw_verbs.c | 31 +++++++++++-----
>  drivers/infiniband/hw/i40iw/i40iw_verbs.h |  3 +-
>  6 files changed, 45 insertions(+), 71 deletions(-)
>
> diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h
> index 25747b8..832b80d 100644
> --- a/drivers/infiniband/hw/i40iw/i40iw.h
> +++ b/drivers/infiniband/hw/i40iw/i40iw.h
> @@ -409,8 +409,8 @@ static inline struct i40iw_qp *to_iwqp(struct ib_qp *ibqp)
>  }
>
>  /* i40iw.c */
> -void i40iw_add_ref(struct ib_qp *);
> -void i40iw_rem_ref(struct ib_qp *);
> +void i40iw_qp_add_ref(struct ib_qp *ibqp);
> +void i40iw_qp_rem_ref(struct ib_qp *ibqp);
>  struct ib_qp *i40iw_get_qp(struct ib_device *, int);
>
>  void i40iw_flush_wqes(struct i40iw_device *iwdev,
> @@ -554,9 +554,8 @@ enum i40iw_status_code i40iw_manage_qhash(struct i40iw_device *iwdev,
>  					  bool wait);
>  void i40iw_receive_ilq(struct i40iw_sc_vsi *vsi, struct i40iw_puda_buf *rbuf);
>  void i40iw_free_sqbuf(struct i40iw_sc_vsi *vsi, void *bufp);
> -void i40iw_free_qp_resources(struct i40iw_device *iwdev,
> -			     struct i40iw_qp *iwqp,
> -			     u32 qp_num);
> +void i40iw_free_qp_resources(struct i40iw_qp *iwqp);
> +
>  enum i40iw_status_code i40iw_obj_aligned_mem(struct i40iw_device *iwdev,
>  					     struct i40iw_dma_mem *memptr,
>  					     u32 size, u32 mask);
> diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c
> index a3b9580..3053c34 100644
> --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
> +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
> @@ -2322,7 +2322,7 @@ static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node)
>  	iwqp = cm_node->iwqp;
>  	if (iwqp) {
>  		iwqp->cm_node = NULL;
> -		i40iw_rem_ref(&iwqp->ibqp);
> +		i40iw_qp_rem_ref(&iwqp->ibqp);
>  		cm_node->iwqp = NULL;
>  	} else if (cm_node->qhash_set) {
>  		i40iw_get_addr_info(cm_node, &nfo);
> @@ -3452,7 +3452,7 @@ void i40iw_cm_disconn(struct i40iw_qp *iwqp)
>  		kfree(work);
>  		return;
>  	}
> -	i40iw_add_ref(&iwqp->ibqp);
> +	i40iw_qp_add_ref(&iwqp->ibqp);
>  	spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
>
>  	work->iwqp = iwqp;
> @@ -3623,7 +3623,7 @@ static void i40iw_disconnect_worker(struct work_struct *work)
>
>  	kfree(dwork);
>  	i40iw_cm_disconn_true(iwqp);
> -	i40iw_rem_ref(&iwqp->ibqp);
> +	i40iw_qp_rem_ref(&iwqp->ibqp);
>  }
>
>  /**
> @@ -3745,7 +3745,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
>  	cm_node->lsmm_size = accept.size + conn_param->private_data_len;
>  	i40iw_cm_init_tsa_conn(iwqp, cm_node);
>  	cm_id->add_ref(cm_id);
> -	i40iw_add_ref(&iwqp->ibqp);
> +	i40iw_qp_add_ref(&iwqp->ibqp);
>
>  	attr.qp_state = IB_QPS_RTS;
>  	cm_node->qhash_set = false;
> @@ -3908,7 +3908,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
>  	iwqp->cm_node = cm_node;
>  	cm_node->iwqp = iwqp;
>  	iwqp->cm_id = cm_id;
> -	i40iw_add_ref(&iwqp->ibqp);
> +	i40iw_qp_add_ref(&iwqp->ibqp);
>
>  	if (cm_node->state != I40IW_CM_STATE_OFFLOADED) {
>  		cm_node->state = I40IW_CM_STATE_SYN_SENT;
> diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c
> index e108563..56fdc16 100644
> --- a/drivers/infiniband/hw/i40iw/i40iw_hw.c
> +++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c
> @@ -313,7 +313,7 @@ void i40iw_process_aeq(struct i40iw_device *iwdev)
>  					    __func__, info->qp_cq_id);
>  				continue;
>  			}
> -			i40iw_add_ref(&iwqp->ibqp);
> +			i40iw_qp_add_ref(&iwqp->ibqp);
>  			spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
>  			qp = &iwqp->sc_qp;
>  			spin_lock_irqsave(&iwqp->lock, flags);
> @@ -426,7 +426,7 @@ void i40iw_process_aeq(struct i40iw_device *iwdev)
>  			break;
>  		}
>  		if (info->qp)
> -			i40iw_rem_ref(&iwqp->ibqp);
> +			i40iw_qp_rem_ref(&iwqp->ibqp);
>  	} while (1);
>
>  	if (aeqcnt)
> diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c
> index e07fb37a..5e196bd 100644
> --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
> +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
> @@ -478,25 +478,6 @@ void i40iw_cleanup_pending_cqp_op(struct i40iw_device *iwdev)
>  }
>
>  /**
> - * i40iw_free_qp - callback after destroy cqp completes
> - * @cqp_request: cqp request for destroy qp
> - * @num: not used
> - */
> -static void i40iw_free_qp(struct i40iw_cqp_request *cqp_request, u32 num)
> -{
> -	struct i40iw_sc_qp *qp = (struct i40iw_sc_qp *)cqp_request->param;
> -	struct i40iw_qp *iwqp = (struct i40iw_qp *)qp->back_qp;
> -	struct i40iw_device *iwdev;
> -	u32 qp_num = iwqp->ibqp.qp_num;
> -
> -	iwdev = iwqp->iwdev;
> -
> -	i40iw_rem_pdusecount(iwqp->iwpd, iwdev);
> -	i40iw_free_qp_resources(iwdev, iwqp, qp_num);
> -	i40iw_rem_devusecount(iwdev);
> -}
> -
> -/**
>   * i40iw_wait_event - wait for completion
>   * @iwdev: iwarp device
>   * @cqp_request: cqp request to wait
> @@ -616,26 +597,23 @@ void i40iw_rem_pdusecount(struct i40iw_pd *iwpd, struct i40iw_device *iwdev)
>  }
>
>  /**
> - * i40iw_add_ref - add refcount for qp
> + * i40iw_qp_add_ref - add refcount for qp
>   * @ibqp: iqarp qp
>   */
> -void i40iw_add_ref(struct ib_qp *ibqp)
> +void i40iw_qp_add_ref(struct ib_qp *ibqp)
>  {
>  	struct i40iw_qp *iwqp = (struct i40iw_qp *)ibqp;
>
> -	atomic_inc(&iwqp->refcount);
> +	refcount_inc(&iwqp->refcount);
>  }
>
>  /**
> - * i40iw_rem_ref - rem refcount for qp and free if 0
> + * i40iw_qp_rem_ref - rem refcount for qp and free if 0
>   * @ibqp: iqarp qp
>   */
> -void i40iw_rem_ref(struct ib_qp *ibqp)
> +void i40iw_qp_rem_ref(struct ib_qp *ibqp)
>  {
>  	struct i40iw_qp *iwqp;
> -	enum i40iw_status_code status;
> -	struct i40iw_cqp_request *cqp_request;
> -	struct cqp_commands_info *cqp_info;
>  	struct i40iw_device *iwdev;
>  	u32 qp_num;
>  	unsigned long flags;
> @@ -643,7 +621,7 @@ void i40iw_rem_ref(struct ib_qp *ibqp)
>  	iwqp = to_iwqp(ibqp);
>  	iwdev = iwqp->iwdev;
>  	spin_lock_irqsave(&iwdev->qptable_lock, flags);
> -	if (!atomic_dec_and_test(&iwqp->refcount)) {
> +	if (!refcount_dec_and_test(&iwqp->refcount)) {
>  		spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
>  		return;
>  	}
> @@ -651,25 +629,8 @@ void i40iw_rem_ref(struct ib_qp *ibqp)
>  	qp_num = iwqp->ibqp.qp_num;
>  	iwdev->qp_table[qp_num] = NULL;
>  	spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
> -	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false);
> -	if (!cqp_request)
> -		return;
> -
> -	cqp_request->callback_fcn = i40iw_free_qp;
> -	cqp_request->param = (void *)&iwqp->sc_qp;
> -	cqp_info = &cqp_request->info;
> -	cqp_info->cqp_cmd = OP_QP_DESTROY;
> -	cqp_info->post_sq = 1;
> -	cqp_info->in.u.qp_destroy.qp = &iwqp->sc_qp;
> -	cqp_info->in.u.qp_destroy.scratch = (uintptr_t)cqp_request;
> -	cqp_info->in.u.qp_destroy.remove_hash_idx = true;
> -	status = i40iw_handle_cqp_op(iwdev, cqp_request);
> -	if (!status)
> -		return;
> +	complete(&iwqp->free_qp);
>
> -	i40iw_rem_pdusecount(iwqp->iwpd, iwdev);
> -	i40iw_free_qp_resources(iwdev, iwqp, qp_num);
> -	i40iw_rem_devusecount(iwdev);
>  }
>
>  /**
> @@ -936,7 +897,7 @@ static void i40iw_terminate_timeout(struct timer_list *t)
>  	struct i40iw_sc_qp *qp = (struct i40iw_sc_qp *)&iwqp->sc_qp;
>
>  	i40iw_terminate_done(qp, 1);
> -	i40iw_rem_ref(&iwqp->ibqp);
> +	i40iw_qp_rem_ref(&iwqp->ibqp);
>  }
>
>  /**
> @@ -948,7 +909,7 @@ void i40iw_terminate_start_timer(struct i40iw_sc_qp *qp)
>  	struct i40iw_qp *iwqp;
>
>  	iwqp = (struct i40iw_qp *)qp->back_qp;
> -	i40iw_add_ref(&iwqp->ibqp);
> +	i40iw_qp_add_ref(&iwqp->ibqp);
>  	timer_setup(&iwqp->terminate_timer, i40iw_terminate_timeout, 0);
>  	iwqp->terminate_timer.expires = jiffies + HZ;
>  	add_timer(&iwqp->terminate_timer);
> @@ -964,7 +925,7 @@ void i40iw_terminate_del_timer(struct i40iw_sc_qp *qp)
>
>  	iwqp = (struct i40iw_qp *)qp->back_qp;
>  	if (del_timer(&iwqp->terminate_timer))
> -		i40iw_rem_ref(&iwqp->ibqp);
> +		i40iw_qp_rem_ref(&iwqp->ibqp);
>  }
>
>  /**
> diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
> index 4511e17..6ade1ea 100644
> --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
> +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
> @@ -364,11 +364,11 @@ static struct i40iw_pbl *i40iw_get_pbl(unsigned long va,
>   * @iwqp: qp ptr (user or kernel)
>   * @qp_num: qp number assigned
>   */
> -void i40iw_free_qp_resources(struct i40iw_device *iwdev,
> -			     struct i40iw_qp *iwqp,
> -			     u32 qp_num)
> +void i40iw_free_qp_resources(struct i40iw_qp *iwqp)
>  {
>  	struct i40iw_pbl *iwpbl = &iwqp->iwpbl;
> +	struct i40iw_device *iwdev = iwqp->iwdev;
> +	u32 qp_num = iwqp->ibqp.qp_num;
>
>  	i40iw_ieq_cleanup_qp(iwdev->vsi.ieq, &iwqp->sc_qp);
>  	i40iw_dealloc_push_page(iwdev, &iwqp->sc_qp);
> @@ -402,6 +402,10 @@ static void i40iw_clean_cqes(struct i40iw_qp *iwqp, struct i40iw_cq *iwcq)
>  static int i40iw_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
>  {
>  	struct i40iw_qp *iwqp = to_iwqp(ibqp);
> +	struct ib_qp_attr attr;
> +	struct i40iw_device *iwdev = iwqp->iwdev;
> +
> +	memset(&attr, 0, sizeof(attr));
>
>  	iwqp->destroyed = 1;
>
> @@ -416,7 +420,15 @@ static int i40iw_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
>  		}
>  	}
>
> -	i40iw_rem_ref(&iwqp->ibqp);
> +	attr.qp_state = IB_QPS_ERR;
> +	i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL);
> +	i40iw_qp_rem_ref(&iwqp->ibqp);
> +	wait_for_completion(&iwqp->free_qp);

I always wanted to ask, why do iWARP devices have this gp_get_ref/qp_put_ref
logic? Does it come from iw_cm in-kernel implementation of from some iWARP
specification?

Thanks



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux