Re: [PATCH] RDMA/rxe: Restore tasklet call for rxe_cq.c

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Jul 11, 2024 at 9:41 AM Honggang LI <honggangli@xxxxxxx> wrote:
>
> If ib_req_notify_cq() was called in complete handler, deadlock occurs
> in receive path.
>
> rxe_req_notify_cq+0x21/0x70 [rdma_rxe]
> krping_cq_event_handler+0x26f/0x2c0 [rdma_krping]

What is rdma_krping? What is the deadlock?
Please explain the deadlock in details.


> rxe_cq_post+0x128/0x1d0 [rdma_rxe]
> ? copy_data+0xe0/0x230 [rdma_rxe]
> rxe_responder+0xbe8/0x23a0 [rdma_rxe]
> do_task+0x68/0x1e0 [rdma_rxe]
> ? __pfx_rxe_udp_encap_recv+0x10/0x10 [rdma_rxe]
> rxe_udp_encap_recv+0x79/0xe0 [rdma_rxe]
> udp_queue_rcv_one_skb+0x272/0x540
> udp_unicast_rcv_skb+0x76/0x90
> __udp4_lib_rcv+0xab2/0xd60
> ? raw_local_deliver+0xd2/0x2a0
> ip_protocol_deliver_rcu+0xd1/0x1b0
> ip_local_deliver_finish+0x76/0xa0
> ip_local_deliver+0x68/0x100
> ? ip_rcv_finish_core.isra.0+0xc2/0x430
> ip_sublist_rcv+0x2a0/0x340
> ip_list_rcv+0x13b/0x170
> __netif_receive_skb_list_core+0x2bb/0x2e0
> netif_receive_skb_list_internal+0x1cd/0x300
> napi_complete_done+0x72/0x200
> e1000e_poll+0xcf/0x320 [e1000e]
> __napi_poll+0x2b/0x160
> net_rx_action+0x2c6/0x3b0
> ? enqueue_hrtimer+0x42/0xa0
> handle_softirqs+0xf7/0x340
> ? sched_clock_cpu+0xf/0x1f0
> __irq_exit_rcu+0x97/0xb0
> common_interrupt+0x85/0xa0
>
> Fixes: 0c7e314a6352 ("RDMA/rxe: Fix rxe_cq_post")
> Fixes: 78b26a335310 ("RDMA/rxe: Remove tasklet call from rxe_cq.c")
> Signed-off-by: Honggang LI <honggangli@xxxxxxx>
> ---
>  drivers/infiniband/sw/rxe/rxe_cq.c    | 35 ++++++++++++++++++++++++---
>  drivers/infiniband/sw/rxe/rxe_loc.h   |  2 ++
>  drivers/infiniband/sw/rxe/rxe_verbs.c |  2 ++
>  drivers/infiniband/sw/rxe/rxe_verbs.h |  2 ++
>  4 files changed, 37 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
> index fec87c9030ab..97a537994aee 100644
> --- a/drivers/infiniband/sw/rxe/rxe_cq.c
> +++ b/drivers/infiniband/sw/rxe/rxe_cq.c
> @@ -39,6 +39,21 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
>         return -EINVAL;
>  }
>
> +static void rxe_send_complete(struct tasklet_struct *t)
> +{
> +       struct rxe_cq *cq = from_tasklet(cq, t, comp_task);
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&cq->cq_lock, flags);
> +       if (cq->is_dying) {
> +               spin_unlock_irqrestore(&cq->cq_lock, flags);
> +               return;
> +       }
> +       spin_unlock_irqrestore(&cq->cq_lock, flags);
> +
> +       cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
> +}
> +
>  int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
>                      int comp_vector, struct ib_udata *udata,
>                      struct rxe_create_cq_resp __user *uresp)
> @@ -64,6 +79,10 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
>
>         cq->is_user = uresp;
>
> +       cq->is_dying = false;
> +
> +       tasklet_setup(&cq->comp_task, rxe_send_complete);
> +
>         spin_lock_init(&cq->cq_lock);
>         cq->ibcq.cqe = cqe;
>         return 0;
> @@ -84,7 +103,6 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe,
>         return err;
>  }
>
> -/* caller holds reference to cq */
>  int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
>  {
>         struct ib_event ev;
> @@ -113,17 +131,26 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
>
>         queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT);
>
> +       spin_unlock_irqrestore(&cq->cq_lock, flags);
> +
>         if ((cq->notify & IB_CQ_NEXT_COMP) ||
>             (cq->notify & IB_CQ_SOLICITED && solicited)) {
>                 cq->notify = 0;
> -               cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
> +               tasklet_schedule(&cq->comp_task);
>         }
>
> -       spin_unlock_irqrestore(&cq->cq_lock, flags);
> -
>         return 0;
>  }
>
> +void rxe_cq_disable(struct rxe_cq *cq)
> +{
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&cq->cq_lock, flags);
> +       cq->is_dying = true;
> +       spin_unlock_irqrestore(&cq->cq_lock, flags);
> +}
> +
>  void rxe_cq_cleanup(struct rxe_pool_elem *elem)
>  {
>         struct rxe_cq *cq = container_of(elem, typeof(*cq), elem);
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index ded46119151b..ba84f780aa3d 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -31,6 +31,8 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe,
>
>  int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited);
>
> +void rxe_cq_disable(struct rxe_cq *cq);
> +
>  void rxe_cq_cleanup(struct rxe_pool_elem *elem);
>
>  /* rxe_mcast.c */
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index de6238ee4379..a964d86789f6 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -1205,6 +1205,8 @@ static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
>                 goto err_out;
>         }
>
> +       rxe_cq_disable(cq);
> +
>         err = rxe_cleanup(cq);
>         if (err)
>                 rxe_err_cq(cq, "cleanup failed, err = %d\n", err);
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index 3c1354f82283..03df97e83570 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -63,7 +63,9 @@ struct rxe_cq {
>         struct rxe_queue        *queue;
>         spinlock_t              cq_lock;
>         u8                      notify;
> +       bool                    is_dying;
>         bool                    is_user;
> +       struct tasklet_struct   comp_task;
>         atomic_t                num_wq;
>  };
>
> --
> 2.45.2
>
>





[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux