For the case one process modify qp to error while the other process is posting send or recv, the race of flush cqe will happen. To solve this problem, the lock should be hold between post send, post recv with modify qp verbs separately. Further more, this patch uses workqueue to do flush cqe process for hip08 as under some cases post send or post recv may called under softirq context, and it will lead to following calltrace with current driver. [ 5343.812237] Call trace: [ 5343.815448] [<ffff00000808ab38>] dump_backtrace+0x0/0x280 [ 5343.821115] [<ffff00000808addc>] show_stack+0x24/0x30 [ 5343.826605] [<ffff000008d84cb4>] dump_stack+0x98/0xb8 [ 5343.831966] [<ffff0000080fda44>] __schedule_bug+0x64/0x80 [ 5343.837605] [<ffff000008d9b1ec>] __schedule+0x6bc/0x7fc [ 5343.843010] [<ffff000008d9b360>] schedule+0x34/0x8c [ 5343.848133] [<ffff000008d9ee80>] schedule_timeout+0x1d8/0x3cc [ 5343.854087] [<ffff000008d9d72c>] __down+0x84/0xdc [ 5343.859114] [<ffff000008124250>] down+0x54/0x6c [ 5343.866446] [<ffff000001025bd4>] hns_roce_cmd_mbox+0x68/0x2cc [hns_roce] [ 5343.874439] [<ffff000001063f70>] hns_roce_v2_modify_qp+0x4f4/0x1024 [hns_roce_pci] [ 5343.882594] [<ffff00000106570c>] hns_roce_v2_post_recv+0x2a4/0x330 [hns_roce_pci] [ 5343.890872] [<ffff0000010aa138>] nvme_rdma_post_recv+0x88/0xf8 [nvme_rdma] [ 5343.898156] [<ffff0000010ab3a8>] __nvme_rdma_recv_done.isra.40+0x110/0x1f0 [nvme_rdma] [ 5343.906453] [<ffff0000010ab4b4>] nvme_rdma_recv_done+0x2c/0x38 [nvme_rdma] [ 5343.918428] [<ffff000000e34e04>] __ib_process_cq+0x7c/0xf0 [ib_core] [ 5343.927135] [<ffff000000e34fb8>] ib_poll_handler+0x30/0x90 [ib_core] [ 5343.933900] [<ffff00000859db94>] irq_poll_softirq+0xf8/0x150 [ 5343.939825] [<ffff0000080818d0>] __do_softirq+0x140/0x2ec [ 5343.945573] [<ffff0000080d6f10>] run_ksoftirqd+0x48/0x5c [ 5343.951258] [<ffff0000080f9064>] smpboot_thread_fn+0x190/0x1d4 [ 5343.957311] [<ffff0000080f441c>] kthread+0x10c/0x138 [ 5343.962518] [<ffff0000080855dc>] ret_from_fork+0x10/0x18 Fixes: 0425e3e6e0c7 ("RDMA/hns: Support flush cqe for hip08 in kernel space") Signed-off-by: Yixian Liu <liuyixian@xxxxxxxxxx> Signed-off-by: Lijun Ou <oulijun@xxxxxxxxxx> --- drivers/infiniband/hw/hns/hns_roce_device.h | 14 +++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 144 ++++++++++++---------------- drivers/infiniband/hw/hns/hns_roce_main.c | 12 +++ drivers/infiniband/hw/hns/hns_roce_qp.c | 44 +++++++++ 4 files changed, 130 insertions(+), 84 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 9ee86da..530e7e4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -82,6 +82,8 @@ #define HNS_ROCE_MAX_GID_NUM 16 #define HNS_ROCE_GID_SIZE 16 +#define HNS_ROCE_WORKQ_NAME_LEN 32 + #define HNS_ROCE_HOP_NUM_0 0xff #define BITMAP_NO_RR 0 @@ -629,6 +631,8 @@ struct hns_roce_qp { u8 sl; u8 resp_depth; u8 state; + u8 next_state; /* record for flush cqe */ + int attr_mask; /* record for flush cqe */ u32 access_flags; u32 atomic_rd_en; u32 pkey_index; @@ -869,6 +873,12 @@ struct hns_roce_work { int sub_type; }; +struct hns_roce_flush_work { + struct hns_roce_dev *hr_dev; + struct work_struct work; + struct hns_roce_qp *hr_qp; +}; + struct hns_roce_hw { int (*reset)(struct hns_roce_dev *hr_dev, bool enable); int (*cmq_init)(struct hns_roce_dev *hr_dev); @@ -921,6 +931,8 @@ struct hns_roce_hw { int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*init_eq)(struct hns_roce_dev *hr_dev); void (*cleanup_eq)(struct hns_roce_dev *hr_dev); + int (*create_workq)(struct hns_roce_dev *hr_dev); + void (*destroy_workq)(struct hns_roce_dev *hr_dev); void (*write_srqc)(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, u32 pdn, u16 xrcd, u32 cqn, void *mb_buf, u64 *mtts_wqe, u64 *mtts_idx, @@ -985,6 +997,7 @@ struct hns_roce_dev { const struct hns_roce_hw *hw; void *priv; struct workqueue_struct *irq_workq; + struct workqueue_struct *flush_workq; }; static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev) @@ -1160,6 +1173,7 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *ib_pd, struct ib_udata *udata); int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); +void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); void *get_recv_wqe(struct hns_roce_qp *hr_qp, int n); void *get_send_wqe(struct hns_roce_qp *hr_qp, int n); void *get_send_extend_sge(struct hns_roce_qp *hr_qp, int n); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 1c54390..e49c7f1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -219,11 +219,6 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, return 0; } -static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, - int attr_mask, enum ib_qp_state cur_state, - enum ib_qp_state new_state); - static int hns_roce_v2_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) @@ -236,14 +231,12 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct hns_roce_wqe_frmr_seg *fseg; struct device *dev = hr_dev->dev; struct hns_roce_v2_db sq_db; - struct ib_qp_attr attr; unsigned int sge_ind = 0; unsigned int owner_bit; unsigned long flags; unsigned int ind; void *wqe = NULL; bool loopback; - int attr_mask; u32 tmp_len; int ret = 0; u32 hr_op; @@ -592,18 +585,8 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, qp->sq_next_wqe = ind; qp->next_sge = sge_ind; - if (qp->state == IB_QPS_ERR) { - attr_mask = IB_QP_STATE; - attr.qp_state = IB_QPS_ERR; - - ret = hns_roce_v2_modify_qp(&qp->ibqp, &attr, attr_mask, - qp->state, IB_QPS_ERR); - if (ret) { - spin_unlock_irqrestore(&qp->sq.lock, flags); - *bad_wr = wr; - return ret; - } - } + if (qp->state == IB_QPS_ERR) + init_flush_work(hr_dev, qp); } spin_unlock_irqrestore(&qp->sq.lock, flags); @@ -620,10 +603,8 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct hns_roce_v2_wqe_data_seg *dseg; struct hns_roce_rinl_sge *sge_list; struct device *dev = hr_dev->dev; - struct ib_qp_attr attr; unsigned long flags; void *wqe = NULL; - int attr_mask; int ret = 0; int nreq; int ind; @@ -693,19 +674,8 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, *hr_qp->rdb.db_record = hr_qp->rq.head & 0xffff; - if (hr_qp->state == IB_QPS_ERR) { - attr_mask = IB_QP_STATE; - attr.qp_state = IB_QPS_ERR; - - ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, &attr, - attr_mask, hr_qp->state, - IB_QPS_ERR); - if (ret) { - spin_unlock_irqrestore(&hr_qp->rq.lock, flags); - *bad_wr = wr; - return ret; - } - } + if (hr_qp->state == IB_QPS_ERR) + init_flush_work(hr_dev, hr_qp); } spin_unlock_irqrestore(&hr_qp->rq.lock, flags); @@ -1929,6 +1899,11 @@ static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, u64 in_param, { struct hns_roce_cmq_desc desc; struct hns_roce_post_mbox *mb = (struct hns_roce_post_mbox *)desc.data; + struct hns_roce_qp *qp; + unsigned long sq_flags; + unsigned long rq_flags; + int to_be_err_state = false; + int ret; hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_POST_MB, false); @@ -1939,7 +1914,23 @@ static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, u64 in_param, mb->cmd_tag = cpu_to_le32(in_modifier << 8 | op); mb->token_event_en = cpu_to_le32(event << 16 | token); - return hns_roce_cmq_send(hr_dev, &desc, 1); + qp = __hns_roce_qp_lookup(hr_dev, in_modifier); + + if (qp && !qp->ibqp.uobject && + (qp->attr_mask & IB_QP_STATE) && qp->next_state == IB_QPS_ERR) { + spin_lock_irqsave(&qp->sq.lock, sq_flags); + spin_lock_irqsave(&qp->rq.lock, rq_flags); + to_be_err_state = true; + } + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + + if (to_be_err_state) { + spin_unlock_irqrestore(&qp->rq.lock, rq_flags); + spin_unlock_irqrestore(&qp->sq.lock, sq_flags); + } + + return ret; } static int hns_roce_v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param, @@ -2565,13 +2556,11 @@ static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe, static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, struct hns_roce_qp **cur_qp, struct ib_wc *wc) { + struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device); struct hns_roce_srq *srq = NULL; - struct hns_roce_dev *hr_dev; struct hns_roce_v2_cqe *cqe; struct hns_roce_qp *hr_qp; struct hns_roce_wq *wq; - struct ib_qp_attr attr; - int attr_mask; int is_send; u16 wqe_ctr; u32 opcode; @@ -2595,7 +2584,6 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, V2_CQE_BYTE_16_LCL_QPN_S); if (!*cur_qp || (qpn & HNS_ROCE_V2_CQE_QPN_MASK) != (*cur_qp)->qpn) { - hr_dev = to_hr_dev(hr_cq->ib_cq.device); hr_qp = __hns_roce_qp_lookup(hr_dev, qpn); if (unlikely(!hr_qp)) { dev_err(hr_dev->dev, "CQ %06lx with entry for unknown QPN %06x\n", @@ -2690,13 +2678,12 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, } /* flush cqe if wc status is error, excluding flush error */ - if ((wc->status != IB_WC_SUCCESS) && - (wc->status != IB_WC_WR_FLUSH_ERR)) { - attr_mask = IB_QP_STATE; - attr.qp_state = IB_QPS_ERR; - return hns_roce_v2_modify_qp(&(*cur_qp)->ibqp, - &attr, attr_mask, - (*cur_qp)->state, IB_QPS_ERR); + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) { + dev_err(hr_dev->dev, "error cqe status is: 0x%x\n", + status & HNS_ROCE_V2_CQE_STATUS_MASK); + init_flush_work(hr_dev, *cur_qp); + return 0; } if (wc->status == IB_WC_WR_FLUSH_ERR) @@ -4606,39 +4593,6 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) return ret; } -static void hns_roce_set_qps_to_err(struct hns_roce_dev *hr_dev, u32 qpn) -{ - struct hns_roce_qp *hr_qp; - struct ib_qp_attr attr; - int attr_mask; - int ret; - - hr_qp = __hns_roce_qp_lookup(hr_dev, qpn); - if (!hr_qp) { - dev_warn(hr_dev->dev, "no hr_qp can be found!\n"); - return; - } - - if (hr_qp->ibqp.uobject) { - if (hr_qp->sdb_en == 1) { - hr_qp->sq.head = *(int *)(hr_qp->sdb.virt_addr); - if (hr_qp->rdb_en == 1) - hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr); - } else { - dev_warn(hr_dev->dev, "flush cqe is unsupported in userspace!\n"); - return; - } - } - - attr_mask = IB_QP_STATE; - attr.qp_state = IB_QPS_ERR; - ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, &attr, attr_mask, - hr_qp->state, IB_QPS_ERR); - if (ret) - dev_err(hr_dev->dev, "failed to modify qp %d to err state.\n", - qpn); -} - static void hns_roce_irq_work_handle(struct work_struct *work) { struct hns_roce_work *irq_work = @@ -4661,19 +4615,16 @@ static void hns_roce_irq_work_handle(struct work_struct *work) dev_warn(dev, "Send queue drained.\n"); break; case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - dev_err(dev, "Local work queue 0x%x catas error, sub_type:%d\n", + dev_err(dev, "Local work queue 0x%x catast error, sub_event type is: %d\n", qpn, irq_work->sub_type); - hns_roce_set_qps_to_err(irq_work->hr_dev, qpn); break; case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: dev_err(dev, "Invalid request local work queue 0x%x error.\n", qpn); - hns_roce_set_qps_to_err(irq_work->hr_dev, qpn); break; case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: - dev_err(dev, "Local access violation work queue 0x%x error, sub_type:%d\n", + dev_err(dev, "Local access violation work queue 0x%x error, sub_event type is: %d\n", qpn, irq_work->sub_type); - hns_roce_set_qps_to_err(irq_work->hr_dev, qpn); break; case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: dev_warn(dev, "SRQ limit reach.\n"); @@ -5765,6 +5716,29 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev) destroy_workqueue(hr_dev->irq_workq); } +static int hns_roce_v2_create_workq(struct hns_roce_dev *hr_dev) +{ + char workq_name[HNS_ROCE_WORKQ_NAME_LEN]; + struct device *dev = hr_dev->dev; + + snprintf(workq_name, HNS_ROCE_WORKQ_NAME_LEN - 1, "%s_flush_wq", + hr_dev->ib_dev.name); + + hr_dev->flush_workq = create_singlethread_workqueue(workq_name); + if (!hr_dev->flush_workq) { + dev_err(dev, "Failed to create flush workqueue!\n"); + return -ENOMEM; + } + + return 0; +} + +static void hns_roce_v2_destroy_workq(struct hns_roce_dev *hr_dev) +{ + flush_workqueue(hr_dev->flush_workq); + destroy_workqueue(hr_dev->flush_workq); +} + static void hns_roce_v2_write_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, u32 pdn, u16 xrcd, u32 cqn, void *mb_buf, u64 *mtts_wqe, @@ -6089,6 +6063,8 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, .poll_cq = hns_roce_v2_poll_cq, .init_eq = hns_roce_v2_init_eq_table, .cleanup_eq = hns_roce_v2_cleanup_eq_table, + .create_workq = hns_roce_v2_create_workq, + .destroy_workq = hns_roce_v2_destroy_workq, .write_srqc = hns_roce_v2_write_srqc, .modify_srq = hns_roce_v2_modify_srq, .query_srq = hns_roce_v2_query_srq, diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index c929125..563c8b64 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -951,8 +951,17 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) if (ret) goto error_failed_register_device; + if (hr_dev->hw->create_workq) { + ret = hr_dev->hw->create_workq(hr_dev); + if (ret) + goto error_failed_create_workq; + } + return 0; +error_failed_create_workq: + hns_roce_unregister_device(hr_dev); + error_failed_register_device: if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); @@ -991,6 +1000,9 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev) { hns_roce_unregister_device(hr_dev); + if (hr_dev->hw->destroy_workq) + hr_dev->hw->destroy_workq(hr_dev); + if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); hns_roce_cleanup_bitmap(hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index b719c38..7a86ff5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -43,6 +43,41 @@ #define SQP_NUM (2 * HNS_ROCE_MAX_PORTS) +static void flush_work_handle(struct work_struct *work) +{ + struct hns_roce_flush_work *flush_work = container_of(work, + struct hns_roce_flush_work, work); + struct hns_roce_qp *hr_qp = flush_work->hr_qp; + struct device *dev = flush_work->hr_dev->dev; + struct ib_qp_attr attr; + int attr_mask; + int ret; + + attr_mask = IB_QP_STATE; + attr.qp_state = IB_QPS_ERR; + + ret = hns_roce_modify_qp(&hr_qp->ibqp, &attr, attr_mask, NULL); + if (ret) + dev_err(dev, "Modify qp to err for flush cqe fail(%d)\n", ret); + + kfree(flush_work); +} + +void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) +{ + struct hns_roce_flush_work *flush_work; + + flush_work = kzalloc(sizeof(struct hns_roce_flush_work), GFP_ATOMIC); + if (!flush_work) + return; + + flush_work->hr_dev = hr_dev; + flush_work->hr_qp = hr_qp; + INIT_WORK(&flush_work->work, flush_work_handle); + queue_work(hr_dev->flush_workq, &flush_work->work); +} +EXPORT_SYMBOL_GPL(init_flush_work); + void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type) { struct hns_roce_qp_table *qp_table = &hr_dev->qp_table; @@ -62,6 +97,11 @@ void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type) return; } + if (event_type == HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR || + event_type == HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR || + event_type == HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR) + init_flush_work(hr_dev, qp); + qp->event(qp, (enum hns_roce_event)event_type); if (atomic_dec_and_test(&qp->refcount)) @@ -571,6 +611,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, spin_lock_init(&hr_qp->rq.lock); hr_qp->state = IB_QPS_RESET; + hr_qp->next_state = IB_QPS_RESET; hr_qp->ibqp.qp_type = init_attr->qp_type; @@ -983,6 +1024,9 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + hr_qp->next_state = new_state; + hr_qp->attr_mask = attr_mask; + if (ibqp->uobject && (attr_mask & IB_QP_STATE) && new_state == IB_QPS_ERR) { if (hr_qp->sdb_en == 1) { -- 1.9.1