To avoid resource unreleased while ULP aborted abnormally, the hardware adds the capability of restoring the resource while removing module, this patch enables this capability. Signed-off-by: Lang Cheng <chenglang@xxxxxxxxxx> Signed-off-by: Lijun Ou <oulijun@xxxxxxxxxx> --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 164 ++++++++++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 27 ++++- 3 files changed, 191 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 563cf39..2e35469 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -990,6 +990,7 @@ struct hns_roce_dev { void *priv; struct workqueue_struct *irq_workq; const struct hns_roce_dfx_hw *dfx; + u32 func_num; }; static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f155d2d..efaf4ee 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1129,6 +1129,165 @@ static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev) return 0; } +static bool hns_roce_func_clr_chk_rst(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hnae3_handle *handle = priv->handle; + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + unsigned long reset_cnt; + bool sw_resetting; + bool hw_resetting; + + reset_cnt = ops->ae_dev_reset_cnt(handle); + hw_resetting = ops->get_hw_reset_stat(handle); + sw_resetting = ops->ae_dev_resetting(handle); + + if (reset_cnt != hr_dev->reset_cnt || hw_resetting || sw_resetting) + return true; + + return false; +} + +static void hns_roce_func_clr_rst_prc(struct hns_roce_dev *hr_dev, int retval, + int flag) +{ + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hnae3_handle *handle = priv->handle; + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + unsigned long instance_stage; + unsigned long reset_cnt; + unsigned long end; + bool sw_resetting; + bool hw_resetting; + + instance_stage = handle->rinfo.instance_state; + reset_cnt = ops->ae_dev_reset_cnt(handle); + hw_resetting = ops->get_hw_reset_stat(handle); + sw_resetting = ops->ae_dev_resetting(handle); + + if (reset_cnt != hr_dev->reset_cnt) { + hr_dev->dis_db = true; + hr_dev->is_reset = true; + dev_info(hr_dev->dev, "Func clear success after reset.\n"); + } else if (hw_resetting) { + hr_dev->dis_db = true; + + dev_warn(hr_dev->dev, + "Func clear is pending, device in resetting state.\n"); + end = msecs_to_jiffies(HNS_ROCE_V2_HW_RST_TIMEOUT) + jiffies; + while (time_before(jiffies, end)) { + if (!ops->get_hw_reset_stat(handle)) { + hr_dev->is_reset = true; + dev_info(hr_dev->dev, + "Func clear success after reset.\n"); + return; + } + msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT); + } + + dev_warn(hr_dev->dev, "Func clear failed.\n"); + } else if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT) { + hr_dev->dis_db = true; + + dev_warn(hr_dev->dev, + "Func clear is pending, device in resetting state.\n"); + end = msecs_to_jiffies(HNS_ROCE_V2_HW_RST_TIMEOUT) + jiffies; + while (time_before(jiffies, end)) { + if (ops->ae_dev_reset_cnt(handle) != + hr_dev->reset_cnt) { + hr_dev->is_reset = true; + dev_info(hr_dev->dev, + "Func clear success after sw reset\n"); + return; + } + msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT); + } + + dev_warn(hr_dev->dev, "Func clear failed because of unfinished sw reset\n"); + } else { + if (retval && !flag) + dev_warn(hr_dev->dev, + "Func clear read failed, ret = %d.\n", retval); + + dev_warn(hr_dev->dev, "Func clear failed.\n"); + } +} + +static void hns_roce_query_func_num(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_pf_func_num *resp; + struct hns_roce_cmq_desc desc; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_VF_NUM, true); + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + dev_err(hr_dev->dev, "Query vf count fail, ret = %d.\n", + ret); + return; + } + + resp = (struct hns_roce_pf_func_num *)desc.data; + hr_dev->func_num = resp->pf_own_func_num; +} + +static void hns_roce_clear_func(struct hns_roce_dev *hr_dev, int vf_id) +{ + bool fclr_write_fail_flag = false; + struct hns_roce_func_clear *resp; + struct hns_roce_cmq_desc desc; + unsigned long end; + int ret = 0; + + if (hns_roce_func_clr_chk_rst(hr_dev)) + goto out; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR, false); + resp = (struct hns_roce_func_clear *)desc.data; + resp->rst_funcid_en = vf_id; + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + fclr_write_fail_flag = true; + dev_err(hr_dev->dev, "Func clear write failed, ret = %d.\n", + ret); + goto out; + } + + end = msecs_to_jiffies(HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS) + jiffies; + + msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL); + while (time_before(jiffies, end)) { + if (hns_roce_func_clr_chk_rst(hr_dev)) + goto out; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR, + true); + resp->rst_funcid_en = vf_id; + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT); + continue; + } + + if (roce_get_bit(resp->func_done, FUNC_CLEAR_RST_FUN_DONE_S)) { + if (vf_id == 0) + hr_dev->is_reset = true; + return; + } + } + +out: + dev_err(hr_dev->dev, "Func clear read vf_id %d fail.\n", vf_id); + hns_roce_func_clr_rst_prc(hr_dev, ret, fclr_write_fail_flag); +} + +static void hns_roce_function_clear(struct hns_roce_dev *hr_dev) +{ + hns_roce_clear_func(hr_dev, 0); +} + static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev) { struct hns_roce_query_fw_info *resp; @@ -1479,6 +1638,8 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) return ret; } + hns_roce_query_func_num(hr_dev); + if (hr_dev->pci_dev->revision == 0x21) { ret = hns_roce_query_pf_timer_resource(hr_dev); if (ret) { @@ -1890,6 +2051,9 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) { struct hns_roce_v2_priv *priv = hr_dev->priv; + if (hr_dev->pci_dev->revision == 0x21) + hns_roce_function_clear(hr_dev); + hns_roce_free_link_table(hr_dev, &priv->tpq); hns_roce_free_link_table(hr_dev, &priv->tsq); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index edfdbe2..3f2c85f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -96,7 +96,10 @@ #define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE 2 #define HNS_ROCE_V2_RSV_QPS 8 -#define HNS_ROCE_V2_HW_RST_TIMEOUT 1000 +/* Time out for hardware to complete reset */ +#define HNS_ROCE_V2_HW_RST_TIMEOUT 1000 + +#define HNS_ROCE_V2_HW_RST_COMPLETION_WAIT 20 #define HNS_ROCE_CONTEXT_HOP_NUM 1 #define HNS_ROCE_SCCC_HOP_NUM 1 @@ -236,11 +239,13 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_CFG_EXT_LLM = 0x8403, HNS_ROCE_OPC_CFG_TMOUT_LLM = 0x8404, HNS_ROCE_OPC_QUERY_PF_TIMER_RES = 0x8406, + HNS_ROCE_OPC_QUERY_VF_NUM = 0x8407, HNS_ROCE_OPC_CFG_SGID_TB = 0x8500, HNS_ROCE_OPC_CFG_SMAC_TB = 0x8501, HNS_ROCE_OPC_POST_MB = 0x8504, HNS_ROCE_OPC_QUERY_MB_ST = 0x8505, HNS_ROCE_OPC_CFG_BT_ATTR = 0x8506, + HNS_ROCE_OPC_FUNC_CLEAR = 0x8508, HNS_ROCE_OPC_CLR_SCCC = 0x8509, HNS_ROCE_OPC_QUERY_SCCC = 0x850a, HNS_ROCE_OPC_RESET_SCCC = 0x850b, @@ -1226,6 +1231,26 @@ struct hns_roce_query_fw_info { __le32 rsv[5]; }; +struct hns_roce_func_clear { + __le32 rst_funcid_en; + __le32 func_done; + __le32 rsv[4]; +}; + +struct hns_roce_pf_func_num { + __le32 pf_own_func_num; + __le32 func_done; + __le32 rsv[4]; +}; + +#define FUNC_CLEAR_RST_FUN_EN_S 8 + +#define FUNC_CLEAR_RST_FUN_DONE_S 0 + +#define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS (512 * 100) +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL 40 +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT 20 + struct hns_roce_cfg_llm_a { __le32 base_addr_l; __le32 base_addr_h; -- 1.9.1