The aca handle is introduced by upper caller, it's inappropriate to poll aca handle to match and validate aca bank, which will cause unexcepted ras error report. Signed-off-by: Xiang Liu <xiang.liu@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 122 ++++++++++-------------- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 10 +- 3 files changed, 58 insertions(+), 76 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index ffd4c64e123c..b07e101c545d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -122,6 +122,25 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, st idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); } +static bool aca_bank_should_dump(struct amdgpu_device *adev, enum aca_smu_type type) +{ + struct amdgpu_aca *aca = &adev->aca; + bool ret = true; + + /* + * Because the UE Valid MCA count will only be cleared after reset, + * the aca bank is only dumped once during the gpu recovery stage. + */ + if (type == ACA_SMU_TYPE_UE) { + if (amdgpu_ras_intr_triggered()) + ret = atomic_cmpxchg(&aca->ue_dump_flag, 0, 1) == 0; + else + atomic_set(&aca->ue_dump_flag, 0); + } + + return ret; +} + static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type, int start, int count, struct aca_banks *banks, struct ras_query_context *qctx) @@ -130,6 +149,7 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_ const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; struct aca_bank bank; int i, max_count, ret; + struct aca_bank_node *node; if (!count) return 0; @@ -159,14 +179,16 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_ return ret; bank.smu_err_type = type; - - aca_smu_bank_dump(adev, i, count, &bank, qctx); - ret = aca_banks_add_bank(banks, &bank); if (ret) return ret; } + i = 0; + if (aca_bank_should_dump(adev, type)) + list_for_each_entry(node, &banks->list, node) + aca_smu_bank_dump(adev, i++, count, &bank, qctx); + return 0; } @@ -318,72 +340,29 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank return 0; } -static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank, - enum aca_smu_type type, bank_handler_t handler, void *data) -{ - struct aca_handle *handle; - int ret; - - if (list_empty(&mgr->list)) - return 0; - - list_for_each_entry(handle, &mgr->list, node) { - if (!aca_bank_is_valid(handle, bank, type)) - continue; - - ret = handler(handle, bank, type, data); - if (ret) - return ret; - } - - return 0; -} - -static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks, +static int aca_dispatch_banks(struct aca_handle *handle, struct aca_banks *banks, enum aca_smu_type type, bank_handler_t handler, void *data) { struct aca_bank_node *node; struct aca_bank *bank; - int ret; - if (!mgr || !banks) + if (!handle || !banks) return -EINVAL; /* pre check to avoid unnecessary operations */ - if (list_empty(&mgr->list) || list_empty(&banks->list)) + if (list_empty(&banks->list)) return 0; list_for_each_entry(node, &banks->list, node) { bank = &node->bank; - ret = aca_dispatch_bank(mgr, bank, type, handler, data); - if (ret) - return ret; + if (aca_bank_is_valid(handle, bank, type)) + handler(handle, bank, type, data); } return 0; } -static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type) -{ - struct amdgpu_aca *aca = &adev->aca; - bool ret = true; - - /* - * Because the UE Valid MCA count will only be cleared after reset, - * in order to avoid repeated counting of the error count, - * the aca bank is only updated once during the gpu recovery stage. - */ - if (type == ACA_SMU_TYPE_UE) { - if (amdgpu_ras_intr_triggered()) - ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0; - else - atomic_set(&aca->ue_update_flag, 0); - } - - return ret; -} - static void aca_banks_generate_cper(struct amdgpu_device *adev, enum aca_smu_type type, struct aca_banks *banks, @@ -417,20 +396,14 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev, } } -static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, - bank_handler_t handler, struct ras_query_context *qctx, void *data) +static int aca_banks_update(struct amdgpu_device *adev, struct aca_handle *handle, + enum aca_smu_type type, bank_handler_t handler, + struct ras_query_context *qctx, void *data) { - struct amdgpu_aca *aca = &adev->aca; struct aca_banks banks; u32 count = 0; int ret; - if (list_empty(&aca->mgr.list)) - return 0; - - if (!aca_bank_should_update(adev, type)) - return 0; - ret = aca_smu_get_valid_aca_count(adev, type, &count); if (ret) return ret; @@ -442,15 +415,12 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx); if (ret) - goto err_release_banks; + return ret; - if (list_empty(&banks.list)) { - ret = 0; - goto err_release_banks; - } + if (list_empty(&banks.list)) + return 0; - ret = aca_dispatch_banks(&aca->mgr, &banks, type, - handler, data); + ret = aca_dispatch_banks(handle, &banks, type, handler, data); if (ret) goto err_release_banks; @@ -537,7 +507,7 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h } /* update aca bank to aca source error_cache first */ - ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL); + ret = aca_banks_update(adev, handle, smu_type, handler_aca_log_bank_error, qctx, NULL); if (ret) return ret; @@ -730,7 +700,7 @@ int amdgpu_aca_init(struct amdgpu_device *adev) struct amdgpu_aca *aca = &adev->aca; int ret; - atomic_set(&aca->ue_update_flag, 0); + atomic_set(&aca->ue_dump_flag, 0); ret = aca_manager_init(&aca->mgr); if (ret) @@ -745,14 +715,14 @@ void amdgpu_aca_fini(struct amdgpu_device *adev) aca_manager_fini(&aca->mgr); - atomic_set(&aca->ue_update_flag, 0); + atomic_set(&aca->ue_dump_flag, 0); } int amdgpu_aca_reset(struct amdgpu_device *adev) { struct amdgpu_aca *aca = &adev->aca; - atomic_set(&aca->ue_update_flag, 0); + atomic_set(&aca->ue_dump_flag, 0); return 0; } @@ -880,12 +850,20 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban static int aca_dump_show(struct seq_file *m, enum aca_smu_type type) { struct amdgpu_device *adev = (struct amdgpu_device *)m->private; + struct aca_handle_manager *mgr = &adev->aca.mgr; + struct aca_handle *handle; struct aca_dump_context context = { .m = m, .idx = 0, }; - return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context); + if (list_empty(&mgr->list)) + return 0; + + list_for_each_entry(handle, &mgr->list, node) + aca_banks_update(adev, handle, type, handler_aca_bank_dump, NULL, (void *)&context); + + return 0; } static int aca_dump_ce_show(struct seq_file *m, void *unused) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 6f62e5d80ed6..e71d6f5afaec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -202,7 +202,7 @@ struct aca_smu_funcs { struct amdgpu_aca { struct aca_handle_manager mgr; const struct aca_smu_funcs *smu_funcs; - atomic_t ue_update_flag; + atomic_t ue_dump_flag; bool is_enabled; }; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index c0de682b7774..a4038e92c59e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -876,10 +876,14 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle, void *data) { struct aca_bank_info info; - u64 misc0; + u64 misc0, status; u32 instlo; int ret; + status = bank->regs[ACA_REG_IDX_STATUS]; + if (!ACA_REG__STATUS__VAL(status)) + return 0; + ret = aca_bank_info_decode(bank, &info); if (ret) return ret; @@ -894,8 +898,8 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle, switch (type) { case ACA_SMU_TYPE_UE: bank->aca_err_type = ACA_ERROR_TYPE_UE; - ret = aca_error_cache_log_bank_error(handle, &info, - ACA_ERROR_TYPE_UE, 1ULL); + if (ACA_REG__STATUS__UC(status) && ACA_REG__STATUS__PCC(status)) + ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, 1); break; case ACA_SMU_TYPE_CE: bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank); -- 2.34.1