[PATCH] drm/amdgpu: Use correct aca handle to validate aca bank

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The aca handle is introduced by upper caller, it's inappropriate to
poll aca handle to match and validate aca bank, which will cause
unexcepted ras error report.

Signed-off-by: Xiang Liu <xiang.liu@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 122 ++++++++++--------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h |   2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c |  10 +-
 3 files changed, 58 insertions(+), 76 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index ffd4c64e123c..b07e101c545d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -122,6 +122,25 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, st
 			      idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
 }
 
+static bool aca_bank_should_dump(struct amdgpu_device *adev, enum aca_smu_type type)
+{
+	struct amdgpu_aca *aca = &adev->aca;
+	bool ret = true;
+
+	/*
+	 * Because the UE Valid MCA count will only be cleared after reset,
+	 * the aca bank is only dumped once during the gpu recovery stage.
+	 */
+	if (type == ACA_SMU_TYPE_UE) {
+		if (amdgpu_ras_intr_triggered())
+			ret = atomic_cmpxchg(&aca->ue_dump_flag, 0, 1) == 0;
+		else
+			atomic_set(&aca->ue_dump_flag, 0);
+	}
+
+	return ret;
+}
+
 static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
 				       int start, int count,
 				       struct aca_banks *banks, struct ras_query_context *qctx)
@@ -130,6 +149,7 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
 	const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
 	struct aca_bank bank;
 	int i, max_count, ret;
+	struct aca_bank_node *node;
 
 	if (!count)
 		return 0;
@@ -159,14 +179,16 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
 			return ret;
 
 		bank.smu_err_type = type;
-
-		aca_smu_bank_dump(adev, i, count, &bank, qctx);
-
 		ret = aca_banks_add_bank(banks, &bank);
 		if (ret)
 			return ret;
 	}
 
+	i = 0;
+	if (aca_bank_should_dump(adev, type))
+		list_for_each_entry(node, &banks->list, node)
+			aca_smu_bank_dump(adev, i++, count, &bank, qctx);
+
 	return 0;
 }
 
@@ -318,72 +340,29 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank
 	return 0;
 }
 
-static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank,
-			     enum aca_smu_type type, bank_handler_t handler, void *data)
-{
-	struct aca_handle *handle;
-	int ret;
-
-	if (list_empty(&mgr->list))
-		return 0;
-
-	list_for_each_entry(handle, &mgr->list, node) {
-		if (!aca_bank_is_valid(handle, bank, type))
-			continue;
-
-		ret = handler(handle, bank, type, data);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks,
+static int aca_dispatch_banks(struct aca_handle *handle, struct aca_banks *banks,
 			      enum aca_smu_type type, bank_handler_t handler, void *data)
 {
 	struct aca_bank_node *node;
 	struct aca_bank *bank;
-	int ret;
 
-	if (!mgr || !banks)
+	if (!handle || !banks)
 		return -EINVAL;
 
 	/* pre check to avoid unnecessary operations */
-	if (list_empty(&mgr->list) || list_empty(&banks->list))
+	if (list_empty(&banks->list))
 		return 0;
 
 	list_for_each_entry(node, &banks->list, node) {
 		bank = &node->bank;
 
-		ret = aca_dispatch_bank(mgr, bank, type, handler, data);
-		if (ret)
-			return ret;
+		if (aca_bank_is_valid(handle, bank, type))
+			handler(handle, bank, type, data);
 	}
 
 	return 0;
 }
 
-static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type)
-{
-	struct amdgpu_aca *aca = &adev->aca;
-	bool ret = true;
-
-	/*
-	 * Because the UE Valid MCA count will only be cleared after reset,
-	 * in order to avoid repeated counting of the error count,
-	 * the aca bank is only updated once during the gpu recovery stage.
-	 */
-	if (type == ACA_SMU_TYPE_UE) {
-		if (amdgpu_ras_intr_triggered())
-			ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0;
-		else
-			atomic_set(&aca->ue_update_flag, 0);
-	}
-
-	return ret;
-}
-
 static void aca_banks_generate_cper(struct amdgpu_device *adev,
 				    enum aca_smu_type type,
 				    struct aca_banks *banks,
@@ -417,20 +396,14 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
 	}
 }
 
-static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
-			    bank_handler_t handler, struct ras_query_context *qctx, void *data)
+static int aca_banks_update(struct amdgpu_device *adev, struct aca_handle *handle,
+			    enum aca_smu_type type, bank_handler_t handler,
+			    struct ras_query_context *qctx, void *data)
 {
-	struct amdgpu_aca *aca = &adev->aca;
 	struct aca_banks banks;
 	u32 count = 0;
 	int ret;
 
-	if (list_empty(&aca->mgr.list))
-		return 0;
-
-	if (!aca_bank_should_update(adev, type))
-		return 0;
-
 	ret = aca_smu_get_valid_aca_count(adev, type, &count);
 	if (ret)
 		return ret;
@@ -442,15 +415,12 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
 
 	ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx);
 	if (ret)
-		goto err_release_banks;
+		return ret;
 
-	if (list_empty(&banks.list)) {
-		ret = 0;
-		goto err_release_banks;
-	}
+	if (list_empty(&banks.list))
+		return 0;
 
-	ret = aca_dispatch_banks(&aca->mgr, &banks, type,
-				 handler, data);
+	ret = aca_dispatch_banks(handle, &banks, type, handler, data);
 	if (ret)
 		goto err_release_banks;
 
@@ -537,7 +507,7 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h
 	}
 
 	/* update aca bank to aca source error_cache first */
-	ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL);
+	ret = aca_banks_update(adev, handle, smu_type, handler_aca_log_bank_error, qctx, NULL);
 	if (ret)
 		return ret;
 
@@ -730,7 +700,7 @@ int amdgpu_aca_init(struct amdgpu_device *adev)
 	struct amdgpu_aca *aca = &adev->aca;
 	int ret;
 
-	atomic_set(&aca->ue_update_flag, 0);
+	atomic_set(&aca->ue_dump_flag, 0);
 
 	ret = aca_manager_init(&aca->mgr);
 	if (ret)
@@ -745,14 +715,14 @@ void amdgpu_aca_fini(struct amdgpu_device *adev)
 
 	aca_manager_fini(&aca->mgr);
 
-	atomic_set(&aca->ue_update_flag, 0);
+	atomic_set(&aca->ue_dump_flag, 0);
 }
 
 int amdgpu_aca_reset(struct amdgpu_device *adev)
 {
 	struct amdgpu_aca *aca = &adev->aca;
 
-	atomic_set(&aca->ue_update_flag, 0);
+	atomic_set(&aca->ue_dump_flag, 0);
 
 	return 0;
 }
@@ -880,12 +850,20 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban
 static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
+	struct aca_handle_manager *mgr = &adev->aca.mgr;
+	struct aca_handle *handle;
 	struct aca_dump_context context = {
 		.m = m,
 		.idx = 0,
 	};
 
-	return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context);
+	if (list_empty(&mgr->list))
+		return 0;
+
+	list_for_each_entry(handle, &mgr->list, node)
+		aca_banks_update(adev, handle, type, handler_aca_bank_dump, NULL, (void *)&context);
+
+	return 0;
 }
 
 static int aca_dump_ce_show(struct seq_file *m, void *unused)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
index 6f62e5d80ed6..e71d6f5afaec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
@@ -202,7 +202,7 @@ struct aca_smu_funcs {
 struct amdgpu_aca {
 	struct aca_handle_manager mgr;
 	const struct aca_smu_funcs *smu_funcs;
-	atomic_t ue_update_flag;
+	atomic_t ue_dump_flag;
 	bool is_enabled;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index c0de682b7774..a4038e92c59e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -876,10 +876,14 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
 				      void *data)
 {
 	struct aca_bank_info info;
-	u64 misc0;
+	u64 misc0, status;
 	u32 instlo;
 	int ret;
 
+	status = bank->regs[ACA_REG_IDX_STATUS];
+	if (!ACA_REG__STATUS__VAL(status))
+		return 0;
+
 	ret = aca_bank_info_decode(bank, &info);
 	if (ret)
 		return ret;
@@ -894,8 +898,8 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
 	switch (type) {
 	case ACA_SMU_TYPE_UE:
 		bank->aca_err_type = ACA_ERROR_TYPE_UE;
-		ret = aca_error_cache_log_bank_error(handle, &info,
-						     ACA_ERROR_TYPE_UE, 1ULL);
+		if (ACA_REG__STATUS__UC(status) && ACA_REG__STATUS__PCC(status))
+			ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, 1);
 		break;
 	case ACA_SMU_TYPE_CE:
 		bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
-- 
2.34.1




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux