RE: [PATCH] drm/amdgpu: Use correct aca handle to validate aca bank

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[AMD Official Use Only - AMD Internal Distribution Only]

 
         list_for_each_entry(node, &banks->list, node) {
                 bank = &node->bank;
 
-               ret = aca_dispatch_bank(mgr, bank, type, handler, data);
-               if (ret)
-                       return ret;
+               if (aca_bank_is_valid(handle, bank, type))
+                       handler(handle, bank, type, data);
         }
 
         return 0;
}
 
The aca bank set returned by the SMU may contain banks of different ip types, which may result in incorrect statistics of aca bank information of some RAS ip blocks.
e.g:
The SMU returned 6 banks in total, including 3 umc aca banks, 2 xgmi banks, and 1 unsupported bank.
 
 
Best Regards,
Kevin
_____________________________________________
From: Zhang, Hawking <Hawking.Zhang@xxxxxxx>
Sent: Tuesday, March 18, 2025 17:19
To: Liu, Xiang(Dean) <Xiang.Liu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>
Subject: RE: [PATCH] drm/amdgpu: Use correct aca handle to validate aca bank
 
 
[AMD Official Use Only - AMD Internal Distribution Only]

+ @Wang, Yang(Kevin)/@Zhou1, Tao/@Chai, Thomas for the review.
 
Regards,
Hawking
 
-----Original Message-----
From: Liu, Xiang(Dean) <Xiang.Liu@xxxxxxx>
Sent: Tuesday, March 18, 2025 17:15
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Liu, Xiang(Dean) <Xiang.Liu@xxxxxxx>
Subject: [PATCH] drm/amdgpu: Use correct aca handle to validate aca bank
 
The aca handle is introduced by upper caller, it's inappropriate to poll aca handle to match and validate aca bank, which will cause unexcepted ras error report.
 
Signed-off-by: Xiang Liu <xiang.liu@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 122 ++++++++++--------------
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h |   2 +-
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c |  10 +-
3 files changed, 58 insertions(+), 76 deletions(-)
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index ffd4c64e123c..b07e101c545d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -122,6 +122,25 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, st
                               idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);  }
 
+static bool aca_bank_should_dump(struct amdgpu_device *adev, enum
+aca_smu_type type) {
+       struct amdgpu_aca *aca = &adev->aca;
+       bool ret = true;
+
+       /*
+        * Because the UE Valid MCA count will only be cleared after reset,
+        * the aca bank is only dumped once during the gpu recovery stage.
+        */
+       if (type == ACA_SMU_TYPE_UE) {
+               if (amdgpu_ras_intr_triggered())
+                       ret = atomic_cmpxchg(&aca->ue_dump_flag, 0, 1) == 0;
+               else
+                       atomic_set(&aca->ue_dump_flag, 0);
+       }
+
+       return ret;
+}
+
static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
                                        int start, int count,
                                        struct aca_banks *banks, struct ras_query_context *qctx) @@ -130,6 +149,7 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
         const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
         struct aca_bank bank;
         int i, max_count, ret;
+       struct aca_bank_node *node;
 
         if (!count)
                 return 0;
@@ -159,14 +179,16 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
                         return ret;
 
                 bank.smu_err_type = type;
-
-               aca_smu_bank_dump(adev, i, count, &bank, qctx);
-
                 ret = aca_banks_add_bank(banks, &bank);
                 if (ret)
                         return ret;
         }
 
+       i = 0;
+       if (aca_bank_should_dump(adev, type))
+               list_for_each_entry(node, &banks->list, node)
+                       aca_smu_bank_dump(adev, i++, count, &bank, qctx);
+
         return 0;
}
 
@@ -318,72 +340,29 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank
         return 0;
}
 
-static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank,
-                            enum aca_smu_type type, bank_handler_t handler, void *data)
-{
-       struct aca_handle *handle;
-       int ret;
-
-       if (list_empty(&mgr->list))
-               return 0;
-
-       list_for_each_entry(handle, &mgr->list, node) {
-               if (!aca_bank_is_valid(handle, bank, type))
-                       continue;
-
-               ret = handler(handle, bank, type, data);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks,
+static int aca_dispatch_banks(struct aca_handle *handle, struct
+aca_banks *banks,
                               enum aca_smu_type type, bank_handler_t handler, void *data)  {
         struct aca_bank_node *node;
         struct aca_bank *bank;
-       int ret;
 
-       if (!mgr || !banks)
+       if (!handle || !banks)
                 return -EINVAL;
 
         /* pre check to avoid unnecessary operations */
-       if (list_empty(&mgr->list) || list_empty(&banks->list))
+       if (list_empty(&banks->list))
                 return 0;
 
         list_for_each_entry(node, &banks->list, node) {
                 bank = &node->bank;
 
-               ret = aca_dispatch_bank(mgr, bank, type, handler, data);
-               if (ret)
-                       return ret;
+               if (aca_bank_is_valid(handle, bank, type))
+                       handler(handle, bank, type, data);
         }
 
         return 0;
}
 
-static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type) -{
-       struct amdgpu_aca *aca = &adev->aca;
-       bool ret = true;
-
-       /*
-        * Because the UE Valid MCA count will only be cleared after reset,
-        * in order to avoid repeated counting of the error count,
-        * the aca bank is only updated once during the gpu recovery stage.
-        */
-       if (type == ACA_SMU_TYPE_UE) {
-               if (amdgpu_ras_intr_triggered())
-                       ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0;
-               else
-                       atomic_set(&aca->ue_update_flag, 0);
-       }
-
-       return ret;
-}
-
static void aca_banks_generate_cper(struct amdgpu_device *adev,
                                     enum aca_smu_type type,
                                     struct aca_banks *banks,
@@ -417,20 +396,14 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
         }
}
 
-static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
-                           bank_handler_t handler, struct ras_query_context *qctx, void *data)
+static int aca_banks_update(struct amdgpu_device *adev, struct aca_handle *handle,
+                           enum aca_smu_type type, bank_handler_t handler,
+                           struct ras_query_context *qctx, void *data)
{
-       struct amdgpu_aca *aca = &adev->aca;
         struct aca_banks banks;
         u32 count = 0;
         int ret;
 
-       if (list_empty(&aca->mgr.list))
-               return 0;
-
-       if (!aca_bank_should_update(adev, type))
-               return 0;
-
         ret = aca_smu_get_valid_aca_count(adev, type, &count);
         if (ret)
                 return ret;
@@ -442,15 +415,12 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
 
         ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx);
         if (ret)
-               goto err_release_banks;
+               return ret;
 
-       if (list_empty(&banks.list)) {
-               ret = 0;
-               goto err_release_banks;
-       }
+       if (list_empty(&banks.list))
+               return 0;
 
-       ret = aca_dispatch_banks(&aca->mgr, &banks, type,
-                                handler, data);
+       ret = aca_dispatch_banks(handle, &banks, type, handler, data);
         if (ret)
                 goto err_release_banks;
 
@@ -537,7 +507,7 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h
         }
 
         /* update aca bank to aca source error_cache first */
-       ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL);
+       ret = aca_banks_update(adev, handle, smu_type,
+handler_aca_log_bank_error, qctx, NULL);
         if (ret)
                 return ret;
 
@@ -730,7 +700,7 @@ int amdgpu_aca_init(struct amdgpu_device *adev)
         struct amdgpu_aca *aca = &adev->aca;
         int ret;
 
-       atomic_set(&aca->ue_update_flag, 0);
+       atomic_set(&aca->ue_dump_flag, 0);
 
         ret = aca_manager_init(&aca->mgr);
         if (ret)
@@ -745,14 +715,14 @@ void amdgpu_aca_fini(struct amdgpu_device *adev)
 
         aca_manager_fini(&aca->mgr);
 
-       atomic_set(&aca->ue_update_flag, 0);
+       atomic_set(&aca->ue_dump_flag, 0);
}
 
int amdgpu_aca_reset(struct amdgpu_device *adev)  {
         struct amdgpu_aca *aca = &adev->aca;
 
-       atomic_set(&aca->ue_update_flag, 0);
+       atomic_set(&aca->ue_dump_flag, 0);
 
         return 0;
}
@@ -880,12 +850,20 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban  static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)  {
         struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
+       struct aca_handle_manager *mgr = &adev->aca.mgr;
+       struct aca_handle *handle;
         struct aca_dump_context context = {
                 .m = m,
                 .idx = 0,
         };
 
-       return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context);
+       if (list_empty(&mgr->list))
+               return 0;
+
+       list_for_each_entry(handle, &mgr->list, node)
+               aca_banks_update(adev, handle, type, handler_aca_bank_dump, NULL,
+(void *)&context);
+
+       return 0;
}
 
static int aca_dump_ce_show(struct seq_file *m, void *unused) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
index 6f62e5d80ed6..e71d6f5afaec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
@@ -202,7 +202,7 @@ struct aca_smu_funcs {  struct amdgpu_aca {
         struct aca_handle_manager mgr;
         const struct aca_smu_funcs *smu_funcs;
-       atomic_t ue_update_flag;
+       atomic_t ue_dump_flag;
         bool is_enabled;
};
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index c0de682b7774..a4038e92c59e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -876,10 +876,14 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
                                       void *data)
{
         struct aca_bank_info info;
-       u64 misc0;
+       u64 misc0, status;
         u32 instlo;
         int ret;
 
+       status = bank->regs[ACA_REG_IDX_STATUS];
+       if (!ACA_REG__STATUS__VAL(status))
+               return 0;
+
         ret = aca_bank_info_decode(bank, &info);
         if (ret)
                 return ret;
@@ -894,8 +898,8 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
         switch (type) {
         case ACA_SMU_TYPE_UE:
                 bank->aca_err_type = ACA_ERROR_TYPE_UE;
-               ret = aca_error_cache_log_bank_error(handle, &info,
-                                                    ACA_ERROR_TYPE_UE, 1ULL);
+               if (ACA_REG__STATUS__UC(status) && ACA_REG__STATUS__PCC(status))
+                       ret = aca_error_cache_log_bank_error(handle, &info,
+ACA_ERROR_TYPE_UE, 1);
                 break;
         case ACA_SMU_TYPE_CE:
                 bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
--
2.34.1
 

[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux