RE: [PATCH 11/14] drm/amdgpu: add xgmi v6.4.0 ACA support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[AMD Official Use Only - General]

+       if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
+           (type == ACA_ERROR_TYPE_CE && ext_error_code == 6)) {
+               report->type = type;
+               report->count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
+       }

Gentle reminder that we should be able to extend the error logging to all the pcs errors. Just read back the config registers so we know which error is configured to UE and which error is configured to CE.

Regards,
Hawking


-----Original Message-----
From: Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>
Sent: Wednesday, January 3, 2024 16:02
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>
Subject: [PATCH 11/14] drm/amdgpu: add xgmi v6.4.0 ACA support

add xgmi v6.4.0 ACA driver support

Signed-off-by: Yang Wang <kevinyang.wang@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 63 +++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index a6c88f2fe6e5..61208ca94442 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1035,15 +1035,76 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
        return 0;
 }

+static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
+                                               struct aca_bank_report *report, void *data) {
+       struct amdgpu_device *adev = handle->adev;
+       const char *error_str;
+       u64 status;
+       int ret, ext_error_code;
+
+       ret = aca_bank_info_decode(bank, &report->info);
+       if (ret)
+               return ret;
+
+       status = bank->regs[MCA_REG_IDX_STATUS];
+       ext_error_code = MCA_REG__STATUS__ERRORCODEEXT(status);
+
+       error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ?
+               xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL;
+       if (error_str)
+               dev_info(adev->dev, "%s detected\n", error_str);
+
+       if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
+           (type == ACA_ERROR_TYPE_CE && ext_error_code == 6)) {
+               report->type = type;
+               report->count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
+       }
+
+       return 0;
+}
+
+static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = {
+       .aca_bank_generate_report = xgmi_v6_4_0_aca_bank_generate_report,
+};
+
+static const struct aca_info xgmi_v6_4_0_aca_info = {
+       .hwip = ACA_HWIP_TYPE_PCS_XGMI,
+       .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK,
+       .bank_ops = &xgmi_v6_4_0_aca_bank_ops, };
+
 static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)  {
+       int r;
+
        if (!adev->gmc.xgmi.supported ||
            adev->gmc.xgmi.num_physical_nodes == 0)
                return 0;

        amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);

-       return amdgpu_ras_block_late_init(adev, ras_block);
+       r = amdgpu_ras_block_late_init(adev, ras_block);
+       if (r)
+               return r;
+
+       switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
+       case IP_VERSION(6, 4, 0):
+               r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL,
+                                       &xgmi_v6_4_0_aca_info, NULL);
+               if (r)
+                       goto late_fini;
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+
+late_fini:
+       amdgpu_ras_block_late_fini(adev, ras_block);
+
+       return r;
 }

 uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
--
2.34.1





[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux