+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs-
message_smu_query_ras_error_count)
+ adev->umc.ras_funcs-
message_smu_query_ras_error_count(adev,
+err_data);
+
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs-
message_smu_query_ras_error_address)
+ adev->umc.ras_funcs-
message_smu_query_ras_error_address(adev, err_data);
+ } else {
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_count)
+ adev->umc.ras_funcs->query_ras_error_count(adev,
err_data);
+
+ /* umc query_ras_error_address is also responsible for
clearing
+ * error status
+ */
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_address)
+ adev->umc.ras_funcs-
query_ras_error_address(adev, err_data);
+ }
+}
+
/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info)
@@ -901,15 +933,7 @@ int amdgpu_ras_query_error_status(struct
amdgpu_device *adev,
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__UMC:
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_count)
- adev->umc.ras_funcs->query_ras_error_count(adev,
&err_data);
- /* umc query_ras_error_address is also responsible for
clearing
- * error status
- */
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_address)
- adev->umc.ras_funcs-
query_ras_error_address(adev, &err_data);
+ amdgpu_ras_get_ecc_info(adev, &err_data);
break;
case AMDGPU_RAS_BLOCK__SDMA:
if (adev->sdma.funcs->query_ras_error_count) { diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bcbf3264d92f..3f0de0cc8403 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -322,6 +322,12 @@ struct ras_common_if {
#define MAX_UMC_CHANNEL_NUM 32
+/*
+ * SMU support ECCTABLE since version 68.42.0,
+ * use this to decide query umc error info method */ #define
+SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
+
struct ecc_info_per_ch {
uint16_t ce_count_lo_chip;
uint16_t ce_count_hi_chip;
@@ -375,6 +381,7 @@ struct amdgpu_ras {
/* record umc error info queried from smu */
struct umc_ecc_info umc_ecc;
+ uint32_t smu_version;
};
struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 0c7c56a91b25..2c3e97c9410b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -97,28 +97,57 @@ int amdgpu_umc_process_ras_data_cb(struct
amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_count)
- adev->umc.ras_funcs->query_ras_error_count(adev,
ras_error_status);
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_address &&
- adev->umc.max_ras_err_cnt_per_query) {
- err_data->err_addr =
- kcalloc(adev->umc.max_ras_err_cnt_per_query,
- sizeof(struct eeprom_table_record),
GFP_KERNEL);
-
- /* still call query_ras_error_address to clear error status
- * even NOMEM error is encountered
- */
- if(!err_data->err_addr)
- dev_warn(adev->dev, "Failed to alloc memory for "
- "umc error address record!\n");
-
- /* umc query_ras_error_address is also responsible for
clearing
- * error status
- */
- adev->umc.ras_funcs->query_ras_error_address(adev,
ras_error_status);
+ if ((con->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) &&
+ !smu_get_ecc_info(&adev->smu, (void *)&(con-
umc_ecc))) {
+
Same comment as above.
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs-
message_smu_query_ras_error_count)
+ adev->umc.ras_funcs-
message_smu_query_ras_error_count(adev,
+ras_error_status);
+
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs-
message_smu_query_ras_error_address &&
+ adev->umc.max_ras_err_cnt_per_query) {
+ err_data->err_addr =
+ kcalloc(adev-
umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record),
GFP_KERNEL);
+
+ /* still call query_ras_error_address to clear error
status
+ * even NOMEM error is encountered
+ */
+ if(!err_data->err_addr)
+ dev_warn(adev->dev, "Failed to alloc
memory for "
+ "umc error address
record!\n");
+
+ /* umc query_ras_error_address is also responsible
for clearing
+ * error status
+ */
+ adev->umc.ras_funcs-
message_smu_query_ras_error_address(adev, ras_error_status);
+ }
+ } else {
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_count)
+ adev->umc.ras_funcs->query_ras_error_count(adev,
+ras_error_status);
+
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_address &&
+ adev->umc.max_ras_err_cnt_per_query) {
+ err_data->err_addr =
+ kcalloc(adev-
umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record),
GFP_KERNEL);
+
+ /* still call query_ras_error_address to clear error
status
+ * even NOMEM error is encountered
+ */
+ if(!err_data->err_addr)
+ dev_warn(adev->dev, "Failed to alloc
memory for "
+ "umc error address
record!\n");
+
+ /* umc query_ras_error_address is also responsible
for clearing
+ * error status
+ */
+ adev->umc.ras_funcs-
query_ras_error_address(adev, ras_error_status);
+ }
}
/* only uncorrectable error needs gpu reset */ diff --git
a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index ea65de0160c3..7a06021a58f0 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -1404,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu,
bool enable);
int smu_wait_for_event(struct amdgpu_device *adev, enum
smu_event_type event,
uint64_t event_arg);
+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);
#endif
#endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 01168b8955bf..6340c079f35e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -3072,6 +3072,18 @@ int smu_set_light_sbr(struct smu_context *smu,
bool enable)
return ret;
}
+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc) {
+ int ret = -1;
+
+ if (smu->ppt_funcs &&
+ smu->ppt_funcs->get_ecc_info)
+ ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc);
+
Shouldn't return -1 if ppt func is not present. If ppt func is not present, that
means this method is not supported for the SOC; return ENOTSUPP.
+ return ret;
+
+}
+
Probably the above function should be clubbed with patch 3 - smu support
for getting ras ecc info.
static int smu_get_prv_buffer_details(void *handle, void **addr, size_t
*size)
{
struct smu_context *smu = handle;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 55421ea622fb..55ef10ca684a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -200,11 +200,15 @@ int smu_v13_0_check_fw_version(struct
smu_context *smu)
uint16_t smu_major;
uint8_t smu_minor, smu_debug;
int ret = 0;
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(smu->adev);
ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
if (ret)
return ret;
+ /* record smu interface version, help umc query error method */
+ ras->smu_version = smu_version;
+
This is not needed. ASIC specific functions can check the FW version for ECC
table support.
Thanks,
Lijo
smu_major = (smu_version >> 16) & 0xffff;
smu_minor = (smu_version >> 8) & 0xff;
smu_debug = (smu_version >> 0) & 0xff;