if smu support ECCTABLE, driver can message smu to get ecc_table then query umc error info from ECCTABLE apply pmfw version check to ensure backward compatibility Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 42 ++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 7 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 71 +++++++++++++------ drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h | 1 + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 12 ++++ .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 4 ++ 6 files changed, 107 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 90f0db3b4f65..6b0f2ba1e420 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -888,6 +888,38 @@ void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev, } } +static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data) +{ + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + /* + * choosing right query method according to + * whether smu support query error information + */ + if ((ras->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) && + !smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc))) { + + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->message_smu_query_ras_error_count) + adev->umc.ras_funcs->message_smu_query_ras_error_count(adev, err_data); + + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->message_smu_query_ras_error_address) + adev->umc.ras_funcs->message_smu_query_ras_error_address(adev, err_data); + } else { + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->query_ras_error_count) + adev->umc.ras_funcs->query_ras_error_count(adev, err_data); + + /* umc query_ras_error_address is also responsible for clearing + * error status + */ + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->query_ras_error_address) + adev->umc.ras_funcs->query_ras_error_address(adev, err_data); + } +} + /* query/inject/cure begin */ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) @@ -901,15 +933,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, switch (info->head.block) { case AMDGPU_RAS_BLOCK__UMC: - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->query_ras_error_count) - adev->umc.ras_funcs->query_ras_error_count(adev, &err_data); - /* umc query_ras_error_address is also responsible for clearing - * error status - */ - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->query_ras_error_address) - adev->umc.ras_funcs->query_ras_error_address(adev, &err_data); + amdgpu_ras_get_ecc_info(adev, &err_data); break; case AMDGPU_RAS_BLOCK__SDMA: if (adev->sdma.funcs->query_ras_error_count) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index bcbf3264d92f..3f0de0cc8403 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -322,6 +322,12 @@ struct ras_common_if { #define MAX_UMC_CHANNEL_NUM 32 +/* + * SMU support ECCTABLE since version 68.42.0, + * use this to decide query umc error info method + */ +#define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00 + struct ecc_info_per_ch { uint16_t ce_count_lo_chip; uint16_t ce_count_hi_chip; @@ -375,6 +381,7 @@ struct amdgpu_ras { /* record umc error info queried from smu */ struct umc_ecc_info umc_ecc; + uint32_t smu_version; }; struct ras_fs_data { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 0c7c56a91b25..2c3e97c9410b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -97,28 +97,57 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->query_ras_error_count) - adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status); - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->query_ras_error_address && - adev->umc.max_ras_err_cnt_per_query) { - err_data->err_addr = - kcalloc(adev->umc.max_ras_err_cnt_per_query, - sizeof(struct eeprom_table_record), GFP_KERNEL); - - /* still call query_ras_error_address to clear error status - * even NOMEM error is encountered - */ - if(!err_data->err_addr) - dev_warn(adev->dev, "Failed to alloc memory for " - "umc error address record!\n"); - - /* umc query_ras_error_address is also responsible for clearing - * error status - */ - adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status); + if ((con->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) && + !smu_get_ecc_info(&adev->smu, (void *)&(con->umc_ecc))) { + + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->message_smu_query_ras_error_count) + adev->umc.ras_funcs->message_smu_query_ras_error_count(adev, ras_error_status); + + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->message_smu_query_ras_error_address && + adev->umc.max_ras_err_cnt_per_query) { + err_data->err_addr = + kcalloc(adev->umc.max_ras_err_cnt_per_query, + sizeof(struct eeprom_table_record), GFP_KERNEL); + + /* still call query_ras_error_address to clear error status + * even NOMEM error is encountered + */ + if(!err_data->err_addr) + dev_warn(adev->dev, "Failed to alloc memory for " + "umc error address record!\n"); + + /* umc query_ras_error_address is also responsible for clearing + * error status + */ + adev->umc.ras_funcs->message_smu_query_ras_error_address(adev, ras_error_status); + } + } else { + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->query_ras_error_count) + adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status); + + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->query_ras_error_address && + adev->umc.max_ras_err_cnt_per_query) { + err_data->err_addr = + kcalloc(adev->umc.max_ras_err_cnt_per_query, + sizeof(struct eeprom_table_record), GFP_KERNEL); + + /* still call query_ras_error_address to clear error status + * even NOMEM error is encountered + */ + if(!err_data->err_addr) + dev_warn(adev->dev, "Failed to alloc memory for " + "umc error address record!\n"); + + /* umc query_ras_error_address is also responsible for clearing + * error status + */ + adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status); + } } /* only uncorrectable error needs gpu reset */ diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h index ea65de0160c3..7a06021a58f0 100644 --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h @@ -1404,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu, bool enable); int smu_wait_for_event(struct amdgpu_device *adev, enum smu_event_type event, uint64_t event_arg); +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc); #endif #endif diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 01168b8955bf..6340c079f35e 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -3072,6 +3072,18 @@ int smu_set_light_sbr(struct smu_context *smu, bool enable) return ret; } +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc) +{ + int ret = -1; + + if (smu->ppt_funcs && + smu->ppt_funcs->get_ecc_info) + ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc); + + return ret; + +} + static int smu_get_prv_buffer_details(void *handle, void **addr, size_t *size) { struct smu_context *smu = handle; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 55421ea622fb..55ef10ca684a 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -200,11 +200,15 @@ int smu_v13_0_check_fw_version(struct smu_context *smu) uint16_t smu_major; uint8_t smu_minor, smu_debug; int ret = 0; + struct amdgpu_ras *ras = amdgpu_ras_get_context(smu->adev); ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version); if (ret) return ret; + /* record smu interface version, help umc query error method */ + ras->smu_version = smu_version; + smu_major = (smu_version >> 16) & 0xffff; smu_minor = (smu_version >> 8) & 0xff; smu_debug = (smu_version >> 0) & 0xff; -- 2.17.1