SMU add a new variable mca_ceumc_addr to record umc correctable error address in EccInfo table, driver side add EccInfo_V2_t to support this feature Changed from V1: remove ecc_table_v2 and unnecessary table id, define union struct include EccInfo_t and EccInfo_V2_t. Changed from V2: sync patch verion Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + .../inc/pmfw_if/smu13_driver_if_aldebaran.h | 16 +++++- .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 53 ++++++++++++++----- 3 files changed, 57 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index b9a6fac2b8b2..28e603243b67 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -328,6 +328,7 @@ struct ecc_info_per_ch { uint16_t ce_count_hi_chip; uint64_t mca_umc_status; uint64_t mca_umc_addr; + uint64_t mca_ceumc_addr; }; struct umc_ecc_info { diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h index 0f67c56c2863..6f92038470ec 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h @@ -519,7 +519,21 @@ typedef struct { } EccInfo_t; typedef struct { - EccInfo_t EccInfo[ALDEBARAN_UMC_CHANNEL_NUM]; + uint64_t mca_umc_status; + uint64_t mca_umc_addr; + uint64_t mca_ceumc_addr; + + uint16_t ce_count_lo_chip; + uint16_t ce_count_hi_chip; + + uint32_t eccPadding; +} EccInfo_V2_t; + +typedef struct { + union { + EccInfo_t EccInfo[ALDEBARAN_UMC_CHANNEL_NUM]; + EccInfo_V2_t EccInfo_V2[ALDEBARAN_UMC_CHANNEL_NUM]; + }; } EccInfoTable_t; // These defines are used with the following messages: diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 38af648cb857..9cdfeea58085 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -82,6 +82,12 @@ */ #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00 +/* + * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0, + * use this to check mca_ceumc_addr record whether support + */ +#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700 + /* * SMU support BAD CHENNEL info MSG since version 68.51.00, * use this to check ECCTALE feature whether support @@ -1802,7 +1808,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu, return sizeof(struct gpu_metrics_v1_3); } -static int aldebaran_check_ecc_table_support(struct smu_context *smu) +static int aldebaran_check_ecc_table_support(struct smu_context *smu, + int *ecctable_version) { uint32_t if_version = 0xff, smu_version = 0xff; int ret = 0; @@ -1815,6 +1822,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu) if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION) ret = -EOPNOTSUPP; + else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION && + smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION) + *ecctable_version = 1; + else + *ecctable_version = 2; return ret; } @@ -1826,9 +1838,10 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu, EccInfoTable_t *ecc_table = NULL; struct ecc_info_per_ch *ecc_info_per_channel = NULL; int i, ret = 0; + int table_version = 0; struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table; - ret = aldebaran_check_ecc_table_support(smu); + ret = aldebaran_check_ecc_table_support(smu, &table_version); if (ret) return ret; @@ -1844,16 +1857,32 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu, ecc_table = (EccInfoTable_t *)smu_table->ecc_table; - for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { - ecc_info_per_channel = &(eccinfo->ecc[i]); - ecc_info_per_channel->ce_count_lo_chip = - ecc_table->EccInfo[i].ce_count_lo_chip; - ecc_info_per_channel->ce_count_hi_chip = - ecc_table->EccInfo[i].ce_count_hi_chip; - ecc_info_per_channel->mca_umc_status = - ecc_table->EccInfo[i].mca_umc_status; - ecc_info_per_channel->mca_umc_addr = - ecc_table->EccInfo[i].mca_umc_addr; + if (table_version == 1) { + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { + ecc_info_per_channel = &(eccinfo->ecc[i]); + ecc_info_per_channel->ce_count_lo_chip = + ecc_table->EccInfo[i].ce_count_lo_chip; + ecc_info_per_channel->ce_count_hi_chip = + ecc_table->EccInfo[i].ce_count_hi_chip; + ecc_info_per_channel->mca_umc_status = + ecc_table->EccInfo[i].mca_umc_status; + ecc_info_per_channel->mca_umc_addr = + ecc_table->EccInfo[i].mca_umc_addr; + } + } else if (table_version == 2) { + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { + ecc_info_per_channel = &(eccinfo->ecc[i]); + ecc_info_per_channel->ce_count_lo_chip = + ecc_table->EccInfo_V2[i].ce_count_lo_chip; + ecc_info_per_channel->ce_count_hi_chip = + ecc_table->EccInfo_V2[i].ce_count_hi_chip; + ecc_info_per_channel->mca_umc_status = + ecc_table->EccInfo_V2[i].mca_umc_status; + ecc_info_per_channel->mca_umc_addr = + ecc_table->EccInfo_V2[i].mca_umc_addr; + ecc_info_per_channel->mca_ceumc_addr = + ecc_table->EccInfo_V2[i].mca_ceumc_addr; + } } return ret; -- 2.17.1