[AMD Official Use Only - General] From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> on behalf of Stanley.Yang <Stanley.Yang@xxxxxxx>
Sent: Monday, May 23, 2022 4:17 PM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx <amd-gfx@xxxxxxxxxxxxxxxxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Quan, Evan <Evan.Quan@xxxxxxx>; Lazar, Lijo <Lijo.Lazar@xxxxxxx> Cc: Yang, Stanley <Stanley.Yang@xxxxxxx> Subject: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable SMU add a new variable mca_ceumc_addr to record
umc correctable error address in EccInfo table, driver side add ecctable_v2 to support this feature Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 2 + .../inc/pmfw_if/smu13_driver_if_aldebaran.h | 15 +++ .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 101 ++++++++++++++---- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 2 + 5 files changed, 98 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index b9a6fac2b8b2..28e603243b67 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -328,6 +328,7 @@ struct ecc_info_per_ch { uint16_t ce_count_hi_chip; uint64_t mca_umc_status; uint64_t mca_umc_addr; + uint64_t mca_ceumc_addr; }; struct umc_ecc_info { diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index a6a7b6c33683..9f7257ada437 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -322,6 +322,7 @@ enum smu_table_id SMU_TABLE_PACE, SMU_TABLE_ECCINFO, SMU_TABLE_COMBO_PPTABLE, + SMU_TABLE_ECCINFO_V2, SMU_TABLE_COUNT, }; @@ -340,6 +341,7 @@ struct smu_table_context void *driver_pptable; void *combo_pptable; void *ecc_table; + void *ecc_table_v2; // adapt to smu support record mca_ceumc_addr void *driver_smu_config_table; struct smu_table tables[SMU_TABLE_COUNT]; /* diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h index 0f67c56c2863..2868604eff49 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h @@ -522,6 +522,21 @@ typedef struct { EccInfo_t EccInfo[ALDEBARAN_UMC_CHANNEL_NUM]; } EccInfoTable_t; +typedef struct { + uint64_t mca_umc_status; + uint64_t mca_umc_addr; + uint64_t mca_ceumc_addr; + + uint16_t ce_count_lo_chip; + uint16_t ce_count_hi_chip; + + uint32_t eccPadding; +} EccInfo_t_v2; + +typedef struct { + EccInfo_t_v2 EccInfo[ALDEBARAN_UMC_CHANNEL_NUM]; +} EccInfoTable_t_v2; + // These defines are used with the following messages: // SMC_MSG_TransferTableDram2Smu // SMC_MSG_TransferTableSmu2Dram diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 38af648cb857..e58df9490cec 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -82,6 +82,12 @@ */ #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00 +/* + * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0, + * use this to check mca_ceumc_addr record whether support + */ +#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700 + /* * SMU support BAD CHENNEL info MSG since version 68.51.00, * use this to check ECCTALE feature whether support @@ -239,6 +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu) SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t), PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); + SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2, sizeof(EccInfoTable_t_v2), + PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); + [kevin]:
this table mapping is not needed, the reason as below.
smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
if (!smu_table->metrics_table) return -ENOMEM; @@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context *smu) if (!smu_table->ecc_table) return -ENOMEM; + smu_table->ecc_table_v2 = kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL); + if (!smu_table->ecc_table_v2) + return -ENOMEM;; + [kevin]:
add eccinfo table v2 is not needed for this case, this table is only used store table data from pmfw to driver,
you can create a large enough table which can save ecc table data directly.
e.g:
size = max(sizeof(EccInfoTable_t_v2), sizeof(EccInfoTable_t));
smu_table->ecc_table = kzalloc(size, GFP_KERNEL);
Best Regards,
Kevin
return 0;
} @@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu, return sizeof(struct gpu_metrics_v1_3); } -static int aldebaran_check_ecc_table_support(struct smu_context *smu) +static int aldebaran_check_ecc_table_support(struct smu_context *smu, + int *ecctable_version) { uint32_t if_version = 0xff, smu_version = 0xff; int ret = 0; @@ -1815,6 +1829,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu) if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION) ret = -EOPNOTSUPP; + else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION && + smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION) + *ecctable_version = 1; + else + *ecctable_version = 2; return ret; } @@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu, { struct smu_table_context *smu_table = &smu->smu_table; EccInfoTable_t *ecc_table = NULL; + EccInfoTable_t_v2 *ecc_table_v2 = NULL; struct ecc_info_per_ch *ecc_info_per_channel = NULL; int i, ret = 0; + int table_version = 0; struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table; - ret = aldebaran_check_ecc_table_support(smu); + ret = aldebaran_check_ecc_table_support(smu, &table_version); if (ret) return ret; - ret = smu_cmn_update_table(smu, - SMU_TABLE_ECCINFO, - 0, - smu_table->ecc_table, - false); - if (ret) { - dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n"); - return ret; - } + if (table_version == 1) { + ret = smu_cmn_update_table(smu, + SMU_TABLE_ECCINFO, + 0, + smu_table->ecc_table, + false); + if (ret) { + dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n"); + return ret; + } + + ecc_table = (EccInfoTable_t *)smu_table->ecc_table; + + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { + ecc_info_per_channel = &(eccinfo->ecc[i]); + ecc_info_per_channel->ce_count_lo_chip = + ecc_table->EccInfo[i].ce_count_lo_chip; + ecc_info_per_channel->ce_count_hi_chip = + ecc_table->EccInfo[i].ce_count_hi_chip; + ecc_info_per_channel->mca_umc_status = + ecc_table->EccInfo[i].mca_umc_status; + ecc_info_per_channel->mca_umc_addr = + ecc_table->EccInfo[i].mca_umc_addr; + } + } else if (table_version == 2) { + /* still use SMU_TABLE_ECC_INFO index, + * smu 68.55.0 add mca_ceumc_addr variable + * in EccInfo_t struct to report correctable + * error address and the table_id is not changed + */ + ret = smu_cmn_update_table(smu, + SMU_TABLE_ECCINFO, + 0, + smu_table->ecc_table_v2, + false); - ecc_table = (EccInfoTable_t *)smu_table->ecc_table; - - for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { - ecc_info_per_channel = &(eccinfo->ecc[i]); - ecc_info_per_channel->ce_count_lo_chip = - ecc_table->EccInfo[i].ce_count_lo_chip; - ecc_info_per_channel->ce_count_hi_chip = - ecc_table->EccInfo[i].ce_count_hi_chip; - ecc_info_per_channel->mca_umc_status = - ecc_table->EccInfo[i].mca_umc_status; - ecc_info_per_channel->mca_umc_addr = - ecc_table->EccInfo[i].mca_umc_addr; + if (ret) { + dev_info(smu->adev->dev, "Failed to export SMU ecc table_v2!\n"); + return ret; + } + + ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2; + + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { + ecc_info_per_channel = &(eccinfo->ecc[i]); + ecc_info_per_channel->ce_count_lo_chip = + ecc_table_v2->EccInfo[i].ce_count_lo_chip; + ecc_info_per_channel->ce_count_hi_chip = + ecc_table_v2->EccInfo[i].ce_count_hi_chip; + ecc_info_per_channel->mca_umc_status = + ecc_table_v2->EccInfo[i].mca_umc_status; + ecc_info_per_channel->mca_umc_addr = + ecc_table_v2->EccInfo[i].mca_umc_addr; + ecc_info_per_channel->mca_ceumc_addr = + ecc_table_v2->EccInfo[i].mca_ceumc_addr; + } } return ret; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index ae6321af9d88..af2d84a16f3e 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu) kfree(smu_table->hardcode_pptable); smu_table->hardcode_pptable = NULL; + kfree(smu_table->ecc_table_v2); kfree(smu_table->ecc_table); kfree(smu_table->metrics_table); kfree(smu_table->watermarks_table); + smu_table->ecc_table_v2 = NULL; smu_table->ecc_table = NULL; smu_table->metrics_table = NULL; smu_table->watermarks_table = NULL; -- 2.17.1 |