Re: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 5/23/2022 1:47 PM, Stanley.Yang wrote:
SMU add a new variable mca_ceumc_addr to record
umc correctable error address in EccInfo table,
driver side add ecctable_v2 to support this feature

Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |   1 +
  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |   2 +
  .../inc/pmfw_if/smu13_driver_if_aldebaran.h   |  15 +++
  .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 101 ++++++++++++++----
  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |   2 +
  5 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b9a6fac2b8b2..28e603243b67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -328,6 +328,7 @@ struct ecc_info_per_ch {
  	uint16_t ce_count_hi_chip;
  	uint64_t mca_umc_status;
  	uint64_t mca_umc_addr;
+	uint64_t mca_ceumc_addr;
  };
struct umc_ecc_info {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index a6a7b6c33683..9f7257ada437 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -322,6 +322,7 @@ enum smu_table_id
  	SMU_TABLE_PACE,
  	SMU_TABLE_ECCINFO,
  	SMU_TABLE_COMBO_PPTABLE,
+	SMU_TABLE_ECCINFO_V2,

Hi Stanley,

This is not the right approach. Need to ask FW team to fix this. There shouldn't be any new id with each version of table. You may check Sienna Cichlid smu metrics table as an example and ask FW team to follow something similar. I don't see 68.55 being released, so it's not late anyway. We don't need to keep defining pointers in table context per version of ECC table.

Thanks,
Lijo

  	SMU_TABLE_COUNT,
  };
@@ -340,6 +341,7 @@ struct smu_table_context
  	void				*driver_pptable;
  	void				*combo_pptable;
  	void                            *ecc_table;
+	void                            *ecc_table_v2;	// adapt to smu support record mca_ceumc_addr
  	void				*driver_smu_config_table;
  	struct smu_table		tables[SMU_TABLE_COUNT];
  	/*
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
index 0f67c56c2863..2868604eff49 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
@@ -522,6 +522,21 @@ typedef struct {
  	EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
  } EccInfoTable_t;
+typedef struct {
+	uint64_t mca_umc_status;
+	uint64_t mca_umc_addr;
+	uint64_t mca_ceumc_addr;
+
+	uint16_t ce_count_lo_chip;
+	uint16_t ce_count_hi_chip;
+
+	uint32_t eccPadding;
+} EccInfo_t_v2;
+
+typedef struct {
+	EccInfo_t_v2  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
+} EccInfoTable_t_v2;
+
  // These defines are used with the following messages:
  // SMC_MSG_TransferTableDram2Smu
  // SMC_MSG_TransferTableSmu2Dram
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 38af648cb857..e58df9490cec 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -82,6 +82,12 @@
   */
  #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
+/*
+ * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
+ * use this to check mca_ceumc_addr record whether support
+ */
+#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
+
  /*
   * SMU support BAD CHENNEL info MSG since version 68.51.00,
   * use this to check ECCTALE feature whether support
@@ -239,6 +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
  	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
  		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+ SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2, sizeof(EccInfoTable_t_v2),
+			PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+
  	smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
  	if (!smu_table->metrics_table)
  		return -ENOMEM;
@@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
  	if (!smu_table->ecc_table)
  		return -ENOMEM;
+ smu_table->ecc_table_v2 = kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL);
+	if (!smu_table->ecc_table_v2)
+		return -ENOMEM;;
+
  	return 0;
  }
@@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
  	return sizeof(struct gpu_metrics_v1_3);
  }
-static int aldebaran_check_ecc_table_support(struct smu_context *smu)
+static int aldebaran_check_ecc_table_support(struct smu_context *smu,
+		int *ecctable_version)
  {
  	uint32_t if_version = 0xff, smu_version = 0xff;
  	int ret = 0;
@@ -1815,6 +1829,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)
if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
  		ret = -EOPNOTSUPP;
+	else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
+			smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
+		*ecctable_version = 1;
+	else
+		*ecctable_version = 2;
return ret;
  }
@@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
  {
  	struct smu_table_context *smu_table = &smu->smu_table;
  	EccInfoTable_t *ecc_table = NULL;
+	EccInfoTable_t_v2 *ecc_table_v2 = NULL;
  	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
  	int i, ret = 0;
+	int table_version = 0;
  	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
- ret = aldebaran_check_ecc_table_support(smu);
+	ret = aldebaran_check_ecc_table_support(smu, &table_version);
  	if (ret)
  		return ret;
- ret = smu_cmn_update_table(smu,
-			       SMU_TABLE_ECCINFO,
-			       0,
-			       smu_table->ecc_table,
-			       false);
-	if (ret) {
-		dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
-		return ret;
-	}
+	if (table_version == 1) {
+		ret = smu_cmn_update_table(smu,
+				       SMU_TABLE_ECCINFO,
+				       0,
+				       smu_table->ecc_table,
+				       false);
+		if (ret) {
+			dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
+			return ret;
+		}
+
+		ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
+
+		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+			ecc_info_per_channel = &(eccinfo->ecc[i]);
+			ecc_info_per_channel->ce_count_lo_chip =
+				ecc_table->EccInfo[i].ce_count_lo_chip;
+			ecc_info_per_channel->ce_count_hi_chip =
+				ecc_table->EccInfo[i].ce_count_hi_chip;
+			ecc_info_per_channel->mca_umc_status =
+				ecc_table->EccInfo[i].mca_umc_status;
+			ecc_info_per_channel->mca_umc_addr =
+				ecc_table->EccInfo[i].mca_umc_addr;
+		}
+	} else if (table_version == 2) {
+		/* still use SMU_TABLE_ECC_INFO index,
+		 * smu 68.55.0 add mca_ceumc_addr variable
+		 * in EccInfo_t struct to report correctable
+		 * error address and the table_id is not changed
+		 */
+		ret = smu_cmn_update_table(smu,
+				       SMU_TABLE_ECCINFO,
+				       0,
+				       smu_table->ecc_table_v2,
+					   false);
- ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
-
-	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
-		ecc_info_per_channel = &(eccinfo->ecc[i]);
-		ecc_info_per_channel->ce_count_lo_chip =
-			ecc_table->EccInfo[i].ce_count_lo_chip;
-		ecc_info_per_channel->ce_count_hi_chip =
-			ecc_table->EccInfo[i].ce_count_hi_chip;
-		ecc_info_per_channel->mca_umc_status =
-			ecc_table->EccInfo[i].mca_umc_status;
-		ecc_info_per_channel->mca_umc_addr =
-			ecc_table->EccInfo[i].mca_umc_addr;
+		if (ret) {
+			dev_info(smu->adev->dev, "Failed to export SMU ecc table_v2!\n");
+			return ret;
+		}
+
+		ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2;
+
+		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+			ecc_info_per_channel = &(eccinfo->ecc[i]);
+			ecc_info_per_channel->ce_count_lo_chip =
+				ecc_table_v2->EccInfo[i].ce_count_lo_chip;
+			ecc_info_per_channel->ce_count_hi_chip =
+				ecc_table_v2->EccInfo[i].ce_count_hi_chip;
+			ecc_info_per_channel->mca_umc_status =
+				ecc_table_v2->EccInfo[i].mca_umc_status;
+			ecc_info_per_channel->mca_umc_addr =
+				ecc_table_v2->EccInfo[i].mca_umc_addr;
+			ecc_info_per_channel->mca_ceumc_addr =
+				ecc_table_v2->EccInfo[i].mca_ceumc_addr;
+		}
  	}
return ret;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index ae6321af9d88..af2d84a16f3e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
  	kfree(smu_table->hardcode_pptable);
  	smu_table->hardcode_pptable = NULL;
+ kfree(smu_table->ecc_table_v2);
  	kfree(smu_table->ecc_table);
  	kfree(smu_table->metrics_table);
  	kfree(smu_table->watermarks_table);
+	smu_table->ecc_table_v2 = NULL;
  	smu_table->ecc_table = NULL;
  	smu_table->metrics_table = NULL;
  	smu_table->watermarks_table = NULL;




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux