Re: 回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get ecc_table v2

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 11/18/2021 6:05 PM, Yang, Stanley wrote:
[AMD Official Use Only]



-----邮件原件-----
发件人: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
发送时间: Thursday, November 18, 2021 7:33 PM
收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd-
gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>;
Clements, John <John.Clements@xxxxxxx>; Quan, Evan
<Evan.Quan@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>
主题: Re: [PATCH Review 3/4] drm/amdgpu: add message smu to get
ecc_table v2



On 11/18/2021 3:03 PM, Stanley.Yang wrote:
support ECC TABLE message, this table include umc ras error count and
error address

v2:
      add smu version check to query whether support ecctable
      call smu_cmn_update_table to get ecctable directly

Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx>
---
   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  8 +++
   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 14 ++++
   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 70
+++++++++++++++++++
   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  2 +
   4 files changed, 94 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index 3557f4e7fc30..7a06021a58f0 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -324,6 +324,7 @@ enum smu_table_id
   	SMU_TABLE_OVERDRIVE,
   	SMU_TABLE_I2C_COMMANDS,
   	SMU_TABLE_PACE,
+	SMU_TABLE_ECCINFO,
   	SMU_TABLE_COUNT,
   };

@@ -340,6 +341,7 @@ struct smu_table_context
   	void				*max_sustainable_clocks;
   	struct smu_bios_boot_up_values	boot_values;
   	void                            *driver_pptable;
+	void                            *ecc_table;
   	struct smu_table		tables[SMU_TABLE_COUNT];
   	/*
   	 * The driver table is just a staging buffer for @@ -1261,6
+1263,11 @@ struct pptable_funcs {
   	 *
		of SMUBUS table.
   	 */
   	int (*send_hbm_bad_pages_num)(struct smu_context *smu,
uint32_t
size);
+
+	/**
+	 * @get_ecc_table:  message SMU to get ECC INFO table.
+	 */
+	ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
   };

   typedef enum {
@@ -1397,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu,
bool enable);

   int smu_wait_for_event(struct amdgpu_device *adev, enum
smu_event_type event,
   		       uint64_t event_arg);
+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);

   #endif
   #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 01168b8955bf..fd3b6b460b12 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -3072,6 +3072,20 @@ int smu_set_light_sbr(struct smu_context *smu,
bool enable)
   	return ret;
   }

+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc) {
+	int ret = -EOPNOTSUPP;
+
+	mutex_lock(&smu->mutex);
+	if (smu->ppt_funcs &&
+		smu->ppt_funcs->get_ecc_info)
+		ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc);
+	mutex_unlock(&smu->mutex);
+
+	return ret;
+
+}
+
   static int smu_get_prv_buffer_details(void *handle, void **addr, size_t
*size)
   {
   	struct smu_context *smu = handle;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index f835d86cc2f5..4c21609ccea5 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -78,6 +78,12 @@

   #define smnPCIE_ESM_CTRL			0x111003D0

+/*
+ * SMU support ECCTABLE since version 68.42.0,
+ * use this to check ECCTALE feature whether support  */ #define
+SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
+
   static const struct smu_temperature_range smu13_thermal_policy[] =
   {
   	{-273150,  99000, 99000, -273150, 99000, 99000, -273150, 99000,
99000}, @@ -190,6 +196,7 @@ static const struct cmn2asic_mapping
aldebaran_table_map[SMU_TABLE_COUNT] = {
   	TAB_MAP(SMU_METRICS),
   	TAB_MAP(DRIVER_SMU_CONFIG),
   	TAB_MAP(I2C_COMMANDS),
+	TAB_MAP(ECCINFO),
   };

   static const uint8_t aldebaran_throttler_map[] = { @@ -223,6 +230,9
@@ static int aldebaran_tables_init(struct smu_context *smu)
   	SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS,
sizeof(SwI2cRequest_t),
   		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);

+	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO,
sizeof(EccInfoTable_t),
+		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+
   	smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t),
GFP_KERNEL);
   	if (!smu_table->metrics_table)
   		return -ENOMEM;
@@ -235,6 +245,10 @@ static int aldebaran_tables_init(struct smu_context
*smu)
   		return -ENOMEM;
   	}

+	smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size,
GFP_KERNEL);
+	if (!smu_table->ecc_table)
+		return -ENOMEM;
+
   	return 0;
   }

@@ -1765,6 +1779,61 @@ static ssize_t aldebaran_get_gpu_metrics(struct
smu_context *smu,
   	return sizeof(struct gpu_metrics_v1_3);
   }

+static int aldebaran_check_ecc_table_support(struct smu_context *smu)
+{
+	uint32_t if_version = 0xff, smu_version = 0xff;
+	int ret = 0;
+
+	ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
+	if (ret)
+		ret = -EOPNOTSUPP;	// return not support if failed get

Nitpick - comment style

smu_version
+
+	if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
+		ret = -EOPNOTSUPP;
+
+	return ret;
+}
+
+static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
+					 void *table)
+{
+	struct smu_table_context *smu_table = &smu->smu_table;
+	EccInfoTable_t *ecc_table = NULL;
+	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
+	int i, ret = 0;
+	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
+

Missed to ask last time. Since umc_ecc_info is a common struct, do you also
want to pass back the number of channels having data?

Now this struct can hold max of 32 channel data. Let's say if the same
interface is going to be used on another ASIC X having only 16 channels.
Then the callback for ASIC X fills data only for 16 channels. Or, you expect that
to be taken care at the caller side?

[Yang, Stanley] : If ASIC X have only 16 channels, the callback only fill data for 16 channels, and caller side also need consider its own channel number to handle with umc_ecc_info.


Thanks for the details. With the nitpick above and Evan's comments on the patch subject addressed, patches 1 and 3 are

Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx>

2 and 4 look good to me. Hawking or John should a take look though.

Thanks,
Lijo


Thanks,
Lijo

+	ret = aldebaran_check_ecc_table_support(smu);
+	if (ret)
+		return ret;
+
+	ret = smu_cmn_update_table(smu,
+			       SMU_TABLE_ECCINFO,
+			       0,
+			       smu_table->ecc_table,
+			       false);
+	if (ret) {
+		dev_info(smu->adev->dev, "Failed to export SMU ecc
table!\n");
+		return ret;
+	}
+
+	ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
+
+	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+		ecc_info_per_channel = &(eccinfo->ecc[i]);
+		ecc_info_per_channel->ce_count_lo_chip =
+			ecc_table->EccInfo[i].ce_count_lo_chip;
+		ecc_info_per_channel->ce_count_hi_chip =
+			ecc_table->EccInfo[i].ce_count_hi_chip;
+		ecc_info_per_channel->mca_umc_status =
+			ecc_table->EccInfo[i].mca_umc_status;
+		ecc_info_per_channel->mca_umc_addr =
+			ecc_table->EccInfo[i].mca_umc_addr;
+	}
+
+	return ret;
+}
+
   static int aldebaran_mode1_reset(struct smu_context *smu)
   {
   	u32 smu_version, fatal_err, param;
@@ -1967,6 +2036,7 @@ static const struct pptable_funcs
aldebaran_ppt_funcs = {
   	.i2c_init = aldebaran_i2c_control_init,
   	.i2c_fini = aldebaran_i2c_control_fini,
   	.send_hbm_bad_pages_num =
aldebaran_smu_send_hbm_bad_page_num,
+	.get_ecc_info = aldebaran_get_ecc_info,
   };

   void aldebaran_set_ppt_funcs(struct smu_context *smu) diff --git
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 4d96099a9bb1..55421ea622fb 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct
smu_context *smu)
   	kfree(smu_table->hardcode_pptable);
   	smu_table->hardcode_pptable = NULL;

+	kfree(smu_table->ecc_table);
   	kfree(smu_table->metrics_table);
   	kfree(smu_table->watermarks_table);
+	smu_table->ecc_table = NULL;
   	smu_table->metrics_table = NULL;
   	smu_table->watermarks_table = NULL;
   	smu_table->metrics_time = 0;




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux