> -----邮件原件----- > 发件人: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > 发送时间: Wednesday, March 2, 2022 3:45 PM > 收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; > Joo, Maria <Maria.Joo@xxxxxxx> > 抄送: Yang, Stanley <Stanley.Yang@xxxxxxx> > 主题: RE: [PATCH Review 1/1] drm/amdgpu: support send bad channel info > to smu > > [AMD Official Use Only] > > > > > -----Original Message----- > > From: Stanley.Yang <Stanley.Yang@xxxxxxx> > > Sent: Tuesday, March 1, 2022 9:30 PM > > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > > <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Joo, > Maria > > <Maria.Joo@xxxxxxx> > > Cc: Yang, Stanley <Stanley.Yang@xxxxxxx> > > Subject: [PATCH Review 1/1] drm/amdgpu: support send bad channel info > > to smu > > > > Message SMU bad channel information bitmap to update OOB table > > > > Change-Id: I49a79af64d5263c28db059ecb8b8405a471431b4 > > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++ > > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 25 ++++++++++- > > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 4 ++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 +++ > > drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 12 ++++++ > > drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 1 + > > drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 +++++ > > drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 7 +++ > > .../pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h | 3 +- > > drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 +- > > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 43 > +++++++++++++++++++ > > 12 files changed, 119 insertions(+), 4 deletions(-) > > [Tao] It's better to split the patch into two parts, one for amdgpu and one for > pm. [Yang, Stanley] : yeah, it makes sense, will update. > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > index d3875618ebf5..f9104f99eb9c 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > @@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct > > amdgpu_device > > *adev) > > mutex_init(&con->recovery_lock); > > INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); > > atomic_set(&con->in_recovery, 0); > > + con->eeprom_control.bad_channel_bitmap = 0; > > > > max_eeprom_records_count = > > amdgpu_ras_eeprom_max_record_count(); > > amdgpu_ras_validate_threshold(adev, > max_eeprom_records_count); @@ > > -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device > > *adev) > > goto free; > > > > amdgpu_dpm_send_hbm_bad_pages_num(adev, con- > > >eeprom_control.ras_num_recs); > > + > > + if (con->update_channel_flag == true) { > [Tao] It can be simplified to "if (con->update_channel_flag)" [Yang, Stanley] : Yeah, both the "if (con->update_channel_flag)" and "if (con->update_channel_flag == ture)" are feasible. > > > + amdgpu_dpm_send_hbm_bad_channel_flag(adev, > con- > > >eeprom_control.bad_channel_bitmap); > > [Tao] do we need to check status of the function and stop recovery_init if it > fails? [Yang, Stanley] : No, it don't affect ras recovery process even message smu failed. > > > + con->update_channel_flag = false; > > + } > > } > > > > #ifdef CONFIG_X86_MCE_AMD > > @@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > > goto release_con; > > } > > > > + con->update_channel_flag = false; > > con->features = 0; > > INIT_LIST_HEAD(&con->head); > > /* Might need get this flag from vbios. */ diff --git > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > index 7cddaad90d6d..9314fde81e68 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > @@ -374,6 +374,9 @@ struct amdgpu_ras { > > > > /* record umc error info queried from smu */ > > struct umc_ecc_info umc_ecc; > > + > > + /* Indicates smu whether need update bad channel info */ > > + bool update_channel_flag; > > }; > > > > struct ras_fs_data { > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > index 2b844a5aafdb..ad5d8667756d 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > @@ -265,6 +265,7 @@ int amdgpu_ras_eeprom_reset_table(struct > > amdgpu_ras_eeprom_control *control) { > > struct amdgpu_device *adev = to_amdgpu_device(control); > > struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; > > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > > u8 csum; > > int res; > > > > @@ -285,6 +286,10 @@ int amdgpu_ras_eeprom_reset_table(struct > > amdgpu_ras_eeprom_control *control) > > > > amdgpu_dpm_send_hbm_bad_pages_num(adev, control- > > >ras_num_recs); > > > > + control->bad_channel_bitmap = 0; > > + amdgpu_dpm_send_hbm_bad_channel_flag(adev, control- > > >bad_channel_bitmap); > > + con->update_channel_flag = false; > > + > > amdgpu_ras_debugfs_set_ret_size(control); > > > > mutex_unlock(&control->ras_tbl_mutex); > > @@ -418,6 +423,7 @@ amdgpu_ras_eeprom_append_table(struct > > amdgpu_ras_eeprom_control *control, > > struct eeprom_table_record *record, > > const u32 num) > > { > > + struct amdgpu_ras *con = > > +amdgpu_ras_get_context(to_amdgpu_device(control)); > > u32 a, b, i; > > u8 *buf, *pp; > > int res; > > @@ -429,9 +435,16 @@ amdgpu_ras_eeprom_append_table(struct > > amdgpu_ras_eeprom_control *control, > > /* Encode all of them in one go. > > */ > > pp = buf; > > - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) > > + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { > > __encode_table_record_to_buf(control, &record[i], pp); > > > > + /* update bad channel bitmap */ > > + if (!(control->bad_channel_bitmap & (1 << > > record[i].mem_channel))) { > > + control->bad_channel_bitmap |= 1 << > > record[i].mem_channel; > > + con->update_channel_flag = true; > > + } > > + } > > + > > /* a, first record index to write into. > > * b, last record index to write into. > > * a = first index to read (fri) + number of records in the table, > > @@ - > > 684,6 +697,7 @@ int amdgpu_ras_eeprom_read(struct > > amdgpu_ras_eeprom_control *control, > > const u32 num) > > { > > struct amdgpu_device *adev = to_amdgpu_device(control); > > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > > int i, res; > > u8 *buf, *pp; > > u32 g0, g1; > > @@ -751,8 +765,15 @@ int amdgpu_ras_eeprom_read(struct > > amdgpu_ras_eeprom_control *control, > > /* Read up everything? Then transform. > > */ > > pp = buf; > > - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) > > + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { > > __decode_table_record_from_buf(control, &record[i], pp); > > + > > + /* update bad channel bitmap */ > > + if (!(control->bad_channel_bitmap & (1 << > > record[i].mem_channel))) { > > + control->bad_channel_bitmap |= 1 << > > record[i].mem_channel; > > + con->update_channel_flag = true; > > + } > > + } > > Out: > > kfree(buf); > > mutex_unlock(&control->ras_tbl_mutex); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > index 6bb00578bfbb..54d9bfe0881d 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > @@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control { > > /* Protect table access via this mutex. > > */ > > struct mutex ras_tbl_mutex; > > + > > + /* Record channel info which occurred bad pages > > + */ > > + u32 bad_channel_bitmap; > > }; > > > > /* > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > index 85da6cbaf3b7..aad3c8b4c810 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > @@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct > > amdgpu_device *adev, > > amdgpu_ras_save_bad_pages(adev); > > > > amdgpu_dpm_send_hbm_bad_pages_num(adev, > con- > > >eeprom_control.ras_num_recs); > > + > > + if (con->update_channel_flag == true) { > > + > > amdgpu_dpm_send_hbm_bad_channel_flag(adev, con- > > >eeprom_control.bad_channel_bitmap); > > + con->update_channel_flag = false; > > + } > > } > > > > if (reset) > > diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > > b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > > index 1d63f1e8884c..9a892d6d1d7a 100644 > > --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > > +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c > > @@ -507,6 +507,18 @@ int > amdgpu_dpm_send_hbm_bad_pages_num(struct > > amdgpu_device *adev, uint32_t size) > > return ret; > > } > > > > +int amdgpu_dpm_send_hbm_bad_channel_flag(struct amdgpu_device > *adev, > > +uint32_t size) { > > + struct smu_context *smu = adev->powerplay.pp_handle; > > + int ret = 0; > > + > > + mutex_lock(&adev->pm.mutex); > > + ret = smu_send_hbm_bad_channel_flag(smu, size); > > + mutex_unlock(&adev->pm.mutex); > > + > > + return ret; > > +} > > + > > int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev, > > enum pp_clock_type type, > > uint32_t *min, > > diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h > > b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h > > index ddfa55b59d02..3e78b3057277 100644 > > --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h > > +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h > > @@ -412,6 +412,7 @@ void amdgpu_dpm_enable_jpeg(struct > amdgpu_device > > *adev, bool enable); int amdgpu_pm_load_smu_firmware(struct > > amdgpu_device *adev, uint32_t *smu_version); int > > amdgpu_dpm_handle_passthrough_sbr(struct amdgpu_device *adev, > bool > > enable); int amdgpu_dpm_send_hbm_bad_pages_num(struct > amdgpu_device > > *adev, uint32_t size); > > +int amdgpu_dpm_send_hbm_bad_channel_flag(struct amdgpu_device > *adev, > > +uint32_t size); > > int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev, > > enum pp_clock_type type, > > uint32_t *min, > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > index 7e79a67bb8ef..f1544755d8b4 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > @@ -3052,3 +3052,13 @@ int smu_send_hbm_bad_pages_num(struct > > smu_context *smu, uint32_t size) > > > > return ret; > > } > > + > > +int smu_send_hbm_bad_channel_flag(struct smu_context *smu, > uint32_t > > +size) { > > + int ret = 0; > > + > > + if (smu->ppt_funcs && smu->ppt_funcs- > >send_hbm_bad_channel_flag) > > + ret = smu->ppt_funcs->send_hbm_bad_channel_flag(smu, > size); > > + > > + return ret; > > +} > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > index fbef3ab8d487..ef57b6089c69 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > @@ -1292,6 +1292,12 @@ struct pptable_funcs { > > * @set_config_table: Apply the input DriverSmuConfig table settings. > > */ > > int (*set_config_table)(struct smu_context *smu, struct > > config_table_setting *table); > > + > > + /** > > + * @sned_hbm_bad_channel_flag: message SMU to update bad > > channel info > > + * > > of SMUBUS table. > > + */ > > + int (*send_hbm_bad_channel_flag)(struct smu_context *smu, > uint32_t > > +size); > > }; > > > > typedef enum { > > @@ -1428,5 +1434,6 @@ int smu_get_ecc_info(struct smu_context *smu, > > void *umc_ecc); int smu_stb_collect_info(struct smu_context *smu, > > void *buff, uint32_t size); void amdgpu_smu_stb_debug_fs_init(struct > > amdgpu_device *adev); int smu_send_hbm_bad_pages_num(struct > > smu_context *smu, uint32_t size); > > +int smu_send_hbm_bad_channel_flag(struct smu_context *smu, > uint32_t > > +size); > > #endif > > #endif > > diff --git > > a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h > > b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h > > index ab66a4b9e438..0f498baf6838 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h > > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h > > @@ -103,7 +103,8 @@ > > #define PPSMC_MSG_GfxDriverResetRecovery 0x42 > > #define PPSMC_MSG_BoardPowerCalibration 0x43 > > #define PPSMC_MSG_HeavySBR 0x45 > > -#define PPSMC_Message_Count 0x46 > > +#define PPSMC_MSG_SetBadHBMPagesRetiredFlagsPerChannel 0x46 > > +#define PPSMC_Message_Count 0x47 > > > > > > //PPSMC Reset Types > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > > b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > > index d787c3b9fc52..9f6f306eeca0 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > > @@ -232,7 +232,8 @@ > > __SMU_DUMMY_MAP(ForceGfxVid), \ > > __SMU_DUMMY_MAP(Spare0), \ > > __SMU_DUMMY_MAP(UnforceGfxVid), \ > > - __SMU_DUMMY_MAP(HeavySBR), > > + __SMU_DUMMY_MAP(HeavySBR), \ > > + __SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel), > > > > #undef __SMU_DUMMY_MAP > > #define __SMU_DUMMY_MAP(type) SMU_MSG_##type > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > index 890acc4e2cb8..e5e249968244 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > @@ -82,6 +82,12 @@ > > */ > > #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00 > > > > +/* > > + * SMU support BAD CHENNEL info MSG since version 68.51.00, > > + * use this to check ECCTALE feature whether support */ #define > > +SUPPORT_BAD_CHANNEL_INFO_MSG_VERSION 0x00443300 > > + > > static const struct smu_temperature_range smu13_thermal_policy[] = { > > {-273150, 99000, 99000, -273150, 99000, 99000, -273150, 99000, > > 99000}, @@ -140,6 +146,7 @@ static const struct cmn2asic_msg_mapping > > aldebaran_message_map[SMU_MSG_MAX_COUNT > > MSG_MAP(GfxDriverResetRecovery, > > PPSMC_MSG_GfxDriverResetRecovery, 0), > > MSG_MAP(BoardPowerCalibration, > > PPSMC_MSG_BoardPowerCalibration, 0), > > MSG_MAP(HeavySBR, PPSMC_MSG_HeavySBR, > > 0), > > + MSG_MAP(SetBadHBMPagesRetiredFlagsPerChannel, > > PPSMC_MSG_SetBadHBMPagesRetiredFlagsPerChannel, 0), > > }; > > > > static const struct cmn2asic_mapping > aldebaran_clk_map[SMU_CLK_COUNT] > > = { @@ -1997,6 +2004,41 @@ static int > > aldebaran_smu_send_hbm_bad_page_num(struct smu_context *smu, > > return ret; > > } > > > > +static int aldebaran_check_bad_channel_info_support(struct > > +smu_context > > +*smu) { > > + uint32_t if_version = 0xff, smu_version = 0xff; > > + int ret = 0; > > + > > + ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version); > > + if (ret) { > > + /* return not support if failed get smu_version */ > > + ret = -EOPNOTSUPP; > > + } > > + > > + if (smu_version < SUPPORT_BAD_CHANNEL_INFO_MSG_VERSION) > > + ret = -EOPNOTSUPP; > > + > > + return ret; > > +} > > + > > +static int aldebaran_send_hbm_bad_channel_flag(struct smu_context > *smu, > > + uint32_t size) > > +{ > > + int ret = 0; > > + > > + ret = aldebaran_check_bad_channel_info_support(smu); > > + if (ret) > > + return ret; > > + > > + /* message SMU to update the bad channel info on SMUBUS */ > > + ret = smu_cmn_send_smc_msg_with_param(smu, > > SMU_MSG_SetBadHBMPagesRetiredFlagsPerChannel, size, NULL); > > + if (ret) > > + dev_err(smu->adev->dev, "[%s] failed to message SMU to > > update HBM bad channel info\n", > > + __func__); > > + > > + return ret; > > +} > > + > > static const struct pptable_funcs aldebaran_ppt_funcs = { > > /* init dpm */ > > .get_allowed_feature_mask = > aldebaran_get_allowed_feature_mask, > > @@ -2062,6 +2104,7 @@ static const struct pptable_funcs > > aldebaran_ppt_funcs = { > > .i2c_fini = aldebaran_i2c_control_fini, > > .send_hbm_bad_pages_num = > > aldebaran_smu_send_hbm_bad_page_num, > > .get_ecc_info = aldebaran_get_ecc_info, > > + .send_hbm_bad_channel_flag = > > aldebaran_send_hbm_bad_channel_flag, > > }; > > > > void aldebaran_set_ppt_funcs(struct smu_context *smu) > > -- > > 2.17.1