Message SMU bad channel information bitmap to update OOB table Change-Id: I49a79af64d5263c28db059ecb8b8405a471431b4 Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 25 ++++++++++- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 4 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 +++ drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 12 ++++++ drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 1 + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 +++++ drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 7 +++ .../pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h | 3 +- drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 +- .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 43 +++++++++++++++++++ 12 files changed, 119 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index d3875618ebf5..f9104f99eb9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); + con->eeprom_control.bad_channel_bitmap = 0; max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); @@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) goto free; amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); + + if (con->update_channel_flag == true) { + amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); + con->update_channel_flag = false; + } } #ifdef CONFIG_X86_MCE_AMD @@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } + con->update_channel_flag = false; con->features = 0; INIT_LIST_HEAD(&con->head); /* Might need get this flag from vbios. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 7cddaad90d6d..9314fde81e68 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -374,6 +374,9 @@ struct amdgpu_ras { /* record umc error info queried from smu */ struct umc_ecc_info umc_ecc; + + /* Indicates smu whether need update bad channel info */ + bool update_channel_flag; }; struct ras_fs_data { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 2b844a5aafdb..ad5d8667756d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -265,6 +265,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); u8 csum; int res; @@ -285,6 +286,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs); + control->bad_channel_bitmap = 0; + amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap); + con->update_channel_flag = false; + amdgpu_ras_debugfs_set_ret_size(control); mutex_unlock(&control->ras_tbl_mutex); @@ -418,6 +423,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *record, const u32 num) { + struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control)); u32 a, b, i; u8 *buf, *pp; int res; @@ -429,9 +435,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, /* Encode all of them in one go. */ pp = buf; - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { __encode_table_record_to_buf(control, &record[i], pp); + /* update bad channel bitmap */ + if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { + control->bad_channel_bitmap |= 1 << record[i].mem_channel; + con->update_channel_flag = true; + } + } + /* a, first record index to write into. * b, last record index to write into. * a = first index to read (fri) + number of records in the table, @@ -684,6 +697,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, const u32 num) { struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int i, res; u8 *buf, *pp; u32 g0, g1; @@ -751,8 +765,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, /* Read up everything? Then transform. */ pp = buf; - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { __decode_table_record_from_buf(control, &record[i], pp); + + /* update bad channel bitmap */ + if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { + control->bad_channel_bitmap |= 1 << record[i].mem_channel; + con->update_channel_flag = true; + } + } Out: kfree(buf); mutex_unlock(&control->ras_tbl_mutex); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 6bb00578bfbb..54d9bfe0881d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control { /* Protect table access via this mutex. */ struct mutex ras_tbl_mutex; + + /* Record channel info which occurred bad pages + */ + u32 bad_channel_bitmap; }; /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 85da6cbaf3b7..aad3c8b4c810 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, amdgpu_ras_save_bad_pages(adev); amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); + + if (con->update_channel_flag == true) { + amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); + con->update_channel_flag = false; + } } if (reset) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c index 1d63f1e8884c..9a892d6d1d7a 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c @@ -507,6 +507,18 @@ int amdgpu_dpm_send_hbm_bad_pages_num(struct amdgpu_device *adev, uint32_t size) return ret; } +int amdgpu_dpm_send_hbm_bad_channel_flag(struct amdgpu_device *adev, uint32_t size) +{ + struct smu_context *smu = adev->powerplay.pp_handle; + int ret = 0; + + mutex_lock(&adev->pm.mutex); + ret = smu_send_hbm_bad_channel_flag(smu, size); + mutex_unlock(&adev->pm.mutex); + + return ret; +} + int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev, enum pp_clock_type type, uint32_t *min, diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h index ddfa55b59d02..3e78b3057277 100644 --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h @@ -412,6 +412,7 @@ void amdgpu_dpm_enable_jpeg(struct amdgpu_device *adev, bool enable); int amdgpu_pm_load_smu_firmware(struct amdgpu_device *adev, uint32_t *smu_version); int amdgpu_dpm_handle_passthrough_sbr(struct amdgpu_device *adev, bool enable); int amdgpu_dpm_send_hbm_bad_pages_num(struct amdgpu_device *adev, uint32_t size); +int amdgpu_dpm_send_hbm_bad_channel_flag(struct amdgpu_device *adev, uint32_t size); int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev, enum pp_clock_type type, uint32_t *min, diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 7e79a67bb8ef..f1544755d8b4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -3052,3 +3052,13 @@ int smu_send_hbm_bad_pages_num(struct smu_context *smu, uint32_t size) return ret; } + +int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t size) +{ + int ret = 0; + + if (smu->ppt_funcs && smu->ppt_funcs->send_hbm_bad_channel_flag) + ret = smu->ppt_funcs->send_hbm_bad_channel_flag(smu, size); + + return ret; +} diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index fbef3ab8d487..ef57b6089c69 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -1292,6 +1292,12 @@ struct pptable_funcs { * @set_config_table: Apply the input DriverSmuConfig table settings. */ int (*set_config_table)(struct smu_context *smu, struct config_table_setting *table); + + /** + * @sned_hbm_bad_channel_flag: message SMU to update bad channel info + * of SMUBUS table. + */ + int (*send_hbm_bad_channel_flag)(struct smu_context *smu, uint32_t size); }; typedef enum { @@ -1428,5 +1434,6 @@ int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc); int smu_stb_collect_info(struct smu_context *smu, void *buff, uint32_t size); void amdgpu_smu_stb_debug_fs_init(struct amdgpu_device *adev); int smu_send_hbm_bad_pages_num(struct smu_context *smu, uint32_t size); +int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t size); #endif #endif diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h index ab66a4b9e438..0f498baf6838 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h @@ -103,7 +103,8 @@ #define PPSMC_MSG_GfxDriverResetRecovery 0x42 #define PPSMC_MSG_BoardPowerCalibration 0x43 #define PPSMC_MSG_HeavySBR 0x45 -#define PPSMC_Message_Count 0x46 +#define PPSMC_MSG_SetBadHBMPagesRetiredFlagsPerChannel 0x46 +#define PPSMC_Message_Count 0x47 //PPSMC Reset Types diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h index d787c3b9fc52..9f6f306eeca0 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h @@ -232,7 +232,8 @@ __SMU_DUMMY_MAP(ForceGfxVid), \ __SMU_DUMMY_MAP(Spare0), \ __SMU_DUMMY_MAP(UnforceGfxVid), \ - __SMU_DUMMY_MAP(HeavySBR), + __SMU_DUMMY_MAP(HeavySBR), \ + __SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel), #undef __SMU_DUMMY_MAP #define __SMU_DUMMY_MAP(type) SMU_MSG_##type diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 890acc4e2cb8..e5e249968244 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -82,6 +82,12 @@ */ #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00 +/* + * SMU support BAD CHENNEL info MSG since version 68.51.00, + * use this to check ECCTALE feature whether support + */ +#define SUPPORT_BAD_CHANNEL_INFO_MSG_VERSION 0x00443300 + static const struct smu_temperature_range smu13_thermal_policy[] = { {-273150, 99000, 99000, -273150, 99000, 99000, -273150, 99000, 99000}, @@ -140,6 +146,7 @@ static const struct cmn2asic_msg_mapping aldebaran_message_map[SMU_MSG_MAX_COUNT MSG_MAP(GfxDriverResetRecovery, PPSMC_MSG_GfxDriverResetRecovery, 0), MSG_MAP(BoardPowerCalibration, PPSMC_MSG_BoardPowerCalibration, 0), MSG_MAP(HeavySBR, PPSMC_MSG_HeavySBR, 0), + MSG_MAP(SetBadHBMPagesRetiredFlagsPerChannel, PPSMC_MSG_SetBadHBMPagesRetiredFlagsPerChannel, 0), }; static const struct cmn2asic_mapping aldebaran_clk_map[SMU_CLK_COUNT] = { @@ -1997,6 +2004,41 @@ static int aldebaran_smu_send_hbm_bad_page_num(struct smu_context *smu, return ret; } +static int aldebaran_check_bad_channel_info_support(struct smu_context *smu) +{ + uint32_t if_version = 0xff, smu_version = 0xff; + int ret = 0; + + ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version); + if (ret) { + /* return not support if failed get smu_version */ + ret = -EOPNOTSUPP; + } + + if (smu_version < SUPPORT_BAD_CHANNEL_INFO_MSG_VERSION) + ret = -EOPNOTSUPP; + + return ret; +} + +static int aldebaran_send_hbm_bad_channel_flag(struct smu_context *smu, + uint32_t size) +{ + int ret = 0; + + ret = aldebaran_check_bad_channel_info_support(smu); + if (ret) + return ret; + + /* message SMU to update the bad channel info on SMUBUS */ + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetBadHBMPagesRetiredFlagsPerChannel, size, NULL); + if (ret) + dev_err(smu->adev->dev, "[%s] failed to message SMU to update HBM bad channel info\n", + __func__); + + return ret; +} + static const struct pptable_funcs aldebaran_ppt_funcs = { /* init dpm */ .get_allowed_feature_mask = aldebaran_get_allowed_feature_mask, @@ -2062,6 +2104,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = { .i2c_fini = aldebaran_i2c_control_fini, .send_hbm_bad_pages_num = aldebaran_smu_send_hbm_bad_page_num, .get_ecc_info = aldebaran_get_ecc_info, + .send_hbm_bad_channel_flag = aldebaran_send_hbm_bad_channel_flag, }; void aldebaran_set_ppt_funcs(struct smu_context *smu) -- 2.17.1