[AMD Official Use Only] The series is: Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > -----Original Message----- > From: Stanley.Yang <Stanley.Yang@xxxxxxx> > Sent: Friday, March 4, 2022 2:51 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Joo, Maria > <Maria.Joo@xxxxxxx> > Cc: Yang, Stanley <Stanley.Yang@xxxxxxx> > Subject: [PATCH Review 2/2] drm/amdgpu: message smu to update bad channel > info > > It should notice SMU to update bad channel info when detected uncorrectable > error in UMC block > > Change-Id: I2dc8848affdb53e52891013953ae9383fff5f20f > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 ++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 +++ > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 25 +++++++++++++++++-- > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 4 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 ++++ > 5 files changed, 42 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index d3875618ebf5..f9104f99eb9c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device > *adev) > mutex_init(&con->recovery_lock); > INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); > atomic_set(&con->in_recovery, 0); > + con->eeprom_control.bad_channel_bitmap = 0; > > max_eeprom_records_count = > amdgpu_ras_eeprom_max_record_count(); > amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); > @@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device > *adev) > goto free; > > amdgpu_dpm_send_hbm_bad_pages_num(adev, con- > >eeprom_control.ras_num_recs); > + > + if (con->update_channel_flag == true) { > + amdgpu_dpm_send_hbm_bad_channel_flag(adev, con- > >eeprom_control.bad_channel_bitmap); > + con->update_channel_flag = false; > + } > } > > #ifdef CONFIG_X86_MCE_AMD > @@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > goto release_con; > } > > + con->update_channel_flag = false; > con->features = 0; > INIT_LIST_HEAD(&con->head); > /* Might need get this flag from vbios. */ diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 7cddaad90d6d..9314fde81e68 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -374,6 +374,9 @@ struct amdgpu_ras { > > /* record umc error info queried from smu */ > struct umc_ecc_info umc_ecc; > + > + /* Indicates smu whether need update bad channel info */ > + bool update_channel_flag; > }; > > struct ras_fs_data { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > index 2b844a5aafdb..ad5d8667756d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > @@ -265,6 +265,7 @@ int amdgpu_ras_eeprom_reset_table(struct > amdgpu_ras_eeprom_control *control) { > struct amdgpu_device *adev = to_amdgpu_device(control); > struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > u8 csum; > int res; > > @@ -285,6 +286,10 @@ int amdgpu_ras_eeprom_reset_table(struct > amdgpu_ras_eeprom_control *control) > > amdgpu_dpm_send_hbm_bad_pages_num(adev, control- > >ras_num_recs); > > + control->bad_channel_bitmap = 0; > + amdgpu_dpm_send_hbm_bad_channel_flag(adev, control- > >bad_channel_bitmap); > + con->update_channel_flag = false; > + > amdgpu_ras_debugfs_set_ret_size(control); > > mutex_unlock(&control->ras_tbl_mutex); > @@ -418,6 +423,7 @@ amdgpu_ras_eeprom_append_table(struct > amdgpu_ras_eeprom_control *control, > struct eeprom_table_record *record, > const u32 num) > { > + struct amdgpu_ras *con = > +amdgpu_ras_get_context(to_amdgpu_device(control)); > u32 a, b, i; > u8 *buf, *pp; > int res; > @@ -429,9 +435,16 @@ amdgpu_ras_eeprom_append_table(struct > amdgpu_ras_eeprom_control *control, > /* Encode all of them in one go. > */ > pp = buf; > - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) > + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { > __encode_table_record_to_buf(control, &record[i], pp); > > + /* update bad channel bitmap */ > + if (!(control->bad_channel_bitmap & (1 << > record[i].mem_channel))) { > + control->bad_channel_bitmap |= 1 << > record[i].mem_channel; > + con->update_channel_flag = true; > + } > + } > + > /* a, first record index to write into. > * b, last record index to write into. > * a = first index to read (fri) + number of records in the table, @@ - > 684,6 +697,7 @@ int amdgpu_ras_eeprom_read(struct > amdgpu_ras_eeprom_control *control, > const u32 num) > { > struct amdgpu_device *adev = to_amdgpu_device(control); > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > int i, res; > u8 *buf, *pp; > u32 g0, g1; > @@ -751,8 +765,15 @@ int amdgpu_ras_eeprom_read(struct > amdgpu_ras_eeprom_control *control, > /* Read up everything? Then transform. > */ > pp = buf; > - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) > + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { > __decode_table_record_from_buf(control, &record[i], pp); > + > + /* update bad channel bitmap */ > + if (!(control->bad_channel_bitmap & (1 << > record[i].mem_channel))) { > + control->bad_channel_bitmap |= 1 << > record[i].mem_channel; > + con->update_channel_flag = true; > + } > + } > Out: > kfree(buf); > mutex_unlock(&control->ras_tbl_mutex); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > index 6bb00578bfbb..54d9bfe0881d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > @@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control { > /* Protect table access via this mutex. > */ > struct mutex ras_tbl_mutex; > + > + /* Record channel info which occurred bad pages > + */ > + u32 bad_channel_bitmap; > }; > > /* > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > index 85da6cbaf3b7..aad3c8b4c810 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > @@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct > amdgpu_device *adev, > amdgpu_ras_save_bad_pages(adev); > > amdgpu_dpm_send_hbm_bad_pages_num(adev, con- > >eeprom_control.ras_num_recs); > + > + if (con->update_channel_flag == true) { > + > amdgpu_dpm_send_hbm_bad_channel_flag(adev, con- > >eeprom_control.bad_channel_bitmap); > + con->update_channel_flag = false; > + } > } > > if (reset) > -- > 2.17.1