[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Yang, Stanley <Stanley.Yang@xxxxxxx> > Sent: Thursday, May 23, 2024 9:57 PM > To: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Subject: RE: [PATCH 1/2] drm/amdgpu: add RAS is_rma flag > > [AMD Official Use Only - AMD Internal Distribution Only] > > > -----Original Message----- > > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Tao > > Zhou > > Sent: Thursday, May 23, 2024 6:02 PM > > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > > Subject: [PATCH 1/2] drm/amdgpu: add RAS is_rma flag > > > > Set the flag to true if bad page number reaches threshold. > > > > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++---- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++++++---- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 +-- > > 4 files changed, 11 insertions(+), 10 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > index ecce022c657b..934dfb2bf9e5 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > @@ -2940,7 +2940,6 @@ int amdgpu_ras_recovery_init(struct > > amdgpu_device > > *adev) > > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > > struct ras_err_handler_data **data; > > u32 max_eeprom_records_count = 0; > > - bool exc_err_limit = false; > > int ret; > > > > if (!con || amdgpu_sriov_vf(adev)) @@ -2977,12 +2976,12 @@ int > > amdgpu_ras_recovery_init(struct amdgpu_device *adev) > > */ > > if (adev->gmc.xgmi.pending_reset) > > return 0; > > - ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); > > + ret = amdgpu_ras_eeprom_init(&con->eeprom_control); > > /* > > * This calling fails when exc_err_limit is true or > > * ret != 0. > > */ > > - if (exc_err_limit || ret) > > + if (con->is_rma || ret) > > goto free; > > > > if (con->eeprom_control.ras_num_recs) { @@ -3033,7 +3032,7 @@ > > int amdgpu_ras_recovery_init(struct amdgpu_device *adev) > > * Except error threshold exceeding case, other failure cases in this > > * function would not fail amdgpu driver init. > > */ > > - if (!exc_err_limit) > > + if (!con->is_rma) > > ret = 0; > > else > > ret = -EINVAL; > > [Stanley]: Should stop device service if device is under RMA during running? the > amdgpu_ras_recovery_init function only be called during the process of loading > driver. [Tao] yes, I plan to stop service in resume stage after mode-1 if run-time RMA is reported. But I have no environment to verify the design right now, so this is TODO temporarily. > > Regards, > Stanley > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > index d06c01b978cd..437c58c85639 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > @@ -521,6 +521,7 @@ struct amdgpu_ras { > > bool update_channel_flag; > > /* Record status of smu mca debug mode */ > > bool is_aca_debug_mode; > > + bool is_rma; > > > > /* Record special requirements of gpu reset caller */ > > uint32_t gpu_reset_flags; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > index 9b789dcc2bd1..eae0a555df3c 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > @@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct > > amdgpu_ras_eeprom_control *control) > > control->tbl_rai.health_percent = 0; > > } > > > > + if (amdgpu_bad_page_threshold != -1) > > + ras->is_rma = true; > > + > > /* ignore the -ENOTSUPP return value */ > > amdgpu_dpm_send_rma_reason(adev); > > } > > @@ -1321,8 +1324,7 @@ static int __read_table_ras_info(struct > > amdgpu_ras_eeprom_control *control) > > return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; } > > > > -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, > > - bool *exceed_err_limit) > > +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) > > { > > struct amdgpu_device *adev = to_amdgpu_device(control); > > unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; @@ -1330,7 > > +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control > > *control, > > struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > > int res; > > > > - *exceed_err_limit = false; > > + ras->is_rma = false; > > > > if (!__is_ras_eeprom_supported(adev)) > > return 0; > > @@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct > > amdgpu_ras_eeprom_control *control, > > dev_warn(adev->dev, "GPU will be > > initialized due to bad_page_threshold = -1."); > > res = 0; > > } else { > > - *exceed_err_limit = true; > > + ras->is_rma = true; > > dev_err(adev->dev, > > "RAS records:%d exceed threshold:%d, " > > "GPU will not be initialized. > > Replace this GPU or increase the threshold", diff --git > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > index 6dfd667f3013..b9ebda577797 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > @@ -129,8 +129,7 @@ struct eeprom_table_record { > > unsigned char mcumc_id; > > } __packed; > > > > -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, > > - bool *exceed_err_limit); > > +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control > > +*control); > > > > int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control > > *control); > > > > -- > > 2.34.1 >