RE: [PATCH 1/2] drm/amdgpu: add RAS is_rma flag

"Zhou1, Tao" <Tao.Zhou1@xxxxxxx> · Mon, 27 May 2024 03:39:05 +0000

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Yang, Stanley <Stanley.Yang@xxxxxxx>
> Sent: Thursday, May 23, 2024 9:57 PM
> To: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
> Subject: RE: [PATCH 1/2] drm/amdgpu: add RAS is_rma flag
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> > -----Original Message-----
> > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Tao
> > Zhou
> > Sent: Thursday, May 23, 2024 6:02 PM
> > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
> > Subject: [PATCH 1/2] drm/amdgpu: add RAS is_rma flag
> >
> > Set the flag to true if bad page number reaches threshold.
> >
> > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c        |  7 +++----
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h        |  1 +
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++++++----
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h |  3 +--
> >  4 files changed, 11 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index ecce022c657b..934dfb2bf9e5 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -2940,7 +2940,6 @@ int amdgpu_ras_recovery_init(struct
> > amdgpu_device
> > *adev)
> >       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> >       struct ras_err_handler_data **data;
> >       u32  max_eeprom_records_count = 0;
> > -     bool exc_err_limit = false;
> >       int ret;
> >
> >       if (!con || amdgpu_sriov_vf(adev)) @@ -2977,12 +2976,12 @@ int
> > amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> >        */
> >       if (adev->gmc.xgmi.pending_reset)
> >               return 0;
> > -     ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
> > +     ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
> >       /*
> >        * This calling fails when exc_err_limit is true or
> >        * ret != 0.
> >        */
> > -     if (exc_err_limit || ret)
> > +     if (con->is_rma || ret)
> >               goto free;
> >
> >       if (con->eeprom_control.ras_num_recs) { @@ -3033,7 +3032,7 @@
> > int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> >        * Except error threshold exceeding case, other failure cases in this
> >        * function would not fail amdgpu driver init.
> >        */
> > -     if (!exc_err_limit)
> > +     if (!con->is_rma)
> >               ret = 0;
> >       else
> >               ret = -EINVAL;
>
> [Stanley]: Should stop device service if device is under RMA during running? the
> amdgpu_ras_recovery_init function only be called during the process of loading
> driver.

[Tao] yes, I plan to stop service in resume stage after mode-1 if run-time RMA is reported. But I have no environment to verify the design right now, so this is TODO temporarily.

>
> Regards,
> Stanley
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index d06c01b978cd..437c58c85639 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -521,6 +521,7 @@ struct amdgpu_ras {
> >       bool update_channel_flag;
> >       /* Record status of smu mca debug mode */
> >       bool is_aca_debug_mode;
> > +     bool is_rma;
> >
> >       /* Record special requirements of gpu reset caller */
> >       uint32_t  gpu_reset_flags;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > index 9b789dcc2bd1..eae0a555df3c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > @@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct
> > amdgpu_ras_eeprom_control *control)
> >                       control->tbl_rai.health_percent = 0;
> >               }
> >
> > +             if (amdgpu_bad_page_threshold != -1)
> > +                     ras->is_rma = true;
> > +
> >               /* ignore the -ENOTSUPP return value */
> >               amdgpu_dpm_send_rma_reason(adev);
> >       }
> > @@ -1321,8 +1324,7 @@ static int __read_table_ras_info(struct
> > amdgpu_ras_eeprom_control *control)
> >       return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;  }
> >
> > -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> > -                        bool *exceed_err_limit)
> > +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
> >  {
> >       struct amdgpu_device *adev = to_amdgpu_device(control);
> >       unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; @@ -1330,7
> > +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control
> > *control,
> >       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> >       int res;
> >
> > -     *exceed_err_limit = false;
> > +     ras->is_rma = false;
> >
> >       if (!__is_ras_eeprom_supported(adev))
> >               return 0;
> > @@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct
> > amdgpu_ras_eeprom_control *control,
> >                               dev_warn(adev->dev, "GPU will be
> > initialized due to bad_page_threshold = -1.");
> >                               res = 0;
> >                       } else {
> > -                             *exceed_err_limit = true;
> > +                             ras->is_rma = true;
> >                               dev_err(adev->dev,
> >                                       "RAS records:%d exceed threshold:%d, "
> >                                       "GPU will not be initialized.
> > Replace this GPU or increase the threshold", diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > index 6dfd667f3013..b9ebda577797 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > @@ -129,8 +129,7 @@ struct eeprom_table_record {
> >       unsigned char mcumc_id;
> >  } __packed;
> >
> > -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> > -                        bool *exceed_err_limit);
> > +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control
> > +*control);
> >
> >  int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control
> > *control);
> >
> > --
> > 2.34.1
>