RE: [PATCH 1/2] drm/amdgpu: add RAS is_rma flag

"Yang, Stanley" <Stanley.Yang@xxxxxxx> · Thu, 23 May 2024 13:56:35 +0000

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Tao Zhou
> Sent: Thursday, May 23, 2024 6:02 PM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
> Subject: [PATCH 1/2] drm/amdgpu: add RAS is_rma flag
>
> Set the flag to true if bad page number reaches threshold.
>
> Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c        |  7 +++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h        |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++++++----
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h |  3 +--
>  4 files changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index ecce022c657b..934dfb2bf9e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2940,7 +2940,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>       struct ras_err_handler_data **data;
>       u32  max_eeprom_records_count = 0;
> -     bool exc_err_limit = false;
>       int ret;
>
>       if (!con || amdgpu_sriov_vf(adev))
> @@ -2977,12 +2976,12 @@ int amdgpu_ras_recovery_init(struct
> amdgpu_device *adev)
>        */
>       if (adev->gmc.xgmi.pending_reset)
>               return 0;
> -     ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
> +     ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
>       /*
>        * This calling fails when exc_err_limit is true or
>        * ret != 0.
>        */
> -     if (exc_err_limit || ret)
> +     if (con->is_rma || ret)
>               goto free;
>
>       if (con->eeprom_control.ras_num_recs) { @@ -3033,7 +3032,7 @@ int
> amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>        * Except error threshold exceeding case, other failure cases in this
>        * function would not fail amdgpu driver init.
>        */
> -     if (!exc_err_limit)
> +     if (!con->is_rma)
>               ret = 0;
>       else
>               ret = -EINVAL;

[Stanley]: Should stop device service if device is under RMA during running? the amdgpu_ras_recovery_init function only be called during the process of loading driver.

Regards,
Stanley
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index d06c01b978cd..437c58c85639 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -521,6 +521,7 @@ struct amdgpu_ras {
>       bool update_channel_flag;
>       /* Record status of smu mca debug mode */
>       bool is_aca_debug_mode;
> +     bool is_rma;
>
>       /* Record special requirements of gpu reset caller */
>       uint32_t  gpu_reset_flags;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 9b789dcc2bd1..eae0a555df3c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct
> amdgpu_ras_eeprom_control *control)
>                       control->tbl_rai.health_percent = 0;
>               }
>
> +             if (amdgpu_bad_page_threshold != -1)
> +                     ras->is_rma = true;
> +
>               /* ignore the -ENOTSUPP return value */
>               amdgpu_dpm_send_rma_reason(adev);
>       }
> @@ -1321,8 +1324,7 @@ static int __read_table_ras_info(struct
> amdgpu_ras_eeprom_control *control)
>       return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;  }
>
> -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> -                        bool *exceed_err_limit)
> +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
>  {
>       struct amdgpu_device *adev = to_amdgpu_device(control);
>       unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; @@ -1330,7
> +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control
> *control,
>       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>       int res;
>
> -     *exceed_err_limit = false;
> +     ras->is_rma = false;
>
>       if (!__is_ras_eeprom_supported(adev))
>               return 0;
> @@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control,
>                               dev_warn(adev->dev, "GPU will be initialized
> due to bad_page_threshold = -1.");
>                               res = 0;
>                       } else {
> -                             *exceed_err_limit = true;
> +                             ras->is_rma = true;
>                               dev_err(adev->dev,
>                                       "RAS records:%d exceed threshold:%d, "
>                                       "GPU will not be initialized. Replace this
> GPU or increase the threshold", diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 6dfd667f3013..b9ebda577797 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -129,8 +129,7 @@ struct eeprom_table_record {
>       unsigned char mcumc_id;
>  } __packed;
>
> -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> -                        bool *exceed_err_limit);
> +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
>
>  int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control
> *control);
>
> --
> 2.34.1