RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

"Chai, Thomas" <YiPeng.Chai@xxxxxxx> · Mon, 22 Apr 2024 09:21:25 +0000

[AMD Official Use Only - General]

-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
Sent: Monday, April 22, 2024 4:14 PM
To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>
Subject: RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@xxxxxxx>
> Sent: Thursday, April 18, 2024 10:59 AM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhang, Hawking
> <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice
> <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>;
> Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, Thomas
> <YiPeng.Chai@xxxxxxx>
> Subject: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
>
> Retire bad pages for umc v12_0.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57
> +++++++++++++++++++++++++-
>  1 file changed, 55 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 6c2b61ef5b57..bd917eb6ea24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -28,6 +28,8 @@
>  #include "umc/umc_12_0_0_sh_mask.h"
>  #include "mp/mp_13_0_6_sh_mask.h"
>
> +#define MAX_ECC_NUM_PER_RETIREMENT  16

> [Tao] we already have UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL for the purposes

[Thomas]  This is from eeprom view to define the maximum number of eeprom table entries written each time.
The actual data items that need to be written to eeprom may be less than this value, or may be more than this value(e.g.  later, write the data items before address conversion).
If the value is less than this value, it will be written according to the actual value, and if it is more than this value, it will be written in multiple times.

> +
>  static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
>                                           uint32_t node_inst,
>                                           uint32_t umc_inst, @@ -633,6
> +635,58 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,
>       return 0;
>  }
>
> +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
> +                             struct ras_ecc_err *ecc_err, void
> *ras_error_status) {
> +     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> +     uint32_t i = 0;
> +     int ret = 0;
> +
> +     if (!err_data || !ecc_err)
> +             return -EINVAL;
> +
> +     for (i = 0; i < ecc_err->err_pages.count; i++) {
> +             ret = amdgpu_umc_fill_error_record(err_data,
> +                             ecc_err->addr,
> +                             ecc_err->err_pages.pfn[i] <<
> AMDGPU_GPU_PAGE_SHIFT,
> +                             MCA_IPID_2_UMC_CH(ecc_err->ipid),
> +                             MCA_IPID_2_UMC_INST(ecc_err->ipid));
> +             if (ret)
> +                     break;
> +     }
> +
> +     err_data->de_count++;
> +
> +     return ret;
> +}
> +
> +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
> +                                     void *ras_error_status) {
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
> +     struct radix_tree_root *ecc_tree;
> +     int new_detected, ret, i;
> +
> +     ecc_tree = &con->umc_ecc_log.de_page_tree;
> +
> +     mutex_lock(&con->umc_ecc_log.lock);
> +     new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
> +                     0, ARRAY_SIZE(entries),
> UMC_ECC_NEW_DETECTED_TAG);
> +     for (i = 0; i < new_detected; i++) {
> +             if (!entries[i])
> +                     continue;
> +
> +             ret = umc_v12_0_fill_error_record(adev, entries[i],
> ras_error_status);
> +             if (ret) {
> +                     dev_err(adev->dev, "Fail to fill umc error
> + record,
> ret:%d\n", ret);
> +                     break;
> +             }
> +             radix_tree_tag_clear(ecc_tree, entries[i]->hash_index,
> UMC_ECC_NEW_DETECTED_TAG);
> +     }
> +     mutex_unlock(&con->umc_ecc_log.lock);
> +}
> +
>  struct amdgpu_umc_ras umc_v12_0_ras = {
>       .ras_block = {
>               .hw_ops = &umc_v12_0_ras_hw_ops, @@ -640,8 +694,7 @@
> struct amdgpu_umc_ras umc_v12_0_ras = {
>       },
>       .err_cnt_init = umc_v12_0_err_cnt_init,
>       .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
> -     .ecc_info_query_ras_error_count =
> umc_v12_0_ecc_info_query_ras_error_count,
> -     .ecc_info_query_ras_error_address =
> umc_v12_0_ecc_info_query_ras_error_address,
> +     .ecc_info_query_ras_error_address =
> umc_v12_0_query_ras_ecc_err_addr,
>       .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
>       .update_ecc_status = umc_v12_0_update_ecc_status,  };
> --
> 2.34.1