RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

"Zhou1, Tao" <Tao.Zhou1@xxxxxxx> · Mon, 22 Apr 2024 08:14:02 +0000



[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@xxxxxxx>
> Sent: Thursday, April 18, 2024 10:59 AM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhang, Hawking
> <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice
> <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang,
> Stanley <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>
> Subject: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
>
> Retire bad pages for umc v12_0.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57
> +++++++++++++++++++++++++-
>  1 file changed, 55 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 6c2b61ef5b57..bd917eb6ea24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -28,6 +28,8 @@
>  #include "umc/umc_12_0_0_sh_mask.h"
>  #include "mp/mp_13_0_6_sh_mask.h"
>
> +#define MAX_ECC_NUM_PER_RETIREMENT  16

[Tao] we already have UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL for the purposes

> +
>  static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
>                                           uint32_t node_inst,
>                                           uint32_t umc_inst,
> @@ -633,6 +635,58 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,
>       return 0;
>  }
>
> +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
> +                             struct ras_ecc_err *ecc_err, void
> *ras_error_status) {
> +     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> +     uint32_t i = 0;
> +     int ret = 0;
> +
> +     if (!err_data || !ecc_err)
> +             return -EINVAL;
> +
> +     for (i = 0; i < ecc_err->err_pages.count; i++) {
> +             ret = amdgpu_umc_fill_error_record(err_data,
> +                             ecc_err->addr,
> +                             ecc_err->err_pages.pfn[i] <<
> AMDGPU_GPU_PAGE_SHIFT,
> +                             MCA_IPID_2_UMC_CH(ecc_err->ipid),
> +                             MCA_IPID_2_UMC_INST(ecc_err->ipid));
> +             if (ret)
> +                     break;
> +     }
> +
> +     err_data->de_count++;
> +
> +     return ret;
> +}
> +
> +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
> +                                     void *ras_error_status)
> +{
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
> +     struct radix_tree_root *ecc_tree;
> +     int new_detected, ret, i;
> +
> +     ecc_tree = &con->umc_ecc_log.de_page_tree;
> +
> +     mutex_lock(&con->umc_ecc_log.lock);
> +     new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
> +                     0, ARRAY_SIZE(entries),
> UMC_ECC_NEW_DETECTED_TAG);
> +     for (i = 0; i < new_detected; i++) {
> +             if (!entries[i])
> +                     continue;
> +
> +             ret = umc_v12_0_fill_error_record(adev, entries[i],
> ras_error_status);
> +             if (ret) {
> +                     dev_err(adev->dev, "Fail to fill umc error record,
> ret:%d\n", ret);
> +                     break;
> +             }
> +             radix_tree_tag_clear(ecc_tree, entries[i]->hash_index,
> UMC_ECC_NEW_DETECTED_TAG);
> +     }
> +     mutex_unlock(&con->umc_ecc_log.lock);
> +}
> +
>  struct amdgpu_umc_ras umc_v12_0_ras = {
>       .ras_block = {
>               .hw_ops = &umc_v12_0_ras_hw_ops,
> @@ -640,8 +694,7 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
>       },
>       .err_cnt_init = umc_v12_0_err_cnt_init,
>       .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
> -     .ecc_info_query_ras_error_count =
> umc_v12_0_ecc_info_query_ras_error_count,
> -     .ecc_info_query_ras_error_address =
> umc_v12_0_ecc_info_query_ras_error_address,
> +     .ecc_info_query_ras_error_address =
> umc_v12_0_query_ras_ecc_err_addr,
>       .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
>       .update_ecc_status = umc_v12_0_update_ecc_status,  };
> --
> 2.34.1