RE: [PATCH 2/3] drm/amdgpu: Refine bad page adding

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Xie, Patrick <Gangliang.Xie@xxxxxxx>
> Sent: Friday, February 21, 2025 11:19 AM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao
> <Tao.Zhou1@xxxxxxx>; Xie, Patrick <Gangliang.Xie@xxxxxxx>
> Subject: [PATCH 2/3] drm/amdgpu: Refine bad page adding
>
> bad page adding can be simpler with nps info
>
> Signed-off-by: ganglxie <ganglxie@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 196 +++++++++++++-----------
>  1 file changed, 105 insertions(+), 91 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 5420e2d6d244..439841a2d1c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2801,20 +2801,101 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device
> *adev,
>               return  -EINVAL;
>  }
>
> +static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
> +                                     struct eeprom_table_record *bps, int count) {
> +     int j;
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct ras_err_handler_data *data = con->eh_data;
> +
> +     for (j = 0; j < count; j++) {
> +             if (amdgpu_ras_check_bad_page_unlock(con,
> +                     bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
> +                     continue;
> +
> +             if (!data->space_left &&
> +                     amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {

[Tao] space should be replaced with tab

> +                     return -ENOMEM;
> +             }
> +
> +             amdgpu_ras_reserve_page(adev, bps[j].retired_page);
> +
> +             memcpy(&data->bps[data->count], &(bps[j]),
> +                             sizeof(struct eeprom_table_record));
> +             data->count++;
> +             data->space_left--;
> +     }
> +
> +     return 0;
> +}
> +
> +static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device
> *adev,
> +                             struct eeprom_table_record *bps, struct ras_err_data
> *err_data,
> +                             enum amdgpu_memory_partition nps)
> +{
> +     int i = 0;
> +     int ret = 0;
> +     enum amdgpu_memory_partition save_nps;
> +
> +     save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) &
> UMC_NPS_MASK;
> +
> +     for (i = 0; i < adev->umc.retire_unit; i++)
> +             bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
> +
> +     if (save_nps) {
> +             if (save_nps == nps) {
> +                     if (amdgpu_umc_pages_in_a_row(adev, err_data,
> +                                     bps[0].retired_page <<
> AMDGPU_GPU_PAGE_SHIFT))
> +                             return -EINVAL;
> +             } else {
> +                     if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
> +                             return -EINVAL;
> +             }
> +     } else {
> +             if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
> +                     if (nps == AMDGPU_NPS1_PARTITION_MODE)
> +                             memcpy(err_data->err_addr, bps,
> +                                     sizeof(struct eeprom_table_record) * adev-
> >umc.retire_unit);
> +                     else
> +                             return -EOPNOTSUPP;
> +             }
> +     }
> +
> +     return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
> +adev->umc.retire_unit); }
> +
> +static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
> +                             struct eeprom_table_record *bps, struct ras_err_data
> *err_data,
> +                             enum amdgpu_memory_partition nps)
> +{
> +     enum amdgpu_memory_partition save_nps;
> +
> +     save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
> +     bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
> +
> +     if (save_nps == nps) {
> +             if (amdgpu_umc_pages_in_a_row(adev, err_data,
> +                             bps->retired_page <<
> AMDGPU_GPU_PAGE_SHIFT))
> +                     return -EINVAL;
> +     } else {
> +             if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
> +                     return -EINVAL;
> +     }
> +     return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
> +                                                                     adev-
> >umc.retire_unit);
> +}
> +
>  /* it deal with vram only. */
>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>               struct eeprom_table_record *bps, int pages, bool from_rom)  {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> -     struct ras_err_handler_data *data;
>       struct ras_err_data err_data;
> -     struct eeprom_table_record *err_rec;
>       struct amdgpu_ras_eeprom_control *control =
>                       &adev->psp.ras_context.ras->eeprom_control;
>       enum amdgpu_memory_partition nps =
> AMDGPU_NPS1_PARTITION_MODE;
>       int ret = 0;
> -     uint32_t i, j, loop_cnt = 1;
> -     bool find_pages_per_pa = false;
> +     uint32_t i;
>
>       if (!con || !con->eh_data || !bps || pages <= 0)
>               return 0;
> @@ -2825,108 +2906,41 @@ int amdgpu_ras_add_bad_pages(struct
> amdgpu_device *adev,
>                               sizeof(struct eeprom_table_record), GFP_KERNEL);
>               if (!err_data.err_addr) {
>                       dev_warn(adev->dev, "Failed to alloc UMC error address
> record in mca2pa conversion!\n");
> -                     ret = -ENOMEM;
> -                     goto out;
> +                     return -ENOMEM;
>               }
>
> -             err_rec = err_data.err_addr;
> -             loop_cnt = adev->umc.retire_unit;
>               if (adev->gmc.gmc_funcs->query_mem_partition_mode)
>                       nps = adev->gmc.gmc_funcs-
> >query_mem_partition_mode(adev);
>       }
>
>       mutex_lock(&con->recovery_lock);
> -     data = con->eh_data;
> -     if (!data) {
> -             /* Returning 0 as the absence of eh_data is acceptable */
> -             goto free;
> -     }
> -
> -     for (i = 0; i < pages; i++) {
> -             if (from_rom &&
> -                 control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) {
> -                     if (!find_pages_per_pa) {
> -                             if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i],
> &err_data)) {
> -                                     if (!i && nps ==
> AMDGPU_NPS1_PARTITION_MODE) {
> -                                             /* may use old RAS TA, use PA to find
> pages in
> -                                              * one row
> -                                              */
> -                                             if
> (amdgpu_umc_pages_in_a_row(adev, &err_data,
> -
> bps[i].retired_page <<
> -
> AMDGPU_GPU_PAGE_SHIFT)) {
> -                                                     ret = -EINVAL;
> -                                                     goto free;
> -                                             } else {
> -                                                     find_pages_per_pa = true;
> -                                             }
> -                                     } else {
> -                                             /* unsupported cases */
> -                                             ret = -EOPNOTSUPP;
> -                                             goto free;
> -                                     }
> -                             }
> -                     } else {
> -                             if (amdgpu_umc_pages_in_a_row(adev, &err_data,
> -                                             bps[i].retired_page <<
> AMDGPU_GPU_PAGE_SHIFT)) {
> -                                     ret = -EINVAL;
> -                                     goto free;
> -                             }
> -                     }
> -             } else {
> -                     if (from_rom && !find_pages_per_pa) {
> -                             if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
> -                                     /* bad page in any NPS mode in eeprom */
> -                                     if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i],
> &err_data)) {
> -                                             ret = -EINVAL;
> +
> +     if (from_rom) {
> +             for (i = 0; i < pages; i++) {
> +                     if (control->ras_num_recs - i >= adev->umc.retire_unit) {
> +                             if ((bps[i].address == bps[i + 1].address) &&
> +                                 (bps[i].mem_channel == bps[i + 1].mem_channel)) {
> +                                     //deal with retire_unit records a time
> +                                     ret =
> __amdgpu_ras_convert_rec_array_from_rom(adev,
> +                                                                     &bps[i],
> &err_data, nps);
> +                                     if (ret)
>                                               goto free;
> -                                     }
> +                                     i += (adev->umc.retire_unit - 1);
>                               } else {
> -                                     /* legacy bad page in eeprom, generated only
> in
> -                                      * NPS1 mode
> -                                      */
> -                                     if (amdgpu_ras_mca2pa(adev, &bps[i],
> &err_data)) {
> -                                             /* old RAS TA or ASICs which don't
> support to
> -                                              * convert addrss via mca address
> -                                              */
> -                                             if (!i && nps ==
> AMDGPU_NPS1_PARTITION_MODE) {
> -                                                     find_pages_per_pa = true;
> -                                                     err_rec = &bps[i];
> -                                                     loop_cnt = 1;
> -                                             } else {
> -                                                     /* non-nps1 mode, old RAS TA
> -                                                      * can't support it
> -                                                      */
> -                                                     ret = -EOPNOTSUPP;
> -                                                     goto free;
> -                                             }
> -                                     }
> +                                     break;
>                               }
> -
> -                             if (!find_pages_per_pa)
> -                                     i += (adev->umc.retire_unit - 1);
>                       } else {
> -                             err_rec = &bps[i];
> +                             break;
>                       }
>               }
> -
> -             for (j = 0; j < loop_cnt; j++) {
> -                     if (amdgpu_ras_check_bad_page_unlock(con,
> -                             err_rec[j].retired_page <<
> AMDGPU_GPU_PAGE_SHIFT))
> -                             continue;
> -
> -                     if (!data->space_left &&
> -                         amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
> -                             ret = -ENOMEM;
> +             for (; i < pages; i++) {
> +                     ret = __amdgpu_ras_convert_rec_from_rom(adev,
> +                             &bps[i], &err_data, nps);
> +                     if (ret)
>                               goto free;
> -                     }
> -
> -                     amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
> -
> -                     memcpy(&data->bps[data->count], &(err_rec[j]),
> -                                     sizeof(struct eeprom_table_record));
> -                     data->count++;
> -                     data->space_left--;
>               }
> +     } else {
> +             ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
>       }
>
>  free:
> --
> 2.34.1





[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux