[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Xie, Patrick <Gangliang.Xie@xxxxxxx> > Sent: Friday, February 21, 2025 11:19 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao > <Tao.Zhou1@xxxxxxx>; Xie, Patrick <Gangliang.Xie@xxxxxxx> > Subject: [PATCH 2/3] drm/amdgpu: Refine bad page adding > > bad page adding can be simpler with nps info > > Signed-off-by: ganglxie <ganglxie@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 196 +++++++++++++----------- > 1 file changed, 105 insertions(+), 91 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 5420e2d6d244..439841a2d1c2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -2801,20 +2801,101 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device > *adev, > return -EINVAL; > } > > +static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev, > + struct eeprom_table_record *bps, int count) { > + int j; > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + struct ras_err_handler_data *data = con->eh_data; > + > + for (j = 0; j < count; j++) { > + if (amdgpu_ras_check_bad_page_unlock(con, > + bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) > + continue; > + > + if (!data->space_left && > + amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { [Tao] space should be replaced with tab > + return -ENOMEM; > + } > + > + amdgpu_ras_reserve_page(adev, bps[j].retired_page); > + > + memcpy(&data->bps[data->count], &(bps[j]), > + sizeof(struct eeprom_table_record)); > + data->count++; > + data->space_left--; > + } > + > + return 0; > +} > + > +static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device > *adev, > + struct eeprom_table_record *bps, struct ras_err_data > *err_data, > + enum amdgpu_memory_partition nps) > +{ > + int i = 0; > + int ret = 0; > + enum amdgpu_memory_partition save_nps; > + > + save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & > UMC_NPS_MASK; > + > + for (i = 0; i < adev->umc.retire_unit; i++) > + bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); > + > + if (save_nps) { > + if (save_nps == nps) { > + if (amdgpu_umc_pages_in_a_row(adev, err_data, > + bps[0].retired_page << > AMDGPU_GPU_PAGE_SHIFT)) > + return -EINVAL; > + } else { > + if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data)) > + return -EINVAL; > + } > + } else { > + if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) { > + if (nps == AMDGPU_NPS1_PARTITION_MODE) > + memcpy(err_data->err_addr, bps, > + sizeof(struct eeprom_table_record) * adev- > >umc.retire_unit); > + else > + return -EOPNOTSUPP; > + } > + } > + > + return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, > +adev->umc.retire_unit); } > + > +static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, > + struct eeprom_table_record *bps, struct ras_err_data > *err_data, > + enum amdgpu_memory_partition nps) > +{ > + enum amdgpu_memory_partition save_nps; > + > + save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; > + bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); > + > + if (save_nps == nps) { > + if (amdgpu_umc_pages_in_a_row(adev, err_data, > + bps->retired_page << > AMDGPU_GPU_PAGE_SHIFT)) > + return -EINVAL; > + } else { > + if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) > + return -EINVAL; > + } > + return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, > + adev- > >umc.retire_unit); > +} > + > /* it deal with vram only. */ > int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, > struct eeprom_table_record *bps, int pages, bool from_rom) { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > - struct ras_err_handler_data *data; > struct ras_err_data err_data; > - struct eeprom_table_record *err_rec; > struct amdgpu_ras_eeprom_control *control = > &adev->psp.ras_context.ras->eeprom_control; > enum amdgpu_memory_partition nps = > AMDGPU_NPS1_PARTITION_MODE; > int ret = 0; > - uint32_t i, j, loop_cnt = 1; > - bool find_pages_per_pa = false; > + uint32_t i; > > if (!con || !con->eh_data || !bps || pages <= 0) > return 0; > @@ -2825,108 +2906,41 @@ int amdgpu_ras_add_bad_pages(struct > amdgpu_device *adev, > sizeof(struct eeprom_table_record), GFP_KERNEL); > if (!err_data.err_addr) { > dev_warn(adev->dev, "Failed to alloc UMC error address > record in mca2pa conversion!\n"); > - ret = -ENOMEM; > - goto out; > + return -ENOMEM; > } > > - err_rec = err_data.err_addr; > - loop_cnt = adev->umc.retire_unit; > if (adev->gmc.gmc_funcs->query_mem_partition_mode) > nps = adev->gmc.gmc_funcs- > >query_mem_partition_mode(adev); > } > > mutex_lock(&con->recovery_lock); > - data = con->eh_data; > - if (!data) { > - /* Returning 0 as the absence of eh_data is acceptable */ > - goto free; > - } > - > - for (i = 0; i < pages; i++) { > - if (from_rom && > - control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) { > - if (!find_pages_per_pa) { > - if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], > &err_data)) { > - if (!i && nps == > AMDGPU_NPS1_PARTITION_MODE) { > - /* may use old RAS TA, use PA to find > pages in > - * one row > - */ > - if > (amdgpu_umc_pages_in_a_row(adev, &err_data, > - > bps[i].retired_page << > - > AMDGPU_GPU_PAGE_SHIFT)) { > - ret = -EINVAL; > - goto free; > - } else { > - find_pages_per_pa = true; > - } > - } else { > - /* unsupported cases */ > - ret = -EOPNOTSUPP; > - goto free; > - } > - } > - } else { > - if (amdgpu_umc_pages_in_a_row(adev, &err_data, > - bps[i].retired_page << > AMDGPU_GPU_PAGE_SHIFT)) { > - ret = -EINVAL; > - goto free; > - } > - } > - } else { > - if (from_rom && !find_pages_per_pa) { > - if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) { > - /* bad page in any NPS mode in eeprom */ > - if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], > &err_data)) { > - ret = -EINVAL; > + > + if (from_rom) { > + for (i = 0; i < pages; i++) { > + if (control->ras_num_recs - i >= adev->umc.retire_unit) { > + if ((bps[i].address == bps[i + 1].address) && > + (bps[i].mem_channel == bps[i + 1].mem_channel)) { > + //deal with retire_unit records a time > + ret = > __amdgpu_ras_convert_rec_array_from_rom(adev, > + &bps[i], > &err_data, nps); > + if (ret) > goto free; > - } > + i += (adev->umc.retire_unit - 1); > } else { > - /* legacy bad page in eeprom, generated only > in > - * NPS1 mode > - */ > - if (amdgpu_ras_mca2pa(adev, &bps[i], > &err_data)) { > - /* old RAS TA or ASICs which don't > support to > - * convert addrss via mca address > - */ > - if (!i && nps == > AMDGPU_NPS1_PARTITION_MODE) { > - find_pages_per_pa = true; > - err_rec = &bps[i]; > - loop_cnt = 1; > - } else { > - /* non-nps1 mode, old RAS TA > - * can't support it > - */ > - ret = -EOPNOTSUPP; > - goto free; > - } > - } > + break; > } > - > - if (!find_pages_per_pa) > - i += (adev->umc.retire_unit - 1); > } else { > - err_rec = &bps[i]; > + break; > } > } > - > - for (j = 0; j < loop_cnt; j++) { > - if (amdgpu_ras_check_bad_page_unlock(con, > - err_rec[j].retired_page << > AMDGPU_GPU_PAGE_SHIFT)) > - continue; > - > - if (!data->space_left && > - amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { > - ret = -ENOMEM; > + for (; i < pages; i++) { > + ret = __amdgpu_ras_convert_rec_from_rom(adev, > + &bps[i], &err_data, nps); > + if (ret) > goto free; > - } > - > - amdgpu_ras_reserve_page(adev, err_rec[j].retired_page); > - > - memcpy(&data->bps[data->count], &(err_rec[j]), > - sizeof(struct eeprom_table_record)); > - data->count++; > - data->space_left--; > } > + } else { > + ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); > } > > free: > -- > 2.34.1