On Mon, Sep 2, 2024 at 3:53 AM Lijo Lazar <lijo.lazar@xxxxxxx> wrote: > > Add a separate function to read badpage data during initialization. > Reading bad pages will need hardware access and cannot be done during > reset. Hence in cases where device needs a full reset during > init itself, attempting to read will cause a deadlock. > > Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx> Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 56 +++++++++++++++------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +- > 3 files changed, 41 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index e28227869307..468c4f590183 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2945,7 +2945,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) > * Note: theoretically, this should be called before all vram allocations > * to protect retired page from abusing > */ > - r = amdgpu_ras_recovery_init(adev); > + r = amdgpu_ras_recovery_init(adev, true); > if (r) > goto init_failed; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 2076f157cb6a..65c891b6b999 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3146,7 +3146,42 @@ static int amdgpu_ras_page_retirement_thread(void *param) > return 0; > } > > -int amdgpu_ras_recovery_init(struct amdgpu_device *adev) > +int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) > +{ > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + int ret; > + > + if (!con || amdgpu_sriov_vf(adev)) > + return 0; > + > + ret = amdgpu_ras_eeprom_init(&con->eeprom_control); > + > + if (ret) > + return ret; > + > + /* HW not usable */ > + if (amdgpu_ras_is_rma(adev)) > + return -EHWPOISON; > + > + if (con->eeprom_control.ras_num_recs) { > + ret = amdgpu_ras_load_bad_pages(adev); > + if (ret) > + return ret; > + > + amdgpu_dpm_send_hbm_bad_pages_num( > + adev, con->eeprom_control.ras_num_recs); > + > + if (con->update_channel_flag == true) { > + amdgpu_dpm_send_hbm_bad_channel_flag( > + adev, con->eeprom_control.bad_channel_bitmap); > + con->update_channel_flag = false; > + } > + } > + > + return ret; > +} > + > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) > { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct ras_err_handler_data **data; > @@ -3187,25 +3222,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) > */ > if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL) > return 0; > - ret = amdgpu_ras_eeprom_init(&con->eeprom_control); > - /* > - * This calling fails when is_rma is true or > - * ret != 0. > - */ > - if (amdgpu_ras_is_rma(adev) || ret) > - goto free; > - > - if (con->eeprom_control.ras_num_recs) { > - ret = amdgpu_ras_load_bad_pages(adev); > + if (init_bp_info) { > + ret = amdgpu_ras_init_badpage_info(adev); > if (ret) > goto free; > - > - amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); > - > - if (con->update_channel_flag == true) { > - amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); > - con->update_channel_flag = false; > - } > } > > mutex_init(&con->page_rsv_lock); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 669720a9c60a..871b2d6278e0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -736,8 +736,8 @@ struct amdgpu_ras_block_hw_ops { > * 8: feature disable > */ > > - > -int amdgpu_ras_recovery_init(struct amdgpu_device *adev); > +int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev); > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info); > > void amdgpu_ras_resume(struct amdgpu_device *adev); > void amdgpu_ras_suspend(struct amdgpu_device *adev); > -- > 2.25.1 >