[AMD Official Use Only - General] Let's drop the following message. + dev_info(adev->dev, "%ld uncorrectable hardware errors and " + "%ld deferred hardware errors detected in UMC block\n", + err_data->ue_count, err_data->de_count); With that fixed, the series is Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Regards, Hawking -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Candice Li Sent: Wednesday, January 10, 2024 16:39 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Li, Candice <Candice.Li@xxxxxxx> Subject: [PATCH 2/2] drm/amdgpu: Do bad page retirement for deferred errors Needs to do bad page retirement for deferred errors. Signed-off-by: Candice Li <candice.li@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 848df7acdd3210..df61df7e9b155f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -93,6 +93,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int ret = 0; + unsigned long err_count; kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); @@ -147,16 +148,17 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, } /* only uncorrectable error needs gpu reset */ - if (err_data->ue_count) { - dev_info(adev->dev, "%ld uncorrectable hardware errors " - "detected in UMC block\n", - err_data->ue_count); + if (err_data->ue_count || err_data->de_count) { + dev_info(adev->dev, "%ld uncorrectable hardware errors and " + "%ld deferred hardware errors detected in UMC block\n", + err_data->ue_count, err_data->de_count); + err_count = err_data->ue_count + err_data->de_count; if ((amdgpu_bad_page_threshold != 0) && err_data->err_addr_cnt) { amdgpu_ras_add_bad_pages(adev, err_data->err_addr, err_data->err_addr_cnt); - amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count)); + amdgpu_ras_save_bad_pages(adev, &err_count); amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); -- 2.25.1