If a UMC bad page is reserved but not freed by an application, the application may trigger uncorrectable error repeatly by accessing the page. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 6 +++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e85c4689ce2c..eafe01a24349 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2049,7 +2049,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; - int ret = 0; + int ret = 0, old_cnt; uint32_t i; if (!con || !con->eh_data || !bps || pages <= 0) @@ -2060,6 +2060,8 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, if (!data) goto out; + old_cnt = data->count; + for (i = 0; i < pages; i++) { if (amdgpu_ras_check_bad_page_unlock(con, bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) @@ -2079,6 +2081,11 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, data->count++; data->space_left--; } + + /* all pages have been reserved before, no new bad page */ + if (old_cnt == data->count) + ret = -EEXIST; + out: mutex_unlock(&con->recovery_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 1c7fcb4f2380..772c431e4065 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -145,8 +145,12 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, if ((amdgpu_bad_page_threshold != 0) && err_data->err_addr_cnt) { - amdgpu_ras_add_bad_pages(adev, err_data->err_addr, + ret = amdgpu_ras_add_bad_pages(adev, err_data->err_addr, err_data->err_addr_cnt); + /* if no new bad page is found, no need to increase ue count */ + if (ret == -EEXIST) + err_data->ue_count = 0; + amdgpu_ras_save_bad_pages(adev); amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); -- 2.35.1