v2: Trigger GPU reset in case of new bad address errors. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 296e2d9..f5f36ff 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -243,17 +243,40 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, struct ras_err_data *err_data, struct amdgpu_iv_entry *entry) { + unsigned long new_err_addr_cnt, old_err_addr_cnt; + new_err_addr_cnt = 0; + old_err_addr_cnt = err_data->err_addr_cnt; + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); if (adev->umc.funcs->query_ras_error_count) adev->umc.funcs->query_ras_error_count(adev, err_data); /* umc query_ras_error_address is also responsible for clearing * error status */ - if (adev->umc.funcs->query_ras_error_address) + if (adev->umc.funcs->query_ras_error_address) { + unsigned long *bps; + int i; + adev->umc.funcs->query_ras_error_address(adev, err_data); + new_err_addr_cnt = err_data->err_addr_cnt - old_err_addr_cnt; + + if (new_err_addr_cnt) { + bps = kcalloc(new_err_addr_cnt, sizeof(*bps), GFP_KERNEL); + if (!bps) + return -ENOMEM; + + for (i = 0; i < new_err_addr_cnt; i++) + bps[i] = err_data->err_addr[old_err_addr_cnt + i] >> PAGE_SHIFT; + + amdgpu_ras_add_bad_pages(adev, bps, new_err_addr_cnt); + + kfree(bps); + } + } + /* only uncorrectable error needs gpu reset */ - if (err_data->ue_count) + if (err_data->ue_count || new_err_addr_cnt) amdgpu_ras_reset_gpu(adev, 0); return AMDGPU_RAS_SUCCESS; -- 2.7.4 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx