For MCA poison, if unmap queue fails, gpu reset should be triggered even no error count is queried. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 27 +++++++++++++++---------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index e97b1bd343ee..22995f371eae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -229,18 +229,23 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) { int ret; - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - struct ras_common_if head = { - .block = AMDGPU_RAS_BLOCK__UMC, - }; - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); - ret = - amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); - - if (ret == AMDGPU_RAS_SUCCESS && obj) { - obj->err_data.ue_count += err_data->ue_count; - obj->err_data.ce_count += err_data->ce_count; + if (adev->gmc.xgmi.connected_to_cpu) { + ret = amdgpu_umc_poison_handler_mca(adev, ras_error_status, reset); + } else { + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + struct ras_common_if head = { + .block = AMDGPU_RAS_BLOCK__UMC, + }; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + + ret = + amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + + if (ret == AMDGPU_RAS_SUCCESS && obj) { + obj->err_data.ue_count += err_data->ue_count; + obj->err_data.ce_count += err_data->ce_count; + } } return ret; -- 2.35.1