Check RMA status in bad page retirement flow. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 +++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 7 +++---- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 934dfb2bf9e5..a6da44ac3fbd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2049,8 +2049,9 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * struct amdgpu_device *adev = obj->adev; struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, obj->head.block, 0); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!block_obj) + if (!block_obj || !con) return; /* both query_poison_status and handle_poison_consumption are optional, @@ -2074,7 +2075,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); /* gpu reset is fallback for failed and default cases */ - if (poison_stat) { + if (poison_stat || con->is_rma) { dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", block_obj->ras_comm.name); amdgpu_ras_reset_gpu(adev); @@ -2817,6 +2818,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) schedule_delayed_work(&con->page_retirement_dwork, msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL)); mutex_unlock(&con->umc_ecc_log.lock); + + if (err_data->err_addr_cnt && con->is_rma) + amdgpu_ras_reset_gpu(adev); } static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, @@ -2867,7 +2871,7 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, if (poison_msg->pasid_fn) poison_msg->pasid_fn(adev, pasid, poison_msg->data); - if (reset) { + if (reset && !con->is_rma) { flush_delayed_work(&con->page_retirement_dwork); con->gpu_reset_flags |= reset; @@ -3983,6 +3987,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + /* mode1 is the only selection for RMA status */ + if (ras->is_rma) { + ras->gpu_reset_flags = 0; + ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; + } + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 1dbe69eabb9a..5f3866548cb8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -195,7 +195,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); amdgpu_umc_handle_bad_pages(adev, ras_error_status); - if (err_data->ue_count && reset) { + if ((err_data->ue_count && (reset || con->is_rma)) { con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -211,6 +211,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, .block = AMDGPU_RAS_BLOCK__UMC, }; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); uint32_t timeout = timeout_ms; memset(&err_data, 0, sizeof(err_data)); @@ -243,9 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (reset) { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - + if (reset || (err_data.err_addr_cnt && con->is_rma) { con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } -- 2.34.1