Add completion to wait for gpu to complete reset. Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 ++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 2 files changed, 13 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 7dfb2e548d70..341c9bd0d1a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms +#define MAX_GPU_RESET_COMPLETION_TIME 120000 //ms + #define RAS_POISON_FIFO_MSG_PENDING_THRESHOLD (AMDGPU_RAS_POISON_FIFO_SIZE/4) enum amdgpu_ras_retire_page_reservation { @@ -2526,6 +2528,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) atomic_set(&hive->ras_recovery, 0); amdgpu_put_xgmi_hive(hive); } + + complete(&ras->gpu_reset_completion); } /* alloc/realloc bps array */ @@ -2946,7 +2950,14 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, con->gpu_reset_flags |= reset; } + reinit_completion(&con->gpu_reset_completion); + amdgpu_ras_reset_gpu(adev); + + if (!wait_for_completion_timeout(&con->gpu_reset_completion, + msecs_to_jiffies(MAX_GPU_RESET_COMPLETION_TIME))) + dev_err(adev->dev, "Waiting for GPU to complete reset timeout! reset:0x%x\n", + reset); } return 0; @@ -3072,6 +3083,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) } } + init_completion(&con->gpu_reset_completion); mutex_init(&con->page_rsv_lock); INIT_KFIFO(con->poison_fifo); mutex_init(&con->page_retirement_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 103436bb650e..d5ddd0ca5de1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -537,6 +537,7 @@ struct amdgpu_ras { DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, AMDGPU_RAS_POISON_FIFO_SIZE); struct ras_ecc_log_info umc_ecc_log; struct delayed_work page_retirement_dwork; + struct completion gpu_reset_completion; /* Fatal error detected flag */ atomic_t fed; -- 2.34.1