On 6/18/2024 12:03 PM, YiPeng Chai wrote: > Add completion to wait for ras reset to complete. > > Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + > 2 files changed, 12 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 898889600771..7f8e6ca07957 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) > > #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms > > +#define MAX_RAS_RECOVERY_COMPLETION_TIME 120000 //ms > + > enum amdgpu_ras_retire_page_reservation { > AMDGPU_RAS_RETIRE_PAGE_RESERVED, > AMDGPU_RAS_RETIRE_PAGE_PENDING, > @@ -2518,6 +2520,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) > atomic_set(&hive->ras_recovery, 0); > amdgpu_put_xgmi_hive(hive); > } > + > + complete_all(&ras->ras_recovery_completion); > } > > /* alloc/realloc bps array */ > @@ -2911,10 +2915,16 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, > > flush_delayed_work(&con->page_retirement_dwork); > > + reinit_completion(&con->ras_recovery_completion); > + > con->gpu_reset_flags |= reset; > amdgpu_ras_reset_gpu(adev); > > *gpu_reset = reset; > + if (!wait_for_completion_timeout(&con->ras_recovery_completion, > + msecs_to_jiffies(MAX_RAS_RECOVERY_COMPLETION_TIME))) > + dev_err(adev->dev, "Waiting for GPU to complete ras reset timeout! reset:0x%x\n", > + reset); If a mode-1 reset gets to execute first due to job timeout/hws detect cases in poison timeout, then the ras handler will never get executed. Why this wait is required? Thanks, Lijo > } > > return 0; > @@ -3041,6 +3051,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) > } > } > > + init_completion(&con->ras_recovery_completion); > mutex_init(&con->page_rsv_lock); > INIT_KFIFO(con->poison_fifo); > mutex_init(&con->page_retirement_lock); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 91daf48be03a..b47f03edac87 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -537,6 +537,7 @@ struct amdgpu_ras { > DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); > struct ras_ecc_log_info umc_ecc_log; > struct delayed_work page_retirement_dwork; > + struct completion ras_recovery_completion; > > /* Fatal error detected flag */ > atomic_t fed;