[AMD Official Use Only - Internal Distribution Only] >-----Original Message----- >From: Das, Nirmoy <Nirmoy.Das@xxxxxxx> >Sent: Wednesday, August 12, 2020 8:18 PM >To: Deng, Emily <Emily.Deng@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx >Subject: Re: [PATCH] drm/amdgpu: Fix repeatly flr issue > > >On 8/12/20 11:19 AM, Emily.Deng wrote: >> From: jqdeng <Emily.Deng@xxxxxxx> >> >> Only for no job running test case need to do recover in flr >> notification. >> For having job in mirror list, then let guest driver to hit job >> timeout, and then do recover. >> >> Signed-off-by: jqdeng <Emily.Deng@xxxxxxx> >> Change-Id: Ic6234fce46fa1655ba81c4149235eeac75e75868 >> --- >> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 20 +++++++++++++++++++- >> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 22 ++++++++++++++++++++- >- >> 2 files changed, 39 insertions(+), 3 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >> index fe31cbeccfe9..12fe5164aaf3 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >> @@ -238,6 +238,9 @@ static void xgpu_ai_mailbox_flr_work(struct >work_struct *work) >> struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, >flr_work); >> struct amdgpu_device *adev = container_of(virt, struct >amdgpu_device, virt); >> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >> +int i; >> +bool need_do_recover = true; > > >We should find a better name for "need_do_recover", may be >"need_to_recover" ? Thanks, will modify later. > > >> +struct drm_sched_job *job; >> >> /* block amdgpu_gpu_recover till msg FLR COMPLETE received, >> * otherwise the mailbox msg will be ruined/reseted by >> @@ -258,10 +261,25 @@ static void xgpu_ai_mailbox_flr_work(struct >work_struct *work) >> >> flr_done: >> up_read(&adev->reset_sem); >> +for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> +struct amdgpu_ring *ring = adev->rings[i]; >> + >> +if (!ring || !ring->sched.thread) >> +continue; >> + >> +spin_lock(&ring->sched.job_list_lock); >> +job = list_first_entry_or_null(&ring->sched.ring_mirror_list, >> +struct drm_sched_job, node); >> +spin_unlock(&ring->sched.job_list_lock); >> +if (job) { >> +need_do_recover = false; >> +break; >> +} >> +} > > >This 1st job retrieval logic can move to a function as there are two >instance of it. >Sorry, I didn't get your point. > >> >> /* Trigger recovery for world switch failure if no TDR */ >> if (amdgpu_device_should_recover_gpu(adev) >> -&& adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT) >> +&& (need_do_recover || adev->sdma_timeout == >MAX_SCHEDULE_TIMEOUT)) >> amdgpu_device_gpu_recover(adev, NULL); >> } >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >> index 6f55172e8337..fc92c494df0b 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >> @@ -259,6 +259,9 @@ static void xgpu_nv_mailbox_flr_work(struct >work_struct *work) >> struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, >flr_work); >> struct amdgpu_device *adev = container_of(virt, struct >amdgpu_device, virt); >> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >> +int i; >> +bool need_do_recover = true; >> +struct drm_sched_job *job; >> >> /* block amdgpu_gpu_recover till msg FLR COMPLETE received, >> * otherwise the mailbox msg will be ruined/reseted by >> @@ -279,10 +282,25 @@ static void xgpu_nv_mailbox_flr_work(struct >work_struct *work) >> >> flr_done: >> up_read(&adev->reset_sem); >> +for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> +struct amdgpu_ring *ring = adev->rings[i]; >> + >> +if (!ring || !ring->sched.thread) >> +continue; >> + >> +spin_lock(&ring->sched.job_list_lock); >> +job = list_first_entry_or_null(&ring->sched.ring_mirror_list, >> +struct drm_sched_job, node); >> +spin_unlock(&ring->sched.job_list_lock); >> +if (job) { >> +need_do_recover = false; >> +break; >> +} >> +} >> >> /* Trigger recovery for world switch failure if no TDR */ >> -if (amdgpu_device_should_recover_gpu(adev) >> -&& (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT || >> +if (amdgpu_device_should_recover_gpu(adev) && (need_do_recover >|| >> +adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT || >> adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || >> adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || >> adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx