On 8/12/20 11:19 AM, Emily.Deng wrote:
From: jqdeng <Emily.Deng@xxxxxxx>
Only for no job running test case need to do recover in
flr notification.
For having job in mirror list, then let guest driver to
hit job timeout, and then do recover.
Signed-off-by: jqdeng <Emily.Deng@xxxxxxx>
Change-Id: Ic6234fce46fa1655ba81c4149235eeac75e75868
---
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 20 +++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 22 ++++++++++++++++++++--
2 files changed, 39 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index fe31cbeccfe9..12fe5164aaf3 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -238,6 +238,9 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
+ int i;
+ bool need_do_recover = true;
We should find a better name for "need_do_recover", may be
"need_to_recover" ?
+ struct drm_sched_job *job;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
* otherwise the mailbox msg will be ruined/reseted by
@@ -258,10 +261,25 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
flr_done:
up_read(&adev->reset_sem);
+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+ struct amdgpu_ring *ring = adev->rings[i];
+
+ if (!ring || !ring->sched.thread)
+ continue;
+
+ spin_lock(&ring->sched.job_list_lock);
+ job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
+ struct drm_sched_job, node);
+ spin_unlock(&ring->sched.job_list_lock);
+ if (job) {
+ need_do_recover = false;
+ break;
+ }
+ }
This 1st job retrieval logic can move to a function as there are two
instance of it.
/* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev)
- && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)
+ && (need_do_recover || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
amdgpu_device_gpu_recover(adev, NULL);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 6f55172e8337..fc92c494df0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -259,6 +259,9 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
+ int i;
+ bool need_do_recover = true;
+ struct drm_sched_job *job;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
* otherwise the mailbox msg will be ruined/reseted by
@@ -279,10 +282,25 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
flr_done:
up_read(&adev->reset_sem);
+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+ struct amdgpu_ring *ring = adev->rings[i];
+
+ if (!ring || !ring->sched.thread)
+ continue;
+
+ spin_lock(&ring->sched.job_list_lock);
+ job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
+ struct drm_sched_job, node);
+ spin_unlock(&ring->sched.job_list_lock);
+ if (job) {
+ need_do_recover = false;
+ break;
+ }
+ }
/* Trigger recovery for world switch failure if no TDR */
- if (amdgpu_device_should_recover_gpu(adev)
- && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
+ if (amdgpu_device_should_recover_gpu(adev) && (need_do_recover ||
+ adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx