[PATCH 06/12] drm/amdgpu/sriov:handle more jobs hang in different ring case

Monk.Liu@xxxxxxx (Monk Liu) · Sat, 30 Sep 2017 14:03:46 +0800

quit first and try later if gpu_reset is already running, this
way we can handle different jobs hang on different ring and
crash each other on the same time

Change-Id: I0c6bc8d76959c5053e7523c41b2305032fc6b79a
Signed-off-by: Monk Liu <Monk.Liu at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 15 ++++++++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 31a5608..9efbb33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2754,9 +2754,9 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
 	struct amdgpu_ring *ring;
 	struct dma_fence *fence = NULL, *next = NULL;
 
-	/* other thread is already into the gpu reset so just quit */
+	/* other thread is already into the gpu reset so just quit and come later */
 	if (!atomic_add_unless(&adev->in_sriov_reset, 1, 1))
-		return 0;
+		return -EAGAIN;
 
 	atomic_inc(&adev->gpu_reset_counter);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 4510627..0db81a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,10 +37,19 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job)
 		  atomic_read(&job->ring->fence_drv.last_seq),
 		  job->ring->fence_drv.sync_seq);
 
-	if (amdgpu_sriov_vf(job->adev))
-		amdgpu_sriov_gpu_reset(job->adev, job);
-	else
+	if (amdgpu_sriov_vf(job->adev)) {
+		int r;
+
+try_again:
+		r = amdgpu_sriov_gpu_reset(job->adev, job);
+		if (r == -EAGAIN) {
+			/* maye two different schedulers all have hang job, try later */
+			schedule();
+			goto try_again;
+		}
+	} else {
 		amdgpu_gpu_reset(job->adev);
+	}
 }
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
-- 
2.7.4