quit first and try later if gpu_reset is already running, this way we can handle different jobs hang on different ring and crash each other on the same time Change-Id: I0c6bc8d76959c5053e7523c41b2305032fc6b79a Signed-off-by: Monk Liu <Monk.Liu at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 31a5608..9efbb33 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2754,9 +2754,9 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) struct amdgpu_ring *ring; struct dma_fence *fence = NULL, *next = NULL; - /* other thread is already into the gpu reset so just quit */ + /* other thread is already into the gpu reset so just quit and come later */ if (!atomic_add_unless(&adev->in_sriov_reset, 1, 1)) - return 0; + return -EAGAIN; atomic_inc(&adev->gpu_reset_counter); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 4510627..0db81a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -37,10 +37,19 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job) atomic_read(&job->ring->fence_drv.last_seq), job->ring->fence_drv.sync_seq); - if (amdgpu_sriov_vf(job->adev)) - amdgpu_sriov_gpu_reset(job->adev, job); - else + if (amdgpu_sriov_vf(job->adev)) { + int r; + +try_again: + r = amdgpu_sriov_gpu_reset(job->adev, job); + if (r == -EAGAIN) { + /* maye two different schedulers all have hang job, try later */ + schedule(); + goto try_again; + } + } else { amdgpu_gpu_reset(job->adev); + } } int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, -- 2.7.4