Am 30.09.2017 um 08:03 schrieb Monk Liu: > quit first and try later if gpu_reset is already running, this > way we can handle different jobs hang on different ring and > crash each other on the same time Using schedule like this is not good coding style, please use a lock or completion event instead. Christian. > > Change-Id: I0c6bc8d76959c5053e7523c41b2305032fc6b79a > Signed-off-by: Monk Liu <Monk.Liu at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 15 ++++++++++++--- > 2 files changed, 14 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 31a5608..9efbb33 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2754,9 +2754,9 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) > struct amdgpu_ring *ring; > struct dma_fence *fence = NULL, *next = NULL; > > - /* other thread is already into the gpu reset so just quit */ > + /* other thread is already into the gpu reset so just quit and come later */ > if (!atomic_add_unless(&adev->in_sriov_reset, 1, 1)) > - return 0; > + return -EAGAIN; > > atomic_inc(&adev->gpu_reset_counter); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 4510627..0db81a4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -37,10 +37,19 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job) > atomic_read(&job->ring->fence_drv.last_seq), > job->ring->fence_drv.sync_seq); > > - if (amdgpu_sriov_vf(job->adev)) > - amdgpu_sriov_gpu_reset(job->adev, job); > - else > + if (amdgpu_sriov_vf(job->adev)) { > + int r; > + > +try_again: > + r = amdgpu_sriov_gpu_reset(job->adev, job); > + if (r == -EAGAIN) { > + /* maye two different schedulers all have hang job, try later */ > + schedule(); > + goto try_again; > + } > + } else { > amdgpu_gpu_reset(job->adev); > + } > } > > int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,