Andrey please give this set a good testing as well. Am 28.02.2018 um 08:21 schrieb Monk Liu: > found recover_vram_from_shadow sometimes get executed > in paralle with SDMA scheduler, should stop all > schedulers before doing gpu reset/recover > > Change-Id: Ibaef3e3c015f3cf88f84b2eaf95cda95ae1a64e3 > Signed-off-by: Monk Liu <Monk.Liu at amd.com> For now this patch is Reviewed-by: Christian König <christian.koenig at amd.com>. Regards, Christian. > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 +++++++++++------------------- > 1 file changed, 15 insertions(+), 25 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 75d1733..e9d81a8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2649,22 +2649,23 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > > /* block TTM */ > resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); > + > /* store modesetting */ > if (amdgpu_device_has_dc_support(adev)) > state = drm_atomic_helper_suspend(adev->ddev); > > - /* block scheduler */ > + /* block all schedulers and reset given job's ring */ > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > struct amdgpu_ring *ring = adev->rings[i]; > > if (!ring || !ring->sched.thread) > continue; > > - /* only focus on the ring hit timeout if &job not NULL */ > + kthread_park(ring->sched.thread); > + > if (job && job->ring->idx != i) > continue; > > - kthread_park(ring->sched.thread); > drm_sched_hw_job_reset(&ring->sched, &job->base); > > /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ > @@ -2707,33 +2708,22 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > } > dma_fence_put(fence); > } > + } > > - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > - struct amdgpu_ring *ring = adev->rings[i]; > - > - if (!ring || !ring->sched.thread) > - continue; > + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > + struct amdgpu_ring *ring = adev->rings[i]; > > - /* only focus on the ring hit timeout if &job not NULL */ > - if (job && job->ring->idx != i) > - continue; > + if (!ring || !ring->sched.thread) > + continue; > > + /* only need recovery sched of the given job's ring > + * or all rings (in the case @job is NULL) > + * after above amdgpu_reset accomplished > + */ > + if ((!job || job->ring->idx == i) && !r) > drm_sched_job_recovery(&ring->sched); > - kthread_unpark(ring->sched.thread); > - } > - } else { > - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > - struct amdgpu_ring *ring = adev->rings[i]; > > - if (!ring || !ring->sched.thread) > - continue; > - > - /* only focus on the ring hit timeout if &job not NULL */ > - if (job && job->ring->idx != i) > - continue; > - > - kthread_unpark(adev->rings[i]->sched.thread); > - } > + kthread_unpark(ring->sched.thread); > } > > if (amdgpu_device_has_dc_support(adev)) {