Already sent -----Original Message----- From: Grodzovsky, Andrey Sent: 2018å¹´2æ??28æ?¥ 21:31 To: Koenig, Christian <Christian.Koenig at amd.com>; Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org Subject: Re: [PATCH 1/4] drm/amdgpu: stop all rings before doing gpu recover Will do once Monk sends V2 for [PATCH 4/4] drm/amdgpu: try again kiq access if not in IRQ Andrey On 02/28/2018 07:20 AM, Christian König wrote: > Andrey please give this set a good testing as well. > > Am 28.02.2018 um 08:21 schrieb Monk Liu: >> found recover_vram_from_shadow sometimes get executed in paralle with >> SDMA scheduler, should stop all schedulers before doing gpu >> reset/recover >> >> Change-Id: Ibaef3e3c015f3cf88f84b2eaf95cda95ae1a64e3 >> Signed-off-by: Monk Liu <Monk.Liu at amd.com> > > For now this patch is Reviewed-by: Christian König > <christian.koenig at amd.com>. > > Regards, > Christian. > >> --- >>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 >> +++++++++++------------------- >>  1 file changed, 15 insertions(+), 25 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 75d1733..e9d81a8 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -2649,22 +2649,23 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >>       /* block TTM */ >>      resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); >> + >>      /* store modesetting */ >>      if (amdgpu_device_has_dc_support(adev)) >>          state = drm_atomic_helper_suspend(adev->ddev); >>  -   /* block scheduler */ >> +   /* block all schedulers and reset given job's ring */ >>      for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >>          struct amdgpu_ring *ring = adev->rings[i]; >>           if (!ring || !ring->sched.thread) >>              continue; >>  -       /* only focus on the ring hit timeout if &job not NULL */ >> +       kthread_park(ring->sched.thread); >> + >>          if (job && job->ring->idx != i) >>              continue; >>  -       kthread_park(ring->sched.thread); >>          drm_sched_hw_job_reset(&ring->sched, &job->base); >>           /* after all hw jobs are reset, hw fence is meaningless, >> so force_completion */ @@ -2707,33 +2708,22 @@ int >> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>              } >>              dma_fence_put(fence); >>          } >> +   } >>  -       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> -           struct amdgpu_ring *ring = adev->rings[i]; >> - >> -           if (!ring || !ring->sched.thread) >> -               continue; >> +   for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> +       struct amdgpu_ring *ring = adev->rings[i]; >>  -           /* only focus on the ring hit timeout if &job not NULL >> */ >> -           if (job && job->ring->idx != i) >> -               continue; >> +       if (!ring || !ring->sched.thread) >> +           continue; >>  +       /* only need recovery sched of the given job's ring >> +        * or all rings (in the case @job is NULL) >> +        * after above amdgpu_reset accomplished >> +        */ >> +       if ((!job || job->ring->idx == i) && !r) >>              drm_sched_job_recovery(&ring->sched); >> -           kthread_unpark(ring->sched.thread); >> -       } >> -   } else { >> -       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> -           struct amdgpu_ring *ring = adev->rings[i]; >>  -           if (!ring || !ring->sched.thread) >> -               continue; >> - >> -           /* only focus on the ring hit timeout if &job not NULL >> */ >> -           if (job && job->ring->idx != i) >> -               continue; >> - >> -           kthread_unpark(adev->rings[i]->sched.thread); >> -       } >> +       kthread_unpark(ring->sched.thread); >>      } >>       if (amdgpu_device_has_dc_support(adev)) { >