Am 23.08.2018 um 17:20 schrieb Zhu, Rex: > >> -----Original Message----- >> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of >> Christian König >> Sent: Thursday, August 23, 2018 7:24 PM >> To: amd-gfx at lists.freedesktop.org >> Subject: [PATCH 1/4] drm/amdgpu: add ring soft recovery v3 >> >> Instead of hammering hard on the GPU try a soft recovery first. >> >> v2: reorder code a bit >> v3: increase timeout to 10ms, increment GPU reset counter >> >> Signed-off-by: Christian König <christian.koenig at amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 6 ++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 25 >> +++++++++++++++++++++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 4 ++++ >> 3 files changed, 35 insertions(+) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> index 265ff90f4e01..d93e31a5c4e7 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct >> drm_sched_job *s_job) >> struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); >> struct amdgpu_job *job = to_amdgpu_job(s_job); >> >> + if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence- >>> parent)) { >> + DRM_ERROR("ring %s timeout, but soft recovered\n", >> + s_job->sched->name); >> + return; >> + } >> + >> DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n", >> job->base.sched->name, atomic_read(&ring- >>> fence_drv.last_seq), >> ring->fence_drv.sync_seq); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c >> index 5dfd26be1eec..d445acb3d956 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c >> @@ -383,6 +383,31 @@ void >> amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring, >> amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask); } >> >> +/** >> + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup >> + * >> + * @ring: ring to try the recovery on >> + * @vmid: VMID we try to get going again >> + * @fence: timedout fence >> + * >> + * Tries to get a ring proceeding again when it is stuck. >> + */ >> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int >> vmid, >> + struct dma_fence *fence) >> +{ >> + ktime_t deadline = ktime_add_us(ktime_get(), 10000); >> + >> + if (!ring->funcs->soft_recovery) >> + return false; >> + >> + atomic_inc(&adev->gpu_reset_counter); >> + while (!dma_fence_is_signaled(fence) && >> + ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0) >> + ring->funcs->soft_recovery(ring, vmid); > Hi Christian, > > Is it necessary to add a udelay() here? No, I don't think so. Christian. > > Regards > Rex >> + return dma_fence_is_signaled(fence); >> +} >> + >> /* >> * Debugfs info >> */ >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >> index 409fdd9b9710..9cc239968e40 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >> @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs { >> /* priority functions */ >> void (*set_priority) (struct amdgpu_ring *ring, >> enum drm_sched_priority priority); >> + /* Try to soft recover the ring to make the fence signal */ >> + void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid); >> }; >> >> struct amdgpu_ring { >> @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring); >> void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring >> *ring, >> uint32_t reg0, uint32_t val0, >> uint32_t reg1, uint32_t val1); >> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int >> vmid, >> + struct dma_fence *fence); >> >> static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring) { >> -- >> 2.14.1 >> >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx at lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx