From: Rob Clark <robdclark@xxxxxxxxxxxx> Don't directly restart the hangcheck timer from the timer handler, but instead start it after the recover_worker replays remaining jobs. If the kthread is blocked for other reasons, there is no point to immediately restart the timer. Fixes a random symptom of the problem fixed in the next patch. v2: Keep the hangcheck timer restart in the timer handler in the case where we aren't scheduling recover_worker Signed-off-by: Rob Clark <robdclark@xxxxxxxxxxxx> --- drivers/gpu/drm/msm/msm_gpu.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index fba85f894314..6762001d9945 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -328,6 +328,7 @@ find_submit(struct msm_ringbuffer *ring, uint32_t fence) } static void retire_submits(struct msm_gpu *gpu); +static void hangcheck_timer_reset(struct msm_gpu *gpu); static void get_comm_cmdline(struct msm_gem_submit *submit, char **comm, char **cmd) { @@ -420,6 +421,8 @@ static void recover_worker(struct kthread_work *work) } if (msm_gpu_active(gpu)) { + bool restart_hangcheck = false; + /* retire completed submits, plus the one that hung: */ retire_submits(gpu); @@ -436,10 +439,15 @@ static void recover_worker(struct kthread_work *work) unsigned long flags; spin_lock_irqsave(&ring->submit_lock, flags); - list_for_each_entry(submit, &ring->submits, node) + list_for_each_entry(submit, &ring->submits, node) { gpu->funcs->submit(gpu, submit); + restart_hangcheck = true; + } spin_unlock_irqrestore(&ring->submit_lock, flags); } + + if (restart_hangcheck) + hangcheck_timer_reset(gpu); } mutex_unlock(&gpu->lock); @@ -498,6 +506,7 @@ static void hangcheck_handler(struct timer_list *t) struct drm_device *dev = gpu->dev; struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu); uint32_t fence = ring->memptrs->fence; + bool restart_hangcheck = true; if (fence != ring->hangcheck_fence) { /* some progress has been made.. ya! */ @@ -513,10 +522,16 @@ static void hangcheck_handler(struct timer_list *t) gpu->name, ring->fctx->last_fence); kthread_queue_work(gpu->worker, &gpu->recover_work); + + /* If we do recovery, we want to defer restarting the hangcheck + * timer until recovery completes and the remaining non-guilty + * jobs are re-played. + */ + restart_hangcheck = false; } /* if still more pending work, reset the hangcheck timer: */ - if (fence_after(ring->fctx->last_fence, ring->hangcheck_fence)) + if (restart_hangcheck && fence_after(ring->fctx->last_fence, ring->hangcheck_fence)) hangcheck_timer_reset(gpu); /* workaround for missing irq: */ -- 2.36.1