On 1 Nov 2022 15:33:10 -0700 Rob Clark <robdclark@xxxxxxxxxxxx> > --- a/drivers/gpu/drm/msm/msm_gpu.c > +++ b/drivers/gpu/drm/msm/msm_gpu.c > @@ -500,6 +500,21 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu) > round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period))); > } > > +static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring) > +{ > + if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES) > + return false; > + > + if (!gpu->funcs->progress) > + return false; Retry can not make difference without the progress callback provided. > + > + if (!gpu->funcs->progress(gpu, ring)) > + return false; > + > + ring->hangcheck_progress_retries++; > + return true; > +} > + > static void hangcheck_handler(struct timer_list *t) > { > struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer); > @@ -511,9 +526,12 @@ static void hangcheck_handler(struct timer_list *t) > if (fence != ring->hangcheck_fence) { > /* some progress has been made.. ya! */ > ring->hangcheck_fence = fence; > - } else if (fence_before(fence, ring->fctx->last_fence)) { > + ring->hangcheck_progress_retries = 0; > + } else if (fence_before(fence, ring->fctx->last_fence) && > + !made_progress(gpu, ring)) { > /* no progress and not done.. hung! */ > ring->hangcheck_fence = fence; > + ring->hangcheck_progress_retries = 0; > DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n", > gpu->name, ring->id); > DRM_DEV_ERROR(dev->dev, "%s: completed fence: %u\n", Cutting DRM_MSM_HANGCHECK_DEFAULT_PERIOD down to 250ms leads to report of false hang detected in case of no ->progress implemented. > diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h > index 585fd9c8d45a..d8f355e9f0b2 100644 > --- a/drivers/gpu/drm/msm/msm_gpu.h > +++ b/drivers/gpu/drm/msm/msm_gpu.h > @@ -78,6 +78,8 @@ struct msm_gpu_funcs { > struct msm_gem_address_space *(*create_private_address_space) > (struct msm_gpu *gpu); > uint32_t (*get_rptr)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); > + > + bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); > }; > > /* Additional state for iommu faults: */ > @@ -236,7 +238,8 @@ struct msm_gpu { > */ > #define DRM_MSM_INACTIVE_PERIOD 66 /* in ms (roughly four frames) */ > > -#define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 500 /* in ms */ > +#define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 250 /* in ms */ > +#define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3