Mika Kuoppala <mika.kuoppala at linux.intel.com> writes: > If context has recently submitted a faulty batchbuffers guilty of > gpu hang and decides to keep submitting more crap, ban it permanently. > > Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com> > --- > drivers/gpu/drm/i915/i915_drv.c | 23 ++++++++++++++++++++++- > drivers/gpu/drm/i915/i915_drv.h | 7 +++++++ > drivers/gpu/drm/i915/i915_gem.c | 7 +++++-- > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 +++++++++++++ > 4 files changed, 47 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index a5b8aa9..0928f11 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -852,6 +852,8 @@ int intel_gpu_reset(struct drm_device *dev) > int i915_reset(struct drm_device *dev) > { > drm_i915_private_t *dev_priv = dev->dev_private; > + struct i915_ctx_hang_stats *hs; > + bool do_wedge = true; > int ret; > > if (!i915_try_reset) > @@ -859,10 +861,29 @@ int i915_reset(struct drm_device *dev) > > mutex_lock(&dev->struct_mutex); > > + /* i915_gem_reset will set this if it finds guilty context */ > + dev_priv->gpu_error.hang_stats = NULL; > + > i915_gem_reset(dev); > > + hs = dev_priv->gpu_error.hang_stats; > + > + if (hs) { > + if (hs->batch_active == 1) { > + do_wedge = false; > + } else if (!hs->banned && > + get_seconds() - hs->batch_active_reset_ts < 5) { > + hs->banned = true; > + do_wedge = false; > + } > + > + hs->batch_active_reset_ts = get_seconds(); > + } > + > + dev_priv->gpu_error.hang_stats = NULL; > + > ret = -ENODEV; > - if (get_seconds() - dev_priv->gpu_error.last_reset < 5) > + if (do_wedge && get_seconds() - dev_priv->gpu_error.last_reset < 5) > DRM_ERROR("GPU hanging too fast, declaring wedged!\n"); > else > ret = intel_gpu_reset(dev); > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 8223908..30ba79c 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -458,6 +458,12 @@ struct i915_ctx_hang_stats { > > /* This context had batch active when hang was declared */ > unsigned batch_active; > + > + /* Time when this context was last blamed for a GPU reset */ > + unsigned long batch_active_reset_ts; > + > + /* This context is banned to submit more work */ > + bool banned; > }; > > /* This must match up with the value previously used for execbuf2.rsvd1. */ > @@ -831,6 +837,7 @@ struct i915_gpu_error { > struct work_struct work; > > unsigned long last_reset; > + struct i915_ctx_hang_stats *hang_stats; > > /** > * State variable and reset counter controlling the reset flow > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 475b6ad..ca5c9c3 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -2147,6 +2147,7 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring, > struct drm_i915_gem_request *request, > u32 acthd) > { > + struct drm_i915_private *dev_priv = ring->dev->dev_private; > struct i915_ctx_hang_stats *hs = NULL; > bool inside, guilty; > > @@ -2175,10 +2176,12 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring, > hs = &request->file_priv->hang_stats; > > if (hs) { > - if (guilty) > + if (guilty) { > hs->batch_active++; > - else > + dev_priv->gpu_error.hang_stats = hs; This is wrong. The context can be destroyed shortly after, before reset is handled making this a dangling pointer. > + } else { > hs->batch_pending++; > + } > } > } > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > index bd1750a..f1b1ea9 100644 > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > @@ -844,6 +844,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, > struct drm_clip_rect *cliprects = NULL; > struct intel_ring_buffer *ring; > struct i915_hw_context *ctx; > + struct i915_ctx_hang_stats *hs; > u32 ctx_id = i915_execbuffer2_get_context_id(*args); > u32 exec_start, exec_len; > u32 mask, flags; > @@ -1026,6 +1027,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, > if (ret) > goto err; > > + hs = i915_gem_context_get_hang_stats(&dev_priv->ring[RCS], > + file, ctx_id); > + if (IS_ERR(hs)) { > + ret = PTR_ERR(hs); > + goto err; > + } > + > + if (hs->banned) { > + ret = -EIO; > + goto err; > + } > + > ctx = i915_switch_context(ring, file, ctx_id); > if (IS_ERR(ctx)) { > ret = PTR_ERR(ctx); > -- > 1.7.9.5