Now when we have mechanism in place to track which context was guilty of hanging the gpu, it is possible to punish for bad behaviour. If context has recently submitted a faulty batchbuffers guilty of gpu hang and submits another batch which hangs gpu in a 16 seconds time window, ban it permanently. Ban means that batchbuffers from this fd/context will not be accepted anymore for execution. v2: Store guilty ban status bool in gpu_error instead of pointers that might become danling before hang is declared. v3: Use return value for banned status instead of stashing state into gpu_error (Chris Wilson) v4: - rebase on top of fixed hang stats api - add define for ban period - rename commit and improve commit msg Signed-off-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_drv.c | 6 +++-- drivers/gpu/drm/i915/i915_drv.h | 10 +++++++- drivers/gpu/drm/i915/i915_gem.c | 35 +++++++++++++++++++++------- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 ++++++++++ 4 files changed, 51 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index e19dec5..8934365 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -804,6 +804,7 @@ int i915_reset(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; bool simulated; + bool ctx_banned; int ret; if (!i915_try_reset) @@ -811,11 +812,12 @@ int i915_reset(struct drm_device *dev) mutex_lock(&dev->struct_mutex); - i915_gem_reset(dev); + ctx_banned = i915_gem_reset(dev); simulated = dev_priv->gpu_error.stop_rings != 0; - if (!simulated && get_seconds() - dev_priv->gpu_error.last_reset < + if (!(simulated || ctx_banned) && + get_seconds() - dev_priv->gpu_error.last_reset < DRM_I915_WEDGE_PERIOD) { DRM_ERROR("GPU hanging too fast, declaring wedged!\n"); ret = -ENODEV; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 9c0ca78..9d99937 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -575,6 +575,12 @@ struct i915_ctx_hang_stats { /* This context had batch active when hang was declared */ unsigned batch_active; + + /* Time when this context was last blamed for a GPU reset */ + unsigned long batch_active_reset_ts; + + /* This context is banned to submit more work */ + bool banned; }; /* This must match up with the value previously used for execbuf2.rsvd1. */ @@ -978,6 +984,8 @@ struct i915_gpu_error { #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD) /* Hangcheck needs >2 periods to declare a hang */ #define DRM_I915_WEDGE_PERIOD DIV_ROUND_UP(5*DRM_I915_HANGCHECK_PERIOD, 1000) + /* Hang gpu twice in this window and your context gets banned */ +#define DRM_I915_CONTEXT_BAN_PERIOD (2 * DRM_I915_WEDGE_PERIOD) struct timer_list hangcheck_timer; @@ -1922,7 +1930,7 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error) return atomic_read(&error->reset_counter) == I915_WEDGED; } -void i915_gem_reset(struct drm_device *dev); +bool i915_gem_reset(struct drm_device *dev); bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force); int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj); int __must_check i915_gem_init(struct drm_device *dev); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 23c4256..0592966 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2201,16 +2201,16 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request, return false; } -static void i915_set_reset_status(struct intel_ring_buffer *ring, +static bool i915_set_reset_status(struct intel_ring_buffer *ring, struct drm_i915_gem_request *request, u32 acthd) { struct i915_ctx_hang_stats *hs = NULL; - bool inside, guilty; + bool inside, guilty, banned; unsigned long offset = 0; /* Innocent until proven guilty */ - guilty = false; + guilty = banned = false; if (request->batch_obj) offset = i915_gem_obj_offset(request->batch_obj, @@ -2237,11 +2237,21 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring, hs = &request->file_priv->hang_stats; if (hs) { - if (guilty) + if (guilty) { + if (!hs->banned && + get_seconds() - hs->batch_active_reset_ts < + DRM_I915_CONTEXT_BAN_PERIOD) { + hs->banned = banned = true; + DRM_ERROR("context hanging too fast, declaring banned\n"); + } hs->batch_active++; - else + hs->batch_active_reset_ts = get_seconds(); + } else { hs->batch_pending++; + } } + + return banned; } static void i915_gem_free_request(struct drm_i915_gem_request *request) @@ -2255,11 +2265,12 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request) kfree(request); } -static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv, +static bool i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv, struct intel_ring_buffer *ring) { u32 completed_seqno; u32 acthd; + bool ctx_banned = false; acthd = intel_ring_get_active_head(ring); completed_seqno = ring->get_seqno(ring, false); @@ -2272,7 +2283,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv, list); if (request->seqno > completed_seqno) - i915_set_reset_status(ring, request, acthd); + ctx_banned |= i915_set_reset_status(ring, + request, acthd); i915_gem_free_request(request); } @@ -2286,6 +2298,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv, i915_gem_object_move_to_inactive(obj); } + + return ctx_banned; } void i915_gem_restore_fences(struct drm_device *dev) @@ -2309,16 +2323,19 @@ void i915_gem_restore_fences(struct drm_device *dev) } } -void i915_gem_reset(struct drm_device *dev) +bool i915_gem_reset(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct intel_ring_buffer *ring; int i; + bool ctx_banned = false; for_each_ring(ring, dev_priv, i) - i915_gem_reset_ring_lists(dev_priv, ring); + ctx_banned |= i915_gem_reset_ring_lists(dev_priv, ring); i915_gem_restore_fences(dev); + + return ctx_banned; } /** diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 792c52a..50c6bf5 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -886,6 +886,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, struct drm_i915_gem_object *batch_obj; struct drm_clip_rect *cliprects = NULL; struct intel_ring_buffer *ring; + struct i915_ctx_hang_stats *hs; u32 ctx_id = i915_execbuffer2_get_context_id(*args); u32 exec_start, exec_len; u32 mask, flags; @@ -1077,6 +1078,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, if (ret) goto err; + hs = i915_gem_context_get_hang_stats(dev, file, ctx_id); + if (IS_ERR(hs)) { + ret = PTR_ERR(hs); + goto err; + } + + if (hs->banned) { + ret = -EIO; + goto err; + } + ret = i915_switch_context(ring, file, ctx_id); if (ret) goto err; -- 1.7.9.5 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx