[PATCH 2/2] drm/i915: ban badly behaving contexts

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Now when we have mechanism in place to track which context
was guilty of hanging the gpu, it is possible to punish
for bad behaviour.

If context has recently submitted a faulty batchbuffers guilty of
gpu hang and submits another batch which hangs gpu in a 16 seconds
time window, ban it permanently.

Ban means that batchbuffers from this fd/context will not
be accepted anymore for execution.

v2: Store guilty ban status bool in gpu_error instead of pointers
    that might become danling before hang is declared.

v3: Use return value for banned status instead of stashing state
    into gpu_error (Chris Wilson)

v4: - rebase on top of fixed hang stats api
    - add define for ban period
    - rename commit and improve commit msg

Signed-off-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_drv.c            |    6 +++--
 drivers/gpu/drm/i915/i915_drv.h            |   10 +++++++-
 drivers/gpu/drm/i915/i915_gem.c            |   35 +++++++++++++++++++++-------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   12 ++++++++++
 4 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index e19dec5..8934365 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -804,6 +804,7 @@ int i915_reset(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	bool simulated;
+	bool ctx_banned;
 	int ret;
 
 	if (!i915_try_reset)
@@ -811,11 +812,12 @@ int i915_reset(struct drm_device *dev)
 
 	mutex_lock(&dev->struct_mutex);
 
-	i915_gem_reset(dev);
+	ctx_banned = i915_gem_reset(dev);
 
 	simulated = dev_priv->gpu_error.stop_rings != 0;
 
-	if (!simulated && get_seconds() - dev_priv->gpu_error.last_reset <
+	if (!(simulated || ctx_banned) &&
+	    get_seconds() - dev_priv->gpu_error.last_reset <
 	    DRM_I915_WEDGE_PERIOD) {
 		DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
 		ret = -ENODEV;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9c0ca78..9d99937 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -575,6 +575,12 @@ struct i915_ctx_hang_stats {
 
 	/* This context had batch active when hang was declared */
 	unsigned batch_active;
+
+	/* Time when this context was last blamed for a GPU reset */
+	unsigned long batch_active_reset_ts;
+
+	/* This context is banned to submit more work */
+	bool banned;
 };
 
 /* This must match up with the value previously used for execbuf2.rsvd1. */
@@ -978,6 +984,8 @@ struct i915_gpu_error {
 #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
 	/* Hangcheck needs >2 periods to declare a hang */
 #define DRM_I915_WEDGE_PERIOD DIV_ROUND_UP(5*DRM_I915_HANGCHECK_PERIOD, 1000)
+	/* Hang gpu twice in this window and your context gets banned */
+#define DRM_I915_CONTEXT_BAN_PERIOD (2 * DRM_I915_WEDGE_PERIOD)
 
 	struct timer_list hangcheck_timer;
 
@@ -1922,7 +1930,7 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 	return atomic_read(&error->reset_counter) == I915_WEDGED;
 }
 
-void i915_gem_reset(struct drm_device *dev);
+bool i915_gem_reset(struct drm_device *dev);
 bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
 int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj);
 int __must_check i915_gem_init(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 23c4256..0592966 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2201,16 +2201,16 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
 	return false;
 }
 
-static void i915_set_reset_status(struct intel_ring_buffer *ring,
+static bool i915_set_reset_status(struct intel_ring_buffer *ring,
 				  struct drm_i915_gem_request *request,
 				  u32 acthd)
 {
 	struct i915_ctx_hang_stats *hs = NULL;
-	bool inside, guilty;
+	bool inside, guilty, banned;
 	unsigned long offset = 0;
 
 	/* Innocent until proven guilty */
-	guilty = false;
+	guilty = banned = false;
 
 	if (request->batch_obj)
 		offset = i915_gem_obj_offset(request->batch_obj,
@@ -2237,11 +2237,21 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
 		hs = &request->file_priv->hang_stats;
 
 	if (hs) {
-		if (guilty)
+		if (guilty) {
+			if (!hs->banned &&
+			    get_seconds() - hs->batch_active_reset_ts <
+			    DRM_I915_CONTEXT_BAN_PERIOD) {
+				hs->banned = banned = true;
+				DRM_ERROR("context hanging too fast, declaring banned\n");
+			}
 			hs->batch_active++;
-		else
+			hs->batch_active_reset_ts = get_seconds();
+		} else {
 			hs->batch_pending++;
+		}
 	}
+
+	return banned;
 }
 
 static void i915_gem_free_request(struct drm_i915_gem_request *request)
@@ -2255,11 +2265,12 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
 	kfree(request);
 }
 
-static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
+static bool i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
 				      struct intel_ring_buffer *ring)
 {
 	u32 completed_seqno;
 	u32 acthd;
+	bool ctx_banned = false;
 
 	acthd = intel_ring_get_active_head(ring);
 	completed_seqno = ring->get_seqno(ring, false);
@@ -2272,7 +2283,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
 					   list);
 
 		if (request->seqno > completed_seqno)
-			i915_set_reset_status(ring, request, acthd);
+			ctx_banned |= i915_set_reset_status(ring,
+							    request, acthd);
 
 		i915_gem_free_request(request);
 	}
@@ -2286,6 +2298,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
 
 		i915_gem_object_move_to_inactive(obj);
 	}
+
+	return ctx_banned;
 }
 
 void i915_gem_restore_fences(struct drm_device *dev)
@@ -2309,16 +2323,19 @@ void i915_gem_restore_fences(struct drm_device *dev)
 	}
 }
 
-void i915_gem_reset(struct drm_device *dev)
+bool i915_gem_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
 	int i;
+	bool ctx_banned = false;
 
 	for_each_ring(ring, dev_priv, i)
-		i915_gem_reset_ring_lists(dev_priv, ring);
+		ctx_banned |= i915_gem_reset_ring_lists(dev_priv, ring);
 
 	i915_gem_restore_fences(dev);
+
+	return ctx_banned;
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 792c52a..50c6bf5 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -886,6 +886,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct drm_i915_gem_object *batch_obj;
 	struct drm_clip_rect *cliprects = NULL;
 	struct intel_ring_buffer *ring;
+	struct i915_ctx_hang_stats *hs;
 	u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 exec_start, exec_len;
 	u32 mask, flags;
@@ -1077,6 +1078,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	if (ret)
 		goto err;
 
+	hs = i915_gem_context_get_hang_stats(dev, file, ctx_id);
+	if (IS_ERR(hs)) {
+		ret = PTR_ERR(hs);
+		goto err;
+	}
+
+	if (hs->banned) {
+		ret = -EIO;
+		goto err;
+	}
+
 	ret = i915_switch_context(ring, file, ctx_id);
 	if (ret)
 		goto err;
-- 
1.7.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux