[CI-ping 3/5] drm/i915: Harden detection of missed interrupts

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Only declare a missed interrupt if we find that the GPU is idle with
waiters and a hangcheck interval has passed in which no new user
interrupts have been raised.

v2: Clear the stuck interrupt marker between successful batches

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>
Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_debugfs.c     | 11 ++++++----
 drivers/gpu/drm/i915/i915_irq.c         | 38 ++++++++++++++++++++++-----------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
 3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 919c05ba9932..9640738aabf2 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -728,10 +728,10 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
 static void i915_ring_seqno_info(struct seq_file *m,
 				 struct intel_engine_cs *engine)
 {
-	if (engine->get_seqno) {
-		seq_printf(m, "Current sequence (%s): %x\n",
-			   engine->name, engine->get_seqno(engine));
-	}
+	seq_printf(m, "Current sequence (%s): %x\n",
+		   engine->name, engine->get_seqno(engine));
+	seq_printf(m, "Current user interrupts (%s): %x\n",
+		   engine->name, READ_ONCE(engine->user_interrupts));
 }
 
 static int i915_gem_seqno_info(struct seq_file *m, void *data)
@@ -1367,6 +1367,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 			   engine->hangcheck.seqno,
 			   seqno[id],
 			   engine->last_submitted_seqno);
+		seq_printf(m, "\tuser interrupts = %x [current %x]\n",
+			   engine->hangcheck.user_interrupts,
+			   READ_ONCE(engine->user_interrupts));
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 3b946e1c7614..679f08c944ef 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1000,6 +1000,7 @@ static void notify_ring(struct intel_engine_cs *engine)
 		return;
 
 	trace_i915_gem_request_notify(engine);
+	engine->user_interrupts++;
 
 	wake_up_all(&engine->irq_queue);
 }
@@ -3054,6 +3055,24 @@ ring_stuck(struct intel_engine_cs *engine, u64 acthd)
 	return HANGCHECK_HUNG;
 }
 
+static unsigned kick_waiters(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *i915 = to_i915(engine->dev);
+	unsigned user_interrupts = READ_ONCE(engine->user_interrupts);
+
+	if (engine->hangcheck.user_interrupts == user_interrupts &&
+	    !test_and_set_bit(engine->id, &i915->gpu_error.missed_irq_rings)) {
+		if (!(i915->gpu_error.test_irq_rings & intel_engine_flag(engine)))
+			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
+				  engine->name);
+		else
+			DRM_INFO("Fake missed irq on %s\n",
+				 engine->name);
+		wake_up_all(&engine->irq_queue);
+	}
+
+	return user_interrupts;
+}
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -3096,6 +3115,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	for_each_engine_id(engine, dev_priv, id) {
 		u64 acthd;
 		u32 seqno;
+		unsigned user_interrupts;
 		bool busy = true;
 
 		semaphore_clear_deadlocks(dev_priv);
@@ -3113,22 +3133,15 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		acthd = intel_ring_get_active_head(engine);
 		seqno = engine->get_seqno(engine);
 
+		/* Reset stuck interrupts between batch advances */
+		user_interrupts = 0;
+
 		if (engine->hangcheck.seqno == seqno) {
 			if (ring_idle(engine, seqno)) {
 				engine->hangcheck.action = HANGCHECK_IDLE;
-
 				if (waitqueue_active(&engine->irq_queue)) {
-					/* Issue a wake-up to catch stuck h/w. */
-					if (!test_and_set_bit(engine->id, &dev_priv->gpu_error.missed_irq_rings)) {
-						if (!(dev_priv->gpu_error.test_irq_rings & intel_engine_flag(engine)))
-							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-								  engine->name);
-						else
-							DRM_INFO("Fake missed irq on %s\n",
-								 engine->name);
-						wake_up_all(&engine->irq_queue);
-					}
 					/* Safeguard against driver failure */
+					user_interrupts = kick_waiters(engine);
 					engine->hangcheck.score += BUSY;
 				} else
 					busy = false;
@@ -3179,7 +3192,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 				engine->hangcheck.score = 0;
 
 			/* Clear head and subunit states on seqno movement */
-			engine->hangcheck.acthd = 0;
+			acthd = 0;
 
 			memset(engine->hangcheck.instdone, 0,
 			       sizeof(engine->hangcheck.instdone));
@@ -3187,6 +3200,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 
 		engine->hangcheck.seqno = seqno;
 		engine->hangcheck.acthd = acthd;
+		engine->hangcheck.user_interrupts = user_interrupts;
 		busy_count += busy;
 	}
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3f04906a081f..29c54cc1ee5c 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -87,6 +87,7 @@ enum intel_ring_hangcheck_action {
 struct intel_ring_hangcheck {
 	u64 acthd;
 	u32 seqno;
+	unsigned user_interrupts;
 	int score;
 	enum intel_ring_hangcheck_action action;
 	int deadlock;
@@ -305,6 +306,7 @@ struct  intel_engine_cs {
 	 * inspecting request list.
 	 */
 	u32 last_submitted_seqno;
+	unsigned user_interrupts;
 
 	bool gpu_caches_dirty;
 
-- 
2.8.0.rc3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux