[RFC 02/13] drm/i915: Improved hang detection logic

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



>From 53c34924274048bed18364ba71b83b2c8fcabe9b Mon Sep 17 00:00:00 2001
Message-Id: <53c34924274048bed18364ba71b83b2c8fcabe9b.1387201899.git.ian.lister@xxxxxxxxx>
In-Reply-To: <cover.1387201899.git.ian.lister@xxxxxxxxx>
References: <cover.1387201899.git.ian.lister@xxxxxxxxx>
From: ian-lister <ian.lister@xxxxxxxxx>
Date: Thu, 5 Dec 2013 15:52:37 +0000
Subject: [RFC 02/13] drm/i915: Improved hang detection logic

The i915_hangcheck_elapsed function has been split up to make it easier
to understand. The majority of the work is now done in
i915_hangcheck_ring_sample. This function is called for each ring and
it assesses the state to see if there has been any progress since the
last sample. If not it updates the hangcheck score and action.

The hang detection has been modified to detect hangs on commands
inserted directly to the ring by the driver. Previously it was oriented
around batch submissions with the check for progress based on whether
the seqno hadd changed since the previous sample. It now uses the head
and active head so that it can detect progress between individual
commands on the ring. To accommodate this change the queue_hangcheck
function is now called from the lowest level of ring submission
(__intel_ring_advance) so that hang detection will be started by adding
any work to a ring.

ring_stuck() will only be called if there has not been progress since
the last sample so the check for (hangcheck.acthd != acthd) has been
removed.

The semaphore_clear_deadlocks() function has been renamed to
semaphore_clear_deadlock_flags() to more accurately reflect its usage.

A TDR specific debug macro has been added so that TDR debug can be
selectively enabled at run time.

Signed-off-by: ian-lister <ian.lister@xxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_gem.c         |   2 -
 drivers/gpu/drm/i915/i915_irq.c         | 224 +++++++++++++++++++-------------
 drivers/gpu/drm/i915/intel_ringbuffer.c |   6 +
 drivers/gpu/drm/i915/intel_ringbuffer.h |   1 +
 include/drm/drmP.h                      |   7 +
 5 files changed, 147 insertions(+), 93 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 7ed1fab..976b9d8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2174,8 +2174,6 @@ int __i915_add_request(struct intel_ring_buffer *ring,
 	ring->preallocated_lazy_request = NULL;
 
 	if (!dev_priv->ums.mm_suspended) {
-		i915_queue_hangcheck(ring->dev);
-
 		if (was_empty) {
 			cancel_delayed_work_sync(&dev_priv->mm.idle_work);
 			queue_delayed_work(dev_priv->wq,
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 289985b..c2ecea1 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2674,7 +2674,7 @@ static int semaphore_passed(struct intel_ring_buffer *ring)
 	return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
 }
 
-static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
+static void semaphore_clear_deadlock_flags(struct drm_i915_private *dev_priv)
 {
 	struct intel_ring_buffer *ring;
 	int i;
@@ -2684,15 +2684,12 @@ static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
 }
 
 static enum intel_ring_hangcheck_action
-ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
+ring_stuck(struct intel_ring_buffer *ring)
 {
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 tmp;
 
-	if (ring->hangcheck.acthd != acthd)
-		return HANGCHECK_ACTIVE;
-
 	if (IS_GEN2(dev))
 		return HANGCHECK_HUNG;
 
@@ -2721,6 +2718,132 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
 	return HANGCHECK_HUNG;
 }
 
+
+/**
+ * Sample the current state of the ring and determine whether any
+ * progress has been made since the previous sample. A scoring system
+ * is used to decide when to trigger recovery work if the ring appears
+ * to have hung. Returns true if there is more work to do or a hang
+ * has been detected.
+ */
+static bool i915_hangcheck_ring_sample(struct intel_ring_buffer *ring)
+{
+	struct drm_device *dev = ring->dev;
+        drm_i915_private_t *dev_priv = dev->dev_private;
+	struct i915_gpu_error *gpu_error = &dev_priv->gpu_error;
+        uint32_t head, tail, acthd, seqno;
+        bool idle;
+	bool busy = true;
+#define BUSY 1
+#define KICK 5
+#define HUNG 20
+
+	semaphore_clear_deadlock_flags(dev_priv);
+
+	head = I915_READ_HEAD(ring) & HEAD_ADDR;
+	tail = I915_READ_TAIL(ring) & TAIL_ADDR;
+	seqno = ring->get_seqno(ring, false);
+	acthd = intel_ring_get_active_head(ring);
+
+	/* The ring is considered idle when there are no more commands
+	 * to execute and all outstanding requests have completed.
+	 * (There may still be requests waiting to be retired)
+	 */
+	idle = ((head == tail) && (ring_idle(ring, seqno)));
+
+	/* Determine if the ring has progressed since the last sample.
+	 * HEAD allows us to distinguish between commands on the ring
+	 * where as ACTHD is more likely to be progressing as it can
+	 * also point within a batch buffer. The combination of the two
+	 * prevents false positives by distinguishing between commands
+	 * and batch submissions and ensuring that we get good
+	 * visibility of progress.
+	 */
+	if ((head == ring->hangcheck.hd)
+	&& (acthd == ring->hangcheck.acthd)) {
+		/* The HW hasn't advanced since the last sample */
+		if (idle) {
+			ring->hangcheck.action = HANGCHECK_IDLE;
+
+			if (waitqueue_active(&ring->irq_queue)) {
+				/* Issue a wake-up to catch stuck h/w.*/
+				if (!test_and_set_bit(ring->id,
+					&gpu_error->missed_irq_rings)) {
+					if (!(gpu_error->test_irq_rings
+						 & intel_ring_flag(ring)))
+						DRM_ERROR("Timer %s idle\n",
+							ring->name);
+					else
+						DRM_INFO("Fake missed irq %s\n",
+							ring->name);
+					wake_up_all(&ring->irq_queue);
+				}
+
+				/* Safeguard against driver failure */
+				ring->hangcheck.score += BUSY;
+			} else
+				busy = false;
+		} else {
+			/* We always increment the hangcheck score
+			 * if the ring is busy and still processing
+			 * the same request, so that no single request
+			 * can run indefinitely (such as a chain of
+			 * batches). The only time we do not increment
+			 * the hangcheck score on this ring, if this
+			 * ring is in a legitimate wait for another
+			 * ring. In that case the waiting ring is a
+			 * victim and we want to be sure we catch the
+			 * right culprit. Then every time we do kick
+			 * the ring, add a small increment to the
+			 * score so that we can catch a batch that is
+			 * being repeatedly kicked and so responsible
+			 * for stalling the machine.
+			 */
+			ring->hangcheck.action = ring_stuck(ring);
+
+			switch (ring->hangcheck.action) {
+			case HANGCHECK_IDLE:
+			case HANGCHECK_WAIT:
+				break;
+			case HANGCHECK_ACTIVE:
+				ring->hangcheck.score += BUSY;
+				break;
+			case HANGCHECK_KICK:
+				ring->hangcheck.score += KICK;
+				break;
+			case HANGCHECK_HUNG:
+				ring->hangcheck.score += HUNG;
+				break;
+			}
+		}
+	} else {
+		ring->hangcheck.action = HANGCHECK_ACTIVE;
+
+		/* If the ring has progressed to a new command then
+		 * gradually decrease the score. This should help to
+		 * catch DoS attempts accross multiple batches. If we
+		 * are still processing the same ring command then
+		 * gradually increase the score so that no request is
+		 * allowed to run indefinitely.*/
+		if (ring->hangcheck.hd == head)
+			ring->hangcheck.score += BUSY;
+		else if (ring->hangcheck.score > 0)
+			ring->hangcheck.score--;
+	}
+
+	DRM_DEBUG_TDR(
+                "[%d] AHD: 0x%08x HD: 0x%08x TL: 0x%08x I: %d Sq: %d Sc: %d (%s)\n",
+                ring->id, acthd, head, tail, idle, seqno,
+       	        ring->hangcheck.score, ring->name);
+
+	/* Always update last sampled state */
+	ring->hangcheck.seqno = seqno;
+	ring->hangcheck.acthd = acthd;
+	ring->hangcheck.hd = head;
+
+	return busy;	
+}
+
 /**
  * This function is called periodically while the GPU has work to do.
  * It assesses the current state to see if the ring appears to be moving.
@@ -2734,105 +2857,24 @@ static void i915_hangcheck_elapsed(unsigned long data)
 	struct intel_ring_buffer *ring;
 	int i;
 	int busy_count = 0, rings_hung = 0;
-	bool stuck[I915_NUM_RINGS] = { 0 };
 	u32 tmp;
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
-#define FIRE 30
+#define FIRE 50
 
 	/* Clear the active flag *before* assessing the ring state
         * in case new work is added just after we sample the rings.
         * This will allow new work to re-trigger the timer even
         * if we see the rings as idle on this occasion.
-		*/
+	*/
         atomic_set(&dev_priv->gpu_error.active, 0);
 
 	if (!i915_enable_hangcheck)
 		return;
 
 	for_each_ring(ring, dev_priv, i) {
-		u32 seqno, acthd;
-		bool busy = true;
-
-		semaphore_clear_deadlocks(dev_priv);
-
-		seqno = ring->get_seqno(ring, false);
-		acthd = intel_ring_get_active_head(ring);
-
-		if (ring->hangcheck.seqno == seqno) {
-			if (ring_idle(ring, seqno)) {
-				ring->hangcheck.action = HANGCHECK_IDLE;
-
-				if (waitqueue_active(&ring->irq_queue)) {
-					/* Issue a wake-up to catch stuck h/w. */
-					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
-						if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
-							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-								  ring->name);
-						else
-							DRM_INFO("Fake missed irq on %s\n",
-								 ring->name);
-						wake_up_all(&ring->irq_queue);
-					}
-					/* Safeguard against driver failure */
-					ring->hangcheck.score += BUSY;
-				} else
-					busy = false;
-			} else {
-				/* We always increment the hangcheck score
-				 * if the ring is busy and still processing
-				 * the same request, so that no single request
-				 * can run indefinitely (such as a chain of
-				 * batches). The only time we do not increment
-				 * the hangcheck score on this ring, if this
-				 * ring is in a legitimate wait for another
-				 * ring. In that case the waiting ring is a
-				 * victim and we want to be sure we catch the
-				 * right culprit. Then every time we do kick
-				 * the ring, add a small increment to the
-				 * score so that we can catch a batch that is
-				 * being repeatedly kicked and so responsible
-				 * for stalling the machine.
-				 */
-				ring->hangcheck.action = ring_stuck(ring,
-								    acthd);
-
-				switch (ring->hangcheck.action) {
-				case HANGCHECK_IDLE:
-				case HANGCHECK_WAIT:
-					break;
-				case HANGCHECK_ACTIVE:
-					ring->hangcheck.score += BUSY;
-					break;
-				case HANGCHECK_KICK:
-					ring->hangcheck.score += KICK;
-					break;
-				case HANGCHECK_HUNG:
-					ring->hangcheck.score += HUNG;
-					stuck[i] = true;
-					break;
-				}
-			}
-		} else {
-			ring->hangcheck.action = HANGCHECK_ACTIVE;
-
-			/* Gradually reduce the count so that we catch DoS
-			 * attempts across multiple batches.
-			 */
-			if (ring->hangcheck.score > 0)
-				ring->hangcheck.score--;
-		}
+		busy_count += i915_hangcheck_ring_sample(ring);
 
-		ring->hangcheck.seqno = seqno;
-		ring->hangcheck.acthd = acthd;
-		busy_count += busy;
-	}
-
-	for_each_ring(ring, dev_priv, i) {
 		if (ring->hangcheck.score > FIRE) {
-			DRM_INFO("%s on %s\n",
-				 stuck[i] ? "stuck" : "no progress",
+			DRM_ERROR("Hang detected on %s\n",
 				 ring->name);
 
 			/* Is the chip hanging on a WAIT_FOR_EVENT?
@@ -2864,7 +2906,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
 		return i915_handle_error(dev, true);
 
 	if (busy_count)
-		/* Reset timer case chip hangs without another request
+		/* Restart the timer in case the chip hangs without another request
 		 * being added */
 		i915_queue_hangcheck(dev);
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 6d2f940..7614bef 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -45,6 +45,12 @@ void __intel_ring_advance(struct intel_ring_buffer *ring)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 
+	/* Work is being added to the ring so notify the hangcheck
+	 * logic to start the timer if it is not already running.
+	 */
+	if (!dev_priv->ums.mm_suspended)
+		i915_queue_hangcheck(ring->dev);
+
 	ring->tail &= ring->size - 1;
 	if (dev_priv->gpu_error.stop_rings & intel_ring_flag(ring))
 		return;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 71a73f4..f4744bf 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -45,6 +45,7 @@ struct intel_ring_hangcheck {
 	bool deadlock;
 	u32 seqno;
 	u32 acthd;
+	u32 hd;
 	int score;
 	enum intel_ring_hangcheck_action action;
 };
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 1d4a920..73309c5 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -90,6 +90,7 @@ struct videomode;
 #define DRM_UT_DRIVER		0x02
 #define DRM_UT_KMS		0x04
 #define DRM_UT_PRIME		0x08
+#define DRM_UT_TDR		0x10
 /*
  * Three debug levels are defined.
  * drm_core, drm_driver, drm_kms
@@ -211,6 +212,11 @@ int drm_err(const char *func, const char *format, ...);
 		drm_ut_debug_printk(DRM_UT_PRIME, DRM_NAME,		\
 					__func__, fmt, ##args);		\
 	} while (0)
+#define DRM_DEBUG_TDR(fmt, args...)					\
+	do {								\
+		drm_ut_debug_printk(DRM_UT_TDR, DRM_NAME,		\
+					__func__, fmt, ##args);		\
+	} while (0)
 #define DRM_LOG(fmt, args...)						\
 	do {								\
 		drm_ut_debug_printk(DRM_UT_CORE, NULL,			\
@@ -235,6 +241,7 @@ int drm_err(const char *func, const char *format, ...);
 #define DRM_DEBUG_DRIVER(fmt, args...) do { } while (0)
 #define DRM_DEBUG_KMS(fmt, args...)	do { } while (0)
 #define DRM_DEBUG_PRIME(fmt, args...)	do { } while (0)
+#define DRM_DEBUG_TDR(fmt, args...)	do { } while (0)
 #define DRM_DEBUG(fmt, arg...)		 do { } while (0)
 #define DRM_LOG(fmt, arg...)		do { } while (0)
 #define DRM_LOG_KMS(fmt, args...) do { } while (0)
-- 
1.8.5.1


_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux