[RFC 11/11] drm/i915: TDR/watchdog trace points.

Tomas Elf <tomas.elf@xxxxxxxxx> · Mon, 8 Jun 2015 18:03:29 +0100

Defined trace points and sprinkled the usage of these throughout the
TDR/watchdog implementation.

The following trace points are supported:

	1. trace_i915_tdr_gpu_recovery:
	Called at the onset of the full GPU reset recovery path.

	2. trace_i915_tdr_engine_recovery:
	Called at the onset of the per-engine recovery path.

	3. i915_tdr_recovery_start:
	Called at the onset of hang recovery before recovery mode has been
	decided.

	4. i915_tdr_recovery_complete:
	Called at the point of hang recovery completion.

	5. i915_tdr_recovery_queued:
	Called once the error handler decides to schedule the actual hang
	recovery, which marks the end of the hang detection path.

	6. i915_tdr_engine_save:
	Called at the point of saving the engine state during per-engine hang
	recovery.

	7. i915_tdr_gpu_reset_complete:
	Called at the point of full GPU reset recovery completion.

	8. i915_tdr_engine_reset_complete:
	Called at the point of per-engine recovery completion.

	9. i915_tdr_forced_csb_check:
	Called at the completion of a forced CSB check.

	10. i915_tdr_hang_check:
	Called for every engine in the periodic hang checker loop before moving
	on to the next engine. Provides an overview of all hang check stats in
	real-time. The collected stats are:

		a. Engine name.

		b. Current engine seqno.

		c. Seqno of previous hang check iteration for that engine.

		d. ACTHD register value of given engine.

		e. Current hang check score of given engine (and whether or not
		the engine has been detected as hung).

		f. Current action for given engine.

		g. Busyness of given engine.

		h. Submission status of currently running context on given engine.

Signed-off-by: Tomas Elf <tomas.elf@xxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_drv.c       |    3 +
 drivers/gpu/drm/i915/i915_drv.h       |   19 +++
 drivers/gpu/drm/i915/i915_gpu_error.c |    2 +-
 drivers/gpu/drm/i915/i915_irq.c       |   10 +-
 drivers/gpu/drm/i915/i915_trace.h     |  298 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c      |    8 +-
 drivers/gpu/drm/i915/intel_uncore.c   |    4 +
 7 files changed, 340 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index e1629a6..8030b92 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -845,6 +845,7 @@ int i915_reset(struct drm_device *dev)
 	if (!i915.reset)
 		return 0;
 
+	trace_i915_tdr_gpu_recovery(dev);
 	intel_reset_gt_powersave(dev);
 
 	mutex_lock(&dev->struct_mutex);
@@ -952,6 +953,8 @@ int i915_reset_engine(struct intel_engine_cs *engine)
 
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
 
+	trace_i915_tdr_engine_recovery(engine);
+
         /* Take wake lock to prevent power saving mode */
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5139daa..c8d62d2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2323,6 +2323,24 @@ enum context_submission_status {
 	CONTEXT_SUBMISSION_STATUS_UNDEFINED
 };
 
+static inline const char*
+i915_context_submission_status_to_str(enum context_submission_status status)
+{
+	switch(status)
+	{
+		case CONTEXT_SUBMISSION_STATUS_OK:
+			return "ok";
+		case CONTEXT_SUBMISSION_STATUS_INCONSISTENT:
+			return "inconsistent";
+		case CONTEXT_SUBMISSION_STATUS_NONE_SUBMITTED:
+			return "none";
+		case CONTEXT_SUBMISSION_STATUS_UNDEFINED:
+			return "undefined";
+		default:
+			return "invalid";
+	}
+}
+
 /* Note that the (struct drm_i915_private *) cast is just to shut up gcc. */
 #define __I915__(p) ({ \
 	struct drm_i915_private *__p; \
@@ -3129,6 +3147,7 @@ void i915_destroy_error_state(struct drm_device *dev);
 
 void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone);
 const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
+const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a);
 
 /* i915_cmd_parser.c */
 int i915_cmd_parser_get_version(void);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index ac22614..cee1825 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -220,7 +220,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 	}
 }
 
-static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
+const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
 {
 	switch (a) {
 	case HANGCHECK_IDLE:
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 56bd967..66a8456 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2359,6 +2359,7 @@ static void i915_error_work_func(struct work_struct *work)
 
 	mutex_lock(&dev->struct_mutex);
 
+	trace_i915_tdr_recovery_start(dev);
 	kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event);
 
 	for_each_ring(ring, dev_priv, i) {
@@ -2488,6 +2489,8 @@ static void i915_error_work_func(struct work_struct *work)
 
 	kobject_uevent_env(&dev->primary->kdev->kobj,
 			   KOBJ_CHANGE, reset_done_event);
+
+	trace_i915_tdr_recovery_complete(dev);
 }
 
 static void i915_report_and_clear_eir(struct drm_device *dev)
@@ -2618,6 +2621,7 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	va_list args;
 	char error_msg[80];
+	bool full_reset = true;
 
 	struct intel_engine_cs *engine;
 
@@ -2635,7 +2639,6 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 		 *	2. The hardware does not support it (pre-gen7).
 		 *	3. We already tried per-engine reset recently.
 		 */
-		bool full_reset = true;
 
 		/*
 		 * TBD: We currently only support per-engine reset for gen8+.
@@ -2659,6 +2662,7 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 						i915.gpu_reset_promotion_time;
 
 					engine->hangcheck.last_engine_reset_time = now;
+
 				} else {
 					/*
 					 * Watchdog timeout always results
@@ -2707,6 +2711,8 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 		i915_error_wake_up(dev_priv, false);
 	}
 
+	trace_i915_tdr_recovery_queued(dev, engine_mask, watchdog, full_reset);
+
 	/*
 	 * Gen 7:
 	 *
@@ -3212,6 +3218,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		ring->hangcheck.seqno = seqno;
 		ring->hangcheck.acthd = acthd;
 		busy_count += busy;
+
+		trace_i915_tdr_hang_check(ring, seqno, acthd, busy, status[i]);
 	}
 
 	for_each_ring(ring, dev_priv, i) {
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 2aa140e..740033e 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -775,6 +775,304 @@ TRACE_EVENT(switch_mm,
 		  __entry->dev, __entry->ring, __entry->to, __entry->vm)
 );
 
+/**
+ * DOC: i915_tdr_gpu_recovery
+ *
+ * This tracepoint tracks the onset of the full GPU recovery path
+ */
+TRACE_EVENT(i915_tdr_gpu_recovery,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+	),
+
+	TP_printk("dev=%u, full GPU recovery started",
+		  __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_engine_recovery
+ *
+ * This tracepoint tracks the onset of the engine recovery path
+ */
+TRACE_EVENT(i915_tdr_engine_recovery,
+	TP_PROTO(struct intel_engine_cs *engine),
+
+	TP_ARGS(engine),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+	),
+
+	TP_printk("dev=%u, engine=%u, recovery of %s started",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->id,
+		  __entry->engine->name)
+);
+
+/**
+ * DOC: i915_tdr_recovery_start
+ *
+ * This tracepoint tracks hang recovery start
+ */
+TRACE_EVENT(i915_tdr_recovery_start,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+	),
+
+	TP_printk("dev=%u, hang recovery started",
+		  __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_recovery_complete
+ *
+ * This tracepoint tracks hang recovery completion
+ */
+TRACE_EVENT(i915_tdr_recovery_complete,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+	),
+
+	TP_printk("dev=%u, hang recovery completed",
+		  __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_recovery_queued
+ *
+ * This tracepoint tracks the point of queuing recovery from hang check.
+ * If engine recovery is requested engine name will be displayed, otherwise
+ * it will be set to "none". If too many engine reset was attempted in the
+ * previous history we promote to full GPU reset, which is remarked by appending
+ * the "[PROMOTED]" flag.
+ */
+TRACE_EVENT(i915_tdr_recovery_queued,
+	TP_PROTO(struct drm_device *dev,
+		 u32 hung_engines,
+		 bool watchdog,
+		 bool full_reset),
+
+	TP_ARGS(dev, hung_engines, watchdog, full_reset),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+			__field(u32, hung_engines)
+			__field(bool, watchdog)
+			__field(bool, full_reset)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+			__entry->hung_engines = hung_engines;
+			__entry->watchdog = watchdog;
+			__entry->full_reset = full_reset;
+	),
+
+	TP_printk("dev=%u, hung_engines=0x%02x%s%s%s%s%s%s%s, watchdog=%s, full_reset=%s",
+		  __entry->dev,
+		  __entry->hung_engines,
+		  __entry->hung_engines ? " (":"",
+		  __entry->hung_engines & RENDER_RING ? " [RCS] " : "",
+		  __entry->hung_engines & BSD_RING ? 	" [VCS] " : "",
+		  __entry->hung_engines & BLT_RING ? 	" [BCS] " : "",
+		  __entry->hung_engines & VEBOX_RING ? 	" [VECS] " : "",
+		  __entry->hung_engines & BSD2_RING ? 	" [VCS2] " : "",
+		  __entry->hung_engines ? ")":"",
+		  __entry->watchdog ? "true" : "false",
+		  __entry->full_reset ?
+			(__entry->hung_engines ? "true [PROMOTED]" : "true") :
+				"false")
+);
+
+/**
+ * DOC: i915_tdr_engine_save
+ *
+ * This tracepoint tracks the point of engine state save during the engine
+ * recovery path. Logs the head pointer position at point of hang, the position
+ * after recovering and whether or not we forced a head pointer advancement or
+ * rounded up to an aligned QWORD position.
+ */
+TRACE_EVENT(i915_tdr_engine_save,
+	TP_PROTO(struct intel_engine_cs *engine,
+		 u32 old_head,
+		 u32 new_head,
+		 bool forced_advance),
+
+	TP_ARGS(engine, old_head, new_head, forced_advance),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+			__field(u32, old_head)
+			__field(u32, new_head)
+			__field(bool, forced_advance)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+			__entry->old_head = old_head;
+			__entry->new_head = new_head;
+			__entry->forced_advance = forced_advance;
+	),
+
+	TP_printk("dev=%u, engine=%s, old_head=%u, new_head=%u, forced_advance=%s",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->old_head,
+		  __entry->new_head,
+		  __entry->forced_advance ? "true" : "false")
+);
+
+/**
+ * DOC: i915_tdr_gpu_reset_complete
+ *
+ * This tracepoint tracks the point of full GPU reset completion
+ */
+TRACE_EVENT(i915_tdr_gpu_reset_complete,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(struct drm_device *, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev;
+	),
+
+	TP_printk("dev=%u, resets=%u",
+		__entry->dev->primary->index,
+		i915_reset_count(&((struct drm_i915_private *)
+			(__entry->dev)->dev_private)->gpu_error) )
+);
+
+/**
+ * DOC: i915_tdr_engine_reset_complete
+ *
+ * This tracepoint tracks the point of engine reset completion
+ */
+TRACE_EVENT(i915_tdr_engine_reset_complete,
+	TP_PROTO(struct intel_engine_cs *engine),
+
+	TP_ARGS(engine),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+	),
+
+	TP_printk("dev=%u, engine=%s, resets=%u",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->engine->hangcheck.reset_count)
+);
+
+/**
+ * DOC: i915_tdr_forced_csb_check
+ *
+ * This tracepoint tracks the occurences of forced CSB checks
+ * that the driver does when detecting inconsistent context
+ * submission states between the driver state and the current
+ * CPU engine state.
+ */
+TRACE_EVENT(i915_tdr_forced_csb_check,
+	TP_PROTO(struct intel_engine_cs *engine,
+		 bool was_effective),
+
+	TP_ARGS(engine, was_effective),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+			__field(bool, was_effective)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+			__entry->was_effective = was_effective;
+	),
+
+	TP_printk("dev=%u, engine=%s, was_effective=%s",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->was_effective ? "yes" : "no")
+);
+
+/**
+ * DOC: i915_tdr_hang_check
+ *
+ * This tracepoint tracks hang checks on each engine.
+ */
+TRACE_EVENT(i915_tdr_hang_check,
+	TP_PROTO(struct intel_engine_cs *engine,
+		 u32 seqno,
+		 u64 acthd,
+		 bool busy,
+		 enum context_submission_status status),
+
+	TP_ARGS(engine, seqno, acthd, busy, status),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+			__field(u32, seqno)
+			__field(u64, acthd)
+			__field(bool, busy)
+			__field(enum context_submission_status, status)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+			__entry->seqno = seqno;
+			__entry->acthd = acthd;
+			__entry->busy = busy;
+			__entry->status = status;
+	),
+
+	TP_printk("dev=%u, engine=%s, seqno=%u, last seqno=%u, acthd=%lu, score=%d%s, action=%u [%s], busy=%s, status=%u [%s]",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->seqno,
+		  __entry->engine->hangcheck.seqno,
+		  (long unsigned int) __entry->acthd,
+		  __entry->engine->hangcheck.score,
+		  (__entry->engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) ? " [HUNG]" : "",
+		  (unsigned int) __entry->engine->hangcheck.action,
+		  hangcheck_action_to_str(__entry->engine->hangcheck.action),
+		  __entry->busy ? "yes" : "no",
+		  __entry->status,
+		  i915_context_submission_status_to_str(__entry->status))
+);
+
 #endif /* _I915_TRACE_H_ */
 
 /* This part must be outside protection */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 0d197fe..748c0fb 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1780,7 +1780,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
 	struct intel_context *ctx;
 	int ret = 0;
 	int clamp_to_tail = 0;
-	uint32_t head;
+	uint32_t head, old_head;
 	uint32_t tail;
 	uint32_t head_addr;
 	uint32_t tail_addr;
@@ -1795,7 +1795,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
 	 * Read head from MMIO register since it contains the
 	 * most up to date value of head at this point.
 	 */
-	head = I915_READ_HEAD(ring);
+	old_head = head = I915_READ_HEAD(ring);
 
 	/*
 	 * Read tail from the context because the execlist queue
@@ -1852,6 +1852,9 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
 	head |= (head_addr & HEAD_ADDR);
 	ring->saved_head = head;
 
+	trace_i915_tdr_engine_save(ring, old_head,
+		head, force_advance);
+
 	return 0;
 }
 
@@ -2781,6 +2784,7 @@ void intel_execlists_TDR_force_CSB_check(struct drm_i915_private *dev_priv,
 		DRM_ERROR("Forced CSB check of %s ineffective!\n", engine->name);
 	spin_unlock_irqrestore(&engine->execlist_lock, flags);
 
+	trace_i915_tdr_forced_csb_check(engine, !!was_effective);
 	wake_up_all(&engine->irq_queue);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 84774d2..f72cfe1 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1442,6 +1442,8 @@ static int gen6_do_reset(struct drm_device *dev)
 	/* Spin waiting for the device to ack the reset request */
 	ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_FULL) == 0, 500);
 
+	trace_i915_tdr_gpu_reset_complete(dev);
+
 	intel_uncore_forcewake_reset(dev, true);
 
 	return ret;
@@ -1535,6 +1537,8 @@ static int do_engine_reset_nolock(struct intel_engine_cs *engine)
 		break;
 	}
 
+	trace_i915_tdr_engine_reset_complete(engine);
+
 	return ret;
 }
 
-- 
1.7.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx