Cc: Antonio Argenziano <antonio.argenziano@xxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx>
Signed-off-by: Michel Thierry <michel.thierry@xxxxxxxxx>
Signed-off-by: Carlos Santa <carlos.santa@xxxxxxxxx>
---
drivers/gpu/drm/i915/i915_gpu_error.h | 4 ++
drivers/gpu/drm/i915/i915_irq.c | 14 ++++--
drivers/gpu/drm/i915/i915_reg.h | 6 +++
drivers/gpu/drm/i915/i915_reset.c | 20 +++++++++
drivers/gpu/drm/i915/i915_reset.h | 6 +++
drivers/gpu/drm/i915/intel_engine_cs.c | 1 +
drivers/gpu/drm/i915/intel_engine_types.h | 5 +++
drivers/gpu/drm/i915/intel_hangcheck.c | 11 +----
drivers/gpu/drm/i915/intel_lrc.c | 52 +++++++++++++++++++++++
9 files changed, 107 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 99d6b7b270c2..6cf6a8679b26 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -203,6 +203,9 @@ struct i915_gpu_error {
* any global resources that may be clobber by the reset (such as
* FENCE registers).
*
+ * #I915_RESET_WATCHDOG - When hw detects a hang before us, we can use
+ * I915_RESET_WATCHDOG to report the hang detection cause accurately.
+ *
* #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
* acquire the struct_mutex to reset an engine, we need an explicit
* flag to prevent two concurrent reset attempts in the same engine.
@@ -218,6 +221,7 @@ struct i915_gpu_error {
#define I915_RESET_BACKOFF 0
#define I915_RESET_MODESET 1
#define I915_RESET_ENGINE 2
+#define I915_RESET_WATCHDOG 3
#define I915_WEDGED (BITS_PER_LONG - 1)
/** Number of times the device has been reset (global) */
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 82d487189a34..e64994be25c3 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1466,6 +1466,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
if (tasklet)
tasklet_hi_schedule(&engine->execlists.tasklet);
+
+ if (iir & GT_GEN8_WATCHDOG_INTERRUPT)
+ tasklet_schedule(&engine->execlists.watchdog_tasklet);
}
static void gen8_gt_irq_ack(struct drm_i915_private *i915,
@@ -3892,20 +3895,25 @@ static void gen8_gt_irq_postinstall(struct drm_i915_private *dev_priv)
u32 gt_interrupts[] = {
(GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
+ GT_GEN8_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT),
-
(GT_RENDER_USER_INTERRUPT << GEN8_VCS0_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS0_IRQ_SHIFT |
+ GT_GEN8_WATCHDOG_INTERRUPT << GEN8_VCS0_IRQ_SHIFT |
GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
- GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT),
-
+ GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
+ GT_GEN8_WATCHDOG_INTERRUPT << GEN8_VCS1_IRQ_SHIFT),
0,
(GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT |
GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT)
};
+ /* VECS watchdog is only available in skl+ */
+ if (INTEL_GEN(dev_priv) >= 9)
+ gt_interrupts[3] |= GT_GEN8_WATCHDOG_INTERRUPT;
+
dev_priv->pm_ier = 0x0;
dev_priv->pm_imr = ~dev_priv->pm_ier;
GEN8_IRQ_INIT_NDX(GT, 0, ~gt_interrupts[0], gt_interrupts[0]);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 9b69cec21f7b..ac8d984e16ae 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2363,6 +2363,11 @@ enum i915_power_well_id {
#define RING_START(base) _MMIO((base) + 0x38)
#define RING_CTL(base) _MMIO((base) + 0x3c)
#define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */
+#define RING_CNTR(base) _MMIO((base) + 0x178)
+#define GEN8_WATCHDOG_ENABLE 0
+#define GEN8_WATCHDOG_DISABLE 1
+#define GEN8_XCS_WATCHDOG_DISABLE 0xFFFFFFFF /* GEN8 & non-render only */
+#define RING_THRESH(base) _MMIO((base) + 0x17C)
#define RING_SYNC_0(base) _MMIO((base) + 0x40)
#define RING_SYNC_1(base) _MMIO((base) + 0x44)
#define RING_SYNC_2(base) _MMIO((base) + 0x48)
@@ -2925,6 +2930,7 @@ enum i915_power_well_id {
#define GT_BSD_USER_INTERRUPT (1 << 12)
#define GT_RENDER_L3_PARITY_ERROR_INTERRUPT_S1 (1 << 11) /* hsw+; rsvd on snb, ivb, vlv */
#define GT_CONTEXT_SWITCH_INTERRUPT (1 << 8)
+#define GT_GEN8_WATCHDOG_INTERRUPT (1 << 6) /* gen8+ */
#define GT_RENDER_L3_PARITY_ERROR_INTERRUPT (1 << 5) /* !snb */
#define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT (1 << 4)
#define GT_RENDER_CS_MASTER_ERROR_INTERRUPT (1 << 3)
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 861fe083e383..739fa5ad1a8d 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -1208,6 +1208,26 @@ void i915_clear_error_registers(struct drm_i915_private *dev_priv)
}
}
+void engine_reset_error_to_str(struct drm_i915_private *i915,
+ char *msg,
+ size_t sz,
+ unsigned int hung,
+ unsigned int stuck,
+ unsigned int watchdog)
+{
+ int len;
+ unsigned int tmp;
+ struct intel_engine_cs *engine;
+
+ len = scnprintf(msg, sz,
+ "%s on ", watchdog ? "watchdog timeout" :
+ stuck == hung ? "no_progress" : "hang");
+ for_each_engine_masked(engine, i915, hung, tmp)
+ len += scnprintf(msg + len, sz - len,
+ "%s, ", engine->name);
+ msg[len-2] = '\0';
+}
+
/**
* i915_handle_error - handle a gpu error
* @i915: i915 device private
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
index 16f2389f656f..8582d1242248 100644
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -20,6 +20,12 @@ void i915_handle_error(struct drm_i915_private *i915,
u32 engine_mask,
unsigned long flags,
const char *fmt, ...);
+void engine_reset_error_to_str(struct drm_i915_private *i915,
+ char *str,
+ size_t sz,
+ unsigned int hung,
+ unsigned int stuck,
+ unsigned int watchdog);
#define I915_ERROR_CAPTURE BIT(0)
void i915_clear_error_registers(struct drm_i915_private *i915);
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 652c1b3ba190..88cf0fc07623 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1149,6 +1149,7 @@ void intel_engines_park(struct drm_i915_private *i915)
/* Flush the residual irq tasklets first. */
intel_engine_disarm_breadcrumbs(engine);
tasklet_kill(&engine->execlists.tasklet);
+ tasklet_kill(&engine->execlists.watchdog_tasklet);
/*
* We are committed now to parking the engines, make sure there
diff --git a/drivers/gpu/drm/i915/intel_engine_types.h b/drivers/gpu/drm/i915/intel_engine_types.h
index b0aa1f0d4e47..c4f66b774e7c 100644
--- a/drivers/gpu/drm/i915/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/intel_engine_types.h
@@ -124,6 +124,11 @@ struct intel_engine_execlists {
*/
struct tasklet_struct tasklet;
+ /*
+ * @watchdog_tasklet: stop counter and reschedule hangcheck_work asap
+ */
+ struct tasklet_struct watchdog_tasklet;
+
/**
* @default_priolist: priority list for I915_PRIORITY_NORMAL
*/
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 57ed49dc19c4..4bf26863678c 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -220,22 +220,15 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
unsigned int hung,
unsigned int stuck)
{
- struct intel_engine_cs *engine;
char msg[80];
- unsigned int tmp;
- int len;
+ size_t len = sizeof(msg);
/* If some rings hung but others were still busy, only
* blame the hanging rings in the synopsis.
*/
if (stuck != hung)
hung &= ~stuck;
- len = scnprintf(msg, sizeof(msg),
- "%s on ", stuck == hung ? "no progress" : "hang");
- for_each_engine_masked(engine, i915, hung, tmp)
- len += scnprintf(msg + len, sizeof(msg) - len,
- "%s, ", engine->name);
- msg[len-2] = '\0';
+ engine_reset_error_to_str(i915, msg, len, hung, stuck, 0);
return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e54e0064b2d6..85785a94f6ae 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2195,6 +2195,40 @@ static int gen8_emit_flush_render(struct i915_request *request,
return 0;
}
+static void gen8_watchdog_tasklet(unsigned long data)
+{
+ struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+ struct drm_i915_private *dev_priv = engine->i915;
+ enum forcewake_domains fw_domains;
+ char msg[80];
+ size_t len = sizeof(msg);
+ unsigned long *lock = &engine->i915->gpu_error.flags;
+ unsigned int bit = I915_RESET_ENGINE + engine->id;
+
+ switch (engine->class) {
+ default:
+ MISSING_CASE(engine->id);
+ /* fall through */
+ case RENDER_CLASS:
+ fw_domains = FORCEWAKE_RENDER;
+ break;
+ case VIDEO_DECODE_CLASS:
+ case VIDEO_ENHANCEMENT_CLASS:
+ fw_domains = FORCEWAKE_MEDIA;
+ break;
+ }
+
+ intel_uncore_forcewake_get(dev_priv, fw_domains);
+
+ if (!test_and_set_bit(bit, lock)) {
+ unsigned int hung = engine->mask;
+ engine_reset_error_to_str(dev_priv, msg, len, hung, 0, 1);
+ i915_reset_engine(engine, msg);
+ clear_bit(bit, lock);
+ wake_up_bit(lock, bit);
+ }
+}
+
/*
* Reserve space for 2 NOOPs at the end of each request to be
* used as a workaround for not being allowed to do lite
@@ -2377,6 +2411,21 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
+
+ switch (engine->class) {
+ default:
+ /* BCS engine does not support hw watchdog */
+ break;
+ case RENDER_CLASS:
+ case VIDEO_DECODE_CLASS:
+ engine->irq_keep_mask |= GT_GEN8_WATCHDOG_INTERRUPT << shift;
+ break;
+ case VIDEO_ENHANCEMENT_CLASS:
+ if (INTEL_GEN(engine->i915) >= 9)
+ engine->irq_keep_mask |=
+ GT_GEN8_WATCHDOG_INTERRUPT << shift;
+ break;
+ }
}
static int
@@ -2394,6 +2443,9 @@ logical_ring_setup(struct intel_engine_cs *engine)
tasklet_init(&engine->execlists.tasklet,
execlists_submission_tasklet, (unsigned long)engine);
+ tasklet_init(&engine->execlists.watchdog_tasklet,
+ gen8_watchdog_tasklet, (unsigned long)engine);
+
logical_ring_default_vfuncs(engine);
logical_ring_default_irqs(engine);