1. The i915_wedged_set() function allows us to schedule three forms of hang recovery: a) Legacy hang recovery: By passing e.g. -1 we trigger the legacy full GPU reset recovery path. b) Single engine hang recovery: By passing an engine ID in the interval of [0, I915_NUM_RINGS) we can schedule hang recovery of any single engine assuming that the context submission consistency requirements are met (otherwise the hang recovery path will simply exit early and wait for another hang detection). The values are assumed to use up bits 3:0 only since we certainly do not support as many as 16 engines. This mode is supported since there are several legacy test applications that rely on this interface. c) Multiple engine hang recovery: By passing in an engine flag mask in bits 31:8 (bit 8 corresponds to engine 0 = RCS, bit 9 corresponds to engine 1 = VCS etc) we can schedule any combination of engine hang recoveries as we please. For example, by passing in the value 0x3 << 8 we would schedule hang recovery for engines 0 and 1 (RCS and VCS) at the same time. If bits in fields 3:0 and 31:8 are both used then single engine hang recovery mode takes presidence and bits 31:8 are ignored. 2. The i915_hangcheck_read() function produces a set of statistics related to: a) Number of engine hangs detected by periodic hang checker. b) Number of watchdog timeout hangs detected. c) Number of full GPU resets carried out. d) Number of engine resets carried out. These statistics are presented in a very parser-friendly way and are used by the TDR ULT to poll system behaviour to validate test outcomes. * v2: (Chris Wilson) - After review comments by Chris Wilson we're dropping the dual-mode parameter value interpretation in i915_wedged_set(). In this version we only accept engine id flag masks that contain the engine id flags of all currently hung engines. Full GPU reset is most easily requested by passing an all zero engine id flag mask. - Moved TDR-specific engine metrics like number of detected engine hangs and number of per-engine resets into i915_hangcheck_info() from i915_hangcheck_read(). Signed-off-by: Tomas Elf <tomas.elf@xxxxxxxxx> Signed-off-by: Arun Siluvery <arun.siluvery@xxxxxxxxx> Signed-off-by: Ian Lister <ian.lister@xxxxxxxxx> Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_debugfs.c | 76 ++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index a89da48..d99c152 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -1302,6 +1302,8 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) } else seq_printf(m, "Hangcheck inactive\n"); + seq_printf(m, "Full GPU resets = %u\n", i915_reset_count(&dev_priv->gpu_error)); + for_each_ring(ring, dev_priv, i) { seq_printf(m, "%s:\n", ring->name); seq_printf(m, "\tseqno = %x [current %x]\n", @@ -1313,6 +1315,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) (long long)ring->hangcheck.max_acthd); seq_printf(m, "\tscore = %d\n", ring->hangcheck.score); seq_printf(m, "\taction = %d\n", ring->hangcheck.action); + seq_printf(m, "\tengine resets = %u\n", + ring->hangcheck.reset_count); + seq_printf(m, "\tengine hang detections = %u\n", + ring->hangcheck.tdr_count); + seq_printf(m, "\tengine watchdog timeout detections = %u\n", + ring->hangcheck.watchdog_count); } return 0; @@ -2030,7 +2038,7 @@ static int i915_execlists(struct seq_file *m, void *data) seq_printf(m, "%s\n", ring->name); status = I915_READ(RING_EXECLIST_STATUS(ring)); - ctx_id = I915_READ(RING_EXECLIST_STATUS(ring) + 4); + ctx_id = I915_READ(RING_EXECLIST_STATUS_CTX_ID(ring)); seq_printf(m, "\tExeclist status: 0x%08X, context: %u\n", status, ctx_id); @@ -4164,11 +4172,47 @@ i915_wedged_get(void *data, u64 *val) return 0; } +static const char *ringid_to_str(enum intel_ring_id ring_id) +{ + switch (ring_id) { + case RCS: + return "RCS"; + case VCS: + return "VCS"; + case BCS: + return "BCS"; + case VECS: + return "VECS"; + case VCS2: + return "VCS2"; + } + + return "unknown"; +} + static int i915_wedged_set(void *data, u64 val) { struct drm_device *dev = data; struct drm_i915_private *dev_priv = dev->dev_private; + struct intel_engine_cs *engine; + u32 i; +#define ENGINE_MSGLEN 64 + char msg[ENGINE_MSGLEN]; + + /* + * Val contains the engine flag mask of engines to be reset. + * + * Full GPU reset is implied in the following two cases: + * 1. val == 0x0 + * 2. val >= (1 << I915_NUM_RINGS) + * + * Bit 0: RCS engine + * Bit 1: VCS engine + * Bit 2: BCS engine + * Bit 3: VECS engine + * Bit 4: VCS2 engine (if available) + */ /* * There is no safeguard against this debugfs entry colliding @@ -4177,14 +4221,36 @@ i915_wedged_set(void *data, u64 val) * test harness is responsible enough not to inject gpu hangs * while it is writing to 'i915_wedged' */ - - if (i915_reset_in_progress(&dev_priv->gpu_error)) + if (i915_gem_check_wedge(dev_priv, NULL, true)) return -EAGAIN; intel_runtime_pm_get(dev_priv); - i915_handle_error(dev, 0x0, false, val, - "Manually setting wedged to %llu", val); + memset(msg, 0, sizeof(msg)); + + if (val) { + scnprintf(msg, sizeof(msg), "Manual reset:"); + + /* Assemble message string */ + for_each_ring(engine, dev_priv, i) + if (intel_ring_flag(engine) & val) { + DRM_INFO("Manual reset: %s\n", engine->name); + + scnprintf(msg, sizeof(msg), + "%s [%s]", + msg, + ringid_to_str(i)); + } + + } else { + scnprintf(msg, sizeof(msg), "Manual global reset"); + } + + i915_handle_error(dev, + val, + false, + true, + msg); intel_runtime_pm_put(dev_priv); -- 1.9.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx