We have a problem of distinguishing intended hangs submitted by igt during CI/bat and hangs that are nonintended happening in close proximity. As we know how igt constructs a batch intended to hang the gpu, we can use this in our advantage when error state is constructed. The signature of a intended hang can be parsed out in this stage. Batches that are expected to hang can be watermarked by: batch[i++] = MI_BATCH_BUFFER_END; batch[i++] = MI_NOOP; batch[i++] = IGT_HANG_SIGNATURE; /* 0xc5c5c5c5 */ Note that we do the parsing only with gem debug builds. Later when we spew out notification about the hang into the dmesg, we can use DRM_INFO for intended hangs and DRM_WARN for nonintended. This way auxiliary tools can make better judgement on what to consider important for reporting. Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Daniel Vetter <daniel.vetter@xxxxxxxx> Cc: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> Signed-off-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_debugfs.c | 2 +- drivers/gpu/drm/i915/i915_drv.h | 6 +- drivers/gpu/drm/i915/i915_gpu_error.c | 100 +++++++++++++++++++++-- drivers/gpu/drm/i915/i915_irq.c | 4 +- drivers/gpu/drm/i915/intel_hangcheck.c | 6 +- drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 2 +- 6 files changed, 104 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 39883cd915db..678f472a8956 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -4092,7 +4092,7 @@ i915_wedged_set(void *data, u64 val) engine->hangcheck.stalled = true; } - i915_handle_error(i915, val, "Manually setting wedged to %llu", val); + i915_handle_error(i915, true, val, "Manually setting wedged to %llu", val); wait_on_bit(&i915->gpu_error.flags, I915_RESET_HANDOFF, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index fe93115c4caa..91e0ab8509d2 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1004,6 +1004,8 @@ struct i915_gpu_state { u32 *pages[0]; } *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page; + bool batch_had_hang_signature; + struct drm_i915_error_object **user_bo; long user_bo_count; @@ -3372,8 +3374,9 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv) &dev_priv->gpu_error.hangcheck_work, delay); } -__printf(3, 4) +__printf(4, 5) void i915_handle_error(struct drm_i915_private *dev_priv, + bool intentional, u32 engine_mask, const char *fmt, ...); @@ -3960,6 +3963,7 @@ static inline void i915_error_state_buf_release( struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915); void i915_capture_error_state(struct drm_i915_private *dev_priv, + bool intentional, u32 engine_mask, const char *error_msg); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 5c2d83a838d8..a5bfefb94d6b 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -918,15 +918,52 @@ void __i915_gpu_state_free(struct kref *error_ref) kfree(error); } +#ifdef CONFIG_DRM_I915_DEBUG_GEM + +#define HANG_SIGNATURE_MAGIC 0xc5c5c5c5 + +static bool batch_has_hang_signature(const u32 * const batch, u64 len) +{ + struct marker { u32 val; u64 offset; } m[2] = { + { MI_BATCH_BUFFER_END, -1 }, + { HANG_SIGNATURE_MAGIC, -1 } + }; + u64 i, j; + + len = min_t(u64, len, PAGE_SIZE); + len = DIV_ROUND_UP(len, 4); + + for (i = 0, j = 0; i < len && j < ARRAY_SIZE(m); i++) { + if (m[j].offset == -1 && batch[i] == m[j].val) + m[j++].offset = i; + + if (m[0].offset != -1 && i - m[0].offset > 2) + break; + } + + if (m[0].offset == -1 || m[1].offset == -1) + return false; + + return m[1].offset - m[0].offset == 2; +} +#else +static bool batch_has_hang_signature(struct i915_vma *vma) +{ + return false; +} +#endif + static struct drm_i915_error_object * -i915_error_object_create(struct drm_i915_private *i915, - struct i915_vma *vma) +__i915_error_object_create(struct drm_i915_private *i915, + struct i915_vma *vma, + bool *hangsig) { struct i915_ggtt *ggtt = &i915->ggtt; const u64 slot = ggtt->error_capture.start; struct drm_i915_error_object *dst; struct compress compress; - unsigned long num_pages; + unsigned long num_pages, src_pages; + unsigned long page_count = 0; struct sgt_iter iter; dma_addr_t dma; @@ -934,6 +971,7 @@ i915_error_object_create(struct drm_i915_private *i915, return NULL; num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT; + src_pages = num_pages; num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */ dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), GFP_ATOMIC | __GFP_NOWARN); @@ -958,9 +996,16 @@ i915_error_object_create(struct drm_i915_private *i915, I915_CACHE_NONE, 0); s = io_mapping_map_atomic_wc(&ggtt->mappable, slot); + + /* Check last page only for intentional hang signature */ + if (hangsig && page_count == src_pages - 1) + *hangsig = batch_has_hang_signature((void __force *)s, + vma->size); + ret = compress_page(&compress, (void __force *)s, dst); io_mapping_unmap_atomic(s); + page_count++; if (ret) goto unwind; } @@ -978,6 +1023,21 @@ i915_error_object_create(struct drm_i915_private *i915, return dst; } +static struct drm_i915_error_object * +i915_error_object_create(struct drm_i915_private *i915, + struct i915_vma *vma) +{ + return __i915_error_object_create(i915, vma, NULL); +} + +static struct drm_i915_error_object * +i915_error_batch_object_create(struct drm_i915_private *i915, + struct i915_vma *vma, + bool *signature) +{ + return __i915_error_object_create(i915, vma, signature); +} + /* The error capture is special as tries to run underneath the normal * locking rules - so we use the raw version of the i915_gem_active lookup. */ @@ -1484,8 +1544,9 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv, * by userspace. */ ee->batchbuffer = - i915_error_object_create(dev_priv, - request->batch); + i915_error_batch_object_create(dev_priv, + request->batch, + &ee->batch_had_hang_signature); if (HAS_BROKEN_CS_TLB(dev_priv)) ee->wa_batchbuffer = @@ -1701,7 +1762,8 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv, static void i915_error_capture_msg(struct drm_i915_private *dev_priv, struct i915_gpu_state *error, u32 engine_mask, - const char *error_msg) + const char *error_msg, + bool simulated) { u32 ecode; int engine_id = -1, len; @@ -1709,7 +1771,8 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv, ecode = i915_error_generate_code(dev_priv, error, &engine_id); len = scnprintf(error->error_msg, sizeof(error->error_msg), - "GPU HANG: ecode %d:%d:0x%08x", + "GPU HANG%s: ecode %d:%d:0x%08x", + simulated ? "(simulated)" : "", INTEL_GEN(dev_priv), engine_id, ecode); if (engine_id != -1 && error->engine[engine_id].context.pid) @@ -1803,6 +1866,18 @@ i915_capture_gpu_state(struct drm_i915_private *i915) return error; } +static bool error_state_has_hang_signature(const struct i915_gpu_state *error) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(error->engine); i++) { + if (error->engine[i].batch_had_hang_signature) + return true; + } + + return false; +} + /** * i915_capture_error_state - capture an error record for later analysis * @dev: drm device @@ -1813,12 +1888,14 @@ i915_capture_gpu_state(struct drm_i915_private *i915) * to pick up. */ void i915_capture_error_state(struct drm_i915_private *dev_priv, + bool intentional, u32 engine_mask, const char *error_msg) { static bool warned; struct i915_gpu_state *error; unsigned long flags; + bool expected; if (!i915_modparams.error_capture) return; @@ -1832,8 +1909,13 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, return; } - i915_error_capture_msg(dev_priv, error, engine_mask, error_msg); - DRM_INFO("%s\n", error->error_msg); + expected = intentional || error_state_has_hang_signature(error); + i915_error_capture_msg(dev_priv, error, engine_mask, error_msg, expected); + + if (expected) + DRM_INFO("%s\n", error->error_msg); + else + DRM_WARN("%s\n", error->error_msg); if (!error->simulated) { spin_lock_irqsave(&dev_priv->gpu_error.lock, flags); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index ff00e462697a..75f519f910cc 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2828,6 +2828,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) /** * i915_handle_error - handle a gpu error * @dev_priv: i915 device private + * @intentional: error was intentionally injected * @engine_mask: mask representing engines that are hung * @fmt: Error message format string * @@ -2838,6 +2839,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) * of a ring dump etc.). */ void i915_handle_error(struct drm_i915_private *dev_priv, + bool intentional, u32 engine_mask, const char *fmt, ...) { @@ -2859,7 +2861,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv, */ intel_runtime_pm_get(dev_priv); - i915_capture_error_state(dev_priv, engine_mask, error_msg); + i915_capture_error_state(dev_priv, intentional, engine_mask, error_msg); i915_clear_error_registers(dev_priv); /* diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c index 12ac270a5f93..7b9e8dec7d65 100644 --- a/drivers/gpu/drm/i915/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/intel_hangcheck.c @@ -266,7 +266,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) */ tmp = I915_READ_CTL(engine); if (tmp & RING_WAIT) { - i915_handle_error(dev_priv, 0, + i915_handle_error(dev_priv, false, 0, "Kicking stuck wait on %s", engine->name); I915_WRITE_CTL(engine, tmp); @@ -278,7 +278,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) default: return ENGINE_DEAD; case 1: - i915_handle_error(dev_priv, 0, + i915_handle_error(dev_priv, false, 0, "Kicking stuck semaphore on %s", engine->name); I915_WRITE_CTL(engine, tmp); @@ -407,7 +407,7 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915, "%s, ", engine->name); msg[len-2] = '\0'; - return i915_handle_error(i915, hung, "%s", msg); + return i915_handle_error(i915, false, hung, "%s", msg); } /* diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c index 71ce06680d66..7900bba9fe16 100644 --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c @@ -846,7 +846,7 @@ static int igt_handle_error(void *arg) engine->hangcheck.stalled = true; engine->hangcheck.seqno = intel_engine_get_seqno(engine); - i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__); + i915_handle_error(i915, true, intel_engine_flag(engine), "%s", __func__); xchg(&i915->gpu_error.first_error, error); -- 2.11.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx