We only hold the active spinlock while dumping the error state, and this does not prevent another thread from retiring the request -- as it is quite possible that despite us capturing the current state, the GPU has completed the request. As such, it is dangerous to dereference state below the request as it may already be freed, and the simplest way to avoid the danger is not include it in the error state. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1788 Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> Cc: Andi Shyti <andi.shyti@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_gpu_error.c | 18 ++++++++++++------ drivers/gpu/drm/i915/i915_gpu_error.h | 1 - 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 424ad975a360..6dd2fc0b0d47 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -467,14 +467,14 @@ static void error_print_request(struct drm_i915_error_state_buf *m, if (!erq->seqno) return; - err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, start %08x, head %08x, tail %08x\n", + err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, head %08x, tail %08x\n", prefix, erq->pid, erq->context, erq->seqno, test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &erq->flags) ? "!" : "", test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &erq->flags) ? "+" : "", erq->sched_attr.priority, - erq->start, erq->head, erq->tail); + erq->head, erq->tail); } static void error_print_context(struct drm_i915_error_state_buf *m, @@ -1204,16 +1204,18 @@ static void engine_record_registers(struct intel_engine_coredump *ee) } } -static void record_request(const struct i915_request *request, +static bool record_request(const struct i915_request *request, struct i915_request_coredump *erq) { const struct i915_gem_context *ctx; + if (i915_request_completed(request)) + return false; + erq->flags = request->fence.flags; erq->context = request->fence.context; erq->seqno = request->fence.seqno; erq->sched_attr = request->sched.attr; - erq->start = i915_ggtt_offset(request->ring->vma); erq->head = request->head; erq->tail = request->tail; @@ -1223,6 +1225,8 @@ static void record_request(const struct i915_request *request, if (ctx) erq->pid = pid_nr(ctx->pid); rcu_read_unlock(); + + return true; } static void engine_record_execlists(struct intel_engine_coredump *ee) @@ -1231,8 +1235,10 @@ static void engine_record_execlists(struct intel_engine_coredump *ee) struct i915_request * const *port = el->active; unsigned int n = 0; - while (*port) - record_request(*port++, &ee->execlist[n++]); + while (*port) { + if (record_request(*port++, &ee->execlist[n])) + n++; + } ee->num_ports = n; } diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index 0d1f6c8ff355..fa2d82a6de04 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -50,7 +50,6 @@ struct i915_request_coredump { pid_t pid; u32 context; u32 seqno; - u32 start; u32 head; u32 tail; struct i915_sched_attr sched_attr; -- 2.20.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx