Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > Include extra information such as the user_handle and hw_id so that > userspace can identify which of their contexts hung, useful if they are > performing self-diagnositics. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_drv.h | 14 +++++-- > drivers/gpu/drm/i915/i915_gpu_error.c | 77 ++++++++++++++++++++++------------- > 2 files changed, 59 insertions(+), 32 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index c1fde816db63..7e7bc4504c94 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -970,6 +970,16 @@ struct drm_i915_error_state { > u32 semaphore_mboxes[I915_NUM_ENGINES - 1]; > struct intel_instdone instdone; > > + struct drm_i915_error_context { > + char comm[TASK_COMM_LEN]; > + int pid; s/int/pid_t Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > + u32 handle; > + u32 hw_id; > + int ban_score; > + int active; > + int guilty; > + } context; > + > struct drm_i915_error_object { > u64 gtt_offset; > u64 gtt_size; > @@ -1003,10 +1013,6 @@ struct drm_i915_error_state { > u32 pp_dir_base; > }; > } vm_info; > - > - pid_t pid; > - char comm[TASK_COMM_LEN]; > - int context_bans; > } engine[I915_NUM_ENGINES]; > > struct drm_i915_error_buffer { > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index e5375323eb06..5283fe815a4d 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -384,6 +384,15 @@ static void error_print_request(struct drm_i915_error_state_buf *m, > erq->head, erq->tail); > } > > +static void error_print_context(struct drm_i915_error_state_buf *m, > + const char *header, > + struct drm_i915_error_context *ctx) > +{ > + err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n", > + header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id, > + ctx->ban_score, ctx->guilty, ctx->active); > +} > + > static void error_print_engine(struct drm_i915_error_state_buf *m, > struct drm_i915_error_engine *ee) > { > @@ -457,6 +466,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, > > error_print_request(m, " ELSP[0]: ", &ee->execlist[0]); > error_print_request(m, " ELSP[1]: ", &ee->execlist[1]); > + error_print_context(m, " Active context: ", &ee->context); > } > > void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) > @@ -562,12 +572,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, > > for (i = 0; i < ARRAY_SIZE(error->engine); i++) { > if (error->engine[i].hangcheck_stalled && > - error->engine[i].pid != -1) { > - err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n", > + error->engine[i].context.pid) { > + err_printf(m, "Active process (on ring %s): %s [%d], score %d\n", > engine_str(i), > - error->engine[i].comm, > - error->engine[i].pid, > - error->engine[i].context_bans); > + error->engine[i].context.comm, > + error->engine[i].context.pid, > + error->engine[i].context.ban_score); > } > } > err_printf(m, "Reset count: %u\n", error->reset_count); > @@ -658,11 +668,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, > obj = ee->batchbuffer; > if (obj) { > err_puts(m, dev_priv->engine[i]->name); > - if (ee->pid != -1) > - err_printf(m, " (submitted by %s [%d], bans %d)", > - ee->comm, > - ee->pid, > - ee->context_bans); > + if (ee->context.pid) > + err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)", > + ee->context.comm, > + ee->context.pid, > + ee->context.handle, > + ee->context.hw_id, > + ee->context.ban_score); > err_printf(m, " --- gtt_offset = 0x%08x %08x\n", > upper_32_bits(obj->gtt_offset), > lower_32_bits(obj->gtt_offset)); > @@ -1267,6 +1279,28 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine, > &ee->execlist[n]); > } > > +static void record_context(struct drm_i915_error_context *e, > + struct i915_gem_context *ctx) > +{ > + if (ctx->pid) { > + struct task_struct *task; > + > + rcu_read_lock(); > + task = pid_task(ctx->pid, PIDTYPE_PID); > + if (task) { > + strcpy(e->comm, task->comm); > + e->pid = task->pid; > + } > + rcu_read_unlock(); > + } > + > + e->handle = ctx->user_handle; > + e->hw_id = ctx->hw_id; > + e->ban_score = ctx->ban_score; > + e->guilty = ctx->guilty_count; > + e->active = ctx->active_count; > +} > + > static void i915_gem_record_rings(struct drm_i915_private *dev_priv, > struct drm_i915_error_state *error) > { > @@ -1281,7 +1315,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv, > struct drm_i915_error_engine *ee = &error->engine[i]; > struct drm_i915_gem_request *request; > > - ee->pid = -1; > ee->engine_id = -1; > > if (!engine) > @@ -1296,11 +1329,12 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv, > request = i915_gem_find_active_request(engine); > if (request) { > struct intel_ring *ring; > - struct pid *pid; > > ee->vm = request->ctx->ppgtt ? > &request->ctx->ppgtt->base : &ggtt->base; > > + record_context(&ee->context, request->ctx); > + > /* We need to copy these to an anonymous buffer > * as the simplest method to avoid being overwritten > * by userspace. > @@ -1318,19 +1352,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv, > i915_error_object_create(dev_priv, > request->ctx->engine[i].state); > > - pid = request->ctx->pid; > - if (pid) { > - struct task_struct *task; > - > - rcu_read_lock(); > - task = pid_task(pid, PIDTYPE_PID); > - if (task) { > - strcpy(ee->comm, task->comm); > - ee->pid = task->pid; > - } > - rcu_read_unlock(); > - } > - > error->simulated |= > i915_gem_context_no_error_capture(request->ctx); > > @@ -1534,12 +1555,12 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv, > "GPU HANG: ecode %d:%d:0x%08x", > INTEL_GEN(dev_priv), engine_id, ecode); > > - if (engine_id != -1 && error->engine[engine_id].pid != -1) > + if (engine_id != -1 && error->engine[engine_id].context.pid) > len += scnprintf(error->error_msg + len, > sizeof(error->error_msg) - len, > ", in %s [%d]", > - error->engine[engine_id].comm, > - error->engine[engine_id].pid); > + error->engine[engine_id].context.comm, > + error->engine[engine_id].context.pid); > > scnprintf(error->error_msg + len, sizeof(error->error_msg) - len, > ", reason: %s, action: %s", > -- > 2.11.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx