After finding the guilty batch and request, we can use it to find the process that submitted the batch and then add the culprit into the error state. This is a slightly different approach from Ben's in that instead of adding the extra information into the struct i915_hw_context, we use the information already captured in struct drm_file which is then referenced from the request. Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Ben Widawsky <ben@xxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_drv.h | 4 ++ drivers/gpu/drm/i915/i915_gem.c | 1 + drivers/gpu/drm/i915/i915_gpu_error.c | 69 ++++++++++++++++++++++------------- 3 files changed, 49 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 1d317f8c9f05..2007062e4a35 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -374,6 +374,9 @@ struct drm_i915_error_state { u32 pp_dir_base; }; } vm_info; + + pid_t pid; + char comm[TASK_COMM_LEN]; } ring[I915_NUM_RINGS]; struct drm_i915_error_buffer { @@ -1825,6 +1828,7 @@ struct drm_i915_gem_request { struct drm_i915_file_private { struct drm_i915_private *dev_priv; + struct drm_file *file; struct { spinlock_t lock; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 157877bd7a0b..605e4ab636e8 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -4981,6 +4981,7 @@ int i915_gem_open(struct drm_device *dev, struct drm_file *file) file->driver_priv = file_priv; file_priv->dev_priv = dev->dev_private; + file_priv->file = file; spin_lock_init(&file_priv->mm.lock); INIT_LIST_HEAD(&file_priv->mm.request_list); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index ea588c66ffc4..386bf52dd16a 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -309,6 +309,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, drm_i915_private_t *dev_priv = dev->dev_private; struct drm_i915_error_state *error = error_priv->error; int i, j, page, offset, elt; + int max_hangcheck_score; if (!error) { err_printf(m, "no error state collected\n"); @@ -318,6 +319,20 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec, error->time.tv_usec); err_printf(m, "Kernel: " UTS_RELEASE "\n"); + max_hangcheck_score = 0; + for (i = 0; i < ARRAY_SIZE(error->ring); i++) { + if (error->ring[i].hangcheck_score > max_hangcheck_score) + max_hangcheck_score = error->ring[i].hangcheck_score; + } + for (i = 0; i < ARRAY_SIZE(error->ring); i++) { + if (error->ring[i].hangcheck_score == max_hangcheck_score && + error->ring[i].pid != -1) { + err_printf(m, "Active process (on ring %s): %s [%d]\n", + ring_str(i), + error->ring[i].comm, + error->ring[i].pid); + } + } err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device); err_printf(m, "EIR: 0x%08x\n", error->eir); err_printf(m, "IER: 0x%08x\n", error->ier); @@ -363,8 +378,11 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, struct drm_i915_error_object *obj; if ((obj = error->ring[i].batchbuffer)) { - err_printf(m, "%s --- gtt_offset = 0x%08x\n", - dev_priv->ring[i].name, + err_puts(m, dev_priv->ring[i].name); + if (error->ring[i].pid != -1) + err_printf(m, " (submitted by %s [%d])", + error->ring[i].comm, error->ring[i].pid); + err_printf(m, " --- gtt_offset = 0x%08x\n", obj->gtt_offset); offset = 0; for (page = 0; page < obj->page_count; page++) { @@ -698,9 +716,9 @@ static void i915_gem_record_fences(struct drm_device *dev, } } -static struct drm_i915_error_object * -i915_error_first_batchbuffer(struct drm_i915_private *dev_priv, - struct intel_ring_buffer *ring) +static struct drm_i915_gem_request * +i915_error_first_request(struct drm_i915_private *dev_priv, + struct intel_ring_buffer *ring) { struct drm_i915_gem_request *request; u32 seqno; @@ -708,29 +726,12 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv, if (!ring->get_seqno) return NULL; - if (HAS_BROKEN_CS_TLB(dev_priv->dev)) { - struct drm_i915_gem_object *obj; - u32 acthd = I915_READ(ACTHD); - - if (WARN_ON(ring->id != RCS)) - return NULL; - - obj = ring->scratch.obj; - if (obj != NULL && - acthd >= i915_gem_obj_ggtt_offset(obj) && - acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size) - return i915_error_ggtt_object_create(dev_priv, obj); - } - seqno = ring->get_seqno(ring, false); list_for_each_entry(request, &ring->request_list, list) { if (i915_seqno_passed(seqno, request->seqno)) continue; - /* We need to copy these to an anonymous buffer as the simplest - * method to avoid being overwritten by userspace. - */ - return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm); + return request; } return NULL; @@ -884,8 +885,26 @@ static void i915_gem_record_rings(struct drm_device *dev, i915_record_ring_state(dev, ring, &error->ring[i]); - error->ring[i].batchbuffer = - i915_error_first_batchbuffer(dev_priv, ring); + error->ring[i].pid = -1; + request = i915_error_first_request(dev_priv, ring); + if (request) { + /* We need to copy these to an anonymous buffer as the simplest + * method to avoid being overwritten by userspace. + */ + error->ring[i].batchbuffer = + i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm); + if (request->file_priv) { + struct task_struct *task; + + rcu_read_lock(); + task = pid_task(request->file_priv->file->pid, PIDTYPE_PID); + if (task) { + strcpy(error->ring[i].comm, task->comm); + error->ring[i].pid = task->pid; + } + rcu_read_unlock(); + } + } error->ring[i].ringbuffer = i915_error_ggtt_object_create(dev_priv, ring->obj); -- 1.9.rc1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx