[PATCH] drm/i915: Record pid/comm of hanging task

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



After finding the guilty batch and request, we can use it to find the
process that submitted the batch and then add the culprit into the error
state.

This is a slightly different approach from Ben's in that instead of
adding the extra information into the struct i915_hw_context, we use the
information already captured in struct drm_file which is then referenced
from the request.

Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html
Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Ben Widawsky <ben@xxxxxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_drv.h       |  4 ++
 drivers/gpu/drm/i915/i915_gem.c       |  1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 69 ++++++++++++++++++++++-------------
 3 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1d317f8c9f05..2007062e4a35 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -374,6 +374,9 @@ struct drm_i915_error_state {
 				u32 pp_dir_base;
 			};
 		} vm_info;
+
+		pid_t pid;
+		char comm[TASK_COMM_LEN];
 	} ring[I915_NUM_RINGS];
 
 	struct drm_i915_error_buffer {
@@ -1825,6 +1828,7 @@ struct drm_i915_gem_request {
 
 struct drm_i915_file_private {
 	struct drm_i915_private *dev_priv;
+	struct drm_file *file;
 
 	struct {
 		spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 157877bd7a0b..605e4ab636e8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4981,6 +4981,7 @@ int i915_gem_open(struct drm_device *dev, struct drm_file *file)
 
 	file->driver_priv = file_priv;
 	file_priv->dev_priv = dev->dev_private;
+	file_priv->file = file;
 
 	spin_lock_init(&file_priv->mm.lock);
 	INIT_LIST_HEAD(&file_priv->mm.request_list);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index ea588c66ffc4..386bf52dd16a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -309,6 +309,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_error_state *error = error_priv->error;
 	int i, j, page, offset, elt;
+	int max_hangcheck_score;
 
 	if (!error) {
 		err_printf(m, "no error state collected\n");
@@ -318,6 +319,20 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
 		   error->time.tv_usec);
 	err_printf(m, "Kernel: " UTS_RELEASE "\n");
+	max_hangcheck_score = 0;
+	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+		if (error->ring[i].hangcheck_score > max_hangcheck_score)
+			max_hangcheck_score = error->ring[i].hangcheck_score;
+	}
+	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+		if (error->ring[i].hangcheck_score == max_hangcheck_score &&
+		    error->ring[i].pid != -1) {
+			err_printf(m, "Active process (on ring %s): %s [%d]\n",
+				   ring_str(i),
+				   error->ring[i].comm,
+				   error->ring[i].pid);
+		}
+	}
 	err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device);
 	err_printf(m, "EIR: 0x%08x\n", error->eir);
 	err_printf(m, "IER: 0x%08x\n", error->ier);
@@ -363,8 +378,11 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		struct drm_i915_error_object *obj;
 
 		if ((obj = error->ring[i].batchbuffer)) {
-			err_printf(m, "%s --- gtt_offset = 0x%08x\n",
-				   dev_priv->ring[i].name,
+			err_puts(m, dev_priv->ring[i].name);
+			if (error->ring[i].pid != -1)
+				err_printf(m, " (submitted by %s [%d])",
+					   error->ring[i].comm, error->ring[i].pid);
+			err_printf(m, " --- gtt_offset = 0x%08x\n",
 				   obj->gtt_offset);
 			offset = 0;
 			for (page = 0; page < obj->page_count; page++) {
@@ -698,9 +716,9 @@ static void i915_gem_record_fences(struct drm_device *dev,
 	}
 }
 
-static struct drm_i915_error_object *
-i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
-			     struct intel_ring_buffer *ring)
+static struct drm_i915_gem_request *
+i915_error_first_request(struct drm_i915_private *dev_priv,
+			 struct intel_ring_buffer *ring)
 {
 	struct drm_i915_gem_request *request;
 	u32 seqno;
@@ -708,29 +726,12 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
 	if (!ring->get_seqno)
 		return NULL;
 
-	if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
-		struct drm_i915_gem_object *obj;
-		u32 acthd = I915_READ(ACTHD);
-
-		if (WARN_ON(ring->id != RCS))
-			return NULL;
-
-		obj = ring->scratch.obj;
-		if (obj != NULL &&
-		    acthd >= i915_gem_obj_ggtt_offset(obj) &&
-		    acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
-			return i915_error_ggtt_object_create(dev_priv, obj);
-	}
-
 	seqno = ring->get_seqno(ring, false);
 	list_for_each_entry(request, &ring->request_list, list) {
 		if (i915_seqno_passed(seqno, request->seqno))
 			continue;
 
-		/* We need to copy these to an anonymous buffer as the simplest
-		 * method to avoid being overwritten by userspace.
-		 */
-		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
+		return request;
 	}
 
 	return NULL;
@@ -884,8 +885,26 @@ static void i915_gem_record_rings(struct drm_device *dev,
 
 		i915_record_ring_state(dev, ring, &error->ring[i]);
 
-		error->ring[i].batchbuffer =
-			i915_error_first_batchbuffer(dev_priv, ring);
+		error->ring[i].pid = -1;
+		request = i915_error_first_request(dev_priv, ring);
+		if (request) {
+			/* We need to copy these to an anonymous buffer as the simplest
+			 * method to avoid being overwritten by userspace.
+			 */
+			error->ring[i].batchbuffer =
+				i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
+			if (request->file_priv) {
+				struct task_struct *task;
+
+				rcu_read_lock();
+				task = pid_task(request->file_priv->file->pid, PIDTYPE_PID);
+				if (task) {
+					strcpy(error->ring[i].comm, task->comm);
+					error->ring[i].pid = task->pid;
+				}
+				rcu_read_unlock();
+			}
+		}
 
 		error->ring[i].ringbuffer =
 			i915_error_ggtt_object_create(dev_priv, ring->obj);
-- 
1.9.rc1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux