Re: [PATCH 06/10] drm/msm/gpu: Capture the GPU state on a GPU hang

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 4/18/2018 4:14 AM, Jordan Crouse wrote:
Capture the GPU state on a GPU hang and store it for later playback
via the devcoredump facility. Only one crash state is stored at a
time on the assumption that the first hang is usually the most
interesting. The existing crash state can be cleared after capturing
it and then a new one will be captured on the next hang.

Signed-off-by: Jordan Crouse <jcrouse@xxxxxxxxxxxxxx>
---
  drivers/gpu/drm/msm/Kconfig             |  1 +
  drivers/gpu/drm/msm/adreno/a3xx_gpu.c   |  2 +-
  drivers/gpu/drm/msm/adreno/a4xx_gpu.c   |  2 +-
  drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |  4 +-
  drivers/gpu/drm/msm/adreno/adreno_gpu.c | 36 +++++++----
  drivers/gpu/drm/msm/adreno/adreno_gpu.h |  4 +-
  drivers/gpu/drm/msm/msm_debugfs.c       |  5 +-
  drivers/gpu/drm/msm/msm_gpu.c           | 83 ++++++++++++++++++++++++-
  drivers/gpu/drm/msm/msm_gpu.h           | 38 ++++++++++-
  9 files changed, 153 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
index 38cbde971b48..843a9d40c05e 100644
--- a/drivers/gpu/drm/msm/Kconfig
+++ b/drivers/gpu/drm/msm/Kconfig
@@ -12,6 +12,7 @@ config DRM_MSM
  	select SHMEM
  	select TMPFS
  	select QCOM_SCM
+	select WANT_DEV_COREDUMP
  	select SND_SOC_HDMI_CODEC if SND_SOC
  	select SYNC_FILE
  	select PM_OPP
diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
index 4cffec2b6adc..fc502e412132 100644
--- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
@@ -454,7 +454,7 @@ static const struct adreno_gpu_funcs funcs = {
  		.active_ring = adreno_active_ring,
  		.irq = a3xx_irq,
  		.destroy = a3xx_destroy,
-#ifdef CONFIG_DEBUG_FS
+#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
  		.show = adreno_show,
  #endif
  		.gpu_state_get = a3xx_gpu_state_get,
diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
index 95f08c22e8d7..8129cf037db1 100644
--- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
@@ -540,7 +540,7 @@ static const struct adreno_gpu_funcs funcs = {
  		.active_ring = adreno_active_ring,
  		.irq = a4xx_irq,
  		.destroy = a4xx_destroy,
-#ifdef CONFIG_DEBUG_FS
+#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
  		.show = adreno_show,
  #endif
  		.gpu_state_get = a4xx_gpu_state_get,
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index b0910bbe5190..836a1df1f257 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1243,8 +1243,10 @@ static const struct adreno_gpu_funcs funcs = {
  		.active_ring = a5xx_active_ring,
  		.irq = a5xx_irq,
  		.destroy = a5xx_destroy,
-#ifdef CONFIG_DEBUG_FS
+#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
  		.show = adreno_show,
+#endif
+#if defined(CONFIG_DEBUG_FS)
  		.debugfs_init = a5xx_debugfs_init,
  #endif
  		.gpu_busy = a5xx_gpu_busy,
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 522d47ac36e1..d46ae2ede616 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -378,6 +378,8 @@ struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu)
  	if (!state)
  		return ERR_PTR(-ENOMEM);
+ kref_init(&state->ref);
+
  	do_gettimeofday(&state->time);
for (i = 0; i < gpu->nr_rings; i++) {
@@ -413,18 +415,28 @@ struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu)
  	return state;
  }
-void adreno_gpu_state_put(struct msm_gpu_state *state)
+static void adreno_gpu_state_destroy(struct kref *kref)
  {
-	if (IS_ERR_OR_NULL(state))
-		return;
+	struct msm_gpu_state *state = container_of(kref,
+		struct msm_gpu_state, ref);
+ kfree(state->comm);
+	kfree(state->cmd);
  	kfree(state->registers);
  	kfree(state);
  }
-#ifdef CONFIG_DEBUG_FS
+int adreno_gpu_state_put(struct msm_gpu_state *state)
+{
+	if (IS_ERR_OR_NULL(state))
+		return 1;
+
+	return kref_put(&state->ref, adreno_gpu_state_destroy);
+}
+
+#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
  void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
-		struct seq_file *m)
+		struct drm_printer *p)
  {
  	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
  	int i;
@@ -432,23 +444,23 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
  	if (IS_ERR_OR_NULL(state))
  		return;
- seq_printf(m, "status: %08x\n", state->rbbm_status);
-	seq_printf(m, "revision: %d (%d.%d.%d.%d)\n",
+	drm_printf(p, "status:   %08x\n", state->rbbm_status);
+	drm_printf(p, "revision: %d (%d.%d.%d.%d)\n",
  			adreno_gpu->info->revn, adreno_gpu->rev.core,
  			adreno_gpu->rev.major, adreno_gpu->rev.minor,
  			adreno_gpu->rev.patchid);
for (i = 0; i < gpu->nr_rings; i++) {
-		seq_printf(m, "rb %d: fence:    %d/%d\n", i,
+		drm_printf(p, "rb %d: fence:    %d/%d\n", i,
  			state->ring[i].fence, state->ring[i].seqno);
- seq_printf(m, " rptr: %d\n", state->ring[i].rptr);
-		seq_printf(m, "rb wptr:  %d\n", state->ring[i].wptr);
+		drm_printf(p, "      rptr:     %d\n", state->ring[i].rptr);
+		drm_printf(p, "rb wptr:  %d\n", state->ring[i].wptr);
  	}
- seq_printf(m, "IO:region %s 00000000 00020000\n", gpu->name);
+	drm_printf(p, "IO:region %s 00000000 00020000\n", gpu->name);
  	for (i = 0; i < state->nr_registers; i++) {
-		seq_printf(m, "IO:R %08x %08x\n",
+		drm_printf(p, "IO:R %08x %08x\n",
  			state->registers[i * 2] << 2,
  			state->registers[(i * 2) + 1]);
  	}
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index 815ae98c7fd1..077bf1149c0b 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -216,7 +216,7 @@ void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
  bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
  #ifdef CONFIG_DEBUG_FS
  void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
-		struct seq_file *m);
+		struct drm_printer *p);
  #endif
  void adreno_dump_info(struct msm_gpu *gpu);
  void adreno_dump(struct msm_gpu *gpu);
@@ -229,7 +229,7 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
  void adreno_gpu_cleanup(struct adreno_gpu *gpu);
struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu);
-void adreno_gpu_state_put(struct msm_gpu_state *state);
+int adreno_gpu_state_put(struct msm_gpu_state *state);
/* ringbuffer helpers (the parts that are adreno specific) */ diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c
index ea20eb0ad747..9ae9e0a12b3a 100644
--- a/drivers/gpu/drm/msm/msm_debugfs.c
+++ b/drivers/gpu/drm/msm/msm_debugfs.c
@@ -29,6 +29,7 @@ struct msm_gpu_show_priv {
static int msm_gpu_show(struct seq_file *m, void *arg)
  {
+	struct drm_printer p = drm_seq_file_printer(m);
  	struct msm_gpu_show_priv *show_priv = m->private;
  	struct msm_drm_private *priv = show_priv->dev->dev_private;
  	struct msm_gpu *gpu = priv->gpu;
@@ -38,8 +39,8 @@ static int msm_gpu_show(struct seq_file *m, void *arg)
  	if (ret)
  		return ret;
- seq_printf(m, "%s Status:\n", gpu->name);
-	gpu->funcs->show(gpu, show_priv->state, m);
+	drm_printf(&p, "%s Status:\n", gpu->name);
+	gpu->funcs->show(gpu, show_priv->state, &p);
mutex_unlock(&show_priv->dev->struct_mutex); diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 2ca354047250..f36b415e123b 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -20,10 +20,11 @@
  #include "msm_mmu.h"
  #include "msm_fence.h"
+#include <generated/utsrelease.h>
  #include <linux/string_helpers.h>
  #include <linux/pm_opp.h>
  #include <linux/devfreq.h>
-
+#include <linux/devcoredump.h>
/*
   * Power Management:
@@ -273,6 +274,83 @@ int msm_gpu_hw_init(struct msm_gpu *gpu)
  	return ret;
  }
+#ifdef CONFIG_DEV_COREDUMP
+static ssize_t msm_gpu_devcoredump_read(char *buffer, loff_t offset,
+		size_t count, void *data, size_t datalen)
+{
+	struct msm_gpu *gpu = data;
+	struct drm_print_iterator iter;
+	struct drm_printer p;
+	struct msm_gpu_state *state;
+
+	state = msm_gpu_crashstate_get(gpu);
+	if (!state)
+		return 0;
+
+	iter.data = buffer;
+	iter.offset = 0;
+	iter.start = offset;
+	iter.remain = count;

I think I understood how this works. So this read could get called many times before we are done reading and every time we start from scratch. Why don't we copy everything to a buffer before hand(right when we capture the crash state) and then use the buffer here for copying to the user? We do get the offset to the buffer as input. This will probably save us from starting from the first print every time?
+
+	p = drm_coredump_printer(&iter);
+
+	drm_printf(&p, "---\n");
+	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
+	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
+	drm_printf(&p, "time: %ld.%ld\n",
+		state->time.tv_sec, state->time.tv_usec);
+	if (state->comm)
+		drm_printf(&p, "comm: %s\n", state->comm);
+	if (state->cmd)
+		drm_printf(&p, "cmdline: %s\n", state->cmd);
+
+	gpu->funcs->show(gpu, state, &p);
+
+	msm_gpu_crashstate_put(gpu);
+
+	return count - iter.remain;
+}
+
+static void msm_gpu_devcoredump_free(void *data)
+{
+	struct msm_gpu *gpu = data;
+
+	msm_gpu_crashstate_put(gpu);
+}
+
+static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, char *comm,
+		char *cmd)
+{
+	struct msm_gpu_state *state;
+
+	/* Only save one crash state at a a time */
remove extra 'a'
+	if (gpu->crashstate)
+		return;
+
+	state = gpu->funcs->gpu_state_get(gpu);
GPU might enter slumber by this time, so we may have to get/put sync run time pm ops here?
+	if (IS_ERR_OR_NULL(state))
+		return;
+
+	/* Fill in the additional crash state information */
+	state->comm = kstrdup(comm, GFP_KERNEL);
+	state->cmd = kstrdup(cmd, GFP_KERNEL);
+
+	kref_init(&state->ref);
This is not needed, adreno_gpu_state_get() takes care of this
+
+	/* Set the active crash state to be dumped on failure */
+	gpu->crashstate = state;
+
+	/* FIXME: Release the crashstate if this errors out? */
+	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
+		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
+}
+#else
+static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, char *comm,
+		char *cmd)
+{
+}
+#endif
+
  /*
   * Hangcheck detection for locked gpu:
   */
@@ -356,6 +434,9 @@ static void recover_worker(struct work_struct *work)
  			msm_rd_dump_submit(priv->hangrd, submit, NULL);
  	}
+ /* Record the crash state */
+	msm_gpu_crashstate_capture(gpu, comm, cmd);
+
  	kfree(cmd);
  	kfree(comm);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 470f3bb5f834..e65f507954c0 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -66,13 +66,13 @@ struct msm_gpu_funcs {
  #ifdef CONFIG_DEBUG_FS
  	/* show GPU status in debugfs: */
  	void (*show)(struct msm_gpu *gpu, struct msm_gpu_state *state,
-			struct seq_file *m);
+			struct drm_printer *p);
  	/* for generation specific debugfs: */
  	int (*debugfs_init)(struct msm_gpu *gpu, struct drm_minor *minor);
  #endif
  	int (*gpu_busy)(struct msm_gpu *gpu, uint64_t *value);
  	struct msm_gpu_state *(*gpu_state_get)(struct msm_gpu *gpu);
-	void (*gpu_state_put)(struct msm_gpu_state *state);
+	int (*gpu_state_put)(struct msm_gpu_state *state);
  };
struct msm_gpu {
@@ -133,6 +133,8 @@ struct msm_gpu {
  		u64 busy_cycles;
  		ktime_t time;
  	} devfreq;
+
+	struct msm_gpu_state *crashstate;
  };
/* It turns out that all targets use the same ringbuffer size */
@@ -180,6 +182,7 @@ struct msm_gpu_submitqueue {
  };
struct msm_gpu_state {
+	struct kref ref;
  	struct timeval time;
struct {
@@ -193,6 +196,9 @@ struct msm_gpu_state {
  	u32 *registers;
u32 rbbm_status;
+
+	char *comm;
+	char *cmd;
  };
static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
@@ -274,4 +280,32 @@ static inline void msm_submitqueue_put(struct msm_gpu_submitqueue *queue)
  		kref_put(&queue->ref, msm_submitqueue_destroy);
  }
+static inline struct msm_gpu_state *msm_gpu_crashstate_get(struct msm_gpu *gpu)
+{
+	struct msm_gpu_state *state = NULL;
+
+	mutex_lock(&gpu->dev->struct_mutex);
+
+	if (gpu->crashstate) {
+		kref_get(&gpu->crashstate->ref);
+		state = gpu->crashstate;
+	}
+
+	mutex_unlock(&gpu->dev->struct_mutex);
+
+	return state;
+}
+
+static inline void msm_gpu_crashstate_put(struct msm_gpu *gpu)
+{
+	mutex_lock(&gpu->dev->struct_mutex);
+
+	if (gpu->crashstate) {
+		if (gpu->funcs->gpu_state_put(gpu->crashstate))
+			gpu->crashstate = NULL;
+	}
+
+	mutex_unlock(&gpu->dev->struct_mutex);
+}
+
  #endif /* __MSM_GPU_H__ */


--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
Linux Foundation Collaborative Project
_______________________________________________
dri-devel mailing list
dri-devel@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/dri-devel




[Index of Archives]     [Linux DRI Users]     [Linux Intel Graphics]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [XFree86]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux