Capture the GPU state on a GPU hang and store it for later playback using the 'crash' node in the debugfs directory. Only one crash state is stored at a time on the assumption that the first hang is usually the most interesting. The existing crash state can be cleared by writing to the debugfs node and then a new one will be captured on the next hang. Signed-off-by: Jordan Crouse <jcrouse@xxxxxxxxxxxxxx> --- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 18 ++++++++-- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 2 +- drivers/gpu/drm/msm/msm_debugfs.c | 61 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/msm/msm_gpu.c | 47 ++++++++++++++++++++----- drivers/gpu/drm/msm/msm_gpu.h | 36 ++++++++++++++++++- 5 files changed, 151 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index 81da214..963fce3 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -372,6 +372,8 @@ struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu) if (!state) return ERR_PTR(-ENOMEM); + kref_init(&state->ref); + do_gettimeofday(&state->time); for (i = 0; i < gpu->nr_rings; i++) { @@ -407,15 +409,25 @@ struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu) return state; } -void adreno_gpu_state_put(struct msm_gpu_state *state) +static void adreno_gpu_state_destroy(struct kref *kref) { - if (IS_ERR_OR_NULL(state)) - return; + struct msm_gpu_state *state = container_of(kref, + struct msm_gpu_state, ref); + kfree(state->comm); + kfree(state->cmd); kfree(state->registers); kfree(state); } +int adreno_gpu_state_put(struct msm_gpu_state *state) +{ + if (IS_ERR_OR_NULL(state)) + return 1; + + return kref_put(&state->ref, adreno_gpu_state_destroy); +} + #ifdef CONFIG_DEBUG_FS void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state, struct seq_file *m) diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index b44e0b9..bcf755e 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -221,7 +221,7 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev, void adreno_gpu_cleanup(struct adreno_gpu *gpu); struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu); -void adreno_gpu_state_put(struct msm_gpu_state *state); +int adreno_gpu_state_put(struct msm_gpu_state *state); /* ringbuffer helpers (the parts that are adreno specific) */ diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c index 89ee74b..50e049c 100644 --- a/drivers/gpu/drm/msm/msm_debugfs.c +++ b/drivers/gpu/drm/msm/msm_debugfs.c @@ -16,11 +16,69 @@ */ #ifdef CONFIG_DEBUG_FS + +#include <generated/utsrelease.h> +#include <linux/debugfs.h> #include "msm_drv.h" #include "msm_gpu.h" #include "msm_kms.h" #include "msm_debugfs.h" +static int msm_gpu_crash_show(struct seq_file *m, void *data) +{ + struct msm_gpu *gpu = m->private; + struct msm_gpu_state *state; + + state = msm_gpu_crashstate_get(gpu); + if (!state) + return 0; + + seq_printf(m, "%s Crash Status:\n", gpu->name); + seq_puts(m, "Kernel: " UTS_RELEASE "\n"); + seq_printf(m, "Time: %ld s %ld us\n", + state->time.tv_sec, state->time.tv_usec); + if (state->comm) + seq_printf(m, "comm: %s\n", state->comm); + if (state->cmd) + seq_printf(m, "cmdline: %s\n", state->cmd); + + gpu->funcs->show(gpu, state, m); + + msm_gpu_crashstate_put(gpu); + + return 0; +} + +static ssize_t msm_gpu_crash_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + struct msm_gpu *gpu = ((struct seq_file *)file->private_data)->private; + + dev_err(gpu->dev->dev, "Releasing the GPU crash state\n"); + msm_gpu_crashstate_put(gpu); + + return count; +} + +static int msm_gpu_crash_open(struct inode *inode, struct file *file) +{ + struct msm_drm_private *priv = inode->i_private; + + if (!priv->gpu) + return -ENODEV; + + return single_open(file, msm_gpu_crash_show, priv->gpu); +} + +static const struct file_operations msm_gpu_crash_fops = { + .owner = THIS_MODULE, + .open = msm_gpu_crash_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = msm_gpu_crash_write, +}; + static int msm_gpu_show(struct drm_device *dev, struct seq_file *m) { struct msm_drm_private *priv = dev->dev_private; @@ -170,6 +228,9 @@ int msm_debugfs_init(struct drm_minor *minor) return ret; } + debugfs_create_file("crash", 0644, minor->debugfs_root, + priv, &msm_gpu_crash_fops); + if (priv->kms->funcs->debugfs_init) ret = priv->kms->funcs->debugfs_init(priv->kms, minor); diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index bd376f9..f8dff90 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -273,6 +273,30 @@ int msm_gpu_hw_init(struct msm_gpu *gpu) return ret; } +static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, char *comm, + char *cmd) +{ + struct msm_gpu_state *state; + + /* Only save one crash state at a a time */ + if (gpu->crashstate) + return; + + state = gpu->funcs->gpu_state_get(gpu); + if (IS_ERR_OR_NULL(state)) + return; + + /* Fill in the additional crash state information */ + state->comm = kstrdup(comm, GFP_KERNEL); + state->cmd = kstrdup(cmd, GFP_KERNEL); + + kref_get(&state->ref); + + /* Set the active crash state to be dumped on failure */ + gpu->crashstate = state; +} + + /* * Hangcheck detection for locked gpu: */ @@ -314,6 +338,7 @@ static void recover_worker(struct work_struct *work) struct msm_drm_private *priv = dev->dev_private; struct msm_gem_submit *submit; struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu); + char *comm = NULL, *cmd = NULL; int i; mutex_lock(&dev->struct_mutex); @@ -326,8 +351,9 @@ static void recover_worker(struct work_struct *work) rcu_read_lock(); task = pid_task(submit->pid, PIDTYPE_PID); + if (task) { - char *cmd; + comm = kstrdup(task->comm, GFP_KERNEL); /* * So slightly annoying, in other paths like @@ -342,20 +368,25 @@ static void recover_worker(struct work_struct *work) mutex_unlock(&dev->struct_mutex); cmd = kstrdup_quotable_cmdline(task, GFP_KERNEL); mutex_lock(&dev->struct_mutex); + } + + rcu_read_unlock(); + if (comm && cmd) { dev_err(dev->dev, "%s: offending task: %s (%s)\n", - gpu->name, task->comm, cmd); + gpu->name, comm, cmd); msm_rd_dump_submit(priv->hangrd, submit, - "offending task: %s (%s)", task->comm, cmd); - - kfree(cmd); - } else { + "offending task: %s (%s)", comm, cmd); + } else msm_rd_dump_submit(priv->hangrd, submit, NULL); - } - rcu_read_unlock(); } + /* Record the crash state */ + msm_gpu_crashstate_capture(gpu, comm, cmd); + + kfree(cmd); + kfree(comm); /* * Update all the rings with the latest and greatest fence.. this diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 42853e9..23e3b06 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -70,7 +70,7 @@ struct msm_gpu_funcs { #endif int (*gpu_busy)(struct msm_gpu *gpu, uint64_t *value); struct msm_gpu_state *(*gpu_state_get)(struct msm_gpu *gpu); - void (*gpu_state_put)(struct msm_gpu_state *state); + int (*gpu_state_put)(struct msm_gpu_state *state); }; struct msm_gpu { @@ -131,6 +131,8 @@ struct msm_gpu { u64 busy_cycles; ktime_t time; } devfreq; + + struct msm_gpu_state *crashstate; }; /* It turns out that all targets use the same ringbuffer size */ @@ -178,6 +180,7 @@ struct msm_gpu_submitqueue { }; struct msm_gpu_state { + struct kref ref; struct timeval time; struct { @@ -191,6 +194,9 @@ struct msm_gpu_state { u32 *registers; u32 rbbm_status; + + char *comm; + char *cmd; }; static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data) @@ -272,4 +278,32 @@ static inline void msm_submitqueue_put(struct msm_gpu_submitqueue *queue) kref_put(&queue->ref, msm_submitqueue_destroy); } +static inline struct msm_gpu_state *msm_gpu_crashstate_get(struct msm_gpu *gpu) +{ + struct msm_gpu_state *state = NULL; + + mutex_lock(&gpu->dev->struct_mutex); + + if (gpu->crashstate) { + kref_get(&gpu->crashstate->ref); + state = gpu->crashstate; + } + + mutex_unlock(&gpu->dev->struct_mutex); + + return state; +} + +static inline void msm_gpu_crashstate_put(struct msm_gpu *gpu) +{ + mutex_lock(&gpu->dev->struct_mutex); + + if (gpu->crashstate) { + if (gpu->funcs->gpu_state_put(gpu->crashstate)) + gpu->crashstate = NULL; + } + + mutex_unlock(&gpu->dev->struct_mutex); +} + #endif /* __MSM_GPU_H__ */ -- 1.9.1 _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel