Once upon a time before we had automated GPU state capture upon hangs, we had intel_gpu_dump. Now we come almost full circle and reinstate that view of the current GPU queues and registers by using the error capture facility to snapshot the GPU state when debugfs/.../i915_gpu_info is opened - which should provided useful debugging to both the error capture routines (without having to cause a hang and avoid the error state being eaten by igt) and generally. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_debugfs.c | 123 +++++++++++++++++++--------------- drivers/gpu/drm/i915/i915_drv.h | 26 ++++--- drivers/gpu/drm/i915/i915_gpu_error.c | 71 ++++++++------------ drivers/gpu/drm/i915/i915_sysfs.c | 31 +++++---- 4 files changed, 132 insertions(+), 119 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 3ae06568df7b..124a184389a8 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -954,89 +954,103 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data) } #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) - -static ssize_t -i915_error_state_write(struct file *filp, - const char __user *ubuf, - size_t cnt, - loff_t *ppos) +static ssize_t error_state_read(struct file *file, char __user *ubuf, + size_t count, loff_t *pos) { - struct i915_error_state_file_priv *error_priv = filp->private_data; - - DRM_DEBUG_DRIVER("Resetting error state\n"); - i915_destroy_error_state(error_priv->i915); - - return cnt; -} + struct drm_i915_error_state *error = file->private_data; + struct drm_i915_error_state_buf str; + ssize_t ret; + loff_t tmp; -static int i915_error_state_open(struct inode *inode, struct file *file) -{ - struct drm_i915_private *dev_priv = inode->i_private; - struct i915_error_state_file_priv *error_priv; + if (!error) + return 0; - error_priv = kzalloc(sizeof(*error_priv), GFP_KERNEL); - if (!error_priv) - return -ENOMEM; + ret = i915_error_state_buf_init(&str, error->i915, count, *pos); + if (ret) + return ret; - error_priv->i915 = dev_priv; + ret = i915_error_state_to_str(&str, error); + if (ret) + goto out; - i915_error_state_get(&dev_priv->drm, error_priv); + tmp = 0; + ret = simple_read_from_buffer(ubuf, count, &tmp, str.buf, str.bytes); + if (ret < 0) + goto out; - file->private_data = error_priv; + *pos = str.start + ret; +out: + i915_error_state_buf_release(&str); + return ret; +} +static int error_state_release(struct inode *inode, struct file *file) +{ + i915_error_state_put(file->private_data); return 0; } -static int i915_error_state_release(struct inode *inode, struct file *file) +static int i915_gpu_info_open(struct inode *inode, struct file *file) { - struct i915_error_state_file_priv *error_priv = file->private_data; + struct drm_i915_error_state *error; - i915_error_state_put(error_priv); - kfree(error_priv); + error = i915_error_state(inode->i_private); + if (!error) + return -ENOMEM; + file->private_data = error; return 0; } -static ssize_t i915_error_state_read(struct file *file, char __user *userbuf, - size_t count, loff_t *pos) +static const struct file_operations i915_gpu_info_fops = { + .owner = THIS_MODULE, + .open = i915_gpu_info_open, + .read = error_state_read, + .llseek = default_llseek, + .release = error_state_release, +}; + +static ssize_t +i915_error_state_write(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) { - struct i915_error_state_file_priv *error_priv = file->private_data; - struct drm_i915_error_state_buf error_str; - loff_t tmp_pos = 0; - ssize_t ret_count = 0; - int ret; + struct drm_i915_error_state *error = filp->private_data; - ret = i915_error_state_buf_init(&error_str, error_priv->i915, - count, *pos); - if (ret) - return ret; + if (!error) + return 0; - ret = i915_error_state_to_str(&error_str, error_priv); - if (ret) - goto out; + DRM_DEBUG_DRIVER("Resetting error state\n"); + i915_destroy_error_state(error->i915); - ret_count = simple_read_from_buffer(userbuf, count, &tmp_pos, - error_str.buf, - error_str.bytes); + return cnt; +} - if (ret_count < 0) - ret = ret_count; - else - *pos = error_str.start + ret_count; -out: - i915_error_state_buf_release(&error_str); - return ret ?: ret_count; +static int i915_error_state_open(struct inode *inode, struct file *file) +{ + struct drm_i915_private *i915 = inode->i_private; + struct drm_i915_error_state *error; + + spin_lock_irq(&i915->gpu_error.lock); + error = i915->gpu_error.first_error; + if (error) + i915_error_state_get(error); + spin_unlock_irq(&i915->gpu_error.lock); + + file->private_data = error; + + return 0; } static const struct file_operations i915_error_state_fops = { .owner = THIS_MODULE, .open = i915_error_state_open, - .read = i915_error_state_read, + .read = error_state_read, .write = i915_error_state_write, .llseek = default_llseek, - .release = i915_error_state_release, + .release = error_state_release, }; - #endif static int @@ -4707,6 +4721,7 @@ static const struct i915_debugfs_files { {"i915_gem_drop_caches", &i915_drop_caches_fops}, #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) {"i915_error_state", &i915_error_state_fops}, + {"i915_gpu_info", &i915_gpu_info_fops}, #endif {"i915_next_seqno", &i915_next_seqno_fops}, {"i915_display_crc_ctl", &i915_display_crc_ctl_fops}, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 27e74322ca70..cb688918a913 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1504,11 +1504,6 @@ struct drm_i915_error_state_buf { loff_t pos; }; -struct i915_error_state_file_priv { - struct drm_i915_private *i915; - struct drm_i915_error_state *error; -}; - #define I915_RESET_TIMEOUT (10 * HZ) /* 10s */ #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */ @@ -3557,7 +3552,7 @@ static inline void intel_display_crc_init(struct drm_i915_private *dev_priv) {} __printf(2, 3) void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...); int i915_error_state_to_str(struct drm_i915_error_state_buf *estr, - const struct i915_error_state_file_priv *error); + const struct drm_i915_error_state *error); int i915_error_state_buf_init(struct drm_i915_error_state_buf *eb, struct drm_i915_private *i915, size_t count, loff_t pos); @@ -3566,12 +3561,25 @@ static inline void i915_error_state_buf_release( { kfree(eb->buf); } +struct drm_i915_error_state *i915_error_state(struct drm_i915_private *i915); void i915_capture_error_state(struct drm_i915_private *dev_priv, u32 engine_mask, const char *error_msg); -void i915_error_state_get(struct drm_device *dev, - struct i915_error_state_file_priv *error_priv); -void i915_error_state_put(struct i915_error_state_file_priv *error_priv); + +static inline struct drm_i915_error_state * +i915_error_state_get(struct drm_i915_error_state *error) +{ + kref_get(&error->ref); + return error; +} + +void __i915_error_state_free(struct kref *error_ref); +static inline void i915_error_state_put(struct drm_i915_error_state *error) +{ + if (error) + kref_put(&error->ref, __i915_error_state_free); +} + void i915_destroy_error_state(struct drm_i915_private *dev_priv); #else diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 5283fe815a4d..f8f6dad00d14 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -342,7 +342,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m, } static void error_print_instdone(struct drm_i915_error_state_buf *m, - struct drm_i915_error_engine *ee) + const struct drm_i915_error_engine *ee) { int slice; int subslice; @@ -372,7 +372,7 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m, static void error_print_request(struct drm_i915_error_state_buf *m, const char *prefix, - struct drm_i915_error_request *erq) + const struct drm_i915_error_request *erq) { if (!erq->seqno) return; @@ -386,7 +386,7 @@ static void error_print_request(struct drm_i915_error_state_buf *m, static void error_print_context(struct drm_i915_error_state_buf *m, const char *header, - struct drm_i915_error_context *ctx) + const struct drm_i915_error_context *ctx) { err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n", header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id, @@ -394,7 +394,7 @@ static void error_print_context(struct drm_i915_error_state_buf *m, } static void error_print_engine(struct drm_i915_error_state_buf *m, - struct drm_i915_error_engine *ee) + const struct drm_i915_error_engine *ee) { err_printf(m, "%s command stream:\n", engine_str(ee->engine_id)); err_printf(m, " START: 0x%08x\n", ee->start); @@ -547,19 +547,13 @@ static void err_print_capabilities(struct drm_i915_error_state_buf *m, } int i915_error_state_to_str(struct drm_i915_error_state_buf *m, - const struct i915_error_state_file_priv *error_priv) + const struct drm_i915_error_state *error) { - struct drm_i915_private *dev_priv = error_priv->i915; + struct drm_i915_private *dev_priv = error->i915; struct pci_dev *pdev = dev_priv->drm.pdev; - struct drm_i915_error_state *error = error_priv->error; struct drm_i915_error_object *obj; int i, j; - if (!error) { - err_printf(m, "no error state collected\n"); - goto out; - } - err_printf(m, "%s\n", error->error_msg); err_printf(m, "Kernel: " UTS_RELEASE "\n"); err_printf(m, "Time: %ld s %ld us\n", @@ -663,7 +657,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, error->pinned_bo_count); for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - struct drm_i915_error_engine *ee = &error->engine[i]; + const struct drm_i915_error_engine *ee = &error->engine[i]; obj = ee->batchbuffer; if (obj) { @@ -730,7 +724,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, if (error->display) intel_display_print_error_state(m, dev_priv, error->display); -out: if (m->bytes == 0 && m->err) return m->err; @@ -782,7 +775,7 @@ static void i915_error_object_free(struct drm_i915_error_object *obj) kfree(obj); } -static void i915_error_state_free(struct kref *error_ref) +void __i915_error_state_free(struct kref *error_ref) { struct drm_i915_error_state *error = container_of(error_ref, typeof(*error), ref); @@ -1609,6 +1602,23 @@ static int capture(void *data) #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) +struct drm_i915_error_state * +i915_error_state(struct drm_i915_private *i915) +{ + struct drm_i915_error_state *error; + + error = kzalloc(sizeof(*error), GFP_ATOMIC); + if (!error) + return NULL; + + kref_init(&error->ref); + error->i915 = i915; + + stop_machine(capture, error, NULL); + + return error; +} + /** * i915_capture_error_state - capture an error record for later analysis * @dev: drm device @@ -1632,18 +1642,12 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, if (READ_ONCE(dev_priv->gpu_error.first_error)) return; - /* Account for pipe specific data like PIPE*STAT */ - error = kzalloc(sizeof(*error), GFP_ATOMIC); + error = i915_error_state(dev_priv); if (!error) { DRM_DEBUG_DRIVER("out of memory, not capturing error state\n"); return; } - kref_init(&error->ref); - error->i915 = dev_priv; - - stop_machine(capture, error, NULL); - i915_error_capture_msg(dev_priv, error, engine_mask, error_msg); DRM_INFO("%s\n", error->error_msg); @@ -1657,7 +1661,7 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, } if (error) { - i915_error_state_free(&error->ref); + __i915_error_state_free(&error->ref); return; } @@ -1673,24 +1677,6 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, } } -void i915_error_state_get(struct drm_device *dev, - struct i915_error_state_file_priv *error_priv) -{ - struct drm_i915_private *dev_priv = to_i915(dev); - - spin_lock_irq(&dev_priv->gpu_error.lock); - error_priv->error = dev_priv->gpu_error.first_error; - if (error_priv->error) - kref_get(&error_priv->error->ref); - spin_unlock_irq(&dev_priv->gpu_error.lock); -} - -void i915_error_state_put(struct i915_error_state_file_priv *error_priv) -{ - if (error_priv->error) - kref_put(&error_priv->error->ref, i915_error_state_free); -} - void i915_destroy_error_state(struct drm_i915_private *dev_priv) { struct drm_i915_error_state *error; @@ -1700,6 +1686,5 @@ void i915_destroy_error_state(struct drm_i915_private *dev_priv) dev_priv->gpu_error.first_error = NULL; spin_unlock_irq(&dev_priv->gpu_error.lock); - if (error) - kref_put(&error->ref, i915_error_state_free); + i915_error_state_put(error); } diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c index a721ff116101..3f1a825a9f24 100644 --- a/drivers/gpu/drm/i915/i915_sysfs.c +++ b/drivers/gpu/drm/i915/i915_sysfs.c @@ -522,30 +522,35 @@ static ssize_t error_state_read(struct file *filp, struct kobject *kobj, struct device *kdev = kobj_to_dev(kobj); struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev); - struct drm_device *dev = &dev_priv->drm; - struct i915_error_state_file_priv error_priv; struct drm_i915_error_state_buf error_str; - ssize_t ret_count = 0; + struct drm_i915_error_state *error; + ssize_t ret_count; int ret; - memset(&error_priv, 0, sizeof(error_priv)); - - ret = i915_error_state_buf_init(&error_str, to_i915(dev), count, off); + ret = i915_error_state_buf_init(&error_str, dev_priv, count, off); if (ret) return ret; - error_priv.i915 = dev_priv; - i915_error_state_get(dev, &error_priv); + spin_lock_irq(&dev_priv->gpu_error.lock); + error = dev_priv->gpu_error.first_error; + if (error) + i915_error_state_get(error); + spin_unlock_irq(&dev_priv->gpu_error.lock); - ret = i915_error_state_to_str(&error_str, &error_priv); - if (ret) - goto out; + if (error) { + ret = i915_error_state_to_str(&error_str, error); + if (ret) + goto out; + } else { + error_str.bytes = strlen("No error state collected\n"); + strcpy(error_str.buf, "No error state collected\n"); + } ret_count = count < error_str.bytes ? count : error_str.bytes; - memcpy(buf, error_str.buf, ret_count); + out: - i915_error_state_put(&error_priv); + i915_error_state_put(error); i915_error_state_buf_release(&error_str); return ret ?: ret_count; -- 2.11.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx