Once upon a time before we had automated GPU state capture upon hangs, we had intel_gpu_dump. Now we come almost full circle and reinstate that view of the current GPU queues and registers by using the error capture facility to snapshot the GPU state when debugfs/.../i915_gpu_info is opened - which should provided useful debugging to both the error capture routines (without having to cause a hang and avoid the error state being eaten by igt) and generally. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_debugfs.c | 113 +++++++++++++++-------------- drivers/gpu/drm/i915/i915_drv.c | 2 +- drivers/gpu/drm/i915/i915_drv.h | 43 +++++++---- drivers/gpu/drm/i915/i915_gpu_error.c | 131 ++++++++++++++++++---------------- drivers/gpu/drm/i915/i915_sysfs.c | 26 +++---- drivers/gpu/drm/i915/intel_display.c | 2 +- 6 files changed, 172 insertions(+), 145 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 1ccc2978a21a..e7d3eb93848e 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -976,89 +976,93 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data) } #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) - -static ssize_t -i915_error_state_write(struct file *filp, - const char __user *ubuf, - size_t cnt, - loff_t *ppos) +static ssize_t error_state_read(struct file *file, char __user *ubuf, + size_t count, loff_t *pos) { - struct i915_error_state_file_priv *error_priv = filp->private_data; + struct drm_i915_error_state *error = file->private_data; + struct drm_i915_error_state_buf str; + ssize_t ret; + loff_t tmp; - DRM_DEBUG_DRIVER("Resetting error state\n"); - i915_destroy_error_state(error_priv->i915); - - return cnt; -} - -static int i915_error_state_open(struct inode *inode, struct file *file) -{ - struct drm_i915_private *dev_priv = inode->i_private; - struct i915_error_state_file_priv *error_priv; + if (!error) + return 0; - error_priv = kzalloc(sizeof(*error_priv), GFP_KERNEL); - if (!error_priv) - return -ENOMEM; + ret = i915_error_state_buf_init(&str, error->i915, count, *pos); + if (ret) + return ret; - error_priv->i915 = dev_priv; + ret = i915_error_state_to_str(&str, error); + if (ret) + goto out; - i915_error_state_get(&dev_priv->drm, error_priv); + tmp = 0; + ret = simple_read_from_buffer(ubuf, count, &tmp, str.buf, str.bytes); + if (ret < 0) + goto out; - file->private_data = error_priv; + *pos = str.start + ret; +out: + i915_error_state_buf_release(&str); + return ret; +} +static int error_state_release(struct inode *inode, struct file *file) +{ + i915_error_state_put(file->private_data); return 0; } -static int i915_error_state_release(struct inode *inode, struct file *file) +static int i915_gpu_info_open(struct inode *inode, struct file *file) { - struct i915_error_state_file_priv *error_priv = file->private_data; + struct drm_i915_error_state *error; - i915_error_state_put(error_priv); - kfree(error_priv); + error = i915_error_state(inode->i_private); + if (!error) + return -ENOMEM; + file->private_data = error; return 0; } -static ssize_t i915_error_state_read(struct file *file, char __user *userbuf, - size_t count, loff_t *pos) +static const struct file_operations i915_gpu_info_fops = { + .owner = THIS_MODULE, + .open = i915_gpu_info_open, + .read = error_state_read, + .llseek = default_llseek, + .release = error_state_release, +}; + +static ssize_t +i915_error_state_write(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) { - struct i915_error_state_file_priv *error_priv = file->private_data; - struct drm_i915_error_state_buf error_str; - loff_t tmp_pos = 0; - ssize_t ret_count = 0; - int ret; + struct drm_i915_error_state *error = filp->private_data; - ret = i915_error_state_buf_init(&error_str, error_priv->i915, - count, *pos); - if (ret) - return ret; + if (!error) + return 0; - ret = i915_error_state_to_str(&error_str, error_priv); - if (ret) - goto out; + DRM_DEBUG_DRIVER("Resetting error state\n"); + i915_reset_error_state(error->i915); - ret_count = simple_read_from_buffer(userbuf, count, &tmp_pos, - error_str.buf, - error_str.bytes); + return cnt; +} - if (ret_count < 0) - ret = ret_count; - else - *pos = error_str.start + ret_count; -out: - i915_error_state_buf_release(&error_str); - return ret ?: ret_count; +static int i915_error_state_open(struct inode *inode, struct file *file) +{ + file->private_data = i915_first_error_state(inode->i_private); + return 0; } static const struct file_operations i915_error_state_fops = { .owner = THIS_MODULE, .open = i915_error_state_open, - .read = i915_error_state_read, + .read = error_state_read, .write = i915_error_state_write, .llseek = default_llseek, - .release = i915_error_state_release, + .release = error_state_release, }; - #endif static int @@ -4724,6 +4728,7 @@ static const struct i915_debugfs_files { {"i915_gem_drop_caches", &i915_drop_caches_fops}, #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) {"i915_error_state", &i915_error_state_fops}, + {"i915_gpu_info", &i915_gpu_info_fops}, #endif {"i915_next_seqno", &i915_next_seqno_fops}, {"i915_display_crc_ctl", &i915_display_crc_ctl_fops}, diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 0aa4ac2b43ca..eb21554453e2 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1364,7 +1364,7 @@ void i915_driver_unload(struct drm_device *dev) /* Free error state after interrupts are fully disabled. */ cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work); - i915_destroy_error_state(dev_priv); + i915_reset_error_state(dev_priv); /* Flush any outstanding unpin_work. */ drain_workqueue(dev_priv->wq); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 58bb5ca0e8e3..621ab33af94a 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -913,7 +913,7 @@ struct drm_i915_error_state { u32 eir; u32 pgtbl_er; u32 ier; - u32 gtier[4]; + u32 gtier[4], ngtier; u32 ccid; u32 derrmr; u32 forcewake; @@ -927,6 +927,7 @@ struct drm_i915_error_state { u32 gab_ctl; u32 gfx_mode; + u32 nfence; u64 fence[I915_MAX_NUM_FENCES]; struct intel_overlay_error_state *overlay; struct intel_display_error_state *display; @@ -1508,11 +1509,6 @@ struct drm_i915_error_state_buf { loff_t pos; }; -struct i915_error_state_file_priv { - struct drm_i915_private *i915; - struct drm_i915_error_state *error; -}; - #define I915_RESET_TIMEOUT (10 * HZ) /* 10s */ #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */ @@ -3574,7 +3570,7 @@ static inline void intel_display_crc_init(struct drm_i915_private *dev_priv) {} __printf(2, 3) void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...); int i915_error_state_to_str(struct drm_i915_error_state_buf *estr, - const struct i915_error_state_file_priv *error); + const struct drm_i915_error_state *error); int i915_error_state_buf_init(struct drm_i915_error_state_buf *eb, struct drm_i915_private *i915, size_t count, loff_t pos); @@ -3583,13 +3579,29 @@ static inline void i915_error_state_buf_release( { kfree(eb->buf); } + +struct drm_i915_error_state *i915_error_state(struct drm_i915_private *i915); void i915_capture_error_state(struct drm_i915_private *dev_priv, u32 engine_mask, const char *error_msg); -void i915_error_state_get(struct drm_device *dev, - struct i915_error_state_file_priv *error_priv); -void i915_error_state_put(struct i915_error_state_file_priv *error_priv); -void i915_destroy_error_state(struct drm_i915_private *dev_priv); + +static inline struct drm_i915_error_state * +i915_error_state_get(struct drm_i915_error_state *error) +{ + kref_get(&error->ref); + return error; +} + +void __i915_error_state_free(struct kref *error_ref); +static inline void i915_error_state_put(struct drm_i915_error_state *error) +{ + if (error) + kref_put(&error->ref, __i915_error_state_free); +} + +struct drm_i915_error_state * +i915_first_error_state(struct drm_i915_private *i915); +void i915_reset_error_state(struct drm_i915_private *i915); #else @@ -3599,7 +3611,13 @@ static inline void i915_capture_error_state(struct drm_i915_private *dev_priv, { } -static inline void i915_destroy_error_state(struct drm_i915_private *dev_priv) +static inline struct drm_i915_error_state * +i915_first_error_state(struct drm_i915_private *i915) +{ + return NULL; +} + +static inline void i915_reset_error_state(struct drm_i915_private *i915) { } @@ -3755,7 +3773,6 @@ extern void intel_overlay_print_error_state(struct drm_i915_error_state_buf *e, extern struct intel_display_error_state * intel_display_capture_error_state(struct drm_i915_private *dev_priv); extern void intel_display_print_error_state(struct drm_i915_error_state_buf *e, - struct drm_i915_private *dev_priv, struct intel_display_error_state *error); int sandybridge_pcode_read(struct drm_i915_private *dev_priv, u32 mbox, u32 *val); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index c28ccfef84ab..77008c42bed4 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -342,7 +342,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m, } static void error_print_instdone(struct drm_i915_error_state_buf *m, - struct drm_i915_error_engine *ee) + const struct drm_i915_error_engine *ee) { int slice; int subslice; @@ -372,7 +372,7 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m, static void error_print_request(struct drm_i915_error_state_buf *m, const char *prefix, - struct drm_i915_error_request *erq) + const struct drm_i915_error_request *erq) { if (!erq->seqno) return; @@ -386,7 +386,7 @@ static void error_print_request(struct drm_i915_error_state_buf *m, static void error_print_context(struct drm_i915_error_state_buf *m, const char *header, - struct drm_i915_error_context *ctx) + const struct drm_i915_error_context *ctx) { err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n", header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id, @@ -394,7 +394,7 @@ static void error_print_context(struct drm_i915_error_state_buf *m, } static void error_print_engine(struct drm_i915_error_state_buf *m, - struct drm_i915_error_engine *ee) + const struct drm_i915_error_engine *ee) { err_printf(m, "%s command stream:\n", engine_str(ee->engine_id)); err_printf(m, " START: 0x%08x\n", ee->start); @@ -569,21 +569,32 @@ static void err_print_params(struct drm_i915_error_state_buf *m, #undef PRINT } +static void err_print_pciid(struct drm_i915_error_state_buf *m, + struct drm_i915_private *i915) +{ + struct pci_dev *pdev = i915->drm.pdev; + + err_printf(m, "PCI ID: 0x%04x\n", pdev->device); + err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision); + err_printf(m, "PCI Subsystem: %04x:%04x\n", + pdev->subsystem_vendor, + pdev->subsystem_device); +} + int i915_error_state_to_str(struct drm_i915_error_state_buf *m, - const struct i915_error_state_file_priv *error_priv) + const struct drm_i915_error_state *error) { - struct drm_i915_private *dev_priv = error_priv->i915; - struct pci_dev *pdev = dev_priv->drm.pdev; - struct drm_i915_error_state *error = error_priv->error; + struct drm_i915_private *dev_priv = m->i915; struct drm_i915_error_object *obj; int i, j; if (!error) { - err_printf(m, "no error state collected\n"); - goto out; + err_printf(m, "No error state collected\n"); + return 0; } - err_printf(m, "%s\n", error->error_msg); + if (*error->error_msg) + err_printf(m, "%s\n", error->error_msg); err_printf(m, "Kernel: " UTS_RELEASE "\n"); err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec, error->time.tv_usec); @@ -605,11 +616,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, err_printf(m, "Reset count: %u\n", error->reset_count); err_printf(m, "Suspend count: %u\n", error->suspend_count); err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); - err_printf(m, "PCI ID: 0x%04x\n", pdev->device); - err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision); - err_printf(m, "PCI Subsystem: %04x:%04x\n", - pdev->subsystem_vendor, - pdev->subsystem_device); + err_print_pciid(m, error->i915); err_printf(m, "IOMMU enabled?: %d\n", error->iommu); @@ -625,19 +632,15 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, err_printf(m, "EIR: 0x%08x\n", error->eir); err_printf(m, "IER: 0x%08x\n", error->ier); - if (INTEL_GEN(dev_priv) >= 8) { - for (i = 0; i < 4; i++) - err_printf(m, "GTIER gt %d: 0x%08x\n", i, - error->gtier[i]); - } else if (HAS_PCH_SPLIT(dev_priv) || IS_VALLEYVIEW(dev_priv)) - err_printf(m, "GTIER: 0x%08x\n", error->gtier[0]); + for (i = 0; i < error->ngtier; i++) + err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]); err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); err_printf(m, "CCID: 0x%08x\n", error->ccid); err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings); - for (i = 0; i < dev_priv->num_fence_regs; i++) + for (i = 0; i < error->nfence; i++) err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); if (INTEL_GEN(dev_priv) >= 6) { @@ -686,7 +689,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, error->pinned_bo_count); for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - struct drm_i915_error_engine *ee = &error->engine[i]; + const struct drm_i915_error_engine *ee = &error->engine[i]; obj = ee->batchbuffer; if (obj) { @@ -751,12 +754,11 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, intel_overlay_print_error_state(m, error->overlay); if (error->display) - intel_display_print_error_state(m, dev_priv, error->display); + intel_display_print_error_state(m, error->display); err_print_capabilities(m, &error->device_info); err_print_params(m, &error->params); -out: if (m->bytes == 0 && m->err) return m->err; @@ -808,7 +810,7 @@ static void i915_error_object_free(struct drm_i915_error_object *obj) kfree(obj); } -static void i915_error_state_free(struct kref *error_ref) +void __i915_error_state_free(struct kref *error_ref) { struct drm_i915_error_state *error = container_of(error_ref, typeof(*error), ref); @@ -1005,16 +1007,17 @@ static void i915_gem_record_fences(struct drm_i915_private *dev_priv, { int i; - if (IS_GEN3(dev_priv) || IS_GEN2(dev_priv)) { + if (INTEL_GEN(dev_priv) >= 6) { for (i = 0; i < dev_priv->num_fence_regs; i++) - error->fence[i] = I915_READ(FENCE_REG(i)); - } else if (IS_GEN5(dev_priv) || IS_GEN4(dev_priv)) { + error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i)); + } else if (INTEL_GEN(dev_priv) >= 4) { for (i = 0; i < dev_priv->num_fence_regs; i++) error->fence[i] = I915_READ64(FENCE_REG_965_LO(i)); - } else if (INTEL_GEN(dev_priv) >= 6) { + } else { for (i = 0; i < dev_priv->num_fence_regs; i++) - error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i)); + error->fence[i] = I915_READ(FENCE_REG(i)); } + error->nfence = i; } static inline u32 @@ -1555,9 +1558,11 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv, error->ier = I915_READ(GEN8_DE_MISC_IER); for (i = 0; i < 4; i++) error->gtier[i] = I915_READ(GEN8_GT_IER(i)); + error->ngtier = 4; } else if (HAS_PCH_SPLIT(dev_priv)) { error->ier = I915_READ(DEIER); error->gtier[0] = I915_READ(GTIER); + error->ngtier = 1; } else if (IS_GEN2(dev_priv)) { error->ier = I915_READ16(IER); } else if (!IS_VALLEYVIEW(dev_priv)) { @@ -1637,6 +1642,23 @@ static int capture(void *data) #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) +struct drm_i915_error_state * +i915_error_state(struct drm_i915_private *i915) +{ + struct drm_i915_error_state *error; + + error = kzalloc(sizeof(*error), GFP_ATOMIC); + if (!error) + return NULL; + + kref_init(&error->ref); + error->i915 = i915; + + stop_machine(capture, error, NULL); + + return error; +} + /** * i915_capture_error_state - capture an error record for later analysis * @dev: drm device @@ -1660,18 +1682,12 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, if (READ_ONCE(dev_priv->gpu_error.first_error)) return; - /* Account for pipe specific data like PIPE*STAT */ - error = kzalloc(sizeof(*error), GFP_ATOMIC); + error = i915_error_state(dev_priv); if (!error) { DRM_DEBUG_DRIVER("out of memory, not capturing error state\n"); return; } - kref_init(&error->ref); - error->i915 = dev_priv; - - stop_machine(capture, error, NULL); - i915_error_capture_msg(dev_priv, error, engine_mask, error_msg); DRM_INFO("%s\n", error->error_msg); @@ -1685,7 +1701,7 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, } if (error) { - i915_error_state_free(&error->ref); + __i915_error_state_free(&error->ref); return; } @@ -1701,33 +1717,28 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, } } -void i915_error_state_get(struct drm_device *dev, - struct i915_error_state_file_priv *error_priv) +struct drm_i915_error_state * +i915_first_error_state(struct drm_i915_private *i915) { - struct drm_i915_private *dev_priv = to_i915(dev); + struct drm_i915_error_state *error; - spin_lock_irq(&dev_priv->gpu_error.lock); - error_priv->error = dev_priv->gpu_error.first_error; - if (error_priv->error) - kref_get(&error_priv->error->ref); - spin_unlock_irq(&dev_priv->gpu_error.lock); -} + spin_lock_irq(&i915->gpu_error.lock); + error = i915->gpu_error.first_error; + if (error) + i915_error_state_get(error); + spin_unlock_irq(&i915->gpu_error.lock); -void i915_error_state_put(struct i915_error_state_file_priv *error_priv) -{ - if (error_priv->error) - kref_put(&error_priv->error->ref, i915_error_state_free); + return error; } -void i915_destroy_error_state(struct drm_i915_private *dev_priv) +void i915_reset_error_state(struct drm_i915_private *i915) { struct drm_i915_error_state *error; - spin_lock_irq(&dev_priv->gpu_error.lock); - error = dev_priv->gpu_error.first_error; - dev_priv->gpu_error.first_error = NULL; - spin_unlock_irq(&dev_priv->gpu_error.lock); + spin_lock_irq(&i915->gpu_error.lock); + error = i915->gpu_error.first_error; + i915->gpu_error.first_error = NULL; + spin_unlock_irq(&i915->gpu_error.lock); - if (error) - kref_put(&error->ref, i915_error_state_free); + i915_error_state_put(error); } diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c index a721ff116101..5df052d1cbf3 100644 --- a/drivers/gpu/drm/i915/i915_sysfs.c +++ b/drivers/gpu/drm/i915/i915_sysfs.c @@ -522,33 +522,27 @@ static ssize_t error_state_read(struct file *filp, struct kobject *kobj, struct device *kdev = kobj_to_dev(kobj); struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev); - struct drm_device *dev = &dev_priv->drm; - struct i915_error_state_file_priv error_priv; struct drm_i915_error_state_buf error_str; - ssize_t ret_count = 0; - int ret; - - memset(&error_priv, 0, sizeof(error_priv)); + struct drm_i915_error_state *error; + ssize_t ret; - ret = i915_error_state_buf_init(&error_str, to_i915(dev), count, off); + ret = i915_error_state_buf_init(&error_str, dev_priv, count, off); if (ret) return ret; - error_priv.i915 = dev_priv; - i915_error_state_get(dev, &error_priv); - - ret = i915_error_state_to_str(&error_str, &error_priv); + error = i915_first_error_state(dev_priv); + ret = i915_error_state_to_str(&error_str, error); if (ret) goto out; - ret_count = count < error_str.bytes ? count : error_str.bytes; + ret = count < error_str.bytes ? count : error_str.bytes; + memcpy(buf, error_str.buf, ret); - memcpy(buf, error_str.buf, ret_count); out: - i915_error_state_put(&error_priv); + i915_error_state_put(error); i915_error_state_buf_release(&error_str); - return ret ?: ret_count; + return ret; } static ssize_t error_state_write(struct file *file, struct kobject *kobj, @@ -559,7 +553,7 @@ static ssize_t error_state_write(struct file *file, struct kobject *kobj, struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev); DRM_DEBUG_DRIVER("Resetting error state\n"); - i915_destroy_error_state(dev_priv); + i915_reset_error_state(dev_priv); return count; } diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index f6259c949da2..8b1b96bbd7c8 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -15856,9 +15856,9 @@ intel_display_capture_error_state(struct drm_i915_private *dev_priv) void intel_display_print_error_state(struct drm_i915_error_state_buf *m, - struct drm_i915_private *dev_priv, struct intel_display_error_state *error) { + struct drm_i915_private *dev_priv = m->i915; int i; if (!error) -- 2.11.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx