On Mon, Feb 10, 2014 at 04:30:51PM +0200, Mika Kuoppala wrote: > We capture error state not only when the GPU hangs but > also on other situations as in interrupt errors and > in situations where we can kick things forward without GPU reset. > There will be log entry on most of these cases. But as error state > capture might be only thing we have, if dmesg was not captured. Or as > in GEN4 case, interrupt error can trigger error state capture without log > entry, the exact reason why capture was made is hard to decipher. > > To avoid confusion why the error state was captured, print reason > along with error code into log and also store it into the error state. > I could have done with a better commit subject and message. We already capture error state, so "Record error state capture" is a bit weird. Also, I think the commit message glosses over the main point of why these cases are special as opposed to hangcheck. > References: https://bugs.freedesktop.org/show_bug.cgi?id=74193 > Signed-off-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 5 +- > drivers/gpu/drm/i915/i915_drv.h | 9 +++- > drivers/gpu/drm/i915/i915_gpu_error.c | 84 +++++++++++++++++++++++---------- > drivers/gpu/drm/i915/i915_irq.c | 45 +++++++++++------- > 4 files changed, 98 insertions(+), 45 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 2dc05c3..175e524 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -3006,9 +3006,8 @@ i915_wedged_set(void *data, u64 val) > { > struct drm_device *dev = data; > > - DRM_INFO("Manually setting wedged to %llu\n", val); > - i915_handle_error(dev, val); > - > + i915_handle_error(dev, val, > + "Manually setting wedged to %llu", val); > return 0; > } > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 2a61a29..ad41c71 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -299,6 +299,8 @@ struct drm_i915_error_state { > struct kref ref; > struct timeval time; > > + char error_msg[128]; > + > /* Generic register state */ > u32 eir; > u32 pgtbl_er; > @@ -1994,7 +1996,9 @@ extern void intel_console_resume(struct work_struct *work); > > /* i915_irq.c */ > void i915_queue_hangcheck(struct drm_device *dev); > -void i915_handle_error(struct drm_device *dev, bool wedged); > +__printf(3, 4) > +void i915_handle_error(struct drm_device *dev, bool wedged, > + const char *fmt, ...); > > void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir, > int new_delay); > @@ -2464,7 +2468,8 @@ static inline void i915_error_state_buf_release( > { > kfree(eb->buf); > } > -void i915_capture_error_state(struct drm_device *dev); > +void i915_capture_error_state(struct drm_device *dev, bool wedge, > + const char *error_msg); > void i915_error_state_get(struct drm_device *dev, > struct i915_error_state_file_priv *error_priv); > void i915_error_state_put(struct i915_error_state_file_priv *error_priv); > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index a90971a..dce569b 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -329,6 +329,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, > goto out; > } > > + err_printf(m, "%s\n", error->error_msg); > err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec, > error->time.tv_usec); > err_printf(m, "Kernel: " UTS_RELEASE "\n"); > @@ -686,7 +687,8 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err, > * It's only a small step better than a random number in its current form. > */ > static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv, > - struct drm_i915_error_state *error) > + struct drm_i915_error_state *error, > + int *ring_id) > { > uint32_t error_code = 0; > int i; > @@ -696,9 +698,14 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv, > * synchronization commands which almost always appear in the case > * strictly a client bug. Use instdone to differentiate those some. > */ > - for (i = 0; i < I915_NUM_RINGS; i++) > - if (error->ring[i].hangcheck_action == HANGCHECK_HUNG) > + for (i = 0; i < I915_NUM_RINGS; i++) { > + if (error->ring[i].hangcheck_action == HANGCHECK_HUNG) { > + if (ring_id) > + *ring_id = i; > + > return error->ring[i].ipehr ^ error->ring[i].instdone; > + } > + } > > return error_code; > } > @@ -1075,6 +1082,39 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv, > i915_get_extra_instdone(dev, error->extra_instdone); > } > > +static void i915_error_generate_msg(struct drm_device *dev, > + struct drm_i915_error_state *error, > + bool wedge, > + const char *error_msg) > +{ > + struct drm_i915_private *dev_priv = dev->dev_private; > + u32 ecode; > + int ring_id = -1, len; > + > + ecode = i915_error_generate_code(dev_priv, error, &ring_id); > + > + len = scnprintf(error->error_msg, sizeof(error->error_msg), > + "GPU HANG: ecode %d:0x%08x", ring_id, ecode); > + > + if (ring_id != -1 && error->ring[ring_id].pid != -1) > + len += scnprintf(error->error_msg + len, > + sizeof(error->error_msg) - len, > + ", in %s [%d]", > + error->ring[ring_id].comm, > + error->ring[ring_id].pid); > + > + if (error_msg) I'd drop this and just always have an error_msg. But whatever you want is fine. > + len += scnprintf(error->error_msg + len, > + sizeof(error->error_msg) - len, > + ", reason: %s", > + error_msg); > + > + if (wedge) > + len += scnprintf(error->error_msg + len, > + sizeof(error->error_msg) - len, > + ", resetting GPU"); I also don't think the fact that we're resetting the GPU belongs here (for instance, if it's disabled via modparam). On the other hand, I'd be in favor of a taint on GPU reset for all future error states. > +} > + > /** > * i915_capture_error_state - capture an error record for later analysis > * @dev: drm device > @@ -1084,19 +1124,13 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv, > * out a structure which becomes available in debugfs for user level tools > * to pick up. > */ > -void i915_capture_error_state(struct drm_device *dev) > +void i915_capture_error_state(struct drm_device *dev, bool wedged, > + const char *error_msg) > { > static bool warned; > struct drm_i915_private *dev_priv = dev->dev_private; > struct drm_i915_error_state *error; > unsigned long flags; > - uint32_t ecode; > - > - spin_lock_irqsave(&dev_priv->gpu_error.lock, flags); > - error = dev_priv->gpu_error.first_error; > - spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags); > - if (error) > - return; This seems like it should be a separate patch, or else I'm not clear why you've introduced this change in this series. It doesn't seem to be related to what the rest of the patch does. > > /* Account for pipe specific data like PIPE*STAT */ > error = kzalloc(sizeof(*error), GFP_ATOMIC); > @@ -1105,30 +1139,21 @@ void i915_capture_error_state(struct drm_device *dev) > return; > } > > - DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", > - dev->primary->index); > kref_init(&error->ref); > > i915_capture_reg_state(dev_priv, error); > i915_gem_capture_buffers(dev_priv, error); > i915_gem_record_fences(dev, error); > i915_gem_record_rings(dev, error); > - ecode = i915_error_generate_code(dev_priv, error); > - > - if (!warned) { > - DRM_INFO("GPU HANG [%x]\n", ecode); > - DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); > - DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); > - DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); > - DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n"); > - warned = true; > - } > > do_gettimeofday(&error->time); > > error->overlay = intel_overlay_capture_error_state(dev); > error->display = intel_display_capture_error_state(dev); > > + i915_error_generate_msg(dev, error, wedged, error_msg); > + DRM_INFO("%s\n", error->error_msg); > + > spin_lock_irqsave(&dev_priv->gpu_error.lock, flags); > if (dev_priv->gpu_error.first_error == NULL) { > dev_priv->gpu_error.first_error = error; > @@ -1136,8 +1161,19 @@ void i915_capture_error_state(struct drm_device *dev) > } > spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags); > > - if (error) > + if (error) { > i915_error_state_free(&error->ref); > + return; > + } Same comment as above. > + > + if (!warned) { > + DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); > + DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); > + DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); > + DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n"); > + DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", dev->primary->index); > + warned = true; > + } > } > > void i915_error_state_get(struct drm_device *dev, > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index d4defd8..36bba16 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -1223,8 +1223,7 @@ static void snb_gt_irq_handler(struct drm_device *dev, > if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT | > GT_BSD_CS_ERROR_INTERRUPT | > GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) { > - DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir); > - i915_handle_error(dev, false); > + i915_handle_error(dev, false, "GT error interrupt 0x%08x", gt_iir); > } > > if (gt_iir & GT_PARITY_ERROR(dev)) > @@ -1471,8 +1470,9 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir) > notify_ring(dev_priv->dev, &dev_priv->ring[VECS]); > > if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) { > - DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir); > - i915_handle_error(dev_priv->dev, false); > + i915_handle_error(dev_priv->dev, false, > + "VEBOX CS error interrupt 0x%08x", > + pm_iir); > } > } > } > @@ -2174,11 +2174,18 @@ static void i915_report_and_clear_eir(struct drm_device *dev) > * so userspace knows something bad happened (should trigger collection > * of a ring dump etc.). > */ > -void i915_handle_error(struct drm_device *dev, bool wedged) > +void i915_handle_error(struct drm_device *dev, bool wedged, > + const char *fmt, ...) > { > struct drm_i915_private *dev_priv = dev->dev_private; > + va_list args; > + char error_msg[80]; > > - i915_capture_error_state(dev); > + va_start(args, fmt); > + vscnprintf(error_msg, sizeof(error_msg), fmt, args); > + va_end(args); > + > + i915_capture_error_state(dev, wedged, error_msg); As I said above regarding always having an error_msg, I'd be in favor of moving most of this to capture error state, and just passing along the string - but whatever you want is fine with me. > i915_report_and_clear_eir(dev); > > if (wedged) { > @@ -2481,9 +2488,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd) > */ > tmp = I915_READ_CTL(ring); > if (tmp & RING_WAIT) { > - DRM_ERROR("Kicking stuck wait on %s\n", > - ring->name); > - i915_handle_error(dev, false); > + i915_handle_error(dev, false, > + "Kicking stuck wait on %s", > + ring->name); > I915_WRITE_CTL(ring, tmp); > return HANGCHECK_KICK; > } > @@ -2493,9 +2500,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd) > default: > return HANGCHECK_HUNG; > case 1: > - DRM_ERROR("Kicking stuck semaphore on %s\n", > - ring->name); > - i915_handle_error(dev, false); > + i915_handle_error(dev, false, > + "Kicking stuck semaphore on %s", > + ring->name); > I915_WRITE_CTL(ring, tmp); > return HANGCHECK_KICK; > case 0: > @@ -2617,7 +2624,7 @@ static void i915_hangcheck_elapsed(unsigned long data) > } > > if (rings_hung) > - return i915_handle_error(dev, true); > + return i915_handle_error(dev, true, "Ring hung"); > > if (busy_count) > /* Reset timer case chip hangs without another request > @@ -3234,7 +3241,9 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg) > */ > spin_lock_irqsave(&dev_priv->irq_lock, irqflags); > if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT) > - i915_handle_error(dev, false); > + i915_handle_error(dev, false, > + "Command parser error, iir 0x%08x", > + iir); > > for_each_pipe(pipe) { > int reg = PIPESTAT(pipe); > @@ -3416,7 +3425,9 @@ static irqreturn_t i915_irq_handler(int irq, void *arg) > */ > spin_lock_irqsave(&dev_priv->irq_lock, irqflags); > if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT) > - i915_handle_error(dev, false); > + i915_handle_error(dev, false, > + "Command parser error, iir 0x%08x", > + iir); > > for_each_pipe(pipe) { > int reg = PIPESTAT(pipe); > @@ -3653,7 +3664,9 @@ static irqreturn_t i965_irq_handler(int irq, void *arg) > */ > spin_lock_irqsave(&dev_priv->irq_lock, irqflags); > if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT) > - i915_handle_error(dev, false); > + i915_handle_error(dev, false, > + "Command parser error, iir 0x%08x", > + iir); > Since you're not directly using any of the DRM_ printers, we lose the file/line. Would you care to add __FILE__/__LINE? I think it would be helpful - not sure what others thing. > for_each_pipe(pipe) { > int reg = PIPESTAT(pipe); > -- > 1.7.9.5 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > http://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Ben Widawsky, Intel Open Source Technology Center _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx