Re: [PATCH 3/3] drm/i915: Record error state capture reason

Ben Widawsky <ben@xxxxxxxxxxxx> · Tue, 11 Feb 2014 18:32:24 -0800

On Mon, Feb 10, 2014 at 04:30:51PM +0200, Mika Kuoppala wrote:
> We capture error state not only when the GPU hangs but
> also on other situations as in interrupt errors and
> in situations where we can kick things forward without GPU reset.
> There will be log entry on most of these cases. But as error state
> capture might be only thing we have, if dmesg was not captured. Or as
> in GEN4 case, interrupt error can trigger error state capture without log
> entry, the exact reason why capture was made is hard to decipher.
> 
> To avoid confusion why the error state was captured, print reason
> along with error code into log and also store it into the error state.
> 

I could have done with a better commit subject and message. We already
capture error state, so "Record error state capture" is a bit weird.
Also, I think the commit message glosses over the main point of why
these cases are special as opposed to hangcheck.

> References: https://bugs.freedesktop.org/show_bug.cgi?id=74193
> Signed-off-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c   |    5 +-
>  drivers/gpu/drm/i915/i915_drv.h       |    9 +++-
>  drivers/gpu/drm/i915/i915_gpu_error.c |   84 +++++++++++++++++++++++----------
>  drivers/gpu/drm/i915/i915_irq.c       |   45 +++++++++++-------
>  4 files changed, 98 insertions(+), 45 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 2dc05c3..175e524 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3006,9 +3006,8 @@ i915_wedged_set(void *data, u64 val)
>  {
>  	struct drm_device *dev = data;
>  
> -	DRM_INFO("Manually setting wedged to %llu\n", val);
> -	i915_handle_error(dev, val);
> -
> +	i915_handle_error(dev, val,
> +			  "Manually setting wedged to %llu", val);
>  	return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 2a61a29..ad41c71 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -299,6 +299,8 @@ struct drm_i915_error_state {
>  	struct kref ref;
>  	struct timeval time;
>  
> +	char error_msg[128];
> +
>  	/* Generic register state */
>  	u32 eir;
>  	u32 pgtbl_er;
> @@ -1994,7 +1996,9 @@ extern void intel_console_resume(struct work_struct *work);
>  
>  /* i915_irq.c */
>  void i915_queue_hangcheck(struct drm_device *dev);
> -void i915_handle_error(struct drm_device *dev, bool wedged);
> +__printf(3, 4)
> +void i915_handle_error(struct drm_device *dev, bool wedged,
> +		       const char *fmt, ...);
>  
>  void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir,
>  							int new_delay);
> @@ -2464,7 +2468,8 @@ static inline void i915_error_state_buf_release(
>  {
>  	kfree(eb->buf);
>  }
> -void i915_capture_error_state(struct drm_device *dev);
> +void i915_capture_error_state(struct drm_device *dev, bool wedge,
> +			      const char *error_msg);
>  void i915_error_state_get(struct drm_device *dev,
>  			  struct i915_error_state_file_priv *error_priv);
>  void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index a90971a..dce569b 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -329,6 +329,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  		goto out;
>  	}
>  
> +	err_printf(m, "%s\n", error->error_msg);
>  	err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
>  		   error->time.tv_usec);
>  	err_printf(m, "Kernel: " UTS_RELEASE "\n");
> @@ -686,7 +687,8 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
>   * It's only a small step better than a random number in its current form.
>   */
>  static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
> -					 struct drm_i915_error_state *error)
> +					 struct drm_i915_error_state *error,
> +					 int *ring_id)
>  {
>  	uint32_t error_code = 0;
>  	int i;
> @@ -696,9 +698,14 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
>  	 * synchronization commands which almost always appear in the case
>  	 * strictly a client bug. Use instdone to differentiate those some.
>  	 */
> -	for (i = 0; i < I915_NUM_RINGS; i++)
> -		if (error->ring[i].hangcheck_action == HANGCHECK_HUNG)
> +	for (i = 0; i < I915_NUM_RINGS; i++) {
> +		if (error->ring[i].hangcheck_action == HANGCHECK_HUNG) {
> +			if (ring_id)
> +				*ring_id = i;
> +
>  			return error->ring[i].ipehr ^ error->ring[i].instdone;
> +		}
> +	}
>  
>  	return error_code;
>  }
> @@ -1075,6 +1082,39 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
>  	i915_get_extra_instdone(dev, error->extra_instdone);
>  }
>  
> +static void i915_error_generate_msg(struct drm_device *dev,
> +				    struct drm_i915_error_state *error,
> +				    bool wedge,
> +				    const char *error_msg)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	u32 ecode;
> +	int ring_id = -1, len;
> +
> +	ecode = i915_error_generate_code(dev_priv, error, &ring_id);
> +
> +	len = scnprintf(error->error_msg, sizeof(error->error_msg),
> +			"GPU HANG: ecode %d:0x%08x", ring_id, ecode);
> +
> +	if (ring_id != -1 && error->ring[ring_id].pid != -1)
> +			len += scnprintf(error->error_msg + len,
> +					 sizeof(error->error_msg) - len,
> +					 ", in %s [%d]",
> +					 error->ring[ring_id].comm,
> +					 error->ring[ring_id].pid);
> +
> +	if (error_msg)

I'd drop this and just always have an error_msg. But whatever you want
is fine.

> +		len += scnprintf(error->error_msg + len,
> +				 sizeof(error->error_msg) - len,
> +				 ", reason: %s",
> +				 error_msg);
> +
> +	if (wedge)
> +		len += scnprintf(error->error_msg + len,
> +				 sizeof(error->error_msg) - len,
> +				 ", resetting GPU");

I also don't think the fact that we're resetting the GPU belongs here
(for instance, if it's disabled via modparam). On the other hand, I'd be
in favor of a taint on GPU reset for all future error states.

> +}
> +
>  /**
>   * i915_capture_error_state - capture an error record for later analysis
>   * @dev: drm device
> @@ -1084,19 +1124,13 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
>   * out a structure which becomes available in debugfs for user level tools
>   * to pick up.
>   */
> -void i915_capture_error_state(struct drm_device *dev)
> +void i915_capture_error_state(struct drm_device *dev, bool wedged,
> +			      const char *error_msg)
>  {
>  	static bool warned;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	struct drm_i915_error_state *error;
>  	unsigned long flags;
> -	uint32_t ecode;
> -
> -	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
> -	error = dev_priv->gpu_error.first_error;
> -	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
> -	if (error)
> -		return;

This seems like it should be a separate patch, or else I'm not clear why
you've introduced this change in this series. It doesn't seem to be
related to what the rest of the patch does.

>  
>  	/* Account for pipe specific data like PIPE*STAT */
>  	error = kzalloc(sizeof(*error), GFP_ATOMIC);
> @@ -1105,30 +1139,21 @@ void i915_capture_error_state(struct drm_device *dev)
>  		return;
>  	}
>  
> -	DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
> -		 dev->primary->index);
>  	kref_init(&error->ref);
>  
>  	i915_capture_reg_state(dev_priv, error);
>  	i915_gem_capture_buffers(dev_priv, error);
>  	i915_gem_record_fences(dev, error);
>  	i915_gem_record_rings(dev, error);
> -	ecode = i915_error_generate_code(dev_priv, error);
> -
> -	if (!warned) {
> -		DRM_INFO("GPU HANG [%x]\n", ecode);
> -		DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
> -		DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
> -		DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
> -		DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
> -		warned = true;
> -	}
>  
>  	do_gettimeofday(&error->time);
>  
>  	error->overlay = intel_overlay_capture_error_state(dev);
>  	error->display = intel_display_capture_error_state(dev);
>  
> +	i915_error_generate_msg(dev, error, wedged, error_msg);
> +	DRM_INFO("%s\n", error->error_msg);
> +
>  	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
>  	if (dev_priv->gpu_error.first_error == NULL) {
>  		dev_priv->gpu_error.first_error = error;
> @@ -1136,8 +1161,19 @@ void i915_capture_error_state(struct drm_device *dev)
>  	}
>  	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
>  
> -	if (error)
> +	if (error) {
>  		i915_error_state_free(&error->ref);
> +		return;
> +	}

Same comment as above.

> +
> +	if (!warned) {
> +		DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
> +		DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
> +		DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
> +		DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
> +		DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", dev->primary->index);
> +		warned = true;
> +	}
>  }
>  
>  void i915_error_state_get(struct drm_device *dev,
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index d4defd8..36bba16 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1223,8 +1223,7 @@ static void snb_gt_irq_handler(struct drm_device *dev,
>  	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
>  		      GT_BSD_CS_ERROR_INTERRUPT |
>  		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) {
> -		DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
> -		i915_handle_error(dev, false);
> +		i915_handle_error(dev, false, "GT error interrupt 0x%08x", gt_iir);
>  	}
>  
>  	if (gt_iir & GT_PARITY_ERROR(dev))
> @@ -1471,8 +1470,9 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
>  			notify_ring(dev_priv->dev, &dev_priv->ring[VECS]);
>  
>  		if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) {
> -			DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir);
> -			i915_handle_error(dev_priv->dev, false);
> +			i915_handle_error(dev_priv->dev, false,
> +					  "VEBOX CS error interrupt 0x%08x",
> +					  pm_iir);
>  		}
>  	}
>  }
> @@ -2174,11 +2174,18 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
>   * so userspace knows something bad happened (should trigger collection
>   * of a ring dump etc.).
>   */
> -void i915_handle_error(struct drm_device *dev, bool wedged)
> +void i915_handle_error(struct drm_device *dev, bool wedged,
> +		       const char *fmt, ...)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> +	va_list args;
> +	char error_msg[80];
>  
> -	i915_capture_error_state(dev);
> +	va_start(args, fmt);
> +	vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> +	va_end(args);
> +
> +	i915_capture_error_state(dev, wedged, error_msg);

As I said above regarding always having an error_msg, I'd be in favor of
moving most of this to capture error state, and just passing along the
string - but whatever you want is fine with me.

>  	i915_report_and_clear_eir(dev);
>  
>  	if (wedged) {
> @@ -2481,9 +2488,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
>  	 */
>  	tmp = I915_READ_CTL(ring);
>  	if (tmp & RING_WAIT) {
> -		DRM_ERROR("Kicking stuck wait on %s\n",
> -			  ring->name);
> -		i915_handle_error(dev, false);
> +		i915_handle_error(dev, false,
> +				  "Kicking stuck wait on %s",
> +				  ring->name);
>  		I915_WRITE_CTL(ring, tmp);
>  		return HANGCHECK_KICK;
>  	}
> @@ -2493,9 +2500,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
>  		default:
>  			return HANGCHECK_HUNG;
>  		case 1:
> -			DRM_ERROR("Kicking stuck semaphore on %s\n",
> -				  ring->name);
> -			i915_handle_error(dev, false);
> +			i915_handle_error(dev, false,
> +					  "Kicking stuck semaphore on %s",
> +					  ring->name);
>  			I915_WRITE_CTL(ring, tmp);
>  			return HANGCHECK_KICK;
>  		case 0:
> @@ -2617,7 +2624,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  	}
>  
>  	if (rings_hung)
> -		return i915_handle_error(dev, true);
> +		return i915_handle_error(dev, true, "Ring hung");
>  
>  	if (busy_count)
>  		/* Reset timer case chip hangs without another request
> @@ -3234,7 +3241,9 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
>  		 */
>  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
>  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -			i915_handle_error(dev, false);
> +			i915_handle_error(dev, false,
> +					  "Command parser error, iir 0x%08x",
> +					  iir);
>  
>  		for_each_pipe(pipe) {
>  			int reg = PIPESTAT(pipe);
> @@ -3416,7 +3425,9 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
>  		 */
>  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
>  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -			i915_handle_error(dev, false);
> +			i915_handle_error(dev, false,
> +					  "Command parser error, iir 0x%08x",
> +					  iir);
>  
>  		for_each_pipe(pipe) {
>  			int reg = PIPESTAT(pipe);
> @@ -3653,7 +3664,9 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
>  		 */
>  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
>  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -			i915_handle_error(dev, false);
> +			i915_handle_error(dev, false,
> +					  "Command parser error, iir 0x%08x",
> +					  iir);
>  

Since you're not directly using any of the DRM_ printers, we lose the
file/line. Would you care to add __FILE__/__LINE? I think it would be
helpful - not sure what others thing.

>  		for_each_pipe(pipe) {
>  			int reg = PIPESTAT(pipe);
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ben Widawsky, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx