Re: [PATCH] drm/i915: Decouple GPU error reporting from ring initialisation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Jan 27, 2014 at 01:52:34PM +0000, Chris Wilson wrote:
> Currently we report through our error state only the rings that have
> been initialised (as detected by ring->obj). This check is done after
> the GPU reset and ring re-initialisation, which means that the software
> state may not be the same as when we captured the hardware error and we
> may not print out any of the vital information for debugging the hang.
> 
> This (and the implied object leak) is a regression from
> 
> commit 3d57e5bd1284f44e325f3a52d966259ed42f9e05
> Author: Ben Widawsky <ben@xxxxxxxxxxxx>
> Date:   Mon Oct 14 10:01:36 2013 -0700
> 
>     drm/i915: Do a fuller init after reset
> 
> Note that we are already starting to get bug reports with incomplete
> error states from 3.13.
> 
> v2: Prevent a NULL dereference on 830gm/845g after a GPU reset where
>     the scratch obj may be NULL.
> 
> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> Cc: Ben Widawsky <ben@xxxxxxxxxxxx>
> Cc: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=74094
> Cc: stable@xxxxxxxxxxxxxxx

Looks OK to me.

Reviewed-by: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx>

> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  1 +
>  drivers/gpu/drm/i915/i915_gpu_error.c | 22 +++++++++++++++-------
>  2 files changed, 16 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 2e6c67d944eb..0249c9aa345a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -335,6 +335,7 @@ struct drm_i915_error_state {
>  	struct timeval time;
>  
>  	struct drm_i915_error_ring {
> +		bool valid;
>  		struct drm_i915_error_object {
>  			int page_count;
>  			u32 gtt_offset;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 6832473bc386..96e945c3d44f 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -240,6 +240,9 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
>  				  unsigned ring)
>  {
>  	BUG_ON(ring >= I915_NUM_RINGS); /* shut up confused gcc */
> +	if (!error->ring[ring].valid)
> +		return;
> +
>  	err_printf(m, "%s command stream:\n", ring_str(ring));
>  	err_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
>  	err_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
> @@ -295,7 +298,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	struct drm_device *dev = error_priv->dev;
>  	drm_i915_private_t *dev_priv = dev->dev_private;
>  	struct drm_i915_error_state *error = error_priv->error;
> -	struct intel_ring_buffer *ring;
>  	int i, j, page, offset, elt;
>  
>  	if (!error) {
> @@ -330,7 +332,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	if (INTEL_INFO(dev)->gen == 7)
>  		err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
>  
> -	for_each_ring(ring, dev_priv, i)
> +	for (i = 0; i < ARRAY_SIZE(error->ring); i++)
>  		i915_ring_error_state(m, dev, error, i);
>  
>  	for (i = 0; i < error->vm_count; i++) {
> @@ -405,8 +407,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  			}
>  		}
>  
> -		obj = error->ring[i].ctx;
> -		if (obj) {
> +		if ((obj = error->ring[i].ctx)) {
>  			err_printf(m, "%s --- HW Context = 0x%08x\n",
>  				   dev_priv->ring[i].name,
>  				   obj->gtt_offset);
> @@ -730,7 +731,8 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>  			return NULL;
>  
>  		obj = ring->scratch.obj;
> -		if (acthd >= i915_gem_obj_ggtt_offset(obj) &&
> +		if (obj != NULL &&
> +		    acthd >= i915_gem_obj_ggtt_offset(obj) &&
>  		    acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
>  			return i915_error_ggtt_object_create(dev_priv, obj);
>  	}
> @@ -875,11 +877,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
>  				  struct drm_i915_error_state *error)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> -	struct intel_ring_buffer *ring;
>  	struct drm_i915_gem_request *request;
>  	int i, count;
>  
> -	for_each_ring(ring, dev_priv, i) {
> +	for (i = 0; i < I915_NUM_RINGS; i++) {
> +		struct intel_ring_buffer *ring = &dev_priv->ring[i];
> +
> +		if (ring->dev == NULL)
> +			continue;
> +
> +		error->ring[i].valid = true;
> +
>  		i915_record_ring_state(dev, error, ring);
>  
>  		error->ring[i].batchbuffer =
> -- 
> 1.8.5.3

-- 
Ville Syrjälä
Intel OTC
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]