Re: [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches

Ben Widawsky <ben@xxxxxxxxxxxx> · Tue, 11 Feb 2014 17:57:19 -0800

On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> From: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> 
> In the past, it was possible to have multiple batches per request due to
> a stray signal or ENOMEM. As a result we had to scan each active object
> (filtered by those having the COMMAND domain) for the one that contained
> the ACTHD pointer. This was then made more complicated by the
> introduction of ppgtt, whereby ACTHD then pointed into the address space
> of the context and so also needed to be taken into account.
> 
> This is a fairly robust approach (though the implementation is a little
> fragile and depends upon the per-generation setup, registers and
> parameters). However, due to the requirements for hangstats, we needed a
> robust method for associating batches with a particular request and
> having that we can rely upon it for finding the associated batch object
> for error capture.
> 
> If the batch buffer tracking is not robust enough, that should become
> apparent quite quickly through an erroneous error capture. That should
> also help to make sure that the runtime reporting to userspace is
> robust. It also means that we then report the oldest incomplete batch on
> each ring, which can be useful for determining the state of userspace at
> the time of a hang.
> 
> v2: Use i915_gem_find_active_request (Mika)
> 
> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> (v1)
> Signed-off-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> (v2)
> Cc: Ben Widawsky <benjamin.widawsky@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |    3 ++
>  drivers/gpu/drm/i915/i915_gem.c       |   13 +++++--
>  drivers/gpu/drm/i915/i915_gpu_error.c |   66 ++++-----------------------------
>  3 files changed, 20 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b1e91c3..eb260bc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2147,6 +2147,9 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
>  	}
>  }
>  
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring);
> +
>  bool i915_gem_retire_requests(struct drm_device *dev);
>  void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
>  int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b0a244a..20e55ef 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2298,11 +2298,16 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
>  	kfree(request);
>  }
>  
> -static struct drm_i915_gem_request *
> -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring)
>  {
>  	struct drm_i915_gem_request *request;
> -	const u32 completed_seqno = ring->get_seqno(ring, false);
> +	u32 completed_seqno;
> +
> +	if (WARN_ON(!ring->get_seqno))
> +		return NULL;

ring->get_seqno(ring, false) ?

This looks like it's currently wrong below as well.

> +
> +	completed_seqno = ring->get_seqno(ring, false);
>  
>  	list_for_each_entry(request, &ring->request_list, list) {
>  		if (i915_seqno_passed(completed_seqno, request->seqno))
> @@ -2320,7 +2325,7 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  	struct drm_i915_gem_request *request;
>  	bool ring_hung;
>  
> -	request = i915_gem_find_first_non_complete(ring);
> +	request = i915_gem_find_active_request(ring);
>  
>  	if (request == NULL)
>  		return;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index dc47bb9..5bd075a 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -713,46 +713,14 @@ static void i915_gem_record_fences(struct drm_device *dev,
>  	}
>  }
>  
> -/* This assumes all batchbuffers are executed from the PPGTT. It might have to
> - * change in the future. */
> -static bool is_active_vm(struct i915_address_space *vm,
> -			 struct intel_ring_buffer *ring)
> -{
> -	struct drm_device *dev = vm->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	struct i915_hw_ppgtt *ppgtt;
> -
> -	if (INTEL_INFO(dev)->gen < 7)
> -		return i915_is_ggtt(vm);
> -
> -	/* FIXME: This ignores that the global gtt vm is also on this list. */
> -	ppgtt = container_of(vm, struct i915_hw_ppgtt, base);
> -
> -	if (INTEL_INFO(dev)->gen >= 8) {
> -		u64 pdp0 = (u64)I915_READ(GEN8_RING_PDP_UDW(ring, 0)) << 32;
> -		pdp0 |=  I915_READ(GEN8_RING_PDP_LDW(ring, 0));
> -		return pdp0 == ppgtt->pd_dma_addr[0];
> -	} else {
> -		u32 pp_db;
> -		pp_db = I915_READ(RING_PP_DIR_BASE(ring));
> -		return (pp_db >> 10) == ppgtt->pd_offset;
> -	}
> -}
> -
>  static struct drm_i915_error_object *
>  i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>  			     struct intel_ring_buffer *ring)
>  {
> -	struct i915_address_space *vm;
> -	struct i915_vma *vma;
> -	struct drm_i915_gem_object *obj;
> -	bool found_active = false;
> -	u32 seqno;
> -
> -	if (!ring->get_seqno)
> -		return NULL;
> +	struct drm_i915_gem_request *request;
>  
>  	if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
> +		struct drm_i915_gem_object *obj;
>  		u32 acthd = I915_READ(ACTHD);
>  
>  		if (WARN_ON(ring->id != RCS))
> @@ -765,32 +733,14 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>  			return i915_error_ggtt_object_create(dev_priv, obj);
>  	}
>  
> -	seqno = ring->get_seqno(ring, false);
> -	list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
> -		if (!is_active_vm(vm, ring))
> -			continue;
> -
> -		found_active = true;
> -
> -		list_for_each_entry(vma, &vm->active_list, mm_list) {
> -			obj = vma->obj;
> -			if (obj->ring != ring)
> -				continue;
> -
> -			if (i915_seqno_passed(seqno, obj->last_read_seqno))
> -				continue;
> -
> -			if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) == 0)
> -				continue;
> -
> -			/* We need to copy these to an anonymous buffer as the simplest
> -			 * method to avoid being overwritten by userspace.
> -			 */
> -			return i915_error_object_create(dev_priv, obj, vm);
> -		}
> +	request = i915_gem_find_active_request(ring);
> +	if (request) {
> +		/* We need to copy these to an anonymous buffer as the simplest
> +		 * method to avoid being overwritten by userspace.
> +		 */
> +		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);

Wrap this one. It's too long even for my undiscerning eye.

>  	}
>  
> -	WARN_ON(!found_active);
>  	return NULL;
>  }
>  

I still would have preferred to keep both methods for a while until we
felt really comfortable with this... the commit message's statement that
getting bad error state is inevitably a good thing is total bunk, I
think.

However, I'll act as a Daniel proxy here who seems in favor of this
path. The code is definitely simpler, I am just a chicken.

With the above fixed:
Reviewed-by: Ben Widawsky <ben@xxxxxxxxxxxx>

-- 
Ben Widawsky, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx