Re: [PATCH] drm/i915: Write RING_TAIL once per-request

Ben Widawsky <ben@xxxxxxxxxxxx> · Mon, 26 Aug 2013 13:42:12 -0700

On Sat, Aug 10, 2013 at 10:16:32PM +0100, Chris Wilson wrote:
> Ignoring the legacy DRI1 code, and a couple of special cases (to be
> discussed later), all access to the ring is mediated through requests.
> The first write to a ring will grab a seqno and mark the ring as having
> an outstanding_lazy_request. Either through explicitly adding a request
> after an execbuffer or through an implicit wait (either by the CPU or by
> a semaphore), that sequence of writes will be terminated with a request.
> So we can ellide all the intervening writes to the tail register and
> send the entire command stream to the GPU at once. This will reduce the
> number of *serialising* writes to the tail register by a factor or 3-5
> times (depending upon architecture and number of workarounds, context
> switches, etc involved). This becomes even more noticeable when the
> register write is overloaded with a number of debugging tools. The
> astute reader will wonder if it is then possible to overflow the ring
> with a single command. It is not. When we start a command sequence to
> the ring, we check for available space and issue a wait in case we have
> not. The ring wait will in this case be forced to flush the outstanding
> register write and then poll the ACTHD for sufficient space to continue.
> 
> The exception to the rule where everything is inside a request are a few
> initialisation cases where we may want to write GPU commands via the CS
> before userspace wakes up and page flips.
> 

I'm not a huge fan of having the second intel_ring_advance() that does something
other than it sounds like. I'd *much* prefer to not intel_ring_advance()
if you are certain more emits will be coming like in the case you
mention above. We can add a paranoia check whenever we're about to
return to userspace that tail == RING_TAIL

Also, without performance data, it's hard to say this indirection is
worth it.

> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_dma.c         |  2 +-
>  drivers/gpu/drm/i915/i915_gem_exec.c    |  2 +-
>  drivers/gpu/drm/i915/intel_display.c    | 10 +++++-----
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 30 ++++++++++++++++--------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  7 ++++++-
>  5 files changed, 29 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index de0b86a..a7a0eb86 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -52,7 +52,7 @@
>  	intel_ring_emit(LP_RING(dev_priv), x)
>  
>  #define ADVANCE_LP_RING() \
> -	intel_ring_advance(LP_RING(dev_priv))
> +	__intel_ring_advance(LP_RING(dev_priv))
>  
>  /**
>   * Lock test for when it's just for synchronization of ring access.
> diff --git a/drivers/gpu/drm/i915/i915_gem_exec.c b/drivers/gpu/drm/i915/i915_gem_exec.c
> index a2c6dbf..4da3704 100644
> --- a/drivers/gpu/drm/i915/i915_gem_exec.c
> +++ b/drivers/gpu/drm/i915/i915_gem_exec.c
> @@ -111,7 +111,7 @@ int i915_gem_exec_clear_object(struct drm_i915_gem_object *obj)
>  	intel_ring_emit(ring, 0);
>  	intel_ring_emit(ring, MI_NOOP);
>  
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	i915_gem_exec_dirty_object(obj, ring);
>  
>  unpin:
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index b26b50b..6b7ce06 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -7547,7 +7547,7 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, 0); /* aux display base address, unused */
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	return 0;
>  
>  err_unpin:
> @@ -7588,7 +7588,7 @@ static int intel_gen3_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, MI_NOOP);
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	return 0;
>  
>  err_unpin:
> @@ -7636,7 +7636,7 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, pf | pipesrc);
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	return 0;
>  
>  err_unpin:
> @@ -7680,7 +7680,7 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, pf | pipesrc);
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	return 0;
>  
>  err_unpin:
> @@ -7736,7 +7736,7 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, (MI_NOOP));
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	return 0;
>  
>  err_unpin:
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 7de95da..8d5dd65 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -51,6 +51,16 @@ static inline int ring_space(struct intel_ring_buffer *ring)
>  	return space;
>  }
>  
> +void __intel_ring_advance(struct intel_ring_buffer *ring)
> +{
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +
> +	ring->tail &= ring->size - 1;
> +	if (dev_priv->gpu_error.stop_rings & intel_ring_flag(ring))
> +		return;
> +	ring->write_tail(ring, ring->tail);
> +}
> +
>  static int
>  gen2_render_ring_flush(struct intel_ring_buffer *ring,
>  		       u32	invalidate_domains,
> @@ -673,7 +683,7 @@ gen6_add_request(struct intel_ring_buffer *ring)
>  	intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
>  	intel_ring_emit(ring, ring->outstanding_lazy_request);
>  	intel_ring_emit(ring, MI_USER_INTERRUPT);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  
>  	return 0;
>  }
> @@ -787,7 +797,7 @@ pc_render_add_request(struct intel_ring_buffer *ring)
>  	intel_ring_emit(ring, pc->gtt_offset | PIPE_CONTROL_GLOBAL_GTT);
>  	intel_ring_emit(ring, ring->outstanding_lazy_request);
>  	intel_ring_emit(ring, 0);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  
>  	return 0;
>  }
> @@ -1016,7 +1026,7 @@ i9xx_add_request(struct intel_ring_buffer *ring)
>  	intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
>  	intel_ring_emit(ring, ring->outstanding_lazy_request);
>  	intel_ring_emit(ring, MI_USER_INTERRUPT);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  
>  	return 0;
>  }
> @@ -1472,6 +1482,9 @@ static int ring_wait_for_space(struct intel_ring_buffer *ring, int n)
>  	if (ret != -ENOSPC)
>  		return ret;
>  
> +	/* force the tail write in case we have been skipping them */
> +	__intel_ring_advance(ring);
> +
>  	trace_i915_ring_wait_begin(ring);
>  	/* With GEM the hangcheck timer should kick us out of the loop,
>  	 * leaving it early runs the risk of corrupting GEM state (due
> @@ -1614,17 +1627,6 @@ void intel_ring_init_seqno(struct intel_ring_buffer *ring, u32 seqno)
>  	ring->hangcheck.seqno = seqno;
>  }
>  
> -void intel_ring_advance(struct intel_ring_buffer *ring)
> -{
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> -
> -	ring->tail &= ring->size - 1;
> -	if (dev_priv->gpu_error.stop_rings & intel_ring_flag(ring))
> -		return;
> -	ring->write_tail(ring, ring->tail);
> -}
> -
> -
>  static void gen6_bsd_ring_write_tail(struct intel_ring_buffer *ring,
>  				     u32 value)
>  {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 6e38256..d40eff8 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -232,7 +232,12 @@ static inline void intel_ring_emit(struct intel_ring_buffer *ring,
>  	iowrite32(data, ring->virtual_start + ring->tail);
>  	ring->tail += 4;
>  }
> -void intel_ring_advance(struct intel_ring_buffer *ring);
> +static inline void intel_ring_advance(struct intel_ring_buffer *ring)
> +{
> +	ring->tail &= ring->size - 1;
> +}
> +void __intel_ring_advance(struct intel_ring_buffer *ring);
> +
>  int __must_check intel_ring_idle(struct intel_ring_buffer *ring);
>  void intel_ring_init_seqno(struct intel_ring_buffer *ring, u32 seqno);
>  int intel_ring_flush_all_caches(struct intel_ring_buffer *ring);

Functions you missed which I think should have an actual tail write:
ironlake_enable_rc6
intel_overlay_* (not sure about these, just a guess)

Also, this makes a nice opportunity to not actually switch back on
add_request fail in do_switch()

In any case it's (begrudgingly):
Reviewed-by: Ben Widawsky <ben@xxxxxxxxxxxx>

-- 
Ben Widawsky, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx