[PATCH 02/11] drm/i915: Avoid forcing relocations through the mappable GTT or CPU

daniel at ffwll.ch (Daniel Vetter) · Wed, 16 Jan 2013 17:07:24 +0100

On Tue, Jan 08, 2013 at 10:53:10AM +0000, Chris Wilson wrote:
> If the object lies outside of the mappable GTT aperture, do not force it
> through the CPU domain for relocations, but simply flush the writes as
> we perform them and then queue a chipset flush.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

A few bikesheds and comments on this one here. First the general ones:
- How much does this hurt when we're unlucky and a batch ends up in
  unmappable? Since we do unconditional clflushes and don't batch them up,
  I except copywin to go down the toilet a bit ...
- Do we need mitigation ala pwrite, where we try to move objects into
  mappable if it's possible without stalls?
- Performance data is missing ;-)

> ---
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   87 ++++++++++++++++------------
>  1 file changed, 51 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 163bb52..daa5409 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -33,6 +33,9 @@
>  #include "intel_drv.h"
>  #include <linux/dma_remapping.h>
>  
> +#define __EXEC_OBJECT_HAS_PIN (1<<31)
> +#define __EXEC_OBJECT_HAS_FENCE (1<<30)
> +
>  struct eb_objects {
>  	int and;
>  	struct hlist_head buckets[0];
> @@ -95,10 +98,16 @@ eb_destroy(struct eb_objects *eb)
>  static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
>  {
>  	return (obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
> -		!obj->map_and_fenceable ||
>  		obj->cache_level != I915_CACHE_NONE);
>  }
>  
> +static inline struct page *
> +gtt_offset_to_page(struct drm_i915_gem_object *obj, u32 offset)
> +{
> +	offset -= obj->gtt_space->start;
> +	return i915_gem_object_get_page(obj, offset >> PAGE_SHIFT);
> +}
> +
>  static int
>  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  				   struct eb_objects *eb,
> @@ -182,22 +191,20 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  		return -EFAULT;
>  
>  	reloc->delta += target_offset;
> +	reloc->offset += obj->gtt_offset;

Imo this adjustment here leads to messier code - we only really want the
gtt offset for the gtt relocs. With this we then have to substract the gtt
offset again in gtt_offset_to_page, which will be even more fun if we ever
get real ppgtt. I think a gtt_reloc_offset temp var instead and leaving
reloc->offset as would look better ...

>  	if (use_cpu_reloc(obj)) {
> -		uint32_t page_offset = reloc->offset & ~PAGE_MASK;
>  		char *vaddr;
>  
> -		ret = i915_gem_object_set_to_cpu_domain(obj, 1);
> +		ret = i915_gem_object_set_to_cpu_domain(obj, true);
>  		if (ret)
>  			return ret;
>  
> -		vaddr = kmap_atomic(i915_gem_object_get_page(obj,
> -							     reloc->offset >> PAGE_SHIFT));
> -		*(uint32_t *)(vaddr + page_offset) = reloc->delta;
> +		vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset));
> +		*(uint32_t *)(vaddr + offset_in_page(reloc->offset)) = reloc->delta;
>  		kunmap_atomic(vaddr);
>  	} else {
>  		struct drm_i915_private *dev_priv = dev->dev_private;
> -		uint32_t __iomem *reloc_entry;
> -		void __iomem *reloc_page;
> +		unsigned page_offset;
>  
>  		ret = i915_gem_object_set_to_gtt_domain(obj, true);
>  		if (ret)
> @@ -208,13 +215,28 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  			return ret;
>  
>  		/* Map the page containing the relocation we're going to perform.  */
> -		reloc->offset += obj->gtt_offset;
> -		reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
> -						      reloc->offset & PAGE_MASK);
> -		reloc_entry = (uint32_t __iomem *)
> -			(reloc_page + (reloc->offset & ~PAGE_MASK));
> -		iowrite32(reloc->delta, reloc_entry);
> -		io_mapping_unmap_atomic(reloc_page);
> +		page_offset = offset_in_page(reloc->offset);
> +
> +		if (reloc->offset < dev_priv->mm.gtt_mappable_end) {
> +			void __iomem *reloc_page;
> +
> +			reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
> +							      reloc->offset & PAGE_MASK);
> +			iowrite32(reloc->delta, reloc_page + page_offset);
> +			io_mapping_unmap_atomic(reloc_page);

WARN_ON(!obj->has_global_gtt_mapping); for paranoia, see the reasoning for
it below ...

> +		} else {
> +			char *vaddr;
> +
> +			vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset));
> +
> +			drm_clflush_virt_range(vaddr + page_offset, 4);
> +			*(uint32_t *)(vaddr + page_offset) = reloc->delta;
> +			drm_clflush_virt_range(vaddr + page_offset, 4);
> +
> +			kunmap_atomic(vaddr);
> +
> +			obj->base.pending_write_domain |= I915_GEM_DOMAIN_CPU;
> +		}
>  	}
>  
>  	/* and update the user's relocation entry */
> @@ -312,16 +334,6 @@ i915_gem_execbuffer_relocate(struct drm_device *dev,
>  	return ret;
>  }
>  
> -#define  __EXEC_OBJECT_HAS_PIN (1<<31)
> -#define  __EXEC_OBJECT_HAS_FENCE (1<<30)
> -
> -static int
> -need_reloc_mappable(struct drm_i915_gem_object *obj)
> -{
> -	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
> -	return entry->relocation_count && !use_cpu_reloc(obj);

We take the && !use_cpu_reloc to make sure that when we need do gtt
relocs, the global gtt mapping is in place. Luckily we don't have any
platfroms currently where this matters, but if e.g. vlv enables ppgtt, we
have one. So we need to handle this somehow (or risk the warth of Jesse
when he tries to figure out why ppgtt is broken exactly on vlv).

> -}
> -
>  static int
>  i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
>  				   struct intel_ring_buffer *ring)
> @@ -329,16 +341,15 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
>  	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
>  	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
>  	bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4;
> -	bool need_fence, need_mappable;
> +	bool need_fence;
>  	int ret;
>  
>  	need_fence =
>  		has_fenced_gpu_access &&
>  		entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
>  		obj->tiling_mode != I915_TILING_NONE;
> -	need_mappable = need_fence || need_reloc_mappable(obj);
>  
> -	ret = i915_gem_object_pin(obj, entry->alignment, need_mappable, false);
> +	ret = i915_gem_object_pin(obj, entry->alignment, need_fence, false);
>  	if (ret)
>  		return ret;
>  
> @@ -401,7 +412,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  	INIT_LIST_HEAD(&ordered_objects);
>  	while (!list_empty(objects)) {
>  		struct drm_i915_gem_exec_object2 *entry;
> -		bool need_fence, need_mappable;
> +		bool need_fence;
>  
>  		obj = list_first_entry(objects,
>  				       struct drm_i915_gem_object,
> @@ -412,9 +423,8 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  			has_fenced_gpu_access &&
>  			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
>  			obj->tiling_mode != I915_TILING_NONE;
> -		need_mappable = need_fence || need_reloc_mappable(obj);
>  
> -		if (need_mappable)
> +		if (need_fence)
>  			list_move(&obj->exec_list, &ordered_objects);
>  		else
>  			list_move_tail(&obj->exec_list, &ordered_objects);
> @@ -444,7 +454,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  		/* Unbind any ill-fitting objects or pin. */
>  		list_for_each_entry(obj, objects, exec_list) {
>  			struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
> -			bool need_fence, need_mappable;
> +			bool need_fence;
>  
>  			if (!obj->gtt_space)
>  				continue;
> @@ -453,10 +463,9 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  				has_fenced_gpu_access &&
>  				entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
>  				obj->tiling_mode != I915_TILING_NONE;
> -			need_mappable = need_fence || need_reloc_mappable(obj);
>  
>  			if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) ||
> -			    (need_mappable && !obj->map_and_fenceable))
> +			    (need_fence && !obj->map_and_fenceable))
>  				ret = i915_gem_object_unbind(obj);
>  			else
>  				ret = i915_gem_execbuffer_reserve_object(obj, ring);
> @@ -603,10 +612,16 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
>  		if (ret)
>  			return ret;
>  
> +		flush_domains |= obj->base.write_domain;
> +
>  		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
>  			i915_gem_clflush_object(obj);
>  
> -		flush_domains |= obj->base.write_domain;

Can we just drop this?

> +		/* Used as an internal marker during relocation processing */
> +		if (obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS) {
> +			flush_domains |= obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS;
> +			obj->base.pending_write_domain &= I915_GEM_GPU_DOMAINS;
> +		}

We /should/ filter out !GPU_DOMAINS already with return -EINVAL, so why
the changes here? Or can we just drop them?

>  	}
>  
>  	if (flush_domains & I915_GEM_DOMAIN_CPU)
> @@ -923,7 +938,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	}
>  
>  	/* Set the pending read domains for the batch buffer to COMMAND */
> -	if (batch_obj->base.pending_write_domain) {
> +	if (batch_obj->base.pending_write_domain & I915_GEM_GPU_DOMAINS) {

dito.

Cheers, Daniel

>  		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
>  		ret = -EINVAL;
>  		goto err;
> -- 
> 1.7.10.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch