Re: [PATCH 2/8] drm/i915: Cache kmap between relocations

Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> · Thu, 09 Jun 2016 15:25:47 +0300

On to, 2016-06-09 at 12:29 +0100, Chris Wilson wrote:
> When doing relocations, we have to obtain a mapping to the page
> containing the target address. This is either a kmap or iomap depending
> on GPU and its cache coherency. Neighbouring relocation entries are
> typically within the same page and so we can cache our kmapping between
> them and avoid those pesky TLB flushes.
> 
> Note that there is some sleight-of-hand in how the slow relocate works
> as the reloc_entry_cache implies pagefaults disabled (as we are inside a
> kmap_atomic section). However, the slow relocate code is meant to be the
> fallback from the atomic fast path failing. Fortunately it works as we
> already have performed the copy_from_user for the relocation array (no
> more pagefaults there) and the kmap_atomic cache is enabled after we
> have waited upon an active buffer (so no more sleeping in atomic).
> Magic!
> 

You could also mention that you mangle the relocation <-> page logic,
so this is not purely about caching. Or maybe even split it.

Below comments, those addressed;

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx>

> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 160 +++++++++++++++++++----------
>  1 file changed, 106 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index a29c4b6fea28..318c71b663f4 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -302,9 +302,50 @@ relocation_target(struct drm_i915_gem_relocation_entry *reloc,
>  	return gen8_canonical_addr((int)reloc->delta + target_offset);
>  }
>  
> +struct reloc_cache {
> +	void *vaddr;
> +	unsigned page;
> +	enum { KMAP, IOMAP } type;
> +};
> +
> +static void reloc_cache_init(struct reloc_cache *cache)
> +{
> +	cache->page = -1;
> +	cache->vaddr = NULL;
> +}
> +
> +static void reloc_cache_fini(struct reloc_cache *cache)
> +{
> +	if (cache->vaddr == NULL)
> +		return;
> +
> +	switch (cache->type) {
> +	case KMAP: kunmap_atomic(cache->vaddr); break;
> +	case IOMAP: io_mapping_unmap_atomic(cache->vaddr); break;
> +	}
> +}
> +
> +static void *reloc_kmap(struct drm_i915_gem_object *obj,
> +			struct reloc_cache *cache,
> +			int page)
> +{
> +	if (cache->page == page)
> +		return cache->vaddr;
> +
> +	if (cache->vaddr)
> +		kunmap_atomic(cache->vaddr);

Maybe add some GEM_BUG_ON(cache->type != KMAP) here before running
kunmap_atomic? Because that assumption is made.

> +
> +	cache->page = page;
> +	cache->vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page));
> +	cache->type = KMAP;
> +
> +	return cache->vaddr;
> +}
> +
>  static int
>  relocate_entry_cpu(struct drm_i915_gem_object *obj,
>  		   struct drm_i915_gem_relocation_entry *reloc,
> +		   struct reloc_cache *cache,
>  		   uint64_t target_offset)
>  {
>  	struct drm_device *dev = obj->base.dev;
> @@ -317,34 +358,47 @@ relocate_entry_cpu(struct drm_i915_gem_object *obj,
>  	if (ret)
>  		return ret;
>  
> -	vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
> -				reloc->offset >> PAGE_SHIFT));
> +	vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
>  	*(uint32_t *)(vaddr + page_offset) = lower_32_bits(delta);
>  
> -	if (INTEL_INFO(dev)->gen >= 8) {
> -		page_offset = offset_in_page(page_offset + sizeof(uint32_t));
> -
> -		if (page_offset == 0) {
> -			kunmap_atomic(vaddr);
> -			vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
> -			    (reloc->offset + sizeof(uint32_t)) >> PAGE_SHIFT));
> +	if (INTEL_GEN(dev) >= 8) {
> +		page_offset += sizeof(uint32_t);
> +		if (page_offset == PAGE_SIZE) {
> +			vaddr = reloc_kmap(obj, cache, cache->page + 1);
> +			page_offset = 0;
>  		}
> -
>  		*(uint32_t *)(vaddr + page_offset) = upper_32_bits(delta);
>  	}
>  
> -	kunmap_atomic(vaddr);
> -
>  	return 0;
>  }
>  
> +static void *reloc_iomap(struct drm_i915_private *i915,
> +			 struct reloc_cache *cache,
> +			 uint64_t offset)
> +{
> +	if (cache->page == offset >> PAGE_SHIFT)
> +		return cache->vaddr;
> +
> +	if (cache->vaddr)
> +		io_mapping_unmap_atomic(cache->vaddr);
> +
> +	cache->page = offset >> PAGE_SHIFT;
> +	cache->vaddr =
> +		io_mapping_map_atomic_wc(i915->ggtt.mappable,
> +					 offset & PAGE_MASK);
> +	cache->type = IOMAP;
> +
> +	return cache->vaddr;
> +}
> +
>  static int
>  relocate_entry_gtt(struct drm_i915_gem_object *obj,
>  		   struct drm_i915_gem_relocation_entry *reloc,
> +		   struct reloc_cache *cache,
>  		   uint64_t target_offset)
>  {
>  	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
> -	struct i915_ggtt *ggtt = &dev_priv->ggtt;
>  	struct i915_vma *vma;
>  	uint64_t delta = relocation_target(reloc, target_offset);
>  	uint64_t offset;
> @@ -366,28 +420,19 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
>  	/* Map the page containing the relocation we're going to perform.  */
>  	offset = vma->node.start;
>  	offset += reloc->offset;
> -	reloc_page = io_mapping_map_atomic_wc(ggtt->mappable,
> -					      offset & PAGE_MASK);
> +	reloc_page = reloc_iomap(dev_priv, cache, offset);
>  	iowrite32(lower_32_bits(delta), reloc_page + offset_in_page(offset));
>  
>  	if (INTEL_GEN(dev_priv) >= 8) {
>  		offset += sizeof(uint32_t);
> -
> -		if (offset_in_page(offset) == 0) {
> -			io_mapping_unmap_atomic(reloc_page);
> -			reloc_page =
> -				io_mapping_map_atomic_wc(ggtt->mappable,
> -							 offset);
> -		}
> -
> +		if (offset_in_page(offset) == 0)
> +			reloc_page = reloc_iomap(dev_priv, cache, offset);
>  		iowrite32(upper_32_bits(delta),
>  			  reloc_page + offset_in_page(offset));
>  	}
>  
> -	io_mapping_unmap_atomic(reloc_page);
> -
>  unpin:
> -	i915_vma_unpin(vma);
> +	__i915_vma_unpin(vma);
>  	return ret;
>  }
>  
> @@ -403,6 +448,7 @@ clflush_write32(void *addr, uint32_t value)
>  static int
>  relocate_entry_clflush(struct drm_i915_gem_object *obj,
>  		       struct drm_i915_gem_relocation_entry *reloc,
> +		       struct reloc_cache *cache,

For consistency with rest of the patch, I would put the cache as last
argument, as it could easily be removed again in future, so it's the
least important.

>  		       uint64_t target_offset)
>  {
>  	struct drm_device *dev = obj->base.dev;
> @@ -415,24 +461,18 @@ relocate_entry_clflush(struct drm_i915_gem_object *obj,
>  	if (ret)
>  		return ret;
>  
> -	vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
> -				reloc->offset >> PAGE_SHIFT));
> +	vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
>  	clflush_write32(vaddr + page_offset, lower_32_bits(delta));
>  
> -	if (INTEL_INFO(dev)->gen >= 8) {
> -		page_offset = offset_in_page(page_offset + sizeof(uint32_t));
> -
> -		if (page_offset == 0) {
> -			kunmap_atomic(vaddr);
> -			vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
> -			    (reloc->offset + sizeof(uint32_t)) >> PAGE_SHIFT));
> +	if (INTEL_GEN(dev) >= 8) {
> +		page_offset += sizeof(uint32_t);
> +		if (page_offset == PAGE_SIZE) {
> +			vaddr = reloc_kmap(obj, cache, cache->page + 1);
> +			page_offset = 0;
>  		}
> -
>  		clflush_write32(vaddr + page_offset, upper_32_bits(delta));
>  	}
>  
> -	kunmap_atomic(vaddr);
> -
>  	return 0;
>  }
>  
> @@ -453,7 +493,8 @@ static bool object_is_idle(struct drm_i915_gem_object *obj)
>  static int
>  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  				   struct eb_vmas *eb,
> -				   struct drm_i915_gem_relocation_entry *reloc)
> +				   struct drm_i915_gem_relocation_entry *reloc,
> +				   struct reloc_cache *cache)
>  {
>  	struct drm_device *dev = obj->base.dev;
>  	struct drm_gem_object *target_obj;
> @@ -537,11 +578,11 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  		return -EFAULT;
>  
>  	if (use_cpu_reloc(obj))
> -		ret = relocate_entry_cpu(obj, reloc, target_offset);
> +		ret = relocate_entry_cpu(obj, reloc, cache, target_offset);
>  	else if (obj->map_and_fenceable)
> -		ret = relocate_entry_gtt(obj, reloc, target_offset);
> +		ret = relocate_entry_gtt(obj, reloc, cache, target_offset);
>  	else if (static_cpu_has(X86_FEATURE_CLFLUSH))
> -		ret = relocate_entry_clflush(obj, reloc, target_offset);
> +		ret = relocate_entry_clflush(obj, reloc, cache, target_offset);
>  	else {
>  		WARN_ONCE(1, "Impossible case in relocation handling\n");
>  		ret = -ENODEV;
> @@ -564,9 +605,11 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
>  	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
>  	struct drm_i915_gem_relocation_entry __user *user_relocs;
>  	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> -	int remain, ret;
> +	struct reloc_cache cache;
> +	int remain, ret = 0;
>  
>  	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
> +	reloc_cache_init(&cache);
>  
>  	remain = entry->relocation_count;
>  	while (remain) {
> @@ -576,19 +619,23 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
>  			count = ARRAY_SIZE(stack_reloc);
>  		remain -= count;
>  
> -		if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0])))
> -			return -EFAULT;
> +		if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]))) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
>  
>  		do {
>  			u64 offset = r->presumed_offset;
>  
> -			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r);
> +			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, &cache);
>  			if (ret)
> -				return ret;
> +				goto out;
>  
>  			if (r->presumed_offset != offset &&
> -			    __put_user(r->presumed_offset, &user_relocs->presumed_offset)) {
> -				return -EFAULT;
> +			    __put_user(r->presumed_offset,
> +				       &user_relocs->presumed_offset)) {
> +				ret = -EFAULT;
> +				goto out;
>  			}
>  
>  			user_relocs++;
> @@ -596,7 +643,9 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
>  		} while (--count);
>  	}
>  
> -	return 0;
> +out:
> +	reloc_cache_fini(&cache);

<NL> here to keep consistent between your next change.

Regards, Joonas

> +	return ret;
>  #undef N_RELOC
>  }
>  
> @@ -606,15 +655,18 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
>  				      struct drm_i915_gem_relocation_entry *relocs)
>  {
>  	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> -	int i, ret;
> +	struct reloc_cache cache;
> +	int i, ret = 0;
>  
> +	reloc_cache_init(&cache);
>  	for (i = 0; i < entry->relocation_count; i++) {
> -		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i]);
> +		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], &cache);
>  		if (ret)
> -			return ret;
> +			break;
>  	}
> +	reloc_cache_fini(&cache);
>  
> -	return 0;
> +	return ret;
>  }
>  
>  static int
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx