[PATCH] drm/i915: Batch copy_from_user for relocation processing

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, Mar 24, 2012 at 12:20:24AM +0000, Chris Wilson wrote:
> Originally the code tried to allocate a large enough array to perform
> the copy using vmalloc, performance wasn't great and throughput was
> improved by processing each individual relocation entry separately.
> This too is not as efficient as one would desire. A compromise is then
> to allocate a single page (since that is relatively cheap) and process
> the relocations in page size batches.
> 
> x11perf -copywinwin10:	n450/pnv	i3-330m		i5-2520m (cpu)
>                Before: 	  249000	 785000		 1280000 (80%)
>                 After:	  264000	 896000		 1280000 (65%)
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   92 +++++++++++++++++++++++-----
>  1 files changed, 77 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 2d5d41b..1da04db 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -407,28 +407,90 @@ i915_gem_execbuffer_relocate_object(struct drm_i915_gem_object *obj,
>  {
>  	struct drm_i915_gem_relocation_entry __user *user_relocs;
>  	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
> -	int i, ret;
> +	int i, ret = 0;
> +
> +	if (entry->relocation_count == 0)
> +		return 0;
> +
> +#define N_RELOC_PER_PAGE \
> +	(PAGE_SIZE / sizeof(struct drm_i915_gem_relocation_entry))
>  
>  	user_relocs = (void __user *)(uintptr_t)entry->relocs_ptr;
> -	for (i = 0; i < entry->relocation_count; i++) {
> -		struct drm_i915_gem_relocation_entry reloc;
>  
> -		if (__copy_from_user_inatomic(&reloc,
> -					      user_relocs+i,
> -					      sizeof(reloc)))
> -			return -EFAULT;
> +	if (entry->relocation_count < N_RELOC_PER_PAGE / 2) {
> +fallback:

We don't we go slightly more over board with optimiziting this?
drm_i915_gem_relocation_entry is on 32 bytes, so we can fit a few onto the
stack. Hence what about this before the loop?

drm_i915_gem_relocation_entry stack_arr[512/sizeof(reloc_entry)];

if (relocation_count > N_RELOC_PER_PAGE) {
	tmp_stor = __get_free_page;
	if (!tmp_stor)
		goto fallback;
	tmp_stor_size = N_RELOC_PER_PAGE;
} else {
fallback:
	tmp_stor = stackk_arr;
	tmp_stor_size = 512/sizeof(reloc_entry);
}

Then we also wouldn't have the not-so-beautiful duplication in code. Maybe
the added complexity of the page_alloc path isn't even worth it anymore.
Can I bother you to play around with this?

Cheers, Daniel

> +		for (i = 0; i < entry->relocation_count; i++) {
> +			struct drm_i915_gem_relocation_entry reloc;
> +			u64 offset;
>  
> -		ret = i915_gem_execbuffer_relocate_entry(obj, eb, &reloc);
> -		if (ret)
> -			return ret;
> +			if (__copy_from_user_inatomic(&reloc,
> +						      user_relocs+i,
> +						      sizeof(reloc)))
> +				return -EFAULT;
>  
> -		if (__copy_to_user_inatomic(&user_relocs[i].presumed_offset,
> -					    &reloc.presumed_offset,
> -					    sizeof(reloc.presumed_offset)))
> -			return -EFAULT;
> +			offset = reloc.presumed_offset;
> +
> +			ret = i915_gem_execbuffer_relocate_entry(obj, eb, &reloc);
> +			if (ret)
> +				return ret;
> +
> +			if (reloc.presumed_offset != offset &&
> +			    __copy_to_user_inatomic(&user_relocs[i].presumed_offset,
> +						    &reloc.presumed_offset,
> +						    sizeof(reloc.presumed_offset)))
> +				return -EFAULT;
> +		}
> +	} else {
> +		unsigned long page;
> +		int remain;
> +
> +		page = __get_free_page(GFP_TEMPORARY);
> +		if (unlikely(page == 0))
> +			goto fallback;
> +
> +		i = 0;
> +		remain = entry->relocation_count;
> +		do {
> +			struct drm_i915_gem_relocation_entry *r =
> +				(struct drm_i915_gem_relocation_entry *)page;
> +			int count = remain;
> +			if (count > N_RELOC_PER_PAGE)
> +				count = N_RELOC_PER_PAGE;
> +			remain -= count;
> +
> +			if (__copy_from_user_inatomic(r, user_relocs+i,
> +						      count*sizeof(r[0]))) {
> +				ret = -EFAULT;
> +				goto err;
> +			}
> +
> +			do {
> +				u64 offset = r->presumed_offset;
> +
> +				ret = i915_gem_execbuffer_relocate_entry(obj, eb, r);
> +				if (ret)
> +					goto err;
> +
> +				if (r->presumed_offset != offset &&
> +				    __copy_to_user_inatomic(&user_relocs[i].presumed_offset,
> +							    &r->presumed_offset,
> +							    sizeof(r->presumed_offset))) {
> +					ret = -EFAULT;
> +					goto err;
> +				}
> +
> +				i++;
> +				r++;
> +			} while (--count);
> +		} while (remain);
> +
> +		ret = 0;
> +err:
> +		free_page(page);
>  	}
>  
> -	return 0;
> +	return ret;
> +#undef N_RELOC_PER_PAGE
>  }
>  
>  static int
> -- 
> 1.7.9.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Mail: daniel at ffwll.ch
Mobile: +41 (0)79 365 57 48


[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux