Re: [CI 2/2] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Aug 12, 2016 at 12:39:59PM +0100, Chris Wilson wrote:
> This patch provides the infrastructure for performing a 16-byte aligned
> read from WC memory using non-temporal instructions introduced with sse4.1.
> Using movntdqa we can bypass the CPU caches and read directly from memory
> and ignoring the page attributes set on the CPU PTE i.e. negating the
> impact of an otherwise UC access. Copying using movntdqa from WC is almost
> as fast as reading from WB memory, modulo the possibility of both hitting
> the CPU cache or leaving the data in the CPU cache for the next consumer.
> (The CPU cache itself my be flushed for the region of the movntdqa and on
> later access the movntdqa reads from a separate internal buffer for the
> cacheline.) The write back to the memory is however cached.
> 
> This will be used in later patches to accelerate accessing WC memory.
> 
> v2: Report whether the accelerated copy is successful/possible.
> v3: Function alignment override was only necessary when using the
> function target("sse4.1") - which is not necessary for emitting movntdqa
> from __asm__.
> v4: Improve notes on CPU cache behaviour vs non-temporal stores.
> v5: Fix byte offsets for unrolled moves.
> v6: Find all remaining typos of "movntqda", use kernel_fpu_begin.
> 
> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> Cc: Akash Goel <akash.goel@xxxxxxxxx>
> Cc: Damien Lespiau <damien.lespiau@xxxxxxxxx>
> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/Makefile      |   3 ++
>  drivers/gpu/drm/i915/i915_drv.c    |   2 +
>  drivers/gpu/drm/i915/i915_drv.h    |   3 ++
>  drivers/gpu/drm/i915/i915_memcpy.c | 101 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 109 insertions(+)
>  create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index dda724f04445..3412413408c0 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -3,12 +3,15 @@
>  # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
>  
>  subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror
> +subdir-ccflags-y += \
> +	$(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA)
>  
>  # Please keep these build lists sorted!
>  
>  # core driver code
>  i915-y := i915_drv.o \
>  	  i915_irq.o \
> +	  i915_memcpy.o \
>  	  i915_params.o \
>  	  i915_pci.o \
>            i915_suspend.o \
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index d82f96b2a47e..13ae340ef1f3 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -827,6 +827,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
>  	mutex_init(&dev_priv->wm.wm_mutex);
>  	mutex_init(&dev_priv->pps_mutex);
>  
> +	i915_memcpy_init_early(dev_priv);
> +
>  	ret = i915_workqueues_init(dev_priv);
>  	if (ret < 0)
>  		return ret;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 654aabe76efc..bf193ba1574e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3907,6 +3907,9 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
>  	return false;
>  }
>  
> +void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
> +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> +
>  #define ptr_unpack_bits(ptr, bits) ({					\
>  	unsigned long __v = (unsigned long)(ptr);			\
>  	(bits) = __v & ~PAGE_MASK;					\
> diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
> new file mode 100644
> index 000000000000..6f1df0ec8a81
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_memcpy.c
> @@ -0,0 +1,101 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/kernel.h>
> +#include <asm/fpu/api.h>
> +
> +#include "i915_drv.h"
> +
> +DEFINE_STATIC_KEY_FALSE(has_movntdqa);
> +
> +#ifdef CONFIG_AS_MOVNTDQA
> +static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
> +{
> +	kernel_fpu_begin();
> +
> +	len >>= 4;
> +	while (len >= 4) {
> +		asm("movntdqa   (%0), %%xmm0\n"
> +		    "movntdqa 16(%0), %%xmm1\n"
> +		    "movntdqa 32(%0), %%xmm2\n"
> +		    "movntdqa 48(%0), %%xmm3\n"
> +		    "movaps %%xmm0,   (%1)\n"
> +		    "movaps %%xmm1, 16(%1)\n"
> +		    "movaps %%xmm2, 32(%1)\n"
> +		    "movaps %%xmm3, 48(%1)\n"

Not using sse2 movntdq for the store? No benefit or?

> +		    :: "r" (src), "r" (dst) : "memory");
> +		src += 64;
> +		dst += 64;
> +		len -= 4;
> +	}
> +	while (len--) {
> +		asm("movntdqa (%0), %%xmm0\n"
> +		    "movaps %%xmm0, (%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +		src += 16;
> +		dst += 16;
> +	}
> +
> +	kernel_fpu_end();
> +}
> +#endif
> +
> +/**
> + * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
> + * @dst: destination pointer
> + * @src: source pointer
> + * @len: how many bytes to copy
> + *
> + * i915_memcpy_from_wc copies @len bytes from @src to @dst using
> + * non-temporal instructions where available. Note that all arguments
> + * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
> + * of 16.
> + *
> + * To test whether accelerated reads from WC are supported, use
> + * i915_memcpy_from_wc(NULL, NULL, 0);
> + *
> + * Returns true if the copy was successful, false if the preconditions
> + * are not met.
> + */
> +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
> +		return false;
> +
> +#ifdef CONFIG_AS_MOVNTDQA
> +	if (static_branch_likely(&has_movntdqa)) {
> +		if (likely(len))
> +			__memcpy_ntdqa(dst, src, len);
> +		return true;
> +	}
> +#endif
> +
> +	return false;
> +}
> +
> +void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
> +{
> +	if (static_cpu_has(X86_FEATURE_XMM4_1))
> +		static_branch_enable(&has_movntdqa);
> +}
> -- 
> 2.8.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux