On Fri, Aug 12, 2016 at 12:39:59PM +0100, Chris Wilson wrote: > This patch provides the infrastructure for performing a 16-byte aligned > read from WC memory using non-temporal instructions introduced with sse4.1. > Using movntdqa we can bypass the CPU caches and read directly from memory > and ignoring the page attributes set on the CPU PTE i.e. negating the > impact of an otherwise UC access. Copying using movntdqa from WC is almost > as fast as reading from WB memory, modulo the possibility of both hitting > the CPU cache or leaving the data in the CPU cache for the next consumer. > (The CPU cache itself my be flushed for the region of the movntdqa and on > later access the movntdqa reads from a separate internal buffer for the > cacheline.) The write back to the memory is however cached. > > This will be used in later patches to accelerate accessing WC memory. > > v2: Report whether the accelerated copy is successful/possible. > v3: Function alignment override was only necessary when using the > function target("sse4.1") - which is not necessary for emitting movntdqa > from __asm__. > v4: Improve notes on CPU cache behaviour vs non-temporal stores. > v5: Fix byte offsets for unrolled moves. > v6: Find all remaining typos of "movntqda", use kernel_fpu_begin. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Akash Goel <akash.goel@xxxxxxxxx> > Cc: Damien Lespiau <damien.lespiau@xxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > --- > drivers/gpu/drm/i915/Makefile | 3 ++ > drivers/gpu/drm/i915/i915_drv.c | 2 + > drivers/gpu/drm/i915/i915_drv.h | 3 ++ > drivers/gpu/drm/i915/i915_memcpy.c | 101 +++++++++++++++++++++++++++++++++++++ > 4 files changed, 109 insertions(+) > create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c > > diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile > index dda724f04445..3412413408c0 100644 > --- a/drivers/gpu/drm/i915/Makefile > +++ b/drivers/gpu/drm/i915/Makefile > @@ -3,12 +3,15 @@ > # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher. > > subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror > +subdir-ccflags-y += \ > + $(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA) > > # Please keep these build lists sorted! > > # core driver code > i915-y := i915_drv.o \ > i915_irq.o \ > + i915_memcpy.o \ > i915_params.o \ > i915_pci.o \ > i915_suspend.o \ > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index d82f96b2a47e..13ae340ef1f3 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -827,6 +827,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv, > mutex_init(&dev_priv->wm.wm_mutex); > mutex_init(&dev_priv->pps_mutex); > > + i915_memcpy_init_early(dev_priv); > + > ret = i915_workqueues_init(dev_priv); > if (ret < 0) > return ret; > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 654aabe76efc..bf193ba1574e 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -3907,6 +3907,9 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req) > return false; > } > > +void i915_memcpy_init_early(struct drm_i915_private *dev_priv); > +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len); > + > #define ptr_unpack_bits(ptr, bits) ({ \ > unsigned long __v = (unsigned long)(ptr); \ > (bits) = __v & ~PAGE_MASK; \ > diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c > new file mode 100644 > index 000000000000..6f1df0ec8a81 > --- /dev/null > +++ b/drivers/gpu/drm/i915/i915_memcpy.c > @@ -0,0 +1,101 @@ > +/* > + * Copyright © 2016 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS > + * IN THE SOFTWARE. > + * > + */ > + > +#include <linux/kernel.h> > +#include <asm/fpu/api.h> > + > +#include "i915_drv.h" > + > +DEFINE_STATIC_KEY_FALSE(has_movntdqa); > + > +#ifdef CONFIG_AS_MOVNTDQA > +static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len) > +{ > + kernel_fpu_begin(); > + > + len >>= 4; > + while (len >= 4) { > + asm("movntdqa (%0), %%xmm0\n" > + "movntdqa 16(%0), %%xmm1\n" > + "movntdqa 32(%0), %%xmm2\n" > + "movntdqa 48(%0), %%xmm3\n" > + "movaps %%xmm0, (%1)\n" > + "movaps %%xmm1, 16(%1)\n" > + "movaps %%xmm2, 32(%1)\n" > + "movaps %%xmm3, 48(%1)\n" Not using sse2 movntdq for the store? No benefit or? > + :: "r" (src), "r" (dst) : "memory"); > + src += 64; > + dst += 64; > + len -= 4; > + } > + while (len--) { > + asm("movntdqa (%0), %%xmm0\n" > + "movaps %%xmm0, (%1)\n" > + :: "r" (src), "r" (dst) : "memory"); > + src += 16; > + dst += 16; > + } > + > + kernel_fpu_end(); > +} > +#endif > + > +/** > + * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC > + * @dst: destination pointer > + * @src: source pointer > + * @len: how many bytes to copy > + * > + * i915_memcpy_from_wc copies @len bytes from @src to @dst using > + * non-temporal instructions where available. Note that all arguments > + * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple > + * of 16. > + * > + * To test whether accelerated reads from WC are supported, use > + * i915_memcpy_from_wc(NULL, NULL, 0); > + * > + * Returns true if the copy was successful, false if the preconditions > + * are not met. > + */ > +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len) > +{ > + if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15)) > + return false; > + > +#ifdef CONFIG_AS_MOVNTDQA > + if (static_branch_likely(&has_movntdqa)) { > + if (likely(len)) > + __memcpy_ntdqa(dst, src, len); > + return true; > + } > +#endif > + > + return false; > +} > + > +void i915_memcpy_init_early(struct drm_i915_private *dev_priv) > +{ > + if (static_cpu_has(X86_FEATURE_XMM4_1)) > + static_branch_enable(&has_movntdqa); > +} > -- > 2.8.1 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Ville Syrjälä Intel OTC _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx