On 18/07/16 16:06, Tvrtko Ursulin wrote:
On 18/07/16 14:46, Tvrtko Ursulin wrote:
[snip]
This version generates the smallest code:
static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
{
unsigned long l4;
kernel_fpu_begin();
l4 = len / 4;
while (l4) {
asm("movntdqa (%0), %%xmm0" :: "r" (src), "m" (src[0]));
asm("movntdqa 16(%0), %%xmm1" :: "r" (src), "m" (src[1]));
asm("movntdqa 32(%0), %%xmm2" :: "r" (src), "m" (src[2]));
asm("movntdqa 48(%0), %%xmm3" :: "r" (src), "m" (src[3]));
asm("movaps %%xmm0, (%1)" : "=m" (dst[0]) : "r" (dst));
asm("movaps %%xmm1, 16(%1)" : "=m" (dst[1]) : "r" (dst));
asm("movaps %%xmm2, 32(%1)" : "=m" (dst[2]) : "r" (dst));
asm("movaps %%xmm3, 48(%1)" : "=m" (dst[3]) : "r" (dst));
src += 4;
dst += 4;
l4--;
}
len %= 4;
while (len) {
asm("movntdqa (%0), %%xmm0" :: "r" (src), "m" (src[0]));
asm("movaps %%xmm0, (%1)" : "=m" (dst[0]) : "r" (dst));
src++;
dst++;
len--;
}
kernel_fpu_end();
}
Although I still haven't figured out a way to convince it to use
the same registers for src and dest between the two loops.
I remembered one famous interview question, along the lines of, "what
is the code below doing". Translated to this example:
static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
{
unsigned long n;
kernel_fpu_begin();
n = (len + 3) / 4;
switch (len % 4) {
case 0: do { asm("movntdqa %1, %%xmm0\n"
"movaps %%xmm0, %0\n" : "=m" (*dst): "m" (*src));
src++; dst++;
case 3: asm("movntdqa %1, %%xmm1\n"
"movaps %%xmm1, %0\n" : "=m" (*dst): "m" (*src));
src++; dst++;
case 2: asm("movntdqa %1, %%xmm2\n"
"movaps %%xmm2, %0\n" : "=m" (*dst): "m" (*src));
src++; dst++;
case 1: asm("movntdqa %1, %%xmm3\n"
"movaps %%xmm3, %0\n" : "=m" (*dst): "m" (*src));
src++; dst++;
} while (--n > 0);
}
kernel_fpu_end();
}
:D
No idea if loads/stores can run async in this case.
Regards,
Tvrtko
Here's yet another variant, just to document other ways of writing it:
#include "asm/fpu/api.h"
/* This is the datatype of an xmm register */
typedef double xmmd_t __attribute__ ((vector_size (16)));
__attribute__((target("sse4.1")))
void __memcpy_ntdqa(xmmd_t *dst, const xmmd_t *src, unsigned long len)
{
xmmd_t tmp0, tmp1, tmp2, tmp3;
unsigned long l64;
kernel_fpu_begin();
/* Whole 64-byte blocks as 4*16 bytes */
for (l64 = len/64; l64--; ) {
asm("movntdqa %1, %0" : "=x" (tmp0) : "m" (*src++));
asm("movntdqa %1, %0" : "=x" (tmp1) : "m" (*src++));
asm("movntdqa %1, %0" : "=x" (tmp2) : "m" (*src++));
asm("movntdqa %1, %0" : "=x" (tmp3) : "m" (*src++));
asm("movaps %1, %0" : "=m" (*dst++) : "x" (tmp0));
asm("movaps %1, %0" : "=m" (*dst++) : "x" (tmp1));
asm("movaps %1, %0" : "=m" (*dst++) : "x" (tmp2));
asm("movaps %1, %0" : "=m" (*dst++) : "x" (tmp3));
}
/* Remaining up-to-3 16-byte chunks */
for (len &= 63, len >>= 4; len--; ) {
asm("movntdqa %1, %0" : "=x" (tmp0) : "m" (*src++));
asm("movaps %1, %0" : "=m" (*dst++) : "x" (tmp0));
}
kernel_fpu_end();
}
I wondered whether we could get GCC to unroll the loops automatically
i.e. just write the one loop and say we wanted it unrolled four times,
leaving the compiler to deal with the remainder; but I didn't find a way
to specify "unroll 4 times" as opposed to just "unroll this some".
.Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx