GCC problems with Altivec / PowerPC

Przemyslaw Iskra <sparky@xxxxxxxxxxxxx> · Thu, 14 Jun 2007 21:01:02 +0200

 Hello everyone !

 Unlike in scalar code generation for ppc, which seems to get better
with each gcc version; I've observed progressing altivec code
generation breakage. I am not powerpc guru (yet :P), so I'd like
to discuss some problems I've spotted before adding new entry to
bugzilla. I hope there is someone who can tell me what else should
I check.

 NOTE: I've been using gcc versions patched by distributions,
but vanilla gcc 4.1.2 gave same results as ppc-pld-linux-gcc-4.1.2
and gcc-4.1, so I beleve there is no breakage introduced with
patching in those gcc versions.

$ ppc-pld-linux-gcc-4.1.2 --version
ppc-pld-linux-gcc-4.1.2 (GCC) 4.1.2 (PLD-Linux)

$ ppc-pld-linux-gcc-4.2.0 --version
ppc-pld-linux-gcc-4.2.0 (GCC) 4.2.0 (PLD-Linux)

$ gcc-3.3 --version
gcc-3.3 (GCC) 3.3.6 (Debian 1:3.3.6-15)

$ gcc-4.1 --version
gcc-4.1 (GCC) 4.1.2 20061115 (prerelease) (Debian 4.1.1-21)

PROBLEM 1.

 With altivec enabled gcc moves stack in every function by additional
16 bytes no matter it is used or not.

 An empty function:

$ cat empty.c
void empty(void) { }

 with optimization -O2 compiles to:

empty:
	blr

 But with -maltivec it adds 'stwu' instruction at the beggining,
and 'addi' at the end:

empty:
	stwu 1,-16(1)
	addi 1,1,16
	blr

 I beleve this is totally unnecessary and useless. Happens with all
gcc versions I've checked, in gcc 3.3 it is triggered by
-mabi=altivec, in gcc 4.* just adding -maltivec (or -mcpu=7450 or
similar) activates this "feature". Are those instructions suposed
to do anything ?

 Moreover, those two instructions could be easily simplified to one:
	stw 1,-16(1)
why they aren't ?

PROBLEM 2.

 Let's have a very simple memory copying function using altivec:

--BEGIN--
#include <altivec.h>
#define __need_size_t
#include <stddef.h>

void
vec_memcpy( void *dest, const void *src, size_t vecs4 )
{
	vector unsigned char *vDest = dest;
	const vector unsigned char *vSrc = src;
	vector unsigned char tmp0, tmp1, tmp2, tmp3;
	int i;

	for ( i = 0; i < vecs4; i++ ) {
		tmp0 = vec_ld(  0, vSrc );
		tmp1 = vec_ld( 16, vSrc );
		tmp2 = vec_ld( 32, vSrc );
		tmp3 = vec_ld( 48, vSrc );
		vSrc += 4;

		vec_st( tmp0,  0, vDest );
		vec_st( tmp1, 16, vDest );
		vec_st( tmp2, 32, vDest );
		vec_st( tmp3, 48, vDest );
		vDest += 4;
	}
}
--END--

 With gcc 3.3 it gives a very nice code:
$ gcc-3.3 -O2 -maltivec vec_memcpy.c -S

--BEGIN--
vec_memcpy:
	cmpwi 0,5,0
	beqlr- 0
	mtctr 5
	li 8,0
	li 10,16
	li 11,32
	li 9,48
.L9:
	lvx 0,0,4
	lvx 1,10,4
	lvx 13,11,4
	lvx 12,9,4
	stvx 0,8,3
	addi 4,4,64
	stvx 1,10,3
	stvx 13,11,3
	stvx 12,9,3
	addi 3,3,64
	bdnz .L9
	blr
--END--

 The only part I don't like about it is 'li 8,0'.

 But gcc 4.1.2 and 4.2.0 create the code to calculate load/store
pointer manually and allways put '0' as index, so the main loop
becomes:

--BEGIN--
.L4:
	addi 9,4,16
	lvx 0,0,4
	lvx 1,0,9
	addi 9,4,32
	lvx 13,0,9
	addi 9,4,48
	lvx 12,0,9
	addi 9,3,16
	stvx 0,0,3
	addi 4,4,64
	stvx 1,0,9
	addi 9,3,32
	stvx 13,0,9
	addi 9,3,48
	stvx 12,0,9
	addi 3,3,64
	bdnz .L4
--END--

 and it is slower than the former version. I've been trying to
use int and register int as the index, but gcc allways manages to
use '0' as second lvx/stvx argument.

 As a workaround I tried to use inline assembly for loads and stores:

#define asm_lvx( vec, idx, ptr )	\
	asm ( "lvx %0,%1,%2"		\
		: "=v" (vec)		\
		: "r" (idx), "r" (ptr)
	)

#define asm_stvx( vec, idx, ptr )	\
	asm ( "stvx %0,%1,%2"		\
		:			\
		: "v" (vec),		\
		  "r" (idx), "r" (ptr)	\
		: "memory"		\
	)

 whole code became:

--BEGIN--
#include <altivec.h>
#define __need_size_t
#include <stddef.h>

#define asm_lvx( vec, idx, ptr )	\
	asm ( "lvx %0,%1,%2"		\
		: "=v" (vec)		\
		: "r" (idx), "r" (ptr)	\
	)
#define asm_stvx( vec, idx, ptr )	\
	asm ( "stvx %0,%1,%2"		\
		:			\
		: "v" (vec),		\
		  "r" (idx), "r" (ptr)	\
		: "memory"		\
	)

void
vec_memcpy( void *dest, const void *src, size_t vecs4 )
{
	vector unsigned char *vDest = dest;
	const vector unsigned char *vSrc = src;
	vector unsigned char tmp0, tmp1, tmp2, tmp3;
	int i;

	for ( i = 0; i < vecs4; i++ ) {
		asm_lvx( tmp0,  0, vSrc );
		asm_lvx( tmp1, 16, vSrc );
		asm_lvx( tmp2, 32, vSrc );
		asm_lvx( tmp3, 48, vSrc );
		vSrc += 4;

		asm_stvx( tmp0,  0, vDest );
		asm_stvx( tmp1, 16, vDest );
		asm_stvx( tmp2, 32, vDest );
		asm_stvx( tmp3, 48, vDest );
		vDest += 4;
	}
}
--END--

 In gcc 4.1.2 the given result seems to be nice:

--BEGIN--
vec_memcpy:
	mr. 0,5
	stwu 1,-16(1)
	mtctr 0
	beq- 0,.L5
	li 10,0
	li 11,16
	li 9,32
	li 0,48
.L4:
#APP
	lvx 0,10,4
	lvx 1,11,4
	lvx 13,9,4
	lvx 12,0,4
#NO_APP
	addi 4,4,64
#APP
	stvx 0,10,3
	stvx 1,11,3
	stvx 13,9,3
	stvx 12,0,3
#NO_APP
	addi 3,3,64
	bdnz .L4
.L5:
	addi 1,1,16
	blr
--END--

 But there's a problem, in lvx/stvx '0' in second term acts as
literal 0, not as register number. Shouldn't gcc be able to
understand those instructions and not use r0 ?

 Anyway, I've marked "r0" in asm as used register to prevent gcc from
using it and defined more macros for lvx and stvx with '0' index.

 Now the code works correctly, gives nice results in gcc 4.1.2 and
gcc 3.3, but does not look as nicely as before, and removes from gcc
liberty to reorganize instruction order. See included file.

 The revelant code is:
--BEGIN--
	li 10,16
	li 11,32
	li 9,48
.L4:
#APP
	lvx 0,0,4
	lvx 1,10,4
	lvx 13,11,4
	lvx 12,9,4
#NO_APP
	addi 4,4,64
#APP
	stvx 0,0,3
	stvx 1,10,3
	stvx 13,11,3
	stvx 12,9,3
#NO_APP
	addi 3,3,64
	bdnz .L4
--END--

 But there still is a problem with gcc 4.2.0, it loads the constants
to registers inside loop, not before it.

 I've been trying to define indexes as:

register int index1 = 16, index2 = 32, index3 = 48;

 and force values to be loaded before the loop:

asm volatile ( "" : : "r" (index1), "r" (index2), "r" (index3) );

 but the result is even worse, values are loaded twice !

 Finally the second included code did the trick. But with all those
asm inlines I prefer to rewrite whole loop in assembly, at least
I'll be sure no additional magic is going to be needed for future
gcc versions.

 Maybe all those aren't bugs and the code generated by gcc 4.2 and
4.1.2 should be as fast as that generated by gcc 3.3. If so, tell me,
why on my machine it isn't ?

-- 
 ____  Sparky{PI] -- Przemyslaw _  ___  _  _  ........... LANG...Pl..Ca..Es..En
/____) ___  ___  _ _ || Iskra  |  | _ \| |  | : WWW........ppcrcd.pld-linux.org
\____\| -_)'___| ||^'||//\\// <   |  _/| |  | : JID......sparky<at>jabberes.org
(____/||   (_-_|_||  ||\\ ||   |_ |_|  |_| _| : Mail....sparky<at>pld-linux.org

#include <altivec.h>
#define __need_size_t
#include <stddef.h>

#define asm_lvx( vec, idx, ptr )	\
	asm ( "lvx %0,%1,%2"		\
		: "=v" (vec)		\
		: "r" (idx), "r" (ptr)	\
		: "r0"			\
	)
#define asm_lvx0( vec, ptr )		\
	asm ( "lvx %0,0,%1"		\
		: "=v" (vec)		\
		: "r" (ptr)		\
		: "r0"			\
	)
#define asm_stvx( vec, idx, ptr )	\
	asm ( "stvx %0,%1,%2"		\
		:			\
		: "v" (vec),		\
		  "r" (idx), "r" (ptr)	\
		: "r0", "memory"	\
	)
#define asm_stvx0( vec, ptr )		\
	asm ( "stvx %0,0,%1"		\
		:			\
		: "v" (vec),		\
		  "r" (ptr)		\
		: "r0", "memory"	\
	)

void
vec_memcpy( void *dest, const void *src, size_t vecs4 )
{
	vector unsigned char *vDest = dest;
	const vector unsigned char *vSrc = src;
	vector unsigned char tmp0, tmp1, tmp2, tmp3;
	int i;

	for ( i = 0; i < vecs4; i++ ) {
		asm_lvx0( tmp0,    vSrc );
		asm_lvx( tmp1, 16, vSrc );
		asm_lvx( tmp2, 32, vSrc );
		asm_lvx( tmp3, 48, vSrc );
		vSrc += 4;

		asm_stvx0( tmp0,    vDest );
		asm_stvx( tmp1, 16, vDest );
		asm_stvx( tmp2, 32, vDest );
		asm_stvx( tmp3, 48, vDest );
		vDest += 4;
	}
}

#include <altivec.h>
#define __need_size_t
#include <stddef.h>

#define asm_lvx( vec, idx, ptr )	\
	asm ( "lvx %0,%1,%2"		\
		: "=v" (vec)		\
		: "r" (idx), "r" (ptr)	\
		: "r0"			\
	)
#define asm_lvx0( vec, ptr )		\
	asm ( "lvx %0,0,%1"		\
		: "=v" (vec)		\
		: "r" (ptr)		\
		: "r0"			\
	)
#define asm_stvx( vec, idx, ptr )	\
	asm ( "stvx %0,%1,%2"		\
		:			\
		: "v" (vec),		\
		  "r" (idx), "r" (ptr)	\
		: "r0", "memory"	\
	)
#define asm_stvx0( vec, ptr )		\
	asm ( "stvx %0,0,%1"		\
		:			\
		: "v" (vec),		\
		  "r" (ptr)		\
		: "r0", "memory"	\
	)

void
vec_memcpy( void *dest, const void *src, size_t vecs4 )
{
	register int index1, index2, index3;
	int i;

	vector unsigned char *vDest = dest;
	const vector unsigned char *vSrc = src;
	vector unsigned char tmp0, tmp1, tmp2, tmp3;

	if (vecs4) {
		asm ( "li %0,16" : "=r" (index1) );
		asm ( "li %0,32" : "=r" (index2) );
		asm ( "li %0,48" : "=r" (index3) );

		do {

			asm_lvx0( tmp0,        vSrc );
			asm_lvx( tmp1, index1, vSrc );
			asm_lvx( tmp2, index2, vSrc );
			asm_lvx( tmp3, index3, vSrc );
			vSrc += 4;

			asm_stvx0( tmp0,        vDest );
			asm_stvx( tmp1, index1, vDest );
			asm_stvx( tmp2, index2, vDest );
			asm_stvx( tmp3, index3, vDest );
			vDest += 4;
		} while (--vecs4);
	}
}
	.file	"vec_memcpy.c"
	.section	".text"
	.align 2
	.globl vec_memcpy
	.type	vec_memcpy, @function
vec_memcpy:
	mr. 0,5
	stwu 1,-16(1)
	mtctr 0
	beq- 0,.L5
#APP
	li 10,16
	li 11,32
	li 9,48
#NO_APP
.L4:
#APP
	lvx 0,0,4
	lvx 1,10,4
	lvx 13,11,4
	lvx 12,9,4
#NO_APP
	addi 4,4,64
#APP
	stvx 0,0,3
	stvx 1,10,3
	stvx 13,11,3
	stvx 12,9,3
#NO_APP
	addi 3,3,64
	bdnz .L4
.L5:
	addi 1,1,16
	blr
	.size	vec_memcpy, .-vec_memcpy
	.ident	"GCC: (GNU) 4.2.0 (PLD-Linux)"
	.section	.note.GNU-stack,"",@progbits