Hello everyone ! Unlike in scalar code generation for ppc, which seems to get better with each gcc version; I've observed progressing altivec code generation breakage. I am not powerpc guru (yet :P), so I'd like to discuss some problems I've spotted before adding new entry to bugzilla. I hope there is someone who can tell me what else should I check. NOTE: I've been using gcc versions patched by distributions, but vanilla gcc 4.1.2 gave same results as ppc-pld-linux-gcc-4.1.2 and gcc-4.1, so I beleve there is no breakage introduced with patching in those gcc versions. $ ppc-pld-linux-gcc-4.1.2 --version ppc-pld-linux-gcc-4.1.2 (GCC) 4.1.2 (PLD-Linux) $ ppc-pld-linux-gcc-4.2.0 --version ppc-pld-linux-gcc-4.2.0 (GCC) 4.2.0 (PLD-Linux) $ gcc-3.3 --version gcc-3.3 (GCC) 3.3.6 (Debian 1:3.3.6-15) $ gcc-4.1 --version gcc-4.1 (GCC) 4.1.2 20061115 (prerelease) (Debian 4.1.1-21) PROBLEM 1. With altivec enabled gcc moves stack in every function by additional 16 bytes no matter it is used or not. An empty function: $ cat empty.c void empty(void) { } with optimization -O2 compiles to: empty: blr But with -maltivec it adds 'stwu' instruction at the beggining, and 'addi' at the end: empty: stwu 1,-16(1) addi 1,1,16 blr I beleve this is totally unnecessary and useless. Happens with all gcc versions I've checked, in gcc 3.3 it is triggered by -mabi=altivec, in gcc 4.* just adding -maltivec (or -mcpu=7450 or similar) activates this "feature". Are those instructions suposed to do anything ? Moreover, those two instructions could be easily simplified to one: stw 1,-16(1) why they aren't ? PROBLEM 2. Let's have a very simple memory copying function using altivec: --BEGIN-- #include <altivec.h> #define __need_size_t #include <stddef.h> void vec_memcpy( void *dest, const void *src, size_t vecs4 ) { vector unsigned char *vDest = dest; const vector unsigned char *vSrc = src; vector unsigned char tmp0, tmp1, tmp2, tmp3; int i; for ( i = 0; i < vecs4; i++ ) { tmp0 = vec_ld( 0, vSrc ); tmp1 = vec_ld( 16, vSrc ); tmp2 = vec_ld( 32, vSrc ); tmp3 = vec_ld( 48, vSrc ); vSrc += 4; vec_st( tmp0, 0, vDest ); vec_st( tmp1, 16, vDest ); vec_st( tmp2, 32, vDest ); vec_st( tmp3, 48, vDest ); vDest += 4; } } --END-- With gcc 3.3 it gives a very nice code: $ gcc-3.3 -O2 -maltivec vec_memcpy.c -S --BEGIN-- vec_memcpy: cmpwi 0,5,0 beqlr- 0 mtctr 5 li 8,0 li 10,16 li 11,32 li 9,48 .L9: lvx 0,0,4 lvx 1,10,4 lvx 13,11,4 lvx 12,9,4 stvx 0,8,3 addi 4,4,64 stvx 1,10,3 stvx 13,11,3 stvx 12,9,3 addi 3,3,64 bdnz .L9 blr --END-- The only part I don't like about it is 'li 8,0'. But gcc 4.1.2 and 4.2.0 create the code to calculate load/store pointer manually and allways put '0' as index, so the main loop becomes: --BEGIN-- .L4: addi 9,4,16 lvx 0,0,4 lvx 1,0,9 addi 9,4,32 lvx 13,0,9 addi 9,4,48 lvx 12,0,9 addi 9,3,16 stvx 0,0,3 addi 4,4,64 stvx 1,0,9 addi 9,3,32 stvx 13,0,9 addi 9,3,48 stvx 12,0,9 addi 3,3,64 bdnz .L4 --END-- and it is slower than the former version. I've been trying to use int and register int as the index, but gcc allways manages to use '0' as second lvx/stvx argument. As a workaround I tried to use inline assembly for loads and stores: #define asm_lvx( vec, idx, ptr ) \ asm ( "lvx %0,%1,%2" \ : "=v" (vec) \ : "r" (idx), "r" (ptr) ) #define asm_stvx( vec, idx, ptr ) \ asm ( "stvx %0,%1,%2" \ : \ : "v" (vec), \ "r" (idx), "r" (ptr) \ : "memory" \ ) whole code became: --BEGIN-- #include <altivec.h> #define __need_size_t #include <stddef.h> #define asm_lvx( vec, idx, ptr ) \ asm ( "lvx %0,%1,%2" \ : "=v" (vec) \ : "r" (idx), "r" (ptr) \ ) #define asm_stvx( vec, idx, ptr ) \ asm ( "stvx %0,%1,%2" \ : \ : "v" (vec), \ "r" (idx), "r" (ptr) \ : "memory" \ ) void vec_memcpy( void *dest, const void *src, size_t vecs4 ) { vector unsigned char *vDest = dest; const vector unsigned char *vSrc = src; vector unsigned char tmp0, tmp1, tmp2, tmp3; int i; for ( i = 0; i < vecs4; i++ ) { asm_lvx( tmp0, 0, vSrc ); asm_lvx( tmp1, 16, vSrc ); asm_lvx( tmp2, 32, vSrc ); asm_lvx( tmp3, 48, vSrc ); vSrc += 4; asm_stvx( tmp0, 0, vDest ); asm_stvx( tmp1, 16, vDest ); asm_stvx( tmp2, 32, vDest ); asm_stvx( tmp3, 48, vDest ); vDest += 4; } } --END-- In gcc 4.1.2 the given result seems to be nice: --BEGIN-- vec_memcpy: mr. 0,5 stwu 1,-16(1) mtctr 0 beq- 0,.L5 li 10,0 li 11,16 li 9,32 li 0,48 .L4: #APP lvx 0,10,4 lvx 1,11,4 lvx 13,9,4 lvx 12,0,4 #NO_APP addi 4,4,64 #APP stvx 0,10,3 stvx 1,11,3 stvx 13,9,3 stvx 12,0,3 #NO_APP addi 3,3,64 bdnz .L4 .L5: addi 1,1,16 blr --END-- But there's a problem, in lvx/stvx '0' in second term acts as literal 0, not as register number. Shouldn't gcc be able to understand those instructions and not use r0 ? Anyway, I've marked "r0" in asm as used register to prevent gcc from using it and defined more macros for lvx and stvx with '0' index. Now the code works correctly, gives nice results in gcc 4.1.2 and gcc 3.3, but does not look as nicely as before, and removes from gcc liberty to reorganize instruction order. See included file. The revelant code is: --BEGIN-- li 10,16 li 11,32 li 9,48 .L4: #APP lvx 0,0,4 lvx 1,10,4 lvx 13,11,4 lvx 12,9,4 #NO_APP addi 4,4,64 #APP stvx 0,0,3 stvx 1,10,3 stvx 13,11,3 stvx 12,9,3 #NO_APP addi 3,3,64 bdnz .L4 --END-- But there still is a problem with gcc 4.2.0, it loads the constants to registers inside loop, not before it. I've been trying to define indexes as: register int index1 = 16, index2 = 32, index3 = 48; and force values to be loaded before the loop: asm volatile ( "" : : "r" (index1), "r" (index2), "r" (index3) ); but the result is even worse, values are loaded twice ! Finally the second included code did the trick. But with all those asm inlines I prefer to rewrite whole loop in assembly, at least I'll be sure no additional magic is going to be needed for future gcc versions. Maybe all those aren't bugs and the code generated by gcc 4.2 and 4.1.2 should be as fast as that generated by gcc 3.3. If so, tell me, why on my machine it isn't ? -- ____ Sparky{PI] -- Przemyslaw _ ___ _ _ ........... LANG...Pl..Ca..Es..En /____) ___ ___ _ _ || Iskra | | _ \| | | : WWW........ppcrcd.pld-linux.org \____\| -_)'___| ||^'||//\\// < | _/| | | : JID......sparky<at>jabberes.org (____/|| (_-_|_|| ||\\ || |_ |_| |_| _| : Mail....sparky<at>pld-linux.org
#include <altivec.h> #define __need_size_t #include <stddef.h> #define asm_lvx( vec, idx, ptr ) \ asm ( "lvx %0,%1,%2" \ : "=v" (vec) \ : "r" (idx), "r" (ptr) \ : "r0" \ ) #define asm_lvx0( vec, ptr ) \ asm ( "lvx %0,0,%1" \ : "=v" (vec) \ : "r" (ptr) \ : "r0" \ ) #define asm_stvx( vec, idx, ptr ) \ asm ( "stvx %0,%1,%2" \ : \ : "v" (vec), \ "r" (idx), "r" (ptr) \ : "r0", "memory" \ ) #define asm_stvx0( vec, ptr ) \ asm ( "stvx %0,0,%1" \ : \ : "v" (vec), \ "r" (ptr) \ : "r0", "memory" \ ) void vec_memcpy( void *dest, const void *src, size_t vecs4 ) { vector unsigned char *vDest = dest; const vector unsigned char *vSrc = src; vector unsigned char tmp0, tmp1, tmp2, tmp3; int i; for ( i = 0; i < vecs4; i++ ) { asm_lvx0( tmp0, vSrc ); asm_lvx( tmp1, 16, vSrc ); asm_lvx( tmp2, 32, vSrc ); asm_lvx( tmp3, 48, vSrc ); vSrc += 4; asm_stvx0( tmp0, vDest ); asm_stvx( tmp1, 16, vDest ); asm_stvx( tmp2, 32, vDest ); asm_stvx( tmp3, 48, vDest ); vDest += 4; } }
#include <altivec.h> #define __need_size_t #include <stddef.h> #define asm_lvx( vec, idx, ptr ) \ asm ( "lvx %0,%1,%2" \ : "=v" (vec) \ : "r" (idx), "r" (ptr) \ : "r0" \ ) #define asm_lvx0( vec, ptr ) \ asm ( "lvx %0,0,%1" \ : "=v" (vec) \ : "r" (ptr) \ : "r0" \ ) #define asm_stvx( vec, idx, ptr ) \ asm ( "stvx %0,%1,%2" \ : \ : "v" (vec), \ "r" (idx), "r" (ptr) \ : "r0", "memory" \ ) #define asm_stvx0( vec, ptr ) \ asm ( "stvx %0,0,%1" \ : \ : "v" (vec), \ "r" (ptr) \ : "r0", "memory" \ ) void vec_memcpy( void *dest, const void *src, size_t vecs4 ) { register int index1, index2, index3; int i; vector unsigned char *vDest = dest; const vector unsigned char *vSrc = src; vector unsigned char tmp0, tmp1, tmp2, tmp3; if (vecs4) { asm ( "li %0,16" : "=r" (index1) ); asm ( "li %0,32" : "=r" (index2) ); asm ( "li %0,48" : "=r" (index3) ); do { asm_lvx0( tmp0, vSrc ); asm_lvx( tmp1, index1, vSrc ); asm_lvx( tmp2, index2, vSrc ); asm_lvx( tmp3, index3, vSrc ); vSrc += 4; asm_stvx0( tmp0, vDest ); asm_stvx( tmp1, index1, vDest ); asm_stvx( tmp2, index2, vDest ); asm_stvx( tmp3, index3, vDest ); vDest += 4; } while (--vecs4); } }
.file "vec_memcpy.c" .section ".text" .align 2 .globl vec_memcpy .type vec_memcpy, @function vec_memcpy: mr. 0,5 stwu 1,-16(1) mtctr 0 beq- 0,.L5 #APP li 10,16 li 11,32 li 9,48 #NO_APP .L4: #APP lvx 0,0,4 lvx 1,10,4 lvx 13,11,4 lvx 12,9,4 #NO_APP addi 4,4,64 #APP stvx 0,0,3 stvx 1,10,3 stvx 13,11,3 stvx 12,9,3 #NO_APP addi 3,3,64 bdnz .L4 .L5: addi 1,1,16 blr .size vec_memcpy, .-vec_memcpy .ident "GCC: (GNU) 4.2.0 (PLD-Linux)" .section .note.GNU-stack,"",@progbits