On Thu, Apr 30, 2009 at 2:36 PM, Kjetil Barvik <barvik@xxxxxxxxxxxx> wrote: > * "Shawn O. Pearce" <spearce@xxxxxxxxxxx> writes: > |> 4) The "static inline void hashcpy(....)" in cache.h could then > |> maybe be written like this: > | > | Its already done as "memcpy(a, b, 20)" which most compilers will > | inline and probably reduce to 5 word moves anyway. That's why > | hashcpy() itself is inline. > > But would the compiler be able to trust that the hashcpy() is always > called with correct word alignment on variables a and b? > > I made a test and compiled git with: > > make USE_NSEC=1 CFLAGS="-march=core2 -mtune=core2 -O2 -g2 -fno-stack-protector" clean all > > compiler: gcc (Gentoo 4.3.3-r2 p1.1, pie-10.1.5) 4.3.3 > CPU: Intel(R) Core(TM)2 CPU T7200 @ 2.00GHz GenuineIntel > > Then used gdb to get the following: > > (gdb) disassemble write_sha1_file > Dump of assembler code for function write_sha1_file: > 0x080e3830 <write_sha1_file+0>: push %ebp > 0x080e3831 <write_sha1_file+1>: mov %esp,%ebp > 0x080e3833 <write_sha1_file+3>: sub $0x58,%esp > 0x080e3836 <write_sha1_file+6>: lea -0x10(%ebp),%eax > 0x080e3839 <write_sha1_file+9>: mov %ebx,-0xc(%ebp) > 0x080e383c <write_sha1_file+12>: mov %esi,-0x8(%ebp) > 0x080e383f <write_sha1_file+15>: mov %edi,-0x4(%ebp) > 0x080e3842 <write_sha1_file+18>: mov 0x14(%ebp),%ebx > 0x080e3845 <write_sha1_file+21>: mov %eax,0x8(%esp) > 0x080e3849 <write_sha1_file+25>: lea -0x44(%ebp),%edi > 0x080e384c <write_sha1_file+28>: lea -0x24(%ebp),%esi > 0x080e384f <write_sha1_file+31>: mov %edi,0x4(%esp) > 0x080e3853 <write_sha1_file+35>: mov %esi,(%esp) > 0x080e3856 <write_sha1_file+38>: mov 0x10(%ebp),%ecx > 0x080e3859 <write_sha1_file+41>: mov 0xc(%ebp),%edx > 0x080e385c <write_sha1_file+44>: mov 0x8(%ebp),%eax > 0x080e385f <write_sha1_file+47>: call 0x80e0350 <write_sha1_file_prepare> > 0x080e3864 <write_sha1_file+52>: test %ebx,%ebx > 0x080e3866 <write_sha1_file+54>: je 0x80e3885 <write_sha1_file+85> > > 0x080e3868 <write_sha1_file+56>: mov -0x24(%ebp),%eax > 0x080e386b <write_sha1_file+59>: mov %eax,(%ebx) > 0x080e386d <write_sha1_file+61>: mov -0x20(%ebp),%eax > 0x080e3870 <write_sha1_file+64>: mov %eax,0x4(%ebx) > 0x080e3873 <write_sha1_file+67>: mov -0x1c(%ebp),%eax > 0x080e3876 <write_sha1_file+70>: mov %eax,0x8(%ebx) > 0x080e3879 <write_sha1_file+73>: mov -0x18(%ebp),%eax > 0x080e387c <write_sha1_file+76>: mov %eax,0xc(%ebx) > 0x080e387f <write_sha1_file+79>: mov -0x14(%ebp),%eax > 0x080e3882 <write_sha1_file+82>: mov %eax,0x10(%ebx) > > I admit that I am not particular familar with intel machine > instructions, but I guess that the above 10 mov instructions is the > result for the compiled inline hashcpy() in the write_sha1_file() > function in sha1_file.c > > Question: would it be possible for the compiler to compile it down to > just 5 mov instructions if we had used unsigned 32 bits type? Or is > this the best we can reasonable hope for inside the write_sha1_file() > function? > > I checked 3 other output of "disassemble function_foo", and it seems > that those 3 functions I checked got 10 mov instructions for the > inline hashcpy(), as far as I can tell. > > 0x080e3885 <write_sha1_file+85>: mov %esi,(%esp) > 0x080e3888 <write_sha1_file+88>: call 0x80e3800 <has_sha1_file> > 0x080e388d <write_sha1_file+93>: xor %edx,%edx > 0x080e388f <write_sha1_file+95>: test %eax,%eax > 0x080e3891 <write_sha1_file+97>: jne 0x80e38b6 <write_sha1_file+134> > 0x080e3893 <write_sha1_file+99>: mov 0xc(%ebp),%eax > 0x080e3896 <write_sha1_file+102>: mov %edi,%edx > 0x080e3898 <write_sha1_file+104>: mov %eax,0x4(%esp) > 0x080e389c <write_sha1_file+108>: mov -0x10(%ebp),%ecx > 0x080e389f <write_sha1_file+111>: mov 0x8(%ebp),%eax > 0x080e38a2 <write_sha1_file+114>: movl $0x0,0x8(%esp) > 0x080e38aa <write_sha1_file+122>: mov %eax,(%esp) > 0x080e38ad <write_sha1_file+125>: mov %esi,%eax > 0x080e38af <write_sha1_file+127>: call 0x80e1e40 <write_loose_object> > 0x080e38b4 <write_sha1_file+132>: mov %eax,%edx > 0x080e38b6 <write_sha1_file+134>: mov %edx,%eax > 0x080e38b8 <write_sha1_file+136>: mov -0xc(%ebp),%ebx > 0x080e38bb <write_sha1_file+139>: mov -0x8(%ebp),%esi > 0x080e38be <write_sha1_file+142>: mov -0x4(%ebp),%edi > 0x080e38c1 <write_sha1_file+145>: leave > 0x080e38c2 <write_sha1_file+146>: ret > End of assembler dump. > (gdb) > > So, maybe the compiler is doing the right thing after all? > Well, I just tested this with GCC myself. I used this segment of code: #include <memory.h> void hashcpy(unsigned char *sha_dst, const unsigned char *sha_src) { memcpy(sha_dst, sha_src, 20); } I compiled using Apple's GCC 4.0.1 (note that GCC 4.3 and 4.4 vanilla yield the same code) with these parameters to get Intel assembly: gcc -O2 -arch i386 -march=pentium3 -mtune=pentium3 -fomit-frame-pointer -fno-strict-aliasing -S test.c and these parameters to get the equivalent PowerPC code: gcc -O2 -mcpu=G5 -arch ppc -fomit-frame-pointer -fno-strict-aliasing -S test.c Intel code: .text .align 4,0x90 .globl _hashcpy _hashcpy: subl $12, %esp movl 20(%esp), %edx movl 16(%esp), %ecx movl (%edx), %eax movl %eax, (%ecx) movl 4(%edx), %eax movl %eax, 4(%ecx) movl 8(%edx), %eax movl %eax, 8(%ecx) movl 12(%edx), %eax movl %eax, 12(%ecx) movl 16(%edx), %eax movl %eax, 16(%ecx) addl $12, %esp ret .subsections_via_symbols and the PowerPC code: .section __TEXT,__text,regular,pure_instructions .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 .machine ppc970 .text .align 2 .p2align 4,,15 .globl _hashcpy _hashcpy: lwz r0,0(r4) lwz r2,4(r4) lwz r9,8(r4) lwz r11,12(r4) stw r0,0(r3) stw r2,4(r3) stw r9,8(r3) stw r11,12(r3) lwz r0,16(r4) stw r0,16(r3) blr .subsections_via_symbols So it does look like GCC does what it should and it inlines the memcpy. A bit off topic, but the results are rather interesting to me, and I think I see a weakness in how GCC is doing this on Intel. Someone please correct me if I'm wrong, but the PowerPC code seems much better because it can yield very high instruction-level parallelism. It does 5 loads and then 5 stores, using 4 registers for temporary storage and 2 registers for pointers. I realize the Intel x86 architecture is quite constrained in that it has so few general purpose registers, but there has to be better code than what GCC emitted above. It seems like the processor would stall because of the quantity of sequential inter-dependent instructions that can't be done in parallel (mov to memory that depends on a mov to eax, etc). I suppose the code might not be stalling if it's using the maximum number of registers and doing as many memory accesses that it can per clock, but based on known details about the architecture, does it seem to be doing that? - Steven -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html