Re: Why Git is so fast

Steven Noonan <steven@xxxxxxxxxxxxxx> · Thu, 30 Apr 2009 17:23:57 -0700

On Thu, Apr 30, 2009 at 2:36 PM, Kjetil Barvik <barvik@xxxxxxxxxxxx> wrote:
> * "Shawn O. Pearce" <spearce@xxxxxxxxxxx> writes:
> |>      4) The "static inline void hashcpy(....)" in cache.h could then
> |>         maybe be written like this:
> |
> | Its already done as "memcpy(a, b, 20)" which most compilers will
> | inline and probably reduce to 5 word moves anyway.  That's why
> | hashcpy() itself is inline.
>
>  But would the compiler be able to trust that the hashcpy() is always
>  called with correct word alignment on variables a and b?
>
>  I made a test and compiled git with:
>
>    make USE_NSEC=1 CFLAGS="-march=core2 -mtune=core2 -O2 -g2 -fno-stack-protector" clean all
>
>  compiler: gcc (Gentoo 4.3.3-r2 p1.1, pie-10.1.5) 4.3.3
>  CPU: Intel(R) Core(TM)2 CPU T7200 @ 2.00GHz GenuineIntel
>
>  Then used gdb to get the following:
>
> (gdb) disassemble write_sha1_file
> Dump of assembler code for function write_sha1_file:
> 0x080e3830 <write_sha1_file+0>: push   %ebp
> 0x080e3831 <write_sha1_file+1>: mov    %esp,%ebp
> 0x080e3833 <write_sha1_file+3>: sub    $0x58,%esp
> 0x080e3836 <write_sha1_file+6>: lea    -0x10(%ebp),%eax
> 0x080e3839 <write_sha1_file+9>: mov    %ebx,-0xc(%ebp)
> 0x080e383c <write_sha1_file+12>:        mov    %esi,-0x8(%ebp)
> 0x080e383f <write_sha1_file+15>:        mov    %edi,-0x4(%ebp)
> 0x080e3842 <write_sha1_file+18>:        mov    0x14(%ebp),%ebx
> 0x080e3845 <write_sha1_file+21>:        mov    %eax,0x8(%esp)
> 0x080e3849 <write_sha1_file+25>:        lea    -0x44(%ebp),%edi
> 0x080e384c <write_sha1_file+28>:        lea    -0x24(%ebp),%esi
> 0x080e384f <write_sha1_file+31>:        mov    %edi,0x4(%esp)
> 0x080e3853 <write_sha1_file+35>:        mov    %esi,(%esp)
> 0x080e3856 <write_sha1_file+38>:        mov    0x10(%ebp),%ecx
> 0x080e3859 <write_sha1_file+41>:        mov    0xc(%ebp),%edx
> 0x080e385c <write_sha1_file+44>:        mov    0x8(%ebp),%eax
> 0x080e385f <write_sha1_file+47>:        call   0x80e0350 <write_sha1_file_prepare>
> 0x080e3864 <write_sha1_file+52>:        test   %ebx,%ebx
> 0x080e3866 <write_sha1_file+54>:        je     0x80e3885 <write_sha1_file+85>
>
> 0x080e3868 <write_sha1_file+56>:        mov    -0x24(%ebp),%eax
> 0x080e386b <write_sha1_file+59>:        mov    %eax,(%ebx)
> 0x080e386d <write_sha1_file+61>:        mov    -0x20(%ebp),%eax
> 0x080e3870 <write_sha1_file+64>:        mov    %eax,0x4(%ebx)
> 0x080e3873 <write_sha1_file+67>:        mov    -0x1c(%ebp),%eax
> 0x080e3876 <write_sha1_file+70>:        mov    %eax,0x8(%ebx)
> 0x080e3879 <write_sha1_file+73>:        mov    -0x18(%ebp),%eax
> 0x080e387c <write_sha1_file+76>:        mov    %eax,0xc(%ebx)
> 0x080e387f <write_sha1_file+79>:        mov    -0x14(%ebp),%eax
> 0x080e3882 <write_sha1_file+82>:        mov    %eax,0x10(%ebx)
>
>  I admit that I am not particular familar with intel machine
>  instructions, but I guess that the above 10 mov instructions is the
>  result for the compiled inline hashcpy() in the write_sha1_file()
>  function in sha1_file.c
>
>  Question: would it be possible for the compiler to compile it down to
>  just 5 mov instructions if we had used unsigned 32 bits type?  Or is
>  this the best we can reasonable hope for inside the write_sha1_file()
>  function?
>
>  I checked 3 other output of "disassemble function_foo", and it seems
>  that those 3 functions I checked got 10 mov instructions for the
>  inline hashcpy(), as far as I can tell.
>
> 0x080e3885 <write_sha1_file+85>:        mov    %esi,(%esp)
> 0x080e3888 <write_sha1_file+88>:        call   0x80e3800 <has_sha1_file>
> 0x080e388d <write_sha1_file+93>:        xor    %edx,%edx
> 0x080e388f <write_sha1_file+95>:        test   %eax,%eax
> 0x080e3891 <write_sha1_file+97>:        jne    0x80e38b6 <write_sha1_file+134>
> 0x080e3893 <write_sha1_file+99>:        mov    0xc(%ebp),%eax
> 0x080e3896 <write_sha1_file+102>:       mov    %edi,%edx
> 0x080e3898 <write_sha1_file+104>:       mov    %eax,0x4(%esp)
> 0x080e389c <write_sha1_file+108>:       mov    -0x10(%ebp),%ecx
> 0x080e389f <write_sha1_file+111>:       mov    0x8(%ebp),%eax
> 0x080e38a2 <write_sha1_file+114>:       movl   $0x0,0x8(%esp)
> 0x080e38aa <write_sha1_file+122>:       mov    %eax,(%esp)
> 0x080e38ad <write_sha1_file+125>:       mov    %esi,%eax
> 0x080e38af <write_sha1_file+127>:       call   0x80e1e40 <write_loose_object>
> 0x080e38b4 <write_sha1_file+132>:       mov    %eax,%edx
> 0x080e38b6 <write_sha1_file+134>:       mov    %edx,%eax
> 0x080e38b8 <write_sha1_file+136>:       mov    -0xc(%ebp),%ebx
> 0x080e38bb <write_sha1_file+139>:       mov    -0x8(%ebp),%esi
> 0x080e38be <write_sha1_file+142>:       mov    -0x4(%ebp),%edi
> 0x080e38c1 <write_sha1_file+145>:       leave
> 0x080e38c2 <write_sha1_file+146>:       ret
> End of assembler dump.
> (gdb)
>
>  So, maybe the compiler is doing the right thing after all?
>

Well, I just tested this with GCC myself. I used this segment of code:

        #include <memory.h>
        void hashcpy(unsigned char *sha_dst, const unsigned char *sha_src)
        {
                memcpy(sha_dst, sha_src, 20);
        }

I compiled using Apple's GCC 4.0.1 (note that GCC 4.3 and 4.4 vanilla
yield the same code) with these parameters to get Intel assembly:
        gcc -O2 -arch i386 -march=pentium3 -mtune=pentium3
-fomit-frame-pointer -fno-strict-aliasing -S test.c
and these parameters to get the equivalent PowerPC code:
        gcc -O2 -mcpu=G5 -arch ppc -fomit-frame-pointer
-fno-strict-aliasing -S test.c

Intel code:
        .text
        .align 4,0x90
.globl _hashcpy
_hashcpy:
        subl    $12, %esp
        movl    20(%esp), %edx
        movl    16(%esp), %ecx
        movl    (%edx), %eax
        movl    %eax, (%ecx)
        movl    4(%edx), %eax
        movl    %eax, 4(%ecx)
        movl    8(%edx), %eax
        movl    %eax, 8(%ecx)
        movl    12(%edx), %eax
        movl    %eax, 12(%ecx)
        movl    16(%edx), %eax
        movl    %eax, 16(%ecx)
        addl    $12, %esp
        ret
        .subsections_via_symbols

and the PowerPC code:

        .section __TEXT,__text,regular,pure_instructions
        .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
        .machine ppc970
        .text
        .align 2
        .p2align 4,,15
        .globl _hashcpy
_hashcpy:
        lwz r0,0(r4)
        lwz r2,4(r4)
        lwz r9,8(r4)
        lwz r11,12(r4)
        stw r0,0(r3)
        stw r2,4(r3)
        stw r9,8(r3)
        stw r11,12(r3)
        lwz r0,16(r4)
        stw r0,16(r3)
        blr
        .subsections_via_symbols

So it does look like GCC does what it should and it inlines the memcpy.

A bit off topic, but the results are rather interesting to me, and I
think I see a weakness in how GCC is doing this on Intel. Someone
please correct me if I'm wrong, but the PowerPC code seems much better
because it can yield very high instruction-level parallelism. It does
5 loads and then 5 stores, using 4 registers for temporary storage and
2 registers for pointers.

I realize the Intel x86 architecture is quite constrained in that it
has so few general purpose registers, but there has to be better code
than what GCC emitted above. It seems like the processor would stall
because of the quantity of sequential inter-dependent instructions
that can't be done in parallel (mov to memory that depends on a mov to
eax, etc).

I suppose the code might not be stalling if it's using the maximum
number of registers and doing as many memory accesses that it can per
clock, but based on known details about the architecture, does it seem
to be doing that?

- Steven
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html