Hi, list.

I'm not sure if I found out a bug or I just made a mistake in gcc inline assembler.
I think that more probably it's a bug, because my code works well when I don't use compile time optimization.
I have the following code for x86_64 target:
#include <stdio.h>

#define SPIN_LOCKED_V   1

struct spin {
    unsigned long val;

static inline void spin_lock(struct spin *lock)
    __asm__ volatile ("movq %2, %%rax\n\t"
                      "1: lock cmpxchg %1, %0\n\t"
                      "cmpq %2, %%rax\n\t"
                      "jnz 1b\n"
                      : "+m" (lock->val)
                      : "r" ((volatile long)SPIN_LOCKED_V),
                        "rI" ((volatile long)SPIN_UNLOCKED_V)
                      : "%rax", "memory");

static inline void spin_unlock(struct spin *lock)
  __asm__ volatile ("lock xchgq %1, %0\n"
                    : "+m" (lock->val)
                    : "r" ((volatile long)SPIN_UNLOCKED_V));

int main(void)
    struct spin spin;
    int i = 0;

    spin.val = SPIN_UNLOCKED_V;
    for (i = 0; i < 10; i++) {
        printf("[0][UNLOCKED] spin_val = %#x\n", spin.val);
        printf("[1][LOCKED] spin_val = %#x\n", spin.val);
        printf("[2][UNLOCKED] spin_val = %#x\n", spin.val);

    return 0;

The code above works very well when I compile it with -O0 or -g options, but when I ask gcc to optimize this usign -O1 or -O2 options,
my application hangs after first iteration.
I have the following output:
[0][UNLOCKED] spin_val = 0
[1][LOCKED] spin_val = 0x1
[2][UNLOCKED] spin_val = 0
[0][UNLOCKED] spin_val = 0
[1][LOCKED] spin_val = 0x1
[2][UNLOCKED] spin_val = 0x1
[0][UNLOCKED] spin_val = 0x1

It hangs because gcc generates the following assembler code:
  4004f0:       41 54                   push   %r12
  4004f2:       41 bc 01 00 00 00       mov    $0x1,%r12d
  4004f8:       55                      push   %rbp
  4004f9:       31 ed                   xor    %ebp,%ebp
  4004fb:       53                      push   %rbx
  4004fc:       31 db                   xor    %ebx,%ebx
  4004fe:       48 83 ec 10             sub    $0x10,%rsp
  400502:       48 c7 04 24 00 00 00    movq   $0x0,(%rsp)
  400509:       00 
  40050a:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  400510:       48 8b 34 24             mov    (%rsp),%rsi        // <- Here is my for(i = 0; i < 10; i++) loop
  400514:       bf 5c 06 40 00          mov    $0x40065c,%edi
  400519:       31 c0                   xor    %eax,%eax
  40051b:       e8 c0 fe ff ff          callq  4003e0 <printf@plt>
  400520:       48 c7 c0 00 00 00 00    mov    $0x0,%rax
  400527:       f0 4c 0f b1 24 24       lock cmpxchg %r12,(%rsp)
  40052d:       48 83 f8 00             cmp    $0x0,%rax
  400531:       75 f4                   jne    400527 <main+0x37>
  400533:       48 8b 34 24             mov    (%rsp),%rsi
  400537:       bf 7a 06 40 00          mov    $0x40067a,%edi
  40053c:       31 c0                   xor    %eax,%eax
  40053e:       e8 9d fe ff ff          callq  4003e0 <printf@plt>
  400543:       f0 48 87 2c 24          lock xchg %rbp,(%rsp)
  400548:       48 8b 34 24             mov    (%rsp),%rsi
  40054c:       31 c0                   xor    %eax,%eax
  40054e:       bf 96 06 40 00          mov    $0x400696,%edi
  400553:       83 c3 01                add    $0x1,%ebx
  400556:       e8 85 fe ff ff          callq  4003e0 <printf@plt>
  40055b:       83 fb 0a                cmp    $0xa,%ebx
  40055e:       75 b0                   jne    400510 <main+0x20> // <- And here it ends

As you can see, gcc initializes registers %r12 and %rbp that are used by spin_lock and spin_unlock functions respectively 
only once before entering to the loop. Looking at this assembler code I can say that after my spinlock has been locked once it never be unlocked
because %r12 and %rbp aren't reinitialized after each iteration by values I explicitly described in spin_lock and spin_unlock functions.
Is it ubnormal behavior of compiler or I made a mistake in gcc inline assembler?

Extra information:
1) GCC: I tried both 4.3.4 and 3.4.6
1.1) gcc-3.4 --version
gcc-3.4 (GCC) 3.4.6 (Debian 3.4.6-10)
1.2) gcc --version
gcc (Debian 4.3.4-6) 4.3.4

2) uname -a
Linux godel 2.6.30-2-amd64 #1 SMP Mon Dec 7 05:21:45 UTC 2009 x86_64 GNU/Linux 

3) cat /proc/cpuinfo | grep 'model ' | head -1
model name	: Intel(R) Core(TM)2 Duo CPU     L7500  @ 1.60GH

Kind regards.

