Re: perf BPF annotation output validation

Song Liu <songliubraving@xxxxxx> · Fri, 26 Apr 2019 22:45:47 +0000

> On Apr 26, 2019, at 3:40 PM, Arnaldo Carvalho de Melo <arnaldo.melo@xxxxxxxxx> wrote:
> 
> Em Fri, Apr 26, 2019 at 08:51:09PM +0000, Yonghong Song escreveu:
>> On 4/26/19 12:26 PM, Arnaldo Carvalho de Melo wrote:
>>> Em Fri, Apr 26, 2019 at 07:11:37PM +0000, Song Liu escreveu:
>>>>> On Apr 26, 2019, at 11:37 AM, Arnaldo Carvalho de Melo <arnaldo.melo@xxxxxxxxx> wrote:
>>>>> 
>>>>> tldr; more to the end there are some annotation for all sorts of cache level
>>>>> misses trying to validate the 'perf annotation' output for disassembled
>>>>> BPF programs.
>>>>> 
>>>>> [root@quaco ~]# uname -a
>>>>> Linux quaco 5.1.0-rc5+ #4 SMP Wed Apr 17 19:33:00 -03 2019 x86_64 x86_64 x86_64 GNU/Linux
>>>>> 
>>>>> Running perf trace with:
>>>>> 
>>>>> [root@quaco ~]# cat ~/.perfconfig
>>>>> [llvm]
>>>>> 	dump-obj = true
>>>>> 	clang-opt = -g
>>>>> [trace]
>>>>> 	add_events = /home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.c
>>>>> 	#add_events = /wb/augmented_raw_syscalls.o
>>>>> 	show_zeros = yes
>>>>> 	show_duration = no
>>>>> 	no_inherit = yes
>>>>> 	show_timestamp = no
>>>>> 	show_arg_names = no
>>>>> 	args_alignment = 40
>>>>> 	show_prefix = yes
>>>>> 
>>>>> [annotate]
>>>>> 
>>>>> 	hide_src_code = false
>>>>> [root@quaco ~]#
>>>>> 
>>>>> clang/llvm 9:
>>>>> 
>>>>> [root@quaco perf]# eu-readelf -winfo /home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.o | grep compile_unit
>>>>> [     b]  compile_unit         abbrev: 1
>>>>> [root@quaco perf]# eu-readelf -winfo /home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.o | grep compile_unit -B1 -A10
>>>>> Version: 4, Abbreviation section offset: 0, Address size: 8, Offset size: 4
>>>>> [     b]  compile_unit         abbrev: 1
>>>>>           producer             (strp) "clang version 9.0.0 (https://urldefense.proofpoint.com/v2/url?u=https-3A__git.llvm.org_git_clang.git_&d=DwIDaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=dR8692q0_uaizy0jkrBJQM5k2hfm4CiFxYT8KaysFrg&m=8ymLhVNE6vfRn0KCyz7xlPkm7i1H4ttMrMYmeHEP1Cg&s=3_k_sPaZo7mwxQMoYhgd4hBUmCoS3TboKrw15yws6mA&e= 7906282d3afec5dfdc2b27943fd6c0309086c507) (https://urldefense.proofpoint.com/v2/url?u=https-3A__git.llvm.org_git_llvm.git_&d=DwIDaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=dR8692q0_uaizy0jkrBJQM5k2hfm4CiFxYT8KaysFrg&m=8ymLhVNE6vfRn0KCyz7xlPkm7i1H4ttMrMYmeHEP1Cg&s=jutQ4xaAa5FePXthz5gHAgdx6kVlrPI64pnpqWWt59Q&e= a1b5de1ff8ae8bc79dc8e86e1f82565229bd0500)"
>>>>>           language             (data2) C99 (12)
>>>>>           name                 (strp) "/home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.c"
>>>>>           stmt_list            (sec_offset) 0
>>>>>           comp_dir             (strp) "/root"
>>>>>           low_pc               (addr) raw_syscalls:sys_enter+000000000000000000 <sys_enter>
>>>>>           ranges               (sec_offset) range list [     0]
>>>>> [    2a]    variable             abbrev: 2
>>>>>             name                 (strp) "__augmented_syscalls__"
>>>>>             type                 (ref4) [    3f]
>>>>> [root@quaco perf]#
>>>>> 
>>>>> I then leave  this running doing system wide syscall tracing using that
>>>>> augmented_raw_syscalls.c eBPF program hooked into the syscall enter and
>>>>> exit tracepoints, so that I have something to profile and further test
>>>>> the BPF support in perf top/annotate/report (slides for LSFMM anyone?
>>>>> :-)) then I get:
>>>>> 
>>>>> # perf trace -a sleep 100h
>>>> 
>>>> wow, I never tried to run sleep with "100h". ;-)
>>> 
>>> This is me dreaming of the long night sleeps I don't have for about 3.3
>>> years by now ;-)
>>> 
>>>>> <SNIP>
>>>>> tmux: server/11597 ioctl(8</dev/ptmx>, TIOCLINUX, 0x7ffe1b49abf8) = 0
>>>>> tmux: server/11597 readv(8, 0x7ffe1b49ac00, 1)             = 101
>>>>> tmux: server/11597 ioctl(8</dev/ptmx>, TIOCSPGRP, 0x7ffe1b49a9f4) = 0
>>>>> tmux: server/11597 openat(AT_FDCWD, "/proc/30339/cmdline", O_RDONLY) = 5
>>>>> tmux: server/11597 fstat(5, 0x7ffe1b49a8e0)                = 0
>>>>> tmux: server/11597 read(5, 0x5571aaeb2c10, 1024)           = 10
>>>>> tmux: server/11597 close(5)                                = 0
>>>>> gnome-shell/19195  ... [continued]: ioctl())              = 0
>>>>> tmux: server/11597 poll(0x5571aad38e50, 6, 11997)       ...
>>>>> gnome-shell/19195 ioctl(11</dev/dri/card0>, DRM_MODE_RMFB, 0x7ffc2129c25c) = 0
>>>>> gnome-shell/19195 recvmsg(39<socket:[18671390]>, 0x7ffc2129c440, NONE) = -1 EAGAIN (Resource temporarily unavailable)
>>>>> <SNIP>
>>>>> 
>>>>> With that running, I do:
>>>>> 
>>>>> [root@quaco perf]# perf record -a sleep 2s
>>>>> [ perf record: Woken up 1 times to write data ]
>>>>> [ perf record: Captured and wrote 1.647 MB perf.data (3698 samples) ]
>>>>> [root@quaco perf]#
>>>>> 
>>>>> I.e. collect 2 seconds worth of CPU cycles samples, system wide.
>>>>> 
>>>>> [root@quaco perf]# perf record -a sleep 2s
>>>>> [ perf record: Woken up 1 times to write data ]
>>>>> [ perf record: Captured and wrote 2.328 MB perf.data (17764 samples) ]
>>>>> [root@quaco perf]# perf report --header-only perf.data | grep -i b[pt]f
>>>>> # event : name = cycles, , id = { 266608, 266609, 266610, 266611, 266612, 266613, 266614, 266615 }, size = 112, { sample_period, sample_freq } = 4000, sample_type = IP|TID|TIME|CPU|PERIOD, read_format = ID, disabled = 1, inherit = 1, mmap = 1, comm = 1, freq = 1, task = 1, precise_ip = 3, sample_id_all = 1, exclude_guest = 1, mmap2 = 1, comm_exec = 1, ksymbol = 1, bpf_event = 1
>>>>> # bpf_prog_info 13: bpf_prog_7be49e3934a125ba addr 0xffffffffc0437f01 size 229
>>>>> # bpf_prog_info 14: bpf_prog_2a142ef67aaad174 addr 0xffffffffc04393cd size 229
>>>>> # bpf_prog_info 17: bpf_prog_7be49e3934a125ba addr 0xffffffffc052ca3c size 229
>>>>> # bpf_prog_info 18: bpf_prog_2a142ef67aaad174 addr 0xffffffffc0531218 size 229
>>>>> # bpf_prog_info 21: bpf_prog_7be49e3934a125ba addr 0xffffffffc0cb8010 size 229
>>>>> # bpf_prog_info 22: bpf_prog_2a142ef67aaad174 addr 0xffffffffc0cba8be size 229
>>>>> # bpf_prog_info 65: bpf_prog_7be49e3934a125ba addr 0xffffffffc03f6bc2 size 229
>>>>> # bpf_prog_info 66: bpf_prog_2a142ef67aaad174 addr 0xffffffffc03f87f6 size 229
>>>>> # bpf_prog_info 125: bpf_prog_819967866022f1e1_sys_enter addr 0xffffffffc0c6e5a1 size 381
>>>>> # bpf_prog_info 126: bpf_prog_c1bd85c092d6e4aa_sys_exit addr 0xffffffffc0c953a8 size 191
>>>>> [root@quaco perf]#
>>>>> 
>>>>> So everything seems in order, but then:
>>>>> 
>>>>> [root@quaco perf]# perf report | grep bpf_prog
>>>>>     0.14%  perf             bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] bpf_prog_c1bd85c092d6e4aa_sys_exit
>>>>>     0.08%  gnome-terminal-  bpf_prog_819967866022f1e1_sys_enter  [k] bpf_prog_819967866022f1e1_sys_enter
>>>>>     0.06%  perf             bpf_prog_819967866022f1e1_sys_enter  [k] bpf_prog_819967866022f1e1_sys_enter
>>>>>     0.04%  gnome-terminal-  bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] bpf_prog_c1bd85c092d6e4aa_sys_exit
>>>>>     0.01%  alsa-sink-ALC25  bpf_prog_819967866022f1e1_sys_enter  [k] bpf_prog_819967866022f1e1_sys_enter
>>>>>     0.01%  Web Content      bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] bpf_prog_c1bd85c092d6e4aa_sys_exit
>>>>>     0.01%  lvmetad          bpf_prog_819967866022f1e1_sys_enter  [k] bpf_prog_819967866022f1e1_sys_enter
>>>>>     0.00%  alsa-sink-ALC25  bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] bpf_prog_c1bd85c092d6e4aa_sys_exit
>>>>>     0.00%  sssd_kcm         bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] bpf_prog_c1bd85c092d6e4aa_sys_exit
>>>>>     0.00%  gnome-shell      bpf_prog_819967866022f1e1_sys_enter  [k] bpf_prog_819967866022f1e1_sys_enter
>>>>>     0.00%  firefox          bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] bpf_prog_c1bd85c092d6e4aa_sys_exit
>>>>>     0.00%  weechat          bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] bpf_prog_c1bd85c092d6e4aa_sys_exit
>>>>>     0.00%  WebExtensions    bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] bpf_prog_c1bd85c092d6e4aa_sys_exit
>>>>>     0.00%  Timer            bpf_prog_819967866022f1e1_sys_enter  [k] bpf_prog_819967866022f1e1_sys_enter
>>>>>     0.00%  Web Content      bpf_prog_819967866022f1e1_sys_enter  [k] bpf_prog_819967866022f1e1_sys_enter
>>>>> [root@quaco perf]#
>>>>> 
>>>>> Ok, lets check that bpf_prog_819967866022f1e1_sys_enter symbol, 38% of
>>>>> the samples at just at that %rbp, more in a previous run...
>>>>> 
>>>>> [root@quaco perf]# perf annotate --stdio2 bpf_prog_819967866022f1e1_sys_enter
>>>>> Samples: 25  of event 'cycles', 4000 Hz, Event count (approx.): 7248318, [percent: local period]
>>>>> bpf_prog_819967866022f1e1_sys_enter() bpf_prog_819967866022f1e1_sys_enter
>>>>> 38.00         push   %rbp
>>>>> 
>>>>>  2.72         mov    %rsp,%rbp
>>>>>               sub    $0x170,%rsp
>>>>> 13.19         sub    $0x28,%rbp
>>>>>  3.30         mov    %rbx,0x0(%rbp)
>>>>> 20.33         mov    %r13,0x8(%rbp)
>>>>>               mov    %r14,0x10(%rbp)
>>>>>               mov    %r15,0x18(%rbp)
>>>>>               xor    %eax,%eax
>>>>>               mov    %rax,0x20(%rbp)
>>>>>               mov    %rdi,%rbx
>>>>> 13.34       → callq  *ffffffffd359487f
>>>>>               mov    %eax,-0x148(%rbp)
>>>>> 
>>>>> So I ended up trying with a event group to check what could be that:
>>>>> 
>>>>> [root@quaco perf]# perf record -a -e '{cycles,instructions,cache-misses}' sleep 2s
>>>>> [ perf record: Woken up 18 times to write data ]
>>>>> [ perf record: Captured and wrote 6.619 MB perf.data (91964 samples) ]
>>>>> [root@quaco perf]# perf annotate --stdio2 bpf_prog_819967866022f1e1_sys_enter
>>>>> Samples: 314  of events 'anon group { cycles, instructions, cache-misses }', 4000 Hz, Event count (approx.): 118106063, [percent: local period]
>>>>> bpf_prog_819967866022f1e1_sys_enter() bpf_prog_819967866022f1e1_sys_enter
>>>>> 20.88  24.58  49.11         push   %rbp
>>>>> 
>>>>> 47.37  31.62  26.71         mov    %rsp,%rbp
>>>>>  1.16   0.00   0.00         sub    $0x170,%rsp
>>>>>                             sub    $0x28,%rbp
>>>>>  0.58   2.48   0.00         mov    %rbx,0x0(%rbp)
>>>>>  7.10   5.92   0.00         mov    %r13,0x8(%rbp)
>>>>>  0.51   0.96   7.96         mov    %r14,0x10(%rbp)
>>>>>  0.00   1.10   0.00         mov    %r15,0x18(%rbp)
>>>>>  0.58   0.00   0.00         xor    %eax,%eax
>>>>>                             mov    %rax,0x20(%rbp)
>>>>>  2.39   0.85   0.00         mov    %rdi,%rbx
>>>>>  0.58   1.96   0.00       → callq  *ffffffffd359487f
>>>>>                             mov    %eax,-0x148(%rbp)
>>>>>  2.88   0.89   3.50         mov    %rbp,%rsi
>>>>>                             add    $0xfffffffffffffeb8,%rsi
>>>>> 
>>>>> So seems ok? I.e. that big cost at right the start of the BPF program is
>>>>> just cache pressure, probably, right?
>>>>> 
>>>>> 
>>>>> [root@quaco perf]# perf record -a -e '{L1-icache-load-misses,L1-dcache-load-misses}' sleep 2s
>>>>> [ perf record: Woken up 14 times to write data ]
>>>>> [ perf record: Captured and wrote 5.557 MB perf.data (72026 samples) ]
>>>>> [root@quaco perf]# perf annotate --stdio2 bpf_prog_819967866022f1e1_sys_enter
>>>>> Samples: 376  of events 'anon group { L1-icache-load-misses, L1-dcache-load-misses }', 4000 Hz, Event count (approx.): 3715276, [percent: local p>
>>>>> bpf_prog_819967866022f1e1_sys_enter() bpf_prog_819967866022f1e1_sys_enter
>>>>>  1.54  63.01         push   %rbp
>>>>> 
>>>>> 53.09  28.13         mov    %rsp,%rbp
>>>>>  0.58   0.00         sub    $0x170,%rsp
>>>>>  0.00   0.55         sub    $0x28,%rbp
>>>>>  6.96   0.00         mov    %rbx,0x0(%rbp)
>>>>> 10.25   0.00         mov    %r13,0x8(%rbp)
>>>>>  0.73   0.00         mov    %r14,0x10(%rbp)
>>>>>  3.04   0.00         mov    %r15,0x18(%rbp)
>>>>> 
>>>>> Looks like. furthermore:
>>>>> 
>>>>> [root@quaco perf]# perf record -a -e '{L1-icache-load-misses,L1-dcache-load-misses,dTLB-load-misses,iTLB-load-misses}' sleep 2s
>>>>> [ perf record: Woken up 21 times to write data ]
>>>>> [ perf record: Captured and wrote 7.708 MB perf.data (112323 samples) ]
>>>>> [root@quaco perf]# perf annotate --stdio2 bpf_prog_819967866022f1e1_sys_enter
>>>>> Samples: 391  of events 'anon group { L1-icache-load-misses, L1-dcache-load-misses, dTLB-load-misses, iTLB-load-misses }', 4000 Hz, Event count (>
>>>>> bpf_prog_819967866022f1e1_sys_enter() bpf_prog_819967866022f1e1_sys_enter
>>>>>  1.98  64.42 100.00  13.66         push   %rbp
>>>>> 
>>>>> 50.79  23.91   0.00  82.23         mov    %rsp,%rbp
>>>>>                                    sub    $0x170,%rsp
>>>>>  0.24   0.00   0.00   0.00         sub    $0x28,%rbp
>>>>>  7.73   0.00   0.00   0.00         mov    %rbx,0x0(%rbp)
>>>>> 11.13   0.44   0.00   0.00         mov    %r13,0x8(%rbp)
>>>>>                                    mov    %r14,0x10(%rbp)
>>>>>  1.64   0.46   0.00   0.00         mov    %r15,0x18(%rbp)
>>>>> 
>>>>> Do these numbers look sane to you guys?
>>>> 
>>>> From my experiments, mov is often the top time consumer. But
>>>> 
>>>>    mov %rsp,%rbp
>>>> 
>>>> is a little weird. It does not access cache, right?
>>> 
>>> Right, but its like the code for bpf_prog_819967866022f1e1_sys_enter is
>>> being sucked into L1 icache at that point, no?
>> 
>> do you need to enable pebs (precise event-based sampling)?
> 
> So, when you don't specify a event, 'perf record' will use cycles:P,
> which is the highest level of (PEBS, on x86) precision available. But
> then that has some limitations, I'll try to check this later, to see if
> cache events can be counted that way, how many events, etc.
> 
> But I think that knowing that is on the first or second instruction is
> enough, i.e. it seems that somehow the eBPF program is not staying in
> the CPU cache all the time, which ends up making transferring control
> to them cause cache misses?
> 
> That was the reason for my message, to validate if this is the case or
> if the BPF perf annotation or sample address resolution are buggy.
> 
>>>>> Song, I also noticed that source code is not being intermixed for the
>>>>> --stdio annotation, while it works, to some degree, for '--tui', i.e.
>>>>> when you do 'perf top', press '/bpf' to show just symbols with that
>>>>> substring and then press enter or 'A' to annotate, we can see the
>>>>> original C source code for the BPF program, but it is mangling the
>>>>> screen sometimes, I need to try and fix, please take a look if you have
>>>>> the time.
>>>> 
>>>> I will take a look at this, most likely after LSFMM.
>>> 
>>> mmkay
>>> 
>>>>> Also things like the callq targets need some work to tell what function
>>>>> is that, which as I said isn't appearing on the --stdio2 output, but
>>>>> appears on the --tui, i.e. we need to resolve that symbol to check how
>>>>> to map back to a BPF helper or any othe callq target.
>>> 
>>>> I am aware of missing callq symbol. I was thinking they are less critical
>>>> with source code.
>>> 
>>> Some people dislikes intermixing C and assembly, Linus is one, IIRC, so
>>> if we can augment the assembly resolving those addresses to the helper,
>>> that would be best.
>>> 
>>>>> Also, what about those 'je 0', i.e. the target is being misinterpreted
>>>>> or is this some BPF construct I should've know about? :)
>>>> 
>>>> I didn't notice the je 0 before. Let me try reproduce it.
>> 
>> Did you see these `je 0` when you use bpftool dump jited instructions?
>> With dumping source annotated jited/xlated instructions, you can get 
>> rough idea where this `je 0` is coming from. Want to give a try?
> 
> So, here is a fresh record:
> 
> [root@quaco ~]# perf annotate --stdio2 bpf_prog_819967866022f1e1_sys_enter
> Samples: 40  of event 'cycles', 4000 Hz, Event count (approx.): 12152834, [percent: local period]
> bpf_prog_819967866022f1e1_sys_enter() bpf_prog_819967866022f1e1_sys_enter
> 69.13         push   %rbp
> 
>               mov    %rsp,%rbp
>               sub    $0x170,%rsp
>               sub    $0x28,%rbp
> 19.07         mov    %rbx,0x0(%rbp)
>  2.08         mov    %r13,0x8(%rbp)
>               mov    %r14,0x10(%rbp)
>               mov    %r15,0x18(%rbp)
>               xor    %eax,%eax
>               mov    %rax,0x20(%rbp)
>               mov    %rdi,%rbx
>             → callq  *ffffffffd356dda0
>  3.03         mov    %eax,-0x148(%rbp)
>               mov    %rbp,%rsi
>               add    $0xfffffffffffffeb8,%rsi
>               movabs $0xffff9d5b7133a400,%rdi
> 
>  1.17       → callq  *ffffffffd356f050
>               cmp    $0x0,%rax
>             → je     0
>               add    $0x38,%rax
>               xor    %r13d,%r13d
>               cmp    $0x0,%rax
>             → jne    0
>               mov    %rbp,%rdi
>               add    $0xfffffffffffffeb8,%rdi
>               mov    $0x40,%esi
>               mov    %rbx,%rdx
>             → callq  *ffffffffd354e490
>               mov    %rbp,%rsi
>               add    $0xfffffffffffffec0,%rsi
>               movabs $0xffff9d5b6fe1e000,%rdi
> 
> [root@quaco ~]#
> 
> Plus the bpf prog dump jit, we're probably just parsing 32-bits, the
> zeroed part...
> 
> [root@quaco ~]# bpftool prog | tail -6
> 129: tracepoint  name sys_enter  tag 819967866022f1e1  gpl
> 	loaded_at 2019-04-26T19:31:10-0300  uid 0
> 	xlated 528B  jited 381B  memlock 4096B  map_ids 119,118,117
> 130: tracepoint  name sys_exit  tag c1bd85c092d6e4aa  gpl
> 	loaded_at 2019-04-26T19:31:10-0300  uid 0
> 	xlated 256B  jited 191B  memlock 4096B  map_ids 119,118
> [root@quaco ~]# bpftool prog dump jit id 129
> 0xffffffffc0c95080:
>   0:	push   %rbp
>   1:	mov    %rsp,%rbp
>   4:	sub    $0x170,%rsp
>   b:	sub    $0x28,%rbp
>   f:	mov    %rbx,0x0(%rbp)
>  13:	mov    %r13,0x8(%rbp)
>  17:	mov    %r14,0x10(%rbp)
>  1b:	mov    %r15,0x18(%rbp)
>  1f:	xor    %eax,%eax
>  21:	mov    %rax,0x20(%rbp)
>  25:	mov    %rdi,%rbx
>  28:	callq  0xffffffffd356dda0
>  2d:	mov    %eax,-0x148(%rbp)
>  33:	mov    %rbp,%rsi
>  36:	add    $0xfffffffffffffeb8,%rsi
>  3d:	movabs $0xffff9d5b7133a400,%rdi
>  47:	callq  0xffffffffd356f050
>  4c:	cmp    $0x0,%rax
>  50:	je     0x0000000000000056
>  52:	add    $0x38,%rax
>  56:	xor    %r13d,%r13d
>  59:	cmp    $0x0,%rax
>  5d:	jne    0x0000000000000164
>  63:	mov    %rbp,%rdi
>  66:	add    $0xfffffffffffffeb8,%rdi
>  6d:	mov    $0x40,%esi
>  72:	mov    %rbx,%rdx
>  75:	callq  0xffffffffd354e490
>  7a:	mov    %rbp,%rsi
>  7d:	add    $0xfffffffffffffec0,%rsi
>  84:	movabs $0xffff9d5b6fe1e000,%rdi
>  8e:	add    $0xd0,%rdi
>  95:	mov    0x0(%rsi),%eax
>  98:	cmp    $0x200,%rax
>  9f:	jae    0x00000000000000aa
>  a1:	shl    $0x3,%rax
>  a5:	add    %rdi,%rax
>  a8:	jmp    0x00000000000000ac
>  aa:	xor    %eax,%eax
>  ac:	cmp    $0x0,%rax
>  b0:	je     0x0000000000000164
>  b6:	movzbq 0x0(%rax),%rdi
>  bb:	cmp    $0x0,%rdi
>  bf:	je     0x0000000000000164
>  c5:	mov    $0x40,%r8d
>  cb:	mov    -0x140(%rbp),%rdi
>  d2:	cmp    $0x2,%rdi
>  d6:	je     0x00000000000000e7
>  d8:	cmp    $0x101,%rdi
>  df:	je     0x00000000000000ed
>  e1:	cmp    $0x15,%rdi
>  e5:	jne    0x0000000000000140
>  e7:	mov    0x10(%rbx),%rdx
>  eb:	jmp    0x00000000000000f1
>  ed:	mov    0x18(%rbx),%rdx
>  f1:	cmp    $0x0,%rdx
>  f5:	je     0x0000000000000140
>  f7:	xor    %edi,%edi
>  f9:	mov    %edi,-0x104(%rbp)
>  ff:	mov    %rbp,%rdi
> 102:	add    $0xffffffffffffff00,%rdi
> 109:	mov    $0x100,%esi
> 10e:	callq  0xffffffffd354e520
> 113:	mov    $0x148,%r8d
> 119:	mov    %eax,-0x108(%rbp)
> 11f:	mov    %rax,%rdi
> 122:	shl    $0x20,%rdi
> 126:	shr    $0x20,%rdi
> 12a:	cmp    $0xff,%rdi
> 131:	ja     0x0000000000000140
> 133:	add    $0x48,%rax
> 137:	and    $0xff,%rax
> 13d:	mov    %rax,%r8
> 140:	mov    %rbp,%rcx
> 143:	add    $0xfffffffffffffeb8,%rcx
> 14a:	mov    %rbx,%rdi
> 14d:	movabs $0xffff9d58126aac00,%rsi
> 157:	mov    $0xffffffff,%edx
> 15c:	callq  0xffffffffd354eb60
> 161:	mov    %rax,%r13
> 164:	mov    %r13,%rax
> 167:	mov    0x0(%rbp),%rbx
> 16b:	mov    0x8(%rbp),%r13
> 16f:	mov    0x10(%rbp),%r14
> 173:	mov    0x18(%rbp),%r15
> 177:	add    $0x28,%rbp
> 17b:	leaveq 
> 17c:	retq   
> 
> [root@quaco ~]# 
> 

Hmm... it does look like some bug in my code... Will take a close
look.

Thanks Arnaldo and Yonghong!

Song

> 
>>> 
>>> Ok!
>>> 
>>>> Thanks,
>>>> Song
>>>> 
>>>>> 
>>>>>  2.68   0.00   0.00   0.00         mov    %rdi,%rbx
>>>>>                                  → callq  *ffffffffd359487f
>>>>>                                    mov    %eax,-0x148(%rbp)
>>>>>  9.61   0.00   0.00   0.00         mov    %rbp,%rsi
>>>>>                                    add    $0xfffffffffffffeb8,%rsi
>>>>>                                    movabs $0xffff9d556c776c00,%rdi
>>>>> 
>>>>>                                  → callq  *ffffffffd3595b2f
>>>>>                                    cmp    $0x0,%rax
>>>>>                                  → je     0
>>>>>  0.00   1.25   0.00   0.00         add    $0x38,%rax
>>>>>  0.80   0.21   0.00   0.00         xor    %r13d,%r13d
>>>>>                                    cmp    $0x0,%rax
>>>>>                                  → jne    0
>>>>>                                    mov    %rbp,%rdi
>>>>>                                    add    $0xfffffffffffffeb8,%rdi
>>>>> 
>>>>> - Arnaldo
>>>> 
>>> 
> 
> -- 
> 
> - Arnaldo