Hello,
I've run into an optimization issue, please help me understand the why's
and maybe to resolve it.
The reduced test case is at the end. It encodes data into a buffer in a
loop with variable length encoding (not a working real encoding). For
some reason, the write ptr is not kept in a register, but loaded/stored
when used/updated. There is a potential function call in the loop, but
there are __builtin_expect hints, so I think it would be possible to use
a register for the ptr and store just before the call, and load it back
right after the call. This would speed up the common code path: less
code, less loads and stores.I measured around 20-30% more runtime,
compared to a version where a pointer goes in and the updated ptr is
returned. However, passing/returning the ptr has other issues, esp for a
decoder, that would return the decoded value normally, not the ptr.
The platform is amd64 debian jessie, g++ version is 4.8.2, tried with
4.9.0, 4.7 and 4.6 the same code generated with little variations.
To compile:
g++ -c 20140325-reg_vs_mem.cpp -g -std=c++0x -Wall -Wextra -Werror
-Wundef -Wshadow -O3
g++ -o 20140325-reg_vs_mem 20140325-reg_vs_mem.o -g
Dump of assembler code for function encode_node_list(OutBuf&, Node*):
// rdi/rbp points to the out buffer, rsi/rbx to the node
0x00000000004005e0 <+0>: push %rbp
0x00000000004005e1 <+1>: test %rsi,%rsi
0x00000000004005e4 <+4>: mov %rdi,%rbp
0x00000000004005e7 <+7>: push %rbx
0x00000000004005e8 <+8>: mov %rsi,%rbx
0x00000000004005eb <+11>: je 0x400611
<encode_node_list(OutBuf&, Node*)+49>
// why not load cur before the loop?
0x00000000004005ed <+13>: nopl (%rax)
// load cur. this is the start of the loop, so will be loaded in each iter
0x00000000004005f0 <+16>: mov 0x0(%rbp),%rax
// load the data from the node
0x00000000004005f4 <+20>: mov (%rbx),%esi
// calc new cur value
0x00000000004005f6 <+22>: lea 0x4(%rax),%rdx
0x00000000004005fa <+26>: cmp $0xff,%esi
// and store it back, even though the correct new value is not known at
this point
0x0000000000400600 <+32>: mov %rdx,0x0(%rbp)
0x0000000000400604 <+36>: jg 0x400614
<encode_node_list(OutBuf&, Node*)+52>
// the most likely path: store the data, fetch the next node and loop
0x0000000000400606 <+38>: mov %esi,(%rax)
0x0000000000400608 <+40>: mov 0x8(%rbx),%rbx
0x000000000040060c <+44>: test %rbx,%rbx
0x000000000040060f <+47>: jne 0x4005f0
<encode_node_list(OutBuf&, Node*)+16>
// cur could be stored here, if loaded just once before the loop: could
be in %rax the whole time in the likely case
0x0000000000400611 <+49>: pop %rbx
0x0000000000400612 <+50>: pop %rbp
0x0000000000400613 <+51>: retq
// calc new cur value
0x0000000000400614 <+52>: lea 0x8(%rax),%rdx
0x0000000000400618 <+56>: cmp $0xffff,%esi
0x000000000040061e <+62>: movl $0x0,(%rax)
// and store it, again. prev store was at +32
0x0000000000400624 <+68>: mov %rdx,0x0(%rbp)
0x0000000000400628 <+72>: jg 0x40062f
<encode_node_list(OutBuf&, Node*)+79>
0x000000000040062a <+74>: mov %esi,0x4(%rax)
0x000000000040062d <+77>: jmp 0x400608
<encode_node_list(OutBuf&, Node*)+40>
0x000000000040062f <+79>: movl $0x0,0x4(%rax)
0x0000000000400636 <+86>: mov %rbp,%rdi
// why not store just here?
0x0000000000400639 <+89>: callq 0x400570
<encode_noinline(OutBuf&, int)>
// and load it back here?
0x000000000040063e <+94>: jmp 0x400608
<encode_node_list(OutBuf&, Node*)+40>
Is it an optimization issue in the compiler, or is there a way to change
the source to generate better code, without messing up the interfaces?
Thanks, Peter
----8<----8<----
#define LIKELY(x_) __builtin_expect(!!(x_), 1)
#define UNLIKELY(x_) __builtin_expect(!!(x_), 0)
typedef unsigned int uint;
struct OutBuf
{
uint* cur;
uint* end;
uint* beg;
};
__attribute__((noinline))
void encode_noinline(OutBuf& ob, int v)
{
if (LIKELY(v < 0x1000000)) {
*ob.cur++ = v;
return;
}
*ob.cur++ = 0;
*ob.cur++ = v;
}
void encode(OutBuf& ob, int v)
{
if (LIKELY(v < 0x100)) {
*ob.cur++ = v;
return;
}
*ob.cur++ = 0;
if (LIKELY(v < 0x10000)) {
*ob.cur++ = v;
return;
}
*ob.cur++ = 0;
encode_noinline(ob, v);
}
struct Node
{
int data;
Node* next;
};
void encode_node_list(OutBuf& ob, Node* n)
{
while (n) {
encode(ob, n->data);
n = n->next;
}
}
int main()
{
}