Hello,
recently, when writing a serializer for integers using variable length
encoding, I noticed several inefficiencies in the generated code, the
most severe being the reloading of the target ptr from the reference
after each store. This lead to reading about all the aliasing rules,
especially about char* aliasing _everything_. Now I understand why the
compiler generated the code the way it did, but I'd like to know whether
there is a way to tell it that 'I know what I'm doing and even though
it's a char*, it doesn't alias anything' ?
This has a severe inpact on the generated code. If the dst ptr is passed
via a reference (to be able to return the new value after a variable
number of bytes were written), the ptr is reloaded after each store.
This means the ptr is aliasing its own reference, which, for me sounds
like stupid, but rules are rules. If I pass a stream object, with the
dst and the end ptrs, then both will be reloaded after each store,
moreover, the updates to the dst will not be amortized to a single
update at the end.
Attached a little test case. Compiling with 4.8/4.9, -O3, the same
functions for uint8 and uint16 are quite different, because for uint8,
the 'unsigned char* aliases everything' rule kicks in. Adding
__restrict__ to the ptr type doesn't help.
The issues in detail:
#1 dead stores are not eliminated:
(gdb) disassemble dead_store3_u8
Dump of assembler code for function dead_store3_u8(unsigned char*&):
0x0000000000400710 <+0>: mov (%rdi),%rax
0x0000000000400713 <+3>: movb $0x0,(%rax)
0x0000000000400716 <+6>: mov (%rdi),%rax
0x0000000000400719 <+9>: movb $0x1,(%rax)
0x000000000040071c <+12>: mov (%rdi),%rax
0x000000000040071f <+15>: movb $0x2,(%rax)
0x0000000000400722 <+18>: mov (%rdi),%rax
0x0000000000400725 <+21>: movb $0x3,(%rax)
0x0000000000400728 <+24>: retq
vs the 'short' version: only the last store is generated:
(gdb) disassemble dead_store3_u16
Dump of assembler code for function dead_store3_u16(unsigned short*&):
0x00000000004007b0 <+0>: mov (%rdi),%rax
0x00000000004007b3 <+3>: mov $0x3,%edx
0x00000000004007b8 <+8>: mov %dx,(%rax)
0x00000000004007bb <+11>: retq
#2 reloading the variables
(gdb) disassemble write_v1v2_incstrm_u8
Dump of assembler code for function
write_v1v2_incstrm_u8(OutputStream<unsigned char>&, unsigned char,
unsigned char):
0x0000000000400590 <+0>: mov 0x8(%rdi),%rax
0x0000000000400594 <+4>: lea 0x1(%rax),%rcx
0x0000000000400598 <+8>: mov %rcx,0x8(%rdi)
0x000000000040059c <+12>: mov %sil,(%rax)
0x000000000040059f <+15>: mov 0x8(%rdi),%rax
0x00000000004005a3 <+19>: lea 0x1(%rax),%rcx
0x00000000004005a7 <+23>: mov %rcx,0x8(%rdi)
0x00000000004005ab <+27>: mov %dl,(%rax)
0x00000000004005ad <+29>: retq
with an end ptr check, the effect is more severe, as both members are
reloaded, and cur is stored multiple times (+31, +48):
(gdb) disassemble write_v1v2_chkincstrm_u8
Dump of assembler code for function
write_v1v2_chkincstrm_u8(OutputStream<unsigned char>&, unsigned char,
unsigned char):
0x00000000004005b0 <+0>: mov 0x10(%rdi),%r8
0x00000000004005b4 <+4>: mov 0x8(%rdi),%rcx
0x00000000004005b8 <+8>: cmp %rcx,%r8
0x00000000004005bb <+11>: jbe 0x4005c8
<write_v1v2_chkincstrm_u8(OutputStream<unsigned char>&, unsigned char,
unsigned char)+24>
0x00000000004005bd <+13>: mov %sil,(%rcx)
0x00000000004005c0 <+16>: mov 0x8(%rdi),%rcx
0x00000000004005c4 <+20>: mov 0x10(%rdi),%r8
0x00000000004005c8 <+24>: lea 0x1(%rcx),%rax
0x00000000004005cc <+28>: cmp %r8,%rax
0x00000000004005cf <+31>: mov %rax,0x8(%rdi)
0x00000000004005d3 <+35>: jae 0x4005dc
<write_v1v2_chkincstrm_u8(OutputStream<unsigned char>&, unsigned char,
unsigned char)+44>
0x00000000004005d5 <+37>: mov %dl,0x1(%rcx)
0x00000000004005d8 <+40>: mov 0x8(%rdi),%rax
0x00000000004005dc <+44>: add $0x1,%rax
0x00000000004005e0 <+48>: mov %rax,0x8(%rdi)
0x00000000004005e4 <+52>: retq
vs the 'short' version:
(gdb) disassemble write_v1v2_incstrm_u16
Dump of assembler code for function
write_v1v2_incstrm_u16(OutputStream<unsigned short>&, unsigned short,
unsigned short):
0x0000000000400650 <+0>: mov 0x8(%rdi),%rax
0x0000000000400654 <+4>: lea 0x4(%rax),%rcx
0x0000000000400658 <+8>: mov %si,(%rax)
0x000000000040065b <+11>: mov %rcx,0x8(%rdi)
0x000000000040065f <+15>: mov %dx,0x2(%rax)
0x0000000000400663 <+19>: retq
cur, end loaded only once, cur written only once, aggregating the two
++'s. this is fine.
(gdb) disassemble write_v1v2_chkincstrm_u16
Dump of assembler code for function
write_v1v2_chkincstrm_u16(OutputStream<unsigned short>&, unsigned short,
unsigned short):
0x0000000000400670 <+0>: mov 0x10(%rdi),%r8
0x0000000000400674 <+4>: mov 0x8(%rdi),%rax
0x0000000000400678 <+8>: cmp %rax,%r8
0x000000000040067b <+11>: jbe 0x400680
<write_v1v2_chkincstrm_u16(OutputStream<unsigned short>&, unsigned
short, unsigned short)+16>
0x000000000040067d <+13>: mov %si,(%rax)
0x0000000000400680 <+16>: lea 0x2(%rax),%rcx
0x0000000000400684 <+20>: cmp %rcx,%r8
0x0000000000400687 <+23>: mov %rcx,0x8(%rdi)
0x000000000040068b <+27>: jbe 0x400691
<write_v1v2_chkincstrm_u16(OutputStream<unsigned short>&, unsigned
short, unsigned short)+33>
0x000000000040068d <+29>: mov %dx,0x2(%rax)
0x0000000000400691 <+33>: add $0x4,%rax
0x0000000000400695 <+37>: mov %rax,0x8(%rdi)
0x0000000000400699 <+41>: retq
No multiple loads, but cur is stored twice, at +23 and +37. Is this a bug?
My major question is, given the char* aliasing rules, how can one write
efficient code (serializer, compressior, decompressor, etc) that writes
bytes, without all the artifacts listed above? In my varint encoder, I
dodged the multiple reloads by passing the ptrs as args, and using
template metaprogramming to calculate the offsets and the increment.
This worked for my specific case, but it's quite some effort and can't
be applied everywhere. Is there a generic solution? An attribute? A builtin?
Thanks, Peter
----8<----8<----8<----
template<typename T>
struct OutputStream
{
void Wr(T v)
{
*cur = v;
}
void WrInc(T v)
{
*cur++ = v;
}
void WrChk(T v)
{
if (cur < end) {
*cur = v;
}
}
void WrChkInc(T v)
{
if (cur < end) {
*cur = v;
}
++cur;
}
private:
T* beg;
T* cur;
T* end;
};
template<typename T>
struct InputStream
{
T Rd()
{
return *cur;
}
T RdInc()
{
return *cur++;
}
T RdChk()
{
return cur < end ? *cur : 0;
}
T RdChkInc()
{
T r = cur < end ? *cur : 0;
++cur;
return r;
}
private:
const T* beg;
const T* cur;
const T* end;
};
///////////////////////////////////////////////////////////////////////////////
template<typename T>
inline
void write_v_ptr(T* p, T v)
{
*p = v;
}
template<typename T>
inline
T read_v_ptr(const T* p)
{
return *p;
}
///////////////////////////////////////////////////////////////////////////////
template<typename T>
void write_v1v2_incref(T*& p, T v1, T v2)
{
write_v_ptr(p, v1);
++p;
write_v_ptr(p, v2);
++p;
}
template<typename T>
T read_v1v2_incref(const T*& p)
{
T v1 = read_v_ptr(p);
++p;
T v2 = read_v_ptr(p);
++p;
return v1 + v2;
}
///////////////////////////////////////////////////////////////////////////////
template<typename T>
void write_v1v2_incstrm(OutputStream<T>& s, T v1, T v2)
{
s.WrInc(v1);
s.WrInc(v2);
}
template<typename T>
void write_v1v2_chkincstrm(OutputStream<T>& s, T v1, T v2)
{
s.WrChkInc(v1);
s.WrChkInc(v2);
}
template<typename T>
T read_v1v2_incstrm(InputStream<T>& s)
{
T v1 = s.RdInc();
T v2 = s.RdInc();
return v1 + v2;
}
template<typename T>
T read_v1v2_chkincstrm(InputStream<T>& s)
{
T v1 = s.RdChkInc();
T v2 = s.RdChkInc();
return v1 + v2;
}
///////////////////////////////////////////////////////////////////////////////
template<typename T>
void dead_store(T* p)
{
*p = 0;
*p = 1;
*p = 2;
*p = 3;
}
template<typename T>
void dead_store2(T** p)
{
**p = 0;
**p = 1;
**p = 2;
**p = 3;
}
template<typename T>
void dead_store3(T*& __restrict__ p)
{
*p = 0;
*p = 1;
*p = 2;
*p = 3;
}
template<typename T>
void dead_store4(OutputStream<T>& s)
{
s.Wr(0);
s.Wr(1);
s.Wr(2);
s.Wr(3);
}
template<typename T>
void dead_store5(OutputStream<T>& s)
{
s.WrChk(0);
s.WrChk(1);
s.WrChk(2);
s.WrChk(3);
}
///////////////////////////////////////////////////////////////////////////////
void write_v1v2_incref_u8(unsigned char*& p, unsigned char v1, unsigned
char v2)
{
write_v1v2_incref<unsigned char>(p, v1, v2);
}
void write_v1v2_incstrm_u8(OutputStream<unsigned char>& s, unsigned char
v1, unsigned char v2)
{
write_v1v2_incstrm<unsigned char>(s, v1, v2);
}
void write_v1v2_chkincstrm_u8(OutputStream<unsigned char>& s, unsigned
char v1, unsigned char v2)
{
write_v1v2_chkincstrm<unsigned char>(s, v1, v2);
}
unsigned char read_v1v2_incref_u8(const unsigned char*& p)
{
return read_v1v2_incref<unsigned char>(p);
}
unsigned char read_v1v2_incstrm_u8(InputStream<unsigned char>& s)
{
return read_v1v2_incstrm<unsigned char>(s);
}
///////////////////////////////////////////////////////////////////////////////
void write_v1v2_incref_u16(unsigned short*& p, unsigned short v1,
unsigned short v2)
{
write_v1v2_incref<unsigned short>(p, v1, v2);
}
void write_v1v2_incstrm_u16(OutputStream<unsigned short>& s, unsigned
short v1, unsigned short v2)
{
write_v1v2_incstrm<unsigned short>(s, v1, v2);
}
void write_v1v2_chkincstrm_u16(OutputStream<unsigned short>& s, unsigned
short v1, unsigned short v2)
{
write_v1v2_chkincstrm<unsigned short>(s, v1, v2);
}
unsigned short read_v1v2_incref_u16(const unsigned short*& p)
{
return read_v1v2_incref<unsigned short>(p);
}
unsigned short read_v1v2_incstrm_u16(InputStream<unsigned short>& s)
{
return read_v1v2_incstrm<unsigned short>(s);
}
///////////////////////////////////////////////////////////////////////////////
void dead_store_u8(unsigned char* p)
{
dead_store<unsigned char>(p);
}
void dead_store2_u8(unsigned char** p)
{
dead_store2<unsigned char>(p);
}
void dead_store3_u8(unsigned char*& p)
{
dead_store3<unsigned char>(p);
}
void dead_store4_u8(OutputStream<unsigned char>& s)
{
dead_store4<unsigned char>(s);
}
void dead_store5_u8(OutputStream<unsigned char>& s)
{
dead_store5<unsigned char>(s);
}
///////////////////////////////////////////////////////////////////////////////
void dead_store_u16(unsigned short* p)
{
dead_store<unsigned short>(p);
}
void dead_store2_u16(unsigned short** p)
{
dead_store2<unsigned short>(p);
}
void dead_store3_u16(unsigned short*& p)
{
dead_store3<unsigned short>(p);
}
void dead_store4_u16(OutputStream<unsigned short>& s)
{
dead_store4<unsigned short>(s);
}
void dead_store5_u16(OutputStream<unsigned short>& s)
{
dead_store5<unsigned short>(s);
}
///////////////////////////////////////////////////////////////////////////////
int main()
{
}