char*, aliasing and writing efficient code

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello,

recently, when writing a serializer for integers using variable length encoding, I noticed several inefficiencies in the generated code, the most severe being the reloading of the target ptr from the reference after each store. This lead to reading about all the aliasing rules, especially about char* aliasing _everything_. Now I understand why the compiler generated the code the way it did, but I'd like to know whether there is a way to tell it that 'I know what I'm doing and even though it's a char*, it doesn't alias anything' ?

This has a severe inpact on the generated code. If the dst ptr is passed via a reference (to be able to return the new value after a variable number of bytes were written), the ptr is reloaded after each store. This means the ptr is aliasing its own reference, which, for me sounds like stupid, but rules are rules. If I pass a stream object, with the dst and the end ptrs, then both will be reloaded after each store, moreover, the updates to the dst will not be amortized to a single update at the end.

Attached a little test case. Compiling with 4.8/4.9, -O3, the same functions for uint8 and uint16 are quite different, because for uint8, the 'unsigned char* aliases everything' rule kicks in. Adding __restrict__ to the ptr type doesn't help.

The issues in detail:

#1 dead stores are not eliminated:
(gdb) disassemble dead_store3_u8
Dump of assembler code for function dead_store3_u8(unsigned char*&):
   0x0000000000400710 <+0>:    mov    (%rdi),%rax
   0x0000000000400713 <+3>:    movb   $0x0,(%rax)
   0x0000000000400716 <+6>:    mov    (%rdi),%rax
   0x0000000000400719 <+9>:    movb   $0x1,(%rax)
   0x000000000040071c <+12>:    mov    (%rdi),%rax
   0x000000000040071f <+15>:    movb   $0x2,(%rax)
   0x0000000000400722 <+18>:    mov    (%rdi),%rax
   0x0000000000400725 <+21>:    movb   $0x3,(%rax)
   0x0000000000400728 <+24>:    retq

vs the 'short' version: only the last store is generated:
(gdb) disassemble dead_store3_u16
Dump of assembler code for function dead_store3_u16(unsigned short*&):
   0x00000000004007b0 <+0>:    mov    (%rdi),%rax
   0x00000000004007b3 <+3>:    mov    $0x3,%edx
   0x00000000004007b8 <+8>:    mov    %dx,(%rax)
   0x00000000004007bb <+11>:    retq

#2 reloading the variables
(gdb) disassemble write_v1v2_incstrm_u8
Dump of assembler code for function write_v1v2_incstrm_u8(OutputStream<unsigned char>&, unsigned char, unsigned char):
   0x0000000000400590 <+0>:    mov    0x8(%rdi),%rax
   0x0000000000400594 <+4>:    lea    0x1(%rax),%rcx
   0x0000000000400598 <+8>:    mov    %rcx,0x8(%rdi)
   0x000000000040059c <+12>:    mov    %sil,(%rax)
   0x000000000040059f <+15>:    mov    0x8(%rdi),%rax
   0x00000000004005a3 <+19>:    lea    0x1(%rax),%rcx
   0x00000000004005a7 <+23>:    mov    %rcx,0x8(%rdi)
   0x00000000004005ab <+27>:    mov    %dl,(%rax)
   0x00000000004005ad <+29>:    retq

with an end ptr check, the effect is more severe, as both members are reloaded, and cur is stored multiple times (+31, +48):
(gdb) disassemble write_v1v2_chkincstrm_u8
Dump of assembler code for function write_v1v2_chkincstrm_u8(OutputStream<unsigned char>&, unsigned char, unsigned char):
   0x00000000004005b0 <+0>:    mov    0x10(%rdi),%r8
   0x00000000004005b4 <+4>:    mov    0x8(%rdi),%rcx
   0x00000000004005b8 <+8>:    cmp    %rcx,%r8
0x00000000004005bb <+11>: jbe 0x4005c8 <write_v1v2_chkincstrm_u8(OutputStream<unsigned char>&, unsigned char, unsigned char)+24>
   0x00000000004005bd <+13>:    mov    %sil,(%rcx)
   0x00000000004005c0 <+16>:    mov    0x8(%rdi),%rcx
   0x00000000004005c4 <+20>:    mov    0x10(%rdi),%r8
   0x00000000004005c8 <+24>:    lea    0x1(%rcx),%rax
   0x00000000004005cc <+28>:    cmp    %r8,%rax
   0x00000000004005cf <+31>:    mov    %rax,0x8(%rdi)
0x00000000004005d3 <+35>: jae 0x4005dc <write_v1v2_chkincstrm_u8(OutputStream<unsigned char>&, unsigned char, unsigned char)+44>
   0x00000000004005d5 <+37>:    mov    %dl,0x1(%rcx)
   0x00000000004005d8 <+40>:    mov    0x8(%rdi),%rax
   0x00000000004005dc <+44>:    add    $0x1,%rax
   0x00000000004005e0 <+48>:    mov    %rax,0x8(%rdi)
   0x00000000004005e4 <+52>:    retq

vs the 'short' version:
(gdb) disassemble write_v1v2_incstrm_u16
Dump of assembler code for function write_v1v2_incstrm_u16(OutputStream<unsigned short>&, unsigned short, unsigned short):
   0x0000000000400650 <+0>:    mov    0x8(%rdi),%rax
   0x0000000000400654 <+4>:    lea    0x4(%rax),%rcx
   0x0000000000400658 <+8>:    mov    %si,(%rax)
   0x000000000040065b <+11>:    mov    %rcx,0x8(%rdi)
   0x000000000040065f <+15>:    mov    %dx,0x2(%rax)
   0x0000000000400663 <+19>:    retq
cur, end loaded only once, cur written only once, aggregating the two ++'s. this is fine.

(gdb) disassemble write_v1v2_chkincstrm_u16
Dump of assembler code for function write_v1v2_chkincstrm_u16(OutputStream<unsigned short>&, unsigned short, unsigned short):
   0x0000000000400670 <+0>:    mov    0x10(%rdi),%r8
   0x0000000000400674 <+4>:    mov    0x8(%rdi),%rax
   0x0000000000400678 <+8>:    cmp    %rax,%r8
0x000000000040067b <+11>: jbe 0x400680 <write_v1v2_chkincstrm_u16(OutputStream<unsigned short>&, unsigned short, unsigned short)+16>
   0x000000000040067d <+13>:    mov    %si,(%rax)
   0x0000000000400680 <+16>:    lea    0x2(%rax),%rcx
   0x0000000000400684 <+20>:    cmp    %rcx,%r8
   0x0000000000400687 <+23>:    mov    %rcx,0x8(%rdi)
0x000000000040068b <+27>: jbe 0x400691 <write_v1v2_chkincstrm_u16(OutputStream<unsigned short>&, unsigned short, unsigned short)+33>
   0x000000000040068d <+29>:    mov    %dx,0x2(%rax)
   0x0000000000400691 <+33>:    add    $0x4,%rax
   0x0000000000400695 <+37>:    mov    %rax,0x8(%rdi)
   0x0000000000400699 <+41>:    retq
No multiple loads, but cur is stored twice, at +23 and +37. Is this a bug?

My major question is, given the char* aliasing rules, how can one write efficient code (serializer, compressior, decompressor, etc) that writes bytes, without all the artifacts listed above? In my varint encoder, I dodged the multiple reloads by passing the ptrs as args, and using template metaprogramming to calculate the offsets and the increment. This worked for my specific case, but it's quite some effort and can't be applied everywhere. Is there a generic solution? An attribute? A builtin?

Thanks, Peter

----8<----8<----8<----
template<typename T>
struct OutputStream
{
    void Wr(T v)
    {
        *cur = v;
    }

    void WrInc(T v)
    {
        *cur++ = v;
    }

    void WrChk(T v)
    {
        if (cur < end) {
            *cur = v;
        }
    }

    void WrChkInc(T v)
    {
        if (cur < end) {
            *cur = v;
        }
        ++cur;
    }

private:
    T*        beg;
    T*        cur;
    T*        end;
};

template<typename T>
struct InputStream
{
    T Rd()
    {
        return *cur;
    }

    T RdInc()
    {
        return *cur++;
    }

    T RdChk()
    {
        return cur < end ? *cur : 0;
    }

    T RdChkInc()
    {
        T r = cur < end ? *cur : 0;
        ++cur;
        return r;
    }

private:
    const T*    beg;
    const T*    cur;
    const T*    end;
};
///////////////////////////////////////////////////////////////////////////////
template<typename T>
inline
void write_v_ptr(T* p, T v)
{
    *p = v;
}

template<typename T>
inline
T read_v_ptr(const T* p)
{
    return *p;
}
///////////////////////////////////////////////////////////////////////////////
template<typename T>
void write_v1v2_incref(T*& p, T v1, T v2)
{
    write_v_ptr(p, v1);
    ++p;
    write_v_ptr(p, v2);
    ++p;
}

template<typename T>
T read_v1v2_incref(const T*& p)
{
    T v1 = read_v_ptr(p);
    ++p;
    T v2 = read_v_ptr(p);
    ++p;
    return v1 + v2;
}
///////////////////////////////////////////////////////////////////////////////
template<typename T>
void write_v1v2_incstrm(OutputStream<T>& s, T v1, T v2)
{
    s.WrInc(v1);
    s.WrInc(v2);
}

template<typename T>
void write_v1v2_chkincstrm(OutputStream<T>& s, T v1, T v2)
{
    s.WrChkInc(v1);
    s.WrChkInc(v2);
}

template<typename T>
T read_v1v2_incstrm(InputStream<T>& s)
{
    T v1 = s.RdInc();
    T v2 = s.RdInc();
    return v1 + v2;
}

template<typename T>
T read_v1v2_chkincstrm(InputStream<T>& s)
{
    T v1 = s.RdChkInc();
    T v2 = s.RdChkInc();
    return v1 + v2;
}
///////////////////////////////////////////////////////////////////////////////
template<typename T>
void dead_store(T* p)
{
    *p = 0;
    *p = 1;
    *p = 2;
    *p = 3;
}

template<typename T>
void dead_store2(T** p)
{
    **p = 0;
    **p = 1;
    **p = 2;
    **p = 3;
}

template<typename T>
void dead_store3(T*& __restrict__ p)
{
    *p = 0;
    *p = 1;
    *p = 2;
    *p = 3;
}

template<typename T>
void dead_store4(OutputStream<T>& s)
{
    s.Wr(0);
    s.Wr(1);
    s.Wr(2);
    s.Wr(3);
}

template<typename T>
void dead_store5(OutputStream<T>& s)
{
    s.WrChk(0);
    s.WrChk(1);
    s.WrChk(2);
    s.WrChk(3);
}
///////////////////////////////////////////////////////////////////////////////
void write_v1v2_incref_u8(unsigned char*& p, unsigned char v1, unsigned char v2)
{
    write_v1v2_incref<unsigned char>(p, v1, v2);
}

void write_v1v2_incstrm_u8(OutputStream<unsigned char>& s, unsigned char v1, unsigned char v2)
{
    write_v1v2_incstrm<unsigned char>(s, v1, v2);
}

void write_v1v2_chkincstrm_u8(OutputStream<unsigned char>& s, unsigned char v1, unsigned char v2)
{
    write_v1v2_chkincstrm<unsigned char>(s, v1, v2);
}

unsigned char read_v1v2_incref_u8(const unsigned char*& p)
{
    return read_v1v2_incref<unsigned char>(p);
}

unsigned char read_v1v2_incstrm_u8(InputStream<unsigned char>& s)
{
    return read_v1v2_incstrm<unsigned char>(s);
}
///////////////////////////////////////////////////////////////////////////////
void write_v1v2_incref_u16(unsigned short*& p, unsigned short v1, unsigned short v2)
{
    write_v1v2_incref<unsigned short>(p, v1, v2);
}

void write_v1v2_incstrm_u16(OutputStream<unsigned short>& s, unsigned short v1, unsigned short v2)
{
    write_v1v2_incstrm<unsigned short>(s, v1, v2);
}

void write_v1v2_chkincstrm_u16(OutputStream<unsigned short>& s, unsigned short v1, unsigned short v2)
{
    write_v1v2_chkincstrm<unsigned short>(s, v1, v2);
}

unsigned short read_v1v2_incref_u16(const unsigned short*& p)
{
    return read_v1v2_incref<unsigned short>(p);
}

unsigned short read_v1v2_incstrm_u16(InputStream<unsigned short>& s)
{
    return read_v1v2_incstrm<unsigned short>(s);
}
///////////////////////////////////////////////////////////////////////////////
void dead_store_u8(unsigned char* p)
{
    dead_store<unsigned char>(p);
}

void dead_store2_u8(unsigned char** p)
{
    dead_store2<unsigned char>(p);
}

void dead_store3_u8(unsigned char*& p)
{
    dead_store3<unsigned char>(p);
}

void dead_store4_u8(OutputStream<unsigned char>& s)
{
    dead_store4<unsigned char>(s);
}

void dead_store5_u8(OutputStream<unsigned char>& s)
{
    dead_store5<unsigned char>(s);
}
///////////////////////////////////////////////////////////////////////////////
void dead_store_u16(unsigned short* p)
{
    dead_store<unsigned short>(p);
}

void dead_store2_u16(unsigned short** p)
{
    dead_store2<unsigned short>(p);
}

void dead_store3_u16(unsigned short*& p)
{
    dead_store3<unsigned short>(p);
}

void dead_store4_u16(OutputStream<unsigned short>& s)
{
    dead_store4<unsigned short>(s);
}

void dead_store5_u16(OutputStream<unsigned short>& s)
{
    dead_store5<unsigned short>(s);
}
///////////////////////////////////////////////////////////////////////////////

int main()
{
}





[Index of Archives]     [Linux C Programming]     [Linux Kernel]     [eCos]     [Fedora Development]     [Fedora Announce]     [Autoconf]     [The DWARVES Debugging Tools]     [Yosemite Campsites]     [Yosemite News]     [Linux GCC]

  Powered by Linux