Empty destructor definition disables optimization.

Juan Cabrera <jjcp.91@xxxxxxxxx> · Tue, 20 Dec 2016 22:17:46 -0300

Hello,

I was reading the assembly output for a specific usecase of a custom
optional<T> implementation I'm writing to see if the `optional<T>` abstraction
would incur in more overhead than what I was expecting.

This is the samallest and simplest sample code I came up with that illustrates
the problem I'm having:

    #include <exception>

    class M
    {
    public:
        int  value;
        bool valid;

       ~M() { }

        int get() const
        {
            if (valid)
                return value;
            std::terminate();
        }
    };

    M    func_1();
    void func_2(int a);

    int main(int, char**)
    {
        const auto i = func_1();
        if (i.valid)
        {
            func_2(i.get());
            func_2(i.get());
            func_2(i.get());
        }

        return 0;
    }

Now the issue is that the compiler checks `i.valid` to be true on each call to
`i.get()` even though the enclosing `if` statement has already done so.

This is the assembly output from g++ 6.2.1 (compiled with -O3)

    main:
            sub     rsp, 24
            mov     rdi, rsp
            call    func_1()
            cmp     BYTE PTR [rsp+4], 0
            jne     .L7
    .L2:
            xor     eax, eax
            add     rsp, 24
            ret
    .L7:
            mov     edi, DWORD PTR [rsp]
            call    func_2(int)
            cmp     BYTE PTR [rsp+4], 0
            mov     edi, DWORD PTR [rsp]
            je      .L3
            call    func_2(int)
            cmp     BYTE PTR [rsp+4], 0
            mov     edi, DWORD PTR [rsp]
            je      .L3
            call    func_2(int)
            jmp     .L2
    .L3:
            call    std::terminate()

The following slight modification produces better code:

    // class M ....

    M    func_1();
    void func_2(...);

    int main(int, char**)
    {
        const auto i = func_1();
        if (i.valid)
        {
            func_2(i.get(), i.get(), i.get());
        }

        return 0;
    }

--> (compiled with -O3)

    main:
            sub     rsp, 24
            mov     rdi, rsp
            call    func_1()
            cmp     BYTE PTR [rsp+4], 0
            je      .L2
            mov     edi, DWORD PTR [rsp]
            xor     eax, eax
            mov     edx, edi
            mov     esi, edi
            call    func_2(...)
    .L2:
            xor     eax, eax
            add     rsp, 24
            ret

After doing a bunch of tests I found out that by removing the destructor
definition from `M` the generated code for the first code example becomes:

    main:
            push    rbx
            call    func_1()
            mov     rbx, rax
            shr     rax, 32
            test    al, al
            je      .L2
            mov     edi, ebx
            call    func_2(int)
            mov     edi, ebx
            call    func_2(int)
            mov     edi, ebx
            call    func_2(int)
    .L2:
            xor     eax, eax
            pop     rbx
            ret

That's basically what I was expecting.
Of course removing the destructor definition is not an option for the the real
code I'm working on (the `value` is inside a union, etc).

Why is the compiling not able to optimize those comparison away?
clang++ also generates basically the same code as g++ in all cases which makes
me think I must be overlooking something important there.

Thanks a lot!
Juan.